Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
442
venv/lib/python3.9/site-packages/itemloaders/__init__.py
Normal file
442
venv/lib/python3.9/site-packages/itemloaders/__init__.py
Normal file
|
|
@ -0,0 +1,442 @@
|
|||
"""
|
||||
Item Loader
|
||||
|
||||
See documentation in docs/topics/loaders.rst
|
||||
"""
|
||||
from contextlib import suppress
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from parsel.utils import extract_regex, flatten
|
||||
|
||||
from itemloaders.common import wrap_loader_context
|
||||
from itemloaders.processors import Identity
|
||||
from itemloaders.utils import arg_to_iter
|
||||
|
||||
|
||||
def unbound_method(method):
|
||||
"""
|
||||
Allow to use single-argument functions as input or output processors
|
||||
(no need to define an unused first 'self' argument)
|
||||
"""
|
||||
with suppress(AttributeError):
|
||||
if '.' not in method.__qualname__:
|
||||
return method.__func__
|
||||
return method
|
||||
|
||||
|
||||
class ItemLoader:
|
||||
"""
|
||||
Return a new Item Loader for populating the given item. If no item is
|
||||
given, one is instantiated automatically using the class in
|
||||
:attr:`default_item_class`.
|
||||
|
||||
When instantiated with a :param ``selector`` parameter the :class:`ItemLoader` class
|
||||
provides convenient mechanisms for extracting data from web pages
|
||||
using parsel_ selectors.
|
||||
|
||||
:param item: The item instance to populate using subsequent calls to
|
||||
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
|
||||
or :meth:`~ItemLoader.add_value`.
|
||||
:type item: :class:`dict` object
|
||||
|
||||
:param selector: The selector to extract data from, when using the
|
||||
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
|
||||
(resp. :meth:`replace_css`) method.
|
||||
:type selector: :class:`~parsel.selector.Selector` object
|
||||
|
||||
The item, selector and the remaining keyword arguments are
|
||||
assigned to the Loader context (accessible through the :attr:`context` attribute).
|
||||
|
||||
.. attribute:: item
|
||||
|
||||
The item object being parsed by this Item Loader.
|
||||
This is mostly used as a property so when attempting to override this
|
||||
value, you may want to check out :attr:`default_item_class` first.
|
||||
|
||||
.. attribute:: context
|
||||
|
||||
The currently active :ref:`Context <loaders-context>` of this Item Loader.
|
||||
Refer to <loaders-context> for more information about the Loader Context.
|
||||
|
||||
.. attribute:: default_item_class
|
||||
|
||||
An Item class (or factory), used to instantiate items when not given in
|
||||
the ``__init__`` method.
|
||||
|
||||
.. warning:: Currently, this factory/class needs to be
|
||||
callable/instantiated without any arguments.
|
||||
If you are using ``dataclasses``, please consider the following
|
||||
alternative::
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
@dataclass
|
||||
class Product:
|
||||
name: Optional[str] = field(default=None)
|
||||
price: Optional[float] = field(default=None)
|
||||
|
||||
.. attribute:: default_input_processor
|
||||
|
||||
The default input processor to use for those fields which don't specify
|
||||
one.
|
||||
|
||||
.. attribute:: default_output_processor
|
||||
|
||||
The default output processor to use for those fields which don't specify
|
||||
one.
|
||||
|
||||
.. attribute:: selector
|
||||
|
||||
The :class:`~parsel.selector.Selector` object to extract data from.
|
||||
It's the selector given in the ``__init__`` method.
|
||||
This attribute is meant to be read-only.
|
||||
|
||||
.. _parsel: https://parsel.readthedocs.io/en/latest/
|
||||
"""
|
||||
|
||||
default_item_class = dict
|
||||
default_input_processor = Identity()
|
||||
default_output_processor = Identity()
|
||||
|
||||
def __init__(self, item=None, selector=None, parent=None, **context):
|
||||
self.selector = selector
|
||||
context.update(selector=selector)
|
||||
if item is None:
|
||||
item = self.default_item_class()
|
||||
self._local_item = item
|
||||
context['item'] = item
|
||||
self.context = context
|
||||
self.parent = parent
|
||||
self._local_values = {}
|
||||
# values from initial item
|
||||
for field_name, value in ItemAdapter(item).items():
|
||||
self._values.setdefault(field_name, [])
|
||||
self._values[field_name] += arg_to_iter(value)
|
||||
|
||||
@property
|
||||
def _values(self):
|
||||
if self.parent is not None:
|
||||
return self.parent._values
|
||||
else:
|
||||
return self._local_values
|
||||
|
||||
@property
|
||||
def item(self):
|
||||
if self.parent is not None:
|
||||
return self.parent.item
|
||||
else:
|
||||
return self._local_item
|
||||
|
||||
def nested_xpath(self, xpath, **context):
|
||||
"""
|
||||
Create a nested loader with an xpath selector.
|
||||
The supplied selector is applied relative to selector associated
|
||||
with this :class:`ItemLoader`. The nested loader shares the item
|
||||
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
|
||||
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
|
||||
"""
|
||||
selector = self.selector.xpath(xpath)
|
||||
context.update(selector=selector)
|
||||
subloader = self.__class__(
|
||||
item=self.item, parent=self, **context
|
||||
)
|
||||
return subloader
|
||||
|
||||
def nested_css(self, css, **context):
|
||||
"""
|
||||
Create a nested loader with a css selector.
|
||||
The supplied selector is applied relative to selector associated
|
||||
with this :class:`ItemLoader`. The nested loader shares the item
|
||||
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
|
||||
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
|
||||
"""
|
||||
selector = self.selector.css(css)
|
||||
context.update(selector=selector)
|
||||
subloader = self.__class__(
|
||||
item=self.item, parent=self, **context
|
||||
)
|
||||
return subloader
|
||||
|
||||
def add_value(self, field_name, value, *processors, **kw):
|
||||
"""
|
||||
Process and then add the given ``value`` for the given field.
|
||||
|
||||
The value is first passed through :meth:`get_value` by giving the
|
||||
``processors`` and ``kwargs``, and then passed through the
|
||||
:ref:`field input processor <processors>` and its result
|
||||
appended to the data collected for that field. If the field already
|
||||
contains collected data, the new data is added.
|
||||
|
||||
The given ``field_name`` can be ``None``, in which case values for
|
||||
multiple fields may be added. And the processed value should be a dict
|
||||
with field_name mapped to values.
|
||||
|
||||
Examples::
|
||||
|
||||
loader.add_value('name', 'Color TV')
|
||||
loader.add_value('colours', ['white', 'blue'])
|
||||
loader.add_value('length', '100')
|
||||
loader.add_value('name', 'name: foo', TakeFirst(), re='name: (.+)')
|
||||
loader.add_value(None, {'name': 'foo', 'sex': 'male'})
|
||||
"""
|
||||
value = self.get_value(value, *processors, **kw)
|
||||
if value is None:
|
||||
return
|
||||
if not field_name:
|
||||
for k, v in value.items():
|
||||
self._add_value(k, v)
|
||||
else:
|
||||
self._add_value(field_name, value)
|
||||
|
||||
def replace_value(self, field_name, value, *processors, **kw):
|
||||
"""
|
||||
Similar to :meth:`add_value` but replaces the collected data with the
|
||||
new value instead of adding it.
|
||||
"""
|
||||
value = self.get_value(value, *processors, **kw)
|
||||
if value is None:
|
||||
return
|
||||
if not field_name:
|
||||
for k, v in value.items():
|
||||
self._replace_value(k, v)
|
||||
else:
|
||||
self._replace_value(field_name, value)
|
||||
|
||||
def _add_value(self, field_name, value):
|
||||
value = arg_to_iter(value)
|
||||
processed_value = self._process_input_value(field_name, value)
|
||||
if processed_value:
|
||||
self._values.setdefault(field_name, [])
|
||||
self._values[field_name] += arg_to_iter(processed_value)
|
||||
|
||||
def _replace_value(self, field_name, value):
|
||||
self._values.pop(field_name, None)
|
||||
self._add_value(field_name, value)
|
||||
|
||||
def get_value(self, value, *processors, **kw):
|
||||
"""
|
||||
Process the given ``value`` by the given ``processors`` and keyword
|
||||
arguments.
|
||||
|
||||
Available keyword arguments:
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
given value using :func:`~parsel.utils.extract_regex` method,
|
||||
applied before processors
|
||||
:type re: str or typing.Pattern
|
||||
|
||||
Examples:
|
||||
|
||||
>>> from itemloaders import ItemLoader
|
||||
>>> from itemloaders.processors import TakeFirst
|
||||
>>> loader = ItemLoader()
|
||||
>>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)')
|
||||
'FOO'
|
||||
"""
|
||||
regex = kw.get('re', None)
|
||||
if regex:
|
||||
value = arg_to_iter(value)
|
||||
value = flatten(extract_regex(regex, x) for x in value)
|
||||
|
||||
for proc in processors:
|
||||
if value is None:
|
||||
break
|
||||
_proc = proc
|
||||
proc = wrap_loader_context(proc, self.context)
|
||||
try:
|
||||
value = proc(value)
|
||||
except Exception as e:
|
||||
raise ValueError("Error with processor %s value=%r error='%s: %s'" %
|
||||
(_proc.__class__.__name__, value,
|
||||
type(e).__name__, str(e)))
|
||||
return value
|
||||
|
||||
def load_item(self):
|
||||
"""
|
||||
Populate the item with the data collected so far, and return it. The
|
||||
data collected is first passed through the :ref:`output processors
|
||||
<processors>` to get the final value to assign to each item field.
|
||||
"""
|
||||
adapter = ItemAdapter(self.item)
|
||||
for field_name in tuple(self._values):
|
||||
value = self.get_output_value(field_name)
|
||||
if value is not None:
|
||||
adapter[field_name] = value
|
||||
|
||||
return adapter.item
|
||||
|
||||
def get_output_value(self, field_name):
|
||||
"""
|
||||
Return the collected values parsed using the output processor, for the
|
||||
given field. This method doesn't populate or modify the item at all.
|
||||
"""
|
||||
proc = self.get_output_processor(field_name)
|
||||
proc = wrap_loader_context(proc, self.context)
|
||||
value = self._values.get(field_name, [])
|
||||
try:
|
||||
return proc(value)
|
||||
except Exception as e:
|
||||
raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" %
|
||||
(field_name, value, type(e).__name__, str(e)))
|
||||
|
||||
def get_collected_values(self, field_name):
|
||||
"""Return the collected values for the given field."""
|
||||
return self._values.get(field_name, [])
|
||||
|
||||
def get_input_processor(self, field_name):
|
||||
proc = getattr(self, '%s_in' % field_name, None)
|
||||
if not proc:
|
||||
proc = self._get_item_field_attr(
|
||||
field_name,
|
||||
'input_processor',
|
||||
self.default_input_processor
|
||||
)
|
||||
return unbound_method(proc)
|
||||
|
||||
def get_output_processor(self, field_name):
|
||||
proc = getattr(self, '%s_out' % field_name, None)
|
||||
if not proc:
|
||||
proc = self._get_item_field_attr(
|
||||
field_name,
|
||||
'output_processor',
|
||||
self.default_output_processor
|
||||
)
|
||||
return unbound_method(proc)
|
||||
|
||||
def _get_item_field_attr(self, field_name, key, default=None):
|
||||
field_meta = ItemAdapter(self.item).get_field_meta(field_name)
|
||||
return field_meta.get(key, default)
|
||||
|
||||
def _process_input_value(self, field_name, value):
|
||||
proc = self.get_input_processor(field_name)
|
||||
_proc = proc
|
||||
proc = wrap_loader_context(proc, self.context)
|
||||
try:
|
||||
return proc(value)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
"Error with input processor %s: field=%r value=%r "
|
||||
"error='%s: %s'" % (_proc.__class__.__name__, field_name,
|
||||
value, type(e).__name__, str(e)))
|
||||
|
||||
def _check_selector_method(self):
|
||||
if self.selector is None:
|
||||
raise RuntimeError(
|
||||
"To use XPath or CSS selectors, %s"
|
||||
"must be instantiated with a selector" % self.__class__.__name__
|
||||
)
|
||||
|
||||
def add_xpath(self, field_name, xpath, *processors, **kw):
|
||||
"""
|
||||
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of strings from the
|
||||
selector associated with this :class:`ItemLoader`.
|
||||
|
||||
See :meth:`get_xpath` for ``kwargs``.
|
||||
|
||||
:param xpath: the XPath to extract data from
|
||||
:type xpath: str
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.add_xpath('name', '//p[@class="product-name"]')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
||||
|
||||
"""
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
self.add_value(field_name, values, *processors, **kw)
|
||||
|
||||
def replace_xpath(self, field_name, xpath, *processors, **kw):
|
||||
"""
|
||||
Similar to :meth:`add_xpath` but replaces collected data instead of adding it.
|
||||
"""
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
self.replace_value(field_name, values, *processors, **kw)
|
||||
|
||||
def get_xpath(self, xpath, *processors, **kw):
|
||||
"""
|
||||
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
|
||||
value, which is used to extract a list of unicode strings from the
|
||||
selector associated with this :class:`ItemLoader`.
|
||||
|
||||
:param xpath: the XPath to extract data from
|
||||
:type xpath: str
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
selected XPath region
|
||||
:type re: str or typing.Pattern
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.get_xpath('//p[@class="product-name"]')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
|
||||
|
||||
"""
|
||||
values = self._get_xpathvalues(xpath, **kw)
|
||||
return self.get_value(values, *processors, **kw)
|
||||
|
||||
def _get_xpathvalues(self, xpaths, **kw):
|
||||
self._check_selector_method()
|
||||
xpaths = arg_to_iter(xpaths)
|
||||
return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths)
|
||||
|
||||
def add_css(self, field_name, css, *processors, **kw):
|
||||
"""
|
||||
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
|
||||
instead of a value, which is used to extract a list of unicode strings
|
||||
from the selector associated with this :class:`ItemLoader`.
|
||||
|
||||
See :meth:`get_css` for ``kwargs``.
|
||||
|
||||
:param css: the CSS selector to extract data from
|
||||
:type css: str
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.add_css('name', 'p.product-name')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.add_css('price', 'p#price', re='the price is (.*)')
|
||||
"""
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
self.add_value(field_name, values, *processors, **kw)
|
||||
|
||||
def replace_css(self, field_name, css, *processors, **kw):
|
||||
"""
|
||||
Similar to :meth:`add_css` but replaces collected data instead of adding it.
|
||||
"""
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
self.replace_value(field_name, values, *processors, **kw)
|
||||
|
||||
def get_css(self, css, *processors, **kw):
|
||||
"""
|
||||
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
|
||||
instead of a value, which is used to extract a list of unicode strings
|
||||
from the selector associated with this :class:`ItemLoader`.
|
||||
|
||||
:param css: the CSS selector to extract data from
|
||||
:type css: str
|
||||
|
||||
:param re: a regular expression to use for extracting data from the
|
||||
selected CSS region
|
||||
:type re: str or typing.Pattern
|
||||
|
||||
Examples::
|
||||
|
||||
# HTML snippet: <p class="product-name">Color TV</p>
|
||||
loader.get_css('p.product-name')
|
||||
# HTML snippet: <p id="price">the price is $1200</p>
|
||||
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
|
||||
"""
|
||||
values = self._get_cssvalues(css, **kw)
|
||||
return self.get_value(values, *processors, **kw)
|
||||
|
||||
def _get_cssvalues(self, csss, **kw):
|
||||
self._check_selector_method()
|
||||
csss = arg_to_iter(csss)
|
||||
return flatten(self.selector.css(css).getall() for css in csss)
|
||||
14
venv/lib/python3.9/site-packages/itemloaders/common.py
Normal file
14
venv/lib/python3.9/site-packages/itemloaders/common.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
"""Common functions used in Item Loaders code"""
|
||||
|
||||
from functools import partial
|
||||
from itemloaders.utils import get_func_args
|
||||
|
||||
|
||||
def wrap_loader_context(function, context):
|
||||
"""Wrap functions that receive loader_context to contain the context
|
||||
"pre-loaded" and expose a interface that receives only one argument
|
||||
"""
|
||||
if 'loader_context' in get_func_args(function):
|
||||
return partial(function, loader_context=context)
|
||||
else:
|
||||
return function
|
||||
233
venv/lib/python3.9/site-packages/itemloaders/processors.py
Normal file
233
venv/lib/python3.9/site-packages/itemloaders/processors.py
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
"""
|
||||
This module provides some commonly used processors for Item Loaders.
|
||||
|
||||
See documentation in docs/topics/loaders.rst
|
||||
"""
|
||||
from collections import ChainMap
|
||||
|
||||
from itemloaders.utils import arg_to_iter
|
||||
from itemloaders.common import wrap_loader_context
|
||||
|
||||
|
||||
class MapCompose:
|
||||
"""
|
||||
A processor which is constructed from the composition of the given
|
||||
functions, similar to the :class:`Compose` processor. The difference with
|
||||
this processor is the way internal results are passed among functions,
|
||||
which is as follows:
|
||||
|
||||
The input value of this processor is *iterated* and the first function is
|
||||
applied to each element. The results of these function calls (one for each element)
|
||||
are concatenated to construct a new iterable, which is then used to apply the
|
||||
second function, and so on, until the last function is applied to each
|
||||
value of the list of values collected so far. The output values of the last
|
||||
function are concatenated together to produce the output of this processor.
|
||||
|
||||
Each particular function can return a value or a list of values, which is
|
||||
flattened with the list of values returned by the same function applied to
|
||||
the other input values. The functions can also return ``None`` in which
|
||||
case the output of that function is ignored for further processing over the
|
||||
chain.
|
||||
|
||||
This processor provides a convenient way to compose functions that only
|
||||
work with single values (instead of iterables). For this reason the
|
||||
:class:`MapCompose` processor is typically used as input processor, since
|
||||
data is often extracted using the
|
||||
:meth:`~parsel.selector.Selector.extract` method of `parsel selectors`_,
|
||||
which returns a list of unicode strings.
|
||||
|
||||
The example below should clarify how it works:
|
||||
|
||||
>>> def filter_world(x):
|
||||
... return None if x == 'world' else x
|
||||
...
|
||||
>>> from itemloaders.processors import MapCompose
|
||||
>>> proc = MapCompose(filter_world, str.upper)
|
||||
>>> proc(['hello', 'world', 'this', 'is', 'something'])
|
||||
['HELLO', 'THIS', 'IS', 'SOMETHING']
|
||||
|
||||
As with the Compose processor, functions can receive Loader contexts, and
|
||||
``__init__`` method keyword arguments are used as default context values.
|
||||
See :class:`Compose` processor for more info.
|
||||
|
||||
.. _`parsel selectors`: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.selector.Selector.extract
|
||||
"""
|
||||
|
||||
def __init__(self, *functions, **default_loader_context):
|
||||
self.functions = functions
|
||||
self.default_loader_context = default_loader_context
|
||||
|
||||
def __call__(self, value, loader_context=None):
|
||||
values = arg_to_iter(value)
|
||||
if loader_context:
|
||||
context = ChainMap(loader_context, self.default_loader_context)
|
||||
else:
|
||||
context = self.default_loader_context
|
||||
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
|
||||
for func in wrapped_funcs:
|
||||
next_values = []
|
||||
for v in values:
|
||||
try:
|
||||
next_values += arg_to_iter(func(v))
|
||||
except Exception as e:
|
||||
raise ValueError("Error in MapCompose with "
|
||||
"%s value=%r error='%s: %s'" %
|
||||
(str(func), value, type(e).__name__,
|
||||
str(e)))
|
||||
values = next_values
|
||||
return values
|
||||
|
||||
|
||||
class Compose:
|
||||
"""
|
||||
A processor which is constructed from the composition of the given
|
||||
functions. This means that each input value of this processor is passed to
|
||||
the first function, and the result of that function is passed to the second
|
||||
function, and so on, until the last function returns the output value of
|
||||
this processor.
|
||||
|
||||
By default, stop process on ``None`` value. This behaviour can be changed by
|
||||
passing keyword argument ``stop_on_none=False``.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from itemloaders.processors import Compose
|
||||
>>> proc = Compose(lambda v: v[0], str.upper)
|
||||
>>> proc(['hello', 'world'])
|
||||
'HELLO'
|
||||
|
||||
Each function can optionally receive a ``loader_context`` parameter. For
|
||||
those which do, this processor will pass the currently active :ref:`Loader
|
||||
context <loaders-context>` through that parameter.
|
||||
|
||||
The keyword arguments passed in the ``__init__`` method are used as the default
|
||||
Loader context values passed to each function call. However, the final
|
||||
Loader context values passed to functions are overridden with the currently
|
||||
active Loader context accessible through the :attr:`ItemLoader.context
|
||||
<itemloaders.ItemLoader.context>` attribute.
|
||||
"""
|
||||
|
||||
def __init__(self, *functions, **default_loader_context):
|
||||
self.functions = functions
|
||||
self.stop_on_none = default_loader_context.get('stop_on_none', True)
|
||||
self.default_loader_context = default_loader_context
|
||||
|
||||
def __call__(self, value, loader_context=None):
|
||||
if loader_context:
|
||||
context = ChainMap(loader_context, self.default_loader_context)
|
||||
else:
|
||||
context = self.default_loader_context
|
||||
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
|
||||
for func in wrapped_funcs:
|
||||
if value is None and self.stop_on_none:
|
||||
break
|
||||
try:
|
||||
value = func(value)
|
||||
except Exception as e:
|
||||
raise ValueError("Error in Compose with "
|
||||
"%s value=%r error='%s: %s'" %
|
||||
(str(func), value, type(e).__name__, str(e)))
|
||||
return value
|
||||
|
||||
|
||||
class TakeFirst:
|
||||
"""
|
||||
Returns the first non-null/non-empty value from the values received,
|
||||
so it's typically used as an output processor to single-valued fields.
|
||||
It doesn't receive any ``__init__`` method arguments, nor does it accept Loader contexts.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from itemloaders.processors import TakeFirst
|
||||
>>> proc = TakeFirst()
|
||||
>>> proc(['', 'one', 'two', 'three'])
|
||||
'one'
|
||||
"""
|
||||
|
||||
def __call__(self, values):
|
||||
for value in values:
|
||||
if value is not None and value != '':
|
||||
return value
|
||||
|
||||
|
||||
class Identity:
|
||||
"""
|
||||
The simplest processor, which doesn't do anything. It returns the original
|
||||
values unchanged. It doesn't receive any ``__init__`` method arguments, nor does it
|
||||
accept Loader contexts.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from itemloaders.processors import Identity
|
||||
>>> proc = Identity()
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
['one', 'two', 'three']
|
||||
"""
|
||||
|
||||
def __call__(self, values):
|
||||
return values
|
||||
|
||||
|
||||
class SelectJmes:
|
||||
"""
|
||||
Query the input string for the jmespath (given at instantiation), and return the answer
|
||||
Requires : jmespath(https://github.com/jmespath/jmespath)
|
||||
Note: SelectJmes accepts only one input element at a time.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from itemloaders.processors import SelectJmes, Compose, MapCompose
|
||||
>>> proc = SelectJmes("foo") #for direct use on lists and dictionaries
|
||||
>>> proc({'foo': 'bar'})
|
||||
'bar'
|
||||
>>> proc({'foo': {'bar': 'baz'}})
|
||||
{'bar': 'baz'}
|
||||
|
||||
Working with Json:
|
||||
|
||||
>>> import json
|
||||
>>> proc_single_json_str = Compose(json.loads, SelectJmes("foo"))
|
||||
>>> proc_single_json_str('{"foo": "bar"}')
|
||||
'bar'
|
||||
>>> proc_json_list = Compose(json.loads, MapCompose(SelectJmes('foo')))
|
||||
>>> proc_json_list('[{"foo":"bar"}, {"baz":"tar"}]')
|
||||
['bar']
|
||||
"""
|
||||
|
||||
def __init__(self, json_path):
|
||||
self.json_path = json_path
|
||||
import jmespath
|
||||
self.compiled_path = jmespath.compile(self.json_path)
|
||||
|
||||
def __call__(self, value):
|
||||
"""Query value for the jmespath query and return answer
|
||||
:param value: a data structure (dict, list) to extract from
|
||||
:return: Element extracted according to jmespath query
|
||||
"""
|
||||
return self.compiled_path.search(value)
|
||||
|
||||
|
||||
class Join:
|
||||
"""
|
||||
Returns the values joined with the separator given in the ``__init__`` method, which
|
||||
defaults to ``' '``. It doesn't accept Loader contexts.
|
||||
|
||||
When using the default separator, this processor is equivalent to the
|
||||
function: ``' '.join``
|
||||
|
||||
Examples:
|
||||
|
||||
>>> from itemloaders.processors import Join
|
||||
>>> proc = Join()
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
'one two three'
|
||||
>>> proc = Join('<br>')
|
||||
>>> proc(['one', 'two', 'three'])
|
||||
'one<br>two<br>three'
|
||||
"""
|
||||
|
||||
def __init__(self, separator=' '):
|
||||
self.separator = separator
|
||||
|
||||
def __call__(self, values):
|
||||
return self.separator.join(values)
|
||||
73
venv/lib/python3.9/site-packages/itemloaders/utils.py
Normal file
73
venv/lib/python3.9/site-packages/itemloaders/utils.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
"""
|
||||
Copy/paste from scrapy source at the moment, to ensure tests are working.
|
||||
Refactoring to come later
|
||||
"""
|
||||
import inspect
|
||||
from functools import partial
|
||||
|
||||
from itemadapter import is_item
|
||||
|
||||
|
||||
_ITERABLE_SINGLE_VALUES = str, bytes
|
||||
|
||||
|
||||
def arg_to_iter(arg):
|
||||
"""Convert an argument to an iterable. The argument can be a None, single
|
||||
value, or an iterable.
|
||||
|
||||
Exception: if arg is a dict, [arg] will be returned
|
||||
"""
|
||||
if arg is None:
|
||||
return []
|
||||
elif (
|
||||
hasattr(arg, '__iter__')
|
||||
and not isinstance(arg, _ITERABLE_SINGLE_VALUES)
|
||||
and not is_item(arg)
|
||||
):
|
||||
return arg
|
||||
else:
|
||||
return [arg]
|
||||
|
||||
|
||||
def get_func_args(func, stripself=False):
|
||||
"""Return the argument name list of a callable"""
|
||||
if inspect.isfunction(func):
|
||||
func_args, _, _, _ = _getargspec_py23(func)
|
||||
elif inspect.isclass(func):
|
||||
return get_func_args(func.__init__, True)
|
||||
elif inspect.ismethod(func):
|
||||
return get_func_args(func.__func__, True)
|
||||
elif inspect.ismethoddescriptor(func):
|
||||
return []
|
||||
elif isinstance(func, partial):
|
||||
return [x for x in get_func_args(func.func)[len(func.args):]
|
||||
if not (func.keywords and x in func.keywords)]
|
||||
elif hasattr(func, '__call__'):
|
||||
if inspect.isroutine(func):
|
||||
return []
|
||||
elif getattr(func, '__name__', None) == '__call__':
|
||||
return []
|
||||
else:
|
||||
return get_func_args(func.__call__, True)
|
||||
else:
|
||||
raise TypeError('%s is not callable' % type(func))
|
||||
if stripself:
|
||||
func_args.pop(0)
|
||||
return func_args
|
||||
|
||||
|
||||
def _getargspec_py23(func):
|
||||
"""_getargspec_py23(function) -> named tuple ArgSpec(args, varargs, keywords,
|
||||
defaults)
|
||||
|
||||
Was identical to inspect.getargspec() in python2, but uses
|
||||
inspect.getfullargspec() for python3 behind the scenes to avoid
|
||||
DeprecationWarning.
|
||||
|
||||
>>> def f(a, b=2, *ar, **kw):
|
||||
... pass
|
||||
|
||||
>>> _getargspec_py23(f)
|
||||
ArgSpec(args=['a', 'b'], varargs='ar', keywords='kw', defaults=(2,))
|
||||
"""
|
||||
return inspect.ArgSpec(*inspect.getfullargspec(func)[:4])
|
||||
Loading…
Add table
Add a link
Reference in a new issue