Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,442 @@
"""
Item Loader
See documentation in docs/topics/loaders.rst
"""
from contextlib import suppress
from itemadapter import ItemAdapter
from parsel.utils import extract_regex, flatten
from itemloaders.common import wrap_loader_context
from itemloaders.processors import Identity
from itemloaders.utils import arg_to_iter
def unbound_method(method):
"""
Allow to use single-argument functions as input or output processors
(no need to define an unused first 'self' argument)
"""
with suppress(AttributeError):
if '.' not in method.__qualname__:
return method.__func__
return method
class ItemLoader:
"""
Return a new Item Loader for populating the given item. If no item is
given, one is instantiated automatically using the class in
:attr:`default_item_class`.
When instantiated with a :param ``selector`` parameter the :class:`ItemLoader` class
provides convenient mechanisms for extracting data from web pages
using parsel_ selectors.
:param item: The item instance to populate using subsequent calls to
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
or :meth:`~ItemLoader.add_value`.
:type item: :class:`dict` object
:param selector: The selector to extract data from, when using the
:meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
(resp. :meth:`replace_css`) method.
:type selector: :class:`~parsel.selector.Selector` object
The item, selector and the remaining keyword arguments are
assigned to the Loader context (accessible through the :attr:`context` attribute).
.. attribute:: item
The item object being parsed by this Item Loader.
This is mostly used as a property so when attempting to override this
value, you may want to check out :attr:`default_item_class` first.
.. attribute:: context
The currently active :ref:`Context <loaders-context>` of this Item Loader.
Refer to <loaders-context> for more information about the Loader Context.
.. attribute:: default_item_class
An Item class (or factory), used to instantiate items when not given in
the ``__init__`` method.
.. warning:: Currently, this factory/class needs to be
callable/instantiated without any arguments.
If you are using ``dataclasses``, please consider the following
alternative::
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class Product:
name: Optional[str] = field(default=None)
price: Optional[float] = field(default=None)
.. attribute:: default_input_processor
The default input processor to use for those fields which don't specify
one.
.. attribute:: default_output_processor
The default output processor to use for those fields which don't specify
one.
.. attribute:: selector
The :class:`~parsel.selector.Selector` object to extract data from.
It's the selector given in the ``__init__`` method.
This attribute is meant to be read-only.
.. _parsel: https://parsel.readthedocs.io/en/latest/
"""
default_item_class = dict
default_input_processor = Identity()
default_output_processor = Identity()
def __init__(self, item=None, selector=None, parent=None, **context):
self.selector = selector
context.update(selector=selector)
if item is None:
item = self.default_item_class()
self._local_item = item
context['item'] = item
self.context = context
self.parent = parent
self._local_values = {}
# values from initial item
for field_name, value in ItemAdapter(item).items():
self._values.setdefault(field_name, [])
self._values[field_name] += arg_to_iter(value)
@property
def _values(self):
if self.parent is not None:
return self.parent._values
else:
return self._local_values
@property
def item(self):
if self.parent is not None:
return self.parent.item
else:
return self._local_item
def nested_xpath(self, xpath, **context):
"""
Create a nested loader with an xpath selector.
The supplied selector is applied relative to selector associated
with this :class:`ItemLoader`. The nested loader shares the item
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
"""
selector = self.selector.xpath(xpath)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
def nested_css(self, css, **context):
"""
Create a nested loader with a css selector.
The supplied selector is applied relative to selector associated
with this :class:`ItemLoader`. The nested loader shares the item
with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
:meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
"""
selector = self.selector.css(css)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
def add_value(self, field_name, value, *processors, **kw):
"""
Process and then add the given ``value`` for the given field.
The value is first passed through :meth:`get_value` by giving the
``processors`` and ``kwargs``, and then passed through the
:ref:`field input processor <processors>` and its result
appended to the data collected for that field. If the field already
contains collected data, the new data is added.
The given ``field_name`` can be ``None``, in which case values for
multiple fields may be added. And the processed value should be a dict
with field_name mapped to values.
Examples::
loader.add_value('name', 'Color TV')
loader.add_value('colours', ['white', 'blue'])
loader.add_value('length', '100')
loader.add_value('name', 'name: foo', TakeFirst(), re='name: (.+)')
loader.add_value(None, {'name': 'foo', 'sex': 'male'})
"""
value = self.get_value(value, *processors, **kw)
if value is None:
return
if not field_name:
for k, v in value.items():
self._add_value(k, v)
else:
self._add_value(field_name, value)
def replace_value(self, field_name, value, *processors, **kw):
"""
Similar to :meth:`add_value` but replaces the collected data with the
new value instead of adding it.
"""
value = self.get_value(value, *processors, **kw)
if value is None:
return
if not field_name:
for k, v in value.items():
self._replace_value(k, v)
else:
self._replace_value(field_name, value)
def _add_value(self, field_name, value):
value = arg_to_iter(value)
processed_value = self._process_input_value(field_name, value)
if processed_value:
self._values.setdefault(field_name, [])
self._values[field_name] += arg_to_iter(processed_value)
def _replace_value(self, field_name, value):
self._values.pop(field_name, None)
self._add_value(field_name, value)
def get_value(self, value, *processors, **kw):
"""
Process the given ``value`` by the given ``processors`` and keyword
arguments.
Available keyword arguments:
:param re: a regular expression to use for extracting data from the
given value using :func:`~parsel.utils.extract_regex` method,
applied before processors
:type re: str or typing.Pattern
Examples:
>>> from itemloaders import ItemLoader
>>> from itemloaders.processors import TakeFirst
>>> loader = ItemLoader()
>>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)')
'FOO'
"""
regex = kw.get('re', None)
if regex:
value = arg_to_iter(value)
value = flatten(extract_regex(regex, x) for x in value)
for proc in processors:
if value is None:
break
_proc = proc
proc = wrap_loader_context(proc, self.context)
try:
value = proc(value)
except Exception as e:
raise ValueError("Error with processor %s value=%r error='%s: %s'" %
(_proc.__class__.__name__, value,
type(e).__name__, str(e)))
return value
def load_item(self):
"""
Populate the item with the data collected so far, and return it. The
data collected is first passed through the :ref:`output processors
<processors>` to get the final value to assign to each item field.
"""
adapter = ItemAdapter(self.item)
for field_name in tuple(self._values):
value = self.get_output_value(field_name)
if value is not None:
adapter[field_name] = value
return adapter.item
def get_output_value(self, field_name):
"""
Return the collected values parsed using the output processor, for the
given field. This method doesn't populate or modify the item at all.
"""
proc = self.get_output_processor(field_name)
proc = wrap_loader_context(proc, self.context)
value = self._values.get(field_name, [])
try:
return proc(value)
except Exception as e:
raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" %
(field_name, value, type(e).__name__, str(e)))
def get_collected_values(self, field_name):
"""Return the collected values for the given field."""
return self._values.get(field_name, [])
def get_input_processor(self, field_name):
proc = getattr(self, '%s_in' % field_name, None)
if not proc:
proc = self._get_item_field_attr(
field_name,
'input_processor',
self.default_input_processor
)
return unbound_method(proc)
def get_output_processor(self, field_name):
proc = getattr(self, '%s_out' % field_name, None)
if not proc:
proc = self._get_item_field_attr(
field_name,
'output_processor',
self.default_output_processor
)
return unbound_method(proc)
def _get_item_field_attr(self, field_name, key, default=None):
field_meta = ItemAdapter(self.item).get_field_meta(field_name)
return field_meta.get(key, default)
def _process_input_value(self, field_name, value):
proc = self.get_input_processor(field_name)
_proc = proc
proc = wrap_loader_context(proc, self.context)
try:
return proc(value)
except Exception as e:
raise ValueError(
"Error with input processor %s: field=%r value=%r "
"error='%s: %s'" % (_proc.__class__.__name__, field_name,
value, type(e).__name__, str(e)))
def _check_selector_method(self):
if self.selector is None:
raise RuntimeError(
"To use XPath or CSS selectors, %s"
"must be instantiated with a selector" % self.__class__.__name__
)
def add_xpath(self, field_name, xpath, *processors, **kw):
"""
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
value, which is used to extract a list of strings from the
selector associated with this :class:`ItemLoader`.
See :meth:`get_xpath` for ``kwargs``.
:param xpath: the XPath to extract data from
:type xpath: str
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.add_xpath('name', '//p[@class="product-name"]')
# HTML snippet: <p id="price">the price is $1200</p>
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw)
self.add_value(field_name, values, *processors, **kw)
def replace_xpath(self, field_name, xpath, *processors, **kw):
"""
Similar to :meth:`add_xpath` but replaces collected data instead of adding it.
"""
values = self._get_xpathvalues(xpath, **kw)
self.replace_value(field_name, values, *processors, **kw)
def get_xpath(self, xpath, *processors, **kw):
"""
Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
value, which is used to extract a list of unicode strings from the
selector associated with this :class:`ItemLoader`.
:param xpath: the XPath to extract data from
:type xpath: str
:param re: a regular expression to use for extracting data from the
selected XPath region
:type re: str or typing.Pattern
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.get_xpath('//p[@class="product-name"]')
# HTML snippet: <p id="price">the price is $1200</p>
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw)
return self.get_value(values, *processors, **kw)
def _get_xpathvalues(self, xpaths, **kw):
self._check_selector_method()
xpaths = arg_to_iter(xpaths)
return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths)
def add_css(self, field_name, css, *processors, **kw):
"""
Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
instead of a value, which is used to extract a list of unicode strings
from the selector associated with this :class:`ItemLoader`.
See :meth:`get_css` for ``kwargs``.
:param css: the CSS selector to extract data from
:type css: str
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.add_css('name', 'p.product-name')
# HTML snippet: <p id="price">the price is $1200</p>
loader.add_css('price', 'p#price', re='the price is (.*)')
"""
values = self._get_cssvalues(css, **kw)
self.add_value(field_name, values, *processors, **kw)
def replace_css(self, field_name, css, *processors, **kw):
"""
Similar to :meth:`add_css` but replaces collected data instead of adding it.
"""
values = self._get_cssvalues(css, **kw)
self.replace_value(field_name, values, *processors, **kw)
def get_css(self, css, *processors, **kw):
"""
Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
instead of a value, which is used to extract a list of unicode strings
from the selector associated with this :class:`ItemLoader`.
:param css: the CSS selector to extract data from
:type css: str
:param re: a regular expression to use for extracting data from the
selected CSS region
:type re: str or typing.Pattern
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.get_css('p.product-name')
# HTML snippet: <p id="price">the price is $1200</p>
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
"""
values = self._get_cssvalues(css, **kw)
return self.get_value(values, *processors, **kw)
def _get_cssvalues(self, csss, **kw):
self._check_selector_method()
csss = arg_to_iter(csss)
return flatten(self.selector.css(css).getall() for css in csss)

View file

@ -0,0 +1,14 @@
"""Common functions used in Item Loaders code"""
from functools import partial
from itemloaders.utils import get_func_args
def wrap_loader_context(function, context):
"""Wrap functions that receive loader_context to contain the context
"pre-loaded" and expose a interface that receives only one argument
"""
if 'loader_context' in get_func_args(function):
return partial(function, loader_context=context)
else:
return function

View file

@ -0,0 +1,233 @@
"""
This module provides some commonly used processors for Item Loaders.
See documentation in docs/topics/loaders.rst
"""
from collections import ChainMap
from itemloaders.utils import arg_to_iter
from itemloaders.common import wrap_loader_context
class MapCompose:
"""
A processor which is constructed from the composition of the given
functions, similar to the :class:`Compose` processor. The difference with
this processor is the way internal results are passed among functions,
which is as follows:
The input value of this processor is *iterated* and the first function is
applied to each element. The results of these function calls (one for each element)
are concatenated to construct a new iterable, which is then used to apply the
second function, and so on, until the last function is applied to each
value of the list of values collected so far. The output values of the last
function are concatenated together to produce the output of this processor.
Each particular function can return a value or a list of values, which is
flattened with the list of values returned by the same function applied to
the other input values. The functions can also return ``None`` in which
case the output of that function is ignored for further processing over the
chain.
This processor provides a convenient way to compose functions that only
work with single values (instead of iterables). For this reason the
:class:`MapCompose` processor is typically used as input processor, since
data is often extracted using the
:meth:`~parsel.selector.Selector.extract` method of `parsel selectors`_,
which returns a list of unicode strings.
The example below should clarify how it works:
>>> def filter_world(x):
... return None if x == 'world' else x
...
>>> from itemloaders.processors import MapCompose
>>> proc = MapCompose(filter_world, str.upper)
>>> proc(['hello', 'world', 'this', 'is', 'something'])
['HELLO', 'THIS', 'IS', 'SOMETHING']
As with the Compose processor, functions can receive Loader contexts, and
``__init__`` method keyword arguments are used as default context values.
See :class:`Compose` processor for more info.
.. _`parsel selectors`: https://parsel.readthedocs.io/en/latest/parsel.html#parsel.selector.Selector.extract
"""
def __init__(self, *functions, **default_loader_context):
self.functions = functions
self.default_loader_context = default_loader_context
def __call__(self, value, loader_context=None):
values = arg_to_iter(value)
if loader_context:
context = ChainMap(loader_context, self.default_loader_context)
else:
context = self.default_loader_context
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
for func in wrapped_funcs:
next_values = []
for v in values:
try:
next_values += arg_to_iter(func(v))
except Exception as e:
raise ValueError("Error in MapCompose with "
"%s value=%r error='%s: %s'" %
(str(func), value, type(e).__name__,
str(e)))
values = next_values
return values
class Compose:
"""
A processor which is constructed from the composition of the given
functions. This means that each input value of this processor is passed to
the first function, and the result of that function is passed to the second
function, and so on, until the last function returns the output value of
this processor.
By default, stop process on ``None`` value. This behaviour can be changed by
passing keyword argument ``stop_on_none=False``.
Example:
>>> from itemloaders.processors import Compose
>>> proc = Compose(lambda v: v[0], str.upper)
>>> proc(['hello', 'world'])
'HELLO'
Each function can optionally receive a ``loader_context`` parameter. For
those which do, this processor will pass the currently active :ref:`Loader
context <loaders-context>` through that parameter.
The keyword arguments passed in the ``__init__`` method are used as the default
Loader context values passed to each function call. However, the final
Loader context values passed to functions are overridden with the currently
active Loader context accessible through the :attr:`ItemLoader.context
<itemloaders.ItemLoader.context>` attribute.
"""
def __init__(self, *functions, **default_loader_context):
self.functions = functions
self.stop_on_none = default_loader_context.get('stop_on_none', True)
self.default_loader_context = default_loader_context
def __call__(self, value, loader_context=None):
if loader_context:
context = ChainMap(loader_context, self.default_loader_context)
else:
context = self.default_loader_context
wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
for func in wrapped_funcs:
if value is None and self.stop_on_none:
break
try:
value = func(value)
except Exception as e:
raise ValueError("Error in Compose with "
"%s value=%r error='%s: %s'" %
(str(func), value, type(e).__name__, str(e)))
return value
class TakeFirst:
"""
Returns the first non-null/non-empty value from the values received,
so it's typically used as an output processor to single-valued fields.
It doesn't receive any ``__init__`` method arguments, nor does it accept Loader contexts.
Example:
>>> from itemloaders.processors import TakeFirst
>>> proc = TakeFirst()
>>> proc(['', 'one', 'two', 'three'])
'one'
"""
def __call__(self, values):
for value in values:
if value is not None and value != '':
return value
class Identity:
"""
The simplest processor, which doesn't do anything. It returns the original
values unchanged. It doesn't receive any ``__init__`` method arguments, nor does it
accept Loader contexts.
Example:
>>> from itemloaders.processors import Identity
>>> proc = Identity()
>>> proc(['one', 'two', 'three'])
['one', 'two', 'three']
"""
def __call__(self, values):
return values
class SelectJmes:
"""
Query the input string for the jmespath (given at instantiation), and return the answer
Requires : jmespath(https://github.com/jmespath/jmespath)
Note: SelectJmes accepts only one input element at a time.
Example:
>>> from itemloaders.processors import SelectJmes, Compose, MapCompose
>>> proc = SelectJmes("foo") #for direct use on lists and dictionaries
>>> proc({'foo': 'bar'})
'bar'
>>> proc({'foo': {'bar': 'baz'}})
{'bar': 'baz'}
Working with Json:
>>> import json
>>> proc_single_json_str = Compose(json.loads, SelectJmes("foo"))
>>> proc_single_json_str('{"foo": "bar"}')
'bar'
>>> proc_json_list = Compose(json.loads, MapCompose(SelectJmes('foo')))
>>> proc_json_list('[{"foo":"bar"}, {"baz":"tar"}]')
['bar']
"""
def __init__(self, json_path):
self.json_path = json_path
import jmespath
self.compiled_path = jmespath.compile(self.json_path)
def __call__(self, value):
"""Query value for the jmespath query and return answer
:param value: a data structure (dict, list) to extract from
:return: Element extracted according to jmespath query
"""
return self.compiled_path.search(value)
class Join:
"""
Returns the values joined with the separator given in the ``__init__`` method, which
defaults to ``' '``. It doesn't accept Loader contexts.
When using the default separator, this processor is equivalent to the
function: ``' '.join``
Examples:
>>> from itemloaders.processors import Join
>>> proc = Join()
>>> proc(['one', 'two', 'three'])
'one two three'
>>> proc = Join('<br>')
>>> proc(['one', 'two', 'three'])
'one<br>two<br>three'
"""
def __init__(self, separator=' '):
self.separator = separator
def __call__(self, values):
return self.separator.join(values)

View file

@ -0,0 +1,73 @@
"""
Copy/paste from scrapy source at the moment, to ensure tests are working.
Refactoring to come later
"""
import inspect
from functools import partial
from itemadapter import is_item
_ITERABLE_SINGLE_VALUES = str, bytes
def arg_to_iter(arg):
"""Convert an argument to an iterable. The argument can be a None, single
value, or an iterable.
Exception: if arg is a dict, [arg] will be returned
"""
if arg is None:
return []
elif (
hasattr(arg, '__iter__')
and not isinstance(arg, _ITERABLE_SINGLE_VALUES)
and not is_item(arg)
):
return arg
else:
return [arg]
def get_func_args(func, stripself=False):
"""Return the argument name list of a callable"""
if inspect.isfunction(func):
func_args, _, _, _ = _getargspec_py23(func)
elif inspect.isclass(func):
return get_func_args(func.__init__, True)
elif inspect.ismethod(func):
return get_func_args(func.__func__, True)
elif inspect.ismethoddescriptor(func):
return []
elif isinstance(func, partial):
return [x for x in get_func_args(func.func)[len(func.args):]
if not (func.keywords and x in func.keywords)]
elif hasattr(func, '__call__'):
if inspect.isroutine(func):
return []
elif getattr(func, '__name__', None) == '__call__':
return []
else:
return get_func_args(func.__call__, True)
else:
raise TypeError('%s is not callable' % type(func))
if stripself:
func_args.pop(0)
return func_args
def _getargspec_py23(func):
"""_getargspec_py23(function) -> named tuple ArgSpec(args, varargs, keywords,
defaults)
Was identical to inspect.getargspec() in python2, but uses
inspect.getfullargspec() for python3 behind the scenes to avoid
DeprecationWarning.
>>> def f(a, b=2, *ar, **kw):
... pass
>>> _getargspec_py23(f)
ArgSpec(args=['a', 'b'], varargs='ar', keywords='kw', defaults=(2,))
"""
return inspect.ArgSpec(*inspect.getfullargspec(func)[:4])