Ausgabe der neuen DB Einträge

2022-01-02 21:50:48 +01:00 · 2022-01-02 21:50:48 +01:00 · cfbbb9ee3d
commit cfbbb9ee3d
parent bad48e1627
2399 changed files with 843193 additions and 43 deletions
--- a/venv/lib/python3.9/site-packages/parsel/init.py
+++ b/venv/lib/python3.9/site-packages/parsel/init.py
@ -0,0 +1,14 @@
+"""
+Parsel lets you extract text from XML/HTML documents using XPath
+or CSS selectors
+"""
+
+__author__ = 'Scrapy project'
+__email__ = 'info@scrapy.org'
+__version__ = '1.6.0'
+
+from parsel.selector import Selector, SelectorList  # NOQA
+from parsel.csstranslator import css2xpath  # NOQA
+from parsel import xpathfuncs # NOQA
+
+xpathfuncs.setup()
--- a/venv/lib/python3.9/site-packages/parsel/csstranslator.py
+++ b/venv/lib/python3.9/site-packages/parsel/csstranslator.py
@ -0,0 +1,115 @@
+try:
+    from functools import lru_cache
+except ImportError:
+    from functools32 import lru_cache
+
+from cssselect import GenericTranslator as OriginalGenericTranslator
+from cssselect import HTMLTranslator as OriginalHTMLTranslator
+from cssselect.xpath import XPathExpr as OriginalXPathExpr
+from cssselect.xpath import _unicode_safe_getattr, ExpressionError
+from cssselect.parser import FunctionalPseudoElement
+
+
+class XPathExpr(OriginalXPathExpr):
+
+    textnode = False
+    attribute = None
+
+    @classmethod
+    def from_xpath(cls, xpath, textnode=False, attribute=None):
+        x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
+        x.textnode = textnode
+        x.attribute = attribute
+        return x
+
+    def __str__(self):
+        path = super(XPathExpr, self).__str__()
+        if self.textnode:
+            if path == '*':
+                path = 'text()'
+            elif path.endswith('::*/*'):
+                path = path[:-3] + 'text()'
+            else:
+                path += '/text()'
+
+        if self.attribute is not None:
+            if path.endswith('::*/*'):
+                path = path[:-2]
+            path += '/@%s' % self.attribute
+
+        return path
+
+    def join(self, combiner, other):
+        super(XPathExpr, self).join(combiner, other)
+        self.textnode = other.textnode
+        self.attribute = other.attribute
+        return self
+
+
+class TranslatorMixin(object):
+    """This mixin adds support to CSS pseudo elements via dynamic dispatch.
+
+    Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.
+    """
+
+    def xpath_element(self, selector):
+        xpath = super(TranslatorMixin, self).xpath_element(selector)
+        return XPathExpr.from_xpath(xpath)
+
+    def xpath_pseudo_element(self, xpath, pseudo_element):
+        """
+        Dispatch method that transforms XPath to support pseudo-element
+        """
+        if isinstance(pseudo_element, FunctionalPseudoElement):
+            method = 'xpath_%s_functional_pseudo_element' % (
+                pseudo_element.name.replace('-', '_'))
+            method = _unicode_safe_getattr(self, method, None)
+            if not method:
+                raise ExpressionError(
+                    "The functional pseudo-element ::%s() is unknown"
+                    % pseudo_element.name)
+            xpath = method(xpath, pseudo_element)
+        else:
+            method = 'xpath_%s_simple_pseudo_element' % (
+                pseudo_element.replace('-', '_'))
+            method = _unicode_safe_getattr(self, method, None)
+            if not method:
+                raise ExpressionError(
+                    "The pseudo-element ::%s is unknown"
+                    % pseudo_element)
+            xpath = method(xpath)
+        return xpath
+
+    def xpath_attr_functional_pseudo_element(self, xpath, function):
+        """Support selecting attribute values using ::attr() pseudo-element
+        """
+        if function.argument_types() not in (['STRING'], ['IDENT']):
+            raise ExpressionError(
+                "Expected a single string or ident for ::attr(), got %r"
+                % function.arguments)
+        return XPathExpr.from_xpath(xpath,
+                                    attribute=function.arguments[0].value)
+
+    def xpath_text_simple_pseudo_element(self, xpath):
+        """Support selecting text nodes using ::text pseudo-element"""
+        return XPathExpr.from_xpath(xpath, textnode=True)
+
+
+class GenericTranslator(TranslatorMixin, OriginalGenericTranslator):
+    @lru_cache(maxsize=256)
+    def css_to_xpath(self, css, prefix='descendant-or-self::'):
+        return super(GenericTranslator, self).css_to_xpath(css, prefix)
+
+
+class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
+    @lru_cache(maxsize=256)
+    def css_to_xpath(self, css, prefix='descendant-or-self::'):
+        return super(HTMLTranslator, self).css_to_xpath(css, prefix)
+
+
+_translator = HTMLTranslator()
+
+
+def css2xpath(query):
+    "Return translated XPath version of a given CSS query"
+    return _translator.css_to_xpath(query)
--- a/venv/lib/python3.9/site-packages/parsel/selector.py
+++ b/venv/lib/python3.9/site-packages/parsel/selector.py
@ -0,0 +1,405 @@
+"""
+XPath selectors based on lxml
+"""
+
+import sys
+
+import six
+from lxml import etree, html
+
+from .utils import flatten, iflatten, extract_regex, shorten
+from .csstranslator import HTMLTranslator, GenericTranslator
+
+
+class CannotRemoveElementWithoutRoot(Exception):
+    pass
+
+
+class CannotRemoveElementWithoutParent(Exception):
+    pass
+
+
+class SafeXMLParser(etree.XMLParser):
+    def __init__(self, *args, **kwargs):
+        kwargs.setdefault('resolve_entities', False)
+        super(SafeXMLParser, self).__init__(*args, **kwargs)
+
+
+_ctgroup = {
+    'html': {'_parser': html.HTMLParser,
+             '_csstranslator': HTMLTranslator(),
+             '_tostring_method': 'html'},
+    'xml': {'_parser': SafeXMLParser,
+            '_csstranslator': GenericTranslator(),
+            '_tostring_method': 'xml'},
+}
+
+
+def _st(st):
+    if st is None:
+        return 'html'
+    elif st in _ctgroup:
+        return st
+    else:
+        raise ValueError('Invalid type: %s' % st)
+
+
+def create_root_node(text, parser_cls, base_url=None):
+    """Create root node for text using given parser class.
+    """
+    body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
+    parser = parser_cls(recover=True, encoding='utf8')
+    root = etree.fromstring(body, parser=parser, base_url=base_url)
+    if root is None:
+        root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url)
+    return root
+
+
+class SelectorList(list):
+    """
+    The :class:`SelectorList` class is a subclass of the builtin ``list``
+    class, which provides a few additional methods.
+    """
+
+    # __getslice__ is deprecated but `list` builtin implements it only in Py2
+    def __getslice__(self, i, j):
+        o = super(SelectorList, self).__getslice__(i, j)
+        return self.__class__(o)
+
+    def __getitem__(self, pos):
+        o = super(SelectorList, self).__getitem__(pos)
+        return self.__class__(o) if isinstance(pos, slice) else o
+
+    def __getstate__(self):
+        raise TypeError("can't pickle SelectorList objects")
+
+    def xpath(self, xpath, namespaces=None, **kwargs):
+        """
+        Call the ``.xpath()`` method for each element in this list and return
+        their results flattened as another :class:`SelectorList`.
+
+        ``query`` is the same argument as the one in :meth:`Selector.xpath`
+
+        ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
+        for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
+        Contrary to ``register_namespace()``, these prefixes are not
+        saved for future calls.
+
+        Any additional named arguments can be used to pass values for XPath
+        variables in the XPath expression, e.g.::
+
+            selector.xpath('//a[href=$url]', url="http://www.example.com")
+        """
+        return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]))
+
+    def css(self, query):
+        """
+        Call the ``.css()`` method for each element in this list and return
+        their results flattened as another :class:`SelectorList`.
+
+        ``query`` is the same argument as the one in :meth:`Selector.css`
+        """
+        return self.__class__(flatten([x.css(query) for x in self]))
+
+    def re(self, regex, replace_entities=True):
+        """
+        Call the ``.re()`` method for each element in this list and return
+        their results flattened, as a list of unicode strings.
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+        return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
+
+    def re_first(self, regex, default=None, replace_entities=True):
+        """
+        Call the ``.re()`` method for the first element in this list and
+        return the result in an unicode string. If the list is empty or the
+        regex doesn't match anything, return the default value (``None`` if
+        the argument is not provided).
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+        for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self):
+            return el
+        return default
+
+    def getall(self):
+        """
+        Call the ``.get()`` method for each element is this list and return
+        their results flattened, as a list of unicode strings.
+        """
+        return [x.get() for x in self]
+    extract = getall
+
+    def get(self, default=None):
+        """
+        Return the result of ``.get()`` for the first element in this list.
+        If the list is empty, return the default value.
+        """
+        for x in self:
+            return x.get()
+        return default
+    extract_first = get
+
+    @property
+    def attrib(self):
+        """Return the attributes dictionary for the first element.
+        If the list is empty, return an empty dict.
+        """
+        for x in self:
+            return x.attrib
+        return {}
+
+    def remove(self):
+        """
+        Remove matched nodes from the parent for each element in this list.
+        """
+        for x in self:
+            x.remove()
+
+
+class Selector(object):
+    """
+    :class:`Selector` allows you to select parts of an XML or HTML text using CSS
+    or XPath expressions and extract data from it.
+
+    ``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3
+
+    ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
+    If ``type`` is ``None``, the selector defaults to ``"html"``.
+
+    ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.
+    See [`lxml` documentation](https://lxml.de/api/index.html) ``lxml.etree.fromstring`` for more information.
+    """
+
+    __slots__ = ['text', 'namespaces', 'type', '_expr', 'root',
+                 '__weakref__', '_parser', '_csstranslator', '_tostring_method']
+
+    _default_type = None
+    _default_namespaces = {
+        "re": "http://exslt.org/regular-expressions",
+
+        # supported in libxslt:
+        # set:difference
+        # set:has-same-node
+        # set:intersection
+        # set:leading
+        # set:trailing
+        "set": "http://exslt.org/sets"
+    }
+    _lxml_smart_strings = False
+    selectorlist_cls = SelectorList
+
+    def __init__(self, text=None, type=None, namespaces=None, root=None,
+                 base_url=None, _expr=None):
+        self.type = st = _st(type or self._default_type)
+        self._parser = _ctgroup[st]['_parser']
+        self._csstranslator = _ctgroup[st]['_csstranslator']
+        self._tostring_method = _ctgroup[st]['_tostring_method']
+
+        if text is not None:
+            if not isinstance(text, six.text_type):
+                msg = "text argument should be of type %s, got %s" % (
+                    six.text_type, text.__class__)
+                raise TypeError(msg)
+            root = self._get_root(text, base_url)
+        elif root is None:
+            raise ValueError("Selector needs either text or root argument")
+
+        self.namespaces = dict(self._default_namespaces)
+        if namespaces is not None:
+            self.namespaces.update(namespaces)
+        self.root = root
+        self._expr = _expr
+
+    def __getstate__(self):
+        raise TypeError("can't pickle Selector objects")
+
+    def _get_root(self, text, base_url=None):
+        return create_root_node(text, self._parser, base_url=base_url)
+
+    def xpath(self, query, namespaces=None, **kwargs):
+        """
+        Find nodes matching the xpath ``query`` and return the result as a
+        :class:`SelectorList` instance with all elements flattened. List
+        elements implement :class:`Selector` interface too.
+
+        ``query`` is a string containing the XPATH query to apply.
+
+        ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
+        for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
+        Contrary to ``register_namespace()``, these prefixes are not
+        saved for future calls.
+
+        Any additional named arguments can be used to pass values for XPath
+        variables in the XPath expression, e.g.::
+
+            selector.xpath('//a[href=$url]', url="http://www.example.com")
+        """
+        try:
+            xpathev = self.root.xpath
+        except AttributeError:
+            return self.selectorlist_cls([])
+
+        nsp = dict(self.namespaces)
+        if namespaces is not None:
+            nsp.update(namespaces)
+        try:
+            result = xpathev(query, namespaces=nsp,
+                             smart_strings=self._lxml_smart_strings,
+                             **kwargs)
+        except etree.XPathError as exc:
+            msg = u"XPath error: %s in %s" % (exc, query)
+            msg = msg if six.PY3 else msg.encode('unicode_escape')
+            six.reraise(ValueError, ValueError(msg), sys.exc_info()[2])
+
+        if type(result) is not list:
+            result = [result]
+
+        result = [self.__class__(root=x, _expr=query,
+                                 namespaces=self.namespaces,
+                                 type=self.type)
+                  for x in result]
+        return self.selectorlist_cls(result)
+
+    def css(self, query):
+        """
+        Apply the given CSS selector and return a :class:`SelectorList` instance.
+
+        ``query`` is a string containing the CSS selector to apply.
+
+        In the background, CSS queries are translated into XPath queries using
+        `cssselect`_ library and run ``.xpath()`` method.
+
+        .. _cssselect: https://pypi.python.org/pypi/cssselect/
+        """
+        return self.xpath(self._css2xpath(query))
+
+    def _css2xpath(self, query):
+        return self._csstranslator.css_to_xpath(query)
+
+    def re(self, regex, replace_entities=True):
+        """
+        Apply the given regex and return a list of unicode strings with the
+        matches.
+
+        ``regex`` can be either a compiled regular expression or a string which
+        will be compiled to a regular expression using ``re.compile(regex)``.
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``).
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+        return extract_regex(regex, self.get(), replace_entities=replace_entities)
+
+    def re_first(self, regex, default=None, replace_entities=True):
+        """
+        Apply the given regex and return the first unicode string which
+        matches. If there is no match, return the default value (``None`` if
+        the argument is not provided).
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``).
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+        return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
+
+    def get(self):
+        """
+        Serialize and return the matched nodes in a single unicode string.
+        Percent encoded content is unquoted.
+        """
+        try:
+            return etree.tostring(self.root,
+                                  method=self._tostring_method,
+                                  encoding='unicode',
+                                  with_tail=False)
+        except (AttributeError, TypeError):
+            if self.root is True:
+                return u'1'
+            elif self.root is False:
+                return u'0'
+            else:
+                return six.text_type(self.root)
+    extract = get
+
+    def getall(self):
+        """
+        Serialize and return the matched node in a 1-element list of unicode strings.
+        """
+        return [self.get()]
+
+    def register_namespace(self, prefix, uri):
+        """
+        Register the given namespace to be used in this :class:`Selector`.
+        Without registering namespaces you can't select or extract data from
+        non-standard namespaces. See :ref:`selector-examples-xml`.
+        """
+        self.namespaces[prefix] = uri
+
+    def remove_namespaces(self):
+        """
+        Remove all namespaces, allowing to traverse the document using
+        namespace-less xpaths. See :ref:`removing-namespaces`.
+        """
+        for el in self.root.iter('*'):
+            if el.tag.startswith('{'):
+                el.tag = el.tag.split('}', 1)[1]
+            # loop on element attributes also
+            for an in el.attrib.keys():
+                if an.startswith('{'):
+                    el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an)
+        # remove namespace declarations
+        etree.cleanup_namespaces(self.root)
+
+    def remove(self):
+        """
+        Remove matched nodes from the parent element.
+        """
+        try:
+            parent = self.root.getparent()
+        except AttributeError:
+            # 'str' object has no attribute 'getparent'
+            raise CannotRemoveElementWithoutRoot(
+                "The node you're trying to remove has no root, "
+                "are you trying to remove a pseudo-element? "
+                "Try to use 'li' as a selector instead of 'li::text' or "
+                "'//li' instead of '//li/text()', for example."
+            )
+
+        try:
+            parent.remove(self.root)
+        except AttributeError:
+            # 'NoneType' object has no attribute 'remove'
+            raise CannotRemoveElementWithoutParent(
+                "The node you're trying to remove has no parent, "
+                "are you trying to remove a root element?"
+            )
+
+    @property
+    def attrib(self):
+        """Return the attributes dictionary for underlying element.
+        """
+        return dict(self.root.attrib)
+
+    def __bool__(self):
+        """
+        Return ``True`` if there is any real content selected or ``False``
+        otherwise.  In other words, the boolean value of a :class:`Selector` is
+        given by the contents it selects.
+        """
+        return bool(self.get())
+    __nonzero__ = __bool__
+
+    def __str__(self):
+        data = repr(shorten(self.get(), width=40))
+        return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
+    __repr__ = __str__
--- a/venv/lib/python3.9/site-packages/parsel/utils.py
+++ b/venv/lib/python3.9/site-packages/parsel/utils.py
@ -0,0 +1,94 @@
+import re
+import six
+from w3lib.html import replace_entities as w3lib_replace_entities
+
+
+def flatten(x):
+    """flatten(sequence) -> list
+    Returns a single, flat list which contains all elements retrieved
+    from the sequence and all recursively contained sub-sequences
+    (iterables).
+    Examples:
+    >>> [1, 2, [3,4], (5,6)]
+    [1, 2, [3, 4], (5, 6)]
+    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
+    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
+    >>> flatten(["foo", "bar"])
+    ['foo', 'bar']
+    >>> flatten(["foo", ["baz", 42], "bar"])
+    ['foo', 'baz', 42, 'bar']
+    """
+    return list(iflatten(x))
+
+
+def iflatten(x):
+    """iflatten(sequence) -> Iterator
+    Similar to ``.flatten()``, but returns iterator instead"""
+    for el in x:
+        if _is_listlike(el):
+            for el_ in flatten(el):
+                yield el_
+        else:
+            yield el
+
+
+def _is_listlike(x):
+    """
+    >>> _is_listlike("foo")
+    False
+    >>> _is_listlike(5)
+    False
+    >>> _is_listlike(b"foo")
+    False
+    >>> _is_listlike([b"foo"])
+    True
+    >>> _is_listlike((b"foo",))
+    True
+    >>> _is_listlike({})
+    True
+    >>> _is_listlike(set())
+    True
+    >>> _is_listlike((x for x in range(3)))
+    True
+    >>> _is_listlike(six.moves.xrange(5))
+    True
+    """
+    return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
+
+
+def extract_regex(regex, text, replace_entities=True):
+    """Extract a list of unicode strings from the given text/encoding using the following policies:
+    * if the regex contains a named group called "extract" that will be returned
+    * if the regex contains multiple numbered groups, all those will be returned (flattened)
+    * if the regex doesn't contain any group the entire regex matching is returned
+    """
+    if isinstance(regex, six.string_types):
+        regex = re.compile(regex, re.UNICODE)
+
+    if 'extract' in regex.groupindex:
+        # named group
+        try:
+            extracted = regex.search(text).group('extract')
+        except AttributeError:
+            strings = []
+        else:
+            strings = [extracted] if extracted is not None else []
+    else:
+        # full regex or numbered groups
+        strings = regex.findall(text)
+
+    strings = flatten(strings)
+    if not replace_entities:
+        return strings
+    return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings]
+
+
+def shorten(text, width, suffix='...'):
+    """Truncate the given text to fit in the given width."""
+    if len(text) <= width:
+        return text
+    if width > len(suffix):
+        return text[:width-len(suffix)] + suffix
+    if width >= 0:
+        return suffix[len(suffix)-width:]
+    raise ValueError('width must be equal or greater than 0')
--- a/venv/lib/python3.9/site-packages/parsel/xpathfuncs.py
+++ b/venv/lib/python3.9/site-packages/parsel/xpathfuncs.py
@ -0,0 +1,61 @@
+import re
+from lxml import etree
+
+from six import string_types
+
+from w3lib.html import HTML5_WHITESPACE
+
+regex = '[{}]+'.format(HTML5_WHITESPACE)
+replace_html5_whitespaces = re.compile(regex).sub
+
+
+def set_xpathfunc(fname, func):
+    """Register a custom extension function to use in XPath expressions.
+
+    The function ``func`` registered under ``fname`` identifier will be called
+    for every matching node, being passed a ``context`` parameter as well as
+    any parameters passed from the corresponding XPath expression.
+
+    If ``func`` is ``None``, the extension function will be removed.
+
+    See more `in lxml documentation`_.
+
+    .. _`in lxml documentation`: http://lxml.de/extensions.html#xpath-extension-functions
+
+    """
+    ns_fns = etree.FunctionNamespace(None)
+    if func is not None:
+        ns_fns[fname] = func
+    else:
+        del ns_fns[fname]
+
+
+def setup():
+    set_xpathfunc('has-class', has_class)
+
+
+def has_class(context, *classes):
+    """has-class function.
+
+    Return True if all ``classes`` are present in element's class attr.
+
+    """
+    if not context.eval_context.get('args_checked'):
+        if not classes:
+            raise ValueError(
+                'XPath error: has-class must have at least 1 argument')
+        for c in classes:
+            if not isinstance(c, string_types):
+                raise ValueError(
+                    'XPath error: has-class arguments must be strings')
+        context.eval_context['args_checked'] = True
+
+    node_cls = context.context_node.get('class')
+    if node_cls is None:
+        return False
+    node_cls = ' ' + node_cls + ' '
+    node_cls = replace_html5_whitespaces(' ', node_cls)
+    for cls in classes:
+        if ' ' + cls + ' ' not in node_cls:
+            return False
+    return True