Ausgabe der neuen DB Einträge

2022-01-02 21:50:48 +01:00 · 2022-01-02 21:50:48 +01:00 · cfbbb9ee3d
commit cfbbb9ee3d
parent bad48e1627
2399 changed files with 843193 additions and 43 deletions
--- a/venv/lib/python3.9/site-packages/w3lib/html.py
+++ b/venv/lib/python3.9/site-packages/w3lib/html.py
@ -0,0 +1,336 @@
+# -*- coding: utf-8 -*-
+"""
+Functions for dealing with markup text
+"""
+
+import warnings
+import re
+import six
+from six import moves
+
+from w3lib.util import to_bytes, to_unicode
+from w3lib.url import safe_url_string
+
+_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
+_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
+_baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I)
+_meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
+_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
+
+HTML5_WHITESPACE = ' \t\n\r\x0c'
+
+
+def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
+    r"""
+
+    .. warning::
+
+        This function is deprecated and will be removed in future.
+        Please use :func:`replace_entities` instead.
+    """
+
+    warnings.warn(
+        "`w3lib.html.remove_entities` function is deprecated and "
+        "will be removed in future releases. Please use "
+        "`w3lib.html.replace_entities` instead.",
+        DeprecationWarning
+    )
+
+    return replace_entities(text, keep, remove_illegal, encoding)
+
+def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
+    u"""Remove entities from the given `text` by converting them to their
+    corresponding unicode character.
+
+    `text` can be a unicode string or a byte string encoded in the given
+    `encoding` (which defaults to 'utf-8').
+
+    If `keep` is passed (with a list of entity names) those entities will
+    be kept (they won't be removed).
+
+    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
+    and named entities (such as ``&nbsp;`` or ``&gt;``).
+
+    If `remove_illegal` is ``True``, entities that can't be converted are removed.
+    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
+    is". For more information see the tests.
+
+    Always returns a unicode string (with the entities removed).
+
+    >>> import w3lib.html
+    >>> w3lib.html.replace_entities(b'Price: &pound;100')
+    u'Price: \\xa3100'
+    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
+    Price: £100
+    >>>
+
+    """
+
+    def convert_entity(m):
+        groups = m.groupdict()
+        if groups.get('dec'):
+            number = int(groups['dec'], 10)
+        elif groups.get('hex'):
+            number = int(groups['hex'], 16)
+        elif groups.get('named'):
+            entity_name = groups['named']
+            if entity_name.lower() in keep:
+                return m.group(0)
+            else:
+                number = (moves.html_entities.name2codepoint.get(entity_name) or
+                    moves.html_entities.name2codepoint.get(entity_name.lower()))
+        if number is not None:
+            # Numeric character references in the 80-9F range are typically
+            # interpreted by browsers as representing the characters mapped
+            # to bytes 80-9F in the Windows-1252 encoding. For more info
+            # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
+            try:
+                if 0x80 <= number <= 0x9f:
+                    return six.int2byte(number).decode('cp1252')
+                else:
+                    return six.unichr(number)
+            except ValueError:
+                pass
+
+        return u'' if remove_illegal and groups.get('semicolon') else m.group(0)
+
+    return _ent_re.sub(convert_entity, to_unicode(text, encoding))
+
+def has_entities(text, encoding=None):
+    return bool(_ent_re.search(to_unicode(text, encoding)))
+
+def replace_tags(text, token='', encoding=None):
+    """Replace all markup tags found in the given `text` by the given token.
+    By default `token` is an empty string so it just removes all tags.
+
+    `text` can be a unicode string or a regular string encoded as `encoding`
+    (or ``'utf-8'`` if `encoding` is not given.)
+
+    Always returns a unicode string.
+
+    Examples:
+
+    >>> import w3lib.html
+    >>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
+    u'This text contains some tag'
+    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')
+    u' -- Je ne parle pas  -- fran\\xe7ais --  -- '
+    >>>
+
+    """
+
+    return _tag_re.sub(token, to_unicode(text, encoding))
+
+
+_REMOVECOMMENTS_RE = re.compile(u'<!--.*?(?:-->|$)', re.DOTALL)
+def remove_comments(text, encoding=None):
+    """ Remove HTML Comments.
+
+    >>> import w3lib.html
+    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
+    u'test  whatever'
+    >>>
+
+    """
+
+    text = to_unicode(text, encoding)
+    return _REMOVECOMMENTS_RE.sub(u'', text)
+
+def remove_tags(text, which_ones=(), keep=(), encoding=None):
+    """ Remove HTML Tags only.
+
+    `which_ones` and `keep` are both tuples, there are four cases:
+
+    ==============  ============= ==========================================
+    ``which_ones``  ``keep``      what it does
+    ==============  ============= ==========================================
+    **not empty**   empty         remove all tags in ``which_ones``
+    empty           **not empty** remove all tags except the ones in ``keep``
+    empty           empty         remove all tags
+    **not empty**   **not empty** not allowed
+    ==============  ============= ==========================================
+
+
+    Remove all tags:
+
+    >>> import w3lib.html
+    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
+    >>> w3lib.html.remove_tags(doc)
+    u'This is a link: example'
+    >>>
+
+    Keep only some tags:
+
+    >>> w3lib.html.remove_tags(doc, keep=('div',))
+    u'<div>This is a link: example</div>'
+    >>>
+
+    Remove only specific tags:
+
+    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
+    u'<div><p>This is a link: example</p></div>'
+    >>>
+
+    You can't remove some and keep some:
+
+    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
+    Traceback (most recent call last):
+        ...
+    ValueError: Cannot use both which_ones and keep
+    >>>
+
+    """
+    if which_ones and keep:
+        raise ValueError('Cannot use both which_ones and keep')
+
+    which_ones = {tag.lower() for tag in which_ones}
+    keep = {tag.lower() for tag in keep}
+
+    def will_remove(tag):
+        tag = tag.lower()
+        if which_ones:
+            return tag in which_ones
+        else:
+            return tag not in keep
+
+    def remove_tag(m):
+        tag = m.group(1)
+        return u'' if will_remove(tag) else m.group(0)
+
+    regex = '</?([^ >/]+).*?>'
+    retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
+
+    return retags.sub(remove_tag, to_unicode(text, encoding))
+
+def remove_tags_with_content(text, which_ones=(), encoding=None):
+    """Remove tags and their content.
+
+    `which_ones` is a tuple of which tags to remove including their content.
+    If is empty, returns the string unmodified.
+
+    >>> import w3lib.html
+    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
+    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
+    u'<div><p> <a href="http://www.example.com">example</a></p></div>'
+    >>>
+
+    """
+
+    text = to_unicode(text, encoding)
+    if which_ones:
+        tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
+        retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
+        text = retags.sub(u'', text)
+    return text
+
+
+def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
+        encoding=None):
+    """Remove escape characters.
+
+    `which_ones` is a tuple of which escape characters we want to remove.
+    By default removes ``\\n``, ``\\t``, ``\\r``.
+
+    `replace_by` is the string to replace the escape characters by.
+    It defaults to ``''``, meaning the escape characters are removed.
+
+    """
+
+    text = to_unicode(text, encoding)
+    for ec in which_ones:
+        text = text.replace(ec, to_unicode(replace_by, encoding))
+    return text
+
+def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
+    """
+    This function receives markup as a text (always a unicode string or
+    a UTF-8 encoded string) and does the following:
+
+    1. removes entities (except the ones in `keep`) from any part of it
+        that is not inside a CDATA
+    2. searches for CDATAs and extracts their text (if any) without modifying it.
+    3. removes the found CDATAs
+
+    """
+
+    def _get_fragments(txt, pattern):
+        offset = 0
+        for match in pattern.finditer(txt):
+            match_s, match_e = match.span(1)
+            yield txt[offset:match_s]
+            yield match
+            offset = match_e
+        yield txt[offset:]
+
+    text = to_unicode(text, encoding)
+    ret_text = u''
+    for fragment in _get_fragments(text, _cdata_re):
+        if isinstance(fragment, six.string_types):
+            # it's not a CDATA (so we try to remove its entities)
+            ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
+        else:
+            # it's a CDATA (so we just extract its content)
+            ret_text += fragment.group('cdata_d')
+    return ret_text
+
+def get_base_url(text, baseurl='', encoding='utf-8'):
+    """Return the base url if declared in the given HTML `text`,
+    relative to the given base url.
+
+    If no base url is found, the given `baseurl` is returned.
+
+    """
+
+    text = to_unicode(text, encoding)
+    m = _baseurl_re.search(text)
+    if m:
+        return moves.urllib.parse.urljoin(
+            safe_url_string(baseurl),
+            safe_url_string(m.group(1), encoding=encoding)
+        )
+    else:
+        return safe_url_string(baseurl)
+
+def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
+    """Return  the http-equiv parameter of the HTML meta element from the given
+    HTML text and return a tuple ``(interval, url)`` where interval is an integer
+    containing the delay in seconds (or zero if not present) and url is a
+    string with the absolute url to redirect.
+
+    If no meta redirect is found, ``(None, None)`` is returned.
+
+    """
+
+    if six.PY2:
+        baseurl = to_bytes(baseurl, encoding)
+    try:
+        text = to_unicode(text, encoding)
+    except UnicodeDecodeError:
+        print(text)
+        raise
+    text = remove_tags_with_content(text, ignore_tags)
+    text = remove_comments(replace_entities(text))
+    m = _meta_refresh_re.search(text)
+    if m:
+        interval = float(m.group('int'))
+        url = safe_url_string(m.group('url').strip(' "\''), encoding)
+        url = moves.urllib.parse.urljoin(baseurl, url)
+        return interval, url
+    else:
+        return None, None
+
+
+def strip_html5_whitespace(text):
+    r"""
+    Strip all leading and trailing space characters (as defined in
+    https://www.w3.org/TR/html5/infrastructure.html#space-character).
+
+    Such stripping is useful e.g. for processing HTML element attributes which
+    contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
+    defines them as "valid URL potentially surrounded by spaces"
+    or "valid non-empty URL potentially surrounded by spaces".
+
+    >>> strip_html5_whitespace(' hello\n')
+    'hello'
+    """
+    return text.strip(HTML5_WHITESPACE)