Ausgabe der neuen DB Einträge

2022-01-02 21:50:48 +01:00 · 2022-01-02 21:50:48 +01:00 · cfbbb9ee3d
commit cfbbb9ee3d
parent bad48e1627
2399 changed files with 843193 additions and 43 deletions
--- a/venv/lib/python3.9/site-packages/scrapy/utils/url.py
+++ b/venv/lib/python3.9/site-packages/scrapy/utils/url.py
@ -0,0 +1,164 @@
+"""
+This module contains general purpose URL functions not found in the standard
+library.
+
+Some of the functions that used to be imported from this module have been moved
+to the w3lib.url module. Always import those from there instead.
+"""
+import posixpath
+import re
+from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
+
+# scrapy.utils.url was moved to w3lib.url and import * ensures this
+# move doesn't break old code
+from w3lib.url import *
+from w3lib.url import _safe_chars, _unquotepath  # noqa: F401
+from scrapy.utils.python import to_unicode
+
+
+def url_is_from_any_domain(url, domains):
+    """Return True if the url belongs to any of the given domains"""
+    host = parse_url(url).netloc.lower()
+    if not host:
+        return False
+    domains = [d.lower() for d in domains]
+    return any((host == d) or (host.endswith(f'.{d}')) for d in domains)
+
+
+def url_is_from_spider(url, spider):
+    """Return True if the url belongs to the given spider"""
+    return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', [])))
+
+
+def url_has_any_extension(url, extensions):
+    return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
+
+
+def parse_url(url, encoding=None):
+    """Return urlparsed url from the given argument (which could be an already
+    parsed url)
+    """
+    if isinstance(url, ParseResult):
+        return url
+    return urlparse(to_unicode(url, encoding))
+
+
+def escape_ajax(url):
+    """
+    Return the crawleable url according to:
+    https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
+
+    >>> escape_ajax("www.example.com/ajax.html#!key=value")
+    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
+    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
+    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
+    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
+    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
+    >>> escape_ajax("www.example.com/ajax.html#!")
+    'www.example.com/ajax.html?_escaped_fragment_='
+
+    URLs that are not "AJAX crawlable" (according to Google) returned as-is:
+
+    >>> escape_ajax("www.example.com/ajax.html#key=value")
+    'www.example.com/ajax.html#key=value'
+    >>> escape_ajax("www.example.com/ajax.html#")
+    'www.example.com/ajax.html#'
+    >>> escape_ajax("www.example.com/ajax.html")
+    'www.example.com/ajax.html'
+    """
+    defrag, frag = urldefrag(url)
+    if not frag.startswith('!'):
+        return url
+    return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
+
+
+def add_http_if_no_scheme(url):
+    """Add http as the default scheme if it is missing from the url."""
+    match = re.match(r"^\w+://", url, flags=re.I)
+    if not match:
+        parts = urlparse(url)
+        scheme = "http:" if parts.netloc else "http://"
+        url = scheme + url
+
+    return url
+
+
+def _is_posix_path(string):
+    return bool(
+        re.match(
+            r'''
+            ^                   # start with...
+            (
+                \.              # ...a single dot,
+                (
+                    \. | [^/\.]+  # optionally followed by
+                )?                # either a second dot or some characters
+                |
+                ~   # $HOME
+            )?      # optional match of ".", ".." or ".blabla"
+            /       # at least one "/" for a file path,
+            .       # and something after the "/"
+            ''',
+            string,
+            flags=re.VERBOSE,
+        )
+    )
+
+
+def _is_windows_path(string):
+    return bool(
+        re.match(
+            r'''
+            ^
+            (
+                [a-z]:\\
+                | \\\\
+            )
+            ''',
+            string,
+            flags=re.IGNORECASE | re.VERBOSE,
+        )
+    )
+
+
+def _is_filesystem_path(string):
+    return _is_posix_path(string) or _is_windows_path(string)
+
+
+def guess_scheme(url):
+    """Add an URL scheme if missing: file:// for filepath-like input or
+    http:// otherwise."""
+    if _is_filesystem_path(url):
+        return any_to_uri(url)
+    return add_http_if_no_scheme(url)
+
+
+def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=False, strip_fragment=True):
+
+    """Strip URL string from some of its components:
+
+    - ``strip_credentials`` removes "user:password@"
+    - ``strip_default_port`` removes ":80" (resp. ":443", ":21")
+      from http:// (resp. https://, ftp://) URLs
+    - ``origin_only`` replaces path component with "/", also dropping
+      query and fragment components ; it also strips credentials
+    - ``strip_fragment`` drops any #fragment component
+    """
+
+    parsed_url = urlparse(url)
+    netloc = parsed_url.netloc
+    if (strip_credentials or origin_only) and (parsed_url.username or parsed_url.password):
+        netloc = netloc.split('@')[-1]
+    if strip_default_port and parsed_url.port:
+        if (parsed_url.scheme, parsed_url.port) in (('http', 80),
+                                                    ('https', 443),
+                                                    ('ftp', 21)):
+            netloc = netloc.replace(f':{parsed_url.port}', '')
+    return urlunparse((
+        parsed_url.scheme,
+        netloc,
+        '/' if origin_only else parsed_url.path,
+        '' if origin_only else parsed_url.params,
+        '' if origin_only else parsed_url.query,
+        '' if strip_fragment else parsed_url.fragment
+    ))