Ausgabe der neuen DB Einträge

2022-01-02 21:50:48 +01:00 · 2022-01-02 21:50:48 +01:00 · cfbbb9ee3d
commit cfbbb9ee3d
parent bad48e1627
2399 changed files with 843193 additions and 43 deletions
--- a/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/init.py
+++ b/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/init.py
--- a/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/depth.py
+++ b/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/depth.py
@ -0,0 +1,58 @@
+"""
+Depth Spider Middleware
+
+See documentation in docs/topics/spider-middleware.rst
+"""
+
+import logging
+
+from scrapy.http import Request
+
+logger = logging.getLogger(__name__)
+
+
+class DepthMiddleware:
+
+    def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
+        self.maxdepth = maxdepth
+        self.stats = stats
+        self.verbose_stats = verbose_stats
+        self.prio = prio
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        maxdepth = settings.getint('DEPTH_LIMIT')
+        verbose = settings.getbool('DEPTH_STATS_VERBOSE')
+        prio = settings.getint('DEPTH_PRIORITY')
+        return cls(maxdepth, crawler.stats, verbose, prio)
+
+    def process_spider_output(self, response, result, spider):
+        def _filter(request):
+            if isinstance(request, Request):
+                depth = response.meta['depth'] + 1
+                request.meta['depth'] = depth
+                if self.prio:
+                    request.priority -= depth * self.prio
+                if self.maxdepth and depth > self.maxdepth:
+                    logger.debug(
+                        "Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
+                        {'maxdepth': self.maxdepth, 'requrl': request.url},
+                        extra={'spider': spider}
+                    )
+                    return False
+                else:
+                    if self.verbose_stats:
+                        self.stats.inc_value(f'request_depth_count/{depth}',
+                                             spider=spider)
+                    self.stats.max_value('request_depth_max', depth,
+                                         spider=spider)
+            return True
+
+        # base case (depth=0)
+        if 'depth' not in response.meta:
+            response.meta['depth'] = 0
+            if self.verbose_stats:
+                self.stats.inc_value('request_depth_count/0', spider=spider)
+
+        return (r for r in result or () if _filter(r))
--- a/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/httperror.py
+++ b/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/httperror.py
@ -0,0 +1,57 @@
+"""
+HttpError Spider Middleware
+
+See documentation in docs/topics/spider-middleware.rst
+"""
+import logging
+
+from scrapy.exceptions import IgnoreRequest
+
+logger = logging.getLogger(__name__)
+
+
+class HttpError(IgnoreRequest):
+    """A non-200 response was filtered"""
+
+    def __init__(self, response, *args, **kwargs):
+        self.response = response
+        super().__init__(*args, **kwargs)
+
+
+class HttpErrorMiddleware:
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler.settings)
+
+    def __init__(self, settings):
+        self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
+        self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
+
+    def process_spider_input(self, response, spider):
+        if 200 <= response.status < 300:  # common case
+            return
+        meta = response.meta
+        if 'handle_httpstatus_all' in meta:
+            return
+        if 'handle_httpstatus_list' in meta:
+            allowed_statuses = meta['handle_httpstatus_list']
+        elif self.handle_httpstatus_all:
+            return
+        else:
+            allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
+        if response.status in allowed_statuses:
+            return
+        raise HttpError(response, 'Ignoring non-200 response')
+
+    def process_spider_exception(self, response, exception, spider):
+        if isinstance(exception, HttpError):
+            spider.crawler.stats.inc_value('httperror/response_ignored_count')
+            spider.crawler.stats.inc_value(
+                f'httperror/response_ignored_status_count/{response.status}'
+            )
+            logger.info(
+                "Ignoring response %(response)r: HTTP status code is not handled or not allowed",
+                {'response': response}, extra={'spider': spider},
+            )
+            return []
--- a/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/offsite.py
+++ b/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/offsite.py
@ -0,0 +1,85 @@
+"""
+Offsite Spider Middleware
+
+See documentation in docs/topics/spider-middleware.rst
+"""
+import re
+import logging
+import warnings
+
+from scrapy import signals
+from scrapy.http import Request
+from scrapy.utils.httpobj import urlparse_cached
+
+logger = logging.getLogger(__name__)
+
+
+class OffsiteMiddleware:
+
+    def __init__(self, stats):
+        self.stats = stats
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        o = cls(crawler.stats)
+        crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+        return o
+
+    def process_spider_output(self, response, result, spider):
+        for x in result:
+            if isinstance(x, Request):
+                if x.dont_filter or self.should_follow(x, spider):
+                    yield x
+                else:
+                    domain = urlparse_cached(x).hostname
+                    if domain and domain not in self.domains_seen:
+                        self.domains_seen.add(domain)
+                        logger.debug(
+                            "Filtered offsite request to %(domain)r: %(request)s",
+                            {'domain': domain, 'request': x}, extra={'spider': spider})
+                        self.stats.inc_value('offsite/domains', spider=spider)
+                    self.stats.inc_value('offsite/filtered', spider=spider)
+            else:
+                yield x
+
+    def should_follow(self, request, spider):
+        regex = self.host_regex
+        # hostname can be None for wrong urls (like javascript links)
+        host = urlparse_cached(request).hostname or ''
+        return bool(regex.search(host))
+
+    def get_host_regex(self, spider):
+        """Override this method to implement a different offsite policy"""
+        allowed_domains = getattr(spider, 'allowed_domains', None)
+        if not allowed_domains:
+            return re.compile('')  # allow all by default
+        url_pattern = re.compile(r"^https?://.*$")
+        port_pattern = re.compile(r":\d+$")
+        domains = []
+        for domain in allowed_domains:
+            if domain is None:
+                continue
+            elif url_pattern.match(domain):
+                message = ("allowed_domains accepts only domains, not URLs. "
+                           f"Ignoring URL entry {domain} in allowed_domains.")
+                warnings.warn(message, URLWarning)
+            elif port_pattern.search(domain):
+                message = ("allowed_domains accepts only domains without ports. "
+                           f"Ignoring entry {domain} in allowed_domains.")
+                warnings.warn(message, PortWarning)
+            else:
+                domains.append(re.escape(domain))
+        regex = fr'^(.*\.)?({"|".join(domains)})$'
+        return re.compile(regex)
+
+    def spider_opened(self, spider):
+        self.host_regex = self.get_host_regex(spider)
+        self.domains_seen = set()
+
+
+class URLWarning(Warning):
+    pass
+
+
+class PortWarning(Warning):
+    pass
--- a/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/referer.py
+++ b/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/referer.py
@ -0,0 +1,361 @@
+"""
+RefererMiddleware: populates Request referer field, based on the Response which
+originated it.
+"""
+import warnings
+from urllib.parse import urlparse
+
+from w3lib.url import safe_url_string
+
+from scrapy.http import Request, Response
+from scrapy.exceptions import NotConfigured
+from scrapy import signals
+from scrapy.utils.python import to_unicode
+from scrapy.utils.misc import load_object
+from scrapy.utils.url import strip_url
+
+
+LOCAL_SCHEMES = ('about', 'blob', 'data', 'filesystem',)
+
+POLICY_NO_REFERRER = "no-referrer"
+POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
+POLICY_SAME_ORIGIN = "same-origin"
+POLICY_ORIGIN = "origin"
+POLICY_STRICT_ORIGIN = "strict-origin"
+POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
+POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
+POLICY_UNSAFE_URL = "unsafe-url"
+POLICY_SCRAPY_DEFAULT = "scrapy-default"
+
+
+class ReferrerPolicy:
+
+    NOREFERRER_SCHEMES = LOCAL_SCHEMES
+
+    def referrer(self, response_url, request_url):
+        raise NotImplementedError()
+
+    def stripped_referrer(self, url):
+        if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
+            return self.strip_url(url)
+
+    def origin_referrer(self, url):
+        if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
+            return self.origin(url)
+
+    def strip_url(self, url, origin_only=False):
+        """
+        https://www.w3.org/TR/referrer-policy/#strip-url
+
+        If url is null, return no referrer.
+        If url's scheme is a local scheme, then return no referrer.
+        Set url's username to the empty string.
+        Set url's password to null.
+        Set url's fragment to null.
+        If the origin-only flag is true, then:
+            Set url's path to null.
+            Set url's query to null.
+        Return url.
+        """
+        if not url:
+            return None
+        return strip_url(url,
+                         strip_credentials=True,
+                         strip_fragment=True,
+                         strip_default_port=True,
+                         origin_only=origin_only)
+
+    def origin(self, url):
+        """Return serialized origin (scheme, host, path) for a request or response URL."""
+        return self.strip_url(url, origin_only=True)
+
+    def potentially_trustworthy(self, url):
+        # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
+        parsed_url = urlparse(url)
+        if parsed_url.scheme in ('data',):
+            return False
+        return self.tls_protected(url)
+
+    def tls_protected(self, url):
+        return urlparse(url).scheme in ('https', 'ftps')
+
+
+class NoReferrerPolicy(ReferrerPolicy):
+    """
+    https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
+
+    The simplest policy is "no-referrer", which specifies that no referrer information
+    is to be sent along with requests made from a particular request client to any origin.
+    The header will be omitted entirely.
+    """
+    name = POLICY_NO_REFERRER
+
+    def referrer(self, response_url, request_url):
+        return None
+
+
+class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
+    """
+    https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
+
+    The "no-referrer-when-downgrade" policy sends a full URL along with requests
+    from a TLS-protected environment settings object to a potentially trustworthy URL,
+    and requests from clients which are not TLS-protected to any origin.
+
+    Requests from TLS-protected clients to non-potentially trustworthy URLs,
+    on the other hand, will contain no referrer information.
+    A Referer HTTP header will not be sent.
+
+    This is a user agent's default behavior, if no policy is otherwise specified.
+    """
+    name = POLICY_NO_REFERRER_WHEN_DOWNGRADE
+
+    def referrer(self, response_url, request_url):
+        if not self.tls_protected(response_url) or self.tls_protected(request_url):
+            return self.stripped_referrer(response_url)
+
+
+class SameOriginPolicy(ReferrerPolicy):
+    """
+    https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin
+
+    The "same-origin" policy specifies that a full URL, stripped for use as a referrer,
+    is sent as referrer information when making same-origin requests from a particular request client.
+
+    Cross-origin requests, on the other hand, will contain no referrer information.
+    A Referer HTTP header will not be sent.
+    """
+    name = POLICY_SAME_ORIGIN
+
+    def referrer(self, response_url, request_url):
+        if self.origin(response_url) == self.origin(request_url):
+            return self.stripped_referrer(response_url)
+
+
+class OriginPolicy(ReferrerPolicy):
+    """
+    https://www.w3.org/TR/referrer-policy/#referrer-policy-origin
+
+    The "origin" policy specifies that only the ASCII serialization
+    of the origin of the request client is sent as referrer information
+    when making both same-origin requests and cross-origin requests
+    from a particular request client.
+    """
+    name = POLICY_ORIGIN
+
+    def referrer(self, response_url, request_url):
+        return self.origin_referrer(response_url)
+
+
+class StrictOriginPolicy(ReferrerPolicy):
+    """
+    https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin
+
+    The "strict-origin" policy sends the ASCII serialization
+    of the origin of the request client when making requests:
+    - from a TLS-protected environment settings object to a potentially trustworthy URL, and
+    - from non-TLS-protected environment settings objects to any origin.
+
+    Requests from TLS-protected request clients to non- potentially trustworthy URLs,
+    on the other hand, will contain no referrer information.
+    A Referer HTTP header will not be sent.
+    """
+    name = POLICY_STRICT_ORIGIN
+
+    def referrer(self, response_url, request_url):
+        if (
+            self.tls_protected(response_url) and self.potentially_trustworthy(request_url)
+            or not self.tls_protected(response_url)
+        ):
+            return self.origin_referrer(response_url)
+
+
+class OriginWhenCrossOriginPolicy(ReferrerPolicy):
+    """
+    https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin
+
+    The "origin-when-cross-origin" policy specifies that a full URL,
+    stripped for use as a referrer, is sent as referrer information
+    when making same-origin requests from a particular request client,
+    and only the ASCII serialization of the origin of the request client
+    is sent as referrer information when making cross-origin requests
+    from a particular request client.
+    """
+    name = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
+
+    def referrer(self, response_url, request_url):
+        origin = self.origin(response_url)
+        if origin == self.origin(request_url):
+            return self.stripped_referrer(response_url)
+        else:
+            return origin
+
+
+class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
+    """
+    https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin
+
+    The "strict-origin-when-cross-origin" policy specifies that a full URL,
+    stripped for use as a referrer, is sent as referrer information
+    when making same-origin requests from a particular request client,
+    and only the ASCII serialization of the origin of the request client
+    when making cross-origin requests:
+
+    - from a TLS-protected environment settings object to a potentially trustworthy URL, and
+    - from non-TLS-protected environment settings objects to any origin.
+
+    Requests from TLS-protected clients to non- potentially trustworthy URLs,
+    on the other hand, will contain no referrer information.
+    A Referer HTTP header will not be sent.
+    """
+    name = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
+
+    def referrer(self, response_url, request_url):
+        origin = self.origin(response_url)
+        if origin == self.origin(request_url):
+            return self.stripped_referrer(response_url)
+        elif (
+            self.tls_protected(response_url) and self.potentially_trustworthy(request_url)
+            or not self.tls_protected(response_url)
+        ):
+            return self.origin_referrer(response_url)
+
+
+class UnsafeUrlPolicy(ReferrerPolicy):
+    """
+    https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url
+
+    The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer,
+    is sent along with both cross-origin requests
+    and same-origin requests made from a particular request client.
+
+    Note: The policy's name doesn't lie; it is unsafe.
+    This policy will leak origins and paths from TLS-protected resources
+    to insecure origins.
+    Carefully consider the impact of setting such a policy for potentially sensitive documents.
+    """
+    name = POLICY_UNSAFE_URL
+
+    def referrer(self, response_url, request_url):
+        return self.stripped_referrer(response_url)
+
+
+class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
+    """
+    A variant of "no-referrer-when-downgrade",
+    with the addition that "Referer" is not sent if the parent request was
+    using ``file://`` or ``s3://`` scheme.
+    """
+    NOREFERRER_SCHEMES = LOCAL_SCHEMES + ('file', 's3')
+    name = POLICY_SCRAPY_DEFAULT
+
+
+_policy_classes = {p.name: p for p in (
+    NoReferrerPolicy,
+    NoReferrerWhenDowngradePolicy,
+    SameOriginPolicy,
+    OriginPolicy,
+    StrictOriginPolicy,
+    OriginWhenCrossOriginPolicy,
+    StrictOriginWhenCrossOriginPolicy,
+    UnsafeUrlPolicy,
+    DefaultReferrerPolicy,
+)}
+
+# Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
+_policy_classes[''] = NoReferrerWhenDowngradePolicy
+
+
+def _load_policy_class(policy, warning_only=False):
+    """
+    Expect a string for the path to the policy class,
+    otherwise try to interpret the string as a standard value
+    from https://www.w3.org/TR/referrer-policy/#referrer-policies
+    """
+    try:
+        return load_object(policy)
+    except ValueError:
+        try:
+            return _policy_classes[policy.lower()]
+        except KeyError:
+            msg = f"Could not load referrer policy {policy!r}"
+            if not warning_only:
+                raise RuntimeError(msg)
+            else:
+                warnings.warn(msg, RuntimeWarning)
+                return None
+
+
+class RefererMiddleware:
+
+    def __init__(self, settings=None):
+        self.default_policy = DefaultReferrerPolicy
+        if settings is not None:
+            self.default_policy = _load_policy_class(
+                settings.get('REFERRER_POLICY'))
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        if not crawler.settings.getbool('REFERER_ENABLED'):
+            raise NotConfigured
+        mw = cls(crawler.settings)
+
+        # Note: this hook is a bit of a hack to intercept redirections
+        crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)
+
+        return mw
+
+    def policy(self, resp_or_url, request):
+        """
+        Determine Referrer-Policy to use from a parent Response (or URL),
+        and a Request to be sent.
+
+        - if a valid policy is set in Request meta, it is used.
+        - if the policy is set in meta but is wrong (e.g. a typo error),
+          the policy from settings is used
+        - if the policy is not set in Request meta,
+          but there is a Referrer-policy header in the parent response,
+          it is used if valid
+        - otherwise, the policy from settings is used.
+        """
+        policy_name = request.meta.get('referrer_policy')
+        if policy_name is None:
+            if isinstance(resp_or_url, Response):
+                policy_header = resp_or_url.headers.get('Referrer-Policy')
+                if policy_header is not None:
+                    policy_name = to_unicode(policy_header.decode('latin1'))
+        if policy_name is None:
+            return self.default_policy()
+
+        cls = _load_policy_class(policy_name, warning_only=True)
+        return cls() if cls else self.default_policy()
+
+    def process_spider_output(self, response, result, spider):
+        def _set_referer(r):
+            if isinstance(r, Request):
+                referrer = self.policy(response, r).referrer(response.url, r.url)
+                if referrer is not None:
+                    r.headers.setdefault('Referer', referrer)
+            return r
+        return (_set_referer(r) for r in result or ())
+
+    def request_scheduled(self, request, spider):
+        # check redirected request to patch "Referer" header if necessary
+        redirected_urls = request.meta.get('redirect_urls', [])
+        if redirected_urls:
+            request_referrer = request.headers.get('Referer')
+            # we don't patch the referrer value if there is none
+            if request_referrer is not None:
+                # the request's referrer header value acts as a surrogate
+                # for the parent response URL
+                #
+                # Note: if the 3xx response contained a Referrer-Policy header,
+                #       the information is not available using this hook
+                parent_url = safe_url_string(request_referrer)
+                policy_referrer = self.policy(parent_url, request).referrer(
+                    parent_url, request.url)
+                if policy_referrer != request_referrer:
+                    if policy_referrer is None:
+                        request.headers.pop('Referer')
+                    else:
+                        request.headers['Referer'] = policy_referrer
--- a/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/urllength.py
+++ b/venv/lib/python3.9/site-packages/scrapy/spidermiddlewares/urllength.py
@ -0,0 +1,37 @@
+"""
+Url Length Spider Middleware
+
+See documentation in docs/topics/spider-middleware.rst
+"""
+
+import logging
+
+from scrapy.http import Request
+from scrapy.exceptions import NotConfigured
+
+logger = logging.getLogger(__name__)
+
+
+class UrlLengthMiddleware:
+
+    def __init__(self, maxlength):
+        self.maxlength = maxlength
+
+    @classmethod
+    def from_settings(cls, settings):
+        maxlength = settings.getint('URLLENGTH_LIMIT')
+        if not maxlength:
+            raise NotConfigured
+        return cls(maxlength)
+
+    def process_spider_output(self, response, result, spider):
+        def _filter(request):
+            if isinstance(request, Request) and len(request.url) > self.maxlength:
+                logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ",
+                             {'maxlength': self.maxlength, 'url': request.url},
+                             extra={'spider': spider})
+                return False
+            else:
+                return True
+
+        return (r for r in result or () if _filter(r))