Ausgabe der neuen DB Einträge

2022-01-02 21:50:48 +01:00 · 2022-01-02 21:50:48 +01:00 · cfbbb9ee3d
commit cfbbb9ee3d
parent bad48e1627
2399 changed files with 843193 additions and 43 deletions
--- a/venv/lib/python3.9/site-packages/scrapy/utils/request.py
+++ b/venv/lib/python3.9/site-packages/scrapy/utils/request.py
@ -0,0 +1,100 @@
+"""
+This module provides some useful functions for working with
+scrapy.http.Request objects
+"""
+
+import hashlib
+import weakref
+from urllib.parse import urlunparse
+
+from w3lib.http import basic_auth_header
+from w3lib.url import canonicalize_url
+
+from scrapy.utils.httpobj import urlparse_cached
+from scrapy.utils.python import to_bytes, to_unicode
+
+
+_fingerprint_cache = weakref.WeakKeyDictionary()
+
+
+def request_fingerprint(request, include_headers=None, keep_fragments=False):
+    """
+    Return the request fingerprint.
+
+    The request fingerprint is a hash that uniquely identifies the resource the
+    request points to. For example, take the following two urls:
+
+    http://www.example.com/query?id=111&cat=222
+    http://www.example.com/query?cat=222&id=111
+
+    Even though those are two different URLs both point to the same resource
+    and are equivalent (i.e. they should return the same response).
+
+    Another example are cookies used to store session ids. Suppose the
+    following page is only accessible to authenticated users:
+
+    http://www.example.com/members/offers.html
+
+    Lot of sites use a cookie to store the session id, which adds a random
+    component to the HTTP Request and thus should be ignored when calculating
+    the fingerprint.
+
+    For this reason, request headers are ignored by default when calculating
+    the fingeprint. If you want to include specific headers use the
+    include_headers argument, which is a list of Request headers to include.
+
+    Also, servers usually ignore fragments in urls when handling requests,
+    so they are also ignored by default when calculating the fingerprint.
+    If you want to include them, set the keep_fragments argument to True
+    (for instance when handling requests with a headless browser).
+
+    """
+    if include_headers:
+        include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers))
+    cache = _fingerprint_cache.setdefault(request, {})
+    cache_key = (include_headers, keep_fragments)
+    if cache_key not in cache:
+        fp = hashlib.sha1()
+        fp.update(to_bytes(request.method))
+        fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
+        fp.update(request.body or b'')
+        if include_headers:
+            for hdr in include_headers:
+                if hdr in request.headers:
+                    fp.update(hdr)
+                    for v in request.headers.getlist(hdr):
+                        fp.update(v)
+        cache[cache_key] = fp.hexdigest()
+    return cache[cache_key]
+
+
+def request_authenticate(request, username, password):
+    """Autenticate the given request (in place) using the HTTP basic access
+    authentication mechanism (RFC 2617) and the given username and password
+    """
+    request.headers['Authorization'] = basic_auth_header(username, password)
+
+
+def request_httprepr(request):
+    """Return the raw HTTP representation (as bytes) of the given request.
+    This is provided only for reference since it's not the actual stream of
+    bytes that will be send when performing the request (that's controlled
+    by Twisted).
+    """
+    parsed = urlparse_cached(request)
+    path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
+    s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
+    s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
+    if request.headers:
+        s += request.headers.to_string() + b"\r\n"
+    s += b"\r\n"
+    s += request.body
+    return s
+
+
+def referer_str(request):
+    """ Return Referer HTTP header suitable for logging. """
+    referrer = request.headers.get('Referer')
+    if referrer is None:
+        return referrer
+    return to_unicode(referrer, errors='replace')