Ausgabe der neuen DB Einträge

2022-01-02 21:50:48 +01:00 · 2022-01-02 21:50:48 +01:00 · cfbbb9ee3d
commit cfbbb9ee3d
parent bad48e1627
2399 changed files with 843193 additions and 43 deletions
--- a/venv/lib/python3.9/site-packages/scrapy/extensions/httpcache.py
+++ b/venv/lib/python3.9/site-packages/scrapy/extensions/httpcache.py
@ -0,0 +1,372 @@
+import gzip
+import logging
+import os
+import pickle
+from email.utils import mktime_tz, parsedate_tz
+from importlib import import_module
+from time import time
+from weakref import WeakKeyDictionary
+
+from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
+
+from scrapy.http import Headers, Response
+from scrapy.responsetypes import responsetypes
+from scrapy.utils.httpobj import urlparse_cached
+from scrapy.utils.project import data_path
+from scrapy.utils.python import to_bytes, to_unicode
+from scrapy.utils.request import request_fingerprint
+
+
+logger = logging.getLogger(__name__)
+
+
+class DummyPolicy:
+
+    def __init__(self, settings):
+        self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
+        self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
+
+    def should_cache_request(self, request):
+        return urlparse_cached(request).scheme not in self.ignore_schemes
+
+    def should_cache_response(self, response, request):
+        return response.status not in self.ignore_http_codes
+
+    def is_cached_response_fresh(self, cachedresponse, request):
+        return True
+
+    def is_cached_response_valid(self, cachedresponse, response, request):
+        return True
+
+
+class RFC2616Policy:
+
+    MAXAGE = 3600 * 24 * 365  # one year
+
+    def __init__(self, settings):
+        self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
+        self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
+        self._cc_parsed = WeakKeyDictionary()
+        self.ignore_response_cache_controls = [
+            to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
+        ]
+
+    def _parse_cachecontrol(self, r):
+        if r not in self._cc_parsed:
+            cch = r.headers.get(b'Cache-Control', b'')
+            parsed = parse_cachecontrol(cch)
+            if isinstance(r, Response):
+                for key in self.ignore_response_cache_controls:
+                    parsed.pop(key, None)
+            self._cc_parsed[r] = parsed
+        return self._cc_parsed[r]
+
+    def should_cache_request(self, request):
+        if urlparse_cached(request).scheme in self.ignore_schemes:
+            return False
+        cc = self._parse_cachecontrol(request)
+        # obey user-agent directive "Cache-Control: no-store"
+        if b'no-store' in cc:
+            return False
+        # Any other is eligible for caching
+        return True
+
+    def should_cache_response(self, response, request):
+        # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
+        # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
+        # Status code 206 is not included because cache can not deal with partial contents
+        cc = self._parse_cachecontrol(response)
+        # obey directive "Cache-Control: no-store"
+        if b'no-store' in cc:
+            return False
+        # Never cache 304 (Not Modified) responses
+        elif response.status == 304:
+            return False
+        # Cache unconditionally if configured to do so
+        elif self.always_store:
+            return True
+        # Any hint on response expiration is good
+        elif b'max-age' in cc or b'Expires' in response.headers:
+            return True
+        # Firefox fallbacks this statuses to one year expiration if none is set
+        elif response.status in (300, 301, 308):
+            return True
+        # Other statuses without expiration requires at least one validator
+        elif response.status in (200, 203, 401):
+            return b'Last-Modified' in response.headers or b'ETag' in response.headers
+        # Any other is probably not eligible for caching
+        # Makes no sense to cache responses that does not contain expiration
+        # info and can not be revalidated
+        else:
+            return False
+
+    def is_cached_response_fresh(self, cachedresponse, request):
+        cc = self._parse_cachecontrol(cachedresponse)
+        ccreq = self._parse_cachecontrol(request)
+        if b'no-cache' in cc or b'no-cache' in ccreq:
+            return False
+
+        now = time()
+        freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
+        currentage = self._compute_current_age(cachedresponse, request, now)
+
+        reqmaxage = self._get_max_age(ccreq)
+        if reqmaxage is not None:
+            freshnesslifetime = min(freshnesslifetime, reqmaxage)
+
+        if currentage < freshnesslifetime:
+            return True
+
+        if b'max-stale' in ccreq and b'must-revalidate' not in cc:
+            # From RFC2616: "Indicates that the client is willing to
+            # accept a response that has exceeded its expiration time.
+            # If max-stale is assigned a value, then the client is
+            # willing to accept a response that has exceeded its
+            # expiration time by no more than the specified number of
+            # seconds. If no value is assigned to max-stale, then the
+            # client is willing to accept a stale response of any age."
+            staleage = ccreq[b'max-stale']
+            if staleage is None:
+                return True
+
+            try:
+                if currentage < freshnesslifetime + max(0, int(staleage)):
+                    return True
+            except ValueError:
+                pass
+
+        # Cached response is stale, try to set validators if any
+        self._set_conditional_validators(request, cachedresponse)
+        return False
+
+    def is_cached_response_valid(self, cachedresponse, response, request):
+        # Use the cached response if the new response is a server error,
+        # as long as the old response didn't specify must-revalidate.
+        if response.status >= 500:
+            cc = self._parse_cachecontrol(cachedresponse)
+            if b'must-revalidate' not in cc:
+                return True
+
+        # Use the cached response if the server says it hasn't changed.
+        return response.status == 304
+
+    def _set_conditional_validators(self, request, cachedresponse):
+        if b'Last-Modified' in cachedresponse.headers:
+            request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
+
+        if b'ETag' in cachedresponse.headers:
+            request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
+
+    def _get_max_age(self, cc):
+        try:
+            return max(0, int(cc[b'max-age']))
+        except (KeyError, ValueError):
+            return None
+
+    def _compute_freshness_lifetime(self, response, request, now):
+        # Reference nsHttpResponseHead::ComputeFreshnessLifetime
+        # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
+        cc = self._parse_cachecontrol(response)
+        maxage = self._get_max_age(cc)
+        if maxage is not None:
+            return maxage
+
+        # Parse date header or synthesize it if none exists
+        date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
+
+        # Try HTTP/1.0 Expires header
+        if b'Expires' in response.headers:
+            expires = rfc1123_to_epoch(response.headers[b'Expires'])
+            # When parsing Expires header fails RFC 2616 section 14.21 says we
+            # should treat this as an expiration time in the past.
+            return max(0, expires - date) if expires else 0
+
+        # Fallback to heuristic using last-modified header
+        # This is not in RFC but on Firefox caching implementation
+        lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
+        if lastmodified and lastmodified <= date:
+            return (date - lastmodified) / 10
+
+        # This request can be cached indefinitely
+        if response.status in (300, 301, 308):
+            return self.MAXAGE
+
+        # Insufficient information to compute fresshness lifetime
+        return 0
+
+    def _compute_current_age(self, response, request, now):
+        # Reference nsHttpResponseHead::ComputeCurrentAge
+        # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
+        currentage = 0
+        # If Date header is not set we assume it is a fast connection, and
+        # clock is in sync with the server
+        date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
+        if now > date:
+            currentage = now - date
+
+        if b'Age' in response.headers:
+            try:
+                age = int(response.headers[b'Age'])
+                currentage = max(currentage, age)
+            except ValueError:
+                pass
+
+        return currentage
+
+
+class DbmCacheStorage:
+
+    def __init__(self, settings):
+        self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
+        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
+        self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
+        self.db = None
+
+    def open_spider(self, spider):
+        dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
+        self.db = self.dbmodule.open(dbpath, 'c')
+
+        logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
+
+    def close_spider(self, spider):
+        self.db.close()
+
+    def retrieve_response(self, spider, request):
+        data = self._read_data(spider, request)
+        if data is None:
+            return  # not cached
+        url = data['url']
+        status = data['status']
+        headers = Headers(data['headers'])
+        body = data['body']
+        respcls = responsetypes.from_args(headers=headers, url=url)
+        response = respcls(url=url, headers=headers, status=status, body=body)
+        return response
+
+    def store_response(self, spider, request, response):
+        key = self._request_key(request)
+        data = {
+            'status': response.status,
+            'url': response.url,
+            'headers': dict(response.headers),
+            'body': response.body,
+        }
+        self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
+        self.db[f'{key}_time'] = str(time())
+
+    def _read_data(self, spider, request):
+        key = self._request_key(request)
+        db = self.db
+        tkey = f'{key}_time'
+        if tkey not in db:
+            return  # not found
+
+        ts = db[tkey]
+        if 0 < self.expiration_secs < time() - float(ts):
+            return  # expired
+
+        return pickle.loads(db[f'{key}_data'])
+
+    def _request_key(self, request):
+        return request_fingerprint(request)
+
+
+class FilesystemCacheStorage:
+
+    def __init__(self, settings):
+        self.cachedir = data_path(settings['HTTPCACHE_DIR'])
+        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
+        self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
+        self._open = gzip.open if self.use_gzip else open
+
+    def open_spider(self, spider):
+        logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
+                     extra={'spider': spider})
+
+    def close_spider(self, spider):
+        pass
+
+    def retrieve_response(self, spider, request):
+        """Return response if present in cache, or None otherwise."""
+        metadata = self._read_meta(spider, request)
+        if metadata is None:
+            return  # not cached
+        rpath = self._get_request_path(spider, request)
+        with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
+            body = f.read()
+        with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
+            rawheaders = f.read()
+        url = metadata.get('response_url')
+        status = metadata['status']
+        headers = Headers(headers_raw_to_dict(rawheaders))
+        respcls = responsetypes.from_args(headers=headers, url=url)
+        response = respcls(url=url, headers=headers, status=status, body=body)
+        return response
+
+    def store_response(self, spider, request, response):
+        """Store the given response in the cache."""
+        rpath = self._get_request_path(spider, request)
+        if not os.path.exists(rpath):
+            os.makedirs(rpath)
+        metadata = {
+            'url': request.url,
+            'method': request.method,
+            'status': response.status,
+            'response_url': response.url,
+            'timestamp': time(),
+        }
+        with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
+            f.write(to_bytes(repr(metadata)))
+        with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
+            pickle.dump(metadata, f, protocol=4)
+        with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
+            f.write(headers_dict_to_raw(response.headers))
+        with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
+            f.write(response.body)
+        with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
+            f.write(headers_dict_to_raw(request.headers))
+        with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
+            f.write(request.body)
+
+    def _get_request_path(self, spider, request):
+        key = request_fingerprint(request)
+        return os.path.join(self.cachedir, spider.name, key[0:2], key)
+
+    def _read_meta(self, spider, request):
+        rpath = self._get_request_path(spider, request)
+        metapath = os.path.join(rpath, 'pickled_meta')
+        if not os.path.exists(metapath):
+            return  # not found
+        mtime = os.stat(metapath).st_mtime
+        if 0 < self.expiration_secs < time() - mtime:
+            return  # expired
+        with self._open(metapath, 'rb') as f:
+            return pickle.load(f)
+
+
+def parse_cachecontrol(header):
+    """Parse Cache-Control header
+
+    https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
+
+    >>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None,
+    ...                                                 b'max-age': b'3600'}
+    True
+    >>> parse_cachecontrol(b'') == {}
+    True
+
+    """
+    directives = {}
+    for directive in header.split(b','):
+        key, sep, val = directive.strip().partition(b'=')
+        if key:
+            directives[key.lower()] = val if sep else None
+    return directives
+
+
+def rfc1123_to_epoch(date_str):
+    try:
+        date_str = to_unicode(date_str, encoding='ascii')
+        return mktime_tz(parsedate_tz(date_str))
+    except Exception:
+        return None