Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
372
venv/lib/python3.9/site-packages/scrapy/extensions/httpcache.py
Normal file
372
venv/lib/python3.9/site-packages/scrapy/extensions/httpcache.py
Normal file
|
|
@ -0,0 +1,372 @@
|
|||
import gzip
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from email.utils import mktime_tz, parsedate_tz
|
||||
from importlib import import_module
|
||||
from time import time
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
||||
|
||||
from scrapy.http import Headers, Response
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.project import data_path
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DummyPolicy:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
return urlparse_cached(request).scheme not in self.ignore_schemes
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
return response.status not in self.ignore_http_codes
|
||||
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
return True
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
return True
|
||||
|
||||
|
||||
class RFC2616Policy:
|
||||
|
||||
MAXAGE = 3600 * 24 * 365 # one year
|
||||
|
||||
def __init__(self, settings):
|
||||
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self._cc_parsed = WeakKeyDictionary()
|
||||
self.ignore_response_cache_controls = [
|
||||
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
|
||||
]
|
||||
|
||||
def _parse_cachecontrol(self, r):
|
||||
if r not in self._cc_parsed:
|
||||
cch = r.headers.get(b'Cache-Control', b'')
|
||||
parsed = parse_cachecontrol(cch)
|
||||
if isinstance(r, Response):
|
||||
for key in self.ignore_response_cache_controls:
|
||||
parsed.pop(key, None)
|
||||
self._cc_parsed[r] = parsed
|
||||
return self._cc_parsed[r]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
if urlparse_cached(request).scheme in self.ignore_schemes:
|
||||
return False
|
||||
cc = self._parse_cachecontrol(request)
|
||||
# obey user-agent directive "Cache-Control: no-store"
|
||||
if b'no-store' in cc:
|
||||
return False
|
||||
# Any other is eligible for caching
|
||||
return True
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
|
||||
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
||||
# Status code 206 is not included because cache can not deal with partial contents
|
||||
cc = self._parse_cachecontrol(response)
|
||||
# obey directive "Cache-Control: no-store"
|
||||
if b'no-store' in cc:
|
||||
return False
|
||||
# Never cache 304 (Not Modified) responses
|
||||
elif response.status == 304:
|
||||
return False
|
||||
# Cache unconditionally if configured to do so
|
||||
elif self.always_store:
|
||||
return True
|
||||
# Any hint on response expiration is good
|
||||
elif b'max-age' in cc or b'Expires' in response.headers:
|
||||
return True
|
||||
# Firefox fallbacks this statuses to one year expiration if none is set
|
||||
elif response.status in (300, 301, 308):
|
||||
return True
|
||||
# Other statuses without expiration requires at least one validator
|
||||
elif response.status in (200, 203, 401):
|
||||
return b'Last-Modified' in response.headers or b'ETag' in response.headers
|
||||
# Any other is probably not eligible for caching
|
||||
# Makes no sense to cache responses that does not contain expiration
|
||||
# info and can not be revalidated
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
ccreq = self._parse_cachecontrol(request)
|
||||
if b'no-cache' in cc or b'no-cache' in ccreq:
|
||||
return False
|
||||
|
||||
now = time()
|
||||
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
|
||||
currentage = self._compute_current_age(cachedresponse, request, now)
|
||||
|
||||
reqmaxage = self._get_max_age(ccreq)
|
||||
if reqmaxage is not None:
|
||||
freshnesslifetime = min(freshnesslifetime, reqmaxage)
|
||||
|
||||
if currentage < freshnesslifetime:
|
||||
return True
|
||||
|
||||
if b'max-stale' in ccreq and b'must-revalidate' not in cc:
|
||||
# From RFC2616: "Indicates that the client is willing to
|
||||
# accept a response that has exceeded its expiration time.
|
||||
# If max-stale is assigned a value, then the client is
|
||||
# willing to accept a response that has exceeded its
|
||||
# expiration time by no more than the specified number of
|
||||
# seconds. If no value is assigned to max-stale, then the
|
||||
# client is willing to accept a stale response of any age."
|
||||
staleage = ccreq[b'max-stale']
|
||||
if staleage is None:
|
||||
return True
|
||||
|
||||
try:
|
||||
if currentage < freshnesslifetime + max(0, int(staleage)):
|
||||
return True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Cached response is stale, try to set validators if any
|
||||
self._set_conditional_validators(request, cachedresponse)
|
||||
return False
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
# Use the cached response if the new response is a server error,
|
||||
# as long as the old response didn't specify must-revalidate.
|
||||
if response.status >= 500:
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
if b'must-revalidate' not in cc:
|
||||
return True
|
||||
|
||||
# Use the cached response if the server says it hasn't changed.
|
||||
return response.status == 304
|
||||
|
||||
def _set_conditional_validators(self, request, cachedresponse):
|
||||
if b'Last-Modified' in cachedresponse.headers:
|
||||
request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
|
||||
|
||||
if b'ETag' in cachedresponse.headers:
|
||||
request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
|
||||
|
||||
def _get_max_age(self, cc):
|
||||
try:
|
||||
return max(0, int(cc[b'max-age']))
|
||||
except (KeyError, ValueError):
|
||||
return None
|
||||
|
||||
def _compute_freshness_lifetime(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeFreshnessLifetime
|
||||
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
|
||||
cc = self._parse_cachecontrol(response)
|
||||
maxage = self._get_max_age(cc)
|
||||
if maxage is not None:
|
||||
return maxage
|
||||
|
||||
# Parse date header or synthesize it if none exists
|
||||
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
|
||||
|
||||
# Try HTTP/1.0 Expires header
|
||||
if b'Expires' in response.headers:
|
||||
expires = rfc1123_to_epoch(response.headers[b'Expires'])
|
||||
# When parsing Expires header fails RFC 2616 section 14.21 says we
|
||||
# should treat this as an expiration time in the past.
|
||||
return max(0, expires - date) if expires else 0
|
||||
|
||||
# Fallback to heuristic using last-modified header
|
||||
# This is not in RFC but on Firefox caching implementation
|
||||
lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
|
||||
if lastmodified and lastmodified <= date:
|
||||
return (date - lastmodified) / 10
|
||||
|
||||
# This request can be cached indefinitely
|
||||
if response.status in (300, 301, 308):
|
||||
return self.MAXAGE
|
||||
|
||||
# Insufficient information to compute fresshness lifetime
|
||||
return 0
|
||||
|
||||
def _compute_current_age(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeCurrentAge
|
||||
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
|
||||
currentage = 0
|
||||
# If Date header is not set we assume it is a fast connection, and
|
||||
# clock is in sync with the server
|
||||
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
|
||||
if now > date:
|
||||
currentage = now - date
|
||||
|
||||
if b'Age' in response.headers:
|
||||
try:
|
||||
age = int(response.headers[b'Age'])
|
||||
currentage = max(currentage, age)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return currentage
|
||||
|
||||
|
||||
class DbmCacheStorage:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
|
||||
self.db = self.dbmodule.open(dbpath, 'c')
|
||||
|
||||
logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.db.close()
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return # not cached
|
||||
url = data['url']
|
||||
status = data['status']
|
||||
headers = Headers(data['headers'])
|
||||
body = data['body']
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
key = self._request_key(request)
|
||||
data = {
|
||||
'status': response.status,
|
||||
'url': response.url,
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
|
||||
self.db[f'{key}_time'] = str(time())
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._request_key(request)
|
||||
db = self.db
|
||||
tkey = f'{key}_time'
|
||||
if tkey not in db:
|
||||
return # not found
|
||||
|
||||
ts = db[tkey]
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
return pickle.loads(db[f'{key}_data'])
|
||||
|
||||
def _request_key(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
|
||||
class FilesystemCacheStorage:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
|
||||
self._open = gzip.open if self.use_gzip else open
|
||||
|
||||
def open_spider(self, spider):
|
||||
logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
|
||||
extra={'spider': spider})
|
||||
|
||||
def close_spider(self, spider):
|
||||
pass
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
"""Return response if present in cache, or None otherwise."""
|
||||
metadata = self._read_meta(spider, request)
|
||||
if metadata is None:
|
||||
return # not cached
|
||||
rpath = self._get_request_path(spider, request)
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
|
||||
body = f.read()
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
|
||||
rawheaders = f.read()
|
||||
url = metadata.get('response_url')
|
||||
status = metadata['status']
|
||||
headers = Headers(headers_raw_to_dict(rawheaders))
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
"""Store the given response in the cache."""
|
||||
rpath = self._get_request_path(spider, request)
|
||||
if not os.path.exists(rpath):
|
||||
os.makedirs(rpath)
|
||||
metadata = {
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'status': response.status,
|
||||
'response_url': response.url,
|
||||
'timestamp': time(),
|
||||
}
|
||||
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||
f.write(to_bytes(repr(metadata)))
|
||||
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
pickle.dump(metadata, f, protocol=4)
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||
f.write(response.body)
|
||||
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(request.headers))
|
||||
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
|
||||
f.write(request.body)
|
||||
|
||||
def _get_request_path(self, spider, request):
|
||||
key = request_fingerprint(request)
|
||||
return os.path.join(self.cachedir, spider.name, key[0:2], key)
|
||||
|
||||
def _read_meta(self, spider, request):
|
||||
rpath = self._get_request_path(spider, request)
|
||||
metapath = os.path.join(rpath, 'pickled_meta')
|
||||
if not os.path.exists(metapath):
|
||||
return # not found
|
||||
mtime = os.stat(metapath).st_mtime
|
||||
if 0 < self.expiration_secs < time() - mtime:
|
||||
return # expired
|
||||
with self._open(metapath, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
def parse_cachecontrol(header):
|
||||
"""Parse Cache-Control header
|
||||
|
||||
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
|
||||
|
||||
>>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None,
|
||||
... b'max-age': b'3600'}
|
||||
True
|
||||
>>> parse_cachecontrol(b'') == {}
|
||||
True
|
||||
|
||||
"""
|
||||
directives = {}
|
||||
for directive in header.split(b','):
|
||||
key, sep, val = directive.strip().partition(b'=')
|
||||
if key:
|
||||
directives[key.lower()] = val if sep else None
|
||||
return directives
|
||||
|
||||
|
||||
def rfc1123_to_epoch(date_str):
|
||||
try:
|
||||
date_str = to_unicode(date_str, encoding='ascii')
|
||||
return mktime_tz(parsedate_tz(date_str))
|
||||
except Exception:
|
||||
return None
|
||||
Loading…
Add table
Add a link
Reference in a new issue