Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,68 @@
"""CloseSpider is an extension that forces spiders to be closed after certain
conditions are met.
See documentation in docs/topics/extensions.rst
"""
from collections import defaultdict
from scrapy import signals
from scrapy.exceptions import NotConfigured
class CloseSpider:
def __init__(self, crawler):
self.crawler = crawler
self.close_on = {
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
}
if not any(self.close_on.values()):
raise NotConfigured
self.counter = defaultdict(int)
if self.close_on.get('errorcount'):
crawler.signals.connect(self.error_count, signal=signals.spider_error)
if self.close_on.get('pagecount'):
crawler.signals.connect(self.page_count, signal=signals.response_received)
if self.close_on.get('timeout'):
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
if self.close_on.get('itemcount'):
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def error_count(self, failure, response, spider):
self.counter['errorcount'] += 1
if self.counter['errorcount'] == self.close_on['errorcount']:
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
def page_count(self, response, request, spider):
self.counter['pagecount'] += 1
if self.counter['pagecount'] == self.close_on['pagecount']:
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
def spider_opened(self, spider):
from twisted.internet import reactor
self.task = reactor.callLater(self.close_on['timeout'],
self.crawler.engine.close_spider, spider,
reason='closespider_timeout')
def item_scraped(self, item, spider):
self.counter['itemcount'] += 1
if self.counter['itemcount'] == self.close_on['itemcount']:
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
def spider_closed(self, spider):
task = getattr(self, 'task', False)
if task and task.active():
task.cancel()

View file

@ -0,0 +1,46 @@
"""
Extension for collecting core stats like items scraped and start/finish times
"""
from datetime import datetime
from scrapy import signals
class CoreStats:
def __init__(self, stats):
self.stats = stats
self.start_time = None
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
crawler.signals.connect(o.response_received, signal=signals.response_received)
return o
def spider_opened(self, spider):
self.start_time = datetime.utcnow()
self.stats.set_value('start_time', self.start_time, spider=spider)
def spider_closed(self, spider, reason):
finish_time = datetime.utcnow()
elapsed_time = finish_time - self.start_time
elapsed_time_seconds = elapsed_time.total_seconds()
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
self.stats.set_value('finish_time', finish_time, spider=spider)
self.stats.set_value('finish_reason', reason, spider=spider)
def item_scraped(self, item, spider):
self.stats.inc_value('item_scraped_count', spider=spider)
def response_received(self, spider):
self.stats.inc_value('response_received_count', spider=spider)
def item_dropped(self, item, spider, exception):
reason = exception.__class__.__name__
self.stats.inc_value('item_dropped_count', spider=spider)
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)

View file

@ -0,0 +1,64 @@
"""
Extensions for debugging Scrapy
See documentation in docs/topics/extensions.rst
"""
import sys
import signal
import logging
import traceback
import threading
from pdb import Pdb
from scrapy.utils.engine import format_engine_status
from scrapy.utils.trackref import format_live_refs
logger = logging.getLogger(__name__)
class StackTraceDump:
def __init__(self, crawler=None):
self.crawler = crawler
try:
signal.signal(signal.SIGUSR2, self.dump_stacktrace)
signal.signal(signal.SIGQUIT, self.dump_stacktrace)
except AttributeError:
# win32 platforms don't support SIGUSR signals
pass
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def dump_stacktrace(self, signum, frame):
log_args = {
'stackdumps': self._thread_stacks(),
'enginestatus': format_engine_status(self.crawler.engine),
'liverefs': format_live_refs(),
}
logger.info("Dumping stack trace and engine status\n"
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
log_args, extra={'crawler': self.crawler})
def _thread_stacks(self):
id2name = dict((th.ident, th.name) for th in threading.enumerate())
dumps = ''
for id_, frame in sys._current_frames().items():
name = id2name.get(id_, '')
dump = ''.join(traceback.format_stack(frame))
dumps += f"# Thread: {name}({id_})\n{dump}\n"
return dumps
class Debugger:
def __init__(self):
try:
signal.signal(signal.SIGUSR2, self._enter_debugger)
except AttributeError:
# win32 platforms don't support SIGUSR signals
pass
def _enter_debugger(self, signum, frame):
Pdb().set_trace(frame.f_back)

View file

@ -0,0 +1,480 @@
"""
Feed Exports extension
See documentation in docs/topics/feed-exports.rst
"""
import logging
import os
import re
import sys
import warnings
from datetime import datetime
from tempfile import NamedTemporaryFile
from urllib.parse import unquote, urlparse
from twisted.internet import defer, threads
from w3lib.url import file_uri_to_path
from zope.interface import implementer, Interface
from scrapy import signals
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.utils.boto import is_botocore_available
from scrapy.utils.conf import feed_complete_default_values_from_settings
from scrapy.utils.ftp import ftp_store_file
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import get_func_args, without_none_values
logger = logging.getLogger(__name__)
def build_storage(builder, uri, *args, feed_options=None, preargs=(), **kwargs):
argument_names = get_func_args(builder)
if 'feed_options' in argument_names:
kwargs['feed_options'] = feed_options
else:
warnings.warn(
"{} does not support the 'feed_options' keyword argument. Add a "
"'feed_options' parameter to its signature to remove this "
"warning. This parameter will become mandatory in a future "
"version of Scrapy."
.format(builder.__qualname__),
category=ScrapyDeprecationWarning
)
return builder(*preargs, uri, *args, **kwargs)
class IFeedStorage(Interface):
"""Interface that all Feed Storages must implement"""
def __init__(uri, *, feed_options=None):
"""Initialize the storage with the parameters given in the URI and the
feed-specific options (see :setting:`FEEDS`)"""
def open(spider):
"""Open the storage for the given spider. It must return a file-like
object that will be used for the exporters"""
def store(file):
"""Store the given file stream"""
@implementer(IFeedStorage)
class BlockingFeedStorage:
def open(self, spider):
path = spider.crawler.settings['FEED_TEMPDIR']
if path and not os.path.isdir(path):
raise OSError('Not a Directory: ' + str(path))
return NamedTemporaryFile(prefix='feed-', dir=path)
def store(self, file):
return threads.deferToThread(self._store_in_thread, file)
def _store_in_thread(self, file):
raise NotImplementedError
@implementer(IFeedStorage)
class StdoutFeedStorage:
def __init__(self, uri, _stdout=None, *, feed_options=None):
if not _stdout:
_stdout = sys.stdout.buffer
self._stdout = _stdout
if feed_options and feed_options.get('overwrite', False) is True:
logger.warning('Standard output (stdout) storage does not support '
'overwriting. To suppress this warning, remove the '
'overwrite option from your FEEDS setting, or set '
'it to False.')
def open(self, spider):
return self._stdout
def store(self, file):
pass
@implementer(IFeedStorage)
class FileFeedStorage:
def __init__(self, uri, *, feed_options=None):
self.path = file_uri_to_path(uri)
feed_options = feed_options or {}
self.write_mode = 'wb' if feed_options.get('overwrite', False) else 'ab'
def open(self, spider):
dirname = os.path.dirname(self.path)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
return open(self.path, self.write_mode)
def store(self, file):
file.close()
class S3FeedStorage(BlockingFeedStorage):
def __init__(self, uri, access_key=None, secret_key=None, acl=None, *,
feed_options=None):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
u = urlparse(uri)
self.bucketname = u.hostname
self.access_key = u.username or access_key
self.secret_key = u.password or secret_key
self.keyname = u.path[1:] # remove first "/"
self.acl = acl
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3', aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key)
if feed_options and feed_options.get('overwrite', True) is False:
logger.warning('S3 does not support appending to files. To '
'suppress this warning, remove the overwrite '
'option from your FEEDS setting or set it to True.')
@classmethod
def from_crawler(cls, crawler, uri, *, feed_options=None):
return build_storage(
cls,
uri,
access_key=crawler.settings['AWS_ACCESS_KEY_ID'],
secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'],
acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None,
feed_options=feed_options,
)
def _store_in_thread(self, file):
file.seek(0)
kwargs = {'ACL': self.acl} if self.acl else {}
self.s3_client.put_object(
Bucket=self.bucketname, Key=self.keyname, Body=file,
**kwargs)
file.close()
class GCSFeedStorage(BlockingFeedStorage):
def __init__(self, uri, project_id, acl):
self.project_id = project_id
self.acl = acl
u = urlparse(uri)
self.bucket_name = u.hostname
self.blob_name = u.path[1:] # remove first "/"
@classmethod
def from_crawler(cls, crawler, uri):
return cls(
uri,
crawler.settings['GCS_PROJECT_ID'],
crawler.settings['FEED_STORAGE_GCS_ACL'] or None
)
def _store_in_thread(self, file):
file.seek(0)
from google.cloud.storage import Client
client = Client(project=self.project_id)
bucket = client.get_bucket(self.bucket_name)
blob = bucket.blob(self.blob_name)
blob.upload_from_file(file, predefined_acl=self.acl)
class FTPFeedStorage(BlockingFeedStorage):
def __init__(self, uri, use_active_mode=False, *, feed_options=None):
u = urlparse(uri)
self.host = u.hostname
self.port = int(u.port or '21')
self.username = u.username
self.password = unquote(u.password or '')
self.path = u.path
self.use_active_mode = use_active_mode
self.overwrite = not feed_options or feed_options.get('overwrite', True)
@classmethod
def from_crawler(cls, crawler, uri, *, feed_options=None):
return build_storage(
cls,
uri,
crawler.settings.getbool('FEED_STORAGE_FTP_ACTIVE'),
feed_options=feed_options,
)
def _store_in_thread(self, file):
ftp_store_file(
path=self.path, file=file, host=self.host,
port=self.port, username=self.username,
password=self.password, use_active_mode=self.use_active_mode,
overwrite=self.overwrite,
)
class _FeedSlot:
def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template):
self.file = file
self.exporter = exporter
self.storage = storage
# feed params
self.batch_id = batch_id
self.format = format
self.store_empty = store_empty
self.uri_template = uri_template
self.uri = uri
# flags
self.itemcount = 0
self._exporting = False
def start_exporting(self):
if not self._exporting:
self.exporter.start_exporting()
self._exporting = True
def finish_exporting(self):
if self._exporting:
self.exporter.finish_exporting()
self._exporting = False
class FeedExporter:
@classmethod
def from_crawler(cls, crawler):
exporter = cls(crawler)
crawler.signals.connect(exporter.open_spider, signals.spider_opened)
crawler.signals.connect(exporter.close_spider, signals.spider_closed)
crawler.signals.connect(exporter.item_scraped, signals.item_scraped)
return exporter
def __init__(self, crawler):
self.crawler = crawler
self.settings = crawler.settings
self.feeds = {}
self.slots = []
if not self.settings['FEEDS'] and not self.settings['FEED_URI']:
raise NotConfigured
# Begin: Backward compatibility for FEED_URI and FEED_FORMAT settings
if self.settings['FEED_URI']:
warnings.warn(
'The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of '
'the `FEEDS` setting. Please see the `FEEDS` setting docs for more details',
category=ScrapyDeprecationWarning, stacklevel=2,
)
uri = str(self.settings['FEED_URI']) # handle pathlib.Path objects
feed_options = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
# End: Backward compatibility for FEED_URI and FEED_FORMAT settings
# 'FEEDS' setting takes precedence over 'FEED_URI'
for uri, feed_options in self.settings.getdict('FEEDS').items():
uri = str(uri) # handle pathlib.Path objects
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
self.storages = self._load_components('FEED_STORAGES')
self.exporters = self._load_components('FEED_EXPORTERS')
for uri, feed_options in self.feeds.items():
if not self._storage_supported(uri, feed_options):
raise NotConfigured
if not self._settings_are_valid():
raise NotConfigured
if not self._exporter_supported(feed_options['format']):
raise NotConfigured
def open_spider(self, spider):
for uri, feed_options in self.feeds.items():
uri_params = self._get_uri_params(spider, feed_options['uri_params'])
self.slots.append(self._start_new_batch(
batch_id=1,
uri=uri % uri_params,
feed_options=feed_options,
spider=spider,
uri_template=uri,
))
def close_spider(self, spider):
deferred_list = []
for slot in self.slots:
d = self._close_slot(slot, spider)
deferred_list.append(d)
return defer.DeferredList(deferred_list) if deferred_list else None
def _close_slot(self, slot, spider):
if not slot.itemcount and not slot.store_empty:
# We need to call slot.storage.store nonetheless to get the file
# properly closed.
return defer.maybeDeferred(slot.storage.store, slot.file)
slot.finish_exporting()
logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
log_args = {'format': slot.format,
'itemcount': slot.itemcount,
'uri': slot.uri}
d = defer.maybeDeferred(slot.storage.store, slot.file)
# Use `largs=log_args` to copy log_args into function's scope
# instead of using `log_args` from the outer scope
d.addCallback(
lambda _, largs=log_args: logger.info(
logfmt % "Stored", largs, extra={'spider': spider}
)
)
d.addErrback(
lambda f, largs=log_args: logger.error(
logfmt % "Error storing", largs,
exc_info=failure_to_exc_info(f), extra={'spider': spider}
)
)
return d
def _start_new_batch(self, batch_id, uri, feed_options, spider, uri_template):
"""
Redirect the output data stream to a new file.
Execute multiple times if FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified
:param batch_id: sequence number of current batch
:param uri: uri of the new batch to start
:param feed_options: dict with parameters of feed
:param spider: user spider
:param uri_template: template of uri which contains %(batch_time)s or %(batch_id)d to create new uri
"""
storage = self._get_storage(uri, feed_options)
file = storage.open(spider)
exporter = self._get_exporter(
file=file,
format=feed_options['format'],
fields_to_export=feed_options['fields'],
encoding=feed_options['encoding'],
indent=feed_options['indent'],
**feed_options['item_export_kwargs'],
)
slot = _FeedSlot(
file=file,
exporter=exporter,
storage=storage,
uri=uri,
format=feed_options['format'],
store_empty=feed_options['store_empty'],
batch_id=batch_id,
uri_template=uri_template,
)
if slot.store_empty:
slot.start_exporting()
return slot
def item_scraped(self, item, spider):
slots = []
for slot in self.slots:
slot.start_exporting()
slot.exporter.export_item(item)
slot.itemcount += 1
# create new slot for each slot with itemcount == FEED_EXPORT_BATCH_ITEM_COUNT and close the old one
if (
self.feeds[slot.uri_template]['batch_item_count']
and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count']
):
uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot)
self._close_slot(slot, spider)
slots.append(self._start_new_batch(
batch_id=slot.batch_id + 1,
uri=slot.uri_template % uri_params,
feed_options=self.feeds[slot.uri_template],
spider=spider,
uri_template=slot.uri_template,
))
else:
slots.append(slot)
self.slots = slots
def _load_components(self, setting_prefix):
conf = without_none_values(self.settings.getwithbase(setting_prefix))
d = {}
for k, v in conf.items():
try:
d[k] = load_object(v)
except NotConfigured:
pass
return d
def _exporter_supported(self, format):
if format in self.exporters:
return True
logger.error("Unknown feed format: %(format)s", {'format': format})
def _settings_are_valid(self):
"""
If FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain
%(batch_time)s or %(batch_id)d to distinguish different files of partial output
"""
for uri_template, values in self.feeds.items():
if values['batch_item_count'] and not re.search(r'%\(batch_time\)s|%\(batch_id\)', uri_template):
logger.error(
'%(batch_time)s or %(batch_id)d must be in the feed URI ({}) if FEED_EXPORT_BATCH_ITEM_COUNT '
'setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: '
'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count'
''.format(uri_template)
)
return False
return True
def _storage_supported(self, uri, feed_options):
scheme = urlparse(uri).scheme
if scheme in self.storages:
try:
self._get_storage(uri, feed_options)
return True
except NotConfigured as e:
logger.error("Disabled feed storage scheme: %(scheme)s. "
"Reason: %(reason)s",
{'scheme': scheme, 'reason': str(e)})
else:
logger.error("Unknown feed storage scheme: %(scheme)s",
{'scheme': scheme})
def _get_instance(self, objcls, *args, **kwargs):
return create_instance(
objcls, self.settings, getattr(self, 'crawler', None),
*args, **kwargs)
def _get_exporter(self, file, format, *args, **kwargs):
return self._get_instance(self.exporters[format], file, *args, **kwargs)
def _get_storage(self, uri, feed_options):
"""Fork of create_instance specific to feed storage classes
It supports not passing the *feed_options* parameters to classes that
do not support it, and issuing a deprecation warning instead.
"""
feedcls = self.storages[urlparse(uri).scheme]
crawler = getattr(self, 'crawler', None)
def build_instance(builder, *preargs):
return build_storage(builder, uri, feed_options=feed_options, preargs=preargs)
if crawler and hasattr(feedcls, 'from_crawler'):
instance = build_instance(feedcls.from_crawler, crawler)
method_name = 'from_crawler'
elif hasattr(feedcls, 'from_settings'):
instance = build_instance(feedcls.from_settings, self.settings)
method_name = 'from_settings'
else:
instance = build_instance(feedcls)
method_name = '__new__'
if instance is None:
raise TypeError("%s.%s returned None" % (feedcls.__qualname__, method_name))
return instance
def _get_uri_params(self, spider, uri_params, slot=None):
params = {}
for k in dir(spider):
params[k] = getattr(spider, k)
utc_now = datetime.utcnow()
params['time'] = utc_now.replace(microsecond=0).isoformat().replace(':', '-')
params['batch_time'] = utc_now.isoformat().replace(':', '-')
params['batch_id'] = slot.batch_id + 1 if slot is not None else 1
uripar_function = load_object(uri_params) if uri_params else lambda x, y: None
uripar_function(params, spider)
return params

View file

@ -0,0 +1,372 @@
import gzip
import logging
import os
import pickle
from email.utils import mktime_tz, parsedate_tz
from importlib import import_module
from time import time
from weakref import WeakKeyDictionary
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
from scrapy.http import Headers, Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.project import data_path
from scrapy.utils.python import to_bytes, to_unicode
from scrapy.utils.request import request_fingerprint
logger = logging.getLogger(__name__)
class DummyPolicy:
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
def should_cache_request(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes
def should_cache_response(self, response, request):
return response.status not in self.ignore_http_codes
def is_cached_response_fresh(self, cachedresponse, request):
return True
def is_cached_response_valid(self, cachedresponse, response, request):
return True
class RFC2616Policy:
MAXAGE = 3600 * 24 * 365 # one year
def __init__(self, settings):
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self._cc_parsed = WeakKeyDictionary()
self.ignore_response_cache_controls = [
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
]
def _parse_cachecontrol(self, r):
if r not in self._cc_parsed:
cch = r.headers.get(b'Cache-Control', b'')
parsed = parse_cachecontrol(cch)
if isinstance(r, Response):
for key in self.ignore_response_cache_controls:
parsed.pop(key, None)
self._cc_parsed[r] = parsed
return self._cc_parsed[r]
def should_cache_request(self, request):
if urlparse_cached(request).scheme in self.ignore_schemes:
return False
cc = self._parse_cachecontrol(request)
# obey user-agent directive "Cache-Control: no-store"
if b'no-store' in cc:
return False
# Any other is eligible for caching
return True
def should_cache_response(self, response, request):
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
# Status code 206 is not included because cache can not deal with partial contents
cc = self._parse_cachecontrol(response)
# obey directive "Cache-Control: no-store"
if b'no-store' in cc:
return False
# Never cache 304 (Not Modified) responses
elif response.status == 304:
return False
# Cache unconditionally if configured to do so
elif self.always_store:
return True
# Any hint on response expiration is good
elif b'max-age' in cc or b'Expires' in response.headers:
return True
# Firefox fallbacks this statuses to one year expiration if none is set
elif response.status in (300, 301, 308):
return True
# Other statuses without expiration requires at least one validator
elif response.status in (200, 203, 401):
return b'Last-Modified' in response.headers or b'ETag' in response.headers
# Any other is probably not eligible for caching
# Makes no sense to cache responses that does not contain expiration
# info and can not be revalidated
else:
return False
def is_cached_response_fresh(self, cachedresponse, request):
cc = self._parse_cachecontrol(cachedresponse)
ccreq = self._parse_cachecontrol(request)
if b'no-cache' in cc or b'no-cache' in ccreq:
return False
now = time()
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
currentage = self._compute_current_age(cachedresponse, request, now)
reqmaxage = self._get_max_age(ccreq)
if reqmaxage is not None:
freshnesslifetime = min(freshnesslifetime, reqmaxage)
if currentage < freshnesslifetime:
return True
if b'max-stale' in ccreq and b'must-revalidate' not in cc:
# From RFC2616: "Indicates that the client is willing to
# accept a response that has exceeded its expiration time.
# If max-stale is assigned a value, then the client is
# willing to accept a response that has exceeded its
# expiration time by no more than the specified number of
# seconds. If no value is assigned to max-stale, then the
# client is willing to accept a stale response of any age."
staleage = ccreq[b'max-stale']
if staleage is None:
return True
try:
if currentage < freshnesslifetime + max(0, int(staleage)):
return True
except ValueError:
pass
# Cached response is stale, try to set validators if any
self._set_conditional_validators(request, cachedresponse)
return False
def is_cached_response_valid(self, cachedresponse, response, request):
# Use the cached response if the new response is a server error,
# as long as the old response didn't specify must-revalidate.
if response.status >= 500:
cc = self._parse_cachecontrol(cachedresponse)
if b'must-revalidate' not in cc:
return True
# Use the cached response if the server says it hasn't changed.
return response.status == 304
def _set_conditional_validators(self, request, cachedresponse):
if b'Last-Modified' in cachedresponse.headers:
request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
if b'ETag' in cachedresponse.headers:
request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
def _get_max_age(self, cc):
try:
return max(0, int(cc[b'max-age']))
except (KeyError, ValueError):
return None
def _compute_freshness_lifetime(self, response, request, now):
# Reference nsHttpResponseHead::ComputeFreshnessLifetime
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
cc = self._parse_cachecontrol(response)
maxage = self._get_max_age(cc)
if maxage is not None:
return maxage
# Parse date header or synthesize it if none exists
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
# Try HTTP/1.0 Expires header
if b'Expires' in response.headers:
expires = rfc1123_to_epoch(response.headers[b'Expires'])
# When parsing Expires header fails RFC 2616 section 14.21 says we
# should treat this as an expiration time in the past.
return max(0, expires - date) if expires else 0
# Fallback to heuristic using last-modified header
# This is not in RFC but on Firefox caching implementation
lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
if lastmodified and lastmodified <= date:
return (date - lastmodified) / 10
# This request can be cached indefinitely
if response.status in (300, 301, 308):
return self.MAXAGE
# Insufficient information to compute fresshness lifetime
return 0
def _compute_current_age(self, response, request, now):
# Reference nsHttpResponseHead::ComputeCurrentAge
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
currentage = 0
# If Date header is not set we assume it is a fast connection, and
# clock is in sync with the server
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
if now > date:
currentage = now - date
if b'Age' in response.headers:
try:
age = int(response.headers[b'Age'])
currentage = max(currentage, age)
except ValueError:
pass
return currentage
class DbmCacheStorage:
def __init__(self, settings):
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
self.db = None
def open_spider(self, spider):
dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
self.db = self.dbmodule.open(dbpath, 'c')
logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
def close_spider(self, spider):
self.db.close()
def retrieve_response(self, spider, request):
data = self._read_data(spider, request)
if data is None:
return # not cached
url = data['url']
status = data['status']
headers = Headers(data['headers'])
body = data['body']
respcls = responsetypes.from_args(headers=headers, url=url)
response = respcls(url=url, headers=headers, status=status, body=body)
return response
def store_response(self, spider, request, response):
key = self._request_key(request)
data = {
'status': response.status,
'url': response.url,
'headers': dict(response.headers),
'body': response.body,
}
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
self.db[f'{key}_time'] = str(time())
def _read_data(self, spider, request):
key = self._request_key(request)
db = self.db
tkey = f'{key}_time'
if tkey not in db:
return # not found
ts = db[tkey]
if 0 < self.expiration_secs < time() - float(ts):
return # expired
return pickle.loads(db[f'{key}_data'])
def _request_key(self, request):
return request_fingerprint(request)
class FilesystemCacheStorage:
def __init__(self, settings):
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
self._open = gzip.open if self.use_gzip else open
def open_spider(self, spider):
logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
extra={'spider': spider})
def close_spider(self, spider):
pass
def retrieve_response(self, spider, request):
"""Return response if present in cache, or None otherwise."""
metadata = self._read_meta(spider, request)
if metadata is None:
return # not cached
rpath = self._get_request_path(spider, request)
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
body = f.read()
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
rawheaders = f.read()
url = metadata.get('response_url')
status = metadata['status']
headers = Headers(headers_raw_to_dict(rawheaders))
respcls = responsetypes.from_args(headers=headers, url=url)
response = respcls(url=url, headers=headers, status=status, body=body)
return response
def store_response(self, spider, request, response):
"""Store the given response in the cache."""
rpath = self._get_request_path(spider, request)
if not os.path.exists(rpath):
os.makedirs(rpath)
metadata = {
'url': request.url,
'method': request.method,
'status': response.status,
'response_url': response.url,
'timestamp': time(),
}
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
f.write(to_bytes(repr(metadata)))
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
pickle.dump(metadata, f, protocol=4)
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
f.write(headers_dict_to_raw(response.headers))
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
f.write(response.body)
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
f.write(headers_dict_to_raw(request.headers))
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
f.write(request.body)
def _get_request_path(self, spider, request):
key = request_fingerprint(request)
return os.path.join(self.cachedir, spider.name, key[0:2], key)
def _read_meta(self, spider, request):
rpath = self._get_request_path(spider, request)
metapath = os.path.join(rpath, 'pickled_meta')
if not os.path.exists(metapath):
return # not found
mtime = os.stat(metapath).st_mtime
if 0 < self.expiration_secs < time() - mtime:
return # expired
with self._open(metapath, 'rb') as f:
return pickle.load(f)
def parse_cachecontrol(header):
"""Parse Cache-Control header
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
>>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None,
... b'max-age': b'3600'}
True
>>> parse_cachecontrol(b'') == {}
True
"""
directives = {}
for directive in header.split(b','):
key, sep, val = directive.strip().partition(b'=')
if key:
directives[key.lower()] = val if sep else None
return directives
def rfc1123_to_epoch(date_str):
try:
date_str = to_unicode(date_str, encoding='ascii')
return mktime_tz(parsedate_tz(date_str))
except Exception:
return None

View file

@ -0,0 +1,52 @@
import logging
from twisted.internet import task
from scrapy.exceptions import NotConfigured
from scrapy import signals
logger = logging.getLogger(__name__)
class LogStats:
"""Log basic scraping stats periodically"""
def __init__(self, stats, interval=60.0):
self.stats = stats
self.interval = interval
self.multiplier = 60.0 / self.interval
self.task = None
@classmethod
def from_crawler(cls, crawler):
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
if not interval:
raise NotConfigured
o = cls(crawler.stats, interval)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider):
self.pagesprev = 0
self.itemsprev = 0
self.task = task.LoopingCall(self.log, spider)
self.task.start(self.interval)
def log(self, spider):
items = self.stats.get_value('item_scraped_count', 0)
pages = self.stats.get_value('response_received_count', 0)
irate = (items - self.itemsprev) * self.multiplier
prate = (pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = pages, items
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
"scraped %(items)d items (at %(itemrate)d items/min)")
log_args = {'pages': pages, 'pagerate': prate,
'items': items, 'itemrate': irate}
logger.info(msg, log_args, extra={'spider': spider})
def spider_closed(self, spider, reason):
if self.task and self.task.running:
self.task.stop()

View file

@ -0,0 +1,33 @@
"""
MemoryDebugger extension
See documentation in docs/topics/extensions.rst
"""
import gc
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.trackref import live_refs
class MemoryDebugger:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('MEMDEBUG_ENABLED'):
raise NotConfigured
o = cls(crawler.stats)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_closed(self, spider, reason):
gc.collect()
self.stats.set_value('memdebug/gc_garbage_count', len(gc.garbage), spider=spider)
for cls, wdict in live_refs.items():
if not wdict:
continue
self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)

View file

@ -0,0 +1,126 @@
"""
MemoryUsage extension
See documentation in docs/topics/extensions.rst
"""
import sys
import socket
import logging
from pprint import pformat
from importlib import import_module
from twisted.internet import task
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.mail import MailSender
from scrapy.utils.engine import get_engine_status
logger = logging.getLogger(__name__)
class MemoryUsage:
def __init__(self, crawler):
if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
raise NotConfigured
try:
# stdlib's resource module is only available on unix platforms.
self.resource = import_module('resource')
except ImportError:
raise NotConfigured
self.crawler = crawler
self.warned = False
self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
self.check_interval = crawler.settings.getfloat('MEMUSAGE_CHECK_INTERVAL_SECONDS')
self.mail = MailSender.from_settings(crawler.settings)
crawler.signals.connect(self.engine_started, signal=signals.engine_started)
crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def get_virtual_size(self):
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
if sys.platform != 'darwin':
# on macOS ru_maxrss is in bytes, on Linux it is in KB
size *= 1024
return size
def engine_started(self):
self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
self.tasks = []
tsk = task.LoopingCall(self.update)
self.tasks.append(tsk)
tsk.start(self.check_interval, now=True)
if self.limit:
tsk = task.LoopingCall(self._check_limit)
self.tasks.append(tsk)
tsk.start(self.check_interval, now=True)
if self.warning:
tsk = task.LoopingCall(self._check_warning)
self.tasks.append(tsk)
tsk.start(self.check_interval, now=True)
def engine_stopped(self):
for tsk in self.tasks:
if tsk.running:
tsk.stop()
def update(self):
self.crawler.stats.max_value('memusage/max', self.get_virtual_size())
def _check_limit(self):
if self.get_virtual_size() > self.limit:
self.crawler.stats.set_value('memusage/limit_reached', 1)
mem = self.limit/1024/1024
logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
{'memusage': mem}, extra={'crawler': self.crawler})
if self.notify_mails:
subj = (
f"{self.crawler.settings['BOT_NAME']} terminated: "
f"memory usage exceeded {mem}M at {socket.gethostname()}"
)
self._send_report(self.notify_mails, subj)
self.crawler.stats.set_value('memusage/limit_notified', 1)
open_spiders = self.crawler.engine.open_spiders
if open_spiders:
for spider in open_spiders:
self.crawler.engine.close_spider(spider, 'memusage_exceeded')
else:
self.crawler.stop()
def _check_warning(self):
if self.warned: # warn only once
return
if self.get_virtual_size() > self.warning:
self.crawler.stats.set_value('memusage/warning_reached', 1)
mem = self.warning/1024/1024
logger.warning("Memory usage reached %(memusage)dM",
{'memusage': mem}, extra={'crawler': self.crawler})
if self.notify_mails:
subj = (
f"{self.crawler.settings['BOT_NAME']} warning: "
f"memory usage reached {mem}M at {socket.gethostname()}"
)
self._send_report(self.notify_mails, subj)
self.crawler.stats.set_value('memusage/warning_notified', 1)
self.warned = True
def _send_report(self, rcpts, subject):
"""send notification mail with some additional useful info"""
stats = self.crawler.stats
s = f"Memory usage at engine startup : {stats.get_value('memusage/startup')/1024/1024}M\r\n"
s += f"Maximum memory usage : {stats.get_value('memusage/max')/1024/1024}M\r\n"
s += f"Current memory usage : {self.get_virtual_size()/1024/1024}M\r\n"
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
s += "\r\n"
s += pformat(get_engine_status(self.crawler.engine))
s += "\r\n"
self.mail.send(rcpts, subject, s)

View file

@ -0,0 +1,40 @@
import os
import pickle
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.job import job_dir
class SpiderState:
"""Store and load spider state during a scraping job"""
def __init__(self, jobdir=None):
self.jobdir = jobdir
@classmethod
def from_crawler(cls, crawler):
jobdir = job_dir(crawler.settings)
if not jobdir:
raise NotConfigured
obj = cls(jobdir)
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
return obj
def spider_closed(self, spider):
if self.jobdir:
with open(self.statefn, 'wb') as f:
pickle.dump(spider.state, f, protocol=4)
def spider_opened(self, spider):
if self.jobdir and os.path.exists(self.statefn):
with open(self.statefn, 'rb') as f:
spider.state = pickle.load(f)
else:
spider.state = {}
@property
def statefn(self):
return os.path.join(self.jobdir, 'spider.state')

View file

@ -0,0 +1,34 @@
"""
StatsMailer extension sends an email when a spider finishes scraping.
Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
"""
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
class StatsMailer:
def __init__(self, stats, recipients, mail):
self.stats = stats
self.recipients = recipients
self.mail = mail
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
o = cls(crawler.stats, recipients, mail)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_closed(self, spider):
spider_stats = self.stats.get_stats(spider)
body = "Global stats\n\n"
body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
body += f"\n\n{spider.name} stats\n\n"
body += "\n".join(f"{k:<50} : {v}" for k, v in spider_stats.items())
return self.mail.send(self.recipients, f"Scrapy stats for: {spider.name}", body)

View file

@ -0,0 +1,114 @@
"""
Scrapy Telnet Console extension
See documentation in docs/topics/telnetconsole.rst
"""
import pprint
import logging
import traceback
import binascii
import os
from twisted.internet import protocol
try:
from twisted.conch import manhole, telnet
from twisted.conch.insults import insults
TWISTED_CONCH_AVAILABLE = True
except (ImportError, SyntaxError):
_TWISTED_CONCH_TRACEBACK = traceback.format_exc()
TWISTED_CONCH_AVAILABLE = False
from scrapy.exceptions import NotConfigured
from scrapy import signals
from scrapy.utils.trackref import print_live_refs
from scrapy.utils.engine import print_engine_status
from scrapy.utils.reactor import listen_tcp
from scrapy.utils.decorators import defers
logger = logging.getLogger(__name__)
# signal to update telnet variables
# args: telnet_vars
update_telnet_vars = object()
class TelnetConsole(protocol.ServerFactory):
def __init__(self, crawler):
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
raise NotConfigured
if not TWISTED_CONCH_AVAILABLE:
raise NotConfigured(
'TELNETCONSOLE_ENABLED setting is True but required twisted '
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
self.crawler = crawler
self.noisy = False
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
self.host = crawler.settings['TELNETCONSOLE_HOST']
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
if not self.password:
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
logger.info('Telnet Password: %s', self.password)
self.crawler.signals.connect(self.start_listening, signals.engine_started)
self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def start_listening(self):
self.port = listen_tcp(self.portrange, self.host, self)
h = self.port.getHost()
logger.info("Telnet console listening on %(host)s:%(port)d",
{'host': h.host, 'port': h.port},
extra={'crawler': self.crawler})
def stop_listening(self):
self.port.stopListening()
def protocol(self):
class Portal:
"""An implementation of IPortal"""
@defers
def login(self_, credentials, mind, *interfaces):
if not (
credentials.username == self.username.encode('utf8')
and credentials.checkPassword(self.password.encode('utf8'))
):
raise ValueError("Invalid credentials")
protocol = telnet.TelnetBootstrapProtocol(
insults.ServerProtocol,
manhole.Manhole,
self._get_telnet_vars()
)
return (interfaces[0], protocol, lambda: None)
return telnet.TelnetTransport(
telnet.AuthenticatingTelnetProtocol,
Portal()
)
def _get_telnet_vars(self):
# Note: if you add entries here also update topics/telnetconsole.rst
telnet_vars = {
'engine': self.crawler.engine,
'spider': self.crawler.engine.spider,
'slot': self.crawler.engine.slot,
'crawler': self.crawler,
'extensions': self.crawler.extensions,
'stats': self.crawler.stats,
'settings': self.crawler.settings,
'est': lambda: print_engine_status(self.crawler.engine),
'p': pprint.pprint,
'prefs': print_live_refs,
'help': "This is Scrapy telnet console. For more info see: "
"https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
}
self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars)
return telnet_vars

View file

@ -0,0 +1,93 @@
import logging
from scrapy.exceptions import NotConfigured
from scrapy import signals
logger = logging.getLogger(__name__)
class AutoThrottle:
def __init__(self, crawler):
self.crawler = crawler
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
raise NotConfigured
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def _spider_opened(self, spider):
self.mindelay = self._min_delay(spider)
self.maxdelay = self._max_delay(spider)
spider.download_delay = self._start_delay(spider)
def _min_delay(self, spider):
s = self.crawler.settings
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
def _max_delay(self, spider):
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
def _start_delay(self, spider):
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
def _response_downloaded(self, response, request, spider):
key, slot = self._get_slot(request, spider)
latency = request.meta.get('download_latency')
if latency is None or slot is None:
return
olddelay = slot.delay
self._adjust_delay(slot, latency, response)
if self.debug:
diff = slot.delay - olddelay
size = len(response.body)
conc = len(slot.transferring)
logger.info(
"slot: %(slot)s | conc:%(concurrency)2d | "
"delay:%(delay)5d ms (%(delaydiff)+d) | "
"latency:%(latency)5d ms | size:%(size)6d bytes",
{
'slot': key, 'concurrency': conc,
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
'latency': latency * 1000, 'size': size
},
extra={'spider': spider}
)
def _get_slot(self, request, spider):
key = request.meta.get('download_slot')
return key, self.crawler.engine.downloader.slots.get(key)
def _adjust_delay(self, slot, latency, response):
"""Define delay adjustment policy"""
# If a server needs `latency` seconds to respond then
# we should send a request each `latency/N` seconds
# to have N requests processed in parallel
target_delay = latency / self.target_concurrency
# Adjust the delay to make it closer to target_delay
new_delay = (slot.delay + target_delay) / 2.0
# If target delay is bigger than old delay, then use it instead of mean.
# It works better with problematic sites.
new_delay = max(target_delay, new_delay)
# Make sure self.mindelay <= new_delay <= self.max_delay
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
# Dont adjust delay if response status != 200 and new delay is smaller
# than old one, as error pages (and redirections) are usually small and
# so tend to reduce latency, thus provoking a positive feedback by
# reducing delay instead of increase.
if response.status != 200 and new_delay <= slot.delay:
return
slot.delay = new_delay