Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
|
|
@ -0,0 +1,68 @@
|
|||
"""CloseSpider is an extension that forces spiders to be closed after certain
|
||||
conditions are met.
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class CloseSpider:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
|
||||
self.close_on = {
|
||||
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
|
||||
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
|
||||
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
|
||||
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
|
||||
}
|
||||
|
||||
if not any(self.close_on.values()):
|
||||
raise NotConfigured
|
||||
|
||||
self.counter = defaultdict(int)
|
||||
|
||||
if self.close_on.get('errorcount'):
|
||||
crawler.signals.connect(self.error_count, signal=signals.spider_error)
|
||||
if self.close_on.get('pagecount'):
|
||||
crawler.signals.connect(self.page_count, signal=signals.response_received)
|
||||
if self.close_on.get('timeout'):
|
||||
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
if self.close_on.get('itemcount'):
|
||||
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def error_count(self, failure, response, spider):
|
||||
self.counter['errorcount'] += 1
|
||||
if self.counter['errorcount'] == self.close_on['errorcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
|
||||
|
||||
def page_count(self, response, request, spider):
|
||||
self.counter['pagecount'] += 1
|
||||
if self.counter['pagecount'] == self.close_on['pagecount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
|
||||
|
||||
def spider_opened(self, spider):
|
||||
from twisted.internet import reactor
|
||||
self.task = reactor.callLater(self.close_on['timeout'],
|
||||
self.crawler.engine.close_spider, spider,
|
||||
reason='closespider_timeout')
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.counter['itemcount'] += 1
|
||||
if self.counter['itemcount'] == self.close_on['itemcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
|
||||
|
||||
def spider_closed(self, spider):
|
||||
task = getattr(self, 'task', False)
|
||||
if task and task.active():
|
||||
task.cancel()
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
"""
|
||||
Extension for collecting core stats like items scraped and start/finish times
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class CoreStats:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
self.start_time = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
|
||||
crawler.signals.connect(o.response_received, signal=signals.response_received)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.start_time = datetime.utcnow()
|
||||
self.stats.set_value('start_time', self.start_time, spider=spider)
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
finish_time = datetime.utcnow()
|
||||
elapsed_time = finish_time - self.start_time
|
||||
elapsed_time_seconds = elapsed_time.total_seconds()
|
||||
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
|
||||
self.stats.set_value('finish_time', finish_time, spider=spider)
|
||||
self.stats.set_value('finish_reason', reason, spider=spider)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.stats.inc_value('item_scraped_count', spider=spider)
|
||||
|
||||
def response_received(self, spider):
|
||||
self.stats.inc_value('response_received_count', spider=spider)
|
||||
|
||||
def item_dropped(self, item, spider, exception):
|
||||
reason = exception.__class__.__name__
|
||||
self.stats.inc_value('item_dropped_count', spider=spider)
|
||||
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)
|
||||
64
venv/lib/python3.9/site-packages/scrapy/extensions/debug.py
Normal file
64
venv/lib/python3.9/site-packages/scrapy/extensions/debug.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
"""
|
||||
Extensions for debugging Scrapy
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
import sys
|
||||
import signal
|
||||
import logging
|
||||
import traceback
|
||||
import threading
|
||||
from pdb import Pdb
|
||||
|
||||
from scrapy.utils.engine import format_engine_status
|
||||
from scrapy.utils.trackref import format_live_refs
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StackTraceDump:
|
||||
|
||||
def __init__(self, crawler=None):
|
||||
self.crawler = crawler
|
||||
try:
|
||||
signal.signal(signal.SIGUSR2, self.dump_stacktrace)
|
||||
signal.signal(signal.SIGQUIT, self.dump_stacktrace)
|
||||
except AttributeError:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def dump_stacktrace(self, signum, frame):
|
||||
log_args = {
|
||||
'stackdumps': self._thread_stacks(),
|
||||
'enginestatus': format_engine_status(self.crawler.engine),
|
||||
'liverefs': format_live_refs(),
|
||||
}
|
||||
logger.info("Dumping stack trace and engine status\n"
|
||||
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
|
||||
log_args, extra={'crawler': self.crawler})
|
||||
|
||||
def _thread_stacks(self):
|
||||
id2name = dict((th.ident, th.name) for th in threading.enumerate())
|
||||
dumps = ''
|
||||
for id_, frame in sys._current_frames().items():
|
||||
name = id2name.get(id_, '')
|
||||
dump = ''.join(traceback.format_stack(frame))
|
||||
dumps += f"# Thread: {name}({id_})\n{dump}\n"
|
||||
return dumps
|
||||
|
||||
|
||||
class Debugger:
|
||||
def __init__(self):
|
||||
try:
|
||||
signal.signal(signal.SIGUSR2, self._enter_debugger)
|
||||
except AttributeError:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
def _enter_debugger(self, signum, frame):
|
||||
Pdb().set_trace(frame.f_back)
|
||||
480
venv/lib/python3.9/site-packages/scrapy/extensions/feedexport.py
Normal file
480
venv/lib/python3.9/site-packages/scrapy/extensions/feedexport.py
Normal file
|
|
@ -0,0 +1,480 @@
|
|||
"""
|
||||
Feed Exports extension
|
||||
|
||||
See documentation in docs/topics/feed-exports.rst
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
from tempfile import NamedTemporaryFile
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
from twisted.internet import defer, threads
|
||||
from w3lib.url import file_uri_to_path
|
||||
from zope.interface import implementer, Interface
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||
from scrapy.utils.boto import is_botocore_available
|
||||
from scrapy.utils.conf import feed_complete_default_values_from_settings
|
||||
from scrapy.utils.ftp import ftp_store_file
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import get_func_args, without_none_values
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_storage(builder, uri, *args, feed_options=None, preargs=(), **kwargs):
|
||||
argument_names = get_func_args(builder)
|
||||
if 'feed_options' in argument_names:
|
||||
kwargs['feed_options'] = feed_options
|
||||
else:
|
||||
warnings.warn(
|
||||
"{} does not support the 'feed_options' keyword argument. Add a "
|
||||
"'feed_options' parameter to its signature to remove this "
|
||||
"warning. This parameter will become mandatory in a future "
|
||||
"version of Scrapy."
|
||||
.format(builder.__qualname__),
|
||||
category=ScrapyDeprecationWarning
|
||||
)
|
||||
return builder(*preargs, uri, *args, **kwargs)
|
||||
|
||||
|
||||
class IFeedStorage(Interface):
|
||||
"""Interface that all Feed Storages must implement"""
|
||||
|
||||
def __init__(uri, *, feed_options=None):
|
||||
"""Initialize the storage with the parameters given in the URI and the
|
||||
feed-specific options (see :setting:`FEEDS`)"""
|
||||
|
||||
def open(spider):
|
||||
"""Open the storage for the given spider. It must return a file-like
|
||||
object that will be used for the exporters"""
|
||||
|
||||
def store(file):
|
||||
"""Store the given file stream"""
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class BlockingFeedStorage:
|
||||
|
||||
def open(self, spider):
|
||||
path = spider.crawler.settings['FEED_TEMPDIR']
|
||||
if path and not os.path.isdir(path):
|
||||
raise OSError('Not a Directory: ' + str(path))
|
||||
|
||||
return NamedTemporaryFile(prefix='feed-', dir=path)
|
||||
|
||||
def store(self, file):
|
||||
return threads.deferToThread(self._store_in_thread, file)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class StdoutFeedStorage:
|
||||
|
||||
def __init__(self, uri, _stdout=None, *, feed_options=None):
|
||||
if not _stdout:
|
||||
_stdout = sys.stdout.buffer
|
||||
self._stdout = _stdout
|
||||
if feed_options and feed_options.get('overwrite', False) is True:
|
||||
logger.warning('Standard output (stdout) storage does not support '
|
||||
'overwriting. To suppress this warning, remove the '
|
||||
'overwrite option from your FEEDS setting, or set '
|
||||
'it to False.')
|
||||
|
||||
def open(self, spider):
|
||||
return self._stdout
|
||||
|
||||
def store(self, file):
|
||||
pass
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class FileFeedStorage:
|
||||
|
||||
def __init__(self, uri, *, feed_options=None):
|
||||
self.path = file_uri_to_path(uri)
|
||||
feed_options = feed_options or {}
|
||||
self.write_mode = 'wb' if feed_options.get('overwrite', False) else 'ab'
|
||||
|
||||
def open(self, spider):
|
||||
dirname = os.path.dirname(self.path)
|
||||
if dirname and not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
return open(self.path, self.write_mode)
|
||||
|
||||
def store(self, file):
|
||||
file.close()
|
||||
|
||||
|
||||
class S3FeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, access_key=None, secret_key=None, acl=None, *,
|
||||
feed_options=None):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
u = urlparse(uri)
|
||||
self.bucketname = u.hostname
|
||||
self.access_key = u.username or access_key
|
||||
self.secret_key = u.password or secret_key
|
||||
self.keyname = u.path[1:] # remove first "/"
|
||||
self.acl = acl
|
||||
import botocore.session
|
||||
session = botocore.session.get_session()
|
||||
self.s3_client = session.create_client(
|
||||
's3', aws_access_key_id=self.access_key,
|
||||
aws_secret_access_key=self.secret_key)
|
||||
if feed_options and feed_options.get('overwrite', True) is False:
|
||||
logger.warning('S3 does not support appending to files. To '
|
||||
'suppress this warning, remove the overwrite '
|
||||
'option from your FEEDS setting or set it to True.')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, uri, *, feed_options=None):
|
||||
return build_storage(
|
||||
cls,
|
||||
uri,
|
||||
access_key=crawler.settings['AWS_ACCESS_KEY_ID'],
|
||||
secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'],
|
||||
acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None,
|
||||
feed_options=feed_options,
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
kwargs = {'ACL': self.acl} if self.acl else {}
|
||||
self.s3_client.put_object(
|
||||
Bucket=self.bucketname, Key=self.keyname, Body=file,
|
||||
**kwargs)
|
||||
file.close()
|
||||
|
||||
|
||||
class GCSFeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, project_id, acl):
|
||||
self.project_id = project_id
|
||||
self.acl = acl
|
||||
u = urlparse(uri)
|
||||
self.bucket_name = u.hostname
|
||||
self.blob_name = u.path[1:] # remove first "/"
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, uri):
|
||||
return cls(
|
||||
uri,
|
||||
crawler.settings['GCS_PROJECT_ID'],
|
||||
crawler.settings['FEED_STORAGE_GCS_ACL'] or None
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
from google.cloud.storage import Client
|
||||
client = Client(project=self.project_id)
|
||||
bucket = client.get_bucket(self.bucket_name)
|
||||
blob = bucket.blob(self.blob_name)
|
||||
blob.upload_from_file(file, predefined_acl=self.acl)
|
||||
|
||||
|
||||
class FTPFeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, use_active_mode=False, *, feed_options=None):
|
||||
u = urlparse(uri)
|
||||
self.host = u.hostname
|
||||
self.port = int(u.port or '21')
|
||||
self.username = u.username
|
||||
self.password = unquote(u.password or '')
|
||||
self.path = u.path
|
||||
self.use_active_mode = use_active_mode
|
||||
self.overwrite = not feed_options or feed_options.get('overwrite', True)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, uri, *, feed_options=None):
|
||||
return build_storage(
|
||||
cls,
|
||||
uri,
|
||||
crawler.settings.getbool('FEED_STORAGE_FTP_ACTIVE'),
|
||||
feed_options=feed_options,
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
ftp_store_file(
|
||||
path=self.path, file=file, host=self.host,
|
||||
port=self.port, username=self.username,
|
||||
password=self.password, use_active_mode=self.use_active_mode,
|
||||
overwrite=self.overwrite,
|
||||
)
|
||||
|
||||
|
||||
class _FeedSlot:
|
||||
def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template):
|
||||
self.file = file
|
||||
self.exporter = exporter
|
||||
self.storage = storage
|
||||
# feed params
|
||||
self.batch_id = batch_id
|
||||
self.format = format
|
||||
self.store_empty = store_empty
|
||||
self.uri_template = uri_template
|
||||
self.uri = uri
|
||||
# flags
|
||||
self.itemcount = 0
|
||||
self._exporting = False
|
||||
|
||||
def start_exporting(self):
|
||||
if not self._exporting:
|
||||
self.exporter.start_exporting()
|
||||
self._exporting = True
|
||||
|
||||
def finish_exporting(self):
|
||||
if self._exporting:
|
||||
self.exporter.finish_exporting()
|
||||
self._exporting = False
|
||||
|
||||
|
||||
class FeedExporter:
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
exporter = cls(crawler)
|
||||
crawler.signals.connect(exporter.open_spider, signals.spider_opened)
|
||||
crawler.signals.connect(exporter.close_spider, signals.spider_closed)
|
||||
crawler.signals.connect(exporter.item_scraped, signals.item_scraped)
|
||||
return exporter
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
self.settings = crawler.settings
|
||||
self.feeds = {}
|
||||
self.slots = []
|
||||
|
||||
if not self.settings['FEEDS'] and not self.settings['FEED_URI']:
|
||||
raise NotConfigured
|
||||
|
||||
# Begin: Backward compatibility for FEED_URI and FEED_FORMAT settings
|
||||
if self.settings['FEED_URI']:
|
||||
warnings.warn(
|
||||
'The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of '
|
||||
'the `FEEDS` setting. Please see the `FEEDS` setting docs for more details',
|
||||
category=ScrapyDeprecationWarning, stacklevel=2,
|
||||
)
|
||||
uri = str(self.settings['FEED_URI']) # handle pathlib.Path objects
|
||||
feed_options = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
|
||||
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
|
||||
# End: Backward compatibility for FEED_URI and FEED_FORMAT settings
|
||||
|
||||
# 'FEEDS' setting takes precedence over 'FEED_URI'
|
||||
for uri, feed_options in self.settings.getdict('FEEDS').items():
|
||||
uri = str(uri) # handle pathlib.Path objects
|
||||
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
|
||||
|
||||
self.storages = self._load_components('FEED_STORAGES')
|
||||
self.exporters = self._load_components('FEED_EXPORTERS')
|
||||
for uri, feed_options in self.feeds.items():
|
||||
if not self._storage_supported(uri, feed_options):
|
||||
raise NotConfigured
|
||||
if not self._settings_are_valid():
|
||||
raise NotConfigured
|
||||
if not self._exporter_supported(feed_options['format']):
|
||||
raise NotConfigured
|
||||
|
||||
def open_spider(self, spider):
|
||||
for uri, feed_options in self.feeds.items():
|
||||
uri_params = self._get_uri_params(spider, feed_options['uri_params'])
|
||||
self.slots.append(self._start_new_batch(
|
||||
batch_id=1,
|
||||
uri=uri % uri_params,
|
||||
feed_options=feed_options,
|
||||
spider=spider,
|
||||
uri_template=uri,
|
||||
))
|
||||
|
||||
def close_spider(self, spider):
|
||||
deferred_list = []
|
||||
for slot in self.slots:
|
||||
d = self._close_slot(slot, spider)
|
||||
deferred_list.append(d)
|
||||
return defer.DeferredList(deferred_list) if deferred_list else None
|
||||
|
||||
def _close_slot(self, slot, spider):
|
||||
if not slot.itemcount and not slot.store_empty:
|
||||
# We need to call slot.storage.store nonetheless to get the file
|
||||
# properly closed.
|
||||
return defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
slot.finish_exporting()
|
||||
logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
|
||||
log_args = {'format': slot.format,
|
||||
'itemcount': slot.itemcount,
|
||||
'uri': slot.uri}
|
||||
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
|
||||
# Use `largs=log_args` to copy log_args into function's scope
|
||||
# instead of using `log_args` from the outer scope
|
||||
d.addCallback(
|
||||
lambda _, largs=log_args: logger.info(
|
||||
logfmt % "Stored", largs, extra={'spider': spider}
|
||||
)
|
||||
)
|
||||
d.addErrback(
|
||||
lambda f, largs=log_args: logger.error(
|
||||
logfmt % "Error storing", largs,
|
||||
exc_info=failure_to_exc_info(f), extra={'spider': spider}
|
||||
)
|
||||
)
|
||||
return d
|
||||
|
||||
def _start_new_batch(self, batch_id, uri, feed_options, spider, uri_template):
|
||||
"""
|
||||
Redirect the output data stream to a new file.
|
||||
Execute multiple times if FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified
|
||||
:param batch_id: sequence number of current batch
|
||||
:param uri: uri of the new batch to start
|
||||
:param feed_options: dict with parameters of feed
|
||||
:param spider: user spider
|
||||
:param uri_template: template of uri which contains %(batch_time)s or %(batch_id)d to create new uri
|
||||
"""
|
||||
storage = self._get_storage(uri, feed_options)
|
||||
file = storage.open(spider)
|
||||
exporter = self._get_exporter(
|
||||
file=file,
|
||||
format=feed_options['format'],
|
||||
fields_to_export=feed_options['fields'],
|
||||
encoding=feed_options['encoding'],
|
||||
indent=feed_options['indent'],
|
||||
**feed_options['item_export_kwargs'],
|
||||
)
|
||||
slot = _FeedSlot(
|
||||
file=file,
|
||||
exporter=exporter,
|
||||
storage=storage,
|
||||
uri=uri,
|
||||
format=feed_options['format'],
|
||||
store_empty=feed_options['store_empty'],
|
||||
batch_id=batch_id,
|
||||
uri_template=uri_template,
|
||||
)
|
||||
if slot.store_empty:
|
||||
slot.start_exporting()
|
||||
return slot
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
slots = []
|
||||
for slot in self.slots:
|
||||
slot.start_exporting()
|
||||
slot.exporter.export_item(item)
|
||||
slot.itemcount += 1
|
||||
# create new slot for each slot with itemcount == FEED_EXPORT_BATCH_ITEM_COUNT and close the old one
|
||||
if (
|
||||
self.feeds[slot.uri_template]['batch_item_count']
|
||||
and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count']
|
||||
):
|
||||
uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot)
|
||||
self._close_slot(slot, spider)
|
||||
slots.append(self._start_new_batch(
|
||||
batch_id=slot.batch_id + 1,
|
||||
uri=slot.uri_template % uri_params,
|
||||
feed_options=self.feeds[slot.uri_template],
|
||||
spider=spider,
|
||||
uri_template=slot.uri_template,
|
||||
))
|
||||
else:
|
||||
slots.append(slot)
|
||||
self.slots = slots
|
||||
|
||||
def _load_components(self, setting_prefix):
|
||||
conf = without_none_values(self.settings.getwithbase(setting_prefix))
|
||||
d = {}
|
||||
for k, v in conf.items():
|
||||
try:
|
||||
d[k] = load_object(v)
|
||||
except NotConfigured:
|
||||
pass
|
||||
return d
|
||||
|
||||
def _exporter_supported(self, format):
|
||||
if format in self.exporters:
|
||||
return True
|
||||
logger.error("Unknown feed format: %(format)s", {'format': format})
|
||||
|
||||
def _settings_are_valid(self):
|
||||
"""
|
||||
If FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain
|
||||
%(batch_time)s or %(batch_id)d to distinguish different files of partial output
|
||||
"""
|
||||
for uri_template, values in self.feeds.items():
|
||||
if values['batch_item_count'] and not re.search(r'%\(batch_time\)s|%\(batch_id\)', uri_template):
|
||||
logger.error(
|
||||
'%(batch_time)s or %(batch_id)d must be in the feed URI ({}) if FEED_EXPORT_BATCH_ITEM_COUNT '
|
||||
'setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: '
|
||||
'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count'
|
||||
''.format(uri_template)
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def _storage_supported(self, uri, feed_options):
|
||||
scheme = urlparse(uri).scheme
|
||||
if scheme in self.storages:
|
||||
try:
|
||||
self._get_storage(uri, feed_options)
|
||||
return True
|
||||
except NotConfigured as e:
|
||||
logger.error("Disabled feed storage scheme: %(scheme)s. "
|
||||
"Reason: %(reason)s",
|
||||
{'scheme': scheme, 'reason': str(e)})
|
||||
else:
|
||||
logger.error("Unknown feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
|
||||
def _get_instance(self, objcls, *args, **kwargs):
|
||||
return create_instance(
|
||||
objcls, self.settings, getattr(self, 'crawler', None),
|
||||
*args, **kwargs)
|
||||
|
||||
def _get_exporter(self, file, format, *args, **kwargs):
|
||||
return self._get_instance(self.exporters[format], file, *args, **kwargs)
|
||||
|
||||
def _get_storage(self, uri, feed_options):
|
||||
"""Fork of create_instance specific to feed storage classes
|
||||
|
||||
It supports not passing the *feed_options* parameters to classes that
|
||||
do not support it, and issuing a deprecation warning instead.
|
||||
"""
|
||||
feedcls = self.storages[urlparse(uri).scheme]
|
||||
crawler = getattr(self, 'crawler', None)
|
||||
|
||||
def build_instance(builder, *preargs):
|
||||
return build_storage(builder, uri, feed_options=feed_options, preargs=preargs)
|
||||
|
||||
if crawler and hasattr(feedcls, 'from_crawler'):
|
||||
instance = build_instance(feedcls.from_crawler, crawler)
|
||||
method_name = 'from_crawler'
|
||||
elif hasattr(feedcls, 'from_settings'):
|
||||
instance = build_instance(feedcls.from_settings, self.settings)
|
||||
method_name = 'from_settings'
|
||||
else:
|
||||
instance = build_instance(feedcls)
|
||||
method_name = '__new__'
|
||||
if instance is None:
|
||||
raise TypeError("%s.%s returned None" % (feedcls.__qualname__, method_name))
|
||||
return instance
|
||||
|
||||
def _get_uri_params(self, spider, uri_params, slot=None):
|
||||
params = {}
|
||||
for k in dir(spider):
|
||||
params[k] = getattr(spider, k)
|
||||
utc_now = datetime.utcnow()
|
||||
params['time'] = utc_now.replace(microsecond=0).isoformat().replace(':', '-')
|
||||
params['batch_time'] = utc_now.isoformat().replace(':', '-')
|
||||
params['batch_id'] = slot.batch_id + 1 if slot is not None else 1
|
||||
uripar_function = load_object(uri_params) if uri_params else lambda x, y: None
|
||||
uripar_function(params, spider)
|
||||
return params
|
||||
372
venv/lib/python3.9/site-packages/scrapy/extensions/httpcache.py
Normal file
372
venv/lib/python3.9/site-packages/scrapy/extensions/httpcache.py
Normal file
|
|
@ -0,0 +1,372 @@
|
|||
import gzip
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from email.utils import mktime_tz, parsedate_tz
|
||||
from importlib import import_module
|
||||
from time import time
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
||||
|
||||
from scrapy.http import Headers, Response
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.project import data_path
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DummyPolicy:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
return urlparse_cached(request).scheme not in self.ignore_schemes
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
return response.status not in self.ignore_http_codes
|
||||
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
return True
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
return True
|
||||
|
||||
|
||||
class RFC2616Policy:
|
||||
|
||||
MAXAGE = 3600 * 24 * 365 # one year
|
||||
|
||||
def __init__(self, settings):
|
||||
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self._cc_parsed = WeakKeyDictionary()
|
||||
self.ignore_response_cache_controls = [
|
||||
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
|
||||
]
|
||||
|
||||
def _parse_cachecontrol(self, r):
|
||||
if r not in self._cc_parsed:
|
||||
cch = r.headers.get(b'Cache-Control', b'')
|
||||
parsed = parse_cachecontrol(cch)
|
||||
if isinstance(r, Response):
|
||||
for key in self.ignore_response_cache_controls:
|
||||
parsed.pop(key, None)
|
||||
self._cc_parsed[r] = parsed
|
||||
return self._cc_parsed[r]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
if urlparse_cached(request).scheme in self.ignore_schemes:
|
||||
return False
|
||||
cc = self._parse_cachecontrol(request)
|
||||
# obey user-agent directive "Cache-Control: no-store"
|
||||
if b'no-store' in cc:
|
||||
return False
|
||||
# Any other is eligible for caching
|
||||
return True
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
|
||||
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
||||
# Status code 206 is not included because cache can not deal with partial contents
|
||||
cc = self._parse_cachecontrol(response)
|
||||
# obey directive "Cache-Control: no-store"
|
||||
if b'no-store' in cc:
|
||||
return False
|
||||
# Never cache 304 (Not Modified) responses
|
||||
elif response.status == 304:
|
||||
return False
|
||||
# Cache unconditionally if configured to do so
|
||||
elif self.always_store:
|
||||
return True
|
||||
# Any hint on response expiration is good
|
||||
elif b'max-age' in cc or b'Expires' in response.headers:
|
||||
return True
|
||||
# Firefox fallbacks this statuses to one year expiration if none is set
|
||||
elif response.status in (300, 301, 308):
|
||||
return True
|
||||
# Other statuses without expiration requires at least one validator
|
||||
elif response.status in (200, 203, 401):
|
||||
return b'Last-Modified' in response.headers or b'ETag' in response.headers
|
||||
# Any other is probably not eligible for caching
|
||||
# Makes no sense to cache responses that does not contain expiration
|
||||
# info and can not be revalidated
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
ccreq = self._parse_cachecontrol(request)
|
||||
if b'no-cache' in cc or b'no-cache' in ccreq:
|
||||
return False
|
||||
|
||||
now = time()
|
||||
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
|
||||
currentage = self._compute_current_age(cachedresponse, request, now)
|
||||
|
||||
reqmaxage = self._get_max_age(ccreq)
|
||||
if reqmaxage is not None:
|
||||
freshnesslifetime = min(freshnesslifetime, reqmaxage)
|
||||
|
||||
if currentage < freshnesslifetime:
|
||||
return True
|
||||
|
||||
if b'max-stale' in ccreq and b'must-revalidate' not in cc:
|
||||
# From RFC2616: "Indicates that the client is willing to
|
||||
# accept a response that has exceeded its expiration time.
|
||||
# If max-stale is assigned a value, then the client is
|
||||
# willing to accept a response that has exceeded its
|
||||
# expiration time by no more than the specified number of
|
||||
# seconds. If no value is assigned to max-stale, then the
|
||||
# client is willing to accept a stale response of any age."
|
||||
staleage = ccreq[b'max-stale']
|
||||
if staleage is None:
|
||||
return True
|
||||
|
||||
try:
|
||||
if currentage < freshnesslifetime + max(0, int(staleage)):
|
||||
return True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Cached response is stale, try to set validators if any
|
||||
self._set_conditional_validators(request, cachedresponse)
|
||||
return False
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
# Use the cached response if the new response is a server error,
|
||||
# as long as the old response didn't specify must-revalidate.
|
||||
if response.status >= 500:
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
if b'must-revalidate' not in cc:
|
||||
return True
|
||||
|
||||
# Use the cached response if the server says it hasn't changed.
|
||||
return response.status == 304
|
||||
|
||||
def _set_conditional_validators(self, request, cachedresponse):
|
||||
if b'Last-Modified' in cachedresponse.headers:
|
||||
request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
|
||||
|
||||
if b'ETag' in cachedresponse.headers:
|
||||
request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
|
||||
|
||||
def _get_max_age(self, cc):
|
||||
try:
|
||||
return max(0, int(cc[b'max-age']))
|
||||
except (KeyError, ValueError):
|
||||
return None
|
||||
|
||||
def _compute_freshness_lifetime(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeFreshnessLifetime
|
||||
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
|
||||
cc = self._parse_cachecontrol(response)
|
||||
maxage = self._get_max_age(cc)
|
||||
if maxage is not None:
|
||||
return maxage
|
||||
|
||||
# Parse date header or synthesize it if none exists
|
||||
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
|
||||
|
||||
# Try HTTP/1.0 Expires header
|
||||
if b'Expires' in response.headers:
|
||||
expires = rfc1123_to_epoch(response.headers[b'Expires'])
|
||||
# When parsing Expires header fails RFC 2616 section 14.21 says we
|
||||
# should treat this as an expiration time in the past.
|
||||
return max(0, expires - date) if expires else 0
|
||||
|
||||
# Fallback to heuristic using last-modified header
|
||||
# This is not in RFC but on Firefox caching implementation
|
||||
lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
|
||||
if lastmodified and lastmodified <= date:
|
||||
return (date - lastmodified) / 10
|
||||
|
||||
# This request can be cached indefinitely
|
||||
if response.status in (300, 301, 308):
|
||||
return self.MAXAGE
|
||||
|
||||
# Insufficient information to compute fresshness lifetime
|
||||
return 0
|
||||
|
||||
def _compute_current_age(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeCurrentAge
|
||||
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
|
||||
currentage = 0
|
||||
# If Date header is not set we assume it is a fast connection, and
|
||||
# clock is in sync with the server
|
||||
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
|
||||
if now > date:
|
||||
currentage = now - date
|
||||
|
||||
if b'Age' in response.headers:
|
||||
try:
|
||||
age = int(response.headers[b'Age'])
|
||||
currentage = max(currentage, age)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return currentage
|
||||
|
||||
|
||||
class DbmCacheStorage:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
|
||||
self.db = self.dbmodule.open(dbpath, 'c')
|
||||
|
||||
logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.db.close()
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return # not cached
|
||||
url = data['url']
|
||||
status = data['status']
|
||||
headers = Headers(data['headers'])
|
||||
body = data['body']
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
key = self._request_key(request)
|
||||
data = {
|
||||
'status': response.status,
|
||||
'url': response.url,
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
|
||||
self.db[f'{key}_time'] = str(time())
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._request_key(request)
|
||||
db = self.db
|
||||
tkey = f'{key}_time'
|
||||
if tkey not in db:
|
||||
return # not found
|
||||
|
||||
ts = db[tkey]
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
return pickle.loads(db[f'{key}_data'])
|
||||
|
||||
def _request_key(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
|
||||
class FilesystemCacheStorage:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
|
||||
self._open = gzip.open if self.use_gzip else open
|
||||
|
||||
def open_spider(self, spider):
|
||||
logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
|
||||
extra={'spider': spider})
|
||||
|
||||
def close_spider(self, spider):
|
||||
pass
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
"""Return response if present in cache, or None otherwise."""
|
||||
metadata = self._read_meta(spider, request)
|
||||
if metadata is None:
|
||||
return # not cached
|
||||
rpath = self._get_request_path(spider, request)
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
|
||||
body = f.read()
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
|
||||
rawheaders = f.read()
|
||||
url = metadata.get('response_url')
|
||||
status = metadata['status']
|
||||
headers = Headers(headers_raw_to_dict(rawheaders))
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
"""Store the given response in the cache."""
|
||||
rpath = self._get_request_path(spider, request)
|
||||
if not os.path.exists(rpath):
|
||||
os.makedirs(rpath)
|
||||
metadata = {
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'status': response.status,
|
||||
'response_url': response.url,
|
||||
'timestamp': time(),
|
||||
}
|
||||
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||
f.write(to_bytes(repr(metadata)))
|
||||
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
pickle.dump(metadata, f, protocol=4)
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||
f.write(response.body)
|
||||
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(request.headers))
|
||||
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
|
||||
f.write(request.body)
|
||||
|
||||
def _get_request_path(self, spider, request):
|
||||
key = request_fingerprint(request)
|
||||
return os.path.join(self.cachedir, spider.name, key[0:2], key)
|
||||
|
||||
def _read_meta(self, spider, request):
|
||||
rpath = self._get_request_path(spider, request)
|
||||
metapath = os.path.join(rpath, 'pickled_meta')
|
||||
if not os.path.exists(metapath):
|
||||
return # not found
|
||||
mtime = os.stat(metapath).st_mtime
|
||||
if 0 < self.expiration_secs < time() - mtime:
|
||||
return # expired
|
||||
with self._open(metapath, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
def parse_cachecontrol(header):
|
||||
"""Parse Cache-Control header
|
||||
|
||||
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
|
||||
|
||||
>>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None,
|
||||
... b'max-age': b'3600'}
|
||||
True
|
||||
>>> parse_cachecontrol(b'') == {}
|
||||
True
|
||||
|
||||
"""
|
||||
directives = {}
|
||||
for directive in header.split(b','):
|
||||
key, sep, val = directive.strip().partition(b'=')
|
||||
if key:
|
||||
directives[key.lower()] = val if sep else None
|
||||
return directives
|
||||
|
||||
|
||||
def rfc1123_to_epoch(date_str):
|
||||
try:
|
||||
date_str = to_unicode(date_str, encoding='ascii')
|
||||
return mktime_tz(parsedate_tz(date_str))
|
||||
except Exception:
|
||||
return None
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
import logging
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogStats:
|
||||
"""Log basic scraping stats periodically"""
|
||||
|
||||
def __init__(self, stats, interval=60.0):
|
||||
self.stats = stats
|
||||
self.interval = interval
|
||||
self.multiplier = 60.0 / self.interval
|
||||
self.task = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
|
||||
if not interval:
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats, interval)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.pagesprev = 0
|
||||
self.itemsprev = 0
|
||||
|
||||
self.task = task.LoopingCall(self.log, spider)
|
||||
self.task.start(self.interval)
|
||||
|
||||
def log(self, spider):
|
||||
items = self.stats.get_value('item_scraped_count', 0)
|
||||
pages = self.stats.get_value('response_received_count', 0)
|
||||
irate = (items - self.itemsprev) * self.multiplier
|
||||
prate = (pages - self.pagesprev) * self.multiplier
|
||||
self.pagesprev, self.itemsprev = pages, items
|
||||
|
||||
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
|
||||
"scraped %(items)d items (at %(itemrate)d items/min)")
|
||||
log_args = {'pages': pages, 'pagerate': prate,
|
||||
'items': items, 'itemrate': irate}
|
||||
logger.info(msg, log_args, extra={'spider': spider})
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
if self.task and self.task.running:
|
||||
self.task.stop()
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
"""
|
||||
MemoryDebugger extension
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
import gc
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.trackref import live_refs
|
||||
|
||||
|
||||
class MemoryDebugger:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('MEMDEBUG_ENABLED'):
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
gc.collect()
|
||||
self.stats.set_value('memdebug/gc_garbage_count', len(gc.garbage), spider=spider)
|
||||
for cls, wdict in live_refs.items():
|
||||
if not wdict:
|
||||
continue
|
||||
self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)
|
||||
126
venv/lib/python3.9/site-packages/scrapy/extensions/memusage.py
Normal file
126
venv/lib/python3.9/site-packages/scrapy/extensions/memusage.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
"""
|
||||
MemoryUsage extension
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
import sys
|
||||
import socket
|
||||
import logging
|
||||
from pprint import pformat
|
||||
from importlib import import_module
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.utils.engine import get_engine_status
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryUsage:
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
|
||||
raise NotConfigured
|
||||
try:
|
||||
# stdlib's resource module is only available on unix platforms.
|
||||
self.resource = import_module('resource')
|
||||
except ImportError:
|
||||
raise NotConfigured
|
||||
|
||||
self.crawler = crawler
|
||||
self.warned = False
|
||||
self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
|
||||
self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
|
||||
self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
|
||||
self.check_interval = crawler.settings.getfloat('MEMUSAGE_CHECK_INTERVAL_SECONDS')
|
||||
self.mail = MailSender.from_settings(crawler.settings)
|
||||
crawler.signals.connect(self.engine_started, signal=signals.engine_started)
|
||||
crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def get_virtual_size(self):
|
||||
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
|
||||
if sys.platform != 'darwin':
|
||||
# on macOS ru_maxrss is in bytes, on Linux it is in KB
|
||||
size *= 1024
|
||||
return size
|
||||
|
||||
def engine_started(self):
|
||||
self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
|
||||
self.tasks = []
|
||||
tsk = task.LoopingCall(self.update)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(self.check_interval, now=True)
|
||||
if self.limit:
|
||||
tsk = task.LoopingCall(self._check_limit)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(self.check_interval, now=True)
|
||||
if self.warning:
|
||||
tsk = task.LoopingCall(self._check_warning)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(self.check_interval, now=True)
|
||||
|
||||
def engine_stopped(self):
|
||||
for tsk in self.tasks:
|
||||
if tsk.running:
|
||||
tsk.stop()
|
||||
|
||||
def update(self):
|
||||
self.crawler.stats.max_value('memusage/max', self.get_virtual_size())
|
||||
|
||||
def _check_limit(self):
|
||||
if self.get_virtual_size() > self.limit:
|
||||
self.crawler.stats.set_value('memusage/limit_reached', 1)
|
||||
mem = self.limit/1024/1024
|
||||
logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = (
|
||||
f"{self.crawler.settings['BOT_NAME']} terminated: "
|
||||
f"memory usage exceeded {mem}M at {socket.gethostname()}"
|
||||
)
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/limit_notified', 1)
|
||||
|
||||
open_spiders = self.crawler.engine.open_spiders
|
||||
if open_spiders:
|
||||
for spider in open_spiders:
|
||||
self.crawler.engine.close_spider(spider, 'memusage_exceeded')
|
||||
else:
|
||||
self.crawler.stop()
|
||||
|
||||
def _check_warning(self):
|
||||
if self.warned: # warn only once
|
||||
return
|
||||
if self.get_virtual_size() > self.warning:
|
||||
self.crawler.stats.set_value('memusage/warning_reached', 1)
|
||||
mem = self.warning/1024/1024
|
||||
logger.warning("Memory usage reached %(memusage)dM",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = (
|
||||
f"{self.crawler.settings['BOT_NAME']} warning: "
|
||||
f"memory usage reached {mem}M at {socket.gethostname()}"
|
||||
)
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/warning_notified', 1)
|
||||
self.warned = True
|
||||
|
||||
def _send_report(self, rcpts, subject):
|
||||
"""send notification mail with some additional useful info"""
|
||||
stats = self.crawler.stats
|
||||
s = f"Memory usage at engine startup : {stats.get_value('memusage/startup')/1024/1024}M\r\n"
|
||||
s += f"Maximum memory usage : {stats.get_value('memusage/max')/1024/1024}M\r\n"
|
||||
s += f"Current memory usage : {self.get_virtual_size()/1024/1024}M\r\n"
|
||||
|
||||
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
|
||||
s += "\r\n"
|
||||
s += pformat(get_engine_status(self.crawler.engine))
|
||||
s += "\r\n"
|
||||
self.mail.send(rcpts, subject, s)
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
import os
|
||||
import pickle
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.job import job_dir
|
||||
|
||||
|
||||
class SpiderState:
|
||||
"""Store and load spider state during a scraping job"""
|
||||
|
||||
def __init__(self, jobdir=None):
|
||||
self.jobdir = jobdir
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
jobdir = job_dir(crawler.settings)
|
||||
if not jobdir:
|
||||
raise NotConfigured
|
||||
|
||||
obj = cls(jobdir)
|
||||
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
|
||||
return obj
|
||||
|
||||
def spider_closed(self, spider):
|
||||
if self.jobdir:
|
||||
with open(self.statefn, 'wb') as f:
|
||||
pickle.dump(spider.state, f, protocol=4)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
if self.jobdir and os.path.exists(self.statefn):
|
||||
with open(self.statefn, 'rb') as f:
|
||||
spider.state = pickle.load(f)
|
||||
else:
|
||||
spider.state = {}
|
||||
|
||||
@property
|
||||
def statefn(self):
|
||||
return os.path.join(self.jobdir, 'spider.state')
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
"""
|
||||
StatsMailer extension sends an email when a spider finishes scraping.
|
||||
|
||||
Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
|
||||
"""
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
class StatsMailer:
|
||||
|
||||
def __init__(self, stats, recipients, mail):
|
||||
self.stats = stats
|
||||
self.recipients = recipients
|
||||
self.mail = mail
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
|
||||
if not recipients:
|
||||
raise NotConfigured
|
||||
mail = MailSender.from_settings(crawler.settings)
|
||||
o = cls(crawler.stats, recipients, mail)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider):
|
||||
spider_stats = self.stats.get_stats(spider)
|
||||
body = "Global stats\n\n"
|
||||
body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
|
||||
body += f"\n\n{spider.name} stats\n\n"
|
||||
body += "\n".join(f"{k:<50} : {v}" for k, v in spider_stats.items())
|
||||
return self.mail.send(self.recipients, f"Scrapy stats for: {spider.name}", body)
|
||||
114
venv/lib/python3.9/site-packages/scrapy/extensions/telnet.py
Normal file
114
venv/lib/python3.9/site-packages/scrapy/extensions/telnet.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
"""
|
||||
Scrapy Telnet Console extension
|
||||
|
||||
See documentation in docs/topics/telnetconsole.rst
|
||||
"""
|
||||
|
||||
import pprint
|
||||
import logging
|
||||
import traceback
|
||||
import binascii
|
||||
import os
|
||||
|
||||
from twisted.internet import protocol
|
||||
try:
|
||||
from twisted.conch import manhole, telnet
|
||||
from twisted.conch.insults import insults
|
||||
TWISTED_CONCH_AVAILABLE = True
|
||||
except (ImportError, SyntaxError):
|
||||
_TWISTED_CONCH_TRACEBACK = traceback.format_exc()
|
||||
TWISTED_CONCH_AVAILABLE = False
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
from scrapy.utils.trackref import print_live_refs
|
||||
from scrapy.utils.engine import print_engine_status
|
||||
from scrapy.utils.reactor import listen_tcp
|
||||
from scrapy.utils.decorators import defers
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# signal to update telnet variables
|
||||
# args: telnet_vars
|
||||
update_telnet_vars = object()
|
||||
|
||||
|
||||
class TelnetConsole(protocol.ServerFactory):
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
|
||||
raise NotConfigured
|
||||
if not TWISTED_CONCH_AVAILABLE:
|
||||
raise NotConfigured(
|
||||
'TELNETCONSOLE_ENABLED setting is True but required twisted '
|
||||
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
|
||||
self.crawler = crawler
|
||||
self.noisy = False
|
||||
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
|
||||
self.host = crawler.settings['TELNETCONSOLE_HOST']
|
||||
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
|
||||
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
|
||||
|
||||
if not self.password:
|
||||
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
|
||||
logger.info('Telnet Password: %s', self.password)
|
||||
|
||||
self.crawler.signals.connect(self.start_listening, signals.engine_started)
|
||||
self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def start_listening(self):
|
||||
self.port = listen_tcp(self.portrange, self.host, self)
|
||||
h = self.port.getHost()
|
||||
logger.info("Telnet console listening on %(host)s:%(port)d",
|
||||
{'host': h.host, 'port': h.port},
|
||||
extra={'crawler': self.crawler})
|
||||
|
||||
def stop_listening(self):
|
||||
self.port.stopListening()
|
||||
|
||||
def protocol(self):
|
||||
class Portal:
|
||||
"""An implementation of IPortal"""
|
||||
@defers
|
||||
def login(self_, credentials, mind, *interfaces):
|
||||
if not (
|
||||
credentials.username == self.username.encode('utf8')
|
||||
and credentials.checkPassword(self.password.encode('utf8'))
|
||||
):
|
||||
raise ValueError("Invalid credentials")
|
||||
|
||||
protocol = telnet.TelnetBootstrapProtocol(
|
||||
insults.ServerProtocol,
|
||||
manhole.Manhole,
|
||||
self._get_telnet_vars()
|
||||
)
|
||||
return (interfaces[0], protocol, lambda: None)
|
||||
|
||||
return telnet.TelnetTransport(
|
||||
telnet.AuthenticatingTelnetProtocol,
|
||||
Portal()
|
||||
)
|
||||
|
||||
def _get_telnet_vars(self):
|
||||
# Note: if you add entries here also update topics/telnetconsole.rst
|
||||
telnet_vars = {
|
||||
'engine': self.crawler.engine,
|
||||
'spider': self.crawler.engine.spider,
|
||||
'slot': self.crawler.engine.slot,
|
||||
'crawler': self.crawler,
|
||||
'extensions': self.crawler.extensions,
|
||||
'stats': self.crawler.stats,
|
||||
'settings': self.crawler.settings,
|
||||
'est': lambda: print_engine_status(self.crawler.engine),
|
||||
'p': pprint.pprint,
|
||||
'prefs': print_live_refs,
|
||||
'help': "This is Scrapy telnet console. For more info see: "
|
||||
"https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
|
||||
}
|
||||
self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars)
|
||||
return telnet_vars
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
import logging
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoThrottle:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
|
||||
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
|
||||
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def _spider_opened(self, spider):
|
||||
self.mindelay = self._min_delay(spider)
|
||||
self.maxdelay = self._max_delay(spider)
|
||||
spider.download_delay = self._start_delay(spider)
|
||||
|
||||
def _min_delay(self, spider):
|
||||
s = self.crawler.settings
|
||||
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
|
||||
|
||||
def _max_delay(self, spider):
|
||||
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
|
||||
|
||||
def _start_delay(self, spider):
|
||||
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
|
||||
|
||||
def _response_downloaded(self, response, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
latency = request.meta.get('download_latency')
|
||||
if latency is None or slot is None:
|
||||
return
|
||||
|
||||
olddelay = slot.delay
|
||||
self._adjust_delay(slot, latency, response)
|
||||
if self.debug:
|
||||
diff = slot.delay - olddelay
|
||||
size = len(response.body)
|
||||
conc = len(slot.transferring)
|
||||
logger.info(
|
||||
"slot: %(slot)s | conc:%(concurrency)2d | "
|
||||
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
||||
"latency:%(latency)5d ms | size:%(size)6d bytes",
|
||||
{
|
||||
'slot': key, 'concurrency': conc,
|
||||
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
|
||||
'latency': latency * 1000, 'size': size
|
||||
},
|
||||
extra={'spider': spider}
|
||||
)
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = request.meta.get('download_slot')
|
||||
return key, self.crawler.engine.downloader.slots.get(key)
|
||||
|
||||
def _adjust_delay(self, slot, latency, response):
|
||||
"""Define delay adjustment policy"""
|
||||
|
||||
# If a server needs `latency` seconds to respond then
|
||||
# we should send a request each `latency/N` seconds
|
||||
# to have N requests processed in parallel
|
||||
target_delay = latency / self.target_concurrency
|
||||
|
||||
# Adjust the delay to make it closer to target_delay
|
||||
new_delay = (slot.delay + target_delay) / 2.0
|
||||
|
||||
# If target delay is bigger than old delay, then use it instead of mean.
|
||||
# It works better with problematic sites.
|
||||
new_delay = max(target_delay, new_delay)
|
||||
|
||||
# Make sure self.mindelay <= new_delay <= self.max_delay
|
||||
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
|
||||
|
||||
# Dont adjust delay if response status != 200 and new delay is smaller
|
||||
# than old one, as error pages (and redirections) are usually small and
|
||||
# so tend to reduce latency, thus provoking a positive feedback by
|
||||
# reducing delay instead of increase.
|
||||
if response.status != 200 and new_delay <= slot.delay:
|
||||
return
|
||||
|
||||
slot.delay = new_delay
|
||||
Loading…
Add table
Add a link
Reference in a new issue