Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
3
venv/lib/python3.9/site-packages/scrapy/core/__init__.py
Normal file
3
venv/lib/python3.9/site-packages/scrapy/core/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
"""
|
||||
Scrapy core library classes and functions.
|
||||
"""
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
import random
|
||||
from time import time
|
||||
from datetime import datetime
|
||||
from collections import deque
|
||||
|
||||
from twisted.internet import defer, task
|
||||
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.resolver import dnscache
|
||||
from scrapy import signals
|
||||
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
|
||||
from scrapy.core.downloader.handlers import DownloadHandlers
|
||||
|
||||
|
||||
class Slot:
|
||||
"""Downloader slot"""
|
||||
|
||||
def __init__(self, concurrency, delay, randomize_delay):
|
||||
self.concurrency = concurrency
|
||||
self.delay = delay
|
||||
self.randomize_delay = randomize_delay
|
||||
|
||||
self.active = set()
|
||||
self.queue = deque()
|
||||
self.transferring = set()
|
||||
self.lastseen = 0
|
||||
self.latercall = None
|
||||
|
||||
def free_transfer_slots(self):
|
||||
return self.concurrency - len(self.transferring)
|
||||
|
||||
def download_delay(self):
|
||||
if self.randomize_delay:
|
||||
return random.uniform(0.5 * self.delay, 1.5 * self.delay)
|
||||
return self.delay
|
||||
|
||||
def close(self):
|
||||
if self.latercall and self.latercall.active():
|
||||
self.latercall.cancel()
|
||||
|
||||
def __repr__(self):
|
||||
cls_name = self.__class__.__name__
|
||||
return (f"{cls_name}(concurrency={self.concurrency!r}, "
|
||||
f"delay={self.delay:.2f}, "
|
||||
f"randomize_delay={self.randomize_delay!r})")
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"<downloader.Slot concurrency={self.concurrency!r} "
|
||||
f"delay={self.delay:.2f} randomize_delay={self.randomize_delay!r} "
|
||||
f"len(active)={len(self.active)} len(queue)={len(self.queue)} "
|
||||
f"len(transferring)={len(self.transferring)} "
|
||||
f"lastseen={datetime.fromtimestamp(self.lastseen).isoformat()}>"
|
||||
)
|
||||
|
||||
|
||||
def _get_concurrency_delay(concurrency, spider, settings):
|
||||
delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
if hasattr(spider, 'download_delay'):
|
||||
delay = spider.download_delay
|
||||
|
||||
if hasattr(spider, 'max_concurrent_requests'):
|
||||
concurrency = spider.max_concurrent_requests
|
||||
|
||||
return concurrency, delay
|
||||
|
||||
|
||||
class Downloader:
|
||||
|
||||
DOWNLOAD_SLOT = 'download_slot'
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.settings = crawler.settings
|
||||
self.signals = crawler.signals
|
||||
self.slots = {}
|
||||
self.active = set()
|
||||
self.handlers = DownloadHandlers(crawler)
|
||||
self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
|
||||
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
|
||||
self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
|
||||
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
|
||||
self._slot_gc_loop = task.LoopingCall(self._slot_gc)
|
||||
self._slot_gc_loop.start(60)
|
||||
|
||||
def fetch(self, request, spider):
|
||||
def _deactivate(response):
|
||||
self.active.remove(request)
|
||||
return response
|
||||
|
||||
self.active.add(request)
|
||||
dfd = self.middleware.download(self._enqueue_request, request, spider)
|
||||
return dfd.addBoth(_deactivate)
|
||||
|
||||
def needs_backout(self):
|
||||
return len(self.active) >= self.total_concurrency
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = self._get_slot_key(request, spider)
|
||||
if key not in self.slots:
|
||||
conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
|
||||
conc, delay = _get_concurrency_delay(conc, spider, self.settings)
|
||||
self.slots[key] = Slot(conc, delay, self.randomize_delay)
|
||||
|
||||
return key, self.slots[key]
|
||||
|
||||
def _get_slot_key(self, request, spider):
|
||||
if self.DOWNLOAD_SLOT in request.meta:
|
||||
return request.meta[self.DOWNLOAD_SLOT]
|
||||
|
||||
key = urlparse_cached(request).hostname or ''
|
||||
if self.ip_concurrency:
|
||||
key = dnscache.get(key, key)
|
||||
|
||||
return key
|
||||
|
||||
def _enqueue_request(self, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
request.meta[self.DOWNLOAD_SLOT] = key
|
||||
|
||||
def _deactivate(response):
|
||||
slot.active.remove(request)
|
||||
return response
|
||||
|
||||
slot.active.add(request)
|
||||
self.signals.send_catch_log(signal=signals.request_reached_downloader,
|
||||
request=request,
|
||||
spider=spider)
|
||||
deferred = defer.Deferred().addBoth(_deactivate)
|
||||
slot.queue.append((request, deferred))
|
||||
self._process_queue(spider, slot)
|
||||
return deferred
|
||||
|
||||
def _process_queue(self, spider, slot):
|
||||
from twisted.internet import reactor
|
||||
if slot.latercall and slot.latercall.active():
|
||||
return
|
||||
|
||||
# Delay queue processing if a download_delay is configured
|
||||
now = time()
|
||||
delay = slot.download_delay()
|
||||
if delay:
|
||||
penalty = delay - now + slot.lastseen
|
||||
if penalty > 0:
|
||||
slot.latercall = reactor.callLater(penalty, self._process_queue, spider, slot)
|
||||
return
|
||||
|
||||
# Process enqueued requests if there are free slots to transfer for this slot
|
||||
while slot.queue and slot.free_transfer_slots() > 0:
|
||||
slot.lastseen = now
|
||||
request, deferred = slot.queue.popleft()
|
||||
dfd = self._download(slot, request, spider)
|
||||
dfd.chainDeferred(deferred)
|
||||
# prevent burst if inter-request delays were configured
|
||||
if delay:
|
||||
self._process_queue(spider, slot)
|
||||
break
|
||||
|
||||
def _download(self, slot, request, spider):
|
||||
# The order is very important for the following deferreds. Do not change!
|
||||
|
||||
# 1. Create the download deferred
|
||||
dfd = mustbe_deferred(self.handlers.download_request, request, spider)
|
||||
|
||||
# 2. Notify response_downloaded listeners about the recent download
|
||||
# before querying queue for next request
|
||||
def _downloaded(response):
|
||||
self.signals.send_catch_log(signal=signals.response_downloaded,
|
||||
response=response,
|
||||
request=request,
|
||||
spider=spider)
|
||||
return response
|
||||
dfd.addCallback(_downloaded)
|
||||
|
||||
# 3. After response arrives, remove the request from transferring
|
||||
# state to free up the transferring slot so it can be used by the
|
||||
# following requests (perhaps those which came from the downloader
|
||||
# middleware itself)
|
||||
slot.transferring.add(request)
|
||||
|
||||
def finish_transferring(_):
|
||||
slot.transferring.remove(request)
|
||||
self._process_queue(spider, slot)
|
||||
self.signals.send_catch_log(signal=signals.request_left_downloader,
|
||||
request=request,
|
||||
spider=spider)
|
||||
return _
|
||||
|
||||
return dfd.addBoth(finish_transferring)
|
||||
|
||||
def close(self):
|
||||
self._slot_gc_loop.stop()
|
||||
for slot in self.slots.values():
|
||||
slot.close()
|
||||
|
||||
def _slot_gc(self, age=60):
|
||||
mintime = time() - age
|
||||
for key, slot in list(self.slots.items()):
|
||||
if not slot.active and slot.lastseen + slot.delay < mintime:
|
||||
self.slots.pop(key).close()
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
from OpenSSL import SSL
|
||||
from twisted.internet.ssl import optionsForClientTLS, CertificateOptions, platformTrust, AcceptableCiphers
|
||||
from twisted.web.client import BrowserLikePolicyForHTTPS
|
||||
from twisted.web.iweb import IPolicyForHTTPS
|
||||
from zope.interface.declarations import implementer
|
||||
|
||||
from scrapy.core.downloader.tls import ScrapyClientTLSOptions, DEFAULT_CIPHERS
|
||||
|
||||
|
||||
@implementer(IPolicyForHTTPS)
|
||||
class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
"""
|
||||
Non-peer-certificate verifying HTTPS context factory
|
||||
|
||||
Default OpenSSL method is TLS_METHOD (also called SSLv23_METHOD)
|
||||
which allows TLS protocol negotiation
|
||||
|
||||
'A TLS/SSL connection established with [this method] may
|
||||
understand the SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols.'
|
||||
"""
|
||||
|
||||
def __init__(self, method=SSL.SSLv23_METHOD, tls_verbose_logging=False, tls_ciphers=None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._ssl_method = method
|
||||
self.tls_verbose_logging = tls_verbose_logging
|
||||
if tls_ciphers:
|
||||
self.tls_ciphers = AcceptableCiphers.fromOpenSSLCipherString(tls_ciphers)
|
||||
else:
|
||||
self.tls_ciphers = DEFAULT_CIPHERS
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings, method=SSL.SSLv23_METHOD, *args, **kwargs):
|
||||
tls_verbose_logging = settings.getbool('DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING')
|
||||
tls_ciphers = settings['DOWNLOADER_CLIENT_TLS_CIPHERS']
|
||||
return cls(method=method, tls_verbose_logging=tls_verbose_logging, tls_ciphers=tls_ciphers, *args, **kwargs)
|
||||
|
||||
def getCertificateOptions(self):
|
||||
# setting verify=True will require you to provide CAs
|
||||
# to verify against; in other words: it's not that simple
|
||||
|
||||
# backward-compatible SSL/TLS method:
|
||||
#
|
||||
# * this will respect `method` attribute in often recommended
|
||||
# `ScrapyClientContextFactory` subclass
|
||||
# (https://github.com/scrapy/scrapy/issues/1429#issuecomment-131782133)
|
||||
#
|
||||
# * getattr() for `_ssl_method` attribute for context factories
|
||||
# not calling super().__init__
|
||||
return CertificateOptions(
|
||||
verify=False,
|
||||
method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
|
||||
fixBrokenPeers=True,
|
||||
acceptableCiphers=self.tls_ciphers,
|
||||
)
|
||||
|
||||
# kept for old-style HTTP/1.0 downloader context twisted calls,
|
||||
# e.g. connectSSL()
|
||||
def getContext(self, hostname=None, port=None):
|
||||
return self.getCertificateOptions().getContext()
|
||||
|
||||
def creatorForNetloc(self, hostname, port):
|
||||
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext(),
|
||||
verbose_logging=self.tls_verbose_logging)
|
||||
|
||||
|
||||
@implementer(IPolicyForHTTPS)
|
||||
class BrowserLikeContextFactory(ScrapyClientContextFactory):
|
||||
"""
|
||||
Twisted-recommended context factory for web clients.
|
||||
|
||||
Quoting the documentation of the :class:`~twisted.web.client.Agent` class:
|
||||
|
||||
The default is to use a
|
||||
:class:`~twisted.web.client.BrowserLikePolicyForHTTPS`, so unless you
|
||||
have special requirements you can leave this as-is.
|
||||
|
||||
:meth:`creatorForNetloc` is the same as
|
||||
:class:`~twisted.web.client.BrowserLikePolicyForHTTPS` except this context
|
||||
factory allows setting the TLS/SSL method to use.
|
||||
|
||||
The default OpenSSL method is ``TLS_METHOD`` (also called
|
||||
``SSLv23_METHOD``) which allows TLS protocol negotiation.
|
||||
"""
|
||||
def creatorForNetloc(self, hostname, port):
|
||||
|
||||
# trustRoot set to platformTrust() will use the platform's root CAs.
|
||||
#
|
||||
# This means that a website like https://www.cacert.org will be rejected
|
||||
# by default, since CAcert.org CA certificate is seldom shipped.
|
||||
return optionsForClientTLS(
|
||||
hostname=hostname.decode("ascii"),
|
||||
trustRoot=platformTrust(),
|
||||
extraCertificateOptions={'method': self._ssl_method},
|
||||
)
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
"""Download handlers for different schemes"""
|
||||
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured, NotSupported
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DownloadHandlers:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self._crawler = crawler
|
||||
self._schemes = {} # stores acceptable schemes on instancing
|
||||
self._handlers = {} # stores instanced handlers for schemes
|
||||
self._notconfigured = {} # remembers failed handlers
|
||||
handlers = without_none_values(
|
||||
crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
|
||||
for scheme, clspath in handlers.items():
|
||||
self._schemes[scheme] = clspath
|
||||
self._load_handler(scheme, skip_lazy=True)
|
||||
|
||||
crawler.signals.connect(self._close, signals.engine_stopped)
|
||||
|
||||
def _get_handler(self, scheme):
|
||||
"""Lazy-load the downloadhandler for a scheme
|
||||
only on the first request for that scheme.
|
||||
"""
|
||||
if scheme in self._handlers:
|
||||
return self._handlers[scheme]
|
||||
if scheme in self._notconfigured:
|
||||
return None
|
||||
if scheme not in self._schemes:
|
||||
self._notconfigured[scheme] = 'no handler available for that scheme'
|
||||
return None
|
||||
|
||||
return self._load_handler(scheme)
|
||||
|
||||
def _load_handler(self, scheme, skip_lazy=False):
|
||||
path = self._schemes[scheme]
|
||||
try:
|
||||
dhcls = load_object(path)
|
||||
if skip_lazy and getattr(dhcls, 'lazy', True):
|
||||
return None
|
||||
dh = create_instance(
|
||||
objcls=dhcls,
|
||||
settings=self._crawler.settings,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
except NotConfigured as ex:
|
||||
self._notconfigured[scheme] = str(ex)
|
||||
return None
|
||||
except Exception as ex:
|
||||
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
|
||||
{"clspath": path, "scheme": scheme},
|
||||
exc_info=True, extra={'crawler': self._crawler})
|
||||
self._notconfigured[scheme] = str(ex)
|
||||
return None
|
||||
else:
|
||||
self._handlers[scheme] = dh
|
||||
return dh
|
||||
|
||||
def download_request(self, request, spider):
|
||||
scheme = urlparse_cached(request).scheme
|
||||
handler = self._get_handler(scheme)
|
||||
if not handler:
|
||||
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
|
||||
return handler.download_request(request, spider)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _close(self, *_a, **_kw):
|
||||
for dh in self._handlers.values():
|
||||
if hasattr(dh, 'close'):
|
||||
yield dh.close()
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
from w3lib.url import parse_data_uri
|
||||
|
||||
from scrapy.http import TextResponse
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.decorators import defers
|
||||
|
||||
|
||||
class DataURIDownloadHandler:
|
||||
lazy = False
|
||||
|
||||
@defers
|
||||
def download_request(self, request, spider):
|
||||
uri = parse_data_uri(request.url)
|
||||
respcls = responsetypes.from_mimetype(uri.media_type)
|
||||
|
||||
resp_kwargs = {}
|
||||
if (issubclass(respcls, TextResponse)
|
||||
and uri.media_type.split('/')[0] == 'text'):
|
||||
charset = uri.media_type_parameters.get('charset')
|
||||
resp_kwargs['encoding'] = charset
|
||||
|
||||
return respcls(url=request.url, body=uri.data, **resp_kwargs)
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
from w3lib.url import file_uri_to_path
|
||||
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.decorators import defers
|
||||
|
||||
|
||||
class FileDownloadHandler:
|
||||
lazy = False
|
||||
|
||||
@defers
|
||||
def download_request(self, request, spider):
|
||||
filepath = file_uri_to_path(request.url)
|
||||
with open(filepath, 'rb') as fo:
|
||||
body = fo.read()
|
||||
respcls = responsetypes.from_args(filename=filepath, body=body)
|
||||
return respcls(url=request.url, body=body)
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
"""
|
||||
An asynchronous FTP file download handler for scrapy which somehow emulates an http response.
|
||||
|
||||
FTP connection parameters are passed using the request meta field:
|
||||
- ftp_user (required)
|
||||
- ftp_password (required)
|
||||
- ftp_passive (by default, enabled) sets FTP connection passive mode
|
||||
- ftp_local_filename
|
||||
- If not given, file data will come in the response.body, as a normal scrapy Response,
|
||||
which will imply that the entire file will be on memory.
|
||||
- if given, file data will be saved in a local file with the given name
|
||||
This helps when downloading very big files to avoid memory issues. In addition, for
|
||||
convenience the local file name will also be given in the response body.
|
||||
|
||||
The status of the built html response will be, by default
|
||||
- 200 in case of success
|
||||
- 404 in case specified file was not found in the server (ftp code 550)
|
||||
|
||||
or raise corresponding ftp exception otherwise
|
||||
|
||||
The matching from server ftp command return codes to html response codes is defined in the
|
||||
CODE_MAPPING attribute of the handler class. The key 'default' is used for any code
|
||||
that is not explicitly present among the map keys. You may need to overwrite this
|
||||
mapping if want a different behaviour than default.
|
||||
|
||||
In case of status 200 request, response.headers will come with two keys:
|
||||
'Local Filename' - with the value of the local filename if given
|
||||
'Size' - with size of the downloaded data
|
||||
"""
|
||||
|
||||
import re
|
||||
from io import BytesIO
|
||||
from urllib.parse import unquote
|
||||
|
||||
from twisted.internet.protocol import ClientCreator, Protocol
|
||||
from twisted.protocols.ftp import CommandFailed, FTPClient
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class ReceivedDataProtocol(Protocol):
|
||||
def __init__(self, filename=None):
|
||||
self.__filename = filename
|
||||
self.body = open(filename, "wb") if filename else BytesIO()
|
||||
self.size = 0
|
||||
|
||||
def dataReceived(self, data):
|
||||
self.body.write(data)
|
||||
self.size += len(data)
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
return self.__filename
|
||||
|
||||
def close(self):
|
||||
self.body.close() if self.filename else self.body.seek(0)
|
||||
|
||||
|
||||
_CODE_RE = re.compile(r"\d+")
|
||||
|
||||
|
||||
class FTPDownloadHandler:
|
||||
lazy = False
|
||||
|
||||
CODE_MAPPING = {
|
||||
"550": 404,
|
||||
"default": 503,
|
||||
}
|
||||
|
||||
def __init__(self, settings):
|
||||
self.default_user = settings['FTP_USER']
|
||||
self.default_password = settings['FTP_PASSWORD']
|
||||
self.passive_mode = settings['FTP_PASSIVE_MODE']
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
from twisted.internet import reactor
|
||||
parsed_url = urlparse_cached(request)
|
||||
user = request.meta.get("ftp_user", self.default_user)
|
||||
password = request.meta.get("ftp_password", self.default_password)
|
||||
passive_mode = 1 if bool(request.meta.get("ftp_passive",
|
||||
self.passive_mode)) else 0
|
||||
creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
|
||||
dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
|
||||
return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
|
||||
|
||||
def gotClient(self, client, request, filepath):
|
||||
self.client = client
|
||||
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
|
||||
return client.retrieveFile(filepath, protocol).addCallbacks(
|
||||
callback=self._build_response,
|
||||
callbackArgs=(request, protocol),
|
||||
errback=self._failed,
|
||||
errbackArgs=(request,),
|
||||
)
|
||||
|
||||
def _build_response(self, result, request, protocol):
|
||||
self.result = result
|
||||
respcls = responsetypes.from_args(url=request.url)
|
||||
protocol.close()
|
||||
body = protocol.filename or protocol.body.read()
|
||||
headers = {"local filename": protocol.filename or '', "size": protocol.size}
|
||||
return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
|
||||
|
||||
def _failed(self, result, request):
|
||||
message = result.getErrorMessage()
|
||||
if result.type == CommandFailed:
|
||||
m = _CODE_RE.search(message)
|
||||
if m:
|
||||
ftpcode = m.group()
|
||||
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
|
||||
return Response(url=request.url, status=httpcode, body=to_bytes(message))
|
||||
raise result.type(result.value)
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
|
||||
from scrapy.core.downloader.handlers.http11 import (
|
||||
HTTP11DownloadHandler as HTTPDownloadHandler,
|
||||
)
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
"""Download handlers for http and https schemes
|
||||
"""
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
class HTTP10DownloadHandler:
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings, crawler=None):
|
||||
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
|
||||
self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
self._settings = settings
|
||||
self._crawler = crawler
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings, crawler)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
factory = self.HTTPClientFactory(request)
|
||||
self._connect(factory)
|
||||
return factory.deferred
|
||||
|
||||
def _connect(self, factory):
|
||||
from twisted.internet import reactor
|
||||
host, port = to_unicode(factory.host), factory.port
|
||||
if factory.scheme == b'https':
|
||||
client_context_factory = create_instance(
|
||||
objcls=self.ClientContextFactory,
|
||||
settings=self._settings,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
return reactor.connectSSL(host, port, factory, client_context_factory)
|
||||
else:
|
||||
return reactor.connectTCP(host, port, factory)
|
||||
|
|
@ -0,0 +1,568 @@
|
|||
"""Download handlers for http and https schemes"""
|
||||
|
||||
import ipaddress
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
from time import time
|
||||
from urllib.parse import urldefrag
|
||||
|
||||
from twisted.internet import defer, protocol, ssl
|
||||
from twisted.internet.endpoints import TCP4ClientEndpoint
|
||||
from twisted.internet.error import TimeoutError
|
||||
from twisted.python.failure import Failure
|
||||
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
|
||||
from twisted.web.http import _DataLoss, PotentialDataLoss
|
||||
from twisted.web.http_headers import Headers as TxHeaders
|
||||
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
|
||||
from zope.interface import implementer
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.core.downloader.tls import openssl_methods
|
||||
from scrapy.core.downloader.webclient import _parse
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
|
||||
from scrapy.http import Headers
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTTP11DownloadHandler:
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings, crawler=None):
|
||||
self._crawler = crawler
|
||||
|
||||
from twisted.internet import reactor
|
||||
self._pool = HTTPConnectionPool(reactor, persistent=True)
|
||||
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
self._pool._factory.noisy = False
|
||||
|
||||
self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
|
||||
self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
# try method-aware context factory
|
||||
try:
|
||||
self._contextFactory = create_instance(
|
||||
objcls=self._contextFactoryClass,
|
||||
settings=settings,
|
||||
crawler=crawler,
|
||||
method=self._sslMethod,
|
||||
)
|
||||
except TypeError:
|
||||
# use context factory defaults
|
||||
self._contextFactory = create_instance(
|
||||
objcls=self._contextFactoryClass,
|
||||
settings=settings,
|
||||
crawler=crawler,
|
||||
)
|
||||
msg = f"""
|
||||
'{settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]}' does not accept `method` \
|
||||
argument (type OpenSSL.SSL method, e.g. OpenSSL.SSL.SSLv23_METHOD) and/or \
|
||||
`tls_verbose_logging` argument and/or `tls_ciphers` argument.\
|
||||
Please upgrade your context factory class to handle them or ignore them."""
|
||||
warnings.warn(msg)
|
||||
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
|
||||
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
|
||||
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
|
||||
self._disconnect_timeout = 1
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings, crawler)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
agent = ScrapyAgent(
|
||||
contextFactory=self._contextFactory,
|
||||
pool=self._pool,
|
||||
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
|
||||
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
|
||||
fail_on_dataloss=self._fail_on_dataloss,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
return agent.download_request(request)
|
||||
|
||||
def close(self):
|
||||
from twisted.internet import reactor
|
||||
d = self._pool.closeCachedConnections()
|
||||
# closeCachedConnections will hang on network or server issues, so
|
||||
# we'll manually timeout the deferred.
|
||||
#
|
||||
# Twisted issue addressing this problem can be found here:
|
||||
# https://twistedmatrix.com/trac/ticket/7738.
|
||||
#
|
||||
# closeCachedConnections doesn't handle external errbacks, so we'll
|
||||
# issue a callback after `_disconnect_timeout` seconds.
|
||||
delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, [])
|
||||
|
||||
def cancel_delayed_call(result):
|
||||
if delayed_call.active():
|
||||
delayed_call.cancel()
|
||||
return result
|
||||
|
||||
d.addBoth(cancel_delayed_call)
|
||||
return d
|
||||
|
||||
|
||||
class TunnelError(Exception):
|
||||
"""An HTTP CONNECT tunnel could not be established by the proxy."""
|
||||
|
||||
|
||||
class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
"""An endpoint that tunnels through proxies to allow HTTPS downloads. To
|
||||
accomplish that, this endpoint sends an HTTP CONNECT to the proxy.
|
||||
The HTTP CONNECT is always sent when using this endpoint, I think this could
|
||||
be improved as the CONNECT will be redundant if the connection associated
|
||||
with this endpoint comes from the pool and a CONNECT has already been issued
|
||||
for it.
|
||||
"""
|
||||
|
||||
_responseMatcher = re.compile(br'HTTP/1\.. (?P<status>\d{3})(?P<reason>.{,32})')
|
||||
|
||||
def __init__(self, reactor, host, port, proxyConf, contextFactory, timeout=30, bindAddress=None):
|
||||
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
|
||||
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
|
||||
self._tunnelReadyDeferred = defer.Deferred()
|
||||
self._tunneledHost = host
|
||||
self._tunneledPort = port
|
||||
self._contextFactory = contextFactory
|
||||
self._connectBuffer = bytearray()
|
||||
|
||||
def requestTunnel(self, protocol):
|
||||
"""Asks the proxy to open a tunnel."""
|
||||
tunnelReq = tunnel_request_data(self._tunneledHost, self._tunneledPort, self._proxyAuthHeader)
|
||||
protocol.transport.write(tunnelReq)
|
||||
self._protocolDataReceived = protocol.dataReceived
|
||||
protocol.dataReceived = self.processProxyResponse
|
||||
self._protocol = protocol
|
||||
return protocol
|
||||
|
||||
def processProxyResponse(self, rcvd_bytes):
|
||||
"""Processes the response from the proxy. If the tunnel is successfully
|
||||
created, notifies the client that we are ready to send requests. If not
|
||||
raises a TunnelError.
|
||||
"""
|
||||
self._connectBuffer += rcvd_bytes
|
||||
# make sure that enough (all) bytes are consumed
|
||||
# and that we've got all HTTP headers (ending with a blank line)
|
||||
# from the proxy so that we don't send those bytes to the TLS layer
|
||||
#
|
||||
# see https://github.com/scrapy/scrapy/issues/2491
|
||||
if b'\r\n\r\n' not in self._connectBuffer:
|
||||
return
|
||||
self._protocol.dataReceived = self._protocolDataReceived
|
||||
respm = TunnelingTCP4ClientEndpoint._responseMatcher.match(self._connectBuffer)
|
||||
if respm and int(respm.group('status')) == 200:
|
||||
# set proper Server Name Indication extension
|
||||
sslOptions = self._contextFactory.creatorForNetloc(self._tunneledHost, self._tunneledPort)
|
||||
self._protocol.transport.startTLS(sslOptions, self._protocolFactory)
|
||||
self._tunnelReadyDeferred.callback(self._protocol)
|
||||
else:
|
||||
if respm:
|
||||
extra = {'status': int(respm.group('status')),
|
||||
'reason': respm.group('reason').strip()}
|
||||
else:
|
||||
extra = rcvd_bytes[:32]
|
||||
self._tunnelReadyDeferred.errback(
|
||||
TunnelError('Could not open CONNECT tunnel with proxy '
|
||||
f'{self._host}:{self._port} [{extra!r}]')
|
||||
)
|
||||
|
||||
def connectFailed(self, reason):
|
||||
"""Propagates the errback to the appropriate deferred."""
|
||||
self._tunnelReadyDeferred.errback(reason)
|
||||
|
||||
def connect(self, protocolFactory):
|
||||
self._protocolFactory = protocolFactory
|
||||
connectDeferred = super().connect(protocolFactory)
|
||||
connectDeferred.addCallback(self.requestTunnel)
|
||||
connectDeferred.addErrback(self.connectFailed)
|
||||
return self._tunnelReadyDeferred
|
||||
|
||||
|
||||
def tunnel_request_data(host, port, proxy_auth_header=None):
|
||||
r"""
|
||||
Return binary content of a CONNECT request.
|
||||
|
||||
>>> from scrapy.utils.python import to_unicode as s
|
||||
>>> s(tunnel_request_data("example.com", 8080))
|
||||
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
|
||||
>>> s(tunnel_request_data("example.com", 8080, b"123"))
|
||||
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n'
|
||||
>>> s(tunnel_request_data(b"example.com", "8090"))
|
||||
'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
|
||||
"""
|
||||
host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
|
||||
tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
|
||||
tunnel_req += b'Host: ' + host_value + b'\r\n'
|
||||
if proxy_auth_header:
|
||||
tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
|
||||
tunnel_req += b'\r\n'
|
||||
return tunnel_req
|
||||
|
||||
|
||||
class TunnelingAgent(Agent):
|
||||
"""An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
|
||||
downloads. It may look strange that we have chosen to subclass Agent and not
|
||||
ProxyAgent but consider that after the tunnel is opened the proxy is
|
||||
transparent to the client; thus the agent should behave like there is no
|
||||
proxy involved.
|
||||
"""
|
||||
|
||||
def __init__(self, reactor, proxyConf, contextFactory=None,
|
||||
connectTimeout=None, bindAddress=None, pool=None):
|
||||
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
|
||||
self._proxyConf = proxyConf
|
||||
self._contextFactory = contextFactory
|
||||
|
||||
def _getEndpoint(self, uri):
|
||||
return TunnelingTCP4ClientEndpoint(
|
||||
reactor=self._reactor,
|
||||
host=uri.host,
|
||||
port=uri.port,
|
||||
proxyConf=self._proxyConf,
|
||||
contextFactory=self._contextFactory,
|
||||
timeout=self._endpointFactory._connectTimeout,
|
||||
bindAddress=self._endpointFactory._bindAddress,
|
||||
)
|
||||
|
||||
def _requestWithEndpoint(self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath):
|
||||
# proxy host and port are required for HTTP pool `key`
|
||||
# otherwise, same remote host connection request could reuse
|
||||
# a cached tunneled connection to a different proxy
|
||||
key = key + self._proxyConf
|
||||
return super()._requestWithEndpoint(
|
||||
key=key,
|
||||
endpoint=endpoint,
|
||||
method=method,
|
||||
parsedURI=parsedURI,
|
||||
headers=headers,
|
||||
bodyProducer=bodyProducer,
|
||||
requestPath=requestPath,
|
||||
)
|
||||
|
||||
|
||||
class ScrapyProxyAgent(Agent):
|
||||
|
||||
def __init__(self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None):
|
||||
super().__init__(
|
||||
reactor=reactor,
|
||||
connectTimeout=connectTimeout,
|
||||
bindAddress=bindAddress,
|
||||
pool=pool,
|
||||
)
|
||||
self._proxyURI = URI.fromBytes(proxyURI)
|
||||
|
||||
def request(self, method, uri, headers=None, bodyProducer=None):
|
||||
"""
|
||||
Issue a new request via the configured proxy.
|
||||
"""
|
||||
# Cache *all* connections under the same key, since we are only
|
||||
# connecting to a single destination, the proxy:
|
||||
return self._requestWithEndpoint(
|
||||
key=("http-proxy", self._proxyURI.host, self._proxyURI.port),
|
||||
endpoint=self._getEndpoint(self._proxyURI),
|
||||
method=method,
|
||||
parsedURI=URI.fromBytes(uri),
|
||||
headers=headers,
|
||||
bodyProducer=bodyProducer,
|
||||
requestPath=uri,
|
||||
)
|
||||
|
||||
|
||||
class ScrapyAgent:
|
||||
|
||||
_Agent = Agent
|
||||
_ProxyAgent = ScrapyProxyAgent
|
||||
_TunnelingAgent = TunnelingAgent
|
||||
|
||||
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
|
||||
maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
|
||||
self._contextFactory = contextFactory
|
||||
self._connectTimeout = connectTimeout
|
||||
self._bindAddress = bindAddress
|
||||
self._pool = pool
|
||||
self._maxsize = maxsize
|
||||
self._warnsize = warnsize
|
||||
self._fail_on_dataloss = fail_on_dataloss
|
||||
self._txresponse = None
|
||||
self._crawler = crawler
|
||||
|
||||
def _get_agent(self, request, timeout):
|
||||
from twisted.internet import reactor
|
||||
bindaddress = request.meta.get('bindaddress') or self._bindAddress
|
||||
proxy = request.meta.get('proxy')
|
||||
if proxy:
|
||||
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
|
||||
scheme = _parse(request.url)[0]
|
||||
proxyHost = to_unicode(proxyHost)
|
||||
omitConnectTunnel = b'noconnect' in proxyParams
|
||||
if omitConnectTunnel:
|
||||
warnings.warn("Using HTTPS proxies in the noconnect mode is deprecated. "
|
||||
"If you use Crawlera, it doesn't require this mode anymore, "
|
||||
"so you should update scrapy-crawlera to 1.3.0+ "
|
||||
"and remove '?noconnect' from the Crawlera URL.",
|
||||
ScrapyDeprecationWarning)
|
||||
if scheme == b'https' and not omitConnectTunnel:
|
||||
proxyAuth = request.headers.get(b'Proxy-Authorization', None)
|
||||
proxyConf = (proxyHost, proxyPort, proxyAuth)
|
||||
return self._TunnelingAgent(
|
||||
reactor=reactor,
|
||||
proxyConf=proxyConf,
|
||||
contextFactory=self._contextFactory,
|
||||
connectTimeout=timeout,
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
)
|
||||
else:
|
||||
return self._ProxyAgent(
|
||||
reactor=reactor,
|
||||
proxyURI=to_bytes(proxy, encoding='ascii'),
|
||||
connectTimeout=timeout,
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
)
|
||||
|
||||
return self._Agent(
|
||||
reactor=reactor,
|
||||
contextFactory=self._contextFactory,
|
||||
connectTimeout=timeout,
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
)
|
||||
|
||||
def download_request(self, request):
|
||||
from twisted.internet import reactor
|
||||
timeout = request.meta.get('download_timeout') or self._connectTimeout
|
||||
agent = self._get_agent(request, timeout)
|
||||
|
||||
# request details
|
||||
url = urldefrag(request.url)[0]
|
||||
method = to_bytes(request.method)
|
||||
headers = TxHeaders(request.headers)
|
||||
if isinstance(agent, self._TunnelingAgent):
|
||||
headers.removeHeader(b'Proxy-Authorization')
|
||||
if request.body:
|
||||
bodyproducer = _RequestBodyProducer(request.body)
|
||||
else:
|
||||
bodyproducer = None
|
||||
start_time = time()
|
||||
d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
|
||||
# set download latency
|
||||
d.addCallback(self._cb_latency, request, start_time)
|
||||
# response body is ready to be consumed
|
||||
d.addCallback(self._cb_bodyready, request)
|
||||
d.addCallback(self._cb_bodydone, request, url)
|
||||
# check download timeout
|
||||
self._timeout_cl = reactor.callLater(timeout, d.cancel)
|
||||
d.addBoth(self._cb_timeout, request, url, timeout)
|
||||
return d
|
||||
|
||||
def _cb_timeout(self, result, request, url, timeout):
|
||||
if self._timeout_cl.active():
|
||||
self._timeout_cl.cancel()
|
||||
return result
|
||||
# needed for HTTPS requests, otherwise _ResponseReader doesn't
|
||||
# receive connectionLost()
|
||||
if self._txresponse:
|
||||
self._txresponse._transport.stopProducing()
|
||||
|
||||
raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
|
||||
|
||||
def _cb_latency(self, result, request, start_time):
|
||||
request.meta['download_latency'] = time() - start_time
|
||||
return result
|
||||
|
||||
def _cb_bodyready(self, txresponse, request):
|
||||
# deliverBody hangs for responses without body
|
||||
if txresponse.length == 0:
|
||||
return {
|
||||
"txresponse": txresponse,
|
||||
"body": b"",
|
||||
"flags": None,
|
||||
"certificate": None,
|
||||
"ip_address": None,
|
||||
}
|
||||
|
||||
maxsize = request.meta.get('download_maxsize', self._maxsize)
|
||||
warnsize = request.meta.get('download_warnsize', self._warnsize)
|
||||
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
|
||||
fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
|
||||
|
||||
if maxsize and expected_size > maxsize:
|
||||
warning_msg = ("Cancelling download of %(url)s: expected response "
|
||||
"size (%(size)s) larger than download max size (%(maxsize)s).")
|
||||
warning_args = {'url': request.url, 'size': expected_size, 'maxsize': maxsize}
|
||||
|
||||
logger.warning(warning_msg, warning_args)
|
||||
|
||||
txresponse._transport._producer.loseConnection()
|
||||
raise defer.CancelledError(warning_msg % warning_args)
|
||||
|
||||
if warnsize and expected_size > warnsize:
|
||||
logger.warning("Expected response size (%(size)s) larger than "
|
||||
"download warn size (%(warnsize)s) in request %(request)s.",
|
||||
{'size': expected_size, 'warnsize': warnsize, 'request': request})
|
||||
|
||||
def _cancel(_):
|
||||
# Abort connection immediately.
|
||||
txresponse._transport._producer.abortConnection()
|
||||
|
||||
d = defer.Deferred(_cancel)
|
||||
txresponse.deliverBody(
|
||||
_ResponseReader(
|
||||
finished=d,
|
||||
txresponse=txresponse,
|
||||
request=request,
|
||||
maxsize=maxsize,
|
||||
warnsize=warnsize,
|
||||
fail_on_dataloss=fail_on_dataloss,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
)
|
||||
|
||||
# save response for timeouts
|
||||
self._txresponse = txresponse
|
||||
|
||||
return d
|
||||
|
||||
def _cb_bodydone(self, result, request, url):
|
||||
headers = Headers(result["txresponse"].headers.getAllRawHeaders())
|
||||
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
|
||||
response = respcls(
|
||||
url=url,
|
||||
status=int(result["txresponse"].code),
|
||||
headers=headers,
|
||||
body=result["body"],
|
||||
flags=result["flags"],
|
||||
certificate=result["certificate"],
|
||||
ip_address=result["ip_address"],
|
||||
)
|
||||
if result.get("failure"):
|
||||
result["failure"].value.response = response
|
||||
return result["failure"]
|
||||
return response
|
||||
|
||||
|
||||
@implementer(IBodyProducer)
|
||||
class _RequestBodyProducer:
|
||||
|
||||
def __init__(self, body):
|
||||
self.body = body
|
||||
self.length = len(body)
|
||||
|
||||
def startProducing(self, consumer):
|
||||
consumer.write(self.body)
|
||||
return defer.succeed(None)
|
||||
|
||||
def pauseProducing(self):
|
||||
pass
|
||||
|
||||
def stopProducing(self):
|
||||
pass
|
||||
|
||||
|
||||
class _ResponseReader(protocol.Protocol):
|
||||
|
||||
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
|
||||
self._finished = finished
|
||||
self._txresponse = txresponse
|
||||
self._request = request
|
||||
self._bodybuf = BytesIO()
|
||||
self._maxsize = maxsize
|
||||
self._warnsize = warnsize
|
||||
self._fail_on_dataloss = fail_on_dataloss
|
||||
self._fail_on_dataloss_warned = False
|
||||
self._reached_warnsize = False
|
||||
self._bytes_received = 0
|
||||
self._certificate = None
|
||||
self._ip_address = None
|
||||
self._crawler = crawler
|
||||
|
||||
def _finish_response(self, flags=None, failure=None):
|
||||
self._finished.callback({
|
||||
"txresponse": self._txresponse,
|
||||
"body": self._bodybuf.getvalue(),
|
||||
"flags": flags,
|
||||
"certificate": self._certificate,
|
||||
"ip_address": self._ip_address,
|
||||
"failure": failure,
|
||||
})
|
||||
|
||||
def connectionMade(self):
|
||||
if self._certificate is None:
|
||||
with suppress(AttributeError):
|
||||
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
|
||||
|
||||
if self._ip_address is None:
|
||||
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
|
||||
|
||||
def dataReceived(self, bodyBytes):
|
||||
# This maybe called several times after cancel was called with buffered data.
|
||||
if self._finished.called:
|
||||
return
|
||||
|
||||
self._bodybuf.write(bodyBytes)
|
||||
self._bytes_received += len(bodyBytes)
|
||||
|
||||
bytes_received_result = self._crawler.signals.send_catch_log(
|
||||
signal=signals.bytes_received,
|
||||
data=bodyBytes,
|
||||
request=self._request,
|
||||
spider=self._crawler.spider,
|
||||
)
|
||||
for handler, result in bytes_received_result:
|
||||
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
|
||||
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
|
||||
{"request": self._request, "handler": handler.__qualname__})
|
||||
self.transport._producer.loseConnection()
|
||||
failure = result if result.value.fail else None
|
||||
self._finish_response(flags=["download_stopped"], failure=failure)
|
||||
|
||||
if self._maxsize and self._bytes_received > self._maxsize:
|
||||
logger.warning("Received (%(bytes)s) bytes larger than download "
|
||||
"max size (%(maxsize)s) in request %(request)s.",
|
||||
{'bytes': self._bytes_received,
|
||||
'maxsize': self._maxsize,
|
||||
'request': self._request})
|
||||
# Clear buffer earlier to avoid keeping data in memory for a long time.
|
||||
self._bodybuf.truncate(0)
|
||||
self._finished.cancel()
|
||||
|
||||
if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
|
||||
self._reached_warnsize = True
|
||||
logger.warning("Received more bytes than download "
|
||||
"warn size (%(warnsize)s) in request %(request)s.",
|
||||
{'warnsize': self._warnsize,
|
||||
'request': self._request})
|
||||
|
||||
def connectionLost(self, reason):
|
||||
if self._finished.called:
|
||||
return
|
||||
|
||||
if reason.check(ResponseDone):
|
||||
self._finish_response()
|
||||
return
|
||||
|
||||
if reason.check(PotentialDataLoss):
|
||||
self._finish_response(flags=["partial"])
|
||||
return
|
||||
|
||||
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
|
||||
if not self._fail_on_dataloss:
|
||||
self._finish_response(flags=["dataloss"])
|
||||
return
|
||||
|
||||
elif not self._fail_on_dataloss_warned:
|
||||
logger.warning("Got data loss in %s. If you want to process broken "
|
||||
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
|
||||
" -- This message won't be shown in further requests",
|
||||
self._txresponse.request.absoluteURI.decode())
|
||||
self._fail_on_dataloss_warned = True
|
||||
|
||||
self._finished.errback(reason)
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
from urllib.parse import unquote
|
||||
|
||||
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.boto import is_botocore_available
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.misc import create_instance
|
||||
|
||||
|
||||
class S3DownloadHandler:
|
||||
|
||||
def __init__(self, settings, *,
|
||||
crawler=None,
|
||||
aws_access_key_id=None, aws_secret_access_key=None,
|
||||
httpdownloadhandler=HTTPDownloadHandler, **kw):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
|
||||
if not aws_access_key_id:
|
||||
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
|
||||
if not aws_secret_access_key:
|
||||
aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
|
||||
|
||||
# If no credentials could be found anywhere,
|
||||
# consider this an anonymous connection request by default;
|
||||
# unless 'anon' was set explicitly (True/False).
|
||||
anon = kw.get('anon')
|
||||
if anon is None and not aws_access_key_id and not aws_secret_access_key:
|
||||
kw['anon'] = True
|
||||
self.anon = kw.get('anon')
|
||||
|
||||
self._signer = None
|
||||
import botocore.auth
|
||||
import botocore.credentials
|
||||
kw.pop('anon', None)
|
||||
if kw:
|
||||
raise TypeError(f'Unexpected keyword arguments: {kw}')
|
||||
if not self.anon:
|
||||
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
|
||||
self._signer = SignerCls(botocore.credentials.Credentials(
|
||||
aws_access_key_id, aws_secret_access_key))
|
||||
|
||||
_http_handler = create_instance(
|
||||
objcls=httpdownloadhandler,
|
||||
settings=settings,
|
||||
crawler=crawler,
|
||||
)
|
||||
self._download_http = _http_handler.download_request
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, **kwargs):
|
||||
return cls(crawler.settings, crawler=crawler, **kwargs)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
p = urlparse_cached(request)
|
||||
scheme = 'https' if request.meta.get('is_secure') else 'http'
|
||||
bucket = p.hostname
|
||||
path = p.path + '?' + p.query if p.query else p.path
|
||||
url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
|
||||
if self.anon:
|
||||
request = request.replace(url=url)
|
||||
elif self._signer is not None:
|
||||
import botocore.awsrequest
|
||||
awsrequest = botocore.awsrequest.AWSRequest(
|
||||
method=request.method,
|
||||
url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
|
||||
headers=request.headers.to_unicode_dict(),
|
||||
data=request.body)
|
||||
self._signer.add_auth(awsrequest)
|
||||
request = request.replace(
|
||||
url=url, headers=awsrequest.headers.items())
|
||||
else:
|
||||
signed_headers = self.conn.make_request(
|
||||
method=request.method,
|
||||
bucket=bucket,
|
||||
key=unquote(p.path),
|
||||
query_args=unquote(p.query),
|
||||
headers=request.headers,
|
||||
data=request.body,
|
||||
)
|
||||
request = request.replace(url=url, headers=signed_headers)
|
||||
return self._download_http(request, spider)
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
"""
|
||||
Downloader Middleware manager
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.exceptions import _InvalidOutput
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.defer import mustbe_deferred, deferred_from_coro
|
||||
from scrapy.utils.conf import build_component_list
|
||||
|
||||
|
||||
class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
component_name = 'downloader middleware'
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(
|
||||
settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
|
||||
|
||||
def _add_middleware(self, mw):
|
||||
if hasattr(mw, 'process_request'):
|
||||
self.methods['process_request'].append(mw.process_request)
|
||||
if hasattr(mw, 'process_response'):
|
||||
self.methods['process_response'].appendleft(mw.process_response)
|
||||
if hasattr(mw, 'process_exception'):
|
||||
self.methods['process_exception'].appendleft(mw.process_exception)
|
||||
|
||||
def download(self, download_func, request, spider):
|
||||
@defer.inlineCallbacks
|
||||
def process_request(request):
|
||||
for method in self.methods['process_request']:
|
||||
response = yield deferred_from_coro(method(request=request, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_request must return None, Response or "
|
||||
f"Request, got {response.__class__.__name__}"
|
||||
)
|
||||
if response:
|
||||
return response
|
||||
return (yield download_func(request=request, spider=spider))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def process_response(response):
|
||||
if response is None:
|
||||
raise TypeError("Received None in process_response")
|
||||
elif isinstance(response, Request):
|
||||
return response
|
||||
|
||||
for method in self.methods['process_response']:
|
||||
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_response must return Response or Request, "
|
||||
f"got {type(response)}"
|
||||
)
|
||||
if isinstance(response, Request):
|
||||
return response
|
||||
return response
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def process_exception(failure):
|
||||
exception = failure.value
|
||||
for method in self.methods['process_exception']:
|
||||
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_exception must return None, Response or "
|
||||
f"Request, got {type(response)}"
|
||||
)
|
||||
if response:
|
||||
return response
|
||||
return failure
|
||||
|
||||
deferred = mustbe_deferred(process_request, request)
|
||||
deferred.addErrback(process_exception)
|
||||
deferred.addCallback(process_response)
|
||||
return deferred
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
import logging
|
||||
|
||||
from OpenSSL import SSL
|
||||
from service_identity.exceptions import CertificateError
|
||||
from twisted.internet._sslverify import ClientTLSOptions, verifyHostname, VerificationError
|
||||
from twisted.internet.ssl import AcceptableCiphers
|
||||
|
||||
from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
METHOD_SSLv3 = 'SSLv3'
|
||||
METHOD_TLS = 'TLS'
|
||||
METHOD_TLSv10 = 'TLSv1.0'
|
||||
METHOD_TLSv11 = 'TLSv1.1'
|
||||
METHOD_TLSv12 = 'TLSv1.2'
|
||||
|
||||
|
||||
openssl_methods = {
|
||||
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
|
||||
METHOD_SSLv3: SSL.SSLv3_METHOD, # SSL 3 (NOT recommended)
|
||||
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
|
||||
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
|
||||
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only
|
||||
}
|
||||
|
||||
|
||||
class ScrapyClientTLSOptions(ClientTLSOptions):
|
||||
"""
|
||||
SSL Client connection creator ignoring certificate verification errors
|
||||
(for genuinely invalid certificates or bugs in verification code).
|
||||
|
||||
Same as Twisted's private _sslverify.ClientTLSOptions,
|
||||
except that VerificationError, CertificateError and ValueError
|
||||
exceptions are caught, so that the connection is not closed, only
|
||||
logging warnings. Also, HTTPS connection parameters logging is added.
|
||||
"""
|
||||
|
||||
def __init__(self, hostname, ctx, verbose_logging=False):
|
||||
super().__init__(hostname, ctx)
|
||||
self.verbose_logging = verbose_logging
|
||||
|
||||
def _identityVerifyingInfoCallback(self, connection, where, ret):
|
||||
if where & SSL.SSL_CB_HANDSHAKE_START:
|
||||
connection.set_tlsext_host_name(self._hostnameBytes)
|
||||
elif where & SSL.SSL_CB_HANDSHAKE_DONE:
|
||||
if self.verbose_logging:
|
||||
logger.debug('SSL connection to %s using protocol %s, cipher %s',
|
||||
self._hostnameASCII,
|
||||
connection.get_protocol_version_name(),
|
||||
connection.get_cipher_name(),
|
||||
)
|
||||
server_cert = connection.get_peer_certificate()
|
||||
logger.debug('SSL connection certificate: issuer "%s", subject "%s"',
|
||||
x509name_to_string(server_cert.get_issuer()),
|
||||
x509name_to_string(server_cert.get_subject()),
|
||||
)
|
||||
key_info = get_temp_key_info(connection._ssl)
|
||||
if key_info:
|
||||
logger.debug('SSL temp key: %s', key_info)
|
||||
|
||||
try:
|
||||
verifyHostname(connection, self._hostnameASCII)
|
||||
except (CertificateError, VerificationError) as e:
|
||||
logger.warning(
|
||||
'Remote certificate is not valid for hostname "{}"; {}'.format(
|
||||
self._hostnameASCII, e))
|
||||
|
||||
except ValueError as e:
|
||||
logger.warning(
|
||||
'Ignoring error while verifying certificate '
|
||||
'from host "{}" (exception: {})'.format(
|
||||
self._hostnameASCII, repr(e)))
|
||||
|
||||
|
||||
DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT')
|
||||
|
|
@ -0,0 +1,212 @@
|
|||
from time import time
|
||||
from urllib.parse import urlparse, urlunparse, urldefrag
|
||||
|
||||
from twisted.web.http import HTTPClient
|
||||
from twisted.internet import defer, reactor
|
||||
from twisted.internet.protocol import ClientFactory
|
||||
|
||||
from scrapy.http import Headers
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
|
||||
def _parsed_url_args(parsed):
|
||||
# Assume parsed is urlparse-d from Request.url,
|
||||
# which was passed via safe_url_string and is ascii-only.
|
||||
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
||||
path = to_bytes(path, encoding="ascii")
|
||||
host = to_bytes(parsed.hostname, encoding="ascii")
|
||||
port = parsed.port
|
||||
scheme = to_bytes(parsed.scheme, encoding="ascii")
|
||||
netloc = to_bytes(parsed.netloc, encoding="ascii")
|
||||
if port is None:
|
||||
port = 443 if scheme == b'https' else 80
|
||||
return scheme, netloc, host, port, path
|
||||
|
||||
|
||||
def _parse(url):
|
||||
""" Return tuple of (scheme, netloc, host, port, path),
|
||||
all in bytes except for port which is int.
|
||||
Assume url is from Request.url, which was passed via safe_url_string
|
||||
and is ascii-only.
|
||||
"""
|
||||
url = url.strip()
|
||||
parsed = urlparse(url)
|
||||
return _parsed_url_args(parsed)
|
||||
|
||||
|
||||
class ScrapyHTTPPageGetter(HTTPClient):
|
||||
|
||||
delimiter = b'\n'
|
||||
|
||||
def connectionMade(self):
|
||||
self.headers = Headers() # bucket for response headers
|
||||
|
||||
# Method command
|
||||
self.sendCommand(self.factory.method, self.factory.path)
|
||||
# Headers
|
||||
for key, values in self.factory.headers.items():
|
||||
for value in values:
|
||||
self.sendHeader(key, value)
|
||||
self.endHeaders()
|
||||
# Body
|
||||
if self.factory.body is not None:
|
||||
self.transport.write(self.factory.body)
|
||||
|
||||
def lineReceived(self, line):
|
||||
return HTTPClient.lineReceived(self, line.rstrip())
|
||||
|
||||
def handleHeader(self, key, value):
|
||||
self.headers.appendlist(key, value)
|
||||
|
||||
def handleStatus(self, version, status, message):
|
||||
self.factory.gotStatus(version, status, message)
|
||||
|
||||
def handleEndHeaders(self):
|
||||
self.factory.gotHeaders(self.headers)
|
||||
|
||||
def connectionLost(self, reason):
|
||||
self._connection_lost_reason = reason
|
||||
HTTPClient.connectionLost(self, reason)
|
||||
self.factory.noPage(reason)
|
||||
|
||||
def handleResponse(self, response):
|
||||
if self.factory.method.upper() == b'HEAD':
|
||||
self.factory.page(b'')
|
||||
elif self.length is not None and self.length > 0:
|
||||
self.factory.noPage(self._connection_lost_reason)
|
||||
else:
|
||||
self.factory.page(response)
|
||||
self.transport.loseConnection()
|
||||
|
||||
def timeout(self):
|
||||
self.transport.loseConnection()
|
||||
|
||||
# transport cleanup needed for HTTPS connections
|
||||
if self.factory.url.startswith(b'https'):
|
||||
self.transport.stopProducing()
|
||||
|
||||
self.factory.noPage(
|
||||
defer.TimeoutError(f"Getting {self.factory.url} took longer "
|
||||
f"than {self.factory.timeout} seconds."))
|
||||
|
||||
|
||||
# This class used to inherit from Twisted’s
|
||||
# twisted.web.client.HTTPClientFactory. When that class was deprecated in
|
||||
# Twisted (https://github.com/twisted/twisted/pull/643), we merged its
|
||||
# non-overriden code into this class.
|
||||
class ScrapyHTTPClientFactory(ClientFactory):
|
||||
|
||||
protocol = ScrapyHTTPPageGetter
|
||||
|
||||
waiting = 1
|
||||
noisy = False
|
||||
followRedirect = False
|
||||
afterFoundGet = False
|
||||
|
||||
def _build_response(self, body, request):
|
||||
request.meta['download_latency'] = self.headers_time - self.start_time
|
||||
status = int(self.status)
|
||||
headers = Headers(self.response_headers)
|
||||
respcls = responsetypes.from_args(headers=headers, url=self._url)
|
||||
return respcls(url=self._url, status=status, headers=headers, body=body)
|
||||
|
||||
def _set_connection_attributes(self, request):
|
||||
parsed = urlparse_cached(request)
|
||||
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
|
||||
proxy = request.meta.get('proxy')
|
||||
if proxy:
|
||||
self.scheme, _, self.host, self.port, _ = _parse(proxy)
|
||||
self.path = self.url
|
||||
|
||||
def __init__(self, request, timeout=180):
|
||||
self._url = urldefrag(request.url)[0]
|
||||
# converting to bytes to comply to Twisted interface
|
||||
self.url = to_bytes(self._url, encoding='ascii')
|
||||
self.method = to_bytes(request.method, encoding='ascii')
|
||||
self.body = request.body or None
|
||||
self.headers = Headers(request.headers)
|
||||
self.response_headers = None
|
||||
self.timeout = request.meta.get('download_timeout') or timeout
|
||||
self.start_time = time()
|
||||
self.deferred = defer.Deferred().addCallback(self._build_response, request)
|
||||
|
||||
# Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
|
||||
# to have _disconnectedDeferred. See Twisted r32329.
|
||||
# As Scrapy implements it's own logic to handle redirects is not
|
||||
# needed to add the callback _waitForDisconnect.
|
||||
# Specifically this avoids the AttributeError exception when
|
||||
# clientConnectionFailed method is called.
|
||||
self._disconnectedDeferred = defer.Deferred()
|
||||
|
||||
self._set_connection_attributes(request)
|
||||
|
||||
# set Host header based on url
|
||||
self.headers.setdefault('Host', self.netloc)
|
||||
|
||||
# set Content-Length based len of body
|
||||
if self.body is not None:
|
||||
self.headers['Content-Length'] = len(self.body)
|
||||
# just in case a broken http/1.1 decides to keep connection alive
|
||||
self.headers.setdefault("Connection", "close")
|
||||
# Content-Length must be specified in POST method even with no body
|
||||
elif self.method == b'POST':
|
||||
self.headers['Content-Length'] = 0
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.__class__.__name__}: {self.url}>"
|
||||
|
||||
def _cancelTimeout(self, result, timeoutCall):
|
||||
if timeoutCall.active():
|
||||
timeoutCall.cancel()
|
||||
return result
|
||||
|
||||
def buildProtocol(self, addr):
|
||||
p = ClientFactory.buildProtocol(self, addr)
|
||||
p.followRedirect = self.followRedirect
|
||||
p.afterFoundGet = self.afterFoundGet
|
||||
if self.timeout:
|
||||
timeoutCall = reactor.callLater(self.timeout, p.timeout)
|
||||
self.deferred.addBoth(self._cancelTimeout, timeoutCall)
|
||||
return p
|
||||
|
||||
def gotHeaders(self, headers):
|
||||
self.headers_time = time()
|
||||
self.response_headers = headers
|
||||
|
||||
def gotStatus(self, version, status, message):
|
||||
"""
|
||||
Set the status of the request on us.
|
||||
@param version: The HTTP version.
|
||||
@type version: L{bytes}
|
||||
@param status: The HTTP status code, an integer represented as a
|
||||
bytestring.
|
||||
@type status: L{bytes}
|
||||
@param message: The HTTP status message.
|
||||
@type message: L{bytes}
|
||||
"""
|
||||
self.version, self.status, self.message = version, status, message
|
||||
|
||||
def page(self, page):
|
||||
if self.waiting:
|
||||
self.waiting = 0
|
||||
self.deferred.callback(page)
|
||||
|
||||
def noPage(self, reason):
|
||||
if self.waiting:
|
||||
self.waiting = 0
|
||||
self.deferred.errback(reason)
|
||||
|
||||
def clientConnectionFailed(self, _, reason):
|
||||
"""
|
||||
When a connection attempt fails, the request cannot be issued. If no
|
||||
result has yet been provided to the result Deferred, provide the
|
||||
connection failure reason as an error result.
|
||||
"""
|
||||
if self.waiting:
|
||||
self.waiting = 0
|
||||
# If the connection attempt failed, there is nothing more to
|
||||
# disconnect, so just fire that Deferred now.
|
||||
self._disconnectedDeferred.callback(None)
|
||||
self.deferred.errback(reason)
|
||||
360
venv/lib/python3.9/site-packages/scrapy/core/engine.py
Normal file
360
venv/lib/python3.9/site-packages/scrapy/core/engine.py
Normal file
|
|
@ -0,0 +1,360 @@
|
|||
"""
|
||||
This is the Scrapy engine which controls the Scheduler, Downloader and Spiders.
|
||||
|
||||
For more information see docs/topics/architecture.rst
|
||||
|
||||
"""
|
||||
import logging
|
||||
from time import time
|
||||
|
||||
from twisted.internet import defer, task
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.core.scraper import Scraper
|
||||
from scrapy.exceptions import DontCloseSpider
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.reactor import CallLaterOnce
|
||||
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Slot:
|
||||
|
||||
def __init__(self, start_requests, close_if_idle, nextcall, scheduler):
|
||||
self.closing = False
|
||||
self.inprogress = set() # requests in progress
|
||||
self.start_requests = iter(start_requests)
|
||||
self.close_if_idle = close_if_idle
|
||||
self.nextcall = nextcall
|
||||
self.scheduler = scheduler
|
||||
self.heartbeat = task.LoopingCall(nextcall.schedule)
|
||||
|
||||
def add_request(self, request):
|
||||
self.inprogress.add(request)
|
||||
|
||||
def remove_request(self, request):
|
||||
self.inprogress.remove(request)
|
||||
self._maybe_fire_closing()
|
||||
|
||||
def close(self):
|
||||
self.closing = defer.Deferred()
|
||||
self._maybe_fire_closing()
|
||||
return self.closing
|
||||
|
||||
def _maybe_fire_closing(self):
|
||||
if self.closing and not self.inprogress:
|
||||
if self.nextcall:
|
||||
self.nextcall.cancel()
|
||||
if self.heartbeat.running:
|
||||
self.heartbeat.stop()
|
||||
self.closing.callback(None)
|
||||
|
||||
|
||||
class ExecutionEngine:
|
||||
|
||||
def __init__(self, crawler, spider_closed_callback):
|
||||
self.crawler = crawler
|
||||
self.settings = crawler.settings
|
||||
self.signals = crawler.signals
|
||||
self.logformatter = crawler.logformatter
|
||||
self.slot = None
|
||||
self.spider = None
|
||||
self.running = False
|
||||
self.paused = False
|
||||
self.scheduler_cls = load_object(self.settings['SCHEDULER'])
|
||||
downloader_cls = load_object(self.settings['DOWNLOADER'])
|
||||
self.downloader = downloader_cls(crawler)
|
||||
self.scraper = Scraper(crawler)
|
||||
self._spider_closed_callback = spider_closed_callback
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def start(self):
|
||||
"""Start the execution engine"""
|
||||
if self.running:
|
||||
raise RuntimeError("Engine already running")
|
||||
self.start_time = time()
|
||||
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
|
||||
self.running = True
|
||||
self._closewait = defer.Deferred()
|
||||
yield self._closewait
|
||||
|
||||
def stop(self):
|
||||
"""Stop the execution engine gracefully"""
|
||||
if not self.running:
|
||||
raise RuntimeError("Engine not running")
|
||||
self.running = False
|
||||
dfd = self._close_all_spiders()
|
||||
return dfd.addBoth(lambda _: self._finish_stopping_engine())
|
||||
|
||||
def close(self):
|
||||
"""Close the execution engine gracefully.
|
||||
|
||||
If it has already been started, stop it. In all cases, close all spiders
|
||||
and the downloader.
|
||||
"""
|
||||
if self.running:
|
||||
# Will also close spiders and downloader
|
||||
return self.stop()
|
||||
elif self.open_spiders:
|
||||
# Will also close downloader
|
||||
return self._close_all_spiders()
|
||||
else:
|
||||
return defer.succeed(self.downloader.close())
|
||||
|
||||
def pause(self):
|
||||
"""Pause the execution engine"""
|
||||
self.paused = True
|
||||
|
||||
def unpause(self):
|
||||
"""Resume the execution engine"""
|
||||
self.paused = False
|
||||
|
||||
def _next_request(self, spider):
|
||||
slot = self.slot
|
||||
if not slot:
|
||||
return
|
||||
|
||||
if self.paused:
|
||||
return
|
||||
|
||||
while not self._needs_backout(spider):
|
||||
if not self._next_request_from_scheduler(spider):
|
||||
break
|
||||
|
||||
if slot.start_requests and not self._needs_backout(spider):
|
||||
try:
|
||||
request = next(slot.start_requests)
|
||||
except StopIteration:
|
||||
slot.start_requests = None
|
||||
except Exception:
|
||||
slot.start_requests = None
|
||||
logger.error('Error while obtaining start requests',
|
||||
exc_info=True, extra={'spider': spider})
|
||||
else:
|
||||
self.crawl(request, spider)
|
||||
|
||||
if self.spider_is_idle(spider) and slot.close_if_idle:
|
||||
self._spider_idle(spider)
|
||||
|
||||
def _needs_backout(self, spider):
|
||||
slot = self.slot
|
||||
return (
|
||||
not self.running
|
||||
or slot.closing
|
||||
or self.downloader.needs_backout()
|
||||
or self.scraper.slot.needs_backout()
|
||||
)
|
||||
|
||||
def _next_request_from_scheduler(self, spider):
|
||||
slot = self.slot
|
||||
request = slot.scheduler.next_request()
|
||||
if not request:
|
||||
return
|
||||
d = self._download(request, spider)
|
||||
d.addBoth(self._handle_downloader_output, request, spider)
|
||||
d.addErrback(lambda f: logger.info('Error while handling downloader output',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
d.addBoth(lambda _: slot.remove_request(request))
|
||||
d.addErrback(lambda f: logger.info('Error while removing request from slot',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
d.addBoth(lambda _: slot.nextcall.schedule())
|
||||
d.addErrback(lambda f: logger.info('Error while scheduling new request',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
return d
|
||||
|
||||
def _handle_downloader_output(self, response, request, spider):
|
||||
if not isinstance(response, (Request, Response, Failure)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Request, Response or Failure, got "
|
||||
f"{type(response)}: {response!r}"
|
||||
)
|
||||
# downloader middleware can return requests (for example, redirects)
|
||||
if isinstance(response, Request):
|
||||
self.crawl(response, spider)
|
||||
return
|
||||
# response is a Response or Failure
|
||||
d = self.scraper.enqueue_scrape(response, request, spider)
|
||||
d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
return d
|
||||
|
||||
def spider_is_idle(self, spider):
|
||||
if not self.scraper.slot.is_idle():
|
||||
# scraper is not idle
|
||||
return False
|
||||
|
||||
if self.downloader.active:
|
||||
# downloader has pending requests
|
||||
return False
|
||||
|
||||
if self.slot.start_requests is not None:
|
||||
# not all start requests are handled
|
||||
return False
|
||||
|
||||
if self.slot.scheduler.has_pending_requests():
|
||||
# scheduler has pending requests
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@property
|
||||
def open_spiders(self):
|
||||
return [self.spider] if self.spider else []
|
||||
|
||||
def has_capacity(self):
|
||||
"""Does the engine have capacity to handle more spiders"""
|
||||
return not bool(self.slot)
|
||||
|
||||
def crawl(self, request, spider):
|
||||
if spider not in self.open_spiders:
|
||||
raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
|
||||
self.schedule(request, spider)
|
||||
self.slot.nextcall.schedule()
|
||||
|
||||
def schedule(self, request, spider):
|
||||
self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
|
||||
if not self.slot.scheduler.enqueue_request(request):
|
||||
self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
|
||||
|
||||
def download(self, request, spider):
|
||||
d = self._download(request, spider)
|
||||
d.addBoth(self._downloaded, self.slot, request, spider)
|
||||
return d
|
||||
|
||||
def _downloaded(self, response, slot, request, spider):
|
||||
slot.remove_request(request)
|
||||
return self.download(response, spider) if isinstance(response, Request) else response
|
||||
|
||||
def _download(self, request, spider):
|
||||
slot = self.slot
|
||||
slot.add_request(request)
|
||||
|
||||
def _on_success(response):
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Response or Request, got "
|
||||
f"{type(response)}: {response!r}"
|
||||
)
|
||||
if isinstance(response, Response):
|
||||
if response.request is None:
|
||||
response.request = request
|
||||
logkws = self.logformatter.crawled(response.request, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.response_received,
|
||||
response=response,
|
||||
request=response.request,
|
||||
spider=spider,
|
||||
)
|
||||
return response
|
||||
|
||||
def _on_complete(_):
|
||||
slot.nextcall.schedule()
|
||||
return _
|
||||
|
||||
dwld = self.downloader.fetch(request, spider)
|
||||
dwld.addCallbacks(_on_success)
|
||||
dwld.addBoth(_on_complete)
|
||||
return dwld
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def open_spider(self, spider, start_requests=(), close_if_idle=True):
|
||||
if not self.has_capacity():
|
||||
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
|
||||
logger.info("Spider opened", extra={'spider': spider})
|
||||
nextcall = CallLaterOnce(self._next_request, spider)
|
||||
scheduler = self.scheduler_cls.from_crawler(self.crawler)
|
||||
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
|
||||
slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
|
||||
self.slot = slot
|
||||
self.spider = spider
|
||||
yield scheduler.open(spider)
|
||||
yield self.scraper.open_spider(spider)
|
||||
self.crawler.stats.open_spider(spider)
|
||||
yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
|
||||
slot.nextcall.schedule()
|
||||
slot.heartbeat.start(5)
|
||||
|
||||
def _spider_idle(self, spider):
|
||||
"""Called when a spider gets idle. This function is called when there
|
||||
are no remaining pages to download or schedule. It can be called
|
||||
multiple times. If some extension raises a DontCloseSpider exception
|
||||
(in the spider_idle signal handler) the spider is not closed until the
|
||||
next loop and this function is guaranteed to be called (at least) once
|
||||
again for this spider.
|
||||
"""
|
||||
res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
|
||||
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
|
||||
return
|
||||
|
||||
if self.spider_is_idle(spider):
|
||||
self.close_spider(spider, reason='finished')
|
||||
|
||||
def close_spider(self, spider, reason='cancelled'):
|
||||
"""Close (cancel) spider and clear all its outstanding requests"""
|
||||
|
||||
slot = self.slot
|
||||
if slot.closing:
|
||||
return slot.closing
|
||||
logger.info("Closing spider (%(reason)s)",
|
||||
{'reason': reason},
|
||||
extra={'spider': spider})
|
||||
|
||||
dfd = slot.close()
|
||||
|
||||
def log_failure(msg):
|
||||
def errback(failure):
|
||||
logger.error(
|
||||
msg,
|
||||
exc_info=failure_to_exc_info(failure),
|
||||
extra={'spider': spider}
|
||||
)
|
||||
return errback
|
||||
|
||||
dfd.addBoth(lambda _: self.downloader.close())
|
||||
dfd.addErrback(log_failure('Downloader close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
|
||||
dfd.addErrback(log_failure('Scraper close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: slot.scheduler.close(reason))
|
||||
dfd.addErrback(log_failure('Scheduler close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
|
||||
signal=signals.spider_closed, spider=spider, reason=reason))
|
||||
dfd.addErrback(log_failure('Error while sending spider_close signal'))
|
||||
|
||||
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
|
||||
dfd.addErrback(log_failure('Stats close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
|
||||
{'reason': reason},
|
||||
extra={'spider': spider}))
|
||||
|
||||
dfd.addBoth(lambda _: setattr(self, 'slot', None))
|
||||
dfd.addErrback(log_failure('Error while unassigning slot'))
|
||||
|
||||
dfd.addBoth(lambda _: setattr(self, 'spider', None))
|
||||
dfd.addErrback(log_failure('Error while unassigning spider'))
|
||||
|
||||
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
|
||||
|
||||
return dfd
|
||||
|
||||
def _close_all_spiders(self):
|
||||
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
|
||||
dlist = defer.DeferredList(dfds)
|
||||
return dlist
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _finish_stopping_engine(self):
|
||||
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
|
||||
self._closewait.callback(None)
|
||||
182
venv/lib/python3.9/site-packages/scrapy/core/scheduler.py
Normal file
182
venv/lib/python3.9/site-packages/scrapy/core/scheduler.py
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
import os
|
||||
import json
|
||||
import logging
|
||||
import warnings
|
||||
from os.path import join, exists
|
||||
|
||||
from queuelib import PriorityQueue
|
||||
|
||||
from scrapy.utils.misc import load_object, create_instance
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Scheduler:
|
||||
"""
|
||||
Scrapy Scheduler. It allows to enqueue requests and then get
|
||||
a next request to download. Scheduler is also handling duplication
|
||||
filtering, via dupefilter.
|
||||
|
||||
Prioritization and queueing is not performed by the Scheduler.
|
||||
User sets ``priority`` field for each Request, and a PriorityQueue
|
||||
(defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
|
||||
to dequeue requests in a desired order.
|
||||
|
||||
Scheduler uses two PriorityQueue instances, configured to work in-memory
|
||||
and on-disk (optional). When on-disk queue is present, it is used by
|
||||
default, and an in-memory queue is used as a fallback for cases where
|
||||
a disk queue can't handle a request (can't serialize it).
|
||||
|
||||
:setting:`SCHEDULER_MEMORY_QUEUE` and
|
||||
:setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
|
||||
which PriorityQueue instances would be instantiated with, to keep requests
|
||||
on disk and in memory respectively.
|
||||
|
||||
Overall, Scheduler is an object which holds several PriorityQueue instances
|
||||
(in-memory and on-disk) and implements fallback logic for them.
|
||||
Also, it handles dupefilters.
|
||||
"""
|
||||
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
|
||||
logunser=False, stats=None, pqclass=None, crawler=None):
|
||||
self.df = dupefilter
|
||||
self.dqdir = self._dqdir(jobdir)
|
||||
self.pqclass = pqclass
|
||||
self.dqclass = dqclass
|
||||
self.mqclass = mqclass
|
||||
self.logunser = logunser
|
||||
self.stats = stats
|
||||
self.crawler = crawler
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
settings = crawler.settings
|
||||
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
|
||||
dupefilter = create_instance(dupefilter_cls, settings, crawler)
|
||||
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
|
||||
if pqclass is PriorityQueue:
|
||||
warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
|
||||
" is no longer supported because of API changes; "
|
||||
"please use 'scrapy.pqueues.ScrapyPriorityQueue'",
|
||||
ScrapyDeprecationWarning)
|
||||
from scrapy.pqueues import ScrapyPriorityQueue
|
||||
pqclass = ScrapyPriorityQueue
|
||||
|
||||
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
|
||||
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
|
||||
logunser = settings.getbool('SCHEDULER_DEBUG')
|
||||
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
|
||||
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
|
||||
mqclass=mqclass, crawler=crawler)
|
||||
|
||||
def has_pending_requests(self):
|
||||
return len(self) > 0
|
||||
|
||||
def open(self, spider):
|
||||
self.spider = spider
|
||||
self.mqs = self._mq()
|
||||
self.dqs = self._dq() if self.dqdir else None
|
||||
return self.df.open()
|
||||
|
||||
def close(self, reason):
|
||||
if self.dqs:
|
||||
state = self.dqs.close()
|
||||
self._write_dqs_state(self.dqdir, state)
|
||||
return self.df.close(reason)
|
||||
|
||||
def enqueue_request(self, request):
|
||||
if not request.dont_filter and self.df.request_seen(request):
|
||||
self.df.log(request, self.spider)
|
||||
return False
|
||||
dqok = self._dqpush(request)
|
||||
if dqok:
|
||||
self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
|
||||
else:
|
||||
self._mqpush(request)
|
||||
self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
|
||||
self.stats.inc_value('scheduler/enqueued', spider=self.spider)
|
||||
return True
|
||||
|
||||
def next_request(self):
|
||||
request = self.mqs.pop()
|
||||
if request:
|
||||
self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
|
||||
else:
|
||||
request = self._dqpop()
|
||||
if request:
|
||||
self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
|
||||
if request:
|
||||
self.stats.inc_value('scheduler/dequeued', spider=self.spider)
|
||||
return request
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)
|
||||
|
||||
def _dqpush(self, request):
|
||||
if self.dqs is None:
|
||||
return
|
||||
try:
|
||||
self.dqs.push(request)
|
||||
except ValueError as e: # non serializable request
|
||||
if self.logunser:
|
||||
msg = ("Unable to serialize request: %(request)s - reason:"
|
||||
" %(reason)s - no more unserializable requests will be"
|
||||
" logged (stats being collected)")
|
||||
logger.warning(msg, {'request': request, 'reason': e},
|
||||
exc_info=True, extra={'spider': self.spider})
|
||||
self.logunser = False
|
||||
self.stats.inc_value('scheduler/unserializable',
|
||||
spider=self.spider)
|
||||
return
|
||||
else:
|
||||
return True
|
||||
|
||||
def _mqpush(self, request):
|
||||
self.mqs.push(request)
|
||||
|
||||
def _dqpop(self):
|
||||
if self.dqs:
|
||||
return self.dqs.pop()
|
||||
|
||||
def _mq(self):
|
||||
""" Create a new priority queue instance, with in-memory storage """
|
||||
return create_instance(self.pqclass,
|
||||
settings=None,
|
||||
crawler=self.crawler,
|
||||
downstream_queue_cls=self.mqclass,
|
||||
key='')
|
||||
|
||||
def _dq(self):
|
||||
""" Create a new priority queue instance, with disk storage """
|
||||
state = self._read_dqs_state(self.dqdir)
|
||||
q = create_instance(self.pqclass,
|
||||
settings=None,
|
||||
crawler=self.crawler,
|
||||
downstream_queue_cls=self.dqclass,
|
||||
key=self.dqdir,
|
||||
startprios=state)
|
||||
if q:
|
||||
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
||||
{'queuesize': len(q)}, extra={'spider': self.spider})
|
||||
return q
|
||||
|
||||
def _dqdir(self, jobdir):
|
||||
""" Return a folder name to keep disk queue state at """
|
||||
if jobdir:
|
||||
dqdir = join(jobdir, 'requests.queue')
|
||||
if not exists(dqdir):
|
||||
os.makedirs(dqdir)
|
||||
return dqdir
|
||||
|
||||
def _read_dqs_state(self, dqdir):
|
||||
path = join(dqdir, 'active.json')
|
||||
if not exists(path):
|
||||
return ()
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
def _write_dqs_state(self, dqdir, state):
|
||||
with open(join(dqdir, 'active.json'), 'w') as f:
|
||||
json.dump(state, f)
|
||||
260
venv/lib/python3.9/site-packages/scrapy/core/scraper.py
Normal file
260
venv/lib/python3.9/site-packages/scrapy/core/scraper.py
Normal file
|
|
@ -0,0 +1,260 @@
|
|||
"""This module implements the Scraper component which parses responses and
|
||||
extracts information from them"""
|
||||
|
||||
import logging
|
||||
from collections import deque
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet import defer
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.core.spidermw import SpiderMiddlewareManager
|
||||
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.utils.defer import defer_fail, defer_succeed, iter_errback, parallel
|
||||
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
|
||||
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Slot:
|
||||
"""Scraper slot (one per running spider)"""
|
||||
|
||||
MIN_RESPONSE_SIZE = 1024
|
||||
|
||||
def __init__(self, max_active_size=5000000):
|
||||
self.max_active_size = max_active_size
|
||||
self.queue = deque()
|
||||
self.active = set()
|
||||
self.active_size = 0
|
||||
self.itemproc_size = 0
|
||||
self.closing = None
|
||||
|
||||
def add_response_request(self, response, request):
|
||||
deferred = defer.Deferred()
|
||||
self.queue.append((response, request, deferred))
|
||||
if isinstance(response, Response):
|
||||
self.active_size += max(len(response.body), self.MIN_RESPONSE_SIZE)
|
||||
else:
|
||||
self.active_size += self.MIN_RESPONSE_SIZE
|
||||
return deferred
|
||||
|
||||
def next_response_request_deferred(self):
|
||||
response, request, deferred = self.queue.popleft()
|
||||
self.active.add(request)
|
||||
return response, request, deferred
|
||||
|
||||
def finish_response(self, response, request):
|
||||
self.active.remove(request)
|
||||
if isinstance(response, Response):
|
||||
self.active_size -= max(len(response.body), self.MIN_RESPONSE_SIZE)
|
||||
else:
|
||||
self.active_size -= self.MIN_RESPONSE_SIZE
|
||||
|
||||
def is_idle(self):
|
||||
return not (self.queue or self.active)
|
||||
|
||||
def needs_backout(self):
|
||||
return self.active_size > self.max_active_size
|
||||
|
||||
|
||||
class Scraper:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.slot = None
|
||||
self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
|
||||
itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
|
||||
self.itemproc = itemproc_cls.from_crawler(crawler)
|
||||
self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
|
||||
self.crawler = crawler
|
||||
self.signals = crawler.signals
|
||||
self.logformatter = crawler.logformatter
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def open_spider(self, spider):
|
||||
"""Open the given spider for scraping and allocate resources for it"""
|
||||
self.slot = Slot(self.crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE'))
|
||||
yield self.itemproc.open_spider(spider)
|
||||
|
||||
def close_spider(self, spider):
|
||||
"""Close a spider being scraped and release its resources"""
|
||||
slot = self.slot
|
||||
slot.closing = defer.Deferred()
|
||||
slot.closing.addCallback(self.itemproc.close_spider)
|
||||
self._check_if_closing(spider, slot)
|
||||
return slot.closing
|
||||
|
||||
def is_idle(self):
|
||||
"""Return True if there isn't any more spiders to process"""
|
||||
return not self.slot
|
||||
|
||||
def _check_if_closing(self, spider, slot):
|
||||
if slot.closing and slot.is_idle():
|
||||
slot.closing.callback(spider)
|
||||
|
||||
def enqueue_scrape(self, response, request, spider):
|
||||
slot = self.slot
|
||||
dfd = slot.add_response_request(response, request)
|
||||
|
||||
def finish_scraping(_):
|
||||
slot.finish_response(response, request)
|
||||
self._check_if_closing(spider, slot)
|
||||
self._scrape_next(spider, slot)
|
||||
return _
|
||||
|
||||
dfd.addBoth(finish_scraping)
|
||||
dfd.addErrback(
|
||||
lambda f: logger.error('Scraper bug processing %(request)s',
|
||||
{'request': request},
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
self._scrape_next(spider, slot)
|
||||
return dfd
|
||||
|
||||
def _scrape_next(self, spider, slot):
|
||||
while slot.queue:
|
||||
response, request, deferred = slot.next_response_request_deferred()
|
||||
self._scrape(response, request, spider).chainDeferred(deferred)
|
||||
|
||||
def _scrape(self, result, request, spider):
|
||||
"""
|
||||
Handle the downloaded response or failure through the spider callback/errback
|
||||
"""
|
||||
if not isinstance(result, (Response, Failure)):
|
||||
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
|
||||
dfd = self._scrape2(result, request, spider) # returns spider's processed output
|
||||
dfd.addErrback(self.handle_spider_error, request, result, spider)
|
||||
dfd.addCallback(self.handle_spider_output, request, result, spider)
|
||||
return dfd
|
||||
|
||||
def _scrape2(self, result, request, spider):
|
||||
"""
|
||||
Handle the different cases of request's result been a Response or a Failure
|
||||
"""
|
||||
if isinstance(result, Response):
|
||||
return self.spidermw.scrape_response(self.call_spider, result, request, spider)
|
||||
else: # result is a Failure
|
||||
dfd = self.call_spider(result, request, spider)
|
||||
return dfd.addErrback(self._log_download_errors, result, request, spider)
|
||||
|
||||
def call_spider(self, result, request, spider):
|
||||
if isinstance(result, Response):
|
||||
if getattr(result, "request", None) is None:
|
||||
result.request = request
|
||||
callback = result.request.callback or spider._parse
|
||||
warn_on_generator_with_return_value(spider, callback)
|
||||
dfd = defer_succeed(result)
|
||||
dfd.addCallback(callback, **result.request.cb_kwargs)
|
||||
else: # result is a Failure
|
||||
result.request = request
|
||||
warn_on_generator_with_return_value(spider, request.errback)
|
||||
dfd = defer_fail(result)
|
||||
dfd.addErrback(request.errback)
|
||||
return dfd.addCallback(iterate_spider_output)
|
||||
|
||||
def handle_spider_error(self, _failure, request, response, spider):
|
||||
exc = _failure.value
|
||||
if isinstance(exc, CloseSpider):
|
||||
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
|
||||
return
|
||||
logkws = self.logformatter.spider_error(_failure, request, response, spider)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
exc_info=failure_to_exc_info(_failure),
|
||||
extra={'spider': spider}
|
||||
)
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.spider_error,
|
||||
failure=_failure, response=response,
|
||||
spider=spider
|
||||
)
|
||||
self.crawler.stats.inc_value(
|
||||
f"spider_exceptions/{_failure.value.__class__.__name__}",
|
||||
spider=spider
|
||||
)
|
||||
|
||||
def handle_spider_output(self, result, request, response, spider):
|
||||
if not result:
|
||||
return defer_succeed(None)
|
||||
it = iter_errback(result, self.handle_spider_error, request, response, spider)
|
||||
dfd = parallel(it, self.concurrent_items, self._process_spidermw_output,
|
||||
request, response, spider)
|
||||
return dfd
|
||||
|
||||
def _process_spidermw_output(self, output, request, response, spider):
|
||||
"""Process each Request/Item (given in the output parameter) returned
|
||||
from the given spider
|
||||
"""
|
||||
if isinstance(output, Request):
|
||||
self.crawler.engine.crawl(request=output, spider=spider)
|
||||
elif is_item(output):
|
||||
self.slot.itemproc_size += 1
|
||||
dfd = self.itemproc.process_item(output, spider)
|
||||
dfd.addBoth(self._itemproc_finished, output, response, spider)
|
||||
return dfd
|
||||
elif output is None:
|
||||
pass
|
||||
else:
|
||||
typename = type(output).__name__
|
||||
logger.error(
|
||||
'Spider must return request, item, or None, got %(typename)r in %(request)s',
|
||||
{'request': request, 'typename': typename},
|
||||
extra={'spider': spider},
|
||||
)
|
||||
|
||||
def _log_download_errors(self, spider_failure, download_failure, request, spider):
|
||||
"""Log and silence errors that come from the engine (typically download
|
||||
errors that got propagated thru here)
|
||||
"""
|
||||
if isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest):
|
||||
if download_failure.frames:
|
||||
logkws = self.logformatter.download_error(download_failure, request, spider)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
extra={'spider': spider},
|
||||
exc_info=failure_to_exc_info(download_failure),
|
||||
)
|
||||
else:
|
||||
errmsg = download_failure.getErrorMessage()
|
||||
if errmsg:
|
||||
logkws = self.logformatter.download_error(
|
||||
download_failure, request, spider, errmsg)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
extra={'spider': spider},
|
||||
)
|
||||
|
||||
if spider_failure is not download_failure:
|
||||
return spider_failure
|
||||
|
||||
def _itemproc_finished(self, output, item, response, spider):
|
||||
"""ItemProcessor finished for the given ``item`` and returned ``output``
|
||||
"""
|
||||
self.slot.itemproc_size -= 1
|
||||
if isinstance(output, Failure):
|
||||
ex = output.value
|
||||
if isinstance(ex, DropItem):
|
||||
logkws = self.logformatter.dropped(item, ex, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_dropped, item=item, response=response,
|
||||
spider=spider, exception=output.value)
|
||||
else:
|
||||
logkws = self.logformatter.item_error(item, ex, response, spider)
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider},
|
||||
exc_info=failure_to_exc_info(output))
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_error, item=item, response=response,
|
||||
spider=spider, failure=output)
|
||||
else:
|
||||
logkws = self.logformatter.scraped(output, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_scraped, item=output, response=response,
|
||||
spider=spider)
|
||||
128
venv/lib/python3.9/site-packages/scrapy/core/spidermw.py
Normal file
128
venv/lib/python3.9/site-packages/scrapy/core/spidermw.py
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
"""
|
||||
Spider Middleware manager
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
from itertools import islice
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.exceptions import _InvalidOutput
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.conf import build_component_list
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy.utils.python import MutableChain
|
||||
|
||||
|
||||
def _isiterable(possible_iterator):
|
||||
return hasattr(possible_iterator, '__iter__')
|
||||
|
||||
|
||||
def _fname(f):
|
||||
return f"{f.__self__.__class__.__name__}.{f.__func__.__name__}"
|
||||
|
||||
|
||||
class SpiderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
component_name = 'spider middleware'
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
|
||||
|
||||
def _add_middleware(self, mw):
|
||||
super()._add_middleware(mw)
|
||||
if hasattr(mw, 'process_spider_input'):
|
||||
self.methods['process_spider_input'].append(mw.process_spider_input)
|
||||
if hasattr(mw, 'process_start_requests'):
|
||||
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
|
||||
process_spider_output = getattr(mw, 'process_spider_output', None)
|
||||
self.methods['process_spider_output'].appendleft(process_spider_output)
|
||||
process_spider_exception = getattr(mw, 'process_spider_exception', None)
|
||||
self.methods['process_spider_exception'].appendleft(process_spider_exception)
|
||||
|
||||
def scrape_response(self, scrape_func, response, request, spider):
|
||||
|
||||
def process_spider_input(response):
|
||||
for method in self.methods['process_spider_input']:
|
||||
try:
|
||||
result = method(response=response, spider=spider)
|
||||
if result is not None:
|
||||
msg = (f"Middleware {_fname(method)} must return None "
|
||||
f"or raise an exception, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
except _InvalidOutput:
|
||||
raise
|
||||
except Exception:
|
||||
return scrape_func(Failure(), request, spider)
|
||||
return scrape_func(response, request, spider)
|
||||
|
||||
def _evaluate_iterable(iterable, exception_processor_index, recover_to):
|
||||
try:
|
||||
for r in iterable:
|
||||
yield r
|
||||
except Exception as ex:
|
||||
exception_result = process_spider_exception(Failure(ex), exception_processor_index)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
recover_to.extend(exception_result)
|
||||
|
||||
def process_spider_exception(_failure, start_index=0):
|
||||
exception = _failure.value
|
||||
# don't handle _InvalidOutput exception
|
||||
if isinstance(exception, _InvalidOutput):
|
||||
return _failure
|
||||
method_list = islice(self.methods['process_spider_exception'], start_index, None)
|
||||
for method_index, method in enumerate(method_list, start=start_index):
|
||||
if method is None:
|
||||
continue
|
||||
result = method(response=response, exception=exception, spider=spider)
|
||||
if _isiterable(result):
|
||||
# stop exception handling by handing control over to the
|
||||
# process_spider_output chain if an iterable has been returned
|
||||
return process_spider_output(result, method_index + 1)
|
||||
elif result is None:
|
||||
continue
|
||||
else:
|
||||
msg = (f"Middleware {_fname(method)} must return None "
|
||||
f"or an iterable, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
return _failure
|
||||
|
||||
def process_spider_output(result, start_index=0):
|
||||
# items in this iterable do not need to go through the process_spider_output
|
||||
# chain, they went through it already from the process_spider_exception method
|
||||
recovered = MutableChain()
|
||||
|
||||
method_list = islice(self.methods['process_spider_output'], start_index, None)
|
||||
for method_index, method in enumerate(method_list, start=start_index):
|
||||
if method is None:
|
||||
continue
|
||||
try:
|
||||
# might fail directly if the output value is not a generator
|
||||
result = method(response=response, result=result, spider=spider)
|
||||
except Exception as ex:
|
||||
exception_result = process_spider_exception(Failure(ex), method_index + 1)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
return exception_result
|
||||
if _isiterable(result):
|
||||
result = _evaluate_iterable(result, method_index + 1, recovered)
|
||||
else:
|
||||
msg = (f"Middleware {_fname(method)} must return an "
|
||||
f"iterable, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
|
||||
return MutableChain(result, recovered)
|
||||
|
||||
def process_callback_output(result):
|
||||
recovered = MutableChain()
|
||||
result = _evaluate_iterable(result, 0, recovered)
|
||||
return MutableChain(process_spider_output(result), recovered)
|
||||
|
||||
dfd = mustbe_deferred(process_spider_input, response)
|
||||
dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception)
|
||||
return dfd
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
return self._process_chain('process_start_requests', start_requests, spider)
|
||||
Loading…
Add table
Add a link
Reference in a new issue