Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,3 @@
"""
Scrapy core library classes and functions.
"""

View file

@ -0,0 +1,201 @@
import random
from time import time
from datetime import datetime
from collections import deque
from twisted.internet import defer, task
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.httpobj import urlparse_cached
from scrapy.resolver import dnscache
from scrapy import signals
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
from scrapy.core.downloader.handlers import DownloadHandlers
class Slot:
"""Downloader slot"""
def __init__(self, concurrency, delay, randomize_delay):
self.concurrency = concurrency
self.delay = delay
self.randomize_delay = randomize_delay
self.active = set()
self.queue = deque()
self.transferring = set()
self.lastseen = 0
self.latercall = None
def free_transfer_slots(self):
return self.concurrency - len(self.transferring)
def download_delay(self):
if self.randomize_delay:
return random.uniform(0.5 * self.delay, 1.5 * self.delay)
return self.delay
def close(self):
if self.latercall and self.latercall.active():
self.latercall.cancel()
def __repr__(self):
cls_name = self.__class__.__name__
return (f"{cls_name}(concurrency={self.concurrency!r}, "
f"delay={self.delay:.2f}, "
f"randomize_delay={self.randomize_delay!r})")
def __str__(self):
return (
f"<downloader.Slot concurrency={self.concurrency!r} "
f"delay={self.delay:.2f} randomize_delay={self.randomize_delay!r} "
f"len(active)={len(self.active)} len(queue)={len(self.queue)} "
f"len(transferring)={len(self.transferring)} "
f"lastseen={datetime.fromtimestamp(self.lastseen).isoformat()}>"
)
def _get_concurrency_delay(concurrency, spider, settings):
delay = settings.getfloat('DOWNLOAD_DELAY')
if hasattr(spider, 'download_delay'):
delay = spider.download_delay
if hasattr(spider, 'max_concurrent_requests'):
concurrency = spider.max_concurrent_requests
return concurrency, delay
class Downloader:
DOWNLOAD_SLOT = 'download_slot'
def __init__(self, crawler):
self.settings = crawler.settings
self.signals = crawler.signals
self.slots = {}
self.active = set()
self.handlers = DownloadHandlers(crawler)
self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
self._slot_gc_loop = task.LoopingCall(self._slot_gc)
self._slot_gc_loop.start(60)
def fetch(self, request, spider):
def _deactivate(response):
self.active.remove(request)
return response
self.active.add(request)
dfd = self.middleware.download(self._enqueue_request, request, spider)
return dfd.addBoth(_deactivate)
def needs_backout(self):
return len(self.active) >= self.total_concurrency
def _get_slot(self, request, spider):
key = self._get_slot_key(request, spider)
if key not in self.slots:
conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
conc, delay = _get_concurrency_delay(conc, spider, self.settings)
self.slots[key] = Slot(conc, delay, self.randomize_delay)
return key, self.slots[key]
def _get_slot_key(self, request, spider):
if self.DOWNLOAD_SLOT in request.meta:
return request.meta[self.DOWNLOAD_SLOT]
key = urlparse_cached(request).hostname or ''
if self.ip_concurrency:
key = dnscache.get(key, key)
return key
def _enqueue_request(self, request, spider):
key, slot = self._get_slot(request, spider)
request.meta[self.DOWNLOAD_SLOT] = key
def _deactivate(response):
slot.active.remove(request)
return response
slot.active.add(request)
self.signals.send_catch_log(signal=signals.request_reached_downloader,
request=request,
spider=spider)
deferred = defer.Deferred().addBoth(_deactivate)
slot.queue.append((request, deferred))
self._process_queue(spider, slot)
return deferred
def _process_queue(self, spider, slot):
from twisted.internet import reactor
if slot.latercall and slot.latercall.active():
return
# Delay queue processing if a download_delay is configured
now = time()
delay = slot.download_delay()
if delay:
penalty = delay - now + slot.lastseen
if penalty > 0:
slot.latercall = reactor.callLater(penalty, self._process_queue, spider, slot)
return
# Process enqueued requests if there are free slots to transfer for this slot
while slot.queue and slot.free_transfer_slots() > 0:
slot.lastseen = now
request, deferred = slot.queue.popleft()
dfd = self._download(slot, request, spider)
dfd.chainDeferred(deferred)
# prevent burst if inter-request delays were configured
if delay:
self._process_queue(spider, slot)
break
def _download(self, slot, request, spider):
# The order is very important for the following deferreds. Do not change!
# 1. Create the download deferred
dfd = mustbe_deferred(self.handlers.download_request, request, spider)
# 2. Notify response_downloaded listeners about the recent download
# before querying queue for next request
def _downloaded(response):
self.signals.send_catch_log(signal=signals.response_downloaded,
response=response,
request=request,
spider=spider)
return response
dfd.addCallback(_downloaded)
# 3. After response arrives, remove the request from transferring
# state to free up the transferring slot so it can be used by the
# following requests (perhaps those which came from the downloader
# middleware itself)
slot.transferring.add(request)
def finish_transferring(_):
slot.transferring.remove(request)
self._process_queue(spider, slot)
self.signals.send_catch_log(signal=signals.request_left_downloader,
request=request,
spider=spider)
return _
return dfd.addBoth(finish_transferring)
def close(self):
self._slot_gc_loop.stop()
for slot in self.slots.values():
slot.close()
def _slot_gc(self, age=60):
mintime = time() - age
for key, slot in list(self.slots.items()):
if not slot.active and slot.lastseen + slot.delay < mintime:
self.slots.pop(key).close()

View file

@ -0,0 +1,94 @@
from OpenSSL import SSL
from twisted.internet.ssl import optionsForClientTLS, CertificateOptions, platformTrust, AcceptableCiphers
from twisted.web.client import BrowserLikePolicyForHTTPS
from twisted.web.iweb import IPolicyForHTTPS
from zope.interface.declarations import implementer
from scrapy.core.downloader.tls import ScrapyClientTLSOptions, DEFAULT_CIPHERS
@implementer(IPolicyForHTTPS)
class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
"""
Non-peer-certificate verifying HTTPS context factory
Default OpenSSL method is TLS_METHOD (also called SSLv23_METHOD)
which allows TLS protocol negotiation
'A TLS/SSL connection established with [this method] may
understand the SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols.'
"""
def __init__(self, method=SSL.SSLv23_METHOD, tls_verbose_logging=False, tls_ciphers=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self._ssl_method = method
self.tls_verbose_logging = tls_verbose_logging
if tls_ciphers:
self.tls_ciphers = AcceptableCiphers.fromOpenSSLCipherString(tls_ciphers)
else:
self.tls_ciphers = DEFAULT_CIPHERS
@classmethod
def from_settings(cls, settings, method=SSL.SSLv23_METHOD, *args, **kwargs):
tls_verbose_logging = settings.getbool('DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING')
tls_ciphers = settings['DOWNLOADER_CLIENT_TLS_CIPHERS']
return cls(method=method, tls_verbose_logging=tls_verbose_logging, tls_ciphers=tls_ciphers, *args, **kwargs)
def getCertificateOptions(self):
# setting verify=True will require you to provide CAs
# to verify against; in other words: it's not that simple
# backward-compatible SSL/TLS method:
#
# * this will respect `method` attribute in often recommended
# `ScrapyClientContextFactory` subclass
# (https://github.com/scrapy/scrapy/issues/1429#issuecomment-131782133)
#
# * getattr() for `_ssl_method` attribute for context factories
# not calling super().__init__
return CertificateOptions(
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
fixBrokenPeers=True,
acceptableCiphers=self.tls_ciphers,
)
# kept for old-style HTTP/1.0 downloader context twisted calls,
# e.g. connectSSL()
def getContext(self, hostname=None, port=None):
return self.getCertificateOptions().getContext()
def creatorForNetloc(self, hostname, port):
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext(),
verbose_logging=self.tls_verbose_logging)
@implementer(IPolicyForHTTPS)
class BrowserLikeContextFactory(ScrapyClientContextFactory):
"""
Twisted-recommended context factory for web clients.
Quoting the documentation of the :class:`~twisted.web.client.Agent` class:
The default is to use a
:class:`~twisted.web.client.BrowserLikePolicyForHTTPS`, so unless you
have special requirements you can leave this as-is.
:meth:`creatorForNetloc` is the same as
:class:`~twisted.web.client.BrowserLikePolicyForHTTPS` except this context
factory allows setting the TLS/SSL method to use.
The default OpenSSL method is ``TLS_METHOD`` (also called
``SSLv23_METHOD``) which allows TLS protocol negotiation.
"""
def creatorForNetloc(self, hostname, port):
# trustRoot set to platformTrust() will use the platform's root CAs.
#
# This means that a website like https://www.cacert.org will be rejected
# by default, since CAcert.org CA certificate is seldom shipped.
return optionsForClientTLS(
hostname=hostname.decode("ascii"),
trustRoot=platformTrust(),
extraCertificateOptions={'method': self._ssl_method},
)

View file

@ -0,0 +1,81 @@
"""Download handlers for different schemes"""
import logging
from twisted.internet import defer
from scrapy import signals
from scrapy.exceptions import NotConfigured, NotSupported
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import without_none_values
logger = logging.getLogger(__name__)
class DownloadHandlers:
def __init__(self, crawler):
self._crawler = crawler
self._schemes = {} # stores acceptable schemes on instancing
self._handlers = {} # stores instanced handlers for schemes
self._notconfigured = {} # remembers failed handlers
handlers = without_none_values(
crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
for scheme, clspath in handlers.items():
self._schemes[scheme] = clspath
self._load_handler(scheme, skip_lazy=True)
crawler.signals.connect(self._close, signals.engine_stopped)
def _get_handler(self, scheme):
"""Lazy-load the downloadhandler for a scheme
only on the first request for that scheme.
"""
if scheme in self._handlers:
return self._handlers[scheme]
if scheme in self._notconfigured:
return None
if scheme not in self._schemes:
self._notconfigured[scheme] = 'no handler available for that scheme'
return None
return self._load_handler(scheme)
def _load_handler(self, scheme, skip_lazy=False):
path = self._schemes[scheme]
try:
dhcls = load_object(path)
if skip_lazy and getattr(dhcls, 'lazy', True):
return None
dh = create_instance(
objcls=dhcls,
settings=self._crawler.settings,
crawler=self._crawler,
)
except NotConfigured as ex:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True, extra={'crawler': self._crawler})
self._notconfigured[scheme] = str(ex)
return None
else:
self._handlers[scheme] = dh
return dh
def download_request(self, request, spider):
scheme = urlparse_cached(request).scheme
handler = self._get_handler(scheme)
if not handler:
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
return handler.download_request(request, spider)
@defer.inlineCallbacks
def _close(self, *_a, **_kw):
for dh in self._handlers.values():
if hasattr(dh, 'close'):
yield dh.close()

View file

@ -0,0 +1,22 @@
from w3lib.url import parse_data_uri
from scrapy.http import TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.utils.decorators import defers
class DataURIDownloadHandler:
lazy = False
@defers
def download_request(self, request, spider):
uri = parse_data_uri(request.url)
respcls = responsetypes.from_mimetype(uri.media_type)
resp_kwargs = {}
if (issubclass(respcls, TextResponse)
and uri.media_type.split('/')[0] == 'text'):
charset = uri.media_type_parameters.get('charset')
resp_kwargs['encoding'] = charset
return respcls(url=request.url, body=uri.data, **resp_kwargs)

View file

@ -0,0 +1,16 @@
from w3lib.url import file_uri_to_path
from scrapy.responsetypes import responsetypes
from scrapy.utils.decorators import defers
class FileDownloadHandler:
lazy = False
@defers
def download_request(self, request, spider):
filepath = file_uri_to_path(request.url)
with open(filepath, 'rb') as fo:
body = fo.read()
respcls = responsetypes.from_args(filename=filepath, body=body)
return respcls(url=request.url, body=body)

View file

@ -0,0 +1,119 @@
"""
An asynchronous FTP file download handler for scrapy which somehow emulates an http response.
FTP connection parameters are passed using the request meta field:
- ftp_user (required)
- ftp_password (required)
- ftp_passive (by default, enabled) sets FTP connection passive mode
- ftp_local_filename
- If not given, file data will come in the response.body, as a normal scrapy Response,
which will imply that the entire file will be on memory.
- if given, file data will be saved in a local file with the given name
This helps when downloading very big files to avoid memory issues. In addition, for
convenience the local file name will also be given in the response body.
The status of the built html response will be, by default
- 200 in case of success
- 404 in case specified file was not found in the server (ftp code 550)
or raise corresponding ftp exception otherwise
The matching from server ftp command return codes to html response codes is defined in the
CODE_MAPPING attribute of the handler class. The key 'default' is used for any code
that is not explicitly present among the map keys. You may need to overwrite this
mapping if want a different behaviour than default.
In case of status 200 request, response.headers will come with two keys:
'Local Filename' - with the value of the local filename if given
'Size' - with size of the downloaded data
"""
import re
from io import BytesIO
from urllib.parse import unquote
from twisted.internet.protocol import ClientCreator, Protocol
from twisted.protocols.ftp import CommandFailed, FTPClient
from scrapy.http import Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
class ReceivedDataProtocol(Protocol):
def __init__(self, filename=None):
self.__filename = filename
self.body = open(filename, "wb") if filename else BytesIO()
self.size = 0
def dataReceived(self, data):
self.body.write(data)
self.size += len(data)
@property
def filename(self):
return self.__filename
def close(self):
self.body.close() if self.filename else self.body.seek(0)
_CODE_RE = re.compile(r"\d+")
class FTPDownloadHandler:
lazy = False
CODE_MAPPING = {
"550": 404,
"default": 503,
}
def __init__(self, settings):
self.default_user = settings['FTP_USER']
self.default_password = settings['FTP_PASSWORD']
self.passive_mode = settings['FTP_PASSIVE_MODE']
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def download_request(self, request, spider):
from twisted.internet import reactor
parsed_url = urlparse_cached(request)
user = request.meta.get("ftp_user", self.default_user)
password = request.meta.get("ftp_password", self.default_password)
passive_mode = 1 if bool(request.meta.get("ftp_passive",
self.passive_mode)) else 0
creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
def gotClient(self, client, request, filepath):
self.client = client
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
return client.retrieveFile(filepath, protocol).addCallbacks(
callback=self._build_response,
callbackArgs=(request, protocol),
errback=self._failed,
errbackArgs=(request,),
)
def _build_response(self, result, request, protocol):
self.result = result
respcls = responsetypes.from_args(url=request.url)
protocol.close()
body = protocol.filename or protocol.body.read()
headers = {"local filename": protocol.filename or '', "size": protocol.size}
return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
def _failed(self, result, request):
message = result.getErrorMessage()
if result.type == CommandFailed:
m = _CODE_RE.search(message)
if m:
ftpcode = m.group()
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
return Response(url=request.url, status=httpcode, body=to_bytes(message))
raise result.type(result.value)

View file

@ -0,0 +1,4 @@
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
from scrapy.core.downloader.handlers.http11 import (
HTTP11DownloadHandler as HTTPDownloadHandler,
)

View file

@ -0,0 +1,37 @@
"""Download handlers for http and https schemes
"""
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import to_unicode
class HTTP10DownloadHandler:
lazy = False
def __init__(self, settings, crawler=None):
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
self._settings = settings
self._crawler = crawler
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings, crawler)
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
factory = self.HTTPClientFactory(request)
self._connect(factory)
return factory.deferred
def _connect(self, factory):
from twisted.internet import reactor
host, port = to_unicode(factory.host), factory.port
if factory.scheme == b'https':
client_context_factory = create_instance(
objcls=self.ClientContextFactory,
settings=self._settings,
crawler=self._crawler,
)
return reactor.connectSSL(host, port, factory, client_context_factory)
else:
return reactor.connectTCP(host, port, factory)

View file

@ -0,0 +1,568 @@
"""Download handlers for http and https schemes"""
import ipaddress
import logging
import re
import warnings
from contextlib import suppress
from io import BytesIO
from time import time
from urllib.parse import urldefrag
from twisted.internet import defer, protocol, ssl
from twisted.internet.endpoints import TCP4ClientEndpoint
from twisted.internet.error import TimeoutError
from twisted.python.failure import Failure
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
from twisted.web.http import _DataLoss, PotentialDataLoss
from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
from zope.interface import implementer
from scrapy import signals
from scrapy.core.downloader.tls import openssl_methods
from scrapy.core.downloader.webclient import _parse
from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import to_bytes, to_unicode
logger = logging.getLogger(__name__)
class HTTP11DownloadHandler:
lazy = False
def __init__(self, settings, crawler=None):
self._crawler = crawler
from twisted.internet import reactor
self._pool = HTTPConnectionPool(reactor, persistent=True)
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self._pool._factory.noisy = False
self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
# try method-aware context factory
try:
self._contextFactory = create_instance(
objcls=self._contextFactoryClass,
settings=settings,
crawler=crawler,
method=self._sslMethod,
)
except TypeError:
# use context factory defaults
self._contextFactory = create_instance(
objcls=self._contextFactoryClass,
settings=settings,
crawler=crawler,
)
msg = f"""
'{settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]}' does not accept `method` \
argument (type OpenSSL.SSL method, e.g. OpenSSL.SSL.SSLv23_METHOD) and/or \
`tls_verbose_logging` argument and/or `tls_ciphers` argument.\
Please upgrade your context factory class to handle them or ignore them."""
warnings.warn(msg)
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
self._disconnect_timeout = 1
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings, crawler)
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(
contextFactory=self._contextFactory,
pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
fail_on_dataloss=self._fail_on_dataloss,
crawler=self._crawler,
)
return agent.download_request(request)
def close(self):
from twisted.internet import reactor
d = self._pool.closeCachedConnections()
# closeCachedConnections will hang on network or server issues, so
# we'll manually timeout the deferred.
#
# Twisted issue addressing this problem can be found here:
# https://twistedmatrix.com/trac/ticket/7738.
#
# closeCachedConnections doesn't handle external errbacks, so we'll
# issue a callback after `_disconnect_timeout` seconds.
delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, [])
def cancel_delayed_call(result):
if delayed_call.active():
delayed_call.cancel()
return result
d.addBoth(cancel_delayed_call)
return d
class TunnelError(Exception):
"""An HTTP CONNECT tunnel could not be established by the proxy."""
class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
"""An endpoint that tunnels through proxies to allow HTTPS downloads. To
accomplish that, this endpoint sends an HTTP CONNECT to the proxy.
The HTTP CONNECT is always sent when using this endpoint, I think this could
be improved as the CONNECT will be redundant if the connection associated
with this endpoint comes from the pool and a CONNECT has already been issued
for it.
"""
_responseMatcher = re.compile(br'HTTP/1\.. (?P<status>\d{3})(?P<reason>.{,32})')
def __init__(self, reactor, host, port, proxyConf, contextFactory, timeout=30, bindAddress=None):
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
self._tunnelReadyDeferred = defer.Deferred()
self._tunneledHost = host
self._tunneledPort = port
self._contextFactory = contextFactory
self._connectBuffer = bytearray()
def requestTunnel(self, protocol):
"""Asks the proxy to open a tunnel."""
tunnelReq = tunnel_request_data(self._tunneledHost, self._tunneledPort, self._proxyAuthHeader)
protocol.transport.write(tunnelReq)
self._protocolDataReceived = protocol.dataReceived
protocol.dataReceived = self.processProxyResponse
self._protocol = protocol
return protocol
def processProxyResponse(self, rcvd_bytes):
"""Processes the response from the proxy. If the tunnel is successfully
created, notifies the client that we are ready to send requests. If not
raises a TunnelError.
"""
self._connectBuffer += rcvd_bytes
# make sure that enough (all) bytes are consumed
# and that we've got all HTTP headers (ending with a blank line)
# from the proxy so that we don't send those bytes to the TLS layer
#
# see https://github.com/scrapy/scrapy/issues/2491
if b'\r\n\r\n' not in self._connectBuffer:
return
self._protocol.dataReceived = self._protocolDataReceived
respm = TunnelingTCP4ClientEndpoint._responseMatcher.match(self._connectBuffer)
if respm and int(respm.group('status')) == 200:
# set proper Server Name Indication extension
sslOptions = self._contextFactory.creatorForNetloc(self._tunneledHost, self._tunneledPort)
self._protocol.transport.startTLS(sslOptions, self._protocolFactory)
self._tunnelReadyDeferred.callback(self._protocol)
else:
if respm:
extra = {'status': int(respm.group('status')),
'reason': respm.group('reason').strip()}
else:
extra = rcvd_bytes[:32]
self._tunnelReadyDeferred.errback(
TunnelError('Could not open CONNECT tunnel with proxy '
f'{self._host}:{self._port} [{extra!r}]')
)
def connectFailed(self, reason):
"""Propagates the errback to the appropriate deferred."""
self._tunnelReadyDeferred.errback(reason)
def connect(self, protocolFactory):
self._protocolFactory = protocolFactory
connectDeferred = super().connect(protocolFactory)
connectDeferred.addCallback(self.requestTunnel)
connectDeferred.addErrback(self.connectFailed)
return self._tunnelReadyDeferred
def tunnel_request_data(host, port, proxy_auth_header=None):
r"""
Return binary content of a CONNECT request.
>>> from scrapy.utils.python import to_unicode as s
>>> s(tunnel_request_data("example.com", 8080))
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
>>> s(tunnel_request_data("example.com", 8080, b"123"))
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n'
>>> s(tunnel_request_data(b"example.com", "8090"))
'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
"""
host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
tunnel_req += b'Host: ' + host_value + b'\r\n'
if proxy_auth_header:
tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
tunnel_req += b'\r\n'
return tunnel_req
class TunnelingAgent(Agent):
"""An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
downloads. It may look strange that we have chosen to subclass Agent and not
ProxyAgent but consider that after the tunnel is opened the proxy is
transparent to the client; thus the agent should behave like there is no
proxy involved.
"""
def __init__(self, reactor, proxyConf, contextFactory=None,
connectTimeout=None, bindAddress=None, pool=None):
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
self._proxyConf = proxyConf
self._contextFactory = contextFactory
def _getEndpoint(self, uri):
return TunnelingTCP4ClientEndpoint(
reactor=self._reactor,
host=uri.host,
port=uri.port,
proxyConf=self._proxyConf,
contextFactory=self._contextFactory,
timeout=self._endpointFactory._connectTimeout,
bindAddress=self._endpointFactory._bindAddress,
)
def _requestWithEndpoint(self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath):
# proxy host and port are required for HTTP pool `key`
# otherwise, same remote host connection request could reuse
# a cached tunneled connection to a different proxy
key = key + self._proxyConf
return super()._requestWithEndpoint(
key=key,
endpoint=endpoint,
method=method,
parsedURI=parsedURI,
headers=headers,
bodyProducer=bodyProducer,
requestPath=requestPath,
)
class ScrapyProxyAgent(Agent):
def __init__(self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None):
super().__init__(
reactor=reactor,
connectTimeout=connectTimeout,
bindAddress=bindAddress,
pool=pool,
)
self._proxyURI = URI.fromBytes(proxyURI)
def request(self, method, uri, headers=None, bodyProducer=None):
"""
Issue a new request via the configured proxy.
"""
# Cache *all* connections under the same key, since we are only
# connecting to a single destination, the proxy:
return self._requestWithEndpoint(
key=("http-proxy", self._proxyURI.host, self._proxyURI.port),
endpoint=self._getEndpoint(self._proxyURI),
method=method,
parsedURI=URI.fromBytes(uri),
headers=headers,
bodyProducer=bodyProducer,
requestPath=uri,
)
class ScrapyAgent:
_Agent = Agent
_ProxyAgent = ScrapyProxyAgent
_TunnelingAgent = TunnelingAgent
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
self._pool = pool
self._maxsize = maxsize
self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss
self._txresponse = None
self._crawler = crawler
def _get_agent(self, request, timeout):
from twisted.internet import reactor
bindaddress = request.meta.get('bindaddress') or self._bindAddress
proxy = request.meta.get('proxy')
if proxy:
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
proxyHost = to_unicode(proxyHost)
omitConnectTunnel = b'noconnect' in proxyParams
if omitConnectTunnel:
warnings.warn("Using HTTPS proxies in the noconnect mode is deprecated. "
"If you use Crawlera, it doesn't require this mode anymore, "
"so you should update scrapy-crawlera to 1.3.0+ "
"and remove '?noconnect' from the Crawlera URL.",
ScrapyDeprecationWarning)
if scheme == b'https' and not omitConnectTunnel:
proxyAuth = request.headers.get(b'Proxy-Authorization', None)
proxyConf = (proxyHost, proxyPort, proxyAuth)
return self._TunnelingAgent(
reactor=reactor,
proxyConf=proxyConf,
contextFactory=self._contextFactory,
connectTimeout=timeout,
bindAddress=bindaddress,
pool=self._pool,
)
else:
return self._ProxyAgent(
reactor=reactor,
proxyURI=to_bytes(proxy, encoding='ascii'),
connectTimeout=timeout,
bindAddress=bindaddress,
pool=self._pool,
)
return self._Agent(
reactor=reactor,
contextFactory=self._contextFactory,
connectTimeout=timeout,
bindAddress=bindaddress,
pool=self._pool,
)
def download_request(self, request):
from twisted.internet import reactor
timeout = request.meta.get('download_timeout') or self._connectTimeout
agent = self._get_agent(request, timeout)
# request details
url = urldefrag(request.url)[0]
method = to_bytes(request.method)
headers = TxHeaders(request.headers)
if isinstance(agent, self._TunnelingAgent):
headers.removeHeader(b'Proxy-Authorization')
if request.body:
bodyproducer = _RequestBodyProducer(request.body)
else:
bodyproducer = None
start_time = time()
d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
d.addCallback(self._cb_bodyready, request)
d.addCallback(self._cb_bodydone, request, url)
# check download timeout
self._timeout_cl = reactor.callLater(timeout, d.cancel)
d.addBoth(self._cb_timeout, request, url, timeout)
return d
def _cb_timeout(self, result, request, url, timeout):
if self._timeout_cl.active():
self._timeout_cl.cancel()
return result
# needed for HTTPS requests, otherwise _ResponseReader doesn't
# receive connectionLost()
if self._txresponse:
self._txresponse._transport.stopProducing()
raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
def _cb_latency(self, result, request, start_time):
request.meta['download_latency'] = time() - start_time
return result
def _cb_bodyready(self, txresponse, request):
# deliverBody hangs for responses without body
if txresponse.length == 0:
return {
"txresponse": txresponse,
"body": b"",
"flags": None,
"certificate": None,
"ip_address": None,
}
maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize)
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
if maxsize and expected_size > maxsize:
warning_msg = ("Cancelling download of %(url)s: expected response "
"size (%(size)s) larger than download max size (%(maxsize)s).")
warning_args = {'url': request.url, 'size': expected_size, 'maxsize': maxsize}
logger.warning(warning_msg, warning_args)
txresponse._transport._producer.loseConnection()
raise defer.CancelledError(warning_msg % warning_args)
if warnsize and expected_size > warnsize:
logger.warning("Expected response size (%(size)s) larger than "
"download warn size (%(warnsize)s) in request %(request)s.",
{'size': expected_size, 'warnsize': warnsize, 'request': request})
def _cancel(_):
# Abort connection immediately.
txresponse._transport._producer.abortConnection()
d = defer.Deferred(_cancel)
txresponse.deliverBody(
_ResponseReader(
finished=d,
txresponse=txresponse,
request=request,
maxsize=maxsize,
warnsize=warnsize,
fail_on_dataloss=fail_on_dataloss,
crawler=self._crawler,
)
)
# save response for timeouts
self._txresponse = txresponse
return d
def _cb_bodydone(self, result, request, url):
headers = Headers(result["txresponse"].headers.getAllRawHeaders())
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
response = respcls(
url=url,
status=int(result["txresponse"].code),
headers=headers,
body=result["body"],
flags=result["flags"],
certificate=result["certificate"],
ip_address=result["ip_address"],
)
if result.get("failure"):
result["failure"].value.response = response
return result["failure"]
return response
@implementer(IBodyProducer)
class _RequestBodyProducer:
def __init__(self, body):
self.body = body
self.length = len(body)
def startProducing(self, consumer):
consumer.write(self.body)
return defer.succeed(None)
def pauseProducing(self):
pass
def stopProducing(self):
pass
class _ResponseReader(protocol.Protocol):
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
self._finished = finished
self._txresponse = txresponse
self._request = request
self._bodybuf = BytesIO()
self._maxsize = maxsize
self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss
self._fail_on_dataloss_warned = False
self._reached_warnsize = False
self._bytes_received = 0
self._certificate = None
self._ip_address = None
self._crawler = crawler
def _finish_response(self, flags=None, failure=None):
self._finished.callback({
"txresponse": self._txresponse,
"body": self._bodybuf.getvalue(),
"flags": flags,
"certificate": self._certificate,
"ip_address": self._ip_address,
"failure": failure,
})
def connectionMade(self):
if self._certificate is None:
with suppress(AttributeError):
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
if self._ip_address is None:
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
def dataReceived(self, bodyBytes):
# This maybe called several times after cancel was called with buffered data.
if self._finished.called:
return
self._bodybuf.write(bodyBytes)
self._bytes_received += len(bodyBytes)
bytes_received_result = self._crawler.signals.send_catch_log(
signal=signals.bytes_received,
data=bodyBytes,
request=self._request,
spider=self._crawler.spider,
)
for handler, result in bytes_received_result:
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
{"request": self._request, "handler": handler.__qualname__})
self.transport._producer.loseConnection()
failure = result if result.value.fail else None
self._finish_response(flags=["download_stopped"], failure=failure)
if self._maxsize and self._bytes_received > self._maxsize:
logger.warning("Received (%(bytes)s) bytes larger than download "
"max size (%(maxsize)s) in request %(request)s.",
{'bytes': self._bytes_received,
'maxsize': self._maxsize,
'request': self._request})
# Clear buffer earlier to avoid keeping data in memory for a long time.
self._bodybuf.truncate(0)
self._finished.cancel()
if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
self._reached_warnsize = True
logger.warning("Received more bytes than download "
"warn size (%(warnsize)s) in request %(request)s.",
{'warnsize': self._warnsize,
'request': self._request})
def connectionLost(self, reason):
if self._finished.called:
return
if reason.check(ResponseDone):
self._finish_response()
return
if reason.check(PotentialDataLoss):
self._finish_response(flags=["partial"])
return
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
if not self._fail_on_dataloss:
self._finish_response(flags=["dataloss"])
return
elif not self._fail_on_dataloss_warned:
logger.warning("Got data loss in %s. If you want to process broken "
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
" -- This message won't be shown in further requests",
self._txresponse.request.absoluteURI.decode())
self._fail_on_dataloss_warned = True
self._finished.errback(reason)

View file

@ -0,0 +1,82 @@
from urllib.parse import unquote
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.exceptions import NotConfigured
from scrapy.utils.boto import is_botocore_available
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import create_instance
class S3DownloadHandler:
def __init__(self, settings, *,
crawler=None,
aws_access_key_id=None, aws_secret_access_key=None,
httpdownloadhandler=HTTPDownloadHandler, **kw):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
if not aws_access_key_id:
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
if not aws_secret_access_key:
aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
# If no credentials could be found anywhere,
# consider this an anonymous connection request by default;
# unless 'anon' was set explicitly (True/False).
anon = kw.get('anon')
if anon is None and not aws_access_key_id and not aws_secret_access_key:
kw['anon'] = True
self.anon = kw.get('anon')
self._signer = None
import botocore.auth
import botocore.credentials
kw.pop('anon', None)
if kw:
raise TypeError(f'Unexpected keyword arguments: {kw}')
if not self.anon:
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
self._signer = SignerCls(botocore.credentials.Credentials(
aws_access_key_id, aws_secret_access_key))
_http_handler = create_instance(
objcls=httpdownloadhandler,
settings=settings,
crawler=crawler,
)
self._download_http = _http_handler.download_request
@classmethod
def from_crawler(cls, crawler, **kwargs):
return cls(crawler.settings, crawler=crawler, **kwargs)
def download_request(self, request, spider):
p = urlparse_cached(request)
scheme = 'https' if request.meta.get('is_secure') else 'http'
bucket = p.hostname
path = p.path + '?' + p.query if p.query else p.path
url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
if self.anon:
request = request.replace(url=url)
elif self._signer is not None:
import botocore.awsrequest
awsrequest = botocore.awsrequest.AWSRequest(
method=request.method,
url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
headers=request.headers.to_unicode_dict(),
data=request.body)
self._signer.add_auth(awsrequest)
request = request.replace(
url=url, headers=awsrequest.headers.items())
else:
signed_headers = self.conn.make_request(
method=request.method,
bucket=bucket,
key=unquote(p.path),
query_args=unquote(p.query),
headers=request.headers,
data=request.body,
)
request = request.replace(url=url, headers=signed_headers)
return self._download_http(request, spider)

View file

@ -0,0 +1,84 @@
"""
Downloader Middleware manager
See documentation in docs/topics/downloader-middleware.rst
"""
from twisted.internet import defer
from scrapy.exceptions import _InvalidOutput
from scrapy.http import Request, Response
from scrapy.middleware import MiddlewareManager
from scrapy.utils.defer import mustbe_deferred, deferred_from_coro
from scrapy.utils.conf import build_component_list
class DownloaderMiddlewareManager(MiddlewareManager):
component_name = 'downloader middleware'
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(
settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
def _add_middleware(self, mw):
if hasattr(mw, 'process_request'):
self.methods['process_request'].append(mw.process_request)
if hasattr(mw, 'process_response'):
self.methods['process_response'].appendleft(mw.process_response)
if hasattr(mw, 'process_exception'):
self.methods['process_exception'].appendleft(mw.process_exception)
def download(self, download_func, request, spider):
@defer.inlineCallbacks
def process_request(request):
for method in self.methods['process_request']:
response = yield deferred_from_coro(method(request=request, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
raise _InvalidOutput(
f"Middleware {method.__self__.__class__.__name__}"
".process_request must return None, Response or "
f"Request, got {response.__class__.__name__}"
)
if response:
return response
return (yield download_func(request=request, spider=spider))
@defer.inlineCallbacks
def process_response(response):
if response is None:
raise TypeError("Received None in process_response")
elif isinstance(response, Request):
return response
for method in self.methods['process_response']:
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
if not isinstance(response, (Response, Request)):
raise _InvalidOutput(
f"Middleware {method.__self__.__class__.__name__}"
".process_response must return Response or Request, "
f"got {type(response)}"
)
if isinstance(response, Request):
return response
return response
@defer.inlineCallbacks
def process_exception(failure):
exception = failure.value
for method in self.methods['process_exception']:
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
raise _InvalidOutput(
f"Middleware {method.__self__.__class__.__name__}"
".process_exception must return None, Response or "
f"Request, got {type(response)}"
)
if response:
return response
return failure
deferred = mustbe_deferred(process_request, request)
deferred.addErrback(process_exception)
deferred.addCallback(process_response)
return deferred

View file

@ -0,0 +1,78 @@
import logging
from OpenSSL import SSL
from service_identity.exceptions import CertificateError
from twisted.internet._sslverify import ClientTLSOptions, verifyHostname, VerificationError
from twisted.internet.ssl import AcceptableCiphers
from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
logger = logging.getLogger(__name__)
METHOD_SSLv3 = 'SSLv3'
METHOD_TLS = 'TLS'
METHOD_TLSv10 = 'TLSv1.0'
METHOD_TLSv11 = 'TLSv1.1'
METHOD_TLSv12 = 'TLSv1.2'
openssl_methods = {
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
METHOD_SSLv3: SSL.SSLv3_METHOD, # SSL 3 (NOT recommended)
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only
}
class ScrapyClientTLSOptions(ClientTLSOptions):
"""
SSL Client connection creator ignoring certificate verification errors
(for genuinely invalid certificates or bugs in verification code).
Same as Twisted's private _sslverify.ClientTLSOptions,
except that VerificationError, CertificateError and ValueError
exceptions are caught, so that the connection is not closed, only
logging warnings. Also, HTTPS connection parameters logging is added.
"""
def __init__(self, hostname, ctx, verbose_logging=False):
super().__init__(hostname, ctx)
self.verbose_logging = verbose_logging
def _identityVerifyingInfoCallback(self, connection, where, ret):
if where & SSL.SSL_CB_HANDSHAKE_START:
connection.set_tlsext_host_name(self._hostnameBytes)
elif where & SSL.SSL_CB_HANDSHAKE_DONE:
if self.verbose_logging:
logger.debug('SSL connection to %s using protocol %s, cipher %s',
self._hostnameASCII,
connection.get_protocol_version_name(),
connection.get_cipher_name(),
)
server_cert = connection.get_peer_certificate()
logger.debug('SSL connection certificate: issuer "%s", subject "%s"',
x509name_to_string(server_cert.get_issuer()),
x509name_to_string(server_cert.get_subject()),
)
key_info = get_temp_key_info(connection._ssl)
if key_info:
logger.debug('SSL temp key: %s', key_info)
try:
verifyHostname(connection, self._hostnameASCII)
except (CertificateError, VerificationError) as e:
logger.warning(
'Remote certificate is not valid for hostname "{}"; {}'.format(
self._hostnameASCII, e))
except ValueError as e:
logger.warning(
'Ignoring error while verifying certificate '
'from host "{}" (exception: {})'.format(
self._hostnameASCII, repr(e)))
DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT')

View file

@ -0,0 +1,212 @@
from time import time
from urllib.parse import urlparse, urlunparse, urldefrag
from twisted.web.http import HTTPClient
from twisted.internet import defer, reactor
from twisted.internet.protocol import ClientFactory
from scrapy.http import Headers
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
from scrapy.responsetypes import responsetypes
def _parsed_url_args(parsed):
# Assume parsed is urlparse-d from Request.url,
# which was passed via safe_url_string and is ascii-only.
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
path = to_bytes(path, encoding="ascii")
host = to_bytes(parsed.hostname, encoding="ascii")
port = parsed.port
scheme = to_bytes(parsed.scheme, encoding="ascii")
netloc = to_bytes(parsed.netloc, encoding="ascii")
if port is None:
port = 443 if scheme == b'https' else 80
return scheme, netloc, host, port, path
def _parse(url):
""" Return tuple of (scheme, netloc, host, port, path),
all in bytes except for port which is int.
Assume url is from Request.url, which was passed via safe_url_string
and is ascii-only.
"""
url = url.strip()
parsed = urlparse(url)
return _parsed_url_args(parsed)
class ScrapyHTTPPageGetter(HTTPClient):
delimiter = b'\n'
def connectionMade(self):
self.headers = Headers() # bucket for response headers
# Method command
self.sendCommand(self.factory.method, self.factory.path)
# Headers
for key, values in self.factory.headers.items():
for value in values:
self.sendHeader(key, value)
self.endHeaders()
# Body
if self.factory.body is not None:
self.transport.write(self.factory.body)
def lineReceived(self, line):
return HTTPClient.lineReceived(self, line.rstrip())
def handleHeader(self, key, value):
self.headers.appendlist(key, value)
def handleStatus(self, version, status, message):
self.factory.gotStatus(version, status, message)
def handleEndHeaders(self):
self.factory.gotHeaders(self.headers)
def connectionLost(self, reason):
self._connection_lost_reason = reason
HTTPClient.connectionLost(self, reason)
self.factory.noPage(reason)
def handleResponse(self, response):
if self.factory.method.upper() == b'HEAD':
self.factory.page(b'')
elif self.length is not None and self.length > 0:
self.factory.noPage(self._connection_lost_reason)
else:
self.factory.page(response)
self.transport.loseConnection()
def timeout(self):
self.transport.loseConnection()
# transport cleanup needed for HTTPS connections
if self.factory.url.startswith(b'https'):
self.transport.stopProducing()
self.factory.noPage(
defer.TimeoutError(f"Getting {self.factory.url} took longer "
f"than {self.factory.timeout} seconds."))
# This class used to inherit from Twisteds
# twisted.web.client.HTTPClientFactory. When that class was deprecated in
# Twisted (https://github.com/twisted/twisted/pull/643), we merged its
# non-overriden code into this class.
class ScrapyHTTPClientFactory(ClientFactory):
protocol = ScrapyHTTPPageGetter
waiting = 1
noisy = False
followRedirect = False
afterFoundGet = False
def _build_response(self, body, request):
request.meta['download_latency'] = self.headers_time - self.start_time
status = int(self.status)
headers = Headers(self.response_headers)
respcls = responsetypes.from_args(headers=headers, url=self._url)
return respcls(url=self._url, status=status, headers=headers, body=body)
def _set_connection_attributes(self, request):
parsed = urlparse_cached(request)
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
proxy = request.meta.get('proxy')
if proxy:
self.scheme, _, self.host, self.port, _ = _parse(proxy)
self.path = self.url
def __init__(self, request, timeout=180):
self._url = urldefrag(request.url)[0]
# converting to bytes to comply to Twisted interface
self.url = to_bytes(self._url, encoding='ascii')
self.method = to_bytes(request.method, encoding='ascii')
self.body = request.body or None
self.headers = Headers(request.headers)
self.response_headers = None
self.timeout = request.meta.get('download_timeout') or timeout
self.start_time = time()
self.deferred = defer.Deferred().addCallback(self._build_response, request)
# Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
# to have _disconnectedDeferred. See Twisted r32329.
# As Scrapy implements it's own logic to handle redirects is not
# needed to add the callback _waitForDisconnect.
# Specifically this avoids the AttributeError exception when
# clientConnectionFailed method is called.
self._disconnectedDeferred = defer.Deferred()
self._set_connection_attributes(request)
# set Host header based on url
self.headers.setdefault('Host', self.netloc)
# set Content-Length based len of body
if self.body is not None:
self.headers['Content-Length'] = len(self.body)
# just in case a broken http/1.1 decides to keep connection alive
self.headers.setdefault("Connection", "close")
# Content-Length must be specified in POST method even with no body
elif self.method == b'POST':
self.headers['Content-Length'] = 0
def __repr__(self):
return f"<{self.__class__.__name__}: {self.url}>"
def _cancelTimeout(self, result, timeoutCall):
if timeoutCall.active():
timeoutCall.cancel()
return result
def buildProtocol(self, addr):
p = ClientFactory.buildProtocol(self, addr)
p.followRedirect = self.followRedirect
p.afterFoundGet = self.afterFoundGet
if self.timeout:
timeoutCall = reactor.callLater(self.timeout, p.timeout)
self.deferred.addBoth(self._cancelTimeout, timeoutCall)
return p
def gotHeaders(self, headers):
self.headers_time = time()
self.response_headers = headers
def gotStatus(self, version, status, message):
"""
Set the status of the request on us.
@param version: The HTTP version.
@type version: L{bytes}
@param status: The HTTP status code, an integer represented as a
bytestring.
@type status: L{bytes}
@param message: The HTTP status message.
@type message: L{bytes}
"""
self.version, self.status, self.message = version, status, message
def page(self, page):
if self.waiting:
self.waiting = 0
self.deferred.callback(page)
def noPage(self, reason):
if self.waiting:
self.waiting = 0
self.deferred.errback(reason)
def clientConnectionFailed(self, _, reason):
"""
When a connection attempt fails, the request cannot be issued. If no
result has yet been provided to the result Deferred, provide the
connection failure reason as an error result.
"""
if self.waiting:
self.waiting = 0
# If the connection attempt failed, there is nothing more to
# disconnect, so just fire that Deferred now.
self._disconnectedDeferred.callback(None)
self.deferred.errback(reason)

View file

@ -0,0 +1,360 @@
"""
This is the Scrapy engine which controls the Scheduler, Downloader and Spiders.
For more information see docs/topics/architecture.rst
"""
import logging
from time import time
from twisted.internet import defer, task
from twisted.python.failure import Failure
from scrapy import signals
from scrapy.core.scraper import Scraper
from scrapy.exceptions import DontCloseSpider
from scrapy.http import Response, Request
from scrapy.utils.misc import load_object
from scrapy.utils.reactor import CallLaterOnce
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
logger = logging.getLogger(__name__)
class Slot:
def __init__(self, start_requests, close_if_idle, nextcall, scheduler):
self.closing = False
self.inprogress = set() # requests in progress
self.start_requests = iter(start_requests)
self.close_if_idle = close_if_idle
self.nextcall = nextcall
self.scheduler = scheduler
self.heartbeat = task.LoopingCall(nextcall.schedule)
def add_request(self, request):
self.inprogress.add(request)
def remove_request(self, request):
self.inprogress.remove(request)
self._maybe_fire_closing()
def close(self):
self.closing = defer.Deferred()
self._maybe_fire_closing()
return self.closing
def _maybe_fire_closing(self):
if self.closing and not self.inprogress:
if self.nextcall:
self.nextcall.cancel()
if self.heartbeat.running:
self.heartbeat.stop()
self.closing.callback(None)
class ExecutionEngine:
def __init__(self, crawler, spider_closed_callback):
self.crawler = crawler
self.settings = crawler.settings
self.signals = crawler.signals
self.logformatter = crawler.logformatter
self.slot = None
self.spider = None
self.running = False
self.paused = False
self.scheduler_cls = load_object(self.settings['SCHEDULER'])
downloader_cls = load_object(self.settings['DOWNLOADER'])
self.downloader = downloader_cls(crawler)
self.scraper = Scraper(crawler)
self._spider_closed_callback = spider_closed_callback
@defer.inlineCallbacks
def start(self):
"""Start the execution engine"""
if self.running:
raise RuntimeError("Engine already running")
self.start_time = time()
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
self.running = True
self._closewait = defer.Deferred()
yield self._closewait
def stop(self):
"""Stop the execution engine gracefully"""
if not self.running:
raise RuntimeError("Engine not running")
self.running = False
dfd = self._close_all_spiders()
return dfd.addBoth(lambda _: self._finish_stopping_engine())
def close(self):
"""Close the execution engine gracefully.
If it has already been started, stop it. In all cases, close all spiders
and the downloader.
"""
if self.running:
# Will also close spiders and downloader
return self.stop()
elif self.open_spiders:
# Will also close downloader
return self._close_all_spiders()
else:
return defer.succeed(self.downloader.close())
def pause(self):
"""Pause the execution engine"""
self.paused = True
def unpause(self):
"""Resume the execution engine"""
self.paused = False
def _next_request(self, spider):
slot = self.slot
if not slot:
return
if self.paused:
return
while not self._needs_backout(spider):
if not self._next_request_from_scheduler(spider):
break
if slot.start_requests and not self._needs_backout(spider):
try:
request = next(slot.start_requests)
except StopIteration:
slot.start_requests = None
except Exception:
slot.start_requests = None
logger.error('Error while obtaining start requests',
exc_info=True, extra={'spider': spider})
else:
self.crawl(request, spider)
if self.spider_is_idle(spider) and slot.close_if_idle:
self._spider_idle(spider)
def _needs_backout(self, spider):
slot = self.slot
return (
not self.running
or slot.closing
or self.downloader.needs_backout()
or self.scraper.slot.needs_backout()
)
def _next_request_from_scheduler(self, spider):
slot = self.slot
request = slot.scheduler.next_request()
if not request:
return
d = self._download(request, spider)
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.remove_request(request))
d.addErrback(lambda f: logger.info('Error while removing request from slot',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(lambda f: logger.info('Error while scheduling new request',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d
def _handle_downloader_output(self, response, request, spider):
if not isinstance(response, (Request, Response, Failure)):
raise TypeError(
"Incorrect type: expected Request, Response or Failure, got "
f"{type(response)}: {response!r}"
)
# downloader middleware can return requests (for example, redirects)
if isinstance(response, Request):
self.crawl(response, spider)
return
# response is a Response or Failure
d = self.scraper.enqueue_scrape(response, request, spider)
d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d
def spider_is_idle(self, spider):
if not self.scraper.slot.is_idle():
# scraper is not idle
return False
if self.downloader.active:
# downloader has pending requests
return False
if self.slot.start_requests is not None:
# not all start requests are handled
return False
if self.slot.scheduler.has_pending_requests():
# scheduler has pending requests
return False
return True
@property
def open_spiders(self):
return [self.spider] if self.spider else []
def has_capacity(self):
"""Does the engine have capacity to handle more spiders"""
return not bool(self.slot)
def crawl(self, request, spider):
if spider not in self.open_spiders:
raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
self.schedule(request, spider)
self.slot.nextcall.schedule()
def schedule(self, request, spider):
self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
if not self.slot.scheduler.enqueue_request(request):
self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
def download(self, request, spider):
d = self._download(request, spider)
d.addBoth(self._downloaded, self.slot, request, spider)
return d
def _downloaded(self, response, slot, request, spider):
slot.remove_request(request)
return self.download(response, spider) if isinstance(response, Request) else response
def _download(self, request, spider):
slot = self.slot
slot.add_request(request)
def _on_success(response):
if not isinstance(response, (Response, Request)):
raise TypeError(
"Incorrect type: expected Response or Request, got "
f"{type(response)}: {response!r}"
)
if isinstance(response, Response):
if response.request is None:
response.request = request
logkws = self.logformatter.crawled(response.request, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
self.signals.send_catch_log(
signal=signals.response_received,
response=response,
request=response.request,
spider=spider,
)
return response
def _on_complete(_):
slot.nextcall.schedule()
return _
dwld = self.downloader.fetch(request, spider)
dwld.addCallbacks(_on_success)
dwld.addBoth(_on_complete)
return dwld
@defer.inlineCallbacks
def open_spider(self, spider, start_requests=(), close_if_idle=True):
if not self.has_capacity():
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
logger.info("Spider opened", extra={'spider': spider})
nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_crawler(self.crawler)
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
self.slot = slot
self.spider = spider
yield scheduler.open(spider)
yield self.scraper.open_spider(spider)
self.crawler.stats.open_spider(spider)
yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
slot.nextcall.schedule()
slot.heartbeat.start(5)
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
def close_spider(self, spider, reason='cancelled'):
"""Close (cancel) spider and clear all its outstanding requests"""
slot = self.slot
if slot.closing:
return slot.closing
logger.info("Closing spider (%(reason)s)",
{'reason': reason},
extra={'spider': spider})
dfd = slot.close()
def log_failure(msg):
def errback(failure):
logger.error(
msg,
exc_info=failure_to_exc_info(failure),
extra={'spider': spider}
)
return errback
dfd.addBoth(lambda _: self.downloader.close())
dfd.addErrback(log_failure('Downloader close failure'))
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log_failure('Scraper close failure'))
dfd.addBoth(lambda _: slot.scheduler.close(reason))
dfd.addErrback(log_failure('Scheduler close failure'))
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
signal=signals.spider_closed, spider=spider, reason=reason))
dfd.addErrback(log_failure('Error while sending spider_close signal'))
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
dfd.addErrback(log_failure('Stats close failure'))
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
{'reason': reason},
extra={'spider': spider}))
dfd.addBoth(lambda _: setattr(self, 'slot', None))
dfd.addErrback(log_failure('Error while unassigning slot'))
dfd.addBoth(lambda _: setattr(self, 'spider', None))
dfd.addErrback(log_failure('Error while unassigning spider'))
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
return dfd
def _close_all_spiders(self):
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
dlist = defer.DeferredList(dfds)
return dlist
@defer.inlineCallbacks
def _finish_stopping_engine(self):
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
self._closewait.callback(None)

View file

@ -0,0 +1,182 @@
import os
import json
import logging
import warnings
from os.path import join, exists
from queuelib import PriorityQueue
from scrapy.utils.misc import load_object, create_instance
from scrapy.utils.job import job_dir
from scrapy.utils.deprecate import ScrapyDeprecationWarning
logger = logging.getLogger(__name__)
class Scheduler:
"""
Scrapy Scheduler. It allows to enqueue requests and then get
a next request to download. Scheduler is also handling duplication
filtering, via dupefilter.
Prioritization and queueing is not performed by the Scheduler.
User sets ``priority`` field for each Request, and a PriorityQueue
(defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
to dequeue requests in a desired order.
Scheduler uses two PriorityQueue instances, configured to work in-memory
and on-disk (optional). When on-disk queue is present, it is used by
default, and an in-memory queue is used as a fallback for cases where
a disk queue can't handle a request (can't serialize it).
:setting:`SCHEDULER_MEMORY_QUEUE` and
:setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
which PriorityQueue instances would be instantiated with, to keep requests
on disk and in memory respectively.
Overall, Scheduler is an object which holds several PriorityQueue instances
(in-memory and on-disk) and implements fallback logic for them.
Also, it handles dupefilters.
"""
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
logunser=False, stats=None, pqclass=None, crawler=None):
self.df = dupefilter
self.dqdir = self._dqdir(jobdir)
self.pqclass = pqclass
self.dqclass = dqclass
self.mqclass = mqclass
self.logunser = logunser
self.stats = stats
self.crawler = crawler
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
dupefilter = create_instance(dupefilter_cls, settings, crawler)
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
if pqclass is PriorityQueue:
warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
" is no longer supported because of API changes; "
"please use 'scrapy.pqueues.ScrapyPriorityQueue'",
ScrapyDeprecationWarning)
from scrapy.pqueues import ScrapyPriorityQueue
pqclass = ScrapyPriorityQueue
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
logunser = settings.getbool('SCHEDULER_DEBUG')
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
mqclass=mqclass, crawler=crawler)
def has_pending_requests(self):
return len(self) > 0
def open(self, spider):
self.spider = spider
self.mqs = self._mq()
self.dqs = self._dq() if self.dqdir else None
return self.df.open()
def close(self, reason):
if self.dqs:
state = self.dqs.close()
self._write_dqs_state(self.dqdir, state)
return self.df.close(reason)
def enqueue_request(self, request):
if not request.dont_filter and self.df.request_seen(request):
self.df.log(request, self.spider)
return False
dqok = self._dqpush(request)
if dqok:
self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
else:
self._mqpush(request)
self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
self.stats.inc_value('scheduler/enqueued', spider=self.spider)
return True
def next_request(self):
request = self.mqs.pop()
if request:
self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
else:
request = self._dqpop()
if request:
self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
if request:
self.stats.inc_value('scheduler/dequeued', spider=self.spider)
return request
def __len__(self):
return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)
def _dqpush(self, request):
if self.dqs is None:
return
try:
self.dqs.push(request)
except ValueError as e: # non serializable request
if self.logunser:
msg = ("Unable to serialize request: %(request)s - reason:"
" %(reason)s - no more unserializable requests will be"
" logged (stats being collected)")
logger.warning(msg, {'request': request, 'reason': e},
exc_info=True, extra={'spider': self.spider})
self.logunser = False
self.stats.inc_value('scheduler/unserializable',
spider=self.spider)
return
else:
return True
def _mqpush(self, request):
self.mqs.push(request)
def _dqpop(self):
if self.dqs:
return self.dqs.pop()
def _mq(self):
""" Create a new priority queue instance, with in-memory storage """
return create_instance(self.pqclass,
settings=None,
crawler=self.crawler,
downstream_queue_cls=self.mqclass,
key='')
def _dq(self):
""" Create a new priority queue instance, with disk storage """
state = self._read_dqs_state(self.dqdir)
q = create_instance(self.pqclass,
settings=None,
crawler=self.crawler,
downstream_queue_cls=self.dqclass,
key=self.dqdir,
startprios=state)
if q:
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
{'queuesize': len(q)}, extra={'spider': self.spider})
return q
def _dqdir(self, jobdir):
""" Return a folder name to keep disk queue state at """
if jobdir:
dqdir = join(jobdir, 'requests.queue')
if not exists(dqdir):
os.makedirs(dqdir)
return dqdir
def _read_dqs_state(self, dqdir):
path = join(dqdir, 'active.json')
if not exists(path):
return ()
with open(path) as f:
return json.load(f)
def _write_dqs_state(self, dqdir, state):
with open(join(dqdir, 'active.json'), 'w') as f:
json.dump(state, f)

View file

@ -0,0 +1,260 @@
"""This module implements the Scraper component which parses responses and
extracts information from them"""
import logging
from collections import deque
from itemadapter import is_item
from twisted.internet import defer
from twisted.python.failure import Failure
from scrapy import signals
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy.http import Request, Response
from scrapy.utils.defer import defer_fail, defer_succeed, iter_errback, parallel
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.spider import iterate_spider_output
logger = logging.getLogger(__name__)
class Slot:
"""Scraper slot (one per running spider)"""
MIN_RESPONSE_SIZE = 1024
def __init__(self, max_active_size=5000000):
self.max_active_size = max_active_size
self.queue = deque()
self.active = set()
self.active_size = 0
self.itemproc_size = 0
self.closing = None
def add_response_request(self, response, request):
deferred = defer.Deferred()
self.queue.append((response, request, deferred))
if isinstance(response, Response):
self.active_size += max(len(response.body), self.MIN_RESPONSE_SIZE)
else:
self.active_size += self.MIN_RESPONSE_SIZE
return deferred
def next_response_request_deferred(self):
response, request, deferred = self.queue.popleft()
self.active.add(request)
return response, request, deferred
def finish_response(self, response, request):
self.active.remove(request)
if isinstance(response, Response):
self.active_size -= max(len(response.body), self.MIN_RESPONSE_SIZE)
else:
self.active_size -= self.MIN_RESPONSE_SIZE
def is_idle(self):
return not (self.queue or self.active)
def needs_backout(self):
return self.active_size > self.max_active_size
class Scraper:
def __init__(self, crawler):
self.slot = None
self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
self.itemproc = itemproc_cls.from_crawler(crawler)
self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
self.crawler = crawler
self.signals = crawler.signals
self.logformatter = crawler.logformatter
@defer.inlineCallbacks
def open_spider(self, spider):
"""Open the given spider for scraping and allocate resources for it"""
self.slot = Slot(self.crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE'))
yield self.itemproc.open_spider(spider)
def close_spider(self, spider):
"""Close a spider being scraped and release its resources"""
slot = self.slot
slot.closing = defer.Deferred()
slot.closing.addCallback(self.itemproc.close_spider)
self._check_if_closing(spider, slot)
return slot.closing
def is_idle(self):
"""Return True if there isn't any more spiders to process"""
return not self.slot
def _check_if_closing(self, spider, slot):
if slot.closing and slot.is_idle():
slot.closing.callback(spider)
def enqueue_scrape(self, response, request, spider):
slot = self.slot
dfd = slot.add_response_request(response, request)
def finish_scraping(_):
slot.finish_response(response, request)
self._check_if_closing(spider, slot)
self._scrape_next(spider, slot)
return _
dfd.addBoth(finish_scraping)
dfd.addErrback(
lambda f: logger.error('Scraper bug processing %(request)s',
{'request': request},
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
self._scrape_next(spider, slot)
return dfd
def _scrape_next(self, spider, slot):
while slot.queue:
response, request, deferred = slot.next_response_request_deferred()
self._scrape(response, request, spider).chainDeferred(deferred)
def _scrape(self, result, request, spider):
"""
Handle the downloaded response or failure through the spider callback/errback
"""
if not isinstance(result, (Response, Failure)):
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
dfd = self._scrape2(result, request, spider) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, result, spider)
dfd.addCallback(self.handle_spider_output, request, result, spider)
return dfd
def _scrape2(self, result, request, spider):
"""
Handle the different cases of request's result been a Response or a Failure
"""
if isinstance(result, Response):
return self.spidermw.scrape_response(self.call_spider, result, request, spider)
else: # result is a Failure
dfd = self.call_spider(result, request, spider)
return dfd.addErrback(self._log_download_errors, result, request, spider)
def call_spider(self, result, request, spider):
if isinstance(result, Response):
if getattr(result, "request", None) is None:
result.request = request
callback = result.request.callback or spider._parse
warn_on_generator_with_return_value(spider, callback)
dfd = defer_succeed(result)
dfd.addCallback(callback, **result.request.cb_kwargs)
else: # result is a Failure
result.request = request
warn_on_generator_with_return_value(spider, request.errback)
dfd = defer_fail(result)
dfd.addErrback(request.errback)
return dfd.addCallback(iterate_spider_output)
def handle_spider_error(self, _failure, request, response, spider):
exc = _failure.value
if isinstance(exc, CloseSpider):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
return
logkws = self.logformatter.spider_error(_failure, request, response, spider)
logger.log(
*logformatter_adapter(logkws),
exc_info=failure_to_exc_info(_failure),
extra={'spider': spider}
)
self.signals.send_catch_log(
signal=signals.spider_error,
failure=_failure, response=response,
spider=spider
)
self.crawler.stats.inc_value(
f"spider_exceptions/{_failure.value.__class__.__name__}",
spider=spider
)
def handle_spider_output(self, result, request, response, spider):
if not result:
return defer_succeed(None)
it = iter_errback(result, self.handle_spider_error, request, response, spider)
dfd = parallel(it, self.concurrent_items, self._process_spidermw_output,
request, response, spider)
return dfd
def _process_spidermw_output(self, output, request, response, spider):
"""Process each Request/Item (given in the output parameter) returned
from the given spider
"""
if isinstance(output, Request):
self.crawler.engine.crawl(request=output, spider=spider)
elif is_item(output):
self.slot.itemproc_size += 1
dfd = self.itemproc.process_item(output, spider)
dfd.addBoth(self._itemproc_finished, output, response, spider)
return dfd
elif output is None:
pass
else:
typename = type(output).__name__
logger.error(
'Spider must return request, item, or None, got %(typename)r in %(request)s',
{'request': request, 'typename': typename},
extra={'spider': spider},
)
def _log_download_errors(self, spider_failure, download_failure, request, spider):
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here)
"""
if isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest):
if download_failure.frames:
logkws = self.logformatter.download_error(download_failure, request, spider)
logger.log(
*logformatter_adapter(logkws),
extra={'spider': spider},
exc_info=failure_to_exc_info(download_failure),
)
else:
errmsg = download_failure.getErrorMessage()
if errmsg:
logkws = self.logformatter.download_error(
download_failure, request, spider, errmsg)
logger.log(
*logformatter_adapter(logkws),
extra={'spider': spider},
)
if spider_failure is not download_failure:
return spider_failure
def _itemproc_finished(self, output, item, response, spider):
"""ItemProcessor finished for the given ``item`` and returned ``output``
"""
self.slot.itemproc_size -= 1
if isinstance(output, Failure):
ex = output.value
if isinstance(ex, DropItem):
logkws = self.logformatter.dropped(item, ex, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
else:
logkws = self.logformatter.item_error(item, ex, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider},
exc_info=failure_to_exc_info(output))
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)
else:
logkws = self.logformatter.scraped(output, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)

View file

@ -0,0 +1,128 @@
"""
Spider Middleware manager
See documentation in docs/topics/spider-middleware.rst
"""
from itertools import islice
from twisted.python.failure import Failure
from scrapy.exceptions import _InvalidOutput
from scrapy.middleware import MiddlewareManager
from scrapy.utils.conf import build_component_list
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.python import MutableChain
def _isiterable(possible_iterator):
return hasattr(possible_iterator, '__iter__')
def _fname(f):
return f"{f.__self__.__class__.__name__}.{f.__func__.__name__}"
class SpiderMiddlewareManager(MiddlewareManager):
component_name = 'spider middleware'
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
def _add_middleware(self, mw):
super()._add_middleware(mw)
if hasattr(mw, 'process_spider_input'):
self.methods['process_spider_input'].append(mw.process_spider_input)
if hasattr(mw, 'process_start_requests'):
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
process_spider_output = getattr(mw, 'process_spider_output', None)
self.methods['process_spider_output'].appendleft(process_spider_output)
process_spider_exception = getattr(mw, 'process_spider_exception', None)
self.methods['process_spider_exception'].appendleft(process_spider_exception)
def scrape_response(self, scrape_func, response, request, spider):
def process_spider_input(response):
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
if result is not None:
msg = (f"Middleware {_fname(method)} must return None "
f"or raise an exception, got {type(result)}")
raise _InvalidOutput(msg)
except _InvalidOutput:
raise
except Exception:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def _evaluate_iterable(iterable, exception_processor_index, recover_to):
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), exception_processor_index)
if isinstance(exception_result, Failure):
raise
recover_to.extend(exception_result)
def process_spider_exception(_failure, start_index=0):
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
result = method(response=response, exception=exception, spider=spider)
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
return process_spider_output(result, method_index + 1)
elif result is None:
continue
else:
msg = (f"Middleware {_fname(method)} must return None "
f"or an iterable, got {type(result)}")
raise _InvalidOutput(msg)
return _failure
def process_spider_output(result, start_index=0):
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
method_list = islice(self.methods['process_spider_output'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
try:
# might fail directly if the output value is not a generator
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), method_index + 1)
if isinstance(exception_result, Failure):
raise
return exception_result
if _isiterable(result):
result = _evaluate_iterable(result, method_index + 1, recovered)
else:
msg = (f"Middleware {_fname(method)} must return an "
f"iterable, got {type(result)}")
raise _InvalidOutput(msg)
return MutableChain(result, recovered)
def process_callback_output(result):
recovered = MutableChain()
result = _evaluate_iterable(result, 0, recovered)
return MutableChain(process_spider_output(result), recovered)
dfd = mustbe_deferred(process_spider_input, response)
dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception)
return dfd
def process_start_requests(self, start_requests, spider):
return self._process_chain('process_start_requests', start_requests, spider)