Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
|
|
@ -0,0 +1,97 @@
|
|||
"""
|
||||
An extension to retry failed requests that are potentially caused by temporary
|
||||
problems such as a connection timeout or HTTP 500 error.
|
||||
|
||||
You can change the behaviour of this middleware by modifing the scraping settings:
|
||||
RETRY_TIMES - how many times to retry a failed page
|
||||
RETRY_HTTP_CODES - which HTTP response codes to retry
|
||||
|
||||
Failed pages are collected on the scraping process and rescheduled at the end,
|
||||
once the spider has finished crawling all regular (non failed) pages.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import (
|
||||
ConnectError,
|
||||
ConnectionDone,
|
||||
ConnectionLost,
|
||||
ConnectionRefusedError,
|
||||
DNSLookupError,
|
||||
TCPTimedOutError,
|
||||
TimeoutError,
|
||||
)
|
||||
from twisted.web.client import ResponseFailed
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.response import response_status_message
|
||||
from scrapy.core.downloader.handlers.http11 import TunnelError
|
||||
from scrapy.utils.python import global_object_name
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryMiddleware:
|
||||
|
||||
# IOError is raised by the HttpCompression middleware when trying to
|
||||
# decompress an empty response
|
||||
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError, TunnelError)
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('RETRY_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.max_retry_times = settings.getint('RETRY_TIMES')
|
||||
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
|
||||
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_retry', False):
|
||||
return response
|
||||
if response.status in self.retry_http_codes:
|
||||
reason = response_status_message(response.status)
|
||||
return self._retry(request, reason, spider) or response
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if (
|
||||
isinstance(exception, self.EXCEPTIONS_TO_RETRY)
|
||||
and not request.meta.get('dont_retry', False)
|
||||
):
|
||||
return self._retry(request, exception, spider)
|
||||
|
||||
def _retry(self, request, reason, spider):
|
||||
retries = request.meta.get('retry_times', 0) + 1
|
||||
|
||||
retry_times = self.max_retry_times
|
||||
|
||||
if 'max_retry_times' in request.meta:
|
||||
retry_times = request.meta['max_retry_times']
|
||||
|
||||
stats = spider.crawler.stats
|
||||
if retries <= retry_times:
|
||||
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
retryreq.priority = request.priority + self.priority_adjust
|
||||
|
||||
if isinstance(reason, Exception):
|
||||
reason = global_object_name(reason.__class__)
|
||||
|
||||
stats.inc_value('retry/count')
|
||||
stats.inc_value(f'retry/reason_count/{reason}')
|
||||
return retryreq
|
||||
else:
|
||||
stats.inc_value('retry/max_reached')
|
||||
logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
Loading…
Add table
Add a link
Reference in a new issue