93 lines
3.5 KiB
Python
93 lines
3.5 KiB
Python
import logging
|
|
|
|
from scrapy.exceptions import NotConfigured
|
|
from scrapy import signals
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class AutoThrottle:
|
|
|
|
def __init__(self, crawler):
|
|
self.crawler = crawler
|
|
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
|
|
raise NotConfigured
|
|
|
|
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
|
|
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
|
|
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
|
|
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
return cls(crawler)
|
|
|
|
def _spider_opened(self, spider):
|
|
self.mindelay = self._min_delay(spider)
|
|
self.maxdelay = self._max_delay(spider)
|
|
spider.download_delay = self._start_delay(spider)
|
|
|
|
def _min_delay(self, spider):
|
|
s = self.crawler.settings
|
|
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
|
|
|
|
def _max_delay(self, spider):
|
|
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
|
|
|
|
def _start_delay(self, spider):
|
|
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
|
|
|
|
def _response_downloaded(self, response, request, spider):
|
|
key, slot = self._get_slot(request, spider)
|
|
latency = request.meta.get('download_latency')
|
|
if latency is None or slot is None:
|
|
return
|
|
|
|
olddelay = slot.delay
|
|
self._adjust_delay(slot, latency, response)
|
|
if self.debug:
|
|
diff = slot.delay - olddelay
|
|
size = len(response.body)
|
|
conc = len(slot.transferring)
|
|
logger.info(
|
|
"slot: %(slot)s | conc:%(concurrency)2d | "
|
|
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
|
"latency:%(latency)5d ms | size:%(size)6d bytes",
|
|
{
|
|
'slot': key, 'concurrency': conc,
|
|
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
|
|
'latency': latency * 1000, 'size': size
|
|
},
|
|
extra={'spider': spider}
|
|
)
|
|
|
|
def _get_slot(self, request, spider):
|
|
key = request.meta.get('download_slot')
|
|
return key, self.crawler.engine.downloader.slots.get(key)
|
|
|
|
def _adjust_delay(self, slot, latency, response):
|
|
"""Define delay adjustment policy"""
|
|
|
|
# If a server needs `latency` seconds to respond then
|
|
# we should send a request each `latency/N` seconds
|
|
# to have N requests processed in parallel
|
|
target_delay = latency / self.target_concurrency
|
|
|
|
# Adjust the delay to make it closer to target_delay
|
|
new_delay = (slot.delay + target_delay) / 2.0
|
|
|
|
# If target delay is bigger than old delay, then use it instead of mean.
|
|
# It works better with problematic sites.
|
|
new_delay = max(target_delay, new_delay)
|
|
|
|
# Make sure self.mindelay <= new_delay <= self.max_delay
|
|
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
|
|
|
|
# Dont adjust delay if response status != 200 and new delay is smaller
|
|
# than old one, as error pages (and redirections) are usually small and
|
|
# so tend to reduce latency, thus provoking a positive feedback by
|
|
# reducing delay instead of increase.
|
|
if response.status != 200 and new_delay <= slot.delay:
|
|
return
|
|
|
|
slot.delay = new_delay
|