Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
|
|
@ -0,0 +1,93 @@
|
|||
import logging
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoThrottle:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
|
||||
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
|
||||
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def _spider_opened(self, spider):
|
||||
self.mindelay = self._min_delay(spider)
|
||||
self.maxdelay = self._max_delay(spider)
|
||||
spider.download_delay = self._start_delay(spider)
|
||||
|
||||
def _min_delay(self, spider):
|
||||
s = self.crawler.settings
|
||||
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
|
||||
|
||||
def _max_delay(self, spider):
|
||||
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
|
||||
|
||||
def _start_delay(self, spider):
|
||||
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
|
||||
|
||||
def _response_downloaded(self, response, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
latency = request.meta.get('download_latency')
|
||||
if latency is None or slot is None:
|
||||
return
|
||||
|
||||
olddelay = slot.delay
|
||||
self._adjust_delay(slot, latency, response)
|
||||
if self.debug:
|
||||
diff = slot.delay - olddelay
|
||||
size = len(response.body)
|
||||
conc = len(slot.transferring)
|
||||
logger.info(
|
||||
"slot: %(slot)s | conc:%(concurrency)2d | "
|
||||
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
||||
"latency:%(latency)5d ms | size:%(size)6d bytes",
|
||||
{
|
||||
'slot': key, 'concurrency': conc,
|
||||
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
|
||||
'latency': latency * 1000, 'size': size
|
||||
},
|
||||
extra={'spider': spider}
|
||||
)
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = request.meta.get('download_slot')
|
||||
return key, self.crawler.engine.downloader.slots.get(key)
|
||||
|
||||
def _adjust_delay(self, slot, latency, response):
|
||||
"""Define delay adjustment policy"""
|
||||
|
||||
# If a server needs `latency` seconds to respond then
|
||||
# we should send a request each `latency/N` seconds
|
||||
# to have N requests processed in parallel
|
||||
target_delay = latency / self.target_concurrency
|
||||
|
||||
# Adjust the delay to make it closer to target_delay
|
||||
new_delay = (slot.delay + target_delay) / 2.0
|
||||
|
||||
# If target delay is bigger than old delay, then use it instead of mean.
|
||||
# It works better with problematic sites.
|
||||
new_delay = max(target_delay, new_delay)
|
||||
|
||||
# Make sure self.mindelay <= new_delay <= self.max_delay
|
||||
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
|
||||
|
||||
# Dont adjust delay if response status != 200 and new delay is smaller
|
||||
# than old one, as error pages (and redirections) are usually small and
|
||||
# so tend to reduce latency, thus provoking a positive feedback by
|
||||
# reducing delay instead of increase.
|
||||
if response.status != 200 and new_delay <= slot.delay:
|
||||
return
|
||||
|
||||
slot.delay = new_delay
|
||||
Loading…
Add table
Add a link
Reference in a new issue