Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,200 @@
import hashlib
import logging
from scrapy.utils.misc import create_instance
logger = logging.getLogger(__name__)
def _path_safe(text):
"""
Return a filesystem-safe version of a string ``text``
>>> _path_safe('simple.org').startswith('simple.org')
True
>>> _path_safe('dash-underscore_.org').startswith('dash-underscore_.org')
True
>>> _path_safe('some@symbol?').startswith('some_symbol_')
True
"""
pathable_slot = "".join([c if c.isalnum() or c in '-._' else '_'
for c in text])
# as we replace some letters we can get collision for different slots
# add we add unique part
unique_slot = hashlib.md5(text.encode('utf8')).hexdigest()
return '-'.join([pathable_slot, unique_slot])
class ScrapyPriorityQueue:
"""A priority queue implemented using multiple internal queues (typically,
FIFO queues). It uses one internal queue for each priority value. The internal
queue must implement the following methods:
* push(obj)
* pop()
* close()
* __len__()
``__init__`` method of ScrapyPriorityQueue receives a downstream_queue_cls
argument, which is a class used to instantiate a new (internal) queue when
a new priority is allocated.
Only integer priorities should be used. Lower numbers are higher
priorities.
startprios is a sequence of priorities to start with. If the queue was
previously closed leaving some priority buckets non-empty, those priorities
should be passed in startprios.
"""
@classmethod
def from_crawler(cls, crawler, downstream_queue_cls, key, startprios=()):
return cls(crawler, downstream_queue_cls, key, startprios)
def __init__(self, crawler, downstream_queue_cls, key, startprios=()):
self.crawler = crawler
self.downstream_queue_cls = downstream_queue_cls
self.key = key
self.queues = {}
self.curprio = None
self.init_prios(startprios)
def init_prios(self, startprios):
if not startprios:
return
for priority in startprios:
self.queues[priority] = self.qfactory(priority)
self.curprio = min(startprios)
def qfactory(self, key):
return create_instance(self.downstream_queue_cls,
None,
self.crawler,
self.key + '/' + str(key))
def priority(self, request):
return -request.priority
def push(self, request):
priority = self.priority(request)
if priority not in self.queues:
self.queues[priority] = self.qfactory(priority)
q = self.queues[priority]
q.push(request) # this may fail (eg. serialization error)
if self.curprio is None or priority < self.curprio:
self.curprio = priority
def pop(self):
if self.curprio is None:
return
q = self.queues[self.curprio]
m = q.pop()
if not q:
del self.queues[self.curprio]
q.close()
prios = [p for p, q in self.queues.items() if q]
self.curprio = min(prios) if prios else None
return m
def close(self):
active = []
for p, q in self.queues.items():
active.append(p)
q.close()
return active
def __len__(self):
return sum(len(x) for x in self.queues.values()) if self.queues else 0
class DownloaderInterface:
def __init__(self, crawler):
self.downloader = crawler.engine.downloader
def stats(self, possible_slots):
return [(self._active_downloads(slot), slot)
for slot in possible_slots]
def get_slot_key(self, request):
return self.downloader._get_slot_key(request, None)
def _active_downloads(self, slot):
""" Return a number of requests in a Downloader for a given slot """
if slot not in self.downloader.slots:
return 0
return len(self.downloader.slots[slot].active)
class DownloaderAwarePriorityQueue:
""" PriorityQueue which takes Downloader activity into account:
domains (slots) with the least amount of active downloads are dequeued
first.
"""
@classmethod
def from_crawler(cls, crawler, downstream_queue_cls, key, startprios=()):
return cls(crawler, downstream_queue_cls, key, startprios)
def __init__(self, crawler, downstream_queue_cls, key, slot_startprios=()):
if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
raise ValueError(f'"{self.__class__}" does not support CONCURRENT_REQUESTS_PER_IP')
if slot_startprios and not isinstance(slot_startprios, dict):
raise ValueError("DownloaderAwarePriorityQueue accepts "
"``slot_startprios`` as a dict; "
f"{slot_startprios.__class__!r} instance "
"is passed. Most likely, it means the state is"
"created by an incompatible priority queue. "
"Only a crawl started with the same priority "
"queue class can be resumed.")
self._downloader_interface = DownloaderInterface(crawler)
self.downstream_queue_cls = downstream_queue_cls
self.key = key
self.crawler = crawler
self.pqueues = {} # slot -> priority queue
for slot, startprios in (slot_startprios or {}).items():
self.pqueues[slot] = self.pqfactory(slot, startprios)
def pqfactory(self, slot, startprios=()):
return ScrapyPriorityQueue(self.crawler,
self.downstream_queue_cls,
self.key + '/' + _path_safe(slot),
startprios)
def pop(self):
stats = self._downloader_interface.stats(self.pqueues)
if not stats:
return
slot = min(stats)[1]
queue = self.pqueues[slot]
request = queue.pop()
if len(queue) == 0:
del self.pqueues[slot]
return request
def push(self, request):
slot = self._downloader_interface.get_slot_key(request)
if slot not in self.pqueues:
self.pqueues[slot] = self.pqfactory(slot)
queue = self.pqueues[slot]
queue.push(request)
def close(self):
active = {slot: queue.close()
for slot, queue in self.pqueues.items()}
self.pqueues.clear()
return active
def __len__(self):
return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
def __contains__(self, slot):
return slot in self.pqueues