Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
|
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
Url Length Spider Middleware
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UrlLengthMiddleware:
|
||||
|
||||
def __init__(self, maxlength):
|
||||
self.maxlength = maxlength
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
maxlength = settings.getint('URLLENGTH_LIMIT')
|
||||
if not maxlength:
|
||||
raise NotConfigured
|
||||
return cls(maxlength)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
def _filter(request):
|
||||
if isinstance(request, Request) and len(request.url) > self.maxlength:
|
||||
logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ",
|
||||
{'maxlength': self.maxlength, 'url': request.url},
|
||||
extra={'spider': spider})
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
return (r for r in result or () if _filter(r))
|
||||
Loading…
Add table
Add a link
Reference in a new issue