Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
|
|
@ -0,0 +1,58 @@
|
|||
"""
|
||||
Depth Spider Middleware
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from scrapy.http import Request
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DepthMiddleware:
|
||||
|
||||
def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
|
||||
self.maxdepth = maxdepth
|
||||
self.stats = stats
|
||||
self.verbose_stats = verbose_stats
|
||||
self.prio = prio
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
settings = crawler.settings
|
||||
maxdepth = settings.getint('DEPTH_LIMIT')
|
||||
verbose = settings.getbool('DEPTH_STATS_VERBOSE')
|
||||
prio = settings.getint('DEPTH_PRIORITY')
|
||||
return cls(maxdepth, crawler.stats, verbose, prio)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
def _filter(request):
|
||||
if isinstance(request, Request):
|
||||
depth = response.meta['depth'] + 1
|
||||
request.meta['depth'] = depth
|
||||
if self.prio:
|
||||
request.priority -= depth * self.prio
|
||||
if self.maxdepth and depth > self.maxdepth:
|
||||
logger.debug(
|
||||
"Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
|
||||
{'maxdepth': self.maxdepth, 'requrl': request.url},
|
||||
extra={'spider': spider}
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if self.verbose_stats:
|
||||
self.stats.inc_value(f'request_depth_count/{depth}',
|
||||
spider=spider)
|
||||
self.stats.max_value('request_depth_max', depth,
|
||||
spider=spider)
|
||||
return True
|
||||
|
||||
# base case (depth=0)
|
||||
if 'depth' not in response.meta:
|
||||
response.meta['depth'] = 0
|
||||
if self.verbose_stats:
|
||||
self.stats.inc_value('request_depth_count/0', spider=spider)
|
||||
|
||||
return (r for r in result or () if _filter(r))
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
"""
|
||||
HttpError Spider Middleware
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
import logging
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HttpError(IgnoreRequest):
|
||||
"""A non-200 response was filtered"""
|
||||
|
||||
def __init__(self, response, *args, **kwargs):
|
||||
self.response = response
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class HttpErrorMiddleware:
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def __init__(self, settings):
|
||||
self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
|
||||
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
if 200 <= response.status < 300: # common case
|
||||
return
|
||||
meta = response.meta
|
||||
if 'handle_httpstatus_all' in meta:
|
||||
return
|
||||
if 'handle_httpstatus_list' in meta:
|
||||
allowed_statuses = meta['handle_httpstatus_list']
|
||||
elif self.handle_httpstatus_all:
|
||||
return
|
||||
else:
|
||||
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
|
||||
if response.status in allowed_statuses:
|
||||
return
|
||||
raise HttpError(response, 'Ignoring non-200 response')
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
if isinstance(exception, HttpError):
|
||||
spider.crawler.stats.inc_value('httperror/response_ignored_count')
|
||||
spider.crawler.stats.inc_value(
|
||||
f'httperror/response_ignored_status_count/{response.status}'
|
||||
)
|
||||
logger.info(
|
||||
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",
|
||||
{'response': response}, extra={'spider': spider},
|
||||
)
|
||||
return []
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
"""
|
||||
Offsite Spider Middleware
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
import re
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OffsiteMiddleware:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
for x in result:
|
||||
if isinstance(x, Request):
|
||||
if x.dont_filter or self.should_follow(x, spider):
|
||||
yield x
|
||||
else:
|
||||
domain = urlparse_cached(x).hostname
|
||||
if domain and domain not in self.domains_seen:
|
||||
self.domains_seen.add(domain)
|
||||
logger.debug(
|
||||
"Filtered offsite request to %(domain)r: %(request)s",
|
||||
{'domain': domain, 'request': x}, extra={'spider': spider})
|
||||
self.stats.inc_value('offsite/domains', spider=spider)
|
||||
self.stats.inc_value('offsite/filtered', spider=spider)
|
||||
else:
|
||||
yield x
|
||||
|
||||
def should_follow(self, request, spider):
|
||||
regex = self.host_regex
|
||||
# hostname can be None for wrong urls (like javascript links)
|
||||
host = urlparse_cached(request).hostname or ''
|
||||
return bool(regex.search(host))
|
||||
|
||||
def get_host_regex(self, spider):
|
||||
"""Override this method to implement a different offsite policy"""
|
||||
allowed_domains = getattr(spider, 'allowed_domains', None)
|
||||
if not allowed_domains:
|
||||
return re.compile('') # allow all by default
|
||||
url_pattern = re.compile(r"^https?://.*$")
|
||||
port_pattern = re.compile(r":\d+$")
|
||||
domains = []
|
||||
for domain in allowed_domains:
|
||||
if domain is None:
|
||||
continue
|
||||
elif url_pattern.match(domain):
|
||||
message = ("allowed_domains accepts only domains, not URLs. "
|
||||
f"Ignoring URL entry {domain} in allowed_domains.")
|
||||
warnings.warn(message, URLWarning)
|
||||
elif port_pattern.search(domain):
|
||||
message = ("allowed_domains accepts only domains without ports. "
|
||||
f"Ignoring entry {domain} in allowed_domains.")
|
||||
warnings.warn(message, PortWarning)
|
||||
else:
|
||||
domains.append(re.escape(domain))
|
||||
regex = fr'^(.*\.)?({"|".join(domains)})$'
|
||||
return re.compile(regex)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.host_regex = self.get_host_regex(spider)
|
||||
self.domains_seen = set()
|
||||
|
||||
|
||||
class URLWarning(Warning):
|
||||
pass
|
||||
|
||||
|
||||
class PortWarning(Warning):
|
||||
pass
|
||||
|
|
@ -0,0 +1,361 @@
|
|||
"""
|
||||
RefererMiddleware: populates Request referer field, based on the Response which
|
||||
originated it.
|
||||
"""
|
||||
import warnings
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
from scrapy.utils.python import to_unicode
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.url import strip_url
|
||||
|
||||
|
||||
LOCAL_SCHEMES = ('about', 'blob', 'data', 'filesystem',)
|
||||
|
||||
POLICY_NO_REFERRER = "no-referrer"
|
||||
POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
|
||||
POLICY_SAME_ORIGIN = "same-origin"
|
||||
POLICY_ORIGIN = "origin"
|
||||
POLICY_STRICT_ORIGIN = "strict-origin"
|
||||
POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
|
||||
POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
|
||||
POLICY_UNSAFE_URL = "unsafe-url"
|
||||
POLICY_SCRAPY_DEFAULT = "scrapy-default"
|
||||
|
||||
|
||||
class ReferrerPolicy:
|
||||
|
||||
NOREFERRER_SCHEMES = LOCAL_SCHEMES
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
raise NotImplementedError()
|
||||
|
||||
def stripped_referrer(self, url):
|
||||
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
|
||||
return self.strip_url(url)
|
||||
|
||||
def origin_referrer(self, url):
|
||||
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
|
||||
return self.origin(url)
|
||||
|
||||
def strip_url(self, url, origin_only=False):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#strip-url
|
||||
|
||||
If url is null, return no referrer.
|
||||
If url's scheme is a local scheme, then return no referrer.
|
||||
Set url's username to the empty string.
|
||||
Set url's password to null.
|
||||
Set url's fragment to null.
|
||||
If the origin-only flag is true, then:
|
||||
Set url's path to null.
|
||||
Set url's query to null.
|
||||
Return url.
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
return strip_url(url,
|
||||
strip_credentials=True,
|
||||
strip_fragment=True,
|
||||
strip_default_port=True,
|
||||
origin_only=origin_only)
|
||||
|
||||
def origin(self, url):
|
||||
"""Return serialized origin (scheme, host, path) for a request or response URL."""
|
||||
return self.strip_url(url, origin_only=True)
|
||||
|
||||
def potentially_trustworthy(self, url):
|
||||
# Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
|
||||
parsed_url = urlparse(url)
|
||||
if parsed_url.scheme in ('data',):
|
||||
return False
|
||||
return self.tls_protected(url)
|
||||
|
||||
def tls_protected(self, url):
|
||||
return urlparse(url).scheme in ('https', 'ftps')
|
||||
|
||||
|
||||
class NoReferrerPolicy(ReferrerPolicy):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
|
||||
|
||||
The simplest policy is "no-referrer", which specifies that no referrer information
|
||||
is to be sent along with requests made from a particular request client to any origin.
|
||||
The header will be omitted entirely.
|
||||
"""
|
||||
name = POLICY_NO_REFERRER
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
return None
|
||||
|
||||
|
||||
class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
|
||||
|
||||
The "no-referrer-when-downgrade" policy sends a full URL along with requests
|
||||
from a TLS-protected environment settings object to a potentially trustworthy URL,
|
||||
and requests from clients which are not TLS-protected to any origin.
|
||||
|
||||
Requests from TLS-protected clients to non-potentially trustworthy URLs,
|
||||
on the other hand, will contain no referrer information.
|
||||
A Referer HTTP header will not be sent.
|
||||
|
||||
This is a user agent's default behavior, if no policy is otherwise specified.
|
||||
"""
|
||||
name = POLICY_NO_REFERRER_WHEN_DOWNGRADE
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
if not self.tls_protected(response_url) or self.tls_protected(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
|
||||
|
||||
class SameOriginPolicy(ReferrerPolicy):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin
|
||||
|
||||
The "same-origin" policy specifies that a full URL, stripped for use as a referrer,
|
||||
is sent as referrer information when making same-origin requests from a particular request client.
|
||||
|
||||
Cross-origin requests, on the other hand, will contain no referrer information.
|
||||
A Referer HTTP header will not be sent.
|
||||
"""
|
||||
name = POLICY_SAME_ORIGIN
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
if self.origin(response_url) == self.origin(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
|
||||
|
||||
class OriginPolicy(ReferrerPolicy):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#referrer-policy-origin
|
||||
|
||||
The "origin" policy specifies that only the ASCII serialization
|
||||
of the origin of the request client is sent as referrer information
|
||||
when making both same-origin requests and cross-origin requests
|
||||
from a particular request client.
|
||||
"""
|
||||
name = POLICY_ORIGIN
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
return self.origin_referrer(response_url)
|
||||
|
||||
|
||||
class StrictOriginPolicy(ReferrerPolicy):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin
|
||||
|
||||
The "strict-origin" policy sends the ASCII serialization
|
||||
of the origin of the request client when making requests:
|
||||
- from a TLS-protected environment settings object to a potentially trustworthy URL, and
|
||||
- from non-TLS-protected environment settings objects to any origin.
|
||||
|
||||
Requests from TLS-protected request clients to non- potentially trustworthy URLs,
|
||||
on the other hand, will contain no referrer information.
|
||||
A Referer HTTP header will not be sent.
|
||||
"""
|
||||
name = POLICY_STRICT_ORIGIN
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
if (
|
||||
self.tls_protected(response_url) and self.potentially_trustworthy(request_url)
|
||||
or not self.tls_protected(response_url)
|
||||
):
|
||||
return self.origin_referrer(response_url)
|
||||
|
||||
|
||||
class OriginWhenCrossOriginPolicy(ReferrerPolicy):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin
|
||||
|
||||
The "origin-when-cross-origin" policy specifies that a full URL,
|
||||
stripped for use as a referrer, is sent as referrer information
|
||||
when making same-origin requests from a particular request client,
|
||||
and only the ASCII serialization of the origin of the request client
|
||||
is sent as referrer information when making cross-origin requests
|
||||
from a particular request client.
|
||||
"""
|
||||
name = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
origin = self.origin(response_url)
|
||||
if origin == self.origin(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
else:
|
||||
return origin
|
||||
|
||||
|
||||
class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin
|
||||
|
||||
The "strict-origin-when-cross-origin" policy specifies that a full URL,
|
||||
stripped for use as a referrer, is sent as referrer information
|
||||
when making same-origin requests from a particular request client,
|
||||
and only the ASCII serialization of the origin of the request client
|
||||
when making cross-origin requests:
|
||||
|
||||
- from a TLS-protected environment settings object to a potentially trustworthy URL, and
|
||||
- from non-TLS-protected environment settings objects to any origin.
|
||||
|
||||
Requests from TLS-protected clients to non- potentially trustworthy URLs,
|
||||
on the other hand, will contain no referrer information.
|
||||
A Referer HTTP header will not be sent.
|
||||
"""
|
||||
name = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
origin = self.origin(response_url)
|
||||
if origin == self.origin(request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
elif (
|
||||
self.tls_protected(response_url) and self.potentially_trustworthy(request_url)
|
||||
or not self.tls_protected(response_url)
|
||||
):
|
||||
return self.origin_referrer(response_url)
|
||||
|
||||
|
||||
class UnsafeUrlPolicy(ReferrerPolicy):
|
||||
"""
|
||||
https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url
|
||||
|
||||
The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer,
|
||||
is sent along with both cross-origin requests
|
||||
and same-origin requests made from a particular request client.
|
||||
|
||||
Note: The policy's name doesn't lie; it is unsafe.
|
||||
This policy will leak origins and paths from TLS-protected resources
|
||||
to insecure origins.
|
||||
Carefully consider the impact of setting such a policy for potentially sensitive documents.
|
||||
"""
|
||||
name = POLICY_UNSAFE_URL
|
||||
|
||||
def referrer(self, response_url, request_url):
|
||||
return self.stripped_referrer(response_url)
|
||||
|
||||
|
||||
class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
|
||||
"""
|
||||
A variant of "no-referrer-when-downgrade",
|
||||
with the addition that "Referer" is not sent if the parent request was
|
||||
using ``file://`` or ``s3://`` scheme.
|
||||
"""
|
||||
NOREFERRER_SCHEMES = LOCAL_SCHEMES + ('file', 's3')
|
||||
name = POLICY_SCRAPY_DEFAULT
|
||||
|
||||
|
||||
_policy_classes = {p.name: p for p in (
|
||||
NoReferrerPolicy,
|
||||
NoReferrerWhenDowngradePolicy,
|
||||
SameOriginPolicy,
|
||||
OriginPolicy,
|
||||
StrictOriginPolicy,
|
||||
OriginWhenCrossOriginPolicy,
|
||||
StrictOriginWhenCrossOriginPolicy,
|
||||
UnsafeUrlPolicy,
|
||||
DefaultReferrerPolicy,
|
||||
)}
|
||||
|
||||
# Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
|
||||
_policy_classes[''] = NoReferrerWhenDowngradePolicy
|
||||
|
||||
|
||||
def _load_policy_class(policy, warning_only=False):
|
||||
"""
|
||||
Expect a string for the path to the policy class,
|
||||
otherwise try to interpret the string as a standard value
|
||||
from https://www.w3.org/TR/referrer-policy/#referrer-policies
|
||||
"""
|
||||
try:
|
||||
return load_object(policy)
|
||||
except ValueError:
|
||||
try:
|
||||
return _policy_classes[policy.lower()]
|
||||
except KeyError:
|
||||
msg = f"Could not load referrer policy {policy!r}"
|
||||
if not warning_only:
|
||||
raise RuntimeError(msg)
|
||||
else:
|
||||
warnings.warn(msg, RuntimeWarning)
|
||||
return None
|
||||
|
||||
|
||||
class RefererMiddleware:
|
||||
|
||||
def __init__(self, settings=None):
|
||||
self.default_policy = DefaultReferrerPolicy
|
||||
if settings is not None:
|
||||
self.default_policy = _load_policy_class(
|
||||
settings.get('REFERRER_POLICY'))
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('REFERER_ENABLED'):
|
||||
raise NotConfigured
|
||||
mw = cls(crawler.settings)
|
||||
|
||||
# Note: this hook is a bit of a hack to intercept redirections
|
||||
crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)
|
||||
|
||||
return mw
|
||||
|
||||
def policy(self, resp_or_url, request):
|
||||
"""
|
||||
Determine Referrer-Policy to use from a parent Response (or URL),
|
||||
and a Request to be sent.
|
||||
|
||||
- if a valid policy is set in Request meta, it is used.
|
||||
- if the policy is set in meta but is wrong (e.g. a typo error),
|
||||
the policy from settings is used
|
||||
- if the policy is not set in Request meta,
|
||||
but there is a Referrer-policy header in the parent response,
|
||||
it is used if valid
|
||||
- otherwise, the policy from settings is used.
|
||||
"""
|
||||
policy_name = request.meta.get('referrer_policy')
|
||||
if policy_name is None:
|
||||
if isinstance(resp_or_url, Response):
|
||||
policy_header = resp_or_url.headers.get('Referrer-Policy')
|
||||
if policy_header is not None:
|
||||
policy_name = to_unicode(policy_header.decode('latin1'))
|
||||
if policy_name is None:
|
||||
return self.default_policy()
|
||||
|
||||
cls = _load_policy_class(policy_name, warning_only=True)
|
||||
return cls() if cls else self.default_policy()
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
def _set_referer(r):
|
||||
if isinstance(r, Request):
|
||||
referrer = self.policy(response, r).referrer(response.url, r.url)
|
||||
if referrer is not None:
|
||||
r.headers.setdefault('Referer', referrer)
|
||||
return r
|
||||
return (_set_referer(r) for r in result or ())
|
||||
|
||||
def request_scheduled(self, request, spider):
|
||||
# check redirected request to patch "Referer" header if necessary
|
||||
redirected_urls = request.meta.get('redirect_urls', [])
|
||||
if redirected_urls:
|
||||
request_referrer = request.headers.get('Referer')
|
||||
# we don't patch the referrer value if there is none
|
||||
if request_referrer is not None:
|
||||
# the request's referrer header value acts as a surrogate
|
||||
# for the parent response URL
|
||||
#
|
||||
# Note: if the 3xx response contained a Referrer-Policy header,
|
||||
# the information is not available using this hook
|
||||
parent_url = safe_url_string(request_referrer)
|
||||
policy_referrer = self.policy(parent_url, request).referrer(
|
||||
parent_url, request.url)
|
||||
if policy_referrer != request_referrer:
|
||||
if policy_referrer is None:
|
||||
request.headers.pop('Referer')
|
||||
else:
|
||||
request.headers['Referer'] = policy_referrer
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
Url Length Spider Middleware
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UrlLengthMiddleware:
|
||||
|
||||
def __init__(self, maxlength):
|
||||
self.maxlength = maxlength
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
maxlength = settings.getint('URLLENGTH_LIMIT')
|
||||
if not maxlength:
|
||||
raise NotConfigured
|
||||
return cls(maxlength)
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
def _filter(request):
|
||||
if isinstance(request, Request) and len(request.url) > self.maxlength:
|
||||
logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ",
|
||||
{'maxlength': self.maxlength, 'url': request.url},
|
||||
extra={'spider': spider})
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
return (r for r in result or () if _filter(r))
|
||||
Loading…
Add table
Add a link
Reference in a new issue