Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,93 @@
import re
import logging
from w3lib import html
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
logger = logging.getLogger(__name__)
class AjaxCrawlMiddleware:
"""
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
"""
def __init__(self, settings):
if not settings.getbool('AJAXCRAWL_ENABLED'):
raise NotConfigured
# XXX: Google parses at least first 100k bytes; scrapy's redirect
# middleware parses first 4k. 4k turns out to be insufficient
# for this middleware, and parsing 100k could be slow.
# We use something in between (32K) by default.
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_response(self, request, response, spider):
if not isinstance(response, HtmlResponse) or response.status != 200:
return response
if request.method != 'GET':
# other HTTP methods are either not safe or don't have a body
return response
if 'ajax_crawlable' in request.meta: # prevent loops
return response
if not self._has_ajax_crawlable_variant(response):
return response
# scrapy already handles #! links properly
ajax_crawl_request = request.replace(url=request.url + '#!')
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
extra={'spider': spider})
ajax_crawl_request.meta['ajax_crawlable'] = True
return ajax_crawl_request
def _has_ajax_crawlable_variant(self, response):
"""
Return True if a page without hash fragment could be "AJAX crawlable"
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
"""
body = response.text[:self.lookup_bytes]
return _has_ajaxcrawlable_meta(body)
# XXX: move it to w3lib?
_ajax_crawlable_re = re.compile(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>')
def _has_ajaxcrawlable_meta(text):
"""
>>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
True
>>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
True
>>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
False
>>> _has_ajaxcrawlable_meta('<html></html>')
False
"""
# Stripping scripts and comments is slow (about 20x slower than
# just checking if a string is in text); this is a quick fail-fast
# path that should work for most pages.
if 'fragment' not in text:
return False
if 'content' not in text:
return False
text = html.remove_tags_with_content(text, ('script', 'noscript'))
text = html.replace_entities(text)
text = html.remove_comments(text)
return _ajax_crawlable_re.search(text) is not None

View file

@ -0,0 +1,110 @@
import logging
from collections import defaultdict
from scrapy.exceptions import NotConfigured
from scrapy.http import Response
from scrapy.http.cookies import CookieJar
from scrapy.utils.python import to_unicode
logger = logging.getLogger(__name__)
class CookiesMiddleware:
"""This middleware enables working with sites that need cookies"""
def __init__(self, debug=False):
self.jars = defaultdict(CookieJar)
self.debug = debug
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COOKIES_ENABLED'):
raise NotConfigured
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
def process_request(self, request, spider):
if request.meta.get('dont_merge_cookies', False):
return
cookiejarkey = request.meta.get("cookiejar")
jar = self.jars[cookiejarkey]
for cookie in self._get_request_cookies(jar, request):
jar.set_cookie_if_ok(cookie, request)
# set Cookie header
request.headers.pop('Cookie', None)
jar.add_cookie_header(request)
self._debug_cookie(request, spider)
def process_response(self, request, response, spider):
if request.meta.get('dont_merge_cookies', False):
return response
# extract cookies from Set-Cookie and drop invalid/expired cookies
cookiejarkey = request.meta.get("cookiejar")
jar = self.jars[cookiejarkey]
jar.extract_cookies(response, request)
self._debug_set_cookie(response, spider)
return response
def _debug_cookie(self, request, spider):
if self.debug:
cl = [to_unicode(c, errors='replace')
for c in request.headers.getlist('Cookie')]
if cl:
cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
msg = f"Sending cookies to: {request}\n{cookies}"
logger.debug(msg, extra={'spider': spider})
def _debug_set_cookie(self, response, spider):
if self.debug:
cl = [to_unicode(c, errors='replace')
for c in response.headers.getlist('Set-Cookie')]
if cl:
cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
msg = f"Received cookies from: {response}\n{cookies}"
logger.debug(msg, extra={'spider': spider})
def _format_cookie(self, cookie, request):
"""
Given a dict consisting of cookie components, return its string representation.
Decode from bytes if necessary.
"""
decoded = {}
for key in ("name", "value", "path", "domain"):
if cookie.get(key) is None:
if key in ("name", "value"):
msg = "Invalid cookie found in request {}: {} ('{}' is missing)"
logger.warning(msg.format(request, cookie, key))
return
continue
if isinstance(cookie[key], str):
decoded[key] = cookie[key]
else:
try:
decoded[key] = cookie[key].decode("utf8")
except UnicodeDecodeError:
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
request, cookie)
decoded[key] = cookie[key].decode("latin1", errors="replace")
cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
for key, value in decoded.items(): # path, domain
cookie_str += f"; {key.capitalize()}={value}"
return cookie_str
def _get_request_cookies(self, jar, request):
"""
Extract cookies from the Request.cookies attribute
"""
if not request.cookies:
return []
elif isinstance(request.cookies, dict):
cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
else:
cookies = request.cookies
formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
response = Response(request.url, headers={"Set-Cookie": formatted})
return jar.make_cookies(response, request)

View file

@ -0,0 +1,83 @@
""" This module implements the DecompressionMiddleware which tries to recognise
and extract the potentially compressed responses that may arrive.
"""
import bz2
import gzip
import logging
import tarfile
import zipfile
from io import BytesIO
from tempfile import mktemp
from scrapy.responsetypes import responsetypes
logger = logging.getLogger(__name__)
class DecompressionMiddleware:
""" This middleware tries to recognise and extract the possibly compressed
responses that may arrive. """
def __init__(self):
self._formats = {
'tar': self._is_tar,
'zip': self._is_zip,
'gz': self._is_gzip,
'bz2': self._is_bzip2
}
def _is_tar(self, response):
archive = BytesIO(response.body)
try:
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
except tarfile.ReadError:
return
body = tar_file.extractfile(tar_file.members[0]).read()
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
return response.replace(body=body, cls=respcls)
def _is_zip(self, response):
archive = BytesIO(response.body)
try:
zip_file = zipfile.ZipFile(archive)
except zipfile.BadZipfile:
return
namelist = zip_file.namelist()
body = zip_file.read(namelist[0])
respcls = responsetypes.from_args(filename=namelist[0], body=body)
return response.replace(body=body, cls=respcls)
def _is_gzip(self, response):
archive = BytesIO(response.body)
try:
body = gzip.GzipFile(fileobj=archive).read()
except IOError:
return
respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)
def _is_bzip2(self, response):
try:
body = bz2.decompress(response.body)
except IOError:
return
respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)
def process_response(self, request, response, spider):
if not response.body:
return response
for fmt, func in self._formats.items():
new_response = func(response)
if new_response:
logger.debug('Decompressed response with format: %(responsefmt)s',
{'responsefmt': fmt}, extra={'spider': spider})
return new_response
return response

View file

@ -0,0 +1,22 @@
"""
DefaultHeaders downloader middleware
See documentation in docs/topics/downloader-middleware.rst
"""
from scrapy.utils.python import without_none_values
class DefaultHeadersMiddleware:
def __init__(self, headers):
self._headers = headers
@classmethod
def from_crawler(cls, crawler):
headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
return cls(headers.items())
def process_request(self, request, spider):
for k, v in self._headers:
request.headers.setdefault(k, v)

View file

@ -0,0 +1,26 @@
"""
Download timeout middleware
See documentation in docs/topics/downloader-middleware.rst
"""
from scrapy import signals
class DownloadTimeoutMiddleware:
def __init__(self, timeout=180):
self._timeout = timeout
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self._timeout = getattr(spider, 'download_timeout', self._timeout)
def process_request(self, request, spider):
if self._timeout:
request.meta.setdefault('download_timeout', self._timeout)

View file

@ -0,0 +1,31 @@
"""
HTTP basic auth downloader middleware
See documentation in docs/topics/downloader-middleware.rst
"""
from w3lib.http import basic_auth_header
from scrapy import signals
class HttpAuthMiddleware:
"""Set Basic HTTP Authorization header
(http_user and http_pass spider class attributes)"""
@classmethod
def from_crawler(cls, crawler):
o = cls()
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
usr = getattr(spider, 'http_user', '')
pwd = getattr(spider, 'http_pass', '')
if usr or pwd:
self.auth = basic_auth_header(usr, pwd)
def process_request(self, request, spider):
auth = getattr(self, 'auth', None)
if auth and b'Authorization' not in request.headers:
request.headers[b'Authorization'] = auth

View file

@ -0,0 +1,133 @@
from email.utils import formatdate
from typing import Optional, Type, TypeVar
from twisted.internet import defer
from twisted.internet.error import (
ConnectError,
ConnectionDone,
ConnectionLost,
ConnectionRefusedError,
DNSLookupError,
TCPTimedOutError,
TimeoutError,
)
from twisted.web.client import ResponseFailed
from scrapy import signals
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http.request import Request
from scrapy.http.response import Response
from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.statscollectors import StatsCollector
from scrapy.utils.misc import load_object
HttpCacheMiddlewareTV = TypeVar("HttpCacheMiddlewareTV", bound="HttpCacheMiddleware")
class HttpCacheMiddleware:
DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError)
def __init__(self, settings: Settings, stats: StatsCollector) -> None:
if not settings.getbool('HTTPCACHE_ENABLED'):
raise NotConfigured
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
self.stats = stats
@classmethod
def from_crawler(cls: Type[HttpCacheMiddlewareTV], crawler: Crawler) -> HttpCacheMiddlewareTV:
o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider: Spider) -> None:
self.storage.open_spider(spider)
def spider_closed(self, spider: Spider) -> None:
self.storage.close_spider(spider)
def process_request(self, request: Request, spider: Spider) -> Optional[Response]:
if request.meta.get('dont_cache', False):
return None
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
return None
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return None # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse
return None
def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
if request.meta.get('dont_cache', False):
return response
# Skip cached responses and uncacheable requests
if 'cached' in response.flags or '_dont_cache' in request.meta:
request.meta.pop('_dont_cache', None)
return response
# RFC2616 requires origin server to set Date header,
# https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
if 'Date' not in response.headers:
response.headers['Date'] = formatdate(usegmt=True)
# Do not validate first-hand responses
cachedresponse = request.meta.pop('cached_response', None)
if cachedresponse is None:
self.stats.inc_value('httpcache/firsthand', spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
if self.policy.is_cached_response_valid(cachedresponse, response, request):
self.stats.inc_value('httpcache/revalidate', spider=spider)
return cachedresponse
self.stats.inc_value('httpcache/invalidate', spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
def process_exception(
self, request: Request, exception: Exception, spider: Spider
) -> Optional[Response]:
cachedresponse = request.meta.pop('cached_response', None)
if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
self.stats.inc_value('httpcache/errorrecovery', spider=spider)
return cachedresponse
return None
def _cache_response(
self, spider: Spider, response: Response, request: Request, cachedresponse: Optional[Response]
) -> None:
if self.policy.should_cache_response(response, request):
self.stats.inc_value('httpcache/store', spider=spider)
self.storage.store_response(spider, request, response)
else:
self.stats.inc_value('httpcache/uncacheable', spider=spider)

View file

@ -0,0 +1,82 @@
import io
import zlib
from scrapy.utils.gz import gunzip
from scrapy.http import Response, TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.exceptions import NotConfigured
ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
try:
import brotli
ACCEPTED_ENCODINGS.append(b'br')
except ImportError:
pass
try:
import zstandard
ACCEPTED_ENCODINGS.append(b'zstd')
except ImportError:
pass
class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
raise NotConfigured
return cls()
def process_request(self, request, spider):
request.headers.setdefault('Accept-Encoding',
b", ".join(ACCEPTED_ENCODINGS))
def process_response(self, request, response, spider):
if request.method == 'HEAD':
return response
if isinstance(response, Response):
content_encoding = response.headers.getlist('Content-Encoding')
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs = dict(cls=respcls, body=decoded_body)
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs['encoding'] = None
response = response.replace(**kwargs)
if not content_encoding:
del response.headers['Content-Encoding']
return response
def _decode(self, body, encoding):
if encoding == b'gzip' or encoding == b'x-gzip':
body = gunzip(body)
if encoding == b'deflate':
try:
body = zlib.decompress(body)
except zlib.error:
# ugly hack to work with raw deflate content that may
# be sent by microsoft servers. For more information, see:
# http://carsten.codimi.de/gzip.yaws/
# http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
# http://www.gzip.org/zlib/zlib_faq.html#faq38
body = zlib.decompress(body, -15)
if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
body = brotli.decompress(body)
if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
# Using its streaming API since its simple API could handle only cases
# where there is content size data embedded in the frame
reader = zstandard.ZstdDecompressor().stream_reader(io.BytesIO(body))
body = reader.read()
return body

View file

@ -0,0 +1,75 @@
import base64
from urllib.parse import unquote, urlunparse
from urllib.request import getproxies, proxy_bypass, _parse_proxy
from scrapy.exceptions import NotConfigured
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
class HttpProxyMiddleware:
def __init__(self, auth_encoding='latin-1'):
self.auth_encoding = auth_encoding
self.proxies = {}
for type_, url in getproxies().items():
try:
self.proxies[type_] = self._get_proxy(url, type_)
# some values such as '/var/run/docker.sock' can't be parsed
# by _parse_proxy and as such should be skipped
except ValueError:
continue
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
raise NotConfigured
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
return cls(auth_encoding)
def _basic_auth_header(self, username, password):
user_pass = to_bytes(
f'{unquote(username)}:{unquote(password)}',
encoding=self.auth_encoding)
return base64.b64encode(user_pass)
def _get_proxy(self, url, orig_type):
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
if user:
creds = self._basic_auth_header(user, password)
else:
creds = None
return creds, proxy_url
def process_request(self, request, spider):
# ignore if proxy is already set
if 'proxy' in request.meta:
if request.meta['proxy'] is None:
return
# extract credentials if present
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
request.meta['proxy'] = proxy_url
if creds and not request.headers.get('Proxy-Authorization'):
request.headers['Proxy-Authorization'] = b'Basic ' + creds
return
elif not self.proxies:
return
parsed = urlparse_cached(request)
scheme = parsed.scheme
# 'no_proxy' is only supported by http schemes
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
return
if scheme in self.proxies:
self._set_proxy(request, scheme)
def _set_proxy(self, request, scheme):
creds, proxy = self.proxies[scheme]
request.meta['proxy'] = proxy
if creds:
request.headers['Proxy-Authorization'] = b'Basic ' + creds

View file

@ -0,0 +1,113 @@
import logging
from urllib.parse import urljoin, urlparse
from w3lib.url import safe_url_string
from scrapy.http import HtmlResponse
from scrapy.utils.response import get_meta_refresh
from scrapy.exceptions import IgnoreRequest, NotConfigured
logger = logging.getLogger(__name__)
class BaseRedirectMiddleware:
enabled_setting = 'REDIRECT_ENABLED'
def __init__(self, settings):
if not settings.getbool(self.enabled_setting):
raise NotConfigured
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + [request.url]
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + [reason]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
def _redirect_request_using_get(self, request, redirect_url):
redirected = request.replace(url=redirect_url, method='GET', body='')
redirected.headers.pop('Content-Type', None)
redirected.headers.pop('Content-Length', None)
return redirected
class RedirectMiddleware(BaseRedirectMiddleware):
"""
Handle redirection of requests based on response status
and meta-refresh html tag.
"""
def process_response(self, request, response, spider):
if (
request.meta.get('dont_redirect', False)
or response.status in getattr(spider, 'handle_httpstatus_list', [])
or response.status in request.meta.get('handle_httpstatus_list', [])
or request.meta.get('handle_httpstatus_all', False)
):
return response
allowed_status = (301, 302, 303, 307, 308)
if 'Location' not in response.headers or response.status not in allowed_status:
return response
location = safe_url_string(response.headers['Location'])
if response.headers['Location'].startswith(b'//'):
request_scheme = urlparse(request.url).scheme
location = request_scheme + '://' + location.lstrip('/')
redirected_url = urljoin(request.url, location)
if response.status in (301, 307, 308) or request.method == 'HEAD':
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
redirected = self._redirect_request_using_get(request, redirected_url)
return self._redirect(redirected, request, spider, response.status)
class MetaRefreshMiddleware(BaseRedirectMiddleware):
enabled_setting = 'METAREFRESH_ENABLED'
def __init__(self, settings):
super().__init__(settings)
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
self._maxdelay = settings.getint('METAREFRESH_MAXDELAY')
def process_response(self, request, response, spider):
if (
request.meta.get('dont_redirect', False)
or request.method == 'HEAD'
or not isinstance(response, HtmlResponse)
):
return response
interval, url = get_meta_refresh(response,
ignore_tags=self._ignore_tags)
if url and interval < self._maxdelay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, 'meta refresh')
return response

View file

@ -0,0 +1,97 @@
"""
An extension to retry failed requests that are potentially caused by temporary
problems such as a connection timeout or HTTP 500 error.
You can change the behaviour of this middleware by modifing the scraping settings:
RETRY_TIMES - how many times to retry a failed page
RETRY_HTTP_CODES - which HTTP response codes to retry
Failed pages are collected on the scraping process and rescheduled at the end,
once the spider has finished crawling all regular (non failed) pages.
"""
import logging
from twisted.internet import defer
from twisted.internet.error import (
ConnectError,
ConnectionDone,
ConnectionLost,
ConnectionRefusedError,
DNSLookupError,
TCPTimedOutError,
TimeoutError,
)
from twisted.web.client import ResponseFailed
from scrapy.exceptions import NotConfigured
from scrapy.utils.response import response_status_message
from scrapy.core.downloader.handlers.http11 import TunnelError
from scrapy.utils.python import global_object_name
logger = logging.getLogger(__name__)
class RetryMiddleware:
# IOError is raised by the HttpCompression middleware when trying to
# decompress an empty response
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError)
def __init__(self, settings):
if not settings.getbool('RETRY_ENABLED'):
raise NotConfigured
self.max_retry_times = settings.getint('RETRY_TIMES')
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
if (
isinstance(exception, self.EXCEPTIONS_TO_RETRY)
and not request.meta.get('dont_retry', False)
):
return self._retry(request, exception, spider)
def _retry(self, request, reason, spider):
retries = request.meta.get('retry_times', 0) + 1
retry_times = self.max_retry_times
if 'max_retry_times' in request.meta:
retry_times = request.meta['max_retry_times']
stats = spider.crawler.stats
if retries <= retry_times:
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
retryreq.priority = request.priority + self.priority_adjust
if isinstance(reason, Exception):
reason = global_object_name(reason.__class__)
stats.inc_value('retry/count')
stats.inc_value(f'retry/reason_count/{reason}')
return retryreq
else:
stats.inc_value('retry/max_reached')
logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})

View file

@ -0,0 +1,109 @@
"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
import logging
from twisted.internet.defer import Deferred, maybeDeferred
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import load_object
logger = logging.getLogger(__name__)
class RobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
raise NotConfigured
self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None)
self.crawler = crawler
self._parsers = {}
self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b'')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get('dont_obey_robotstxt'):
return
d = maybeDeferred(self.robot_parser, request, spider)
d.addCallback(self.process_request_2, request, spider)
return d
def process_request_2(self, rp, request, spider):
if rp is None:
return
useragent = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b'User-Agent', self._default_useragent)
if not rp.allowed(request.url, useragent):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
self.crawler.stats.inc_value('robotstxt/forbidden')
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={'dont_obey_robotstxt': True}
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value('robotstxt/request_count')
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
else:
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
if failure.type is not IgnoreRequest:
logger.error("Error downloading %(request)s: %(f_exception)s",
{'request': request, 'f_exception': failure.value},
exc_info=failure_to_exc_info(failure),
extra={'spider': spider})
return failure
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value('robotstxt/response_count')
self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
rp_dfd.callback(rp)
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = f'robotstxt/exception_count/{failure.type}'
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)

View file

@ -0,0 +1,34 @@
from scrapy.exceptions import NotConfigured
from scrapy.utils.request import request_httprepr
from scrapy.utils.response import response_httprepr
from scrapy.utils.python import global_object_name
class DownloaderStats:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('DOWNLOADER_STATS'):
raise NotConfigured
return cls(crawler.stats)
def process_request(self, request, spider):
self.stats.inc_value('downloader/request_count', spider=spider)
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
reqlen = len(request_httprepr(request))
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
def process_response(self, request, response, spider):
self.stats.inc_value('downloader/response_count', spider=spider)
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
reslen = len(response_httprepr(response))
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
return response
def process_exception(self, request, exception, spider):
ex_class = global_object_name(exception.__class__)
self.stats.inc_value('downloader/exception_count', spider=spider)
self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)

View file

@ -0,0 +1,23 @@
"""Set User-Agent header per spider or use a default value from settings"""
from scrapy import signals
class UserAgentMiddleware:
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings['USER_AGENT'])
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)