Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
|
|
@ -0,0 +1,93 @@
|
|||
import re
|
||||
import logging
|
||||
|
||||
from w3lib import html
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import HtmlResponse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AjaxCrawlMiddleware:
|
||||
"""
|
||||
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
|
||||
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('AJAXCRAWL_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||
# middleware parses first 4k. 4k turns out to be insufficient
|
||||
# for this middleware, and parsing 100k could be slow.
|
||||
# We use something in between (32K) by default.
|
||||
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
|
||||
if not isinstance(response, HtmlResponse) or response.status != 200:
|
||||
return response
|
||||
|
||||
if request.method != 'GET':
|
||||
# other HTTP methods are either not safe or don't have a body
|
||||
return response
|
||||
|
||||
if 'ajax_crawlable' in request.meta: # prevent loops
|
||||
return response
|
||||
|
||||
if not self._has_ajax_crawlable_variant(response):
|
||||
return response
|
||||
|
||||
# scrapy already handles #! links properly
|
||||
ajax_crawl_request = request.replace(url=request.url + '#!')
|
||||
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
|
||||
extra={'spider': spider})
|
||||
|
||||
ajax_crawl_request.meta['ajax_crawlable'] = True
|
||||
return ajax_crawl_request
|
||||
|
||||
def _has_ajax_crawlable_variant(self, response):
|
||||
"""
|
||||
Return True if a page without hash fragment could be "AJAX crawlable"
|
||||
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
body = response.text[:self.lookup_bytes]
|
||||
return _has_ajaxcrawlable_meta(body)
|
||||
|
||||
|
||||
# XXX: move it to w3lib?
|
||||
_ajax_crawlable_re = re.compile(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>')
|
||||
|
||||
|
||||
def _has_ajaxcrawlable_meta(text):
|
||||
"""
|
||||
>>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
|
||||
True
|
||||
>>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
|
||||
True
|
||||
>>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
|
||||
False
|
||||
>>> _has_ajaxcrawlable_meta('<html></html>')
|
||||
False
|
||||
"""
|
||||
|
||||
# Stripping scripts and comments is slow (about 20x slower than
|
||||
# just checking if a string is in text); this is a quick fail-fast
|
||||
# path that should work for most pages.
|
||||
if 'fragment' not in text:
|
||||
return False
|
||||
if 'content' not in text:
|
||||
return False
|
||||
|
||||
text = html.remove_tags_with_content(text, ('script', 'noscript'))
|
||||
text = html.replace_entities(text)
|
||||
text = html.remove_comments(text)
|
||||
return _ajax_crawlable_re.search(text) is not None
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import Response
|
||||
from scrapy.http.cookies import CookieJar
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CookiesMiddleware:
|
||||
"""This middleware enables working with sites that need cookies"""
|
||||
|
||||
def __init__(self, debug=False):
|
||||
self.jars = defaultdict(CookieJar)
|
||||
self.debug = debug
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COOKIES_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return
|
||||
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
for cookie in self._get_request_cookies(jar, request):
|
||||
jar.set_cookie_if_ok(cookie, request)
|
||||
|
||||
# set Cookie header
|
||||
request.headers.pop('Cookie', None)
|
||||
jar.add_cookie_header(request)
|
||||
self._debug_cookie(request, spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return response
|
||||
|
||||
# extract cookies from Set-Cookie and drop invalid/expired cookies
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
jar.extract_cookies(response, request)
|
||||
self._debug_set_cookie(response, spider)
|
||||
|
||||
return response
|
||||
|
||||
def _debug_cookie(self, request, spider):
|
||||
if self.debug:
|
||||
cl = [to_unicode(c, errors='replace')
|
||||
for c in request.headers.getlist('Cookie')]
|
||||
if cl:
|
||||
cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
|
||||
msg = f"Sending cookies to: {request}\n{cookies}"
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _debug_set_cookie(self, response, spider):
|
||||
if self.debug:
|
||||
cl = [to_unicode(c, errors='replace')
|
||||
for c in response.headers.getlist('Set-Cookie')]
|
||||
if cl:
|
||||
cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
|
||||
msg = f"Received cookies from: {response}\n{cookies}"
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _format_cookie(self, cookie, request):
|
||||
"""
|
||||
Given a dict consisting of cookie components, return its string representation.
|
||||
Decode from bytes if necessary.
|
||||
"""
|
||||
decoded = {}
|
||||
for key in ("name", "value", "path", "domain"):
|
||||
if cookie.get(key) is None:
|
||||
if key in ("name", "value"):
|
||||
msg = "Invalid cookie found in request {}: {} ('{}' is missing)"
|
||||
logger.warning(msg.format(request, cookie, key))
|
||||
return
|
||||
continue
|
||||
if isinstance(cookie[key], str):
|
||||
decoded[key] = cookie[key]
|
||||
else:
|
||||
try:
|
||||
decoded[key] = cookie[key].decode("utf8")
|
||||
except UnicodeDecodeError:
|
||||
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
|
||||
request, cookie)
|
||||
decoded[key] = cookie[key].decode("latin1", errors="replace")
|
||||
|
||||
cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
|
||||
for key, value in decoded.items(): # path, domain
|
||||
cookie_str += f"; {key.capitalize()}={value}"
|
||||
return cookie_str
|
||||
|
||||
def _get_request_cookies(self, jar, request):
|
||||
"""
|
||||
Extract cookies from the Request.cookies attribute
|
||||
"""
|
||||
if not request.cookies:
|
||||
return []
|
||||
elif isinstance(request.cookies, dict):
|
||||
cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
|
||||
else:
|
||||
cookies = request.cookies
|
||||
formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
|
||||
response = Response(request.url, headers={"Set-Cookie": formatted})
|
||||
return jar.make_cookies(response, request)
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
""" This module implements the DecompressionMiddleware which tries to recognise
|
||||
and extract the potentially compressed responses that may arrive.
|
||||
"""
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import logging
|
||||
import tarfile
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from tempfile import mktemp
|
||||
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DecompressionMiddleware:
|
||||
""" This middleware tries to recognise and extract the possibly compressed
|
||||
responses that may arrive. """
|
||||
|
||||
def __init__(self):
|
||||
self._formats = {
|
||||
'tar': self._is_tar,
|
||||
'zip': self._is_zip,
|
||||
'gz': self._is_gzip,
|
||||
'bz2': self._is_bzip2
|
||||
}
|
||||
|
||||
def _is_tar(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
|
||||
except tarfile.ReadError:
|
||||
return
|
||||
|
||||
body = tar_file.extractfile(tar_file.members[0]).read()
|
||||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_zip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
zip_file = zipfile.ZipFile(archive)
|
||||
except zipfile.BadZipfile:
|
||||
return
|
||||
|
||||
namelist = zip_file.namelist()
|
||||
body = zip_file.read(namelist[0])
|
||||
respcls = responsetypes.from_args(filename=namelist[0], body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_gzip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
body = gzip.GzipFile(fileobj=archive).read()
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_bzip2(self, response):
|
||||
try:
|
||||
body = bz2.decompress(response.body)
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if not response.body:
|
||||
return response
|
||||
|
||||
for fmt, func in self._formats.items():
|
||||
new_response = func(response)
|
||||
if new_response:
|
||||
logger.debug('Decompressed response with format: %(responsefmt)s',
|
||||
{'responsefmt': fmt}, extra={'spider': spider})
|
||||
return new_response
|
||||
return response
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
DefaultHeaders downloader middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
|
||||
class DefaultHeadersMiddleware:
|
||||
|
||||
def __init__(self, headers):
|
||||
self._headers = headers
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
|
||||
return cls(headers.items())
|
||||
|
||||
def process_request(self, request, spider):
|
||||
for k, v in self._headers:
|
||||
request.headers.setdefault(k, v)
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
"""
|
||||
Download timeout middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class DownloadTimeoutMiddleware:
|
||||
|
||||
def __init__(self, timeout=180):
|
||||
self._timeout = timeout
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self._timeout = getattr(spider, 'download_timeout', self._timeout)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self._timeout:
|
||||
request.meta.setdefault('download_timeout', self._timeout)
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
HTTP basic auth downloader middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class HttpAuthMiddleware:
|
||||
"""Set Basic HTTP Authorization header
|
||||
(http_user and http_pass spider class attributes)"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls()
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
usr = getattr(spider, 'http_user', '')
|
||||
pwd = getattr(spider, 'http_pass', '')
|
||||
if usr or pwd:
|
||||
self.auth = basic_auth_header(usr, pwd)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
auth = getattr(self, 'auth', None)
|
||||
if auth and b'Authorization' not in request.headers:
|
||||
request.headers[b'Authorization'] = auth
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
from email.utils import formatdate
|
||||
from typing import Optional, Type, TypeVar
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import (
|
||||
ConnectError,
|
||||
ConnectionDone,
|
||||
ConnectionLost,
|
||||
ConnectionRefusedError,
|
||||
DNSLookupError,
|
||||
TCPTimedOutError,
|
||||
TimeoutError,
|
||||
)
|
||||
from twisted.web.client import ResponseFailed
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
from scrapy.http.request import Request
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.statscollectors import StatsCollector
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
|
||||
HttpCacheMiddlewareTV = TypeVar("HttpCacheMiddlewareTV", bound="HttpCacheMiddleware")
|
||||
|
||||
|
||||
class HttpCacheMiddleware:
|
||||
|
||||
DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError)
|
||||
|
||||
def __init__(self, settings: Settings, stats: StatsCollector) -> None:
|
||||
if not settings.getbool('HTTPCACHE_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
|
||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls: Type[HttpCacheMiddlewareTV], crawler: Crawler) -> HttpCacheMiddlewareTV:
|
||||
o = cls(crawler.settings, crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider: Spider) -> None:
|
||||
self.storage.open_spider(spider)
|
||||
|
||||
def spider_closed(self, spider: Spider) -> None:
|
||||
self.storage.close_spider(spider)
|
||||
|
||||
def process_request(self, request: Request, spider: Spider) -> Optional[Response]:
|
||||
if request.meta.get('dont_cache', False):
|
||||
return None
|
||||
|
||||
# Skip uncacheable requests
|
||||
if not self.policy.should_cache_request(request):
|
||||
request.meta['_dont_cache'] = True # flag as uncacheable
|
||||
return None
|
||||
|
||||
# Look for cached response and check if expired
|
||||
cachedresponse = self.storage.retrieve_response(spider, request)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/miss', spider=spider)
|
||||
if self.ignore_missing:
|
||||
self.stats.inc_value('httpcache/ignore', spider=spider)
|
||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||
return None # first time request
|
||||
|
||||
# Return cached response only if not expired
|
||||
cachedresponse.flags.append('cached')
|
||||
if self.policy.is_cached_response_fresh(cachedresponse, request):
|
||||
self.stats.inc_value('httpcache/hit', spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
# Keep a reference to cached response to avoid a second cache lookup on
|
||||
# process_response hook
|
||||
request.meta['cached_response'] = cachedresponse
|
||||
|
||||
return None
|
||||
|
||||
def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
|
||||
if request.meta.get('dont_cache', False):
|
||||
return response
|
||||
|
||||
# Skip cached responses and uncacheable requests
|
||||
if 'cached' in response.flags or '_dont_cache' in request.meta:
|
||||
request.meta.pop('_dont_cache', None)
|
||||
return response
|
||||
|
||||
# RFC2616 requires origin server to set Date header,
|
||||
# https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
|
||||
if 'Date' not in response.headers:
|
||||
response.headers['Date'] = formatdate(usegmt=True)
|
||||
|
||||
# Do not validate first-hand responses
|
||||
cachedresponse = request.meta.pop('cached_response', None)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/firsthand', spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
if self.policy.is_cached_response_valid(cachedresponse, response, request):
|
||||
self.stats.inc_value('httpcache/revalidate', spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
self.stats.inc_value('httpcache/invalidate', spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
def process_exception(
|
||||
self, request: Request, exception: Exception, spider: Spider
|
||||
) -> Optional[Response]:
|
||||
cachedresponse = request.meta.pop('cached_response', None)
|
||||
if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
|
||||
self.stats.inc_value('httpcache/errorrecovery', spider=spider)
|
||||
return cachedresponse
|
||||
return None
|
||||
|
||||
def _cache_response(
|
||||
self, spider: Spider, response: Response, request: Request, cachedresponse: Optional[Response]
|
||||
) -> None:
|
||||
if self.policy.should_cache_response(response, request):
|
||||
self.stats.inc_value('httpcache/store', spider=spider)
|
||||
self.storage.store_response(spider, request, response)
|
||||
else:
|
||||
self.stats.inc_value('httpcache/uncacheable', spider=spider)
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
import io
|
||||
import zlib
|
||||
|
||||
from scrapy.utils.gz import gunzip
|
||||
from scrapy.http import Response, TextResponse
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
|
||||
|
||||
try:
|
||||
import brotli
|
||||
ACCEPTED_ENCODINGS.append(b'br')
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import zstandard
|
||||
ACCEPTED_ENCODINGS.append(b'zstd')
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class HttpCompressionMiddleware:
|
||||
"""This middleware allows compressed (gzip, deflate) traffic to be
|
||||
sent/received from web sites"""
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls()
|
||||
|
||||
def process_request(self, request, spider):
|
||||
request.headers.setdefault('Accept-Encoding',
|
||||
b", ".join(ACCEPTED_ENCODINGS))
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
|
||||
if request.method == 'HEAD':
|
||||
return response
|
||||
if isinstance(response, Response):
|
||||
content_encoding = response.headers.getlist('Content-Encoding')
|
||||
if content_encoding:
|
||||
encoding = content_encoding.pop()
|
||||
decoded_body = self._decode(response.body, encoding.lower())
|
||||
respcls = responsetypes.from_args(
|
||||
headers=response.headers, url=response.url, body=decoded_body
|
||||
)
|
||||
kwargs = dict(cls=respcls, body=decoded_body)
|
||||
if issubclass(respcls, TextResponse):
|
||||
# force recalculating the encoding until we make sure the
|
||||
# responsetypes guessing is reliable
|
||||
kwargs['encoding'] = None
|
||||
response = response.replace(**kwargs)
|
||||
if not content_encoding:
|
||||
del response.headers['Content-Encoding']
|
||||
|
||||
return response
|
||||
|
||||
def _decode(self, body, encoding):
|
||||
if encoding == b'gzip' or encoding == b'x-gzip':
|
||||
body = gunzip(body)
|
||||
|
||||
if encoding == b'deflate':
|
||||
try:
|
||||
body = zlib.decompress(body)
|
||||
except zlib.error:
|
||||
# ugly hack to work with raw deflate content that may
|
||||
# be sent by microsoft servers. For more information, see:
|
||||
# http://carsten.codimi.de/gzip.yaws/
|
||||
# http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
|
||||
# http://www.gzip.org/zlib/zlib_faq.html#faq38
|
||||
body = zlib.decompress(body, -15)
|
||||
if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
|
||||
body = brotli.decompress(body)
|
||||
if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
|
||||
# Using its streaming API since its simple API could handle only cases
|
||||
# where there is content size data embedded in the frame
|
||||
reader = zstandard.ZstdDecompressor().stream_reader(io.BytesIO(body))
|
||||
body = reader.read()
|
||||
return body
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
import base64
|
||||
from urllib.parse import unquote, urlunparse
|
||||
from urllib.request import getproxies, proxy_bypass, _parse_proxy
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class HttpProxyMiddleware:
|
||||
|
||||
def __init__(self, auth_encoding='latin-1'):
|
||||
self.auth_encoding = auth_encoding
|
||||
self.proxies = {}
|
||||
for type_, url in getproxies().items():
|
||||
try:
|
||||
self.proxies[type_] = self._get_proxy(url, type_)
|
||||
# some values such as '/var/run/docker.sock' can't be parsed
|
||||
# by _parse_proxy and as such should be skipped
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
|
||||
raise NotConfigured
|
||||
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
|
||||
return cls(auth_encoding)
|
||||
|
||||
def _basic_auth_header(self, username, password):
|
||||
user_pass = to_bytes(
|
||||
f'{unquote(username)}:{unquote(password)}',
|
||||
encoding=self.auth_encoding)
|
||||
return base64.b64encode(user_pass)
|
||||
|
||||
def _get_proxy(self, url, orig_type):
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
|
||||
|
||||
if user:
|
||||
creds = self._basic_auth_header(user, password)
|
||||
else:
|
||||
creds = None
|
||||
|
||||
return creds, proxy_url
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# ignore if proxy is already set
|
||||
if 'proxy' in request.meta:
|
||||
if request.meta['proxy'] is None:
|
||||
return
|
||||
# extract credentials if present
|
||||
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
|
||||
request.meta['proxy'] = proxy_url
|
||||
if creds and not request.headers.get('Proxy-Authorization'):
|
||||
request.headers['Proxy-Authorization'] = b'Basic ' + creds
|
||||
return
|
||||
elif not self.proxies:
|
||||
return
|
||||
|
||||
parsed = urlparse_cached(request)
|
||||
scheme = parsed.scheme
|
||||
|
||||
# 'no_proxy' is only supported by http schemes
|
||||
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
|
||||
return
|
||||
|
||||
if scheme in self.proxies:
|
||||
self._set_proxy(request, scheme)
|
||||
|
||||
def _set_proxy(self, request, scheme):
|
||||
creds, proxy = self.proxies[scheme]
|
||||
request.meta['proxy'] = proxy
|
||||
if creds:
|
||||
request.headers['Proxy-Authorization'] = b'Basic ' + creds
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
import logging
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import get_meta_refresh
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseRedirectMiddleware:
|
||||
|
||||
enabled_setting = 'REDIRECT_ENABLED'
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool(self.enabled_setting):
|
||||
raise NotConfigured
|
||||
|
||||
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
|
||||
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def _redirect(self, redirected, request, spider, reason):
|
||||
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
|
||||
redirects = request.meta.get('redirect_times', 0) + 1
|
||||
|
||||
if ttl and redirects <= self.max_redirect_times:
|
||||
redirected.meta['redirect_times'] = redirects
|
||||
redirected.meta['redirect_ttl'] = ttl - 1
|
||||
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + [request.url]
|
||||
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + [reason]
|
||||
redirected.dont_filter = request.dont_filter
|
||||
redirected.priority = request.priority + self.priority_adjust
|
||||
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
|
||||
{'reason': reason, 'redirected': redirected, 'request': request},
|
||||
extra={'spider': spider})
|
||||
return redirected
|
||||
else:
|
||||
logger.debug("Discarding %(request)s: max redirections reached",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
raise IgnoreRequest("max redirections reached")
|
||||
|
||||
def _redirect_request_using_get(self, request, redirect_url):
|
||||
redirected = request.replace(url=redirect_url, method='GET', body='')
|
||||
redirected.headers.pop('Content-Type', None)
|
||||
redirected.headers.pop('Content-Length', None)
|
||||
return redirected
|
||||
|
||||
|
||||
class RedirectMiddleware(BaseRedirectMiddleware):
|
||||
"""
|
||||
Handle redirection of requests based on response status
|
||||
and meta-refresh html tag.
|
||||
"""
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if (
|
||||
request.meta.get('dont_redirect', False)
|
||||
or response.status in getattr(spider, 'handle_httpstatus_list', [])
|
||||
or response.status in request.meta.get('handle_httpstatus_list', [])
|
||||
or request.meta.get('handle_httpstatus_all', False)
|
||||
):
|
||||
return response
|
||||
|
||||
allowed_status = (301, 302, 303, 307, 308)
|
||||
if 'Location' not in response.headers or response.status not in allowed_status:
|
||||
return response
|
||||
|
||||
location = safe_url_string(response.headers['Location'])
|
||||
if response.headers['Location'].startswith(b'//'):
|
||||
request_scheme = urlparse(request.url).scheme
|
||||
location = request_scheme + '://' + location.lstrip('/')
|
||||
|
||||
redirected_url = urljoin(request.url, location)
|
||||
|
||||
if response.status in (301, 307, 308) or request.method == 'HEAD':
|
||||
redirected = request.replace(url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
redirected = self._redirect_request_using_get(request, redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
|
||||
class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
enabled_setting = 'METAREFRESH_ENABLED'
|
||||
|
||||
def __init__(self, settings):
|
||||
super().__init__(settings)
|
||||
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
|
||||
self._maxdelay = settings.getint('METAREFRESH_MAXDELAY')
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if (
|
||||
request.meta.get('dont_redirect', False)
|
||||
or request.method == 'HEAD'
|
||||
or not isinstance(response, HtmlResponse)
|
||||
):
|
||||
return response
|
||||
|
||||
interval, url = get_meta_refresh(response,
|
||||
ignore_tags=self._ignore_tags)
|
||||
if url and interval < self._maxdelay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
||||
return response
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
"""
|
||||
An extension to retry failed requests that are potentially caused by temporary
|
||||
problems such as a connection timeout or HTTP 500 error.
|
||||
|
||||
You can change the behaviour of this middleware by modifing the scraping settings:
|
||||
RETRY_TIMES - how many times to retry a failed page
|
||||
RETRY_HTTP_CODES - which HTTP response codes to retry
|
||||
|
||||
Failed pages are collected on the scraping process and rescheduled at the end,
|
||||
once the spider has finished crawling all regular (non failed) pages.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import (
|
||||
ConnectError,
|
||||
ConnectionDone,
|
||||
ConnectionLost,
|
||||
ConnectionRefusedError,
|
||||
DNSLookupError,
|
||||
TCPTimedOutError,
|
||||
TimeoutError,
|
||||
)
|
||||
from twisted.web.client import ResponseFailed
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.response import response_status_message
|
||||
from scrapy.core.downloader.handlers.http11 import TunnelError
|
||||
from scrapy.utils.python import global_object_name
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryMiddleware:
|
||||
|
||||
# IOError is raised by the HttpCompression middleware when trying to
|
||||
# decompress an empty response
|
||||
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError, TunnelError)
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('RETRY_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.max_retry_times = settings.getint('RETRY_TIMES')
|
||||
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
|
||||
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_retry', False):
|
||||
return response
|
||||
if response.status in self.retry_http_codes:
|
||||
reason = response_status_message(response.status)
|
||||
return self._retry(request, reason, spider) or response
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if (
|
||||
isinstance(exception, self.EXCEPTIONS_TO_RETRY)
|
||||
and not request.meta.get('dont_retry', False)
|
||||
):
|
||||
return self._retry(request, exception, spider)
|
||||
|
||||
def _retry(self, request, reason, spider):
|
||||
retries = request.meta.get('retry_times', 0) + 1
|
||||
|
||||
retry_times = self.max_retry_times
|
||||
|
||||
if 'max_retry_times' in request.meta:
|
||||
retry_times = request.meta['max_retry_times']
|
||||
|
||||
stats = spider.crawler.stats
|
||||
if retries <= retry_times:
|
||||
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
retryreq.priority = request.priority + self.priority_adjust
|
||||
|
||||
if isinstance(reason, Exception):
|
||||
reason = global_object_name(reason.__class__)
|
||||
|
||||
stats.inc_value('retry/count')
|
||||
stats.inc_value(f'retry/reason_count/{reason}')
|
||||
return retryreq
|
||||
else:
|
||||
stats.inc_value('retry/max_reached')
|
||||
logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
|
|
@ -0,0 +1,109 @@
|
|||
"""
|
||||
This is a middleware to respect robots.txt policies. To activate it you must
|
||||
enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from twisted.internet.defer import Deferred, maybeDeferred
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RobotsTxtMiddleware:
|
||||
DOWNLOAD_PRIORITY = 1000
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
|
||||
raise NotConfigured
|
||||
self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
|
||||
self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None)
|
||||
self.crawler = crawler
|
||||
self._parsers = {}
|
||||
self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
|
||||
|
||||
# check if parser dependencies are met, this should throw an error otherwise.
|
||||
self._parserimpl.from_crawler(self.crawler, b'')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_obey_robotstxt'):
|
||||
return
|
||||
d = maybeDeferred(self.robot_parser, request, spider)
|
||||
d.addCallback(self.process_request_2, request, spider)
|
||||
return d
|
||||
|
||||
def process_request_2(self, rp, request, spider):
|
||||
if rp is None:
|
||||
return
|
||||
|
||||
useragent = self._robotstxt_useragent
|
||||
if not useragent:
|
||||
useragent = request.headers.get(b'User-Agent', self._default_useragent)
|
||||
if not rp.allowed(request.url, useragent):
|
||||
logger.debug("Forbidden by robots.txt: %(request)s",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
self.crawler.stats.inc_value('robotstxt/forbidden')
|
||||
raise IgnoreRequest("Forbidden by robots.txt")
|
||||
|
||||
def robot_parser(self, request, spider):
|
||||
url = urlparse_cached(request)
|
||||
netloc = url.netloc
|
||||
|
||||
if netloc not in self._parsers:
|
||||
self._parsers[netloc] = Deferred()
|
||||
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
|
||||
robotsreq = Request(
|
||||
robotsurl,
|
||||
priority=self.DOWNLOAD_PRIORITY,
|
||||
meta={'dont_obey_robotstxt': True}
|
||||
)
|
||||
dfd = self.crawler.engine.download(robotsreq, spider)
|
||||
dfd.addCallback(self._parse_robots, netloc, spider)
|
||||
dfd.addErrback(self._logerror, robotsreq, spider)
|
||||
dfd.addErrback(self._robots_error, netloc)
|
||||
self.crawler.stats.inc_value('robotstxt/request_count')
|
||||
|
||||
if isinstance(self._parsers[netloc], Deferred):
|
||||
d = Deferred()
|
||||
|
||||
def cb(result):
|
||||
d.callback(result)
|
||||
return result
|
||||
self._parsers[netloc].addCallback(cb)
|
||||
return d
|
||||
else:
|
||||
return self._parsers[netloc]
|
||||
|
||||
def _logerror(self, failure, request, spider):
|
||||
if failure.type is not IgnoreRequest:
|
||||
logger.error("Error downloading %(request)s: %(f_exception)s",
|
||||
{'request': request, 'f_exception': failure.value},
|
||||
exc_info=failure_to_exc_info(failure),
|
||||
extra={'spider': spider})
|
||||
return failure
|
||||
|
||||
def _parse_robots(self, response, netloc, spider):
|
||||
self.crawler.stats.inc_value('robotstxt/response_count')
|
||||
self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
|
||||
rp = self._parserimpl.from_crawler(self.crawler, response.body)
|
||||
rp_dfd = self._parsers[netloc]
|
||||
self._parsers[netloc] = rp
|
||||
rp_dfd.callback(rp)
|
||||
|
||||
def _robots_error(self, failure, netloc):
|
||||
if failure.type is not IgnoreRequest:
|
||||
key = f'robotstxt/exception_count/{failure.type}'
|
||||
self.crawler.stats.inc_value(key)
|
||||
rp_dfd = self._parsers[netloc]
|
||||
self._parsers[netloc] = None
|
||||
rp_dfd.callback(None)
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.request import request_httprepr
|
||||
from scrapy.utils.response import response_httprepr
|
||||
from scrapy.utils.python import global_object_name
|
||||
|
||||
|
||||
class DownloaderStats:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('DOWNLOADER_STATS'):
|
||||
raise NotConfigured
|
||||
return cls(crawler.stats)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
self.stats.inc_value('downloader/request_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
|
||||
reqlen = len(request_httprepr(request))
|
||||
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
self.stats.inc_value('downloader/response_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
|
||||
reslen = len(response_httprepr(response))
|
||||
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
ex_class = global_object_name(exception.__class__)
|
||||
self.stats.inc_value('downloader/exception_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
"""Set User-Agent header per spider or use a default value from settings"""
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class UserAgentMiddleware:
|
||||
"""This middleware allows spiders to override the user_agent"""
|
||||
|
||||
def __init__(self, user_agent='Scrapy'):
|
||||
self.user_agent = user_agent
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings['USER_AGENT'])
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self.user_agent:
|
||||
request.headers.setdefault(b'User-Agent', self.user_agent)
|
||||
Loading…
Add table
Add a link
Reference in a new issue