Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,118 @@
"""
Base class for Scrapy spiders
See documentation in docs/topics/spiders.rst
"""
import logging
import warnings
from typing import Optional
from scrapy import signals
from scrapy.http import Request
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import url_is_from_spider
from scrapy.utils.deprecate import method_is_overridden
class Spider(object_ref):
"""Base class for scrapy spiders. All spiders must inherit from this
class.
"""
name: Optional[str] = None
custom_settings: Optional[dict] = None
def __init__(self, name=None, **kwargs):
if name is not None:
self.name = name
elif not getattr(self, 'name', None):
raise ValueError(f"{type(self).__name__} must have a name")
self.__dict__.update(kwargs)
if not hasattr(self, 'start_urls'):
self.start_urls = []
@property
def logger(self):
logger = logging.getLogger(self.name)
return logging.LoggerAdapter(logger, {'spider': self})
def log(self, message, level=logging.DEBUG, **kw):
"""Log the given message at the given log level
This helper wraps a log call to the logger within the spider, but you
can use it directly (e.g. Spider.logger.info('msg')) or use any other
Python logger too.
"""
self.logger.log(level, message, **kw)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = cls(*args, **kwargs)
spider._set_crawler(crawler)
return spider
def _set_crawler(self, crawler):
self.crawler = crawler
self.settings = crawler.settings
crawler.signals.connect(self.close, signals.spider_closed)
def start_requests(self):
cls = self.__class__
if not self.start_urls and hasattr(self, 'start_url'):
raise AttributeError(
"Crawling could not start: 'start_urls' not found "
"or empty (but found 'start_url' attribute instead, "
"did you miss an 's'?)")
if method_is_overridden(cls, Spider, 'make_requests_from_url'):
warnings.warn(
"Spider.make_requests_from_url method is deprecated; it "
"won't be called in future Scrapy releases. Please "
"override Spider.start_requests method instead "
f"(see {cls.__module__}.{cls.__name__}).",
)
for url in self.start_urls:
yield self.make_requests_from_url(url)
else:
for url in self.start_urls:
yield Request(url, dont_filter=True)
def make_requests_from_url(self, url):
""" This method is deprecated. """
warnings.warn(
"Spider.make_requests_from_url method is deprecated: "
"it will be removed and not be called by the default "
"Spider.start_requests method in future Scrapy releases. "
"Please override Spider.start_requests method instead."
)
return Request(url, dont_filter=True)
def _parse(self, response, **kwargs):
return self.parse(response, **kwargs)
def parse(self, response, **kwargs):
raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
@classmethod
def update_settings(cls, settings):
settings.setdict(cls.custom_settings or {}, priority='spider')
@classmethod
def handles_request(cls, request):
return url_is_from_spider(request.url, cls)
@staticmethod
def close(spider, reason):
closed = getattr(spider, 'closed', None)
if callable(closed):
return closed(reason)
def __str__(self):
return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>"
__repr__ = __str__
# Top-level imports
from scrapy.spiders.crawl import CrawlSpider, Rule
from scrapy.spiders.feed import XMLFeedSpider, CSVFeedSpider
from scrapy.spiders.sitemap import SitemapSpider

View file

@ -0,0 +1,139 @@
"""
This modules implements the CrawlSpider which is the recommended spider to use
for scraping typical web sites that requires crawling pages.
See documentation in docs/topics/spiders.rst
"""
import copy
from typing import Sequence
from scrapy.http import Request, HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
from scrapy.utils.spider import iterate_spider_output
def _identity(x):
return x
def _identity_process_request(request, response):
return request
def _get_method(method, spider):
if callable(method):
return method
elif isinstance(method, str):
return getattr(spider, method, None)
_default_link_extractor = LinkExtractor()
class Rule:
def __init__(
self,
link_extractor=None,
callback=None,
cb_kwargs=None,
follow=None,
process_links=None,
process_request=None,
errback=None,
):
self.link_extractor = link_extractor or _default_link_extractor
self.callback = callback
self.errback = errback
self.cb_kwargs = cb_kwargs or {}
self.process_links = process_links or _identity
self.process_request = process_request or _identity_process_request
self.follow = follow if follow is not None else not callback
def _compile(self, spider):
self.callback = _get_method(self.callback, spider)
self.errback = _get_method(self.errback, spider)
self.process_links = _get_method(self.process_links, spider)
self.process_request = _get_method(self.process_request, spider)
class CrawlSpider(Spider):
rules: Sequence[Rule] = ()
def __init__(self, *a, **kw):
super().__init__(*a, **kw)
self._compile_rules()
def _parse(self, response, **kwargs):
return self._parse_response(
response=response,
callback=self.parse_start_url,
cb_kwargs=kwargs,
follow=True,
)
def parse_start_url(self, response, **kwargs):
return []
def process_results(self, response, results):
return results
def _build_request(self, rule_index, link):
return Request(
url=link.url,
callback=self._callback,
errback=self._errback,
meta=dict(rule=rule_index, link_text=link.text),
)
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for rule_index, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
for link in rule.process_links(links):
seen.add(link)
request = self._build_request(rule_index, link)
yield rule.process_request(request, response)
def _callback(self, response):
rule = self._rules[response.meta['rule']]
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _errback(self, failure):
rule = self._rules[failure.request.meta['rule']]
return self._handle_failure(failure, rule.errback)
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for request_or_item in iterate_spider_output(cb_res):
yield request_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
def _handle_failure(self, failure, errback):
if errback:
results = errback(failure) or ()
for request_or_item in iterate_spider_output(results):
yield request_or_item
def _compile_rules(self):
self._rules = []
for rule in self.rules:
self._rules.append(copy.copy(rule))
self._rules[-1]._compile(self)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
spider._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
return spider

View file

@ -0,0 +1,135 @@
"""
This module implements the XMLFeedSpider which is the recommended spider to use
for scraping from an XML feed.
See documentation in docs/topics/spiders.rst
"""
from scrapy.spiders import Spider
from scrapy.utils.iterators import xmliter, csviter
from scrapy.utils.spider import iterate_spider_output
from scrapy.selector import Selector
from scrapy.exceptions import NotConfigured, NotSupported
class XMLFeedSpider(Spider):
"""
This class intends to be the base class for spiders that scrape
from XML feeds.
You can choose whether to parse the file using the 'iternodes' iterator, an
'xml' selector, or an 'html' selector. In most cases, it's convenient to
use iternodes, since it's a faster and cleaner.
"""
iterator = 'iternodes'
itertag = 'item'
namespaces = ()
def process_results(self, response, results):
"""This overridable method is called for each result (item or request)
returned by the spider, and it's intended to perform any last time
processing required before returning the results to the framework core,
for example setting the item GUIDs. It receives a list of results and
the response which originated that results. It must return a list of
results (items or requests).
"""
return results
def adapt_response(self, response):
"""You can override this function in order to make any changes you want
to into the feed before parsing it. This function must return a
response.
"""
return response
def parse_node(self, response, selector):
"""This method must be overriden with your custom spider functionality"""
if hasattr(self, 'parse_item'): # backward compatibility
return self.parse_item(response, selector)
raise NotImplementedError
def parse_nodes(self, response, nodes):
"""This method is called for the nodes matching the provided tag name
(itertag). Receives the response and an Selector for each node.
Overriding this method is mandatory. Otherwise, you spider won't work.
This method must return either an item, a request, or a list
containing any of them.
"""
for selector in nodes:
ret = iterate_spider_output(self.parse_node(response, selector))
for result_item in self.process_results(response, ret):
yield result_item
def _parse(self, response, **kwargs):
if not hasattr(self, 'parse_node'):
raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
response = self.adapt_response(response)
if self.iterator == 'iternodes':
nodes = self._iternodes(response)
elif self.iterator == 'xml':
selector = Selector(response, type='xml')
self._register_namespaces(selector)
nodes = selector.xpath(f'//{self.itertag}')
elif self.iterator == 'html':
selector = Selector(response, type='html')
self._register_namespaces(selector)
nodes = selector.xpath(f'//{self.itertag}')
else:
raise NotSupported('Unsupported node iterator')
return self.parse_nodes(response, nodes)
def _iternodes(self, response):
for node in xmliter(response, self.itertag):
self._register_namespaces(node)
yield node
def _register_namespaces(self, selector):
for (prefix, uri) in self.namespaces:
selector.register_namespace(prefix, uri)
class CSVFeedSpider(Spider):
"""Spider for parsing CSV feeds.
It receives a CSV file in a response; iterates through each of its rows,
and calls parse_row with a dict containing each field's data.
You can set some options regarding the CSV file, such as the delimiter, quotechar
and the file's headers.
"""
delimiter = None # When this is None, python's csv module's default delimiter is used
quotechar = None # When this is None, python's csv module's default quotechar is used
headers = None
def process_results(self, response, results):
"""This method has the same purpose as the one in XMLFeedSpider"""
return results
def adapt_response(self, response):
"""This method has the same purpose as the one in XMLFeedSpider"""
return response
def parse_row(self, response, row):
"""This method must be overriden with your custom spider functionality"""
raise NotImplementedError
def parse_rows(self, response):
"""Receives a response and a dict (representing each row) with a key for
each provided (or detected) header of the CSV file. This spider also
gives the opportunity to override adapt_response and
process_results methods for pre and post-processing purposes.
"""
for row in csviter(response, self.delimiter, self.headers, self.quotechar):
ret = iterate_spider_output(self.parse_row(response, row))
for result_item in self.process_results(response, ret):
yield result_item
def _parse(self, response, **kwargs):
if not hasattr(self, 'parse_row'):
raise NotConfigured('You must define parse_row method in order to scrape this CSV feed')
response = self.adapt_response(response)
return self.parse_rows(response)

View file

@ -0,0 +1,31 @@
from scrapy.spiders import Spider
from scrapy.utils.spider import iterate_spider_output
class InitSpider(Spider):
"""Base Spider with initialization facilities"""
def start_requests(self):
self._postinit_reqs = super().start_requests()
return iterate_spider_output(self.init_request())
def initialized(self, response=None):
"""This method must be set as the callback of your last initialization
request. See self.init_request() docstring for more info.
"""
return self.__dict__.pop('_postinit_reqs')
def init_request(self):
"""This function should return one initialization request, with the
self.initialized method as callback. When the self.initialized method
is called this spider is considered initialized. If you need to perform
several requests for initializing your spider, you can do so by using
different callbacks. The only requirement is that the final callback
(of the last initialization request) must be self.initialized.
The default implementation calls self.initialized immediately, and
means that no initialization is needed. This method should be
overridden only when you need to perform requests to initialize your
spider
"""
return self.initialized()

View file

@ -0,0 +1,99 @@
import re
import logging
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, gzip_magic_number
logger = logging.getLogger(__name__)
class SitemapSpider(Spider):
sitemap_urls = ()
sitemap_rules = [('', 'parse')]
sitemap_follow = ['']
sitemap_alternate_links = False
def __init__(self, *a, **kw):
super().__init__(*a, **kw)
self._cbs = []
for r, c in self.sitemap_rules:
if isinstance(c, str):
c = getattr(self, c)
self._cbs.append((regex(r), c))
self._follow = [regex(x) for x in self.sitemap_follow]
def start_requests(self):
for url in self.sitemap_urls:
yield Request(url, self._parse_sitemap)
def sitemap_filter(self, entries):
"""This method can be used to filter sitemap entries by their
attributes, for example, you can filter locs with lastmod greater
than a given date (see docs).
"""
for entry in entries:
yield entry
def _parse_sitemap(self, response):
if response.url.endswith('/robots.txt'):
for url in sitemap_urls_from_robots(response.text, base_url=response.url):
yield Request(url, callback=self._parse_sitemap)
else:
body = self._get_sitemap_body(response)
if body is None:
logger.warning("Ignoring invalid sitemap: %(response)s",
{'response': response}, extra={'spider': self})
return
s = Sitemap(body)
it = self.sitemap_filter(s)
if s.type == 'sitemapindex':
for loc in iterloc(it, self.sitemap_alternate_links):
if any(x.search(loc) for x in self._follow):
yield Request(loc, callback=self._parse_sitemap)
elif s.type == 'urlset':
for loc in iterloc(it, self.sitemap_alternate_links):
for r, c in self._cbs:
if r.search(loc):
yield Request(loc, callback=c)
break
def _get_sitemap_body(self, response):
"""Return the sitemap body contained in the given response,
or None if the response is not a sitemap.
"""
if isinstance(response, XmlResponse):
return response.body
elif gzip_magic_number(response):
return gunzip(response.body)
# actual gzipped sitemap files are decompressed above ;
# if we are here (response body is not gzipped)
# and have a response for .xml.gz,
# it usually means that it was already gunzipped
# by HttpCompression middleware,
# the HTTP response being sent with "Content-Encoding: gzip"
# without actually being a .xml.gz file in the first place,
# merely XML gzip-compressed on the fly,
# in other word, here, we have plain XML
elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
return response.body
def regex(x):
if isinstance(x, str):
return re.compile(x)
return x
def iterloc(it, alt=False):
for d in it:
yield d['loc']
# Also consider alternate URLs (xhtml:link rel="alternate")
if alt and 'alternate' in d:
yield from d['alternate']