Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
118
venv/lib/python3.9/site-packages/scrapy/spiders/__init__.py
Normal file
118
venv/lib/python3.9/site-packages/scrapy/spiders/__init__.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
"""
|
||||
Base class for Scrapy spiders
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
import logging
|
||||
import warnings
|
||||
from typing import Optional
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.trackref import object_ref
|
||||
from scrapy.utils.url import url_is_from_spider
|
||||
from scrapy.utils.deprecate import method_is_overridden
|
||||
|
||||
|
||||
class Spider(object_ref):
|
||||
"""Base class for scrapy spiders. All spiders must inherit from this
|
||||
class.
|
||||
"""
|
||||
|
||||
name: Optional[str] = None
|
||||
custom_settings: Optional[dict] = None
|
||||
|
||||
def __init__(self, name=None, **kwargs):
|
||||
if name is not None:
|
||||
self.name = name
|
||||
elif not getattr(self, 'name', None):
|
||||
raise ValueError(f"{type(self).__name__} must have a name")
|
||||
self.__dict__.update(kwargs)
|
||||
if not hasattr(self, 'start_urls'):
|
||||
self.start_urls = []
|
||||
|
||||
@property
|
||||
def logger(self):
|
||||
logger = logging.getLogger(self.name)
|
||||
return logging.LoggerAdapter(logger, {'spider': self})
|
||||
|
||||
def log(self, message, level=logging.DEBUG, **kw):
|
||||
"""Log the given message at the given log level
|
||||
|
||||
This helper wraps a log call to the logger within the spider, but you
|
||||
can use it directly (e.g. Spider.logger.info('msg')) or use any other
|
||||
Python logger too.
|
||||
"""
|
||||
self.logger.log(level, message, **kw)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
spider = cls(*args, **kwargs)
|
||||
spider._set_crawler(crawler)
|
||||
return spider
|
||||
|
||||
def _set_crawler(self, crawler):
|
||||
self.crawler = crawler
|
||||
self.settings = crawler.settings
|
||||
crawler.signals.connect(self.close, signals.spider_closed)
|
||||
|
||||
def start_requests(self):
|
||||
cls = self.__class__
|
||||
if not self.start_urls and hasattr(self, 'start_url'):
|
||||
raise AttributeError(
|
||||
"Crawling could not start: 'start_urls' not found "
|
||||
"or empty (but found 'start_url' attribute instead, "
|
||||
"did you miss an 's'?)")
|
||||
if method_is_overridden(cls, Spider, 'make_requests_from_url'):
|
||||
warnings.warn(
|
||||
"Spider.make_requests_from_url method is deprecated; it "
|
||||
"won't be called in future Scrapy releases. Please "
|
||||
"override Spider.start_requests method instead "
|
||||
f"(see {cls.__module__}.{cls.__name__}).",
|
||||
)
|
||||
for url in self.start_urls:
|
||||
yield self.make_requests_from_url(url)
|
||||
else:
|
||||
for url in self.start_urls:
|
||||
yield Request(url, dont_filter=True)
|
||||
|
||||
def make_requests_from_url(self, url):
|
||||
""" This method is deprecated. """
|
||||
warnings.warn(
|
||||
"Spider.make_requests_from_url method is deprecated: "
|
||||
"it will be removed and not be called by the default "
|
||||
"Spider.start_requests method in future Scrapy releases. "
|
||||
"Please override Spider.start_requests method instead."
|
||||
)
|
||||
return Request(url, dont_filter=True)
|
||||
|
||||
def _parse(self, response, **kwargs):
|
||||
return self.parse(response, **kwargs)
|
||||
|
||||
def parse(self, response, **kwargs):
|
||||
raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
|
||||
|
||||
@classmethod
|
||||
def update_settings(cls, settings):
|
||||
settings.setdict(cls.custom_settings or {}, priority='spider')
|
||||
|
||||
@classmethod
|
||||
def handles_request(cls, request):
|
||||
return url_is_from_spider(request.url, cls)
|
||||
|
||||
@staticmethod
|
||||
def close(spider, reason):
|
||||
closed = getattr(spider, 'closed', None)
|
||||
if callable(closed):
|
||||
return closed(reason)
|
||||
|
||||
def __str__(self):
|
||||
return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
|
||||
# Top-level imports
|
||||
from scrapy.spiders.crawl import CrawlSpider, Rule
|
||||
from scrapy.spiders.feed import XMLFeedSpider, CSVFeedSpider
|
||||
from scrapy.spiders.sitemap import SitemapSpider
|
||||
139
venv/lib/python3.9/site-packages/scrapy/spiders/crawl.py
Normal file
139
venv/lib/python3.9/site-packages/scrapy/spiders/crawl.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
"""
|
||||
This modules implements the CrawlSpider which is the recommended spider to use
|
||||
for scraping typical web sites that requires crawling pages.
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
|
||||
import copy
|
||||
from typing import Sequence
|
||||
|
||||
from scrapy.http import Request, HtmlResponse
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
||||
def _identity(x):
|
||||
return x
|
||||
|
||||
|
||||
def _identity_process_request(request, response):
|
||||
return request
|
||||
|
||||
|
||||
def _get_method(method, spider):
|
||||
if callable(method):
|
||||
return method
|
||||
elif isinstance(method, str):
|
||||
return getattr(spider, method, None)
|
||||
|
||||
|
||||
_default_link_extractor = LinkExtractor()
|
||||
|
||||
|
||||
class Rule:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
link_extractor=None,
|
||||
callback=None,
|
||||
cb_kwargs=None,
|
||||
follow=None,
|
||||
process_links=None,
|
||||
process_request=None,
|
||||
errback=None,
|
||||
):
|
||||
self.link_extractor = link_extractor or _default_link_extractor
|
||||
self.callback = callback
|
||||
self.errback = errback
|
||||
self.cb_kwargs = cb_kwargs or {}
|
||||
self.process_links = process_links or _identity
|
||||
self.process_request = process_request or _identity_process_request
|
||||
self.follow = follow if follow is not None else not callback
|
||||
|
||||
def _compile(self, spider):
|
||||
self.callback = _get_method(self.callback, spider)
|
||||
self.errback = _get_method(self.errback, spider)
|
||||
self.process_links = _get_method(self.process_links, spider)
|
||||
self.process_request = _get_method(self.process_request, spider)
|
||||
|
||||
|
||||
class CrawlSpider(Spider):
|
||||
|
||||
rules: Sequence[Rule] = ()
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self._compile_rules()
|
||||
|
||||
def _parse(self, response, **kwargs):
|
||||
return self._parse_response(
|
||||
response=response,
|
||||
callback=self.parse_start_url,
|
||||
cb_kwargs=kwargs,
|
||||
follow=True,
|
||||
)
|
||||
|
||||
def parse_start_url(self, response, **kwargs):
|
||||
return []
|
||||
|
||||
def process_results(self, response, results):
|
||||
return results
|
||||
|
||||
def _build_request(self, rule_index, link):
|
||||
return Request(
|
||||
url=link.url,
|
||||
callback=self._callback,
|
||||
errback=self._errback,
|
||||
meta=dict(rule=rule_index, link_text=link.text),
|
||||
)
|
||||
|
||||
def _requests_to_follow(self, response):
|
||||
if not isinstance(response, HtmlResponse):
|
||||
return
|
||||
seen = set()
|
||||
for rule_index, rule in enumerate(self._rules):
|
||||
links = [lnk for lnk in rule.link_extractor.extract_links(response)
|
||||
if lnk not in seen]
|
||||
for link in rule.process_links(links):
|
||||
seen.add(link)
|
||||
request = self._build_request(rule_index, link)
|
||||
yield rule.process_request(request, response)
|
||||
|
||||
def _callback(self, response):
|
||||
rule = self._rules[response.meta['rule']]
|
||||
return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
|
||||
|
||||
def _errback(self, failure):
|
||||
rule = self._rules[failure.request.meta['rule']]
|
||||
return self._handle_failure(failure, rule.errback)
|
||||
|
||||
def _parse_response(self, response, callback, cb_kwargs, follow=True):
|
||||
if callback:
|
||||
cb_res = callback(response, **cb_kwargs) or ()
|
||||
cb_res = self.process_results(response, cb_res)
|
||||
for request_or_item in iterate_spider_output(cb_res):
|
||||
yield request_or_item
|
||||
|
||||
if follow and self._follow_links:
|
||||
for request_or_item in self._requests_to_follow(response):
|
||||
yield request_or_item
|
||||
|
||||
def _handle_failure(self, failure, errback):
|
||||
if errback:
|
||||
results = errback(failure) or ()
|
||||
for request_or_item in iterate_spider_output(results):
|
||||
yield request_or_item
|
||||
|
||||
def _compile_rules(self):
|
||||
self._rules = []
|
||||
for rule in self.rules:
|
||||
self._rules.append(copy.copy(rule))
|
||||
self._rules[-1]._compile(self)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
spider = super().from_crawler(crawler, *args, **kwargs)
|
||||
spider._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
|
||||
return spider
|
||||
135
venv/lib/python3.9/site-packages/scrapy/spiders/feed.py
Normal file
135
venv/lib/python3.9/site-packages/scrapy/spiders/feed.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
"""
|
||||
This module implements the XMLFeedSpider which is the recommended spider to use
|
||||
for scraping from an XML feed.
|
||||
|
||||
See documentation in docs/topics/spiders.rst
|
||||
"""
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.iterators import xmliter, csviter
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.exceptions import NotConfigured, NotSupported
|
||||
|
||||
|
||||
class XMLFeedSpider(Spider):
|
||||
"""
|
||||
This class intends to be the base class for spiders that scrape
|
||||
from XML feeds.
|
||||
|
||||
You can choose whether to parse the file using the 'iternodes' iterator, an
|
||||
'xml' selector, or an 'html' selector. In most cases, it's convenient to
|
||||
use iternodes, since it's a faster and cleaner.
|
||||
"""
|
||||
|
||||
iterator = 'iternodes'
|
||||
itertag = 'item'
|
||||
namespaces = ()
|
||||
|
||||
def process_results(self, response, results):
|
||||
"""This overridable method is called for each result (item or request)
|
||||
returned by the spider, and it's intended to perform any last time
|
||||
processing required before returning the results to the framework core,
|
||||
for example setting the item GUIDs. It receives a list of results and
|
||||
the response which originated that results. It must return a list of
|
||||
results (items or requests).
|
||||
"""
|
||||
return results
|
||||
|
||||
def adapt_response(self, response):
|
||||
"""You can override this function in order to make any changes you want
|
||||
to into the feed before parsing it. This function must return a
|
||||
response.
|
||||
"""
|
||||
return response
|
||||
|
||||
def parse_node(self, response, selector):
|
||||
"""This method must be overriden with your custom spider functionality"""
|
||||
if hasattr(self, 'parse_item'): # backward compatibility
|
||||
return self.parse_item(response, selector)
|
||||
raise NotImplementedError
|
||||
|
||||
def parse_nodes(self, response, nodes):
|
||||
"""This method is called for the nodes matching the provided tag name
|
||||
(itertag). Receives the response and an Selector for each node.
|
||||
Overriding this method is mandatory. Otherwise, you spider won't work.
|
||||
This method must return either an item, a request, or a list
|
||||
containing any of them.
|
||||
"""
|
||||
|
||||
for selector in nodes:
|
||||
ret = iterate_spider_output(self.parse_node(response, selector))
|
||||
for result_item in self.process_results(response, ret):
|
||||
yield result_item
|
||||
|
||||
def _parse(self, response, **kwargs):
|
||||
if not hasattr(self, 'parse_node'):
|
||||
raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
|
||||
|
||||
response = self.adapt_response(response)
|
||||
if self.iterator == 'iternodes':
|
||||
nodes = self._iternodes(response)
|
||||
elif self.iterator == 'xml':
|
||||
selector = Selector(response, type='xml')
|
||||
self._register_namespaces(selector)
|
||||
nodes = selector.xpath(f'//{self.itertag}')
|
||||
elif self.iterator == 'html':
|
||||
selector = Selector(response, type='html')
|
||||
self._register_namespaces(selector)
|
||||
nodes = selector.xpath(f'//{self.itertag}')
|
||||
else:
|
||||
raise NotSupported('Unsupported node iterator')
|
||||
|
||||
return self.parse_nodes(response, nodes)
|
||||
|
||||
def _iternodes(self, response):
|
||||
for node in xmliter(response, self.itertag):
|
||||
self._register_namespaces(node)
|
||||
yield node
|
||||
|
||||
def _register_namespaces(self, selector):
|
||||
for (prefix, uri) in self.namespaces:
|
||||
selector.register_namespace(prefix, uri)
|
||||
|
||||
|
||||
class CSVFeedSpider(Spider):
|
||||
"""Spider for parsing CSV feeds.
|
||||
It receives a CSV file in a response; iterates through each of its rows,
|
||||
and calls parse_row with a dict containing each field's data.
|
||||
|
||||
You can set some options regarding the CSV file, such as the delimiter, quotechar
|
||||
and the file's headers.
|
||||
"""
|
||||
|
||||
delimiter = None # When this is None, python's csv module's default delimiter is used
|
||||
quotechar = None # When this is None, python's csv module's default quotechar is used
|
||||
headers = None
|
||||
|
||||
def process_results(self, response, results):
|
||||
"""This method has the same purpose as the one in XMLFeedSpider"""
|
||||
return results
|
||||
|
||||
def adapt_response(self, response):
|
||||
"""This method has the same purpose as the one in XMLFeedSpider"""
|
||||
return response
|
||||
|
||||
def parse_row(self, response, row):
|
||||
"""This method must be overriden with your custom spider functionality"""
|
||||
raise NotImplementedError
|
||||
|
||||
def parse_rows(self, response):
|
||||
"""Receives a response and a dict (representing each row) with a key for
|
||||
each provided (or detected) header of the CSV file. This spider also
|
||||
gives the opportunity to override adapt_response and
|
||||
process_results methods for pre and post-processing purposes.
|
||||
"""
|
||||
|
||||
for row in csviter(response, self.delimiter, self.headers, self.quotechar):
|
||||
ret = iterate_spider_output(self.parse_row(response, row))
|
||||
for result_item in self.process_results(response, ret):
|
||||
yield result_item
|
||||
|
||||
def _parse(self, response, **kwargs):
|
||||
if not hasattr(self, 'parse_row'):
|
||||
raise NotConfigured('You must define parse_row method in order to scrape this CSV feed')
|
||||
response = self.adapt_response(response)
|
||||
return self.parse_rows(response)
|
||||
31
venv/lib/python3.9/site-packages/scrapy/spiders/init.py
Normal file
31
venv/lib/python3.9/site-packages/scrapy/spiders/init.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
||||
class InitSpider(Spider):
|
||||
"""Base Spider with initialization facilities"""
|
||||
|
||||
def start_requests(self):
|
||||
self._postinit_reqs = super().start_requests()
|
||||
return iterate_spider_output(self.init_request())
|
||||
|
||||
def initialized(self, response=None):
|
||||
"""This method must be set as the callback of your last initialization
|
||||
request. See self.init_request() docstring for more info.
|
||||
"""
|
||||
return self.__dict__.pop('_postinit_reqs')
|
||||
|
||||
def init_request(self):
|
||||
"""This function should return one initialization request, with the
|
||||
self.initialized method as callback. When the self.initialized method
|
||||
is called this spider is considered initialized. If you need to perform
|
||||
several requests for initializing your spider, you can do so by using
|
||||
different callbacks. The only requirement is that the final callback
|
||||
(of the last initialization request) must be self.initialized.
|
||||
|
||||
The default implementation calls self.initialized immediately, and
|
||||
means that no initialization is needed. This method should be
|
||||
overridden only when you need to perform requests to initialize your
|
||||
spider
|
||||
"""
|
||||
return self.initialized()
|
||||
99
venv/lib/python3.9/site-packages/scrapy/spiders/sitemap.py
Normal file
99
venv/lib/python3.9/site-packages/scrapy/spiders/sitemap.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
import re
|
||||
import logging
|
||||
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.http import Request, XmlResponse
|
||||
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
|
||||
from scrapy.utils.gz import gunzip, gzip_magic_number
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SitemapSpider(Spider):
|
||||
|
||||
sitemap_urls = ()
|
||||
sitemap_rules = [('', 'parse')]
|
||||
sitemap_follow = ['']
|
||||
sitemap_alternate_links = False
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self._cbs = []
|
||||
for r, c in self.sitemap_rules:
|
||||
if isinstance(c, str):
|
||||
c = getattr(self, c)
|
||||
self._cbs.append((regex(r), c))
|
||||
self._follow = [regex(x) for x in self.sitemap_follow]
|
||||
|
||||
def start_requests(self):
|
||||
for url in self.sitemap_urls:
|
||||
yield Request(url, self._parse_sitemap)
|
||||
|
||||
def sitemap_filter(self, entries):
|
||||
"""This method can be used to filter sitemap entries by their
|
||||
attributes, for example, you can filter locs with lastmod greater
|
||||
than a given date (see docs).
|
||||
"""
|
||||
for entry in entries:
|
||||
yield entry
|
||||
|
||||
def _parse_sitemap(self, response):
|
||||
if response.url.endswith('/robots.txt'):
|
||||
for url in sitemap_urls_from_robots(response.text, base_url=response.url):
|
||||
yield Request(url, callback=self._parse_sitemap)
|
||||
else:
|
||||
body = self._get_sitemap_body(response)
|
||||
if body is None:
|
||||
logger.warning("Ignoring invalid sitemap: %(response)s",
|
||||
{'response': response}, extra={'spider': self})
|
||||
return
|
||||
|
||||
s = Sitemap(body)
|
||||
it = self.sitemap_filter(s)
|
||||
|
||||
if s.type == 'sitemapindex':
|
||||
for loc in iterloc(it, self.sitemap_alternate_links):
|
||||
if any(x.search(loc) for x in self._follow):
|
||||
yield Request(loc, callback=self._parse_sitemap)
|
||||
elif s.type == 'urlset':
|
||||
for loc in iterloc(it, self.sitemap_alternate_links):
|
||||
for r, c in self._cbs:
|
||||
if r.search(loc):
|
||||
yield Request(loc, callback=c)
|
||||
break
|
||||
|
||||
def _get_sitemap_body(self, response):
|
||||
"""Return the sitemap body contained in the given response,
|
||||
or None if the response is not a sitemap.
|
||||
"""
|
||||
if isinstance(response, XmlResponse):
|
||||
return response.body
|
||||
elif gzip_magic_number(response):
|
||||
return gunzip(response.body)
|
||||
# actual gzipped sitemap files are decompressed above ;
|
||||
# if we are here (response body is not gzipped)
|
||||
# and have a response for .xml.gz,
|
||||
# it usually means that it was already gunzipped
|
||||
# by HttpCompression middleware,
|
||||
# the HTTP response being sent with "Content-Encoding: gzip"
|
||||
# without actually being a .xml.gz file in the first place,
|
||||
# merely XML gzip-compressed on the fly,
|
||||
# in other word, here, we have plain XML
|
||||
elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
|
||||
return response.body
|
||||
|
||||
|
||||
def regex(x):
|
||||
if isinstance(x, str):
|
||||
return re.compile(x)
|
||||
return x
|
||||
|
||||
|
||||
def iterloc(it, alt=False):
|
||||
for d in it:
|
||||
yield d['loc']
|
||||
|
||||
# Also consider alternate URLs (xhtml:link rel="alternate")
|
||||
if alt and 'alternate' in d:
|
||||
yield from d['alternate']
|
||||
Loading…
Add table
Add a link
Reference in a new issue