Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
189
venv/lib/python3.9/site-packages/scrapy/shell.py
Normal file
189
venv/lib/python3.9/site-packages/scrapy/shell.py
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
"""Scrapy Shell
|
||||
|
||||
See documentation in docs/topics/shell.rst
|
||||
|
||||
"""
|
||||
import os
|
||||
import signal
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet import threads, defer
|
||||
from twisted.python import threadable
|
||||
from w3lib.url import any_to_uri
|
||||
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.conf import get_config
|
||||
from scrapy.utils.console import DEFAULT_PYTHON_SHELLS, start_python_console
|
||||
from scrapy.utils.datatypes import SequenceExclude
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.response import open_in_browser
|
||||
|
||||
|
||||
class Shell:
|
||||
|
||||
relevant_classes = (Crawler, Spider, Request, Response, Settings)
|
||||
|
||||
def __init__(self, crawler, update_vars=None, code=None):
|
||||
self.crawler = crawler
|
||||
self.update_vars = update_vars or (lambda x: None)
|
||||
self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
|
||||
self.spider = None
|
||||
self.inthread = not threadable.isInIOThread()
|
||||
self.code = code
|
||||
self.vars = {}
|
||||
|
||||
def start(self, url=None, request=None, response=None, spider=None, redirect=True):
|
||||
# disable accidental Ctrl-C key press from shutting down the engine
|
||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||
if url:
|
||||
self.fetch(url, spider, redirect=redirect)
|
||||
elif request:
|
||||
self.fetch(request, spider)
|
||||
elif response:
|
||||
request = response.request
|
||||
self.populate_vars(response, request, spider)
|
||||
else:
|
||||
self.populate_vars()
|
||||
if self.code:
|
||||
print(eval(self.code, globals(), self.vars))
|
||||
else:
|
||||
"""
|
||||
Detect interactive shell setting in scrapy.cfg
|
||||
e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
|
||||
[settings]
|
||||
# shell can be one of ipython, bpython or python;
|
||||
# to be used as the interactive python console, if available.
|
||||
# (default is ipython, fallbacks in the order listed above)
|
||||
shell = python
|
||||
"""
|
||||
cfg = get_config()
|
||||
section, option = 'settings', 'shell'
|
||||
env = os.environ.get('SCRAPY_PYTHON_SHELL')
|
||||
shells = []
|
||||
if env:
|
||||
shells += env.strip().lower().split(',')
|
||||
elif cfg.has_option(section, option):
|
||||
shells += [cfg.get(section, option).strip().lower()]
|
||||
else: # try all by default
|
||||
shells += DEFAULT_PYTHON_SHELLS.keys()
|
||||
# always add standard shell as fallback
|
||||
shells += ['python']
|
||||
start_python_console(self.vars, shells=shells,
|
||||
banner=self.vars.pop('banner', ''))
|
||||
|
||||
def _schedule(self, request, spider):
|
||||
spider = self._open_spider(request, spider)
|
||||
d = _request_deferred(request)
|
||||
d.addCallback(lambda x: (x, spider))
|
||||
self.crawler.engine.crawl(request, spider)
|
||||
return d
|
||||
|
||||
def _open_spider(self, request, spider):
|
||||
if self.spider:
|
||||
return self.spider
|
||||
|
||||
if spider is None:
|
||||
spider = self.crawler.spider or self.crawler._create_spider()
|
||||
|
||||
self.crawler.spider = spider
|
||||
self.crawler.engine.open_spider(spider, close_if_idle=False)
|
||||
self.spider = spider
|
||||
return spider
|
||||
|
||||
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
|
||||
from twisted.internet import reactor
|
||||
if isinstance(request_or_url, Request):
|
||||
request = request_or_url
|
||||
else:
|
||||
url = any_to_uri(request_or_url)
|
||||
request = Request(url, dont_filter=True, **kwargs)
|
||||
if redirect:
|
||||
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
response = None
|
||||
try:
|
||||
response, spider = threads.blockingCallFromThread(
|
||||
reactor, self._schedule, request, spider)
|
||||
except IgnoreRequest:
|
||||
pass
|
||||
self.populate_vars(response, request, spider)
|
||||
|
||||
def populate_vars(self, response=None, request=None, spider=None):
|
||||
import scrapy
|
||||
|
||||
self.vars['scrapy'] = scrapy
|
||||
self.vars['crawler'] = self.crawler
|
||||
self.vars['item'] = self.item_class()
|
||||
self.vars['settings'] = self.crawler.settings
|
||||
self.vars['spider'] = spider
|
||||
self.vars['request'] = request
|
||||
self.vars['response'] = response
|
||||
if self.inthread:
|
||||
self.vars['fetch'] = self.fetch
|
||||
self.vars['view'] = open_in_browser
|
||||
self.vars['shelp'] = self.print_help
|
||||
self.update_vars(self.vars)
|
||||
if not self.code:
|
||||
self.vars['banner'] = self.get_help()
|
||||
|
||||
def print_help(self):
|
||||
print(self.get_help())
|
||||
|
||||
def get_help(self):
|
||||
b = []
|
||||
b.append("Available Scrapy objects:")
|
||||
b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
|
||||
for k, v in sorted(self.vars.items()):
|
||||
if self._is_relevant(v):
|
||||
b.append(f" {k:<10} {v}")
|
||||
b.append("Useful shortcuts:")
|
||||
if self.inthread:
|
||||
b.append(" fetch(url[, redirect=True]) "
|
||||
"Fetch URL and update local objects (by default, redirects are followed)")
|
||||
b.append(" fetch(req) "
|
||||
"Fetch a scrapy.Request and update local objects ")
|
||||
b.append(" shelp() Shell help (print this help)")
|
||||
b.append(" view(response) View response in a browser")
|
||||
|
||||
return "\n".join(f"[s] {line}" for line in b)
|
||||
|
||||
def _is_relevant(self, value):
|
||||
return isinstance(value, self.relevant_classes) or is_item(value)
|
||||
|
||||
|
||||
def inspect_response(response, spider):
|
||||
"""Open a shell to inspect the given response"""
|
||||
Shell(spider.crawler).start(response=response, spider=spider)
|
||||
|
||||
|
||||
def _request_deferred(request):
|
||||
"""Wrap a request inside a Deferred.
|
||||
|
||||
This function is harmful, do not use it until you know what you are doing.
|
||||
|
||||
This returns a Deferred whose first pair of callbacks are the request
|
||||
callback and errback. The Deferred also triggers when the request
|
||||
callback/errback is executed (i.e. when the request is downloaded)
|
||||
|
||||
WARNING: Do not call request.replace() until after the deferred is called.
|
||||
"""
|
||||
request_callback = request.callback
|
||||
request_errback = request.errback
|
||||
|
||||
def _restore_callbacks(result):
|
||||
request.callback = request_callback
|
||||
request.errback = request_errback
|
||||
return result
|
||||
|
||||
d = defer.Deferred()
|
||||
d.addBoth(_restore_callbacks)
|
||||
if request.callback:
|
||||
d.addCallbacks(request.callback, request.errback)
|
||||
|
||||
request.callback, request.errback = d.callback, d.errback
|
||||
return d
|
||||
Loading…
Add table
Add a link
Reference in a new issue