Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
137
venv/lib/python3.9/site-packages/scrapy/commands/__init__.py
Normal file
137
venv/lib/python3.9/site-packages/scrapy/commands/__init__.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
"""
|
||||
Base class for Scrapy commands
|
||||
"""
|
||||
import os
|
||||
from optparse import OptionGroup
|
||||
from twisted.python import failure
|
||||
|
||||
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
class ScrapyCommand:
|
||||
|
||||
requires_project = False
|
||||
crawler_process = None
|
||||
|
||||
# default settings to be used for this command instead of global defaults
|
||||
default_settings = {}
|
||||
|
||||
exitcode = 0
|
||||
|
||||
def __init__(self):
|
||||
self.settings = None # set in scrapy.cmdline
|
||||
|
||||
def set_crawler(self, crawler):
|
||||
if hasattr(self, '_crawler'):
|
||||
raise RuntimeError("crawler already set")
|
||||
self._crawler = crawler
|
||||
|
||||
def syntax(self):
|
||||
"""
|
||||
Command syntax (preferably one-line). Do not include command name.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def short_desc(self):
|
||||
"""
|
||||
A short description of the command
|
||||
"""
|
||||
return ""
|
||||
|
||||
def long_desc(self):
|
||||
"""A long description of the command. Return short description when not
|
||||
available. It cannot contain newlines, since contents will be formatted
|
||||
by optparser which removes newlines and wraps text.
|
||||
"""
|
||||
return self.short_desc()
|
||||
|
||||
def help(self):
|
||||
"""An extensive help for the command. It will be shown when using the
|
||||
"help" command. It can contain newlines, since no post-formatting will
|
||||
be applied to its contents.
|
||||
"""
|
||||
return self.long_desc()
|
||||
|
||||
def add_options(self, parser):
|
||||
"""
|
||||
Populate option parse with options available for this command
|
||||
"""
|
||||
group = OptionGroup(parser, "Global Options")
|
||||
group.add_option("--logfile", metavar="FILE",
|
||||
help="log file. if omitted stderr will be used")
|
||||
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
|
||||
help=f"log level (default: {self.settings['LOG_LEVEL']})")
|
||||
group.add_option("--nolog", action="store_true",
|
||||
help="disable logging completely")
|
||||
group.add_option("--profile", metavar="FILE", default=None,
|
||||
help="write python cProfile stats to FILE")
|
||||
group.add_option("--pidfile", metavar="FILE",
|
||||
help="write process ID to FILE")
|
||||
group.add_option("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set/override setting (may be repeated)")
|
||||
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
|
||||
|
||||
parser.add_option_group(group)
|
||||
|
||||
def process_options(self, args, opts):
|
||||
try:
|
||||
self.settings.setdict(arglist_to_dict(opts.set),
|
||||
priority='cmdline')
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)
|
||||
|
||||
if opts.logfile:
|
||||
self.settings.set('LOG_ENABLED', True, priority='cmdline')
|
||||
self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')
|
||||
|
||||
if opts.loglevel:
|
||||
self.settings.set('LOG_ENABLED', True, priority='cmdline')
|
||||
self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')
|
||||
|
||||
if opts.nolog:
|
||||
self.settings.set('LOG_ENABLED', False, priority='cmdline')
|
||||
|
||||
if opts.pidfile:
|
||||
with open(opts.pidfile, "w") as f:
|
||||
f.write(str(os.getpid()) + os.linesep)
|
||||
|
||||
if opts.pdb:
|
||||
failure.startDebugMode()
|
||||
|
||||
def run(self, args, opts):
|
||||
"""
|
||||
Entry point for running commands
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseRunSpiderCommand(ScrapyCommand):
|
||||
"""
|
||||
Common class used to share functionality between the crawl, parse and runspider commands
|
||||
"""
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set spider argument (may be repeated)")
|
||||
parser.add_option("-o", "--output", metavar="FILE", action="append",
|
||||
help="append scraped items to the end of FILE (use - for stdout)")
|
||||
parser.add_option("-O", "--overwrite-output", metavar="FILE", action="append",
|
||||
help="dump scraped items into FILE, overwriting any existing file")
|
||||
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
||||
help="format to use for dumping items")
|
||||
|
||||
def process_options(self, args, opts):
|
||||
ScrapyCommand.process_options(self, args, opts)
|
||||
try:
|
||||
opts.spargs = arglist_to_dict(opts.spargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
||||
if opts.output or opts.overwrite_output:
|
||||
feeds = feed_process_params_from_cli(
|
||||
self.settings,
|
||||
opts.output,
|
||||
opts.output_format,
|
||||
opts.overwrite_output,
|
||||
)
|
||||
self.settings.set('FEEDS', feeds, priority='cmdline')
|
||||
58
venv/lib/python3.9/site-packages/scrapy/commands/bench.py
Normal file
58
venv/lib/python3.9/site-packages/scrapy/commands/bench.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
import sys
|
||||
import time
|
||||
import subprocess
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
default_settings = {
|
||||
'LOG_LEVEL': 'INFO',
|
||||
'LOGSTATS_INTERVAL': 1,
|
||||
'CLOSESPIDER_TIMEOUT': 10,
|
||||
}
|
||||
|
||||
def short_desc(self):
|
||||
return "Run quick benchmark test"
|
||||
|
||||
def run(self, args, opts):
|
||||
with _BenchServer():
|
||||
self.crawler_process.crawl(_BenchSpider, total=100000)
|
||||
self.crawler_process.start()
|
||||
|
||||
|
||||
class _BenchServer:
|
||||
|
||||
def __enter__(self):
|
||||
from scrapy.utils.test import get_testenv
|
||||
pargs = [sys.executable, '-u', '-m', 'scrapy.utils.benchserver']
|
||||
self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE,
|
||||
env=get_testenv())
|
||||
self.proc.stdout.readline()
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.proc.kill()
|
||||
self.proc.wait()
|
||||
time.sleep(0.2)
|
||||
|
||||
|
||||
class _BenchSpider(scrapy.Spider):
|
||||
"""A spider that follows all links"""
|
||||
name = 'follow'
|
||||
total = 10000
|
||||
show = 20
|
||||
baseurl = 'http://localhost:8998'
|
||||
link_extractor = LinkExtractor()
|
||||
|
||||
def start_requests(self):
|
||||
qargs = {'total': self.total, 'show': self.show}
|
||||
url = f'{self.baseurl}?{urlencode(qargs, doseq=1)}'
|
||||
return [scrapy.Request(url, dont_filter=True)]
|
||||
|
||||
def parse(self, response):
|
||||
for link in self.link_extractor.extract_links(response):
|
||||
yield scrapy.Request(link.url, callback=self.parse)
|
||||
96
venv/lib/python3.9/site-packages/scrapy/commands/check.py
Normal file
96
venv/lib/python3.9/site-packages/scrapy/commands/check.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
import time
|
||||
from collections import defaultdict
|
||||
from unittest import TextTestRunner, TextTestResult as _TextTestResult
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.contracts import ContractsManager
|
||||
from scrapy.utils.misc import load_object, set_environ
|
||||
from scrapy.utils.conf import build_component_list
|
||||
|
||||
|
||||
class TextTestResult(_TextTestResult):
|
||||
def printSummary(self, start, stop):
|
||||
write = self.stream.write
|
||||
writeln = self.stream.writeln
|
||||
|
||||
run = self.testsRun
|
||||
plural = "s" if run != 1 else ""
|
||||
|
||||
writeln(self.separator2)
|
||||
writeln(f"Ran {run} contract{plural} in {stop - start:.3f}s")
|
||||
writeln()
|
||||
|
||||
infos = []
|
||||
if not self.wasSuccessful():
|
||||
write("FAILED")
|
||||
failed, errored = map(len, (self.failures, self.errors))
|
||||
if failed:
|
||||
infos.append(f"failures={failed}")
|
||||
if errored:
|
||||
infos.append(f"errors={errored}")
|
||||
else:
|
||||
write("OK")
|
||||
|
||||
if infos:
|
||||
writeln(f" ({', '.join(infos)})")
|
||||
else:
|
||||
write("\n")
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Check spider contracts"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-l", "--list", dest="list", action="store_true",
|
||||
help="only list contracts, without checking them")
|
||||
parser.add_option("-v", "--verbose", dest="verbose", default=False, action='store_true',
|
||||
help="print contract tests for all spiders")
|
||||
|
||||
def run(self, args, opts):
|
||||
# load contracts
|
||||
contracts = build_component_list(self.settings.getwithbase('SPIDER_CONTRACTS'))
|
||||
conman = ContractsManager(load_object(c) for c in contracts)
|
||||
runner = TextTestRunner(verbosity=2 if opts.verbose else 1)
|
||||
result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity)
|
||||
|
||||
# contract requests
|
||||
contract_reqs = defaultdict(list)
|
||||
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
|
||||
with set_environ(SCRAPY_CHECK='true'):
|
||||
for spidername in args or spider_loader.list():
|
||||
spidercls = spider_loader.load(spidername)
|
||||
spidercls.start_requests = lambda s: conman.from_spider(s, result)
|
||||
|
||||
tested_methods = conman.tested_methods_from_spidercls(spidercls)
|
||||
if opts.list:
|
||||
for method in tested_methods:
|
||||
contract_reqs[spidercls.name].append(method)
|
||||
elif tested_methods:
|
||||
self.crawler_process.crawl(spidercls)
|
||||
|
||||
# start checks
|
||||
if opts.list:
|
||||
for spider, methods in sorted(contract_reqs.items()):
|
||||
if not methods and not opts.verbose:
|
||||
continue
|
||||
print(spider)
|
||||
for method in sorted(methods):
|
||||
print(f' * {method}')
|
||||
else:
|
||||
start = time.time()
|
||||
self.crawler_process.start()
|
||||
stop = time.time()
|
||||
|
||||
result.printErrors()
|
||||
result.printSummary(start, stop)
|
||||
self.exitcode = int(not result.wasSuccessful())
|
||||
33
venv/lib/python3.9/site-packages/scrapy/commands/crawl.py
Normal file
33
venv/lib/python3.9/site-packages/scrapy/commands/crawl.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
from scrapy.commands import BaseRunSpiderCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
class Command(BaseRunSpiderCommand):
|
||||
|
||||
requires_project = True
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Run a spider"
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) < 1:
|
||||
raise UsageError()
|
||||
elif len(args) > 1:
|
||||
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
|
||||
spname = args[0]
|
||||
|
||||
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
|
||||
|
||||
if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
|
||||
self.exitcode = 1
|
||||
else:
|
||||
self.crawler_process.start()
|
||||
|
||||
if (
|
||||
self.crawler_process.bootstrap_failed
|
||||
or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception
|
||||
):
|
||||
self.exitcode = 1
|
||||
39
venv/lib/python3.9/site-packages/scrapy/commands/edit.py
Normal file
39
venv/lib/python3.9/site-packages/scrapy/commands/edit.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
import sys
|
||||
import os
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
|
||||
def syntax(self):
|
||||
return "<spider>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Edit spider"
|
||||
|
||||
def long_desc(self):
|
||||
return ("Edit a spider using the editor defined in the EDITOR environment"
|
||||
" variable or else the EDITOR setting")
|
||||
|
||||
def _err(self, msg):
|
||||
sys.stderr.write(msg + os.linesep)
|
||||
self.exitcode = 1
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1:
|
||||
raise UsageError()
|
||||
|
||||
editor = self.settings['EDITOR']
|
||||
try:
|
||||
spidercls = self.crawler_process.spider_loader.load(args[0])
|
||||
except KeyError:
|
||||
return self._err(f"Spider not found: {args[0]}")
|
||||
|
||||
sfile = sys.modules[spidercls.__module__].__file__
|
||||
sfile = sfile.replace('.pyc', '.py')
|
||||
self.exitcode = os.system(f'{editor} "{sfile}"')
|
||||
70
venv/lib/python3.9/site-packages/scrapy/commands/fetch.py
Normal file
70
venv/lib/python3.9/site-packages/scrapy/commands/fetch.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
import sys
|
||||
from w3lib.url import is_url
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.datatypes import SequenceExclude
|
||||
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <url>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Fetch a URL using the Scrapy downloader"
|
||||
|
||||
def long_desc(self):
|
||||
return (
|
||||
"Fetch a URL using the Scrapy downloader and print its content"
|
||||
" to stdout. You may want to use --nolog to disable logging"
|
||||
)
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--spider", dest="spider", help="use this spider")
|
||||
parser.add_option("--headers", dest="headers", action="store_true",
|
||||
help="print response HTTP headers instead of body")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
|
||||
def _print_headers(self, headers, prefix):
|
||||
for key, values in headers.items():
|
||||
for value in values:
|
||||
self._print_bytes(prefix + b' ' + key + b': ' + value)
|
||||
|
||||
def _print_response(self, response, opts):
|
||||
if opts.headers:
|
||||
self._print_headers(response.request.headers, b'>')
|
||||
print('>')
|
||||
self._print_headers(response.headers, b'<')
|
||||
else:
|
||||
self._print_bytes(response.body)
|
||||
|
||||
def _print_bytes(self, bytes_):
|
||||
sys.stdout.buffer.write(bytes_ + b'\n')
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1 or not is_url(args[0]):
|
||||
raise UsageError()
|
||||
request = Request(args[0], callback=self._print_response,
|
||||
cb_kwargs={"opts": opts}, dont_filter=True)
|
||||
# by default, let the framework handle redirects,
|
||||
# i.e. command handles all codes expect 3xx
|
||||
if not opts.no_redirect:
|
||||
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
|
||||
spidercls = DefaultSpider
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
if opts.spider:
|
||||
spidercls = spider_loader.load(opts.spider)
|
||||
else:
|
||||
spidercls = spidercls_for_request(spider_loader, request, spidercls)
|
||||
self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
|
||||
self.crawler_process.start()
|
||||
149
venv/lib/python3.9/site-packages/scrapy/commands/genspider.py
Normal file
149
venv/lib/python3.9/site-packages/scrapy/commands/genspider.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
import os
|
||||
import shutil
|
||||
import string
|
||||
|
||||
from importlib import import_module
|
||||
from os.path import join, dirname, abspath, exists, splitext
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.utils.template import render_templatefile, string_camelcase
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
def sanitize_module_name(module_name):
|
||||
"""Sanitize the given module name, by replacing dashes and points
|
||||
with underscores and prefixing it with a letter if it doesn't start
|
||||
with one
|
||||
"""
|
||||
module_name = module_name.replace('-', '_').replace('.', '_')
|
||||
if module_name[0] not in string.ascii_letters:
|
||||
module_name = "a" + module_name
|
||||
return module_name
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <name> <domain>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Generate new spider using pre-defined templates"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-l", "--list", dest="list", action="store_true",
|
||||
help="List available templates")
|
||||
parser.add_option("-e", "--edit", dest="edit", action="store_true",
|
||||
help="Edit spider after creating it")
|
||||
parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",
|
||||
help="Dump template to standard output")
|
||||
parser.add_option("-t", "--template", dest="template", default="basic",
|
||||
help="Uses a custom template.")
|
||||
parser.add_option("--force", dest="force", action="store_true",
|
||||
help="If the spider already exists, overwrite it with the template")
|
||||
|
||||
def run(self, args, opts):
|
||||
if opts.list:
|
||||
self._list_templates()
|
||||
return
|
||||
if opts.dump:
|
||||
template_file = self._find_template(opts.dump)
|
||||
if template_file:
|
||||
with open(template_file, "r") as f:
|
||||
print(f.read())
|
||||
return
|
||||
if len(args) != 2:
|
||||
raise UsageError()
|
||||
|
||||
name, domain = args[0:2]
|
||||
module = sanitize_module_name(name)
|
||||
|
||||
if self.settings.get('BOT_NAME') == module:
|
||||
print("Cannot create a spider with the same name as your project")
|
||||
return
|
||||
|
||||
if not opts.force and self._spider_exists(name):
|
||||
return
|
||||
|
||||
template_file = self._find_template(opts.template)
|
||||
if template_file:
|
||||
self._genspider(module, name, domain, opts.template, template_file)
|
||||
if opts.edit:
|
||||
self.exitcode = os.system(f'scrapy edit "{name}"')
|
||||
|
||||
def _genspider(self, module, name, domain, template_name, template_file):
|
||||
"""Generate the spider module, based on the given template"""
|
||||
capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
|
||||
tvars = {
|
||||
'project_name': self.settings.get('BOT_NAME'),
|
||||
'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
|
||||
'module': module,
|
||||
'name': name,
|
||||
'domain': domain,
|
||||
'classname': f'{capitalized_module}Spider'
|
||||
}
|
||||
if self.settings.get('NEWSPIDER_MODULE'):
|
||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||
spiders_dir = abspath(dirname(spiders_module.__file__))
|
||||
else:
|
||||
spiders_module = None
|
||||
spiders_dir = "."
|
||||
spider_file = f"{join(spiders_dir, module)}.py"
|
||||
shutil.copyfile(template_file, spider_file)
|
||||
render_templatefile(spider_file, **tvars)
|
||||
print(f"Created spider {name!r} using template {template_name!r} ",
|
||||
end=('' if spiders_module else '\n'))
|
||||
if spiders_module:
|
||||
print(f"in module:\n {spiders_module.__name__}.{module}")
|
||||
|
||||
def _find_template(self, template):
|
||||
template_file = join(self.templates_dir, f'{template}.tmpl')
|
||||
if exists(template_file):
|
||||
return template_file
|
||||
print(f"Unable to find template: {template}\n")
|
||||
print('Use "scrapy genspider --list" to see all available templates.')
|
||||
|
||||
def _list_templates(self):
|
||||
print("Available templates:")
|
||||
for filename in sorted(os.listdir(self.templates_dir)):
|
||||
if filename.endswith('.tmpl'):
|
||||
print(f" {splitext(filename)[0]}")
|
||||
|
||||
def _spider_exists(self, name):
|
||||
if not self.settings.get('NEWSPIDER_MODULE'):
|
||||
# if run as a standalone command and file with same filename already exists
|
||||
if exists(name + ".py"):
|
||||
print(f"{abspath(name + '.py')} already exists")
|
||||
return True
|
||||
return False
|
||||
|
||||
try:
|
||||
spidercls = self.crawler_process.spider_loader.load(name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
# if spider with same name exists
|
||||
print(f"Spider {name!r} already exists in module:")
|
||||
print(f" {spidercls.__module__}")
|
||||
return True
|
||||
|
||||
# a file with the same name exists in the target directory
|
||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||
spiders_dir = dirname(spiders_module.__file__)
|
||||
spiders_dir_abs = abspath(spiders_dir)
|
||||
if exists(join(spiders_dir_abs, name + ".py")):
|
||||
print(f"{join(spiders_dir_abs, (name + '.py'))} already exists")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@property
|
||||
def templates_dir(self):
|
||||
return join(
|
||||
self.settings['TEMPLATES_DIR'] or join(scrapy.__path__[0], 'templates'),
|
||||
'spiders'
|
||||
)
|
||||
14
venv/lib/python3.9/site-packages/scrapy/commands/list.py
Normal file
14
venv/lib/python3.9/site-packages/scrapy/commands/list.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from scrapy.commands import ScrapyCommand
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
|
||||
def short_desc(self):
|
||||
return "List available spiders"
|
||||
|
||||
def run(self, args, opts):
|
||||
for s in sorted(self.crawler_process.spider_loader.list()):
|
||||
print(s)
|
||||
256
venv/lib/python3.9/site-packages/scrapy/commands/parse.py
Normal file
256
venv/lib/python3.9/site-packages/scrapy/commands/parse.py
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
import json
|
||||
import logging
|
||||
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
from w3lib.url import is_url
|
||||
|
||||
from scrapy.commands import BaseRunSpiderCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils import display
|
||||
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Command(BaseRunSpiderCommand):
|
||||
requires_project = True
|
||||
|
||||
spider = None
|
||||
items = {}
|
||||
requests = {}
|
||||
|
||||
first_response = None
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <url>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Parse URL (using its spider) and print the results"
|
||||
|
||||
def add_options(self, parser):
|
||||
BaseRunSpiderCommand.add_options(self, parser)
|
||||
parser.add_option("--spider", dest="spider", default=None,
|
||||
help="use this spider without looking for one")
|
||||
parser.add_option("--pipelines", action="store_true",
|
||||
help="process items through pipelines")
|
||||
parser.add_option("--nolinks", dest="nolinks", action="store_true",
|
||||
help="don't show links to follow (extracted requests)")
|
||||
parser.add_option("--noitems", dest="noitems", action="store_true",
|
||||
help="don't show scraped items")
|
||||
parser.add_option("--nocolour", dest="nocolour", action="store_true",
|
||||
help="avoid using pygments to colorize the output")
|
||||
parser.add_option("-r", "--rules", dest="rules", action="store_true",
|
||||
help="use CrawlSpider rules to discover the callback")
|
||||
parser.add_option("-c", "--callback", dest="callback",
|
||||
help="use this callback for parsing, instead looking for a callback")
|
||||
parser.add_option("-m", "--meta", dest="meta",
|
||||
help="inject extra meta into the Request, it must be a valid raw json string")
|
||||
parser.add_option("--cbkwargs", dest="cbkwargs",
|
||||
help="inject extra callback kwargs into the Request, it must be a valid raw json string")
|
||||
parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
|
||||
help="maximum depth for parsing requests [default: %default]")
|
||||
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
|
||||
help="print each depth level one by one")
|
||||
|
||||
@property
|
||||
def max_level(self):
|
||||
max_items, max_requests = 0, 0
|
||||
if self.items:
|
||||
max_items = max(self.items)
|
||||
if self.requests:
|
||||
max_requests = max(self.requests)
|
||||
return max(max_items, max_requests)
|
||||
|
||||
def add_items(self, lvl, new_items):
|
||||
old_items = self.items.get(lvl, [])
|
||||
self.items[lvl] = old_items + new_items
|
||||
|
||||
def add_requests(self, lvl, new_reqs):
|
||||
old_reqs = self.requests.get(lvl, [])
|
||||
self.requests[lvl] = old_reqs + new_reqs
|
||||
|
||||
def print_items(self, lvl=None, colour=True):
|
||||
if lvl is None:
|
||||
items = [item for lst in self.items.values() for item in lst]
|
||||
else:
|
||||
items = self.items.get(lvl, [])
|
||||
|
||||
print("# Scraped Items ", "-" * 60)
|
||||
display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
|
||||
|
||||
def print_requests(self, lvl=None, colour=True):
|
||||
if lvl is None:
|
||||
if self.requests:
|
||||
requests = self.requests[max(self.requests)]
|
||||
else:
|
||||
requests = []
|
||||
else:
|
||||
requests = self.requests.get(lvl, [])
|
||||
|
||||
print("# Requests ", "-" * 65)
|
||||
display.pprint(requests, colorize=colour)
|
||||
|
||||
def print_results(self, opts):
|
||||
colour = not opts.nocolour
|
||||
|
||||
if opts.verbose:
|
||||
for level in range(1, self.max_level + 1):
|
||||
print(f'\n>>> DEPTH LEVEL: {level} <<<')
|
||||
if not opts.noitems:
|
||||
self.print_items(level, colour)
|
||||
if not opts.nolinks:
|
||||
self.print_requests(level, colour)
|
||||
else:
|
||||
print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
|
||||
if not opts.noitems:
|
||||
self.print_items(colour=colour)
|
||||
if not opts.nolinks:
|
||||
self.print_requests(colour=colour)
|
||||
|
||||
def run_callback(self, response, callback, cb_kwargs=None):
|
||||
cb_kwargs = cb_kwargs or {}
|
||||
items, requests = [], []
|
||||
|
||||
for x in iterate_spider_output(callback(response, **cb_kwargs)):
|
||||
if is_item(x):
|
||||
items.append(x)
|
||||
elif isinstance(x, Request):
|
||||
requests.append(x)
|
||||
return items, requests
|
||||
|
||||
def get_callback_from_rules(self, spider, response):
|
||||
if getattr(spider, 'rules', None):
|
||||
for rule in spider.rules:
|
||||
if rule.link_extractor.matches(response.url):
|
||||
return rule.callback or "parse"
|
||||
else:
|
||||
logger.error('No CrawlSpider rules found in spider %(spider)r, '
|
||||
'please specify a callback to use for parsing',
|
||||
{'spider': spider.name})
|
||||
|
||||
def set_spidercls(self, url, opts):
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
if opts.spider:
|
||||
try:
|
||||
self.spidercls = spider_loader.load(opts.spider)
|
||||
except KeyError:
|
||||
logger.error('Unable to find spider: %(spider)s',
|
||||
{'spider': opts.spider})
|
||||
else:
|
||||
self.spidercls = spidercls_for_request(spider_loader, Request(url))
|
||||
if not self.spidercls:
|
||||
logger.error('Unable to find spider for: %(url)s', {'url': url})
|
||||
|
||||
def _start_requests(spider):
|
||||
yield self.prepare_request(spider, Request(url), opts)
|
||||
self.spidercls.start_requests = _start_requests
|
||||
|
||||
def start_parsing(self, url, opts):
|
||||
self.crawler_process.crawl(self.spidercls, **opts.spargs)
|
||||
self.pcrawler = list(self.crawler_process.crawlers)[0]
|
||||
self.crawler_process.start()
|
||||
|
||||
if not self.first_response:
|
||||
logger.error('No response downloaded for: %(url)s',
|
||||
{'url': url})
|
||||
|
||||
def prepare_request(self, spider, request, opts):
|
||||
def callback(response, **cb_kwargs):
|
||||
# memorize first request
|
||||
if not self.first_response:
|
||||
self.first_response = response
|
||||
|
||||
# determine real callback
|
||||
cb = response.meta['_callback']
|
||||
if not cb:
|
||||
if opts.callback:
|
||||
cb = opts.callback
|
||||
elif opts.rules and self.first_response == response:
|
||||
cb = self.get_callback_from_rules(spider, response)
|
||||
|
||||
if not cb:
|
||||
logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s',
|
||||
{'url': response.url, 'spider': spider.name})
|
||||
return
|
||||
else:
|
||||
cb = 'parse'
|
||||
|
||||
if not callable(cb):
|
||||
cb_method = getattr(spider, cb, None)
|
||||
if callable(cb_method):
|
||||
cb = cb_method
|
||||
else:
|
||||
logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
|
||||
{'callback': cb, 'spider': spider.name})
|
||||
return
|
||||
|
||||
# parse items and requests
|
||||
depth = response.meta['_depth']
|
||||
|
||||
items, requests = self.run_callback(response, cb, cb_kwargs)
|
||||
if opts.pipelines:
|
||||
itemproc = self.pcrawler.engine.scraper.itemproc
|
||||
for item in items:
|
||||
itemproc.process_item(item, spider)
|
||||
self.add_items(depth, items)
|
||||
self.add_requests(depth, requests)
|
||||
|
||||
scraped_data = items if opts.output else []
|
||||
if depth < opts.depth:
|
||||
for req in requests:
|
||||
req.meta['_depth'] = depth + 1
|
||||
req.meta['_callback'] = req.callback
|
||||
req.callback = callback
|
||||
scraped_data += requests
|
||||
|
||||
return scraped_data
|
||||
|
||||
# update request meta if any extra meta was passed through the --meta/-m opts.
|
||||
if opts.meta:
|
||||
request.meta.update(opts.meta)
|
||||
|
||||
# update cb_kwargs if any extra values were was passed through the --cbkwargs option.
|
||||
if opts.cbkwargs:
|
||||
request.cb_kwargs.update(opts.cbkwargs)
|
||||
|
||||
request.meta['_depth'] = 1
|
||||
request.meta['_callback'] = request.callback
|
||||
request.callback = callback
|
||||
return request
|
||||
|
||||
def process_options(self, args, opts):
|
||||
BaseRunSpiderCommand.process_options(self, args, opts)
|
||||
|
||||
self.process_request_meta(opts)
|
||||
self.process_request_cb_kwargs(opts)
|
||||
|
||||
def process_request_meta(self, opts):
|
||||
if opts.meta:
|
||||
try:
|
||||
opts.meta = json.loads(opts.meta)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. "
|
||||
"Example: --meta='{\"foo\" : \"bar\"}'", print_help=False)
|
||||
|
||||
def process_request_cb_kwargs(self, opts):
|
||||
if opts.cbkwargs:
|
||||
try:
|
||||
opts.cbkwargs = json.loads(opts.cbkwargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
|
||||
"Example: --cbkwargs='{\"foo\" : \"bar\"}'", print_help=False)
|
||||
|
||||
def run(self, args, opts):
|
||||
# parse arguments
|
||||
if not len(args) == 1 or not is_url(args[0]):
|
||||
raise UsageError()
|
||||
else:
|
||||
url = args[0]
|
||||
|
||||
# prepare spidercls
|
||||
self.set_spidercls(url, opts)
|
||||
|
||||
if self.spidercls and opts.depth > 0:
|
||||
self.start_parsing(url, opts)
|
||||
self.print_results(opts)
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
import sys
|
||||
import os
|
||||
from importlib import import_module
|
||||
|
||||
from scrapy.utils.spider import iter_spider_classes
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.commands import BaseRunSpiderCommand
|
||||
|
||||
|
||||
def _import_file(filepath):
|
||||
abspath = os.path.abspath(filepath)
|
||||
dirname, file = os.path.split(abspath)
|
||||
fname, fext = os.path.splitext(file)
|
||||
if fext not in ('.py', '.pyw'):
|
||||
raise ValueError(f"Not a Python source file: {abspath}")
|
||||
if dirname:
|
||||
sys.path = [dirname] + sys.path
|
||||
try:
|
||||
module = import_module(fname)
|
||||
finally:
|
||||
if dirname:
|
||||
sys.path.pop(0)
|
||||
return module
|
||||
|
||||
|
||||
class Command(BaseRunSpiderCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider_file>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Run a self-contained spider (without creating a project)"
|
||||
|
||||
def long_desc(self):
|
||||
return "Run the spider defined in the given file"
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1:
|
||||
raise UsageError()
|
||||
filename = args[0]
|
||||
if not os.path.exists(filename):
|
||||
raise UsageError(f"File not found: {filename}\n")
|
||||
try:
|
||||
module = _import_file(filename)
|
||||
except (ImportError, ValueError) as e:
|
||||
raise UsageError(f"Unable to load {filename!r}: {e}\n")
|
||||
spclasses = list(iter_spider_classes(module))
|
||||
if not spclasses:
|
||||
raise UsageError(f"No spider found in file: {filename}\n")
|
||||
spidercls = spclasses.pop()
|
||||
|
||||
self.crawler_process.crawl(spidercls, **opts.spargs)
|
||||
self.crawler_process.start()
|
||||
|
||||
if self.crawler_process.bootstrap_failed:
|
||||
self.exitcode = 1
|
||||
47
venv/lib/python3.9/site-packages/scrapy/commands/settings.py
Normal file
47
venv/lib/python3.9/site-packages/scrapy/commands/settings.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import json
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.settings import BaseSettings
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
|
||||
def syntax(self):
|
||||
return "[options]"
|
||||
|
||||
def short_desc(self):
|
||||
return "Get settings values"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--get", dest="get", metavar="SETTING",
|
||||
help="print raw setting value")
|
||||
parser.add_option("--getbool", dest="getbool", metavar="SETTING",
|
||||
help="print setting value, interpreted as a boolean")
|
||||
parser.add_option("--getint", dest="getint", metavar="SETTING",
|
||||
help="print setting value, interpreted as an integer")
|
||||
parser.add_option("--getfloat", dest="getfloat", metavar="SETTING",
|
||||
help="print setting value, interpreted as a float")
|
||||
parser.add_option("--getlist", dest="getlist", metavar="SETTING",
|
||||
help="print setting value, interpreted as a list")
|
||||
|
||||
def run(self, args, opts):
|
||||
settings = self.crawler_process.settings
|
||||
if opts.get:
|
||||
s = settings.get(opts.get)
|
||||
if isinstance(s, BaseSettings):
|
||||
print(json.dumps(s.copy_to_dict()))
|
||||
else:
|
||||
print(s)
|
||||
elif opts.getbool:
|
||||
print(settings.getbool(opts.getbool))
|
||||
elif opts.getint:
|
||||
print(settings.getint(opts.getint))
|
||||
elif opts.getfloat:
|
||||
print(settings.getfloat(opts.getfloat))
|
||||
elif opts.getlist:
|
||||
print(settings.getlist(opts.getlist))
|
||||
80
venv/lib/python3.9/site-packages/scrapy/commands/shell.py
Normal file
80
venv/lib/python3.9/site-packages/scrapy/commands/shell.py
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
"""
|
||||
Scrapy Shell
|
||||
|
||||
See documentation in docs/topics/shell.rst
|
||||
"""
|
||||
from threading import Thread
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.shell import Shell
|
||||
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
|
||||
from scrapy.utils.url import guess_scheme
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {
|
||||
'KEEP_ALIVE': True,
|
||||
'LOGSTATS_INTERVAL': 0,
|
||||
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
|
||||
}
|
||||
|
||||
def syntax(self):
|
||||
return "[url|file]"
|
||||
|
||||
def short_desc(self):
|
||||
return "Interactive scraping console"
|
||||
|
||||
def long_desc(self):
|
||||
return ("Interactive console for scraping the given url or file. "
|
||||
"Use ./file.html syntax or full path for local file.")
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-c", dest="code",
|
||||
help="evaluate the code in the shell, print the result and exit")
|
||||
parser.add_option("--spider", dest="spider",
|
||||
help="use this spider")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
|
||||
def update_vars(self, vars):
|
||||
"""You can use this function to update the Scrapy objects that will be
|
||||
available in the shell
|
||||
"""
|
||||
pass
|
||||
|
||||
def run(self, args, opts):
|
||||
url = args[0] if args else None
|
||||
if url:
|
||||
# first argument may be a local file
|
||||
url = guess_scheme(url)
|
||||
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
|
||||
spidercls = DefaultSpider
|
||||
if opts.spider:
|
||||
spidercls = spider_loader.load(opts.spider)
|
||||
elif url:
|
||||
spidercls = spidercls_for_request(spider_loader, Request(url),
|
||||
spidercls, log_multiple=True)
|
||||
|
||||
# The crawler is created this way since the Shell manually handles the
|
||||
# crawling engine, so the set up in the crawl method won't work
|
||||
crawler = self.crawler_process._create_crawler(spidercls)
|
||||
# The Shell class needs a persistent engine in the crawler
|
||||
crawler.engine = crawler._create_engine()
|
||||
crawler.engine.start()
|
||||
|
||||
self._start_crawler_thread()
|
||||
|
||||
shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
|
||||
shell.start(url=url, redirect=not opts.no_redirect)
|
||||
|
||||
def _start_crawler_thread(self):
|
||||
t = Thread(target=self.crawler_process.start,
|
||||
kwargs={'stop_after_crawl': False})
|
||||
t.daemon = True
|
||||
t.start()
|
||||
128
venv/lib/python3.9/site-packages/scrapy/commands/startproject.py
Normal file
128
venv/lib/python3.9/site-packages/scrapy/commands/startproject.py
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
import re
|
||||
import os
|
||||
import string
|
||||
from importlib import import_module
|
||||
from os.path import join, exists, abspath
|
||||
from shutil import ignore_patterns, move, copy2, copystat
|
||||
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.utils.template import render_templatefile, string_camelcase
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
TEMPLATES_TO_RENDER = (
|
||||
('scrapy.cfg',),
|
||||
('${project_name}', 'settings.py.tmpl'),
|
||||
('${project_name}', 'items.py.tmpl'),
|
||||
('${project_name}', 'pipelines.py.tmpl'),
|
||||
('${project_name}', 'middlewares.py.tmpl'),
|
||||
)
|
||||
|
||||
IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn')
|
||||
|
||||
|
||||
def _make_writable(path):
|
||||
current_permissions = os.stat(path).st_mode
|
||||
os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
|
||||
def syntax(self):
|
||||
return "<project_name> [project_dir]"
|
||||
|
||||
def short_desc(self):
|
||||
return "Create new project"
|
||||
|
||||
def _is_valid_name(self, project_name):
|
||||
def _module_exists(module_name):
|
||||
try:
|
||||
import_module(module_name)
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
if not re.search(r'^[_a-zA-Z]\w*$', project_name):
|
||||
print('Error: Project names must begin with a letter and contain'
|
||||
' only\nletters, numbers and underscores')
|
||||
elif _module_exists(project_name):
|
||||
print(f'Error: Module {project_name!r} already exists')
|
||||
else:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _copytree(self, src, dst):
|
||||
"""
|
||||
Since the original function always creates the directory, to resolve
|
||||
the issue a new function had to be created. It's a simple copy and
|
||||
was reduced for this case.
|
||||
|
||||
More info at:
|
||||
https://github.com/scrapy/scrapy/pull/2005
|
||||
"""
|
||||
ignore = IGNORE
|
||||
names = os.listdir(src)
|
||||
ignored_names = ignore(src, names)
|
||||
|
||||
if not os.path.exists(dst):
|
||||
os.makedirs(dst)
|
||||
|
||||
for name in names:
|
||||
if name in ignored_names:
|
||||
continue
|
||||
|
||||
srcname = os.path.join(src, name)
|
||||
dstname = os.path.join(dst, name)
|
||||
if os.path.isdir(srcname):
|
||||
self._copytree(srcname, dstname)
|
||||
else:
|
||||
copy2(srcname, dstname)
|
||||
_make_writable(dstname)
|
||||
|
||||
copystat(src, dst)
|
||||
_make_writable(dst)
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) not in (1, 2):
|
||||
raise UsageError()
|
||||
|
||||
project_name = args[0]
|
||||
project_dir = args[0]
|
||||
|
||||
if len(args) == 2:
|
||||
project_dir = args[1]
|
||||
|
||||
if exists(join(project_dir, 'scrapy.cfg')):
|
||||
self.exitcode = 1
|
||||
print(f'Error: scrapy.cfg already exists in {abspath(project_dir)}')
|
||||
return
|
||||
|
||||
if not self._is_valid_name(project_name):
|
||||
self.exitcode = 1
|
||||
return
|
||||
|
||||
self._copytree(self.templates_dir, abspath(project_dir))
|
||||
move(join(project_dir, 'module'), join(project_dir, project_name))
|
||||
for paths in TEMPLATES_TO_RENDER:
|
||||
path = join(*paths)
|
||||
tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
|
||||
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
|
||||
print(f"New Scrapy project '{project_name}', using template directory "
|
||||
f"'{self.templates_dir}', created in:")
|
||||
print(f" {abspath(project_dir)}\n")
|
||||
print("You can start your first spider with:")
|
||||
print(f" cd {project_dir}")
|
||||
print(" scrapy genspider example example.com")
|
||||
|
||||
@property
|
||||
def templates_dir(self):
|
||||
return join(
|
||||
self.settings['TEMPLATES_DIR'] or join(scrapy.__path__[0], 'templates'),
|
||||
'project'
|
||||
)
|
||||
29
venv/lib/python3.9/site-packages/scrapy/commands/version.py
Normal file
29
venv/lib/python3.9/site-packages/scrapy/commands/version.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.utils.versions import scrapy_components_versions
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
|
||||
def syntax(self):
|
||||
return "[-v]"
|
||||
|
||||
def short_desc(self):
|
||||
return "Print Scrapy version"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--verbose", "-v", dest="verbose", action="store_true",
|
||||
help="also display twisted/python/platform info (useful for bug reports)")
|
||||
|
||||
def run(self, args, opts):
|
||||
if opts.verbose:
|
||||
versions = scrapy_components_versions()
|
||||
width = max(len(n) for (n, _) in versions)
|
||||
for name, version in versions:
|
||||
print(f"{name:<{width}} : {version}")
|
||||
else:
|
||||
print(f"Scrapy {scrapy.__version__}")
|
||||
18
venv/lib/python3.9/site-packages/scrapy/commands/view.py
Normal file
18
venv/lib/python3.9/site-packages/scrapy/commands/view.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
from scrapy.commands import fetch
|
||||
from scrapy.utils.response import open_in_browser
|
||||
|
||||
|
||||
class Command(fetch.Command):
|
||||
|
||||
def short_desc(self):
|
||||
return "Open URL in browser, as seen by Scrapy"
|
||||
|
||||
def long_desc(self):
|
||||
return "Fetch a URL using the Scrapy downloader and show its contents in a browser"
|
||||
|
||||
def add_options(self, parser):
|
||||
super().add_options(parser)
|
||||
parser.remove_option("--headers")
|
||||
|
||||
def _print_response(self, response, opts):
|
||||
open_in_browser(response)
|
||||
Loading…
Add table
Add a link
Reference in a new issue