Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,137 @@
"""
Base class for Scrapy commands
"""
import os
from optparse import OptionGroup
from twisted.python import failure
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
from scrapy.exceptions import UsageError
class ScrapyCommand:
requires_project = False
crawler_process = None
# default settings to be used for this command instead of global defaults
default_settings = {}
exitcode = 0
def __init__(self):
self.settings = None # set in scrapy.cmdline
def set_crawler(self, crawler):
if hasattr(self, '_crawler'):
raise RuntimeError("crawler already set")
self._crawler = crawler
def syntax(self):
"""
Command syntax (preferably one-line). Do not include command name.
"""
return ""
def short_desc(self):
"""
A short description of the command
"""
return ""
def long_desc(self):
"""A long description of the command. Return short description when not
available. It cannot contain newlines, since contents will be formatted
by optparser which removes newlines and wraps text.
"""
return self.short_desc()
def help(self):
"""An extensive help for the command. It will be shown when using the
"help" command. It can contain newlines, since no post-formatting will
be applied to its contents.
"""
return self.long_desc()
def add_options(self, parser):
"""
Populate option parse with options available for this command
"""
group = OptionGroup(parser, "Global Options")
group.add_option("--logfile", metavar="FILE",
help="log file. if omitted stderr will be used")
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
help=f"log level (default: {self.settings['LOG_LEVEL']})")
group.add_option("--nolog", action="store_true",
help="disable logging completely")
group.add_option("--profile", metavar="FILE", default=None,
help="write python cProfile stats to FILE")
group.add_option("--pidfile", metavar="FILE",
help="write process ID to FILE")
group.add_option("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
help="set/override setting (may be repeated)")
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
parser.add_option_group(group)
def process_options(self, args, opts):
try:
self.settings.setdict(arglist_to_dict(opts.set),
priority='cmdline')
except ValueError:
raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)
if opts.logfile:
self.settings.set('LOG_ENABLED', True, priority='cmdline')
self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')
if opts.loglevel:
self.settings.set('LOG_ENABLED', True, priority='cmdline')
self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')
if opts.nolog:
self.settings.set('LOG_ENABLED', False, priority='cmdline')
if opts.pidfile:
with open(opts.pidfile, "w") as f:
f.write(str(os.getpid()) + os.linesep)
if opts.pdb:
failure.startDebugMode()
def run(self, args, opts):
"""
Entry point for running commands
"""
raise NotImplementedError
class BaseRunSpiderCommand(ScrapyCommand):
"""
Common class used to share functionality between the crawl, parse and runspider commands
"""
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE", action="append",
help="append scraped items to the end of FILE (use - for stdout)")
parser.add_option("-O", "--overwrite-output", metavar="FILE", action="append",
help="dump scraped items into FILE, overwriting any existing file")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.output or opts.overwrite_output:
feeds = feed_process_params_from_cli(
self.settings,
opts.output,
opts.output_format,
opts.overwrite_output,
)
self.settings.set('FEEDS', feeds, priority='cmdline')

View file

@ -0,0 +1,58 @@
import sys
import time
import subprocess
from urllib.parse import urlencode
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.linkextractors import LinkExtractor
class Command(ScrapyCommand):
default_settings = {
'LOG_LEVEL': 'INFO',
'LOGSTATS_INTERVAL': 1,
'CLOSESPIDER_TIMEOUT': 10,
}
def short_desc(self):
return "Run quick benchmark test"
def run(self, args, opts):
with _BenchServer():
self.crawler_process.crawl(_BenchSpider, total=100000)
self.crawler_process.start()
class _BenchServer:
def __enter__(self):
from scrapy.utils.test import get_testenv
pargs = [sys.executable, '-u', '-m', 'scrapy.utils.benchserver']
self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE,
env=get_testenv())
self.proc.stdout.readline()
def __exit__(self, exc_type, exc_value, traceback):
self.proc.kill()
self.proc.wait()
time.sleep(0.2)
class _BenchSpider(scrapy.Spider):
"""A spider that follows all links"""
name = 'follow'
total = 10000
show = 20
baseurl = 'http://localhost:8998'
link_extractor = LinkExtractor()
def start_requests(self):
qargs = {'total': self.total, 'show': self.show}
url = f'{self.baseurl}?{urlencode(qargs, doseq=1)}'
return [scrapy.Request(url, dont_filter=True)]
def parse(self, response):
for link in self.link_extractor.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse)

View file

@ -0,0 +1,96 @@
import time
from collections import defaultdict
from unittest import TextTestRunner, TextTestResult as _TextTestResult
from scrapy.commands import ScrapyCommand
from scrapy.contracts import ContractsManager
from scrapy.utils.misc import load_object, set_environ
from scrapy.utils.conf import build_component_list
class TextTestResult(_TextTestResult):
def printSummary(self, start, stop):
write = self.stream.write
writeln = self.stream.writeln
run = self.testsRun
plural = "s" if run != 1 else ""
writeln(self.separator2)
writeln(f"Ran {run} contract{plural} in {stop - start:.3f}s")
writeln()
infos = []
if not self.wasSuccessful():
write("FAILED")
failed, errored = map(len, (self.failures, self.errors))
if failed:
infos.append(f"failures={failed}")
if errored:
infos.append(f"errors={errored}")
else:
write("OK")
if infos:
writeln(f" ({', '.join(infos)})")
else:
write("\n")
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def syntax(self):
return "[options] <spider>"
def short_desc(self):
return "Check spider contracts"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-l", "--list", dest="list", action="store_true",
help="only list contracts, without checking them")
parser.add_option("-v", "--verbose", dest="verbose", default=False, action='store_true',
help="print contract tests for all spiders")
def run(self, args, opts):
# load contracts
contracts = build_component_list(self.settings.getwithbase('SPIDER_CONTRACTS'))
conman = ContractsManager(load_object(c) for c in contracts)
runner = TextTestRunner(verbosity=2 if opts.verbose else 1)
result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity)
# contract requests
contract_reqs = defaultdict(list)
spider_loader = self.crawler_process.spider_loader
with set_environ(SCRAPY_CHECK='true'):
for spidername in args or spider_loader.list():
spidercls = spider_loader.load(spidername)
spidercls.start_requests = lambda s: conman.from_spider(s, result)
tested_methods = conman.tested_methods_from_spidercls(spidercls)
if opts.list:
for method in tested_methods:
contract_reqs[spidercls.name].append(method)
elif tested_methods:
self.crawler_process.crawl(spidercls)
# start checks
if opts.list:
for spider, methods in sorted(contract_reqs.items()):
if not methods and not opts.verbose:
continue
print(spider)
for method in sorted(methods):
print(f' * {method}')
else:
start = time.time()
self.crawler_process.start()
stop = time.time()
result.printErrors()
result.printSummary(start, stop)
self.exitcode = int(not result.wasSuccessful())

View file

@ -0,0 +1,33 @@
from scrapy.commands import BaseRunSpiderCommand
from scrapy.exceptions import UsageError
class Command(BaseRunSpiderCommand):
requires_project = True
def syntax(self):
return "[options] <spider>"
def short_desc(self):
return "Run a spider"
def run(self, args, opts):
if len(args) < 1:
raise UsageError()
elif len(args) > 1:
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
spname = args[0]
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
self.exitcode = 1
else:
self.crawler_process.start()
if (
self.crawler_process.bootstrap_failed
or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception
):
self.exitcode = 1

View file

@ -0,0 +1,39 @@
import sys
import os
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def syntax(self):
return "<spider>"
def short_desc(self):
return "Edit spider"
def long_desc(self):
return ("Edit a spider using the editor defined in the EDITOR environment"
" variable or else the EDITOR setting")
def _err(self, msg):
sys.stderr.write(msg + os.linesep)
self.exitcode = 1
def run(self, args, opts):
if len(args) != 1:
raise UsageError()
editor = self.settings['EDITOR']
try:
spidercls = self.crawler_process.spider_loader.load(args[0])
except KeyError:
return self._err(f"Spider not found: {args[0]}")
sfile = sys.modules[spidercls.__module__].__file__
sfile = sfile.replace('.pyc', '.py')
self.exitcode = os.system(f'{editor} "{sfile}"')

View file

@ -0,0 +1,70 @@
import sys
from w3lib.url import is_url
from scrapy.commands import ScrapyCommand
from scrapy.http import Request
from scrapy.exceptions import UsageError
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
class Command(ScrapyCommand):
requires_project = False
def syntax(self):
return "[options] <url>"
def short_desc(self):
return "Fetch a URL using the Scrapy downloader"
def long_desc(self):
return (
"Fetch a URL using the Scrapy downloader and print its content"
" to stdout. You may want to use --nolog to disable logging"
)
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--spider", dest="spider", help="use this spider")
parser.add_option("--headers", dest="headers", action="store_true",
help="print response HTTP headers instead of body")
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
help="do not handle HTTP 3xx status codes and print response as-is")
def _print_headers(self, headers, prefix):
for key, values in headers.items():
for value in values:
self._print_bytes(prefix + b' ' + key + b': ' + value)
def _print_response(self, response, opts):
if opts.headers:
self._print_headers(response.request.headers, b'>')
print('>')
self._print_headers(response.headers, b'<')
else:
self._print_bytes(response.body)
def _print_bytes(self, bytes_):
sys.stdout.buffer.write(bytes_ + b'\n')
def run(self, args, opts):
if len(args) != 1 or not is_url(args[0]):
raise UsageError()
request = Request(args[0], callback=self._print_response,
cb_kwargs={"opts": opts}, dont_filter=True)
# by default, let the framework handle redirects,
# i.e. command handles all codes expect 3xx
if not opts.no_redirect:
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
else:
request.meta['handle_httpstatus_all'] = True
spidercls = DefaultSpider
spider_loader = self.crawler_process.spider_loader
if opts.spider:
spidercls = spider_loader.load(opts.spider)
else:
spidercls = spidercls_for_request(spider_loader, request, spidercls)
self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
self.crawler_process.start()

View file

@ -0,0 +1,149 @@
import os
import shutil
import string
from importlib import import_module
from os.path import join, dirname, abspath, exists, splitext
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.utils.template import render_templatefile, string_camelcase
from scrapy.exceptions import UsageError
def sanitize_module_name(module_name):
"""Sanitize the given module name, by replacing dashes and points
with underscores and prefixing it with a letter if it doesn't start
with one
"""
module_name = module_name.replace('-', '_').replace('.', '_')
if module_name[0] not in string.ascii_letters:
module_name = "a" + module_name
return module_name
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False}
def syntax(self):
return "[options] <name> <domain>"
def short_desc(self):
return "Generate new spider using pre-defined templates"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-l", "--list", dest="list", action="store_true",
help="List available templates")
parser.add_option("-e", "--edit", dest="edit", action="store_true",
help="Edit spider after creating it")
parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",
help="Dump template to standard output")
parser.add_option("-t", "--template", dest="template", default="basic",
help="Uses a custom template.")
parser.add_option("--force", dest="force", action="store_true",
help="If the spider already exists, overwrite it with the template")
def run(self, args, opts):
if opts.list:
self._list_templates()
return
if opts.dump:
template_file = self._find_template(opts.dump)
if template_file:
with open(template_file, "r") as f:
print(f.read())
return
if len(args) != 2:
raise UsageError()
name, domain = args[0:2]
module = sanitize_module_name(name)
if self.settings.get('BOT_NAME') == module:
print("Cannot create a spider with the same name as your project")
return
if not opts.force and self._spider_exists(name):
return
template_file = self._find_template(opts.template)
if template_file:
self._genspider(module, name, domain, opts.template, template_file)
if opts.edit:
self.exitcode = os.system(f'scrapy edit "{name}"')
def _genspider(self, module, name, domain, template_name, template_file):
"""Generate the spider module, based on the given template"""
capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
tvars = {
'project_name': self.settings.get('BOT_NAME'),
'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
'module': module,
'name': name,
'domain': domain,
'classname': f'{capitalized_module}Spider'
}
if self.settings.get('NEWSPIDER_MODULE'):
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
spiders_dir = abspath(dirname(spiders_module.__file__))
else:
spiders_module = None
spiders_dir = "."
spider_file = f"{join(spiders_dir, module)}.py"
shutil.copyfile(template_file, spider_file)
render_templatefile(spider_file, **tvars)
print(f"Created spider {name!r} using template {template_name!r} ",
end=('' if spiders_module else '\n'))
if spiders_module:
print(f"in module:\n {spiders_module.__name__}.{module}")
def _find_template(self, template):
template_file = join(self.templates_dir, f'{template}.tmpl')
if exists(template_file):
return template_file
print(f"Unable to find template: {template}\n")
print('Use "scrapy genspider --list" to see all available templates.')
def _list_templates(self):
print("Available templates:")
for filename in sorted(os.listdir(self.templates_dir)):
if filename.endswith('.tmpl'):
print(f" {splitext(filename)[0]}")
def _spider_exists(self, name):
if not self.settings.get('NEWSPIDER_MODULE'):
# if run as a standalone command and file with same filename already exists
if exists(name + ".py"):
print(f"{abspath(name + '.py')} already exists")
return True
return False
try:
spidercls = self.crawler_process.spider_loader.load(name)
except KeyError:
pass
else:
# if spider with same name exists
print(f"Spider {name!r} already exists in module:")
print(f" {spidercls.__module__}")
return True
# a file with the same name exists in the target directory
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
spiders_dir = dirname(spiders_module.__file__)
spiders_dir_abs = abspath(spiders_dir)
if exists(join(spiders_dir_abs, name + ".py")):
print(f"{join(spiders_dir_abs, (name + '.py'))} already exists")
return True
return False
@property
def templates_dir(self):
return join(
self.settings['TEMPLATES_DIR'] or join(scrapy.__path__[0], 'templates'),
'spiders'
)

View file

@ -0,0 +1,14 @@
from scrapy.commands import ScrapyCommand
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def short_desc(self):
return "List available spiders"
def run(self, args, opts):
for s in sorted(self.crawler_process.spider_loader.list()):
print(s)

View file

@ -0,0 +1,256 @@
import json
import logging
from itemadapter import is_item, ItemAdapter
from w3lib.url import is_url
from scrapy.commands import BaseRunSpiderCommand
from scrapy.http import Request
from scrapy.utils import display
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
from scrapy.exceptions import UsageError
logger = logging.getLogger(__name__)
class Command(BaseRunSpiderCommand):
requires_project = True
spider = None
items = {}
requests = {}
first_response = None
def syntax(self):
return "[options] <url>"
def short_desc(self):
return "Parse URL (using its spider) and print the results"
def add_options(self, parser):
BaseRunSpiderCommand.add_options(self, parser)
parser.add_option("--spider", dest="spider", default=None,
help="use this spider without looking for one")
parser.add_option("--pipelines", action="store_true",
help="process items through pipelines")
parser.add_option("--nolinks", dest="nolinks", action="store_true",
help="don't show links to follow (extracted requests)")
parser.add_option("--noitems", dest="noitems", action="store_true",
help="don't show scraped items")
parser.add_option("--nocolour", dest="nocolour", action="store_true",
help="avoid using pygments to colorize the output")
parser.add_option("-r", "--rules", dest="rules", action="store_true",
help="use CrawlSpider rules to discover the callback")
parser.add_option("-c", "--callback", dest="callback",
help="use this callback for parsing, instead looking for a callback")
parser.add_option("-m", "--meta", dest="meta",
help="inject extra meta into the Request, it must be a valid raw json string")
parser.add_option("--cbkwargs", dest="cbkwargs",
help="inject extra callback kwargs into the Request, it must be a valid raw json string")
parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
help="maximum depth for parsing requests [default: %default]")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
help="print each depth level one by one")
@property
def max_level(self):
max_items, max_requests = 0, 0
if self.items:
max_items = max(self.items)
if self.requests:
max_requests = max(self.requests)
return max(max_items, max_requests)
def add_items(self, lvl, new_items):
old_items = self.items.get(lvl, [])
self.items[lvl] = old_items + new_items
def add_requests(self, lvl, new_reqs):
old_reqs = self.requests.get(lvl, [])
self.requests[lvl] = old_reqs + new_reqs
def print_items(self, lvl=None, colour=True):
if lvl is None:
items = [item for lst in self.items.values() for item in lst]
else:
items = self.items.get(lvl, [])
print("# Scraped Items ", "-" * 60)
display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
def print_requests(self, lvl=None, colour=True):
if lvl is None:
if self.requests:
requests = self.requests[max(self.requests)]
else:
requests = []
else:
requests = self.requests.get(lvl, [])
print("# Requests ", "-" * 65)
display.pprint(requests, colorize=colour)
def print_results(self, opts):
colour = not opts.nocolour
if opts.verbose:
for level in range(1, self.max_level + 1):
print(f'\n>>> DEPTH LEVEL: {level} <<<')
if not opts.noitems:
self.print_items(level, colour)
if not opts.nolinks:
self.print_requests(level, colour)
else:
print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
if not opts.noitems:
self.print_items(colour=colour)
if not opts.nolinks:
self.print_requests(colour=colour)
def run_callback(self, response, callback, cb_kwargs=None):
cb_kwargs = cb_kwargs or {}
items, requests = [], []
for x in iterate_spider_output(callback(response, **cb_kwargs)):
if is_item(x):
items.append(x)
elif isinstance(x, Request):
requests.append(x)
return items, requests
def get_callback_from_rules(self, spider, response):
if getattr(spider, 'rules', None):
for rule in spider.rules:
if rule.link_extractor.matches(response.url):
return rule.callback or "parse"
else:
logger.error('No CrawlSpider rules found in spider %(spider)r, '
'please specify a callback to use for parsing',
{'spider': spider.name})
def set_spidercls(self, url, opts):
spider_loader = self.crawler_process.spider_loader
if opts.spider:
try:
self.spidercls = spider_loader.load(opts.spider)
except KeyError:
logger.error('Unable to find spider: %(spider)s',
{'spider': opts.spider})
else:
self.spidercls = spidercls_for_request(spider_loader, Request(url))
if not self.spidercls:
logger.error('Unable to find spider for: %(url)s', {'url': url})
def _start_requests(spider):
yield self.prepare_request(spider, Request(url), opts)
self.spidercls.start_requests = _start_requests
def start_parsing(self, url, opts):
self.crawler_process.crawl(self.spidercls, **opts.spargs)
self.pcrawler = list(self.crawler_process.crawlers)[0]
self.crawler_process.start()
if not self.first_response:
logger.error('No response downloaded for: %(url)s',
{'url': url})
def prepare_request(self, spider, request, opts):
def callback(response, **cb_kwargs):
# memorize first request
if not self.first_response:
self.first_response = response
# determine real callback
cb = response.meta['_callback']
if not cb:
if opts.callback:
cb = opts.callback
elif opts.rules and self.first_response == response:
cb = self.get_callback_from_rules(spider, response)
if not cb:
logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s',
{'url': response.url, 'spider': spider.name})
return
else:
cb = 'parse'
if not callable(cb):
cb_method = getattr(spider, cb, None)
if callable(cb_method):
cb = cb_method
else:
logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
{'callback': cb, 'spider': spider.name})
return
# parse items and requests
depth = response.meta['_depth']
items, requests = self.run_callback(response, cb, cb_kwargs)
if opts.pipelines:
itemproc = self.pcrawler.engine.scraper.itemproc
for item in items:
itemproc.process_item(item, spider)
self.add_items(depth, items)
self.add_requests(depth, requests)
scraped_data = items if opts.output else []
if depth < opts.depth:
for req in requests:
req.meta['_depth'] = depth + 1
req.meta['_callback'] = req.callback
req.callback = callback
scraped_data += requests
return scraped_data
# update request meta if any extra meta was passed through the --meta/-m opts.
if opts.meta:
request.meta.update(opts.meta)
# update cb_kwargs if any extra values were was passed through the --cbkwargs option.
if opts.cbkwargs:
request.cb_kwargs.update(opts.cbkwargs)
request.meta['_depth'] = 1
request.meta['_callback'] = request.callback
request.callback = callback
return request
def process_options(self, args, opts):
BaseRunSpiderCommand.process_options(self, args, opts)
self.process_request_meta(opts)
self.process_request_cb_kwargs(opts)
def process_request_meta(self, opts):
if opts.meta:
try:
opts.meta = json.loads(opts.meta)
except ValueError:
raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. "
"Example: --meta='{\"foo\" : \"bar\"}'", print_help=False)
def process_request_cb_kwargs(self, opts):
if opts.cbkwargs:
try:
opts.cbkwargs = json.loads(opts.cbkwargs)
except ValueError:
raise UsageError("Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
"Example: --cbkwargs='{\"foo\" : \"bar\"}'", print_help=False)
def run(self, args, opts):
# parse arguments
if not len(args) == 1 or not is_url(args[0]):
raise UsageError()
else:
url = args[0]
# prepare spidercls
self.set_spidercls(url, opts)
if self.spidercls and opts.depth > 0:
self.start_parsing(url, opts)
self.print_results(opts)

View file

@ -0,0 +1,59 @@
import sys
import os
from importlib import import_module
from scrapy.utils.spider import iter_spider_classes
from scrapy.exceptions import UsageError
from scrapy.commands import BaseRunSpiderCommand
def _import_file(filepath):
abspath = os.path.abspath(filepath)
dirname, file = os.path.split(abspath)
fname, fext = os.path.splitext(file)
if fext not in ('.py', '.pyw'):
raise ValueError(f"Not a Python source file: {abspath}")
if dirname:
sys.path = [dirname] + sys.path
try:
module = import_module(fname)
finally:
if dirname:
sys.path.pop(0)
return module
class Command(BaseRunSpiderCommand):
requires_project = False
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
def syntax(self):
return "[options] <spider_file>"
def short_desc(self):
return "Run a self-contained spider (without creating a project)"
def long_desc(self):
return "Run the spider defined in the given file"
def run(self, args, opts):
if len(args) != 1:
raise UsageError()
filename = args[0]
if not os.path.exists(filename):
raise UsageError(f"File not found: {filename}\n")
try:
module = _import_file(filename)
except (ImportError, ValueError) as e:
raise UsageError(f"Unable to load {filename!r}: {e}\n")
spclasses = list(iter_spider_classes(module))
if not spclasses:
raise UsageError(f"No spider found in file: {filename}\n")
spidercls = spclasses.pop()
self.crawler_process.crawl(spidercls, **opts.spargs)
self.crawler_process.start()
if self.crawler_process.bootstrap_failed:
self.exitcode = 1

View file

@ -0,0 +1,47 @@
import json
from scrapy.commands import ScrapyCommand
from scrapy.settings import BaseSettings
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
def syntax(self):
return "[options]"
def short_desc(self):
return "Get settings values"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--get", dest="get", metavar="SETTING",
help="print raw setting value")
parser.add_option("--getbool", dest="getbool", metavar="SETTING",
help="print setting value, interpreted as a boolean")
parser.add_option("--getint", dest="getint", metavar="SETTING",
help="print setting value, interpreted as an integer")
parser.add_option("--getfloat", dest="getfloat", metavar="SETTING",
help="print setting value, interpreted as a float")
parser.add_option("--getlist", dest="getlist", metavar="SETTING",
help="print setting value, interpreted as a list")
def run(self, args, opts):
settings = self.crawler_process.settings
if opts.get:
s = settings.get(opts.get)
if isinstance(s, BaseSettings):
print(json.dumps(s.copy_to_dict()))
else:
print(s)
elif opts.getbool:
print(settings.getbool(opts.getbool))
elif opts.getint:
print(settings.getint(opts.getint))
elif opts.getfloat:
print(settings.getfloat(opts.getfloat))
elif opts.getlist:
print(settings.getlist(opts.getlist))

View file

@ -0,0 +1,80 @@
"""
Scrapy Shell
See documentation in docs/topics/shell.rst
"""
from threading import Thread
from scrapy.commands import ScrapyCommand
from scrapy.http import Request
from scrapy.shell import Shell
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
from scrapy.utils.url import guess_scheme
class Command(ScrapyCommand):
requires_project = False
default_settings = {
'KEEP_ALIVE': True,
'LOGSTATS_INTERVAL': 0,
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def syntax(self):
return "[url|file]"
def short_desc(self):
return "Interactive scraping console"
def long_desc(self):
return ("Interactive console for scraping the given url or file. "
"Use ./file.html syntax or full path for local file.")
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-c", dest="code",
help="evaluate the code in the shell, print the result and exit")
parser.add_option("--spider", dest="spider",
help="use this spider")
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
help="do not handle HTTP 3xx status codes and print response as-is")
def update_vars(self, vars):
"""You can use this function to update the Scrapy objects that will be
available in the shell
"""
pass
def run(self, args, opts):
url = args[0] if args else None
if url:
# first argument may be a local file
url = guess_scheme(url)
spider_loader = self.crawler_process.spider_loader
spidercls = DefaultSpider
if opts.spider:
spidercls = spider_loader.load(opts.spider)
elif url:
spidercls = spidercls_for_request(spider_loader, Request(url),
spidercls, log_multiple=True)
# The crawler is created this way since the Shell manually handles the
# crawling engine, so the set up in the crawl method won't work
crawler = self.crawler_process._create_crawler(spidercls)
# The Shell class needs a persistent engine in the crawler
crawler.engine = crawler._create_engine()
crawler.engine.start()
self._start_crawler_thread()
shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
shell.start(url=url, redirect=not opts.no_redirect)
def _start_crawler_thread(self):
t = Thread(target=self.crawler_process.start,
kwargs={'stop_after_crawl': False})
t.daemon = True
t.start()

View file

@ -0,0 +1,128 @@
import re
import os
import string
from importlib import import_module
from os.path import join, exists, abspath
from shutil import ignore_patterns, move, copy2, copystat
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.utils.template import render_templatefile, string_camelcase
from scrapy.exceptions import UsageError
TEMPLATES_TO_RENDER = (
('scrapy.cfg',),
('${project_name}', 'settings.py.tmpl'),
('${project_name}', 'items.py.tmpl'),
('${project_name}', 'pipelines.py.tmpl'),
('${project_name}', 'middlewares.py.tmpl'),
)
IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn')
def _make_writable(path):
current_permissions = os.stat(path).st_mode
os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
def syntax(self):
return "<project_name> [project_dir]"
def short_desc(self):
return "Create new project"
def _is_valid_name(self, project_name):
def _module_exists(module_name):
try:
import_module(module_name)
return True
except ImportError:
return False
if not re.search(r'^[_a-zA-Z]\w*$', project_name):
print('Error: Project names must begin with a letter and contain'
' only\nletters, numbers and underscores')
elif _module_exists(project_name):
print(f'Error: Module {project_name!r} already exists')
else:
return True
return False
def _copytree(self, src, dst):
"""
Since the original function always creates the directory, to resolve
the issue a new function had to be created. It's a simple copy and
was reduced for this case.
More info at:
https://github.com/scrapy/scrapy/pull/2005
"""
ignore = IGNORE
names = os.listdir(src)
ignored_names = ignore(src, names)
if not os.path.exists(dst):
os.makedirs(dst)
for name in names:
if name in ignored_names:
continue
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
if os.path.isdir(srcname):
self._copytree(srcname, dstname)
else:
copy2(srcname, dstname)
_make_writable(dstname)
copystat(src, dst)
_make_writable(dst)
def run(self, args, opts):
if len(args) not in (1, 2):
raise UsageError()
project_name = args[0]
project_dir = args[0]
if len(args) == 2:
project_dir = args[1]
if exists(join(project_dir, 'scrapy.cfg')):
self.exitcode = 1
print(f'Error: scrapy.cfg already exists in {abspath(project_dir)}')
return
if not self._is_valid_name(project_name):
self.exitcode = 1
return
self._copytree(self.templates_dir, abspath(project_dir))
move(join(project_dir, 'module'), join(project_dir, project_name))
for paths in TEMPLATES_TO_RENDER:
path = join(*paths)
tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
print(f"New Scrapy project '{project_name}', using template directory "
f"'{self.templates_dir}', created in:")
print(f" {abspath(project_dir)}\n")
print("You can start your first spider with:")
print(f" cd {project_dir}")
print(" scrapy genspider example example.com")
@property
def templates_dir(self):
return join(
self.settings['TEMPLATES_DIR'] or join(scrapy.__path__[0], 'templates'),
'project'
)

View file

@ -0,0 +1,29 @@
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.utils.versions import scrapy_components_versions
class Command(ScrapyCommand):
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
def syntax(self):
return "[-v]"
def short_desc(self):
return "Print Scrapy version"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--verbose", "-v", dest="verbose", action="store_true",
help="also display twisted/python/platform info (useful for bug reports)")
def run(self, args, opts):
if opts.verbose:
versions = scrapy_components_versions()
width = max(len(n) for (n, _) in versions)
for name, version in versions:
print(f"{name:<{width}} : {version}")
else:
print(f"Scrapy {scrapy.__version__}")

View file

@ -0,0 +1,18 @@
from scrapy.commands import fetch
from scrapy.utils.response import open_in_browser
class Command(fetch.Command):
def short_desc(self):
return "Open URL in browser, as seen by Scrapy"
def long_desc(self):
return "Fetch a URL using the Scrapy downloader and show its contents in a browser"
def add_options(self, parser):
super().add_options(parser)
parser.remove_option("--headers")
def _print_response(self, response, opts):
open_in_browser(response)