Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
1
venv/lib/python3.9/site-packages/scrapy/VERSION
Normal file
1
venv/lib/python3.9/site-packages/scrapy/VERSION
Normal file
|
|
@ -0,0 +1 @@
|
|||
2.4.1
|
||||
42
venv/lib/python3.9/site-packages/scrapy/__init__.py
Normal file
42
venv/lib/python3.9/site-packages/scrapy/__init__.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
Scrapy - a web crawling and web scraping framework written for Python
|
||||
"""
|
||||
|
||||
import pkgutil
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
from twisted import version as _txv
|
||||
|
||||
# Declare top-level shortcuts
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.http import Request, FormRequest
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
|
||||
__all__ = [
|
||||
'__version__', 'version_info', 'twisted_version', 'Spider',
|
||||
'Request', 'FormRequest', 'Selector', 'Item', 'Field',
|
||||
]
|
||||
|
||||
|
||||
# Scrapy and Twisted versions
|
||||
__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
|
||||
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split('.'))
|
||||
twisted_version = (_txv.major, _txv.minor, _txv.micro)
|
||||
|
||||
|
||||
# Check minimum required Python version
|
||||
if sys.version_info < (3, 6):
|
||||
print("Scrapy %s requires Python 3.6+" % __version__)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# Ignore noisy twisted deprecation warnings
|
||||
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
|
||||
|
||||
|
||||
del pkgutil
|
||||
del sys
|
||||
del warnings
|
||||
4
venv/lib/python3.9/site-packages/scrapy/__main__.py
Normal file
4
venv/lib/python3.9/site-packages/scrapy/__main__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from scrapy.cmdline import execute
|
||||
|
||||
if __name__ == '__main__':
|
||||
execute()
|
||||
173
venv/lib/python3.9/site-packages/scrapy/cmdline.py
Normal file
173
venv/lib/python3.9/site-packages/scrapy/cmdline.py
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
import sys
|
||||
import os
|
||||
import optparse
|
||||
import cProfile
|
||||
import inspect
|
||||
import pkg_resources
|
||||
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.misc import walk_modules
|
||||
from scrapy.utils.project import inside_project, get_project_settings
|
||||
from scrapy.utils.python import garbage_collect
|
||||
|
||||
|
||||
def _iter_command_classes(module_name):
|
||||
# TODO: add `name` attribute to commands and and merge this function with
|
||||
# scrapy.utils.spider.iter_spider_classes
|
||||
for module in walk_modules(module_name):
|
||||
for obj in vars(module).values():
|
||||
if (
|
||||
inspect.isclass(obj)
|
||||
and issubclass(obj, ScrapyCommand)
|
||||
and obj.__module__ == module.__name__
|
||||
and not obj == ScrapyCommand
|
||||
):
|
||||
yield obj
|
||||
|
||||
|
||||
def _get_commands_from_module(module, inproject):
|
||||
d = {}
|
||||
for cmd in _iter_command_classes(module):
|
||||
if inproject or not cmd.requires_project:
|
||||
cmdname = cmd.__module__.split('.')[-1]
|
||||
d[cmdname] = cmd()
|
||||
return d
|
||||
|
||||
|
||||
def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
|
||||
cmds = {}
|
||||
for entry_point in pkg_resources.iter_entry_points(group):
|
||||
obj = entry_point.load()
|
||||
if inspect.isclass(obj):
|
||||
cmds[entry_point.name] = obj()
|
||||
else:
|
||||
raise Exception(f"Invalid entry point {entry_point.name}")
|
||||
return cmds
|
||||
|
||||
|
||||
def _get_commands_dict(settings, inproject):
|
||||
cmds = _get_commands_from_module('scrapy.commands', inproject)
|
||||
cmds.update(_get_commands_from_entry_points(inproject))
|
||||
cmds_module = settings['COMMANDS_MODULE']
|
||||
if cmds_module:
|
||||
cmds.update(_get_commands_from_module(cmds_module, inproject))
|
||||
return cmds
|
||||
|
||||
|
||||
def _pop_command_name(argv):
|
||||
i = 0
|
||||
for arg in argv[1:]:
|
||||
if not arg.startswith('-'):
|
||||
del argv[i]
|
||||
return arg
|
||||
i += 1
|
||||
|
||||
|
||||
def _print_header(settings, inproject):
|
||||
version = scrapy.__version__
|
||||
if inproject:
|
||||
print(f"Scrapy {version} - project: {settings['BOT_NAME']}\n")
|
||||
else:
|
||||
print(f"Scrapy {version} - no active project\n")
|
||||
|
||||
|
||||
def _print_commands(settings, inproject):
|
||||
_print_header(settings, inproject)
|
||||
print("Usage:")
|
||||
print(" scrapy <command> [options] [args]\n")
|
||||
print("Available commands:")
|
||||
cmds = _get_commands_dict(settings, inproject)
|
||||
for cmdname, cmdclass in sorted(cmds.items()):
|
||||
print(f" {cmdname:<13} {cmdclass.short_desc()}")
|
||||
if not inproject:
|
||||
print()
|
||||
print(" [ more ] More commands available when run from project directory")
|
||||
print()
|
||||
print('Use "scrapy <command> -h" to see more info about a command')
|
||||
|
||||
|
||||
def _print_unknown_command(settings, cmdname, inproject):
|
||||
_print_header(settings, inproject)
|
||||
print(f"Unknown command: {cmdname}\n")
|
||||
print('Use "scrapy" to see available commands')
|
||||
|
||||
|
||||
def _run_print_help(parser, func, *a, **kw):
|
||||
try:
|
||||
func(*a, **kw)
|
||||
except UsageError as e:
|
||||
if str(e):
|
||||
parser.error(str(e))
|
||||
if e.print_help:
|
||||
parser.print_help()
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
def execute(argv=None, settings=None):
|
||||
if argv is None:
|
||||
argv = sys.argv
|
||||
|
||||
if settings is None:
|
||||
settings = get_project_settings()
|
||||
# set EDITOR from environment if available
|
||||
try:
|
||||
editor = os.environ['EDITOR']
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
settings['EDITOR'] = editor
|
||||
|
||||
inproject = inside_project()
|
||||
cmds = _get_commands_dict(settings, inproject)
|
||||
cmdname = _pop_command_name(argv)
|
||||
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
|
||||
conflict_handler='resolve')
|
||||
if not cmdname:
|
||||
_print_commands(settings, inproject)
|
||||
sys.exit(0)
|
||||
elif cmdname not in cmds:
|
||||
_print_unknown_command(settings, cmdname, inproject)
|
||||
sys.exit(2)
|
||||
|
||||
cmd = cmds[cmdname]
|
||||
parser.usage = f"scrapy {cmdname} {cmd.syntax()}"
|
||||
parser.description = cmd.long_desc()
|
||||
settings.setdict(cmd.default_settings, priority='command')
|
||||
cmd.settings = settings
|
||||
cmd.add_options(parser)
|
||||
opts, args = parser.parse_args(args=argv[1:])
|
||||
_run_print_help(parser, cmd.process_options, args, opts)
|
||||
|
||||
cmd.crawler_process = CrawlerProcess(settings)
|
||||
_run_print_help(parser, _run_command, cmd, args, opts)
|
||||
sys.exit(cmd.exitcode)
|
||||
|
||||
|
||||
def _run_command(cmd, args, opts):
|
||||
if opts.profile:
|
||||
_run_command_profiled(cmd, args, opts)
|
||||
else:
|
||||
cmd.run(args, opts)
|
||||
|
||||
|
||||
def _run_command_profiled(cmd, args, opts):
|
||||
if opts.profile:
|
||||
sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")
|
||||
loc = locals()
|
||||
p = cProfile.Profile()
|
||||
p.runctx('cmd.run(args, opts)', globals(), loc)
|
||||
if opts.profile:
|
||||
p.dump_stats(opts.profile)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
execute()
|
||||
finally:
|
||||
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() on exit:
|
||||
# http://doc.pypy.org/en/latest/cpython_differences.html
|
||||
# ?highlight=gc.collect#differences-related-to-garbage-collection-strategies
|
||||
garbage_collect()
|
||||
137
venv/lib/python3.9/site-packages/scrapy/commands/__init__.py
Normal file
137
venv/lib/python3.9/site-packages/scrapy/commands/__init__.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
"""
|
||||
Base class for Scrapy commands
|
||||
"""
|
||||
import os
|
||||
from optparse import OptionGroup
|
||||
from twisted.python import failure
|
||||
|
||||
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
class ScrapyCommand:
|
||||
|
||||
requires_project = False
|
||||
crawler_process = None
|
||||
|
||||
# default settings to be used for this command instead of global defaults
|
||||
default_settings = {}
|
||||
|
||||
exitcode = 0
|
||||
|
||||
def __init__(self):
|
||||
self.settings = None # set in scrapy.cmdline
|
||||
|
||||
def set_crawler(self, crawler):
|
||||
if hasattr(self, '_crawler'):
|
||||
raise RuntimeError("crawler already set")
|
||||
self._crawler = crawler
|
||||
|
||||
def syntax(self):
|
||||
"""
|
||||
Command syntax (preferably one-line). Do not include command name.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def short_desc(self):
|
||||
"""
|
||||
A short description of the command
|
||||
"""
|
||||
return ""
|
||||
|
||||
def long_desc(self):
|
||||
"""A long description of the command. Return short description when not
|
||||
available. It cannot contain newlines, since contents will be formatted
|
||||
by optparser which removes newlines and wraps text.
|
||||
"""
|
||||
return self.short_desc()
|
||||
|
||||
def help(self):
|
||||
"""An extensive help for the command. It will be shown when using the
|
||||
"help" command. It can contain newlines, since no post-formatting will
|
||||
be applied to its contents.
|
||||
"""
|
||||
return self.long_desc()
|
||||
|
||||
def add_options(self, parser):
|
||||
"""
|
||||
Populate option parse with options available for this command
|
||||
"""
|
||||
group = OptionGroup(parser, "Global Options")
|
||||
group.add_option("--logfile", metavar="FILE",
|
||||
help="log file. if omitted stderr will be used")
|
||||
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
|
||||
help=f"log level (default: {self.settings['LOG_LEVEL']})")
|
||||
group.add_option("--nolog", action="store_true",
|
||||
help="disable logging completely")
|
||||
group.add_option("--profile", metavar="FILE", default=None,
|
||||
help="write python cProfile stats to FILE")
|
||||
group.add_option("--pidfile", metavar="FILE",
|
||||
help="write process ID to FILE")
|
||||
group.add_option("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set/override setting (may be repeated)")
|
||||
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
|
||||
|
||||
parser.add_option_group(group)
|
||||
|
||||
def process_options(self, args, opts):
|
||||
try:
|
||||
self.settings.setdict(arglist_to_dict(opts.set),
|
||||
priority='cmdline')
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)
|
||||
|
||||
if opts.logfile:
|
||||
self.settings.set('LOG_ENABLED', True, priority='cmdline')
|
||||
self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')
|
||||
|
||||
if opts.loglevel:
|
||||
self.settings.set('LOG_ENABLED', True, priority='cmdline')
|
||||
self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')
|
||||
|
||||
if opts.nolog:
|
||||
self.settings.set('LOG_ENABLED', False, priority='cmdline')
|
||||
|
||||
if opts.pidfile:
|
||||
with open(opts.pidfile, "w") as f:
|
||||
f.write(str(os.getpid()) + os.linesep)
|
||||
|
||||
if opts.pdb:
|
||||
failure.startDebugMode()
|
||||
|
||||
def run(self, args, opts):
|
||||
"""
|
||||
Entry point for running commands
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BaseRunSpiderCommand(ScrapyCommand):
|
||||
"""
|
||||
Common class used to share functionality between the crawl, parse and runspider commands
|
||||
"""
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
|
||||
help="set spider argument (may be repeated)")
|
||||
parser.add_option("-o", "--output", metavar="FILE", action="append",
|
||||
help="append scraped items to the end of FILE (use - for stdout)")
|
||||
parser.add_option("-O", "--overwrite-output", metavar="FILE", action="append",
|
||||
help="dump scraped items into FILE, overwriting any existing file")
|
||||
parser.add_option("-t", "--output-format", metavar="FORMAT",
|
||||
help="format to use for dumping items")
|
||||
|
||||
def process_options(self, args, opts):
|
||||
ScrapyCommand.process_options(self, args, opts)
|
||||
try:
|
||||
opts.spargs = arglist_to_dict(opts.spargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
|
||||
if opts.output or opts.overwrite_output:
|
||||
feeds = feed_process_params_from_cli(
|
||||
self.settings,
|
||||
opts.output,
|
||||
opts.output_format,
|
||||
opts.overwrite_output,
|
||||
)
|
||||
self.settings.set('FEEDS', feeds, priority='cmdline')
|
||||
58
venv/lib/python3.9/site-packages/scrapy/commands/bench.py
Normal file
58
venv/lib/python3.9/site-packages/scrapy/commands/bench.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
import sys
|
||||
import time
|
||||
import subprocess
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
default_settings = {
|
||||
'LOG_LEVEL': 'INFO',
|
||||
'LOGSTATS_INTERVAL': 1,
|
||||
'CLOSESPIDER_TIMEOUT': 10,
|
||||
}
|
||||
|
||||
def short_desc(self):
|
||||
return "Run quick benchmark test"
|
||||
|
||||
def run(self, args, opts):
|
||||
with _BenchServer():
|
||||
self.crawler_process.crawl(_BenchSpider, total=100000)
|
||||
self.crawler_process.start()
|
||||
|
||||
|
||||
class _BenchServer:
|
||||
|
||||
def __enter__(self):
|
||||
from scrapy.utils.test import get_testenv
|
||||
pargs = [sys.executable, '-u', '-m', 'scrapy.utils.benchserver']
|
||||
self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE,
|
||||
env=get_testenv())
|
||||
self.proc.stdout.readline()
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.proc.kill()
|
||||
self.proc.wait()
|
||||
time.sleep(0.2)
|
||||
|
||||
|
||||
class _BenchSpider(scrapy.Spider):
|
||||
"""A spider that follows all links"""
|
||||
name = 'follow'
|
||||
total = 10000
|
||||
show = 20
|
||||
baseurl = 'http://localhost:8998'
|
||||
link_extractor = LinkExtractor()
|
||||
|
||||
def start_requests(self):
|
||||
qargs = {'total': self.total, 'show': self.show}
|
||||
url = f'{self.baseurl}?{urlencode(qargs, doseq=1)}'
|
||||
return [scrapy.Request(url, dont_filter=True)]
|
||||
|
||||
def parse(self, response):
|
||||
for link in self.link_extractor.extract_links(response):
|
||||
yield scrapy.Request(link.url, callback=self.parse)
|
||||
96
venv/lib/python3.9/site-packages/scrapy/commands/check.py
Normal file
96
venv/lib/python3.9/site-packages/scrapy/commands/check.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
import time
|
||||
from collections import defaultdict
|
||||
from unittest import TextTestRunner, TextTestResult as _TextTestResult
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.contracts import ContractsManager
|
||||
from scrapy.utils.misc import load_object, set_environ
|
||||
from scrapy.utils.conf import build_component_list
|
||||
|
||||
|
||||
class TextTestResult(_TextTestResult):
|
||||
def printSummary(self, start, stop):
|
||||
write = self.stream.write
|
||||
writeln = self.stream.writeln
|
||||
|
||||
run = self.testsRun
|
||||
plural = "s" if run != 1 else ""
|
||||
|
||||
writeln(self.separator2)
|
||||
writeln(f"Ran {run} contract{plural} in {stop - start:.3f}s")
|
||||
writeln()
|
||||
|
||||
infos = []
|
||||
if not self.wasSuccessful():
|
||||
write("FAILED")
|
||||
failed, errored = map(len, (self.failures, self.errors))
|
||||
if failed:
|
||||
infos.append(f"failures={failed}")
|
||||
if errored:
|
||||
infos.append(f"errors={errored}")
|
||||
else:
|
||||
write("OK")
|
||||
|
||||
if infos:
|
||||
writeln(f" ({', '.join(infos)})")
|
||||
else:
|
||||
write("\n")
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Check spider contracts"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-l", "--list", dest="list", action="store_true",
|
||||
help="only list contracts, without checking them")
|
||||
parser.add_option("-v", "--verbose", dest="verbose", default=False, action='store_true',
|
||||
help="print contract tests for all spiders")
|
||||
|
||||
def run(self, args, opts):
|
||||
# load contracts
|
||||
contracts = build_component_list(self.settings.getwithbase('SPIDER_CONTRACTS'))
|
||||
conman = ContractsManager(load_object(c) for c in contracts)
|
||||
runner = TextTestRunner(verbosity=2 if opts.verbose else 1)
|
||||
result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity)
|
||||
|
||||
# contract requests
|
||||
contract_reqs = defaultdict(list)
|
||||
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
|
||||
with set_environ(SCRAPY_CHECK='true'):
|
||||
for spidername in args or spider_loader.list():
|
||||
spidercls = spider_loader.load(spidername)
|
||||
spidercls.start_requests = lambda s: conman.from_spider(s, result)
|
||||
|
||||
tested_methods = conman.tested_methods_from_spidercls(spidercls)
|
||||
if opts.list:
|
||||
for method in tested_methods:
|
||||
contract_reqs[spidercls.name].append(method)
|
||||
elif tested_methods:
|
||||
self.crawler_process.crawl(spidercls)
|
||||
|
||||
# start checks
|
||||
if opts.list:
|
||||
for spider, methods in sorted(contract_reqs.items()):
|
||||
if not methods and not opts.verbose:
|
||||
continue
|
||||
print(spider)
|
||||
for method in sorted(methods):
|
||||
print(f' * {method}')
|
||||
else:
|
||||
start = time.time()
|
||||
self.crawler_process.start()
|
||||
stop = time.time()
|
||||
|
||||
result.printErrors()
|
||||
result.printSummary(start, stop)
|
||||
self.exitcode = int(not result.wasSuccessful())
|
||||
33
venv/lib/python3.9/site-packages/scrapy/commands/crawl.py
Normal file
33
venv/lib/python3.9/site-packages/scrapy/commands/crawl.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
from scrapy.commands import BaseRunSpiderCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
class Command(BaseRunSpiderCommand):
|
||||
|
||||
requires_project = True
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Run a spider"
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) < 1:
|
||||
raise UsageError()
|
||||
elif len(args) > 1:
|
||||
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
|
||||
spname = args[0]
|
||||
|
||||
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
|
||||
|
||||
if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
|
||||
self.exitcode = 1
|
||||
else:
|
||||
self.crawler_process.start()
|
||||
|
||||
if (
|
||||
self.crawler_process.bootstrap_failed
|
||||
or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception
|
||||
):
|
||||
self.exitcode = 1
|
||||
39
venv/lib/python3.9/site-packages/scrapy/commands/edit.py
Normal file
39
venv/lib/python3.9/site-packages/scrapy/commands/edit.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
import sys
|
||||
import os
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
|
||||
def syntax(self):
|
||||
return "<spider>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Edit spider"
|
||||
|
||||
def long_desc(self):
|
||||
return ("Edit a spider using the editor defined in the EDITOR environment"
|
||||
" variable or else the EDITOR setting")
|
||||
|
||||
def _err(self, msg):
|
||||
sys.stderr.write(msg + os.linesep)
|
||||
self.exitcode = 1
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1:
|
||||
raise UsageError()
|
||||
|
||||
editor = self.settings['EDITOR']
|
||||
try:
|
||||
spidercls = self.crawler_process.spider_loader.load(args[0])
|
||||
except KeyError:
|
||||
return self._err(f"Spider not found: {args[0]}")
|
||||
|
||||
sfile = sys.modules[spidercls.__module__].__file__
|
||||
sfile = sfile.replace('.pyc', '.py')
|
||||
self.exitcode = os.system(f'{editor} "{sfile}"')
|
||||
70
venv/lib/python3.9/site-packages/scrapy/commands/fetch.py
Normal file
70
venv/lib/python3.9/site-packages/scrapy/commands/fetch.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
import sys
|
||||
from w3lib.url import is_url
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.utils.datatypes import SequenceExclude
|
||||
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <url>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Fetch a URL using the Scrapy downloader"
|
||||
|
||||
def long_desc(self):
|
||||
return (
|
||||
"Fetch a URL using the Scrapy downloader and print its content"
|
||||
" to stdout. You may want to use --nolog to disable logging"
|
||||
)
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--spider", dest="spider", help="use this spider")
|
||||
parser.add_option("--headers", dest="headers", action="store_true",
|
||||
help="print response HTTP headers instead of body")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
|
||||
def _print_headers(self, headers, prefix):
|
||||
for key, values in headers.items():
|
||||
for value in values:
|
||||
self._print_bytes(prefix + b' ' + key + b': ' + value)
|
||||
|
||||
def _print_response(self, response, opts):
|
||||
if opts.headers:
|
||||
self._print_headers(response.request.headers, b'>')
|
||||
print('>')
|
||||
self._print_headers(response.headers, b'<')
|
||||
else:
|
||||
self._print_bytes(response.body)
|
||||
|
||||
def _print_bytes(self, bytes_):
|
||||
sys.stdout.buffer.write(bytes_ + b'\n')
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1 or not is_url(args[0]):
|
||||
raise UsageError()
|
||||
request = Request(args[0], callback=self._print_response,
|
||||
cb_kwargs={"opts": opts}, dont_filter=True)
|
||||
# by default, let the framework handle redirects,
|
||||
# i.e. command handles all codes expect 3xx
|
||||
if not opts.no_redirect:
|
||||
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
|
||||
spidercls = DefaultSpider
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
if opts.spider:
|
||||
spidercls = spider_loader.load(opts.spider)
|
||||
else:
|
||||
spidercls = spidercls_for_request(spider_loader, request, spidercls)
|
||||
self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
|
||||
self.crawler_process.start()
|
||||
149
venv/lib/python3.9/site-packages/scrapy/commands/genspider.py
Normal file
149
venv/lib/python3.9/site-packages/scrapy/commands/genspider.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
import os
|
||||
import shutil
|
||||
import string
|
||||
|
||||
from importlib import import_module
|
||||
from os.path import join, dirname, abspath, exists, splitext
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.utils.template import render_templatefile, string_camelcase
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
def sanitize_module_name(module_name):
|
||||
"""Sanitize the given module name, by replacing dashes and points
|
||||
with underscores and prefixing it with a letter if it doesn't start
|
||||
with one
|
||||
"""
|
||||
module_name = module_name.replace('-', '_').replace('.', '_')
|
||||
if module_name[0] not in string.ascii_letters:
|
||||
module_name = "a" + module_name
|
||||
return module_name
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <name> <domain>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Generate new spider using pre-defined templates"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-l", "--list", dest="list", action="store_true",
|
||||
help="List available templates")
|
||||
parser.add_option("-e", "--edit", dest="edit", action="store_true",
|
||||
help="Edit spider after creating it")
|
||||
parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",
|
||||
help="Dump template to standard output")
|
||||
parser.add_option("-t", "--template", dest="template", default="basic",
|
||||
help="Uses a custom template.")
|
||||
parser.add_option("--force", dest="force", action="store_true",
|
||||
help="If the spider already exists, overwrite it with the template")
|
||||
|
||||
def run(self, args, opts):
|
||||
if opts.list:
|
||||
self._list_templates()
|
||||
return
|
||||
if opts.dump:
|
||||
template_file = self._find_template(opts.dump)
|
||||
if template_file:
|
||||
with open(template_file, "r") as f:
|
||||
print(f.read())
|
||||
return
|
||||
if len(args) != 2:
|
||||
raise UsageError()
|
||||
|
||||
name, domain = args[0:2]
|
||||
module = sanitize_module_name(name)
|
||||
|
||||
if self.settings.get('BOT_NAME') == module:
|
||||
print("Cannot create a spider with the same name as your project")
|
||||
return
|
||||
|
||||
if not opts.force and self._spider_exists(name):
|
||||
return
|
||||
|
||||
template_file = self._find_template(opts.template)
|
||||
if template_file:
|
||||
self._genspider(module, name, domain, opts.template, template_file)
|
||||
if opts.edit:
|
||||
self.exitcode = os.system(f'scrapy edit "{name}"')
|
||||
|
||||
def _genspider(self, module, name, domain, template_name, template_file):
|
||||
"""Generate the spider module, based on the given template"""
|
||||
capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
|
||||
tvars = {
|
||||
'project_name': self.settings.get('BOT_NAME'),
|
||||
'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
|
||||
'module': module,
|
||||
'name': name,
|
||||
'domain': domain,
|
||||
'classname': f'{capitalized_module}Spider'
|
||||
}
|
||||
if self.settings.get('NEWSPIDER_MODULE'):
|
||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||
spiders_dir = abspath(dirname(spiders_module.__file__))
|
||||
else:
|
||||
spiders_module = None
|
||||
spiders_dir = "."
|
||||
spider_file = f"{join(spiders_dir, module)}.py"
|
||||
shutil.copyfile(template_file, spider_file)
|
||||
render_templatefile(spider_file, **tvars)
|
||||
print(f"Created spider {name!r} using template {template_name!r} ",
|
||||
end=('' if spiders_module else '\n'))
|
||||
if spiders_module:
|
||||
print(f"in module:\n {spiders_module.__name__}.{module}")
|
||||
|
||||
def _find_template(self, template):
|
||||
template_file = join(self.templates_dir, f'{template}.tmpl')
|
||||
if exists(template_file):
|
||||
return template_file
|
||||
print(f"Unable to find template: {template}\n")
|
||||
print('Use "scrapy genspider --list" to see all available templates.')
|
||||
|
||||
def _list_templates(self):
|
||||
print("Available templates:")
|
||||
for filename in sorted(os.listdir(self.templates_dir)):
|
||||
if filename.endswith('.tmpl'):
|
||||
print(f" {splitext(filename)[0]}")
|
||||
|
||||
def _spider_exists(self, name):
|
||||
if not self.settings.get('NEWSPIDER_MODULE'):
|
||||
# if run as a standalone command and file with same filename already exists
|
||||
if exists(name + ".py"):
|
||||
print(f"{abspath(name + '.py')} already exists")
|
||||
return True
|
||||
return False
|
||||
|
||||
try:
|
||||
spidercls = self.crawler_process.spider_loader.load(name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
# if spider with same name exists
|
||||
print(f"Spider {name!r} already exists in module:")
|
||||
print(f" {spidercls.__module__}")
|
||||
return True
|
||||
|
||||
# a file with the same name exists in the target directory
|
||||
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
|
||||
spiders_dir = dirname(spiders_module.__file__)
|
||||
spiders_dir_abs = abspath(spiders_dir)
|
||||
if exists(join(spiders_dir_abs, name + ".py")):
|
||||
print(f"{join(spiders_dir_abs, (name + '.py'))} already exists")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
@property
|
||||
def templates_dir(self):
|
||||
return join(
|
||||
self.settings['TEMPLATES_DIR'] or join(scrapy.__path__[0], 'templates'),
|
||||
'spiders'
|
||||
)
|
||||
14
venv/lib/python3.9/site-packages/scrapy/commands/list.py
Normal file
14
venv/lib/python3.9/site-packages/scrapy/commands/list.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
from scrapy.commands import ScrapyCommand
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = True
|
||||
default_settings = {'LOG_ENABLED': False}
|
||||
|
||||
def short_desc(self):
|
||||
return "List available spiders"
|
||||
|
||||
def run(self, args, opts):
|
||||
for s in sorted(self.crawler_process.spider_loader.list()):
|
||||
print(s)
|
||||
256
venv/lib/python3.9/site-packages/scrapy/commands/parse.py
Normal file
256
venv/lib/python3.9/site-packages/scrapy/commands/parse.py
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
import json
|
||||
import logging
|
||||
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
from w3lib.url import is_url
|
||||
|
||||
from scrapy.commands import BaseRunSpiderCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils import display
|
||||
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Command(BaseRunSpiderCommand):
|
||||
requires_project = True
|
||||
|
||||
spider = None
|
||||
items = {}
|
||||
requests = {}
|
||||
|
||||
first_response = None
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <url>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Parse URL (using its spider) and print the results"
|
||||
|
||||
def add_options(self, parser):
|
||||
BaseRunSpiderCommand.add_options(self, parser)
|
||||
parser.add_option("--spider", dest="spider", default=None,
|
||||
help="use this spider without looking for one")
|
||||
parser.add_option("--pipelines", action="store_true",
|
||||
help="process items through pipelines")
|
||||
parser.add_option("--nolinks", dest="nolinks", action="store_true",
|
||||
help="don't show links to follow (extracted requests)")
|
||||
parser.add_option("--noitems", dest="noitems", action="store_true",
|
||||
help="don't show scraped items")
|
||||
parser.add_option("--nocolour", dest="nocolour", action="store_true",
|
||||
help="avoid using pygments to colorize the output")
|
||||
parser.add_option("-r", "--rules", dest="rules", action="store_true",
|
||||
help="use CrawlSpider rules to discover the callback")
|
||||
parser.add_option("-c", "--callback", dest="callback",
|
||||
help="use this callback for parsing, instead looking for a callback")
|
||||
parser.add_option("-m", "--meta", dest="meta",
|
||||
help="inject extra meta into the Request, it must be a valid raw json string")
|
||||
parser.add_option("--cbkwargs", dest="cbkwargs",
|
||||
help="inject extra callback kwargs into the Request, it must be a valid raw json string")
|
||||
parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
|
||||
help="maximum depth for parsing requests [default: %default]")
|
||||
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
|
||||
help="print each depth level one by one")
|
||||
|
||||
@property
|
||||
def max_level(self):
|
||||
max_items, max_requests = 0, 0
|
||||
if self.items:
|
||||
max_items = max(self.items)
|
||||
if self.requests:
|
||||
max_requests = max(self.requests)
|
||||
return max(max_items, max_requests)
|
||||
|
||||
def add_items(self, lvl, new_items):
|
||||
old_items = self.items.get(lvl, [])
|
||||
self.items[lvl] = old_items + new_items
|
||||
|
||||
def add_requests(self, lvl, new_reqs):
|
||||
old_reqs = self.requests.get(lvl, [])
|
||||
self.requests[lvl] = old_reqs + new_reqs
|
||||
|
||||
def print_items(self, lvl=None, colour=True):
|
||||
if lvl is None:
|
||||
items = [item for lst in self.items.values() for item in lst]
|
||||
else:
|
||||
items = self.items.get(lvl, [])
|
||||
|
||||
print("# Scraped Items ", "-" * 60)
|
||||
display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
|
||||
|
||||
def print_requests(self, lvl=None, colour=True):
|
||||
if lvl is None:
|
||||
if self.requests:
|
||||
requests = self.requests[max(self.requests)]
|
||||
else:
|
||||
requests = []
|
||||
else:
|
||||
requests = self.requests.get(lvl, [])
|
||||
|
||||
print("# Requests ", "-" * 65)
|
||||
display.pprint(requests, colorize=colour)
|
||||
|
||||
def print_results(self, opts):
|
||||
colour = not opts.nocolour
|
||||
|
||||
if opts.verbose:
|
||||
for level in range(1, self.max_level + 1):
|
||||
print(f'\n>>> DEPTH LEVEL: {level} <<<')
|
||||
if not opts.noitems:
|
||||
self.print_items(level, colour)
|
||||
if not opts.nolinks:
|
||||
self.print_requests(level, colour)
|
||||
else:
|
||||
print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
|
||||
if not opts.noitems:
|
||||
self.print_items(colour=colour)
|
||||
if not opts.nolinks:
|
||||
self.print_requests(colour=colour)
|
||||
|
||||
def run_callback(self, response, callback, cb_kwargs=None):
|
||||
cb_kwargs = cb_kwargs or {}
|
||||
items, requests = [], []
|
||||
|
||||
for x in iterate_spider_output(callback(response, **cb_kwargs)):
|
||||
if is_item(x):
|
||||
items.append(x)
|
||||
elif isinstance(x, Request):
|
||||
requests.append(x)
|
||||
return items, requests
|
||||
|
||||
def get_callback_from_rules(self, spider, response):
|
||||
if getattr(spider, 'rules', None):
|
||||
for rule in spider.rules:
|
||||
if rule.link_extractor.matches(response.url):
|
||||
return rule.callback or "parse"
|
||||
else:
|
||||
logger.error('No CrawlSpider rules found in spider %(spider)r, '
|
||||
'please specify a callback to use for parsing',
|
||||
{'spider': spider.name})
|
||||
|
||||
def set_spidercls(self, url, opts):
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
if opts.spider:
|
||||
try:
|
||||
self.spidercls = spider_loader.load(opts.spider)
|
||||
except KeyError:
|
||||
logger.error('Unable to find spider: %(spider)s',
|
||||
{'spider': opts.spider})
|
||||
else:
|
||||
self.spidercls = spidercls_for_request(spider_loader, Request(url))
|
||||
if not self.spidercls:
|
||||
logger.error('Unable to find spider for: %(url)s', {'url': url})
|
||||
|
||||
def _start_requests(spider):
|
||||
yield self.prepare_request(spider, Request(url), opts)
|
||||
self.spidercls.start_requests = _start_requests
|
||||
|
||||
def start_parsing(self, url, opts):
|
||||
self.crawler_process.crawl(self.spidercls, **opts.spargs)
|
||||
self.pcrawler = list(self.crawler_process.crawlers)[0]
|
||||
self.crawler_process.start()
|
||||
|
||||
if not self.first_response:
|
||||
logger.error('No response downloaded for: %(url)s',
|
||||
{'url': url})
|
||||
|
||||
def prepare_request(self, spider, request, opts):
|
||||
def callback(response, **cb_kwargs):
|
||||
# memorize first request
|
||||
if not self.first_response:
|
||||
self.first_response = response
|
||||
|
||||
# determine real callback
|
||||
cb = response.meta['_callback']
|
||||
if not cb:
|
||||
if opts.callback:
|
||||
cb = opts.callback
|
||||
elif opts.rules and self.first_response == response:
|
||||
cb = self.get_callback_from_rules(spider, response)
|
||||
|
||||
if not cb:
|
||||
logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s',
|
||||
{'url': response.url, 'spider': spider.name})
|
||||
return
|
||||
else:
|
||||
cb = 'parse'
|
||||
|
||||
if not callable(cb):
|
||||
cb_method = getattr(spider, cb, None)
|
||||
if callable(cb_method):
|
||||
cb = cb_method
|
||||
else:
|
||||
logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
|
||||
{'callback': cb, 'spider': spider.name})
|
||||
return
|
||||
|
||||
# parse items and requests
|
||||
depth = response.meta['_depth']
|
||||
|
||||
items, requests = self.run_callback(response, cb, cb_kwargs)
|
||||
if opts.pipelines:
|
||||
itemproc = self.pcrawler.engine.scraper.itemproc
|
||||
for item in items:
|
||||
itemproc.process_item(item, spider)
|
||||
self.add_items(depth, items)
|
||||
self.add_requests(depth, requests)
|
||||
|
||||
scraped_data = items if opts.output else []
|
||||
if depth < opts.depth:
|
||||
for req in requests:
|
||||
req.meta['_depth'] = depth + 1
|
||||
req.meta['_callback'] = req.callback
|
||||
req.callback = callback
|
||||
scraped_data += requests
|
||||
|
||||
return scraped_data
|
||||
|
||||
# update request meta if any extra meta was passed through the --meta/-m opts.
|
||||
if opts.meta:
|
||||
request.meta.update(opts.meta)
|
||||
|
||||
# update cb_kwargs if any extra values were was passed through the --cbkwargs option.
|
||||
if opts.cbkwargs:
|
||||
request.cb_kwargs.update(opts.cbkwargs)
|
||||
|
||||
request.meta['_depth'] = 1
|
||||
request.meta['_callback'] = request.callback
|
||||
request.callback = callback
|
||||
return request
|
||||
|
||||
def process_options(self, args, opts):
|
||||
BaseRunSpiderCommand.process_options(self, args, opts)
|
||||
|
||||
self.process_request_meta(opts)
|
||||
self.process_request_cb_kwargs(opts)
|
||||
|
||||
def process_request_meta(self, opts):
|
||||
if opts.meta:
|
||||
try:
|
||||
opts.meta = json.loads(opts.meta)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. "
|
||||
"Example: --meta='{\"foo\" : \"bar\"}'", print_help=False)
|
||||
|
||||
def process_request_cb_kwargs(self, opts):
|
||||
if opts.cbkwargs:
|
||||
try:
|
||||
opts.cbkwargs = json.loads(opts.cbkwargs)
|
||||
except ValueError:
|
||||
raise UsageError("Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
|
||||
"Example: --cbkwargs='{\"foo\" : \"bar\"}'", print_help=False)
|
||||
|
||||
def run(self, args, opts):
|
||||
# parse arguments
|
||||
if not len(args) == 1 or not is_url(args[0]):
|
||||
raise UsageError()
|
||||
else:
|
||||
url = args[0]
|
||||
|
||||
# prepare spidercls
|
||||
self.set_spidercls(url, opts)
|
||||
|
||||
if self.spidercls and opts.depth > 0:
|
||||
self.start_parsing(url, opts)
|
||||
self.print_results(opts)
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
import sys
|
||||
import os
|
||||
from importlib import import_module
|
||||
|
||||
from scrapy.utils.spider import iter_spider_classes
|
||||
from scrapy.exceptions import UsageError
|
||||
from scrapy.commands import BaseRunSpiderCommand
|
||||
|
||||
|
||||
def _import_file(filepath):
|
||||
abspath = os.path.abspath(filepath)
|
||||
dirname, file = os.path.split(abspath)
|
||||
fname, fext = os.path.splitext(file)
|
||||
if fext not in ('.py', '.pyw'):
|
||||
raise ValueError(f"Not a Python source file: {abspath}")
|
||||
if dirname:
|
||||
sys.path = [dirname] + sys.path
|
||||
try:
|
||||
module = import_module(fname)
|
||||
finally:
|
||||
if dirname:
|
||||
sys.path.pop(0)
|
||||
return module
|
||||
|
||||
|
||||
class Command(BaseRunSpiderCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
|
||||
|
||||
def syntax(self):
|
||||
return "[options] <spider_file>"
|
||||
|
||||
def short_desc(self):
|
||||
return "Run a self-contained spider (without creating a project)"
|
||||
|
||||
def long_desc(self):
|
||||
return "Run the spider defined in the given file"
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) != 1:
|
||||
raise UsageError()
|
||||
filename = args[0]
|
||||
if not os.path.exists(filename):
|
||||
raise UsageError(f"File not found: {filename}\n")
|
||||
try:
|
||||
module = _import_file(filename)
|
||||
except (ImportError, ValueError) as e:
|
||||
raise UsageError(f"Unable to load {filename!r}: {e}\n")
|
||||
spclasses = list(iter_spider_classes(module))
|
||||
if not spclasses:
|
||||
raise UsageError(f"No spider found in file: {filename}\n")
|
||||
spidercls = spclasses.pop()
|
||||
|
||||
self.crawler_process.crawl(spidercls, **opts.spargs)
|
||||
self.crawler_process.start()
|
||||
|
||||
if self.crawler_process.bootstrap_failed:
|
||||
self.exitcode = 1
|
||||
47
venv/lib/python3.9/site-packages/scrapy/commands/settings.py
Normal file
47
venv/lib/python3.9/site-packages/scrapy/commands/settings.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import json
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.settings import BaseSettings
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
|
||||
def syntax(self):
|
||||
return "[options]"
|
||||
|
||||
def short_desc(self):
|
||||
return "Get settings values"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--get", dest="get", metavar="SETTING",
|
||||
help="print raw setting value")
|
||||
parser.add_option("--getbool", dest="getbool", metavar="SETTING",
|
||||
help="print setting value, interpreted as a boolean")
|
||||
parser.add_option("--getint", dest="getint", metavar="SETTING",
|
||||
help="print setting value, interpreted as an integer")
|
||||
parser.add_option("--getfloat", dest="getfloat", metavar="SETTING",
|
||||
help="print setting value, interpreted as a float")
|
||||
parser.add_option("--getlist", dest="getlist", metavar="SETTING",
|
||||
help="print setting value, interpreted as a list")
|
||||
|
||||
def run(self, args, opts):
|
||||
settings = self.crawler_process.settings
|
||||
if opts.get:
|
||||
s = settings.get(opts.get)
|
||||
if isinstance(s, BaseSettings):
|
||||
print(json.dumps(s.copy_to_dict()))
|
||||
else:
|
||||
print(s)
|
||||
elif opts.getbool:
|
||||
print(settings.getbool(opts.getbool))
|
||||
elif opts.getint:
|
||||
print(settings.getint(opts.getint))
|
||||
elif opts.getfloat:
|
||||
print(settings.getfloat(opts.getfloat))
|
||||
elif opts.getlist:
|
||||
print(settings.getlist(opts.getlist))
|
||||
80
venv/lib/python3.9/site-packages/scrapy/commands/shell.py
Normal file
80
venv/lib/python3.9/site-packages/scrapy/commands/shell.py
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
"""
|
||||
Scrapy Shell
|
||||
|
||||
See documentation in docs/topics/shell.rst
|
||||
"""
|
||||
from threading import Thread
|
||||
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.http import Request
|
||||
from scrapy.shell import Shell
|
||||
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
|
||||
from scrapy.utils.url import guess_scheme
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {
|
||||
'KEEP_ALIVE': True,
|
||||
'LOGSTATS_INTERVAL': 0,
|
||||
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
|
||||
}
|
||||
|
||||
def syntax(self):
|
||||
return "[url|file]"
|
||||
|
||||
def short_desc(self):
|
||||
return "Interactive scraping console"
|
||||
|
||||
def long_desc(self):
|
||||
return ("Interactive console for scraping the given url or file. "
|
||||
"Use ./file.html syntax or full path for local file.")
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("-c", dest="code",
|
||||
help="evaluate the code in the shell, print the result and exit")
|
||||
parser.add_option("--spider", dest="spider",
|
||||
help="use this spider")
|
||||
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
|
||||
help="do not handle HTTP 3xx status codes and print response as-is")
|
||||
|
||||
def update_vars(self, vars):
|
||||
"""You can use this function to update the Scrapy objects that will be
|
||||
available in the shell
|
||||
"""
|
||||
pass
|
||||
|
||||
def run(self, args, opts):
|
||||
url = args[0] if args else None
|
||||
if url:
|
||||
# first argument may be a local file
|
||||
url = guess_scheme(url)
|
||||
|
||||
spider_loader = self.crawler_process.spider_loader
|
||||
|
||||
spidercls = DefaultSpider
|
||||
if opts.spider:
|
||||
spidercls = spider_loader.load(opts.spider)
|
||||
elif url:
|
||||
spidercls = spidercls_for_request(spider_loader, Request(url),
|
||||
spidercls, log_multiple=True)
|
||||
|
||||
# The crawler is created this way since the Shell manually handles the
|
||||
# crawling engine, so the set up in the crawl method won't work
|
||||
crawler = self.crawler_process._create_crawler(spidercls)
|
||||
# The Shell class needs a persistent engine in the crawler
|
||||
crawler.engine = crawler._create_engine()
|
||||
crawler.engine.start()
|
||||
|
||||
self._start_crawler_thread()
|
||||
|
||||
shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
|
||||
shell.start(url=url, redirect=not opts.no_redirect)
|
||||
|
||||
def _start_crawler_thread(self):
|
||||
t = Thread(target=self.crawler_process.start,
|
||||
kwargs={'stop_after_crawl': False})
|
||||
t.daemon = True
|
||||
t.start()
|
||||
128
venv/lib/python3.9/site-packages/scrapy/commands/startproject.py
Normal file
128
venv/lib/python3.9/site-packages/scrapy/commands/startproject.py
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
import re
|
||||
import os
|
||||
import string
|
||||
from importlib import import_module
|
||||
from os.path import join, exists, abspath
|
||||
from shutil import ignore_patterns, move, copy2, copystat
|
||||
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
|
||||
|
||||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.utils.template import render_templatefile, string_camelcase
|
||||
from scrapy.exceptions import UsageError
|
||||
|
||||
|
||||
TEMPLATES_TO_RENDER = (
|
||||
('scrapy.cfg',),
|
||||
('${project_name}', 'settings.py.tmpl'),
|
||||
('${project_name}', 'items.py.tmpl'),
|
||||
('${project_name}', 'pipelines.py.tmpl'),
|
||||
('${project_name}', 'middlewares.py.tmpl'),
|
||||
)
|
||||
|
||||
IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn')
|
||||
|
||||
|
||||
def _make_writable(path):
|
||||
current_permissions = os.stat(path).st_mode
|
||||
os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
requires_project = False
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
|
||||
def syntax(self):
|
||||
return "<project_name> [project_dir]"
|
||||
|
||||
def short_desc(self):
|
||||
return "Create new project"
|
||||
|
||||
def _is_valid_name(self, project_name):
|
||||
def _module_exists(module_name):
|
||||
try:
|
||||
import_module(module_name)
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
if not re.search(r'^[_a-zA-Z]\w*$', project_name):
|
||||
print('Error: Project names must begin with a letter and contain'
|
||||
' only\nletters, numbers and underscores')
|
||||
elif _module_exists(project_name):
|
||||
print(f'Error: Module {project_name!r} already exists')
|
||||
else:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _copytree(self, src, dst):
|
||||
"""
|
||||
Since the original function always creates the directory, to resolve
|
||||
the issue a new function had to be created. It's a simple copy and
|
||||
was reduced for this case.
|
||||
|
||||
More info at:
|
||||
https://github.com/scrapy/scrapy/pull/2005
|
||||
"""
|
||||
ignore = IGNORE
|
||||
names = os.listdir(src)
|
||||
ignored_names = ignore(src, names)
|
||||
|
||||
if not os.path.exists(dst):
|
||||
os.makedirs(dst)
|
||||
|
||||
for name in names:
|
||||
if name in ignored_names:
|
||||
continue
|
||||
|
||||
srcname = os.path.join(src, name)
|
||||
dstname = os.path.join(dst, name)
|
||||
if os.path.isdir(srcname):
|
||||
self._copytree(srcname, dstname)
|
||||
else:
|
||||
copy2(srcname, dstname)
|
||||
_make_writable(dstname)
|
||||
|
||||
copystat(src, dst)
|
||||
_make_writable(dst)
|
||||
|
||||
def run(self, args, opts):
|
||||
if len(args) not in (1, 2):
|
||||
raise UsageError()
|
||||
|
||||
project_name = args[0]
|
||||
project_dir = args[0]
|
||||
|
||||
if len(args) == 2:
|
||||
project_dir = args[1]
|
||||
|
||||
if exists(join(project_dir, 'scrapy.cfg')):
|
||||
self.exitcode = 1
|
||||
print(f'Error: scrapy.cfg already exists in {abspath(project_dir)}')
|
||||
return
|
||||
|
||||
if not self._is_valid_name(project_name):
|
||||
self.exitcode = 1
|
||||
return
|
||||
|
||||
self._copytree(self.templates_dir, abspath(project_dir))
|
||||
move(join(project_dir, 'module'), join(project_dir, project_name))
|
||||
for paths in TEMPLATES_TO_RENDER:
|
||||
path = join(*paths)
|
||||
tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
|
||||
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
|
||||
print(f"New Scrapy project '{project_name}', using template directory "
|
||||
f"'{self.templates_dir}', created in:")
|
||||
print(f" {abspath(project_dir)}\n")
|
||||
print("You can start your first spider with:")
|
||||
print(f" cd {project_dir}")
|
||||
print(" scrapy genspider example example.com")
|
||||
|
||||
@property
|
||||
def templates_dir(self):
|
||||
return join(
|
||||
self.settings['TEMPLATES_DIR'] or join(scrapy.__path__[0], 'templates'),
|
||||
'project'
|
||||
)
|
||||
29
venv/lib/python3.9/site-packages/scrapy/commands/version.py
Normal file
29
venv/lib/python3.9/site-packages/scrapy/commands/version.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
import scrapy
|
||||
from scrapy.commands import ScrapyCommand
|
||||
from scrapy.utils.versions import scrapy_components_versions
|
||||
|
||||
|
||||
class Command(ScrapyCommand):
|
||||
|
||||
default_settings = {'LOG_ENABLED': False,
|
||||
'SPIDER_LOADER_WARN_ONLY': True}
|
||||
|
||||
def syntax(self):
|
||||
return "[-v]"
|
||||
|
||||
def short_desc(self):
|
||||
return "Print Scrapy version"
|
||||
|
||||
def add_options(self, parser):
|
||||
ScrapyCommand.add_options(self, parser)
|
||||
parser.add_option("--verbose", "-v", dest="verbose", action="store_true",
|
||||
help="also display twisted/python/platform info (useful for bug reports)")
|
||||
|
||||
def run(self, args, opts):
|
||||
if opts.verbose:
|
||||
versions = scrapy_components_versions()
|
||||
width = max(len(n) for (n, _) in versions)
|
||||
for name, version in versions:
|
||||
print(f"{name:<{width}} : {version}")
|
||||
else:
|
||||
print(f"Scrapy {scrapy.__version__}")
|
||||
18
venv/lib/python3.9/site-packages/scrapy/commands/view.py
Normal file
18
venv/lib/python3.9/site-packages/scrapy/commands/view.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
from scrapy.commands import fetch
|
||||
from scrapy.utils.response import open_in_browser
|
||||
|
||||
|
||||
class Command(fetch.Command):
|
||||
|
||||
def short_desc(self):
|
||||
return "Open URL in browser, as seen by Scrapy"
|
||||
|
||||
def long_desc(self):
|
||||
return "Fetch a URL using the Scrapy downloader and show its contents in a browser"
|
||||
|
||||
def add_options(self, parser):
|
||||
super().add_options(parser)
|
||||
parser.remove_option("--headers")
|
||||
|
||||
def _print_response(self, response, opts):
|
||||
open_in_browser(response)
|
||||
179
venv/lib/python3.9/site-packages/scrapy/contracts/__init__.py
Normal file
179
venv/lib/python3.9/site-packages/scrapy/contracts/__init__.py
Normal file
|
|
@ -0,0 +1,179 @@
|
|||
import sys
|
||||
import re
|
||||
from functools import wraps
|
||||
from inspect import getmembers
|
||||
from unittest import TestCase
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
from scrapy.utils.python import get_spec
|
||||
|
||||
|
||||
class ContractsManager:
|
||||
contracts = {}
|
||||
|
||||
def __init__(self, contracts):
|
||||
for contract in contracts:
|
||||
self.contracts[contract.name] = contract
|
||||
|
||||
def tested_methods_from_spidercls(self, spidercls):
|
||||
is_method = re.compile(r"^\s*@", re.MULTILINE).search
|
||||
methods = []
|
||||
for key, value in getmembers(spidercls):
|
||||
if callable(value) and value.__doc__ and is_method(value.__doc__):
|
||||
methods.append(key)
|
||||
|
||||
return methods
|
||||
|
||||
def extract_contracts(self, method):
|
||||
contracts = []
|
||||
for line in method.__doc__.split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if line.startswith('@'):
|
||||
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
|
||||
args = re.split(r'\s+', args)
|
||||
|
||||
contracts.append(self.contracts[name](method, *args))
|
||||
|
||||
return contracts
|
||||
|
||||
def from_spider(self, spider, results):
|
||||
requests = []
|
||||
for method in self.tested_methods_from_spidercls(type(spider)):
|
||||
bound_method = spider.__getattribute__(method)
|
||||
try:
|
||||
requests.append(self.from_method(bound_method, results))
|
||||
except Exception:
|
||||
case = _create_testcase(bound_method, 'contract')
|
||||
results.addError(case, sys.exc_info())
|
||||
|
||||
return requests
|
||||
|
||||
def from_method(self, method, results):
|
||||
contracts = self.extract_contracts(method)
|
||||
if contracts:
|
||||
request_cls = Request
|
||||
for contract in contracts:
|
||||
if contract.request_cls is not None:
|
||||
request_cls = contract.request_cls
|
||||
|
||||
# calculate request args
|
||||
args, kwargs = get_spec(request_cls.__init__)
|
||||
|
||||
# Don't filter requests to allow
|
||||
# testing different callbacks on the same URL.
|
||||
kwargs['dont_filter'] = True
|
||||
kwargs['callback'] = method
|
||||
|
||||
for contract in contracts:
|
||||
kwargs = contract.adjust_request_args(kwargs)
|
||||
|
||||
args.remove('self')
|
||||
|
||||
# check if all positional arguments are defined in kwargs
|
||||
if set(args).issubset(set(kwargs)):
|
||||
request = request_cls(**kwargs)
|
||||
|
||||
# execute pre and post hooks in order
|
||||
for contract in reversed(contracts):
|
||||
request = contract.add_pre_hook(request, results)
|
||||
for contract in contracts:
|
||||
request = contract.add_post_hook(request, results)
|
||||
|
||||
self._clean_req(request, method, results)
|
||||
return request
|
||||
|
||||
def _clean_req(self, request, method, results):
|
||||
""" stop the request from returning objects and records any errors """
|
||||
|
||||
cb = request.callback
|
||||
|
||||
@wraps(cb)
|
||||
def cb_wrapper(response, **cb_kwargs):
|
||||
try:
|
||||
output = cb(response, **cb_kwargs)
|
||||
output = list(iterate_spider_output(output))
|
||||
except Exception:
|
||||
case = _create_testcase(method, 'callback')
|
||||
results.addError(case, sys.exc_info())
|
||||
|
||||
def eb_wrapper(failure):
|
||||
case = _create_testcase(method, 'errback')
|
||||
exc_info = failure.type, failure.value, failure.getTracebackObject()
|
||||
results.addError(case, exc_info)
|
||||
|
||||
request.callback = cb_wrapper
|
||||
request.errback = eb_wrapper
|
||||
|
||||
|
||||
class Contract:
|
||||
""" Abstract class for contracts """
|
||||
request_cls = None
|
||||
|
||||
def __init__(self, method, *args):
|
||||
self.testcase_pre = _create_testcase(method, f'@{self.name} pre-hook')
|
||||
self.testcase_post = _create_testcase(method, f'@{self.name} post-hook')
|
||||
self.args = args
|
||||
|
||||
def add_pre_hook(self, request, results):
|
||||
if hasattr(self, 'pre_process'):
|
||||
cb = request.callback
|
||||
|
||||
@wraps(cb)
|
||||
def wrapper(response, **cb_kwargs):
|
||||
try:
|
||||
results.startTest(self.testcase_pre)
|
||||
self.pre_process(response)
|
||||
results.stopTest(self.testcase_pre)
|
||||
except AssertionError:
|
||||
results.addFailure(self.testcase_pre, sys.exc_info())
|
||||
except Exception:
|
||||
results.addError(self.testcase_pre, sys.exc_info())
|
||||
else:
|
||||
results.addSuccess(self.testcase_pre)
|
||||
finally:
|
||||
return list(iterate_spider_output(cb(response, **cb_kwargs)))
|
||||
|
||||
request.callback = wrapper
|
||||
|
||||
return request
|
||||
|
||||
def add_post_hook(self, request, results):
|
||||
if hasattr(self, 'post_process'):
|
||||
cb = request.callback
|
||||
|
||||
@wraps(cb)
|
||||
def wrapper(response, **cb_kwargs):
|
||||
output = list(iterate_spider_output(cb(response, **cb_kwargs)))
|
||||
try:
|
||||
results.startTest(self.testcase_post)
|
||||
self.post_process(output)
|
||||
results.stopTest(self.testcase_post)
|
||||
except AssertionError:
|
||||
results.addFailure(self.testcase_post, sys.exc_info())
|
||||
except Exception:
|
||||
results.addError(self.testcase_post, sys.exc_info())
|
||||
else:
|
||||
results.addSuccess(self.testcase_post)
|
||||
finally:
|
||||
return output
|
||||
|
||||
request.callback = wrapper
|
||||
|
||||
return request
|
||||
|
||||
def adjust_request_args(self, args):
|
||||
return args
|
||||
|
||||
|
||||
def _create_testcase(method, desc):
|
||||
spider = method.__self__.name
|
||||
|
||||
class ContractTestCase(TestCase):
|
||||
def __str__(_self):
|
||||
return f"[{spider}] {method.__name__} ({desc})"
|
||||
|
||||
name = f'{spider}_{method.__name__}'
|
||||
setattr(ContractTestCase, name, lambda x: x)
|
||||
return ContractTestCase(name)
|
||||
108
venv/lib/python3.9/site-packages/scrapy/contracts/default.py
Normal file
108
venv/lib/python3.9/site-packages/scrapy/contracts/default.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
import json
|
||||
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
from scrapy.contracts import Contract
|
||||
from scrapy.exceptions import ContractFail
|
||||
from scrapy.http import Request
|
||||
|
||||
|
||||
# contracts
|
||||
class UrlContract(Contract):
|
||||
""" Contract to set the url of the request (mandatory)
|
||||
@url http://scrapy.org
|
||||
"""
|
||||
|
||||
name = 'url'
|
||||
|
||||
def adjust_request_args(self, args):
|
||||
args['url'] = self.args[0]
|
||||
return args
|
||||
|
||||
|
||||
class CallbackKeywordArgumentsContract(Contract):
|
||||
""" Contract to set the keyword arguments for the request.
|
||||
The value should be a JSON-encoded dictionary, e.g.:
|
||||
|
||||
@cb_kwargs {"arg1": "some value"}
|
||||
"""
|
||||
|
||||
name = 'cb_kwargs'
|
||||
|
||||
def adjust_request_args(self, args):
|
||||
args['cb_kwargs'] = json.loads(' '.join(self.args))
|
||||
return args
|
||||
|
||||
|
||||
class ReturnsContract(Contract):
|
||||
""" Contract to check the output of a callback
|
||||
|
||||
general form:
|
||||
@returns request(s)/item(s) [min=1 [max]]
|
||||
|
||||
e.g.:
|
||||
@returns request
|
||||
@returns request 2
|
||||
@returns request 2 10
|
||||
@returns request 0 10
|
||||
"""
|
||||
|
||||
name = 'returns'
|
||||
object_type_verifiers = {
|
||||
'request': lambda x: isinstance(x, Request),
|
||||
'requests': lambda x: isinstance(x, Request),
|
||||
'item': is_item,
|
||||
'items': is_item,
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if len(self.args) not in [1, 2, 3]:
|
||||
raise ValueError(
|
||||
f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}"
|
||||
)
|
||||
self.obj_name = self.args[0] or None
|
||||
self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
|
||||
|
||||
try:
|
||||
self.min_bound = int(self.args[1])
|
||||
except IndexError:
|
||||
self.min_bound = 1
|
||||
|
||||
try:
|
||||
self.max_bound = int(self.args[2])
|
||||
except IndexError:
|
||||
self.max_bound = float('inf')
|
||||
|
||||
def post_process(self, output):
|
||||
occurrences = 0
|
||||
for x in output:
|
||||
if self.obj_type_verifier(x):
|
||||
occurrences += 1
|
||||
|
||||
assertion = (self.min_bound <= occurrences <= self.max_bound)
|
||||
|
||||
if not assertion:
|
||||
if self.min_bound == self.max_bound:
|
||||
expected = self.min_bound
|
||||
else:
|
||||
expected = f'{self.min_bound}..{self.max_bound}'
|
||||
|
||||
raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}")
|
||||
|
||||
|
||||
class ScrapesContract(Contract):
|
||||
""" Contract to check presence of fields in scraped items
|
||||
@scrapes page_name page_body
|
||||
"""
|
||||
|
||||
name = 'scrapes'
|
||||
|
||||
def post_process(self, output):
|
||||
for x in output:
|
||||
if is_item(x):
|
||||
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
|
||||
if missing:
|
||||
missing_fields = ", ".join(missing)
|
||||
raise ContractFail(f"Missing fields: {missing_fields}")
|
||||
3
venv/lib/python3.9/site-packages/scrapy/core/__init__.py
Normal file
3
venv/lib/python3.9/site-packages/scrapy/core/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
"""
|
||||
Scrapy core library classes and functions.
|
||||
"""
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
import random
|
||||
from time import time
|
||||
from datetime import datetime
|
||||
from collections import deque
|
||||
|
||||
from twisted.internet import defer, task
|
||||
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.resolver import dnscache
|
||||
from scrapy import signals
|
||||
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
|
||||
from scrapy.core.downloader.handlers import DownloadHandlers
|
||||
|
||||
|
||||
class Slot:
|
||||
"""Downloader slot"""
|
||||
|
||||
def __init__(self, concurrency, delay, randomize_delay):
|
||||
self.concurrency = concurrency
|
||||
self.delay = delay
|
||||
self.randomize_delay = randomize_delay
|
||||
|
||||
self.active = set()
|
||||
self.queue = deque()
|
||||
self.transferring = set()
|
||||
self.lastseen = 0
|
||||
self.latercall = None
|
||||
|
||||
def free_transfer_slots(self):
|
||||
return self.concurrency - len(self.transferring)
|
||||
|
||||
def download_delay(self):
|
||||
if self.randomize_delay:
|
||||
return random.uniform(0.5 * self.delay, 1.5 * self.delay)
|
||||
return self.delay
|
||||
|
||||
def close(self):
|
||||
if self.latercall and self.latercall.active():
|
||||
self.latercall.cancel()
|
||||
|
||||
def __repr__(self):
|
||||
cls_name = self.__class__.__name__
|
||||
return (f"{cls_name}(concurrency={self.concurrency!r}, "
|
||||
f"delay={self.delay:.2f}, "
|
||||
f"randomize_delay={self.randomize_delay!r})")
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"<downloader.Slot concurrency={self.concurrency!r} "
|
||||
f"delay={self.delay:.2f} randomize_delay={self.randomize_delay!r} "
|
||||
f"len(active)={len(self.active)} len(queue)={len(self.queue)} "
|
||||
f"len(transferring)={len(self.transferring)} "
|
||||
f"lastseen={datetime.fromtimestamp(self.lastseen).isoformat()}>"
|
||||
)
|
||||
|
||||
|
||||
def _get_concurrency_delay(concurrency, spider, settings):
|
||||
delay = settings.getfloat('DOWNLOAD_DELAY')
|
||||
if hasattr(spider, 'download_delay'):
|
||||
delay = spider.download_delay
|
||||
|
||||
if hasattr(spider, 'max_concurrent_requests'):
|
||||
concurrency = spider.max_concurrent_requests
|
||||
|
||||
return concurrency, delay
|
||||
|
||||
|
||||
class Downloader:
|
||||
|
||||
DOWNLOAD_SLOT = 'download_slot'
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.settings = crawler.settings
|
||||
self.signals = crawler.signals
|
||||
self.slots = {}
|
||||
self.active = set()
|
||||
self.handlers = DownloadHandlers(crawler)
|
||||
self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
|
||||
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
|
||||
self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
|
||||
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
|
||||
self._slot_gc_loop = task.LoopingCall(self._slot_gc)
|
||||
self._slot_gc_loop.start(60)
|
||||
|
||||
def fetch(self, request, spider):
|
||||
def _deactivate(response):
|
||||
self.active.remove(request)
|
||||
return response
|
||||
|
||||
self.active.add(request)
|
||||
dfd = self.middleware.download(self._enqueue_request, request, spider)
|
||||
return dfd.addBoth(_deactivate)
|
||||
|
||||
def needs_backout(self):
|
||||
return len(self.active) >= self.total_concurrency
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = self._get_slot_key(request, spider)
|
||||
if key not in self.slots:
|
||||
conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
|
||||
conc, delay = _get_concurrency_delay(conc, spider, self.settings)
|
||||
self.slots[key] = Slot(conc, delay, self.randomize_delay)
|
||||
|
||||
return key, self.slots[key]
|
||||
|
||||
def _get_slot_key(self, request, spider):
|
||||
if self.DOWNLOAD_SLOT in request.meta:
|
||||
return request.meta[self.DOWNLOAD_SLOT]
|
||||
|
||||
key = urlparse_cached(request).hostname or ''
|
||||
if self.ip_concurrency:
|
||||
key = dnscache.get(key, key)
|
||||
|
||||
return key
|
||||
|
||||
def _enqueue_request(self, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
request.meta[self.DOWNLOAD_SLOT] = key
|
||||
|
||||
def _deactivate(response):
|
||||
slot.active.remove(request)
|
||||
return response
|
||||
|
||||
slot.active.add(request)
|
||||
self.signals.send_catch_log(signal=signals.request_reached_downloader,
|
||||
request=request,
|
||||
spider=spider)
|
||||
deferred = defer.Deferred().addBoth(_deactivate)
|
||||
slot.queue.append((request, deferred))
|
||||
self._process_queue(spider, slot)
|
||||
return deferred
|
||||
|
||||
def _process_queue(self, spider, slot):
|
||||
from twisted.internet import reactor
|
||||
if slot.latercall and slot.latercall.active():
|
||||
return
|
||||
|
||||
# Delay queue processing if a download_delay is configured
|
||||
now = time()
|
||||
delay = slot.download_delay()
|
||||
if delay:
|
||||
penalty = delay - now + slot.lastseen
|
||||
if penalty > 0:
|
||||
slot.latercall = reactor.callLater(penalty, self._process_queue, spider, slot)
|
||||
return
|
||||
|
||||
# Process enqueued requests if there are free slots to transfer for this slot
|
||||
while slot.queue and slot.free_transfer_slots() > 0:
|
||||
slot.lastseen = now
|
||||
request, deferred = slot.queue.popleft()
|
||||
dfd = self._download(slot, request, spider)
|
||||
dfd.chainDeferred(deferred)
|
||||
# prevent burst if inter-request delays were configured
|
||||
if delay:
|
||||
self._process_queue(spider, slot)
|
||||
break
|
||||
|
||||
def _download(self, slot, request, spider):
|
||||
# The order is very important for the following deferreds. Do not change!
|
||||
|
||||
# 1. Create the download deferred
|
||||
dfd = mustbe_deferred(self.handlers.download_request, request, spider)
|
||||
|
||||
# 2. Notify response_downloaded listeners about the recent download
|
||||
# before querying queue for next request
|
||||
def _downloaded(response):
|
||||
self.signals.send_catch_log(signal=signals.response_downloaded,
|
||||
response=response,
|
||||
request=request,
|
||||
spider=spider)
|
||||
return response
|
||||
dfd.addCallback(_downloaded)
|
||||
|
||||
# 3. After response arrives, remove the request from transferring
|
||||
# state to free up the transferring slot so it can be used by the
|
||||
# following requests (perhaps those which came from the downloader
|
||||
# middleware itself)
|
||||
slot.transferring.add(request)
|
||||
|
||||
def finish_transferring(_):
|
||||
slot.transferring.remove(request)
|
||||
self._process_queue(spider, slot)
|
||||
self.signals.send_catch_log(signal=signals.request_left_downloader,
|
||||
request=request,
|
||||
spider=spider)
|
||||
return _
|
||||
|
||||
return dfd.addBoth(finish_transferring)
|
||||
|
||||
def close(self):
|
||||
self._slot_gc_loop.stop()
|
||||
for slot in self.slots.values():
|
||||
slot.close()
|
||||
|
||||
def _slot_gc(self, age=60):
|
||||
mintime = time() - age
|
||||
for key, slot in list(self.slots.items()):
|
||||
if not slot.active and slot.lastseen + slot.delay < mintime:
|
||||
self.slots.pop(key).close()
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
from OpenSSL import SSL
|
||||
from twisted.internet.ssl import optionsForClientTLS, CertificateOptions, platformTrust, AcceptableCiphers
|
||||
from twisted.web.client import BrowserLikePolicyForHTTPS
|
||||
from twisted.web.iweb import IPolicyForHTTPS
|
||||
from zope.interface.declarations import implementer
|
||||
|
||||
from scrapy.core.downloader.tls import ScrapyClientTLSOptions, DEFAULT_CIPHERS
|
||||
|
||||
|
||||
@implementer(IPolicyForHTTPS)
|
||||
class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
|
||||
"""
|
||||
Non-peer-certificate verifying HTTPS context factory
|
||||
|
||||
Default OpenSSL method is TLS_METHOD (also called SSLv23_METHOD)
|
||||
which allows TLS protocol negotiation
|
||||
|
||||
'A TLS/SSL connection established with [this method] may
|
||||
understand the SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols.'
|
||||
"""
|
||||
|
||||
def __init__(self, method=SSL.SSLv23_METHOD, tls_verbose_logging=False, tls_ciphers=None, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._ssl_method = method
|
||||
self.tls_verbose_logging = tls_verbose_logging
|
||||
if tls_ciphers:
|
||||
self.tls_ciphers = AcceptableCiphers.fromOpenSSLCipherString(tls_ciphers)
|
||||
else:
|
||||
self.tls_ciphers = DEFAULT_CIPHERS
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings, method=SSL.SSLv23_METHOD, *args, **kwargs):
|
||||
tls_verbose_logging = settings.getbool('DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING')
|
||||
tls_ciphers = settings['DOWNLOADER_CLIENT_TLS_CIPHERS']
|
||||
return cls(method=method, tls_verbose_logging=tls_verbose_logging, tls_ciphers=tls_ciphers, *args, **kwargs)
|
||||
|
||||
def getCertificateOptions(self):
|
||||
# setting verify=True will require you to provide CAs
|
||||
# to verify against; in other words: it's not that simple
|
||||
|
||||
# backward-compatible SSL/TLS method:
|
||||
#
|
||||
# * this will respect `method` attribute in often recommended
|
||||
# `ScrapyClientContextFactory` subclass
|
||||
# (https://github.com/scrapy/scrapy/issues/1429#issuecomment-131782133)
|
||||
#
|
||||
# * getattr() for `_ssl_method` attribute for context factories
|
||||
# not calling super().__init__
|
||||
return CertificateOptions(
|
||||
verify=False,
|
||||
method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
|
||||
fixBrokenPeers=True,
|
||||
acceptableCiphers=self.tls_ciphers,
|
||||
)
|
||||
|
||||
# kept for old-style HTTP/1.0 downloader context twisted calls,
|
||||
# e.g. connectSSL()
|
||||
def getContext(self, hostname=None, port=None):
|
||||
return self.getCertificateOptions().getContext()
|
||||
|
||||
def creatorForNetloc(self, hostname, port):
|
||||
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext(),
|
||||
verbose_logging=self.tls_verbose_logging)
|
||||
|
||||
|
||||
@implementer(IPolicyForHTTPS)
|
||||
class BrowserLikeContextFactory(ScrapyClientContextFactory):
|
||||
"""
|
||||
Twisted-recommended context factory for web clients.
|
||||
|
||||
Quoting the documentation of the :class:`~twisted.web.client.Agent` class:
|
||||
|
||||
The default is to use a
|
||||
:class:`~twisted.web.client.BrowserLikePolicyForHTTPS`, so unless you
|
||||
have special requirements you can leave this as-is.
|
||||
|
||||
:meth:`creatorForNetloc` is the same as
|
||||
:class:`~twisted.web.client.BrowserLikePolicyForHTTPS` except this context
|
||||
factory allows setting the TLS/SSL method to use.
|
||||
|
||||
The default OpenSSL method is ``TLS_METHOD`` (also called
|
||||
``SSLv23_METHOD``) which allows TLS protocol negotiation.
|
||||
"""
|
||||
def creatorForNetloc(self, hostname, port):
|
||||
|
||||
# trustRoot set to platformTrust() will use the platform's root CAs.
|
||||
#
|
||||
# This means that a website like https://www.cacert.org will be rejected
|
||||
# by default, since CAcert.org CA certificate is seldom shipped.
|
||||
return optionsForClientTLS(
|
||||
hostname=hostname.decode("ascii"),
|
||||
trustRoot=platformTrust(),
|
||||
extraCertificateOptions={'method': self._ssl_method},
|
||||
)
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
"""Download handlers for different schemes"""
|
||||
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured, NotSupported
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DownloadHandlers:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self._crawler = crawler
|
||||
self._schemes = {} # stores acceptable schemes on instancing
|
||||
self._handlers = {} # stores instanced handlers for schemes
|
||||
self._notconfigured = {} # remembers failed handlers
|
||||
handlers = without_none_values(
|
||||
crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
|
||||
for scheme, clspath in handlers.items():
|
||||
self._schemes[scheme] = clspath
|
||||
self._load_handler(scheme, skip_lazy=True)
|
||||
|
||||
crawler.signals.connect(self._close, signals.engine_stopped)
|
||||
|
||||
def _get_handler(self, scheme):
|
||||
"""Lazy-load the downloadhandler for a scheme
|
||||
only on the first request for that scheme.
|
||||
"""
|
||||
if scheme in self._handlers:
|
||||
return self._handlers[scheme]
|
||||
if scheme in self._notconfigured:
|
||||
return None
|
||||
if scheme not in self._schemes:
|
||||
self._notconfigured[scheme] = 'no handler available for that scheme'
|
||||
return None
|
||||
|
||||
return self._load_handler(scheme)
|
||||
|
||||
def _load_handler(self, scheme, skip_lazy=False):
|
||||
path = self._schemes[scheme]
|
||||
try:
|
||||
dhcls = load_object(path)
|
||||
if skip_lazy and getattr(dhcls, 'lazy', True):
|
||||
return None
|
||||
dh = create_instance(
|
||||
objcls=dhcls,
|
||||
settings=self._crawler.settings,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
except NotConfigured as ex:
|
||||
self._notconfigured[scheme] = str(ex)
|
||||
return None
|
||||
except Exception as ex:
|
||||
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
|
||||
{"clspath": path, "scheme": scheme},
|
||||
exc_info=True, extra={'crawler': self._crawler})
|
||||
self._notconfigured[scheme] = str(ex)
|
||||
return None
|
||||
else:
|
||||
self._handlers[scheme] = dh
|
||||
return dh
|
||||
|
||||
def download_request(self, request, spider):
|
||||
scheme = urlparse_cached(request).scheme
|
||||
handler = self._get_handler(scheme)
|
||||
if not handler:
|
||||
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
|
||||
return handler.download_request(request, spider)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _close(self, *_a, **_kw):
|
||||
for dh in self._handlers.values():
|
||||
if hasattr(dh, 'close'):
|
||||
yield dh.close()
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
from w3lib.url import parse_data_uri
|
||||
|
||||
from scrapy.http import TextResponse
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.decorators import defers
|
||||
|
||||
|
||||
class DataURIDownloadHandler:
|
||||
lazy = False
|
||||
|
||||
@defers
|
||||
def download_request(self, request, spider):
|
||||
uri = parse_data_uri(request.url)
|
||||
respcls = responsetypes.from_mimetype(uri.media_type)
|
||||
|
||||
resp_kwargs = {}
|
||||
if (issubclass(respcls, TextResponse)
|
||||
and uri.media_type.split('/')[0] == 'text'):
|
||||
charset = uri.media_type_parameters.get('charset')
|
||||
resp_kwargs['encoding'] = charset
|
||||
|
||||
return respcls(url=request.url, body=uri.data, **resp_kwargs)
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
from w3lib.url import file_uri_to_path
|
||||
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.decorators import defers
|
||||
|
||||
|
||||
class FileDownloadHandler:
|
||||
lazy = False
|
||||
|
||||
@defers
|
||||
def download_request(self, request, spider):
|
||||
filepath = file_uri_to_path(request.url)
|
||||
with open(filepath, 'rb') as fo:
|
||||
body = fo.read()
|
||||
respcls = responsetypes.from_args(filename=filepath, body=body)
|
||||
return respcls(url=request.url, body=body)
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
"""
|
||||
An asynchronous FTP file download handler for scrapy which somehow emulates an http response.
|
||||
|
||||
FTP connection parameters are passed using the request meta field:
|
||||
- ftp_user (required)
|
||||
- ftp_password (required)
|
||||
- ftp_passive (by default, enabled) sets FTP connection passive mode
|
||||
- ftp_local_filename
|
||||
- If not given, file data will come in the response.body, as a normal scrapy Response,
|
||||
which will imply that the entire file will be on memory.
|
||||
- if given, file data will be saved in a local file with the given name
|
||||
This helps when downloading very big files to avoid memory issues. In addition, for
|
||||
convenience the local file name will also be given in the response body.
|
||||
|
||||
The status of the built html response will be, by default
|
||||
- 200 in case of success
|
||||
- 404 in case specified file was not found in the server (ftp code 550)
|
||||
|
||||
or raise corresponding ftp exception otherwise
|
||||
|
||||
The matching from server ftp command return codes to html response codes is defined in the
|
||||
CODE_MAPPING attribute of the handler class. The key 'default' is used for any code
|
||||
that is not explicitly present among the map keys. You may need to overwrite this
|
||||
mapping if want a different behaviour than default.
|
||||
|
||||
In case of status 200 request, response.headers will come with two keys:
|
||||
'Local Filename' - with the value of the local filename if given
|
||||
'Size' - with size of the downloaded data
|
||||
"""
|
||||
|
||||
import re
|
||||
from io import BytesIO
|
||||
from urllib.parse import unquote
|
||||
|
||||
from twisted.internet.protocol import ClientCreator, Protocol
|
||||
from twisted.protocols.ftp import CommandFailed, FTPClient
|
||||
|
||||
from scrapy.http import Response
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class ReceivedDataProtocol(Protocol):
|
||||
def __init__(self, filename=None):
|
||||
self.__filename = filename
|
||||
self.body = open(filename, "wb") if filename else BytesIO()
|
||||
self.size = 0
|
||||
|
||||
def dataReceived(self, data):
|
||||
self.body.write(data)
|
||||
self.size += len(data)
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
return self.__filename
|
||||
|
||||
def close(self):
|
||||
self.body.close() if self.filename else self.body.seek(0)
|
||||
|
||||
|
||||
_CODE_RE = re.compile(r"\d+")
|
||||
|
||||
|
||||
class FTPDownloadHandler:
|
||||
lazy = False
|
||||
|
||||
CODE_MAPPING = {
|
||||
"550": 404,
|
||||
"default": 503,
|
||||
}
|
||||
|
||||
def __init__(self, settings):
|
||||
self.default_user = settings['FTP_USER']
|
||||
self.default_password = settings['FTP_PASSWORD']
|
||||
self.passive_mode = settings['FTP_PASSIVE_MODE']
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
from twisted.internet import reactor
|
||||
parsed_url = urlparse_cached(request)
|
||||
user = request.meta.get("ftp_user", self.default_user)
|
||||
password = request.meta.get("ftp_password", self.default_password)
|
||||
passive_mode = 1 if bool(request.meta.get("ftp_passive",
|
||||
self.passive_mode)) else 0
|
||||
creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
|
||||
dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
|
||||
return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
|
||||
|
||||
def gotClient(self, client, request, filepath):
|
||||
self.client = client
|
||||
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
|
||||
return client.retrieveFile(filepath, protocol).addCallbacks(
|
||||
callback=self._build_response,
|
||||
callbackArgs=(request, protocol),
|
||||
errback=self._failed,
|
||||
errbackArgs=(request,),
|
||||
)
|
||||
|
||||
def _build_response(self, result, request, protocol):
|
||||
self.result = result
|
||||
respcls = responsetypes.from_args(url=request.url)
|
||||
protocol.close()
|
||||
body = protocol.filename or protocol.body.read()
|
||||
headers = {"local filename": protocol.filename or '', "size": protocol.size}
|
||||
return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
|
||||
|
||||
def _failed(self, result, request):
|
||||
message = result.getErrorMessage()
|
||||
if result.type == CommandFailed:
|
||||
m = _CODE_RE.search(message)
|
||||
if m:
|
||||
ftpcode = m.group()
|
||||
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
|
||||
return Response(url=request.url, status=httpcode, body=to_bytes(message))
|
||||
raise result.type(result.value)
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
|
||||
from scrapy.core.downloader.handlers.http11 import (
|
||||
HTTP11DownloadHandler as HTTPDownloadHandler,
|
||||
)
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
"""Download handlers for http and https schemes
|
||||
"""
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
class HTTP10DownloadHandler:
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings, crawler=None):
|
||||
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
|
||||
self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
self._settings = settings
|
||||
self._crawler = crawler
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings, crawler)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
factory = self.HTTPClientFactory(request)
|
||||
self._connect(factory)
|
||||
return factory.deferred
|
||||
|
||||
def _connect(self, factory):
|
||||
from twisted.internet import reactor
|
||||
host, port = to_unicode(factory.host), factory.port
|
||||
if factory.scheme == b'https':
|
||||
client_context_factory = create_instance(
|
||||
objcls=self.ClientContextFactory,
|
||||
settings=self._settings,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
return reactor.connectSSL(host, port, factory, client_context_factory)
|
||||
else:
|
||||
return reactor.connectTCP(host, port, factory)
|
||||
|
|
@ -0,0 +1,568 @@
|
|||
"""Download handlers for http and https schemes"""
|
||||
|
||||
import ipaddress
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
from time import time
|
||||
from urllib.parse import urldefrag
|
||||
|
||||
from twisted.internet import defer, protocol, ssl
|
||||
from twisted.internet.endpoints import TCP4ClientEndpoint
|
||||
from twisted.internet.error import TimeoutError
|
||||
from twisted.python.failure import Failure
|
||||
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
|
||||
from twisted.web.http import _DataLoss, PotentialDataLoss
|
||||
from twisted.web.http_headers import Headers as TxHeaders
|
||||
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
|
||||
from zope.interface import implementer
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.core.downloader.tls import openssl_methods
|
||||
from scrapy.core.downloader.webclient import _parse
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
|
||||
from scrapy.http import Headers
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTTP11DownloadHandler:
|
||||
lazy = False
|
||||
|
||||
def __init__(self, settings, crawler=None):
|
||||
self._crawler = crawler
|
||||
|
||||
from twisted.internet import reactor
|
||||
self._pool = HTTPConnectionPool(reactor, persistent=True)
|
||||
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
|
||||
self._pool._factory.noisy = False
|
||||
|
||||
self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
|
||||
self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
|
||||
# try method-aware context factory
|
||||
try:
|
||||
self._contextFactory = create_instance(
|
||||
objcls=self._contextFactoryClass,
|
||||
settings=settings,
|
||||
crawler=crawler,
|
||||
method=self._sslMethod,
|
||||
)
|
||||
except TypeError:
|
||||
# use context factory defaults
|
||||
self._contextFactory = create_instance(
|
||||
objcls=self._contextFactoryClass,
|
||||
settings=settings,
|
||||
crawler=crawler,
|
||||
)
|
||||
msg = f"""
|
||||
'{settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]}' does not accept `method` \
|
||||
argument (type OpenSSL.SSL method, e.g. OpenSSL.SSL.SSLv23_METHOD) and/or \
|
||||
`tls_verbose_logging` argument and/or `tls_ciphers` argument.\
|
||||
Please upgrade your context factory class to handle them or ignore them."""
|
||||
warnings.warn(msg)
|
||||
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
|
||||
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
|
||||
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
|
||||
self._disconnect_timeout = 1
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings, crawler)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
"""Return a deferred for the HTTP download"""
|
||||
agent = ScrapyAgent(
|
||||
contextFactory=self._contextFactory,
|
||||
pool=self._pool,
|
||||
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
|
||||
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
|
||||
fail_on_dataloss=self._fail_on_dataloss,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
return agent.download_request(request)
|
||||
|
||||
def close(self):
|
||||
from twisted.internet import reactor
|
||||
d = self._pool.closeCachedConnections()
|
||||
# closeCachedConnections will hang on network or server issues, so
|
||||
# we'll manually timeout the deferred.
|
||||
#
|
||||
# Twisted issue addressing this problem can be found here:
|
||||
# https://twistedmatrix.com/trac/ticket/7738.
|
||||
#
|
||||
# closeCachedConnections doesn't handle external errbacks, so we'll
|
||||
# issue a callback after `_disconnect_timeout` seconds.
|
||||
delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, [])
|
||||
|
||||
def cancel_delayed_call(result):
|
||||
if delayed_call.active():
|
||||
delayed_call.cancel()
|
||||
return result
|
||||
|
||||
d.addBoth(cancel_delayed_call)
|
||||
return d
|
||||
|
||||
|
||||
class TunnelError(Exception):
|
||||
"""An HTTP CONNECT tunnel could not be established by the proxy."""
|
||||
|
||||
|
||||
class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
|
||||
"""An endpoint that tunnels through proxies to allow HTTPS downloads. To
|
||||
accomplish that, this endpoint sends an HTTP CONNECT to the proxy.
|
||||
The HTTP CONNECT is always sent when using this endpoint, I think this could
|
||||
be improved as the CONNECT will be redundant if the connection associated
|
||||
with this endpoint comes from the pool and a CONNECT has already been issued
|
||||
for it.
|
||||
"""
|
||||
|
||||
_responseMatcher = re.compile(br'HTTP/1\.. (?P<status>\d{3})(?P<reason>.{,32})')
|
||||
|
||||
def __init__(self, reactor, host, port, proxyConf, contextFactory, timeout=30, bindAddress=None):
|
||||
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
|
||||
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
|
||||
self._tunnelReadyDeferred = defer.Deferred()
|
||||
self._tunneledHost = host
|
||||
self._tunneledPort = port
|
||||
self._contextFactory = contextFactory
|
||||
self._connectBuffer = bytearray()
|
||||
|
||||
def requestTunnel(self, protocol):
|
||||
"""Asks the proxy to open a tunnel."""
|
||||
tunnelReq = tunnel_request_data(self._tunneledHost, self._tunneledPort, self._proxyAuthHeader)
|
||||
protocol.transport.write(tunnelReq)
|
||||
self._protocolDataReceived = protocol.dataReceived
|
||||
protocol.dataReceived = self.processProxyResponse
|
||||
self._protocol = protocol
|
||||
return protocol
|
||||
|
||||
def processProxyResponse(self, rcvd_bytes):
|
||||
"""Processes the response from the proxy. If the tunnel is successfully
|
||||
created, notifies the client that we are ready to send requests. If not
|
||||
raises a TunnelError.
|
||||
"""
|
||||
self._connectBuffer += rcvd_bytes
|
||||
# make sure that enough (all) bytes are consumed
|
||||
# and that we've got all HTTP headers (ending with a blank line)
|
||||
# from the proxy so that we don't send those bytes to the TLS layer
|
||||
#
|
||||
# see https://github.com/scrapy/scrapy/issues/2491
|
||||
if b'\r\n\r\n' not in self._connectBuffer:
|
||||
return
|
||||
self._protocol.dataReceived = self._protocolDataReceived
|
||||
respm = TunnelingTCP4ClientEndpoint._responseMatcher.match(self._connectBuffer)
|
||||
if respm and int(respm.group('status')) == 200:
|
||||
# set proper Server Name Indication extension
|
||||
sslOptions = self._contextFactory.creatorForNetloc(self._tunneledHost, self._tunneledPort)
|
||||
self._protocol.transport.startTLS(sslOptions, self._protocolFactory)
|
||||
self._tunnelReadyDeferred.callback(self._protocol)
|
||||
else:
|
||||
if respm:
|
||||
extra = {'status': int(respm.group('status')),
|
||||
'reason': respm.group('reason').strip()}
|
||||
else:
|
||||
extra = rcvd_bytes[:32]
|
||||
self._tunnelReadyDeferred.errback(
|
||||
TunnelError('Could not open CONNECT tunnel with proxy '
|
||||
f'{self._host}:{self._port} [{extra!r}]')
|
||||
)
|
||||
|
||||
def connectFailed(self, reason):
|
||||
"""Propagates the errback to the appropriate deferred."""
|
||||
self._tunnelReadyDeferred.errback(reason)
|
||||
|
||||
def connect(self, protocolFactory):
|
||||
self._protocolFactory = protocolFactory
|
||||
connectDeferred = super().connect(protocolFactory)
|
||||
connectDeferred.addCallback(self.requestTunnel)
|
||||
connectDeferred.addErrback(self.connectFailed)
|
||||
return self._tunnelReadyDeferred
|
||||
|
||||
|
||||
def tunnel_request_data(host, port, proxy_auth_header=None):
|
||||
r"""
|
||||
Return binary content of a CONNECT request.
|
||||
|
||||
>>> from scrapy.utils.python import to_unicode as s
|
||||
>>> s(tunnel_request_data("example.com", 8080))
|
||||
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
|
||||
>>> s(tunnel_request_data("example.com", 8080, b"123"))
|
||||
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n'
|
||||
>>> s(tunnel_request_data(b"example.com", "8090"))
|
||||
'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
|
||||
"""
|
||||
host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
|
||||
tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
|
||||
tunnel_req += b'Host: ' + host_value + b'\r\n'
|
||||
if proxy_auth_header:
|
||||
tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
|
||||
tunnel_req += b'\r\n'
|
||||
return tunnel_req
|
||||
|
||||
|
||||
class TunnelingAgent(Agent):
|
||||
"""An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
|
||||
downloads. It may look strange that we have chosen to subclass Agent and not
|
||||
ProxyAgent but consider that after the tunnel is opened the proxy is
|
||||
transparent to the client; thus the agent should behave like there is no
|
||||
proxy involved.
|
||||
"""
|
||||
|
||||
def __init__(self, reactor, proxyConf, contextFactory=None,
|
||||
connectTimeout=None, bindAddress=None, pool=None):
|
||||
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
|
||||
self._proxyConf = proxyConf
|
||||
self._contextFactory = contextFactory
|
||||
|
||||
def _getEndpoint(self, uri):
|
||||
return TunnelingTCP4ClientEndpoint(
|
||||
reactor=self._reactor,
|
||||
host=uri.host,
|
||||
port=uri.port,
|
||||
proxyConf=self._proxyConf,
|
||||
contextFactory=self._contextFactory,
|
||||
timeout=self._endpointFactory._connectTimeout,
|
||||
bindAddress=self._endpointFactory._bindAddress,
|
||||
)
|
||||
|
||||
def _requestWithEndpoint(self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath):
|
||||
# proxy host and port are required for HTTP pool `key`
|
||||
# otherwise, same remote host connection request could reuse
|
||||
# a cached tunneled connection to a different proxy
|
||||
key = key + self._proxyConf
|
||||
return super()._requestWithEndpoint(
|
||||
key=key,
|
||||
endpoint=endpoint,
|
||||
method=method,
|
||||
parsedURI=parsedURI,
|
||||
headers=headers,
|
||||
bodyProducer=bodyProducer,
|
||||
requestPath=requestPath,
|
||||
)
|
||||
|
||||
|
||||
class ScrapyProxyAgent(Agent):
|
||||
|
||||
def __init__(self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None):
|
||||
super().__init__(
|
||||
reactor=reactor,
|
||||
connectTimeout=connectTimeout,
|
||||
bindAddress=bindAddress,
|
||||
pool=pool,
|
||||
)
|
||||
self._proxyURI = URI.fromBytes(proxyURI)
|
||||
|
||||
def request(self, method, uri, headers=None, bodyProducer=None):
|
||||
"""
|
||||
Issue a new request via the configured proxy.
|
||||
"""
|
||||
# Cache *all* connections under the same key, since we are only
|
||||
# connecting to a single destination, the proxy:
|
||||
return self._requestWithEndpoint(
|
||||
key=("http-proxy", self._proxyURI.host, self._proxyURI.port),
|
||||
endpoint=self._getEndpoint(self._proxyURI),
|
||||
method=method,
|
||||
parsedURI=URI.fromBytes(uri),
|
||||
headers=headers,
|
||||
bodyProducer=bodyProducer,
|
||||
requestPath=uri,
|
||||
)
|
||||
|
||||
|
||||
class ScrapyAgent:
|
||||
|
||||
_Agent = Agent
|
||||
_ProxyAgent = ScrapyProxyAgent
|
||||
_TunnelingAgent = TunnelingAgent
|
||||
|
||||
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
|
||||
maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
|
||||
self._contextFactory = contextFactory
|
||||
self._connectTimeout = connectTimeout
|
||||
self._bindAddress = bindAddress
|
||||
self._pool = pool
|
||||
self._maxsize = maxsize
|
||||
self._warnsize = warnsize
|
||||
self._fail_on_dataloss = fail_on_dataloss
|
||||
self._txresponse = None
|
||||
self._crawler = crawler
|
||||
|
||||
def _get_agent(self, request, timeout):
|
||||
from twisted.internet import reactor
|
||||
bindaddress = request.meta.get('bindaddress') or self._bindAddress
|
||||
proxy = request.meta.get('proxy')
|
||||
if proxy:
|
||||
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
|
||||
scheme = _parse(request.url)[0]
|
||||
proxyHost = to_unicode(proxyHost)
|
||||
omitConnectTunnel = b'noconnect' in proxyParams
|
||||
if omitConnectTunnel:
|
||||
warnings.warn("Using HTTPS proxies in the noconnect mode is deprecated. "
|
||||
"If you use Crawlera, it doesn't require this mode anymore, "
|
||||
"so you should update scrapy-crawlera to 1.3.0+ "
|
||||
"and remove '?noconnect' from the Crawlera URL.",
|
||||
ScrapyDeprecationWarning)
|
||||
if scheme == b'https' and not omitConnectTunnel:
|
||||
proxyAuth = request.headers.get(b'Proxy-Authorization', None)
|
||||
proxyConf = (proxyHost, proxyPort, proxyAuth)
|
||||
return self._TunnelingAgent(
|
||||
reactor=reactor,
|
||||
proxyConf=proxyConf,
|
||||
contextFactory=self._contextFactory,
|
||||
connectTimeout=timeout,
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
)
|
||||
else:
|
||||
return self._ProxyAgent(
|
||||
reactor=reactor,
|
||||
proxyURI=to_bytes(proxy, encoding='ascii'),
|
||||
connectTimeout=timeout,
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
)
|
||||
|
||||
return self._Agent(
|
||||
reactor=reactor,
|
||||
contextFactory=self._contextFactory,
|
||||
connectTimeout=timeout,
|
||||
bindAddress=bindaddress,
|
||||
pool=self._pool,
|
||||
)
|
||||
|
||||
def download_request(self, request):
|
||||
from twisted.internet import reactor
|
||||
timeout = request.meta.get('download_timeout') or self._connectTimeout
|
||||
agent = self._get_agent(request, timeout)
|
||||
|
||||
# request details
|
||||
url = urldefrag(request.url)[0]
|
||||
method = to_bytes(request.method)
|
||||
headers = TxHeaders(request.headers)
|
||||
if isinstance(agent, self._TunnelingAgent):
|
||||
headers.removeHeader(b'Proxy-Authorization')
|
||||
if request.body:
|
||||
bodyproducer = _RequestBodyProducer(request.body)
|
||||
else:
|
||||
bodyproducer = None
|
||||
start_time = time()
|
||||
d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
|
||||
# set download latency
|
||||
d.addCallback(self._cb_latency, request, start_time)
|
||||
# response body is ready to be consumed
|
||||
d.addCallback(self._cb_bodyready, request)
|
||||
d.addCallback(self._cb_bodydone, request, url)
|
||||
# check download timeout
|
||||
self._timeout_cl = reactor.callLater(timeout, d.cancel)
|
||||
d.addBoth(self._cb_timeout, request, url, timeout)
|
||||
return d
|
||||
|
||||
def _cb_timeout(self, result, request, url, timeout):
|
||||
if self._timeout_cl.active():
|
||||
self._timeout_cl.cancel()
|
||||
return result
|
||||
# needed for HTTPS requests, otherwise _ResponseReader doesn't
|
||||
# receive connectionLost()
|
||||
if self._txresponse:
|
||||
self._txresponse._transport.stopProducing()
|
||||
|
||||
raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
|
||||
|
||||
def _cb_latency(self, result, request, start_time):
|
||||
request.meta['download_latency'] = time() - start_time
|
||||
return result
|
||||
|
||||
def _cb_bodyready(self, txresponse, request):
|
||||
# deliverBody hangs for responses without body
|
||||
if txresponse.length == 0:
|
||||
return {
|
||||
"txresponse": txresponse,
|
||||
"body": b"",
|
||||
"flags": None,
|
||||
"certificate": None,
|
||||
"ip_address": None,
|
||||
}
|
||||
|
||||
maxsize = request.meta.get('download_maxsize', self._maxsize)
|
||||
warnsize = request.meta.get('download_warnsize', self._warnsize)
|
||||
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
|
||||
fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
|
||||
|
||||
if maxsize and expected_size > maxsize:
|
||||
warning_msg = ("Cancelling download of %(url)s: expected response "
|
||||
"size (%(size)s) larger than download max size (%(maxsize)s).")
|
||||
warning_args = {'url': request.url, 'size': expected_size, 'maxsize': maxsize}
|
||||
|
||||
logger.warning(warning_msg, warning_args)
|
||||
|
||||
txresponse._transport._producer.loseConnection()
|
||||
raise defer.CancelledError(warning_msg % warning_args)
|
||||
|
||||
if warnsize and expected_size > warnsize:
|
||||
logger.warning("Expected response size (%(size)s) larger than "
|
||||
"download warn size (%(warnsize)s) in request %(request)s.",
|
||||
{'size': expected_size, 'warnsize': warnsize, 'request': request})
|
||||
|
||||
def _cancel(_):
|
||||
# Abort connection immediately.
|
||||
txresponse._transport._producer.abortConnection()
|
||||
|
||||
d = defer.Deferred(_cancel)
|
||||
txresponse.deliverBody(
|
||||
_ResponseReader(
|
||||
finished=d,
|
||||
txresponse=txresponse,
|
||||
request=request,
|
||||
maxsize=maxsize,
|
||||
warnsize=warnsize,
|
||||
fail_on_dataloss=fail_on_dataloss,
|
||||
crawler=self._crawler,
|
||||
)
|
||||
)
|
||||
|
||||
# save response for timeouts
|
||||
self._txresponse = txresponse
|
||||
|
||||
return d
|
||||
|
||||
def _cb_bodydone(self, result, request, url):
|
||||
headers = Headers(result["txresponse"].headers.getAllRawHeaders())
|
||||
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
|
||||
response = respcls(
|
||||
url=url,
|
||||
status=int(result["txresponse"].code),
|
||||
headers=headers,
|
||||
body=result["body"],
|
||||
flags=result["flags"],
|
||||
certificate=result["certificate"],
|
||||
ip_address=result["ip_address"],
|
||||
)
|
||||
if result.get("failure"):
|
||||
result["failure"].value.response = response
|
||||
return result["failure"]
|
||||
return response
|
||||
|
||||
|
||||
@implementer(IBodyProducer)
|
||||
class _RequestBodyProducer:
|
||||
|
||||
def __init__(self, body):
|
||||
self.body = body
|
||||
self.length = len(body)
|
||||
|
||||
def startProducing(self, consumer):
|
||||
consumer.write(self.body)
|
||||
return defer.succeed(None)
|
||||
|
||||
def pauseProducing(self):
|
||||
pass
|
||||
|
||||
def stopProducing(self):
|
||||
pass
|
||||
|
||||
|
||||
class _ResponseReader(protocol.Protocol):
|
||||
|
||||
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
|
||||
self._finished = finished
|
||||
self._txresponse = txresponse
|
||||
self._request = request
|
||||
self._bodybuf = BytesIO()
|
||||
self._maxsize = maxsize
|
||||
self._warnsize = warnsize
|
||||
self._fail_on_dataloss = fail_on_dataloss
|
||||
self._fail_on_dataloss_warned = False
|
||||
self._reached_warnsize = False
|
||||
self._bytes_received = 0
|
||||
self._certificate = None
|
||||
self._ip_address = None
|
||||
self._crawler = crawler
|
||||
|
||||
def _finish_response(self, flags=None, failure=None):
|
||||
self._finished.callback({
|
||||
"txresponse": self._txresponse,
|
||||
"body": self._bodybuf.getvalue(),
|
||||
"flags": flags,
|
||||
"certificate": self._certificate,
|
||||
"ip_address": self._ip_address,
|
||||
"failure": failure,
|
||||
})
|
||||
|
||||
def connectionMade(self):
|
||||
if self._certificate is None:
|
||||
with suppress(AttributeError):
|
||||
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
|
||||
|
||||
if self._ip_address is None:
|
||||
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
|
||||
|
||||
def dataReceived(self, bodyBytes):
|
||||
# This maybe called several times after cancel was called with buffered data.
|
||||
if self._finished.called:
|
||||
return
|
||||
|
||||
self._bodybuf.write(bodyBytes)
|
||||
self._bytes_received += len(bodyBytes)
|
||||
|
||||
bytes_received_result = self._crawler.signals.send_catch_log(
|
||||
signal=signals.bytes_received,
|
||||
data=bodyBytes,
|
||||
request=self._request,
|
||||
spider=self._crawler.spider,
|
||||
)
|
||||
for handler, result in bytes_received_result:
|
||||
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
|
||||
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
|
||||
{"request": self._request, "handler": handler.__qualname__})
|
||||
self.transport._producer.loseConnection()
|
||||
failure = result if result.value.fail else None
|
||||
self._finish_response(flags=["download_stopped"], failure=failure)
|
||||
|
||||
if self._maxsize and self._bytes_received > self._maxsize:
|
||||
logger.warning("Received (%(bytes)s) bytes larger than download "
|
||||
"max size (%(maxsize)s) in request %(request)s.",
|
||||
{'bytes': self._bytes_received,
|
||||
'maxsize': self._maxsize,
|
||||
'request': self._request})
|
||||
# Clear buffer earlier to avoid keeping data in memory for a long time.
|
||||
self._bodybuf.truncate(0)
|
||||
self._finished.cancel()
|
||||
|
||||
if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
|
||||
self._reached_warnsize = True
|
||||
logger.warning("Received more bytes than download "
|
||||
"warn size (%(warnsize)s) in request %(request)s.",
|
||||
{'warnsize': self._warnsize,
|
||||
'request': self._request})
|
||||
|
||||
def connectionLost(self, reason):
|
||||
if self._finished.called:
|
||||
return
|
||||
|
||||
if reason.check(ResponseDone):
|
||||
self._finish_response()
|
||||
return
|
||||
|
||||
if reason.check(PotentialDataLoss):
|
||||
self._finish_response(flags=["partial"])
|
||||
return
|
||||
|
||||
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
|
||||
if not self._fail_on_dataloss:
|
||||
self._finish_response(flags=["dataloss"])
|
||||
return
|
||||
|
||||
elif not self._fail_on_dataloss_warned:
|
||||
logger.warning("Got data loss in %s. If you want to process broken "
|
||||
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
|
||||
" -- This message won't be shown in further requests",
|
||||
self._txresponse.request.absoluteURI.decode())
|
||||
self._fail_on_dataloss_warned = True
|
||||
|
||||
self._finished.errback(reason)
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
from urllib.parse import unquote
|
||||
|
||||
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.boto import is_botocore_available
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.misc import create_instance
|
||||
|
||||
|
||||
class S3DownloadHandler:
|
||||
|
||||
def __init__(self, settings, *,
|
||||
crawler=None,
|
||||
aws_access_key_id=None, aws_secret_access_key=None,
|
||||
httpdownloadhandler=HTTPDownloadHandler, **kw):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
|
||||
if not aws_access_key_id:
|
||||
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
|
||||
if not aws_secret_access_key:
|
||||
aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
|
||||
|
||||
# If no credentials could be found anywhere,
|
||||
# consider this an anonymous connection request by default;
|
||||
# unless 'anon' was set explicitly (True/False).
|
||||
anon = kw.get('anon')
|
||||
if anon is None and not aws_access_key_id and not aws_secret_access_key:
|
||||
kw['anon'] = True
|
||||
self.anon = kw.get('anon')
|
||||
|
||||
self._signer = None
|
||||
import botocore.auth
|
||||
import botocore.credentials
|
||||
kw.pop('anon', None)
|
||||
if kw:
|
||||
raise TypeError(f'Unexpected keyword arguments: {kw}')
|
||||
if not self.anon:
|
||||
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
|
||||
self._signer = SignerCls(botocore.credentials.Credentials(
|
||||
aws_access_key_id, aws_secret_access_key))
|
||||
|
||||
_http_handler = create_instance(
|
||||
objcls=httpdownloadhandler,
|
||||
settings=settings,
|
||||
crawler=crawler,
|
||||
)
|
||||
self._download_http = _http_handler.download_request
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, **kwargs):
|
||||
return cls(crawler.settings, crawler=crawler, **kwargs)
|
||||
|
||||
def download_request(self, request, spider):
|
||||
p = urlparse_cached(request)
|
||||
scheme = 'https' if request.meta.get('is_secure') else 'http'
|
||||
bucket = p.hostname
|
||||
path = p.path + '?' + p.query if p.query else p.path
|
||||
url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
|
||||
if self.anon:
|
||||
request = request.replace(url=url)
|
||||
elif self._signer is not None:
|
||||
import botocore.awsrequest
|
||||
awsrequest = botocore.awsrequest.AWSRequest(
|
||||
method=request.method,
|
||||
url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
|
||||
headers=request.headers.to_unicode_dict(),
|
||||
data=request.body)
|
||||
self._signer.add_auth(awsrequest)
|
||||
request = request.replace(
|
||||
url=url, headers=awsrequest.headers.items())
|
||||
else:
|
||||
signed_headers = self.conn.make_request(
|
||||
method=request.method,
|
||||
bucket=bucket,
|
||||
key=unquote(p.path),
|
||||
query_args=unquote(p.query),
|
||||
headers=request.headers,
|
||||
data=request.body,
|
||||
)
|
||||
request = request.replace(url=url, headers=signed_headers)
|
||||
return self._download_http(request, spider)
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
"""
|
||||
Downloader Middleware manager
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.exceptions import _InvalidOutput
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.defer import mustbe_deferred, deferred_from_coro
|
||||
from scrapy.utils.conf import build_component_list
|
||||
|
||||
|
||||
class DownloaderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
component_name = 'downloader middleware'
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(
|
||||
settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
|
||||
|
||||
def _add_middleware(self, mw):
|
||||
if hasattr(mw, 'process_request'):
|
||||
self.methods['process_request'].append(mw.process_request)
|
||||
if hasattr(mw, 'process_response'):
|
||||
self.methods['process_response'].appendleft(mw.process_response)
|
||||
if hasattr(mw, 'process_exception'):
|
||||
self.methods['process_exception'].appendleft(mw.process_exception)
|
||||
|
||||
def download(self, download_func, request, spider):
|
||||
@defer.inlineCallbacks
|
||||
def process_request(request):
|
||||
for method in self.methods['process_request']:
|
||||
response = yield deferred_from_coro(method(request=request, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_request must return None, Response or "
|
||||
f"Request, got {response.__class__.__name__}"
|
||||
)
|
||||
if response:
|
||||
return response
|
||||
return (yield download_func(request=request, spider=spider))
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def process_response(response):
|
||||
if response is None:
|
||||
raise TypeError("Received None in process_response")
|
||||
elif isinstance(response, Request):
|
||||
return response
|
||||
|
||||
for method in self.methods['process_response']:
|
||||
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_response must return Response or Request, "
|
||||
f"got {type(response)}"
|
||||
)
|
||||
if isinstance(response, Request):
|
||||
return response
|
||||
return response
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def process_exception(failure):
|
||||
exception = failure.value
|
||||
for method in self.methods['process_exception']:
|
||||
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
|
||||
if response is not None and not isinstance(response, (Response, Request)):
|
||||
raise _InvalidOutput(
|
||||
f"Middleware {method.__self__.__class__.__name__}"
|
||||
".process_exception must return None, Response or "
|
||||
f"Request, got {type(response)}"
|
||||
)
|
||||
if response:
|
||||
return response
|
||||
return failure
|
||||
|
||||
deferred = mustbe_deferred(process_request, request)
|
||||
deferred.addErrback(process_exception)
|
||||
deferred.addCallback(process_response)
|
||||
return deferred
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
import logging
|
||||
|
||||
from OpenSSL import SSL
|
||||
from service_identity.exceptions import CertificateError
|
||||
from twisted.internet._sslverify import ClientTLSOptions, verifyHostname, VerificationError
|
||||
from twisted.internet.ssl import AcceptableCiphers
|
||||
|
||||
from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
METHOD_SSLv3 = 'SSLv3'
|
||||
METHOD_TLS = 'TLS'
|
||||
METHOD_TLSv10 = 'TLSv1.0'
|
||||
METHOD_TLSv11 = 'TLSv1.1'
|
||||
METHOD_TLSv12 = 'TLSv1.2'
|
||||
|
||||
|
||||
openssl_methods = {
|
||||
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
|
||||
METHOD_SSLv3: SSL.SSLv3_METHOD, # SSL 3 (NOT recommended)
|
||||
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
|
||||
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
|
||||
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only
|
||||
}
|
||||
|
||||
|
||||
class ScrapyClientTLSOptions(ClientTLSOptions):
|
||||
"""
|
||||
SSL Client connection creator ignoring certificate verification errors
|
||||
(for genuinely invalid certificates or bugs in verification code).
|
||||
|
||||
Same as Twisted's private _sslverify.ClientTLSOptions,
|
||||
except that VerificationError, CertificateError and ValueError
|
||||
exceptions are caught, so that the connection is not closed, only
|
||||
logging warnings. Also, HTTPS connection parameters logging is added.
|
||||
"""
|
||||
|
||||
def __init__(self, hostname, ctx, verbose_logging=False):
|
||||
super().__init__(hostname, ctx)
|
||||
self.verbose_logging = verbose_logging
|
||||
|
||||
def _identityVerifyingInfoCallback(self, connection, where, ret):
|
||||
if where & SSL.SSL_CB_HANDSHAKE_START:
|
||||
connection.set_tlsext_host_name(self._hostnameBytes)
|
||||
elif where & SSL.SSL_CB_HANDSHAKE_DONE:
|
||||
if self.verbose_logging:
|
||||
logger.debug('SSL connection to %s using protocol %s, cipher %s',
|
||||
self._hostnameASCII,
|
||||
connection.get_protocol_version_name(),
|
||||
connection.get_cipher_name(),
|
||||
)
|
||||
server_cert = connection.get_peer_certificate()
|
||||
logger.debug('SSL connection certificate: issuer "%s", subject "%s"',
|
||||
x509name_to_string(server_cert.get_issuer()),
|
||||
x509name_to_string(server_cert.get_subject()),
|
||||
)
|
||||
key_info = get_temp_key_info(connection._ssl)
|
||||
if key_info:
|
||||
logger.debug('SSL temp key: %s', key_info)
|
||||
|
||||
try:
|
||||
verifyHostname(connection, self._hostnameASCII)
|
||||
except (CertificateError, VerificationError) as e:
|
||||
logger.warning(
|
||||
'Remote certificate is not valid for hostname "{}"; {}'.format(
|
||||
self._hostnameASCII, e))
|
||||
|
||||
except ValueError as e:
|
||||
logger.warning(
|
||||
'Ignoring error while verifying certificate '
|
||||
'from host "{}" (exception: {})'.format(
|
||||
self._hostnameASCII, repr(e)))
|
||||
|
||||
|
||||
DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT')
|
||||
|
|
@ -0,0 +1,212 @@
|
|||
from time import time
|
||||
from urllib.parse import urlparse, urlunparse, urldefrag
|
||||
|
||||
from twisted.web.http import HTTPClient
|
||||
from twisted.internet import defer, reactor
|
||||
from twisted.internet.protocol import ClientFactory
|
||||
|
||||
from scrapy.http import Headers
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
|
||||
def _parsed_url_args(parsed):
|
||||
# Assume parsed is urlparse-d from Request.url,
|
||||
# which was passed via safe_url_string and is ascii-only.
|
||||
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
||||
path = to_bytes(path, encoding="ascii")
|
||||
host = to_bytes(parsed.hostname, encoding="ascii")
|
||||
port = parsed.port
|
||||
scheme = to_bytes(parsed.scheme, encoding="ascii")
|
||||
netloc = to_bytes(parsed.netloc, encoding="ascii")
|
||||
if port is None:
|
||||
port = 443 if scheme == b'https' else 80
|
||||
return scheme, netloc, host, port, path
|
||||
|
||||
|
||||
def _parse(url):
|
||||
""" Return tuple of (scheme, netloc, host, port, path),
|
||||
all in bytes except for port which is int.
|
||||
Assume url is from Request.url, which was passed via safe_url_string
|
||||
and is ascii-only.
|
||||
"""
|
||||
url = url.strip()
|
||||
parsed = urlparse(url)
|
||||
return _parsed_url_args(parsed)
|
||||
|
||||
|
||||
class ScrapyHTTPPageGetter(HTTPClient):
|
||||
|
||||
delimiter = b'\n'
|
||||
|
||||
def connectionMade(self):
|
||||
self.headers = Headers() # bucket for response headers
|
||||
|
||||
# Method command
|
||||
self.sendCommand(self.factory.method, self.factory.path)
|
||||
# Headers
|
||||
for key, values in self.factory.headers.items():
|
||||
for value in values:
|
||||
self.sendHeader(key, value)
|
||||
self.endHeaders()
|
||||
# Body
|
||||
if self.factory.body is not None:
|
||||
self.transport.write(self.factory.body)
|
||||
|
||||
def lineReceived(self, line):
|
||||
return HTTPClient.lineReceived(self, line.rstrip())
|
||||
|
||||
def handleHeader(self, key, value):
|
||||
self.headers.appendlist(key, value)
|
||||
|
||||
def handleStatus(self, version, status, message):
|
||||
self.factory.gotStatus(version, status, message)
|
||||
|
||||
def handleEndHeaders(self):
|
||||
self.factory.gotHeaders(self.headers)
|
||||
|
||||
def connectionLost(self, reason):
|
||||
self._connection_lost_reason = reason
|
||||
HTTPClient.connectionLost(self, reason)
|
||||
self.factory.noPage(reason)
|
||||
|
||||
def handleResponse(self, response):
|
||||
if self.factory.method.upper() == b'HEAD':
|
||||
self.factory.page(b'')
|
||||
elif self.length is not None and self.length > 0:
|
||||
self.factory.noPage(self._connection_lost_reason)
|
||||
else:
|
||||
self.factory.page(response)
|
||||
self.transport.loseConnection()
|
||||
|
||||
def timeout(self):
|
||||
self.transport.loseConnection()
|
||||
|
||||
# transport cleanup needed for HTTPS connections
|
||||
if self.factory.url.startswith(b'https'):
|
||||
self.transport.stopProducing()
|
||||
|
||||
self.factory.noPage(
|
||||
defer.TimeoutError(f"Getting {self.factory.url} took longer "
|
||||
f"than {self.factory.timeout} seconds."))
|
||||
|
||||
|
||||
# This class used to inherit from Twisted’s
|
||||
# twisted.web.client.HTTPClientFactory. When that class was deprecated in
|
||||
# Twisted (https://github.com/twisted/twisted/pull/643), we merged its
|
||||
# non-overriden code into this class.
|
||||
class ScrapyHTTPClientFactory(ClientFactory):
|
||||
|
||||
protocol = ScrapyHTTPPageGetter
|
||||
|
||||
waiting = 1
|
||||
noisy = False
|
||||
followRedirect = False
|
||||
afterFoundGet = False
|
||||
|
||||
def _build_response(self, body, request):
|
||||
request.meta['download_latency'] = self.headers_time - self.start_time
|
||||
status = int(self.status)
|
||||
headers = Headers(self.response_headers)
|
||||
respcls = responsetypes.from_args(headers=headers, url=self._url)
|
||||
return respcls(url=self._url, status=status, headers=headers, body=body)
|
||||
|
||||
def _set_connection_attributes(self, request):
|
||||
parsed = urlparse_cached(request)
|
||||
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
|
||||
proxy = request.meta.get('proxy')
|
||||
if proxy:
|
||||
self.scheme, _, self.host, self.port, _ = _parse(proxy)
|
||||
self.path = self.url
|
||||
|
||||
def __init__(self, request, timeout=180):
|
||||
self._url = urldefrag(request.url)[0]
|
||||
# converting to bytes to comply to Twisted interface
|
||||
self.url = to_bytes(self._url, encoding='ascii')
|
||||
self.method = to_bytes(request.method, encoding='ascii')
|
||||
self.body = request.body or None
|
||||
self.headers = Headers(request.headers)
|
||||
self.response_headers = None
|
||||
self.timeout = request.meta.get('download_timeout') or timeout
|
||||
self.start_time = time()
|
||||
self.deferred = defer.Deferred().addCallback(self._build_response, request)
|
||||
|
||||
# Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
|
||||
# to have _disconnectedDeferred. See Twisted r32329.
|
||||
# As Scrapy implements it's own logic to handle redirects is not
|
||||
# needed to add the callback _waitForDisconnect.
|
||||
# Specifically this avoids the AttributeError exception when
|
||||
# clientConnectionFailed method is called.
|
||||
self._disconnectedDeferred = defer.Deferred()
|
||||
|
||||
self._set_connection_attributes(request)
|
||||
|
||||
# set Host header based on url
|
||||
self.headers.setdefault('Host', self.netloc)
|
||||
|
||||
# set Content-Length based len of body
|
||||
if self.body is not None:
|
||||
self.headers['Content-Length'] = len(self.body)
|
||||
# just in case a broken http/1.1 decides to keep connection alive
|
||||
self.headers.setdefault("Connection", "close")
|
||||
# Content-Length must be specified in POST method even with no body
|
||||
elif self.method == b'POST':
|
||||
self.headers['Content-Length'] = 0
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.__class__.__name__}: {self.url}>"
|
||||
|
||||
def _cancelTimeout(self, result, timeoutCall):
|
||||
if timeoutCall.active():
|
||||
timeoutCall.cancel()
|
||||
return result
|
||||
|
||||
def buildProtocol(self, addr):
|
||||
p = ClientFactory.buildProtocol(self, addr)
|
||||
p.followRedirect = self.followRedirect
|
||||
p.afterFoundGet = self.afterFoundGet
|
||||
if self.timeout:
|
||||
timeoutCall = reactor.callLater(self.timeout, p.timeout)
|
||||
self.deferred.addBoth(self._cancelTimeout, timeoutCall)
|
||||
return p
|
||||
|
||||
def gotHeaders(self, headers):
|
||||
self.headers_time = time()
|
||||
self.response_headers = headers
|
||||
|
||||
def gotStatus(self, version, status, message):
|
||||
"""
|
||||
Set the status of the request on us.
|
||||
@param version: The HTTP version.
|
||||
@type version: L{bytes}
|
||||
@param status: The HTTP status code, an integer represented as a
|
||||
bytestring.
|
||||
@type status: L{bytes}
|
||||
@param message: The HTTP status message.
|
||||
@type message: L{bytes}
|
||||
"""
|
||||
self.version, self.status, self.message = version, status, message
|
||||
|
||||
def page(self, page):
|
||||
if self.waiting:
|
||||
self.waiting = 0
|
||||
self.deferred.callback(page)
|
||||
|
||||
def noPage(self, reason):
|
||||
if self.waiting:
|
||||
self.waiting = 0
|
||||
self.deferred.errback(reason)
|
||||
|
||||
def clientConnectionFailed(self, _, reason):
|
||||
"""
|
||||
When a connection attempt fails, the request cannot be issued. If no
|
||||
result has yet been provided to the result Deferred, provide the
|
||||
connection failure reason as an error result.
|
||||
"""
|
||||
if self.waiting:
|
||||
self.waiting = 0
|
||||
# If the connection attempt failed, there is nothing more to
|
||||
# disconnect, so just fire that Deferred now.
|
||||
self._disconnectedDeferred.callback(None)
|
||||
self.deferred.errback(reason)
|
||||
360
venv/lib/python3.9/site-packages/scrapy/core/engine.py
Normal file
360
venv/lib/python3.9/site-packages/scrapy/core/engine.py
Normal file
|
|
@ -0,0 +1,360 @@
|
|||
"""
|
||||
This is the Scrapy engine which controls the Scheduler, Downloader and Spiders.
|
||||
|
||||
For more information see docs/topics/architecture.rst
|
||||
|
||||
"""
|
||||
import logging
|
||||
from time import time
|
||||
|
||||
from twisted.internet import defer, task
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.core.scraper import Scraper
|
||||
from scrapy.exceptions import DontCloseSpider
|
||||
from scrapy.http import Response, Request
|
||||
from scrapy.utils.misc import load_object
|
||||
from scrapy.utils.reactor import CallLaterOnce
|
||||
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Slot:
|
||||
|
||||
def __init__(self, start_requests, close_if_idle, nextcall, scheduler):
|
||||
self.closing = False
|
||||
self.inprogress = set() # requests in progress
|
||||
self.start_requests = iter(start_requests)
|
||||
self.close_if_idle = close_if_idle
|
||||
self.nextcall = nextcall
|
||||
self.scheduler = scheduler
|
||||
self.heartbeat = task.LoopingCall(nextcall.schedule)
|
||||
|
||||
def add_request(self, request):
|
||||
self.inprogress.add(request)
|
||||
|
||||
def remove_request(self, request):
|
||||
self.inprogress.remove(request)
|
||||
self._maybe_fire_closing()
|
||||
|
||||
def close(self):
|
||||
self.closing = defer.Deferred()
|
||||
self._maybe_fire_closing()
|
||||
return self.closing
|
||||
|
||||
def _maybe_fire_closing(self):
|
||||
if self.closing and not self.inprogress:
|
||||
if self.nextcall:
|
||||
self.nextcall.cancel()
|
||||
if self.heartbeat.running:
|
||||
self.heartbeat.stop()
|
||||
self.closing.callback(None)
|
||||
|
||||
|
||||
class ExecutionEngine:
|
||||
|
||||
def __init__(self, crawler, spider_closed_callback):
|
||||
self.crawler = crawler
|
||||
self.settings = crawler.settings
|
||||
self.signals = crawler.signals
|
||||
self.logformatter = crawler.logformatter
|
||||
self.slot = None
|
||||
self.spider = None
|
||||
self.running = False
|
||||
self.paused = False
|
||||
self.scheduler_cls = load_object(self.settings['SCHEDULER'])
|
||||
downloader_cls = load_object(self.settings['DOWNLOADER'])
|
||||
self.downloader = downloader_cls(crawler)
|
||||
self.scraper = Scraper(crawler)
|
||||
self._spider_closed_callback = spider_closed_callback
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def start(self):
|
||||
"""Start the execution engine"""
|
||||
if self.running:
|
||||
raise RuntimeError("Engine already running")
|
||||
self.start_time = time()
|
||||
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
|
||||
self.running = True
|
||||
self._closewait = defer.Deferred()
|
||||
yield self._closewait
|
||||
|
||||
def stop(self):
|
||||
"""Stop the execution engine gracefully"""
|
||||
if not self.running:
|
||||
raise RuntimeError("Engine not running")
|
||||
self.running = False
|
||||
dfd = self._close_all_spiders()
|
||||
return dfd.addBoth(lambda _: self._finish_stopping_engine())
|
||||
|
||||
def close(self):
|
||||
"""Close the execution engine gracefully.
|
||||
|
||||
If it has already been started, stop it. In all cases, close all spiders
|
||||
and the downloader.
|
||||
"""
|
||||
if self.running:
|
||||
# Will also close spiders and downloader
|
||||
return self.stop()
|
||||
elif self.open_spiders:
|
||||
# Will also close downloader
|
||||
return self._close_all_spiders()
|
||||
else:
|
||||
return defer.succeed(self.downloader.close())
|
||||
|
||||
def pause(self):
|
||||
"""Pause the execution engine"""
|
||||
self.paused = True
|
||||
|
||||
def unpause(self):
|
||||
"""Resume the execution engine"""
|
||||
self.paused = False
|
||||
|
||||
def _next_request(self, spider):
|
||||
slot = self.slot
|
||||
if not slot:
|
||||
return
|
||||
|
||||
if self.paused:
|
||||
return
|
||||
|
||||
while not self._needs_backout(spider):
|
||||
if not self._next_request_from_scheduler(spider):
|
||||
break
|
||||
|
||||
if slot.start_requests and not self._needs_backout(spider):
|
||||
try:
|
||||
request = next(slot.start_requests)
|
||||
except StopIteration:
|
||||
slot.start_requests = None
|
||||
except Exception:
|
||||
slot.start_requests = None
|
||||
logger.error('Error while obtaining start requests',
|
||||
exc_info=True, extra={'spider': spider})
|
||||
else:
|
||||
self.crawl(request, spider)
|
||||
|
||||
if self.spider_is_idle(spider) and slot.close_if_idle:
|
||||
self._spider_idle(spider)
|
||||
|
||||
def _needs_backout(self, spider):
|
||||
slot = self.slot
|
||||
return (
|
||||
not self.running
|
||||
or slot.closing
|
||||
or self.downloader.needs_backout()
|
||||
or self.scraper.slot.needs_backout()
|
||||
)
|
||||
|
||||
def _next_request_from_scheduler(self, spider):
|
||||
slot = self.slot
|
||||
request = slot.scheduler.next_request()
|
||||
if not request:
|
||||
return
|
||||
d = self._download(request, spider)
|
||||
d.addBoth(self._handle_downloader_output, request, spider)
|
||||
d.addErrback(lambda f: logger.info('Error while handling downloader output',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
d.addBoth(lambda _: slot.remove_request(request))
|
||||
d.addErrback(lambda f: logger.info('Error while removing request from slot',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
d.addBoth(lambda _: slot.nextcall.schedule())
|
||||
d.addErrback(lambda f: logger.info('Error while scheduling new request',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
return d
|
||||
|
||||
def _handle_downloader_output(self, response, request, spider):
|
||||
if not isinstance(response, (Request, Response, Failure)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Request, Response or Failure, got "
|
||||
f"{type(response)}: {response!r}"
|
||||
)
|
||||
# downloader middleware can return requests (for example, redirects)
|
||||
if isinstance(response, Request):
|
||||
self.crawl(response, spider)
|
||||
return
|
||||
# response is a Response or Failure
|
||||
d = self.scraper.enqueue_scrape(response, request, spider)
|
||||
d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
return d
|
||||
|
||||
def spider_is_idle(self, spider):
|
||||
if not self.scraper.slot.is_idle():
|
||||
# scraper is not idle
|
||||
return False
|
||||
|
||||
if self.downloader.active:
|
||||
# downloader has pending requests
|
||||
return False
|
||||
|
||||
if self.slot.start_requests is not None:
|
||||
# not all start requests are handled
|
||||
return False
|
||||
|
||||
if self.slot.scheduler.has_pending_requests():
|
||||
# scheduler has pending requests
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@property
|
||||
def open_spiders(self):
|
||||
return [self.spider] if self.spider else []
|
||||
|
||||
def has_capacity(self):
|
||||
"""Does the engine have capacity to handle more spiders"""
|
||||
return not bool(self.slot)
|
||||
|
||||
def crawl(self, request, spider):
|
||||
if spider not in self.open_spiders:
|
||||
raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
|
||||
self.schedule(request, spider)
|
||||
self.slot.nextcall.schedule()
|
||||
|
||||
def schedule(self, request, spider):
|
||||
self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
|
||||
if not self.slot.scheduler.enqueue_request(request):
|
||||
self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
|
||||
|
||||
def download(self, request, spider):
|
||||
d = self._download(request, spider)
|
||||
d.addBoth(self._downloaded, self.slot, request, spider)
|
||||
return d
|
||||
|
||||
def _downloaded(self, response, slot, request, spider):
|
||||
slot.remove_request(request)
|
||||
return self.download(response, spider) if isinstance(response, Request) else response
|
||||
|
||||
def _download(self, request, spider):
|
||||
slot = self.slot
|
||||
slot.add_request(request)
|
||||
|
||||
def _on_success(response):
|
||||
if not isinstance(response, (Response, Request)):
|
||||
raise TypeError(
|
||||
"Incorrect type: expected Response or Request, got "
|
||||
f"{type(response)}: {response!r}"
|
||||
)
|
||||
if isinstance(response, Response):
|
||||
if response.request is None:
|
||||
response.request = request
|
||||
logkws = self.logformatter.crawled(response.request, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.response_received,
|
||||
response=response,
|
||||
request=response.request,
|
||||
spider=spider,
|
||||
)
|
||||
return response
|
||||
|
||||
def _on_complete(_):
|
||||
slot.nextcall.schedule()
|
||||
return _
|
||||
|
||||
dwld = self.downloader.fetch(request, spider)
|
||||
dwld.addCallbacks(_on_success)
|
||||
dwld.addBoth(_on_complete)
|
||||
return dwld
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def open_spider(self, spider, start_requests=(), close_if_idle=True):
|
||||
if not self.has_capacity():
|
||||
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
|
||||
logger.info("Spider opened", extra={'spider': spider})
|
||||
nextcall = CallLaterOnce(self._next_request, spider)
|
||||
scheduler = self.scheduler_cls.from_crawler(self.crawler)
|
||||
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
|
||||
slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
|
||||
self.slot = slot
|
||||
self.spider = spider
|
||||
yield scheduler.open(spider)
|
||||
yield self.scraper.open_spider(spider)
|
||||
self.crawler.stats.open_spider(spider)
|
||||
yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
|
||||
slot.nextcall.schedule()
|
||||
slot.heartbeat.start(5)
|
||||
|
||||
def _spider_idle(self, spider):
|
||||
"""Called when a spider gets idle. This function is called when there
|
||||
are no remaining pages to download or schedule. It can be called
|
||||
multiple times. If some extension raises a DontCloseSpider exception
|
||||
(in the spider_idle signal handler) the spider is not closed until the
|
||||
next loop and this function is guaranteed to be called (at least) once
|
||||
again for this spider.
|
||||
"""
|
||||
res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
|
||||
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
|
||||
return
|
||||
|
||||
if self.spider_is_idle(spider):
|
||||
self.close_spider(spider, reason='finished')
|
||||
|
||||
def close_spider(self, spider, reason='cancelled'):
|
||||
"""Close (cancel) spider and clear all its outstanding requests"""
|
||||
|
||||
slot = self.slot
|
||||
if slot.closing:
|
||||
return slot.closing
|
||||
logger.info("Closing spider (%(reason)s)",
|
||||
{'reason': reason},
|
||||
extra={'spider': spider})
|
||||
|
||||
dfd = slot.close()
|
||||
|
||||
def log_failure(msg):
|
||||
def errback(failure):
|
||||
logger.error(
|
||||
msg,
|
||||
exc_info=failure_to_exc_info(failure),
|
||||
extra={'spider': spider}
|
||||
)
|
||||
return errback
|
||||
|
||||
dfd.addBoth(lambda _: self.downloader.close())
|
||||
dfd.addErrback(log_failure('Downloader close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
|
||||
dfd.addErrback(log_failure('Scraper close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: slot.scheduler.close(reason))
|
||||
dfd.addErrback(log_failure('Scheduler close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
|
||||
signal=signals.spider_closed, spider=spider, reason=reason))
|
||||
dfd.addErrback(log_failure('Error while sending spider_close signal'))
|
||||
|
||||
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
|
||||
dfd.addErrback(log_failure('Stats close failure'))
|
||||
|
||||
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
|
||||
{'reason': reason},
|
||||
extra={'spider': spider}))
|
||||
|
||||
dfd.addBoth(lambda _: setattr(self, 'slot', None))
|
||||
dfd.addErrback(log_failure('Error while unassigning slot'))
|
||||
|
||||
dfd.addBoth(lambda _: setattr(self, 'spider', None))
|
||||
dfd.addErrback(log_failure('Error while unassigning spider'))
|
||||
|
||||
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
|
||||
|
||||
return dfd
|
||||
|
||||
def _close_all_spiders(self):
|
||||
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
|
||||
dlist = defer.DeferredList(dfds)
|
||||
return dlist
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _finish_stopping_engine(self):
|
||||
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
|
||||
self._closewait.callback(None)
|
||||
182
venv/lib/python3.9/site-packages/scrapy/core/scheduler.py
Normal file
182
venv/lib/python3.9/site-packages/scrapy/core/scheduler.py
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
import os
|
||||
import json
|
||||
import logging
|
||||
import warnings
|
||||
from os.path import join, exists
|
||||
|
||||
from queuelib import PriorityQueue
|
||||
|
||||
from scrapy.utils.misc import load_object, create_instance
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Scheduler:
|
||||
"""
|
||||
Scrapy Scheduler. It allows to enqueue requests and then get
|
||||
a next request to download. Scheduler is also handling duplication
|
||||
filtering, via dupefilter.
|
||||
|
||||
Prioritization and queueing is not performed by the Scheduler.
|
||||
User sets ``priority`` field for each Request, and a PriorityQueue
|
||||
(defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
|
||||
to dequeue requests in a desired order.
|
||||
|
||||
Scheduler uses two PriorityQueue instances, configured to work in-memory
|
||||
and on-disk (optional). When on-disk queue is present, it is used by
|
||||
default, and an in-memory queue is used as a fallback for cases where
|
||||
a disk queue can't handle a request (can't serialize it).
|
||||
|
||||
:setting:`SCHEDULER_MEMORY_QUEUE` and
|
||||
:setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
|
||||
which PriorityQueue instances would be instantiated with, to keep requests
|
||||
on disk and in memory respectively.
|
||||
|
||||
Overall, Scheduler is an object which holds several PriorityQueue instances
|
||||
(in-memory and on-disk) and implements fallback logic for them.
|
||||
Also, it handles dupefilters.
|
||||
"""
|
||||
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
|
||||
logunser=False, stats=None, pqclass=None, crawler=None):
|
||||
self.df = dupefilter
|
||||
self.dqdir = self._dqdir(jobdir)
|
||||
self.pqclass = pqclass
|
||||
self.dqclass = dqclass
|
||||
self.mqclass = mqclass
|
||||
self.logunser = logunser
|
||||
self.stats = stats
|
||||
self.crawler = crawler
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
settings = crawler.settings
|
||||
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
|
||||
dupefilter = create_instance(dupefilter_cls, settings, crawler)
|
||||
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
|
||||
if pqclass is PriorityQueue:
|
||||
warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
|
||||
" is no longer supported because of API changes; "
|
||||
"please use 'scrapy.pqueues.ScrapyPriorityQueue'",
|
||||
ScrapyDeprecationWarning)
|
||||
from scrapy.pqueues import ScrapyPriorityQueue
|
||||
pqclass = ScrapyPriorityQueue
|
||||
|
||||
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
|
||||
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
|
||||
logunser = settings.getbool('SCHEDULER_DEBUG')
|
||||
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
|
||||
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
|
||||
mqclass=mqclass, crawler=crawler)
|
||||
|
||||
def has_pending_requests(self):
|
||||
return len(self) > 0
|
||||
|
||||
def open(self, spider):
|
||||
self.spider = spider
|
||||
self.mqs = self._mq()
|
||||
self.dqs = self._dq() if self.dqdir else None
|
||||
return self.df.open()
|
||||
|
||||
def close(self, reason):
|
||||
if self.dqs:
|
||||
state = self.dqs.close()
|
||||
self._write_dqs_state(self.dqdir, state)
|
||||
return self.df.close(reason)
|
||||
|
||||
def enqueue_request(self, request):
|
||||
if not request.dont_filter and self.df.request_seen(request):
|
||||
self.df.log(request, self.spider)
|
||||
return False
|
||||
dqok = self._dqpush(request)
|
||||
if dqok:
|
||||
self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
|
||||
else:
|
||||
self._mqpush(request)
|
||||
self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
|
||||
self.stats.inc_value('scheduler/enqueued', spider=self.spider)
|
||||
return True
|
||||
|
||||
def next_request(self):
|
||||
request = self.mqs.pop()
|
||||
if request:
|
||||
self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
|
||||
else:
|
||||
request = self._dqpop()
|
||||
if request:
|
||||
self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
|
||||
if request:
|
||||
self.stats.inc_value('scheduler/dequeued', spider=self.spider)
|
||||
return request
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)
|
||||
|
||||
def _dqpush(self, request):
|
||||
if self.dqs is None:
|
||||
return
|
||||
try:
|
||||
self.dqs.push(request)
|
||||
except ValueError as e: # non serializable request
|
||||
if self.logunser:
|
||||
msg = ("Unable to serialize request: %(request)s - reason:"
|
||||
" %(reason)s - no more unserializable requests will be"
|
||||
" logged (stats being collected)")
|
||||
logger.warning(msg, {'request': request, 'reason': e},
|
||||
exc_info=True, extra={'spider': self.spider})
|
||||
self.logunser = False
|
||||
self.stats.inc_value('scheduler/unserializable',
|
||||
spider=self.spider)
|
||||
return
|
||||
else:
|
||||
return True
|
||||
|
||||
def _mqpush(self, request):
|
||||
self.mqs.push(request)
|
||||
|
||||
def _dqpop(self):
|
||||
if self.dqs:
|
||||
return self.dqs.pop()
|
||||
|
||||
def _mq(self):
|
||||
""" Create a new priority queue instance, with in-memory storage """
|
||||
return create_instance(self.pqclass,
|
||||
settings=None,
|
||||
crawler=self.crawler,
|
||||
downstream_queue_cls=self.mqclass,
|
||||
key='')
|
||||
|
||||
def _dq(self):
|
||||
""" Create a new priority queue instance, with disk storage """
|
||||
state = self._read_dqs_state(self.dqdir)
|
||||
q = create_instance(self.pqclass,
|
||||
settings=None,
|
||||
crawler=self.crawler,
|
||||
downstream_queue_cls=self.dqclass,
|
||||
key=self.dqdir,
|
||||
startprios=state)
|
||||
if q:
|
||||
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
|
||||
{'queuesize': len(q)}, extra={'spider': self.spider})
|
||||
return q
|
||||
|
||||
def _dqdir(self, jobdir):
|
||||
""" Return a folder name to keep disk queue state at """
|
||||
if jobdir:
|
||||
dqdir = join(jobdir, 'requests.queue')
|
||||
if not exists(dqdir):
|
||||
os.makedirs(dqdir)
|
||||
return dqdir
|
||||
|
||||
def _read_dqs_state(self, dqdir):
|
||||
path = join(dqdir, 'active.json')
|
||||
if not exists(path):
|
||||
return ()
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
def _write_dqs_state(self, dqdir, state):
|
||||
with open(join(dqdir, 'active.json'), 'w') as f:
|
||||
json.dump(state, f)
|
||||
260
venv/lib/python3.9/site-packages/scrapy/core/scraper.py
Normal file
260
venv/lib/python3.9/site-packages/scrapy/core/scraper.py
Normal file
|
|
@ -0,0 +1,260 @@
|
|||
"""This module implements the Scraper component which parses responses and
|
||||
extracts information from them"""
|
||||
|
||||
import logging
|
||||
from collections import deque
|
||||
|
||||
from itemadapter import is_item
|
||||
from twisted.internet import defer
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.core.spidermw import SpiderMiddlewareManager
|
||||
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
|
||||
from scrapy.http import Request, Response
|
||||
from scrapy.utils.defer import defer_fail, defer_succeed, iter_errback, parallel
|
||||
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
|
||||
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
|
||||
from scrapy.utils.spider import iterate_spider_output
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Slot:
|
||||
"""Scraper slot (one per running spider)"""
|
||||
|
||||
MIN_RESPONSE_SIZE = 1024
|
||||
|
||||
def __init__(self, max_active_size=5000000):
|
||||
self.max_active_size = max_active_size
|
||||
self.queue = deque()
|
||||
self.active = set()
|
||||
self.active_size = 0
|
||||
self.itemproc_size = 0
|
||||
self.closing = None
|
||||
|
||||
def add_response_request(self, response, request):
|
||||
deferred = defer.Deferred()
|
||||
self.queue.append((response, request, deferred))
|
||||
if isinstance(response, Response):
|
||||
self.active_size += max(len(response.body), self.MIN_RESPONSE_SIZE)
|
||||
else:
|
||||
self.active_size += self.MIN_RESPONSE_SIZE
|
||||
return deferred
|
||||
|
||||
def next_response_request_deferred(self):
|
||||
response, request, deferred = self.queue.popleft()
|
||||
self.active.add(request)
|
||||
return response, request, deferred
|
||||
|
||||
def finish_response(self, response, request):
|
||||
self.active.remove(request)
|
||||
if isinstance(response, Response):
|
||||
self.active_size -= max(len(response.body), self.MIN_RESPONSE_SIZE)
|
||||
else:
|
||||
self.active_size -= self.MIN_RESPONSE_SIZE
|
||||
|
||||
def is_idle(self):
|
||||
return not (self.queue or self.active)
|
||||
|
||||
def needs_backout(self):
|
||||
return self.active_size > self.max_active_size
|
||||
|
||||
|
||||
class Scraper:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.slot = None
|
||||
self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
|
||||
itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
|
||||
self.itemproc = itemproc_cls.from_crawler(crawler)
|
||||
self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
|
||||
self.crawler = crawler
|
||||
self.signals = crawler.signals
|
||||
self.logformatter = crawler.logformatter
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def open_spider(self, spider):
|
||||
"""Open the given spider for scraping and allocate resources for it"""
|
||||
self.slot = Slot(self.crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE'))
|
||||
yield self.itemproc.open_spider(spider)
|
||||
|
||||
def close_spider(self, spider):
|
||||
"""Close a spider being scraped and release its resources"""
|
||||
slot = self.slot
|
||||
slot.closing = defer.Deferred()
|
||||
slot.closing.addCallback(self.itemproc.close_spider)
|
||||
self._check_if_closing(spider, slot)
|
||||
return slot.closing
|
||||
|
||||
def is_idle(self):
|
||||
"""Return True if there isn't any more spiders to process"""
|
||||
return not self.slot
|
||||
|
||||
def _check_if_closing(self, spider, slot):
|
||||
if slot.closing and slot.is_idle():
|
||||
slot.closing.callback(spider)
|
||||
|
||||
def enqueue_scrape(self, response, request, spider):
|
||||
slot = self.slot
|
||||
dfd = slot.add_response_request(response, request)
|
||||
|
||||
def finish_scraping(_):
|
||||
slot.finish_response(response, request)
|
||||
self._check_if_closing(spider, slot)
|
||||
self._scrape_next(spider, slot)
|
||||
return _
|
||||
|
||||
dfd.addBoth(finish_scraping)
|
||||
dfd.addErrback(
|
||||
lambda f: logger.error('Scraper bug processing %(request)s',
|
||||
{'request': request},
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': spider}))
|
||||
self._scrape_next(spider, slot)
|
||||
return dfd
|
||||
|
||||
def _scrape_next(self, spider, slot):
|
||||
while slot.queue:
|
||||
response, request, deferred = slot.next_response_request_deferred()
|
||||
self._scrape(response, request, spider).chainDeferred(deferred)
|
||||
|
||||
def _scrape(self, result, request, spider):
|
||||
"""
|
||||
Handle the downloaded response or failure through the spider callback/errback
|
||||
"""
|
||||
if not isinstance(result, (Response, Failure)):
|
||||
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
|
||||
dfd = self._scrape2(result, request, spider) # returns spider's processed output
|
||||
dfd.addErrback(self.handle_spider_error, request, result, spider)
|
||||
dfd.addCallback(self.handle_spider_output, request, result, spider)
|
||||
return dfd
|
||||
|
||||
def _scrape2(self, result, request, spider):
|
||||
"""
|
||||
Handle the different cases of request's result been a Response or a Failure
|
||||
"""
|
||||
if isinstance(result, Response):
|
||||
return self.spidermw.scrape_response(self.call_spider, result, request, spider)
|
||||
else: # result is a Failure
|
||||
dfd = self.call_spider(result, request, spider)
|
||||
return dfd.addErrback(self._log_download_errors, result, request, spider)
|
||||
|
||||
def call_spider(self, result, request, spider):
|
||||
if isinstance(result, Response):
|
||||
if getattr(result, "request", None) is None:
|
||||
result.request = request
|
||||
callback = result.request.callback or spider._parse
|
||||
warn_on_generator_with_return_value(spider, callback)
|
||||
dfd = defer_succeed(result)
|
||||
dfd.addCallback(callback, **result.request.cb_kwargs)
|
||||
else: # result is a Failure
|
||||
result.request = request
|
||||
warn_on_generator_with_return_value(spider, request.errback)
|
||||
dfd = defer_fail(result)
|
||||
dfd.addErrback(request.errback)
|
||||
return dfd.addCallback(iterate_spider_output)
|
||||
|
||||
def handle_spider_error(self, _failure, request, response, spider):
|
||||
exc = _failure.value
|
||||
if isinstance(exc, CloseSpider):
|
||||
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
|
||||
return
|
||||
logkws = self.logformatter.spider_error(_failure, request, response, spider)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
exc_info=failure_to_exc_info(_failure),
|
||||
extra={'spider': spider}
|
||||
)
|
||||
self.signals.send_catch_log(
|
||||
signal=signals.spider_error,
|
||||
failure=_failure, response=response,
|
||||
spider=spider
|
||||
)
|
||||
self.crawler.stats.inc_value(
|
||||
f"spider_exceptions/{_failure.value.__class__.__name__}",
|
||||
spider=spider
|
||||
)
|
||||
|
||||
def handle_spider_output(self, result, request, response, spider):
|
||||
if not result:
|
||||
return defer_succeed(None)
|
||||
it = iter_errback(result, self.handle_spider_error, request, response, spider)
|
||||
dfd = parallel(it, self.concurrent_items, self._process_spidermw_output,
|
||||
request, response, spider)
|
||||
return dfd
|
||||
|
||||
def _process_spidermw_output(self, output, request, response, spider):
|
||||
"""Process each Request/Item (given in the output parameter) returned
|
||||
from the given spider
|
||||
"""
|
||||
if isinstance(output, Request):
|
||||
self.crawler.engine.crawl(request=output, spider=spider)
|
||||
elif is_item(output):
|
||||
self.slot.itemproc_size += 1
|
||||
dfd = self.itemproc.process_item(output, spider)
|
||||
dfd.addBoth(self._itemproc_finished, output, response, spider)
|
||||
return dfd
|
||||
elif output is None:
|
||||
pass
|
||||
else:
|
||||
typename = type(output).__name__
|
||||
logger.error(
|
||||
'Spider must return request, item, or None, got %(typename)r in %(request)s',
|
||||
{'request': request, 'typename': typename},
|
||||
extra={'spider': spider},
|
||||
)
|
||||
|
||||
def _log_download_errors(self, spider_failure, download_failure, request, spider):
|
||||
"""Log and silence errors that come from the engine (typically download
|
||||
errors that got propagated thru here)
|
||||
"""
|
||||
if isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest):
|
||||
if download_failure.frames:
|
||||
logkws = self.logformatter.download_error(download_failure, request, spider)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
extra={'spider': spider},
|
||||
exc_info=failure_to_exc_info(download_failure),
|
||||
)
|
||||
else:
|
||||
errmsg = download_failure.getErrorMessage()
|
||||
if errmsg:
|
||||
logkws = self.logformatter.download_error(
|
||||
download_failure, request, spider, errmsg)
|
||||
logger.log(
|
||||
*logformatter_adapter(logkws),
|
||||
extra={'spider': spider},
|
||||
)
|
||||
|
||||
if spider_failure is not download_failure:
|
||||
return spider_failure
|
||||
|
||||
def _itemproc_finished(self, output, item, response, spider):
|
||||
"""ItemProcessor finished for the given ``item`` and returned ``output``
|
||||
"""
|
||||
self.slot.itemproc_size -= 1
|
||||
if isinstance(output, Failure):
|
||||
ex = output.value
|
||||
if isinstance(ex, DropItem):
|
||||
logkws = self.logformatter.dropped(item, ex, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_dropped, item=item, response=response,
|
||||
spider=spider, exception=output.value)
|
||||
else:
|
||||
logkws = self.logformatter.item_error(item, ex, response, spider)
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider},
|
||||
exc_info=failure_to_exc_info(output))
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_error, item=item, response=response,
|
||||
spider=spider, failure=output)
|
||||
else:
|
||||
logkws = self.logformatter.scraped(output, response, spider)
|
||||
if logkws is not None:
|
||||
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
|
||||
return self.signals.send_catch_log_deferred(
|
||||
signal=signals.item_scraped, item=output, response=response,
|
||||
spider=spider)
|
||||
128
venv/lib/python3.9/site-packages/scrapy/core/spidermw.py
Normal file
128
venv/lib/python3.9/site-packages/scrapy/core/spidermw.py
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
"""
|
||||
Spider Middleware manager
|
||||
|
||||
See documentation in docs/topics/spider-middleware.rst
|
||||
"""
|
||||
from itertools import islice
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.exceptions import _InvalidOutput
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.conf import build_component_list
|
||||
from scrapy.utils.defer import mustbe_deferred
|
||||
from scrapy.utils.python import MutableChain
|
||||
|
||||
|
||||
def _isiterable(possible_iterator):
|
||||
return hasattr(possible_iterator, '__iter__')
|
||||
|
||||
|
||||
def _fname(f):
|
||||
return f"{f.__self__.__class__.__name__}.{f.__func__.__name__}"
|
||||
|
||||
|
||||
class SpiderMiddlewareManager(MiddlewareManager):
|
||||
|
||||
component_name = 'spider middleware'
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
|
||||
|
||||
def _add_middleware(self, mw):
|
||||
super()._add_middleware(mw)
|
||||
if hasattr(mw, 'process_spider_input'):
|
||||
self.methods['process_spider_input'].append(mw.process_spider_input)
|
||||
if hasattr(mw, 'process_start_requests'):
|
||||
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
|
||||
process_spider_output = getattr(mw, 'process_spider_output', None)
|
||||
self.methods['process_spider_output'].appendleft(process_spider_output)
|
||||
process_spider_exception = getattr(mw, 'process_spider_exception', None)
|
||||
self.methods['process_spider_exception'].appendleft(process_spider_exception)
|
||||
|
||||
def scrape_response(self, scrape_func, response, request, spider):
|
||||
|
||||
def process_spider_input(response):
|
||||
for method in self.methods['process_spider_input']:
|
||||
try:
|
||||
result = method(response=response, spider=spider)
|
||||
if result is not None:
|
||||
msg = (f"Middleware {_fname(method)} must return None "
|
||||
f"or raise an exception, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
except _InvalidOutput:
|
||||
raise
|
||||
except Exception:
|
||||
return scrape_func(Failure(), request, spider)
|
||||
return scrape_func(response, request, spider)
|
||||
|
||||
def _evaluate_iterable(iterable, exception_processor_index, recover_to):
|
||||
try:
|
||||
for r in iterable:
|
||||
yield r
|
||||
except Exception as ex:
|
||||
exception_result = process_spider_exception(Failure(ex), exception_processor_index)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
recover_to.extend(exception_result)
|
||||
|
||||
def process_spider_exception(_failure, start_index=0):
|
||||
exception = _failure.value
|
||||
# don't handle _InvalidOutput exception
|
||||
if isinstance(exception, _InvalidOutput):
|
||||
return _failure
|
||||
method_list = islice(self.methods['process_spider_exception'], start_index, None)
|
||||
for method_index, method in enumerate(method_list, start=start_index):
|
||||
if method is None:
|
||||
continue
|
||||
result = method(response=response, exception=exception, spider=spider)
|
||||
if _isiterable(result):
|
||||
# stop exception handling by handing control over to the
|
||||
# process_spider_output chain if an iterable has been returned
|
||||
return process_spider_output(result, method_index + 1)
|
||||
elif result is None:
|
||||
continue
|
||||
else:
|
||||
msg = (f"Middleware {_fname(method)} must return None "
|
||||
f"or an iterable, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
return _failure
|
||||
|
||||
def process_spider_output(result, start_index=0):
|
||||
# items in this iterable do not need to go through the process_spider_output
|
||||
# chain, they went through it already from the process_spider_exception method
|
||||
recovered = MutableChain()
|
||||
|
||||
method_list = islice(self.methods['process_spider_output'], start_index, None)
|
||||
for method_index, method in enumerate(method_list, start=start_index):
|
||||
if method is None:
|
||||
continue
|
||||
try:
|
||||
# might fail directly if the output value is not a generator
|
||||
result = method(response=response, result=result, spider=spider)
|
||||
except Exception as ex:
|
||||
exception_result = process_spider_exception(Failure(ex), method_index + 1)
|
||||
if isinstance(exception_result, Failure):
|
||||
raise
|
||||
return exception_result
|
||||
if _isiterable(result):
|
||||
result = _evaluate_iterable(result, method_index + 1, recovered)
|
||||
else:
|
||||
msg = (f"Middleware {_fname(method)} must return an "
|
||||
f"iterable, got {type(result)}")
|
||||
raise _InvalidOutput(msg)
|
||||
|
||||
return MutableChain(result, recovered)
|
||||
|
||||
def process_callback_output(result):
|
||||
recovered = MutableChain()
|
||||
result = _evaluate_iterable(result, 0, recovered)
|
||||
return MutableChain(process_spider_output(result), recovered)
|
||||
|
||||
dfd = mustbe_deferred(process_spider_input, response)
|
||||
dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception)
|
||||
return dfd
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
return self._process_chain('process_start_requests', start_requests, spider)
|
||||
344
venv/lib/python3.9/site-packages/scrapy/crawler.py
Normal file
344
venv/lib/python3.9/site-packages/scrapy/crawler.py
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
import logging
|
||||
import pprint
|
||||
import signal
|
||||
import warnings
|
||||
|
||||
from twisted.internet import defer
|
||||
from zope.interface.exceptions import DoesNotImplement
|
||||
|
||||
try:
|
||||
# zope >= 5.0 only supports MultipleInvalid
|
||||
from zope.interface.exceptions import MultipleInvalid
|
||||
except ImportError:
|
||||
MultipleInvalid = None
|
||||
|
||||
from zope.interface.verify import verifyClass
|
||||
|
||||
from scrapy import signals, Spider
|
||||
from scrapy.core.engine import ExecutionEngine
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.extension import ExtensionManager
|
||||
from scrapy.interfaces import ISpiderLoader
|
||||
from scrapy.settings import overridden_settings, Settings
|
||||
from scrapy.signalmanager import SignalManager
|
||||
from scrapy.utils.log import (
|
||||
configure_logging,
|
||||
get_scrapy_root_handler,
|
||||
install_scrapy_root_handler,
|
||||
log_scrapy_info,
|
||||
LogCounterHandler,
|
||||
)
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
|
||||
from scrapy.utils.reactor import install_reactor, verify_installed_reactor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Crawler:
|
||||
|
||||
def __init__(self, spidercls, settings=None):
|
||||
if isinstance(spidercls, Spider):
|
||||
raise ValueError('The spidercls argument must be a class, not an object')
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
|
||||
self.spidercls = spidercls
|
||||
self.settings = settings.copy()
|
||||
self.spidercls.update_settings(self.settings)
|
||||
|
||||
self.signals = SignalManager(self)
|
||||
self.stats = load_object(self.settings['STATS_CLASS'])(self)
|
||||
|
||||
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
|
||||
logging.root.addHandler(handler)
|
||||
|
||||
d = dict(overridden_settings(self.settings))
|
||||
logger.info("Overridden settings:\n%(settings)s",
|
||||
{'settings': pprint.pformat(d)})
|
||||
|
||||
if get_scrapy_root_handler() is not None:
|
||||
# scrapy root handler already installed: update it with new settings
|
||||
install_scrapy_root_handler(self.settings)
|
||||
# lambda is assigned to Crawler attribute because this way it is not
|
||||
# garbage collected after leaving __init__ scope
|
||||
self.__remove_handler = lambda: logging.root.removeHandler(handler)
|
||||
self.signals.connect(self.__remove_handler, signals.engine_stopped)
|
||||
|
||||
lf_cls = load_object(self.settings['LOG_FORMATTER'])
|
||||
self.logformatter = lf_cls.from_crawler(self)
|
||||
self.extensions = ExtensionManager.from_crawler(self)
|
||||
|
||||
self.settings.freeze()
|
||||
self.crawling = False
|
||||
self.spider = None
|
||||
self.engine = None
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def crawl(self, *args, **kwargs):
|
||||
if self.crawling:
|
||||
raise RuntimeError("Crawling already taking place")
|
||||
self.crawling = True
|
||||
|
||||
try:
|
||||
self.spider = self._create_spider(*args, **kwargs)
|
||||
self.engine = self._create_engine()
|
||||
start_requests = iter(self.spider.start_requests())
|
||||
yield self.engine.open_spider(self.spider, start_requests)
|
||||
yield defer.maybeDeferred(self.engine.start)
|
||||
except Exception:
|
||||
self.crawling = False
|
||||
if self.engine is not None:
|
||||
yield self.engine.close()
|
||||
raise
|
||||
|
||||
def _create_spider(self, *args, **kwargs):
|
||||
return self.spidercls.from_crawler(self, *args, **kwargs)
|
||||
|
||||
def _create_engine(self):
|
||||
return ExecutionEngine(self, lambda _: self.stop())
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def stop(self):
|
||||
"""Starts a graceful stop of the crawler and returns a deferred that is
|
||||
fired when the crawler is stopped."""
|
||||
if self.crawling:
|
||||
self.crawling = False
|
||||
yield defer.maybeDeferred(self.engine.stop)
|
||||
|
||||
|
||||
class CrawlerRunner:
|
||||
"""
|
||||
This is a convenient helper class that keeps track of, manages and runs
|
||||
crawlers inside an already setup :mod:`~twisted.internet.reactor`.
|
||||
|
||||
The CrawlerRunner object must be instantiated with a
|
||||
:class:`~scrapy.settings.Settings` object.
|
||||
|
||||
This class shouldn't be needed (since Scrapy is responsible of using it
|
||||
accordingly) unless writing scripts that manually handle the crawling
|
||||
process. See :ref:`run-from-script` for an example.
|
||||
"""
|
||||
|
||||
crawlers = property(
|
||||
lambda self: self._crawlers,
|
||||
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
|
||||
":meth:`crawl` and managed by this class."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_spider_loader(settings):
|
||||
""" Get SpiderLoader instance from settings """
|
||||
cls_path = settings.get('SPIDER_LOADER_CLASS')
|
||||
loader_cls = load_object(cls_path)
|
||||
excs = (DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
|
||||
try:
|
||||
verifyClass(ISpiderLoader, loader_cls)
|
||||
except excs:
|
||||
warnings.warn(
|
||||
'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
|
||||
'not fully implement scrapy.interfaces.ISpiderLoader interface. '
|
||||
'Please add all missing methods to avoid unexpected runtime errors.',
|
||||
category=ScrapyDeprecationWarning, stacklevel=2
|
||||
)
|
||||
return loader_cls.from_settings(settings.frozencopy())
|
||||
|
||||
def __init__(self, settings=None):
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
self.settings = settings
|
||||
self.spider_loader = self._get_spider_loader(settings)
|
||||
self._crawlers = set()
|
||||
self._active = set()
|
||||
self.bootstrap_failed = False
|
||||
self._handle_twisted_reactor()
|
||||
|
||||
@property
|
||||
def spiders(self):
|
||||
warnings.warn("CrawlerRunner.spiders attribute is renamed to "
|
||||
"CrawlerRunner.spider_loader.",
|
||||
category=ScrapyDeprecationWarning, stacklevel=2)
|
||||
return self.spider_loader
|
||||
|
||||
def crawl(self, crawler_or_spidercls, *args, **kwargs):
|
||||
"""
|
||||
Run a crawler with the provided arguments.
|
||||
|
||||
It will call the given Crawler's :meth:`~Crawler.crawl` method, while
|
||||
keeping track of it so it can be stopped later.
|
||||
|
||||
If ``crawler_or_spidercls`` isn't a :class:`~scrapy.crawler.Crawler`
|
||||
instance, this method will try to create one using this parameter as
|
||||
the spider class given to it.
|
||||
|
||||
Returns a deferred that is fired when the crawling is finished.
|
||||
|
||||
:param crawler_or_spidercls: already created crawler, or a spider class
|
||||
or spider's name inside the project to create it
|
||||
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
|
||||
:class:`~scrapy.spiders.Spider` subclass or string
|
||||
|
||||
:param args: arguments to initialize the spider
|
||||
|
||||
:param kwargs: keyword arguments to initialize the spider
|
||||
"""
|
||||
if isinstance(crawler_or_spidercls, Spider):
|
||||
raise ValueError(
|
||||
'The crawler_or_spidercls argument cannot be a spider object, '
|
||||
'it must be a spider class (or a Crawler object)')
|
||||
crawler = self.create_crawler(crawler_or_spidercls)
|
||||
return self._crawl(crawler, *args, **kwargs)
|
||||
|
||||
def _crawl(self, crawler, *args, **kwargs):
|
||||
self.crawlers.add(crawler)
|
||||
d = crawler.crawl(*args, **kwargs)
|
||||
self._active.add(d)
|
||||
|
||||
def _done(result):
|
||||
self.crawlers.discard(crawler)
|
||||
self._active.discard(d)
|
||||
self.bootstrap_failed |= not getattr(crawler, 'spider', None)
|
||||
return result
|
||||
|
||||
return d.addBoth(_done)
|
||||
|
||||
def create_crawler(self, crawler_or_spidercls):
|
||||
"""
|
||||
Return a :class:`~scrapy.crawler.Crawler` object.
|
||||
|
||||
* If ``crawler_or_spidercls`` is a Crawler, it is returned as-is.
|
||||
* If ``crawler_or_spidercls`` is a Spider subclass, a new Crawler
|
||||
is constructed for it.
|
||||
* If ``crawler_or_spidercls`` is a string, this function finds
|
||||
a spider with this name in a Scrapy project (using spider loader),
|
||||
then creates a Crawler instance for it.
|
||||
"""
|
||||
if isinstance(crawler_or_spidercls, Spider):
|
||||
raise ValueError(
|
||||
'The crawler_or_spidercls argument cannot be a spider object, '
|
||||
'it must be a spider class (or a Crawler object)')
|
||||
if isinstance(crawler_or_spidercls, Crawler):
|
||||
return crawler_or_spidercls
|
||||
return self._create_crawler(crawler_or_spidercls)
|
||||
|
||||
def _create_crawler(self, spidercls):
|
||||
if isinstance(spidercls, str):
|
||||
spidercls = self.spider_loader.load(spidercls)
|
||||
return Crawler(spidercls, self.settings)
|
||||
|
||||
def stop(self):
|
||||
"""
|
||||
Stops simultaneously all the crawling jobs taking place.
|
||||
|
||||
Returns a deferred that is fired when they all have ended.
|
||||
"""
|
||||
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def join(self):
|
||||
"""
|
||||
join()
|
||||
|
||||
Returns a deferred that is fired when all managed :attr:`crawlers` have
|
||||
completed their executions.
|
||||
"""
|
||||
while self._active:
|
||||
yield defer.DeferredList(self._active)
|
||||
|
||||
def _handle_twisted_reactor(self):
|
||||
if self.settings.get("TWISTED_REACTOR"):
|
||||
verify_installed_reactor(self.settings["TWISTED_REACTOR"])
|
||||
|
||||
|
||||
class CrawlerProcess(CrawlerRunner):
|
||||
"""
|
||||
A class to run multiple scrapy crawlers in a process simultaneously.
|
||||
|
||||
This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
|
||||
for starting a :mod:`~twisted.internet.reactor` and handling shutdown
|
||||
signals, like the keyboard interrupt command Ctrl-C. It also configures
|
||||
top-level logging.
|
||||
|
||||
This utility should be a better fit than
|
||||
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
|
||||
:mod:`~twisted.internet.reactor` within your application.
|
||||
|
||||
The CrawlerProcess object must be instantiated with a
|
||||
:class:`~scrapy.settings.Settings` object.
|
||||
|
||||
:param install_root_handler: whether to install root logging handler
|
||||
(default: True)
|
||||
|
||||
This class shouldn't be needed (since Scrapy is responsible of using it
|
||||
accordingly) unless writing scripts that manually handle the crawling
|
||||
process. See :ref:`run-from-script` for an example.
|
||||
"""
|
||||
|
||||
def __init__(self, settings=None, install_root_handler=True):
|
||||
super().__init__(settings)
|
||||
install_shutdown_handlers(self._signal_shutdown)
|
||||
configure_logging(self.settings, install_root_handler)
|
||||
log_scrapy_info(self.settings)
|
||||
|
||||
def _signal_shutdown(self, signum, _):
|
||||
from twisted.internet import reactor
|
||||
install_shutdown_handlers(self._signal_kill)
|
||||
signame = signal_names[signum]
|
||||
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
|
||||
{'signame': signame})
|
||||
reactor.callFromThread(self._graceful_stop_reactor)
|
||||
|
||||
def _signal_kill(self, signum, _):
|
||||
from twisted.internet import reactor
|
||||
install_shutdown_handlers(signal.SIG_IGN)
|
||||
signame = signal_names[signum]
|
||||
logger.info('Received %(signame)s twice, forcing unclean shutdown',
|
||||
{'signame': signame})
|
||||
reactor.callFromThread(self._stop_reactor)
|
||||
|
||||
def start(self, stop_after_crawl=True):
|
||||
"""
|
||||
This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool
|
||||
size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache
|
||||
based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
|
||||
|
||||
If ``stop_after_crawl`` is True, the reactor will be stopped after all
|
||||
crawlers have finished, using :meth:`join`.
|
||||
|
||||
:param bool stop_after_crawl: stop or not the reactor when all
|
||||
crawlers have finished
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
if stop_after_crawl:
|
||||
d = self.join()
|
||||
# Don't start the reactor if the deferreds are already fired
|
||||
if d.called:
|
||||
return
|
||||
d.addBoth(self._stop_reactor)
|
||||
|
||||
resolver_class = load_object(self.settings["DNS_RESOLVER"])
|
||||
resolver = create_instance(resolver_class, self.settings, self, reactor=reactor)
|
||||
resolver.install_on_reactor()
|
||||
tp = reactor.getThreadPool()
|
||||
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
|
||||
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
|
||||
reactor.run(installSignalHandlers=False) # blocking call
|
||||
|
||||
def _graceful_stop_reactor(self):
|
||||
d = self.stop()
|
||||
d.addBoth(self._stop_reactor)
|
||||
return d
|
||||
|
||||
def _stop_reactor(self, _=None):
|
||||
from twisted.internet import reactor
|
||||
try:
|
||||
reactor.stop()
|
||||
except RuntimeError: # raised if already stopped or in shutdown stage
|
||||
pass
|
||||
|
||||
def _handle_twisted_reactor(self):
|
||||
if self.settings.get("TWISTED_REACTOR"):
|
||||
install_reactor(self.settings["TWISTED_REACTOR"], self.settings["ASYNCIO_EVENT_LOOP"])
|
||||
super()._handle_twisted_reactor()
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
import re
|
||||
import logging
|
||||
|
||||
from w3lib import html
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import HtmlResponse
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AjaxCrawlMiddleware:
|
||||
"""
|
||||
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
|
||||
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('AJAXCRAWL_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
# XXX: Google parses at least first 100k bytes; scrapy's redirect
|
||||
# middleware parses first 4k. 4k turns out to be insufficient
|
||||
# for this middleware, and parsing 100k could be slow.
|
||||
# We use something in between (32K) by default.
|
||||
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
|
||||
if not isinstance(response, HtmlResponse) or response.status != 200:
|
||||
return response
|
||||
|
||||
if request.method != 'GET':
|
||||
# other HTTP methods are either not safe or don't have a body
|
||||
return response
|
||||
|
||||
if 'ajax_crawlable' in request.meta: # prevent loops
|
||||
return response
|
||||
|
||||
if not self._has_ajax_crawlable_variant(response):
|
||||
return response
|
||||
|
||||
# scrapy already handles #! links properly
|
||||
ajax_crawl_request = request.replace(url=request.url + '#!')
|
||||
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
|
||||
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
|
||||
extra={'spider': spider})
|
||||
|
||||
ajax_crawl_request.meta['ajax_crawlable'] = True
|
||||
return ajax_crawl_request
|
||||
|
||||
def _has_ajax_crawlable_variant(self, response):
|
||||
"""
|
||||
Return True if a page without hash fragment could be "AJAX crawlable"
|
||||
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
|
||||
"""
|
||||
body = response.text[:self.lookup_bytes]
|
||||
return _has_ajaxcrawlable_meta(body)
|
||||
|
||||
|
||||
# XXX: move it to w3lib?
|
||||
_ajax_crawlable_re = re.compile(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>')
|
||||
|
||||
|
||||
def _has_ajaxcrawlable_meta(text):
|
||||
"""
|
||||
>>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
|
||||
True
|
||||
>>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
|
||||
True
|
||||
>>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
|
||||
False
|
||||
>>> _has_ajaxcrawlable_meta('<html></html>')
|
||||
False
|
||||
"""
|
||||
|
||||
# Stripping scripts and comments is slow (about 20x slower than
|
||||
# just checking if a string is in text); this is a quick fail-fast
|
||||
# path that should work for most pages.
|
||||
if 'fragment' not in text:
|
||||
return False
|
||||
if 'content' not in text:
|
||||
return False
|
||||
|
||||
text = html.remove_tags_with_content(text, ('script', 'noscript'))
|
||||
text = html.replace_entities(text)
|
||||
text = html.remove_comments(text)
|
||||
return _ajax_crawlable_re.search(text) is not None
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.http import Response
|
||||
from scrapy.http.cookies import CookieJar
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CookiesMiddleware:
|
||||
"""This middleware enables working with sites that need cookies"""
|
||||
|
||||
def __init__(self, debug=False):
|
||||
self.jars = defaultdict(CookieJar)
|
||||
self.debug = debug
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COOKIES_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return
|
||||
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
for cookie in self._get_request_cookies(jar, request):
|
||||
jar.set_cookie_if_ok(cookie, request)
|
||||
|
||||
# set Cookie header
|
||||
request.headers.pop('Cookie', None)
|
||||
jar.add_cookie_header(request)
|
||||
self._debug_cookie(request, spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_merge_cookies', False):
|
||||
return response
|
||||
|
||||
# extract cookies from Set-Cookie and drop invalid/expired cookies
|
||||
cookiejarkey = request.meta.get("cookiejar")
|
||||
jar = self.jars[cookiejarkey]
|
||||
jar.extract_cookies(response, request)
|
||||
self._debug_set_cookie(response, spider)
|
||||
|
||||
return response
|
||||
|
||||
def _debug_cookie(self, request, spider):
|
||||
if self.debug:
|
||||
cl = [to_unicode(c, errors='replace')
|
||||
for c in request.headers.getlist('Cookie')]
|
||||
if cl:
|
||||
cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
|
||||
msg = f"Sending cookies to: {request}\n{cookies}"
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _debug_set_cookie(self, response, spider):
|
||||
if self.debug:
|
||||
cl = [to_unicode(c, errors='replace')
|
||||
for c in response.headers.getlist('Set-Cookie')]
|
||||
if cl:
|
||||
cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
|
||||
msg = f"Received cookies from: {response}\n{cookies}"
|
||||
logger.debug(msg, extra={'spider': spider})
|
||||
|
||||
def _format_cookie(self, cookie, request):
|
||||
"""
|
||||
Given a dict consisting of cookie components, return its string representation.
|
||||
Decode from bytes if necessary.
|
||||
"""
|
||||
decoded = {}
|
||||
for key in ("name", "value", "path", "domain"):
|
||||
if cookie.get(key) is None:
|
||||
if key in ("name", "value"):
|
||||
msg = "Invalid cookie found in request {}: {} ('{}' is missing)"
|
||||
logger.warning(msg.format(request, cookie, key))
|
||||
return
|
||||
continue
|
||||
if isinstance(cookie[key], str):
|
||||
decoded[key] = cookie[key]
|
||||
else:
|
||||
try:
|
||||
decoded[key] = cookie[key].decode("utf8")
|
||||
except UnicodeDecodeError:
|
||||
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
|
||||
request, cookie)
|
||||
decoded[key] = cookie[key].decode("latin1", errors="replace")
|
||||
|
||||
cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
|
||||
for key, value in decoded.items(): # path, domain
|
||||
cookie_str += f"; {key.capitalize()}={value}"
|
||||
return cookie_str
|
||||
|
||||
def _get_request_cookies(self, jar, request):
|
||||
"""
|
||||
Extract cookies from the Request.cookies attribute
|
||||
"""
|
||||
if not request.cookies:
|
||||
return []
|
||||
elif isinstance(request.cookies, dict):
|
||||
cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
|
||||
else:
|
||||
cookies = request.cookies
|
||||
formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
|
||||
response = Response(request.url, headers={"Set-Cookie": formatted})
|
||||
return jar.make_cookies(response, request)
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
""" This module implements the DecompressionMiddleware which tries to recognise
|
||||
and extract the potentially compressed responses that may arrive.
|
||||
"""
|
||||
|
||||
import bz2
|
||||
import gzip
|
||||
import logging
|
||||
import tarfile
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from tempfile import mktemp
|
||||
|
||||
from scrapy.responsetypes import responsetypes
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DecompressionMiddleware:
|
||||
""" This middleware tries to recognise and extract the possibly compressed
|
||||
responses that may arrive. """
|
||||
|
||||
def __init__(self):
|
||||
self._formats = {
|
||||
'tar': self._is_tar,
|
||||
'zip': self._is_zip,
|
||||
'gz': self._is_gzip,
|
||||
'bz2': self._is_bzip2
|
||||
}
|
||||
|
||||
def _is_tar(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
|
||||
except tarfile.ReadError:
|
||||
return
|
||||
|
||||
body = tar_file.extractfile(tar_file.members[0]).read()
|
||||
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_zip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
zip_file = zipfile.ZipFile(archive)
|
||||
except zipfile.BadZipfile:
|
||||
return
|
||||
|
||||
namelist = zip_file.namelist()
|
||||
body = zip_file.read(namelist[0])
|
||||
respcls = responsetypes.from_args(filename=namelist[0], body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_gzip(self, response):
|
||||
archive = BytesIO(response.body)
|
||||
try:
|
||||
body = gzip.GzipFile(fileobj=archive).read()
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def _is_bzip2(self, response):
|
||||
try:
|
||||
body = bz2.decompress(response.body)
|
||||
except IOError:
|
||||
return
|
||||
|
||||
respcls = responsetypes.from_args(body=body)
|
||||
return response.replace(body=body, cls=respcls)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if not response.body:
|
||||
return response
|
||||
|
||||
for fmt, func in self._formats.items():
|
||||
new_response = func(response)
|
||||
if new_response:
|
||||
logger.debug('Decompressed response with format: %(responsefmt)s',
|
||||
{'responsefmt': fmt}, extra={'spider': spider})
|
||||
return new_response
|
||||
return response
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
DefaultHeaders downloader middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
|
||||
class DefaultHeadersMiddleware:
|
||||
|
||||
def __init__(self, headers):
|
||||
self._headers = headers
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
|
||||
return cls(headers.items())
|
||||
|
||||
def process_request(self, request, spider):
|
||||
for k, v in self._headers:
|
||||
request.headers.setdefault(k, v)
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
"""
|
||||
Download timeout middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class DownloadTimeoutMiddleware:
|
||||
|
||||
def __init__(self, timeout=180):
|
||||
self._timeout = timeout
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self._timeout = getattr(spider, 'download_timeout', self._timeout)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self._timeout:
|
||||
request.meta.setdefault('download_timeout', self._timeout)
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
HTTP basic auth downloader middleware
|
||||
|
||||
See documentation in docs/topics/downloader-middleware.rst
|
||||
"""
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class HttpAuthMiddleware:
|
||||
"""Set Basic HTTP Authorization header
|
||||
(http_user and http_pass spider class attributes)"""
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls()
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
usr = getattr(spider, 'http_user', '')
|
||||
pwd = getattr(spider, 'http_pass', '')
|
||||
if usr or pwd:
|
||||
self.auth = basic_auth_header(usr, pwd)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
auth = getattr(self, 'auth', None)
|
||||
if auth and b'Authorization' not in request.headers:
|
||||
request.headers[b'Authorization'] = auth
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
from email.utils import formatdate
|
||||
from typing import Optional, Type, TypeVar
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import (
|
||||
ConnectError,
|
||||
ConnectionDone,
|
||||
ConnectionLost,
|
||||
ConnectionRefusedError,
|
||||
DNSLookupError,
|
||||
TCPTimedOutError,
|
||||
TimeoutError,
|
||||
)
|
||||
from twisted.web.client import ResponseFailed
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
from scrapy.http.request import Request
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.statscollectors import StatsCollector
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
|
||||
HttpCacheMiddlewareTV = TypeVar("HttpCacheMiddlewareTV", bound="HttpCacheMiddleware")
|
||||
|
||||
|
||||
class HttpCacheMiddleware:
|
||||
|
||||
DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError)
|
||||
|
||||
def __init__(self, settings: Settings, stats: StatsCollector) -> None:
|
||||
if not settings.getbool('HTTPCACHE_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
|
||||
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
|
||||
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls: Type[HttpCacheMiddlewareTV], crawler: Crawler) -> HttpCacheMiddlewareTV:
|
||||
o = cls(crawler.settings, crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider: Spider) -> None:
|
||||
self.storage.open_spider(spider)
|
||||
|
||||
def spider_closed(self, spider: Spider) -> None:
|
||||
self.storage.close_spider(spider)
|
||||
|
||||
def process_request(self, request: Request, spider: Spider) -> Optional[Response]:
|
||||
if request.meta.get('dont_cache', False):
|
||||
return None
|
||||
|
||||
# Skip uncacheable requests
|
||||
if not self.policy.should_cache_request(request):
|
||||
request.meta['_dont_cache'] = True # flag as uncacheable
|
||||
return None
|
||||
|
||||
# Look for cached response and check if expired
|
||||
cachedresponse = self.storage.retrieve_response(spider, request)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/miss', spider=spider)
|
||||
if self.ignore_missing:
|
||||
self.stats.inc_value('httpcache/ignore', spider=spider)
|
||||
raise IgnoreRequest("Ignored request not in cache: %s" % request)
|
||||
return None # first time request
|
||||
|
||||
# Return cached response only if not expired
|
||||
cachedresponse.flags.append('cached')
|
||||
if self.policy.is_cached_response_fresh(cachedresponse, request):
|
||||
self.stats.inc_value('httpcache/hit', spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
# Keep a reference to cached response to avoid a second cache lookup on
|
||||
# process_response hook
|
||||
request.meta['cached_response'] = cachedresponse
|
||||
|
||||
return None
|
||||
|
||||
def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
|
||||
if request.meta.get('dont_cache', False):
|
||||
return response
|
||||
|
||||
# Skip cached responses and uncacheable requests
|
||||
if 'cached' in response.flags or '_dont_cache' in request.meta:
|
||||
request.meta.pop('_dont_cache', None)
|
||||
return response
|
||||
|
||||
# RFC2616 requires origin server to set Date header,
|
||||
# https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
|
||||
if 'Date' not in response.headers:
|
||||
response.headers['Date'] = formatdate(usegmt=True)
|
||||
|
||||
# Do not validate first-hand responses
|
||||
cachedresponse = request.meta.pop('cached_response', None)
|
||||
if cachedresponse is None:
|
||||
self.stats.inc_value('httpcache/firsthand', spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
if self.policy.is_cached_response_valid(cachedresponse, response, request):
|
||||
self.stats.inc_value('httpcache/revalidate', spider=spider)
|
||||
return cachedresponse
|
||||
|
||||
self.stats.inc_value('httpcache/invalidate', spider=spider)
|
||||
self._cache_response(spider, response, request, cachedresponse)
|
||||
return response
|
||||
|
||||
def process_exception(
|
||||
self, request: Request, exception: Exception, spider: Spider
|
||||
) -> Optional[Response]:
|
||||
cachedresponse = request.meta.pop('cached_response', None)
|
||||
if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
|
||||
self.stats.inc_value('httpcache/errorrecovery', spider=spider)
|
||||
return cachedresponse
|
||||
return None
|
||||
|
||||
def _cache_response(
|
||||
self, spider: Spider, response: Response, request: Request, cachedresponse: Optional[Response]
|
||||
) -> None:
|
||||
if self.policy.should_cache_response(response, request):
|
||||
self.stats.inc_value('httpcache/store', spider=spider)
|
||||
self.storage.store_response(spider, request, response)
|
||||
else:
|
||||
self.stats.inc_value('httpcache/uncacheable', spider=spider)
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
import io
|
||||
import zlib
|
||||
|
||||
from scrapy.utils.gz import gunzip
|
||||
from scrapy.http import Response, TextResponse
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
|
||||
|
||||
try:
|
||||
import brotli
|
||||
ACCEPTED_ENCODINGS.append(b'br')
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import zstandard
|
||||
ACCEPTED_ENCODINGS.append(b'zstd')
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class HttpCompressionMiddleware:
|
||||
"""This middleware allows compressed (gzip, deflate) traffic to be
|
||||
sent/received from web sites"""
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
|
||||
raise NotConfigured
|
||||
return cls()
|
||||
|
||||
def process_request(self, request, spider):
|
||||
request.headers.setdefault('Accept-Encoding',
|
||||
b", ".join(ACCEPTED_ENCODINGS))
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
|
||||
if request.method == 'HEAD':
|
||||
return response
|
||||
if isinstance(response, Response):
|
||||
content_encoding = response.headers.getlist('Content-Encoding')
|
||||
if content_encoding:
|
||||
encoding = content_encoding.pop()
|
||||
decoded_body = self._decode(response.body, encoding.lower())
|
||||
respcls = responsetypes.from_args(
|
||||
headers=response.headers, url=response.url, body=decoded_body
|
||||
)
|
||||
kwargs = dict(cls=respcls, body=decoded_body)
|
||||
if issubclass(respcls, TextResponse):
|
||||
# force recalculating the encoding until we make sure the
|
||||
# responsetypes guessing is reliable
|
||||
kwargs['encoding'] = None
|
||||
response = response.replace(**kwargs)
|
||||
if not content_encoding:
|
||||
del response.headers['Content-Encoding']
|
||||
|
||||
return response
|
||||
|
||||
def _decode(self, body, encoding):
|
||||
if encoding == b'gzip' or encoding == b'x-gzip':
|
||||
body = gunzip(body)
|
||||
|
||||
if encoding == b'deflate':
|
||||
try:
|
||||
body = zlib.decompress(body)
|
||||
except zlib.error:
|
||||
# ugly hack to work with raw deflate content that may
|
||||
# be sent by microsoft servers. For more information, see:
|
||||
# http://carsten.codimi.de/gzip.yaws/
|
||||
# http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
|
||||
# http://www.gzip.org/zlib/zlib_faq.html#faq38
|
||||
body = zlib.decompress(body, -15)
|
||||
if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
|
||||
body = brotli.decompress(body)
|
||||
if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
|
||||
# Using its streaming API since its simple API could handle only cases
|
||||
# where there is content size data embedded in the frame
|
||||
reader = zstandard.ZstdDecompressor().stream_reader(io.BytesIO(body))
|
||||
body = reader.read()
|
||||
return body
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
import base64
|
||||
from urllib.parse import unquote, urlunparse
|
||||
from urllib.request import getproxies, proxy_bypass, _parse_proxy
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class HttpProxyMiddleware:
|
||||
|
||||
def __init__(self, auth_encoding='latin-1'):
|
||||
self.auth_encoding = auth_encoding
|
||||
self.proxies = {}
|
||||
for type_, url in getproxies().items():
|
||||
try:
|
||||
self.proxies[type_] = self._get_proxy(url, type_)
|
||||
# some values such as '/var/run/docker.sock' can't be parsed
|
||||
# by _parse_proxy and as such should be skipped
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
|
||||
raise NotConfigured
|
||||
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
|
||||
return cls(auth_encoding)
|
||||
|
||||
def _basic_auth_header(self, username, password):
|
||||
user_pass = to_bytes(
|
||||
f'{unquote(username)}:{unquote(password)}',
|
||||
encoding=self.auth_encoding)
|
||||
return base64.b64encode(user_pass)
|
||||
|
||||
def _get_proxy(self, url, orig_type):
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
|
||||
|
||||
if user:
|
||||
creds = self._basic_auth_header(user, password)
|
||||
else:
|
||||
creds = None
|
||||
|
||||
return creds, proxy_url
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# ignore if proxy is already set
|
||||
if 'proxy' in request.meta:
|
||||
if request.meta['proxy'] is None:
|
||||
return
|
||||
# extract credentials if present
|
||||
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
|
||||
request.meta['proxy'] = proxy_url
|
||||
if creds and not request.headers.get('Proxy-Authorization'):
|
||||
request.headers['Proxy-Authorization'] = b'Basic ' + creds
|
||||
return
|
||||
elif not self.proxies:
|
||||
return
|
||||
|
||||
parsed = urlparse_cached(request)
|
||||
scheme = parsed.scheme
|
||||
|
||||
# 'no_proxy' is only supported by http schemes
|
||||
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
|
||||
return
|
||||
|
||||
if scheme in self.proxies:
|
||||
self._set_proxy(request, scheme)
|
||||
|
||||
def _set_proxy(self, request, scheme):
|
||||
creds, proxy = self.proxies[scheme]
|
||||
request.meta['proxy'] = proxy
|
||||
if creds:
|
||||
request.headers['Proxy-Authorization'] = b'Basic ' + creds
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
import logging
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from w3lib.url import safe_url_string
|
||||
|
||||
from scrapy.http import HtmlResponse
|
||||
from scrapy.utils.response import get_meta_refresh
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseRedirectMiddleware:
|
||||
|
||||
enabled_setting = 'REDIRECT_ENABLED'
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool(self.enabled_setting):
|
||||
raise NotConfigured
|
||||
|
||||
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
|
||||
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def _redirect(self, redirected, request, spider, reason):
|
||||
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
|
||||
redirects = request.meta.get('redirect_times', 0) + 1
|
||||
|
||||
if ttl and redirects <= self.max_redirect_times:
|
||||
redirected.meta['redirect_times'] = redirects
|
||||
redirected.meta['redirect_ttl'] = ttl - 1
|
||||
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + [request.url]
|
||||
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + [reason]
|
||||
redirected.dont_filter = request.dont_filter
|
||||
redirected.priority = request.priority + self.priority_adjust
|
||||
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
|
||||
{'reason': reason, 'redirected': redirected, 'request': request},
|
||||
extra={'spider': spider})
|
||||
return redirected
|
||||
else:
|
||||
logger.debug("Discarding %(request)s: max redirections reached",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
raise IgnoreRequest("max redirections reached")
|
||||
|
||||
def _redirect_request_using_get(self, request, redirect_url):
|
||||
redirected = request.replace(url=redirect_url, method='GET', body='')
|
||||
redirected.headers.pop('Content-Type', None)
|
||||
redirected.headers.pop('Content-Length', None)
|
||||
return redirected
|
||||
|
||||
|
||||
class RedirectMiddleware(BaseRedirectMiddleware):
|
||||
"""
|
||||
Handle redirection of requests based on response status
|
||||
and meta-refresh html tag.
|
||||
"""
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if (
|
||||
request.meta.get('dont_redirect', False)
|
||||
or response.status in getattr(spider, 'handle_httpstatus_list', [])
|
||||
or response.status in request.meta.get('handle_httpstatus_list', [])
|
||||
or request.meta.get('handle_httpstatus_all', False)
|
||||
):
|
||||
return response
|
||||
|
||||
allowed_status = (301, 302, 303, 307, 308)
|
||||
if 'Location' not in response.headers or response.status not in allowed_status:
|
||||
return response
|
||||
|
||||
location = safe_url_string(response.headers['Location'])
|
||||
if response.headers['Location'].startswith(b'//'):
|
||||
request_scheme = urlparse(request.url).scheme
|
||||
location = request_scheme + '://' + location.lstrip('/')
|
||||
|
||||
redirected_url = urljoin(request.url, location)
|
||||
|
||||
if response.status in (301, 307, 308) or request.method == 'HEAD':
|
||||
redirected = request.replace(url=redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
redirected = self._redirect_request_using_get(request, redirected_url)
|
||||
return self._redirect(redirected, request, spider, response.status)
|
||||
|
||||
|
||||
class MetaRefreshMiddleware(BaseRedirectMiddleware):
|
||||
|
||||
enabled_setting = 'METAREFRESH_ENABLED'
|
||||
|
||||
def __init__(self, settings):
|
||||
super().__init__(settings)
|
||||
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
|
||||
self._maxdelay = settings.getint('METAREFRESH_MAXDELAY')
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if (
|
||||
request.meta.get('dont_redirect', False)
|
||||
or request.method == 'HEAD'
|
||||
or not isinstance(response, HtmlResponse)
|
||||
):
|
||||
return response
|
||||
|
||||
interval, url = get_meta_refresh(response,
|
||||
ignore_tags=self._ignore_tags)
|
||||
if url and interval < self._maxdelay:
|
||||
redirected = self._redirect_request_using_get(request, url)
|
||||
return self._redirect(redirected, request, spider, 'meta refresh')
|
||||
|
||||
return response
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
"""
|
||||
An extension to retry failed requests that are potentially caused by temporary
|
||||
problems such as a connection timeout or HTTP 500 error.
|
||||
|
||||
You can change the behaviour of this middleware by modifing the scraping settings:
|
||||
RETRY_TIMES - how many times to retry a failed page
|
||||
RETRY_HTTP_CODES - which HTTP response codes to retry
|
||||
|
||||
Failed pages are collected on the scraping process and rescheduled at the end,
|
||||
once the spider has finished crawling all regular (non failed) pages.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.error import (
|
||||
ConnectError,
|
||||
ConnectionDone,
|
||||
ConnectionLost,
|
||||
ConnectionRefusedError,
|
||||
DNSLookupError,
|
||||
TCPTimedOutError,
|
||||
TimeoutError,
|
||||
)
|
||||
from twisted.web.client import ResponseFailed
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.response import response_status_message
|
||||
from scrapy.core.downloader.handlers.http11 import TunnelError
|
||||
from scrapy.utils.python import global_object_name
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RetryMiddleware:
|
||||
|
||||
# IOError is raised by the HttpCompression middleware when trying to
|
||||
# decompress an empty response
|
||||
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
|
||||
ConnectionRefusedError, ConnectionDone, ConnectError,
|
||||
ConnectionLost, TCPTimedOutError, ResponseFailed,
|
||||
IOError, TunnelError)
|
||||
|
||||
def __init__(self, settings):
|
||||
if not settings.getbool('RETRY_ENABLED'):
|
||||
raise NotConfigured
|
||||
self.max_retry_times = settings.getint('RETRY_TIMES')
|
||||
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
|
||||
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler.settings)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
if request.meta.get('dont_retry', False):
|
||||
return response
|
||||
if response.status in self.retry_http_codes:
|
||||
reason = response_status_message(response.status)
|
||||
return self._retry(request, reason, spider) or response
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
if (
|
||||
isinstance(exception, self.EXCEPTIONS_TO_RETRY)
|
||||
and not request.meta.get('dont_retry', False)
|
||||
):
|
||||
return self._retry(request, exception, spider)
|
||||
|
||||
def _retry(self, request, reason, spider):
|
||||
retries = request.meta.get('retry_times', 0) + 1
|
||||
|
||||
retry_times = self.max_retry_times
|
||||
|
||||
if 'max_retry_times' in request.meta:
|
||||
retry_times = request.meta['max_retry_times']
|
||||
|
||||
stats = spider.crawler.stats
|
||||
if retries <= retry_times:
|
||||
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
retryreq = request.copy()
|
||||
retryreq.meta['retry_times'] = retries
|
||||
retryreq.dont_filter = True
|
||||
retryreq.priority = request.priority + self.priority_adjust
|
||||
|
||||
if isinstance(reason, Exception):
|
||||
reason = global_object_name(reason.__class__)
|
||||
|
||||
stats.inc_value('retry/count')
|
||||
stats.inc_value(f'retry/reason_count/{reason}')
|
||||
return retryreq
|
||||
else:
|
||||
stats.inc_value('retry/max_reached')
|
||||
logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
|
||||
{'request': request, 'retries': retries, 'reason': reason},
|
||||
extra={'spider': spider})
|
||||
|
|
@ -0,0 +1,109 @@
|
|||
"""
|
||||
This is a middleware to respect robots.txt policies. To activate it you must
|
||||
enable this middleware and enable the ROBOTSTXT_OBEY setting.
|
||||
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from twisted.internet.defer import Deferred, maybeDeferred
|
||||
from scrapy.exceptions import NotConfigured, IgnoreRequest
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RobotsTxtMiddleware:
|
||||
DOWNLOAD_PRIORITY = 1000
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
|
||||
raise NotConfigured
|
||||
self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
|
||||
self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None)
|
||||
self.crawler = crawler
|
||||
self._parsers = {}
|
||||
self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
|
||||
|
||||
# check if parser dependencies are met, this should throw an error otherwise.
|
||||
self._parserimpl.from_crawler(self.crawler, b'')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if request.meta.get('dont_obey_robotstxt'):
|
||||
return
|
||||
d = maybeDeferred(self.robot_parser, request, spider)
|
||||
d.addCallback(self.process_request_2, request, spider)
|
||||
return d
|
||||
|
||||
def process_request_2(self, rp, request, spider):
|
||||
if rp is None:
|
||||
return
|
||||
|
||||
useragent = self._robotstxt_useragent
|
||||
if not useragent:
|
||||
useragent = request.headers.get(b'User-Agent', self._default_useragent)
|
||||
if not rp.allowed(request.url, useragent):
|
||||
logger.debug("Forbidden by robots.txt: %(request)s",
|
||||
{'request': request}, extra={'spider': spider})
|
||||
self.crawler.stats.inc_value('robotstxt/forbidden')
|
||||
raise IgnoreRequest("Forbidden by robots.txt")
|
||||
|
||||
def robot_parser(self, request, spider):
|
||||
url = urlparse_cached(request)
|
||||
netloc = url.netloc
|
||||
|
||||
if netloc not in self._parsers:
|
||||
self._parsers[netloc] = Deferred()
|
||||
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
|
||||
robotsreq = Request(
|
||||
robotsurl,
|
||||
priority=self.DOWNLOAD_PRIORITY,
|
||||
meta={'dont_obey_robotstxt': True}
|
||||
)
|
||||
dfd = self.crawler.engine.download(robotsreq, spider)
|
||||
dfd.addCallback(self._parse_robots, netloc, spider)
|
||||
dfd.addErrback(self._logerror, robotsreq, spider)
|
||||
dfd.addErrback(self._robots_error, netloc)
|
||||
self.crawler.stats.inc_value('robotstxt/request_count')
|
||||
|
||||
if isinstance(self._parsers[netloc], Deferred):
|
||||
d = Deferred()
|
||||
|
||||
def cb(result):
|
||||
d.callback(result)
|
||||
return result
|
||||
self._parsers[netloc].addCallback(cb)
|
||||
return d
|
||||
else:
|
||||
return self._parsers[netloc]
|
||||
|
||||
def _logerror(self, failure, request, spider):
|
||||
if failure.type is not IgnoreRequest:
|
||||
logger.error("Error downloading %(request)s: %(f_exception)s",
|
||||
{'request': request, 'f_exception': failure.value},
|
||||
exc_info=failure_to_exc_info(failure),
|
||||
extra={'spider': spider})
|
||||
return failure
|
||||
|
||||
def _parse_robots(self, response, netloc, spider):
|
||||
self.crawler.stats.inc_value('robotstxt/response_count')
|
||||
self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
|
||||
rp = self._parserimpl.from_crawler(self.crawler, response.body)
|
||||
rp_dfd = self._parsers[netloc]
|
||||
self._parsers[netloc] = rp
|
||||
rp_dfd.callback(rp)
|
||||
|
||||
def _robots_error(self, failure, netloc):
|
||||
if failure.type is not IgnoreRequest:
|
||||
key = f'robotstxt/exception_count/{failure.type}'
|
||||
self.crawler.stats.inc_value(key)
|
||||
rp_dfd = self._parsers[netloc]
|
||||
self._parsers[netloc] = None
|
||||
rp_dfd.callback(None)
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.request import request_httprepr
|
||||
from scrapy.utils.response import response_httprepr
|
||||
from scrapy.utils.python import global_object_name
|
||||
|
||||
|
||||
class DownloaderStats:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('DOWNLOADER_STATS'):
|
||||
raise NotConfigured
|
||||
return cls(crawler.stats)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
self.stats.inc_value('downloader/request_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
|
||||
reqlen = len(request_httprepr(request))
|
||||
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
self.stats.inc_value('downloader/response_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
|
||||
reslen = len(response_httprepr(response))
|
||||
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
ex_class = global_object_name(exception.__class__)
|
||||
self.stats.inc_value('downloader/exception_count', spider=spider)
|
||||
self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
"""Set User-Agent header per spider or use a default value from settings"""
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class UserAgentMiddleware:
|
||||
"""This middleware allows spiders to override the user_agent"""
|
||||
|
||||
def __init__(self, user_agent='Scrapy'):
|
||||
self.user_agent = user_agent
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.settings['USER_AGENT'])
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
|
||||
|
||||
def process_request(self, request, spider):
|
||||
if self.user_agent:
|
||||
request.headers.setdefault(b'User-Agent', self.user_agent)
|
||||
73
venv/lib/python3.9/site-packages/scrapy/dupefilters.py
Normal file
73
venv/lib/python3.9/site-packages/scrapy/dupefilters.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
import os
|
||||
import logging
|
||||
|
||||
from scrapy.utils.job import job_dir
|
||||
from scrapy.utils.request import referer_str, request_fingerprint
|
||||
|
||||
|
||||
class BaseDupeFilter:
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
return cls()
|
||||
|
||||
def request_seen(self, request):
|
||||
return False
|
||||
|
||||
def open(self): # can return deferred
|
||||
pass
|
||||
|
||||
def close(self, reason): # can return a deferred
|
||||
pass
|
||||
|
||||
def log(self, request, spider): # log that a request has been filtered
|
||||
pass
|
||||
|
||||
|
||||
class RFPDupeFilter(BaseDupeFilter):
|
||||
"""Request Fingerprint duplicates filter"""
|
||||
|
||||
def __init__(self, path=None, debug=False):
|
||||
self.file = None
|
||||
self.fingerprints = set()
|
||||
self.logdupes = True
|
||||
self.debug = debug
|
||||
self.logger = logging.getLogger(__name__)
|
||||
if path:
|
||||
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
||||
self.file.seek(0)
|
||||
self.fingerprints.update(x.rstrip() for x in self.file)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
debug = settings.getbool('DUPEFILTER_DEBUG')
|
||||
return cls(job_dir(settings), debug)
|
||||
|
||||
def request_seen(self, request):
|
||||
fp = self.request_fingerprint(request)
|
||||
if fp in self.fingerprints:
|
||||
return True
|
||||
self.fingerprints.add(fp)
|
||||
if self.file:
|
||||
self.file.write(fp + '\n')
|
||||
|
||||
def request_fingerprint(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
def close(self, reason):
|
||||
if self.file:
|
||||
self.file.close()
|
||||
|
||||
def log(self, request, spider):
|
||||
if self.debug:
|
||||
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
|
||||
args = {'request': request, 'referer': referer_str(request)}
|
||||
self.logger.debug(msg, args, extra={'spider': spider})
|
||||
elif self.logdupes:
|
||||
msg = ("Filtered duplicate request: %(request)s"
|
||||
" - no more duplicates will be shown"
|
||||
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
||||
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||
self.logdupes = False
|
||||
|
||||
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
||||
89
venv/lib/python3.9/site-packages/scrapy/exceptions.py
Normal file
89
venv/lib/python3.9/site-packages/scrapy/exceptions.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
"""
|
||||
Scrapy core exceptions
|
||||
|
||||
These exceptions are documented in docs/topics/exceptions.rst. Please don't add
|
||||
new exceptions here without documenting them there.
|
||||
"""
|
||||
|
||||
# Internal
|
||||
|
||||
|
||||
class NotConfigured(Exception):
|
||||
"""Indicates a missing configuration situation"""
|
||||
pass
|
||||
|
||||
|
||||
class _InvalidOutput(TypeError):
|
||||
"""
|
||||
Indicates an invalid value has been returned by a middleware's processing method.
|
||||
Internal and undocumented, it should not be raised or caught by user code.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
# HTTP and crawling
|
||||
|
||||
|
||||
class IgnoreRequest(Exception):
|
||||
"""Indicates a decision was made not to process a request"""
|
||||
|
||||
|
||||
class DontCloseSpider(Exception):
|
||||
"""Request the spider not to be closed yet"""
|
||||
pass
|
||||
|
||||
|
||||
class CloseSpider(Exception):
|
||||
"""Raise this from callbacks to request the spider to be closed"""
|
||||
|
||||
def __init__(self, reason='cancelled'):
|
||||
super().__init__()
|
||||
self.reason = reason
|
||||
|
||||
|
||||
class StopDownload(Exception):
|
||||
"""
|
||||
Stop the download of the body for a given response.
|
||||
The 'fail' boolean parameter indicates whether or not the resulting partial response
|
||||
should be handled by the request errback. Note that 'fail' is a keyword-only argument.
|
||||
"""
|
||||
|
||||
def __init__(self, *, fail=True):
|
||||
super().__init__()
|
||||
self.fail = fail
|
||||
|
||||
|
||||
# Items
|
||||
|
||||
|
||||
class DropItem(Exception):
|
||||
"""Drop item from the item pipeline"""
|
||||
pass
|
||||
|
||||
|
||||
class NotSupported(Exception):
|
||||
"""Indicates a feature or method is not supported"""
|
||||
pass
|
||||
|
||||
|
||||
# Commands
|
||||
|
||||
|
||||
class UsageError(Exception):
|
||||
"""To indicate a command-line usage error"""
|
||||
|
||||
def __init__(self, *a, **kw):
|
||||
self.print_help = kw.pop('print_help', True)
|
||||
super().__init__(*a, **kw)
|
||||
|
||||
|
||||
class ScrapyDeprecationWarning(Warning):
|
||||
"""Warning category for deprecated features, since the default
|
||||
DeprecationWarning is silenced on Python 2.7+
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ContractFail(AssertionError):
|
||||
"""Error raised in case of a failing contract"""
|
||||
pass
|
||||
338
venv/lib/python3.9/site-packages/scrapy/exporters.py
Normal file
338
venv/lib/python3.9/site-packages/scrapy/exporters.py
Normal file
|
|
@ -0,0 +1,338 @@
|
|||
"""
|
||||
Item Exporters are used to export/serialize items into different formats.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import io
|
||||
import marshal
|
||||
import pickle
|
||||
import pprint
|
||||
import warnings
|
||||
from xml.sax.saxutils import XMLGenerator
|
||||
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.item import _BaseItem
|
||||
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
|
||||
from scrapy.utils.serialize import ScrapyJSONEncoder
|
||||
|
||||
|
||||
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
|
||||
'CsvItemExporter', 'XmlItemExporter', 'JsonLinesItemExporter',
|
||||
'JsonItemExporter', 'MarshalItemExporter']
|
||||
|
||||
|
||||
class BaseItemExporter:
|
||||
|
||||
def __init__(self, *, dont_fail=False, **kwargs):
|
||||
self._kwargs = kwargs
|
||||
self._configure(kwargs, dont_fail=dont_fail)
|
||||
|
||||
def _configure(self, options, dont_fail=False):
|
||||
"""Configure the exporter by poping options from the ``options`` dict.
|
||||
If dont_fail is set, it won't raise an exception on unexpected options
|
||||
(useful for using with keyword arguments in subclasses ``__init__`` methods)
|
||||
"""
|
||||
self.encoding = options.pop('encoding', None)
|
||||
self.fields_to_export = options.pop('fields_to_export', None)
|
||||
self.export_empty_fields = options.pop('export_empty_fields', False)
|
||||
self.indent = options.pop('indent', None)
|
||||
if not dont_fail and options:
|
||||
raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
|
||||
|
||||
def export_item(self, item):
|
||||
raise NotImplementedError
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', lambda x: x)
|
||||
return serializer(value)
|
||||
|
||||
def start_exporting(self):
|
||||
pass
|
||||
|
||||
def finish_exporting(self):
|
||||
pass
|
||||
|
||||
def _get_serialized_fields(self, item, default_value=None, include_empty=None):
|
||||
"""Return the fields to export as an iterable of tuples
|
||||
(name, serialized_value)
|
||||
"""
|
||||
item = ItemAdapter(item)
|
||||
|
||||
if include_empty is None:
|
||||
include_empty = self.export_empty_fields
|
||||
|
||||
if self.fields_to_export is None:
|
||||
if include_empty:
|
||||
field_iter = item.field_names()
|
||||
else:
|
||||
field_iter = item.keys()
|
||||
else:
|
||||
if include_empty:
|
||||
field_iter = self.fields_to_export
|
||||
else:
|
||||
field_iter = (x for x in self.fields_to_export if x in item)
|
||||
|
||||
for field_name in field_iter:
|
||||
if field_name in item:
|
||||
field_meta = item.get_field_meta(field_name)
|
||||
value = self.serialize_field(field_meta, field_name, item[field_name])
|
||||
else:
|
||||
value = default_value
|
||||
|
||||
yield field_name, value
|
||||
|
||||
|
||||
class JsonLinesItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
super().__init__(dont_fail=True, **kwargs)
|
||||
self.file = file
|
||||
self._kwargs.setdefault('ensure_ascii', not self.encoding)
|
||||
self.encoder = ScrapyJSONEncoder(**self._kwargs)
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
data = self.encoder.encode(itemdict) + '\n'
|
||||
self.file.write(to_bytes(data, self.encoding))
|
||||
|
||||
|
||||
class JsonItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
super().__init__(dont_fail=True, **kwargs)
|
||||
self.file = file
|
||||
# there is a small difference between the behaviour or JsonItemExporter.indent
|
||||
# and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
|
||||
# the addition of newlines everywhere
|
||||
json_indent = self.indent if self.indent is not None and self.indent > 0 else None
|
||||
self._kwargs.setdefault('indent', json_indent)
|
||||
self._kwargs.setdefault('ensure_ascii', not self.encoding)
|
||||
self.encoder = ScrapyJSONEncoder(**self._kwargs)
|
||||
self.first_item = True
|
||||
|
||||
def _beautify_newline(self):
|
||||
if self.indent is not None:
|
||||
self.file.write(b'\n')
|
||||
|
||||
def start_exporting(self):
|
||||
self.file.write(b"[")
|
||||
self._beautify_newline()
|
||||
|
||||
def finish_exporting(self):
|
||||
self._beautify_newline()
|
||||
self.file.write(b"]")
|
||||
|
||||
def export_item(self, item):
|
||||
if self.first_item:
|
||||
self.first_item = False
|
||||
else:
|
||||
self.file.write(b',')
|
||||
self._beautify_newline()
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
data = self.encoder.encode(itemdict)
|
||||
self.file.write(to_bytes(data, self.encoding))
|
||||
|
||||
|
||||
class XmlItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
self.item_element = kwargs.pop('item_element', 'item')
|
||||
self.root_element = kwargs.pop('root_element', 'items')
|
||||
super().__init__(**kwargs)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
self.xg = XMLGenerator(file, encoding=self.encoding)
|
||||
|
||||
def _beautify_newline(self, new_item=False):
|
||||
if self.indent is not None and (self.indent > 0 or new_item):
|
||||
self.xg.characters('\n')
|
||||
|
||||
def _beautify_indent(self, depth=1):
|
||||
if self.indent:
|
||||
self.xg.characters(' ' * self.indent * depth)
|
||||
|
||||
def start_exporting(self):
|
||||
self.xg.startDocument()
|
||||
self.xg.startElement(self.root_element, {})
|
||||
self._beautify_newline(new_item=True)
|
||||
|
||||
def export_item(self, item):
|
||||
self._beautify_indent(depth=1)
|
||||
self.xg.startElement(self.item_element, {})
|
||||
self._beautify_newline()
|
||||
for name, value in self._get_serialized_fields(item, default_value=''):
|
||||
self._export_xml_field(name, value, depth=2)
|
||||
self._beautify_indent(depth=1)
|
||||
self.xg.endElement(self.item_element)
|
||||
self._beautify_newline(new_item=True)
|
||||
|
||||
def finish_exporting(self):
|
||||
self.xg.endElement(self.root_element)
|
||||
self.xg.endDocument()
|
||||
|
||||
def _export_xml_field(self, name, serialized_value, depth):
|
||||
self._beautify_indent(depth=depth)
|
||||
self.xg.startElement(name, {})
|
||||
if hasattr(serialized_value, 'items'):
|
||||
self._beautify_newline()
|
||||
for subname, value in serialized_value.items():
|
||||
self._export_xml_field(subname, value, depth=depth + 1)
|
||||
self._beautify_indent(depth=depth)
|
||||
elif is_listlike(serialized_value):
|
||||
self._beautify_newline()
|
||||
for value in serialized_value:
|
||||
self._export_xml_field('value', value, depth=depth + 1)
|
||||
self._beautify_indent(depth=depth)
|
||||
elif isinstance(serialized_value, str):
|
||||
self.xg.characters(serialized_value)
|
||||
else:
|
||||
self.xg.characters(str(serialized_value))
|
||||
self.xg.endElement(name)
|
||||
self._beautify_newline()
|
||||
|
||||
|
||||
class CsvItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, include_headers_line=True, join_multivalued=',', errors=None, **kwargs):
|
||||
super().__init__(dont_fail=True, **kwargs)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
self.include_headers_line = include_headers_line
|
||||
self.stream = io.TextIOWrapper(
|
||||
file,
|
||||
line_buffering=False,
|
||||
write_through=True,
|
||||
encoding=self.encoding,
|
||||
newline='', # Windows needs this https://github.com/scrapy/scrapy/issues/3034
|
||||
errors=errors,
|
||||
)
|
||||
self.csv_writer = csv.writer(self.stream, **self._kwargs)
|
||||
self._headers_not_written = True
|
||||
self._join_multivalued = join_multivalued
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._join_if_needed)
|
||||
return serializer(value)
|
||||
|
||||
def _join_if_needed(self, value):
|
||||
if isinstance(value, (list, tuple)):
|
||||
try:
|
||||
return self._join_multivalued.join(value)
|
||||
except TypeError: # list in value may not contain strings
|
||||
pass
|
||||
return value
|
||||
|
||||
def export_item(self, item):
|
||||
if self._headers_not_written:
|
||||
self._headers_not_written = False
|
||||
self._write_headers_and_set_fields_to_export(item)
|
||||
|
||||
fields = self._get_serialized_fields(item, default_value='',
|
||||
include_empty=True)
|
||||
values = list(self._build_row(x for _, x in fields))
|
||||
self.csv_writer.writerow(values)
|
||||
|
||||
def _build_row(self, values):
|
||||
for s in values:
|
||||
try:
|
||||
yield to_unicode(s, self.encoding)
|
||||
except TypeError:
|
||||
yield s
|
||||
|
||||
def _write_headers_and_set_fields_to_export(self, item):
|
||||
if self.include_headers_line:
|
||||
if not self.fields_to_export:
|
||||
# use declared field names, or keys if the item is a dict
|
||||
self.fields_to_export = ItemAdapter(item).field_names()
|
||||
row = list(self._build_row(self.fields_to_export))
|
||||
self.csv_writer.writerow(row)
|
||||
|
||||
|
||||
class PickleItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, protocol=4, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.file = file
|
||||
self.protocol = protocol
|
||||
|
||||
def export_item(self, item):
|
||||
d = dict(self._get_serialized_fields(item))
|
||||
pickle.dump(d, self.file, self.protocol)
|
||||
|
||||
|
||||
class MarshalItemExporter(BaseItemExporter):
|
||||
"""Exports items in a Python-specific binary format (see
|
||||
:mod:`marshal`).
|
||||
|
||||
:param file: The file-like object to use for exporting the data. Its
|
||||
``write`` method should accept :class:`bytes` (a disk file
|
||||
opened in binary mode, a :class:`~io.BytesIO` object, etc)
|
||||
"""
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.file = file
|
||||
|
||||
def export_item(self, item):
|
||||
marshal.dump(dict(self._get_serialized_fields(item)), self.file)
|
||||
|
||||
|
||||
class PprintItemExporter(BaseItemExporter):
|
||||
|
||||
def __init__(self, file, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.file = file
|
||||
|
||||
def export_item(self, item):
|
||||
itemdict = dict(self._get_serialized_fields(item))
|
||||
self.file.write(to_bytes(pprint.pformat(itemdict) + '\n'))
|
||||
|
||||
|
||||
class PythonItemExporter(BaseItemExporter):
|
||||
"""This is a base class for item exporters that extends
|
||||
:class:`BaseItemExporter` with support for nested items.
|
||||
|
||||
It serializes items to built-in Python types, so that any serialization
|
||||
library (e.g. :mod:`json` or msgpack_) can be used on top of it.
|
||||
|
||||
.. _msgpack: https://pypi.org/project/msgpack/
|
||||
"""
|
||||
|
||||
def _configure(self, options, dont_fail=False):
|
||||
self.binary = options.pop('binary', True)
|
||||
super()._configure(options, dont_fail)
|
||||
if self.binary:
|
||||
warnings.warn(
|
||||
"PythonItemExporter will drop support for binary export in the future",
|
||||
ScrapyDeprecationWarning)
|
||||
if not self.encoding:
|
||||
self.encoding = 'utf-8'
|
||||
|
||||
def serialize_field(self, field, name, value):
|
||||
serializer = field.get('serializer', self._serialize_value)
|
||||
return serializer(value)
|
||||
|
||||
def _serialize_value(self, value):
|
||||
if isinstance(value, _BaseItem):
|
||||
return self.export_item(value)
|
||||
elif is_item(value):
|
||||
return dict(self._serialize_item(value))
|
||||
elif is_listlike(value):
|
||||
return [self._serialize_value(v) for v in value]
|
||||
encode_func = to_bytes if self.binary else to_unicode
|
||||
if isinstance(value, (str, bytes)):
|
||||
return encode_func(value, encoding=self.encoding)
|
||||
return value
|
||||
|
||||
def _serialize_item(self, item):
|
||||
for key, value in ItemAdapter(item).items():
|
||||
key = to_bytes(key) if self.binary else key
|
||||
yield key, self._serialize_value(value)
|
||||
|
||||
def export_item(self, item):
|
||||
result = dict(self._get_serialized_fields(item))
|
||||
if self.binary:
|
||||
result = dict(self._serialize_item(result))
|
||||
return result
|
||||
16
venv/lib/python3.9/site-packages/scrapy/extension.py
Normal file
16
venv/lib/python3.9/site-packages/scrapy/extension.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
The Extension Manager
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.conf import build_component_list
|
||||
|
||||
|
||||
class ExtensionManager(MiddlewareManager):
|
||||
|
||||
component_name = 'extension'
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(settings.getwithbase('EXTENSIONS'))
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
"""CloseSpider is an extension that forces spiders to be closed after certain
|
||||
conditions are met.
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
|
||||
class CloseSpider:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
|
||||
self.close_on = {
|
||||
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
|
||||
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
|
||||
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
|
||||
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
|
||||
}
|
||||
|
||||
if not any(self.close_on.values()):
|
||||
raise NotConfigured
|
||||
|
||||
self.counter = defaultdict(int)
|
||||
|
||||
if self.close_on.get('errorcount'):
|
||||
crawler.signals.connect(self.error_count, signal=signals.spider_error)
|
||||
if self.close_on.get('pagecount'):
|
||||
crawler.signals.connect(self.page_count, signal=signals.response_received)
|
||||
if self.close_on.get('timeout'):
|
||||
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
|
||||
if self.close_on.get('itemcount'):
|
||||
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def error_count(self, failure, response, spider):
|
||||
self.counter['errorcount'] += 1
|
||||
if self.counter['errorcount'] == self.close_on['errorcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
|
||||
|
||||
def page_count(self, response, request, spider):
|
||||
self.counter['pagecount'] += 1
|
||||
if self.counter['pagecount'] == self.close_on['pagecount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
|
||||
|
||||
def spider_opened(self, spider):
|
||||
from twisted.internet import reactor
|
||||
self.task = reactor.callLater(self.close_on['timeout'],
|
||||
self.crawler.engine.close_spider, spider,
|
||||
reason='closespider_timeout')
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.counter['itemcount'] += 1
|
||||
if self.counter['itemcount'] == self.close_on['itemcount']:
|
||||
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
|
||||
|
||||
def spider_closed(self, spider):
|
||||
task = getattr(self, 'task', False)
|
||||
if task and task.active():
|
||||
task.cancel()
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
"""
|
||||
Extension for collecting core stats like items scraped and start/finish times
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class CoreStats:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
self.start_time = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
|
||||
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
|
||||
crawler.signals.connect(o.response_received, signal=signals.response_received)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.start_time = datetime.utcnow()
|
||||
self.stats.set_value('start_time', self.start_time, spider=spider)
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
finish_time = datetime.utcnow()
|
||||
elapsed_time = finish_time - self.start_time
|
||||
elapsed_time_seconds = elapsed_time.total_seconds()
|
||||
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
|
||||
self.stats.set_value('finish_time', finish_time, spider=spider)
|
||||
self.stats.set_value('finish_reason', reason, spider=spider)
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
self.stats.inc_value('item_scraped_count', spider=spider)
|
||||
|
||||
def response_received(self, spider):
|
||||
self.stats.inc_value('response_received_count', spider=spider)
|
||||
|
||||
def item_dropped(self, item, spider, exception):
|
||||
reason = exception.__class__.__name__
|
||||
self.stats.inc_value('item_dropped_count', spider=spider)
|
||||
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)
|
||||
64
venv/lib/python3.9/site-packages/scrapy/extensions/debug.py
Normal file
64
venv/lib/python3.9/site-packages/scrapy/extensions/debug.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
"""
|
||||
Extensions for debugging Scrapy
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
import sys
|
||||
import signal
|
||||
import logging
|
||||
import traceback
|
||||
import threading
|
||||
from pdb import Pdb
|
||||
|
||||
from scrapy.utils.engine import format_engine_status
|
||||
from scrapy.utils.trackref import format_live_refs
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StackTraceDump:
|
||||
|
||||
def __init__(self, crawler=None):
|
||||
self.crawler = crawler
|
||||
try:
|
||||
signal.signal(signal.SIGUSR2, self.dump_stacktrace)
|
||||
signal.signal(signal.SIGQUIT, self.dump_stacktrace)
|
||||
except AttributeError:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def dump_stacktrace(self, signum, frame):
|
||||
log_args = {
|
||||
'stackdumps': self._thread_stacks(),
|
||||
'enginestatus': format_engine_status(self.crawler.engine),
|
||||
'liverefs': format_live_refs(),
|
||||
}
|
||||
logger.info("Dumping stack trace and engine status\n"
|
||||
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
|
||||
log_args, extra={'crawler': self.crawler})
|
||||
|
||||
def _thread_stacks(self):
|
||||
id2name = dict((th.ident, th.name) for th in threading.enumerate())
|
||||
dumps = ''
|
||||
for id_, frame in sys._current_frames().items():
|
||||
name = id2name.get(id_, '')
|
||||
dump = ''.join(traceback.format_stack(frame))
|
||||
dumps += f"# Thread: {name}({id_})\n{dump}\n"
|
||||
return dumps
|
||||
|
||||
|
||||
class Debugger:
|
||||
def __init__(self):
|
||||
try:
|
||||
signal.signal(signal.SIGUSR2, self._enter_debugger)
|
||||
except AttributeError:
|
||||
# win32 platforms don't support SIGUSR signals
|
||||
pass
|
||||
|
||||
def _enter_debugger(self, signum, frame):
|
||||
Pdb().set_trace(frame.f_back)
|
||||
480
venv/lib/python3.9/site-packages/scrapy/extensions/feedexport.py
Normal file
480
venv/lib/python3.9/site-packages/scrapy/extensions/feedexport.py
Normal file
|
|
@ -0,0 +1,480 @@
|
|||
"""
|
||||
Feed Exports extension
|
||||
|
||||
See documentation in docs/topics/feed-exports.rst
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
from tempfile import NamedTemporaryFile
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
from twisted.internet import defer, threads
|
||||
from w3lib.url import file_uri_to_path
|
||||
from zope.interface import implementer, Interface
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||
from scrapy.utils.boto import is_botocore_available
|
||||
from scrapy.utils.conf import feed_complete_default_values_from_settings
|
||||
from scrapy.utils.ftp import ftp_store_file
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.python import get_func_args, without_none_values
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_storage(builder, uri, *args, feed_options=None, preargs=(), **kwargs):
|
||||
argument_names = get_func_args(builder)
|
||||
if 'feed_options' in argument_names:
|
||||
kwargs['feed_options'] = feed_options
|
||||
else:
|
||||
warnings.warn(
|
||||
"{} does not support the 'feed_options' keyword argument. Add a "
|
||||
"'feed_options' parameter to its signature to remove this "
|
||||
"warning. This parameter will become mandatory in a future "
|
||||
"version of Scrapy."
|
||||
.format(builder.__qualname__),
|
||||
category=ScrapyDeprecationWarning
|
||||
)
|
||||
return builder(*preargs, uri, *args, **kwargs)
|
||||
|
||||
|
||||
class IFeedStorage(Interface):
|
||||
"""Interface that all Feed Storages must implement"""
|
||||
|
||||
def __init__(uri, *, feed_options=None):
|
||||
"""Initialize the storage with the parameters given in the URI and the
|
||||
feed-specific options (see :setting:`FEEDS`)"""
|
||||
|
||||
def open(spider):
|
||||
"""Open the storage for the given spider. It must return a file-like
|
||||
object that will be used for the exporters"""
|
||||
|
||||
def store(file):
|
||||
"""Store the given file stream"""
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class BlockingFeedStorage:
|
||||
|
||||
def open(self, spider):
|
||||
path = spider.crawler.settings['FEED_TEMPDIR']
|
||||
if path and not os.path.isdir(path):
|
||||
raise OSError('Not a Directory: ' + str(path))
|
||||
|
||||
return NamedTemporaryFile(prefix='feed-', dir=path)
|
||||
|
||||
def store(self, file):
|
||||
return threads.deferToThread(self._store_in_thread, file)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class StdoutFeedStorage:
|
||||
|
||||
def __init__(self, uri, _stdout=None, *, feed_options=None):
|
||||
if not _stdout:
|
||||
_stdout = sys.stdout.buffer
|
||||
self._stdout = _stdout
|
||||
if feed_options and feed_options.get('overwrite', False) is True:
|
||||
logger.warning('Standard output (stdout) storage does not support '
|
||||
'overwriting. To suppress this warning, remove the '
|
||||
'overwrite option from your FEEDS setting, or set '
|
||||
'it to False.')
|
||||
|
||||
def open(self, spider):
|
||||
return self._stdout
|
||||
|
||||
def store(self, file):
|
||||
pass
|
||||
|
||||
|
||||
@implementer(IFeedStorage)
|
||||
class FileFeedStorage:
|
||||
|
||||
def __init__(self, uri, *, feed_options=None):
|
||||
self.path = file_uri_to_path(uri)
|
||||
feed_options = feed_options or {}
|
||||
self.write_mode = 'wb' if feed_options.get('overwrite', False) else 'ab'
|
||||
|
||||
def open(self, spider):
|
||||
dirname = os.path.dirname(self.path)
|
||||
if dirname and not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
return open(self.path, self.write_mode)
|
||||
|
||||
def store(self, file):
|
||||
file.close()
|
||||
|
||||
|
||||
class S3FeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, access_key=None, secret_key=None, acl=None, *,
|
||||
feed_options=None):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
u = urlparse(uri)
|
||||
self.bucketname = u.hostname
|
||||
self.access_key = u.username or access_key
|
||||
self.secret_key = u.password or secret_key
|
||||
self.keyname = u.path[1:] # remove first "/"
|
||||
self.acl = acl
|
||||
import botocore.session
|
||||
session = botocore.session.get_session()
|
||||
self.s3_client = session.create_client(
|
||||
's3', aws_access_key_id=self.access_key,
|
||||
aws_secret_access_key=self.secret_key)
|
||||
if feed_options and feed_options.get('overwrite', True) is False:
|
||||
logger.warning('S3 does not support appending to files. To '
|
||||
'suppress this warning, remove the overwrite '
|
||||
'option from your FEEDS setting or set it to True.')
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, uri, *, feed_options=None):
|
||||
return build_storage(
|
||||
cls,
|
||||
uri,
|
||||
access_key=crawler.settings['AWS_ACCESS_KEY_ID'],
|
||||
secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'],
|
||||
acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None,
|
||||
feed_options=feed_options,
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
kwargs = {'ACL': self.acl} if self.acl else {}
|
||||
self.s3_client.put_object(
|
||||
Bucket=self.bucketname, Key=self.keyname, Body=file,
|
||||
**kwargs)
|
||||
file.close()
|
||||
|
||||
|
||||
class GCSFeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, project_id, acl):
|
||||
self.project_id = project_id
|
||||
self.acl = acl
|
||||
u = urlparse(uri)
|
||||
self.bucket_name = u.hostname
|
||||
self.blob_name = u.path[1:] # remove first "/"
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, uri):
|
||||
return cls(
|
||||
uri,
|
||||
crawler.settings['GCS_PROJECT_ID'],
|
||||
crawler.settings['FEED_STORAGE_GCS_ACL'] or None
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
file.seek(0)
|
||||
from google.cloud.storage import Client
|
||||
client = Client(project=self.project_id)
|
||||
bucket = client.get_bucket(self.bucket_name)
|
||||
blob = bucket.blob(self.blob_name)
|
||||
blob.upload_from_file(file, predefined_acl=self.acl)
|
||||
|
||||
|
||||
class FTPFeedStorage(BlockingFeedStorage):
|
||||
|
||||
def __init__(self, uri, use_active_mode=False, *, feed_options=None):
|
||||
u = urlparse(uri)
|
||||
self.host = u.hostname
|
||||
self.port = int(u.port or '21')
|
||||
self.username = u.username
|
||||
self.password = unquote(u.password or '')
|
||||
self.path = u.path
|
||||
self.use_active_mode = use_active_mode
|
||||
self.overwrite = not feed_options or feed_options.get('overwrite', True)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, uri, *, feed_options=None):
|
||||
return build_storage(
|
||||
cls,
|
||||
uri,
|
||||
crawler.settings.getbool('FEED_STORAGE_FTP_ACTIVE'),
|
||||
feed_options=feed_options,
|
||||
)
|
||||
|
||||
def _store_in_thread(self, file):
|
||||
ftp_store_file(
|
||||
path=self.path, file=file, host=self.host,
|
||||
port=self.port, username=self.username,
|
||||
password=self.password, use_active_mode=self.use_active_mode,
|
||||
overwrite=self.overwrite,
|
||||
)
|
||||
|
||||
|
||||
class _FeedSlot:
|
||||
def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template):
|
||||
self.file = file
|
||||
self.exporter = exporter
|
||||
self.storage = storage
|
||||
# feed params
|
||||
self.batch_id = batch_id
|
||||
self.format = format
|
||||
self.store_empty = store_empty
|
||||
self.uri_template = uri_template
|
||||
self.uri = uri
|
||||
# flags
|
||||
self.itemcount = 0
|
||||
self._exporting = False
|
||||
|
||||
def start_exporting(self):
|
||||
if not self._exporting:
|
||||
self.exporter.start_exporting()
|
||||
self._exporting = True
|
||||
|
||||
def finish_exporting(self):
|
||||
if self._exporting:
|
||||
self.exporter.finish_exporting()
|
||||
self._exporting = False
|
||||
|
||||
|
||||
class FeedExporter:
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
exporter = cls(crawler)
|
||||
crawler.signals.connect(exporter.open_spider, signals.spider_opened)
|
||||
crawler.signals.connect(exporter.close_spider, signals.spider_closed)
|
||||
crawler.signals.connect(exporter.item_scraped, signals.item_scraped)
|
||||
return exporter
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
self.settings = crawler.settings
|
||||
self.feeds = {}
|
||||
self.slots = []
|
||||
|
||||
if not self.settings['FEEDS'] and not self.settings['FEED_URI']:
|
||||
raise NotConfigured
|
||||
|
||||
# Begin: Backward compatibility for FEED_URI and FEED_FORMAT settings
|
||||
if self.settings['FEED_URI']:
|
||||
warnings.warn(
|
||||
'The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of '
|
||||
'the `FEEDS` setting. Please see the `FEEDS` setting docs for more details',
|
||||
category=ScrapyDeprecationWarning, stacklevel=2,
|
||||
)
|
||||
uri = str(self.settings['FEED_URI']) # handle pathlib.Path objects
|
||||
feed_options = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
|
||||
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
|
||||
# End: Backward compatibility for FEED_URI and FEED_FORMAT settings
|
||||
|
||||
# 'FEEDS' setting takes precedence over 'FEED_URI'
|
||||
for uri, feed_options in self.settings.getdict('FEEDS').items():
|
||||
uri = str(uri) # handle pathlib.Path objects
|
||||
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
|
||||
|
||||
self.storages = self._load_components('FEED_STORAGES')
|
||||
self.exporters = self._load_components('FEED_EXPORTERS')
|
||||
for uri, feed_options in self.feeds.items():
|
||||
if not self._storage_supported(uri, feed_options):
|
||||
raise NotConfigured
|
||||
if not self._settings_are_valid():
|
||||
raise NotConfigured
|
||||
if not self._exporter_supported(feed_options['format']):
|
||||
raise NotConfigured
|
||||
|
||||
def open_spider(self, spider):
|
||||
for uri, feed_options in self.feeds.items():
|
||||
uri_params = self._get_uri_params(spider, feed_options['uri_params'])
|
||||
self.slots.append(self._start_new_batch(
|
||||
batch_id=1,
|
||||
uri=uri % uri_params,
|
||||
feed_options=feed_options,
|
||||
spider=spider,
|
||||
uri_template=uri,
|
||||
))
|
||||
|
||||
def close_spider(self, spider):
|
||||
deferred_list = []
|
||||
for slot in self.slots:
|
||||
d = self._close_slot(slot, spider)
|
||||
deferred_list.append(d)
|
||||
return defer.DeferredList(deferred_list) if deferred_list else None
|
||||
|
||||
def _close_slot(self, slot, spider):
|
||||
if not slot.itemcount and not slot.store_empty:
|
||||
# We need to call slot.storage.store nonetheless to get the file
|
||||
# properly closed.
|
||||
return defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
slot.finish_exporting()
|
||||
logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
|
||||
log_args = {'format': slot.format,
|
||||
'itemcount': slot.itemcount,
|
||||
'uri': slot.uri}
|
||||
d = defer.maybeDeferred(slot.storage.store, slot.file)
|
||||
|
||||
# Use `largs=log_args` to copy log_args into function's scope
|
||||
# instead of using `log_args` from the outer scope
|
||||
d.addCallback(
|
||||
lambda _, largs=log_args: logger.info(
|
||||
logfmt % "Stored", largs, extra={'spider': spider}
|
||||
)
|
||||
)
|
||||
d.addErrback(
|
||||
lambda f, largs=log_args: logger.error(
|
||||
logfmt % "Error storing", largs,
|
||||
exc_info=failure_to_exc_info(f), extra={'spider': spider}
|
||||
)
|
||||
)
|
||||
return d
|
||||
|
||||
def _start_new_batch(self, batch_id, uri, feed_options, spider, uri_template):
|
||||
"""
|
||||
Redirect the output data stream to a new file.
|
||||
Execute multiple times if FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified
|
||||
:param batch_id: sequence number of current batch
|
||||
:param uri: uri of the new batch to start
|
||||
:param feed_options: dict with parameters of feed
|
||||
:param spider: user spider
|
||||
:param uri_template: template of uri which contains %(batch_time)s or %(batch_id)d to create new uri
|
||||
"""
|
||||
storage = self._get_storage(uri, feed_options)
|
||||
file = storage.open(spider)
|
||||
exporter = self._get_exporter(
|
||||
file=file,
|
||||
format=feed_options['format'],
|
||||
fields_to_export=feed_options['fields'],
|
||||
encoding=feed_options['encoding'],
|
||||
indent=feed_options['indent'],
|
||||
**feed_options['item_export_kwargs'],
|
||||
)
|
||||
slot = _FeedSlot(
|
||||
file=file,
|
||||
exporter=exporter,
|
||||
storage=storage,
|
||||
uri=uri,
|
||||
format=feed_options['format'],
|
||||
store_empty=feed_options['store_empty'],
|
||||
batch_id=batch_id,
|
||||
uri_template=uri_template,
|
||||
)
|
||||
if slot.store_empty:
|
||||
slot.start_exporting()
|
||||
return slot
|
||||
|
||||
def item_scraped(self, item, spider):
|
||||
slots = []
|
||||
for slot in self.slots:
|
||||
slot.start_exporting()
|
||||
slot.exporter.export_item(item)
|
||||
slot.itemcount += 1
|
||||
# create new slot for each slot with itemcount == FEED_EXPORT_BATCH_ITEM_COUNT and close the old one
|
||||
if (
|
||||
self.feeds[slot.uri_template]['batch_item_count']
|
||||
and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count']
|
||||
):
|
||||
uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot)
|
||||
self._close_slot(slot, spider)
|
||||
slots.append(self._start_new_batch(
|
||||
batch_id=slot.batch_id + 1,
|
||||
uri=slot.uri_template % uri_params,
|
||||
feed_options=self.feeds[slot.uri_template],
|
||||
spider=spider,
|
||||
uri_template=slot.uri_template,
|
||||
))
|
||||
else:
|
||||
slots.append(slot)
|
||||
self.slots = slots
|
||||
|
||||
def _load_components(self, setting_prefix):
|
||||
conf = without_none_values(self.settings.getwithbase(setting_prefix))
|
||||
d = {}
|
||||
for k, v in conf.items():
|
||||
try:
|
||||
d[k] = load_object(v)
|
||||
except NotConfigured:
|
||||
pass
|
||||
return d
|
||||
|
||||
def _exporter_supported(self, format):
|
||||
if format in self.exporters:
|
||||
return True
|
||||
logger.error("Unknown feed format: %(format)s", {'format': format})
|
||||
|
||||
def _settings_are_valid(self):
|
||||
"""
|
||||
If FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain
|
||||
%(batch_time)s or %(batch_id)d to distinguish different files of partial output
|
||||
"""
|
||||
for uri_template, values in self.feeds.items():
|
||||
if values['batch_item_count'] and not re.search(r'%\(batch_time\)s|%\(batch_id\)', uri_template):
|
||||
logger.error(
|
||||
'%(batch_time)s or %(batch_id)d must be in the feed URI ({}) if FEED_EXPORT_BATCH_ITEM_COUNT '
|
||||
'setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: '
|
||||
'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count'
|
||||
''.format(uri_template)
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def _storage_supported(self, uri, feed_options):
|
||||
scheme = urlparse(uri).scheme
|
||||
if scheme in self.storages:
|
||||
try:
|
||||
self._get_storage(uri, feed_options)
|
||||
return True
|
||||
except NotConfigured as e:
|
||||
logger.error("Disabled feed storage scheme: %(scheme)s. "
|
||||
"Reason: %(reason)s",
|
||||
{'scheme': scheme, 'reason': str(e)})
|
||||
else:
|
||||
logger.error("Unknown feed storage scheme: %(scheme)s",
|
||||
{'scheme': scheme})
|
||||
|
||||
def _get_instance(self, objcls, *args, **kwargs):
|
||||
return create_instance(
|
||||
objcls, self.settings, getattr(self, 'crawler', None),
|
||||
*args, **kwargs)
|
||||
|
||||
def _get_exporter(self, file, format, *args, **kwargs):
|
||||
return self._get_instance(self.exporters[format], file, *args, **kwargs)
|
||||
|
||||
def _get_storage(self, uri, feed_options):
|
||||
"""Fork of create_instance specific to feed storage classes
|
||||
|
||||
It supports not passing the *feed_options* parameters to classes that
|
||||
do not support it, and issuing a deprecation warning instead.
|
||||
"""
|
||||
feedcls = self.storages[urlparse(uri).scheme]
|
||||
crawler = getattr(self, 'crawler', None)
|
||||
|
||||
def build_instance(builder, *preargs):
|
||||
return build_storage(builder, uri, feed_options=feed_options, preargs=preargs)
|
||||
|
||||
if crawler and hasattr(feedcls, 'from_crawler'):
|
||||
instance = build_instance(feedcls.from_crawler, crawler)
|
||||
method_name = 'from_crawler'
|
||||
elif hasattr(feedcls, 'from_settings'):
|
||||
instance = build_instance(feedcls.from_settings, self.settings)
|
||||
method_name = 'from_settings'
|
||||
else:
|
||||
instance = build_instance(feedcls)
|
||||
method_name = '__new__'
|
||||
if instance is None:
|
||||
raise TypeError("%s.%s returned None" % (feedcls.__qualname__, method_name))
|
||||
return instance
|
||||
|
||||
def _get_uri_params(self, spider, uri_params, slot=None):
|
||||
params = {}
|
||||
for k in dir(spider):
|
||||
params[k] = getattr(spider, k)
|
||||
utc_now = datetime.utcnow()
|
||||
params['time'] = utc_now.replace(microsecond=0).isoformat().replace(':', '-')
|
||||
params['batch_time'] = utc_now.isoformat().replace(':', '-')
|
||||
params['batch_id'] = slot.batch_id + 1 if slot is not None else 1
|
||||
uripar_function = load_object(uri_params) if uri_params else lambda x, y: None
|
||||
uripar_function(params, spider)
|
||||
return params
|
||||
372
venv/lib/python3.9/site-packages/scrapy/extensions/httpcache.py
Normal file
372
venv/lib/python3.9/site-packages/scrapy/extensions/httpcache.py
Normal file
|
|
@ -0,0 +1,372 @@
|
|||
import gzip
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from email.utils import mktime_tz, parsedate_tz
|
||||
from importlib import import_module
|
||||
from time import time
|
||||
from weakref import WeakKeyDictionary
|
||||
|
||||
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
|
||||
|
||||
from scrapy.http import Headers, Response
|
||||
from scrapy.responsetypes import responsetypes
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.project import data_path
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DummyPolicy:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
return urlparse_cached(request).scheme not in self.ignore_schemes
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
return response.status not in self.ignore_http_codes
|
||||
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
return True
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
return True
|
||||
|
||||
|
||||
class RFC2616Policy:
|
||||
|
||||
MAXAGE = 3600 * 24 * 365 # one year
|
||||
|
||||
def __init__(self, settings):
|
||||
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
|
||||
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
|
||||
self._cc_parsed = WeakKeyDictionary()
|
||||
self.ignore_response_cache_controls = [
|
||||
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
|
||||
]
|
||||
|
||||
def _parse_cachecontrol(self, r):
|
||||
if r not in self._cc_parsed:
|
||||
cch = r.headers.get(b'Cache-Control', b'')
|
||||
parsed = parse_cachecontrol(cch)
|
||||
if isinstance(r, Response):
|
||||
for key in self.ignore_response_cache_controls:
|
||||
parsed.pop(key, None)
|
||||
self._cc_parsed[r] = parsed
|
||||
return self._cc_parsed[r]
|
||||
|
||||
def should_cache_request(self, request):
|
||||
if urlparse_cached(request).scheme in self.ignore_schemes:
|
||||
return False
|
||||
cc = self._parse_cachecontrol(request)
|
||||
# obey user-agent directive "Cache-Control: no-store"
|
||||
if b'no-store' in cc:
|
||||
return False
|
||||
# Any other is eligible for caching
|
||||
return True
|
||||
|
||||
def should_cache_response(self, response, request):
|
||||
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
|
||||
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
|
||||
# Status code 206 is not included because cache can not deal with partial contents
|
||||
cc = self._parse_cachecontrol(response)
|
||||
# obey directive "Cache-Control: no-store"
|
||||
if b'no-store' in cc:
|
||||
return False
|
||||
# Never cache 304 (Not Modified) responses
|
||||
elif response.status == 304:
|
||||
return False
|
||||
# Cache unconditionally if configured to do so
|
||||
elif self.always_store:
|
||||
return True
|
||||
# Any hint on response expiration is good
|
||||
elif b'max-age' in cc or b'Expires' in response.headers:
|
||||
return True
|
||||
# Firefox fallbacks this statuses to one year expiration if none is set
|
||||
elif response.status in (300, 301, 308):
|
||||
return True
|
||||
# Other statuses without expiration requires at least one validator
|
||||
elif response.status in (200, 203, 401):
|
||||
return b'Last-Modified' in response.headers or b'ETag' in response.headers
|
||||
# Any other is probably not eligible for caching
|
||||
# Makes no sense to cache responses that does not contain expiration
|
||||
# info and can not be revalidated
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_cached_response_fresh(self, cachedresponse, request):
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
ccreq = self._parse_cachecontrol(request)
|
||||
if b'no-cache' in cc or b'no-cache' in ccreq:
|
||||
return False
|
||||
|
||||
now = time()
|
||||
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
|
||||
currentage = self._compute_current_age(cachedresponse, request, now)
|
||||
|
||||
reqmaxage = self._get_max_age(ccreq)
|
||||
if reqmaxage is not None:
|
||||
freshnesslifetime = min(freshnesslifetime, reqmaxage)
|
||||
|
||||
if currentage < freshnesslifetime:
|
||||
return True
|
||||
|
||||
if b'max-stale' in ccreq and b'must-revalidate' not in cc:
|
||||
# From RFC2616: "Indicates that the client is willing to
|
||||
# accept a response that has exceeded its expiration time.
|
||||
# If max-stale is assigned a value, then the client is
|
||||
# willing to accept a response that has exceeded its
|
||||
# expiration time by no more than the specified number of
|
||||
# seconds. If no value is assigned to max-stale, then the
|
||||
# client is willing to accept a stale response of any age."
|
||||
staleage = ccreq[b'max-stale']
|
||||
if staleage is None:
|
||||
return True
|
||||
|
||||
try:
|
||||
if currentage < freshnesslifetime + max(0, int(staleage)):
|
||||
return True
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Cached response is stale, try to set validators if any
|
||||
self._set_conditional_validators(request, cachedresponse)
|
||||
return False
|
||||
|
||||
def is_cached_response_valid(self, cachedresponse, response, request):
|
||||
# Use the cached response if the new response is a server error,
|
||||
# as long as the old response didn't specify must-revalidate.
|
||||
if response.status >= 500:
|
||||
cc = self._parse_cachecontrol(cachedresponse)
|
||||
if b'must-revalidate' not in cc:
|
||||
return True
|
||||
|
||||
# Use the cached response if the server says it hasn't changed.
|
||||
return response.status == 304
|
||||
|
||||
def _set_conditional_validators(self, request, cachedresponse):
|
||||
if b'Last-Modified' in cachedresponse.headers:
|
||||
request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
|
||||
|
||||
if b'ETag' in cachedresponse.headers:
|
||||
request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
|
||||
|
||||
def _get_max_age(self, cc):
|
||||
try:
|
||||
return max(0, int(cc[b'max-age']))
|
||||
except (KeyError, ValueError):
|
||||
return None
|
||||
|
||||
def _compute_freshness_lifetime(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeFreshnessLifetime
|
||||
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
|
||||
cc = self._parse_cachecontrol(response)
|
||||
maxage = self._get_max_age(cc)
|
||||
if maxage is not None:
|
||||
return maxage
|
||||
|
||||
# Parse date header or synthesize it if none exists
|
||||
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
|
||||
|
||||
# Try HTTP/1.0 Expires header
|
||||
if b'Expires' in response.headers:
|
||||
expires = rfc1123_to_epoch(response.headers[b'Expires'])
|
||||
# When parsing Expires header fails RFC 2616 section 14.21 says we
|
||||
# should treat this as an expiration time in the past.
|
||||
return max(0, expires - date) if expires else 0
|
||||
|
||||
# Fallback to heuristic using last-modified header
|
||||
# This is not in RFC but on Firefox caching implementation
|
||||
lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
|
||||
if lastmodified and lastmodified <= date:
|
||||
return (date - lastmodified) / 10
|
||||
|
||||
# This request can be cached indefinitely
|
||||
if response.status in (300, 301, 308):
|
||||
return self.MAXAGE
|
||||
|
||||
# Insufficient information to compute fresshness lifetime
|
||||
return 0
|
||||
|
||||
def _compute_current_age(self, response, request, now):
|
||||
# Reference nsHttpResponseHead::ComputeCurrentAge
|
||||
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
|
||||
currentage = 0
|
||||
# If Date header is not set we assume it is a fast connection, and
|
||||
# clock is in sync with the server
|
||||
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
|
||||
if now > date:
|
||||
currentage = now - date
|
||||
|
||||
if b'Age' in response.headers:
|
||||
try:
|
||||
age = int(response.headers[b'Age'])
|
||||
currentage = max(currentage, age)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return currentage
|
||||
|
||||
|
||||
class DbmCacheStorage:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
|
||||
self.db = None
|
||||
|
||||
def open_spider(self, spider):
|
||||
dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
|
||||
self.db = self.dbmodule.open(dbpath, 'c')
|
||||
|
||||
logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.db.close()
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
data = self._read_data(spider, request)
|
||||
if data is None:
|
||||
return # not cached
|
||||
url = data['url']
|
||||
status = data['status']
|
||||
headers = Headers(data['headers'])
|
||||
body = data['body']
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
key = self._request_key(request)
|
||||
data = {
|
||||
'status': response.status,
|
||||
'url': response.url,
|
||||
'headers': dict(response.headers),
|
||||
'body': response.body,
|
||||
}
|
||||
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
|
||||
self.db[f'{key}_time'] = str(time())
|
||||
|
||||
def _read_data(self, spider, request):
|
||||
key = self._request_key(request)
|
||||
db = self.db
|
||||
tkey = f'{key}_time'
|
||||
if tkey not in db:
|
||||
return # not found
|
||||
|
||||
ts = db[tkey]
|
||||
if 0 < self.expiration_secs < time() - float(ts):
|
||||
return # expired
|
||||
|
||||
return pickle.loads(db[f'{key}_data'])
|
||||
|
||||
def _request_key(self, request):
|
||||
return request_fingerprint(request)
|
||||
|
||||
|
||||
class FilesystemCacheStorage:
|
||||
|
||||
def __init__(self, settings):
|
||||
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
|
||||
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
|
||||
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
|
||||
self._open = gzip.open if self.use_gzip else open
|
||||
|
||||
def open_spider(self, spider):
|
||||
logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
|
||||
extra={'spider': spider})
|
||||
|
||||
def close_spider(self, spider):
|
||||
pass
|
||||
|
||||
def retrieve_response(self, spider, request):
|
||||
"""Return response if present in cache, or None otherwise."""
|
||||
metadata = self._read_meta(spider, request)
|
||||
if metadata is None:
|
||||
return # not cached
|
||||
rpath = self._get_request_path(spider, request)
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
|
||||
body = f.read()
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
|
||||
rawheaders = f.read()
|
||||
url = metadata.get('response_url')
|
||||
status = metadata['status']
|
||||
headers = Headers(headers_raw_to_dict(rawheaders))
|
||||
respcls = responsetypes.from_args(headers=headers, url=url)
|
||||
response = respcls(url=url, headers=headers, status=status, body=body)
|
||||
return response
|
||||
|
||||
def store_response(self, spider, request, response):
|
||||
"""Store the given response in the cache."""
|
||||
rpath = self._get_request_path(spider, request)
|
||||
if not os.path.exists(rpath):
|
||||
os.makedirs(rpath)
|
||||
metadata = {
|
||||
'url': request.url,
|
||||
'method': request.method,
|
||||
'status': response.status,
|
||||
'response_url': response.url,
|
||||
'timestamp': time(),
|
||||
}
|
||||
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
|
||||
f.write(to_bytes(repr(metadata)))
|
||||
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
|
||||
pickle.dump(metadata, f, protocol=4)
|
||||
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(response.headers))
|
||||
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
|
||||
f.write(response.body)
|
||||
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
|
||||
f.write(headers_dict_to_raw(request.headers))
|
||||
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
|
||||
f.write(request.body)
|
||||
|
||||
def _get_request_path(self, spider, request):
|
||||
key = request_fingerprint(request)
|
||||
return os.path.join(self.cachedir, spider.name, key[0:2], key)
|
||||
|
||||
def _read_meta(self, spider, request):
|
||||
rpath = self._get_request_path(spider, request)
|
||||
metapath = os.path.join(rpath, 'pickled_meta')
|
||||
if not os.path.exists(metapath):
|
||||
return # not found
|
||||
mtime = os.stat(metapath).st_mtime
|
||||
if 0 < self.expiration_secs < time() - mtime:
|
||||
return # expired
|
||||
with self._open(metapath, 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
|
||||
def parse_cachecontrol(header):
|
||||
"""Parse Cache-Control header
|
||||
|
||||
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
|
||||
|
||||
>>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None,
|
||||
... b'max-age': b'3600'}
|
||||
True
|
||||
>>> parse_cachecontrol(b'') == {}
|
||||
True
|
||||
|
||||
"""
|
||||
directives = {}
|
||||
for directive in header.split(b','):
|
||||
key, sep, val = directive.strip().partition(b'=')
|
||||
if key:
|
||||
directives[key.lower()] = val if sep else None
|
||||
return directives
|
||||
|
||||
|
||||
def rfc1123_to_epoch(date_str):
|
||||
try:
|
||||
date_str = to_unicode(date_str, encoding='ascii')
|
||||
return mktime_tz(parsedate_tz(date_str))
|
||||
except Exception:
|
||||
return None
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
import logging
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogStats:
|
||||
"""Log basic scraping stats periodically"""
|
||||
|
||||
def __init__(self, stats, interval=60.0):
|
||||
self.stats = stats
|
||||
self.interval = interval
|
||||
self.multiplier = 60.0 / self.interval
|
||||
self.task = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
|
||||
if not interval:
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats, interval)
|
||||
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.pagesprev = 0
|
||||
self.itemsprev = 0
|
||||
|
||||
self.task = task.LoopingCall(self.log, spider)
|
||||
self.task.start(self.interval)
|
||||
|
||||
def log(self, spider):
|
||||
items = self.stats.get_value('item_scraped_count', 0)
|
||||
pages = self.stats.get_value('response_received_count', 0)
|
||||
irate = (items - self.itemsprev) * self.multiplier
|
||||
prate = (pages - self.pagesprev) * self.multiplier
|
||||
self.pagesprev, self.itemsprev = pages, items
|
||||
|
||||
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
|
||||
"scraped %(items)d items (at %(itemrate)d items/min)")
|
||||
log_args = {'pages': pages, 'pagerate': prate,
|
||||
'items': items, 'itemrate': irate}
|
||||
logger.info(msg, log_args, extra={'spider': spider})
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
if self.task and self.task.running:
|
||||
self.task.stop()
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
"""
|
||||
MemoryDebugger extension
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
|
||||
import gc
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.trackref import live_refs
|
||||
|
||||
|
||||
class MemoryDebugger:
|
||||
|
||||
def __init__(self, stats):
|
||||
self.stats = stats
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('MEMDEBUG_ENABLED'):
|
||||
raise NotConfigured
|
||||
o = cls(crawler.stats)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider, reason):
|
||||
gc.collect()
|
||||
self.stats.set_value('memdebug/gc_garbage_count', len(gc.garbage), spider=spider)
|
||||
for cls, wdict in live_refs.items():
|
||||
if not wdict:
|
||||
continue
|
||||
self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)
|
||||
126
venv/lib/python3.9/site-packages/scrapy/extensions/memusage.py
Normal file
126
venv/lib/python3.9/site-packages/scrapy/extensions/memusage.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
"""
|
||||
MemoryUsage extension
|
||||
|
||||
See documentation in docs/topics/extensions.rst
|
||||
"""
|
||||
import sys
|
||||
import socket
|
||||
import logging
|
||||
from pprint import pformat
|
||||
from importlib import import_module
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.utils.engine import get_engine_status
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MemoryUsage:
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
|
||||
raise NotConfigured
|
||||
try:
|
||||
# stdlib's resource module is only available on unix platforms.
|
||||
self.resource = import_module('resource')
|
||||
except ImportError:
|
||||
raise NotConfigured
|
||||
|
||||
self.crawler = crawler
|
||||
self.warned = False
|
||||
self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
|
||||
self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
|
||||
self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
|
||||
self.check_interval = crawler.settings.getfloat('MEMUSAGE_CHECK_INTERVAL_SECONDS')
|
||||
self.mail = MailSender.from_settings(crawler.settings)
|
||||
crawler.signals.connect(self.engine_started, signal=signals.engine_started)
|
||||
crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def get_virtual_size(self):
|
||||
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
|
||||
if sys.platform != 'darwin':
|
||||
# on macOS ru_maxrss is in bytes, on Linux it is in KB
|
||||
size *= 1024
|
||||
return size
|
||||
|
||||
def engine_started(self):
|
||||
self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
|
||||
self.tasks = []
|
||||
tsk = task.LoopingCall(self.update)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(self.check_interval, now=True)
|
||||
if self.limit:
|
||||
tsk = task.LoopingCall(self._check_limit)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(self.check_interval, now=True)
|
||||
if self.warning:
|
||||
tsk = task.LoopingCall(self._check_warning)
|
||||
self.tasks.append(tsk)
|
||||
tsk.start(self.check_interval, now=True)
|
||||
|
||||
def engine_stopped(self):
|
||||
for tsk in self.tasks:
|
||||
if tsk.running:
|
||||
tsk.stop()
|
||||
|
||||
def update(self):
|
||||
self.crawler.stats.max_value('memusage/max', self.get_virtual_size())
|
||||
|
||||
def _check_limit(self):
|
||||
if self.get_virtual_size() > self.limit:
|
||||
self.crawler.stats.set_value('memusage/limit_reached', 1)
|
||||
mem = self.limit/1024/1024
|
||||
logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = (
|
||||
f"{self.crawler.settings['BOT_NAME']} terminated: "
|
||||
f"memory usage exceeded {mem}M at {socket.gethostname()}"
|
||||
)
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/limit_notified', 1)
|
||||
|
||||
open_spiders = self.crawler.engine.open_spiders
|
||||
if open_spiders:
|
||||
for spider in open_spiders:
|
||||
self.crawler.engine.close_spider(spider, 'memusage_exceeded')
|
||||
else:
|
||||
self.crawler.stop()
|
||||
|
||||
def _check_warning(self):
|
||||
if self.warned: # warn only once
|
||||
return
|
||||
if self.get_virtual_size() > self.warning:
|
||||
self.crawler.stats.set_value('memusage/warning_reached', 1)
|
||||
mem = self.warning/1024/1024
|
||||
logger.warning("Memory usage reached %(memusage)dM",
|
||||
{'memusage': mem}, extra={'crawler': self.crawler})
|
||||
if self.notify_mails:
|
||||
subj = (
|
||||
f"{self.crawler.settings['BOT_NAME']} warning: "
|
||||
f"memory usage reached {mem}M at {socket.gethostname()}"
|
||||
)
|
||||
self._send_report(self.notify_mails, subj)
|
||||
self.crawler.stats.set_value('memusage/warning_notified', 1)
|
||||
self.warned = True
|
||||
|
||||
def _send_report(self, rcpts, subject):
|
||||
"""send notification mail with some additional useful info"""
|
||||
stats = self.crawler.stats
|
||||
s = f"Memory usage at engine startup : {stats.get_value('memusage/startup')/1024/1024}M\r\n"
|
||||
s += f"Maximum memory usage : {stats.get_value('memusage/max')/1024/1024}M\r\n"
|
||||
s += f"Current memory usage : {self.get_virtual_size()/1024/1024}M\r\n"
|
||||
|
||||
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
|
||||
s += "\r\n"
|
||||
s += pformat(get_engine_status(self.crawler.engine))
|
||||
s += "\r\n"
|
||||
self.mail.send(rcpts, subject, s)
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
import os
|
||||
import pickle
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.job import job_dir
|
||||
|
||||
|
||||
class SpiderState:
|
||||
"""Store and load spider state during a scraping job"""
|
||||
|
||||
def __init__(self, jobdir=None):
|
||||
self.jobdir = jobdir
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
jobdir = job_dir(crawler.settings)
|
||||
if not jobdir:
|
||||
raise NotConfigured
|
||||
|
||||
obj = cls(jobdir)
|
||||
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
|
||||
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
|
||||
return obj
|
||||
|
||||
def spider_closed(self, spider):
|
||||
if self.jobdir:
|
||||
with open(self.statefn, 'wb') as f:
|
||||
pickle.dump(spider.state, f, protocol=4)
|
||||
|
||||
def spider_opened(self, spider):
|
||||
if self.jobdir and os.path.exists(self.statefn):
|
||||
with open(self.statefn, 'rb') as f:
|
||||
spider.state = pickle.load(f)
|
||||
else:
|
||||
spider.state = {}
|
||||
|
||||
@property
|
||||
def statefn(self):
|
||||
return os.path.join(self.jobdir, 'spider.state')
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
"""
|
||||
StatsMailer extension sends an email when a spider finishes scraping.
|
||||
|
||||
Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
|
||||
"""
|
||||
|
||||
from scrapy import signals
|
||||
from scrapy.mail import MailSender
|
||||
from scrapy.exceptions import NotConfigured
|
||||
|
||||
class StatsMailer:
|
||||
|
||||
def __init__(self, stats, recipients, mail):
|
||||
self.stats = stats
|
||||
self.recipients = recipients
|
||||
self.mail = mail
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
|
||||
if not recipients:
|
||||
raise NotConfigured
|
||||
mail = MailSender.from_settings(crawler.settings)
|
||||
o = cls(crawler.stats, recipients, mail)
|
||||
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
|
||||
return o
|
||||
|
||||
def spider_closed(self, spider):
|
||||
spider_stats = self.stats.get_stats(spider)
|
||||
body = "Global stats\n\n"
|
||||
body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
|
||||
body += f"\n\n{spider.name} stats\n\n"
|
||||
body += "\n".join(f"{k:<50} : {v}" for k, v in spider_stats.items())
|
||||
return self.mail.send(self.recipients, f"Scrapy stats for: {spider.name}", body)
|
||||
114
venv/lib/python3.9/site-packages/scrapy/extensions/telnet.py
Normal file
114
venv/lib/python3.9/site-packages/scrapy/extensions/telnet.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
"""
|
||||
Scrapy Telnet Console extension
|
||||
|
||||
See documentation in docs/topics/telnetconsole.rst
|
||||
"""
|
||||
|
||||
import pprint
|
||||
import logging
|
||||
import traceback
|
||||
import binascii
|
||||
import os
|
||||
|
||||
from twisted.internet import protocol
|
||||
try:
|
||||
from twisted.conch import manhole, telnet
|
||||
from twisted.conch.insults import insults
|
||||
TWISTED_CONCH_AVAILABLE = True
|
||||
except (ImportError, SyntaxError):
|
||||
_TWISTED_CONCH_TRACEBACK = traceback.format_exc()
|
||||
TWISTED_CONCH_AVAILABLE = False
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
from scrapy.utils.trackref import print_live_refs
|
||||
from scrapy.utils.engine import print_engine_status
|
||||
from scrapy.utils.reactor import listen_tcp
|
||||
from scrapy.utils.decorators import defers
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# signal to update telnet variables
|
||||
# args: telnet_vars
|
||||
update_telnet_vars = object()
|
||||
|
||||
|
||||
class TelnetConsole(protocol.ServerFactory):
|
||||
|
||||
def __init__(self, crawler):
|
||||
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
|
||||
raise NotConfigured
|
||||
if not TWISTED_CONCH_AVAILABLE:
|
||||
raise NotConfigured(
|
||||
'TELNETCONSOLE_ENABLED setting is True but required twisted '
|
||||
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
|
||||
self.crawler = crawler
|
||||
self.noisy = False
|
||||
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
|
||||
self.host = crawler.settings['TELNETCONSOLE_HOST']
|
||||
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
|
||||
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
|
||||
|
||||
if not self.password:
|
||||
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
|
||||
logger.info('Telnet Password: %s', self.password)
|
||||
|
||||
self.crawler.signals.connect(self.start_listening, signals.engine_started)
|
||||
self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def start_listening(self):
|
||||
self.port = listen_tcp(self.portrange, self.host, self)
|
||||
h = self.port.getHost()
|
||||
logger.info("Telnet console listening on %(host)s:%(port)d",
|
||||
{'host': h.host, 'port': h.port},
|
||||
extra={'crawler': self.crawler})
|
||||
|
||||
def stop_listening(self):
|
||||
self.port.stopListening()
|
||||
|
||||
def protocol(self):
|
||||
class Portal:
|
||||
"""An implementation of IPortal"""
|
||||
@defers
|
||||
def login(self_, credentials, mind, *interfaces):
|
||||
if not (
|
||||
credentials.username == self.username.encode('utf8')
|
||||
and credentials.checkPassword(self.password.encode('utf8'))
|
||||
):
|
||||
raise ValueError("Invalid credentials")
|
||||
|
||||
protocol = telnet.TelnetBootstrapProtocol(
|
||||
insults.ServerProtocol,
|
||||
manhole.Manhole,
|
||||
self._get_telnet_vars()
|
||||
)
|
||||
return (interfaces[0], protocol, lambda: None)
|
||||
|
||||
return telnet.TelnetTransport(
|
||||
telnet.AuthenticatingTelnetProtocol,
|
||||
Portal()
|
||||
)
|
||||
|
||||
def _get_telnet_vars(self):
|
||||
# Note: if you add entries here also update topics/telnetconsole.rst
|
||||
telnet_vars = {
|
||||
'engine': self.crawler.engine,
|
||||
'spider': self.crawler.engine.spider,
|
||||
'slot': self.crawler.engine.slot,
|
||||
'crawler': self.crawler,
|
||||
'extensions': self.crawler.extensions,
|
||||
'stats': self.crawler.stats,
|
||||
'settings': self.crawler.settings,
|
||||
'est': lambda: print_engine_status(self.crawler.engine),
|
||||
'p': pprint.pprint,
|
||||
'prefs': print_live_refs,
|
||||
'help': "This is Scrapy telnet console. For more info see: "
|
||||
"https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
|
||||
}
|
||||
self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars)
|
||||
return telnet_vars
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
import logging
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy import signals
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AutoThrottle:
|
||||
|
||||
def __init__(self, crawler):
|
||||
self.crawler = crawler
|
||||
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
|
||||
raise NotConfigured
|
||||
|
||||
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
|
||||
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
|
||||
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(crawler)
|
||||
|
||||
def _spider_opened(self, spider):
|
||||
self.mindelay = self._min_delay(spider)
|
||||
self.maxdelay = self._max_delay(spider)
|
||||
spider.download_delay = self._start_delay(spider)
|
||||
|
||||
def _min_delay(self, spider):
|
||||
s = self.crawler.settings
|
||||
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
|
||||
|
||||
def _max_delay(self, spider):
|
||||
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
|
||||
|
||||
def _start_delay(self, spider):
|
||||
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
|
||||
|
||||
def _response_downloaded(self, response, request, spider):
|
||||
key, slot = self._get_slot(request, spider)
|
||||
latency = request.meta.get('download_latency')
|
||||
if latency is None or slot is None:
|
||||
return
|
||||
|
||||
olddelay = slot.delay
|
||||
self._adjust_delay(slot, latency, response)
|
||||
if self.debug:
|
||||
diff = slot.delay - olddelay
|
||||
size = len(response.body)
|
||||
conc = len(slot.transferring)
|
||||
logger.info(
|
||||
"slot: %(slot)s | conc:%(concurrency)2d | "
|
||||
"delay:%(delay)5d ms (%(delaydiff)+d) | "
|
||||
"latency:%(latency)5d ms | size:%(size)6d bytes",
|
||||
{
|
||||
'slot': key, 'concurrency': conc,
|
||||
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
|
||||
'latency': latency * 1000, 'size': size
|
||||
},
|
||||
extra={'spider': spider}
|
||||
)
|
||||
|
||||
def _get_slot(self, request, spider):
|
||||
key = request.meta.get('download_slot')
|
||||
return key, self.crawler.engine.downloader.slots.get(key)
|
||||
|
||||
def _adjust_delay(self, slot, latency, response):
|
||||
"""Define delay adjustment policy"""
|
||||
|
||||
# If a server needs `latency` seconds to respond then
|
||||
# we should send a request each `latency/N` seconds
|
||||
# to have N requests processed in parallel
|
||||
target_delay = latency / self.target_concurrency
|
||||
|
||||
# Adjust the delay to make it closer to target_delay
|
||||
new_delay = (slot.delay + target_delay) / 2.0
|
||||
|
||||
# If target delay is bigger than old delay, then use it instead of mean.
|
||||
# It works better with problematic sites.
|
||||
new_delay = max(target_delay, new_delay)
|
||||
|
||||
# Make sure self.mindelay <= new_delay <= self.max_delay
|
||||
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
|
||||
|
||||
# Dont adjust delay if response status != 200 and new delay is smaller
|
||||
# than old one, as error pages (and redirections) are usually small and
|
||||
# so tend to reduce latency, thus provoking a positive feedback by
|
||||
# reducing delay instead of increase.
|
||||
if response.status != 200 and new_delay <= slot.delay:
|
||||
return
|
||||
|
||||
slot.delay = new_delay
|
||||
18
venv/lib/python3.9/site-packages/scrapy/http/__init__.py
Normal file
18
venv/lib/python3.9/site-packages/scrapy/http/__init__.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
"""
|
||||
Module containing all HTTP related classes
|
||||
|
||||
Use this module (instead of the more specific ones) when importing Headers,
|
||||
Request and Response outside this module.
|
||||
"""
|
||||
|
||||
from scrapy.http.headers import Headers
|
||||
|
||||
from scrapy.http.request import Request
|
||||
from scrapy.http.request.form import FormRequest
|
||||
from scrapy.http.request.rpc import XmlRpcRequest
|
||||
from scrapy.http.request.json_request import JsonRequest
|
||||
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.http.response.html import HtmlResponse
|
||||
from scrapy.http.response.xml import XmlResponse
|
||||
from scrapy.http.response.text import TextResponse
|
||||
6
venv/lib/python3.9/site-packages/scrapy/http/common.py
Normal file
6
venv/lib/python3.9/site-packages/scrapy/http/common.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
def obsolete_setter(setter, attrname):
|
||||
def newsetter(self, value):
|
||||
c = self.__class__.__name__
|
||||
msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
|
||||
raise AttributeError(msg)
|
||||
return newsetter
|
||||
191
venv/lib/python3.9/site-packages/scrapy/http/cookies.py
Normal file
191
venv/lib/python3.9/site-packages/scrapy/http/cookies.py
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
import time
|
||||
from http.cookiejar import CookieJar as _CookieJar, DefaultCookiePolicy, IPV4_RE
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
class CookieJar:
|
||||
def __init__(self, policy=None, check_expired_frequency=10000):
|
||||
self.policy = policy or DefaultCookiePolicy()
|
||||
self.jar = _CookieJar(self.policy)
|
||||
self.jar._cookies_lock = _DummyLock()
|
||||
self.check_expired_frequency = check_expired_frequency
|
||||
self.processed = 0
|
||||
|
||||
def extract_cookies(self, response, request):
|
||||
wreq = WrappedRequest(request)
|
||||
wrsp = WrappedResponse(response)
|
||||
return self.jar.extract_cookies(wrsp, wreq)
|
||||
|
||||
def add_cookie_header(self, request):
|
||||
wreq = WrappedRequest(request)
|
||||
self.policy._now = self.jar._now = int(time.time())
|
||||
|
||||
# the cookiejar implementation iterates through all domains
|
||||
# instead we restrict to potential matches on the domain
|
||||
req_host = urlparse_cached(request).hostname
|
||||
if not req_host:
|
||||
return
|
||||
|
||||
if not IPV4_RE.search(req_host):
|
||||
hosts = potential_domain_matches(req_host)
|
||||
if '.' not in req_host:
|
||||
hosts += [req_host + ".local"]
|
||||
else:
|
||||
hosts = [req_host]
|
||||
|
||||
cookies = []
|
||||
for host in hosts:
|
||||
if host in self.jar._cookies:
|
||||
cookies += self.jar._cookies_for_domain(host, wreq)
|
||||
|
||||
attrs = self.jar._cookie_attrs(cookies)
|
||||
if attrs:
|
||||
if not wreq.has_header("Cookie"):
|
||||
wreq.add_unredirected_header("Cookie", "; ".join(attrs))
|
||||
|
||||
self.processed += 1
|
||||
if self.processed % self.check_expired_frequency == 0:
|
||||
# This is still quite inefficient for large number of cookies
|
||||
self.jar.clear_expired_cookies()
|
||||
|
||||
@property
|
||||
def _cookies(self):
|
||||
return self.jar._cookies
|
||||
|
||||
def clear_session_cookies(self, *args, **kwargs):
|
||||
return self.jar.clear_session_cookies(*args, **kwargs)
|
||||
|
||||
def clear(self, domain=None, path=None, name=None):
|
||||
return self.jar.clear(domain, path, name)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.jar)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.jar)
|
||||
|
||||
def set_policy(self, pol):
|
||||
return self.jar.set_policy(pol)
|
||||
|
||||
def make_cookies(self, response, request):
|
||||
wreq = WrappedRequest(request)
|
||||
wrsp = WrappedResponse(response)
|
||||
return self.jar.make_cookies(wrsp, wreq)
|
||||
|
||||
def set_cookie(self, cookie):
|
||||
self.jar.set_cookie(cookie)
|
||||
|
||||
def set_cookie_if_ok(self, cookie, request):
|
||||
self.jar.set_cookie_if_ok(cookie, WrappedRequest(request))
|
||||
|
||||
|
||||
def potential_domain_matches(domain):
|
||||
"""Potential domain matches for a cookie
|
||||
|
||||
>>> potential_domain_matches('www.example.com')
|
||||
['www.example.com', 'example.com', '.www.example.com', '.example.com']
|
||||
|
||||
"""
|
||||
matches = [domain]
|
||||
try:
|
||||
start = domain.index('.') + 1
|
||||
end = domain.rindex('.')
|
||||
while start < end:
|
||||
matches.append(domain[start:])
|
||||
start = domain.index('.', start) + 1
|
||||
except ValueError:
|
||||
pass
|
||||
return matches + ['.' + d for d in matches]
|
||||
|
||||
|
||||
class _DummyLock:
|
||||
def acquire(self):
|
||||
pass
|
||||
|
||||
def release(self):
|
||||
pass
|
||||
|
||||
|
||||
class WrappedRequest:
|
||||
"""Wraps a scrapy Request class with methods defined by urllib2.Request class to interact with CookieJar class
|
||||
|
||||
see http://docs.python.org/library/urllib2.html#urllib2.Request
|
||||
"""
|
||||
|
||||
def __init__(self, request):
|
||||
self.request = request
|
||||
|
||||
def get_full_url(self):
|
||||
return self.request.url
|
||||
|
||||
def get_host(self):
|
||||
return urlparse_cached(self.request).netloc
|
||||
|
||||
def get_type(self):
|
||||
return urlparse_cached(self.request).scheme
|
||||
|
||||
def is_unverifiable(self):
|
||||
"""Unverifiable should indicate whether the request is unverifiable, as defined by RFC 2965.
|
||||
|
||||
It defaults to False. An unverifiable request is one whose URL the user did not have the
|
||||
option to approve. For example, if the request is for an image in an
|
||||
HTML document, and the user had no option to approve the automatic
|
||||
fetching of the image, this should be true.
|
||||
"""
|
||||
return self.request.meta.get('is_unverifiable', False)
|
||||
|
||||
def get_origin_req_host(self):
|
||||
return urlparse_cached(self.request).hostname
|
||||
|
||||
# python3 uses attributes instead of methods
|
||||
@property
|
||||
def full_url(self):
|
||||
return self.get_full_url()
|
||||
|
||||
@property
|
||||
def host(self):
|
||||
return self.get_host()
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self.get_type()
|
||||
|
||||
@property
|
||||
def unverifiable(self):
|
||||
return self.is_unverifiable()
|
||||
|
||||
@property
|
||||
def origin_req_host(self):
|
||||
return self.get_origin_req_host()
|
||||
|
||||
def has_header(self, name):
|
||||
return name in self.request.headers
|
||||
|
||||
def get_header(self, name, default=None):
|
||||
return to_unicode(self.request.headers.get(name, default),
|
||||
errors='replace')
|
||||
|
||||
def header_items(self):
|
||||
return [
|
||||
(to_unicode(k, errors='replace'),
|
||||
[to_unicode(x, errors='replace') for x in v])
|
||||
for k, v in self.request.headers.items()
|
||||
]
|
||||
|
||||
def add_unredirected_header(self, name, value):
|
||||
self.request.headers.appendlist(name, value)
|
||||
|
||||
|
||||
class WrappedResponse:
|
||||
|
||||
def __init__(self, response):
|
||||
self.response = response
|
||||
|
||||
def info(self):
|
||||
return self
|
||||
|
||||
def get_all(self, name, default=None):
|
||||
return [to_unicode(v, errors='replace')
|
||||
for v in self.response.headers.getlist(name)]
|
||||
89
venv/lib/python3.9/site-packages/scrapy/http/headers.py
Normal file
89
venv/lib/python3.9/site-packages/scrapy/http/headers.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
from w3lib.http import headers_dict_to_raw
|
||||
from scrapy.utils.datatypes import CaselessDict
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
class Headers(CaselessDict):
|
||||
"""Case insensitive http headers dictionary"""
|
||||
|
||||
def __init__(self, seq=None, encoding='utf-8'):
|
||||
self.encoding = encoding
|
||||
super().__init__(seq)
|
||||
|
||||
def normkey(self, key):
|
||||
"""Normalize key to bytes"""
|
||||
return self._tobytes(key.title())
|
||||
|
||||
def normvalue(self, value):
|
||||
"""Normalize values to bytes"""
|
||||
if value is None:
|
||||
value = []
|
||||
elif isinstance(value, (str, bytes)):
|
||||
value = [value]
|
||||
elif not hasattr(value, '__iter__'):
|
||||
value = [value]
|
||||
|
||||
return [self._tobytes(x) for x in value]
|
||||
|
||||
def _tobytes(self, x):
|
||||
if isinstance(x, bytes):
|
||||
return x
|
||||
elif isinstance(x, str):
|
||||
return x.encode(self.encoding)
|
||||
elif isinstance(x, int):
|
||||
return str(x).encode(self.encoding)
|
||||
else:
|
||||
raise TypeError(f'Unsupported value type: {type(x)}')
|
||||
|
||||
def __getitem__(self, key):
|
||||
try:
|
||||
return super().__getitem__(key)[-1]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def get(self, key, def_val=None):
|
||||
try:
|
||||
return super().get(key, def_val)[-1]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def getlist(self, key, def_val=None):
|
||||
try:
|
||||
return super().__getitem__(key)
|
||||
except KeyError:
|
||||
if def_val is not None:
|
||||
return self.normvalue(def_val)
|
||||
return []
|
||||
|
||||
def setlist(self, key, list_):
|
||||
self[key] = list_
|
||||
|
||||
def setlistdefault(self, key, default_list=()):
|
||||
return self.setdefault(key, default_list)
|
||||
|
||||
def appendlist(self, key, value):
|
||||
lst = self.getlist(key)
|
||||
lst.extend(self.normvalue(value))
|
||||
self[key] = lst
|
||||
|
||||
def items(self):
|
||||
return ((k, self.getlist(k)) for k in self.keys())
|
||||
|
||||
def values(self):
|
||||
return [self[k] for k in self.keys()]
|
||||
|
||||
def to_string(self):
|
||||
return headers_dict_to_raw(self)
|
||||
|
||||
def to_unicode_dict(self):
|
||||
""" Return headers as a CaselessDict with unicode keys
|
||||
and unicode values. Multiple values are joined with ','.
|
||||
"""
|
||||
return CaselessDict(
|
||||
(to_unicode(key, encoding=self.encoding),
|
||||
to_unicode(b','.join(value), encoding=self.encoding))
|
||||
for key, value in self.items())
|
||||
|
||||
def __copy__(self):
|
||||
return self.__class__(self)
|
||||
copy = __copy__
|
||||
143
venv/lib/python3.9/site-packages/scrapy/http/request/__init__.py
Normal file
143
venv/lib/python3.9/site-packages/scrapy/http/request/__init__.py
Normal file
|
|
@ -0,0 +1,143 @@
|
|||
"""
|
||||
This module implements the Request class which is used to represent HTTP
|
||||
requests in Scrapy.
|
||||
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
from w3lib.url import safe_url_string
|
||||
|
||||
from scrapy.http.headers import Headers
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.trackref import object_ref
|
||||
from scrapy.utils.url import escape_ajax
|
||||
from scrapy.http.common import obsolete_setter
|
||||
from scrapy.utils.curl import curl_to_request_kwargs
|
||||
|
||||
|
||||
class Request(object_ref):
|
||||
|
||||
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding='utf-8', priority=0,
|
||||
dont_filter=False, errback=None, flags=None, cb_kwargs=None):
|
||||
|
||||
self._encoding = encoding # this one has to be set first
|
||||
self.method = str(method).upper()
|
||||
self._set_url(url)
|
||||
self._set_body(body)
|
||||
if not isinstance(priority, int):
|
||||
raise TypeError(f"Request priority not an integer: {priority!r}")
|
||||
self.priority = priority
|
||||
|
||||
if callback is not None and not callable(callback):
|
||||
raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
|
||||
if errback is not None and not callable(errback):
|
||||
raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
|
||||
self.callback = callback
|
||||
self.errback = errback
|
||||
|
||||
self.cookies = cookies or {}
|
||||
self.headers = Headers(headers or {}, encoding=encoding)
|
||||
self.dont_filter = dont_filter
|
||||
|
||||
self._meta = dict(meta) if meta else None
|
||||
self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
|
||||
self.flags = [] if flags is None else list(flags)
|
||||
|
||||
@property
|
||||
def cb_kwargs(self):
|
||||
if self._cb_kwargs is None:
|
||||
self._cb_kwargs = {}
|
||||
return self._cb_kwargs
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
if self._meta is None:
|
||||
self._meta = {}
|
||||
return self._meta
|
||||
|
||||
def _get_url(self):
|
||||
return self._url
|
||||
|
||||
def _set_url(self, url):
|
||||
if not isinstance(url, str):
|
||||
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
|
||||
|
||||
s = safe_url_string(url, self.encoding)
|
||||
self._url = escape_ajax(s)
|
||||
|
||||
if (
|
||||
'://' not in self._url
|
||||
and not self._url.startswith('about:')
|
||||
and not self._url.startswith('data:')
|
||||
):
|
||||
raise ValueError(f'Missing scheme in request url: {self._url}')
|
||||
|
||||
url = property(_get_url, obsolete_setter(_set_url, 'url'))
|
||||
|
||||
def _get_body(self):
|
||||
return self._body
|
||||
|
||||
def _set_body(self, body):
|
||||
if body is None:
|
||||
self._body = b''
|
||||
else:
|
||||
self._body = to_bytes(body, self.encoding)
|
||||
|
||||
body = property(_get_body, obsolete_setter(_set_body, 'body'))
|
||||
|
||||
@property
|
||||
def encoding(self):
|
||||
return self._encoding
|
||||
|
||||
def __str__(self):
|
||||
return f"<{self.method} {self.url}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def copy(self):
|
||||
"""Return a copy of this Request"""
|
||||
return self.replace()
|
||||
|
||||
def replace(self, *args, **kwargs):
|
||||
"""Create a new Request with the same attributes except for those
|
||||
given new values.
|
||||
"""
|
||||
for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags',
|
||||
'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_curl(cls, curl_command, ignore_unknown_options=True, **kwargs):
|
||||
"""Create a Request object from a string containing a `cURL
|
||||
<https://curl.haxx.se/>`_ command. It populates the HTTP method, the
|
||||
URL, the headers, the cookies and the body. It accepts the same
|
||||
arguments as the :class:`Request` class, taking preference and
|
||||
overriding the values of the same arguments contained in the cURL
|
||||
command.
|
||||
|
||||
Unrecognized options are ignored by default. To raise an error when
|
||||
finding unknown options call this method by passing
|
||||
``ignore_unknown_options=False``.
|
||||
|
||||
.. caution:: Using :meth:`from_curl` from :class:`~scrapy.http.Request`
|
||||
subclasses, such as :class:`~scrapy.http.JSONRequest`, or
|
||||
:class:`~scrapy.http.XmlRpcRequest`, as well as having
|
||||
:ref:`downloader middlewares <topics-downloader-middleware>`
|
||||
and
|
||||
:ref:`spider middlewares <topics-spider-middleware>`
|
||||
enabled, such as
|
||||
:class:`~scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware`,
|
||||
:class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`,
|
||||
or
|
||||
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
|
||||
may modify the :class:`~scrapy.http.Request` object.
|
||||
|
||||
To translate a cURL command into a Scrapy request,
|
||||
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
|
||||
|
||||
"""
|
||||
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
|
||||
request_kwargs.update(kwargs)
|
||||
return cls(**request_kwargs)
|
||||
215
venv/lib/python3.9/site-packages/scrapy/http/request/form.py
Normal file
215
venv/lib/python3.9/site-packages/scrapy/http/request/form.py
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
"""
|
||||
This module implements the FormRequest class which is a more convenient class
|
||||
(than Request) to generate Requests based on form data.
|
||||
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
from urllib.parse import urljoin, urlencode
|
||||
|
||||
import lxml.html
|
||||
from parsel.selector import create_root_node
|
||||
from w3lib.html import strip_html5_whitespace
|
||||
|
||||
from scrapy.http.request import Request
|
||||
from scrapy.utils.python import to_bytes, is_listlike
|
||||
from scrapy.utils.response import get_base_url
|
||||
|
||||
|
||||
class FormRequest(Request):
|
||||
valid_form_methods = ['GET', 'POST']
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
formdata = kwargs.pop('formdata', None)
|
||||
if formdata and kwargs.get('method') is None:
|
||||
kwargs['method'] = 'POST'
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
if formdata:
|
||||
items = formdata.items() if isinstance(formdata, dict) else formdata
|
||||
querystr = _urlencode(items, self.encoding)
|
||||
if self.method == 'POST':
|
||||
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
|
||||
self._set_body(querystr)
|
||||
else:
|
||||
self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)
|
||||
|
||||
@classmethod
|
||||
def from_response(cls, response, formname=None, formid=None, formnumber=0, formdata=None,
|
||||
clickdata=None, dont_click=False, formxpath=None, formcss=None, **kwargs):
|
||||
|
||||
kwargs.setdefault('encoding', response.encoding)
|
||||
|
||||
if formcss is not None:
|
||||
from parsel.csstranslator import HTMLTranslator
|
||||
formxpath = HTMLTranslator().css_to_xpath(formcss)
|
||||
|
||||
form = _get_form(response, formname, formid, formnumber, formxpath)
|
||||
formdata = _get_inputs(form, formdata, dont_click, clickdata, response)
|
||||
url = _get_form_url(form, kwargs.pop('url', None))
|
||||
|
||||
method = kwargs.pop('method', form.method)
|
||||
if method is not None:
|
||||
method = method.upper()
|
||||
if method not in cls.valid_form_methods:
|
||||
method = 'GET'
|
||||
|
||||
return cls(url=url, method=method, formdata=formdata, **kwargs)
|
||||
|
||||
|
||||
def _get_form_url(form, url):
|
||||
if url is None:
|
||||
action = form.get('action')
|
||||
if action is None:
|
||||
return form.base_url
|
||||
return urljoin(form.base_url, strip_html5_whitespace(action))
|
||||
return urljoin(form.base_url, url)
|
||||
|
||||
|
||||
def _urlencode(seq, enc):
|
||||
values = [(to_bytes(k, enc), to_bytes(v, enc))
|
||||
for k, vs in seq
|
||||
for v in (vs if is_listlike(vs) else [vs])]
|
||||
return urlencode(values, doseq=1)
|
||||
|
||||
|
||||
def _get_form(response, formname, formid, formnumber, formxpath):
|
||||
"""Find the form element """
|
||||
root = create_root_node(response.text, lxml.html.HTMLParser,
|
||||
base_url=get_base_url(response))
|
||||
forms = root.xpath('//form')
|
||||
if not forms:
|
||||
raise ValueError(f"No <form> element found in {response}")
|
||||
|
||||
if formname is not None:
|
||||
f = root.xpath(f'//form[@name="{formname}"]')
|
||||
if f:
|
||||
return f[0]
|
||||
|
||||
if formid is not None:
|
||||
f = root.xpath(f'//form[@id="{formid}"]')
|
||||
if f:
|
||||
return f[0]
|
||||
|
||||
# Get form element from xpath, if not found, go up
|
||||
if formxpath is not None:
|
||||
nodes = root.xpath(formxpath)
|
||||
if nodes:
|
||||
el = nodes[0]
|
||||
while True:
|
||||
if el.tag == 'form':
|
||||
return el
|
||||
el = el.getparent()
|
||||
if el is None:
|
||||
break
|
||||
raise ValueError(f'No <form> element found with {formxpath}')
|
||||
|
||||
# If we get here, it means that either formname was None
|
||||
# or invalid
|
||||
if formnumber is not None:
|
||||
try:
|
||||
form = forms[formnumber]
|
||||
except IndexError:
|
||||
raise IndexError(f"Form number {formnumber} not found in {response}")
|
||||
else:
|
||||
return form
|
||||
|
||||
|
||||
def _get_inputs(form, formdata, dont_click, clickdata, response):
|
||||
try:
|
||||
formdata_keys = dict(formdata or ()).keys()
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError('formdata should be a dict or iterable of tuples')
|
||||
|
||||
if not formdata:
|
||||
formdata = ()
|
||||
inputs = form.xpath('descendant::textarea'
|
||||
'|descendant::select'
|
||||
'|descendant::input[not(@type) or @type['
|
||||
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
|
||||
' and (../@checked or'
|
||||
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
|
||||
namespaces={
|
||||
"re": "http://exslt.org/regular-expressions"})
|
||||
values = [(k, '' if v is None else v)
|
||||
for k, v in (_value(e) for e in inputs)
|
||||
if k and k not in formdata_keys]
|
||||
|
||||
if not dont_click:
|
||||
clickable = _get_clickable(clickdata, form)
|
||||
if clickable and clickable[0] not in formdata and not clickable[0] is None:
|
||||
values.append(clickable)
|
||||
|
||||
if isinstance(formdata, dict):
|
||||
formdata = formdata.items()
|
||||
|
||||
values.extend((k, v) for k, v in formdata if v is not None)
|
||||
return values
|
||||
|
||||
|
||||
def _value(ele):
|
||||
n = ele.name
|
||||
v = ele.value
|
||||
if ele.tag == 'select':
|
||||
return _select_value(ele, n, v)
|
||||
return n, v
|
||||
|
||||
|
||||
def _select_value(ele, n, v):
|
||||
multiple = ele.multiple
|
||||
if v is None and not multiple:
|
||||
# Match browser behaviour on simple select tag without options selected
|
||||
# And for select tags wihout options
|
||||
o = ele.value_options
|
||||
return (n, o[0]) if o else (None, None)
|
||||
elif v is not None and multiple:
|
||||
# This is a workround to bug in lxml fixed 2.3.1
|
||||
# fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
|
||||
selected_options = ele.xpath('.//option[@selected]')
|
||||
v = [(o.get('value') or o.text or '').strip() for o in selected_options]
|
||||
return n, v
|
||||
|
||||
|
||||
def _get_clickable(clickdata, form):
|
||||
"""
|
||||
Returns the clickable element specified in clickdata,
|
||||
if the latter is given. If not, it returns the first
|
||||
clickable element found
|
||||
"""
|
||||
clickables = list(form.xpath(
|
||||
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
|
||||
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
|
||||
namespaces={"re": "http://exslt.org/regular-expressions"}
|
||||
))
|
||||
if not clickables:
|
||||
return
|
||||
|
||||
# If we don't have clickdata, we just use the first clickable element
|
||||
if clickdata is None:
|
||||
el = clickables[0]
|
||||
return (el.get('name'), el.get('value') or '')
|
||||
|
||||
# If clickdata is given, we compare it to the clickable elements to find a
|
||||
# match. We first look to see if the number is specified in clickdata,
|
||||
# because that uniquely identifies the element
|
||||
nr = clickdata.get('nr', None)
|
||||
if nr is not None:
|
||||
try:
|
||||
el = list(form.inputs)[nr]
|
||||
except IndexError:
|
||||
pass
|
||||
else:
|
||||
return (el.get('name'), el.get('value') or '')
|
||||
|
||||
# We didn't find it, so now we build an XPath expression out of the other
|
||||
# arguments, because they can be used as such
|
||||
xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
|
||||
el = form.xpath(xpath)
|
||||
if len(el) == 1:
|
||||
return (el[0].get('name'), el[0].get('value') or '')
|
||||
elif len(el) > 1:
|
||||
raise ValueError(f"Multiple elements found ({el!r}) matching the "
|
||||
f"criteria in clickdata: {clickdata!r}")
|
||||
else:
|
||||
raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
"""
|
||||
This module implements the JsonRequest class which is a more convenient class
|
||||
(than Request) to generate JSON Requests.
|
||||
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
import copy
|
||||
import json
|
||||
import warnings
|
||||
|
||||
from scrapy.http.request import Request
|
||||
from scrapy.utils.deprecate import create_deprecated_class
|
||||
|
||||
|
||||
class JsonRequest(Request):
|
||||
def __init__(self, *args, **kwargs):
|
||||
dumps_kwargs = copy.deepcopy(kwargs.pop('dumps_kwargs', {}))
|
||||
dumps_kwargs.setdefault('sort_keys', True)
|
||||
self._dumps_kwargs = dumps_kwargs
|
||||
|
||||
body_passed = kwargs.get('body', None) is not None
|
||||
data = kwargs.pop('data', None)
|
||||
data_passed = data is not None
|
||||
|
||||
if body_passed and data_passed:
|
||||
warnings.warn('Both body and data passed. data will be ignored')
|
||||
|
||||
elif not body_passed and data_passed:
|
||||
kwargs['body'] = self._dumps(data)
|
||||
|
||||
if 'method' not in kwargs:
|
||||
kwargs['method'] = 'POST'
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.headers.setdefault('Content-Type', 'application/json')
|
||||
self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01')
|
||||
|
||||
def replace(self, *args, **kwargs):
|
||||
body_passed = kwargs.get('body', None) is not None
|
||||
data = kwargs.pop('data', None)
|
||||
data_passed = data is not None
|
||||
|
||||
if body_passed and data_passed:
|
||||
warnings.warn('Both body and data passed. data will be ignored')
|
||||
|
||||
elif not body_passed and data_passed:
|
||||
kwargs['body'] = self._dumps(data)
|
||||
|
||||
return super().replace(*args, **kwargs)
|
||||
|
||||
def _dumps(self, data):
|
||||
"""Convert to JSON """
|
||||
return json.dumps(data, **self._dumps_kwargs)
|
||||
|
||||
|
||||
JSONRequest = create_deprecated_class("JSONRequest", JsonRequest)
|
||||
35
venv/lib/python3.9/site-packages/scrapy/http/request/rpc.py
Normal file
35
venv/lib/python3.9/site-packages/scrapy/http/request/rpc.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
This module implements the XmlRpcRequest class which is a more convenient class
|
||||
(that Request) to generate xml-rpc requests.
|
||||
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
import xmlrpc.client as xmlrpclib
|
||||
|
||||
from scrapy.http.request import Request
|
||||
from scrapy.utils.python import get_func_args
|
||||
|
||||
|
||||
DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
|
||||
|
||||
|
||||
class XmlRpcRequest(Request):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
encoding = kwargs.get('encoding', None)
|
||||
if 'body' not in kwargs and 'params' in kwargs:
|
||||
kw = dict((k, kwargs.pop(k)) for k in DUMPS_ARGS if k in kwargs)
|
||||
kwargs['body'] = xmlrpclib.dumps(**kw)
|
||||
|
||||
# spec defines that requests must use POST method
|
||||
kwargs.setdefault('method', 'POST')
|
||||
|
||||
# xmlrpc query multiples times over the same url
|
||||
kwargs.setdefault('dont_filter', True)
|
||||
|
||||
# restore encoding
|
||||
if encoding is not None:
|
||||
kwargs['encoding'] = encoding
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
self.headers.setdefault('Content-Type', 'text/xml')
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
"""
|
||||
This module implements the Response class which is used to represent HTTP
|
||||
responses in Scrapy.
|
||||
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
from typing import Generator
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from scrapy.exceptions import NotSupported
|
||||
from scrapy.http.common import obsolete_setter
|
||||
from scrapy.http.headers import Headers
|
||||
from scrapy.http.request import Request
|
||||
from scrapy.link import Link
|
||||
from scrapy.utils.trackref import object_ref
|
||||
|
||||
|
||||
class Response(object_ref):
|
||||
|
||||
def __init__(self, url, status=200, headers=None, body=b'', flags=None,
|
||||
request=None, certificate=None, ip_address=None):
|
||||
self.headers = Headers(headers or {})
|
||||
self.status = int(status)
|
||||
self._set_body(body)
|
||||
self._set_url(url)
|
||||
self.request = request
|
||||
self.flags = [] if flags is None else list(flags)
|
||||
self.certificate = certificate
|
||||
self.ip_address = ip_address
|
||||
|
||||
@property
|
||||
def cb_kwargs(self):
|
||||
try:
|
||||
return self.request.cb_kwargs
|
||||
except AttributeError:
|
||||
raise AttributeError(
|
||||
"Response.cb_kwargs not available, this response "
|
||||
"is not tied to any request"
|
||||
)
|
||||
|
||||
@property
|
||||
def meta(self):
|
||||
try:
|
||||
return self.request.meta
|
||||
except AttributeError:
|
||||
raise AttributeError(
|
||||
"Response.meta not available, this response "
|
||||
"is not tied to any request"
|
||||
)
|
||||
|
||||
def _get_url(self):
|
||||
return self._url
|
||||
|
||||
def _set_url(self, url):
|
||||
if isinstance(url, str):
|
||||
self._url = url
|
||||
else:
|
||||
raise TypeError(f'{type(self).__name__} url must be str, '
|
||||
f'got {type(url).__name__}')
|
||||
|
||||
url = property(_get_url, obsolete_setter(_set_url, 'url'))
|
||||
|
||||
def _get_body(self):
|
||||
return self._body
|
||||
|
||||
def _set_body(self, body):
|
||||
if body is None:
|
||||
self._body = b''
|
||||
elif not isinstance(body, bytes):
|
||||
raise TypeError(
|
||||
"Response body must be bytes. "
|
||||
"If you want to pass unicode body use TextResponse "
|
||||
"or HtmlResponse.")
|
||||
else:
|
||||
self._body = body
|
||||
|
||||
body = property(_get_body, obsolete_setter(_set_body, 'body'))
|
||||
|
||||
def __str__(self):
|
||||
return f"<{self.status} {self.url}>"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def copy(self):
|
||||
"""Return a copy of this Response"""
|
||||
return self.replace()
|
||||
|
||||
def replace(self, *args, **kwargs):
|
||||
"""Create a new Response with the same attributes except for those
|
||||
given new values.
|
||||
"""
|
||||
for x in ['url', 'status', 'headers', 'body',
|
||||
'request', 'flags', 'certificate', 'ip_address']:
|
||||
kwargs.setdefault(x, getattr(self, x))
|
||||
cls = kwargs.pop('cls', self.__class__)
|
||||
return cls(*args, **kwargs)
|
||||
|
||||
def urljoin(self, url):
|
||||
"""Join this Response's url with a possible relative url to form an
|
||||
absolute interpretation of the latter."""
|
||||
return urljoin(self.url, url)
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
"""For subclasses of TextResponse, this will return the body
|
||||
as str
|
||||
"""
|
||||
raise AttributeError("Response content isn't text")
|
||||
|
||||
def css(self, *a, **kw):
|
||||
"""Shortcut method implemented only by responses whose content
|
||||
is text (subclasses of TextResponse).
|
||||
"""
|
||||
raise NotSupported("Response content isn't text")
|
||||
|
||||
def xpath(self, *a, **kw):
|
||||
"""Shortcut method implemented only by responses whose content
|
||||
is text (subclasses of TextResponse).
|
||||
"""
|
||||
raise NotSupported("Response content isn't text")
|
||||
|
||||
def follow(self, url, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding='utf-8', priority=0,
|
||||
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
|
||||
# type: (...) -> Request
|
||||
"""
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
It accepts the same arguments as ``Request.__init__`` method,
|
||||
but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
|
||||
not only an absolute URL.
|
||||
|
||||
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
|
||||
method which supports selectors in addition to absolute/relative URLs
|
||||
and Link objects.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
The *flags* parameter.
|
||||
"""
|
||||
if isinstance(url, Link):
|
||||
url = url.url
|
||||
elif url is None:
|
||||
raise ValueError("url can't be None")
|
||||
url = self.urljoin(url)
|
||||
|
||||
return Request(
|
||||
url=url,
|
||||
callback=callback,
|
||||
method=method,
|
||||
headers=headers,
|
||||
body=body,
|
||||
cookies=cookies,
|
||||
meta=meta,
|
||||
encoding=encoding,
|
||||
priority=priority,
|
||||
dont_filter=dont_filter,
|
||||
errback=errback,
|
||||
cb_kwargs=cb_kwargs,
|
||||
flags=flags,
|
||||
)
|
||||
|
||||
def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding='utf-8', priority=0,
|
||||
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
|
||||
# type: (...) -> Generator[Request, None, None]
|
||||
"""
|
||||
.. versionadded:: 2.0
|
||||
|
||||
Return an iterable of :class:`~.Request` instances to follow all links
|
||||
in ``urls``. It accepts the same arguments as ``Request.__init__`` method,
|
||||
but elements of ``urls`` can be relative URLs or :class:`~scrapy.link.Link` objects,
|
||||
not only absolute URLs.
|
||||
|
||||
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow_all`
|
||||
method which supports selectors in addition to absolute/relative URLs
|
||||
and Link objects.
|
||||
"""
|
||||
if not hasattr(urls, '__iter__'):
|
||||
raise TypeError("'urls' argument must be an iterable")
|
||||
return (
|
||||
self.follow(
|
||||
url=url,
|
||||
callback=callback,
|
||||
method=method,
|
||||
headers=headers,
|
||||
body=body,
|
||||
cookies=cookies,
|
||||
meta=meta,
|
||||
encoding=encoding,
|
||||
priority=priority,
|
||||
dont_filter=dont_filter,
|
||||
errback=errback,
|
||||
cb_kwargs=cb_kwargs,
|
||||
flags=flags,
|
||||
)
|
||||
for url in urls
|
||||
)
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
"""
|
||||
This module implements the HtmlResponse class which adds encoding
|
||||
discovering through HTML encoding declarations to the TextResponse class.
|
||||
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
from scrapy.http.response.text import TextResponse
|
||||
|
||||
|
||||
class HtmlResponse(TextResponse):
|
||||
pass
|
||||
265
venv/lib/python3.9/site-packages/scrapy/http/response/text.py
Normal file
265
venv/lib/python3.9/site-packages/scrapy/http/response/text.py
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
"""
|
||||
This module implements the TextResponse class which adds encoding handling and
|
||||
discovering (through HTTP headers) to base Response class.
|
||||
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
import json
|
||||
import warnings
|
||||
from contextlib import suppress
|
||||
from typing import Generator
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import parsel
|
||||
from w3lib.encoding import (html_body_declared_encoding, html_to_unicode,
|
||||
http_content_type_encoding, resolve_encoding)
|
||||
from w3lib.html import strip_html5_whitespace
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.http import Request
|
||||
from scrapy.http.response import Response
|
||||
from scrapy.utils.python import memoizemethod_noargs, to_unicode
|
||||
from scrapy.utils.response import get_base_url
|
||||
|
||||
_NONE = object()
|
||||
|
||||
|
||||
class TextResponse(Response):
|
||||
|
||||
_DEFAULT_ENCODING = 'ascii'
|
||||
_cached_decoded_json = _NONE
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._encoding = kwargs.pop('encoding', None)
|
||||
self._cached_benc = None
|
||||
self._cached_ubody = None
|
||||
self._cached_selector = None
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _set_url(self, url):
|
||||
if isinstance(url, str):
|
||||
self._url = to_unicode(url, self.encoding)
|
||||
else:
|
||||
super()._set_url(url)
|
||||
|
||||
def _set_body(self, body):
|
||||
self._body = b'' # used by encoding detection
|
||||
if isinstance(body, str):
|
||||
if self._encoding is None:
|
||||
raise TypeError('Cannot convert unicode body - '
|
||||
f'{type(self).__name__} has no encoding')
|
||||
self._body = body.encode(self._encoding)
|
||||
else:
|
||||
super()._set_body(body)
|
||||
|
||||
def replace(self, *args, **kwargs):
|
||||
kwargs.setdefault('encoding', self.encoding)
|
||||
return Response.replace(self, *args, **kwargs)
|
||||
|
||||
@property
|
||||
def encoding(self):
|
||||
return self._declared_encoding() or self._body_inferred_encoding()
|
||||
|
||||
def _declared_encoding(self):
|
||||
return (
|
||||
self._encoding
|
||||
or self._headers_encoding()
|
||||
or self._body_declared_encoding()
|
||||
)
|
||||
|
||||
def body_as_unicode(self):
|
||||
"""Return body as unicode"""
|
||||
warnings.warn('Response.body_as_unicode() is deprecated, '
|
||||
'please use Response.text instead.',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return self.text
|
||||
|
||||
def json(self):
|
||||
"""
|
||||
.. versionadded:: 2.2
|
||||
|
||||
Deserialize a JSON document to a Python object.
|
||||
"""
|
||||
if self._cached_decoded_json is _NONE:
|
||||
self._cached_decoded_json = json.loads(self.text)
|
||||
return self._cached_decoded_json
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
""" Body as unicode """
|
||||
# access self.encoding before _cached_ubody to make sure
|
||||
# _body_inferred_encoding is called
|
||||
benc = self.encoding
|
||||
if self._cached_ubody is None:
|
||||
charset = f'charset={benc}'
|
||||
self._cached_ubody = html_to_unicode(charset, self.body)[1]
|
||||
return self._cached_ubody
|
||||
|
||||
def urljoin(self, url):
|
||||
"""Join this Response's url with a possible relative url to form an
|
||||
absolute interpretation of the latter."""
|
||||
return urljoin(get_base_url(self), url)
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _headers_encoding(self):
|
||||
content_type = self.headers.get(b'Content-Type', b'')
|
||||
return http_content_type_encoding(to_unicode(content_type))
|
||||
|
||||
def _body_inferred_encoding(self):
|
||||
if self._cached_benc is None:
|
||||
content_type = to_unicode(self.headers.get(b'Content-Type', b''))
|
||||
benc, ubody = html_to_unicode(content_type, self.body,
|
||||
auto_detect_fun=self._auto_detect_fun,
|
||||
default_encoding=self._DEFAULT_ENCODING)
|
||||
self._cached_benc = benc
|
||||
self._cached_ubody = ubody
|
||||
return self._cached_benc
|
||||
|
||||
def _auto_detect_fun(self, text):
|
||||
for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
|
||||
try:
|
||||
text.decode(enc)
|
||||
except UnicodeError:
|
||||
continue
|
||||
return resolve_encoding(enc)
|
||||
|
||||
@memoizemethod_noargs
|
||||
def _body_declared_encoding(self):
|
||||
return html_body_declared_encoding(self.body)
|
||||
|
||||
@property
|
||||
def selector(self):
|
||||
from scrapy.selector import Selector
|
||||
if self._cached_selector is None:
|
||||
self._cached_selector = Selector(self)
|
||||
return self._cached_selector
|
||||
|
||||
def xpath(self, query, **kwargs):
|
||||
return self.selector.xpath(query, **kwargs)
|
||||
|
||||
def css(self, query):
|
||||
return self.selector.css(query)
|
||||
|
||||
def follow(self, url, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding=None, priority=0,
|
||||
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
|
||||
# type: (...) -> Request
|
||||
"""
|
||||
Return a :class:`~.Request` instance to follow a link ``url``.
|
||||
It accepts the same arguments as ``Request.__init__`` method,
|
||||
but ``url`` can be not only an absolute URL, but also
|
||||
|
||||
* a relative URL
|
||||
* a :class:`~scrapy.link.Link` object, e.g. the result of
|
||||
:ref:`topics-link-extractors`
|
||||
* a :class:`~scrapy.selector.Selector` object for a ``<link>`` or ``<a>`` element, e.g.
|
||||
``response.css('a.my_link')[0]``
|
||||
* an attribute :class:`~scrapy.selector.Selector` (not SelectorList), e.g.
|
||||
``response.css('a::attr(href)')[0]`` or
|
||||
``response.xpath('//img/@src')[0]``
|
||||
|
||||
See :ref:`response-follow-example` for usage examples.
|
||||
"""
|
||||
if isinstance(url, parsel.Selector):
|
||||
url = _url_from_selector(url)
|
||||
elif isinstance(url, parsel.SelectorList):
|
||||
raise ValueError("SelectorList is not supported")
|
||||
encoding = self.encoding if encoding is None else encoding
|
||||
return super().follow(
|
||||
url=url,
|
||||
callback=callback,
|
||||
method=method,
|
||||
headers=headers,
|
||||
body=body,
|
||||
cookies=cookies,
|
||||
meta=meta,
|
||||
encoding=encoding,
|
||||
priority=priority,
|
||||
dont_filter=dont_filter,
|
||||
errback=errback,
|
||||
cb_kwargs=cb_kwargs,
|
||||
flags=flags,
|
||||
)
|
||||
|
||||
def follow_all(self, urls=None, callback=None, method='GET', headers=None, body=None,
|
||||
cookies=None, meta=None, encoding=None, priority=0,
|
||||
dont_filter=False, errback=None, cb_kwargs=None, flags=None,
|
||||
css=None, xpath=None):
|
||||
# type: (...) -> Generator[Request, None, None]
|
||||
"""
|
||||
A generator that produces :class:`~.Request` instances to follow all
|
||||
links in ``urls``. It accepts the same arguments as the :class:`~.Request`'s
|
||||
``__init__`` method, except that each ``urls`` element does not need to be
|
||||
an absolute URL, it can be any of the following:
|
||||
|
||||
* a relative URL
|
||||
* a :class:`~scrapy.link.Link` object, e.g. the result of
|
||||
:ref:`topics-link-extractors`
|
||||
* a :class:`~scrapy.selector.Selector` object for a ``<link>`` or ``<a>`` element, e.g.
|
||||
``response.css('a.my_link')[0]``
|
||||
* an attribute :class:`~scrapy.selector.Selector` (not SelectorList), e.g.
|
||||
``response.css('a::attr(href)')[0]`` or
|
||||
``response.xpath('//img/@src')[0]``
|
||||
|
||||
In addition, ``css`` and ``xpath`` arguments are accepted to perform the link extraction
|
||||
within the ``follow_all`` method (only one of ``urls``, ``css`` and ``xpath`` is accepted).
|
||||
|
||||
Note that when passing a ``SelectorList`` as argument for the ``urls`` parameter or
|
||||
using the ``css`` or ``xpath`` parameters, this method will not produce requests for
|
||||
selectors from which links cannot be obtained (for instance, anchor tags without an
|
||||
``href`` attribute)
|
||||
"""
|
||||
arguments = [x for x in (urls, css, xpath) if x is not None]
|
||||
if len(arguments) != 1:
|
||||
raise ValueError(
|
||||
"Please supply exactly one of the following arguments: urls, css, xpath"
|
||||
)
|
||||
if not urls:
|
||||
if css:
|
||||
urls = self.css(css)
|
||||
if xpath:
|
||||
urls = self.xpath(xpath)
|
||||
if isinstance(urls, parsel.SelectorList):
|
||||
selectors = urls
|
||||
urls = []
|
||||
for sel in selectors:
|
||||
with suppress(_InvalidSelector):
|
||||
urls.append(_url_from_selector(sel))
|
||||
return super().follow_all(
|
||||
urls=urls,
|
||||
callback=callback,
|
||||
method=method,
|
||||
headers=headers,
|
||||
body=body,
|
||||
cookies=cookies,
|
||||
meta=meta,
|
||||
encoding=encoding,
|
||||
priority=priority,
|
||||
dont_filter=dont_filter,
|
||||
errback=errback,
|
||||
cb_kwargs=cb_kwargs,
|
||||
flags=flags,
|
||||
)
|
||||
|
||||
|
||||
class _InvalidSelector(ValueError):
|
||||
"""
|
||||
Raised when a URL cannot be obtained from a Selector
|
||||
"""
|
||||
|
||||
|
||||
def _url_from_selector(sel):
|
||||
# type: (parsel.Selector) -> str
|
||||
if isinstance(sel.root, str):
|
||||
# e.g. ::attr(href) result
|
||||
return strip_html5_whitespace(sel.root)
|
||||
if not hasattr(sel.root, 'tag'):
|
||||
raise _InvalidSelector(f"Unsupported selector: {sel}")
|
||||
if sel.root.tag not in ('a', 'link'):
|
||||
raise _InvalidSelector("Only <a> and <link> elements are supported; "
|
||||
f"got <{sel.root.tag}>")
|
||||
href = sel.root.get('href')
|
||||
if href is None:
|
||||
raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
|
||||
return strip_html5_whitespace(href)
|
||||
12
venv/lib/python3.9/site-packages/scrapy/http/response/xml.py
Normal file
12
venv/lib/python3.9/site-packages/scrapy/http/response/xml.py
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
"""
|
||||
This module implements the XmlResponse class which adds encoding
|
||||
discovering through XML encoding declarations to the TextResponse class.
|
||||
|
||||
See documentation in docs/topics/request-response.rst
|
||||
"""
|
||||
|
||||
from scrapy.http.response.text import TextResponse
|
||||
|
||||
|
||||
class XmlResponse(TextResponse):
|
||||
pass
|
||||
18
venv/lib/python3.9/site-packages/scrapy/interfaces.py
Normal file
18
venv/lib/python3.9/site-packages/scrapy/interfaces.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
from zope.interface import Interface
|
||||
|
||||
|
||||
class ISpiderLoader(Interface):
|
||||
|
||||
def from_settings(settings):
|
||||
"""Return an instance of the class for the given settings"""
|
||||
|
||||
def load(spider_name):
|
||||
"""Return the Spider class for the given spider name. If the spider
|
||||
name is not found, it must raise a KeyError."""
|
||||
|
||||
def list():
|
||||
"""Return a list with the names of all spiders available in the
|
||||
project"""
|
||||
|
||||
def find_by_request(request):
|
||||
"""Return the list of spiders names that can handle the given request"""
|
||||
158
venv/lib/python3.9/site-packages/scrapy/item.py
Normal file
158
venv/lib/python3.9/site-packages/scrapy/item.py
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
"""
|
||||
Scrapy Item
|
||||
|
||||
See documentation in docs/topics/item.rst
|
||||
"""
|
||||
|
||||
from abc import ABCMeta
|
||||
from collections.abc import MutableMapping
|
||||
from copy import deepcopy
|
||||
from pprint import pformat
|
||||
from warnings import warn
|
||||
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
from scrapy.utils.trackref import object_ref
|
||||
|
||||
|
||||
class _BaseItem(object_ref):
|
||||
"""
|
||||
Temporary class used internally to avoid the deprecation
|
||||
warning raised by isinstance checks using BaseItem.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class _BaseItemMeta(ABCMeta):
|
||||
def __instancecheck__(cls, instance):
|
||||
if cls is BaseItem:
|
||||
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return super().__instancecheck__(instance)
|
||||
|
||||
|
||||
class BaseItem(_BaseItem, metaclass=_BaseItemMeta):
|
||||
"""
|
||||
Deprecated, please use :class:`scrapy.item.Item` instead
|
||||
"""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if issubclass(cls, BaseItem) and not issubclass(cls, (Item, DictItem)):
|
||||
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return super().__new__(cls, *args, **kwargs)
|
||||
|
||||
|
||||
class Field(dict):
|
||||
"""Container of field metadata"""
|
||||
|
||||
|
||||
class ItemMeta(_BaseItemMeta):
|
||||
"""Metaclass_ of :class:`Item` that handles field definitions.
|
||||
|
||||
.. _metaclass: https://realpython.com/python-metaclasses
|
||||
"""
|
||||
|
||||
def __new__(mcs, class_name, bases, attrs):
|
||||
classcell = attrs.pop('__classcell__', None)
|
||||
new_bases = tuple(base._class for base in bases if hasattr(base, '_class'))
|
||||
_class = super().__new__(mcs, 'x_' + class_name, new_bases, attrs)
|
||||
|
||||
fields = getattr(_class, 'fields', {})
|
||||
new_attrs = {}
|
||||
for n in dir(_class):
|
||||
v = getattr(_class, n)
|
||||
if isinstance(v, Field):
|
||||
fields[n] = v
|
||||
elif n in attrs:
|
||||
new_attrs[n] = attrs[n]
|
||||
|
||||
new_attrs['fields'] = fields
|
||||
new_attrs['_class'] = _class
|
||||
if classcell is not None:
|
||||
new_attrs['__classcell__'] = classcell
|
||||
return super().__new__(mcs, class_name, bases, new_attrs)
|
||||
|
||||
|
||||
class DictItem(MutableMapping, BaseItem):
|
||||
|
||||
fields = {}
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if issubclass(cls, DictItem) and not issubclass(cls, Item):
|
||||
warn('scrapy.item.DictItem is deprecated, please use scrapy.item.Item instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return super().__new__(cls, *args, **kwargs)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._values = {}
|
||||
if args or kwargs: # avoid creating dict for most common case
|
||||
for k, v in dict(*args, **kwargs).items():
|
||||
self[k] = v
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self._values[key]
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if key in self.fields:
|
||||
self._values[key] = value
|
||||
else:
|
||||
raise KeyError(f"{self.__class__.__name__} does not support field: {key}")
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._values[key]
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name in self.fields:
|
||||
raise AttributeError(f"Use item[{name!r}] to get field value")
|
||||
raise AttributeError(name)
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
if not name.startswith('_'):
|
||||
raise AttributeError(f"Use item[{name!r}] = {value!r} to set field value")
|
||||
super().__setattr__(name, value)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._values)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._values)
|
||||
|
||||
__hash__ = BaseItem.__hash__
|
||||
|
||||
def keys(self):
|
||||
return self._values.keys()
|
||||
|
||||
def __repr__(self):
|
||||
return pformat(dict(self))
|
||||
|
||||
def copy(self):
|
||||
return self.__class__(self)
|
||||
|
||||
def deepcopy(self):
|
||||
"""Return a :func:`~copy.deepcopy` of this item.
|
||||
"""
|
||||
return deepcopy(self)
|
||||
|
||||
|
||||
class Item(DictItem, metaclass=ItemMeta):
|
||||
"""
|
||||
Base class for scraped items.
|
||||
|
||||
In Scrapy, an object is considered an ``item`` if it is an instance of either
|
||||
:class:`Item` or :class:`dict`, or any subclass. For example, when the output of a
|
||||
spider callback is evaluated, only instances of :class:`Item` or
|
||||
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.
|
||||
|
||||
If you need instances of a custom class to be considered items by Scrapy,
|
||||
you must inherit from either :class:`Item` or :class:`dict`.
|
||||
|
||||
Items must declare :class:`Field` attributes, which are processed and stored
|
||||
in the ``fields`` attribute. This restricts the set of allowed field names
|
||||
and prevents typos, raising ``KeyError`` when referring to undefined fields.
|
||||
Additionally, fields can be used to define metadata and control the way
|
||||
data is processed internally. Please refer to the :ref:`documentation
|
||||
about fields <topics-items-fields>` for additional information.
|
||||
|
||||
Unlike instances of :class:`dict`, instances of :class:`Item` may be
|
||||
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
|
||||
"""
|
||||
53
venv/lib/python3.9/site-packages/scrapy/link.py
Normal file
53
venv/lib/python3.9/site-packages/scrapy/link.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
"""
|
||||
This module defines the Link object used in Link extractors.
|
||||
|
||||
For actual link extractors implementation see scrapy.linkextractors, or
|
||||
its documentation in: docs/topics/link-extractors.rst
|
||||
"""
|
||||
|
||||
|
||||
class Link:
|
||||
"""Link objects represent an extracted link by the LinkExtractor.
|
||||
|
||||
Using the anchor tag sample below to illustrate the parameters::
|
||||
|
||||
<a href="https://example.com/nofollow.html#foo" rel="nofollow">Dont follow this one</a>
|
||||
|
||||
:param url: the absolute url being linked to in the anchor tag.
|
||||
From the sample, this is ``https://example.com/nofollow.html``.
|
||||
|
||||
:param text: the text in the anchor tag. From the sample, this is ``Dont follow this one``.
|
||||
|
||||
:param fragment: the part of the url after the hash symbol. From the sample, this is ``foo``.
|
||||
|
||||
:param nofollow: an indication of the presence or absence of a nofollow value in the ``rel`` attribute
|
||||
of the anchor tag.
|
||||
"""
|
||||
|
||||
__slots__ = ['url', 'text', 'fragment', 'nofollow']
|
||||
|
||||
def __init__(self, url, text='', fragment='', nofollow=False):
|
||||
if not isinstance(url, str):
|
||||
got = url.__class__.__name__
|
||||
raise TypeError(f"Link urls must be str objects, got {got}")
|
||||
self.url = url
|
||||
self.text = text
|
||||
self.fragment = fragment
|
||||
self.nofollow = nofollow
|
||||
|
||||
def __eq__(self, other):
|
||||
return (
|
||||
self.url == other.url
|
||||
and self.text == other.text
|
||||
and self.fragment == other.fragment
|
||||
and self.nofollow == other.nofollow
|
||||
)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(self.nofollow)
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f'Link(url={self.url!r}, text={self.text!r}, '
|
||||
f'fragment={self.fragment!r}, nofollow={self.nofollow!r})'
|
||||
)
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
"""
|
||||
scrapy.linkextractors
|
||||
|
||||
This package contains a collection of Link Extractors.
|
||||
|
||||
For more info see docs/topics/link-extractors.rst
|
||||
"""
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
from warnings import warn
|
||||
|
||||
from parsel.csstranslator import HTMLTranslator
|
||||
from w3lib.url import canonicalize_url
|
||||
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.url import (
|
||||
url_is_from_any_domain, url_has_any_extension,
|
||||
)
|
||||
|
||||
|
||||
# common file extensions that are not followed if they occur in links
|
||||
IGNORED_EXTENSIONS = [
|
||||
# archives
|
||||
'7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',
|
||||
|
||||
# images
|
||||
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
|
||||
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',
|
||||
|
||||
# audio
|
||||
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
|
||||
|
||||
# video
|
||||
'3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
|
||||
'm4a', 'm4v', 'flv', 'webm',
|
||||
|
||||
# office suites
|
||||
'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg',
|
||||
'odp',
|
||||
|
||||
# other
|
||||
'css', 'pdf', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk'
|
||||
]
|
||||
|
||||
|
||||
_re_type = type(re.compile("", 0))
|
||||
|
||||
|
||||
def _matches(url, regexs):
|
||||
return any(r.search(url) for r in regexs)
|
||||
|
||||
|
||||
def _is_valid_url(url):
|
||||
return url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
|
||||
|
||||
|
||||
class FilteringLinkExtractor:
|
||||
|
||||
_csstranslator = HTMLTranslator()
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
|
||||
if issubclass(cls, FilteringLinkExtractor) and not issubclass(cls, LxmlLinkExtractor):
|
||||
warn('scrapy.linkextractors.FilteringLinkExtractor is deprecated, '
|
||||
'please use scrapy.linkextractors.LinkExtractor instead',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
return super().__new__(cls)
|
||||
|
||||
def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
|
||||
restrict_xpaths, canonicalize, deny_extensions, restrict_css, restrict_text):
|
||||
|
||||
self.link_extractor = link_extractor
|
||||
|
||||
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(allow)]
|
||||
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(deny)]
|
||||
|
||||
self.allow_domains = set(arg_to_iter(allow_domains))
|
||||
self.deny_domains = set(arg_to_iter(deny_domains))
|
||||
|
||||
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
|
||||
self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
|
||||
arg_to_iter(restrict_css)))
|
||||
|
||||
self.canonicalize = canonicalize
|
||||
if deny_extensions is None:
|
||||
deny_extensions = IGNORED_EXTENSIONS
|
||||
self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
|
||||
self.restrict_text = [x if isinstance(x, _re_type) else re.compile(x)
|
||||
for x in arg_to_iter(restrict_text)]
|
||||
|
||||
def _link_allowed(self, link):
|
||||
if not _is_valid_url(link.url):
|
||||
return False
|
||||
if self.allow_res and not _matches(link.url, self.allow_res):
|
||||
return False
|
||||
if self.deny_res and _matches(link.url, self.deny_res):
|
||||
return False
|
||||
parsed_url = urlparse(link.url)
|
||||
if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains):
|
||||
return False
|
||||
if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
|
||||
return False
|
||||
if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
|
||||
return False
|
||||
if self.restrict_text and not _matches(link.text, self.restrict_text):
|
||||
return False
|
||||
return True
|
||||
|
||||
def matches(self, url):
|
||||
|
||||
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
|
||||
return False
|
||||
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
|
||||
return False
|
||||
|
||||
allowed = (regex.search(url) for regex in self.allow_res) if self.allow_res else [True]
|
||||
denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
|
||||
return any(allowed) and not any(denied)
|
||||
|
||||
def _process_links(self, links):
|
||||
links = [x for x in links if self._link_allowed(x)]
|
||||
if self.canonicalize:
|
||||
for link in links:
|
||||
link.url = canonicalize_url(link.url)
|
||||
links = self.link_extractor._process_links(links)
|
||||
return links
|
||||
|
||||
def _extract_links(self, *args, **kwargs):
|
||||
return self.link_extractor._extract_links(*args, **kwargs)
|
||||
|
||||
|
||||
# Top-level imports
|
||||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor as LinkExtractor
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
"""
|
||||
Link extractor based on lxml.html
|
||||
"""
|
||||
import operator
|
||||
from functools import partial
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import lxml.etree as etree
|
||||
from w3lib.html import strip_html5_whitespace
|
||||
from w3lib.url import canonicalize_url, safe_url_string
|
||||
|
||||
from scrapy.link import Link
|
||||
from scrapy.linkextractors import FilteringLinkExtractor
|
||||
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
|
||||
from scrapy.utils.python import unique as unique_list
|
||||
from scrapy.utils.response import get_base_url
|
||||
|
||||
|
||||
# from lxml/src/lxml/html/__init__.py
|
||||
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
|
||||
|
||||
_collect_string_content = etree.XPath("string()")
|
||||
|
||||
|
||||
def _nons(tag):
|
||||
if isinstance(tag, str):
|
||||
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE:
|
||||
return tag.split('}')[-1]
|
||||
return tag
|
||||
|
||||
|
||||
def _identity(x):
|
||||
return x
|
||||
|
||||
|
||||
def _canonicalize_link_url(link):
|
||||
return canonicalize_url(link.url, keep_fragments=True)
|
||||
|
||||
|
||||
class LxmlParserLinkExtractor:
|
||||
def __init__(
|
||||
self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False
|
||||
):
|
||||
self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
|
||||
self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
|
||||
self.process_attr = process if callable(process) else _identity
|
||||
self.unique = unique
|
||||
self.strip = strip
|
||||
self.link_key = operator.attrgetter("url") if canonicalized else _canonicalize_link_url
|
||||
|
||||
def _iter_links(self, document):
|
||||
for el in document.iter(etree.Element):
|
||||
if not self.scan_tag(_nons(el.tag)):
|
||||
continue
|
||||
attribs = el.attrib
|
||||
for attrib in attribs:
|
||||
if not self.scan_attr(attrib):
|
||||
continue
|
||||
yield (el, attrib, attribs[attrib])
|
||||
|
||||
def _extract_links(self, selector, response_url, response_encoding, base_url):
|
||||
links = []
|
||||
# hacky way to get the underlying lxml parsed document
|
||||
for el, attr, attr_val in self._iter_links(selector.root):
|
||||
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
|
||||
try:
|
||||
if self.strip:
|
||||
attr_val = strip_html5_whitespace(attr_val)
|
||||
attr_val = urljoin(base_url, attr_val)
|
||||
except ValueError:
|
||||
continue # skipping bogus links
|
||||
else:
|
||||
url = self.process_attr(attr_val)
|
||||
if url is None:
|
||||
continue
|
||||
url = safe_url_string(url, encoding=response_encoding)
|
||||
# to fix relative links after process_value
|
||||
url = urljoin(response_url, url)
|
||||
link = Link(url, _collect_string_content(el) or '',
|
||||
nofollow=rel_has_nofollow(el.get('rel')))
|
||||
links.append(link)
|
||||
return self._deduplicate_if_needed(links)
|
||||
|
||||
def extract_links(self, response):
|
||||
base_url = get_base_url(response)
|
||||
return self._extract_links(response.selector, response.url, response.encoding, base_url)
|
||||
|
||||
def _process_links(self, links):
|
||||
""" Normalize and filter extracted links
|
||||
|
||||
The subclass should override it if neccessary
|
||||
"""
|
||||
return self._deduplicate_if_needed(links)
|
||||
|
||||
def _deduplicate_if_needed(self, links):
|
||||
if self.unique:
|
||||
return unique_list(links, key=self.link_key)
|
||||
return links
|
||||
|
||||
|
||||
class LxmlLinkExtractor(FilteringLinkExtractor):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
allow=(),
|
||||
deny=(),
|
||||
allow_domains=(),
|
||||
deny_domains=(),
|
||||
restrict_xpaths=(),
|
||||
tags=('a', 'area'),
|
||||
attrs=('href',),
|
||||
canonicalize=False,
|
||||
unique=True,
|
||||
process_value=None,
|
||||
deny_extensions=None,
|
||||
restrict_css=(),
|
||||
strip=True,
|
||||
restrict_text=None,
|
||||
):
|
||||
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
|
||||
lx = LxmlParserLinkExtractor(
|
||||
tag=partial(operator.contains, tags),
|
||||
attr=partial(operator.contains, attrs),
|
||||
unique=unique,
|
||||
process=process_value,
|
||||
strip=strip,
|
||||
canonicalized=canonicalize
|
||||
)
|
||||
super().__init__(
|
||||
link_extractor=lx,
|
||||
allow=allow,
|
||||
deny=deny,
|
||||
allow_domains=allow_domains,
|
||||
deny_domains=deny_domains,
|
||||
restrict_xpaths=restrict_xpaths,
|
||||
restrict_css=restrict_css,
|
||||
canonicalize=canonicalize,
|
||||
deny_extensions=deny_extensions,
|
||||
restrict_text=restrict_text,
|
||||
)
|
||||
|
||||
def extract_links(self, response):
|
||||
"""Returns a list of :class:`~scrapy.link.Link` objects from the
|
||||
specified :class:`response <scrapy.http.Response>`.
|
||||
|
||||
Only links that match the settings passed to the ``__init__`` method of
|
||||
the link extractor are returned.
|
||||
|
||||
Duplicate links are omitted.
|
||||
"""
|
||||
base_url = get_base_url(response)
|
||||
if self.restrict_xpaths:
|
||||
docs = [
|
||||
subdoc
|
||||
for x in self.restrict_xpaths
|
||||
for subdoc in response.xpath(x)
|
||||
]
|
||||
else:
|
||||
docs = [response.selector]
|
||||
all_links = []
|
||||
for doc in docs:
|
||||
links = self._extract_links(doc, response.url, response.encoding, base_url)
|
||||
all_links.extend(self._process_links(links))
|
||||
return unique_list(all_links)
|
||||
88
venv/lib/python3.9/site-packages/scrapy/loader/__init__.py
Normal file
88
venv/lib/python3.9/site-packages/scrapy/loader/__init__.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
"""
|
||||
Item Loader
|
||||
|
||||
See documentation in docs/topics/loaders.rst
|
||||
"""
|
||||
import itemloaders
|
||||
|
||||
from scrapy.item import Item
|
||||
from scrapy.selector import Selector
|
||||
|
||||
|
||||
class ItemLoader(itemloaders.ItemLoader):
|
||||
"""
|
||||
A user-friendly abstraction to populate an :ref:`item <topics-items>` with data
|
||||
by applying :ref:`field processors <topics-loaders-processors>` to scraped data.
|
||||
When instantiated with a ``selector`` or a ``response`` it supports
|
||||
data extraction from web pages using :ref:`selectors <topics-selectors>`.
|
||||
|
||||
:param item: The item instance to populate using subsequent calls to
|
||||
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
|
||||
or :meth:`~ItemLoader.add_value`.
|
||||
:type item: scrapy.item.Item
|
||||
|
||||
:param selector: The selector to extract data from, when using the
|
||||
:meth:`add_xpath`, :meth:`add_css`, :meth:`replace_xpath`, or
|
||||
:meth:`replace_css` method.
|
||||
:type selector: :class:`~scrapy.selector.Selector` object
|
||||
|
||||
:param response: The response used to construct the selector using the
|
||||
:attr:`default_selector_class`, unless the selector argument is given,
|
||||
in which case this argument is ignored.
|
||||
:type response: :class:`~scrapy.http.Response` object
|
||||
|
||||
If no item is given, one is instantiated automatically using the class in
|
||||
:attr:`default_item_class`.
|
||||
|
||||
The item, selector, response and remaining keyword arguments are
|
||||
assigned to the Loader context (accessible through the :attr:`context` attribute).
|
||||
|
||||
.. attribute:: item
|
||||
|
||||
The item object being parsed by this Item Loader.
|
||||
This is mostly used as a property so, when attempting to override this
|
||||
value, you may want to check out :attr:`default_item_class` first.
|
||||
|
||||
.. attribute:: context
|
||||
|
||||
The currently active :ref:`Context <loaders-context>` of this Item Loader.
|
||||
|
||||
.. attribute:: default_item_class
|
||||
|
||||
An :ref:`item <topics-items>` class (or factory), used to instantiate
|
||||
items when not given in the ``__init__`` method.
|
||||
|
||||
.. attribute:: default_input_processor
|
||||
|
||||
The default input processor to use for those fields which don't specify
|
||||
one.
|
||||
|
||||
.. attribute:: default_output_processor
|
||||
|
||||
The default output processor to use for those fields which don't specify
|
||||
one.
|
||||
|
||||
.. attribute:: default_selector_class
|
||||
|
||||
The class used to construct the :attr:`selector` of this
|
||||
:class:`ItemLoader`, if only a response is given in the ``__init__`` method.
|
||||
If a selector is given in the ``__init__`` method this attribute is ignored.
|
||||
This attribute is sometimes overridden in subclasses.
|
||||
|
||||
.. attribute:: selector
|
||||
|
||||
The :class:`~scrapy.selector.Selector` object to extract data from.
|
||||
It's either the selector given in the ``__init__`` method or one created from
|
||||
the response given in the ``__init__`` method using the
|
||||
:attr:`default_selector_class`. This attribute is meant to be
|
||||
read-only.
|
||||
"""
|
||||
|
||||
default_item_class = Item
|
||||
default_selector_class = Selector
|
||||
|
||||
def __init__(self, item=None, selector=None, response=None, parent=None, **context):
|
||||
if selector is None and response is not None:
|
||||
selector = self.default_selector_class(response)
|
||||
context.update(response=response)
|
||||
super().__init__(item=item, selector=selector, parent=parent, **context)
|
||||
21
venv/lib/python3.9/site-packages/scrapy/loader/common.py
Normal file
21
venv/lib/python3.9/site-packages/scrapy/loader/common.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
"""Common functions used in Item Loaders code"""
|
||||
|
||||
import warnings
|
||||
|
||||
from itemloaders import common
|
||||
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
def wrap_loader_context(function, context):
|
||||
"""Wrap functions that receive loader_context to contain the context
|
||||
"pre-loaded" and expose a interface that receives only one argument
|
||||
"""
|
||||
warnings.warn(
|
||||
"scrapy.loader.common.wrap_loader_context has moved to a new library."
|
||||
"Please update your reference to itemloaders.common.wrap_loader_context",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
return common.wrap_loader_context(function, context)
|
||||
21
venv/lib/python3.9/site-packages/scrapy/loader/processors.py
Normal file
21
venv/lib/python3.9/site-packages/scrapy/loader/processors.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
"""
|
||||
This module provides some commonly used processors for Item Loaders.
|
||||
|
||||
See documentation in docs/topics/loaders.rst
|
||||
"""
|
||||
from itemloaders import processors
|
||||
|
||||
from scrapy.utils.deprecate import create_deprecated_class
|
||||
|
||||
|
||||
MapCompose = create_deprecated_class('MapCompose', processors.MapCompose)
|
||||
|
||||
Compose = create_deprecated_class('Compose', processors.Compose)
|
||||
|
||||
TakeFirst = create_deprecated_class('TakeFirst', processors.TakeFirst)
|
||||
|
||||
Identity = create_deprecated_class('Identity', processors.Identity)
|
||||
|
||||
SelectJmes = create_deprecated_class('SelectJmes', processors.SelectJmes)
|
||||
|
||||
Join = create_deprecated_class('Join', processors.Join)
|
||||
147
venv/lib/python3.9/site-packages/scrapy/logformatter.py
Normal file
147
venv/lib/python3.9/site-packages/scrapy/logformatter.py
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
import os
|
||||
import logging
|
||||
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.utils.request import referer_str
|
||||
|
||||
SCRAPEDMSG = "Scraped from %(src)s" + os.linesep + "%(item)s"
|
||||
DROPPEDMSG = "Dropped: %(exception)s" + os.linesep + "%(item)s"
|
||||
CRAWLEDMSG = "Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
|
||||
ITEMERRORMSG = "Error processing %(item)s"
|
||||
SPIDERERRORMSG = "Spider error processing %(request)s (referer: %(referer)s)"
|
||||
DOWNLOADERRORMSG_SHORT = "Error downloading %(request)s"
|
||||
DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
|
||||
|
||||
|
||||
class LogFormatter:
|
||||
"""Class for generating log messages for different actions.
|
||||
|
||||
All methods must return a dictionary listing the parameters ``level``, ``msg``
|
||||
and ``args`` which are going to be used for constructing the log message when
|
||||
calling ``logging.log``.
|
||||
|
||||
Dictionary keys for the method outputs:
|
||||
|
||||
* ``level`` is the log level for that action, you can use those from the
|
||||
`python logging library <https://docs.python.org/3/library/logging.html>`_ :
|
||||
``logging.DEBUG``, ``logging.INFO``, ``logging.WARNING``, ``logging.ERROR``
|
||||
and ``logging.CRITICAL``.
|
||||
* ``msg`` should be a string that can contain different formatting placeholders.
|
||||
This string, formatted with the provided ``args``, is going to be the long message
|
||||
for that action.
|
||||
* ``args`` should be a tuple or dict with the formatting placeholders for ``msg``.
|
||||
The final log message is computed as ``msg % args``.
|
||||
|
||||
Users can define their own ``LogFormatter`` class if they want to customize how
|
||||
each action is logged or if they want to omit it entirely. In order to omit
|
||||
logging an action the method must return ``None``.
|
||||
|
||||
Here is an example on how to create a custom log formatter to lower the severity level of
|
||||
the log message when an item is dropped from the pipeline::
|
||||
|
||||
class PoliteLogFormatter(logformatter.LogFormatter):
|
||||
def dropped(self, item, exception, response, spider):
|
||||
return {
|
||||
'level': logging.INFO, # lowering the level from logging.WARNING
|
||||
'msg': "Dropped: %(exception)s" + os.linesep + "%(item)s",
|
||||
'args': {
|
||||
'exception': exception,
|
||||
'item': item,
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
def crawled(self, request, response, spider):
|
||||
"""Logs a message when the crawler finds a webpage."""
|
||||
request_flags = f' {str(request.flags)}' if request.flags else ''
|
||||
response_flags = f' {str(response.flags)}' if response.flags else ''
|
||||
return {
|
||||
'level': logging.DEBUG,
|
||||
'msg': CRAWLEDMSG,
|
||||
'args': {
|
||||
'status': response.status,
|
||||
'request': request,
|
||||
'request_flags': request_flags,
|
||||
'referer': referer_str(request),
|
||||
'response_flags': response_flags,
|
||||
# backward compatibility with Scrapy logformatter below 1.4 version
|
||||
'flags': response_flags
|
||||
}
|
||||
}
|
||||
|
||||
def scraped(self, item, response, spider):
|
||||
"""Logs a message when an item is scraped by a spider."""
|
||||
if isinstance(response, Failure):
|
||||
src = response.getErrorMessage()
|
||||
else:
|
||||
src = response
|
||||
return {
|
||||
'level': logging.DEBUG,
|
||||
'msg': SCRAPEDMSG,
|
||||
'args': {
|
||||
'src': src,
|
||||
'item': item,
|
||||
}
|
||||
}
|
||||
|
||||
def dropped(self, item, exception, response, spider):
|
||||
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
|
||||
return {
|
||||
'level': logging.WARNING,
|
||||
'msg': DROPPEDMSG,
|
||||
'args': {
|
||||
'exception': exception,
|
||||
'item': item,
|
||||
}
|
||||
}
|
||||
|
||||
def item_error(self, item, exception, response, spider):
|
||||
"""Logs a message when an item causes an error while it is passing
|
||||
through the item pipeline.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': ITEMERRORMSG,
|
||||
'args': {
|
||||
'item': item,
|
||||
}
|
||||
}
|
||||
|
||||
def spider_error(self, failure, request, response, spider):
|
||||
"""Logs an error message from a spider.
|
||||
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': SPIDERERRORMSG,
|
||||
'args': {
|
||||
'request': request,
|
||||
'referer': referer_str(request),
|
||||
}
|
||||
}
|
||||
|
||||
def download_error(self, failure, request, spider, errmsg=None):
|
||||
"""Logs a download error message from a spider (typically coming from
|
||||
the engine).
|
||||
|
||||
.. versionadded:: 2.0
|
||||
"""
|
||||
args = {'request': request}
|
||||
if errmsg:
|
||||
msg = DOWNLOADERRORMSG_LONG
|
||||
args['errmsg'] = errmsg
|
||||
else:
|
||||
msg = DOWNLOADERRORMSG_SHORT
|
||||
return {
|
||||
'level': logging.ERROR,
|
||||
'msg': msg,
|
||||
'args': args,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls()
|
||||
140
venv/lib/python3.9/site-packages/scrapy/mail.py
Normal file
140
venv/lib/python3.9/site-packages/scrapy/mail.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
"""
|
||||
Mail sending helpers
|
||||
|
||||
See documentation in docs/topics/email.rst
|
||||
"""
|
||||
import logging
|
||||
from email import encoders as Encoders
|
||||
from email.mime.base import MIMEBase
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.nonmultipart import MIMENonMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from email.utils import COMMASPACE, formatdate
|
||||
from io import BytesIO
|
||||
|
||||
from twisted.internet import defer, ssl
|
||||
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _to_bytes_or_none(text):
|
||||
if text is None:
|
||||
return None
|
||||
return to_bytes(text)
|
||||
|
||||
|
||||
class MailSender:
|
||||
def __init__(
|
||||
self, smtphost='localhost', mailfrom='scrapy@localhost', smtpuser=None,
|
||||
smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False
|
||||
):
|
||||
self.smtphost = smtphost
|
||||
self.smtpport = smtpport
|
||||
self.smtpuser = _to_bytes_or_none(smtpuser)
|
||||
self.smtppass = _to_bytes_or_none(smtppass)
|
||||
self.smtptls = smtptls
|
||||
self.smtpssl = smtpssl
|
||||
self.mailfrom = mailfrom
|
||||
self.debug = debug
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
return cls(
|
||||
smtphost=settings['MAIL_HOST'],
|
||||
mailfrom=settings['MAIL_FROM'],
|
||||
smtpuser=settings['MAIL_USER'],
|
||||
smtppass=settings['MAIL_PASS'],
|
||||
smtpport=settings.getint('MAIL_PORT'),
|
||||
smtptls=settings.getbool('MAIL_TLS'),
|
||||
smtpssl=settings.getbool('MAIL_SSL'),
|
||||
)
|
||||
|
||||
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
|
||||
from twisted.internet import reactor
|
||||
if attachs:
|
||||
msg = MIMEMultipart()
|
||||
else:
|
||||
msg = MIMENonMultipart(*mimetype.split('/', 1))
|
||||
|
||||
to = list(arg_to_iter(to))
|
||||
cc = list(arg_to_iter(cc))
|
||||
|
||||
msg['From'] = self.mailfrom
|
||||
msg['To'] = COMMASPACE.join(to)
|
||||
msg['Date'] = formatdate(localtime=True)
|
||||
msg['Subject'] = subject
|
||||
rcpts = to[:]
|
||||
if cc:
|
||||
rcpts.extend(cc)
|
||||
msg['Cc'] = COMMASPACE.join(cc)
|
||||
|
||||
if charset:
|
||||
msg.set_charset(charset)
|
||||
|
||||
if attachs:
|
||||
msg.attach(MIMEText(body, 'plain', charset or 'us-ascii'))
|
||||
for attach_name, mimetype, f in attachs:
|
||||
part = MIMEBase(*mimetype.split('/'))
|
||||
part.set_payload(f.read())
|
||||
Encoders.encode_base64(part)
|
||||
part.add_header('Content-Disposition', 'attachment', filename=attach_name)
|
||||
msg.attach(part)
|
||||
else:
|
||||
msg.set_payload(body)
|
||||
|
||||
if _callback:
|
||||
_callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)
|
||||
|
||||
if self.debug:
|
||||
logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': len(attachs)})
|
||||
return
|
||||
|
||||
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
|
||||
dfd.addCallbacks(
|
||||
callback=self._sent_ok,
|
||||
errback=self._sent_failed,
|
||||
callbackArgs=[to, cc, subject, len(attachs)],
|
||||
errbackArgs=[to, cc, subject, len(attachs)],
|
||||
)
|
||||
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
|
||||
return dfd
|
||||
|
||||
def _sent_ok(self, result, to, cc, subject, nattachs):
|
||||
logger.info('Mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': nattachs})
|
||||
|
||||
def _sent_failed(self, failure, to, cc, subject, nattachs):
|
||||
errstr = str(failure.value)
|
||||
logger.error('Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
|
||||
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
|
||||
'- %(mailerr)s',
|
||||
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
|
||||
'mailattachs': nattachs, 'mailerr': errstr})
|
||||
|
||||
def _sendmail(self, to_addrs, msg):
|
||||
# Import twisted.mail here because it is not available in python3
|
||||
from twisted.internet import reactor
|
||||
from twisted.mail.smtp import ESMTPSenderFactory
|
||||
msg = BytesIO(msg)
|
||||
d = defer.Deferred()
|
||||
factory = ESMTPSenderFactory(
|
||||
self.smtpuser, self.smtppass, self.mailfrom, to_addrs, msg, d,
|
||||
heloFallback=True, requireAuthentication=False, requireTransportSecurity=self.smtptls,
|
||||
)
|
||||
factory.noisy = False
|
||||
|
||||
if self.smtpssl:
|
||||
reactor.connectSSL(self.smtphost, self.smtpport, factory, ssl.ClientContextFactory())
|
||||
else:
|
||||
reactor.connectTCP(self.smtphost, self.smtpport, factory)
|
||||
|
||||
return d
|
||||
75
venv/lib/python3.9/site-packages/scrapy/middleware.py
Normal file
75
venv/lib/python3.9/site-packages/scrapy/middleware.py
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
from collections import defaultdict, deque
|
||||
import logging
|
||||
import pprint
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.misc import create_instance, load_object
|
||||
from scrapy.utils.defer import process_parallel, process_chain, process_chain_both
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MiddlewareManager:
|
||||
"""Base class for implementing middleware managers"""
|
||||
|
||||
component_name = 'foo middleware'
|
||||
|
||||
def __init__(self, *middlewares):
|
||||
self.middlewares = middlewares
|
||||
self.methods = defaultdict(deque)
|
||||
for mw in middlewares:
|
||||
self._add_middleware(mw)
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings, crawler=None):
|
||||
mwlist = cls._get_mwlist_from_settings(settings)
|
||||
middlewares = []
|
||||
enabled = []
|
||||
for clspath in mwlist:
|
||||
try:
|
||||
mwcls = load_object(clspath)
|
||||
mw = create_instance(mwcls, settings, crawler)
|
||||
middlewares.append(mw)
|
||||
enabled.append(clspath)
|
||||
except NotConfigured as e:
|
||||
if e.args:
|
||||
clsname = clspath.split('.')[-1]
|
||||
logger.warning("Disabled %(clsname)s: %(eargs)s",
|
||||
{'clsname': clsname, 'eargs': e.args[0]},
|
||||
extra={'crawler': crawler})
|
||||
|
||||
logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
|
||||
{'componentname': cls.component_name,
|
||||
'enabledlist': pprint.pformat(enabled)},
|
||||
extra={'crawler': crawler})
|
||||
return cls(*middlewares)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls.from_settings(crawler.settings, crawler)
|
||||
|
||||
def _add_middleware(self, mw):
|
||||
if hasattr(mw, 'open_spider'):
|
||||
self.methods['open_spider'].append(mw.open_spider)
|
||||
if hasattr(mw, 'close_spider'):
|
||||
self.methods['close_spider'].appendleft(mw.close_spider)
|
||||
|
||||
def _process_parallel(self, methodname, obj, *args):
|
||||
return process_parallel(self.methods[methodname], obj, *args)
|
||||
|
||||
def _process_chain(self, methodname, obj, *args):
|
||||
return process_chain(self.methods[methodname], obj, *args)
|
||||
|
||||
def _process_chain_both(self, cb_methodname, eb_methodname, obj, *args):
|
||||
return process_chain_both(self.methods[cb_methodname],
|
||||
self.methods[eb_methodname], obj, *args)
|
||||
|
||||
def open_spider(self, spider):
|
||||
return self._process_parallel('open_spider', spider)
|
||||
|
||||
def close_spider(self, spider):
|
||||
return self._process_parallel('close_spider', spider)
|
||||
750
venv/lib/python3.9/site-packages/scrapy/mime.types
Normal file
750
venv/lib/python3.9/site-packages/scrapy/mime.types
Normal file
|
|
@ -0,0 +1,750 @@
|
|||
###############################################################################
|
||||
#
|
||||
# MIME-TYPES and the extensions that represent them
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
|
||||
application/activemessage
|
||||
application/andrew-inset ez
|
||||
application/annodex anx
|
||||
application/applefile
|
||||
application/atom+xml atom
|
||||
application/atomcat+xml atomcat
|
||||
application/atomserv+xml atomsrv
|
||||
application/atomicmail
|
||||
application/batch-SMTP
|
||||
application/beep+xml
|
||||
application/bbolin lin
|
||||
application/cals-1840
|
||||
application/cap cap pcap
|
||||
application/commonground
|
||||
application/cu-seeme cu
|
||||
application/cybercash
|
||||
application/davmount+xml davmount
|
||||
application/dca-rft
|
||||
application/dec-dx
|
||||
application/docbook+xml
|
||||
application/dsptype tsp
|
||||
application/dvcs
|
||||
application/ecmascript es
|
||||
application/edi-consent
|
||||
application/edi-x12
|
||||
application/edifact
|
||||
application/eshop
|
||||
application/font-tdpfr
|
||||
application/futuresplash spl
|
||||
application/ghostview
|
||||
application/hta hta
|
||||
application/http
|
||||
application/hyperstudio
|
||||
application/iges
|
||||
application/index
|
||||
application/index.cmd
|
||||
application/index.obj
|
||||
application/index.response
|
||||
application/index.vnd
|
||||
application/iotp
|
||||
application/ipp
|
||||
application/isup
|
||||
application/java-archive jar
|
||||
application/java-serialized-object ser
|
||||
application/java-vm class
|
||||
application/javascript js
|
||||
application/m3g m3g
|
||||
application/mac-binhex40 hqx
|
||||
application/mac-compactpro cpt
|
||||
application/macwriteii
|
||||
application/marc
|
||||
application/mathematica nb nbp
|
||||
application/ms-tnef
|
||||
application/msaccess mdb
|
||||
application/msword doc dot
|
||||
application/news-message-id
|
||||
application/news-transmission
|
||||
application/ocsp-request
|
||||
application/ocsp-response
|
||||
application/octet-stream bin
|
||||
application/oda oda
|
||||
application/ogg ogx
|
||||
application/parityfec
|
||||
application/pdf pdf
|
||||
application/pgp-encrypted
|
||||
application/pgp-keys key
|
||||
application/pgp-signature pgp
|
||||
application/pics-rules prf
|
||||
application/pkcs10
|
||||
application/pkcs7-mime
|
||||
application/pkcs7-signature
|
||||
application/pkix-cert
|
||||
application/pkix-crl
|
||||
application/pkixcmp
|
||||
application/postscript ps ai eps espi epsf eps2 eps3
|
||||
application/prs.alvestrand.titrax-sheet
|
||||
application/prs.cww
|
||||
application/prs.nprend
|
||||
application/qsig
|
||||
application/rar rar
|
||||
application/rdf+xml rdf
|
||||
application/remote-printing
|
||||
application/riscos
|
||||
application/rss+xml rss
|
||||
application/rtf rtf
|
||||
application/sdp
|
||||
application/set-payment
|
||||
application/set-payment-initiation
|
||||
application/set-registration
|
||||
application/set-registration-initiation
|
||||
application/sgml
|
||||
application/sgml-open-catalog
|
||||
application/sieve
|
||||
application/slate
|
||||
application/smil smi smil
|
||||
application/timestamp-query
|
||||
application/timestamp-reply
|
||||
application/vemmi
|
||||
application/whoispp-query
|
||||
application/whoispp-response
|
||||
application/wita
|
||||
application/x400-bp
|
||||
application/xhtml+xml xhtml xht
|
||||
application/xml xml xsl xsd
|
||||
application/xml-dtd
|
||||
application/xml-external-parsed-entity
|
||||
application/xspf+xml xspf
|
||||
application/zip zip
|
||||
application/vnd.3M.Post-it-Notes
|
||||
application/vnd.accpac.simply.aso
|
||||
application/vnd.accpac.simply.imp
|
||||
application/vnd.acucobol
|
||||
application/vnd.aether.imp
|
||||
application/vnd.anser-web-certificate-issue-initiation
|
||||
application/vnd.anser-web-funds-transfer-initiation
|
||||
application/vnd.audiograph
|
||||
application/vnd.bmi
|
||||
application/vnd.businessobjects
|
||||
application/vnd.canon-cpdl
|
||||
application/vnd.canon-lips
|
||||
application/vnd.cinderella cdy
|
||||
application/vnd.claymore
|
||||
application/vnd.commerce-battelle
|
||||
application/vnd.commonspace
|
||||
application/vnd.comsocaller
|
||||
application/vnd.contact.cmsg
|
||||
application/vnd.cosmocaller
|
||||
application/vnd.ctc-posml
|
||||
application/vnd.cups-postscript
|
||||
application/vnd.cups-raster
|
||||
application/vnd.cups-raw
|
||||
application/vnd.cybank
|
||||
application/vnd.dna
|
||||
application/vnd.dpgraph
|
||||
application/vnd.dxr
|
||||
application/vnd.ecdis-update
|
||||
application/vnd.ecowin.chart
|
||||
application/vnd.ecowin.filerequest
|
||||
application/vnd.ecowin.fileupdate
|
||||
application/vnd.ecowin.series
|
||||
application/vnd.ecowin.seriesrequest
|
||||
application/vnd.ecowin.seriesupdate
|
||||
application/vnd.enliven
|
||||
application/vnd.epson.esf
|
||||
application/vnd.epson.msf
|
||||
application/vnd.epson.quickanime
|
||||
application/vnd.epson.salt
|
||||
application/vnd.epson.ssf
|
||||
application/vnd.ericsson.quickcall
|
||||
application/vnd.eudora.data
|
||||
application/vnd.fdf
|
||||
application/vnd.ffsns
|
||||
application/vnd.flographit
|
||||
application/vnd.framemaker
|
||||
application/vnd.fsc.weblaunch
|
||||
application/vnd.fujitsu.oasys
|
||||
application/vnd.fujitsu.oasys2
|
||||
application/vnd.fujitsu.oasys3
|
||||
application/vnd.fujitsu.oasysgp
|
||||
application/vnd.fujitsu.oasysprs
|
||||
application/vnd.fujixerox.ddd
|
||||
application/vnd.fujixerox.docuworks
|
||||
application/vnd.fujixerox.docuworks.binder
|
||||
application/vnd.fut-misnet
|
||||
application/vnd.google-earth.kml+xml kml
|
||||
application/vnd.google-earth.kmz kmz
|
||||
application/vnd.grafeq
|
||||
application/vnd.groove-account
|
||||
application/vnd.groove-identity-message
|
||||
application/vnd.groove-injector
|
||||
application/vnd.groove-tool-message
|
||||
application/vnd.groove-tool-template
|
||||
application/vnd.groove-vcard
|
||||
application/vnd.hhe.lesson-player
|
||||
application/vnd.hp-HPGL
|
||||
application/vnd.hp-PCL
|
||||
application/vnd.hp-PCLXL
|
||||
application/vnd.hp-hpid
|
||||
application/vnd.hp-hps
|
||||
application/vnd.httphone
|
||||
application/vnd.hzn-3d-crossword
|
||||
application/vnd.ibm.MiniPay
|
||||
application/vnd.ibm.afplinedata
|
||||
application/vnd.ibm.modcap
|
||||
application/vnd.informix-visionary
|
||||
application/vnd.intercon.formnet
|
||||
application/vnd.intertrust.digibox
|
||||
application/vnd.intertrust.nncp
|
||||
application/vnd.intu.qbo
|
||||
application/vnd.intu.qfx
|
||||
application/vnd.irepository.package+xml
|
||||
application/vnd.is-xpr
|
||||
application/vnd.japannet-directory-service
|
||||
application/vnd.japannet-jpnstore-wakeup
|
||||
application/vnd.japannet-payment-wakeup
|
||||
application/vnd.japannet-registration
|
||||
application/vnd.japannet-registration-wakeup
|
||||
application/vnd.japannet-setstore-wakeup
|
||||
application/vnd.japannet-verification
|
||||
application/vnd.japannet-verification-wakeup
|
||||
application/vnd.koan
|
||||
application/vnd.lotus-1-2-3
|
||||
application/vnd.lotus-approach
|
||||
application/vnd.lotus-freelance
|
||||
application/vnd.lotus-notes
|
||||
application/vnd.lotus-organizer
|
||||
application/vnd.lotus-screencam
|
||||
application/vnd.lotus-wordpro
|
||||
application/vnd.mcd
|
||||
application/vnd.mediastation.cdkey
|
||||
application/vnd.meridian-slingshot
|
||||
application/vnd.mif
|
||||
application/vnd.minisoft-hp3000-save
|
||||
application/vnd.mitsubishi.misty-guard.trustweb
|
||||
application/vnd.mobius.daf
|
||||
application/vnd.mobius.dis
|
||||
application/vnd.mobius.msl
|
||||
application/vnd.mobius.plc
|
||||
application/vnd.mobius.txf
|
||||
application/vnd.motorola.flexsuite
|
||||
application/vnd.motorola.flexsuite.adsi
|
||||
application/vnd.motorola.flexsuite.fis
|
||||
application/vnd.motorola.flexsuite.gotap
|
||||
application/vnd.motorola.flexsuite.kmr
|
||||
application/vnd.motorola.flexsuite.ttc
|
||||
application/vnd.motorola.flexsuite.wem
|
||||
application/vnd.mozilla.xul+xml xul
|
||||
application/vnd.ms-artgalry
|
||||
application/vnd.ms-asf
|
||||
application/vnd.ms-excel xls xlb xlt
|
||||
application/vnd.ms-lrm
|
||||
application/vnd.ms-pki.seccat cat
|
||||
application/vnd.ms-pki.stl stl
|
||||
application/vnd.ms-powerpoint ppt pps
|
||||
application/vnd.ms-project
|
||||
application/vnd.ms-tnef
|
||||
application/vnd.ms-works
|
||||
application/vnd.mseq
|
||||
application/vnd.msign
|
||||
application/vnd.music-niff
|
||||
application/vnd.musician
|
||||
application/vnd.netfpx
|
||||
application/vnd.noblenet-directory
|
||||
application/vnd.noblenet-sealer
|
||||
application/vnd.noblenet-web
|
||||
application/vnd.novadigm.EDM
|
||||
application/vnd.novadigm.EDX
|
||||
application/vnd.novadigm.EXT
|
||||
application/vnd.oasis.opendocument.chart odc
|
||||
application/vnd.oasis.opendocument.database odb
|
||||
application/vnd.oasis.opendocument.formula odf
|
||||
application/vnd.oasis.opendocument.graphics odg
|
||||
application/vnd.oasis.opendocument.graphics-template otg
|
||||
application/vnd.oasis.opendocument.image odi
|
||||
application/vnd.oasis.opendocument.presentation odp
|
||||
application/vnd.oasis.opendocument.presentation-template otp
|
||||
application/vnd.oasis.opendocument.spreadsheet ods
|
||||
application/vnd.oasis.opendocument.spreadsheet-template ots
|
||||
application/vnd.oasis.opendocument.text odt
|
||||
application/vnd.oasis.opendocument.text-master odm
|
||||
application/vnd.oasis.opendocument.text-template ott
|
||||
application/vnd.oasis.opendocument.text-web oth
|
||||
application/vnd.osa.netdeploy
|
||||
application/vnd.palm
|
||||
application/vnd.pg.format
|
||||
application/vnd.pg.osasli
|
||||
application/vnd.powerbuilder6
|
||||
application/vnd.powerbuilder6-s
|
||||
application/vnd.powerbuilder7
|
||||
application/vnd.powerbuilder7-s
|
||||
application/vnd.powerbuilder75
|
||||
application/vnd.powerbuilder75-s
|
||||
application/vnd.previewsystems.box
|
||||
application/vnd.publishare-delta-tree
|
||||
application/vnd.pvi.ptid1
|
||||
application/vnd.pwg-xhtml-print+xml
|
||||
application/vnd.rapid
|
||||
application/vnd.rim.cod cod
|
||||
application/vnd.s3sms
|
||||
application/vnd.seemail
|
||||
application/vnd.shana.informed.formdata
|
||||
application/vnd.shana.informed.formtemplate
|
||||
application/vnd.shana.informed.interchange
|
||||
application/vnd.shana.informed.package
|
||||
application/vnd.smaf mmf
|
||||
application/vnd.sss-cod
|
||||
application/vnd.sss-dtf
|
||||
application/vnd.sss-ntf
|
||||
application/vnd.stardivision.calc sdc
|
||||
application/vnd.stardivision.chart sds
|
||||
application/vnd.stardivision.draw sda
|
||||
application/vnd.stardivision.impress sdd
|
||||
application/vnd.stardivision.math sdf
|
||||
application/vnd.stardivision.writer sdw
|
||||
application/vnd.stardivision.writer-global sgl
|
||||
application/vnd.street-stream
|
||||
application/vnd.sun.xml.calc sxc
|
||||
application/vnd.sun.xml.calc.template stc
|
||||
application/vnd.sun.xml.draw sxd
|
||||
application/vnd.sun.xml.draw.template std
|
||||
application/vnd.sun.xml.impress sxi
|
||||
application/vnd.sun.xml.impress.template sti
|
||||
application/vnd.sun.xml.math sxm
|
||||
application/vnd.sun.xml.writer sxw
|
||||
application/vnd.sun.xml.writer.global sxg
|
||||
application/vnd.sun.xml.writer.template stw
|
||||
application/vnd.svd
|
||||
application/vnd.swiftview-ics
|
||||
application/vnd.symbian.install sis
|
||||
application/vnd.triscape.mxs
|
||||
application/vnd.trueapp
|
||||
application/vnd.truedoc
|
||||
application/vnd.tve-trigger
|
||||
application/vnd.ufdl
|
||||
application/vnd.uplanet.alert
|
||||
application/vnd.uplanet.alert-wbxml
|
||||
application/vnd.uplanet.bearer-choice
|
||||
application/vnd.uplanet.bearer-choice-wbxml
|
||||
application/vnd.uplanet.cacheop
|
||||
application/vnd.uplanet.cacheop-wbxml
|
||||
application/vnd.uplanet.channel
|
||||
application/vnd.uplanet.channel-wbxml
|
||||
application/vnd.uplanet.list
|
||||
application/vnd.uplanet.list-wbxml
|
||||
application/vnd.uplanet.listcmd
|
||||
application/vnd.uplanet.listcmd-wbxml
|
||||
application/vnd.uplanet.signal
|
||||
application/vnd.vcx
|
||||
application/vnd.vectorworks
|
||||
application/vnd.vidsoft.vidconference
|
||||
application/vnd.visio vsd
|
||||
application/vnd.vividence.scriptfile
|
||||
application/vnd.wap.sic
|
||||
application/vnd.wap.slc
|
||||
application/vnd.wap.wbxml wbxml
|
||||
application/vnd.wap.wmlc wmlc
|
||||
application/vnd.wap.wmlscriptc wmlsc
|
||||
application/vnd.webturbo
|
||||
application/vnd.wordperfect wpd
|
||||
application/vnd.wordperfect5.1 wp5
|
||||
application/vnd.wrq-hp3000-labelled
|
||||
application/vnd.wt.stf
|
||||
application/vnd.xara
|
||||
application/vnd.xfdl
|
||||
application/vnd.yellowriver-custom-menu
|
||||
application/x-123 wk
|
||||
application/x-7z-compressed 7z
|
||||
application/x-abiword abw
|
||||
application/x-apple-diskimage dmg
|
||||
application/x-bcpio bcpio
|
||||
application/x-bittorrent torrent
|
||||
application/x-cab cab
|
||||
application/x-cbr cbr
|
||||
application/x-cbz cbz
|
||||
application/x-cdf cdf cda
|
||||
application/x-cdlink vcd
|
||||
application/x-chess-pgn pgn
|
||||
application/x-core
|
||||
application/x-cpio cpio
|
||||
application/x-csh csh
|
||||
application/x-debian-package deb udeb
|
||||
application/x-director dcr dir dxr
|
||||
application/x-dms dms
|
||||
application/x-doom wad
|
||||
application/x-dvi dvi
|
||||
application/x-httpd-eruby rhtml
|
||||
application/x-executable
|
||||
application/x-font pfa pfb gsf pcf pcf.Z
|
||||
application/x-freemind mm
|
||||
application/x-futuresplash spl
|
||||
application/x-gnumeric gnumeric
|
||||
application/x-go-sgf sgf
|
||||
application/x-graphing-calculator gcf
|
||||
application/x-gtar gtar tgz taz
|
||||
application/x-hdf hdf
|
||||
application/x-httpd-php phtml pht php
|
||||
application/x-httpd-php-source phps
|
||||
application/x-httpd-php3 php3
|
||||
application/x-httpd-php3-preprocessed php3p
|
||||
application/x-httpd-php4 php4
|
||||
application/x-ica ica
|
||||
application/x-info info
|
||||
application/x-internet-signup ins isp
|
||||
application/x-iphone iii
|
||||
application/x-iso9660-image iso
|
||||
application/x-jam jam
|
||||
application/x-java-applet
|
||||
application/x-java-bean
|
||||
application/x-java-jnlp-file jnlp
|
||||
application/x-jmol jmz
|
||||
application/x-kchart chrt
|
||||
application/x-kdelnk
|
||||
application/x-killustrator kil
|
||||
application/x-koan skp skd skt skm
|
||||
application/x-kpresenter kpr kpt
|
||||
application/x-kspread ksp
|
||||
application/x-kword kwd kwt
|
||||
application/x-latex latex
|
||||
application/x-lha lha
|
||||
application/x-lyx lyx
|
||||
application/x-lzh lzh
|
||||
application/x-lzx lzx
|
||||
application/x-maker frm maker frame fm fb book fbdoc
|
||||
application/x-mif mif
|
||||
application/x-ms-wmd wmd
|
||||
application/x-ms-wmz wmz
|
||||
application/x-msdos-program com exe bat dll
|
||||
application/x-msi msi
|
||||
application/x-netcdf nc
|
||||
application/x-ns-proxy-autoconfig pac dat
|
||||
application/x-nwc nwc
|
||||
application/x-object o
|
||||
application/x-oz-application oza
|
||||
application/x-pkcs7-certreqresp p7r
|
||||
application/x-pkcs7-crl crl
|
||||
application/x-python-code pyc pyo
|
||||
application/x-qgis qgs shp shx
|
||||
application/x-quicktimeplayer qtl
|
||||
application/x-redhat-package-manager rpm
|
||||
application/x-ruby rb
|
||||
application/x-rx
|
||||
application/x-sh sh
|
||||
application/x-shar shar
|
||||
application/x-shellscript
|
||||
application/x-shockwave-flash swf swfl
|
||||
application/x-stuffit sit sitx
|
||||
application/x-sv4cpio sv4cpio
|
||||
application/x-sv4crc sv4crc
|
||||
application/x-tar tar
|
||||
application/x-tcl tcl
|
||||
application/x-tex-gf gf
|
||||
application/x-tex-pk pk
|
||||
application/x-texinfo texinfo texi
|
||||
application/x-trash ~ % bak old sik
|
||||
application/x-troff t tr roff
|
||||
application/x-troff-man man
|
||||
application/x-troff-me me
|
||||
application/x-troff-ms ms
|
||||
application/x-ustar ustar
|
||||
application/x-videolan
|
||||
application/x-wais-source src
|
||||
application/x-wingz wz
|
||||
application/x-x509-ca-cert crt
|
||||
application/x-xcf xcf
|
||||
application/x-xfig fig
|
||||
application/x-xpinstall xpi
|
||||
|
||||
audio/32kadpcm
|
||||
audio/3gpp
|
||||
audio/amr amr
|
||||
audio/amr-wb awb
|
||||
audio/amr amr
|
||||
audio/amr-wb awb
|
||||
audio/annodex axa
|
||||
audio/basic au snd
|
||||
audio/flac flac
|
||||
audio/g.722.1
|
||||
audio/l16
|
||||
audio/midi mid midi kar
|
||||
audio/mp4a-latm
|
||||
audio/mpa-robust
|
||||
audio/mpeg mpga mpega mp2 mp3 m4a
|
||||
audio/mpegurl m3u
|
||||
audio/ogg oga ogg spx
|
||||
audio/parityfec
|
||||
audio/prs.sid sid
|
||||
audio/telephone-event
|
||||
audio/tone
|
||||
audio/vnd.cisco.nse
|
||||
audio/vnd.cns.anp1
|
||||
audio/vnd.cns.inf1
|
||||
audio/vnd.digital-winds
|
||||
audio/vnd.everad.plj
|
||||
audio/vnd.lucent.voice
|
||||
audio/vnd.nortel.vbk
|
||||
audio/vnd.nuera.ecelp4800
|
||||
audio/vnd.nuera.ecelp7470
|
||||
audio/vnd.nuera.ecelp9600
|
||||
audio/vnd.octel.sbc
|
||||
audio/vnd.qcelp
|
||||
audio/vnd.rhetorex.32kadpcm
|
||||
audio/vnd.vmx.cvsd
|
||||
audio/x-aiff aif aiff aifc
|
||||
audio/x-gsm gsm
|
||||
audio/x-mpegurl m3u
|
||||
audio/x-ms-wma wma
|
||||
audio/x-ms-wax wax
|
||||
audio/x-pn-realaudio-plugin
|
||||
audio/x-pn-realaudio ra rm ram
|
||||
audio/x-realaudio ra
|
||||
audio/x-scpls pls
|
||||
audio/x-sd2 sd2
|
||||
audio/x-wav wav
|
||||
|
||||
chemical/x-alchemy alc
|
||||
chemical/x-cache cac cache
|
||||
chemical/x-cache-csf csf
|
||||
chemical/x-cactvs-binary cbin cascii ctab
|
||||
chemical/x-cdx cdx
|
||||
chemical/x-cerius cer
|
||||
chemical/x-chem3d c3d
|
||||
chemical/x-chemdraw chm
|
||||
chemical/x-cif cif
|
||||
chemical/x-cmdf cmdf
|
||||
chemical/x-cml cml
|
||||
chemical/x-compass cpa
|
||||
chemical/x-crossfire bsd
|
||||
chemical/x-csml csml csm
|
||||
chemical/x-ctx ctx
|
||||
chemical/x-cxf cxf cef
|
||||
#chemical/x-daylight-smiles smi
|
||||
chemical/x-embl-dl-nucleotide emb embl
|
||||
chemical/x-galactic-spc spc
|
||||
chemical/x-gamess-input inp gam gamin
|
||||
chemical/x-gaussian-checkpoint fch fchk
|
||||
chemical/x-gaussian-cube cub
|
||||
chemical/x-gaussian-input gau gjc gjf
|
||||
chemical/x-gaussian-log gal
|
||||
chemical/x-gcg8-sequence gcg
|
||||
chemical/x-genbank gen
|
||||
chemical/x-hin hin
|
||||
chemical/x-isostar istr ist
|
||||
chemical/x-jcamp-dx jdx dx
|
||||
chemical/x-kinemage kin
|
||||
chemical/x-macmolecule mcm
|
||||
chemical/x-macromodel-input mmd mmod
|
||||
chemical/x-mdl-molfile mol
|
||||
chemical/x-mdl-rdfile rd
|
||||
chemical/x-mdl-rxnfile rxn
|
||||
chemical/x-mdl-sdfile sd sdf
|
||||
chemical/x-mdl-tgf tgf
|
||||
#chemical/x-mif mif
|
||||
chemical/x-mmcif mcif
|
||||
chemical/x-mol2 mol2
|
||||
chemical/x-molconn-Z b
|
||||
chemical/x-mopac-graph gpt
|
||||
chemical/x-mopac-input mop mopcrt mpc zmt
|
||||
chemical/x-mopac-out moo
|
||||
chemical/x-mopac-vib mvb
|
||||
chemical/x-ncbi-asn1 asn
|
||||
chemical/x-ncbi-asn1-ascii prt ent
|
||||
chemical/x-ncbi-asn1-binary val aso
|
||||
chemical/x-ncbi-asn1-spec asn
|
||||
chemical/x-pdb pdb ent
|
||||
chemical/x-rosdal ros
|
||||
chemical/x-swissprot sw
|
||||
chemical/x-vamas-iso14976 vms
|
||||
chemical/x-vmd vmd
|
||||
chemical/x-xtel xtel
|
||||
chemical/x-xyz xyz
|
||||
|
||||
image/cgm
|
||||
image/g3fax
|
||||
image/gif gif
|
||||
image/ief ief
|
||||
image/jpeg jpeg jpg jpe
|
||||
image/naplps
|
||||
image/pcx pcx
|
||||
image/png png
|
||||
image/prs.btif
|
||||
image/prs.pti
|
||||
image/svg+xml svg svgz
|
||||
image/tiff tiff tif
|
||||
image/vnd.cns.inf2
|
||||
image/vnd.djvu djvu djv
|
||||
image/vnd.dwg
|
||||
image/vnd.dxf
|
||||
image/vnd.fastbidsheet
|
||||
image/vnd.fpx
|
||||
image/vnd.fst
|
||||
image/vnd.fujixerox.edmics-mmr
|
||||
image/vnd.fujixerox.edmics-rlc
|
||||
image/vnd.mix
|
||||
image/vnd.net-fpx
|
||||
image/vnd.svf
|
||||
image/vnd.wap.wbmp wbmp
|
||||
image/vnd.xiff
|
||||
image/x-cmu-raster ras
|
||||
image/x-coreldraw cdr
|
||||
image/x-coreldrawpattern pat
|
||||
image/x-coreldrawtemplate cdt
|
||||
image/x-corelphotopaint cpt
|
||||
image/x-icon ico
|
||||
image/x-jg art
|
||||
image/x-jng jng
|
||||
image/x-ms-bmp bmp
|
||||
image/x-photoshop psd
|
||||
image/x-portable-anymap pnm
|
||||
image/x-portable-bitmap pbm
|
||||
image/x-portable-graymap pgm
|
||||
image/x-portable-pixmap ppm
|
||||
image/x-rgb rgb
|
||||
image/x-xbitmap xbm
|
||||
image/x-xpixmap xpm
|
||||
image/x-xwindowdump xwd
|
||||
|
||||
inode/chardevice
|
||||
inode/blockdevice
|
||||
inode/directory-locked
|
||||
inode/directory
|
||||
inode/fifo
|
||||
inode/socket
|
||||
|
||||
message/delivery-status
|
||||
message/disposition-notification
|
||||
message/external-body
|
||||
message/http
|
||||
message/s-http
|
||||
message/news
|
||||
message/partial
|
||||
message/rfc822 eml
|
||||
|
||||
model/iges igs iges
|
||||
model/mesh msh mesh silo
|
||||
model/vnd.dwf
|
||||
model/vnd.flatland.3dml
|
||||
model/vnd.gdl
|
||||
model/vnd.gs-gdl
|
||||
model/vnd.gtw
|
||||
model/vnd.mts
|
||||
model/vnd.vtu
|
||||
model/vrml wrl vrml
|
||||
|
||||
multipart/alternative
|
||||
multipart/appledouble
|
||||
multipart/byteranges
|
||||
multipart/digest
|
||||
multipart/encrypted
|
||||
multipart/form-data
|
||||
multipart/header-set
|
||||
multipart/mixed
|
||||
multipart/parallel
|
||||
multipart/related
|
||||
multipart/report
|
||||
multipart/signed
|
||||
multipart/voice-message
|
||||
|
||||
text/calendar ics icz
|
||||
text/css css
|
||||
text/csv csv
|
||||
text/directory
|
||||
text/english
|
||||
text/enriched
|
||||
text/h323 323
|
||||
text/html html htm shtml
|
||||
text/iuls uls
|
||||
text/mathml mml
|
||||
text/parityfec
|
||||
text/plain asc txt text pot brf
|
||||
text/prs.lines.tag
|
||||
text/rfc822-headers
|
||||
text/richtext rtx
|
||||
text/rtf
|
||||
text/scriptlet sct wsc
|
||||
text/t140
|
||||
text/texmacs tm ts
|
||||
text/tab-separated-values tsv
|
||||
text/uri-list
|
||||
text/vnd.abc
|
||||
text/vnd.curl
|
||||
text/vnd.DMClientScript
|
||||
text/vnd.flatland.3dml
|
||||
text/vnd.fly
|
||||
text/vnd.fmi.flexstor
|
||||
text/vnd.in3d.3dml
|
||||
text/vnd.in3d.spot
|
||||
text/vnd.IPTC.NewsML
|
||||
text/vnd.IPTC.NITF
|
||||
text/vnd.latex-z
|
||||
text/vnd.motorola.reflex
|
||||
text/vnd.ms-mediapackage
|
||||
text/vnd.sun.j2me.app-descriptor jad
|
||||
text/vnd.wap.si
|
||||
text/vnd.wap.sl
|
||||
text/vnd.wap.wml wml
|
||||
text/vnd.wap.wmlscript wmls
|
||||
text/x-bibtex bib
|
||||
text/x-boo boo
|
||||
text/x-c++hdr h++ hpp hxx hh
|
||||
text/x-c++src c++ cpp cxx cc
|
||||
text/x-chdr h
|
||||
text/x-component htc
|
||||
text/x-crontab
|
||||
text/x-csh csh
|
||||
text/x-csrc c
|
||||
text/x-dsrc d
|
||||
text/x-diff diff patch
|
||||
text/x-haskell hs
|
||||
text/x-java java
|
||||
text/x-literate-haskell lhs
|
||||
text/x-makefile
|
||||
text/x-moc moc
|
||||
text/x-pascal p pas
|
||||
text/x-pcs-gcd gcd
|
||||
text/x-perl pl pm
|
||||
text/x-python py
|
||||
text/x-scala scala
|
||||
text/x-server-parsed-html
|
||||
text/x-setext etx
|
||||
text/x-sh sh
|
||||
text/x-tcl tcl tk
|
||||
text/x-tex tex ltx sty cls
|
||||
text/x-vcalendar vcs
|
||||
text/x-vcard vcf
|
||||
|
||||
video/3gpp 3gp
|
||||
video/annodex axv
|
||||
video/dl dl
|
||||
video/dv dif dv
|
||||
video/fli fli
|
||||
video/gl gl
|
||||
video/mpeg mpeg mpg mpe
|
||||
video/mp4 mp4
|
||||
video/quicktime qt mov
|
||||
video/mp4v-es
|
||||
video/ogg ogv
|
||||
video/parityfec
|
||||
video/pointer
|
||||
video/vnd.fvt
|
||||
video/vnd.motorola.video
|
||||
video/vnd.motorola.videop
|
||||
video/vnd.mpegurl mxu
|
||||
video/vnd.mts
|
||||
video/vnd.nokia.interleaved-multimedia
|
||||
video/vnd.vivo
|
||||
video/x-flv flv
|
||||
video/x-la-asf lsf lsx
|
||||
video/x-mng mng
|
||||
video/x-ms-asf asf asx
|
||||
video/x-ms-wm wm
|
||||
video/x-ms-wmv wmv
|
||||
video/x-ms-wmx wmx
|
||||
video/x-ms-wvx wvx
|
||||
video/x-msvideo avi
|
||||
video/x-sgi-movie movie
|
||||
video/x-matroska mpv
|
||||
|
||||
x-conference/x-cooltalk ice
|
||||
|
||||
x-epoc/x-sisx-app sisx
|
||||
x-world/x-vrml vrm vrml wrl
|
||||
|
||||
x-scrapy/test scrapytest
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
"""
|
||||
Item pipeline
|
||||
|
||||
See documentation in docs/item-pipeline.rst
|
||||
"""
|
||||
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.conf import build_component_list
|
||||
from scrapy.utils.defer import deferred_f_from_coro_f
|
||||
|
||||
|
||||
class ItemPipelineManager(MiddlewareManager):
|
||||
|
||||
component_name = 'item pipeline'
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
|
||||
|
||||
def _add_middleware(self, pipe):
|
||||
super(ItemPipelineManager, self)._add_middleware(pipe)
|
||||
if hasattr(pipe, 'process_item'):
|
||||
self.methods['process_item'].append(deferred_f_from_coro_f(pipe.process_item))
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return self._process_chain('process_item', item, spider)
|
||||
514
venv/lib/python3.9/site-packages/scrapy/pipelines/files.py
Normal file
514
venv/lib/python3.9/site-packages/scrapy/pipelines/files.py
Normal file
|
|
@ -0,0 +1,514 @@
|
|||
"""
|
||||
Files Pipeline
|
||||
|
||||
See documentation in topics/media-pipeline.rst
|
||||
"""
|
||||
import functools
|
||||
import hashlib
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from contextlib import suppress
|
||||
from ftplib import FTP
|
||||
from io import BytesIO
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from twisted.internet import defer, threads
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
from scrapy.http import Request
|
||||
from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.boto import is_botocore_available
|
||||
from scrapy.utils.datatypes import CaselessDict
|
||||
from scrapy.utils.ftp import ftp_store_file
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.request import referer_str
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileException(Exception):
|
||||
"""General media error exception"""
|
||||
|
||||
|
||||
class FSFilesStore:
|
||||
def __init__(self, basedir):
|
||||
if '://' in basedir:
|
||||
basedir = basedir.split('://', 1)[1]
|
||||
self.basedir = basedir
|
||||
self._mkdir(self.basedir)
|
||||
self.created_directories = defaultdict(set)
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
self._mkdir(os.path.dirname(absolute_path), info)
|
||||
with open(absolute_path, 'wb') as f:
|
||||
f.write(buf.getvalue())
|
||||
|
||||
def stat_file(self, path, info):
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
try:
|
||||
last_modified = os.path.getmtime(absolute_path)
|
||||
except os.error:
|
||||
return {}
|
||||
|
||||
with open(absolute_path, 'rb') as f:
|
||||
checksum = md5sum(f)
|
||||
|
||||
return {'last_modified': last_modified, 'checksum': checksum}
|
||||
|
||||
def _get_filesystem_path(self, path):
|
||||
path_comps = path.split('/')
|
||||
return os.path.join(self.basedir, *path_comps)
|
||||
|
||||
def _mkdir(self, dirname, domain=None):
|
||||
seen = self.created_directories[domain] if domain else set()
|
||||
if dirname not in seen:
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
seen.add(dirname)
|
||||
|
||||
|
||||
class S3FilesStore:
|
||||
AWS_ACCESS_KEY_ID = None
|
||||
AWS_SECRET_ACCESS_KEY = None
|
||||
AWS_ENDPOINT_URL = None
|
||||
AWS_REGION_NAME = None
|
||||
AWS_USE_SSL = None
|
||||
AWS_VERIFY = None
|
||||
|
||||
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
|
||||
HEADERS = {
|
||||
'Cache-Control': 'max-age=172800',
|
||||
}
|
||||
|
||||
def __init__(self, uri):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
import botocore.session
|
||||
session = botocore.session.get_session()
|
||||
self.s3_client = session.create_client(
|
||||
's3',
|
||||
aws_access_key_id=self.AWS_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
|
||||
endpoint_url=self.AWS_ENDPOINT_URL,
|
||||
region_name=self.AWS_REGION_NAME,
|
||||
use_ssl=self.AWS_USE_SSL,
|
||||
verify=self.AWS_VERIFY
|
||||
)
|
||||
if not uri.startswith("s3://"):
|
||||
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
|
||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _onsuccess(boto_key):
|
||||
checksum = boto_key['ETag'].strip('"')
|
||||
last_modified = boto_key['LastModified']
|
||||
modified_stamp = time.mktime(last_modified.timetuple())
|
||||
return {'checksum': checksum, 'last_modified': modified_stamp}
|
||||
|
||||
return self._get_boto_key(path).addCallback(_onsuccess)
|
||||
|
||||
def _get_boto_key(self, path):
|
||||
key_name = f'{self.prefix}{path}'
|
||||
return threads.deferToThread(
|
||||
self.s3_client.head_object,
|
||||
Bucket=self.bucket,
|
||||
Key=key_name)
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
"""Upload file to S3 storage"""
|
||||
key_name = f'{self.prefix}{path}'
|
||||
buf.seek(0)
|
||||
extra = self._headers_to_botocore_kwargs(self.HEADERS)
|
||||
if headers:
|
||||
extra.update(self._headers_to_botocore_kwargs(headers))
|
||||
return threads.deferToThread(
|
||||
self.s3_client.put_object,
|
||||
Bucket=self.bucket,
|
||||
Key=key_name,
|
||||
Body=buf,
|
||||
Metadata={k: str(v) for k, v in (meta or {}).items()},
|
||||
ACL=self.POLICY,
|
||||
**extra)
|
||||
|
||||
def _headers_to_botocore_kwargs(self, headers):
|
||||
""" Convert headers to botocore keyword agruments.
|
||||
"""
|
||||
# This is required while we need to support both boto and botocore.
|
||||
mapping = CaselessDict({
|
||||
'Content-Type': 'ContentType',
|
||||
'Cache-Control': 'CacheControl',
|
||||
'Content-Disposition': 'ContentDisposition',
|
||||
'Content-Encoding': 'ContentEncoding',
|
||||
'Content-Language': 'ContentLanguage',
|
||||
'Content-Length': 'ContentLength',
|
||||
'Content-MD5': 'ContentMD5',
|
||||
'Expires': 'Expires',
|
||||
'X-Amz-Grant-Full-Control': 'GrantFullControl',
|
||||
'X-Amz-Grant-Read': 'GrantRead',
|
||||
'X-Amz-Grant-Read-ACP': 'GrantReadACP',
|
||||
'X-Amz-Grant-Write-ACP': 'GrantWriteACP',
|
||||
'X-Amz-Object-Lock-Legal-Hold': 'ObjectLockLegalHoldStatus',
|
||||
'X-Amz-Object-Lock-Mode': 'ObjectLockMode',
|
||||
'X-Amz-Object-Lock-Retain-Until-Date': 'ObjectLockRetainUntilDate',
|
||||
'X-Amz-Request-Payer': 'RequestPayer',
|
||||
'X-Amz-Server-Side-Encryption': 'ServerSideEncryption',
|
||||
'X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id': 'SSEKMSKeyId',
|
||||
'X-Amz-Server-Side-Encryption-Context': 'SSEKMSEncryptionContext',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Algorithm': 'SSECustomerAlgorithm',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Key': 'SSECustomerKey',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Key-Md5': 'SSECustomerKeyMD5',
|
||||
'X-Amz-Storage-Class': 'StorageClass',
|
||||
'X-Amz-Tagging': 'Tagging',
|
||||
'X-Amz-Website-Redirect-Location': 'WebsiteRedirectLocation',
|
||||
})
|
||||
extra = {}
|
||||
for key, value in headers.items():
|
||||
try:
|
||||
kwarg = mapping[key]
|
||||
except KeyError:
|
||||
raise TypeError(f'Header "{key}" is not supported by botocore')
|
||||
else:
|
||||
extra[kwarg] = value
|
||||
return extra
|
||||
|
||||
|
||||
class GCSFilesStore:
|
||||
|
||||
GCS_PROJECT_ID = None
|
||||
|
||||
CACHE_CONTROL = 'max-age=172800'
|
||||
|
||||
# The bucket's default object ACL will be applied to the object.
|
||||
# Overriden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
|
||||
POLICY = None
|
||||
|
||||
def __init__(self, uri):
|
||||
from google.cloud import storage
|
||||
client = storage.Client(project=self.GCS_PROJECT_ID)
|
||||
bucket, prefix = uri[5:].split('/', 1)
|
||||
self.bucket = client.bucket(bucket)
|
||||
self.prefix = prefix
|
||||
permissions = self.bucket.test_iam_permissions(
|
||||
['storage.objects.get', 'storage.objects.create']
|
||||
)
|
||||
if 'storage.objects.get' not in permissions:
|
||||
logger.warning(
|
||||
"No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
|
||||
"Checking if files are up to date will be impossible. Files will be downloaded every time.",
|
||||
{'bucket': bucket}
|
||||
)
|
||||
if 'storage.objects.create' not in permissions:
|
||||
logger.error(
|
||||
"No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
|
||||
{'bucket': bucket}
|
||||
)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _onsuccess(blob):
|
||||
if blob:
|
||||
checksum = blob.md5_hash
|
||||
last_modified = time.mktime(blob.updated.timetuple())
|
||||
return {'checksum': checksum, 'last_modified': last_modified}
|
||||
else:
|
||||
return {}
|
||||
|
||||
return threads.deferToThread(self.bucket.get_blob, path).addCallback(_onsuccess)
|
||||
|
||||
def _get_content_type(self, headers):
|
||||
if headers and 'Content-Type' in headers:
|
||||
return headers['Content-Type']
|
||||
else:
|
||||
return 'application/octet-stream'
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
blob = self.bucket.blob(self.prefix + path)
|
||||
blob.cache_control = self.CACHE_CONTROL
|
||||
blob.metadata = {k: str(v) for k, v in (meta or {}).items()}
|
||||
return threads.deferToThread(
|
||||
blob.upload_from_string,
|
||||
data=buf.getvalue(),
|
||||
content_type=self._get_content_type(headers),
|
||||
predefined_acl=self.POLICY
|
||||
)
|
||||
|
||||
|
||||
class FTPFilesStore:
|
||||
|
||||
FTP_USERNAME = None
|
||||
FTP_PASSWORD = None
|
||||
USE_ACTIVE_MODE = None
|
||||
|
||||
def __init__(self, uri):
|
||||
if not uri.startswith("ftp://"):
|
||||
raise ValueError(f"Incorrect URI scheme in {uri}, expected 'ftp'")
|
||||
u = urlparse(uri)
|
||||
self.port = u.port
|
||||
self.host = u.hostname
|
||||
self.port = int(u.port or 21)
|
||||
self.username = u.username or self.FTP_USERNAME
|
||||
self.password = u.password or self.FTP_PASSWORD
|
||||
self.basedir = u.path.rstrip('/')
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
path = f'{self.basedir}/{path}'
|
||||
return threads.deferToThread(
|
||||
ftp_store_file, path=path, file=buf,
|
||||
host=self.host, port=self.port, username=self.username,
|
||||
password=self.password, use_active_mode=self.USE_ACTIVE_MODE
|
||||
)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _stat_file(path):
|
||||
try:
|
||||
ftp = FTP()
|
||||
ftp.connect(self.host, self.port)
|
||||
ftp.login(self.username, self.password)
|
||||
if self.USE_ACTIVE_MODE:
|
||||
ftp.set_pasv(False)
|
||||
file_path = f"{self.basedir}/{path}"
|
||||
last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
|
||||
m = hashlib.md5()
|
||||
ftp.retrbinary(f'RETR {file_path}', m.update)
|
||||
return {'last_modified': last_modified, 'checksum': m.hexdigest()}
|
||||
# The file doesn't exist
|
||||
except Exception:
|
||||
return {}
|
||||
return threads.deferToThread(_stat_file, path)
|
||||
|
||||
|
||||
class FilesPipeline(MediaPipeline):
|
||||
"""Abstract pipeline that implement the file downloading
|
||||
|
||||
This pipeline tries to minimize network transfers and file processing,
|
||||
doing stat of the files and determining if file is new, uptodate or
|
||||
expired.
|
||||
|
||||
``new`` files are those that pipeline never processed and needs to be
|
||||
downloaded from supplier site the first time.
|
||||
|
||||
``uptodate`` files are the ones that the pipeline processed and are still
|
||||
valid files.
|
||||
|
||||
``expired`` files are those that pipeline already processed but the last
|
||||
modification was made long time ago, so a reprocessing is recommended to
|
||||
refresh it in case of change.
|
||||
|
||||
"""
|
||||
|
||||
MEDIA_NAME = "file"
|
||||
EXPIRES = 90
|
||||
STORE_SCHEMES = {
|
||||
'': FSFilesStore,
|
||||
'file': FSFilesStore,
|
||||
's3': S3FilesStore,
|
||||
'gs': GCSFilesStore,
|
||||
'ftp': FTPFilesStore
|
||||
}
|
||||
DEFAULT_FILES_URLS_FIELD = 'file_urls'
|
||||
DEFAULT_FILES_RESULT_FIELD = 'files'
|
||||
|
||||
def __init__(self, store_uri, download_func=None, settings=None):
|
||||
if not store_uri:
|
||||
raise NotConfigured
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
|
||||
cls_name = "FilesPipeline"
|
||||
self.store = self._get_store(store_uri)
|
||||
resolve = functools.partial(self._key_for_pipe,
|
||||
base_class_name=cls_name,
|
||||
settings=settings)
|
||||
self.expires = settings.getint(
|
||||
resolve('FILES_EXPIRES'), self.EXPIRES
|
||||
)
|
||||
if not hasattr(self, "FILES_URLS_FIELD"):
|
||||
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
|
||||
if not hasattr(self, "FILES_RESULT_FIELD"):
|
||||
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
|
||||
self.files_urls_field = settings.get(
|
||||
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
|
||||
)
|
||||
self.files_result_field = settings.get(
|
||||
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
|
||||
)
|
||||
|
||||
super().__init__(download_func=download_func, settings=settings)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
s3store = cls.STORE_SCHEMES['s3']
|
||||
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
|
||||
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
|
||||
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
|
||||
s3store.AWS_VERIFY = settings['AWS_VERIFY']
|
||||
s3store.POLICY = settings['FILES_STORE_S3_ACL']
|
||||
|
||||
gcs_store = cls.STORE_SCHEMES['gs']
|
||||
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
|
||||
gcs_store.POLICY = settings['FILES_STORE_GCS_ACL'] or None
|
||||
|
||||
ftp_store = cls.STORE_SCHEMES['ftp']
|
||||
ftp_store.FTP_USERNAME = settings['FTP_USER']
|
||||
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
|
||||
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
|
||||
|
||||
store_uri = settings['FILES_STORE']
|
||||
return cls(store_uri, settings=settings)
|
||||
|
||||
def _get_store(self, uri):
|
||||
if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir
|
||||
scheme = 'file'
|
||||
else:
|
||||
scheme = urlparse(uri).scheme
|
||||
store_cls = self.STORE_SCHEMES[scheme]
|
||||
return store_cls(uri)
|
||||
|
||||
def media_to_download(self, request, info, *, item=None):
|
||||
def _onsuccess(result):
|
||||
if not result:
|
||||
return # returning None force download
|
||||
|
||||
last_modified = result.get('last_modified', None)
|
||||
if not last_modified:
|
||||
return # returning None force download
|
||||
|
||||
age_seconds = time.time() - last_modified
|
||||
age_days = age_seconds / 60 / 60 / 24
|
||||
if age_days > self.expires:
|
||||
return # returning None force download
|
||||
|
||||
referer = referer_str(request)
|
||||
logger.debug(
|
||||
'File (uptodate): Downloaded %(medianame)s from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
self.inc_stats(info.spider, 'uptodate')
|
||||
|
||||
checksum = result.get('checksum', None)
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
|
||||
|
||||
path = self.file_path(request, info=info, item=item)
|
||||
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
|
||||
dfd.addCallbacks(_onsuccess, lambda _: None)
|
||||
dfd.addErrback(
|
||||
lambda f:
|
||||
logger.error(self.__class__.__name__ + '.store.stat_file',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': info.spider})
|
||||
)
|
||||
return dfd
|
||||
|
||||
def media_failed(self, failure, request, info):
|
||||
if not isinstance(failure.value, IgnoreRequest):
|
||||
referer = referer_str(request)
|
||||
logger.warning(
|
||||
'File (unknown-error): Error downloading %(medianame)s from '
|
||||
'%(request)s referred in <%(referer)s>: %(exception)s',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer, 'exception': failure.value},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
|
||||
raise FileException
|
||||
|
||||
def media_downloaded(self, response, request, info, *, item=None):
|
||||
referer = referer_str(request)
|
||||
|
||||
if response.status != 200:
|
||||
logger.warning(
|
||||
'File (code: %(status)s): Error downloading file from '
|
||||
'%(request)s referred in <%(referer)s>',
|
||||
{'status': response.status,
|
||||
'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException('download-error')
|
||||
|
||||
if not response.body:
|
||||
logger.warning(
|
||||
'File (empty-content): Empty file from %(request)s referred '
|
||||
'in <%(referer)s>: no-content',
|
||||
{'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException('empty-content')
|
||||
|
||||
status = 'cached' if 'cached' in response.flags else 'downloaded'
|
||||
logger.debug(
|
||||
'File (%(status)s): Downloaded file from %(request)s referred in '
|
||||
'<%(referer)s>',
|
||||
{'status': status, 'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
self.inc_stats(info.spider, status)
|
||||
|
||||
try:
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
checksum = self.file_downloaded(response, request, info, item=item)
|
||||
except FileException as exc:
|
||||
logger.warning(
|
||||
'File (error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>: %(errormsg)s',
|
||||
{'request': request, 'referer': referer, 'errormsg': str(exc)},
|
||||
extra={'spider': info.spider}, exc_info=True
|
||||
)
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.error(
|
||||
'File (unknown-error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'request': request, 'referer': referer},
|
||||
exc_info=True, extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException(str(exc))
|
||||
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': status}
|
||||
|
||||
def inc_stats(self, spider, status):
|
||||
spider.crawler.stats.inc_value('file_count', spider=spider)
|
||||
spider.crawler.stats.inc_value(f'file_status_count/{status}', spider=spider)
|
||||
|
||||
# Overridable Interface
|
||||
def get_media_requests(self, item, info):
|
||||
urls = ItemAdapter(item).get(self.files_urls_field, [])
|
||||
return [Request(u) for u in urls]
|
||||
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
buf = BytesIO(response.body)
|
||||
checksum = md5sum(buf)
|
||||
buf.seek(0)
|
||||
self.store.persist_file(path, buf, info)
|
||||
return checksum
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
with suppress(KeyError):
|
||||
ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
media_ext = os.path.splitext(request.url)[1]
|
||||
# Handles empty and wild extensions by trying to guess the
|
||||
# mime type then extension or default to empty string otherwise
|
||||
if media_ext not in mimetypes.types_map:
|
||||
media_ext = ''
|
||||
media_type = mimetypes.guess_type(request.url)[0]
|
||||
if media_type:
|
||||
media_ext = mimetypes.guess_extension(media_type)
|
||||
return f'full/{media_guid}{media_ext}'
|
||||
176
venv/lib/python3.9/site-packages/scrapy/pipelines/images.py
Normal file
176
venv/lib/python3.9/site-packages/scrapy/pipelines/images.py
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
"""
|
||||
Images Pipeline
|
||||
|
||||
See documentation in topics/media-pipeline.rst
|
||||
"""
|
||||
import functools
|
||||
import hashlib
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from PIL import Image
|
||||
|
||||
from scrapy.exceptions import DropItem
|
||||
from scrapy.http import Request
|
||||
from scrapy.pipelines.files import FileException, FilesPipeline
|
||||
# TODO: from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class NoimagesDrop(DropItem):
|
||||
"""Product with no images exception"""
|
||||
|
||||
|
||||
class ImageException(FileException):
|
||||
"""General image error exception"""
|
||||
|
||||
|
||||
class ImagesPipeline(FilesPipeline):
|
||||
"""Abstract pipeline that implement the image thumbnail generation logic
|
||||
|
||||
"""
|
||||
|
||||
MEDIA_NAME = 'image'
|
||||
|
||||
# Uppercase attributes kept for backward compatibility with code that subclasses
|
||||
# ImagesPipeline. They may be overridden by settings.
|
||||
MIN_WIDTH = 0
|
||||
MIN_HEIGHT = 0
|
||||
EXPIRES = 90
|
||||
THUMBS = {}
|
||||
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
|
||||
DEFAULT_IMAGES_RESULT_FIELD = 'images'
|
||||
|
||||
def __init__(self, store_uri, download_func=None, settings=None):
|
||||
super().__init__(store_uri, settings=settings, download_func=download_func)
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
|
||||
resolve = functools.partial(self._key_for_pipe,
|
||||
base_class_name="ImagesPipeline",
|
||||
settings=settings)
|
||||
self.expires = settings.getint(
|
||||
resolve("IMAGES_EXPIRES"), self.EXPIRES
|
||||
)
|
||||
|
||||
if not hasattr(self, "IMAGES_RESULT_FIELD"):
|
||||
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
|
||||
if not hasattr(self, "IMAGES_URLS_FIELD"):
|
||||
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
|
||||
|
||||
self.images_urls_field = settings.get(
|
||||
resolve('IMAGES_URLS_FIELD'),
|
||||
self.IMAGES_URLS_FIELD
|
||||
)
|
||||
self.images_result_field = settings.get(
|
||||
resolve('IMAGES_RESULT_FIELD'),
|
||||
self.IMAGES_RESULT_FIELD
|
||||
)
|
||||
self.min_width = settings.getint(
|
||||
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
|
||||
)
|
||||
self.min_height = settings.getint(
|
||||
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
|
||||
)
|
||||
self.thumbs = settings.get(
|
||||
resolve('IMAGES_THUMBS'), self.THUMBS
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
s3store = cls.STORE_SCHEMES['s3']
|
||||
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
|
||||
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
|
||||
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
|
||||
s3store.AWS_VERIFY = settings['AWS_VERIFY']
|
||||
s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
|
||||
|
||||
gcs_store = cls.STORE_SCHEMES['gs']
|
||||
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
|
||||
gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
|
||||
|
||||
ftp_store = cls.STORE_SCHEMES['ftp']
|
||||
ftp_store.FTP_USERNAME = settings['FTP_USER']
|
||||
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
|
||||
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
|
||||
|
||||
store_uri = settings['IMAGES_STORE']
|
||||
return cls(store_uri, settings=settings)
|
||||
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
return self.image_downloaded(response, request, info, item=item)
|
||||
|
||||
def image_downloaded(self, response, request, info, *, item=None):
|
||||
checksum = None
|
||||
for path, image, buf in self.get_images(response, request, info, item=item):
|
||||
if checksum is None:
|
||||
buf.seek(0)
|
||||
checksum = md5sum(buf)
|
||||
width, height = image.size
|
||||
self.store.persist_file(
|
||||
path, buf, info,
|
||||
meta={'width': width, 'height': height},
|
||||
headers={'Content-Type': 'image/jpeg'})
|
||||
return checksum
|
||||
|
||||
def get_images(self, response, request, info, *, item=None):
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
orig_image = Image.open(BytesIO(response.body))
|
||||
|
||||
width, height = orig_image.size
|
||||
if width < self.min_width or height < self.min_height:
|
||||
raise ImageException("Image too small "
|
||||
f"({width}x{height} < "
|
||||
f"{self.min_width}x{self.min_height})")
|
||||
|
||||
image, buf = self.convert_image(orig_image)
|
||||
yield path, image, buf
|
||||
|
||||
for thumb_id, size in self.thumbs.items():
|
||||
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
|
||||
thumb_image, thumb_buf = self.convert_image(image, size)
|
||||
yield thumb_path, thumb_image, thumb_buf
|
||||
|
||||
def convert_image(self, image, size=None):
|
||||
if image.format == 'PNG' and image.mode == 'RGBA':
|
||||
background = Image.new('RGBA', image.size, (255, 255, 255))
|
||||
background.paste(image, image)
|
||||
image = background.convert('RGB')
|
||||
elif image.mode == 'P':
|
||||
image = image.convert("RGBA")
|
||||
background = Image.new('RGBA', image.size, (255, 255, 255))
|
||||
background.paste(image, image)
|
||||
image = background.convert('RGB')
|
||||
elif image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
if size:
|
||||
image = image.copy()
|
||||
image.thumbnail(size, Image.ANTIALIAS)
|
||||
|
||||
buf = BytesIO()
|
||||
image.save(buf, 'JPEG')
|
||||
return image, buf
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
urls = ItemAdapter(item).get(self.images_urls_field, [])
|
||||
return [Request(u) for u in urls]
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
with suppress(KeyError):
|
||||
ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
return f'full/{image_guid}.jpg'
|
||||
|
||||
def thumb_path(self, request, thumb_id, response=None, info=None):
|
||||
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
return f'thumbs/{thumb_id}/{thumb_guid}.jpg'
|
||||
251
venv/lib/python3.9/site-packages/scrapy/pipelines/media.py
Normal file
251
venv/lib/python3.9/site-packages/scrapy/pipelines/media.py
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
import functools
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from inspect import signature
|
||||
from warnings import warn
|
||||
|
||||
from twisted.internet.defer import Deferred, DeferredList
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.datatypes import SequenceExclude
|
||||
from scrapy.utils.defer import mustbe_deferred, defer_result
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MediaPipeline:
|
||||
|
||||
LOG_FAILED_RESULTS = True
|
||||
|
||||
class SpiderInfo:
|
||||
def __init__(self, spider):
|
||||
self.spider = spider
|
||||
self.downloading = set()
|
||||
self.downloaded = {}
|
||||
self.waiting = defaultdict(list)
|
||||
|
||||
def __init__(self, download_func=None, settings=None):
|
||||
self.download_func = download_func
|
||||
self._expects_item = {}
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
resolve = functools.partial(self._key_for_pipe,
|
||||
base_class_name="MediaPipeline",
|
||||
settings=settings)
|
||||
self.allow_redirects = settings.getbool(
|
||||
resolve('MEDIA_ALLOW_REDIRECTS'), False
|
||||
)
|
||||
self._handle_statuses(self.allow_redirects)
|
||||
|
||||
# Check if deprecated methods are being used and make them compatible
|
||||
self._make_compatible()
|
||||
|
||||
def _handle_statuses(self, allow_redirects):
|
||||
self.handle_httpstatus_list = None
|
||||
if allow_redirects:
|
||||
self.handle_httpstatus_list = SequenceExclude(range(300, 400))
|
||||
|
||||
def _key_for_pipe(self, key, base_class_name=None, settings=None):
|
||||
"""
|
||||
>>> MediaPipeline()._key_for_pipe("IMAGES")
|
||||
'IMAGES'
|
||||
>>> class MyPipe(MediaPipeline):
|
||||
... pass
|
||||
>>> MyPipe()._key_for_pipe("IMAGES", base_class_name="MediaPipeline")
|
||||
'MYPIPE_IMAGES'
|
||||
"""
|
||||
class_name = self.__class__.__name__
|
||||
formatted_key = f"{class_name.upper()}_{key}"
|
||||
if (
|
||||
not base_class_name
|
||||
or class_name == base_class_name
|
||||
or settings and not settings.get(formatted_key)
|
||||
):
|
||||
return key
|
||||
return formatted_key
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
try:
|
||||
pipe = cls.from_settings(crawler.settings)
|
||||
except AttributeError:
|
||||
pipe = cls()
|
||||
pipe.crawler = crawler
|
||||
return pipe
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.spiderinfo = self.SpiderInfo(spider)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
info = self.spiderinfo
|
||||
requests = arg_to_iter(self.get_media_requests(item, info))
|
||||
dlist = [self._process_request(r, info, item) for r in requests]
|
||||
dfd = DeferredList(dlist, consumeErrors=1)
|
||||
return dfd.addCallback(self.item_completed, item, info)
|
||||
|
||||
def _process_request(self, request, info, item):
|
||||
fp = request_fingerprint(request)
|
||||
cb = request.callback or (lambda _: _)
|
||||
eb = request.errback
|
||||
request.callback = None
|
||||
request.errback = None
|
||||
|
||||
# Return cached result if request was already seen
|
||||
if fp in info.downloaded:
|
||||
return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)
|
||||
|
||||
# Otherwise, wait for result
|
||||
wad = Deferred().addCallbacks(cb, eb)
|
||||
info.waiting[fp].append(wad)
|
||||
|
||||
# Check if request is downloading right now to avoid doing it twice
|
||||
if fp in info.downloading:
|
||||
return wad
|
||||
|
||||
# Download request checking media_to_download hook output first
|
||||
info.downloading.add(fp)
|
||||
dfd = mustbe_deferred(self.media_to_download, request, info, item=item)
|
||||
dfd.addCallback(self._check_media_to_download, request, info, item=item)
|
||||
dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
|
||||
dfd.addErrback(lambda f: logger.error(
|
||||
f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
|
||||
)
|
||||
return dfd.addBoth(lambda _: wad) # it must return wad at last
|
||||
|
||||
def _make_compatible(self):
|
||||
"""Make overridable methods of MediaPipeline and subclasses backwards compatible"""
|
||||
methods = [
|
||||
"file_path", "media_to_download", "media_downloaded",
|
||||
"file_downloaded", "image_downloaded", "get_images"
|
||||
]
|
||||
|
||||
for method_name in methods:
|
||||
method = getattr(self, method_name, None)
|
||||
if callable(method):
|
||||
setattr(self, method_name, self._compatible(method))
|
||||
|
||||
def _compatible(self, func):
|
||||
"""Wrapper for overridable methods to allow backwards compatibility"""
|
||||
self._check_signature(func)
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if self._expects_item[func.__name__]:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
kwargs.pop('item', None)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
def _check_signature(self, func):
|
||||
sig = signature(func)
|
||||
self._expects_item[func.__name__] = True
|
||||
|
||||
if 'item' not in sig.parameters:
|
||||
old_params = str(sig)[1:-1]
|
||||
new_params = old_params + ", *, item=None"
|
||||
warn(f'{func.__name__}(self, {old_params}) is deprecated, '
|
||||
f'please use {func.__name__}(self, {new_params})',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
self._expects_item[func.__name__] = False
|
||||
|
||||
def _modify_media_request(self, request):
|
||||
if self.handle_httpstatus_list:
|
||||
request.meta['handle_httpstatus_list'] = self.handle_httpstatus_list
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
|
||||
def _check_media_to_download(self, result, request, info, item):
|
||||
if result is not None:
|
||||
return result
|
||||
if self.download_func:
|
||||
# this ugly code was left only to support tests. TODO: remove
|
||||
dfd = mustbe_deferred(self.download_func, request, info.spider)
|
||||
dfd.addCallbacks(
|
||||
callback=self.media_downloaded, callbackArgs=(request, info), callbackKeywords={'item': item},
|
||||
errback=self.media_failed, errbackArgs=(request, info))
|
||||
else:
|
||||
self._modify_media_request(request)
|
||||
dfd = self.crawler.engine.download(request, info.spider)
|
||||
dfd.addCallbacks(
|
||||
callback=self.media_downloaded, callbackArgs=(request, info), callbackKeywords={'item': item},
|
||||
errback=self.media_failed, errbackArgs=(request, info))
|
||||
return dfd
|
||||
|
||||
def _cache_result_and_execute_waiters(self, result, fp, info):
|
||||
if isinstance(result, Failure):
|
||||
# minimize cached information for failure
|
||||
result.cleanFailure()
|
||||
result.frames = []
|
||||
result.stack = None
|
||||
|
||||
# This code fixes a memory leak by avoiding to keep references to
|
||||
# the Request and Response objects on the Media Pipeline cache.
|
||||
#
|
||||
# What happens when the media_downloaded callback raises an
|
||||
# exception, for example a FileException('download-error') when
|
||||
# the Response status code is not 200 OK, is that the original
|
||||
# StopIteration exception (which in turn contains the failed
|
||||
# Response and by extension, the original Request) gets encapsulated
|
||||
# within the FileException context.
|
||||
#
|
||||
# Originally, Scrapy was using twisted.internet.defer.returnValue
|
||||
# inside functions decorated with twisted.internet.defer.inlineCallbacks,
|
||||
# encapsulating the returned Response in a _DefGen_Return exception
|
||||
# instead of a StopIteration.
|
||||
#
|
||||
# To avoid keeping references to the Response and therefore Request
|
||||
# objects on the Media Pipeline cache, we should wipe the context of
|
||||
# the encapsulated exception when it is a StopIteration instance
|
||||
#
|
||||
# This problem does not occur in Python 2.7 since we don't have
|
||||
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
|
||||
context = getattr(result.value, '__context__', None)
|
||||
if isinstance(context, StopIteration):
|
||||
setattr(result.value, '__context__', None)
|
||||
|
||||
info.downloading.remove(fp)
|
||||
info.downloaded[fp] = result # cache result
|
||||
for wad in info.waiting.pop(fp):
|
||||
defer_result(result).chainDeferred(wad)
|
||||
|
||||
# Overridable Interface
|
||||
def media_to_download(self, request, info, *, item=None):
|
||||
"""Check request before starting download"""
|
||||
pass
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
"""Returns the media requests to download"""
|
||||
pass
|
||||
|
||||
def media_downloaded(self, response, request, info, *, item=None):
|
||||
"""Handler for success downloads"""
|
||||
return response
|
||||
|
||||
def media_failed(self, failure, request, info):
|
||||
"""Handler for failed downloads"""
|
||||
return failure
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
"""Called per item when all media requests has been processed"""
|
||||
if self.LOG_FAILED_RESULTS:
|
||||
for ok, value in results:
|
||||
if not ok:
|
||||
logger.error(
|
||||
'%(class)s found errors processing %(item)s',
|
||||
{'class': self.__class__.__name__, 'item': item},
|
||||
exc_info=failure_to_exc_info(value),
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
"""Returns the path where downloaded media should be stored"""
|
||||
pass
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue