Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1 @@
2.4.1

View file

@ -0,0 +1,42 @@
"""
Scrapy - a web crawling and web scraping framework written for Python
"""
import pkgutil
import sys
import warnings
from twisted import version as _txv
# Declare top-level shortcuts
from scrapy.spiders import Spider
from scrapy.http import Request, FormRequest
from scrapy.selector import Selector
from scrapy.item import Item, Field
__all__ = [
'__version__', 'version_info', 'twisted_version', 'Spider',
'Request', 'FormRequest', 'Selector', 'Item', 'Field',
]
# Scrapy and Twisted versions
__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split('.'))
twisted_version = (_txv.major, _txv.minor, _txv.micro)
# Check minimum required Python version
if sys.version_info < (3, 6):
print("Scrapy %s requires Python 3.6+" % __version__)
sys.exit(1)
# Ignore noisy twisted deprecation warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='twisted')
del pkgutil
del sys
del warnings

View file

@ -0,0 +1,4 @@
from scrapy.cmdline import execute
if __name__ == '__main__':
execute()

View file

@ -0,0 +1,173 @@
import sys
import os
import optparse
import cProfile
import inspect
import pkg_resources
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
from scrapy.utils.misc import walk_modules
from scrapy.utils.project import inside_project, get_project_settings
from scrapy.utils.python import garbage_collect
def _iter_command_classes(module_name):
# TODO: add `name` attribute to commands and and merge this function with
# scrapy.utils.spider.iter_spider_classes
for module in walk_modules(module_name):
for obj in vars(module).values():
if (
inspect.isclass(obj)
and issubclass(obj, ScrapyCommand)
and obj.__module__ == module.__name__
and not obj == ScrapyCommand
):
yield obj
def _get_commands_from_module(module, inproject):
d = {}
for cmd in _iter_command_classes(module):
if inproject or not cmd.requires_project:
cmdname = cmd.__module__.split('.')[-1]
d[cmdname] = cmd()
return d
def _get_commands_from_entry_points(inproject, group='scrapy.commands'):
cmds = {}
for entry_point in pkg_resources.iter_entry_points(group):
obj = entry_point.load()
if inspect.isclass(obj):
cmds[entry_point.name] = obj()
else:
raise Exception(f"Invalid entry point {entry_point.name}")
return cmds
def _get_commands_dict(settings, inproject):
cmds = _get_commands_from_module('scrapy.commands', inproject)
cmds.update(_get_commands_from_entry_points(inproject))
cmds_module = settings['COMMANDS_MODULE']
if cmds_module:
cmds.update(_get_commands_from_module(cmds_module, inproject))
return cmds
def _pop_command_name(argv):
i = 0
for arg in argv[1:]:
if not arg.startswith('-'):
del argv[i]
return arg
i += 1
def _print_header(settings, inproject):
version = scrapy.__version__
if inproject:
print(f"Scrapy {version} - project: {settings['BOT_NAME']}\n")
else:
print(f"Scrapy {version} - no active project\n")
def _print_commands(settings, inproject):
_print_header(settings, inproject)
print("Usage:")
print(" scrapy <command> [options] [args]\n")
print("Available commands:")
cmds = _get_commands_dict(settings, inproject)
for cmdname, cmdclass in sorted(cmds.items()):
print(f" {cmdname:<13} {cmdclass.short_desc()}")
if not inproject:
print()
print(" [ more ] More commands available when run from project directory")
print()
print('Use "scrapy <command> -h" to see more info about a command')
def _print_unknown_command(settings, cmdname, inproject):
_print_header(settings, inproject)
print(f"Unknown command: {cmdname}\n")
print('Use "scrapy" to see available commands')
def _run_print_help(parser, func, *a, **kw):
try:
func(*a, **kw)
except UsageError as e:
if str(e):
parser.error(str(e))
if e.print_help:
parser.print_help()
sys.exit(2)
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError:
pass
else:
settings['EDITOR'] = editor
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = f"scrapy {cmdname} {cmd.syntax()}"
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
def _run_command(cmd, args, opts):
if opts.profile:
_run_command_profiled(cmd, args, opts)
else:
cmd.run(args, opts)
def _run_command_profiled(cmd, args, opts):
if opts.profile:
sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")
loc = locals()
p = cProfile.Profile()
p.runctx('cmd.run(args, opts)', globals(), loc)
if opts.profile:
p.dump_stats(opts.profile)
if __name__ == '__main__':
try:
execute()
finally:
# Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() on exit:
# http://doc.pypy.org/en/latest/cpython_differences.html
# ?highlight=gc.collect#differences-related-to-garbage-collection-strategies
garbage_collect()

View file

@ -0,0 +1,137 @@
"""
Base class for Scrapy commands
"""
import os
from optparse import OptionGroup
from twisted.python import failure
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli
from scrapy.exceptions import UsageError
class ScrapyCommand:
requires_project = False
crawler_process = None
# default settings to be used for this command instead of global defaults
default_settings = {}
exitcode = 0
def __init__(self):
self.settings = None # set in scrapy.cmdline
def set_crawler(self, crawler):
if hasattr(self, '_crawler'):
raise RuntimeError("crawler already set")
self._crawler = crawler
def syntax(self):
"""
Command syntax (preferably one-line). Do not include command name.
"""
return ""
def short_desc(self):
"""
A short description of the command
"""
return ""
def long_desc(self):
"""A long description of the command. Return short description when not
available. It cannot contain newlines, since contents will be formatted
by optparser which removes newlines and wraps text.
"""
return self.short_desc()
def help(self):
"""An extensive help for the command. It will be shown when using the
"help" command. It can contain newlines, since no post-formatting will
be applied to its contents.
"""
return self.long_desc()
def add_options(self, parser):
"""
Populate option parse with options available for this command
"""
group = OptionGroup(parser, "Global Options")
group.add_option("--logfile", metavar="FILE",
help="log file. if omitted stderr will be used")
group.add_option("-L", "--loglevel", metavar="LEVEL", default=None,
help=f"log level (default: {self.settings['LOG_LEVEL']})")
group.add_option("--nolog", action="store_true",
help="disable logging completely")
group.add_option("--profile", metavar="FILE", default=None,
help="write python cProfile stats to FILE")
group.add_option("--pidfile", metavar="FILE",
help="write process ID to FILE")
group.add_option("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
help="set/override setting (may be repeated)")
group.add_option("--pdb", action="store_true", help="enable pdb on failure")
parser.add_option_group(group)
def process_options(self, args, opts):
try:
self.settings.setdict(arglist_to_dict(opts.set),
priority='cmdline')
except ValueError:
raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)
if opts.logfile:
self.settings.set('LOG_ENABLED', True, priority='cmdline')
self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')
if opts.loglevel:
self.settings.set('LOG_ENABLED', True, priority='cmdline')
self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')
if opts.nolog:
self.settings.set('LOG_ENABLED', False, priority='cmdline')
if opts.pidfile:
with open(opts.pidfile, "w") as f:
f.write(str(os.getpid()) + os.linesep)
if opts.pdb:
failure.startDebugMode()
def run(self, args, opts):
"""
Entry point for running commands
"""
raise NotImplementedError
class BaseRunSpiderCommand(ScrapyCommand):
"""
Common class used to share functionality between the crawl, parse and runspider commands
"""
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE", action="append",
help="append scraped items to the end of FILE (use - for stdout)")
parser.add_option("-O", "--overwrite-output", metavar="FILE", action="append",
help="dump scraped items into FILE, overwriting any existing file")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.output or opts.overwrite_output:
feeds = feed_process_params_from_cli(
self.settings,
opts.output,
opts.output_format,
opts.overwrite_output,
)
self.settings.set('FEEDS', feeds, priority='cmdline')

View file

@ -0,0 +1,58 @@
import sys
import time
import subprocess
from urllib.parse import urlencode
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.linkextractors import LinkExtractor
class Command(ScrapyCommand):
default_settings = {
'LOG_LEVEL': 'INFO',
'LOGSTATS_INTERVAL': 1,
'CLOSESPIDER_TIMEOUT': 10,
}
def short_desc(self):
return "Run quick benchmark test"
def run(self, args, opts):
with _BenchServer():
self.crawler_process.crawl(_BenchSpider, total=100000)
self.crawler_process.start()
class _BenchServer:
def __enter__(self):
from scrapy.utils.test import get_testenv
pargs = [sys.executable, '-u', '-m', 'scrapy.utils.benchserver']
self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE,
env=get_testenv())
self.proc.stdout.readline()
def __exit__(self, exc_type, exc_value, traceback):
self.proc.kill()
self.proc.wait()
time.sleep(0.2)
class _BenchSpider(scrapy.Spider):
"""A spider that follows all links"""
name = 'follow'
total = 10000
show = 20
baseurl = 'http://localhost:8998'
link_extractor = LinkExtractor()
def start_requests(self):
qargs = {'total': self.total, 'show': self.show}
url = f'{self.baseurl}?{urlencode(qargs, doseq=1)}'
return [scrapy.Request(url, dont_filter=True)]
def parse(self, response):
for link in self.link_extractor.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse)

View file

@ -0,0 +1,96 @@
import time
from collections import defaultdict
from unittest import TextTestRunner, TextTestResult as _TextTestResult
from scrapy.commands import ScrapyCommand
from scrapy.contracts import ContractsManager
from scrapy.utils.misc import load_object, set_environ
from scrapy.utils.conf import build_component_list
class TextTestResult(_TextTestResult):
def printSummary(self, start, stop):
write = self.stream.write
writeln = self.stream.writeln
run = self.testsRun
plural = "s" if run != 1 else ""
writeln(self.separator2)
writeln(f"Ran {run} contract{plural} in {stop - start:.3f}s")
writeln()
infos = []
if not self.wasSuccessful():
write("FAILED")
failed, errored = map(len, (self.failures, self.errors))
if failed:
infos.append(f"failures={failed}")
if errored:
infos.append(f"errors={errored}")
else:
write("OK")
if infos:
writeln(f" ({', '.join(infos)})")
else:
write("\n")
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def syntax(self):
return "[options] <spider>"
def short_desc(self):
return "Check spider contracts"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-l", "--list", dest="list", action="store_true",
help="only list contracts, without checking them")
parser.add_option("-v", "--verbose", dest="verbose", default=False, action='store_true',
help="print contract tests for all spiders")
def run(self, args, opts):
# load contracts
contracts = build_component_list(self.settings.getwithbase('SPIDER_CONTRACTS'))
conman = ContractsManager(load_object(c) for c in contracts)
runner = TextTestRunner(verbosity=2 if opts.verbose else 1)
result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity)
# contract requests
contract_reqs = defaultdict(list)
spider_loader = self.crawler_process.spider_loader
with set_environ(SCRAPY_CHECK='true'):
for spidername in args or spider_loader.list():
spidercls = spider_loader.load(spidername)
spidercls.start_requests = lambda s: conman.from_spider(s, result)
tested_methods = conman.tested_methods_from_spidercls(spidercls)
if opts.list:
for method in tested_methods:
contract_reqs[spidercls.name].append(method)
elif tested_methods:
self.crawler_process.crawl(spidercls)
# start checks
if opts.list:
for spider, methods in sorted(contract_reqs.items()):
if not methods and not opts.verbose:
continue
print(spider)
for method in sorted(methods):
print(f' * {method}')
else:
start = time.time()
self.crawler_process.start()
stop = time.time()
result.printErrors()
result.printSummary(start, stop)
self.exitcode = int(not result.wasSuccessful())

View file

@ -0,0 +1,33 @@
from scrapy.commands import BaseRunSpiderCommand
from scrapy.exceptions import UsageError
class Command(BaseRunSpiderCommand):
requires_project = True
def syntax(self):
return "[options] <spider>"
def short_desc(self):
return "Run a spider"
def run(self, args, opts):
if len(args) < 1:
raise UsageError()
elif len(args) > 1:
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
spname = args[0]
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
self.exitcode = 1
else:
self.crawler_process.start()
if (
self.crawler_process.bootstrap_failed
or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception
):
self.exitcode = 1

View file

@ -0,0 +1,39 @@
import sys
import os
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def syntax(self):
return "<spider>"
def short_desc(self):
return "Edit spider"
def long_desc(self):
return ("Edit a spider using the editor defined in the EDITOR environment"
" variable or else the EDITOR setting")
def _err(self, msg):
sys.stderr.write(msg + os.linesep)
self.exitcode = 1
def run(self, args, opts):
if len(args) != 1:
raise UsageError()
editor = self.settings['EDITOR']
try:
spidercls = self.crawler_process.spider_loader.load(args[0])
except KeyError:
return self._err(f"Spider not found: {args[0]}")
sfile = sys.modules[spidercls.__module__].__file__
sfile = sfile.replace('.pyc', '.py')
self.exitcode = os.system(f'{editor} "{sfile}"')

View file

@ -0,0 +1,70 @@
import sys
from w3lib.url import is_url
from scrapy.commands import ScrapyCommand
from scrapy.http import Request
from scrapy.exceptions import UsageError
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
class Command(ScrapyCommand):
requires_project = False
def syntax(self):
return "[options] <url>"
def short_desc(self):
return "Fetch a URL using the Scrapy downloader"
def long_desc(self):
return (
"Fetch a URL using the Scrapy downloader and print its content"
" to stdout. You may want to use --nolog to disable logging"
)
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--spider", dest="spider", help="use this spider")
parser.add_option("--headers", dest="headers", action="store_true",
help="print response HTTP headers instead of body")
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
help="do not handle HTTP 3xx status codes and print response as-is")
def _print_headers(self, headers, prefix):
for key, values in headers.items():
for value in values:
self._print_bytes(prefix + b' ' + key + b': ' + value)
def _print_response(self, response, opts):
if opts.headers:
self._print_headers(response.request.headers, b'>')
print('>')
self._print_headers(response.headers, b'<')
else:
self._print_bytes(response.body)
def _print_bytes(self, bytes_):
sys.stdout.buffer.write(bytes_ + b'\n')
def run(self, args, opts):
if len(args) != 1 or not is_url(args[0]):
raise UsageError()
request = Request(args[0], callback=self._print_response,
cb_kwargs={"opts": opts}, dont_filter=True)
# by default, let the framework handle redirects,
# i.e. command handles all codes expect 3xx
if not opts.no_redirect:
request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
else:
request.meta['handle_httpstatus_all'] = True
spidercls = DefaultSpider
spider_loader = self.crawler_process.spider_loader
if opts.spider:
spidercls = spider_loader.load(opts.spider)
else:
spidercls = spidercls_for_request(spider_loader, request, spidercls)
self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
self.crawler_process.start()

View file

@ -0,0 +1,149 @@
import os
import shutil
import string
from importlib import import_module
from os.path import join, dirname, abspath, exists, splitext
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.utils.template import render_templatefile, string_camelcase
from scrapy.exceptions import UsageError
def sanitize_module_name(module_name):
"""Sanitize the given module name, by replacing dashes and points
with underscores and prefixing it with a letter if it doesn't start
with one
"""
module_name = module_name.replace('-', '_').replace('.', '_')
if module_name[0] not in string.ascii_letters:
module_name = "a" + module_name
return module_name
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False}
def syntax(self):
return "[options] <name> <domain>"
def short_desc(self):
return "Generate new spider using pre-defined templates"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-l", "--list", dest="list", action="store_true",
help="List available templates")
parser.add_option("-e", "--edit", dest="edit", action="store_true",
help="Edit spider after creating it")
parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",
help="Dump template to standard output")
parser.add_option("-t", "--template", dest="template", default="basic",
help="Uses a custom template.")
parser.add_option("--force", dest="force", action="store_true",
help="If the spider already exists, overwrite it with the template")
def run(self, args, opts):
if opts.list:
self._list_templates()
return
if opts.dump:
template_file = self._find_template(opts.dump)
if template_file:
with open(template_file, "r") as f:
print(f.read())
return
if len(args) != 2:
raise UsageError()
name, domain = args[0:2]
module = sanitize_module_name(name)
if self.settings.get('BOT_NAME') == module:
print("Cannot create a spider with the same name as your project")
return
if not opts.force and self._spider_exists(name):
return
template_file = self._find_template(opts.template)
if template_file:
self._genspider(module, name, domain, opts.template, template_file)
if opts.edit:
self.exitcode = os.system(f'scrapy edit "{name}"')
def _genspider(self, module, name, domain, template_name, template_file):
"""Generate the spider module, based on the given template"""
capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
tvars = {
'project_name': self.settings.get('BOT_NAME'),
'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
'module': module,
'name': name,
'domain': domain,
'classname': f'{capitalized_module}Spider'
}
if self.settings.get('NEWSPIDER_MODULE'):
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
spiders_dir = abspath(dirname(spiders_module.__file__))
else:
spiders_module = None
spiders_dir = "."
spider_file = f"{join(spiders_dir, module)}.py"
shutil.copyfile(template_file, spider_file)
render_templatefile(spider_file, **tvars)
print(f"Created spider {name!r} using template {template_name!r} ",
end=('' if spiders_module else '\n'))
if spiders_module:
print(f"in module:\n {spiders_module.__name__}.{module}")
def _find_template(self, template):
template_file = join(self.templates_dir, f'{template}.tmpl')
if exists(template_file):
return template_file
print(f"Unable to find template: {template}\n")
print('Use "scrapy genspider --list" to see all available templates.')
def _list_templates(self):
print("Available templates:")
for filename in sorted(os.listdir(self.templates_dir)):
if filename.endswith('.tmpl'):
print(f" {splitext(filename)[0]}")
def _spider_exists(self, name):
if not self.settings.get('NEWSPIDER_MODULE'):
# if run as a standalone command and file with same filename already exists
if exists(name + ".py"):
print(f"{abspath(name + '.py')} already exists")
return True
return False
try:
spidercls = self.crawler_process.spider_loader.load(name)
except KeyError:
pass
else:
# if spider with same name exists
print(f"Spider {name!r} already exists in module:")
print(f" {spidercls.__module__}")
return True
# a file with the same name exists in the target directory
spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
spiders_dir = dirname(spiders_module.__file__)
spiders_dir_abs = abspath(spiders_dir)
if exists(join(spiders_dir_abs, name + ".py")):
print(f"{join(spiders_dir_abs, (name + '.py'))} already exists")
return True
return False
@property
def templates_dir(self):
return join(
self.settings['TEMPLATES_DIR'] or join(scrapy.__path__[0], 'templates'),
'spiders'
)

View file

@ -0,0 +1,14 @@
from scrapy.commands import ScrapyCommand
class Command(ScrapyCommand):
requires_project = True
default_settings = {'LOG_ENABLED': False}
def short_desc(self):
return "List available spiders"
def run(self, args, opts):
for s in sorted(self.crawler_process.spider_loader.list()):
print(s)

View file

@ -0,0 +1,256 @@
import json
import logging
from itemadapter import is_item, ItemAdapter
from w3lib.url import is_url
from scrapy.commands import BaseRunSpiderCommand
from scrapy.http import Request
from scrapy.utils import display
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
from scrapy.exceptions import UsageError
logger = logging.getLogger(__name__)
class Command(BaseRunSpiderCommand):
requires_project = True
spider = None
items = {}
requests = {}
first_response = None
def syntax(self):
return "[options] <url>"
def short_desc(self):
return "Parse URL (using its spider) and print the results"
def add_options(self, parser):
BaseRunSpiderCommand.add_options(self, parser)
parser.add_option("--spider", dest="spider", default=None,
help="use this spider without looking for one")
parser.add_option("--pipelines", action="store_true",
help="process items through pipelines")
parser.add_option("--nolinks", dest="nolinks", action="store_true",
help="don't show links to follow (extracted requests)")
parser.add_option("--noitems", dest="noitems", action="store_true",
help="don't show scraped items")
parser.add_option("--nocolour", dest="nocolour", action="store_true",
help="avoid using pygments to colorize the output")
parser.add_option("-r", "--rules", dest="rules", action="store_true",
help="use CrawlSpider rules to discover the callback")
parser.add_option("-c", "--callback", dest="callback",
help="use this callback for parsing, instead looking for a callback")
parser.add_option("-m", "--meta", dest="meta",
help="inject extra meta into the Request, it must be a valid raw json string")
parser.add_option("--cbkwargs", dest="cbkwargs",
help="inject extra callback kwargs into the Request, it must be a valid raw json string")
parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
help="maximum depth for parsing requests [default: %default]")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
help="print each depth level one by one")
@property
def max_level(self):
max_items, max_requests = 0, 0
if self.items:
max_items = max(self.items)
if self.requests:
max_requests = max(self.requests)
return max(max_items, max_requests)
def add_items(self, lvl, new_items):
old_items = self.items.get(lvl, [])
self.items[lvl] = old_items + new_items
def add_requests(self, lvl, new_reqs):
old_reqs = self.requests.get(lvl, [])
self.requests[lvl] = old_reqs + new_reqs
def print_items(self, lvl=None, colour=True):
if lvl is None:
items = [item for lst in self.items.values() for item in lst]
else:
items = self.items.get(lvl, [])
print("# Scraped Items ", "-" * 60)
display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
def print_requests(self, lvl=None, colour=True):
if lvl is None:
if self.requests:
requests = self.requests[max(self.requests)]
else:
requests = []
else:
requests = self.requests.get(lvl, [])
print("# Requests ", "-" * 65)
display.pprint(requests, colorize=colour)
def print_results(self, opts):
colour = not opts.nocolour
if opts.verbose:
for level in range(1, self.max_level + 1):
print(f'\n>>> DEPTH LEVEL: {level} <<<')
if not opts.noitems:
self.print_items(level, colour)
if not opts.nolinks:
self.print_requests(level, colour)
else:
print(f'\n>>> STATUS DEPTH LEVEL {self.max_level} <<<')
if not opts.noitems:
self.print_items(colour=colour)
if not opts.nolinks:
self.print_requests(colour=colour)
def run_callback(self, response, callback, cb_kwargs=None):
cb_kwargs = cb_kwargs or {}
items, requests = [], []
for x in iterate_spider_output(callback(response, **cb_kwargs)):
if is_item(x):
items.append(x)
elif isinstance(x, Request):
requests.append(x)
return items, requests
def get_callback_from_rules(self, spider, response):
if getattr(spider, 'rules', None):
for rule in spider.rules:
if rule.link_extractor.matches(response.url):
return rule.callback or "parse"
else:
logger.error('No CrawlSpider rules found in spider %(spider)r, '
'please specify a callback to use for parsing',
{'spider': spider.name})
def set_spidercls(self, url, opts):
spider_loader = self.crawler_process.spider_loader
if opts.spider:
try:
self.spidercls = spider_loader.load(opts.spider)
except KeyError:
logger.error('Unable to find spider: %(spider)s',
{'spider': opts.spider})
else:
self.spidercls = spidercls_for_request(spider_loader, Request(url))
if not self.spidercls:
logger.error('Unable to find spider for: %(url)s', {'url': url})
def _start_requests(spider):
yield self.prepare_request(spider, Request(url), opts)
self.spidercls.start_requests = _start_requests
def start_parsing(self, url, opts):
self.crawler_process.crawl(self.spidercls, **opts.spargs)
self.pcrawler = list(self.crawler_process.crawlers)[0]
self.crawler_process.start()
if not self.first_response:
logger.error('No response downloaded for: %(url)s',
{'url': url})
def prepare_request(self, spider, request, opts):
def callback(response, **cb_kwargs):
# memorize first request
if not self.first_response:
self.first_response = response
# determine real callback
cb = response.meta['_callback']
if not cb:
if opts.callback:
cb = opts.callback
elif opts.rules and self.first_response == response:
cb = self.get_callback_from_rules(spider, response)
if not cb:
logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s',
{'url': response.url, 'spider': spider.name})
return
else:
cb = 'parse'
if not callable(cb):
cb_method = getattr(spider, cb, None)
if callable(cb_method):
cb = cb_method
else:
logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
{'callback': cb, 'spider': spider.name})
return
# parse items and requests
depth = response.meta['_depth']
items, requests = self.run_callback(response, cb, cb_kwargs)
if opts.pipelines:
itemproc = self.pcrawler.engine.scraper.itemproc
for item in items:
itemproc.process_item(item, spider)
self.add_items(depth, items)
self.add_requests(depth, requests)
scraped_data = items if opts.output else []
if depth < opts.depth:
for req in requests:
req.meta['_depth'] = depth + 1
req.meta['_callback'] = req.callback
req.callback = callback
scraped_data += requests
return scraped_data
# update request meta if any extra meta was passed through the --meta/-m opts.
if opts.meta:
request.meta.update(opts.meta)
# update cb_kwargs if any extra values were was passed through the --cbkwargs option.
if opts.cbkwargs:
request.cb_kwargs.update(opts.cbkwargs)
request.meta['_depth'] = 1
request.meta['_callback'] = request.callback
request.callback = callback
return request
def process_options(self, args, opts):
BaseRunSpiderCommand.process_options(self, args, opts)
self.process_request_meta(opts)
self.process_request_cb_kwargs(opts)
def process_request_meta(self, opts):
if opts.meta:
try:
opts.meta = json.loads(opts.meta)
except ValueError:
raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. "
"Example: --meta='{\"foo\" : \"bar\"}'", print_help=False)
def process_request_cb_kwargs(self, opts):
if opts.cbkwargs:
try:
opts.cbkwargs = json.loads(opts.cbkwargs)
except ValueError:
raise UsageError("Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
"Example: --cbkwargs='{\"foo\" : \"bar\"}'", print_help=False)
def run(self, args, opts):
# parse arguments
if not len(args) == 1 or not is_url(args[0]):
raise UsageError()
else:
url = args[0]
# prepare spidercls
self.set_spidercls(url, opts)
if self.spidercls and opts.depth > 0:
self.start_parsing(url, opts)
self.print_results(opts)

View file

@ -0,0 +1,59 @@
import sys
import os
from importlib import import_module
from scrapy.utils.spider import iter_spider_classes
from scrapy.exceptions import UsageError
from scrapy.commands import BaseRunSpiderCommand
def _import_file(filepath):
abspath = os.path.abspath(filepath)
dirname, file = os.path.split(abspath)
fname, fext = os.path.splitext(file)
if fext not in ('.py', '.pyw'):
raise ValueError(f"Not a Python source file: {abspath}")
if dirname:
sys.path = [dirname] + sys.path
try:
module = import_module(fname)
finally:
if dirname:
sys.path.pop(0)
return module
class Command(BaseRunSpiderCommand):
requires_project = False
default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
def syntax(self):
return "[options] <spider_file>"
def short_desc(self):
return "Run a self-contained spider (without creating a project)"
def long_desc(self):
return "Run the spider defined in the given file"
def run(self, args, opts):
if len(args) != 1:
raise UsageError()
filename = args[0]
if not os.path.exists(filename):
raise UsageError(f"File not found: {filename}\n")
try:
module = _import_file(filename)
except (ImportError, ValueError) as e:
raise UsageError(f"Unable to load {filename!r}: {e}\n")
spclasses = list(iter_spider_classes(module))
if not spclasses:
raise UsageError(f"No spider found in file: {filename}\n")
spidercls = spclasses.pop()
self.crawler_process.crawl(spidercls, **opts.spargs)
self.crawler_process.start()
if self.crawler_process.bootstrap_failed:
self.exitcode = 1

View file

@ -0,0 +1,47 @@
import json
from scrapy.commands import ScrapyCommand
from scrapy.settings import BaseSettings
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
def syntax(self):
return "[options]"
def short_desc(self):
return "Get settings values"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--get", dest="get", metavar="SETTING",
help="print raw setting value")
parser.add_option("--getbool", dest="getbool", metavar="SETTING",
help="print setting value, interpreted as a boolean")
parser.add_option("--getint", dest="getint", metavar="SETTING",
help="print setting value, interpreted as an integer")
parser.add_option("--getfloat", dest="getfloat", metavar="SETTING",
help="print setting value, interpreted as a float")
parser.add_option("--getlist", dest="getlist", metavar="SETTING",
help="print setting value, interpreted as a list")
def run(self, args, opts):
settings = self.crawler_process.settings
if opts.get:
s = settings.get(opts.get)
if isinstance(s, BaseSettings):
print(json.dumps(s.copy_to_dict()))
else:
print(s)
elif opts.getbool:
print(settings.getbool(opts.getbool))
elif opts.getint:
print(settings.getint(opts.getint))
elif opts.getfloat:
print(settings.getfloat(opts.getfloat))
elif opts.getlist:
print(settings.getlist(opts.getlist))

View file

@ -0,0 +1,80 @@
"""
Scrapy Shell
See documentation in docs/topics/shell.rst
"""
from threading import Thread
from scrapy.commands import ScrapyCommand
from scrapy.http import Request
from scrapy.shell import Shell
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
from scrapy.utils.url import guess_scheme
class Command(ScrapyCommand):
requires_project = False
default_settings = {
'KEEP_ALIVE': True,
'LOGSTATS_INTERVAL': 0,
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def syntax(self):
return "[url|file]"
def short_desc(self):
return "Interactive scraping console"
def long_desc(self):
return ("Interactive console for scraping the given url or file. "
"Use ./file.html syntax or full path for local file.")
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-c", dest="code",
help="evaluate the code in the shell, print the result and exit")
parser.add_option("--spider", dest="spider",
help="use this spider")
parser.add_option("--no-redirect", dest="no_redirect", action="store_true", default=False,
help="do not handle HTTP 3xx status codes and print response as-is")
def update_vars(self, vars):
"""You can use this function to update the Scrapy objects that will be
available in the shell
"""
pass
def run(self, args, opts):
url = args[0] if args else None
if url:
# first argument may be a local file
url = guess_scheme(url)
spider_loader = self.crawler_process.spider_loader
spidercls = DefaultSpider
if opts.spider:
spidercls = spider_loader.load(opts.spider)
elif url:
spidercls = spidercls_for_request(spider_loader, Request(url),
spidercls, log_multiple=True)
# The crawler is created this way since the Shell manually handles the
# crawling engine, so the set up in the crawl method won't work
crawler = self.crawler_process._create_crawler(spidercls)
# The Shell class needs a persistent engine in the crawler
crawler.engine = crawler._create_engine()
crawler.engine.start()
self._start_crawler_thread()
shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
shell.start(url=url, redirect=not opts.no_redirect)
def _start_crawler_thread(self):
t = Thread(target=self.crawler_process.start,
kwargs={'stop_after_crawl': False})
t.daemon = True
t.start()

View file

@ -0,0 +1,128 @@
import re
import os
import string
from importlib import import_module
from os.path import join, exists, abspath
from shutil import ignore_patterns, move, copy2, copystat
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.utils.template import render_templatefile, string_camelcase
from scrapy.exceptions import UsageError
TEMPLATES_TO_RENDER = (
('scrapy.cfg',),
('${project_name}', 'settings.py.tmpl'),
('${project_name}', 'items.py.tmpl'),
('${project_name}', 'pipelines.py.tmpl'),
('${project_name}', 'middlewares.py.tmpl'),
)
IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn')
def _make_writable(path):
current_permissions = os.stat(path).st_mode
os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
class Command(ScrapyCommand):
requires_project = False
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
def syntax(self):
return "<project_name> [project_dir]"
def short_desc(self):
return "Create new project"
def _is_valid_name(self, project_name):
def _module_exists(module_name):
try:
import_module(module_name)
return True
except ImportError:
return False
if not re.search(r'^[_a-zA-Z]\w*$', project_name):
print('Error: Project names must begin with a letter and contain'
' only\nletters, numbers and underscores')
elif _module_exists(project_name):
print(f'Error: Module {project_name!r} already exists')
else:
return True
return False
def _copytree(self, src, dst):
"""
Since the original function always creates the directory, to resolve
the issue a new function had to be created. It's a simple copy and
was reduced for this case.
More info at:
https://github.com/scrapy/scrapy/pull/2005
"""
ignore = IGNORE
names = os.listdir(src)
ignored_names = ignore(src, names)
if not os.path.exists(dst):
os.makedirs(dst)
for name in names:
if name in ignored_names:
continue
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
if os.path.isdir(srcname):
self._copytree(srcname, dstname)
else:
copy2(srcname, dstname)
_make_writable(dstname)
copystat(src, dst)
_make_writable(dst)
def run(self, args, opts):
if len(args) not in (1, 2):
raise UsageError()
project_name = args[0]
project_dir = args[0]
if len(args) == 2:
project_dir = args[1]
if exists(join(project_dir, 'scrapy.cfg')):
self.exitcode = 1
print(f'Error: scrapy.cfg already exists in {abspath(project_dir)}')
return
if not self._is_valid_name(project_name):
self.exitcode = 1
return
self._copytree(self.templates_dir, abspath(project_dir))
move(join(project_dir, 'module'), join(project_dir, project_name))
for paths in TEMPLATES_TO_RENDER:
path = join(*paths)
tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
print(f"New Scrapy project '{project_name}', using template directory "
f"'{self.templates_dir}', created in:")
print(f" {abspath(project_dir)}\n")
print("You can start your first spider with:")
print(f" cd {project_dir}")
print(" scrapy genspider example example.com")
@property
def templates_dir(self):
return join(
self.settings['TEMPLATES_DIR'] or join(scrapy.__path__[0], 'templates'),
'project'
)

View file

@ -0,0 +1,29 @@
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.utils.versions import scrapy_components_versions
class Command(ScrapyCommand):
default_settings = {'LOG_ENABLED': False,
'SPIDER_LOADER_WARN_ONLY': True}
def syntax(self):
return "[-v]"
def short_desc(self):
return "Print Scrapy version"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--verbose", "-v", dest="verbose", action="store_true",
help="also display twisted/python/platform info (useful for bug reports)")
def run(self, args, opts):
if opts.verbose:
versions = scrapy_components_versions()
width = max(len(n) for (n, _) in versions)
for name, version in versions:
print(f"{name:<{width}} : {version}")
else:
print(f"Scrapy {scrapy.__version__}")

View file

@ -0,0 +1,18 @@
from scrapy.commands import fetch
from scrapy.utils.response import open_in_browser
class Command(fetch.Command):
def short_desc(self):
return "Open URL in browser, as seen by Scrapy"
def long_desc(self):
return "Fetch a URL using the Scrapy downloader and show its contents in a browser"
def add_options(self, parser):
super().add_options(parser)
parser.remove_option("--headers")
def _print_response(self, response, opts):
open_in_browser(response)

View file

@ -0,0 +1,179 @@
import sys
import re
from functools import wraps
from inspect import getmembers
from unittest import TestCase
from scrapy.http import Request
from scrapy.utils.spider import iterate_spider_output
from scrapy.utils.python import get_spec
class ContractsManager:
contracts = {}
def __init__(self, contracts):
for contract in contracts:
self.contracts[contract.name] = contract
def tested_methods_from_spidercls(self, spidercls):
is_method = re.compile(r"^\s*@", re.MULTILINE).search
methods = []
for key, value in getmembers(spidercls):
if callable(value) and value.__doc__ and is_method(value.__doc__):
methods.append(key)
return methods
def extract_contracts(self, method):
contracts = []
for line in method.__doc__.split('\n'):
line = line.strip()
if line.startswith('@'):
name, args = re.match(r'@(\w+)\s*(.*)', line).groups()
args = re.split(r'\s+', args)
contracts.append(self.contracts[name](method, *args))
return contracts
def from_spider(self, spider, results):
requests = []
for method in self.tested_methods_from_spidercls(type(spider)):
bound_method = spider.__getattribute__(method)
try:
requests.append(self.from_method(bound_method, results))
except Exception:
case = _create_testcase(bound_method, 'contract')
results.addError(case, sys.exc_info())
return requests
def from_method(self, method, results):
contracts = self.extract_contracts(method)
if contracts:
request_cls = Request
for contract in contracts:
if contract.request_cls is not None:
request_cls = contract.request_cls
# calculate request args
args, kwargs = get_spec(request_cls.__init__)
# Don't filter requests to allow
# testing different callbacks on the same URL.
kwargs['dont_filter'] = True
kwargs['callback'] = method
for contract in contracts:
kwargs = contract.adjust_request_args(kwargs)
args.remove('self')
# check if all positional arguments are defined in kwargs
if set(args).issubset(set(kwargs)):
request = request_cls(**kwargs)
# execute pre and post hooks in order
for contract in reversed(contracts):
request = contract.add_pre_hook(request, results)
for contract in contracts:
request = contract.add_post_hook(request, results)
self._clean_req(request, method, results)
return request
def _clean_req(self, request, method, results):
""" stop the request from returning objects and records any errors """
cb = request.callback
@wraps(cb)
def cb_wrapper(response, **cb_kwargs):
try:
output = cb(response, **cb_kwargs)
output = list(iterate_spider_output(output))
except Exception:
case = _create_testcase(method, 'callback')
results.addError(case, sys.exc_info())
def eb_wrapper(failure):
case = _create_testcase(method, 'errback')
exc_info = failure.type, failure.value, failure.getTracebackObject()
results.addError(case, exc_info)
request.callback = cb_wrapper
request.errback = eb_wrapper
class Contract:
""" Abstract class for contracts """
request_cls = None
def __init__(self, method, *args):
self.testcase_pre = _create_testcase(method, f'@{self.name} pre-hook')
self.testcase_post = _create_testcase(method, f'@{self.name} post-hook')
self.args = args
def add_pre_hook(self, request, results):
if hasattr(self, 'pre_process'):
cb = request.callback
@wraps(cb)
def wrapper(response, **cb_kwargs):
try:
results.startTest(self.testcase_pre)
self.pre_process(response)
results.stopTest(self.testcase_pre)
except AssertionError:
results.addFailure(self.testcase_pre, sys.exc_info())
except Exception:
results.addError(self.testcase_pre, sys.exc_info())
else:
results.addSuccess(self.testcase_pre)
finally:
return list(iterate_spider_output(cb(response, **cb_kwargs)))
request.callback = wrapper
return request
def add_post_hook(self, request, results):
if hasattr(self, 'post_process'):
cb = request.callback
@wraps(cb)
def wrapper(response, **cb_kwargs):
output = list(iterate_spider_output(cb(response, **cb_kwargs)))
try:
results.startTest(self.testcase_post)
self.post_process(output)
results.stopTest(self.testcase_post)
except AssertionError:
results.addFailure(self.testcase_post, sys.exc_info())
except Exception:
results.addError(self.testcase_post, sys.exc_info())
else:
results.addSuccess(self.testcase_post)
finally:
return output
request.callback = wrapper
return request
def adjust_request_args(self, args):
return args
def _create_testcase(method, desc):
spider = method.__self__.name
class ContractTestCase(TestCase):
def __str__(_self):
return f"[{spider}] {method.__name__} ({desc})"
name = f'{spider}_{method.__name__}'
setattr(ContractTestCase, name, lambda x: x)
return ContractTestCase(name)

View file

@ -0,0 +1,108 @@
import json
from itemadapter import is_item, ItemAdapter
from scrapy.contracts import Contract
from scrapy.exceptions import ContractFail
from scrapy.http import Request
# contracts
class UrlContract(Contract):
""" Contract to set the url of the request (mandatory)
@url http://scrapy.org
"""
name = 'url'
def adjust_request_args(self, args):
args['url'] = self.args[0]
return args
class CallbackKeywordArgumentsContract(Contract):
""" Contract to set the keyword arguments for the request.
The value should be a JSON-encoded dictionary, e.g.:
@cb_kwargs {"arg1": "some value"}
"""
name = 'cb_kwargs'
def adjust_request_args(self, args):
args['cb_kwargs'] = json.loads(' '.join(self.args))
return args
class ReturnsContract(Contract):
""" Contract to check the output of a callback
general form:
@returns request(s)/item(s) [min=1 [max]]
e.g.:
@returns request
@returns request 2
@returns request 2 10
@returns request 0 10
"""
name = 'returns'
object_type_verifiers = {
'request': lambda x: isinstance(x, Request),
'requests': lambda x: isinstance(x, Request),
'item': is_item,
'items': is_item,
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if len(self.args) not in [1, 2, 3]:
raise ValueError(
f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}"
)
self.obj_name = self.args[0] or None
self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
try:
self.min_bound = int(self.args[1])
except IndexError:
self.min_bound = 1
try:
self.max_bound = int(self.args[2])
except IndexError:
self.max_bound = float('inf')
def post_process(self, output):
occurrences = 0
for x in output:
if self.obj_type_verifier(x):
occurrences += 1
assertion = (self.min_bound <= occurrences <= self.max_bound)
if not assertion:
if self.min_bound == self.max_bound:
expected = self.min_bound
else:
expected = f'{self.min_bound}..{self.max_bound}'
raise ContractFail(f"Returned {occurrences} {self.obj_name}, expected {expected}")
class ScrapesContract(Contract):
""" Contract to check presence of fields in scraped items
@scrapes page_name page_body
"""
name = 'scrapes'
def post_process(self, output):
for x in output:
if is_item(x):
missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
if missing:
missing_fields = ", ".join(missing)
raise ContractFail(f"Missing fields: {missing_fields}")

View file

@ -0,0 +1,3 @@
"""
Scrapy core library classes and functions.
"""

View file

@ -0,0 +1,201 @@
import random
from time import time
from datetime import datetime
from collections import deque
from twisted.internet import defer, task
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.httpobj import urlparse_cached
from scrapy.resolver import dnscache
from scrapy import signals
from scrapy.core.downloader.middleware import DownloaderMiddlewareManager
from scrapy.core.downloader.handlers import DownloadHandlers
class Slot:
"""Downloader slot"""
def __init__(self, concurrency, delay, randomize_delay):
self.concurrency = concurrency
self.delay = delay
self.randomize_delay = randomize_delay
self.active = set()
self.queue = deque()
self.transferring = set()
self.lastseen = 0
self.latercall = None
def free_transfer_slots(self):
return self.concurrency - len(self.transferring)
def download_delay(self):
if self.randomize_delay:
return random.uniform(0.5 * self.delay, 1.5 * self.delay)
return self.delay
def close(self):
if self.latercall and self.latercall.active():
self.latercall.cancel()
def __repr__(self):
cls_name = self.__class__.__name__
return (f"{cls_name}(concurrency={self.concurrency!r}, "
f"delay={self.delay:.2f}, "
f"randomize_delay={self.randomize_delay!r})")
def __str__(self):
return (
f"<downloader.Slot concurrency={self.concurrency!r} "
f"delay={self.delay:.2f} randomize_delay={self.randomize_delay!r} "
f"len(active)={len(self.active)} len(queue)={len(self.queue)} "
f"len(transferring)={len(self.transferring)} "
f"lastseen={datetime.fromtimestamp(self.lastseen).isoformat()}>"
)
def _get_concurrency_delay(concurrency, spider, settings):
delay = settings.getfloat('DOWNLOAD_DELAY')
if hasattr(spider, 'download_delay'):
delay = spider.download_delay
if hasattr(spider, 'max_concurrent_requests'):
concurrency = spider.max_concurrent_requests
return concurrency, delay
class Downloader:
DOWNLOAD_SLOT = 'download_slot'
def __init__(self, crawler):
self.settings = crawler.settings
self.signals = crawler.signals
self.slots = {}
self.active = set()
self.handlers = DownloadHandlers(crawler)
self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
self._slot_gc_loop = task.LoopingCall(self._slot_gc)
self._slot_gc_loop.start(60)
def fetch(self, request, spider):
def _deactivate(response):
self.active.remove(request)
return response
self.active.add(request)
dfd = self.middleware.download(self._enqueue_request, request, spider)
return dfd.addBoth(_deactivate)
def needs_backout(self):
return len(self.active) >= self.total_concurrency
def _get_slot(self, request, spider):
key = self._get_slot_key(request, spider)
if key not in self.slots:
conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
conc, delay = _get_concurrency_delay(conc, spider, self.settings)
self.slots[key] = Slot(conc, delay, self.randomize_delay)
return key, self.slots[key]
def _get_slot_key(self, request, spider):
if self.DOWNLOAD_SLOT in request.meta:
return request.meta[self.DOWNLOAD_SLOT]
key = urlparse_cached(request).hostname or ''
if self.ip_concurrency:
key = dnscache.get(key, key)
return key
def _enqueue_request(self, request, spider):
key, slot = self._get_slot(request, spider)
request.meta[self.DOWNLOAD_SLOT] = key
def _deactivate(response):
slot.active.remove(request)
return response
slot.active.add(request)
self.signals.send_catch_log(signal=signals.request_reached_downloader,
request=request,
spider=spider)
deferred = defer.Deferred().addBoth(_deactivate)
slot.queue.append((request, deferred))
self._process_queue(spider, slot)
return deferred
def _process_queue(self, spider, slot):
from twisted.internet import reactor
if slot.latercall and slot.latercall.active():
return
# Delay queue processing if a download_delay is configured
now = time()
delay = slot.download_delay()
if delay:
penalty = delay - now + slot.lastseen
if penalty > 0:
slot.latercall = reactor.callLater(penalty, self._process_queue, spider, slot)
return
# Process enqueued requests if there are free slots to transfer for this slot
while slot.queue and slot.free_transfer_slots() > 0:
slot.lastseen = now
request, deferred = slot.queue.popleft()
dfd = self._download(slot, request, spider)
dfd.chainDeferred(deferred)
# prevent burst if inter-request delays were configured
if delay:
self._process_queue(spider, slot)
break
def _download(self, slot, request, spider):
# The order is very important for the following deferreds. Do not change!
# 1. Create the download deferred
dfd = mustbe_deferred(self.handlers.download_request, request, spider)
# 2. Notify response_downloaded listeners about the recent download
# before querying queue for next request
def _downloaded(response):
self.signals.send_catch_log(signal=signals.response_downloaded,
response=response,
request=request,
spider=spider)
return response
dfd.addCallback(_downloaded)
# 3. After response arrives, remove the request from transferring
# state to free up the transferring slot so it can be used by the
# following requests (perhaps those which came from the downloader
# middleware itself)
slot.transferring.add(request)
def finish_transferring(_):
slot.transferring.remove(request)
self._process_queue(spider, slot)
self.signals.send_catch_log(signal=signals.request_left_downloader,
request=request,
spider=spider)
return _
return dfd.addBoth(finish_transferring)
def close(self):
self._slot_gc_loop.stop()
for slot in self.slots.values():
slot.close()
def _slot_gc(self, age=60):
mintime = time() - age
for key, slot in list(self.slots.items()):
if not slot.active and slot.lastseen + slot.delay < mintime:
self.slots.pop(key).close()

View file

@ -0,0 +1,94 @@
from OpenSSL import SSL
from twisted.internet.ssl import optionsForClientTLS, CertificateOptions, platformTrust, AcceptableCiphers
from twisted.web.client import BrowserLikePolicyForHTTPS
from twisted.web.iweb import IPolicyForHTTPS
from zope.interface.declarations import implementer
from scrapy.core.downloader.tls import ScrapyClientTLSOptions, DEFAULT_CIPHERS
@implementer(IPolicyForHTTPS)
class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
"""
Non-peer-certificate verifying HTTPS context factory
Default OpenSSL method is TLS_METHOD (also called SSLv23_METHOD)
which allows TLS protocol negotiation
'A TLS/SSL connection established with [this method] may
understand the SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols.'
"""
def __init__(self, method=SSL.SSLv23_METHOD, tls_verbose_logging=False, tls_ciphers=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self._ssl_method = method
self.tls_verbose_logging = tls_verbose_logging
if tls_ciphers:
self.tls_ciphers = AcceptableCiphers.fromOpenSSLCipherString(tls_ciphers)
else:
self.tls_ciphers = DEFAULT_CIPHERS
@classmethod
def from_settings(cls, settings, method=SSL.SSLv23_METHOD, *args, **kwargs):
tls_verbose_logging = settings.getbool('DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING')
tls_ciphers = settings['DOWNLOADER_CLIENT_TLS_CIPHERS']
return cls(method=method, tls_verbose_logging=tls_verbose_logging, tls_ciphers=tls_ciphers, *args, **kwargs)
def getCertificateOptions(self):
# setting verify=True will require you to provide CAs
# to verify against; in other words: it's not that simple
# backward-compatible SSL/TLS method:
#
# * this will respect `method` attribute in often recommended
# `ScrapyClientContextFactory` subclass
# (https://github.com/scrapy/scrapy/issues/1429#issuecomment-131782133)
#
# * getattr() for `_ssl_method` attribute for context factories
# not calling super().__init__
return CertificateOptions(
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None)),
fixBrokenPeers=True,
acceptableCiphers=self.tls_ciphers,
)
# kept for old-style HTTP/1.0 downloader context twisted calls,
# e.g. connectSSL()
def getContext(self, hostname=None, port=None):
return self.getCertificateOptions().getContext()
def creatorForNetloc(self, hostname, port):
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext(),
verbose_logging=self.tls_verbose_logging)
@implementer(IPolicyForHTTPS)
class BrowserLikeContextFactory(ScrapyClientContextFactory):
"""
Twisted-recommended context factory for web clients.
Quoting the documentation of the :class:`~twisted.web.client.Agent` class:
The default is to use a
:class:`~twisted.web.client.BrowserLikePolicyForHTTPS`, so unless you
have special requirements you can leave this as-is.
:meth:`creatorForNetloc` is the same as
:class:`~twisted.web.client.BrowserLikePolicyForHTTPS` except this context
factory allows setting the TLS/SSL method to use.
The default OpenSSL method is ``TLS_METHOD`` (also called
``SSLv23_METHOD``) which allows TLS protocol negotiation.
"""
def creatorForNetloc(self, hostname, port):
# trustRoot set to platformTrust() will use the platform's root CAs.
#
# This means that a website like https://www.cacert.org will be rejected
# by default, since CAcert.org CA certificate is seldom shipped.
return optionsForClientTLS(
hostname=hostname.decode("ascii"),
trustRoot=platformTrust(),
extraCertificateOptions={'method': self._ssl_method},
)

View file

@ -0,0 +1,81 @@
"""Download handlers for different schemes"""
import logging
from twisted.internet import defer
from scrapy import signals
from scrapy.exceptions import NotConfigured, NotSupported
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import without_none_values
logger = logging.getLogger(__name__)
class DownloadHandlers:
def __init__(self, crawler):
self._crawler = crawler
self._schemes = {} # stores acceptable schemes on instancing
self._handlers = {} # stores instanced handlers for schemes
self._notconfigured = {} # remembers failed handlers
handlers = without_none_values(
crawler.settings.getwithbase('DOWNLOAD_HANDLERS'))
for scheme, clspath in handlers.items():
self._schemes[scheme] = clspath
self._load_handler(scheme, skip_lazy=True)
crawler.signals.connect(self._close, signals.engine_stopped)
def _get_handler(self, scheme):
"""Lazy-load the downloadhandler for a scheme
only on the first request for that scheme.
"""
if scheme in self._handlers:
return self._handlers[scheme]
if scheme in self._notconfigured:
return None
if scheme not in self._schemes:
self._notconfigured[scheme] = 'no handler available for that scheme'
return None
return self._load_handler(scheme)
def _load_handler(self, scheme, skip_lazy=False):
path = self._schemes[scheme]
try:
dhcls = load_object(path)
if skip_lazy and getattr(dhcls, 'lazy', True):
return None
dh = create_instance(
objcls=dhcls,
settings=self._crawler.settings,
crawler=self._crawler,
)
except NotConfigured as ex:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True, extra={'crawler': self._crawler})
self._notconfigured[scheme] = str(ex)
return None
else:
self._handlers[scheme] = dh
return dh
def download_request(self, request, spider):
scheme = urlparse_cached(request).scheme
handler = self._get_handler(scheme)
if not handler:
raise NotSupported(f"Unsupported URL scheme '{scheme}': {self._notconfigured[scheme]}")
return handler.download_request(request, spider)
@defer.inlineCallbacks
def _close(self, *_a, **_kw):
for dh in self._handlers.values():
if hasattr(dh, 'close'):
yield dh.close()

View file

@ -0,0 +1,22 @@
from w3lib.url import parse_data_uri
from scrapy.http import TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.utils.decorators import defers
class DataURIDownloadHandler:
lazy = False
@defers
def download_request(self, request, spider):
uri = parse_data_uri(request.url)
respcls = responsetypes.from_mimetype(uri.media_type)
resp_kwargs = {}
if (issubclass(respcls, TextResponse)
and uri.media_type.split('/')[0] == 'text'):
charset = uri.media_type_parameters.get('charset')
resp_kwargs['encoding'] = charset
return respcls(url=request.url, body=uri.data, **resp_kwargs)

View file

@ -0,0 +1,16 @@
from w3lib.url import file_uri_to_path
from scrapy.responsetypes import responsetypes
from scrapy.utils.decorators import defers
class FileDownloadHandler:
lazy = False
@defers
def download_request(self, request, spider):
filepath = file_uri_to_path(request.url)
with open(filepath, 'rb') as fo:
body = fo.read()
respcls = responsetypes.from_args(filename=filepath, body=body)
return respcls(url=request.url, body=body)

View file

@ -0,0 +1,119 @@
"""
An asynchronous FTP file download handler for scrapy which somehow emulates an http response.
FTP connection parameters are passed using the request meta field:
- ftp_user (required)
- ftp_password (required)
- ftp_passive (by default, enabled) sets FTP connection passive mode
- ftp_local_filename
- If not given, file data will come in the response.body, as a normal scrapy Response,
which will imply that the entire file will be on memory.
- if given, file data will be saved in a local file with the given name
This helps when downloading very big files to avoid memory issues. In addition, for
convenience the local file name will also be given in the response body.
The status of the built html response will be, by default
- 200 in case of success
- 404 in case specified file was not found in the server (ftp code 550)
or raise corresponding ftp exception otherwise
The matching from server ftp command return codes to html response codes is defined in the
CODE_MAPPING attribute of the handler class. The key 'default' is used for any code
that is not explicitly present among the map keys. You may need to overwrite this
mapping if want a different behaviour than default.
In case of status 200 request, response.headers will come with two keys:
'Local Filename' - with the value of the local filename if given
'Size' - with size of the downloaded data
"""
import re
from io import BytesIO
from urllib.parse import unquote
from twisted.internet.protocol import ClientCreator, Protocol
from twisted.protocols.ftp import CommandFailed, FTPClient
from scrapy.http import Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
class ReceivedDataProtocol(Protocol):
def __init__(self, filename=None):
self.__filename = filename
self.body = open(filename, "wb") if filename else BytesIO()
self.size = 0
def dataReceived(self, data):
self.body.write(data)
self.size += len(data)
@property
def filename(self):
return self.__filename
def close(self):
self.body.close() if self.filename else self.body.seek(0)
_CODE_RE = re.compile(r"\d+")
class FTPDownloadHandler:
lazy = False
CODE_MAPPING = {
"550": 404,
"default": 503,
}
def __init__(self, settings):
self.default_user = settings['FTP_USER']
self.default_password = settings['FTP_PASSWORD']
self.passive_mode = settings['FTP_PASSIVE_MODE']
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def download_request(self, request, spider):
from twisted.internet import reactor
parsed_url = urlparse_cached(request)
user = request.meta.get("ftp_user", self.default_user)
password = request.meta.get("ftp_password", self.default_password)
passive_mode = 1 if bool(request.meta.get("ftp_passive",
self.passive_mode)) else 0
creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode)
dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
def gotClient(self, client, request, filepath):
self.client = client
protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
return client.retrieveFile(filepath, protocol).addCallbacks(
callback=self._build_response,
callbackArgs=(request, protocol),
errback=self._failed,
errbackArgs=(request,),
)
def _build_response(self, result, request, protocol):
self.result = result
respcls = responsetypes.from_args(url=request.url)
protocol.close()
body = protocol.filename or protocol.body.read()
headers = {"local filename": protocol.filename or '', "size": protocol.size}
return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
def _failed(self, result, request):
message = result.getErrorMessage()
if result.type == CommandFailed:
m = _CODE_RE.search(message)
if m:
ftpcode = m.group()
httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
return Response(url=request.url, status=httpcode, body=to_bytes(message))
raise result.type(result.value)

View file

@ -0,0 +1,4 @@
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
from scrapy.core.downloader.handlers.http11 import (
HTTP11DownloadHandler as HTTPDownloadHandler,
)

View file

@ -0,0 +1,37 @@
"""Download handlers for http and https schemes
"""
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import to_unicode
class HTTP10DownloadHandler:
lazy = False
def __init__(self, settings, crawler=None):
self.HTTPClientFactory = load_object(settings['DOWNLOADER_HTTPCLIENTFACTORY'])
self.ClientContextFactory = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
self._settings = settings
self._crawler = crawler
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings, crawler)
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
factory = self.HTTPClientFactory(request)
self._connect(factory)
return factory.deferred
def _connect(self, factory):
from twisted.internet import reactor
host, port = to_unicode(factory.host), factory.port
if factory.scheme == b'https':
client_context_factory = create_instance(
objcls=self.ClientContextFactory,
settings=self._settings,
crawler=self._crawler,
)
return reactor.connectSSL(host, port, factory, client_context_factory)
else:
return reactor.connectTCP(host, port, factory)

View file

@ -0,0 +1,568 @@
"""Download handlers for http and https schemes"""
import ipaddress
import logging
import re
import warnings
from contextlib import suppress
from io import BytesIO
from time import time
from urllib.parse import urldefrag
from twisted.internet import defer, protocol, ssl
from twisted.internet.endpoints import TCP4ClientEndpoint
from twisted.internet.error import TimeoutError
from twisted.python.failure import Failure
from twisted.web.client import Agent, HTTPConnectionPool, ResponseDone, ResponseFailed, URI
from twisted.web.http import _DataLoss, PotentialDataLoss
from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import IBodyProducer, UNKNOWN_LENGTH
from zope.interface import implementer
from scrapy import signals
from scrapy.core.downloader.tls import openssl_methods
from scrapy.core.downloader.webclient import _parse
from scrapy.exceptions import ScrapyDeprecationWarning, StopDownload
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import to_bytes, to_unicode
logger = logging.getLogger(__name__)
class HTTP11DownloadHandler:
lazy = False
def __init__(self, settings, crawler=None):
self._crawler = crawler
from twisted.internet import reactor
self._pool = HTTPConnectionPool(reactor, persistent=True)
self._pool.maxPersistentPerHost = settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
self._pool._factory.noisy = False
self._sslMethod = openssl_methods[settings.get('DOWNLOADER_CLIENT_TLS_METHOD')]
self._contextFactoryClass = load_object(settings['DOWNLOADER_CLIENTCONTEXTFACTORY'])
# try method-aware context factory
try:
self._contextFactory = create_instance(
objcls=self._contextFactoryClass,
settings=settings,
crawler=crawler,
method=self._sslMethod,
)
except TypeError:
# use context factory defaults
self._contextFactory = create_instance(
objcls=self._contextFactoryClass,
settings=settings,
crawler=crawler,
)
msg = f"""
'{settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]}' does not accept `method` \
argument (type OpenSSL.SSL method, e.g. OpenSSL.SSL.SSLv23_METHOD) and/or \
`tls_verbose_logging` argument and/or `tls_ciphers` argument.\
Please upgrade your context factory class to handle them or ignore them."""
warnings.warn(msg)
self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
self._disconnect_timeout = 1
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings, crawler)
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyAgent(
contextFactory=self._contextFactory,
pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
fail_on_dataloss=self._fail_on_dataloss,
crawler=self._crawler,
)
return agent.download_request(request)
def close(self):
from twisted.internet import reactor
d = self._pool.closeCachedConnections()
# closeCachedConnections will hang on network or server issues, so
# we'll manually timeout the deferred.
#
# Twisted issue addressing this problem can be found here:
# https://twistedmatrix.com/trac/ticket/7738.
#
# closeCachedConnections doesn't handle external errbacks, so we'll
# issue a callback after `_disconnect_timeout` seconds.
delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, [])
def cancel_delayed_call(result):
if delayed_call.active():
delayed_call.cancel()
return result
d.addBoth(cancel_delayed_call)
return d
class TunnelError(Exception):
"""An HTTP CONNECT tunnel could not be established by the proxy."""
class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
"""An endpoint that tunnels through proxies to allow HTTPS downloads. To
accomplish that, this endpoint sends an HTTP CONNECT to the proxy.
The HTTP CONNECT is always sent when using this endpoint, I think this could
be improved as the CONNECT will be redundant if the connection associated
with this endpoint comes from the pool and a CONNECT has already been issued
for it.
"""
_responseMatcher = re.compile(br'HTTP/1\.. (?P<status>\d{3})(?P<reason>.{,32})')
def __init__(self, reactor, host, port, proxyConf, contextFactory, timeout=30, bindAddress=None):
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
self._tunnelReadyDeferred = defer.Deferred()
self._tunneledHost = host
self._tunneledPort = port
self._contextFactory = contextFactory
self._connectBuffer = bytearray()
def requestTunnel(self, protocol):
"""Asks the proxy to open a tunnel."""
tunnelReq = tunnel_request_data(self._tunneledHost, self._tunneledPort, self._proxyAuthHeader)
protocol.transport.write(tunnelReq)
self._protocolDataReceived = protocol.dataReceived
protocol.dataReceived = self.processProxyResponse
self._protocol = protocol
return protocol
def processProxyResponse(self, rcvd_bytes):
"""Processes the response from the proxy. If the tunnel is successfully
created, notifies the client that we are ready to send requests. If not
raises a TunnelError.
"""
self._connectBuffer += rcvd_bytes
# make sure that enough (all) bytes are consumed
# and that we've got all HTTP headers (ending with a blank line)
# from the proxy so that we don't send those bytes to the TLS layer
#
# see https://github.com/scrapy/scrapy/issues/2491
if b'\r\n\r\n' not in self._connectBuffer:
return
self._protocol.dataReceived = self._protocolDataReceived
respm = TunnelingTCP4ClientEndpoint._responseMatcher.match(self._connectBuffer)
if respm and int(respm.group('status')) == 200:
# set proper Server Name Indication extension
sslOptions = self._contextFactory.creatorForNetloc(self._tunneledHost, self._tunneledPort)
self._protocol.transport.startTLS(sslOptions, self._protocolFactory)
self._tunnelReadyDeferred.callback(self._protocol)
else:
if respm:
extra = {'status': int(respm.group('status')),
'reason': respm.group('reason').strip()}
else:
extra = rcvd_bytes[:32]
self._tunnelReadyDeferred.errback(
TunnelError('Could not open CONNECT tunnel with proxy '
f'{self._host}:{self._port} [{extra!r}]')
)
def connectFailed(self, reason):
"""Propagates the errback to the appropriate deferred."""
self._tunnelReadyDeferred.errback(reason)
def connect(self, protocolFactory):
self._protocolFactory = protocolFactory
connectDeferred = super().connect(protocolFactory)
connectDeferred.addCallback(self.requestTunnel)
connectDeferred.addErrback(self.connectFailed)
return self._tunnelReadyDeferred
def tunnel_request_data(host, port, proxy_auth_header=None):
r"""
Return binary content of a CONNECT request.
>>> from scrapy.utils.python import to_unicode as s
>>> s(tunnel_request_data("example.com", 8080))
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
>>> s(tunnel_request_data("example.com", 8080, b"123"))
'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n'
>>> s(tunnel_request_data(b"example.com", "8090"))
'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
"""
host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
tunnel_req += b'Host: ' + host_value + b'\r\n'
if proxy_auth_header:
tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
tunnel_req += b'\r\n'
return tunnel_req
class TunnelingAgent(Agent):
"""An agent that uses a L{TunnelingTCP4ClientEndpoint} to make HTTPS
downloads. It may look strange that we have chosen to subclass Agent and not
ProxyAgent but consider that after the tunnel is opened the proxy is
transparent to the client; thus the agent should behave like there is no
proxy involved.
"""
def __init__(self, reactor, proxyConf, contextFactory=None,
connectTimeout=None, bindAddress=None, pool=None):
super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
self._proxyConf = proxyConf
self._contextFactory = contextFactory
def _getEndpoint(self, uri):
return TunnelingTCP4ClientEndpoint(
reactor=self._reactor,
host=uri.host,
port=uri.port,
proxyConf=self._proxyConf,
contextFactory=self._contextFactory,
timeout=self._endpointFactory._connectTimeout,
bindAddress=self._endpointFactory._bindAddress,
)
def _requestWithEndpoint(self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath):
# proxy host and port are required for HTTP pool `key`
# otherwise, same remote host connection request could reuse
# a cached tunneled connection to a different proxy
key = key + self._proxyConf
return super()._requestWithEndpoint(
key=key,
endpoint=endpoint,
method=method,
parsedURI=parsedURI,
headers=headers,
bodyProducer=bodyProducer,
requestPath=requestPath,
)
class ScrapyProxyAgent(Agent):
def __init__(self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None):
super().__init__(
reactor=reactor,
connectTimeout=connectTimeout,
bindAddress=bindAddress,
pool=pool,
)
self._proxyURI = URI.fromBytes(proxyURI)
def request(self, method, uri, headers=None, bodyProducer=None):
"""
Issue a new request via the configured proxy.
"""
# Cache *all* connections under the same key, since we are only
# connecting to a single destination, the proxy:
return self._requestWithEndpoint(
key=("http-proxy", self._proxyURI.host, self._proxyURI.port),
endpoint=self._getEndpoint(self._proxyURI),
method=method,
parsedURI=URI.fromBytes(uri),
headers=headers,
bodyProducer=bodyProducer,
requestPath=uri,
)
class ScrapyAgent:
_Agent = Agent
_ProxyAgent = ScrapyProxyAgent
_TunnelingAgent = TunnelingAgent
def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=None, pool=None,
maxsize=0, warnsize=0, fail_on_dataloss=True, crawler=None):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
self._pool = pool
self._maxsize = maxsize
self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss
self._txresponse = None
self._crawler = crawler
def _get_agent(self, request, timeout):
from twisted.internet import reactor
bindaddress = request.meta.get('bindaddress') or self._bindAddress
proxy = request.meta.get('proxy')
if proxy:
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
proxyHost = to_unicode(proxyHost)
omitConnectTunnel = b'noconnect' in proxyParams
if omitConnectTunnel:
warnings.warn("Using HTTPS proxies in the noconnect mode is deprecated. "
"If you use Crawlera, it doesn't require this mode anymore, "
"so you should update scrapy-crawlera to 1.3.0+ "
"and remove '?noconnect' from the Crawlera URL.",
ScrapyDeprecationWarning)
if scheme == b'https' and not omitConnectTunnel:
proxyAuth = request.headers.get(b'Proxy-Authorization', None)
proxyConf = (proxyHost, proxyPort, proxyAuth)
return self._TunnelingAgent(
reactor=reactor,
proxyConf=proxyConf,
contextFactory=self._contextFactory,
connectTimeout=timeout,
bindAddress=bindaddress,
pool=self._pool,
)
else:
return self._ProxyAgent(
reactor=reactor,
proxyURI=to_bytes(proxy, encoding='ascii'),
connectTimeout=timeout,
bindAddress=bindaddress,
pool=self._pool,
)
return self._Agent(
reactor=reactor,
contextFactory=self._contextFactory,
connectTimeout=timeout,
bindAddress=bindaddress,
pool=self._pool,
)
def download_request(self, request):
from twisted.internet import reactor
timeout = request.meta.get('download_timeout') or self._connectTimeout
agent = self._get_agent(request, timeout)
# request details
url = urldefrag(request.url)[0]
method = to_bytes(request.method)
headers = TxHeaders(request.headers)
if isinstance(agent, self._TunnelingAgent):
headers.removeHeader(b'Proxy-Authorization')
if request.body:
bodyproducer = _RequestBodyProducer(request.body)
else:
bodyproducer = None
start_time = time()
d = agent.request(method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
# set download latency
d.addCallback(self._cb_latency, request, start_time)
# response body is ready to be consumed
d.addCallback(self._cb_bodyready, request)
d.addCallback(self._cb_bodydone, request, url)
# check download timeout
self._timeout_cl = reactor.callLater(timeout, d.cancel)
d.addBoth(self._cb_timeout, request, url, timeout)
return d
def _cb_timeout(self, result, request, url, timeout):
if self._timeout_cl.active():
self._timeout_cl.cancel()
return result
# needed for HTTPS requests, otherwise _ResponseReader doesn't
# receive connectionLost()
if self._txresponse:
self._txresponse._transport.stopProducing()
raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
def _cb_latency(self, result, request, start_time):
request.meta['download_latency'] = time() - start_time
return result
def _cb_bodyready(self, txresponse, request):
# deliverBody hangs for responses without body
if txresponse.length == 0:
return {
"txresponse": txresponse,
"body": b"",
"flags": None,
"certificate": None,
"ip_address": None,
}
maxsize = request.meta.get('download_maxsize', self._maxsize)
warnsize = request.meta.get('download_warnsize', self._warnsize)
expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
fail_on_dataloss = request.meta.get('download_fail_on_dataloss', self._fail_on_dataloss)
if maxsize and expected_size > maxsize:
warning_msg = ("Cancelling download of %(url)s: expected response "
"size (%(size)s) larger than download max size (%(maxsize)s).")
warning_args = {'url': request.url, 'size': expected_size, 'maxsize': maxsize}
logger.warning(warning_msg, warning_args)
txresponse._transport._producer.loseConnection()
raise defer.CancelledError(warning_msg % warning_args)
if warnsize and expected_size > warnsize:
logger.warning("Expected response size (%(size)s) larger than "
"download warn size (%(warnsize)s) in request %(request)s.",
{'size': expected_size, 'warnsize': warnsize, 'request': request})
def _cancel(_):
# Abort connection immediately.
txresponse._transport._producer.abortConnection()
d = defer.Deferred(_cancel)
txresponse.deliverBody(
_ResponseReader(
finished=d,
txresponse=txresponse,
request=request,
maxsize=maxsize,
warnsize=warnsize,
fail_on_dataloss=fail_on_dataloss,
crawler=self._crawler,
)
)
# save response for timeouts
self._txresponse = txresponse
return d
def _cb_bodydone(self, result, request, url):
headers = Headers(result["txresponse"].headers.getAllRawHeaders())
respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
response = respcls(
url=url,
status=int(result["txresponse"].code),
headers=headers,
body=result["body"],
flags=result["flags"],
certificate=result["certificate"],
ip_address=result["ip_address"],
)
if result.get("failure"):
result["failure"].value.response = response
return result["failure"]
return response
@implementer(IBodyProducer)
class _RequestBodyProducer:
def __init__(self, body):
self.body = body
self.length = len(body)
def startProducing(self, consumer):
consumer.write(self.body)
return defer.succeed(None)
def pauseProducing(self):
pass
def stopProducing(self):
pass
class _ResponseReader(protocol.Protocol):
def __init__(self, finished, txresponse, request, maxsize, warnsize, fail_on_dataloss, crawler):
self._finished = finished
self._txresponse = txresponse
self._request = request
self._bodybuf = BytesIO()
self._maxsize = maxsize
self._warnsize = warnsize
self._fail_on_dataloss = fail_on_dataloss
self._fail_on_dataloss_warned = False
self._reached_warnsize = False
self._bytes_received = 0
self._certificate = None
self._ip_address = None
self._crawler = crawler
def _finish_response(self, flags=None, failure=None):
self._finished.callback({
"txresponse": self._txresponse,
"body": self._bodybuf.getvalue(),
"flags": flags,
"certificate": self._certificate,
"ip_address": self._ip_address,
"failure": failure,
})
def connectionMade(self):
if self._certificate is None:
with suppress(AttributeError):
self._certificate = ssl.Certificate(self.transport._producer.getPeerCertificate())
if self._ip_address is None:
self._ip_address = ipaddress.ip_address(self.transport._producer.getPeer().host)
def dataReceived(self, bodyBytes):
# This maybe called several times after cancel was called with buffered data.
if self._finished.called:
return
self._bodybuf.write(bodyBytes)
self._bytes_received += len(bodyBytes)
bytes_received_result = self._crawler.signals.send_catch_log(
signal=signals.bytes_received,
data=bodyBytes,
request=self._request,
spider=self._crawler.spider,
)
for handler, result in bytes_received_result:
if isinstance(result, Failure) and isinstance(result.value, StopDownload):
logger.debug("Download stopped for %(request)s from signal handler %(handler)s",
{"request": self._request, "handler": handler.__qualname__})
self.transport._producer.loseConnection()
failure = result if result.value.fail else None
self._finish_response(flags=["download_stopped"], failure=failure)
if self._maxsize and self._bytes_received > self._maxsize:
logger.warning("Received (%(bytes)s) bytes larger than download "
"max size (%(maxsize)s) in request %(request)s.",
{'bytes': self._bytes_received,
'maxsize': self._maxsize,
'request': self._request})
# Clear buffer earlier to avoid keeping data in memory for a long time.
self._bodybuf.truncate(0)
self._finished.cancel()
if self._warnsize and self._bytes_received > self._warnsize and not self._reached_warnsize:
self._reached_warnsize = True
logger.warning("Received more bytes than download "
"warn size (%(warnsize)s) in request %(request)s.",
{'warnsize': self._warnsize,
'request': self._request})
def connectionLost(self, reason):
if self._finished.called:
return
if reason.check(ResponseDone):
self._finish_response()
return
if reason.check(PotentialDataLoss):
self._finish_response(flags=["partial"])
return
if reason.check(ResponseFailed) and any(r.check(_DataLoss) for r in reason.value.reasons):
if not self._fail_on_dataloss:
self._finish_response(flags=["dataloss"])
return
elif not self._fail_on_dataloss_warned:
logger.warning("Got data loss in %s. If you want to process broken "
"responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
" -- This message won't be shown in further requests",
self._txresponse.request.absoluteURI.decode())
self._fail_on_dataloss_warned = True
self._finished.errback(reason)

View file

@ -0,0 +1,82 @@
from urllib.parse import unquote
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.exceptions import NotConfigured
from scrapy.utils.boto import is_botocore_available
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import create_instance
class S3DownloadHandler:
def __init__(self, settings, *,
crawler=None,
aws_access_key_id=None, aws_secret_access_key=None,
httpdownloadhandler=HTTPDownloadHandler, **kw):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
if not aws_access_key_id:
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
if not aws_secret_access_key:
aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
# If no credentials could be found anywhere,
# consider this an anonymous connection request by default;
# unless 'anon' was set explicitly (True/False).
anon = kw.get('anon')
if anon is None and not aws_access_key_id and not aws_secret_access_key:
kw['anon'] = True
self.anon = kw.get('anon')
self._signer = None
import botocore.auth
import botocore.credentials
kw.pop('anon', None)
if kw:
raise TypeError(f'Unexpected keyword arguments: {kw}')
if not self.anon:
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
self._signer = SignerCls(botocore.credentials.Credentials(
aws_access_key_id, aws_secret_access_key))
_http_handler = create_instance(
objcls=httpdownloadhandler,
settings=settings,
crawler=crawler,
)
self._download_http = _http_handler.download_request
@classmethod
def from_crawler(cls, crawler, **kwargs):
return cls(crawler.settings, crawler=crawler, **kwargs)
def download_request(self, request, spider):
p = urlparse_cached(request)
scheme = 'https' if request.meta.get('is_secure') else 'http'
bucket = p.hostname
path = p.path + '?' + p.query if p.query else p.path
url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
if self.anon:
request = request.replace(url=url)
elif self._signer is not None:
import botocore.awsrequest
awsrequest = botocore.awsrequest.AWSRequest(
method=request.method,
url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
headers=request.headers.to_unicode_dict(),
data=request.body)
self._signer.add_auth(awsrequest)
request = request.replace(
url=url, headers=awsrequest.headers.items())
else:
signed_headers = self.conn.make_request(
method=request.method,
bucket=bucket,
key=unquote(p.path),
query_args=unquote(p.query),
headers=request.headers,
data=request.body,
)
request = request.replace(url=url, headers=signed_headers)
return self._download_http(request, spider)

View file

@ -0,0 +1,84 @@
"""
Downloader Middleware manager
See documentation in docs/topics/downloader-middleware.rst
"""
from twisted.internet import defer
from scrapy.exceptions import _InvalidOutput
from scrapy.http import Request, Response
from scrapy.middleware import MiddlewareManager
from scrapy.utils.defer import mustbe_deferred, deferred_from_coro
from scrapy.utils.conf import build_component_list
class DownloaderMiddlewareManager(MiddlewareManager):
component_name = 'downloader middleware'
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(
settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
def _add_middleware(self, mw):
if hasattr(mw, 'process_request'):
self.methods['process_request'].append(mw.process_request)
if hasattr(mw, 'process_response'):
self.methods['process_response'].appendleft(mw.process_response)
if hasattr(mw, 'process_exception'):
self.methods['process_exception'].appendleft(mw.process_exception)
def download(self, download_func, request, spider):
@defer.inlineCallbacks
def process_request(request):
for method in self.methods['process_request']:
response = yield deferred_from_coro(method(request=request, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
raise _InvalidOutput(
f"Middleware {method.__self__.__class__.__name__}"
".process_request must return None, Response or "
f"Request, got {response.__class__.__name__}"
)
if response:
return response
return (yield download_func(request=request, spider=spider))
@defer.inlineCallbacks
def process_response(response):
if response is None:
raise TypeError("Received None in process_response")
elif isinstance(response, Request):
return response
for method in self.methods['process_response']:
response = yield deferred_from_coro(method(request=request, response=response, spider=spider))
if not isinstance(response, (Response, Request)):
raise _InvalidOutput(
f"Middleware {method.__self__.__class__.__name__}"
".process_response must return Response or Request, "
f"got {type(response)}"
)
if isinstance(response, Request):
return response
return response
@defer.inlineCallbacks
def process_exception(failure):
exception = failure.value
for method in self.methods['process_exception']:
response = yield deferred_from_coro(method(request=request, exception=exception, spider=spider))
if response is not None and not isinstance(response, (Response, Request)):
raise _InvalidOutput(
f"Middleware {method.__self__.__class__.__name__}"
".process_exception must return None, Response or "
f"Request, got {type(response)}"
)
if response:
return response
return failure
deferred = mustbe_deferred(process_request, request)
deferred.addErrback(process_exception)
deferred.addCallback(process_response)
return deferred

View file

@ -0,0 +1,78 @@
import logging
from OpenSSL import SSL
from service_identity.exceptions import CertificateError
from twisted.internet._sslverify import ClientTLSOptions, verifyHostname, VerificationError
from twisted.internet.ssl import AcceptableCiphers
from scrapy.utils.ssl import x509name_to_string, get_temp_key_info
logger = logging.getLogger(__name__)
METHOD_SSLv3 = 'SSLv3'
METHOD_TLS = 'TLS'
METHOD_TLSv10 = 'TLSv1.0'
METHOD_TLSv11 = 'TLSv1.1'
METHOD_TLSv12 = 'TLSv1.2'
openssl_methods = {
METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
METHOD_SSLv3: SSL.SSLv3_METHOD, # SSL 3 (NOT recommended)
METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
METHOD_TLSv11: getattr(SSL, 'TLSv1_1_METHOD', 5), # TLS 1.1 only
METHOD_TLSv12: getattr(SSL, 'TLSv1_2_METHOD', 6), # TLS 1.2 only
}
class ScrapyClientTLSOptions(ClientTLSOptions):
"""
SSL Client connection creator ignoring certificate verification errors
(for genuinely invalid certificates or bugs in verification code).
Same as Twisted's private _sslverify.ClientTLSOptions,
except that VerificationError, CertificateError and ValueError
exceptions are caught, so that the connection is not closed, only
logging warnings. Also, HTTPS connection parameters logging is added.
"""
def __init__(self, hostname, ctx, verbose_logging=False):
super().__init__(hostname, ctx)
self.verbose_logging = verbose_logging
def _identityVerifyingInfoCallback(self, connection, where, ret):
if where & SSL.SSL_CB_HANDSHAKE_START:
connection.set_tlsext_host_name(self._hostnameBytes)
elif where & SSL.SSL_CB_HANDSHAKE_DONE:
if self.verbose_logging:
logger.debug('SSL connection to %s using protocol %s, cipher %s',
self._hostnameASCII,
connection.get_protocol_version_name(),
connection.get_cipher_name(),
)
server_cert = connection.get_peer_certificate()
logger.debug('SSL connection certificate: issuer "%s", subject "%s"',
x509name_to_string(server_cert.get_issuer()),
x509name_to_string(server_cert.get_subject()),
)
key_info = get_temp_key_info(connection._ssl)
if key_info:
logger.debug('SSL temp key: %s', key_info)
try:
verifyHostname(connection, self._hostnameASCII)
except (CertificateError, VerificationError) as e:
logger.warning(
'Remote certificate is not valid for hostname "{}"; {}'.format(
self._hostnameASCII, e))
except ValueError as e:
logger.warning(
'Ignoring error while verifying certificate '
'from host "{}" (exception: {})'.format(
self._hostnameASCII, repr(e)))
DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT')

View file

@ -0,0 +1,212 @@
from time import time
from urllib.parse import urlparse, urlunparse, urldefrag
from twisted.web.http import HTTPClient
from twisted.internet import defer, reactor
from twisted.internet.protocol import ClientFactory
from scrapy.http import Headers
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
from scrapy.responsetypes import responsetypes
def _parsed_url_args(parsed):
# Assume parsed is urlparse-d from Request.url,
# which was passed via safe_url_string and is ascii-only.
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
path = to_bytes(path, encoding="ascii")
host = to_bytes(parsed.hostname, encoding="ascii")
port = parsed.port
scheme = to_bytes(parsed.scheme, encoding="ascii")
netloc = to_bytes(parsed.netloc, encoding="ascii")
if port is None:
port = 443 if scheme == b'https' else 80
return scheme, netloc, host, port, path
def _parse(url):
""" Return tuple of (scheme, netloc, host, port, path),
all in bytes except for port which is int.
Assume url is from Request.url, which was passed via safe_url_string
and is ascii-only.
"""
url = url.strip()
parsed = urlparse(url)
return _parsed_url_args(parsed)
class ScrapyHTTPPageGetter(HTTPClient):
delimiter = b'\n'
def connectionMade(self):
self.headers = Headers() # bucket for response headers
# Method command
self.sendCommand(self.factory.method, self.factory.path)
# Headers
for key, values in self.factory.headers.items():
for value in values:
self.sendHeader(key, value)
self.endHeaders()
# Body
if self.factory.body is not None:
self.transport.write(self.factory.body)
def lineReceived(self, line):
return HTTPClient.lineReceived(self, line.rstrip())
def handleHeader(self, key, value):
self.headers.appendlist(key, value)
def handleStatus(self, version, status, message):
self.factory.gotStatus(version, status, message)
def handleEndHeaders(self):
self.factory.gotHeaders(self.headers)
def connectionLost(self, reason):
self._connection_lost_reason = reason
HTTPClient.connectionLost(self, reason)
self.factory.noPage(reason)
def handleResponse(self, response):
if self.factory.method.upper() == b'HEAD':
self.factory.page(b'')
elif self.length is not None and self.length > 0:
self.factory.noPage(self._connection_lost_reason)
else:
self.factory.page(response)
self.transport.loseConnection()
def timeout(self):
self.transport.loseConnection()
# transport cleanup needed for HTTPS connections
if self.factory.url.startswith(b'https'):
self.transport.stopProducing()
self.factory.noPage(
defer.TimeoutError(f"Getting {self.factory.url} took longer "
f"than {self.factory.timeout} seconds."))
# This class used to inherit from Twisteds
# twisted.web.client.HTTPClientFactory. When that class was deprecated in
# Twisted (https://github.com/twisted/twisted/pull/643), we merged its
# non-overriden code into this class.
class ScrapyHTTPClientFactory(ClientFactory):
protocol = ScrapyHTTPPageGetter
waiting = 1
noisy = False
followRedirect = False
afterFoundGet = False
def _build_response(self, body, request):
request.meta['download_latency'] = self.headers_time - self.start_time
status = int(self.status)
headers = Headers(self.response_headers)
respcls = responsetypes.from_args(headers=headers, url=self._url)
return respcls(url=self._url, status=status, headers=headers, body=body)
def _set_connection_attributes(self, request):
parsed = urlparse_cached(request)
self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
proxy = request.meta.get('proxy')
if proxy:
self.scheme, _, self.host, self.port, _ = _parse(proxy)
self.path = self.url
def __init__(self, request, timeout=180):
self._url = urldefrag(request.url)[0]
# converting to bytes to comply to Twisted interface
self.url = to_bytes(self._url, encoding='ascii')
self.method = to_bytes(request.method, encoding='ascii')
self.body = request.body or None
self.headers = Headers(request.headers)
self.response_headers = None
self.timeout = request.meta.get('download_timeout') or timeout
self.start_time = time()
self.deferred = defer.Deferred().addCallback(self._build_response, request)
# Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
# to have _disconnectedDeferred. See Twisted r32329.
# As Scrapy implements it's own logic to handle redirects is not
# needed to add the callback _waitForDisconnect.
# Specifically this avoids the AttributeError exception when
# clientConnectionFailed method is called.
self._disconnectedDeferred = defer.Deferred()
self._set_connection_attributes(request)
# set Host header based on url
self.headers.setdefault('Host', self.netloc)
# set Content-Length based len of body
if self.body is not None:
self.headers['Content-Length'] = len(self.body)
# just in case a broken http/1.1 decides to keep connection alive
self.headers.setdefault("Connection", "close")
# Content-Length must be specified in POST method even with no body
elif self.method == b'POST':
self.headers['Content-Length'] = 0
def __repr__(self):
return f"<{self.__class__.__name__}: {self.url}>"
def _cancelTimeout(self, result, timeoutCall):
if timeoutCall.active():
timeoutCall.cancel()
return result
def buildProtocol(self, addr):
p = ClientFactory.buildProtocol(self, addr)
p.followRedirect = self.followRedirect
p.afterFoundGet = self.afterFoundGet
if self.timeout:
timeoutCall = reactor.callLater(self.timeout, p.timeout)
self.deferred.addBoth(self._cancelTimeout, timeoutCall)
return p
def gotHeaders(self, headers):
self.headers_time = time()
self.response_headers = headers
def gotStatus(self, version, status, message):
"""
Set the status of the request on us.
@param version: The HTTP version.
@type version: L{bytes}
@param status: The HTTP status code, an integer represented as a
bytestring.
@type status: L{bytes}
@param message: The HTTP status message.
@type message: L{bytes}
"""
self.version, self.status, self.message = version, status, message
def page(self, page):
if self.waiting:
self.waiting = 0
self.deferred.callback(page)
def noPage(self, reason):
if self.waiting:
self.waiting = 0
self.deferred.errback(reason)
def clientConnectionFailed(self, _, reason):
"""
When a connection attempt fails, the request cannot be issued. If no
result has yet been provided to the result Deferred, provide the
connection failure reason as an error result.
"""
if self.waiting:
self.waiting = 0
# If the connection attempt failed, there is nothing more to
# disconnect, so just fire that Deferred now.
self._disconnectedDeferred.callback(None)
self.deferred.errback(reason)

View file

@ -0,0 +1,360 @@
"""
This is the Scrapy engine which controls the Scheduler, Downloader and Spiders.
For more information see docs/topics/architecture.rst
"""
import logging
from time import time
from twisted.internet import defer, task
from twisted.python.failure import Failure
from scrapy import signals
from scrapy.core.scraper import Scraper
from scrapy.exceptions import DontCloseSpider
from scrapy.http import Response, Request
from scrapy.utils.misc import load_object
from scrapy.utils.reactor import CallLaterOnce
from scrapy.utils.log import logformatter_adapter, failure_to_exc_info
logger = logging.getLogger(__name__)
class Slot:
def __init__(self, start_requests, close_if_idle, nextcall, scheduler):
self.closing = False
self.inprogress = set() # requests in progress
self.start_requests = iter(start_requests)
self.close_if_idle = close_if_idle
self.nextcall = nextcall
self.scheduler = scheduler
self.heartbeat = task.LoopingCall(nextcall.schedule)
def add_request(self, request):
self.inprogress.add(request)
def remove_request(self, request):
self.inprogress.remove(request)
self._maybe_fire_closing()
def close(self):
self.closing = defer.Deferred()
self._maybe_fire_closing()
return self.closing
def _maybe_fire_closing(self):
if self.closing and not self.inprogress:
if self.nextcall:
self.nextcall.cancel()
if self.heartbeat.running:
self.heartbeat.stop()
self.closing.callback(None)
class ExecutionEngine:
def __init__(self, crawler, spider_closed_callback):
self.crawler = crawler
self.settings = crawler.settings
self.signals = crawler.signals
self.logformatter = crawler.logformatter
self.slot = None
self.spider = None
self.running = False
self.paused = False
self.scheduler_cls = load_object(self.settings['SCHEDULER'])
downloader_cls = load_object(self.settings['DOWNLOADER'])
self.downloader = downloader_cls(crawler)
self.scraper = Scraper(crawler)
self._spider_closed_callback = spider_closed_callback
@defer.inlineCallbacks
def start(self):
"""Start the execution engine"""
if self.running:
raise RuntimeError("Engine already running")
self.start_time = time()
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
self.running = True
self._closewait = defer.Deferred()
yield self._closewait
def stop(self):
"""Stop the execution engine gracefully"""
if not self.running:
raise RuntimeError("Engine not running")
self.running = False
dfd = self._close_all_spiders()
return dfd.addBoth(lambda _: self._finish_stopping_engine())
def close(self):
"""Close the execution engine gracefully.
If it has already been started, stop it. In all cases, close all spiders
and the downloader.
"""
if self.running:
# Will also close spiders and downloader
return self.stop()
elif self.open_spiders:
# Will also close downloader
return self._close_all_spiders()
else:
return defer.succeed(self.downloader.close())
def pause(self):
"""Pause the execution engine"""
self.paused = True
def unpause(self):
"""Resume the execution engine"""
self.paused = False
def _next_request(self, spider):
slot = self.slot
if not slot:
return
if self.paused:
return
while not self._needs_backout(spider):
if not self._next_request_from_scheduler(spider):
break
if slot.start_requests and not self._needs_backout(spider):
try:
request = next(slot.start_requests)
except StopIteration:
slot.start_requests = None
except Exception:
slot.start_requests = None
logger.error('Error while obtaining start requests',
exc_info=True, extra={'spider': spider})
else:
self.crawl(request, spider)
if self.spider_is_idle(spider) and slot.close_if_idle:
self._spider_idle(spider)
def _needs_backout(self, spider):
slot = self.slot
return (
not self.running
or slot.closing
or self.downloader.needs_backout()
or self.scraper.slot.needs_backout()
)
def _next_request_from_scheduler(self, spider):
slot = self.slot
request = slot.scheduler.next_request()
if not request:
return
d = self._download(request, spider)
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.remove_request(request))
d.addErrback(lambda f: logger.info('Error while removing request from slot',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(lambda f: logger.info('Error while scheduling new request',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d
def _handle_downloader_output(self, response, request, spider):
if not isinstance(response, (Request, Response, Failure)):
raise TypeError(
"Incorrect type: expected Request, Response or Failure, got "
f"{type(response)}: {response!r}"
)
# downloader middleware can return requests (for example, redirects)
if isinstance(response, Request):
self.crawl(response, spider)
return
# response is a Response or Failure
d = self.scraper.enqueue_scrape(response, request, spider)
d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d
def spider_is_idle(self, spider):
if not self.scraper.slot.is_idle():
# scraper is not idle
return False
if self.downloader.active:
# downloader has pending requests
return False
if self.slot.start_requests is not None:
# not all start requests are handled
return False
if self.slot.scheduler.has_pending_requests():
# scheduler has pending requests
return False
return True
@property
def open_spiders(self):
return [self.spider] if self.spider else []
def has_capacity(self):
"""Does the engine have capacity to handle more spiders"""
return not bool(self.slot)
def crawl(self, request, spider):
if spider not in self.open_spiders:
raise RuntimeError(f"Spider {spider.name!r} not opened when crawling: {request}")
self.schedule(request, spider)
self.slot.nextcall.schedule()
def schedule(self, request, spider):
self.signals.send_catch_log(signals.request_scheduled, request=request, spider=spider)
if not self.slot.scheduler.enqueue_request(request):
self.signals.send_catch_log(signals.request_dropped, request=request, spider=spider)
def download(self, request, spider):
d = self._download(request, spider)
d.addBoth(self._downloaded, self.slot, request, spider)
return d
def _downloaded(self, response, slot, request, spider):
slot.remove_request(request)
return self.download(response, spider) if isinstance(response, Request) else response
def _download(self, request, spider):
slot = self.slot
slot.add_request(request)
def _on_success(response):
if not isinstance(response, (Response, Request)):
raise TypeError(
"Incorrect type: expected Response or Request, got "
f"{type(response)}: {response!r}"
)
if isinstance(response, Response):
if response.request is None:
response.request = request
logkws = self.logformatter.crawled(response.request, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
self.signals.send_catch_log(
signal=signals.response_received,
response=response,
request=response.request,
spider=spider,
)
return response
def _on_complete(_):
slot.nextcall.schedule()
return _
dwld = self.downloader.fetch(request, spider)
dwld.addCallbacks(_on_success)
dwld.addBoth(_on_complete)
return dwld
@defer.inlineCallbacks
def open_spider(self, spider, start_requests=(), close_if_idle=True):
if not self.has_capacity():
raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
logger.info("Spider opened", extra={'spider': spider})
nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_crawler(self.crawler)
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
self.slot = slot
self.spider = spider
yield scheduler.open(spider)
yield self.scraper.open_spider(spider)
self.crawler.stats.open_spider(spider)
yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
slot.nextcall.schedule()
slot.heartbeat.start(5)
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = self.signals.send_catch_log(signals.spider_idle, spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) for _, x in res):
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
def close_spider(self, spider, reason='cancelled'):
"""Close (cancel) spider and clear all its outstanding requests"""
slot = self.slot
if slot.closing:
return slot.closing
logger.info("Closing spider (%(reason)s)",
{'reason': reason},
extra={'spider': spider})
dfd = slot.close()
def log_failure(msg):
def errback(failure):
logger.error(
msg,
exc_info=failure_to_exc_info(failure),
extra={'spider': spider}
)
return errback
dfd.addBoth(lambda _: self.downloader.close())
dfd.addErrback(log_failure('Downloader close failure'))
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log_failure('Scraper close failure'))
dfd.addBoth(lambda _: slot.scheduler.close(reason))
dfd.addErrback(log_failure('Scheduler close failure'))
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
signal=signals.spider_closed, spider=spider, reason=reason))
dfd.addErrback(log_failure('Error while sending spider_close signal'))
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
dfd.addErrback(log_failure('Stats close failure'))
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
{'reason': reason},
extra={'spider': spider}))
dfd.addBoth(lambda _: setattr(self, 'slot', None))
dfd.addErrback(log_failure('Error while unassigning slot'))
dfd.addBoth(lambda _: setattr(self, 'spider', None))
dfd.addErrback(log_failure('Error while unassigning spider'))
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
return dfd
def _close_all_spiders(self):
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
dlist = defer.DeferredList(dfds)
return dlist
@defer.inlineCallbacks
def _finish_stopping_engine(self):
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
self._closewait.callback(None)

View file

@ -0,0 +1,182 @@
import os
import json
import logging
import warnings
from os.path import join, exists
from queuelib import PriorityQueue
from scrapy.utils.misc import load_object, create_instance
from scrapy.utils.job import job_dir
from scrapy.utils.deprecate import ScrapyDeprecationWarning
logger = logging.getLogger(__name__)
class Scheduler:
"""
Scrapy Scheduler. It allows to enqueue requests and then get
a next request to download. Scheduler is also handling duplication
filtering, via dupefilter.
Prioritization and queueing is not performed by the Scheduler.
User sets ``priority`` field for each Request, and a PriorityQueue
(defined by :setting:`SCHEDULER_PRIORITY_QUEUE`) uses these priorities
to dequeue requests in a desired order.
Scheduler uses two PriorityQueue instances, configured to work in-memory
and on-disk (optional). When on-disk queue is present, it is used by
default, and an in-memory queue is used as a fallback for cases where
a disk queue can't handle a request (can't serialize it).
:setting:`SCHEDULER_MEMORY_QUEUE` and
:setting:`SCHEDULER_DISK_QUEUE` allow to specify lower-level queue classes
which PriorityQueue instances would be instantiated with, to keep requests
on disk and in memory respectively.
Overall, Scheduler is an object which holds several PriorityQueue instances
(in-memory and on-disk) and implements fallback logic for them.
Also, it handles dupefilters.
"""
def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
logunser=False, stats=None, pqclass=None, crawler=None):
self.df = dupefilter
self.dqdir = self._dqdir(jobdir)
self.pqclass = pqclass
self.dqclass = dqclass
self.mqclass = mqclass
self.logunser = logunser
self.stats = stats
self.crawler = crawler
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
dupefilter = create_instance(dupefilter_cls, settings, crawler)
pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
if pqclass is PriorityQueue:
warnings.warn("SCHEDULER_PRIORITY_QUEUE='queuelib.PriorityQueue'"
" is no longer supported because of API changes; "
"please use 'scrapy.pqueues.ScrapyPriorityQueue'",
ScrapyDeprecationWarning)
from scrapy.pqueues import ScrapyPriorityQueue
pqclass = ScrapyPriorityQueue
dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
logunser = settings.getbool('SCHEDULER_DEBUG')
return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
stats=crawler.stats, pqclass=pqclass, dqclass=dqclass,
mqclass=mqclass, crawler=crawler)
def has_pending_requests(self):
return len(self) > 0
def open(self, spider):
self.spider = spider
self.mqs = self._mq()
self.dqs = self._dq() if self.dqdir else None
return self.df.open()
def close(self, reason):
if self.dqs:
state = self.dqs.close()
self._write_dqs_state(self.dqdir, state)
return self.df.close(reason)
def enqueue_request(self, request):
if not request.dont_filter and self.df.request_seen(request):
self.df.log(request, self.spider)
return False
dqok = self._dqpush(request)
if dqok:
self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider)
else:
self._mqpush(request)
self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider)
self.stats.inc_value('scheduler/enqueued', spider=self.spider)
return True
def next_request(self):
request = self.mqs.pop()
if request:
self.stats.inc_value('scheduler/dequeued/memory', spider=self.spider)
else:
request = self._dqpop()
if request:
self.stats.inc_value('scheduler/dequeued/disk', spider=self.spider)
if request:
self.stats.inc_value('scheduler/dequeued', spider=self.spider)
return request
def __len__(self):
return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)
def _dqpush(self, request):
if self.dqs is None:
return
try:
self.dqs.push(request)
except ValueError as e: # non serializable request
if self.logunser:
msg = ("Unable to serialize request: %(request)s - reason:"
" %(reason)s - no more unserializable requests will be"
" logged (stats being collected)")
logger.warning(msg, {'request': request, 'reason': e},
exc_info=True, extra={'spider': self.spider})
self.logunser = False
self.stats.inc_value('scheduler/unserializable',
spider=self.spider)
return
else:
return True
def _mqpush(self, request):
self.mqs.push(request)
def _dqpop(self):
if self.dqs:
return self.dqs.pop()
def _mq(self):
""" Create a new priority queue instance, with in-memory storage """
return create_instance(self.pqclass,
settings=None,
crawler=self.crawler,
downstream_queue_cls=self.mqclass,
key='')
def _dq(self):
""" Create a new priority queue instance, with disk storage """
state = self._read_dqs_state(self.dqdir)
q = create_instance(self.pqclass,
settings=None,
crawler=self.crawler,
downstream_queue_cls=self.dqclass,
key=self.dqdir,
startprios=state)
if q:
logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
{'queuesize': len(q)}, extra={'spider': self.spider})
return q
def _dqdir(self, jobdir):
""" Return a folder name to keep disk queue state at """
if jobdir:
dqdir = join(jobdir, 'requests.queue')
if not exists(dqdir):
os.makedirs(dqdir)
return dqdir
def _read_dqs_state(self, dqdir):
path = join(dqdir, 'active.json')
if not exists(path):
return ()
with open(path) as f:
return json.load(f)
def _write_dqs_state(self, dqdir, state):
with open(join(dqdir, 'active.json'), 'w') as f:
json.dump(state, f)

View file

@ -0,0 +1,260 @@
"""This module implements the Scraper component which parses responses and
extracts information from them"""
import logging
from collections import deque
from itemadapter import is_item
from twisted.internet import defer
from twisted.python.failure import Failure
from scrapy import signals
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
from scrapy.http import Request, Response
from scrapy.utils.defer import defer_fail, defer_succeed, iter_errback, parallel
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.spider import iterate_spider_output
logger = logging.getLogger(__name__)
class Slot:
"""Scraper slot (one per running spider)"""
MIN_RESPONSE_SIZE = 1024
def __init__(self, max_active_size=5000000):
self.max_active_size = max_active_size
self.queue = deque()
self.active = set()
self.active_size = 0
self.itemproc_size = 0
self.closing = None
def add_response_request(self, response, request):
deferred = defer.Deferred()
self.queue.append((response, request, deferred))
if isinstance(response, Response):
self.active_size += max(len(response.body), self.MIN_RESPONSE_SIZE)
else:
self.active_size += self.MIN_RESPONSE_SIZE
return deferred
def next_response_request_deferred(self):
response, request, deferred = self.queue.popleft()
self.active.add(request)
return response, request, deferred
def finish_response(self, response, request):
self.active.remove(request)
if isinstance(response, Response):
self.active_size -= max(len(response.body), self.MIN_RESPONSE_SIZE)
else:
self.active_size -= self.MIN_RESPONSE_SIZE
def is_idle(self):
return not (self.queue or self.active)
def needs_backout(self):
return self.active_size > self.max_active_size
class Scraper:
def __init__(self, crawler):
self.slot = None
self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
self.itemproc = itemproc_cls.from_crawler(crawler)
self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
self.crawler = crawler
self.signals = crawler.signals
self.logformatter = crawler.logformatter
@defer.inlineCallbacks
def open_spider(self, spider):
"""Open the given spider for scraping and allocate resources for it"""
self.slot = Slot(self.crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE'))
yield self.itemproc.open_spider(spider)
def close_spider(self, spider):
"""Close a spider being scraped and release its resources"""
slot = self.slot
slot.closing = defer.Deferred()
slot.closing.addCallback(self.itemproc.close_spider)
self._check_if_closing(spider, slot)
return slot.closing
def is_idle(self):
"""Return True if there isn't any more spiders to process"""
return not self.slot
def _check_if_closing(self, spider, slot):
if slot.closing and slot.is_idle():
slot.closing.callback(spider)
def enqueue_scrape(self, response, request, spider):
slot = self.slot
dfd = slot.add_response_request(response, request)
def finish_scraping(_):
slot.finish_response(response, request)
self._check_if_closing(spider, slot)
self._scrape_next(spider, slot)
return _
dfd.addBoth(finish_scraping)
dfd.addErrback(
lambda f: logger.error('Scraper bug processing %(request)s',
{'request': request},
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
self._scrape_next(spider, slot)
return dfd
def _scrape_next(self, spider, slot):
while slot.queue:
response, request, deferred = slot.next_response_request_deferred()
self._scrape(response, request, spider).chainDeferred(deferred)
def _scrape(self, result, request, spider):
"""
Handle the downloaded response or failure through the spider callback/errback
"""
if not isinstance(result, (Response, Failure)):
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
dfd = self._scrape2(result, request, spider) # returns spider's processed output
dfd.addErrback(self.handle_spider_error, request, result, spider)
dfd.addCallback(self.handle_spider_output, request, result, spider)
return dfd
def _scrape2(self, result, request, spider):
"""
Handle the different cases of request's result been a Response or a Failure
"""
if isinstance(result, Response):
return self.spidermw.scrape_response(self.call_spider, result, request, spider)
else: # result is a Failure
dfd = self.call_spider(result, request, spider)
return dfd.addErrback(self._log_download_errors, result, request, spider)
def call_spider(self, result, request, spider):
if isinstance(result, Response):
if getattr(result, "request", None) is None:
result.request = request
callback = result.request.callback or spider._parse
warn_on_generator_with_return_value(spider, callback)
dfd = defer_succeed(result)
dfd.addCallback(callback, **result.request.cb_kwargs)
else: # result is a Failure
result.request = request
warn_on_generator_with_return_value(spider, request.errback)
dfd = defer_fail(result)
dfd.addErrback(request.errback)
return dfd.addCallback(iterate_spider_output)
def handle_spider_error(self, _failure, request, response, spider):
exc = _failure.value
if isinstance(exc, CloseSpider):
self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
return
logkws = self.logformatter.spider_error(_failure, request, response, spider)
logger.log(
*logformatter_adapter(logkws),
exc_info=failure_to_exc_info(_failure),
extra={'spider': spider}
)
self.signals.send_catch_log(
signal=signals.spider_error,
failure=_failure, response=response,
spider=spider
)
self.crawler.stats.inc_value(
f"spider_exceptions/{_failure.value.__class__.__name__}",
spider=spider
)
def handle_spider_output(self, result, request, response, spider):
if not result:
return defer_succeed(None)
it = iter_errback(result, self.handle_spider_error, request, response, spider)
dfd = parallel(it, self.concurrent_items, self._process_spidermw_output,
request, response, spider)
return dfd
def _process_spidermw_output(self, output, request, response, spider):
"""Process each Request/Item (given in the output parameter) returned
from the given spider
"""
if isinstance(output, Request):
self.crawler.engine.crawl(request=output, spider=spider)
elif is_item(output):
self.slot.itemproc_size += 1
dfd = self.itemproc.process_item(output, spider)
dfd.addBoth(self._itemproc_finished, output, response, spider)
return dfd
elif output is None:
pass
else:
typename = type(output).__name__
logger.error(
'Spider must return request, item, or None, got %(typename)r in %(request)s',
{'request': request, 'typename': typename},
extra={'spider': spider},
)
def _log_download_errors(self, spider_failure, download_failure, request, spider):
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here)
"""
if isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest):
if download_failure.frames:
logkws = self.logformatter.download_error(download_failure, request, spider)
logger.log(
*logformatter_adapter(logkws),
extra={'spider': spider},
exc_info=failure_to_exc_info(download_failure),
)
else:
errmsg = download_failure.getErrorMessage()
if errmsg:
logkws = self.logformatter.download_error(
download_failure, request, spider, errmsg)
logger.log(
*logformatter_adapter(logkws),
extra={'spider': spider},
)
if spider_failure is not download_failure:
return spider_failure
def _itemproc_finished(self, output, item, response, spider):
"""ItemProcessor finished for the given ``item`` and returned ``output``
"""
self.slot.itemproc_size -= 1
if isinstance(output, Failure):
ex = output.value
if isinstance(ex, DropItem):
logkws = self.logformatter.dropped(item, ex, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
else:
logkws = self.logformatter.item_error(item, ex, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider},
exc_info=failure_to_exc_info(output))
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)
else:
logkws = self.logformatter.scraped(output, response, spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)

View file

@ -0,0 +1,128 @@
"""
Spider Middleware manager
See documentation in docs/topics/spider-middleware.rst
"""
from itertools import islice
from twisted.python.failure import Failure
from scrapy.exceptions import _InvalidOutput
from scrapy.middleware import MiddlewareManager
from scrapy.utils.conf import build_component_list
from scrapy.utils.defer import mustbe_deferred
from scrapy.utils.python import MutableChain
def _isiterable(possible_iterator):
return hasattr(possible_iterator, '__iter__')
def _fname(f):
return f"{f.__self__.__class__.__name__}.{f.__func__.__name__}"
class SpiderMiddlewareManager(MiddlewareManager):
component_name = 'spider middleware'
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
def _add_middleware(self, mw):
super()._add_middleware(mw)
if hasattr(mw, 'process_spider_input'):
self.methods['process_spider_input'].append(mw.process_spider_input)
if hasattr(mw, 'process_start_requests'):
self.methods['process_start_requests'].appendleft(mw.process_start_requests)
process_spider_output = getattr(mw, 'process_spider_output', None)
self.methods['process_spider_output'].appendleft(process_spider_output)
process_spider_exception = getattr(mw, 'process_spider_exception', None)
self.methods['process_spider_exception'].appendleft(process_spider_exception)
def scrape_response(self, scrape_func, response, request, spider):
def process_spider_input(response):
for method in self.methods['process_spider_input']:
try:
result = method(response=response, spider=spider)
if result is not None:
msg = (f"Middleware {_fname(method)} must return None "
f"or raise an exception, got {type(result)}")
raise _InvalidOutput(msg)
except _InvalidOutput:
raise
except Exception:
return scrape_func(Failure(), request, spider)
return scrape_func(response, request, spider)
def _evaluate_iterable(iterable, exception_processor_index, recover_to):
try:
for r in iterable:
yield r
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), exception_processor_index)
if isinstance(exception_result, Failure):
raise
recover_to.extend(exception_result)
def process_spider_exception(_failure, start_index=0):
exception = _failure.value
# don't handle _InvalidOutput exception
if isinstance(exception, _InvalidOutput):
return _failure
method_list = islice(self.methods['process_spider_exception'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
result = method(response=response, exception=exception, spider=spider)
if _isiterable(result):
# stop exception handling by handing control over to the
# process_spider_output chain if an iterable has been returned
return process_spider_output(result, method_index + 1)
elif result is None:
continue
else:
msg = (f"Middleware {_fname(method)} must return None "
f"or an iterable, got {type(result)}")
raise _InvalidOutput(msg)
return _failure
def process_spider_output(result, start_index=0):
# items in this iterable do not need to go through the process_spider_output
# chain, they went through it already from the process_spider_exception method
recovered = MutableChain()
method_list = islice(self.methods['process_spider_output'], start_index, None)
for method_index, method in enumerate(method_list, start=start_index):
if method is None:
continue
try:
# might fail directly if the output value is not a generator
result = method(response=response, result=result, spider=spider)
except Exception as ex:
exception_result = process_spider_exception(Failure(ex), method_index + 1)
if isinstance(exception_result, Failure):
raise
return exception_result
if _isiterable(result):
result = _evaluate_iterable(result, method_index + 1, recovered)
else:
msg = (f"Middleware {_fname(method)} must return an "
f"iterable, got {type(result)}")
raise _InvalidOutput(msg)
return MutableChain(result, recovered)
def process_callback_output(result):
recovered = MutableChain()
result = _evaluate_iterable(result, 0, recovered)
return MutableChain(process_spider_output(result), recovered)
dfd = mustbe_deferred(process_spider_input, response)
dfd.addCallbacks(callback=process_callback_output, errback=process_spider_exception)
return dfd
def process_start_requests(self, start_requests, spider):
return self._process_chain('process_start_requests', start_requests, spider)

View file

@ -0,0 +1,344 @@
import logging
import pprint
import signal
import warnings
from twisted.internet import defer
from zope.interface.exceptions import DoesNotImplement
try:
# zope >= 5.0 only supports MultipleInvalid
from zope.interface.exceptions import MultipleInvalid
except ImportError:
MultipleInvalid = None
from zope.interface.verify import verifyClass
from scrapy import signals, Spider
from scrapy.core.engine import ExecutionEngine
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.extension import ExtensionManager
from scrapy.interfaces import ISpiderLoader
from scrapy.settings import overridden_settings, Settings
from scrapy.signalmanager import SignalManager
from scrapy.utils.log import (
configure_logging,
get_scrapy_root_handler,
install_scrapy_root_handler,
log_scrapy_info,
LogCounterHandler,
)
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
from scrapy.utils.reactor import install_reactor, verify_installed_reactor
logger = logging.getLogger(__name__)
class Crawler:
def __init__(self, spidercls, settings=None):
if isinstance(spidercls, Spider):
raise ValueError('The spidercls argument must be a class, not an object')
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.spidercls = spidercls
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
logging.root.addHandler(handler)
d = dict(overridden_settings(self.settings))
logger.info("Overridden settings:\n%(settings)s",
{'settings': pprint.pformat(d)})
if get_scrapy_root_handler() is not None:
# scrapy root handler already installed: update it with new settings
install_scrapy_root_handler(self.settings)
# lambda is assigned to Crawler attribute because this way it is not
# garbage collected after leaving __init__ scope
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)
lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)
self.settings.freeze()
self.crawling = False
self.spider = None
self.engine = None
@defer.inlineCallbacks
def crawl(self, *args, **kwargs):
if self.crawling:
raise RuntimeError("Crawling already taking place")
self.crawling = True
try:
self.spider = self._create_spider(*args, **kwargs)
self.engine = self._create_engine()
start_requests = iter(self.spider.start_requests())
yield self.engine.open_spider(self.spider, start_requests)
yield defer.maybeDeferred(self.engine.start)
except Exception:
self.crawling = False
if self.engine is not None:
yield self.engine.close()
raise
def _create_spider(self, *args, **kwargs):
return self.spidercls.from_crawler(self, *args, **kwargs)
def _create_engine(self):
return ExecutionEngine(self, lambda _: self.stop())
@defer.inlineCallbacks
def stop(self):
"""Starts a graceful stop of the crawler and returns a deferred that is
fired when the crawler is stopped."""
if self.crawling:
self.crawling = False
yield defer.maybeDeferred(self.engine.stop)
class CrawlerRunner:
"""
This is a convenient helper class that keeps track of, manages and runs
crawlers inside an already setup :mod:`~twisted.internet.reactor`.
The CrawlerRunner object must be instantiated with a
:class:`~scrapy.settings.Settings` object.
This class shouldn't be needed (since Scrapy is responsible of using it
accordingly) unless writing scripts that manually handle the crawling
process. See :ref:`run-from-script` for an example.
"""
crawlers = property(
lambda self: self._crawlers,
doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
":meth:`crawl` and managed by this class."
)
@staticmethod
def _get_spider_loader(settings):
""" Get SpiderLoader instance from settings """
cls_path = settings.get('SPIDER_LOADER_CLASS')
loader_cls = load_object(cls_path)
excs = (DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
try:
verifyClass(ISpiderLoader, loader_cls)
except excs:
warnings.warn(
'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
'not fully implement scrapy.interfaces.ISpiderLoader interface. '
'Please add all missing methods to avoid unexpected runtime errors.',
category=ScrapyDeprecationWarning, stacklevel=2
)
return loader_cls.from_settings(settings.frozencopy())
def __init__(self, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
self.spider_loader = self._get_spider_loader(settings)
self._crawlers = set()
self._active = set()
self.bootstrap_failed = False
self._handle_twisted_reactor()
@property
def spiders(self):
warnings.warn("CrawlerRunner.spiders attribute is renamed to "
"CrawlerRunner.spider_loader.",
category=ScrapyDeprecationWarning, stacklevel=2)
return self.spider_loader
def crawl(self, crawler_or_spidercls, *args, **kwargs):
"""
Run a crawler with the provided arguments.
It will call the given Crawler's :meth:`~Crawler.crawl` method, while
keeping track of it so it can be stopped later.
If ``crawler_or_spidercls`` isn't a :class:`~scrapy.crawler.Crawler`
instance, this method will try to create one using this parameter as
the spider class given to it.
Returns a deferred that is fired when the crawling is finished.
:param crawler_or_spidercls: already created crawler, or a spider class
or spider's name inside the project to create it
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
:class:`~scrapy.spiders.Spider` subclass or string
:param args: arguments to initialize the spider
:param kwargs: keyword arguments to initialize the spider
"""
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
'it must be a spider class (or a Crawler object)')
crawler = self.create_crawler(crawler_or_spidercls)
return self._crawl(crawler, *args, **kwargs)
def _crawl(self, crawler, *args, **kwargs):
self.crawlers.add(crawler)
d = crawler.crawl(*args, **kwargs)
self._active.add(d)
def _done(result):
self.crawlers.discard(crawler)
self._active.discard(d)
self.bootstrap_failed |= not getattr(crawler, 'spider', None)
return result
return d.addBoth(_done)
def create_crawler(self, crawler_or_spidercls):
"""
Return a :class:`~scrapy.crawler.Crawler` object.
* If ``crawler_or_spidercls`` is a Crawler, it is returned as-is.
* If ``crawler_or_spidercls`` is a Spider subclass, a new Crawler
is constructed for it.
* If ``crawler_or_spidercls`` is a string, this function finds
a spider with this name in a Scrapy project (using spider loader),
then creates a Crawler instance for it.
"""
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
'it must be a spider class (or a Crawler object)')
if isinstance(crawler_or_spidercls, Crawler):
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls)
def _create_crawler(self, spidercls):
if isinstance(spidercls, str):
spidercls = self.spider_loader.load(spidercls)
return Crawler(spidercls, self.settings)
def stop(self):
"""
Stops simultaneously all the crawling jobs taking place.
Returns a deferred that is fired when they all have ended.
"""
return defer.DeferredList([c.stop() for c in list(self.crawlers)])
@defer.inlineCallbacks
def join(self):
"""
join()
Returns a deferred that is fired when all managed :attr:`crawlers` have
completed their executions.
"""
while self._active:
yield defer.DeferredList(self._active)
def _handle_twisted_reactor(self):
if self.settings.get("TWISTED_REACTOR"):
verify_installed_reactor(self.settings["TWISTED_REACTOR"])
class CrawlerProcess(CrawlerRunner):
"""
A class to run multiple scrapy crawlers in a process simultaneously.
This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
for starting a :mod:`~twisted.internet.reactor` and handling shutdown
signals, like the keyboard interrupt command Ctrl-C. It also configures
top-level logging.
This utility should be a better fit than
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
:mod:`~twisted.internet.reactor` within your application.
The CrawlerProcess object must be instantiated with a
:class:`~scrapy.settings.Settings` object.
:param install_root_handler: whether to install root logging handler
(default: True)
This class shouldn't be needed (since Scrapy is responsible of using it
accordingly) unless writing scripts that manually handle the crawling
process. See :ref:`run-from-script` for an example.
"""
def __init__(self, settings=None, install_root_handler=True):
super().__init__(settings)
install_shutdown_handlers(self._signal_shutdown)
configure_logging(self.settings, install_root_handler)
log_scrapy_info(self.settings)
def _signal_shutdown(self, signum, _):
from twisted.internet import reactor
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
{'signame': signame})
reactor.callFromThread(self._graceful_stop_reactor)
def _signal_kill(self, signum, _):
from twisted.internet import reactor
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
logger.info('Received %(signame)s twice, forcing unclean shutdown',
{'signame': signame})
reactor.callFromThread(self._stop_reactor)
def start(self, stop_after_crawl=True):
"""
This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool
size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache
based on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
If ``stop_after_crawl`` is True, the reactor will be stopped after all
crawlers have finished, using :meth:`join`.
:param bool stop_after_crawl: stop or not the reactor when all
crawlers have finished
"""
from twisted.internet import reactor
if stop_after_crawl:
d = self.join()
# Don't start the reactor if the deferreds are already fired
if d.called:
return
d.addBoth(self._stop_reactor)
resolver_class = load_object(self.settings["DNS_RESOLVER"])
resolver = create_instance(resolver_class, self.settings, self, reactor=reactor)
resolver.install_on_reactor()
tp = reactor.getThreadPool()
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
reactor.run(installSignalHandlers=False) # blocking call
def _graceful_stop_reactor(self):
d = self.stop()
d.addBoth(self._stop_reactor)
return d
def _stop_reactor(self, _=None):
from twisted.internet import reactor
try:
reactor.stop()
except RuntimeError: # raised if already stopped or in shutdown stage
pass
def _handle_twisted_reactor(self):
if self.settings.get("TWISTED_REACTOR"):
install_reactor(self.settings["TWISTED_REACTOR"], self.settings["ASYNCIO_EVENT_LOOP"])
super()._handle_twisted_reactor()

View file

@ -0,0 +1,93 @@
import re
import logging
from w3lib import html
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
logger = logging.getLogger(__name__)
class AjaxCrawlMiddleware:
"""
Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
"""
def __init__(self, settings):
if not settings.getbool('AJAXCRAWL_ENABLED'):
raise NotConfigured
# XXX: Google parses at least first 100k bytes; scrapy's redirect
# middleware parses first 4k. 4k turns out to be insufficient
# for this middleware, and parsing 100k could be slow.
# We use something in between (32K) by default.
self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_response(self, request, response, spider):
if not isinstance(response, HtmlResponse) or response.status != 200:
return response
if request.method != 'GET':
# other HTTP methods are either not safe or don't have a body
return response
if 'ajax_crawlable' in request.meta: # prevent loops
return response
if not self._has_ajax_crawlable_variant(response):
return response
# scrapy already handles #! links properly
ajax_crawl_request = request.replace(url=request.url + '#!')
logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
{'ajax_crawl_request': ajax_crawl_request, 'request': request},
extra={'spider': spider})
ajax_crawl_request.meta['ajax_crawlable'] = True
return ajax_crawl_request
def _has_ajax_crawlable_variant(self, response):
"""
Return True if a page without hash fragment could be "AJAX crawlable"
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
"""
body = response.text[:self.lookup_bytes]
return _has_ajaxcrawlable_meta(body)
# XXX: move it to w3lib?
_ajax_crawlable_re = re.compile(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>')
def _has_ajaxcrawlable_meta(text):
"""
>>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
True
>>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
True
>>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
False
>>> _has_ajaxcrawlable_meta('<html></html>')
False
"""
# Stripping scripts and comments is slow (about 20x slower than
# just checking if a string is in text); this is a quick fail-fast
# path that should work for most pages.
if 'fragment' not in text:
return False
if 'content' not in text:
return False
text = html.remove_tags_with_content(text, ('script', 'noscript'))
text = html.replace_entities(text)
text = html.remove_comments(text)
return _ajax_crawlable_re.search(text) is not None

View file

@ -0,0 +1,110 @@
import logging
from collections import defaultdict
from scrapy.exceptions import NotConfigured
from scrapy.http import Response
from scrapy.http.cookies import CookieJar
from scrapy.utils.python import to_unicode
logger = logging.getLogger(__name__)
class CookiesMiddleware:
"""This middleware enables working with sites that need cookies"""
def __init__(self, debug=False):
self.jars = defaultdict(CookieJar)
self.debug = debug
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COOKIES_ENABLED'):
raise NotConfigured
return cls(crawler.settings.getbool('COOKIES_DEBUG'))
def process_request(self, request, spider):
if request.meta.get('dont_merge_cookies', False):
return
cookiejarkey = request.meta.get("cookiejar")
jar = self.jars[cookiejarkey]
for cookie in self._get_request_cookies(jar, request):
jar.set_cookie_if_ok(cookie, request)
# set Cookie header
request.headers.pop('Cookie', None)
jar.add_cookie_header(request)
self._debug_cookie(request, spider)
def process_response(self, request, response, spider):
if request.meta.get('dont_merge_cookies', False):
return response
# extract cookies from Set-Cookie and drop invalid/expired cookies
cookiejarkey = request.meta.get("cookiejar")
jar = self.jars[cookiejarkey]
jar.extract_cookies(response, request)
self._debug_set_cookie(response, spider)
return response
def _debug_cookie(self, request, spider):
if self.debug:
cl = [to_unicode(c, errors='replace')
for c in request.headers.getlist('Cookie')]
if cl:
cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
msg = f"Sending cookies to: {request}\n{cookies}"
logger.debug(msg, extra={'spider': spider})
def _debug_set_cookie(self, response, spider):
if self.debug:
cl = [to_unicode(c, errors='replace')
for c in response.headers.getlist('Set-Cookie')]
if cl:
cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
msg = f"Received cookies from: {response}\n{cookies}"
logger.debug(msg, extra={'spider': spider})
def _format_cookie(self, cookie, request):
"""
Given a dict consisting of cookie components, return its string representation.
Decode from bytes if necessary.
"""
decoded = {}
for key in ("name", "value", "path", "domain"):
if cookie.get(key) is None:
if key in ("name", "value"):
msg = "Invalid cookie found in request {}: {} ('{}' is missing)"
logger.warning(msg.format(request, cookie, key))
return
continue
if isinstance(cookie[key], str):
decoded[key] = cookie[key]
else:
try:
decoded[key] = cookie[key].decode("utf8")
except UnicodeDecodeError:
logger.warning("Non UTF-8 encoded cookie found in request %s: %s",
request, cookie)
decoded[key] = cookie[key].decode("latin1", errors="replace")
cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
for key, value in decoded.items(): # path, domain
cookie_str += f"; {key.capitalize()}={value}"
return cookie_str
def _get_request_cookies(self, jar, request):
"""
Extract cookies from the Request.cookies attribute
"""
if not request.cookies:
return []
elif isinstance(request.cookies, dict):
cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
else:
cookies = request.cookies
formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
response = Response(request.url, headers={"Set-Cookie": formatted})
return jar.make_cookies(response, request)

View file

@ -0,0 +1,83 @@
""" This module implements the DecompressionMiddleware which tries to recognise
and extract the potentially compressed responses that may arrive.
"""
import bz2
import gzip
import logging
import tarfile
import zipfile
from io import BytesIO
from tempfile import mktemp
from scrapy.responsetypes import responsetypes
logger = logging.getLogger(__name__)
class DecompressionMiddleware:
""" This middleware tries to recognise and extract the possibly compressed
responses that may arrive. """
def __init__(self):
self._formats = {
'tar': self._is_tar,
'zip': self._is_zip,
'gz': self._is_gzip,
'bz2': self._is_bzip2
}
def _is_tar(self, response):
archive = BytesIO(response.body)
try:
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
except tarfile.ReadError:
return
body = tar_file.extractfile(tar_file.members[0]).read()
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
return response.replace(body=body, cls=respcls)
def _is_zip(self, response):
archive = BytesIO(response.body)
try:
zip_file = zipfile.ZipFile(archive)
except zipfile.BadZipfile:
return
namelist = zip_file.namelist()
body = zip_file.read(namelist[0])
respcls = responsetypes.from_args(filename=namelist[0], body=body)
return response.replace(body=body, cls=respcls)
def _is_gzip(self, response):
archive = BytesIO(response.body)
try:
body = gzip.GzipFile(fileobj=archive).read()
except IOError:
return
respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)
def _is_bzip2(self, response):
try:
body = bz2.decompress(response.body)
except IOError:
return
respcls = responsetypes.from_args(body=body)
return response.replace(body=body, cls=respcls)
def process_response(self, request, response, spider):
if not response.body:
return response
for fmt, func in self._formats.items():
new_response = func(response)
if new_response:
logger.debug('Decompressed response with format: %(responsefmt)s',
{'responsefmt': fmt}, extra={'spider': spider})
return new_response
return response

View file

@ -0,0 +1,22 @@
"""
DefaultHeaders downloader middleware
See documentation in docs/topics/downloader-middleware.rst
"""
from scrapy.utils.python import without_none_values
class DefaultHeadersMiddleware:
def __init__(self, headers):
self._headers = headers
@classmethod
def from_crawler(cls, crawler):
headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS'])
return cls(headers.items())
def process_request(self, request, spider):
for k, v in self._headers:
request.headers.setdefault(k, v)

View file

@ -0,0 +1,26 @@
"""
Download timeout middleware
See documentation in docs/topics/downloader-middleware.rst
"""
from scrapy import signals
class DownloadTimeoutMiddleware:
def __init__(self, timeout=180):
self._timeout = timeout
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings.getfloat('DOWNLOAD_TIMEOUT'))
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self._timeout = getattr(spider, 'download_timeout', self._timeout)
def process_request(self, request, spider):
if self._timeout:
request.meta.setdefault('download_timeout', self._timeout)

View file

@ -0,0 +1,31 @@
"""
HTTP basic auth downloader middleware
See documentation in docs/topics/downloader-middleware.rst
"""
from w3lib.http import basic_auth_header
from scrapy import signals
class HttpAuthMiddleware:
"""Set Basic HTTP Authorization header
(http_user and http_pass spider class attributes)"""
@classmethod
def from_crawler(cls, crawler):
o = cls()
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
usr = getattr(spider, 'http_user', '')
pwd = getattr(spider, 'http_pass', '')
if usr or pwd:
self.auth = basic_auth_header(usr, pwd)
def process_request(self, request, spider):
auth = getattr(self, 'auth', None)
if auth and b'Authorization' not in request.headers:
request.headers[b'Authorization'] = auth

View file

@ -0,0 +1,133 @@
from email.utils import formatdate
from typing import Optional, Type, TypeVar
from twisted.internet import defer
from twisted.internet.error import (
ConnectError,
ConnectionDone,
ConnectionLost,
ConnectionRefusedError,
DNSLookupError,
TCPTimedOutError,
TimeoutError,
)
from twisted.web.client import ResponseFailed
from scrapy import signals
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http.request import Request
from scrapy.http.response import Response
from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.statscollectors import StatsCollector
from scrapy.utils.misc import load_object
HttpCacheMiddlewareTV = TypeVar("HttpCacheMiddlewareTV", bound="HttpCacheMiddleware")
class HttpCacheMiddleware:
DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError)
def __init__(self, settings: Settings, stats: StatsCollector) -> None:
if not settings.getbool('HTTPCACHE_ENABLED'):
raise NotConfigured
self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
self.stats = stats
@classmethod
def from_crawler(cls: Type[HttpCacheMiddlewareTV], crawler: Crawler) -> HttpCacheMiddlewareTV:
o = cls(crawler.settings, crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider: Spider) -> None:
self.storage.open_spider(spider)
def spider_closed(self, spider: Spider) -> None:
self.storage.close_spider(spider)
def process_request(self, request: Request, spider: Spider) -> Optional[Response]:
if request.meta.get('dont_cache', False):
return None
# Skip uncacheable requests
if not self.policy.should_cache_request(request):
request.meta['_dont_cache'] = True # flag as uncacheable
return None
# Look for cached response and check if expired
cachedresponse = self.storage.retrieve_response(spider, request)
if cachedresponse is None:
self.stats.inc_value('httpcache/miss', spider=spider)
if self.ignore_missing:
self.stats.inc_value('httpcache/ignore', spider=spider)
raise IgnoreRequest("Ignored request not in cache: %s" % request)
return None # first time request
# Return cached response only if not expired
cachedresponse.flags.append('cached')
if self.policy.is_cached_response_fresh(cachedresponse, request):
self.stats.inc_value('httpcache/hit', spider=spider)
return cachedresponse
# Keep a reference to cached response to avoid a second cache lookup on
# process_response hook
request.meta['cached_response'] = cachedresponse
return None
def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
if request.meta.get('dont_cache', False):
return response
# Skip cached responses and uncacheable requests
if 'cached' in response.flags or '_dont_cache' in request.meta:
request.meta.pop('_dont_cache', None)
return response
# RFC2616 requires origin server to set Date header,
# https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
if 'Date' not in response.headers:
response.headers['Date'] = formatdate(usegmt=True)
# Do not validate first-hand responses
cachedresponse = request.meta.pop('cached_response', None)
if cachedresponse is None:
self.stats.inc_value('httpcache/firsthand', spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
if self.policy.is_cached_response_valid(cachedresponse, response, request):
self.stats.inc_value('httpcache/revalidate', spider=spider)
return cachedresponse
self.stats.inc_value('httpcache/invalidate', spider=spider)
self._cache_response(spider, response, request, cachedresponse)
return response
def process_exception(
self, request: Request, exception: Exception, spider: Spider
) -> Optional[Response]:
cachedresponse = request.meta.pop('cached_response', None)
if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
self.stats.inc_value('httpcache/errorrecovery', spider=spider)
return cachedresponse
return None
def _cache_response(
self, spider: Spider, response: Response, request: Request, cachedresponse: Optional[Response]
) -> None:
if self.policy.should_cache_response(response, request):
self.stats.inc_value('httpcache/store', spider=spider)
self.storage.store_response(spider, request, response)
else:
self.stats.inc_value('httpcache/uncacheable', spider=spider)

View file

@ -0,0 +1,82 @@
import io
import zlib
from scrapy.utils.gz import gunzip
from scrapy.http import Response, TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.exceptions import NotConfigured
ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
try:
import brotli
ACCEPTED_ENCODINGS.append(b'br')
except ImportError:
pass
try:
import zstandard
ACCEPTED_ENCODINGS.append(b'zstd')
except ImportError:
pass
class HttpCompressionMiddleware:
"""This middleware allows compressed (gzip, deflate) traffic to be
sent/received from web sites"""
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('COMPRESSION_ENABLED'):
raise NotConfigured
return cls()
def process_request(self, request, spider):
request.headers.setdefault('Accept-Encoding',
b", ".join(ACCEPTED_ENCODINGS))
def process_response(self, request, response, spider):
if request.method == 'HEAD':
return response
if isinstance(response, Response):
content_encoding = response.headers.getlist('Content-Encoding')
if content_encoding:
encoding = content_encoding.pop()
decoded_body = self._decode(response.body, encoding.lower())
respcls = responsetypes.from_args(
headers=response.headers, url=response.url, body=decoded_body
)
kwargs = dict(cls=respcls, body=decoded_body)
if issubclass(respcls, TextResponse):
# force recalculating the encoding until we make sure the
# responsetypes guessing is reliable
kwargs['encoding'] = None
response = response.replace(**kwargs)
if not content_encoding:
del response.headers['Content-Encoding']
return response
def _decode(self, body, encoding):
if encoding == b'gzip' or encoding == b'x-gzip':
body = gunzip(body)
if encoding == b'deflate':
try:
body = zlib.decompress(body)
except zlib.error:
# ugly hack to work with raw deflate content that may
# be sent by microsoft servers. For more information, see:
# http://carsten.codimi.de/gzip.yaws/
# http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
# http://www.gzip.org/zlib/zlib_faq.html#faq38
body = zlib.decompress(body, -15)
if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
body = brotli.decompress(body)
if encoding == b'zstd' and b'zstd' in ACCEPTED_ENCODINGS:
# Using its streaming API since its simple API could handle only cases
# where there is content size data embedded in the frame
reader = zstandard.ZstdDecompressor().stream_reader(io.BytesIO(body))
body = reader.read()
return body

View file

@ -0,0 +1,75 @@
import base64
from urllib.parse import unquote, urlunparse
from urllib.request import getproxies, proxy_bypass, _parse_proxy
from scrapy.exceptions import NotConfigured
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
class HttpProxyMiddleware:
def __init__(self, auth_encoding='latin-1'):
self.auth_encoding = auth_encoding
self.proxies = {}
for type_, url in getproxies().items():
try:
self.proxies[type_] = self._get_proxy(url, type_)
# some values such as '/var/run/docker.sock' can't be parsed
# by _parse_proxy and as such should be skipped
except ValueError:
continue
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
raise NotConfigured
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
return cls(auth_encoding)
def _basic_auth_header(self, username, password):
user_pass = to_bytes(
f'{unquote(username)}:{unquote(password)}',
encoding=self.auth_encoding)
return base64.b64encode(user_pass)
def _get_proxy(self, url, orig_type):
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
if user:
creds = self._basic_auth_header(user, password)
else:
creds = None
return creds, proxy_url
def process_request(self, request, spider):
# ignore if proxy is already set
if 'proxy' in request.meta:
if request.meta['proxy'] is None:
return
# extract credentials if present
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
request.meta['proxy'] = proxy_url
if creds and not request.headers.get('Proxy-Authorization'):
request.headers['Proxy-Authorization'] = b'Basic ' + creds
return
elif not self.proxies:
return
parsed = urlparse_cached(request)
scheme = parsed.scheme
# 'no_proxy' is only supported by http schemes
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
return
if scheme in self.proxies:
self._set_proxy(request, scheme)
def _set_proxy(self, request, scheme):
creds, proxy = self.proxies[scheme]
request.meta['proxy'] = proxy
if creds:
request.headers['Proxy-Authorization'] = b'Basic ' + creds

View file

@ -0,0 +1,113 @@
import logging
from urllib.parse import urljoin, urlparse
from w3lib.url import safe_url_string
from scrapy.http import HtmlResponse
from scrapy.utils.response import get_meta_refresh
from scrapy.exceptions import IgnoreRequest, NotConfigured
logger = logging.getLogger(__name__)
class BaseRedirectMiddleware:
enabled_setting = 'REDIRECT_ENABLED'
def __init__(self, settings):
if not settings.getbool(self.enabled_setting):
raise NotConfigured
self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def _redirect(self, redirected, request, spider, reason):
ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
redirects = request.meta.get('redirect_times', 0) + 1
if ttl and redirects <= self.max_redirect_times:
redirected.meta['redirect_times'] = redirects
redirected.meta['redirect_ttl'] = ttl - 1
redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + [request.url]
redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + [reason]
redirected.dont_filter = request.dont_filter
redirected.priority = request.priority + self.priority_adjust
logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
{'reason': reason, 'redirected': redirected, 'request': request},
extra={'spider': spider})
return redirected
else:
logger.debug("Discarding %(request)s: max redirections reached",
{'request': request}, extra={'spider': spider})
raise IgnoreRequest("max redirections reached")
def _redirect_request_using_get(self, request, redirect_url):
redirected = request.replace(url=redirect_url, method='GET', body='')
redirected.headers.pop('Content-Type', None)
redirected.headers.pop('Content-Length', None)
return redirected
class RedirectMiddleware(BaseRedirectMiddleware):
"""
Handle redirection of requests based on response status
and meta-refresh html tag.
"""
def process_response(self, request, response, spider):
if (
request.meta.get('dont_redirect', False)
or response.status in getattr(spider, 'handle_httpstatus_list', [])
or response.status in request.meta.get('handle_httpstatus_list', [])
or request.meta.get('handle_httpstatus_all', False)
):
return response
allowed_status = (301, 302, 303, 307, 308)
if 'Location' not in response.headers or response.status not in allowed_status:
return response
location = safe_url_string(response.headers['Location'])
if response.headers['Location'].startswith(b'//'):
request_scheme = urlparse(request.url).scheme
location = request_scheme + '://' + location.lstrip('/')
redirected_url = urljoin(request.url, location)
if response.status in (301, 307, 308) or request.method == 'HEAD':
redirected = request.replace(url=redirected_url)
return self._redirect(redirected, request, spider, response.status)
redirected = self._redirect_request_using_get(request, redirected_url)
return self._redirect(redirected, request, spider, response.status)
class MetaRefreshMiddleware(BaseRedirectMiddleware):
enabled_setting = 'METAREFRESH_ENABLED'
def __init__(self, settings):
super().__init__(settings)
self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
self._maxdelay = settings.getint('METAREFRESH_MAXDELAY')
def process_response(self, request, response, spider):
if (
request.meta.get('dont_redirect', False)
or request.method == 'HEAD'
or not isinstance(response, HtmlResponse)
):
return response
interval, url = get_meta_refresh(response,
ignore_tags=self._ignore_tags)
if url and interval < self._maxdelay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, 'meta refresh')
return response

View file

@ -0,0 +1,97 @@
"""
An extension to retry failed requests that are potentially caused by temporary
problems such as a connection timeout or HTTP 500 error.
You can change the behaviour of this middleware by modifing the scraping settings:
RETRY_TIMES - how many times to retry a failed page
RETRY_HTTP_CODES - which HTTP response codes to retry
Failed pages are collected on the scraping process and rescheduled at the end,
once the spider has finished crawling all regular (non failed) pages.
"""
import logging
from twisted.internet import defer
from twisted.internet.error import (
ConnectError,
ConnectionDone,
ConnectionLost,
ConnectionRefusedError,
DNSLookupError,
TCPTimedOutError,
TimeoutError,
)
from twisted.web.client import ResponseFailed
from scrapy.exceptions import NotConfigured
from scrapy.utils.response import response_status_message
from scrapy.core.downloader.handlers.http11 import TunnelError
from scrapy.utils.python import global_object_name
logger = logging.getLogger(__name__)
class RetryMiddleware:
# IOError is raised by the HttpCompression middleware when trying to
# decompress an empty response
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError)
def __init__(self, settings):
if not settings.getbool('RETRY_ENABLED'):
raise NotConfigured
self.max_retry_times = settings.getint('RETRY_TIMES')
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
if (
isinstance(exception, self.EXCEPTIONS_TO_RETRY)
and not request.meta.get('dont_retry', False)
):
return self._retry(request, exception, spider)
def _retry(self, request, reason, spider):
retries = request.meta.get('retry_times', 0) + 1
retry_times = self.max_retry_times
if 'max_retry_times' in request.meta:
retry_times = request.meta['max_retry_times']
stats = spider.crawler.stats
if retries <= retry_times:
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
retryreq.priority = request.priority + self.priority_adjust
if isinstance(reason, Exception):
reason = global_object_name(reason.__class__)
stats.inc_value('retry/count')
stats.inc_value(f'retry/reason_count/{reason}')
return retryreq
else:
stats.inc_value('retry/max_reached')
logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})

View file

@ -0,0 +1,109 @@
"""
This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
import logging
from twisted.internet.defer import Deferred, maybeDeferred
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import load_object
logger = logging.getLogger(__name__)
class RobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
raise NotConfigured
self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None)
self.crawler = crawler
self._parsers = {}
self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
# check if parser dependencies are met, this should throw an error otherwise.
self._parserimpl.from_crawler(self.crawler, b'')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
if request.meta.get('dont_obey_robotstxt'):
return
d = maybeDeferred(self.robot_parser, request, spider)
d.addCallback(self.process_request_2, request, spider)
return d
def process_request_2(self, rp, request, spider):
if rp is None:
return
useragent = self._robotstxt_useragent
if not useragent:
useragent = request.headers.get(b'User-Agent', self._default_useragent)
if not rp.allowed(request.url, useragent):
logger.debug("Forbidden by robots.txt: %(request)s",
{'request': request}, extra={'spider': spider})
self.crawler.stats.inc_value('robotstxt/forbidden')
raise IgnoreRequest("Forbidden by robots.txt")
def robot_parser(self, request, spider):
url = urlparse_cached(request)
netloc = url.netloc
if netloc not in self._parsers:
self._parsers[netloc] = Deferred()
robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
robotsreq = Request(
robotsurl,
priority=self.DOWNLOAD_PRIORITY,
meta={'dont_obey_robotstxt': True}
)
dfd = self.crawler.engine.download(robotsreq, spider)
dfd.addCallback(self._parse_robots, netloc, spider)
dfd.addErrback(self._logerror, robotsreq, spider)
dfd.addErrback(self._robots_error, netloc)
self.crawler.stats.inc_value('robotstxt/request_count')
if isinstance(self._parsers[netloc], Deferred):
d = Deferred()
def cb(result):
d.callback(result)
return result
self._parsers[netloc].addCallback(cb)
return d
else:
return self._parsers[netloc]
def _logerror(self, failure, request, spider):
if failure.type is not IgnoreRequest:
logger.error("Error downloading %(request)s: %(f_exception)s",
{'request': request, 'f_exception': failure.value},
exc_info=failure_to_exc_info(failure),
extra={'spider': spider})
return failure
def _parse_robots(self, response, netloc, spider):
self.crawler.stats.inc_value('robotstxt/response_count')
self.crawler.stats.inc_value(f'robotstxt/response_status_count/{response.status}')
rp = self._parserimpl.from_crawler(self.crawler, response.body)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = rp
rp_dfd.callback(rp)
def _robots_error(self, failure, netloc):
if failure.type is not IgnoreRequest:
key = f'robotstxt/exception_count/{failure.type}'
self.crawler.stats.inc_value(key)
rp_dfd = self._parsers[netloc]
self._parsers[netloc] = None
rp_dfd.callback(None)

View file

@ -0,0 +1,34 @@
from scrapy.exceptions import NotConfigured
from scrapy.utils.request import request_httprepr
from scrapy.utils.response import response_httprepr
from scrapy.utils.python import global_object_name
class DownloaderStats:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('DOWNLOADER_STATS'):
raise NotConfigured
return cls(crawler.stats)
def process_request(self, request, spider):
self.stats.inc_value('downloader/request_count', spider=spider)
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
reqlen = len(request_httprepr(request))
self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
def process_response(self, request, response, spider):
self.stats.inc_value('downloader/response_count', spider=spider)
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
reslen = len(response_httprepr(response))
self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
return response
def process_exception(self, request, exception, spider):
ex_class = global_object_name(exception.__class__)
self.stats.inc_value('downloader/exception_count', spider=spider)
self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)

View file

@ -0,0 +1,23 @@
"""Set User-Agent header per spider or use a default value from settings"""
from scrapy import signals
class UserAgentMiddleware:
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.settings['USER_AGENT'])
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)

View file

@ -0,0 +1,73 @@
import os
import logging
from scrapy.utils.job import job_dir
from scrapy.utils.request import referer_str, request_fingerprint
class BaseDupeFilter:
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
return False
def open(self): # can return deferred
pass
def close(self, reason): # can return a deferred
pass
def log(self, request, spider): # log that a request has been filtered
pass
class RFPDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
def __init__(self, path=None, debug=False):
self.file = None
self.fingerprints = set()
self.logdupes = True
self.debug = debug
self.logger = logging.getLogger(__name__)
if path:
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
self.file.seek(0)
self.fingerprints.update(x.rstrip() for x in self.file)
@classmethod
def from_settings(cls, settings):
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(job_dir(settings), debug)
def request_seen(self, request):
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + '\n')
def request_fingerprint(self, request):
return request_fingerprint(request)
def close(self, reason):
if self.file:
self.file.close()
def log(self, request, spider):
if self.debug:
msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
args = {'request': request, 'referer': referer_str(request)}
self.logger.debug(msg, args, extra={'spider': spider})
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)

View file

@ -0,0 +1,89 @@
"""
Scrapy core exceptions
These exceptions are documented in docs/topics/exceptions.rst. Please don't add
new exceptions here without documenting them there.
"""
# Internal
class NotConfigured(Exception):
"""Indicates a missing configuration situation"""
pass
class _InvalidOutput(TypeError):
"""
Indicates an invalid value has been returned by a middleware's processing method.
Internal and undocumented, it should not be raised or caught by user code.
"""
pass
# HTTP and crawling
class IgnoreRequest(Exception):
"""Indicates a decision was made not to process a request"""
class DontCloseSpider(Exception):
"""Request the spider not to be closed yet"""
pass
class CloseSpider(Exception):
"""Raise this from callbacks to request the spider to be closed"""
def __init__(self, reason='cancelled'):
super().__init__()
self.reason = reason
class StopDownload(Exception):
"""
Stop the download of the body for a given response.
The 'fail' boolean parameter indicates whether or not the resulting partial response
should be handled by the request errback. Note that 'fail' is a keyword-only argument.
"""
def __init__(self, *, fail=True):
super().__init__()
self.fail = fail
# Items
class DropItem(Exception):
"""Drop item from the item pipeline"""
pass
class NotSupported(Exception):
"""Indicates a feature or method is not supported"""
pass
# Commands
class UsageError(Exception):
"""To indicate a command-line usage error"""
def __init__(self, *a, **kw):
self.print_help = kw.pop('print_help', True)
super().__init__(*a, **kw)
class ScrapyDeprecationWarning(Warning):
"""Warning category for deprecated features, since the default
DeprecationWarning is silenced on Python 2.7+
"""
pass
class ContractFail(AssertionError):
"""Error raised in case of a failing contract"""
pass

View file

@ -0,0 +1,338 @@
"""
Item Exporters are used to export/serialize items into different formats.
"""
import csv
import io
import marshal
import pickle
import pprint
import warnings
from xml.sax.saxutils import XMLGenerator
from itemadapter import is_item, ItemAdapter
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.item import _BaseItem
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from scrapy.utils.serialize import ScrapyJSONEncoder
__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
'CsvItemExporter', 'XmlItemExporter', 'JsonLinesItemExporter',
'JsonItemExporter', 'MarshalItemExporter']
class BaseItemExporter:
def __init__(self, *, dont_fail=False, **kwargs):
self._kwargs = kwargs
self._configure(kwargs, dont_fail=dont_fail)
def _configure(self, options, dont_fail=False):
"""Configure the exporter by poping options from the ``options`` dict.
If dont_fail is set, it won't raise an exception on unexpected options
(useful for using with keyword arguments in subclasses ``__init__`` methods)
"""
self.encoding = options.pop('encoding', None)
self.fields_to_export = options.pop('fields_to_export', None)
self.export_empty_fields = options.pop('export_empty_fields', False)
self.indent = options.pop('indent', None)
if not dont_fail and options:
raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
def export_item(self, item):
raise NotImplementedError
def serialize_field(self, field, name, value):
serializer = field.get('serializer', lambda x: x)
return serializer(value)
def start_exporting(self):
pass
def finish_exporting(self):
pass
def _get_serialized_fields(self, item, default_value=None, include_empty=None):
"""Return the fields to export as an iterable of tuples
(name, serialized_value)
"""
item = ItemAdapter(item)
if include_empty is None:
include_empty = self.export_empty_fields
if self.fields_to_export is None:
if include_empty:
field_iter = item.field_names()
else:
field_iter = item.keys()
else:
if include_empty:
field_iter = self.fields_to_export
else:
field_iter = (x for x in self.fields_to_export if x in item)
for field_name in field_iter:
if field_name in item:
field_meta = item.get_field_meta(field_name)
value = self.serialize_field(field_meta, field_name, item[field_name])
else:
value = default_value
yield field_name, value
class JsonLinesItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
self._kwargs.setdefault('ensure_ascii', not self.encoding)
self.encoder = ScrapyJSONEncoder(**self._kwargs)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
data = self.encoder.encode(itemdict) + '\n'
self.file.write(to_bytes(data, self.encoding))
class JsonItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
# there is a small difference between the behaviour or JsonItemExporter.indent
# and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
# the addition of newlines everywhere
json_indent = self.indent if self.indent is not None and self.indent > 0 else None
self._kwargs.setdefault('indent', json_indent)
self._kwargs.setdefault('ensure_ascii', not self.encoding)
self.encoder = ScrapyJSONEncoder(**self._kwargs)
self.first_item = True
def _beautify_newline(self):
if self.indent is not None:
self.file.write(b'\n')
def start_exporting(self):
self.file.write(b"[")
self._beautify_newline()
def finish_exporting(self):
self._beautify_newline()
self.file.write(b"]")
def export_item(self, item):
if self.first_item:
self.first_item = False
else:
self.file.write(b',')
self._beautify_newline()
itemdict = dict(self._get_serialized_fields(item))
data = self.encoder.encode(itemdict)
self.file.write(to_bytes(data, self.encoding))
class XmlItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self.item_element = kwargs.pop('item_element', 'item')
self.root_element = kwargs.pop('root_element', 'items')
super().__init__(**kwargs)
if not self.encoding:
self.encoding = 'utf-8'
self.xg = XMLGenerator(file, encoding=self.encoding)
def _beautify_newline(self, new_item=False):
if self.indent is not None and (self.indent > 0 or new_item):
self.xg.characters('\n')
def _beautify_indent(self, depth=1):
if self.indent:
self.xg.characters(' ' * self.indent * depth)
def start_exporting(self):
self.xg.startDocument()
self.xg.startElement(self.root_element, {})
self._beautify_newline(new_item=True)
def export_item(self, item):
self._beautify_indent(depth=1)
self.xg.startElement(self.item_element, {})
self._beautify_newline()
for name, value in self._get_serialized_fields(item, default_value=''):
self._export_xml_field(name, value, depth=2)
self._beautify_indent(depth=1)
self.xg.endElement(self.item_element)
self._beautify_newline(new_item=True)
def finish_exporting(self):
self.xg.endElement(self.root_element)
self.xg.endDocument()
def _export_xml_field(self, name, serialized_value, depth):
self._beautify_indent(depth=depth)
self.xg.startElement(name, {})
if hasattr(serialized_value, 'items'):
self._beautify_newline()
for subname, value in serialized_value.items():
self._export_xml_field(subname, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field('value', value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, str):
self.xg.characters(serialized_value)
else:
self.xg.characters(str(serialized_value))
self.xg.endElement(name)
self._beautify_newline()
class CsvItemExporter(BaseItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', errors=None, **kwargs):
super().__init__(dont_fail=True, **kwargs)
if not self.encoding:
self.encoding = 'utf-8'
self.include_headers_line = include_headers_line
self.stream = io.TextIOWrapper(
file,
line_buffering=False,
write_through=True,
encoding=self.encoding,
newline='', # Windows needs this https://github.com/scrapy/scrapy/issues/3034
errors=errors,
)
self.csv_writer = csv.writer(self.stream, **self._kwargs)
self._headers_not_written = True
self._join_multivalued = join_multivalued
def serialize_field(self, field, name, value):
serializer = field.get('serializer', self._join_if_needed)
return serializer(value)
def _join_if_needed(self, value):
if isinstance(value, (list, tuple)):
try:
return self._join_multivalued.join(value)
except TypeError: # list in value may not contain strings
pass
return value
def export_item(self, item):
if self._headers_not_written:
self._headers_not_written = False
self._write_headers_and_set_fields_to_export(item)
fields = self._get_serialized_fields(item, default_value='',
include_empty=True)
values = list(self._build_row(x for _, x in fields))
self.csv_writer.writerow(values)
def _build_row(self, values):
for s in values:
try:
yield to_unicode(s, self.encoding)
except TypeError:
yield s
def _write_headers_and_set_fields_to_export(self, item):
if self.include_headers_line:
if not self.fields_to_export:
# use declared field names, or keys if the item is a dict
self.fields_to_export = ItemAdapter(item).field_names()
row = list(self._build_row(self.fields_to_export))
self.csv_writer.writerow(row)
class PickleItemExporter(BaseItemExporter):
def __init__(self, file, protocol=4, **kwargs):
super().__init__(**kwargs)
self.file = file
self.protocol = protocol
def export_item(self, item):
d = dict(self._get_serialized_fields(item))
pickle.dump(d, self.file, self.protocol)
class MarshalItemExporter(BaseItemExporter):
"""Exports items in a Python-specific binary format (see
:mod:`marshal`).
:param file: The file-like object to use for exporting the data. Its
``write`` method should accept :class:`bytes` (a disk file
opened in binary mode, a :class:`~io.BytesIO` object, etc)
"""
def __init__(self, file, **kwargs):
super().__init__(**kwargs)
self.file = file
def export_item(self, item):
marshal.dump(dict(self._get_serialized_fields(item)), self.file)
class PprintItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(**kwargs)
self.file = file
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
self.file.write(to_bytes(pprint.pformat(itemdict) + '\n'))
class PythonItemExporter(BaseItemExporter):
"""This is a base class for item exporters that extends
:class:`BaseItemExporter` with support for nested items.
It serializes items to built-in Python types, so that any serialization
library (e.g. :mod:`json` or msgpack_) can be used on top of it.
.. _msgpack: https://pypi.org/project/msgpack/
"""
def _configure(self, options, dont_fail=False):
self.binary = options.pop('binary', True)
super()._configure(options, dont_fail)
if self.binary:
warnings.warn(
"PythonItemExporter will drop support for binary export in the future",
ScrapyDeprecationWarning)
if not self.encoding:
self.encoding = 'utf-8'
def serialize_field(self, field, name, value):
serializer = field.get('serializer', self._serialize_value)
return serializer(value)
def _serialize_value(self, value):
if isinstance(value, _BaseItem):
return self.export_item(value)
elif is_item(value):
return dict(self._serialize_item(value))
elif is_listlike(value):
return [self._serialize_value(v) for v in value]
encode_func = to_bytes if self.binary else to_unicode
if isinstance(value, (str, bytes)):
return encode_func(value, encoding=self.encoding)
return value
def _serialize_item(self, item):
for key, value in ItemAdapter(item).items():
key = to_bytes(key) if self.binary else key
yield key, self._serialize_value(value)
def export_item(self, item):
result = dict(self._get_serialized_fields(item))
if self.binary:
result = dict(self._serialize_item(result))
return result

View file

@ -0,0 +1,16 @@
"""
The Extension Manager
See documentation in docs/topics/extensions.rst
"""
from scrapy.middleware import MiddlewareManager
from scrapy.utils.conf import build_component_list
class ExtensionManager(MiddlewareManager):
component_name = 'extension'
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('EXTENSIONS'))

View file

@ -0,0 +1,68 @@
"""CloseSpider is an extension that forces spiders to be closed after certain
conditions are met.
See documentation in docs/topics/extensions.rst
"""
from collections import defaultdict
from scrapy import signals
from scrapy.exceptions import NotConfigured
class CloseSpider:
def __init__(self, crawler):
self.crawler = crawler
self.close_on = {
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
}
if not any(self.close_on.values()):
raise NotConfigured
self.counter = defaultdict(int)
if self.close_on.get('errorcount'):
crawler.signals.connect(self.error_count, signal=signals.spider_error)
if self.close_on.get('pagecount'):
crawler.signals.connect(self.page_count, signal=signals.response_received)
if self.close_on.get('timeout'):
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
if self.close_on.get('itemcount'):
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def error_count(self, failure, response, spider):
self.counter['errorcount'] += 1
if self.counter['errorcount'] == self.close_on['errorcount']:
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
def page_count(self, response, request, spider):
self.counter['pagecount'] += 1
if self.counter['pagecount'] == self.close_on['pagecount']:
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
def spider_opened(self, spider):
from twisted.internet import reactor
self.task = reactor.callLater(self.close_on['timeout'],
self.crawler.engine.close_spider, spider,
reason='closespider_timeout')
def item_scraped(self, item, spider):
self.counter['itemcount'] += 1
if self.counter['itemcount'] == self.close_on['itemcount']:
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
def spider_closed(self, spider):
task = getattr(self, 'task', False)
if task and task.active():
task.cancel()

View file

@ -0,0 +1,46 @@
"""
Extension for collecting core stats like items scraped and start/finish times
"""
from datetime import datetime
from scrapy import signals
class CoreStats:
def __init__(self, stats):
self.stats = stats
self.start_time = None
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
crawler.signals.connect(o.response_received, signal=signals.response_received)
return o
def spider_opened(self, spider):
self.start_time = datetime.utcnow()
self.stats.set_value('start_time', self.start_time, spider=spider)
def spider_closed(self, spider, reason):
finish_time = datetime.utcnow()
elapsed_time = finish_time - self.start_time
elapsed_time_seconds = elapsed_time.total_seconds()
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
self.stats.set_value('finish_time', finish_time, spider=spider)
self.stats.set_value('finish_reason', reason, spider=spider)
def item_scraped(self, item, spider):
self.stats.inc_value('item_scraped_count', spider=spider)
def response_received(self, spider):
self.stats.inc_value('response_received_count', spider=spider)
def item_dropped(self, item, spider, exception):
reason = exception.__class__.__name__
self.stats.inc_value('item_dropped_count', spider=spider)
self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)

View file

@ -0,0 +1,64 @@
"""
Extensions for debugging Scrapy
See documentation in docs/topics/extensions.rst
"""
import sys
import signal
import logging
import traceback
import threading
from pdb import Pdb
from scrapy.utils.engine import format_engine_status
from scrapy.utils.trackref import format_live_refs
logger = logging.getLogger(__name__)
class StackTraceDump:
def __init__(self, crawler=None):
self.crawler = crawler
try:
signal.signal(signal.SIGUSR2, self.dump_stacktrace)
signal.signal(signal.SIGQUIT, self.dump_stacktrace)
except AttributeError:
# win32 platforms don't support SIGUSR signals
pass
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def dump_stacktrace(self, signum, frame):
log_args = {
'stackdumps': self._thread_stacks(),
'enginestatus': format_engine_status(self.crawler.engine),
'liverefs': format_live_refs(),
}
logger.info("Dumping stack trace and engine status\n"
"%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
log_args, extra={'crawler': self.crawler})
def _thread_stacks(self):
id2name = dict((th.ident, th.name) for th in threading.enumerate())
dumps = ''
for id_, frame in sys._current_frames().items():
name = id2name.get(id_, '')
dump = ''.join(traceback.format_stack(frame))
dumps += f"# Thread: {name}({id_})\n{dump}\n"
return dumps
class Debugger:
def __init__(self):
try:
signal.signal(signal.SIGUSR2, self._enter_debugger)
except AttributeError:
# win32 platforms don't support SIGUSR signals
pass
def _enter_debugger(self, signum, frame):
Pdb().set_trace(frame.f_back)

View file

@ -0,0 +1,480 @@
"""
Feed Exports extension
See documentation in docs/topics/feed-exports.rst
"""
import logging
import os
import re
import sys
import warnings
from datetime import datetime
from tempfile import NamedTemporaryFile
from urllib.parse import unquote, urlparse
from twisted.internet import defer, threads
from w3lib.url import file_uri_to_path
from zope.interface import implementer, Interface
from scrapy import signals
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.utils.boto import is_botocore_available
from scrapy.utils.conf import feed_complete_default_values_from_settings
from scrapy.utils.ftp import ftp_store_file
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import get_func_args, without_none_values
logger = logging.getLogger(__name__)
def build_storage(builder, uri, *args, feed_options=None, preargs=(), **kwargs):
argument_names = get_func_args(builder)
if 'feed_options' in argument_names:
kwargs['feed_options'] = feed_options
else:
warnings.warn(
"{} does not support the 'feed_options' keyword argument. Add a "
"'feed_options' parameter to its signature to remove this "
"warning. This parameter will become mandatory in a future "
"version of Scrapy."
.format(builder.__qualname__),
category=ScrapyDeprecationWarning
)
return builder(*preargs, uri, *args, **kwargs)
class IFeedStorage(Interface):
"""Interface that all Feed Storages must implement"""
def __init__(uri, *, feed_options=None):
"""Initialize the storage with the parameters given in the URI and the
feed-specific options (see :setting:`FEEDS`)"""
def open(spider):
"""Open the storage for the given spider. It must return a file-like
object that will be used for the exporters"""
def store(file):
"""Store the given file stream"""
@implementer(IFeedStorage)
class BlockingFeedStorage:
def open(self, spider):
path = spider.crawler.settings['FEED_TEMPDIR']
if path and not os.path.isdir(path):
raise OSError('Not a Directory: ' + str(path))
return NamedTemporaryFile(prefix='feed-', dir=path)
def store(self, file):
return threads.deferToThread(self._store_in_thread, file)
def _store_in_thread(self, file):
raise NotImplementedError
@implementer(IFeedStorage)
class StdoutFeedStorage:
def __init__(self, uri, _stdout=None, *, feed_options=None):
if not _stdout:
_stdout = sys.stdout.buffer
self._stdout = _stdout
if feed_options and feed_options.get('overwrite', False) is True:
logger.warning('Standard output (stdout) storage does not support '
'overwriting. To suppress this warning, remove the '
'overwrite option from your FEEDS setting, or set '
'it to False.')
def open(self, spider):
return self._stdout
def store(self, file):
pass
@implementer(IFeedStorage)
class FileFeedStorage:
def __init__(self, uri, *, feed_options=None):
self.path = file_uri_to_path(uri)
feed_options = feed_options or {}
self.write_mode = 'wb' if feed_options.get('overwrite', False) else 'ab'
def open(self, spider):
dirname = os.path.dirname(self.path)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
return open(self.path, self.write_mode)
def store(self, file):
file.close()
class S3FeedStorage(BlockingFeedStorage):
def __init__(self, uri, access_key=None, secret_key=None, acl=None, *,
feed_options=None):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
u = urlparse(uri)
self.bucketname = u.hostname
self.access_key = u.username or access_key
self.secret_key = u.password or secret_key
self.keyname = u.path[1:] # remove first "/"
self.acl = acl
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3', aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key)
if feed_options and feed_options.get('overwrite', True) is False:
logger.warning('S3 does not support appending to files. To '
'suppress this warning, remove the overwrite '
'option from your FEEDS setting or set it to True.')
@classmethod
def from_crawler(cls, crawler, uri, *, feed_options=None):
return build_storage(
cls,
uri,
access_key=crawler.settings['AWS_ACCESS_KEY_ID'],
secret_key=crawler.settings['AWS_SECRET_ACCESS_KEY'],
acl=crawler.settings['FEED_STORAGE_S3_ACL'] or None,
feed_options=feed_options,
)
def _store_in_thread(self, file):
file.seek(0)
kwargs = {'ACL': self.acl} if self.acl else {}
self.s3_client.put_object(
Bucket=self.bucketname, Key=self.keyname, Body=file,
**kwargs)
file.close()
class GCSFeedStorage(BlockingFeedStorage):
def __init__(self, uri, project_id, acl):
self.project_id = project_id
self.acl = acl
u = urlparse(uri)
self.bucket_name = u.hostname
self.blob_name = u.path[1:] # remove first "/"
@classmethod
def from_crawler(cls, crawler, uri):
return cls(
uri,
crawler.settings['GCS_PROJECT_ID'],
crawler.settings['FEED_STORAGE_GCS_ACL'] or None
)
def _store_in_thread(self, file):
file.seek(0)
from google.cloud.storage import Client
client = Client(project=self.project_id)
bucket = client.get_bucket(self.bucket_name)
blob = bucket.blob(self.blob_name)
blob.upload_from_file(file, predefined_acl=self.acl)
class FTPFeedStorage(BlockingFeedStorage):
def __init__(self, uri, use_active_mode=False, *, feed_options=None):
u = urlparse(uri)
self.host = u.hostname
self.port = int(u.port or '21')
self.username = u.username
self.password = unquote(u.password or '')
self.path = u.path
self.use_active_mode = use_active_mode
self.overwrite = not feed_options or feed_options.get('overwrite', True)
@classmethod
def from_crawler(cls, crawler, uri, *, feed_options=None):
return build_storage(
cls,
uri,
crawler.settings.getbool('FEED_STORAGE_FTP_ACTIVE'),
feed_options=feed_options,
)
def _store_in_thread(self, file):
ftp_store_file(
path=self.path, file=file, host=self.host,
port=self.port, username=self.username,
password=self.password, use_active_mode=self.use_active_mode,
overwrite=self.overwrite,
)
class _FeedSlot:
def __init__(self, file, exporter, storage, uri, format, store_empty, batch_id, uri_template):
self.file = file
self.exporter = exporter
self.storage = storage
# feed params
self.batch_id = batch_id
self.format = format
self.store_empty = store_empty
self.uri_template = uri_template
self.uri = uri
# flags
self.itemcount = 0
self._exporting = False
def start_exporting(self):
if not self._exporting:
self.exporter.start_exporting()
self._exporting = True
def finish_exporting(self):
if self._exporting:
self.exporter.finish_exporting()
self._exporting = False
class FeedExporter:
@classmethod
def from_crawler(cls, crawler):
exporter = cls(crawler)
crawler.signals.connect(exporter.open_spider, signals.spider_opened)
crawler.signals.connect(exporter.close_spider, signals.spider_closed)
crawler.signals.connect(exporter.item_scraped, signals.item_scraped)
return exporter
def __init__(self, crawler):
self.crawler = crawler
self.settings = crawler.settings
self.feeds = {}
self.slots = []
if not self.settings['FEEDS'] and not self.settings['FEED_URI']:
raise NotConfigured
# Begin: Backward compatibility for FEED_URI and FEED_FORMAT settings
if self.settings['FEED_URI']:
warnings.warn(
'The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of '
'the `FEEDS` setting. Please see the `FEEDS` setting docs for more details',
category=ScrapyDeprecationWarning, stacklevel=2,
)
uri = str(self.settings['FEED_URI']) # handle pathlib.Path objects
feed_options = {'format': self.settings.get('FEED_FORMAT', 'jsonlines')}
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
# End: Backward compatibility for FEED_URI and FEED_FORMAT settings
# 'FEEDS' setting takes precedence over 'FEED_URI'
for uri, feed_options in self.settings.getdict('FEEDS').items():
uri = str(uri) # handle pathlib.Path objects
self.feeds[uri] = feed_complete_default_values_from_settings(feed_options, self.settings)
self.storages = self._load_components('FEED_STORAGES')
self.exporters = self._load_components('FEED_EXPORTERS')
for uri, feed_options in self.feeds.items():
if not self._storage_supported(uri, feed_options):
raise NotConfigured
if not self._settings_are_valid():
raise NotConfigured
if not self._exporter_supported(feed_options['format']):
raise NotConfigured
def open_spider(self, spider):
for uri, feed_options in self.feeds.items():
uri_params = self._get_uri_params(spider, feed_options['uri_params'])
self.slots.append(self._start_new_batch(
batch_id=1,
uri=uri % uri_params,
feed_options=feed_options,
spider=spider,
uri_template=uri,
))
def close_spider(self, spider):
deferred_list = []
for slot in self.slots:
d = self._close_slot(slot, spider)
deferred_list.append(d)
return defer.DeferredList(deferred_list) if deferred_list else None
def _close_slot(self, slot, spider):
if not slot.itemcount and not slot.store_empty:
# We need to call slot.storage.store nonetheless to get the file
# properly closed.
return defer.maybeDeferred(slot.storage.store, slot.file)
slot.finish_exporting()
logfmt = "%s %%(format)s feed (%%(itemcount)d items) in: %%(uri)s"
log_args = {'format': slot.format,
'itemcount': slot.itemcount,
'uri': slot.uri}
d = defer.maybeDeferred(slot.storage.store, slot.file)
# Use `largs=log_args` to copy log_args into function's scope
# instead of using `log_args` from the outer scope
d.addCallback(
lambda _, largs=log_args: logger.info(
logfmt % "Stored", largs, extra={'spider': spider}
)
)
d.addErrback(
lambda f, largs=log_args: logger.error(
logfmt % "Error storing", largs,
exc_info=failure_to_exc_info(f), extra={'spider': spider}
)
)
return d
def _start_new_batch(self, batch_id, uri, feed_options, spider, uri_template):
"""
Redirect the output data stream to a new file.
Execute multiple times if FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified
:param batch_id: sequence number of current batch
:param uri: uri of the new batch to start
:param feed_options: dict with parameters of feed
:param spider: user spider
:param uri_template: template of uri which contains %(batch_time)s or %(batch_id)d to create new uri
"""
storage = self._get_storage(uri, feed_options)
file = storage.open(spider)
exporter = self._get_exporter(
file=file,
format=feed_options['format'],
fields_to_export=feed_options['fields'],
encoding=feed_options['encoding'],
indent=feed_options['indent'],
**feed_options['item_export_kwargs'],
)
slot = _FeedSlot(
file=file,
exporter=exporter,
storage=storage,
uri=uri,
format=feed_options['format'],
store_empty=feed_options['store_empty'],
batch_id=batch_id,
uri_template=uri_template,
)
if slot.store_empty:
slot.start_exporting()
return slot
def item_scraped(self, item, spider):
slots = []
for slot in self.slots:
slot.start_exporting()
slot.exporter.export_item(item)
slot.itemcount += 1
# create new slot for each slot with itemcount == FEED_EXPORT_BATCH_ITEM_COUNT and close the old one
if (
self.feeds[slot.uri_template]['batch_item_count']
and slot.itemcount >= self.feeds[slot.uri_template]['batch_item_count']
):
uri_params = self._get_uri_params(spider, self.feeds[slot.uri_template]['uri_params'], slot)
self._close_slot(slot, spider)
slots.append(self._start_new_batch(
batch_id=slot.batch_id + 1,
uri=slot.uri_template % uri_params,
feed_options=self.feeds[slot.uri_template],
spider=spider,
uri_template=slot.uri_template,
))
else:
slots.append(slot)
self.slots = slots
def _load_components(self, setting_prefix):
conf = without_none_values(self.settings.getwithbase(setting_prefix))
d = {}
for k, v in conf.items():
try:
d[k] = load_object(v)
except NotConfigured:
pass
return d
def _exporter_supported(self, format):
if format in self.exporters:
return True
logger.error("Unknown feed format: %(format)s", {'format': format})
def _settings_are_valid(self):
"""
If FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain
%(batch_time)s or %(batch_id)d to distinguish different files of partial output
"""
for uri_template, values in self.feeds.items():
if values['batch_item_count'] and not re.search(r'%\(batch_time\)s|%\(batch_id\)', uri_template):
logger.error(
'%(batch_time)s or %(batch_id)d must be in the feed URI ({}) if FEED_EXPORT_BATCH_ITEM_COUNT '
'setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: '
'https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count'
''.format(uri_template)
)
return False
return True
def _storage_supported(self, uri, feed_options):
scheme = urlparse(uri).scheme
if scheme in self.storages:
try:
self._get_storage(uri, feed_options)
return True
except NotConfigured as e:
logger.error("Disabled feed storage scheme: %(scheme)s. "
"Reason: %(reason)s",
{'scheme': scheme, 'reason': str(e)})
else:
logger.error("Unknown feed storage scheme: %(scheme)s",
{'scheme': scheme})
def _get_instance(self, objcls, *args, **kwargs):
return create_instance(
objcls, self.settings, getattr(self, 'crawler', None),
*args, **kwargs)
def _get_exporter(self, file, format, *args, **kwargs):
return self._get_instance(self.exporters[format], file, *args, **kwargs)
def _get_storage(self, uri, feed_options):
"""Fork of create_instance specific to feed storage classes
It supports not passing the *feed_options* parameters to classes that
do not support it, and issuing a deprecation warning instead.
"""
feedcls = self.storages[urlparse(uri).scheme]
crawler = getattr(self, 'crawler', None)
def build_instance(builder, *preargs):
return build_storage(builder, uri, feed_options=feed_options, preargs=preargs)
if crawler and hasattr(feedcls, 'from_crawler'):
instance = build_instance(feedcls.from_crawler, crawler)
method_name = 'from_crawler'
elif hasattr(feedcls, 'from_settings'):
instance = build_instance(feedcls.from_settings, self.settings)
method_name = 'from_settings'
else:
instance = build_instance(feedcls)
method_name = '__new__'
if instance is None:
raise TypeError("%s.%s returned None" % (feedcls.__qualname__, method_name))
return instance
def _get_uri_params(self, spider, uri_params, slot=None):
params = {}
for k in dir(spider):
params[k] = getattr(spider, k)
utc_now = datetime.utcnow()
params['time'] = utc_now.replace(microsecond=0).isoformat().replace(':', '-')
params['batch_time'] = utc_now.isoformat().replace(':', '-')
params['batch_id'] = slot.batch_id + 1 if slot is not None else 1
uripar_function = load_object(uri_params) if uri_params else lambda x, y: None
uripar_function(params, spider)
return params

View file

@ -0,0 +1,372 @@
import gzip
import logging
import os
import pickle
from email.utils import mktime_tz, parsedate_tz
from importlib import import_module
from time import time
from weakref import WeakKeyDictionary
from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
from scrapy.http import Headers, Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.project import data_path
from scrapy.utils.python import to_bytes, to_unicode
from scrapy.utils.request import request_fingerprint
logger = logging.getLogger(__name__)
class DummyPolicy:
def __init__(self, settings):
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
def should_cache_request(self, request):
return urlparse_cached(request).scheme not in self.ignore_schemes
def should_cache_response(self, response, request):
return response.status not in self.ignore_http_codes
def is_cached_response_fresh(self, cachedresponse, request):
return True
def is_cached_response_valid(self, cachedresponse, response, request):
return True
class RFC2616Policy:
MAXAGE = 3600 * 24 * 365 # one year
def __init__(self, settings):
self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
self._cc_parsed = WeakKeyDictionary()
self.ignore_response_cache_controls = [
to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')
]
def _parse_cachecontrol(self, r):
if r not in self._cc_parsed:
cch = r.headers.get(b'Cache-Control', b'')
parsed = parse_cachecontrol(cch)
if isinstance(r, Response):
for key in self.ignore_response_cache_controls:
parsed.pop(key, None)
self._cc_parsed[r] = parsed
return self._cc_parsed[r]
def should_cache_request(self, request):
if urlparse_cached(request).scheme in self.ignore_schemes:
return False
cc = self._parse_cachecontrol(request)
# obey user-agent directive "Cache-Control: no-store"
if b'no-store' in cc:
return False
# Any other is eligible for caching
return True
def should_cache_response(self, response, request):
# What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
# Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
# Status code 206 is not included because cache can not deal with partial contents
cc = self._parse_cachecontrol(response)
# obey directive "Cache-Control: no-store"
if b'no-store' in cc:
return False
# Never cache 304 (Not Modified) responses
elif response.status == 304:
return False
# Cache unconditionally if configured to do so
elif self.always_store:
return True
# Any hint on response expiration is good
elif b'max-age' in cc or b'Expires' in response.headers:
return True
# Firefox fallbacks this statuses to one year expiration if none is set
elif response.status in (300, 301, 308):
return True
# Other statuses without expiration requires at least one validator
elif response.status in (200, 203, 401):
return b'Last-Modified' in response.headers or b'ETag' in response.headers
# Any other is probably not eligible for caching
# Makes no sense to cache responses that does not contain expiration
# info and can not be revalidated
else:
return False
def is_cached_response_fresh(self, cachedresponse, request):
cc = self._parse_cachecontrol(cachedresponse)
ccreq = self._parse_cachecontrol(request)
if b'no-cache' in cc or b'no-cache' in ccreq:
return False
now = time()
freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
currentage = self._compute_current_age(cachedresponse, request, now)
reqmaxage = self._get_max_age(ccreq)
if reqmaxage is not None:
freshnesslifetime = min(freshnesslifetime, reqmaxage)
if currentage < freshnesslifetime:
return True
if b'max-stale' in ccreq and b'must-revalidate' not in cc:
# From RFC2616: "Indicates that the client is willing to
# accept a response that has exceeded its expiration time.
# If max-stale is assigned a value, then the client is
# willing to accept a response that has exceeded its
# expiration time by no more than the specified number of
# seconds. If no value is assigned to max-stale, then the
# client is willing to accept a stale response of any age."
staleage = ccreq[b'max-stale']
if staleage is None:
return True
try:
if currentage < freshnesslifetime + max(0, int(staleage)):
return True
except ValueError:
pass
# Cached response is stale, try to set validators if any
self._set_conditional_validators(request, cachedresponse)
return False
def is_cached_response_valid(self, cachedresponse, response, request):
# Use the cached response if the new response is a server error,
# as long as the old response didn't specify must-revalidate.
if response.status >= 500:
cc = self._parse_cachecontrol(cachedresponse)
if b'must-revalidate' not in cc:
return True
# Use the cached response if the server says it hasn't changed.
return response.status == 304
def _set_conditional_validators(self, request, cachedresponse):
if b'Last-Modified' in cachedresponse.headers:
request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
if b'ETag' in cachedresponse.headers:
request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
def _get_max_age(self, cc):
try:
return max(0, int(cc[b'max-age']))
except (KeyError, ValueError):
return None
def _compute_freshness_lifetime(self, response, request, now):
# Reference nsHttpResponseHead::ComputeFreshnessLifetime
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
cc = self._parse_cachecontrol(response)
maxage = self._get_max_age(cc)
if maxage is not None:
return maxage
# Parse date header or synthesize it if none exists
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
# Try HTTP/1.0 Expires header
if b'Expires' in response.headers:
expires = rfc1123_to_epoch(response.headers[b'Expires'])
# When parsing Expires header fails RFC 2616 section 14.21 says we
# should treat this as an expiration time in the past.
return max(0, expires - date) if expires else 0
# Fallback to heuristic using last-modified header
# This is not in RFC but on Firefox caching implementation
lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
if lastmodified and lastmodified <= date:
return (date - lastmodified) / 10
# This request can be cached indefinitely
if response.status in (300, 301, 308):
return self.MAXAGE
# Insufficient information to compute fresshness lifetime
return 0
def _compute_current_age(self, response, request, now):
# Reference nsHttpResponseHead::ComputeCurrentAge
# https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
currentage = 0
# If Date header is not set we assume it is a fast connection, and
# clock is in sync with the server
date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
if now > date:
currentage = now - date
if b'Age' in response.headers:
try:
age = int(response.headers[b'Age'])
currentage = max(currentage, age)
except ValueError:
pass
return currentage
class DbmCacheStorage:
def __init__(self, settings):
self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
self.db = None
def open_spider(self, spider):
dbpath = os.path.join(self.cachedir, f'{spider.name}.db')
self.db = self.dbmodule.open(dbpath, 'c')
logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
def close_spider(self, spider):
self.db.close()
def retrieve_response(self, spider, request):
data = self._read_data(spider, request)
if data is None:
return # not cached
url = data['url']
status = data['status']
headers = Headers(data['headers'])
body = data['body']
respcls = responsetypes.from_args(headers=headers, url=url)
response = respcls(url=url, headers=headers, status=status, body=body)
return response
def store_response(self, spider, request, response):
key = self._request_key(request)
data = {
'status': response.status,
'url': response.url,
'headers': dict(response.headers),
'body': response.body,
}
self.db[f'{key}_data'] = pickle.dumps(data, protocol=4)
self.db[f'{key}_time'] = str(time())
def _read_data(self, spider, request):
key = self._request_key(request)
db = self.db
tkey = f'{key}_time'
if tkey not in db:
return # not found
ts = db[tkey]
if 0 < self.expiration_secs < time() - float(ts):
return # expired
return pickle.loads(db[f'{key}_data'])
def _request_key(self, request):
return request_fingerprint(request)
class FilesystemCacheStorage:
def __init__(self, settings):
self.cachedir = data_path(settings['HTTPCACHE_DIR'])
self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
self._open = gzip.open if self.use_gzip else open
def open_spider(self, spider):
logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
extra={'spider': spider})
def close_spider(self, spider):
pass
def retrieve_response(self, spider, request):
"""Return response if present in cache, or None otherwise."""
metadata = self._read_meta(spider, request)
if metadata is None:
return # not cached
rpath = self._get_request_path(spider, request)
with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
body = f.read()
with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
rawheaders = f.read()
url = metadata.get('response_url')
status = metadata['status']
headers = Headers(headers_raw_to_dict(rawheaders))
respcls = responsetypes.from_args(headers=headers, url=url)
response = respcls(url=url, headers=headers, status=status, body=body)
return response
def store_response(self, spider, request, response):
"""Store the given response in the cache."""
rpath = self._get_request_path(spider, request)
if not os.path.exists(rpath):
os.makedirs(rpath)
metadata = {
'url': request.url,
'method': request.method,
'status': response.status,
'response_url': response.url,
'timestamp': time(),
}
with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
f.write(to_bytes(repr(metadata)))
with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
pickle.dump(metadata, f, protocol=4)
with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
f.write(headers_dict_to_raw(response.headers))
with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
f.write(response.body)
with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
f.write(headers_dict_to_raw(request.headers))
with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
f.write(request.body)
def _get_request_path(self, spider, request):
key = request_fingerprint(request)
return os.path.join(self.cachedir, spider.name, key[0:2], key)
def _read_meta(self, spider, request):
rpath = self._get_request_path(spider, request)
metapath = os.path.join(rpath, 'pickled_meta')
if not os.path.exists(metapath):
return # not found
mtime = os.stat(metapath).st_mtime
if 0 < self.expiration_secs < time() - mtime:
return # expired
with self._open(metapath, 'rb') as f:
return pickle.load(f)
def parse_cachecontrol(header):
"""Parse Cache-Control header
https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
>>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None,
... b'max-age': b'3600'}
True
>>> parse_cachecontrol(b'') == {}
True
"""
directives = {}
for directive in header.split(b','):
key, sep, val = directive.strip().partition(b'=')
if key:
directives[key.lower()] = val if sep else None
return directives
def rfc1123_to_epoch(date_str):
try:
date_str = to_unicode(date_str, encoding='ascii')
return mktime_tz(parsedate_tz(date_str))
except Exception:
return None

View file

@ -0,0 +1,52 @@
import logging
from twisted.internet import task
from scrapy.exceptions import NotConfigured
from scrapy import signals
logger = logging.getLogger(__name__)
class LogStats:
"""Log basic scraping stats periodically"""
def __init__(self, stats, interval=60.0):
self.stats = stats
self.interval = interval
self.multiplier = 60.0 / self.interval
self.task = None
@classmethod
def from_crawler(cls, crawler):
interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
if not interval:
raise NotConfigured
o = cls(crawler.stats, interval)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_opened(self, spider):
self.pagesprev = 0
self.itemsprev = 0
self.task = task.LoopingCall(self.log, spider)
self.task.start(self.interval)
def log(self, spider):
items = self.stats.get_value('item_scraped_count', 0)
pages = self.stats.get_value('response_received_count', 0)
irate = (items - self.itemsprev) * self.multiplier
prate = (pages - self.pagesprev) * self.multiplier
self.pagesprev, self.itemsprev = pages, items
msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
"scraped %(items)d items (at %(itemrate)d items/min)")
log_args = {'pages': pages, 'pagerate': prate,
'items': items, 'itemrate': irate}
logger.info(msg, log_args, extra={'spider': spider})
def spider_closed(self, spider, reason):
if self.task and self.task.running:
self.task.stop()

View file

@ -0,0 +1,33 @@
"""
MemoryDebugger extension
See documentation in docs/topics/extensions.rst
"""
import gc
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.trackref import live_refs
class MemoryDebugger:
def __init__(self, stats):
self.stats = stats
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('MEMDEBUG_ENABLED'):
raise NotConfigured
o = cls(crawler.stats)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_closed(self, spider, reason):
gc.collect()
self.stats.set_value('memdebug/gc_garbage_count', len(gc.garbage), spider=spider)
for cls, wdict in live_refs.items():
if not wdict:
continue
self.stats.set_value(f'memdebug/live_refs/{cls.__name__}', len(wdict), spider=spider)

View file

@ -0,0 +1,126 @@
"""
MemoryUsage extension
See documentation in docs/topics/extensions.rst
"""
import sys
import socket
import logging
from pprint import pformat
from importlib import import_module
from twisted.internet import task
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.mail import MailSender
from scrapy.utils.engine import get_engine_status
logger = logging.getLogger(__name__)
class MemoryUsage:
def __init__(self, crawler):
if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
raise NotConfigured
try:
# stdlib's resource module is only available on unix platforms.
self.resource = import_module('resource')
except ImportError:
raise NotConfigured
self.crawler = crawler
self.warned = False
self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB')*1024*1024
self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB')*1024*1024
self.check_interval = crawler.settings.getfloat('MEMUSAGE_CHECK_INTERVAL_SECONDS')
self.mail = MailSender.from_settings(crawler.settings)
crawler.signals.connect(self.engine_started, signal=signals.engine_started)
crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def get_virtual_size(self):
size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
if sys.platform != 'darwin':
# on macOS ru_maxrss is in bytes, on Linux it is in KB
size *= 1024
return size
def engine_started(self):
self.crawler.stats.set_value('memusage/startup', self.get_virtual_size())
self.tasks = []
tsk = task.LoopingCall(self.update)
self.tasks.append(tsk)
tsk.start(self.check_interval, now=True)
if self.limit:
tsk = task.LoopingCall(self._check_limit)
self.tasks.append(tsk)
tsk.start(self.check_interval, now=True)
if self.warning:
tsk = task.LoopingCall(self._check_warning)
self.tasks.append(tsk)
tsk.start(self.check_interval, now=True)
def engine_stopped(self):
for tsk in self.tasks:
if tsk.running:
tsk.stop()
def update(self):
self.crawler.stats.max_value('memusage/max', self.get_virtual_size())
def _check_limit(self):
if self.get_virtual_size() > self.limit:
self.crawler.stats.set_value('memusage/limit_reached', 1)
mem = self.limit/1024/1024
logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
{'memusage': mem}, extra={'crawler': self.crawler})
if self.notify_mails:
subj = (
f"{self.crawler.settings['BOT_NAME']} terminated: "
f"memory usage exceeded {mem}M at {socket.gethostname()}"
)
self._send_report(self.notify_mails, subj)
self.crawler.stats.set_value('memusage/limit_notified', 1)
open_spiders = self.crawler.engine.open_spiders
if open_spiders:
for spider in open_spiders:
self.crawler.engine.close_spider(spider, 'memusage_exceeded')
else:
self.crawler.stop()
def _check_warning(self):
if self.warned: # warn only once
return
if self.get_virtual_size() > self.warning:
self.crawler.stats.set_value('memusage/warning_reached', 1)
mem = self.warning/1024/1024
logger.warning("Memory usage reached %(memusage)dM",
{'memusage': mem}, extra={'crawler': self.crawler})
if self.notify_mails:
subj = (
f"{self.crawler.settings['BOT_NAME']} warning: "
f"memory usage reached {mem}M at {socket.gethostname()}"
)
self._send_report(self.notify_mails, subj)
self.crawler.stats.set_value('memusage/warning_notified', 1)
self.warned = True
def _send_report(self, rcpts, subject):
"""send notification mail with some additional useful info"""
stats = self.crawler.stats
s = f"Memory usage at engine startup : {stats.get_value('memusage/startup')/1024/1024}M\r\n"
s += f"Maximum memory usage : {stats.get_value('memusage/max')/1024/1024}M\r\n"
s += f"Current memory usage : {self.get_virtual_size()/1024/1024}M\r\n"
s += "ENGINE STATUS ------------------------------------------------------- \r\n"
s += "\r\n"
s += pformat(get_engine_status(self.crawler.engine))
s += "\r\n"
self.mail.send(rcpts, subject, s)

View file

@ -0,0 +1,40 @@
import os
import pickle
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.job import job_dir
class SpiderState:
"""Store and load spider state during a scraping job"""
def __init__(self, jobdir=None):
self.jobdir = jobdir
@classmethod
def from_crawler(cls, crawler):
jobdir = job_dir(crawler.settings)
if not jobdir:
raise NotConfigured
obj = cls(jobdir)
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
return obj
def spider_closed(self, spider):
if self.jobdir:
with open(self.statefn, 'wb') as f:
pickle.dump(spider.state, f, protocol=4)
def spider_opened(self, spider):
if self.jobdir and os.path.exists(self.statefn):
with open(self.statefn, 'rb') as f:
spider.state = pickle.load(f)
else:
spider.state = {}
@property
def statefn(self):
return os.path.join(self.jobdir, 'spider.state')

View file

@ -0,0 +1,34 @@
"""
StatsMailer extension sends an email when a spider finishes scraping.
Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
"""
from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
class StatsMailer:
def __init__(self, stats, recipients, mail):
self.stats = stats
self.recipients = recipients
self.mail = mail
@classmethod
def from_crawler(cls, crawler):
recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
if not recipients:
raise NotConfigured
mail = MailSender.from_settings(crawler.settings)
o = cls(crawler.stats, recipients, mail)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
return o
def spider_closed(self, spider):
spider_stats = self.stats.get_stats(spider)
body = "Global stats\n\n"
body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
body += f"\n\n{spider.name} stats\n\n"
body += "\n".join(f"{k:<50} : {v}" for k, v in spider_stats.items())
return self.mail.send(self.recipients, f"Scrapy stats for: {spider.name}", body)

View file

@ -0,0 +1,114 @@
"""
Scrapy Telnet Console extension
See documentation in docs/topics/telnetconsole.rst
"""
import pprint
import logging
import traceback
import binascii
import os
from twisted.internet import protocol
try:
from twisted.conch import manhole, telnet
from twisted.conch.insults import insults
TWISTED_CONCH_AVAILABLE = True
except (ImportError, SyntaxError):
_TWISTED_CONCH_TRACEBACK = traceback.format_exc()
TWISTED_CONCH_AVAILABLE = False
from scrapy.exceptions import NotConfigured
from scrapy import signals
from scrapy.utils.trackref import print_live_refs
from scrapy.utils.engine import print_engine_status
from scrapy.utils.reactor import listen_tcp
from scrapy.utils.decorators import defers
logger = logging.getLogger(__name__)
# signal to update telnet variables
# args: telnet_vars
update_telnet_vars = object()
class TelnetConsole(protocol.ServerFactory):
def __init__(self, crawler):
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
raise NotConfigured
if not TWISTED_CONCH_AVAILABLE:
raise NotConfigured(
'TELNETCONSOLE_ENABLED setting is True but required twisted '
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
self.crawler = crawler
self.noisy = False
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
self.host = crawler.settings['TELNETCONSOLE_HOST']
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
if not self.password:
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
logger.info('Telnet Password: %s', self.password)
self.crawler.signals.connect(self.start_listening, signals.engine_started)
self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def start_listening(self):
self.port = listen_tcp(self.portrange, self.host, self)
h = self.port.getHost()
logger.info("Telnet console listening on %(host)s:%(port)d",
{'host': h.host, 'port': h.port},
extra={'crawler': self.crawler})
def stop_listening(self):
self.port.stopListening()
def protocol(self):
class Portal:
"""An implementation of IPortal"""
@defers
def login(self_, credentials, mind, *interfaces):
if not (
credentials.username == self.username.encode('utf8')
and credentials.checkPassword(self.password.encode('utf8'))
):
raise ValueError("Invalid credentials")
protocol = telnet.TelnetBootstrapProtocol(
insults.ServerProtocol,
manhole.Manhole,
self._get_telnet_vars()
)
return (interfaces[0], protocol, lambda: None)
return telnet.TelnetTransport(
telnet.AuthenticatingTelnetProtocol,
Portal()
)
def _get_telnet_vars(self):
# Note: if you add entries here also update topics/telnetconsole.rst
telnet_vars = {
'engine': self.crawler.engine,
'spider': self.crawler.engine.spider,
'slot': self.crawler.engine.slot,
'crawler': self.crawler,
'extensions': self.crawler.extensions,
'stats': self.crawler.stats,
'settings': self.crawler.settings,
'est': lambda: print_engine_status(self.crawler.engine),
'p': pprint.pprint,
'prefs': print_live_refs,
'help': "This is Scrapy telnet console. For more info see: "
"https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
}
self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars)
return telnet_vars

View file

@ -0,0 +1,93 @@
import logging
from scrapy.exceptions import NotConfigured
from scrapy import signals
logger = logging.getLogger(__name__)
class AutoThrottle:
def __init__(self, crawler):
self.crawler = crawler
if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
raise NotConfigured
self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def _spider_opened(self, spider):
self.mindelay = self._min_delay(spider)
self.maxdelay = self._max_delay(spider)
spider.download_delay = self._start_delay(spider)
def _min_delay(self, spider):
s = self.crawler.settings
return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
def _max_delay(self, spider):
return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
def _start_delay(self, spider):
return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
def _response_downloaded(self, response, request, spider):
key, slot = self._get_slot(request, spider)
latency = request.meta.get('download_latency')
if latency is None or slot is None:
return
olddelay = slot.delay
self._adjust_delay(slot, latency, response)
if self.debug:
diff = slot.delay - olddelay
size = len(response.body)
conc = len(slot.transferring)
logger.info(
"slot: %(slot)s | conc:%(concurrency)2d | "
"delay:%(delay)5d ms (%(delaydiff)+d) | "
"latency:%(latency)5d ms | size:%(size)6d bytes",
{
'slot': key, 'concurrency': conc,
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
'latency': latency * 1000, 'size': size
},
extra={'spider': spider}
)
def _get_slot(self, request, spider):
key = request.meta.get('download_slot')
return key, self.crawler.engine.downloader.slots.get(key)
def _adjust_delay(self, slot, latency, response):
"""Define delay adjustment policy"""
# If a server needs `latency` seconds to respond then
# we should send a request each `latency/N` seconds
# to have N requests processed in parallel
target_delay = latency / self.target_concurrency
# Adjust the delay to make it closer to target_delay
new_delay = (slot.delay + target_delay) / 2.0
# If target delay is bigger than old delay, then use it instead of mean.
# It works better with problematic sites.
new_delay = max(target_delay, new_delay)
# Make sure self.mindelay <= new_delay <= self.max_delay
new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
# Dont adjust delay if response status != 200 and new delay is smaller
# than old one, as error pages (and redirections) are usually small and
# so tend to reduce latency, thus provoking a positive feedback by
# reducing delay instead of increase.
if response.status != 200 and new_delay <= slot.delay:
return
slot.delay = new_delay

View file

@ -0,0 +1,18 @@
"""
Module containing all HTTP related classes
Use this module (instead of the more specific ones) when importing Headers,
Request and Response outside this module.
"""
from scrapy.http.headers import Headers
from scrapy.http.request import Request
from scrapy.http.request.form import FormRequest
from scrapy.http.request.rpc import XmlRpcRequest
from scrapy.http.request.json_request import JsonRequest
from scrapy.http.response import Response
from scrapy.http.response.html import HtmlResponse
from scrapy.http.response.xml import XmlResponse
from scrapy.http.response.text import TextResponse

View file

@ -0,0 +1,6 @@
def obsolete_setter(setter, attrname):
def newsetter(self, value):
c = self.__class__.__name__
msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
raise AttributeError(msg)
return newsetter

View file

@ -0,0 +1,191 @@
import time
from http.cookiejar import CookieJar as _CookieJar, DefaultCookiePolicy, IPV4_RE
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_unicode
class CookieJar:
def __init__(self, policy=None, check_expired_frequency=10000):
self.policy = policy or DefaultCookiePolicy()
self.jar = _CookieJar(self.policy)
self.jar._cookies_lock = _DummyLock()
self.check_expired_frequency = check_expired_frequency
self.processed = 0
def extract_cookies(self, response, request):
wreq = WrappedRequest(request)
wrsp = WrappedResponse(response)
return self.jar.extract_cookies(wrsp, wreq)
def add_cookie_header(self, request):
wreq = WrappedRequest(request)
self.policy._now = self.jar._now = int(time.time())
# the cookiejar implementation iterates through all domains
# instead we restrict to potential matches on the domain
req_host = urlparse_cached(request).hostname
if not req_host:
return
if not IPV4_RE.search(req_host):
hosts = potential_domain_matches(req_host)
if '.' not in req_host:
hosts += [req_host + ".local"]
else:
hosts = [req_host]
cookies = []
for host in hosts:
if host in self.jar._cookies:
cookies += self.jar._cookies_for_domain(host, wreq)
attrs = self.jar._cookie_attrs(cookies)
if attrs:
if not wreq.has_header("Cookie"):
wreq.add_unredirected_header("Cookie", "; ".join(attrs))
self.processed += 1
if self.processed % self.check_expired_frequency == 0:
# This is still quite inefficient for large number of cookies
self.jar.clear_expired_cookies()
@property
def _cookies(self):
return self.jar._cookies
def clear_session_cookies(self, *args, **kwargs):
return self.jar.clear_session_cookies(*args, **kwargs)
def clear(self, domain=None, path=None, name=None):
return self.jar.clear(domain, path, name)
def __iter__(self):
return iter(self.jar)
def __len__(self):
return len(self.jar)
def set_policy(self, pol):
return self.jar.set_policy(pol)
def make_cookies(self, response, request):
wreq = WrappedRequest(request)
wrsp = WrappedResponse(response)
return self.jar.make_cookies(wrsp, wreq)
def set_cookie(self, cookie):
self.jar.set_cookie(cookie)
def set_cookie_if_ok(self, cookie, request):
self.jar.set_cookie_if_ok(cookie, WrappedRequest(request))
def potential_domain_matches(domain):
"""Potential domain matches for a cookie
>>> potential_domain_matches('www.example.com')
['www.example.com', 'example.com', '.www.example.com', '.example.com']
"""
matches = [domain]
try:
start = domain.index('.') + 1
end = domain.rindex('.')
while start < end:
matches.append(domain[start:])
start = domain.index('.', start) + 1
except ValueError:
pass
return matches + ['.' + d for d in matches]
class _DummyLock:
def acquire(self):
pass
def release(self):
pass
class WrappedRequest:
"""Wraps a scrapy Request class with methods defined by urllib2.Request class to interact with CookieJar class
see http://docs.python.org/library/urllib2.html#urllib2.Request
"""
def __init__(self, request):
self.request = request
def get_full_url(self):
return self.request.url
def get_host(self):
return urlparse_cached(self.request).netloc
def get_type(self):
return urlparse_cached(self.request).scheme
def is_unverifiable(self):
"""Unverifiable should indicate whether the request is unverifiable, as defined by RFC 2965.
It defaults to False. An unverifiable request is one whose URL the user did not have the
option to approve. For example, if the request is for an image in an
HTML document, and the user had no option to approve the automatic
fetching of the image, this should be true.
"""
return self.request.meta.get('is_unverifiable', False)
def get_origin_req_host(self):
return urlparse_cached(self.request).hostname
# python3 uses attributes instead of methods
@property
def full_url(self):
return self.get_full_url()
@property
def host(self):
return self.get_host()
@property
def type(self):
return self.get_type()
@property
def unverifiable(self):
return self.is_unverifiable()
@property
def origin_req_host(self):
return self.get_origin_req_host()
def has_header(self, name):
return name in self.request.headers
def get_header(self, name, default=None):
return to_unicode(self.request.headers.get(name, default),
errors='replace')
def header_items(self):
return [
(to_unicode(k, errors='replace'),
[to_unicode(x, errors='replace') for x in v])
for k, v in self.request.headers.items()
]
def add_unredirected_header(self, name, value):
self.request.headers.appendlist(name, value)
class WrappedResponse:
def __init__(self, response):
self.response = response
def info(self):
return self
def get_all(self, name, default=None):
return [to_unicode(v, errors='replace')
for v in self.response.headers.getlist(name)]

View file

@ -0,0 +1,89 @@
from w3lib.http import headers_dict_to_raw
from scrapy.utils.datatypes import CaselessDict
from scrapy.utils.python import to_unicode
class Headers(CaselessDict):
"""Case insensitive http headers dictionary"""
def __init__(self, seq=None, encoding='utf-8'):
self.encoding = encoding
super().__init__(seq)
def normkey(self, key):
"""Normalize key to bytes"""
return self._tobytes(key.title())
def normvalue(self, value):
"""Normalize values to bytes"""
if value is None:
value = []
elif isinstance(value, (str, bytes)):
value = [value]
elif not hasattr(value, '__iter__'):
value = [value]
return [self._tobytes(x) for x in value]
def _tobytes(self, x):
if isinstance(x, bytes):
return x
elif isinstance(x, str):
return x.encode(self.encoding)
elif isinstance(x, int):
return str(x).encode(self.encoding)
else:
raise TypeError(f'Unsupported value type: {type(x)}')
def __getitem__(self, key):
try:
return super().__getitem__(key)[-1]
except IndexError:
return None
def get(self, key, def_val=None):
try:
return super().get(key, def_val)[-1]
except IndexError:
return None
def getlist(self, key, def_val=None):
try:
return super().__getitem__(key)
except KeyError:
if def_val is not None:
return self.normvalue(def_val)
return []
def setlist(self, key, list_):
self[key] = list_
def setlistdefault(self, key, default_list=()):
return self.setdefault(key, default_list)
def appendlist(self, key, value):
lst = self.getlist(key)
lst.extend(self.normvalue(value))
self[key] = lst
def items(self):
return ((k, self.getlist(k)) for k in self.keys())
def values(self):
return [self[k] for k in self.keys()]
def to_string(self):
return headers_dict_to_raw(self)
def to_unicode_dict(self):
""" Return headers as a CaselessDict with unicode keys
and unicode values. Multiple values are joined with ','.
"""
return CaselessDict(
(to_unicode(key, encoding=self.encoding),
to_unicode(b','.join(value), encoding=self.encoding))
for key, value in self.items())
def __copy__(self):
return self.__class__(self)
copy = __copy__

View file

@ -0,0 +1,143 @@
"""
This module implements the Request class which is used to represent HTTP
requests in Scrapy.
See documentation in docs/topics/request-response.rst
"""
from w3lib.url import safe_url_string
from scrapy.http.headers import Headers
from scrapy.utils.python import to_bytes
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import escape_ajax
from scrapy.http.common import obsolete_setter
from scrapy.utils.curl import curl_to_request_kwargs
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None, cb_kwargs=None):
self._encoding = encoding # this one has to be set first
self.method = str(method).upper()
self._set_url(url)
self._set_body(body)
if not isinstance(priority, int):
raise TypeError(f"Request priority not an integer: {priority!r}")
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
if errback is not None and not callable(errback):
raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
self.callback = callback
self.errback = errback
self.cookies = cookies or {}
self.headers = Headers(headers or {}, encoding=encoding)
self.dont_filter = dont_filter
self._meta = dict(meta) if meta else None
self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
self.flags = [] if flags is None else list(flags)
@property
def cb_kwargs(self):
if self._cb_kwargs is None:
self._cb_kwargs = {}
return self._cb_kwargs
@property
def meta(self):
if self._meta is None:
self._meta = {}
return self._meta
def _get_url(self):
return self._url
def _set_url(self, url):
if not isinstance(url, str):
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
if (
'://' not in self._url
and not self._url.startswith('about:')
and not self._url.startswith('data:')
):
raise ValueError(f'Missing scheme in request url: {self._url}')
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
else:
self._body = to_bytes(body, self.encoding)
body = property(_get_body, obsolete_setter(_set_body, 'body'))
@property
def encoding(self):
return self._encoding
def __str__(self):
return f"<{self.method} {self.url}>"
__repr__ = __str__
def copy(self):
"""Return a copy of this Request"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Request with the same attributes except for those
given new values.
"""
for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags',
'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
@classmethod
def from_curl(cls, curl_command, ignore_unknown_options=True, **kwargs):
"""Create a Request object from a string containing a `cURL
<https://curl.haxx.se/>`_ command. It populates the HTTP method, the
URL, the headers, the cookies and the body. It accepts the same
arguments as the :class:`Request` class, taking preference and
overriding the values of the same arguments contained in the cURL
command.
Unrecognized options are ignored by default. To raise an error when
finding unknown options call this method by passing
``ignore_unknown_options=False``.
.. caution:: Using :meth:`from_curl` from :class:`~scrapy.http.Request`
subclasses, such as :class:`~scrapy.http.JSONRequest`, or
:class:`~scrapy.http.XmlRpcRequest`, as well as having
:ref:`downloader middlewares <topics-downloader-middleware>`
and
:ref:`spider middlewares <topics-spider-middleware>`
enabled, such as
:class:`~scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware`,
:class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`,
or
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
may modify the :class:`~scrapy.http.Request` object.
To translate a cURL command into a Scrapy request,
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
"""
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
request_kwargs.update(kwargs)
return cls(**request_kwargs)

View file

@ -0,0 +1,215 @@
"""
This module implements the FormRequest class which is a more convenient class
(than Request) to generate Requests based on form data.
See documentation in docs/topics/request-response.rst
"""
from urllib.parse import urljoin, urlencode
import lxml.html
from parsel.selector import create_root_node
from w3lib.html import strip_html5_whitespace
from scrapy.http.request import Request
from scrapy.utils.python import to_bytes, is_listlike
from scrapy.utils.response import get_base_url
class FormRequest(Request):
valid_form_methods = ['GET', 'POST']
def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'
super().__init__(*args, **kwargs)
if formdata:
items = formdata.items() if isinstance(formdata, dict) else formdata
querystr = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
self._set_body(querystr)
else:
self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)
@classmethod
def from_response(cls, response, formname=None, formid=None, formnumber=0, formdata=None,
clickdata=None, dont_click=False, formxpath=None, formcss=None, **kwargs):
kwargs.setdefault('encoding', response.encoding)
if formcss is not None:
from parsel.csstranslator import HTMLTranslator
formxpath = HTMLTranslator().css_to_xpath(formcss)
form = _get_form(response, formname, formid, formnumber, formxpath)
formdata = _get_inputs(form, formdata, dont_click, clickdata, response)
url = _get_form_url(form, kwargs.pop('url', None))
method = kwargs.pop('method', form.method)
if method is not None:
method = method.upper()
if method not in cls.valid_form_methods:
method = 'GET'
return cls(url=url, method=method, formdata=formdata, **kwargs)
def _get_form_url(form, url):
if url is None:
action = form.get('action')
if action is None:
return form.base_url
return urljoin(form.base_url, strip_html5_whitespace(action))
return urljoin(form.base_url, url)
def _urlencode(seq, enc):
values = [(to_bytes(k, enc), to_bytes(v, enc))
for k, vs in seq
for v in (vs if is_listlike(vs) else [vs])]
return urlencode(values, doseq=1)
def _get_form(response, formname, formid, formnumber, formxpath):
"""Find the form element """
root = create_root_node(response.text, lxml.html.HTMLParser,
base_url=get_base_url(response))
forms = root.xpath('//form')
if not forms:
raise ValueError(f"No <form> element found in {response}")
if formname is not None:
f = root.xpath(f'//form[@name="{formname}"]')
if f:
return f[0]
if formid is not None:
f = root.xpath(f'//form[@id="{formid}"]')
if f:
return f[0]
# Get form element from xpath, if not found, go up
if formxpath is not None:
nodes = root.xpath(formxpath)
if nodes:
el = nodes[0]
while True:
if el.tag == 'form':
return el
el = el.getparent()
if el is None:
break
raise ValueError(f'No <form> element found with {formxpath}')
# If we get here, it means that either formname was None
# or invalid
if formnumber is not None:
try:
form = forms[formnumber]
except IndexError:
raise IndexError(f"Form number {formnumber} not found in {response}")
else:
return form
def _get_inputs(form, formdata, dont_click, clickdata, response):
try:
formdata_keys = dict(formdata or ()).keys()
except (ValueError, TypeError):
raise ValueError('formdata should be a dict or iterable of tuples')
if not formdata:
formdata = ()
inputs = form.xpath('descendant::textarea'
'|descendant::select'
'|descendant::input[not(@type) or @type['
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
' and (../@checked or'
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
namespaces={
"re": "http://exslt.org/regular-expressions"})
values = [(k, '' if v is None else v)
for k, v in (_value(e) for e in inputs)
if k and k not in formdata_keys]
if not dont_click:
clickable = _get_clickable(clickdata, form)
if clickable and clickable[0] not in formdata and not clickable[0] is None:
values.append(clickable)
if isinstance(formdata, dict):
formdata = formdata.items()
values.extend((k, v) for k, v in formdata if v is not None)
return values
def _value(ele):
n = ele.name
v = ele.value
if ele.tag == 'select':
return _select_value(ele, n, v)
return n, v
def _select_value(ele, n, v):
multiple = ele.multiple
if v is None and not multiple:
# Match browser behaviour on simple select tag without options selected
# And for select tags wihout options
o = ele.value_options
return (n, o[0]) if o else (None, None)
elif v is not None and multiple:
# This is a workround to bug in lxml fixed 2.3.1
# fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
selected_options = ele.xpath('.//option[@selected]')
v = [(o.get('value') or o.text or '').strip() for o in selected_options]
return n, v
def _get_clickable(clickdata, form):
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first
clickable element found
"""
clickables = list(form.xpath(
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
namespaces={"re": "http://exslt.org/regular-expressions"}
))
if not clickables:
return
# If we don't have clickdata, we just use the first clickable element
if clickdata is None:
el = clickables[0]
return (el.get('name'), el.get('value') or '')
# If clickdata is given, we compare it to the clickable elements to find a
# match. We first look to see if the number is specified in clickdata,
# because that uniquely identifies the element
nr = clickdata.get('nr', None)
if nr is not None:
try:
el = list(form.inputs)[nr]
except IndexError:
pass
else:
return (el.get('name'), el.get('value') or '')
# We didn't find it, so now we build an XPath expression out of the other
# arguments, because they can be used as such
xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
el = form.xpath(xpath)
if len(el) == 1:
return (el[0].get('name'), el[0].get('value') or '')
elif len(el) > 1:
raise ValueError(f"Multiple elements found ({el!r}) matching the "
f"criteria in clickdata: {clickdata!r}")
else:
raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')

View file

@ -0,0 +1,57 @@
"""
This module implements the JsonRequest class which is a more convenient class
(than Request) to generate JSON Requests.
See documentation in docs/topics/request-response.rst
"""
import copy
import json
import warnings
from scrapy.http.request import Request
from scrapy.utils.deprecate import create_deprecated_class
class JsonRequest(Request):
def __init__(self, *args, **kwargs):
dumps_kwargs = copy.deepcopy(kwargs.pop('dumps_kwargs', {}))
dumps_kwargs.setdefault('sort_keys', True)
self._dumps_kwargs = dumps_kwargs
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
data_passed = data is not None
if body_passed and data_passed:
warnings.warn('Both body and data passed. data will be ignored')
elif not body_passed and data_passed:
kwargs['body'] = self._dumps(data)
if 'method' not in kwargs:
kwargs['method'] = 'POST'
super().__init__(*args, **kwargs)
self.headers.setdefault('Content-Type', 'application/json')
self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01')
def replace(self, *args, **kwargs):
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
data_passed = data is not None
if body_passed and data_passed:
warnings.warn('Both body and data passed. data will be ignored')
elif not body_passed and data_passed:
kwargs['body'] = self._dumps(data)
return super().replace(*args, **kwargs)
def _dumps(self, data):
"""Convert to JSON """
return json.dumps(data, **self._dumps_kwargs)
JSONRequest = create_deprecated_class("JSONRequest", JsonRequest)

View file

@ -0,0 +1,35 @@
"""
This module implements the XmlRpcRequest class which is a more convenient class
(that Request) to generate xml-rpc requests.
See documentation in docs/topics/request-response.rst
"""
import xmlrpc.client as xmlrpclib
from scrapy.http.request import Request
from scrapy.utils.python import get_func_args
DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
class XmlRpcRequest(Request):
def __init__(self, *args, **kwargs):
encoding = kwargs.get('encoding', None)
if 'body' not in kwargs and 'params' in kwargs:
kw = dict((k, kwargs.pop(k)) for k in DUMPS_ARGS if k in kwargs)
kwargs['body'] = xmlrpclib.dumps(**kw)
# spec defines that requests must use POST method
kwargs.setdefault('method', 'POST')
# xmlrpc query multiples times over the same url
kwargs.setdefault('dont_filter', True)
# restore encoding
if encoding is not None:
kwargs['encoding'] = encoding
super().__init__(*args, **kwargs)
self.headers.setdefault('Content-Type', 'text/xml')

View file

@ -0,0 +1,196 @@
"""
This module implements the Response class which is used to represent HTTP
responses in Scrapy.
See documentation in docs/topics/request-response.rst
"""
from typing import Generator
from urllib.parse import urljoin
from scrapy.exceptions import NotSupported
from scrapy.http.common import obsolete_setter
from scrapy.http.headers import Headers
from scrapy.http.request import Request
from scrapy.link import Link
from scrapy.utils.trackref import object_ref
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None,
request=None, certificate=None, ip_address=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
self.certificate = certificate
self.ip_address = ip_address
@property
def cb_kwargs(self):
try:
return self.request.cb_kwargs
except AttributeError:
raise AttributeError(
"Response.cb_kwargs not available, this response "
"is not tied to any request"
)
@property
def meta(self):
try:
return self.request.meta
except AttributeError:
raise AttributeError(
"Response.meta not available, this response "
"is not tied to any request"
)
def _get_url(self):
return self._url
def _set_url(self, url):
if isinstance(url, str):
self._url = url
else:
raise TypeError(f'{type(self).__name__} url must be str, '
f'got {type(url).__name__}')
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
elif not isinstance(body, bytes):
raise TypeError(
"Response body must be bytes. "
"If you want to pass unicode body use TextResponse "
"or HtmlResponse.")
else:
self._body = body
body = property(_get_body, obsolete_setter(_set_body, 'body'))
def __str__(self):
return f"<{self.status} {self.url}>"
__repr__ = __str__
def copy(self):
"""Return a copy of this Response"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body',
'request', 'flags', 'certificate', 'ip_address']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
def urljoin(self, url):
"""Join this Response's url with a possible relative url to form an
absolute interpretation of the latter."""
return urljoin(self.url, url)
@property
def text(self):
"""For subclasses of TextResponse, this will return the body
as str
"""
raise AttributeError("Response content isn't text")
def css(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def xpath(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Request
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
not only an absolute URL.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
.. versionadded:: 2.0
The *flags* parameter.
"""
if isinstance(url, Link):
url = url.url
elif url is None:
raise ValueError("url can't be None")
url = self.urljoin(url)
return Request(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Generator[Request, None, None]
"""
.. versionadded:: 2.0
Return an iterable of :class:`~.Request` instances to follow all links
in ``urls``. It accepts the same arguments as ``Request.__init__`` method,
but elements of ``urls`` can be relative URLs or :class:`~scrapy.link.Link` objects,
not only absolute URLs.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow_all`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
"""
if not hasattr(urls, '__iter__'):
raise TypeError("'urls' argument must be an iterable")
return (
self.follow(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
for url in urls
)

View file

@ -0,0 +1,12 @@
"""
This module implements the HtmlResponse class which adds encoding
discovering through HTML encoding declarations to the TextResponse class.
See documentation in docs/topics/request-response.rst
"""
from scrapy.http.response.text import TextResponse
class HtmlResponse(TextResponse):
pass

View file

@ -0,0 +1,265 @@
"""
This module implements the TextResponse class which adds encoding handling and
discovering (through HTTP headers) to base Response class.
See documentation in docs/topics/request-response.rst
"""
import json
import warnings
from contextlib import suppress
from typing import Generator
from urllib.parse import urljoin
import parsel
from w3lib.encoding import (html_body_declared_encoding, html_to_unicode,
http_content_type_encoding, resolve_encoding)
from w3lib.html import strip_html5_whitespace
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs, to_unicode
from scrapy.utils.response import get_base_url
_NONE = object()
class TextResponse(Response):
_DEFAULT_ENCODING = 'ascii'
_cached_decoded_json = _NONE
def __init__(self, *args, **kwargs):
self._encoding = kwargs.pop('encoding', None)
self._cached_benc = None
self._cached_ubody = None
self._cached_selector = None
super().__init__(*args, **kwargs)
def _set_url(self, url):
if isinstance(url, str):
self._url = to_unicode(url, self.encoding)
else:
super()._set_url(url)
def _set_body(self, body):
self._body = b'' # used by encoding detection
if isinstance(body, str):
if self._encoding is None:
raise TypeError('Cannot convert unicode body - '
f'{type(self).__name__} has no encoding')
self._body = body.encode(self._encoding)
else:
super()._set_body(body)
def replace(self, *args, **kwargs):
kwargs.setdefault('encoding', self.encoding)
return Response.replace(self, *args, **kwargs)
@property
def encoding(self):
return self._declared_encoding() or self._body_inferred_encoding()
def _declared_encoding(self):
return (
self._encoding
or self._headers_encoding()
or self._body_declared_encoding()
)
def body_as_unicode(self):
"""Return body as unicode"""
warnings.warn('Response.body_as_unicode() is deprecated, '
'please use Response.text instead.',
ScrapyDeprecationWarning, stacklevel=2)
return self.text
def json(self):
"""
.. versionadded:: 2.2
Deserialize a JSON document to a Python object.
"""
if self._cached_decoded_json is _NONE:
self._cached_decoded_json = json.loads(self.text)
return self._cached_decoded_json
@property
def text(self):
""" Body as unicode """
# access self.encoding before _cached_ubody to make sure
# _body_inferred_encoding is called
benc = self.encoding
if self._cached_ubody is None:
charset = f'charset={benc}'
self._cached_ubody = html_to_unicode(charset, self.body)[1]
return self._cached_ubody
def urljoin(self, url):
"""Join this Response's url with a possible relative url to form an
absolute interpretation of the latter."""
return urljoin(get_base_url(self), url)
@memoizemethod_noargs
def _headers_encoding(self):
content_type = self.headers.get(b'Content-Type', b'')
return http_content_type_encoding(to_unicode(content_type))
def _body_inferred_encoding(self):
if self._cached_benc is None:
content_type = to_unicode(self.headers.get(b'Content-Type', b''))
benc, ubody = html_to_unicode(content_type, self.body,
auto_detect_fun=self._auto_detect_fun,
default_encoding=self._DEFAULT_ENCODING)
self._cached_benc = benc
self._cached_ubody = ubody
return self._cached_benc
def _auto_detect_fun(self, text):
for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
try:
text.decode(enc)
except UnicodeError:
continue
return resolve_encoding(enc)
@memoizemethod_noargs
def _body_declared_encoding(self):
return html_body_declared_encoding(self.body)
@property
def selector(self):
from scrapy.selector import Selector
if self._cached_selector is None:
self._cached_selector = Selector(self)
return self._cached_selector
def xpath(self, query, **kwargs):
return self.selector.xpath(query, **kwargs)
def css(self, query):
return self.selector.css(query)
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding=None, priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Request
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be not only an absolute URL, but also
* a relative URL
* a :class:`~scrapy.link.Link` object, e.g. the result of
:ref:`topics-link-extractors`
* a :class:`~scrapy.selector.Selector` object for a ``<link>`` or ``<a>`` element, e.g.
``response.css('a.my_link')[0]``
* an attribute :class:`~scrapy.selector.Selector` (not SelectorList), e.g.
``response.css('a::attr(href)')[0]`` or
``response.xpath('//img/@src')[0]``
See :ref:`response-follow-example` for usage examples.
"""
if isinstance(url, parsel.Selector):
url = _url_from_selector(url)
elif isinstance(url, parsel.SelectorList):
raise ValueError("SelectorList is not supported")
encoding = self.encoding if encoding is None else encoding
return super().follow(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
def follow_all(self, urls=None, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding=None, priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None,
css=None, xpath=None):
# type: (...) -> Generator[Request, None, None]
"""
A generator that produces :class:`~.Request` instances to follow all
links in ``urls``. It accepts the same arguments as the :class:`~.Request`'s
``__init__`` method, except that each ``urls`` element does not need to be
an absolute URL, it can be any of the following:
* a relative URL
* a :class:`~scrapy.link.Link` object, e.g. the result of
:ref:`topics-link-extractors`
* a :class:`~scrapy.selector.Selector` object for a ``<link>`` or ``<a>`` element, e.g.
``response.css('a.my_link')[0]``
* an attribute :class:`~scrapy.selector.Selector` (not SelectorList), e.g.
``response.css('a::attr(href)')[0]`` or
``response.xpath('//img/@src')[0]``
In addition, ``css`` and ``xpath`` arguments are accepted to perform the link extraction
within the ``follow_all`` method (only one of ``urls``, ``css`` and ``xpath`` is accepted).
Note that when passing a ``SelectorList`` as argument for the ``urls`` parameter or
using the ``css`` or ``xpath`` parameters, this method will not produce requests for
selectors from which links cannot be obtained (for instance, anchor tags without an
``href`` attribute)
"""
arguments = [x for x in (urls, css, xpath) if x is not None]
if len(arguments) != 1:
raise ValueError(
"Please supply exactly one of the following arguments: urls, css, xpath"
)
if not urls:
if css:
urls = self.css(css)
if xpath:
urls = self.xpath(xpath)
if isinstance(urls, parsel.SelectorList):
selectors = urls
urls = []
for sel in selectors:
with suppress(_InvalidSelector):
urls.append(_url_from_selector(sel))
return super().follow_all(
urls=urls,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
class _InvalidSelector(ValueError):
"""
Raised when a URL cannot be obtained from a Selector
"""
def _url_from_selector(sel):
# type: (parsel.Selector) -> str
if isinstance(sel.root, str):
# e.g. ::attr(href) result
return strip_html5_whitespace(sel.root)
if not hasattr(sel.root, 'tag'):
raise _InvalidSelector(f"Unsupported selector: {sel}")
if sel.root.tag not in ('a', 'link'):
raise _InvalidSelector("Only <a> and <link> elements are supported; "
f"got <{sel.root.tag}>")
href = sel.root.get('href')
if href is None:
raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
return strip_html5_whitespace(href)

View file

@ -0,0 +1,12 @@
"""
This module implements the XmlResponse class which adds encoding
discovering through XML encoding declarations to the TextResponse class.
See documentation in docs/topics/request-response.rst
"""
from scrapy.http.response.text import TextResponse
class XmlResponse(TextResponse):
pass

View file

@ -0,0 +1,18 @@
from zope.interface import Interface
class ISpiderLoader(Interface):
def from_settings(settings):
"""Return an instance of the class for the given settings"""
def load(spider_name):
"""Return the Spider class for the given spider name. If the spider
name is not found, it must raise a KeyError."""
def list():
"""Return a list with the names of all spiders available in the
project"""
def find_by_request(request):
"""Return the list of spiders names that can handle the given request"""

View file

@ -0,0 +1,158 @@
"""
Scrapy Item
See documentation in docs/topics/item.rst
"""
from abc import ABCMeta
from collections.abc import MutableMapping
from copy import deepcopy
from pprint import pformat
from warnings import warn
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.trackref import object_ref
class _BaseItem(object_ref):
"""
Temporary class used internally to avoid the deprecation
warning raised by isinstance checks using BaseItem.
"""
pass
class _BaseItemMeta(ABCMeta):
def __instancecheck__(cls, instance):
if cls is BaseItem:
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2)
return super().__instancecheck__(instance)
class BaseItem(_BaseItem, metaclass=_BaseItemMeta):
"""
Deprecated, please use :class:`scrapy.item.Item` instead
"""
def __new__(cls, *args, **kwargs):
if issubclass(cls, BaseItem) and not issubclass(cls, (Item, DictItem)):
warn('scrapy.item.BaseItem is deprecated, please use scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2)
return super().__new__(cls, *args, **kwargs)
class Field(dict):
"""Container of field metadata"""
class ItemMeta(_BaseItemMeta):
"""Metaclass_ of :class:`Item` that handles field definitions.
.. _metaclass: https://realpython.com/python-metaclasses
"""
def __new__(mcs, class_name, bases, attrs):
classcell = attrs.pop('__classcell__', None)
new_bases = tuple(base._class for base in bases if hasattr(base, '_class'))
_class = super().__new__(mcs, 'x_' + class_name, new_bases, attrs)
fields = getattr(_class, 'fields', {})
new_attrs = {}
for n in dir(_class):
v = getattr(_class, n)
if isinstance(v, Field):
fields[n] = v
elif n in attrs:
new_attrs[n] = attrs[n]
new_attrs['fields'] = fields
new_attrs['_class'] = _class
if classcell is not None:
new_attrs['__classcell__'] = classcell
return super().__new__(mcs, class_name, bases, new_attrs)
class DictItem(MutableMapping, BaseItem):
fields = {}
def __new__(cls, *args, **kwargs):
if issubclass(cls, DictItem) and not issubclass(cls, Item):
warn('scrapy.item.DictItem is deprecated, please use scrapy.item.Item instead',
ScrapyDeprecationWarning, stacklevel=2)
return super().__new__(cls, *args, **kwargs)
def __init__(self, *args, **kwargs):
self._values = {}
if args or kwargs: # avoid creating dict for most common case
for k, v in dict(*args, **kwargs).items():
self[k] = v
def __getitem__(self, key):
return self._values[key]
def __setitem__(self, key, value):
if key in self.fields:
self._values[key] = value
else:
raise KeyError(f"{self.__class__.__name__} does not support field: {key}")
def __delitem__(self, key):
del self._values[key]
def __getattr__(self, name):
if name in self.fields:
raise AttributeError(f"Use item[{name!r}] to get field value")
raise AttributeError(name)
def __setattr__(self, name, value):
if not name.startswith('_'):
raise AttributeError(f"Use item[{name!r}] = {value!r} to set field value")
super().__setattr__(name, value)
def __len__(self):
return len(self._values)
def __iter__(self):
return iter(self._values)
__hash__ = BaseItem.__hash__
def keys(self):
return self._values.keys()
def __repr__(self):
return pformat(dict(self))
def copy(self):
return self.__class__(self)
def deepcopy(self):
"""Return a :func:`~copy.deepcopy` of this item.
"""
return deepcopy(self)
class Item(DictItem, metaclass=ItemMeta):
"""
Base class for scraped items.
In Scrapy, an object is considered an ``item`` if it is an instance of either
:class:`Item` or :class:`dict`, or any subclass. For example, when the output of a
spider callback is evaluated, only instances of :class:`Item` or
:class:`dict` are passed to :ref:`item pipelines <topics-item-pipeline>`.
If you need instances of a custom class to be considered items by Scrapy,
you must inherit from either :class:`Item` or :class:`dict`.
Items must declare :class:`Field` attributes, which are processed and stored
in the ``fields`` attribute. This restricts the set of allowed field names
and prevents typos, raising ``KeyError`` when referring to undefined fields.
Additionally, fields can be used to define metadata and control the way
data is processed internally. Please refer to the :ref:`documentation
about fields <topics-items-fields>` for additional information.
Unlike instances of :class:`dict`, instances of :class:`Item` may be
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
"""

View file

@ -0,0 +1,53 @@
"""
This module defines the Link object used in Link extractors.
For actual link extractors implementation see scrapy.linkextractors, or
its documentation in: docs/topics/link-extractors.rst
"""
class Link:
"""Link objects represent an extracted link by the LinkExtractor.
Using the anchor tag sample below to illustrate the parameters::
<a href="https://example.com/nofollow.html#foo" rel="nofollow">Dont follow this one</a>
:param url: the absolute url being linked to in the anchor tag.
From the sample, this is ``https://example.com/nofollow.html``.
:param text: the text in the anchor tag. From the sample, this is ``Dont follow this one``.
:param fragment: the part of the url after the hash symbol. From the sample, this is ``foo``.
:param nofollow: an indication of the presence or absence of a nofollow value in the ``rel`` attribute
of the anchor tag.
"""
__slots__ = ['url', 'text', 'fragment', 'nofollow']
def __init__(self, url, text='', fragment='', nofollow=False):
if not isinstance(url, str):
got = url.__class__.__name__
raise TypeError(f"Link urls must be str objects, got {got}")
self.url = url
self.text = text
self.fragment = fragment
self.nofollow = nofollow
def __eq__(self, other):
return (
self.url == other.url
and self.text == other.text
and self.fragment == other.fragment
and self.nofollow == other.nofollow
)
def __hash__(self):
return hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(self.nofollow)
def __repr__(self):
return (
f'Link(url={self.url!r}, text={self.text!r}, '
f'fragment={self.fragment!r}, nofollow={self.nofollow!r})'
)

View file

@ -0,0 +1,136 @@
"""
scrapy.linkextractors
This package contains a collection of Link Extractors.
For more info see docs/topics/link-extractors.rst
"""
import re
from urllib.parse import urlparse
from warnings import warn
from parsel.csstranslator import HTMLTranslator
from w3lib.url import canonicalize_url
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.url import (
url_is_from_any_domain, url_has_any_extension,
)
# common file extensions that are not followed if they occur in links
IGNORED_EXTENSIONS = [
# archives
'7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# video
'3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
'm4a', 'm4v', 'flv', 'webm',
# office suites
'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg',
'odp',
# other
'css', 'pdf', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk'
]
_re_type = type(re.compile("", 0))
def _matches(url, regexs):
return any(r.search(url) for r in regexs)
def _is_valid_url(url):
return url.split('://', 1)[0] in {'http', 'https', 'file', 'ftp'}
class FilteringLinkExtractor:
_csstranslator = HTMLTranslator()
def __new__(cls, *args, **kwargs):
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
if issubclass(cls, FilteringLinkExtractor) and not issubclass(cls, LxmlLinkExtractor):
warn('scrapy.linkextractors.FilteringLinkExtractor is deprecated, '
'please use scrapy.linkextractors.LinkExtractor instead',
ScrapyDeprecationWarning, stacklevel=2)
return super().__new__(cls)
def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains,
restrict_xpaths, canonicalize, deny_extensions, restrict_css, restrict_text):
self.link_extractor = link_extractor
self.allow_res = [x if isinstance(x, _re_type) else re.compile(x)
for x in arg_to_iter(allow)]
self.deny_res = [x if isinstance(x, _re_type) else re.compile(x)
for x in arg_to_iter(deny)]
self.allow_domains = set(arg_to_iter(allow_domains))
self.deny_domains = set(arg_to_iter(deny_domains))
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
arg_to_iter(restrict_css)))
self.canonicalize = canonicalize
if deny_extensions is None:
deny_extensions = IGNORED_EXTENSIONS
self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
self.restrict_text = [x if isinstance(x, _re_type) else re.compile(x)
for x in arg_to_iter(restrict_text)]
def _link_allowed(self, link):
if not _is_valid_url(link.url):
return False
if self.allow_res and not _matches(link.url, self.allow_res):
return False
if self.deny_res and _matches(link.url, self.deny_res):
return False
parsed_url = urlparse(link.url)
if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains):
return False
if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
return False
if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
return False
if self.restrict_text and not _matches(link.text, self.restrict_text):
return False
return True
def matches(self, url):
if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
return False
if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
return False
allowed = (regex.search(url) for regex in self.allow_res) if self.allow_res else [True]
denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
return any(allowed) and not any(denied)
def _process_links(self, links):
links = [x for x in links if self._link_allowed(x)]
if self.canonicalize:
for link in links:
link.url = canonicalize_url(link.url)
links = self.link_extractor._process_links(links)
return links
def _extract_links(self, *args, **kwargs):
return self.link_extractor._extract_links(*args, **kwargs)
# Top-level imports
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor as LinkExtractor

View file

@ -0,0 +1,164 @@
"""
Link extractor based on lxml.html
"""
import operator
from functools import partial
from urllib.parse import urljoin
import lxml.etree as etree
from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string
from scrapy.link import Link
from scrapy.linkextractors import FilteringLinkExtractor
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list
from scrapy.utils.response import get_base_url
# from lxml/src/lxml/html/__init__.py
XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
_collect_string_content = etree.XPath("string()")
def _nons(tag):
if isinstance(tag, str):
if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE:
return tag.split('}')[-1]
return tag
def _identity(x):
return x
def _canonicalize_link_url(link):
return canonicalize_url(link.url, keep_fragments=True)
class LxmlParserLinkExtractor:
def __init__(
self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False
):
self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
self.process_attr = process if callable(process) else _identity
self.unique = unique
self.strip = strip
self.link_key = operator.attrgetter("url") if canonicalized else _canonicalize_link_url
def _iter_links(self, document):
for el in document.iter(etree.Element):
if not self.scan_tag(_nons(el.tag)):
continue
attribs = el.attrib
for attrib in attribs:
if not self.scan_attr(attrib):
continue
yield (el, attrib, attribs[attrib])
def _extract_links(self, selector, response_url, response_encoding, base_url):
links = []
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector.root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
try:
if self.strip:
attr_val = strip_html5_whitespace(attr_val)
attr_val = urljoin(base_url, attr_val)
except ValueError:
continue # skipping bogus links
else:
url = self.process_attr(attr_val)
if url is None:
continue
url = safe_url_string(url, encoding=response_encoding)
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or '',
nofollow=rel_has_nofollow(el.get('rel')))
links.append(link)
return self._deduplicate_if_needed(links)
def extract_links(self, response):
base_url = get_base_url(response)
return self._extract_links(response.selector, response.url, response.encoding, base_url)
def _process_links(self, links):
""" Normalize and filter extracted links
The subclass should override it if neccessary
"""
return self._deduplicate_if_needed(links)
def _deduplicate_if_needed(self, links):
if self.unique:
return unique_list(links, key=self.link_key)
return links
class LxmlLinkExtractor(FilteringLinkExtractor):
def __init__(
self,
allow=(),
deny=(),
allow_domains=(),
deny_domains=(),
restrict_xpaths=(),
tags=('a', 'area'),
attrs=('href',),
canonicalize=False,
unique=True,
process_value=None,
deny_extensions=None,
restrict_css=(),
strip=True,
restrict_text=None,
):
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
lx = LxmlParserLinkExtractor(
tag=partial(operator.contains, tags),
attr=partial(operator.contains, attrs),
unique=unique,
process=process_value,
strip=strip,
canonicalized=canonicalize
)
super().__init__(
link_extractor=lx,
allow=allow,
deny=deny,
allow_domains=allow_domains,
deny_domains=deny_domains,
restrict_xpaths=restrict_xpaths,
restrict_css=restrict_css,
canonicalize=canonicalize,
deny_extensions=deny_extensions,
restrict_text=restrict_text,
)
def extract_links(self, response):
"""Returns a list of :class:`~scrapy.link.Link` objects from the
specified :class:`response <scrapy.http.Response>`.
Only links that match the settings passed to the ``__init__`` method of
the link extractor are returned.
Duplicate links are omitted.
"""
base_url = get_base_url(response)
if self.restrict_xpaths:
docs = [
subdoc
for x in self.restrict_xpaths
for subdoc in response.xpath(x)
]
else:
docs = [response.selector]
all_links = []
for doc in docs:
links = self._extract_links(doc, response.url, response.encoding, base_url)
all_links.extend(self._process_links(links))
return unique_list(all_links)

View file

@ -0,0 +1,88 @@
"""
Item Loader
See documentation in docs/topics/loaders.rst
"""
import itemloaders
from scrapy.item import Item
from scrapy.selector import Selector
class ItemLoader(itemloaders.ItemLoader):
"""
A user-friendly abstraction to populate an :ref:`item <topics-items>` with data
by applying :ref:`field processors <topics-loaders-processors>` to scraped data.
When instantiated with a ``selector`` or a ``response`` it supports
data extraction from web pages using :ref:`selectors <topics-selectors>`.
:param item: The item instance to populate using subsequent calls to
:meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
or :meth:`~ItemLoader.add_value`.
:type item: scrapy.item.Item
:param selector: The selector to extract data from, when using the
:meth:`add_xpath`, :meth:`add_css`, :meth:`replace_xpath`, or
:meth:`replace_css` method.
:type selector: :class:`~scrapy.selector.Selector` object
:param response: The response used to construct the selector using the
:attr:`default_selector_class`, unless the selector argument is given,
in which case this argument is ignored.
:type response: :class:`~scrapy.http.Response` object
If no item is given, one is instantiated automatically using the class in
:attr:`default_item_class`.
The item, selector, response and remaining keyword arguments are
assigned to the Loader context (accessible through the :attr:`context` attribute).
.. attribute:: item
The item object being parsed by this Item Loader.
This is mostly used as a property so, when attempting to override this
value, you may want to check out :attr:`default_item_class` first.
.. attribute:: context
The currently active :ref:`Context <loaders-context>` of this Item Loader.
.. attribute:: default_item_class
An :ref:`item <topics-items>` class (or factory), used to instantiate
items when not given in the ``__init__`` method.
.. attribute:: default_input_processor
The default input processor to use for those fields which don't specify
one.
.. attribute:: default_output_processor
The default output processor to use for those fields which don't specify
one.
.. attribute:: default_selector_class
The class used to construct the :attr:`selector` of this
:class:`ItemLoader`, if only a response is given in the ``__init__`` method.
If a selector is given in the ``__init__`` method this attribute is ignored.
This attribute is sometimes overridden in subclasses.
.. attribute:: selector
The :class:`~scrapy.selector.Selector` object to extract data from.
It's either the selector given in the ``__init__`` method or one created from
the response given in the ``__init__`` method using the
:attr:`default_selector_class`. This attribute is meant to be
read-only.
"""
default_item_class = Item
default_selector_class = Selector
def __init__(self, item=None, selector=None, response=None, parent=None, **context):
if selector is None and response is not None:
selector = self.default_selector_class(response)
context.update(response=response)
super().__init__(item=item, selector=selector, parent=parent, **context)

View file

@ -0,0 +1,21 @@
"""Common functions used in Item Loaders code"""
import warnings
from itemloaders import common
from scrapy.utils.deprecate import ScrapyDeprecationWarning
def wrap_loader_context(function, context):
"""Wrap functions that receive loader_context to contain the context
"pre-loaded" and expose a interface that receives only one argument
"""
warnings.warn(
"scrapy.loader.common.wrap_loader_context has moved to a new library."
"Please update your reference to itemloaders.common.wrap_loader_context",
ScrapyDeprecationWarning,
stacklevel=2
)
return common.wrap_loader_context(function, context)

View file

@ -0,0 +1,21 @@
"""
This module provides some commonly used processors for Item Loaders.
See documentation in docs/topics/loaders.rst
"""
from itemloaders import processors
from scrapy.utils.deprecate import create_deprecated_class
MapCompose = create_deprecated_class('MapCompose', processors.MapCompose)
Compose = create_deprecated_class('Compose', processors.Compose)
TakeFirst = create_deprecated_class('TakeFirst', processors.TakeFirst)
Identity = create_deprecated_class('Identity', processors.Identity)
SelectJmes = create_deprecated_class('SelectJmes', processors.SelectJmes)
Join = create_deprecated_class('Join', processors.Join)

View file

@ -0,0 +1,147 @@
import os
import logging
from twisted.python.failure import Failure
from scrapy.utils.request import referer_str
SCRAPEDMSG = "Scraped from %(src)s" + os.linesep + "%(item)s"
DROPPEDMSG = "Dropped: %(exception)s" + os.linesep + "%(item)s"
CRAWLEDMSG = "Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
ITEMERRORMSG = "Error processing %(item)s"
SPIDERERRORMSG = "Spider error processing %(request)s (referer: %(referer)s)"
DOWNLOADERRORMSG_SHORT = "Error downloading %(request)s"
DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
class LogFormatter:
"""Class for generating log messages for different actions.
All methods must return a dictionary listing the parameters ``level``, ``msg``
and ``args`` which are going to be used for constructing the log message when
calling ``logging.log``.
Dictionary keys for the method outputs:
* ``level`` is the log level for that action, you can use those from the
`python logging library <https://docs.python.org/3/library/logging.html>`_ :
``logging.DEBUG``, ``logging.INFO``, ``logging.WARNING``, ``logging.ERROR``
and ``logging.CRITICAL``.
* ``msg`` should be a string that can contain different formatting placeholders.
This string, formatted with the provided ``args``, is going to be the long message
for that action.
* ``args`` should be a tuple or dict with the formatting placeholders for ``msg``.
The final log message is computed as ``msg % args``.
Users can define their own ``LogFormatter`` class if they want to customize how
each action is logged or if they want to omit it entirely. In order to omit
logging an action the method must return ``None``.
Here is an example on how to create a custom log formatter to lower the severity level of
the log message when an item is dropped from the pipeline::
class PoliteLogFormatter(logformatter.LogFormatter):
def dropped(self, item, exception, response, spider):
return {
'level': logging.INFO, # lowering the level from logging.WARNING
'msg': "Dropped: %(exception)s" + os.linesep + "%(item)s",
'args': {
'exception': exception,
'item': item,
}
}
"""
def crawled(self, request, response, spider):
"""Logs a message when the crawler finds a webpage."""
request_flags = f' {str(request.flags)}' if request.flags else ''
response_flags = f' {str(response.flags)}' if response.flags else ''
return {
'level': logging.DEBUG,
'msg': CRAWLEDMSG,
'args': {
'status': response.status,
'request': request,
'request_flags': request_flags,
'referer': referer_str(request),
'response_flags': response_flags,
# backward compatibility with Scrapy logformatter below 1.4 version
'flags': response_flags
}
}
def scraped(self, item, response, spider):
"""Logs a message when an item is scraped by a spider."""
if isinstance(response, Failure):
src = response.getErrorMessage()
else:
src = response
return {
'level': logging.DEBUG,
'msg': SCRAPEDMSG,
'args': {
'src': src,
'item': item,
}
}
def dropped(self, item, exception, response, spider):
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
return {
'level': logging.WARNING,
'msg': DROPPEDMSG,
'args': {
'exception': exception,
'item': item,
}
}
def item_error(self, item, exception, response, spider):
"""Logs a message when an item causes an error while it is passing
through the item pipeline.
.. versionadded:: 2.0
"""
return {
'level': logging.ERROR,
'msg': ITEMERRORMSG,
'args': {
'item': item,
}
}
def spider_error(self, failure, request, response, spider):
"""Logs an error message from a spider.
.. versionadded:: 2.0
"""
return {
'level': logging.ERROR,
'msg': SPIDERERRORMSG,
'args': {
'request': request,
'referer': referer_str(request),
}
}
def download_error(self, failure, request, spider, errmsg=None):
"""Logs a download error message from a spider (typically coming from
the engine).
.. versionadded:: 2.0
"""
args = {'request': request}
if errmsg:
msg = DOWNLOADERRORMSG_LONG
args['errmsg'] = errmsg
else:
msg = DOWNLOADERRORMSG_SHORT
return {
'level': logging.ERROR,
'msg': msg,
'args': args,
}
@classmethod
def from_crawler(cls, crawler):
return cls()

View file

@ -0,0 +1,140 @@
"""
Mail sending helpers
See documentation in docs/topics/email.rst
"""
import logging
from email import encoders as Encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.nonmultipart import MIMENonMultipart
from email.mime.text import MIMEText
from email.utils import COMMASPACE, formatdate
from io import BytesIO
from twisted.internet import defer, ssl
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.python import to_bytes
logger = logging.getLogger(__name__)
def _to_bytes_or_none(text):
if text is None:
return None
return to_bytes(text)
class MailSender:
def __init__(
self, smtphost='localhost', mailfrom='scrapy@localhost', smtpuser=None,
smtppass=None, smtpport=25, smtptls=False, smtpssl=False, debug=False
):
self.smtphost = smtphost
self.smtpport = smtpport
self.smtpuser = _to_bytes_or_none(smtpuser)
self.smtppass = _to_bytes_or_none(smtppass)
self.smtptls = smtptls
self.smtpssl = smtpssl
self.mailfrom = mailfrom
self.debug = debug
@classmethod
def from_settings(cls, settings):
return cls(
smtphost=settings['MAIL_HOST'],
mailfrom=settings['MAIL_FROM'],
smtpuser=settings['MAIL_USER'],
smtppass=settings['MAIL_PASS'],
smtpport=settings.getint('MAIL_PORT'),
smtptls=settings.getbool('MAIL_TLS'),
smtpssl=settings.getbool('MAIL_SSL'),
)
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None):
from twisted.internet import reactor
if attachs:
msg = MIMEMultipart()
else:
msg = MIMENonMultipart(*mimetype.split('/', 1))
to = list(arg_to_iter(to))
cc = list(arg_to_iter(cc))
msg['From'] = self.mailfrom
msg['To'] = COMMASPACE.join(to)
msg['Date'] = formatdate(localtime=True)
msg['Subject'] = subject
rcpts = to[:]
if cc:
rcpts.extend(cc)
msg['Cc'] = COMMASPACE.join(cc)
if charset:
msg.set_charset(charset)
if attachs:
msg.attach(MIMEText(body, 'plain', charset or 'us-ascii'))
for attach_name, mimetype, f in attachs:
part = MIMEBase(*mimetype.split('/'))
part.set_payload(f.read())
Encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment', filename=attach_name)
msg.attach(part)
else:
msg.set_payload(body)
if _callback:
_callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)
if self.debug:
logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': len(attachs)})
return
dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8'))
dfd.addCallbacks(
callback=self._sent_ok,
errback=self._sent_failed,
callbackArgs=[to, cc, subject, len(attachs)],
errbackArgs=[to, cc, subject, len(attachs)],
)
reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd)
return dfd
def _sent_ok(self, result, to, cc, subject, nattachs):
logger.info('Mail sent OK: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': nattachs})
def _sent_failed(self, failure, to, cc, subject, nattachs):
errstr = str(failure.value)
logger.error('Unable to send mail: To=%(mailto)s Cc=%(mailcc)s '
'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
'- %(mailerr)s',
{'mailto': to, 'mailcc': cc, 'mailsubject': subject,
'mailattachs': nattachs, 'mailerr': errstr})
def _sendmail(self, to_addrs, msg):
# Import twisted.mail here because it is not available in python3
from twisted.internet import reactor
from twisted.mail.smtp import ESMTPSenderFactory
msg = BytesIO(msg)
d = defer.Deferred()
factory = ESMTPSenderFactory(
self.smtpuser, self.smtppass, self.mailfrom, to_addrs, msg, d,
heloFallback=True, requireAuthentication=False, requireTransportSecurity=self.smtptls,
)
factory.noisy = False
if self.smtpssl:
reactor.connectSSL(self.smtphost, self.smtpport, factory, ssl.ClientContextFactory())
else:
reactor.connectTCP(self.smtphost, self.smtpport, factory)
return d

View file

@ -0,0 +1,75 @@
from collections import defaultdict, deque
import logging
import pprint
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.defer import process_parallel, process_chain, process_chain_both
logger = logging.getLogger(__name__)
class MiddlewareManager:
"""Base class for implementing middleware managers"""
component_name = 'foo middleware'
def __init__(self, *middlewares):
self.middlewares = middlewares
self.methods = defaultdict(deque)
for mw in middlewares:
self._add_middleware(mw)
@classmethod
def _get_mwlist_from_settings(cls, settings):
raise NotImplementedError
@classmethod
def from_settings(cls, settings, crawler=None):
mwlist = cls._get_mwlist_from_settings(settings)
middlewares = []
enabled = []
for clspath in mwlist:
try:
mwcls = load_object(clspath)
mw = create_instance(mwcls, settings, crawler)
middlewares.append(mw)
enabled.append(clspath)
except NotConfigured as e:
if e.args:
clsname = clspath.split('.')[-1]
logger.warning("Disabled %(clsname)s: %(eargs)s",
{'clsname': clsname, 'eargs': e.args[0]},
extra={'crawler': crawler})
logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
{'componentname': cls.component_name,
'enabledlist': pprint.pformat(enabled)},
extra={'crawler': crawler})
return cls(*middlewares)
@classmethod
def from_crawler(cls, crawler):
return cls.from_settings(crawler.settings, crawler)
def _add_middleware(self, mw):
if hasattr(mw, 'open_spider'):
self.methods['open_spider'].append(mw.open_spider)
if hasattr(mw, 'close_spider'):
self.methods['close_spider'].appendleft(mw.close_spider)
def _process_parallel(self, methodname, obj, *args):
return process_parallel(self.methods[methodname], obj, *args)
def _process_chain(self, methodname, obj, *args):
return process_chain(self.methods[methodname], obj, *args)
def _process_chain_both(self, cb_methodname, eb_methodname, obj, *args):
return process_chain_both(self.methods[cb_methodname],
self.methods[eb_methodname], obj, *args)
def open_spider(self, spider):
return self._process_parallel('open_spider', spider)
def close_spider(self, spider):
return self._process_parallel('close_spider', spider)

View file

@ -0,0 +1,750 @@
###############################################################################
#
# MIME-TYPES and the extensions that represent them
#
###############################################################################
application/activemessage
application/andrew-inset ez
application/annodex anx
application/applefile
application/atom+xml atom
application/atomcat+xml atomcat
application/atomserv+xml atomsrv
application/atomicmail
application/batch-SMTP
application/beep+xml
application/bbolin lin
application/cals-1840
application/cap cap pcap
application/commonground
application/cu-seeme cu
application/cybercash
application/davmount+xml davmount
application/dca-rft
application/dec-dx
application/docbook+xml
application/dsptype tsp
application/dvcs
application/ecmascript es
application/edi-consent
application/edi-x12
application/edifact
application/eshop
application/font-tdpfr
application/futuresplash spl
application/ghostview
application/hta hta
application/http
application/hyperstudio
application/iges
application/index
application/index.cmd
application/index.obj
application/index.response
application/index.vnd
application/iotp
application/ipp
application/isup
application/java-archive jar
application/java-serialized-object ser
application/java-vm class
application/javascript js
application/m3g m3g
application/mac-binhex40 hqx
application/mac-compactpro cpt
application/macwriteii
application/marc
application/mathematica nb nbp
application/ms-tnef
application/msaccess mdb
application/msword doc dot
application/news-message-id
application/news-transmission
application/ocsp-request
application/ocsp-response
application/octet-stream bin
application/oda oda
application/ogg ogx
application/parityfec
application/pdf pdf
application/pgp-encrypted
application/pgp-keys key
application/pgp-signature pgp
application/pics-rules prf
application/pkcs10
application/pkcs7-mime
application/pkcs7-signature
application/pkix-cert
application/pkix-crl
application/pkixcmp
application/postscript ps ai eps espi epsf eps2 eps3
application/prs.alvestrand.titrax-sheet
application/prs.cww
application/prs.nprend
application/qsig
application/rar rar
application/rdf+xml rdf
application/remote-printing
application/riscos
application/rss+xml rss
application/rtf rtf
application/sdp
application/set-payment
application/set-payment-initiation
application/set-registration
application/set-registration-initiation
application/sgml
application/sgml-open-catalog
application/sieve
application/slate
application/smil smi smil
application/timestamp-query
application/timestamp-reply
application/vemmi
application/whoispp-query
application/whoispp-response
application/wita
application/x400-bp
application/xhtml+xml xhtml xht
application/xml xml xsl xsd
application/xml-dtd
application/xml-external-parsed-entity
application/xspf+xml xspf
application/zip zip
application/vnd.3M.Post-it-Notes
application/vnd.accpac.simply.aso
application/vnd.accpac.simply.imp
application/vnd.acucobol
application/vnd.aether.imp
application/vnd.anser-web-certificate-issue-initiation
application/vnd.anser-web-funds-transfer-initiation
application/vnd.audiograph
application/vnd.bmi
application/vnd.businessobjects
application/vnd.canon-cpdl
application/vnd.canon-lips
application/vnd.cinderella cdy
application/vnd.claymore
application/vnd.commerce-battelle
application/vnd.commonspace
application/vnd.comsocaller
application/vnd.contact.cmsg
application/vnd.cosmocaller
application/vnd.ctc-posml
application/vnd.cups-postscript
application/vnd.cups-raster
application/vnd.cups-raw
application/vnd.cybank
application/vnd.dna
application/vnd.dpgraph
application/vnd.dxr
application/vnd.ecdis-update
application/vnd.ecowin.chart
application/vnd.ecowin.filerequest
application/vnd.ecowin.fileupdate
application/vnd.ecowin.series
application/vnd.ecowin.seriesrequest
application/vnd.ecowin.seriesupdate
application/vnd.enliven
application/vnd.epson.esf
application/vnd.epson.msf
application/vnd.epson.quickanime
application/vnd.epson.salt
application/vnd.epson.ssf
application/vnd.ericsson.quickcall
application/vnd.eudora.data
application/vnd.fdf
application/vnd.ffsns
application/vnd.flographit
application/vnd.framemaker
application/vnd.fsc.weblaunch
application/vnd.fujitsu.oasys
application/vnd.fujitsu.oasys2
application/vnd.fujitsu.oasys3
application/vnd.fujitsu.oasysgp
application/vnd.fujitsu.oasysprs
application/vnd.fujixerox.ddd
application/vnd.fujixerox.docuworks
application/vnd.fujixerox.docuworks.binder
application/vnd.fut-misnet
application/vnd.google-earth.kml+xml kml
application/vnd.google-earth.kmz kmz
application/vnd.grafeq
application/vnd.groove-account
application/vnd.groove-identity-message
application/vnd.groove-injector
application/vnd.groove-tool-message
application/vnd.groove-tool-template
application/vnd.groove-vcard
application/vnd.hhe.lesson-player
application/vnd.hp-HPGL
application/vnd.hp-PCL
application/vnd.hp-PCLXL
application/vnd.hp-hpid
application/vnd.hp-hps
application/vnd.httphone
application/vnd.hzn-3d-crossword
application/vnd.ibm.MiniPay
application/vnd.ibm.afplinedata
application/vnd.ibm.modcap
application/vnd.informix-visionary
application/vnd.intercon.formnet
application/vnd.intertrust.digibox
application/vnd.intertrust.nncp
application/vnd.intu.qbo
application/vnd.intu.qfx
application/vnd.irepository.package+xml
application/vnd.is-xpr
application/vnd.japannet-directory-service
application/vnd.japannet-jpnstore-wakeup
application/vnd.japannet-payment-wakeup
application/vnd.japannet-registration
application/vnd.japannet-registration-wakeup
application/vnd.japannet-setstore-wakeup
application/vnd.japannet-verification
application/vnd.japannet-verification-wakeup
application/vnd.koan
application/vnd.lotus-1-2-3
application/vnd.lotus-approach
application/vnd.lotus-freelance
application/vnd.lotus-notes
application/vnd.lotus-organizer
application/vnd.lotus-screencam
application/vnd.lotus-wordpro
application/vnd.mcd
application/vnd.mediastation.cdkey
application/vnd.meridian-slingshot
application/vnd.mif
application/vnd.minisoft-hp3000-save
application/vnd.mitsubishi.misty-guard.trustweb
application/vnd.mobius.daf
application/vnd.mobius.dis
application/vnd.mobius.msl
application/vnd.mobius.plc
application/vnd.mobius.txf
application/vnd.motorola.flexsuite
application/vnd.motorola.flexsuite.adsi
application/vnd.motorola.flexsuite.fis
application/vnd.motorola.flexsuite.gotap
application/vnd.motorola.flexsuite.kmr
application/vnd.motorola.flexsuite.ttc
application/vnd.motorola.flexsuite.wem
application/vnd.mozilla.xul+xml xul
application/vnd.ms-artgalry
application/vnd.ms-asf
application/vnd.ms-excel xls xlb xlt
application/vnd.ms-lrm
application/vnd.ms-pki.seccat cat
application/vnd.ms-pki.stl stl
application/vnd.ms-powerpoint ppt pps
application/vnd.ms-project
application/vnd.ms-tnef
application/vnd.ms-works
application/vnd.mseq
application/vnd.msign
application/vnd.music-niff
application/vnd.musician
application/vnd.netfpx
application/vnd.noblenet-directory
application/vnd.noblenet-sealer
application/vnd.noblenet-web
application/vnd.novadigm.EDM
application/vnd.novadigm.EDX
application/vnd.novadigm.EXT
application/vnd.oasis.opendocument.chart odc
application/vnd.oasis.opendocument.database odb
application/vnd.oasis.opendocument.formula odf
application/vnd.oasis.opendocument.graphics odg
application/vnd.oasis.opendocument.graphics-template otg
application/vnd.oasis.opendocument.image odi
application/vnd.oasis.opendocument.presentation odp
application/vnd.oasis.opendocument.presentation-template otp
application/vnd.oasis.opendocument.spreadsheet ods
application/vnd.oasis.opendocument.spreadsheet-template ots
application/vnd.oasis.opendocument.text odt
application/vnd.oasis.opendocument.text-master odm
application/vnd.oasis.opendocument.text-template ott
application/vnd.oasis.opendocument.text-web oth
application/vnd.osa.netdeploy
application/vnd.palm
application/vnd.pg.format
application/vnd.pg.osasli
application/vnd.powerbuilder6
application/vnd.powerbuilder6-s
application/vnd.powerbuilder7
application/vnd.powerbuilder7-s
application/vnd.powerbuilder75
application/vnd.powerbuilder75-s
application/vnd.previewsystems.box
application/vnd.publishare-delta-tree
application/vnd.pvi.ptid1
application/vnd.pwg-xhtml-print+xml
application/vnd.rapid
application/vnd.rim.cod cod
application/vnd.s3sms
application/vnd.seemail
application/vnd.shana.informed.formdata
application/vnd.shana.informed.formtemplate
application/vnd.shana.informed.interchange
application/vnd.shana.informed.package
application/vnd.smaf mmf
application/vnd.sss-cod
application/vnd.sss-dtf
application/vnd.sss-ntf
application/vnd.stardivision.calc sdc
application/vnd.stardivision.chart sds
application/vnd.stardivision.draw sda
application/vnd.stardivision.impress sdd
application/vnd.stardivision.math sdf
application/vnd.stardivision.writer sdw
application/vnd.stardivision.writer-global sgl
application/vnd.street-stream
application/vnd.sun.xml.calc sxc
application/vnd.sun.xml.calc.template stc
application/vnd.sun.xml.draw sxd
application/vnd.sun.xml.draw.template std
application/vnd.sun.xml.impress sxi
application/vnd.sun.xml.impress.template sti
application/vnd.sun.xml.math sxm
application/vnd.sun.xml.writer sxw
application/vnd.sun.xml.writer.global sxg
application/vnd.sun.xml.writer.template stw
application/vnd.svd
application/vnd.swiftview-ics
application/vnd.symbian.install sis
application/vnd.triscape.mxs
application/vnd.trueapp
application/vnd.truedoc
application/vnd.tve-trigger
application/vnd.ufdl
application/vnd.uplanet.alert
application/vnd.uplanet.alert-wbxml
application/vnd.uplanet.bearer-choice
application/vnd.uplanet.bearer-choice-wbxml
application/vnd.uplanet.cacheop
application/vnd.uplanet.cacheop-wbxml
application/vnd.uplanet.channel
application/vnd.uplanet.channel-wbxml
application/vnd.uplanet.list
application/vnd.uplanet.list-wbxml
application/vnd.uplanet.listcmd
application/vnd.uplanet.listcmd-wbxml
application/vnd.uplanet.signal
application/vnd.vcx
application/vnd.vectorworks
application/vnd.vidsoft.vidconference
application/vnd.visio vsd
application/vnd.vividence.scriptfile
application/vnd.wap.sic
application/vnd.wap.slc
application/vnd.wap.wbxml wbxml
application/vnd.wap.wmlc wmlc
application/vnd.wap.wmlscriptc wmlsc
application/vnd.webturbo
application/vnd.wordperfect wpd
application/vnd.wordperfect5.1 wp5
application/vnd.wrq-hp3000-labelled
application/vnd.wt.stf
application/vnd.xara
application/vnd.xfdl
application/vnd.yellowriver-custom-menu
application/x-123 wk
application/x-7z-compressed 7z
application/x-abiword abw
application/x-apple-diskimage dmg
application/x-bcpio bcpio
application/x-bittorrent torrent
application/x-cab cab
application/x-cbr cbr
application/x-cbz cbz
application/x-cdf cdf cda
application/x-cdlink vcd
application/x-chess-pgn pgn
application/x-core
application/x-cpio cpio
application/x-csh csh
application/x-debian-package deb udeb
application/x-director dcr dir dxr
application/x-dms dms
application/x-doom wad
application/x-dvi dvi
application/x-httpd-eruby rhtml
application/x-executable
application/x-font pfa pfb gsf pcf pcf.Z
application/x-freemind mm
application/x-futuresplash spl
application/x-gnumeric gnumeric
application/x-go-sgf sgf
application/x-graphing-calculator gcf
application/x-gtar gtar tgz taz
application/x-hdf hdf
application/x-httpd-php phtml pht php
application/x-httpd-php-source phps
application/x-httpd-php3 php3
application/x-httpd-php3-preprocessed php3p
application/x-httpd-php4 php4
application/x-ica ica
application/x-info info
application/x-internet-signup ins isp
application/x-iphone iii
application/x-iso9660-image iso
application/x-jam jam
application/x-java-applet
application/x-java-bean
application/x-java-jnlp-file jnlp
application/x-jmol jmz
application/x-kchart chrt
application/x-kdelnk
application/x-killustrator kil
application/x-koan skp skd skt skm
application/x-kpresenter kpr kpt
application/x-kspread ksp
application/x-kword kwd kwt
application/x-latex latex
application/x-lha lha
application/x-lyx lyx
application/x-lzh lzh
application/x-lzx lzx
application/x-maker frm maker frame fm fb book fbdoc
application/x-mif mif
application/x-ms-wmd wmd
application/x-ms-wmz wmz
application/x-msdos-program com exe bat dll
application/x-msi msi
application/x-netcdf nc
application/x-ns-proxy-autoconfig pac dat
application/x-nwc nwc
application/x-object o
application/x-oz-application oza
application/x-pkcs7-certreqresp p7r
application/x-pkcs7-crl crl
application/x-python-code pyc pyo
application/x-qgis qgs shp shx
application/x-quicktimeplayer qtl
application/x-redhat-package-manager rpm
application/x-ruby rb
application/x-rx
application/x-sh sh
application/x-shar shar
application/x-shellscript
application/x-shockwave-flash swf swfl
application/x-stuffit sit sitx
application/x-sv4cpio sv4cpio
application/x-sv4crc sv4crc
application/x-tar tar
application/x-tcl tcl
application/x-tex-gf gf
application/x-tex-pk pk
application/x-texinfo texinfo texi
application/x-trash ~ % bak old sik
application/x-troff t tr roff
application/x-troff-man man
application/x-troff-me me
application/x-troff-ms ms
application/x-ustar ustar
application/x-videolan
application/x-wais-source src
application/x-wingz wz
application/x-x509-ca-cert crt
application/x-xcf xcf
application/x-xfig fig
application/x-xpinstall xpi
audio/32kadpcm
audio/3gpp
audio/amr amr
audio/amr-wb awb
audio/amr amr
audio/amr-wb awb
audio/annodex axa
audio/basic au snd
audio/flac flac
audio/g.722.1
audio/l16
audio/midi mid midi kar
audio/mp4a-latm
audio/mpa-robust
audio/mpeg mpga mpega mp2 mp3 m4a
audio/mpegurl m3u
audio/ogg oga ogg spx
audio/parityfec
audio/prs.sid sid
audio/telephone-event
audio/tone
audio/vnd.cisco.nse
audio/vnd.cns.anp1
audio/vnd.cns.inf1
audio/vnd.digital-winds
audio/vnd.everad.plj
audio/vnd.lucent.voice
audio/vnd.nortel.vbk
audio/vnd.nuera.ecelp4800
audio/vnd.nuera.ecelp7470
audio/vnd.nuera.ecelp9600
audio/vnd.octel.sbc
audio/vnd.qcelp
audio/vnd.rhetorex.32kadpcm
audio/vnd.vmx.cvsd
audio/x-aiff aif aiff aifc
audio/x-gsm gsm
audio/x-mpegurl m3u
audio/x-ms-wma wma
audio/x-ms-wax wax
audio/x-pn-realaudio-plugin
audio/x-pn-realaudio ra rm ram
audio/x-realaudio ra
audio/x-scpls pls
audio/x-sd2 sd2
audio/x-wav wav
chemical/x-alchemy alc
chemical/x-cache cac cache
chemical/x-cache-csf csf
chemical/x-cactvs-binary cbin cascii ctab
chemical/x-cdx cdx
chemical/x-cerius cer
chemical/x-chem3d c3d
chemical/x-chemdraw chm
chemical/x-cif cif
chemical/x-cmdf cmdf
chemical/x-cml cml
chemical/x-compass cpa
chemical/x-crossfire bsd
chemical/x-csml csml csm
chemical/x-ctx ctx
chemical/x-cxf cxf cef
#chemical/x-daylight-smiles smi
chemical/x-embl-dl-nucleotide emb embl
chemical/x-galactic-spc spc
chemical/x-gamess-input inp gam gamin
chemical/x-gaussian-checkpoint fch fchk
chemical/x-gaussian-cube cub
chemical/x-gaussian-input gau gjc gjf
chemical/x-gaussian-log gal
chemical/x-gcg8-sequence gcg
chemical/x-genbank gen
chemical/x-hin hin
chemical/x-isostar istr ist
chemical/x-jcamp-dx jdx dx
chemical/x-kinemage kin
chemical/x-macmolecule mcm
chemical/x-macromodel-input mmd mmod
chemical/x-mdl-molfile mol
chemical/x-mdl-rdfile rd
chemical/x-mdl-rxnfile rxn
chemical/x-mdl-sdfile sd sdf
chemical/x-mdl-tgf tgf
#chemical/x-mif mif
chemical/x-mmcif mcif
chemical/x-mol2 mol2
chemical/x-molconn-Z b
chemical/x-mopac-graph gpt
chemical/x-mopac-input mop mopcrt mpc zmt
chemical/x-mopac-out moo
chemical/x-mopac-vib mvb
chemical/x-ncbi-asn1 asn
chemical/x-ncbi-asn1-ascii prt ent
chemical/x-ncbi-asn1-binary val aso
chemical/x-ncbi-asn1-spec asn
chemical/x-pdb pdb ent
chemical/x-rosdal ros
chemical/x-swissprot sw
chemical/x-vamas-iso14976 vms
chemical/x-vmd vmd
chemical/x-xtel xtel
chemical/x-xyz xyz
image/cgm
image/g3fax
image/gif gif
image/ief ief
image/jpeg jpeg jpg jpe
image/naplps
image/pcx pcx
image/png png
image/prs.btif
image/prs.pti
image/svg+xml svg svgz
image/tiff tiff tif
image/vnd.cns.inf2
image/vnd.djvu djvu djv
image/vnd.dwg
image/vnd.dxf
image/vnd.fastbidsheet
image/vnd.fpx
image/vnd.fst
image/vnd.fujixerox.edmics-mmr
image/vnd.fujixerox.edmics-rlc
image/vnd.mix
image/vnd.net-fpx
image/vnd.svf
image/vnd.wap.wbmp wbmp
image/vnd.xiff
image/x-cmu-raster ras
image/x-coreldraw cdr
image/x-coreldrawpattern pat
image/x-coreldrawtemplate cdt
image/x-corelphotopaint cpt
image/x-icon ico
image/x-jg art
image/x-jng jng
image/x-ms-bmp bmp
image/x-photoshop psd
image/x-portable-anymap pnm
image/x-portable-bitmap pbm
image/x-portable-graymap pgm
image/x-portable-pixmap ppm
image/x-rgb rgb
image/x-xbitmap xbm
image/x-xpixmap xpm
image/x-xwindowdump xwd
inode/chardevice
inode/blockdevice
inode/directory-locked
inode/directory
inode/fifo
inode/socket
message/delivery-status
message/disposition-notification
message/external-body
message/http
message/s-http
message/news
message/partial
message/rfc822 eml
model/iges igs iges
model/mesh msh mesh silo
model/vnd.dwf
model/vnd.flatland.3dml
model/vnd.gdl
model/vnd.gs-gdl
model/vnd.gtw
model/vnd.mts
model/vnd.vtu
model/vrml wrl vrml
multipart/alternative
multipart/appledouble
multipart/byteranges
multipart/digest
multipart/encrypted
multipart/form-data
multipart/header-set
multipart/mixed
multipart/parallel
multipart/related
multipart/report
multipart/signed
multipart/voice-message
text/calendar ics icz
text/css css
text/csv csv
text/directory
text/english
text/enriched
text/h323 323
text/html html htm shtml
text/iuls uls
text/mathml mml
text/parityfec
text/plain asc txt text pot brf
text/prs.lines.tag
text/rfc822-headers
text/richtext rtx
text/rtf
text/scriptlet sct wsc
text/t140
text/texmacs tm ts
text/tab-separated-values tsv
text/uri-list
text/vnd.abc
text/vnd.curl
text/vnd.DMClientScript
text/vnd.flatland.3dml
text/vnd.fly
text/vnd.fmi.flexstor
text/vnd.in3d.3dml
text/vnd.in3d.spot
text/vnd.IPTC.NewsML
text/vnd.IPTC.NITF
text/vnd.latex-z
text/vnd.motorola.reflex
text/vnd.ms-mediapackage
text/vnd.sun.j2me.app-descriptor jad
text/vnd.wap.si
text/vnd.wap.sl
text/vnd.wap.wml wml
text/vnd.wap.wmlscript wmls
text/x-bibtex bib
text/x-boo boo
text/x-c++hdr h++ hpp hxx hh
text/x-c++src c++ cpp cxx cc
text/x-chdr h
text/x-component htc
text/x-crontab
text/x-csh csh
text/x-csrc c
text/x-dsrc d
text/x-diff diff patch
text/x-haskell hs
text/x-java java
text/x-literate-haskell lhs
text/x-makefile
text/x-moc moc
text/x-pascal p pas
text/x-pcs-gcd gcd
text/x-perl pl pm
text/x-python py
text/x-scala scala
text/x-server-parsed-html
text/x-setext etx
text/x-sh sh
text/x-tcl tcl tk
text/x-tex tex ltx sty cls
text/x-vcalendar vcs
text/x-vcard vcf
video/3gpp 3gp
video/annodex axv
video/dl dl
video/dv dif dv
video/fli fli
video/gl gl
video/mpeg mpeg mpg mpe
video/mp4 mp4
video/quicktime qt mov
video/mp4v-es
video/ogg ogv
video/parityfec
video/pointer
video/vnd.fvt
video/vnd.motorola.video
video/vnd.motorola.videop
video/vnd.mpegurl mxu
video/vnd.mts
video/vnd.nokia.interleaved-multimedia
video/vnd.vivo
video/x-flv flv
video/x-la-asf lsf lsx
video/x-mng mng
video/x-ms-asf asf asx
video/x-ms-wm wm
video/x-ms-wmv wmv
video/x-ms-wmx wmx
video/x-ms-wvx wvx
video/x-msvideo avi
video/x-sgi-movie movie
video/x-matroska mpv
x-conference/x-cooltalk ice
x-epoc/x-sisx-app sisx
x-world/x-vrml vrm vrml wrl
x-scrapy/test scrapytest

View file

@ -0,0 +1,26 @@
"""
Item pipeline
See documentation in docs/item-pipeline.rst
"""
from scrapy.middleware import MiddlewareManager
from scrapy.utils.conf import build_component_list
from scrapy.utils.defer import deferred_f_from_coro_f
class ItemPipelineManager(MiddlewareManager):
component_name = 'item pipeline'
@classmethod
def _get_mwlist_from_settings(cls, settings):
return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
def _add_middleware(self, pipe):
super(ItemPipelineManager, self)._add_middleware(pipe)
if hasattr(pipe, 'process_item'):
self.methods['process_item'].append(deferred_f_from_coro_f(pipe.process_item))
def process_item(self, item, spider):
return self._process_chain('process_item', item, spider)

View file

@ -0,0 +1,514 @@
"""
Files Pipeline
See documentation in topics/media-pipeline.rst
"""
import functools
import hashlib
import logging
import mimetypes
import os
import time
from collections import defaultdict
from contextlib import suppress
from ftplib import FTP
from io import BytesIO
from urllib.parse import urlparse
from itemadapter import ItemAdapter
from twisted.internet import defer, threads
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.boto import is_botocore_available
from scrapy.utils.datatypes import CaselessDict
from scrapy.utils.ftp import ftp_store_file
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
from scrapy.utils.request import referer_str
logger = logging.getLogger(__name__)
class FileException(Exception):
"""General media error exception"""
class FSFilesStore:
def __init__(self, basedir):
if '://' in basedir:
basedir = basedir.split('://', 1)[1]
self.basedir = basedir
self._mkdir(self.basedir)
self.created_directories = defaultdict(set)
def persist_file(self, path, buf, info, meta=None, headers=None):
absolute_path = self._get_filesystem_path(path)
self._mkdir(os.path.dirname(absolute_path), info)
with open(absolute_path, 'wb') as f:
f.write(buf.getvalue())
def stat_file(self, path, info):
absolute_path = self._get_filesystem_path(path)
try:
last_modified = os.path.getmtime(absolute_path)
except os.error:
return {}
with open(absolute_path, 'rb') as f:
checksum = md5sum(f)
return {'last_modified': last_modified, 'checksum': checksum}
def _get_filesystem_path(self, path):
path_comps = path.split('/')
return os.path.join(self.basedir, *path_comps)
def _mkdir(self, dirname, domain=None):
seen = self.created_directories[domain] if domain else set()
if dirname not in seen:
if not os.path.exists(dirname):
os.makedirs(dirname)
seen.add(dirname)
class S3FilesStore:
AWS_ACCESS_KEY_ID = None
AWS_SECRET_ACCESS_KEY = None
AWS_ENDPOINT_URL = None
AWS_REGION_NAME = None
AWS_USE_SSL = None
AWS_VERIFY = None
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
HEADERS = {
'Cache-Control': 'max-age=172800',
}
def __init__(self, uri):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3',
aws_access_key_id=self.AWS_ACCESS_KEY_ID,
aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
endpoint_url=self.AWS_ENDPOINT_URL,
region_name=self.AWS_REGION_NAME,
use_ssl=self.AWS_USE_SSL,
verify=self.AWS_VERIFY
)
if not uri.startswith("s3://"):
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
self.bucket, self.prefix = uri[5:].split('/', 1)
def stat_file(self, path, info):
def _onsuccess(boto_key):
checksum = boto_key['ETag'].strip('"')
last_modified = boto_key['LastModified']
modified_stamp = time.mktime(last_modified.timetuple())
return {'checksum': checksum, 'last_modified': modified_stamp}
return self._get_boto_key(path).addCallback(_onsuccess)
def _get_boto_key(self, path):
key_name = f'{self.prefix}{path}'
return threads.deferToThread(
self.s3_client.head_object,
Bucket=self.bucket,
Key=key_name)
def persist_file(self, path, buf, info, meta=None, headers=None):
"""Upload file to S3 storage"""
key_name = f'{self.prefix}{path}'
buf.seek(0)
extra = self._headers_to_botocore_kwargs(self.HEADERS)
if headers:
extra.update(self._headers_to_botocore_kwargs(headers))
return threads.deferToThread(
self.s3_client.put_object,
Bucket=self.bucket,
Key=key_name,
Body=buf,
Metadata={k: str(v) for k, v in (meta or {}).items()},
ACL=self.POLICY,
**extra)
def _headers_to_botocore_kwargs(self, headers):
""" Convert headers to botocore keyword agruments.
"""
# This is required while we need to support both boto and botocore.
mapping = CaselessDict({
'Content-Type': 'ContentType',
'Cache-Control': 'CacheControl',
'Content-Disposition': 'ContentDisposition',
'Content-Encoding': 'ContentEncoding',
'Content-Language': 'ContentLanguage',
'Content-Length': 'ContentLength',
'Content-MD5': 'ContentMD5',
'Expires': 'Expires',
'X-Amz-Grant-Full-Control': 'GrantFullControl',
'X-Amz-Grant-Read': 'GrantRead',
'X-Amz-Grant-Read-ACP': 'GrantReadACP',
'X-Amz-Grant-Write-ACP': 'GrantWriteACP',
'X-Amz-Object-Lock-Legal-Hold': 'ObjectLockLegalHoldStatus',
'X-Amz-Object-Lock-Mode': 'ObjectLockMode',
'X-Amz-Object-Lock-Retain-Until-Date': 'ObjectLockRetainUntilDate',
'X-Amz-Request-Payer': 'RequestPayer',
'X-Amz-Server-Side-Encryption': 'ServerSideEncryption',
'X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id': 'SSEKMSKeyId',
'X-Amz-Server-Side-Encryption-Context': 'SSEKMSEncryptionContext',
'X-Amz-Server-Side-Encryption-Customer-Algorithm': 'SSECustomerAlgorithm',
'X-Amz-Server-Side-Encryption-Customer-Key': 'SSECustomerKey',
'X-Amz-Server-Side-Encryption-Customer-Key-Md5': 'SSECustomerKeyMD5',
'X-Amz-Storage-Class': 'StorageClass',
'X-Amz-Tagging': 'Tagging',
'X-Amz-Website-Redirect-Location': 'WebsiteRedirectLocation',
})
extra = {}
for key, value in headers.items():
try:
kwarg = mapping[key]
except KeyError:
raise TypeError(f'Header "{key}" is not supported by botocore')
else:
extra[kwarg] = value
return extra
class GCSFilesStore:
GCS_PROJECT_ID = None
CACHE_CONTROL = 'max-age=172800'
# The bucket's default object ACL will be applied to the object.
# Overriden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
POLICY = None
def __init__(self, uri):
from google.cloud import storage
client = storage.Client(project=self.GCS_PROJECT_ID)
bucket, prefix = uri[5:].split('/', 1)
self.bucket = client.bucket(bucket)
self.prefix = prefix
permissions = self.bucket.test_iam_permissions(
['storage.objects.get', 'storage.objects.create']
)
if 'storage.objects.get' not in permissions:
logger.warning(
"No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
"Checking if files are up to date will be impossible. Files will be downloaded every time.",
{'bucket': bucket}
)
if 'storage.objects.create' not in permissions:
logger.error(
"No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
{'bucket': bucket}
)
def stat_file(self, path, info):
def _onsuccess(blob):
if blob:
checksum = blob.md5_hash
last_modified = time.mktime(blob.updated.timetuple())
return {'checksum': checksum, 'last_modified': last_modified}
else:
return {}
return threads.deferToThread(self.bucket.get_blob, path).addCallback(_onsuccess)
def _get_content_type(self, headers):
if headers and 'Content-Type' in headers:
return headers['Content-Type']
else:
return 'application/octet-stream'
def persist_file(self, path, buf, info, meta=None, headers=None):
blob = self.bucket.blob(self.prefix + path)
blob.cache_control = self.CACHE_CONTROL
blob.metadata = {k: str(v) for k, v in (meta or {}).items()}
return threads.deferToThread(
blob.upload_from_string,
data=buf.getvalue(),
content_type=self._get_content_type(headers),
predefined_acl=self.POLICY
)
class FTPFilesStore:
FTP_USERNAME = None
FTP_PASSWORD = None
USE_ACTIVE_MODE = None
def __init__(self, uri):
if not uri.startswith("ftp://"):
raise ValueError(f"Incorrect URI scheme in {uri}, expected 'ftp'")
u = urlparse(uri)
self.port = u.port
self.host = u.hostname
self.port = int(u.port or 21)
self.username = u.username or self.FTP_USERNAME
self.password = u.password or self.FTP_PASSWORD
self.basedir = u.path.rstrip('/')
def persist_file(self, path, buf, info, meta=None, headers=None):
path = f'{self.basedir}/{path}'
return threads.deferToThread(
ftp_store_file, path=path, file=buf,
host=self.host, port=self.port, username=self.username,
password=self.password, use_active_mode=self.USE_ACTIVE_MODE
)
def stat_file(self, path, info):
def _stat_file(path):
try:
ftp = FTP()
ftp.connect(self.host, self.port)
ftp.login(self.username, self.password)
if self.USE_ACTIVE_MODE:
ftp.set_pasv(False)
file_path = f"{self.basedir}/{path}"
last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
m = hashlib.md5()
ftp.retrbinary(f'RETR {file_path}', m.update)
return {'last_modified': last_modified, 'checksum': m.hexdigest()}
# The file doesn't exist
except Exception:
return {}
return threads.deferToThread(_stat_file, path)
class FilesPipeline(MediaPipeline):
"""Abstract pipeline that implement the file downloading
This pipeline tries to minimize network transfers and file processing,
doing stat of the files and determining if file is new, uptodate or
expired.
``new`` files are those that pipeline never processed and needs to be
downloaded from supplier site the first time.
``uptodate`` files are the ones that the pipeline processed and are still
valid files.
``expired`` files are those that pipeline already processed but the last
modification was made long time ago, so a reprocessing is recommended to
refresh it in case of change.
"""
MEDIA_NAME = "file"
EXPIRES = 90
STORE_SCHEMES = {
'': FSFilesStore,
'file': FSFilesStore,
's3': S3FilesStore,
'gs': GCSFilesStore,
'ftp': FTPFilesStore
}
DEFAULT_FILES_URLS_FIELD = 'file_urls'
DEFAULT_FILES_RESULT_FIELD = 'files'
def __init__(self, store_uri, download_func=None, settings=None):
if not store_uri:
raise NotConfigured
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
cls_name = "FilesPipeline"
self.store = self._get_store(store_uri)
resolve = functools.partial(self._key_for_pipe,
base_class_name=cls_name,
settings=settings)
self.expires = settings.getint(
resolve('FILES_EXPIRES'), self.EXPIRES
)
if not hasattr(self, "FILES_URLS_FIELD"):
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
if not hasattr(self, "FILES_RESULT_FIELD"):
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
self.files_urls_field = settings.get(
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
)
self.files_result_field = settings.get(
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
)
super().__init__(download_func=download_func, settings=settings)
@classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
s3store.AWS_VERIFY = settings['AWS_VERIFY']
s3store.POLICY = settings['FILES_STORE_S3_ACL']
gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['FILES_STORE_GCS_ACL'] or None
ftp_store = cls.STORE_SCHEMES['ftp']
ftp_store.FTP_USERNAME = settings['FTP_USER']
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
store_uri = settings['FILES_STORE']
return cls(store_uri, settings=settings)
def _get_store(self, uri):
if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir
scheme = 'file'
else:
scheme = urlparse(uri).scheme
store_cls = self.STORE_SCHEMES[scheme]
return store_cls(uri)
def media_to_download(self, request, info, *, item=None):
def _onsuccess(result):
if not result:
return # returning None force download
last_modified = result.get('last_modified', None)
if not last_modified:
return # returning None force download
age_seconds = time.time() - last_modified
age_days = age_seconds / 60 / 60 / 24
if age_days > self.expires:
return # returning None force download
referer = referer_str(request)
logger.debug(
'File (uptodate): Downloaded %(medianame)s from %(request)s '
'referred in <%(referer)s>',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer},
extra={'spider': info.spider}
)
self.inc_stats(info.spider, 'uptodate')
checksum = result.get('checksum', None)
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
path = self.file_path(request, info=info, item=item)
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
dfd.addCallbacks(_onsuccess, lambda _: None)
dfd.addErrback(
lambda f:
logger.error(self.__class__.__name__ + '.store.stat_file',
exc_info=failure_to_exc_info(f),
extra={'spider': info.spider})
)
return dfd
def media_failed(self, failure, request, info):
if not isinstance(failure.value, IgnoreRequest):
referer = referer_str(request)
logger.warning(
'File (unknown-error): Error downloading %(medianame)s from '
'%(request)s referred in <%(referer)s>: %(exception)s',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer, 'exception': failure.value},
extra={'spider': info.spider}
)
raise FileException
def media_downloaded(self, response, request, info, *, item=None):
referer = referer_str(request)
if response.status != 200:
logger.warning(
'File (code: %(status)s): Error downloading file from '
'%(request)s referred in <%(referer)s>',
{'status': response.status,
'request': request, 'referer': referer},
extra={'spider': info.spider}
)
raise FileException('download-error')
if not response.body:
logger.warning(
'File (empty-content): Empty file from %(request)s referred '
'in <%(referer)s>: no-content',
{'request': request, 'referer': referer},
extra={'spider': info.spider}
)
raise FileException('empty-content')
status = 'cached' if 'cached' in response.flags else 'downloaded'
logger.debug(
'File (%(status)s): Downloaded file from %(request)s referred in '
'<%(referer)s>',
{'status': status, 'request': request, 'referer': referer},
extra={'spider': info.spider}
)
self.inc_stats(info.spider, status)
try:
path = self.file_path(request, response=response, info=info, item=item)
checksum = self.file_downloaded(response, request, info, item=item)
except FileException as exc:
logger.warning(
'File (error): Error processing file from %(request)s '
'referred in <%(referer)s>: %(errormsg)s',
{'request': request, 'referer': referer, 'errormsg': str(exc)},
extra={'spider': info.spider}, exc_info=True
)
raise
except Exception as exc:
logger.error(
'File (unknown-error): Error processing file from %(request)s '
'referred in <%(referer)s>',
{'request': request, 'referer': referer},
exc_info=True, extra={'spider': info.spider}
)
raise FileException(str(exc))
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': status}
def inc_stats(self, spider, status):
spider.crawler.stats.inc_value('file_count', spider=spider)
spider.crawler.stats.inc_value(f'file_status_count/{status}', spider=spider)
# Overridable Interface
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
def file_downloaded(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
buf = BytesIO(response.body)
checksum = md5sum(buf)
buf.seek(0)
self.store.persist_file(path, buf, info)
return checksum
def item_completed(self, results, item, info):
with suppress(KeyError):
ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
return item
def file_path(self, request, response=None, info=None, *, item=None):
media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
media_ext = os.path.splitext(request.url)[1]
# Handles empty and wild extensions by trying to guess the
# mime type then extension or default to empty string otherwise
if media_ext not in mimetypes.types_map:
media_ext = ''
media_type = mimetypes.guess_type(request.url)[0]
if media_type:
media_ext = mimetypes.guess_extension(media_type)
return f'full/{media_guid}{media_ext}'

View file

@ -0,0 +1,176 @@
"""
Images Pipeline
See documentation in topics/media-pipeline.rst
"""
import functools
import hashlib
from contextlib import suppress
from io import BytesIO
from itemadapter import ItemAdapter
from PIL import Image
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.pipelines.files import FileException, FilesPipeline
# TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
class NoimagesDrop(DropItem):
"""Product with no images exception"""
class ImageException(FileException):
"""General image error exception"""
class ImagesPipeline(FilesPipeline):
"""Abstract pipeline that implement the image thumbnail generation logic
"""
MEDIA_NAME = 'image'
# Uppercase attributes kept for backward compatibility with code that subclasses
# ImagesPipeline. They may be overridden by settings.
MIN_WIDTH = 0
MIN_HEIGHT = 0
EXPIRES = 90
THUMBS = {}
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
DEFAULT_IMAGES_RESULT_FIELD = 'images'
def __init__(self, store_uri, download_func=None, settings=None):
super().__init__(store_uri, settings=settings, download_func=download_func)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="ImagesPipeline",
settings=settings)
self.expires = settings.getint(
resolve("IMAGES_EXPIRES"), self.EXPIRES
)
if not hasattr(self, "IMAGES_RESULT_FIELD"):
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
if not hasattr(self, "IMAGES_URLS_FIELD"):
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
self.images_urls_field = settings.get(
resolve('IMAGES_URLS_FIELD'),
self.IMAGES_URLS_FIELD
)
self.images_result_field = settings.get(
resolve('IMAGES_RESULT_FIELD'),
self.IMAGES_RESULT_FIELD
)
self.min_width = settings.getint(
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
)
self.min_height = settings.getint(
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
)
self.thumbs = settings.get(
resolve('IMAGES_THUMBS'), self.THUMBS
)
@classmethod
def from_settings(cls, settings):
s3store = cls.STORE_SCHEMES['s3']
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
s3store.AWS_VERIFY = settings['AWS_VERIFY']
s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
gcs_store = cls.STORE_SCHEMES['gs']
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
ftp_store = cls.STORE_SCHEMES['ftp']
ftp_store.FTP_USERNAME = settings['FTP_USER']
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
store_uri = settings['IMAGES_STORE']
return cls(store_uri, settings=settings)
def file_downloaded(self, response, request, info, *, item=None):
return self.image_downloaded(response, request, info, item=item)
def image_downloaded(self, response, request, info, *, item=None):
checksum = None
for path, image, buf in self.get_images(response, request, info, item=item):
if checksum is None:
buf.seek(0)
checksum = md5sum(buf)
width, height = image.size
self.store.persist_file(
path, buf, info,
meta={'width': width, 'height': height},
headers={'Content-Type': 'image/jpeg'})
return checksum
def get_images(self, response, request, info, *, item=None):
path = self.file_path(request, response=response, info=info, item=item)
orig_image = Image.open(BytesIO(response.body))
width, height = orig_image.size
if width < self.min_width or height < self.min_height:
raise ImageException("Image too small "
f"({width}x{height} < "
f"{self.min_width}x{self.min_height})")
image, buf = self.convert_image(orig_image)
yield path, image, buf
for thumb_id, size in self.thumbs.items():
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
thumb_image, thumb_buf = self.convert_image(image, size)
yield thumb_path, thumb_image, thumb_buf
def convert_image(self, image, size=None):
if image.format == 'PNG' and image.mode == 'RGBA':
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode == 'P':
image = image.convert("RGBA")
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode != 'RGB':
image = image.convert('RGB')
if size:
image = image.copy()
image.thumbnail(size, Image.ANTIALIAS)
buf = BytesIO()
image.save(buf, 'JPEG')
return image, buf
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.images_urls_field, [])
return [Request(u) for u in urls]
def item_completed(self, results, item, info):
with suppress(KeyError):
ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
return item
def file_path(self, request, response=None, info=None, *, item=None):
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return f'full/{image_guid}.jpg'
def thumb_path(self, request, thumb_id, response=None, info=None):
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
return f'thumbs/{thumb_id}/{thumb_guid}.jpg'

View file

@ -0,0 +1,251 @@
import functools
import logging
from collections import defaultdict
from inspect import signature
from warnings import warn
from twisted.internet.defer import Deferred, DeferredList
from twisted.python.failure import Failure
from scrapy.settings import Settings
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.defer import mustbe_deferred, defer_result
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.request import request_fingerprint
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.log import failure_to_exc_info
logger = logging.getLogger(__name__)
class MediaPipeline:
LOG_FAILED_RESULTS = True
class SpiderInfo:
def __init__(self, spider):
self.spider = spider
self.downloading = set()
self.downloaded = {}
self.waiting = defaultdict(list)
def __init__(self, download_func=None, settings=None):
self.download_func = download_func
self._expects_item = {}
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
resolve = functools.partial(self._key_for_pipe,
base_class_name="MediaPipeline",
settings=settings)
self.allow_redirects = settings.getbool(
resolve('MEDIA_ALLOW_REDIRECTS'), False
)
self._handle_statuses(self.allow_redirects)
# Check if deprecated methods are being used and make them compatible
self._make_compatible()
def _handle_statuses(self, allow_redirects):
self.handle_httpstatus_list = None
if allow_redirects:
self.handle_httpstatus_list = SequenceExclude(range(300, 400))
def _key_for_pipe(self, key, base_class_name=None, settings=None):
"""
>>> MediaPipeline()._key_for_pipe("IMAGES")
'IMAGES'
>>> class MyPipe(MediaPipeline):
... pass
>>> MyPipe()._key_for_pipe("IMAGES", base_class_name="MediaPipeline")
'MYPIPE_IMAGES'
"""
class_name = self.__class__.__name__
formatted_key = f"{class_name.upper()}_{key}"
if (
not base_class_name
or class_name == base_class_name
or settings and not settings.get(formatted_key)
):
return key
return formatted_key
@classmethod
def from_crawler(cls, crawler):
try:
pipe = cls.from_settings(crawler.settings)
except AttributeError:
pipe = cls()
pipe.crawler = crawler
return pipe
def open_spider(self, spider):
self.spiderinfo = self.SpiderInfo(spider)
def process_item(self, item, spider):
info = self.spiderinfo
requests = arg_to_iter(self.get_media_requests(item, info))
dlist = [self._process_request(r, info, item) for r in requests]
dfd = DeferredList(dlist, consumeErrors=1)
return dfd.addCallback(self.item_completed, item, info)
def _process_request(self, request, info, item):
fp = request_fingerprint(request)
cb = request.callback or (lambda _: _)
eb = request.errback
request.callback = None
request.errback = None
# Return cached result if request was already seen
if fp in info.downloaded:
return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)
# Otherwise, wait for result
wad = Deferred().addCallbacks(cb, eb)
info.waiting[fp].append(wad)
# Check if request is downloading right now to avoid doing it twice
if fp in info.downloading:
return wad
# Download request checking media_to_download hook output first
info.downloading.add(fp)
dfd = mustbe_deferred(self.media_to_download, request, info, item=item)
dfd.addCallback(self._check_media_to_download, request, info, item=item)
dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
dfd.addErrback(lambda f: logger.error(
f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
)
return dfd.addBoth(lambda _: wad) # it must return wad at last
def _make_compatible(self):
"""Make overridable methods of MediaPipeline and subclasses backwards compatible"""
methods = [
"file_path", "media_to_download", "media_downloaded",
"file_downloaded", "image_downloaded", "get_images"
]
for method_name in methods:
method = getattr(self, method_name, None)
if callable(method):
setattr(self, method_name, self._compatible(method))
def _compatible(self, func):
"""Wrapper for overridable methods to allow backwards compatibility"""
self._check_signature(func)
@functools.wraps(func)
def wrapper(*args, **kwargs):
if self._expects_item[func.__name__]:
return func(*args, **kwargs)
kwargs.pop('item', None)
return func(*args, **kwargs)
return wrapper
def _check_signature(self, func):
sig = signature(func)
self._expects_item[func.__name__] = True
if 'item' not in sig.parameters:
old_params = str(sig)[1:-1]
new_params = old_params + ", *, item=None"
warn(f'{func.__name__}(self, {old_params}) is deprecated, '
f'please use {func.__name__}(self, {new_params})',
ScrapyDeprecationWarning, stacklevel=2)
self._expects_item[func.__name__] = False
def _modify_media_request(self, request):
if self.handle_httpstatus_list:
request.meta['handle_httpstatus_list'] = self.handle_httpstatus_list
else:
request.meta['handle_httpstatus_all'] = True
def _check_media_to_download(self, result, request, info, item):
if result is not None:
return result
if self.download_func:
# this ugly code was left only to support tests. TODO: remove
dfd = mustbe_deferred(self.download_func, request, info.spider)
dfd.addCallbacks(
callback=self.media_downloaded, callbackArgs=(request, info), callbackKeywords={'item': item},
errback=self.media_failed, errbackArgs=(request, info))
else:
self._modify_media_request(request)
dfd = self.crawler.engine.download(request, info.spider)
dfd.addCallbacks(
callback=self.media_downloaded, callbackArgs=(request, info), callbackKeywords={'item': item},
errback=self.media_failed, errbackArgs=(request, info))
return dfd
def _cache_result_and_execute_waiters(self, result, fp, info):
if isinstance(result, Failure):
# minimize cached information for failure
result.cleanFailure()
result.frames = []
result.stack = None
# This code fixes a memory leak by avoiding to keep references to
# the Request and Response objects on the Media Pipeline cache.
#
# What happens when the media_downloaded callback raises an
# exception, for example a FileException('download-error') when
# the Response status code is not 200 OK, is that the original
# StopIteration exception (which in turn contains the failed
# Response and by extension, the original Request) gets encapsulated
# within the FileException context.
#
# Originally, Scrapy was using twisted.internet.defer.returnValue
# inside functions decorated with twisted.internet.defer.inlineCallbacks,
# encapsulating the returned Response in a _DefGen_Return exception
# instead of a StopIteration.
#
# To avoid keeping references to the Response and therefore Request
# objects on the Media Pipeline cache, we should wipe the context of
# the encapsulated exception when it is a StopIteration instance
#
# This problem does not occur in Python 2.7 since we don't have
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
context = getattr(result.value, '__context__', None)
if isinstance(context, StopIteration):
setattr(result.value, '__context__', None)
info.downloading.remove(fp)
info.downloaded[fp] = result # cache result
for wad in info.waiting.pop(fp):
defer_result(result).chainDeferred(wad)
# Overridable Interface
def media_to_download(self, request, info, *, item=None):
"""Check request before starting download"""
pass
def get_media_requests(self, item, info):
"""Returns the media requests to download"""
pass
def media_downloaded(self, response, request, info, *, item=None):
"""Handler for success downloads"""
return response
def media_failed(self, failure, request, info):
"""Handler for failed downloads"""
return failure
def item_completed(self, results, item, info):
"""Called per item when all media requests has been processed"""
if self.LOG_FAILED_RESULTS:
for ok, value in results:
if not ok:
logger.error(
'%(class)s found errors processing %(item)s',
{'class': self.__class__.__name__, 'item': item},
exc_info=failure_to_exc_info(value),
extra={'spider': info.spider}
)
return item
def file_path(self, request, response=None, info=None, *, item=None):
"""Returns the path where downloaded media should be stored"""
pass

Some files were not shown because too many files have changed in this diff Show more