Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,44 @@
import random
from urllib.parse import urlencode
from twisted.web.server import Site
from twisted.web.resource import Resource
class Root(Resource):
isLeaf = True
def getChild(self, name, request):
return self
def render(self, request):
total = _getarg(request, b'total', 100, int)
show = _getarg(request, b'show', 10, int)
nlist = [random.randint(1, total) for _ in range(show)]
request.write(b"<html><head></head><body>")
args = request.args.copy()
for nl in nlist:
args['n'] = nl
argstr = urlencode(args, doseq=True)
request.write(f"<a href='/follow?{argstr}'>follow {nl}</a><br>"
.encode('utf8'))
request.write(b"</body></html>")
return b''
def _getarg(request, name, default=None, type=str):
return type(request.args[name][0]) if name in request.args else default
if __name__ == '__main__':
from twisted.internet import reactor
root = Root()
factory = Site(root)
httpPort = reactor.listenTCP(8998, Site(root))
def _print_listening():
httpHost = httpPort.getHost()
print(f"Bench server at http://{httpHost.host}:{httpHost.port}")
reactor.callWhenRunning(_print_listening)
reactor.run()

View file

@ -0,0 +1,32 @@
"""Boto/botocore helpers"""
import warnings
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
def is_botocore():
""" Returns True if botocore is available, otherwise raises NotConfigured. Never returns False.
Previously, when boto was supported in addition to botocore, this returned False if boto was available
but botocore wasn't.
"""
message = (
'is_botocore() is deprecated and always returns True or raises an Exception, '
'so it cannot be used for checking if boto is available instead of botocore. '
'You can use scrapy.utils.boto.is_botocore_available() to check if botocore '
'is available.'
)
warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
try:
import botocore # noqa: F401
return True
except ImportError:
raise NotConfigured('missing botocore library')
def is_botocore_available():
try:
import botocore # noqa: F401
return True
except ImportError:
return False

View file

@ -0,0 +1,195 @@
import numbers
import os
import sys
import warnings
from configparser import ConfigParser
from operator import itemgetter
from scrapy.exceptions import ScrapyDeprecationWarning, UsageError
from scrapy.settings import BaseSettings
from scrapy.utils.deprecate import update_classpath
from scrapy.utils.python import without_none_values
def build_component_list(compdict, custom=None, convert=update_classpath):
"""Compose a component list from a { class: order } dictionary."""
def _check_components(complist):
if len({convert(c) for c in complist}) != len(complist):
raise ValueError(f'Some paths in {complist!r} convert to the same object, '
'please update your settings')
def _map_keys(compdict):
if isinstance(compdict, BaseSettings):
compbs = BaseSettings()
for k, v in compdict.items():
prio = compdict.getpriority(k)
if compbs.getpriority(convert(k)) == prio:
raise ValueError(f'Some paths in {list(compdict.keys())!r} '
'convert to the same '
'object, please update your settings'
)
else:
compbs.set(convert(k), v, priority=prio)
return compbs
else:
_check_components(compdict)
return {convert(k): v for k, v in compdict.items()}
def _validate_values(compdict):
"""Fail if a value in the components dict is not a real number or None."""
for name, value in compdict.items():
if value is not None and not isinstance(value, numbers.Real):
raise ValueError(f'Invalid value {value} for component {name}, '
'please provide a real number or None instead')
# BEGIN Backward compatibility for old (base, custom) call signature
if isinstance(custom, (list, tuple)):
_check_components(custom)
return type(custom)(convert(c) for c in custom)
if custom is not None:
compdict.update(custom)
# END Backward compatibility
_validate_values(compdict)
compdict = without_none_values(_map_keys(compdict))
return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
def arglist_to_dict(arglist):
"""Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a
dict
"""
return dict(x.split('=', 1) for x in arglist)
def closest_scrapy_cfg(path='.', prevpath=None):
"""Return the path to the closest scrapy.cfg file by traversing the current
directory and its parents
"""
if path == prevpath:
return ''
path = os.path.abspath(path)
cfgfile = os.path.join(path, 'scrapy.cfg')
if os.path.exists(cfgfile):
return cfgfile
return closest_scrapy_cfg(os.path.dirname(path), path)
def init_env(project='default', set_syspath=True):
"""Initialize environment to use command-line tool from inside a project
dir. This sets the Scrapy settings module and modifies the Python path to
be able to locate the project module.
"""
cfg = get_config()
if cfg.has_option('settings', project):
os.environ['SCRAPY_SETTINGS_MODULE'] = cfg.get('settings', project)
closest = closest_scrapy_cfg()
if closest:
projdir = os.path.dirname(closest)
if set_syspath and projdir not in sys.path:
sys.path.append(projdir)
def get_config(use_closest=True):
"""Get Scrapy config file as a ConfigParser"""
sources = get_sources(use_closest)
cfg = ConfigParser()
cfg.read(sources)
return cfg
def get_sources(use_closest=True):
xdg_config_home = os.environ.get('XDG_CONFIG_HOME') or os.path.expanduser('~/.config')
sources = [
'/etc/scrapy.cfg',
r'c:\scrapy\scrapy.cfg',
xdg_config_home + '/scrapy.cfg',
os.path.expanduser('~/.scrapy.cfg'),
]
if use_closest:
sources.append(closest_scrapy_cfg())
return sources
def feed_complete_default_values_from_settings(feed, settings):
out = feed.copy()
out.setdefault("batch_item_count", settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT'))
out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"])
out.setdefault("fields", settings.getlist("FEED_EXPORT_FIELDS") or None)
out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY"))
out.setdefault("uri_params", settings["FEED_URI_PARAMS"])
out.setdefault("item_export_kwargs", dict())
if settings["FEED_EXPORT_INDENT"] is None:
out.setdefault("indent", None)
else:
out.setdefault("indent", settings.getint("FEED_EXPORT_INDENT"))
return out
def feed_process_params_from_cli(settings, output, output_format=None,
overwrite_output=None):
"""
Receives feed export params (from the 'crawl' or 'runspider' commands),
checks for inconsistencies in their quantities and returns a dictionary
suitable to be used as the FEEDS setting.
"""
valid_output_formats = without_none_values(
settings.getwithbase('FEED_EXPORTERS')
).keys()
def check_valid_format(output_format):
if output_format not in valid_output_formats:
raise UsageError(
f"Unrecognized output format '{output_format}'. "
f"Set a supported one ({tuple(valid_output_formats)}) "
"after a colon at the end of the output URI (i.e. -o/-O "
"<URI>:<FORMAT>) or as a file extension."
)
overwrite = False
if overwrite_output:
if output:
raise UsageError(
"Please use only one of -o/--output and -O/--overwrite-output"
)
output = overwrite_output
overwrite = True
if output_format:
if len(output) == 1:
check_valid_format(output_format)
message = (
'The -t command line option is deprecated in favor of '
'specifying the output format within the output URI. See the '
'documentation of the -o and -O options for more information.',
)
warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
return {output[0]: {'format': output_format}}
else:
raise UsageError(
'The -t command-line option cannot be used if multiple output '
'URIs are specified'
)
result = {}
for element in output:
try:
feed_uri, feed_format = element.rsplit(':', 1)
except ValueError:
feed_uri = element
feed_format = os.path.splitext(element)[1].replace('.', '')
else:
if feed_uri == '-':
feed_uri = 'stdout:'
check_valid_format(feed_format)
result[feed_uri] = {'format': feed_format}
if overwrite:
result[feed_uri]['overwrite'] = True
# FEEDS setting should take precedence over the matching CLI options
result.update(settings.getdict('FEEDS'))
return result

View file

@ -0,0 +1,104 @@
from functools import wraps
from collections import OrderedDict
def _embed_ipython_shell(namespace={}, banner=''):
"""Start an IPython Shell"""
try:
from IPython.terminal.embed import InteractiveShellEmbed
from IPython.terminal.ipapp import load_default_config
except ImportError:
from IPython.frontend.terminal.embed import InteractiveShellEmbed
from IPython.frontend.terminal.ipapp import load_default_config
@wraps(_embed_ipython_shell)
def wrapper(namespace=namespace, banner=''):
config = load_default_config()
# Always use .instace() to ensure _instance propagation to all parents
# this is needed for <TAB> completion works well for new imports
# and clear the instance to always have the fresh env
# on repeated breaks like with inspect_response()
InteractiveShellEmbed.clear_instance()
shell = InteractiveShellEmbed.instance(
banner1=banner, user_ns=namespace, config=config)
shell()
return wrapper
def _embed_bpython_shell(namespace={}, banner=''):
"""Start a bpython shell"""
import bpython
@wraps(_embed_bpython_shell)
def wrapper(namespace=namespace, banner=''):
bpython.embed(locals_=namespace, banner=banner)
return wrapper
def _embed_ptpython_shell(namespace={}, banner=''):
"""Start a ptpython shell"""
import ptpython.repl
@wraps(_embed_ptpython_shell)
def wrapper(namespace=namespace, banner=''):
print(banner)
ptpython.repl.embed(locals=namespace)
return wrapper
def _embed_standard_shell(namespace={}, banner=''):
"""Start a standard python shell"""
import code
try: # readline module is only available on unix systems
import readline
except ImportError:
pass
else:
import rlcompleter # noqa: F401
readline.parse_and_bind("tab:complete")
@wraps(_embed_standard_shell)
def wrapper(namespace=namespace, banner=''):
code.interact(banner=banner, local=namespace)
return wrapper
DEFAULT_PYTHON_SHELLS = OrderedDict([
('ptpython', _embed_ptpython_shell),
('ipython', _embed_ipython_shell),
('bpython', _embed_bpython_shell),
('python', _embed_standard_shell),
])
def get_shell_embed_func(shells=None, known_shells=None):
"""Return the first acceptable shell-embed function
from a given list of shell names.
"""
if shells is None: # list, preference order of shells
shells = DEFAULT_PYTHON_SHELLS.keys()
if known_shells is None: # available embeddable shells
known_shells = DEFAULT_PYTHON_SHELLS.copy()
for shell in shells:
if shell in known_shells:
try:
# function test: run all setup code (imports),
# but dont fall into the shell
return known_shells[shell]()
except ImportError:
continue
def start_python_console(namespace=None, banner='', shells=None):
"""Start Python console bound to the given namespace.
Readline support and tab completion will be used on Unix, if available.
"""
if namespace is None:
namespace = {}
try:
shell = get_shell_embed_func(shells)
if shell is not None:
shell(namespace=namespace, banner=banner)
except SystemExit: # raised when using exit() in python code.interact
pass

View file

@ -0,0 +1,100 @@
import argparse
import warnings
from shlex import split
from http.cookies import SimpleCookie
from urllib.parse import urlparse
from w3lib.http import basic_auth_header
class CurlParser(argparse.ArgumentParser):
def error(self, message):
error_msg = f'There was an error parsing the curl command: {message}'
raise ValueError(error_msg)
curl_parser = CurlParser()
curl_parser.add_argument('url')
curl_parser.add_argument('-H', '--header', dest='headers', action='append')
curl_parser.add_argument('-X', '--request', dest='method')
curl_parser.add_argument('-d', '--data', '--data-raw', dest='data')
curl_parser.add_argument('-u', '--user', dest='auth')
safe_to_ignore_arguments = [
['--compressed'],
# `--compressed` argument is not safe to ignore, but it's included here
# because the `HttpCompressionMiddleware` is enabled by default
['-s', '--silent'],
['-v', '--verbose'],
['-#', '--progress-bar']
]
for argument in safe_to_ignore_arguments:
curl_parser.add_argument(*argument, action='store_true')
def curl_to_request_kwargs(curl_command, ignore_unknown_options=True):
"""Convert a cURL command syntax to Request kwargs.
:param str curl_command: string containing the curl command
:param bool ignore_unknown_options: If true, only a warning is emitted when
cURL options are unknown. Otherwise
raises an error. (default: True)
:return: dictionary of Request kwargs
"""
curl_args = split(curl_command)
if curl_args[0] != 'curl':
raise ValueError('A curl command must start with "curl"')
parsed_args, argv = curl_parser.parse_known_args(curl_args[1:])
if argv:
msg = f'Unrecognized options: {", ".join(argv)}'
if ignore_unknown_options:
warnings.warn(msg)
else:
raise ValueError(msg)
url = parsed_args.url
# curl automatically prepends 'http' if the scheme is missing, but Request
# needs the scheme to work
parsed_url = urlparse(url)
if not parsed_url.scheme:
url = 'http://' + url
method = parsed_args.method or 'GET'
result = {'method': method.upper(), 'url': url}
headers = []
cookies = {}
for header in parsed_args.headers or ():
name, val = header.split(':', 1)
name = name.strip()
val = val.strip()
if name.title() == 'Cookie':
for name, morsel in SimpleCookie(val).items():
cookies[name] = morsel.value
else:
headers.append((name, val))
if parsed_args.auth:
user, password = parsed_args.auth.split(':', 1)
headers.append(('Authorization', basic_auth_header(user, password)))
if headers:
result['headers'] = headers
if cookies:
result['cookies'] = cookies
if parsed_args.data:
result['body'] = parsed_args.data
if not parsed_args.method:
# if the "data" is specified but the "method" is not specified,
# the default method is 'POST'
result['method'] = 'POST'
return result

View file

@ -0,0 +1,119 @@
"""
This module contains data types used by Scrapy which are not included in the
Python Standard Library.
This module must not depend on any module outside the Standard Library.
"""
import collections
import weakref
from collections.abc import Mapping
class CaselessDict(dict):
__slots__ = ()
def __init__(self, seq=None):
super().__init__()
if seq:
self.update(seq)
def __getitem__(self, key):
return dict.__getitem__(self, self.normkey(key))
def __setitem__(self, key, value):
dict.__setitem__(self, self.normkey(key), self.normvalue(value))
def __delitem__(self, key):
dict.__delitem__(self, self.normkey(key))
def __contains__(self, key):
return dict.__contains__(self, self.normkey(key))
has_key = __contains__
def __copy__(self):
return self.__class__(self)
copy = __copy__
def normkey(self, key):
"""Method to normalize dictionary key access"""
return key.lower()
def normvalue(self, value):
"""Method to normalize values prior to be setted"""
return value
def get(self, key, def_val=None):
return dict.get(self, self.normkey(key), self.normvalue(def_val))
def setdefault(self, key, def_val=None):
return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))
def update(self, seq):
seq = seq.items() if isinstance(seq, Mapping) else seq
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
super().update(iseq)
@classmethod
def fromkeys(cls, keys, value=None):
return cls((k, value) for k in keys)
def pop(self, key, *args):
return dict.pop(self, self.normkey(key), *args)
class LocalCache(collections.OrderedDict):
"""Dictionary with a finite number of keys.
Older items expires first.
"""
def __init__(self, limit=None):
super().__init__()
self.limit = limit
def __setitem__(self, key, value):
if self.limit:
while len(self) >= self.limit:
self.popitem(last=False)
super().__setitem__(key, value)
class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
"""
A weakref.WeakKeyDictionary implementation that uses LocalCache as its
underlying data structure, making it ordered and capable of being size-limited.
Useful for memoization, while avoiding keeping received
arguments in memory only because of the cached references.
Note: like LocalCache and unlike weakref.WeakKeyDictionary,
it cannot be instantiated with an initial dictionary.
"""
def __init__(self, limit=None):
super().__init__()
self.data = LocalCache(limit=limit)
def __setitem__(self, key, value):
try:
super().__setitem__(key, value)
except TypeError:
pass # key is not weak-referenceable, skip caching
def __getitem__(self, key):
try:
return super().__getitem__(key)
except (TypeError, KeyError):
return None # key is either not weak-referenceable or not cached
class SequenceExclude:
"""Object to test if an item is NOT within some sequence."""
def __init__(self, seq):
self.seq = seq
def __contains__(self, item):
return item not in self.seq

View file

@ -0,0 +1,45 @@
import warnings
from functools import wraps
from twisted.internet import defer, threads
from scrapy.exceptions import ScrapyDeprecationWarning
def deprecated(use_instead=None):
"""This is a decorator which can be used to mark functions
as deprecated. It will result in a warning being emitted
when the function is used."""
def deco(func):
@wraps(func)
def wrapped(*args, **kwargs):
message = f"Call to deprecated function {func.__name__}."
if use_instead:
message += f" Use {use_instead} instead."
warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
return func(*args, **kwargs)
return wrapped
if callable(use_instead):
deco = deco(use_instead)
use_instead = None
return deco
def defers(func):
"""Decorator to make sure a function always returns a deferred"""
@wraps(func)
def wrapped(*a, **kw):
return defer.maybeDeferred(func, *a, **kw)
return wrapped
def inthread(func):
"""Decorator to call a function in a thread and return a deferred with the
result
"""
@wraps(func)
def wrapped(*a, **kw):
return threads.deferToThread(func, *a, **kw)
return wrapped

View file

@ -0,0 +1,168 @@
"""
Helper functions for dealing with Twisted deferreds
"""
import asyncio
import inspect
from functools import wraps
from twisted.internet import defer, task
from twisted.python import failure
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.reactor import is_asyncio_reactor_installed
def defer_fail(_failure):
"""Same as twisted.internet.defer.fail but delay calling errback until
next reactor loop
It delays by 100ms so reactor has a chance to go through readers and writers
before attending pending delayed calls, so do not set delay to zero.
"""
from twisted.internet import reactor
d = defer.Deferred()
reactor.callLater(0.1, d.errback, _failure)
return d
def defer_succeed(result):
"""Same as twisted.internet.defer.succeed but delay calling callback until
next reactor loop
It delays by 100ms so reactor has a chance to go trough readers and writers
before attending pending delayed calls, so do not set delay to zero.
"""
from twisted.internet import reactor
d = defer.Deferred()
reactor.callLater(0.1, d.callback, result)
return d
def defer_result(result):
if isinstance(result, defer.Deferred):
return result
elif isinstance(result, failure.Failure):
return defer_fail(result)
else:
return defer_succeed(result)
def mustbe_deferred(f, *args, **kw):
"""Same as twisted.internet.defer.maybeDeferred, but delay calling
callback/errback to next reactor loop
"""
try:
result = f(*args, **kw)
# FIXME: Hack to avoid introspecting tracebacks. This to speed up
# processing of IgnoreRequest errors which are, by far, the most common
# exception in Scrapy - see #125
except IgnoreRequest as e:
return defer_fail(failure.Failure(e))
except Exception:
return defer_fail(failure.Failure())
else:
return defer_result(result)
def parallel(iterable, count, callable, *args, **named):
"""Execute a callable over the objects in the given iterable, in parallel,
using no more than ``count`` concurrent calls.
Taken from: https://jcalderone.livejournal.com/24285.html
"""
coop = task.Cooperator()
work = (callable(elem, *args, **named) for elem in iterable)
return defer.DeferredList([coop.coiterate(work) for _ in range(count)])
def process_chain(callbacks, input, *a, **kw):
"""Return a Deferred built by chaining the given callbacks"""
d = defer.Deferred()
for x in callbacks:
d.addCallback(x, *a, **kw)
d.callback(input)
return d
def process_chain_both(callbacks, errbacks, input, *a, **kw):
"""Return a Deferred built by chaining the given callbacks and errbacks"""
d = defer.Deferred()
for cb, eb in zip(callbacks, errbacks):
d.addCallbacks(
callback=cb, errback=eb,
callbackArgs=a, callbackKeywords=kw,
errbackArgs=a, errbackKeywords=kw,
)
if isinstance(input, failure.Failure):
d.errback(input)
else:
d.callback(input)
return d
def process_parallel(callbacks, input, *a, **kw):
"""Return a Deferred with the output of all successful calls to the given
callbacks
"""
dfds = [defer.succeed(input).addCallback(x, *a, **kw) for x in callbacks]
d = defer.DeferredList(dfds, fireOnOneErrback=1, consumeErrors=1)
d.addCallbacks(lambda r: [x[1] for x in r], lambda f: f.value.subFailure)
return d
def iter_errback(iterable, errback, *a, **kw):
"""Wraps an iterable calling an errback if an error is caught while
iterating it.
"""
it = iter(iterable)
while True:
try:
yield next(it)
except StopIteration:
break
except Exception:
errback(failure.Failure(), *a, **kw)
def deferred_from_coro(o):
"""Converts a coroutine into a Deferred, or returns the object as is if it isn't a coroutine"""
if isinstance(o, defer.Deferred):
return o
if asyncio.isfuture(o) or inspect.isawaitable(o):
if not is_asyncio_reactor_installed():
# wrapping the coroutine directly into a Deferred, this doesn't work correctly with coroutines
# that use asyncio, e.g. "await asyncio.sleep(1)"
return defer.ensureDeferred(o)
else:
# wrapping the coroutine into a Future and then into a Deferred, this requires AsyncioSelectorReactor
return defer.Deferred.fromFuture(asyncio.ensure_future(o))
return o
def deferred_f_from_coro_f(coro_f):
""" Converts a coroutine function into a function that returns a Deferred.
The coroutine function will be called at the time when the wrapper is called. Wrapper args will be passed to it.
This is useful for callback chains, as callback functions are called with the previous callback result.
"""
@wraps(coro_f)
def f(*coro_args, **coro_kwargs):
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
return f
def maybeDeferred_coro(f, *args, **kw):
""" Copy of defer.maybeDeferred that also converts coroutines to Deferreds. """
try:
result = f(*args, **kw)
except: # noqa: E722
return defer.fail(failure.Failure(captureVars=defer.Deferred.debug))
if isinstance(result, defer.Deferred):
return result
elif asyncio.isfuture(result) or inspect.isawaitable(result):
return deferred_from_coro(result)
elif isinstance(result, failure.Failure):
return defer.fail(result)
else:
return defer.succeed(result)

View file

@ -0,0 +1,174 @@
"""Some helpers for deprecation messages"""
import warnings
import inspect
from scrapy.exceptions import ScrapyDeprecationWarning
def attribute(obj, oldattr, newattr, version='0.12'):
cname = obj.__class__.__name__
warnings.warn(
f"{cname}.{oldattr} attribute is deprecated and will be no longer supported "
f"in Scrapy {version}, use {cname}.{newattr} attribute instead",
ScrapyDeprecationWarning,
stacklevel=3)
def create_deprecated_class(
name,
new_class,
clsdict=None,
warn_category=ScrapyDeprecationWarning,
warn_once=True,
old_class_path=None,
new_class_path=None,
subclass_warn_message="{cls} inherits from deprecated class {old}, please inherit from {new}.",
instance_warn_message="{cls} is deprecated, instantiate {new} instead."
):
"""
Return a "deprecated" class that causes its subclasses to issue a warning.
Subclasses of ``new_class`` are considered subclasses of this class.
It also warns when the deprecated class is instantiated, but do not when
its subclasses are instantiated.
It can be used to rename a base class in a library. For example, if we
have
class OldName(SomeClass):
# ...
and we want to rename it to NewName, we can do the following::
class NewName(SomeClass):
# ...
OldName = create_deprecated_class('OldName', NewName)
Then, if user class inherits from OldName, warning is issued. Also, if
some code uses ``issubclass(sub, OldName)`` or ``isinstance(sub(), OldName)``
checks they'll still return True if sub is a subclass of NewName instead of
OldName.
"""
class DeprecatedClass(new_class.__class__):
deprecated_class = None
warned_on_subclass = False
def __new__(metacls, name, bases, clsdict_):
cls = super().__new__(metacls, name, bases, clsdict_)
if metacls.deprecated_class is None:
metacls.deprecated_class = cls
return cls
def __init__(cls, name, bases, clsdict_):
meta = cls.__class__
old = meta.deprecated_class
if old in bases and not (warn_once and meta.warned_on_subclass):
meta.warned_on_subclass = True
msg = subclass_warn_message.format(cls=_clspath(cls),
old=_clspath(old, old_class_path),
new=_clspath(new_class, new_class_path))
if warn_once:
msg += ' (warning only on first subclass, there may be others)'
warnings.warn(msg, warn_category, stacklevel=2)
super().__init__(name, bases, clsdict_)
# see https://www.python.org/dev/peps/pep-3119/#overloading-isinstance-and-issubclass
# and https://docs.python.org/reference/datamodel.html#customizing-instance-and-subclass-checks
# for implementation details
def __instancecheck__(cls, inst):
return any(cls.__subclasscheck__(c)
for c in {type(inst), inst.__class__})
def __subclasscheck__(cls, sub):
if cls is not DeprecatedClass.deprecated_class:
# we should do the magic only if second `issubclass` argument
# is the deprecated class itself - subclasses of the
# deprecated class should not use custom `__subclasscheck__`
# method.
return super().__subclasscheck__(sub)
if not inspect.isclass(sub):
raise TypeError("issubclass() arg 1 must be a class")
mro = getattr(sub, '__mro__', ())
return any(c in {cls, new_class} for c in mro)
def __call__(cls, *args, **kwargs):
old = DeprecatedClass.deprecated_class
if cls is old:
msg = instance_warn_message.format(cls=_clspath(cls, old_class_path),
new=_clspath(new_class, new_class_path))
warnings.warn(msg, warn_category, stacklevel=2)
return super().__call__(*args, **kwargs)
deprecated_cls = DeprecatedClass(name, (new_class,), clsdict or {})
try:
frm = inspect.stack()[1]
parent_module = inspect.getmodule(frm[0])
if parent_module is not None:
deprecated_cls.__module__ = parent_module.__name__
except Exception as e:
# Sometimes inspect.stack() fails (e.g. when the first import of
# deprecated class is in jinja2 template). __module__ attribute is not
# important enough to raise an exception as users may be unable
# to fix inspect.stack() errors.
warnings.warn(f"Error detecting parent module: {e!r}")
return deprecated_cls
def _clspath(cls, forced=None):
if forced is not None:
return forced
return f'{cls.__module__}.{cls.__name__}'
DEPRECATION_RULES = [
('scrapy.telnet.', 'scrapy.extensions.telnet.'),
]
def update_classpath(path):
"""Update a deprecated path from an object with its new location"""
for prefix, replacement in DEPRECATION_RULES:
if isinstance(path, str) and path.startswith(prefix):
new_path = path.replace(prefix, replacement, 1)
warnings.warn(f"`{path}` class is deprecated, use `{new_path}` instead",
ScrapyDeprecationWarning)
return new_path
return path
def method_is_overridden(subclass, base_class, method_name):
"""
Return True if a method named ``method_name`` of a ``base_class``
is overridden in a ``subclass``.
>>> class Base:
... def foo(self):
... pass
>>> class Sub1(Base):
... pass
>>> class Sub2(Base):
... def foo(self):
... pass
>>> class Sub3(Sub1):
... def foo(self):
... pass
>>> class Sub4(Sub2):
... pass
>>> method_is_overridden(Sub1, Base, 'foo')
False
>>> method_is_overridden(Sub2, Base, 'foo')
True
>>> method_is_overridden(Sub3, Base, 'foo')
True
>>> method_is_overridden(Sub4, Base, 'foo')
True
"""
base_method = getattr(base_class, method_name)
sub_method = getattr(subclass, method_name)
return base_method.__code__ is not sub_method.__code__

View file

@ -0,0 +1,48 @@
"""
pprint and pformat wrappers with colorization support
"""
import ctypes
import platform
import sys
from distutils.version import LooseVersion as parse_version
from pprint import pformat as pformat_
def _enable_windows_terminal_processing():
# https://stackoverflow.com/a/36760881
kernel32 = ctypes.windll.kernel32
return bool(kernel32.SetConsoleMode(kernel32.GetStdHandle(-11), 7))
def _tty_supports_color():
if sys.platform != "win32":
return True
if parse_version(platform.version()) < parse_version("10.0.14393"):
return True
# Windows >= 10.0.14393 interprets ANSI escape sequences providing terminal
# processing is enabled.
return _enable_windows_terminal_processing()
def _colorize(text, colorize=True):
if not colorize or not sys.stdout.isatty() or not _tty_supports_color():
return text
try:
from pygments import highlight
except ImportError:
return text
else:
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonLexer
return highlight(text, PythonLexer(), TerminalFormatter())
def pformat(obj, *args, **kwargs):
return _colorize(pformat_(obj), kwargs.pop('colorize', True))
def pprint(obj, *args, **kwargs):
print(pformat(obj, *args, **kwargs))

View file

@ -0,0 +1,48 @@
"""Some debugging functions for working with the Scrapy engine"""
# used in global tests code
from time import time # noqa: F401
def get_engine_status(engine):
"""Return a report of the current engine status"""
tests = [
"time()-engine.start_time",
"engine.has_capacity()",
"len(engine.downloader.active)",
"engine.scraper.is_idle()",
"engine.spider.name",
"engine.spider_is_idle(engine.spider)",
"engine.slot.closing",
"len(engine.slot.inprogress)",
"len(engine.slot.scheduler.dqs or [])",
"len(engine.slot.scheduler.mqs)",
"len(engine.scraper.slot.queue)",
"len(engine.scraper.slot.active)",
"engine.scraper.slot.active_size",
"engine.scraper.slot.itemproc_size",
"engine.scraper.slot.needs_backout()",
]
checks = []
for test in tests:
try:
checks += [(test, eval(test))]
except Exception as e:
checks += [(test, f"{type(e).__name__} (exception)")]
return checks
def format_engine_status(engine=None):
checks = get_engine_status(engine)
s = "Execution engine status\n\n"
for test, result in checks:
s += f"{test:<47} : {result}\n"
s += "\n"
return s
def print_engine_status(engine):
print(format_engine_status(engine))

View file

@ -0,0 +1,37 @@
import posixpath
from ftplib import error_perm, FTP
from posixpath import dirname
def ftp_makedirs_cwd(ftp, path, first_call=True):
"""Set the current directory of the FTP connection given in the ``ftp``
argument (as a ftplib.FTP object), creating all parent directories if they
don't exist. The ftplib.FTP object must be already connected and logged in.
"""
try:
ftp.cwd(path)
except error_perm:
ftp_makedirs_cwd(ftp, dirname(path), False)
ftp.mkd(path)
if first_call:
ftp.cwd(path)
def ftp_store_file(
*, path, file, host, port,
username, password, use_active_mode=False, overwrite=True):
"""Opens a FTP connection with passed credentials,sets current directory
to the directory extracted from given path, then uploads the file to server
"""
with FTP() as ftp:
ftp.connect(host, port)
ftp.login(username, password)
if use_active_mode:
ftp.set_pasv(False)
file.seek(0)
dirname, filename = posixpath.split(path)
ftp_makedirs_cwd(ftp, dirname)
command = 'STOR' if overwrite else 'APPE'
ftp.storbinary(f'{command} {filename}', file)
file.close()

View file

@ -0,0 +1,58 @@
from gzip import GzipFile
from io import BytesIO
import re
import struct
from scrapy.utils.decorators import deprecated
# - GzipFile's read() has issues returning leftover uncompressed data when
# input is corrupted
# - read1(), which fetches data before raising EOFError on next call
# works here
@deprecated('GzipFile.read1')
def read1(gzf, size=-1):
return gzf.read1(size)
def gunzip(data):
"""Gunzip the given data and return as much data as possible.
This is resilient to CRC checksum errors.
"""
f = GzipFile(fileobj=BytesIO(data))
output_list = []
chunk = b'.'
while chunk:
try:
chunk = f.read1(8196)
output_list.append(chunk)
except (IOError, EOFError, struct.error):
# complete only if there is some data, otherwise re-raise
# see issue 87 about catching struct.error
# some pages are quite small so output_list is empty and f.extrabuf
# contains the whole page content
if output_list or getattr(f, 'extrabuf', None):
try:
output_list.append(f.extrabuf[-f.extrasize:])
finally:
break
else:
raise
return b''.join(output_list)
_is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
_is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
@deprecated
def is_gzipped(response):
"""Return True if the response is gzipped, or False otherwise"""
ctype = response.headers.get('Content-Type', b'')
cenc = response.headers.get('Content-Encoding', b'').lower()
return _is_gzipped(ctype) or _is_octetstream(ctype) and cenc in (b'gzip', b'x-gzip')
def gzip_magic_number(response):
return response.body[:3] == b'\x1f\x8b\x08'

View file

@ -0,0 +1,36 @@
"""
Transitional module for moving to the w3lib library.
For new code, always import from w3lib.http instead of this module
"""
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.decorators import deprecated
from w3lib.http import * # noqa: F401
warnings.warn("Module `scrapy.utils.http` is deprecated, "
"Please import from `w3lib.http` instead.",
ScrapyDeprecationWarning, stacklevel=2)
@deprecated
def decode_chunked_transfer(chunked_body):
"""Parsed body received with chunked transfer encoding, and return the
decoded body.
For more info see:
https://en.wikipedia.org/wiki/Chunked_transfer_encoding
"""
body, h, t = '', '', chunked_body
while t:
h, t = t.split('\r\n', 1)
if h == '0':
break
size = int(h, 16)
body += t[:size]
t = t[size + 2:]
return body

View file

@ -0,0 +1,16 @@
"""Helper functions for scrapy.http objects (Request, Response)"""
import weakref
from urllib.parse import urlparse
_urlparse_cache = weakref.WeakKeyDictionary()
def urlparse_cached(request_or_response):
"""Return urlparse.urlparse caching the result, where the argument can be a
Request or Response object
"""
if request_or_response not in _urlparse_cache:
_urlparse_cache[request_or_response] = urlparse(request_or_response.url)
return _urlparse_cache[request_or_response]

View file

@ -0,0 +1,162 @@
import csv
import logging
import re
from io import StringIO
from scrapy.http import TextResponse, Response
from scrapy.selector import Selector
from scrapy.utils.python import re_rsearch, to_unicode
logger = logging.getLogger(__name__)
def xmliter(obj, nodename):
"""Return a iterator of Selector's over all nodes of a XML document,
given the name of the node to iterate. Useful for parsing XML feeds.
obj can be:
- a Response object
- a unicode string
- a string encoded as utf-8
"""
nodename_patt = re.escape(nodename)
DOCUMENT_HEADER_RE = re.compile(r'<\?xml[^>]+>\s*', re.S)
HEADER_END_RE = re.compile(fr'<\s*/{nodename_patt}\s*>', re.S)
END_TAG_RE = re.compile(r'<\s*/([^\s>]+)\s*>', re.S)
NAMESPACE_RE = re.compile(r'((xmlns[:A-Za-z]*)=[^>\s]+)', re.S)
text = _body_or_str(obj)
document_header = re.search(DOCUMENT_HEADER_RE, text)
document_header = document_header.group().strip() if document_header else ''
header_end_idx = re_rsearch(HEADER_END_RE, text)
header_end = text[header_end_idx[1]:].strip() if header_end_idx else ''
namespaces = {}
if header_end:
for tagname in reversed(re.findall(END_TAG_RE, header_end)):
tag = re.search(fr'<\s*{tagname}.*?xmlns[:=][^>]*>', text[:header_end_idx[1]], re.S)
if tag:
namespaces.update(reversed(x) for x in re.findall(NAMESPACE_RE, tag.group()))
r = re.compile(fr'<{nodename_patt}[\s>].*?</{nodename_patt}>', re.DOTALL)
for match in r.finditer(text):
nodetext = (
document_header
+ match.group().replace(
nodename,
f'{nodename} {" ".join(namespaces.values())}',
1
)
+ header_end
)
yield Selector(text=nodetext, type='xml')
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
from lxml import etree
reader = _StreamReader(obj)
tag = f'{{{namespace}}}{nodename}' if namespace else nodename
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
selxpath = '//' + (f'{prefix}:{nodename}' if namespace else nodename)
for _, node in iterable:
nodetext = etree.tostring(node, encoding='unicode')
node.clear()
xs = Selector(text=nodetext, type='xml')
if namespace:
xs.register_namespace(prefix, namespace)
yield xs.xpath(selxpath)[0]
class _StreamReader:
def __init__(self, obj):
self._ptr = 0
if isinstance(obj, Response):
self._text, self.encoding = obj.body, obj.encoding
else:
self._text, self.encoding = obj, 'utf-8'
self._is_unicode = isinstance(self._text, str)
def read(self, n=65535):
self.read = self._read_unicode if self._is_unicode else self._read_string
return self.read(n).lstrip()
def _read_string(self, n=65535):
s, e = self._ptr, self._ptr + n
self._ptr = e
return self._text[s:e]
def _read_unicode(self, n=65535):
s, e = self._ptr, self._ptr + n
self._ptr = e
return self._text[s:e].encode('utf-8')
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
""" Returns an iterator of dictionaries from the given csv object
obj can be:
- a Response object
- a unicode string
- a string encoded as utf-8
delimiter is the character used to separate fields on the given obj.
headers is an iterable that when provided offers the keys
for the returned dictionaries, if not the first row is used.
quotechar is the character used to enclosure fields on the given obj.
"""
encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
def row_to_unicode(row_):
return [to_unicode(field, encoding) for field in row_]
lines = StringIO(_body_or_str(obj, unicode=True))
kwargs = {}
if delimiter:
kwargs["delimiter"] = delimiter
if quotechar:
kwargs["quotechar"] = quotechar
csv_r = csv.reader(lines, **kwargs)
if not headers:
try:
row = next(csv_r)
except StopIteration:
return
headers = row_to_unicode(row)
for row in csv_r:
row = row_to_unicode(row)
if len(row) != len(headers):
logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
"should be: %(csvheader)d)",
{'csvlnum': csv_r.line_num, 'csvrow': len(row),
'csvheader': len(headers)})
continue
else:
yield dict(zip(headers, row))
def _body_or_str(obj, unicode=True):
expected_types = (Response, str, bytes)
if not isinstance(obj, expected_types):
expected_types_str = " or ".join(t.__name__ for t in expected_types)
raise TypeError(
f"Object {obj!r} must be {expected_types_str}, not {type(obj).__name__}"
)
if isinstance(obj, Response):
if not unicode:
return obj.body
elif isinstance(obj, TextResponse):
return obj.text
else:
return obj.body.decode('utf-8')
elif isinstance(obj, str):
return obj if unicode else obj.encode('utf-8')
else:
return obj.decode('utf-8') if unicode else obj

View file

@ -0,0 +1,8 @@
import os
def job_dir(settings):
path = settings['JOBDIR']
if path and not os.path.exists(path):
os.makedirs(path)
return path

View file

@ -0,0 +1,215 @@
import logging
import sys
import warnings
from logging.config import dictConfig
from twisted.python import log as twisted_log
from twisted.python.failure import Failure
import scrapy
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.settings import Settings
from scrapy.utils.versions import scrapy_components_versions
logger = logging.getLogger(__name__)
def failure_to_exc_info(failure):
"""Extract exc_info from Failure instances"""
if isinstance(failure, Failure):
return (failure.type, failure.value, failure.getTracebackObject())
class TopLevelFormatter(logging.Filter):
"""Keep only top level loggers's name (direct children from root) from
records.
This filter will replace Scrapy loggers' names with 'scrapy'. This mimics
the old Scrapy log behaviour and helps shortening long names.
Since it can't be set for just one logger (it won't propagate for its
children), it's going to be set in the root handler, with a parametrized
``loggers`` list where it should act.
"""
def __init__(self, loggers=None):
self.loggers = loggers or []
def filter(self, record):
if any(record.name.startswith(logger + '.') for logger in self.loggers):
record.name = record.name.split('.', 1)[0]
return True
DEFAULT_LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'loggers': {
'scrapy': {
'level': 'DEBUG',
},
'twisted': {
'level': 'ERROR',
},
}
}
def configure_logging(settings=None, install_root_handler=True):
"""
Initialize logging defaults for Scrapy.
:param settings: settings used to create and configure a handler for the
root logger (default: None).
:type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``
:param install_root_handler: whether to install root logging handler
(default: True)
:type install_root_handler: bool
This function does:
- Route warnings and twisted logging through Python standard logging
- Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
- Route stdout to log if LOG_STDOUT setting is True
When ``install_root_handler`` is True (default), this function also
creates a handler for the root logger according to given settings
(see :ref:`topics-logging-settings`). You can override default options
using ``settings`` argument. When ``settings`` is empty or None, defaults
are used.
"""
if not sys.warnoptions:
# Route warnings through python logging
logging.captureWarnings(True)
observer = twisted_log.PythonLoggingObserver('twisted')
observer.start()
dictConfig(DEFAULT_LOGGING)
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
if settings.getbool('LOG_STDOUT'):
sys.stdout = StreamLogger(logging.getLogger('stdout'))
if install_root_handler:
install_scrapy_root_handler(settings)
def install_scrapy_root_handler(settings):
global _scrapy_root_handler
if (_scrapy_root_handler is not None
and _scrapy_root_handler in logging.root.handlers):
logging.root.removeHandler(_scrapy_root_handler)
logging.root.setLevel(logging.NOTSET)
_scrapy_root_handler = _get_handler(settings)
logging.root.addHandler(_scrapy_root_handler)
def get_scrapy_root_handler():
return _scrapy_root_handler
_scrapy_root_handler = None
def _get_handler(settings):
""" Return a log handler object according to settings """
filename = settings.get('LOG_FILE')
if filename:
encoding = settings.get('LOG_ENCODING')
handler = logging.FileHandler(filename, encoding=encoding)
elif settings.getbool('LOG_ENABLED'):
handler = logging.StreamHandler()
else:
handler = logging.NullHandler()
formatter = logging.Formatter(
fmt=settings.get('LOG_FORMAT'),
datefmt=settings.get('LOG_DATEFORMAT')
)
handler.setFormatter(formatter)
handler.setLevel(settings.get('LOG_LEVEL'))
if settings.getbool('LOG_SHORT_NAMES'):
handler.addFilter(TopLevelFormatter(['scrapy']))
return handler
def log_scrapy_info(settings):
logger.info("Scrapy %(version)s started (bot: %(bot)s)",
{'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
versions = [
f"{name} {version}"
for name, version in scrapy_components_versions()
if name != "Scrapy"
]
logger.info("Versions: %(versions)s", {'versions': ", ".join(versions)})
from twisted.internet import reactor
logger.debug("Using reactor: %s.%s", reactor.__module__, reactor.__class__.__name__)
from twisted.internet import asyncioreactor
if isinstance(reactor, asyncioreactor.AsyncioSelectorReactor):
logger.debug(
"Using asyncio event loop: %s.%s",
reactor._asyncioEventloop.__module__,
reactor._asyncioEventloop.__class__.__name__,
)
class StreamLogger:
"""Fake file-like stream object that redirects writes to a logger instance
Taken from:
https://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
"""
def __init__(self, logger, log_level=logging.INFO):
self.logger = logger
self.log_level = log_level
self.linebuf = ''
def write(self, buf):
for line in buf.rstrip().splitlines():
self.logger.log(self.log_level, line.rstrip())
def flush(self):
for h in self.logger.handlers:
h.flush()
class LogCounterHandler(logging.Handler):
"""Record log levels count into a crawler stats"""
def __init__(self, crawler, *args, **kwargs):
super().__init__(*args, **kwargs)
self.crawler = crawler
def emit(self, record):
sname = f'log_count/{record.levelname}'
self.crawler.stats.inc_value(sname)
def logformatter_adapter(logkws):
"""
Helper that takes the dictionary output from the methods in LogFormatter
and adapts it into a tuple of positional arguments for logger.log calls,
handling backward compatibility as well.
"""
if not {'level', 'msg', 'args'} <= set(logkws):
warnings.warn('Missing keys in LogFormatter method',
ScrapyDeprecationWarning)
if 'format' in logkws:
warnings.warn('`format` key in LogFormatter methods has been '
'deprecated, use `msg` instead',
ScrapyDeprecationWarning)
level = logkws.get('level', logging.INFO)
message = logkws.get('format', logkws.get('msg'))
# NOTE: This also handles 'args' being an empty dict, that case doesn't
# play well in logger.log calls
args = logkws if not logkws.get('args') else logkws['args']
return (level, message, args)

View file

@ -0,0 +1,14 @@
"""
Transitional module for moving to the w3lib library.
For new code, always import from w3lib.html instead of this module
"""
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
from w3lib.html import * # noqa: F401
warnings.warn("Module `scrapy.utils.markup` is deprecated. "
"Please import from `w3lib.html` instead.",
ScrapyDeprecationWarning, stacklevel=2)

View file

@ -0,0 +1,253 @@
"""Helper functions which don't fit anywhere else"""
import ast
import inspect
import os
import re
import hashlib
import warnings
from collections import deque
from contextlib import contextmanager
from importlib import import_module
from pkgutil import iter_modules
from textwrap import dedent
from w3lib.html import replace_entities
from scrapy.utils.datatypes import LocalWeakReferencedCache
from scrapy.utils.python import flatten, to_unicode
from scrapy.item import _BaseItem
from scrapy.utils.deprecate import ScrapyDeprecationWarning
_ITERABLE_SINGLE_VALUES = dict, _BaseItem, str, bytes
def arg_to_iter(arg):
"""Convert an argument to an iterable. The argument can be a None, single
value, or an iterable.
Exception: if arg is a dict, [arg] will be returned
"""
if arg is None:
return []
elif not isinstance(arg, _ITERABLE_SINGLE_VALUES) and hasattr(arg, '__iter__'):
return arg
else:
return [arg]
def load_object(path):
"""Load an object given its absolute object path, and return it.
The object can be the import path of a class, function, variable or an
instance, e.g. 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware'.
If ``path`` is not a string, but is a callable object, such as a class or
a function, then return it as is.
"""
if not isinstance(path, str):
if callable(path):
return path
else:
raise TypeError("Unexpected argument type, expected string "
"or object, got: %s" % type(path))
try:
dot = path.rindex('.')
except ValueError:
raise ValueError(f"Error loading object '{path}': not a full path")
module, name = path[:dot], path[dot + 1:]
mod = import_module(module)
try:
obj = getattr(mod, name)
except AttributeError:
raise NameError(f"Module '{module}' doesn't define any object named '{name}'")
return obj
def walk_modules(path):
"""Loads a module and all its submodules from the given module path and
returns them. If *any* module throws an exception while importing, that
exception is thrown back.
For example: walk_modules('scrapy.utils')
"""
mods = []
mod = import_module(path)
mods.append(mod)
if hasattr(mod, '__path__'):
for _, subpath, ispkg in iter_modules(mod.__path__):
fullpath = path + '.' + subpath
if ispkg:
mods += walk_modules(fullpath)
else:
submod = import_module(fullpath)
mods.append(submod)
return mods
def extract_regex(regex, text, encoding='utf-8'):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
warnings.warn(
"scrapy.utils.misc.extract_regex has moved to parsel.utils.extract_regex.",
ScrapyDeprecationWarning,
stacklevel=2
)
if isinstance(regex, str):
regex = re.compile(regex, re.UNICODE)
try:
strings = [regex.search(text).group('extract')] # named group
except Exception:
strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings)
if isinstance(text, str):
return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
else:
return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
for s in strings]
def md5sum(file):
"""Calculate the md5 checksum of a file-like object without reading its
whole content in memory.
>>> from io import BytesIO
>>> md5sum(BytesIO(b'file content to hash'))
'784406af91dd5a54fbb9c84c2236595a'
"""
m = hashlib.md5()
while True:
d = file.read(8096)
if not d:
break
m.update(d)
return m.hexdigest()
def rel_has_nofollow(rel):
"""Return True if link rel attribute has nofollow type"""
return rel is not None and 'nofollow' in rel.split()
def create_instance(objcls, settings, crawler, *args, **kwargs):
"""Construct a class instance using its ``from_crawler`` or
``from_settings`` constructors, if available.
At least one of ``settings`` and ``crawler`` needs to be different from
``None``. If ``settings `` is ``None``, ``crawler.settings`` will be used.
If ``crawler`` is ``None``, only the ``from_settings`` constructor will be
tried.
``*args`` and ``**kwargs`` are forwarded to the constructors.
Raises ``ValueError`` if both ``settings`` and ``crawler`` are ``None``.
.. versionchanged:: 2.2
Raises ``TypeError`` if the resulting instance is ``None`` (e.g. if an
extension has not been implemented correctly).
"""
if settings is None:
if crawler is None:
raise ValueError("Specify at least one of settings and crawler.")
settings = crawler.settings
if crawler and hasattr(objcls, 'from_crawler'):
instance = objcls.from_crawler(crawler, *args, **kwargs)
method_name = 'from_crawler'
elif hasattr(objcls, 'from_settings'):
instance = objcls.from_settings(settings, *args, **kwargs)
method_name = 'from_settings'
else:
instance = objcls(*args, **kwargs)
method_name = '__new__'
if instance is None:
raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
return instance
@contextmanager
def set_environ(**kwargs):
"""Temporarily set environment variables inside the context manager and
fully restore previous environment afterwards
"""
original_env = {k: os.environ.get(k) for k in kwargs}
os.environ.update(kwargs)
try:
yield
finally:
for k, v in original_env.items():
if v is None:
del os.environ[k]
else:
os.environ[k] = v
def walk_callable(node):
"""Similar to ``ast.walk``, but walks only function body and skips nested
functions defined within the node.
"""
todo = deque([node])
walked_func_def = False
while todo:
node = todo.popleft()
if isinstance(node, ast.FunctionDef):
if walked_func_def:
continue
walked_func_def = True
todo.extend(ast.iter_child_nodes(node))
yield node
_generator_callbacks_cache = LocalWeakReferencedCache(limit=128)
def is_generator_with_return_value(callable):
"""
Returns True if a callable is a generator function which includes a
'return' statement with a value different than None, False otherwise
"""
if callable in _generator_callbacks_cache:
return _generator_callbacks_cache[callable]
def returns_none(return_node):
value = return_node.value
return value is None or isinstance(value, ast.NameConstant) and value.value is None
if inspect.isgeneratorfunction(callable):
tree = ast.parse(dedent(inspect.getsource(callable)))
for node in walk_callable(tree):
if isinstance(node, ast.Return) and not returns_none(node):
_generator_callbacks_cache[callable] = True
return _generator_callbacks_cache[callable]
_generator_callbacks_cache[callable] = False
return _generator_callbacks_cache[callable]
def warn_on_generator_with_return_value(spider, callable):
"""
Logs a warning if a callable is a generator function and includes
a 'return' statement with a value different than None
"""
if is_generator_with_return_value(callable):
warnings.warn(
f'The "{spider.__class__.__name__}.{callable.__name__}" method is '
'a generator and includes a "return" statement with a value '
'different than None. This could lead to unexpected behaviour. Please see '
'https://docs.python.org/3/reference/simple_stmts.html#the-return-statement '
'for details about the semantics of the "return" statement within generators',
stacklevel=2,
)

View file

@ -0,0 +1,15 @@
"""
Transitional module for moving to the w3lib library.
For new code, always import from w3lib.form instead of this module
"""
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
from w3lib.form import * # noqa: F401
warnings.warn("Module `scrapy.utils.multipart` is deprecated. "
"If you're using `encode_multipart` function, please use "
"`urllib3.filepost.encode_multipart_formdata` instead",
ScrapyDeprecationWarning, stacklevel=2)

View file

@ -0,0 +1,25 @@
import signal
signal_names = {}
for signame in dir(signal):
if signame.startswith('SIG') and not signame.startswith('SIG_'):
signum = getattr(signal, signame)
if isinstance(signum, int):
signal_names[signum] = signame
def install_shutdown_handlers(function, override_sigint=True):
"""Install the given function as a signal handler for all common shutdown
signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the
SIGINT handler won't be install if there is already a handler in place
(e.g. Pdb)
"""
from twisted.internet import reactor
reactor._handleSignals()
signal.signal(signal.SIGTERM, function)
if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint:
signal.signal(signal.SIGINT, function)
# Catch Ctrl-Break in windows
if hasattr(signal, 'SIGBREAK'):
signal.signal(signal.SIGBREAK, function)

View file

@ -0,0 +1,98 @@
import os
import pickle
import warnings
from importlib import import_module
from os.path import join, dirname, abspath, isabs, exists
from scrapy.utils.conf import closest_scrapy_cfg, get_config, init_env
from scrapy.settings import Settings
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
ENVVAR = 'SCRAPY_SETTINGS_MODULE'
DATADIR_CFG_SECTION = 'datadir'
def inside_project():
scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE')
if scrapy_module is not None:
try:
import_module(scrapy_module)
except ImportError as exc:
warnings.warn(f"Cannot import scrapy settings module {scrapy_module}: {exc}")
else:
return True
return bool(closest_scrapy_cfg())
def project_data_dir(project='default'):
"""Return the current project data dir, creating it if it doesn't exist"""
if not inside_project():
raise NotConfigured("Not inside a project")
cfg = get_config()
if cfg.has_option(DATADIR_CFG_SECTION, project):
d = cfg.get(DATADIR_CFG_SECTION, project)
else:
scrapy_cfg = closest_scrapy_cfg()
if not scrapy_cfg:
raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir")
d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
if not exists(d):
os.makedirs(d)
return d
def data_path(path, createdir=False):
"""
Return the given path joined with the .scrapy data directory.
If given an absolute path, return it unmodified.
"""
if not isabs(path):
if inside_project():
path = join(project_data_dir(), path)
else:
path = join('.scrapy', path)
if createdir and not exists(path):
os.makedirs(path)
return path
def get_project_settings():
if ENVVAR not in os.environ:
project = os.environ.get('SCRAPY_PROJECT', 'default')
init_env(project)
settings = Settings()
settings_module_path = os.environ.get(ENVVAR)
if settings_module_path:
settings.setmodule(settings_module_path, priority='project')
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
if pickled_settings:
warnings.warn("Use of environment variable "
"'SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE' "
"is deprecated.", ScrapyDeprecationWarning)
settings.setdict(pickle.loads(pickled_settings), priority='project')
scrapy_envvars = {k[7:]: v for k, v in os.environ.items() if
k.startswith('SCRAPY_')}
valid_envvars = {
'CHECK',
'PICKLED_SETTINGS_TO_OVERRIDE',
'PROJECT',
'PYTHON_SHELL',
'SETTINGS_MODULE',
}
setting_envvars = {k for k in scrapy_envvars if k not in valid_envvars}
if setting_envvars:
setting_envvar_list = ', '.join(sorted(setting_envvars))
warnings.warn(
'Use of environment variables prefixed with SCRAPY_ to override '
'settings is deprecated. The following environment variables are '
f'currently defined: {setting_envvar_list}',
ScrapyDeprecationWarning
)
settings.setdict(scrapy_envvars, priority='project')
return settings

View file

@ -0,0 +1,10 @@
"""
Helpers using Python 3.6+ syntax (ignore SyntaxError on import).
"""
async def collect_asyncgen(result):
results = []
async for x in result:
results.append(x)
return results

View file

@ -0,0 +1,357 @@
"""
This module contains essential stuff that should've come with Python itself ;)
"""
import errno
import gc
import inspect
import re
import sys
import warnings
import weakref
from functools import partial, wraps
from itertools import chain
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.decorators import deprecated
def flatten(x):
"""flatten(sequence) -> list
Returns a single, flat list which contains all elements retrieved
from the sequence and all recursively contained sub-sequences
(iterables).
Examples:
>>> [1, 2, [3,4], (5,6)]
[1, 2, [3, 4], (5, 6)]
>>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
[1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
>>> flatten(["foo", "bar"])
['foo', 'bar']
>>> flatten(["foo", ["baz", 42], "bar"])
['foo', 'baz', 42, 'bar']
"""
return list(iflatten(x))
def iflatten(x):
"""iflatten(sequence) -> iterator
Similar to ``.flatten()``, but returns iterator instead"""
for el in x:
if is_listlike(el):
for el_ in iflatten(el):
yield el_
else:
yield el
def is_listlike(x):
"""
>>> is_listlike("foo")
False
>>> is_listlike(5)
False
>>> is_listlike(b"foo")
False
>>> is_listlike([b"foo"])
True
>>> is_listlike((b"foo",))
True
>>> is_listlike({})
True
>>> is_listlike(set())
True
>>> is_listlike((x for x in range(3)))
True
>>> is_listlike(range(5))
True
"""
return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))
def unique(list_, key=lambda x: x):
"""efficient function to uniquify a list preserving item order"""
seen = set()
result = []
for item in list_:
seenkey = key(item)
if seenkey in seen:
continue
seen.add(seenkey)
result.append(item)
return result
def to_unicode(text, encoding=None, errors='strict'):
"""Return the unicode representation of a bytes object ``text``. If
``text`` is already an unicode object, return it as-is."""
if isinstance(text, str):
return text
if not isinstance(text, (bytes, str)):
raise TypeError('to_unicode must receive a bytes or str '
f'object, got {type(text).__name__}')
if encoding is None:
encoding = 'utf-8'
return text.decode(encoding, errors)
def to_bytes(text, encoding=None, errors='strict'):
"""Return the binary representation of ``text``. If ``text``
is already a bytes object, return it as-is."""
if isinstance(text, bytes):
return text
if not isinstance(text, str):
raise TypeError('to_bytes must receive a str or bytes '
f'object, got {type(text).__name__}')
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors)
@deprecated('to_unicode')
def to_native_str(text, encoding=None, errors='strict'):
""" Return str representation of ``text``. """
return to_unicode(text, encoding, errors)
def re_rsearch(pattern, text, chunk_size=1024):
"""
This function does a reverse search in a text using a regular expression
given in the attribute 'pattern'.
Since the re module does not provide this functionality, we have to find for
the expression into chunks of text extracted from the end (for the sake of efficiency).
At first, a chunk of 'chunk_size' kilobytes is extracted from the end, and searched for
the pattern. If the pattern is not found, another chunk is extracted, and another
search is performed.
This process continues until a match is found, or until the whole file is read.
In case the pattern wasn't found, None is returned, otherwise it returns a tuple containing
the start position of the match, and the ending (regarding the entire text).
"""
def _chunk_iter():
offset = len(text)
while True:
offset -= (chunk_size * 1024)
if offset <= 0:
break
yield (text[offset:], offset)
yield (text, 0)
if isinstance(pattern, str):
pattern = re.compile(pattern)
for chunk, offset in _chunk_iter():
matches = [match for match in pattern.finditer(chunk)]
if matches:
start, end = matches[-1].span()
return offset + start, offset + end
return None
def memoizemethod_noargs(method):
"""Decorator to cache the result of a method (without arguments) using a
weak reference to its object
"""
cache = weakref.WeakKeyDictionary()
@wraps(method)
def new_method(self, *args, **kwargs):
if self not in cache:
cache[self] = method(self, *args, **kwargs)
return cache[self]
return new_method
_BINARYCHARS = {to_bytes(chr(i)) for i in range(32)} - {b"\0", b"\t", b"\n", b"\r"}
_BINARYCHARS |= {ord(ch) for ch in _BINARYCHARS}
def binary_is_text(data):
""" Returns ``True`` if the given ``data`` argument (a ``bytes`` object)
does not contain unprintable control characters.
"""
if not isinstance(data, bytes):
raise TypeError(f"data must be bytes, got '{type(data).__name__}'")
return all(c not in _BINARYCHARS for c in data)
def _getargspec_py23(func):
"""_getargspec_py23(function) -> named tuple ArgSpec(args, varargs, keywords,
defaults)
Was identical to inspect.getargspec() in python2, but uses
inspect.getfullargspec() for python3 behind the scenes to avoid
DeprecationWarning.
>>> def f(a, b=2, *ar, **kw):
... pass
>>> _getargspec_py23(f)
ArgSpec(args=['a', 'b'], varargs='ar', keywords='kw', defaults=(2,))
"""
return inspect.ArgSpec(*inspect.getfullargspec(func)[:4])
def get_func_args(func, stripself=False):
"""Return the argument name list of a callable"""
if inspect.isfunction(func):
spec = inspect.getfullargspec(func)
func_args = spec.args + spec.kwonlyargs
elif inspect.isclass(func):
return get_func_args(func.__init__, True)
elif inspect.ismethod(func):
return get_func_args(func.__func__, True)
elif inspect.ismethoddescriptor(func):
return []
elif isinstance(func, partial):
return [x for x in get_func_args(func.func)[len(func.args):]
if not (func.keywords and x in func.keywords)]
elif hasattr(func, '__call__'):
if inspect.isroutine(func):
return []
elif getattr(func, '__name__', None) == '__call__':
return []
else:
return get_func_args(func.__call__, True)
else:
raise TypeError(f'{type(func)} is not callable')
if stripself:
func_args.pop(0)
return func_args
def get_spec(func):
"""Returns (args, kwargs) tuple for a function
>>> import re
>>> get_spec(re.match)
(['pattern', 'string'], {'flags': 0})
>>> class Test:
... def __call__(self, val):
... pass
... def method(self, val, flags=0):
... pass
>>> get_spec(Test)
(['self', 'val'], {})
>>> get_spec(Test.method)
(['self', 'val'], {'flags': 0})
>>> get_spec(Test().method)
(['self', 'val'], {'flags': 0})
"""
if inspect.isfunction(func) or inspect.ismethod(func):
spec = _getargspec_py23(func)
elif hasattr(func, '__call__'):
spec = _getargspec_py23(func.__call__)
else:
raise TypeError(f'{type(func)} is not callable')
defaults = spec.defaults or []
firstdefault = len(spec.args) - len(defaults)
args = spec.args[:firstdefault]
kwargs = dict(zip(spec.args[firstdefault:], defaults))
return args, kwargs
def equal_attributes(obj1, obj2, attributes):
"""Compare two objects attributes"""
# not attributes given return False by default
if not attributes:
return False
temp1, temp2 = object(), object()
for attr in attributes:
# support callables like itemgetter
if callable(attr):
if attr(obj1) != attr(obj2):
return False
elif getattr(obj1, attr, temp1) != getattr(obj2, attr, temp2):
return False
# all attributes equal
return True
class WeakKeyCache:
def __init__(self, default_factory):
warnings.warn("The WeakKeyCache class is deprecated", category=ScrapyDeprecationWarning, stacklevel=2)
self.default_factory = default_factory
self._weakdict = weakref.WeakKeyDictionary()
def __getitem__(self, key):
if key not in self._weakdict:
self._weakdict[key] = self.default_factory(key)
return self._weakdict[key]
@deprecated
def retry_on_eintr(function, *args, **kw):
"""Run a function and retry it while getting EINTR errors"""
while True:
try:
return function(*args, **kw)
except IOError as e:
if e.errno != errno.EINTR:
raise
def without_none_values(iterable):
"""Return a copy of ``iterable`` with all ``None`` entries removed.
If ``iterable`` is a mapping, return a dictionary where all pairs that have
value ``None`` have been removed.
"""
try:
return {k: v for k, v in iterable.items() if v is not None}
except AttributeError:
return type(iterable)((v for v in iterable if v is not None))
def global_object_name(obj):
"""
Return full name of a global object.
>>> from scrapy import Request
>>> global_object_name(Request)
'scrapy.http.request.Request'
"""
return f"{obj.__module__}.{obj.__name__}"
if hasattr(sys, "pypy_version_info"):
def garbage_collect():
# Collecting weakreferences can take two collections on PyPy.
gc.collect()
gc.collect()
else:
def garbage_collect():
gc.collect()
class MutableChain:
"""
Thin wrapper around itertools.chain, allowing to add iterables "in-place"
"""
def __init__(self, *args):
self.data = chain.from_iterable(args)
def extend(self, *iterables):
self.data = chain(self.data, chain.from_iterable(iterables))
def __iter__(self):
return self
def __next__(self):
return next(self.data)
@deprecated("scrapy.utils.python.MutableChain.__next__")
def next(self):
return self.__next__()

View file

@ -0,0 +1,90 @@
import asyncio
from contextlib import suppress
from twisted.internet import asyncioreactor, error
from scrapy.utils.misc import load_object
def listen_tcp(portrange, host, factory):
"""Like reactor.listenTCP but tries different ports in a range."""
from twisted.internet import reactor
if len(portrange) > 2:
raise ValueError(f"invalid portrange: {portrange}")
if not portrange:
return reactor.listenTCP(0, factory, interface=host)
if not hasattr(portrange, '__iter__'):
return reactor.listenTCP(portrange, factory, interface=host)
if len(portrange) == 1:
return reactor.listenTCP(portrange[0], factory, interface=host)
for x in range(portrange[0], portrange[1] + 1):
try:
return reactor.listenTCP(x, factory, interface=host)
except error.CannotListenError:
if x == portrange[1]:
raise
class CallLaterOnce:
"""Schedule a function to be called in the next reactor loop, but only if
it hasn't been already scheduled since the last time it ran.
"""
def __init__(self, func, *a, **kw):
self._func = func
self._a = a
self._kw = kw
self._call = None
def schedule(self, delay=0):
from twisted.internet import reactor
if self._call is None:
self._call = reactor.callLater(delay, self)
def cancel(self):
if self._call:
self._call.cancel()
def __call__(self):
self._call = None
return self._func(*self._a, **self._kw)
def install_reactor(reactor_path, event_loop_path=None):
"""Installs the :mod:`~twisted.internet.reactor` with the specified
import path. Also installs the asyncio event loop with the specified import
path if the asyncio reactor is enabled"""
reactor_class = load_object(reactor_path)
if reactor_class is asyncioreactor.AsyncioSelectorReactor:
with suppress(error.ReactorAlreadyInstalledError):
if event_loop_path is not None:
event_loop_class = load_object(event_loop_path)
event_loop = event_loop_class()
asyncio.set_event_loop(event_loop)
else:
event_loop = asyncio.get_event_loop()
asyncioreactor.install(eventloop=event_loop)
else:
*module, _ = reactor_path.split(".")
installer_path = module + ["install"]
installer = load_object(".".join(installer_path))
with suppress(error.ReactorAlreadyInstalledError):
installer()
def verify_installed_reactor(reactor_path):
"""Raises :exc:`Exception` if the installed
:mod:`~twisted.internet.reactor` does not match the specified import
path."""
from twisted.internet import reactor
reactor_class = load_object(reactor_path)
if not isinstance(reactor, reactor_class):
msg = ("The installed reactor "
f"({reactor.__module__}.{reactor.__class__.__name__}) does not "
f"match the requested one ({reactor_path})")
raise Exception(msg)
def is_asyncio_reactor_installed():
from twisted.internet import reactor
return isinstance(reactor, asyncioreactor.AsyncioSelectorReactor)

View file

@ -0,0 +1,95 @@
"""
Helper functions for serializing (and deserializing) requests.
"""
import inspect
from scrapy.http import Request
from scrapy.utils.python import to_unicode
from scrapy.utils.misc import load_object
def request_to_dict(request, spider=None):
"""Convert Request object to a dict.
If a spider is given, it will try to find out the name of the spider method
used in the callback and store that as the callback.
"""
cb = request.callback
if callable(cb):
cb = _find_method(spider, cb)
eb = request.errback
if callable(eb):
eb = _find_method(spider, eb)
d = {
'url': to_unicode(request.url), # urls should be safe (safe_string_url)
'callback': cb,
'errback': eb,
'method': request.method,
'headers': dict(request.headers),
'body': request.body,
'cookies': request.cookies,
'meta': request.meta,
'_encoding': request._encoding,
'priority': request.priority,
'dont_filter': request.dont_filter,
'flags': request.flags,
'cb_kwargs': request.cb_kwargs,
}
if type(request) is not Request:
d['_class'] = request.__module__ + '.' + request.__class__.__name__
return d
def request_from_dict(d, spider=None):
"""Create Request object from a dict.
If a spider is given, it will try to resolve the callbacks looking at the
spider for methods with the same name.
"""
cb = d['callback']
if cb and spider:
cb = _get_method(spider, cb)
eb = d['errback']
if eb and spider:
eb = _get_method(spider, eb)
request_cls = load_object(d['_class']) if '_class' in d else Request
return request_cls(
url=to_unicode(d['url']),
callback=cb,
errback=eb,
method=d['method'],
headers=d['headers'],
body=d['body'],
cookies=d['cookies'],
meta=d['meta'],
encoding=d['_encoding'],
priority=d['priority'],
dont_filter=d['dont_filter'],
flags=d.get('flags'),
cb_kwargs=d.get('cb_kwargs'),
)
def _find_method(obj, func):
# Only instance methods contain ``__func__``
if obj and hasattr(func, '__func__'):
members = inspect.getmembers(obj, predicate=inspect.ismethod)
for name, obj_func in members:
# We need to use __func__ to access the original
# function object because instance method objects
# are generated each time attribute is retrieved from
# instance.
#
# Reference: The standard type hierarchy
# https://docs.python.org/3/reference/datamodel.html
if obj_func.__func__ is func.__func__:
return name
raise ValueError(f"Function {func} is not an instance method in: {obj}")
def _get_method(obj, name):
name = str(name)
try:
return getattr(obj, name)
except AttributeError:
raise ValueError(f"Method {name!r} not found in: {obj}")

View file

@ -0,0 +1,100 @@
"""
This module provides some useful functions for working with
scrapy.http.Request objects
"""
import hashlib
import weakref
from urllib.parse import urlunparse
from w3lib.http import basic_auth_header
from w3lib.url import canonicalize_url
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes, to_unicode
_fingerprint_cache = weakref.WeakKeyDictionary()
def request_fingerprint(request, include_headers=None, keep_fragments=False):
"""
Return the request fingerprint.
The request fingerprint is a hash that uniquely identifies the resource the
request points to. For example, take the following two urls:
http://www.example.com/query?id=111&cat=222
http://www.example.com/query?cat=222&id=111
Even though those are two different URLs both point to the same resource
and are equivalent (i.e. they should return the same response).
Another example are cookies used to store session ids. Suppose the
following page is only accessible to authenticated users:
http://www.example.com/members/offers.html
Lot of sites use a cookie to store the session id, which adds a random
component to the HTTP Request and thus should be ignored when calculating
the fingerprint.
For this reason, request headers are ignored by default when calculating
the fingeprint. If you want to include specific headers use the
include_headers argument, which is a list of Request headers to include.
Also, servers usually ignore fragments in urls when handling requests,
so they are also ignored by default when calculating the fingerprint.
If you want to include them, set the keep_fragments argument to True
(for instance when handling requests with a headless browser).
"""
if include_headers:
include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers))
cache = _fingerprint_cache.setdefault(request, {})
cache_key = (include_headers, keep_fragments)
if cache_key not in cache:
fp = hashlib.sha1()
fp.update(to_bytes(request.method))
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
fp.update(request.body or b'')
if include_headers:
for hdr in include_headers:
if hdr in request.headers:
fp.update(hdr)
for v in request.headers.getlist(hdr):
fp.update(v)
cache[cache_key] = fp.hexdigest()
return cache[cache_key]
def request_authenticate(request, username, password):
"""Autenticate the given request (in place) using the HTTP basic access
authentication mechanism (RFC 2617) and the given username and password
"""
request.headers['Authorization'] = basic_auth_header(username, password)
def request_httprepr(request):
"""Return the raw HTTP representation (as bytes) of the given request.
This is provided only for reference since it's not the actual stream of
bytes that will be send when performing the request (that's controlled
by Twisted).
"""
parsed = urlparse_cached(request)
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
if request.headers:
s += request.headers.to_string() + b"\r\n"
s += b"\r\n"
s += request.body
return s
def referer_str(request):
""" Return Referer HTTP header suitable for logging. """
referrer = request.headers.get('Referer')
if referrer is None:
return referrer
return to_unicode(referrer, errors='replace')

View file

@ -0,0 +1,83 @@
"""
This module provides some useful functions for working with
scrapy.http.Response objects
"""
import os
import weakref
import webbrowser
import tempfile
from twisted.web import http
from scrapy.utils.python import to_bytes, to_unicode
from w3lib import html
_baseurl_cache = weakref.WeakKeyDictionary()
def get_base_url(response):
"""Return the base url of the given response, joined with the response url"""
if response not in _baseurl_cache:
text = response.text[0:4096]
_baseurl_cache[response] = html.get_base_url(text, response.url, response.encoding)
return _baseurl_cache[response]
_metaref_cache = weakref.WeakKeyDictionary()
def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
"""Parse the http-equiv refrsh parameter from the given response"""
if response not in _metaref_cache:
text = response.text[0:4096]
_metaref_cache[response] = html.get_meta_refresh(
text, response.url, response.encoding, ignore_tags=ignore_tags)
return _metaref_cache[response]
def response_status_message(status):
"""Return status code plus status text descriptive message
"""
message = http.RESPONSES.get(int(status), "Unknown Status")
return f'{status} {to_unicode(message)}'
def response_httprepr(response):
"""Return raw HTTP representation (as bytes) of the given response. This
is provided only for reference, since it's not the exact stream of bytes
that was received (that's not exposed by Twisted).
"""
values = [
b"HTTP/1.1 ",
to_bytes(str(response.status)),
b" ",
to_bytes(http.RESPONSES.get(response.status, b'')),
b"\r\n",
]
if response.headers:
values.extend([response.headers.to_string(), b"\r\n"])
values.extend([b"\r\n", response.body])
return b"".join(values)
def open_in_browser(response, _openfunc=webbrowser.open):
"""Open the given response in a local web browser, populating the <base>
tag for external links to work
"""
from scrapy.http import HtmlResponse, TextResponse
# XXX: this implementation is a bit dirty and could be improved
body = response.body
if isinstance(response, HtmlResponse):
if b'<base' not in body:
repl = f'<head><base href="{response.url}">'
body = body.replace(b'<head>', to_bytes(repl))
ext = '.html'
elif isinstance(response, TextResponse):
ext = '.txt'
else:
raise TypeError("Unsupported response type: "
f"{response.__class__.__name__}")
fd, fname = tempfile.mkstemp(ext)
os.write(fd, body)
os.close(fd)
return _openfunc(f"file://{fname}")

View file

@ -0,0 +1,40 @@
import json
import datetime
import decimal
from itemadapter import is_item, ItemAdapter
from twisted.internet import defer
from scrapy.http import Request, Response
class ScrapyJSONEncoder(json.JSONEncoder):
DATE_FORMAT = "%Y-%m-%d"
TIME_FORMAT = "%H:%M:%S"
def default(self, o):
if isinstance(o, set):
return list(o)
elif isinstance(o, datetime.datetime):
return o.strftime(f"{self.DATE_FORMAT} {self.TIME_FORMAT}")
elif isinstance(o, datetime.date):
return o.strftime(self.DATE_FORMAT)
elif isinstance(o, datetime.time):
return o.strftime(self.TIME_FORMAT)
elif isinstance(o, decimal.Decimal):
return str(o)
elif isinstance(o, defer.Deferred):
return str(o)
elif is_item(o):
return ItemAdapter(o).asdict()
elif isinstance(o, Request):
return f"<{type(o).__name__} {o.method} {o.url}>"
elif isinstance(o, Response):
return f"<{type(o).__name__} {o.status} {o.url}>"
else:
return super().default(o)
class ScrapyJSONDecoder(json.JSONDecoder):
pass

View file

@ -0,0 +1,80 @@
"""Helper functions for working with signals"""
import logging
from twisted.internet.defer import DeferredList, Deferred
from twisted.python.failure import Failure
from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers
from pydispatch.robustapply import robustApply
from scrapy.exceptions import StopDownload
from scrapy.utils.defer import maybeDeferred_coro
from scrapy.utils.log import failure_to_exc_info
logger = logging.getLogger(__name__)
class _IgnoredException(Exception):
pass
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
"""Like pydispatcher.robust.sendRobust but it also logs errors and returns
Failures instead of exceptions.
"""
dont_log = (named.pop('dont_log', _IgnoredException), StopDownload)
spider = named.get('spider', None)
responses = []
for receiver in liveReceivers(getAllReceivers(sender, signal)):
try:
response = robustApply(receiver, signal=signal, sender=sender, *arguments, **named)
if isinstance(response, Deferred):
logger.error("Cannot return deferreds from signal handler: %(receiver)s",
{'receiver': receiver}, extra={'spider': spider})
except dont_log:
result = Failure()
except Exception:
result = Failure()
logger.error("Error caught on signal handler: %(receiver)s",
{'receiver': receiver},
exc_info=True, extra={'spider': spider})
else:
result = response
responses.append((receiver, result))
return responses
def send_catch_log_deferred(signal=Any, sender=Anonymous, *arguments, **named):
"""Like send_catch_log but supports returning deferreds on signal handlers.
Returns a deferred that gets fired once all signal handlers deferreds were
fired.
"""
def logerror(failure, recv):
if dont_log is None or not isinstance(failure.value, dont_log):
logger.error("Error caught on signal handler: %(receiver)s",
{'receiver': recv},
exc_info=failure_to_exc_info(failure),
extra={'spider': spider})
return failure
dont_log = named.pop('dont_log', None)
spider = named.get('spider', None)
dfds = []
for receiver in liveReceivers(getAllReceivers(sender, signal)):
d = maybeDeferred_coro(robustApply, receiver, signal=signal, sender=sender, *arguments, **named)
d.addErrback(logerror, receiver)
d.addBoth(lambda result: (receiver, result))
dfds.append(d)
d = DeferredList(dfds)
d.addCallback(lambda out: [x[1] for x in out])
return d
def disconnect_all(signal=Any, sender=Any):
"""Disconnect all signal handlers. Useful for cleaning up after running
tests
"""
for receiver in liveReceivers(getAllReceivers(sender, signal)):
disconnect(receiver, signal=signal, sender=sender)

View file

@ -0,0 +1,47 @@
"""
Module for processing Sitemaps.
Note: The main purpose of this module is to provide support for the
SitemapSpider, its API is subject to change without notice.
"""
from urllib.parse import urljoin
import lxml.etree
class Sitemap:
"""Class to parse Sitemap (type=urlset) and Sitemap Index
(type=sitemapindex) files"""
def __init__(self, xmltext):
xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True, resolve_entities=False)
self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
rt = self._root.tag
self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt
def __iter__(self):
for elem in self._root.getchildren():
d = {}
for el in elem.getchildren():
tag = el.tag
name = tag.split('}', 1)[1] if '}' in tag else tag
if name == 'link':
if 'href' in el.attrib:
d.setdefault('alternate', []).append(el.get('href'))
else:
d[name] = el.text.strip() if el.text else ''
if 'loc' in d:
yield d
def sitemap_urls_from_robots(robots_text, base_url=None):
"""Return an iterator over all sitemap urls contained in the given
robots.txt file
"""
for line in robots_text.splitlines():
if line.lstrip().lower().startswith('sitemap:'):
url = line.split(':', 1)[1].strip()
yield urljoin(base_url, url)

View file

@ -0,0 +1,74 @@
import inspect
import logging
from scrapy.spiders import Spider
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.misc import arg_to_iter
try:
from scrapy.utils.py36 import collect_asyncgen
except SyntaxError:
collect_asyncgen = None
logger = logging.getLogger(__name__)
def iterate_spider_output(result):
if collect_asyncgen and hasattr(inspect, 'isasyncgen') and inspect.isasyncgen(result):
d = deferred_from_coro(collect_asyncgen(result))
d.addCallback(iterate_spider_output)
return d
elif inspect.iscoroutine(result):
d = deferred_from_coro(result)
d.addCallback(iterate_spider_output)
return d
return arg_to_iter(result)
def iter_spider_classes(module):
"""Return an iterator over all spider classes defined in the given module
that can be instantiated (i.e. which have name)
"""
# this needs to be imported here until get rid of the spider manager
# singleton in scrapy.spider.spiders
from scrapy.spiders import Spider
for obj in vars(module).values():
if (
inspect.isclass(obj)
and issubclass(obj, Spider)
and obj.__module__ == module.__name__
and getattr(obj, 'name', None)
):
yield obj
def spidercls_for_request(spider_loader, request, default_spidercls=None,
log_none=False, log_multiple=False):
"""Return a spider class that handles the given Request.
This will look for the spiders that can handle the given request (using
the spider loader) and return a Spider class if (and only if) there is
only one Spider able to handle the Request.
If multiple spiders (or no spider) are found, it will return the
default_spidercls passed. It can optionally log if multiple or no spiders
are found.
"""
snames = spider_loader.find_by_request(request)
if len(snames) == 1:
return spider_loader.load(snames[0])
if len(snames) > 1 and log_multiple:
logger.error('More than one spider can handle: %(request)s - %(snames)s',
{'request': request, 'snames': ', '.join(snames)})
if len(snames) == 0 and log_none:
logger.error('Unable to find spider that handles: %(request)s',
{'request': request})
return default_spidercls
class DefaultSpider(Spider):
name = 'default'

View file

@ -0,0 +1,61 @@
import OpenSSL
import OpenSSL._util as pyOpenSSLutil
from scrapy.utils.python import to_unicode
# The OpenSSL symbol is present since 1.1.1 but it's not currently supported in any version of pyOpenSSL.
# Using the binding directly, as this code does, requires cryptography 2.4.
SSL_OP_NO_TLSv1_3 = getattr(pyOpenSSLutil.lib, 'SSL_OP_NO_TLSv1_3', 0)
def ffi_buf_to_string(buf):
return to_unicode(pyOpenSSLutil.ffi.string(buf))
def x509name_to_string(x509name):
# from OpenSSL.crypto.X509Name.__repr__
result_buffer = pyOpenSSLutil.ffi.new("char[]", 512)
pyOpenSSLutil.lib.X509_NAME_oneline(x509name._name, result_buffer, len(result_buffer))
return ffi_buf_to_string(result_buffer)
def get_temp_key_info(ssl_object):
if not hasattr(pyOpenSSLutil.lib, 'SSL_get_server_tmp_key'): # requires OpenSSL 1.0.2
return None
# adapted from OpenSSL apps/s_cb.c::ssl_print_tmp_key()
temp_key_p = pyOpenSSLutil.ffi.new("EVP_PKEY **")
if not pyOpenSSLutil.lib.SSL_get_server_tmp_key(ssl_object, temp_key_p):
return None
temp_key = temp_key_p[0]
if temp_key == pyOpenSSLutil.ffi.NULL:
return None
temp_key = pyOpenSSLutil.ffi.gc(temp_key, pyOpenSSLutil.lib.EVP_PKEY_free)
key_info = []
key_type = pyOpenSSLutil.lib.EVP_PKEY_id(temp_key)
if key_type == pyOpenSSLutil.lib.EVP_PKEY_RSA:
key_info.append('RSA')
elif key_type == pyOpenSSLutil.lib.EVP_PKEY_DH:
key_info.append('DH')
elif key_type == pyOpenSSLutil.lib.EVP_PKEY_EC:
key_info.append('ECDH')
ec_key = pyOpenSSLutil.lib.EVP_PKEY_get1_EC_KEY(temp_key)
ec_key = pyOpenSSLutil.ffi.gc(ec_key, pyOpenSSLutil.lib.EC_KEY_free)
nid = pyOpenSSLutil.lib.EC_GROUP_get_curve_name(pyOpenSSLutil.lib.EC_KEY_get0_group(ec_key))
cname = pyOpenSSLutil.lib.EC_curve_nid2nist(nid)
if cname == pyOpenSSLutil.ffi.NULL:
cname = pyOpenSSLutil.lib.OBJ_nid2sn(nid)
key_info.append(ffi_buf_to_string(cname))
else:
key_info.append(ffi_buf_to_string(pyOpenSSLutil.lib.OBJ_nid2sn(key_type)))
key_info.append(f'{pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key)} bits')
return ', '.join(key_info)
def get_openssl_version():
system_openssl = OpenSSL.SSL.SSLeay_version(
OpenSSL.SSL.SSLEAY_VERSION
).decode('ascii', errors='replace')
return f'{OpenSSL.version.__version__} ({system_openssl})'

View file

@ -0,0 +1,36 @@
"""Helper functions for working with templates"""
import os
import re
import string
def render_templatefile(path, **kwargs):
with open(path, 'rb') as fp:
raw = fp.read().decode('utf8')
content = string.Template(raw).substitute(**kwargs)
render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path
if path.endswith('.tmpl'):
os.rename(path, render_path)
with open(render_path, 'wb') as fp:
fp.write(content.encode('utf8'))
CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]')
def string_camelcase(string):
""" Convert a word to its CamelCase version and remove invalid chars
>>> string_camelcase('lost-pound')
'LostPound'
>>> string_camelcase('missing_images')
'MissingImages'
"""
return CAMELCASE_INVALID_CHARS.sub('', string.title())

View file

@ -0,0 +1,112 @@
"""
This module contains some assorted functions used in tests
"""
import asyncio
import os
from posixpath import split
from unittest import mock
from importlib import import_module
from twisted.trial.unittest import SkipTest
from scrapy.utils.boto import is_botocore_available
def assert_gcs_environ():
if 'GCS_PROJECT_ID' not in os.environ:
raise SkipTest("GCS_PROJECT_ID not found")
def skip_if_no_boto():
if not is_botocore_available():
raise SkipTest('missing botocore library')
def get_gcs_content_and_delete(bucket, path):
from google.cloud import storage
client = storage.Client(project=os.environ.get('GCS_PROJECT_ID'))
bucket = client.get_bucket(bucket)
blob = bucket.get_blob(path)
content = blob.download_as_string()
acl = list(blob.acl) # loads acl before it will be deleted
bucket.delete_blob(path)
return content, acl, blob
def get_ftp_content_and_delete(
path, host, port, username,
password, use_active_mode=False):
from ftplib import FTP
ftp = FTP()
ftp.connect(host, port)
ftp.login(username, password)
if use_active_mode:
ftp.set_pasv(False)
ftp_data = []
def buffer_data(data):
ftp_data.append(data)
ftp.retrbinary(f'RETR {path}', buffer_data)
dirname, filename = split(path)
ftp.cwd(dirname)
ftp.delete(filename)
return "".join(ftp_data)
def get_crawler(spidercls=None, settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used to populate the crawler settings with a project level
priority.
"""
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import Spider
runner = CrawlerRunner(settings_dict)
return runner.create_crawler(spidercls or Spider)
def get_pythonpath():
"""Return a PYTHONPATH suitable to use in processes so that they find this
installation of Scrapy"""
scrapy_path = import_module('scrapy').__path__[0]
return os.path.dirname(scrapy_path) + os.pathsep + os.environ.get('PYTHONPATH', '')
def get_testenv():
"""Return a OS environment dict suitable to fork processes that need to import
this installation of Scrapy, instead of a system installed one.
"""
env = os.environ.copy()
env['PYTHONPATH'] = get_pythonpath()
return env
def assert_samelines(testcase, text1, text2, msg=None):
"""Asserts text1 and text2 have the same lines, ignoring differences in
line endings between platforms
"""
testcase.assertEqual(text1.splitlines(), text2.splitlines(), msg)
def get_from_asyncio_queue(value):
q = asyncio.Queue()
getter = q.get()
q.put_nowait(value)
return getter
def mock_google_cloud_storage():
"""Creates autospec mocks for google-cloud-storage Client, Bucket and Blob
classes and set their proper return values.
"""
from google.cloud.storage import Client, Bucket, Blob
client_mock = mock.create_autospec(Client)
bucket_mock = mock.create_autospec(Bucket)
client_mock.get_bucket.return_value = bucket_mock
blob_mock = mock.create_autospec(Blob)
bucket_mock.blob.return_value = blob_mock
return (client_mock, bucket_mock, blob_mock)

View file

@ -0,0 +1,50 @@
import sys
import os
from twisted.internet import defer, protocol
class ProcessTest:
command = None
prefix = [sys.executable, '-m', 'scrapy.cmdline']
cwd = os.getcwd() # trial chdirs to temp dir
def execute(self, args, check_code=True, settings=None):
from twisted.internet import reactor
env = os.environ.copy()
if settings is not None:
env['SCRAPY_SETTINGS_MODULE'] = settings
cmd = self.prefix + [self.command] + list(args)
pp = TestProcessProtocol()
pp.deferred.addBoth(self._process_finished, cmd, check_code)
reactor.spawnProcess(pp, cmd[0], cmd, env=env, path=self.cwd)
return pp.deferred
def _process_finished(self, pp, cmd, check_code):
if pp.exitcode and check_code:
msg = f"process {cmd} exit with code {pp.exitcode}"
msg += f"\n>>> stdout <<<\n{pp.out}"
msg += "\n"
msg += f"\n>>> stderr <<<\n{pp.err}"
raise RuntimeError(msg)
return pp.exitcode, pp.out, pp.err
class TestProcessProtocol(protocol.ProcessProtocol):
def __init__(self):
self.deferred = defer.Deferred()
self.out = b''
self.err = b''
self.exitcode = None
def outReceived(self, data):
self.out += data
def errReceived(self, data):
self.err += data
def processEnded(self, status):
self.exitcode = status.value.exitCode
self.deferred.callback(self)

View file

@ -0,0 +1,44 @@
from urllib.parse import urljoin
from twisted.web import server, resource, static, util
class SiteTest:
def setUp(self):
from twisted.internet import reactor
super().setUp()
self.site = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
self.baseurl = f"http://localhost:{self.site.getHost().port}/"
def tearDown(self):
super().tearDown()
self.site.stopListening()
def url(self, path):
return urljoin(self.baseurl, path)
class NoMetaRefreshRedirect(util.Redirect):
def render(self, request):
content = util.Redirect.render(self, request)
return content.replace(b'http-equiv=\"refresh\"',
b'http-no-equiv=\"do-not-refresh-me\"')
def test_site():
r = resource.Resource()
r.putChild(b"text", static.Data(b"Works", "text/plain"))
r.putChild(b"html", static.Data(b"<body><p class='one'>Works</p><p class='two'>World</p></body>", "text/html"))
r.putChild(b"enc-gb18030", static.Data(b"<p>gb18030 encoding</p>", "text/html; charset=gb18030"))
r.putChild(b"redirect", util.Redirect(b"/redirected"))
r.putChild(b"redirect-no-meta-refresh", NoMetaRefreshRedirect(b"/redirected"))
r.putChild(b"redirected", static.Data(b"Redirected here", "text/plain"))
return server.Site(r)
if __name__ == '__main__':
from twisted.internet import reactor
port = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
print(f"http://localhost:{port.getHost().port}/")
reactor.run()

View file

@ -0,0 +1,66 @@
"""This module provides some functions and classes to record and report
references to live object instances.
If you want live objects for a particular class to be tracked, you only have to
subclass from object_ref (instead of object).
About performance: This library has a minimal performance impact when enabled,
and no performance penalty at all when disabled (as object_ref becomes just an
alias to object in that case).
"""
import weakref
from time import time
from operator import itemgetter
from collections import defaultdict
NoneType = type(None)
live_refs = defaultdict(weakref.WeakKeyDictionary)
class object_ref:
"""Inherit from this class to a keep a record of live instances"""
__slots__ = ()
def __new__(cls, *args, **kwargs):
obj = object.__new__(cls)
live_refs[cls][obj] = time()
return obj
def format_live_refs(ignore=NoneType):
"""Return a tabular representation of tracked objects"""
s = "Live References\n\n"
now = time()
for cls, wdict in sorted(live_refs.items(),
key=lambda x: x[0].__name__):
if not wdict:
continue
if issubclass(cls, ignore):
continue
oldest = min(wdict.values())
s += f"{cls.__name__:<30} {len(wdict):6} oldest: {int(now - oldest)}s ago\n"
return s
def print_live_refs(*a, **kw):
"""Print tracked objects"""
print(format_live_refs(*a, **kw))
def get_oldest(class_name):
"""Get the oldest object for a specific class name"""
for cls, wdict in live_refs.items():
if cls.__name__ == class_name:
if not wdict:
break
return min(wdict.items(), key=itemgetter(1))[0]
def iter_all(class_name):
"""Iterate over all objects of the same class by its class name"""
for cls, wdict in live_refs.items():
if cls.__name__ == class_name:
return wdict.keys()

View file

@ -0,0 +1,164 @@
"""
This module contains general purpose URL functions not found in the standard
library.
Some of the functions that used to be imported from this module have been moved
to the w3lib.url module. Always import those from there instead.
"""
import posixpath
import re
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
# scrapy.utils.url was moved to w3lib.url and import * ensures this
# move doesn't break old code
from w3lib.url import *
from w3lib.url import _safe_chars, _unquotepath # noqa: F401
from scrapy.utils.python import to_unicode
def url_is_from_any_domain(url, domains):
"""Return True if the url belongs to any of the given domains"""
host = parse_url(url).netloc.lower()
if not host:
return False
domains = [d.lower() for d in domains]
return any((host == d) or (host.endswith(f'.{d}')) for d in domains)
def url_is_from_spider(url, spider):
"""Return True if the url belongs to the given spider"""
return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', [])))
def url_has_any_extension(url, extensions):
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
def parse_url(url, encoding=None):
"""Return urlparsed url from the given argument (which could be an already
parsed url)
"""
if isinstance(url, ParseResult):
return url
return urlparse(to_unicode(url, encoding))
def escape_ajax(url):
"""
Return the crawleable url according to:
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
>>> escape_ajax("www.example.com/ajax.html#!key=value")
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html?#!key=value")
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
>>> escape_ajax("www.example.com/ajax.html#!")
'www.example.com/ajax.html?_escaped_fragment_='
URLs that are not "AJAX crawlable" (according to Google) returned as-is:
>>> escape_ajax("www.example.com/ajax.html#key=value")
'www.example.com/ajax.html#key=value'
>>> escape_ajax("www.example.com/ajax.html#")
'www.example.com/ajax.html#'
>>> escape_ajax("www.example.com/ajax.html")
'www.example.com/ajax.html'
"""
defrag, frag = urldefrag(url)
if not frag.startswith('!'):
return url
return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
def add_http_if_no_scheme(url):
"""Add http as the default scheme if it is missing from the url."""
match = re.match(r"^\w+://", url, flags=re.I)
if not match:
parts = urlparse(url)
scheme = "http:" if parts.netloc else "http://"
url = scheme + url
return url
def _is_posix_path(string):
return bool(
re.match(
r'''
^ # start with...
(
\. # ...a single dot,
(
\. | [^/\.]+ # optionally followed by
)? # either a second dot or some characters
|
~ # $HOME
)? # optional match of ".", ".." or ".blabla"
/ # at least one "/" for a file path,
. # and something after the "/"
''',
string,
flags=re.VERBOSE,
)
)
def _is_windows_path(string):
return bool(
re.match(
r'''
^
(
[a-z]:\\
| \\\\
)
''',
string,
flags=re.IGNORECASE | re.VERBOSE,
)
)
def _is_filesystem_path(string):
return _is_posix_path(string) or _is_windows_path(string)
def guess_scheme(url):
"""Add an URL scheme if missing: file:// for filepath-like input or
http:// otherwise."""
if _is_filesystem_path(url):
return any_to_uri(url)
return add_http_if_no_scheme(url)
def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=False, strip_fragment=True):
"""Strip URL string from some of its components:
- ``strip_credentials`` removes "user:password@"
- ``strip_default_port`` removes ":80" (resp. ":443", ":21")
from http:// (resp. https://, ftp://) URLs
- ``origin_only`` replaces path component with "/", also dropping
query and fragment components ; it also strips credentials
- ``strip_fragment`` drops any #fragment component
"""
parsed_url = urlparse(url)
netloc = parsed_url.netloc
if (strip_credentials or origin_only) and (parsed_url.username or parsed_url.password):
netloc = netloc.split('@')[-1]
if strip_default_port and parsed_url.port:
if (parsed_url.scheme, parsed_url.port) in (('http', 80),
('https', 443),
('ftp', 21)):
netloc = netloc.replace(f':{parsed_url.port}', '')
return urlunparse((
parsed_url.scheme,
netloc,
'/' if origin_only else parsed_url.path,
'' if origin_only else parsed_url.params,
'' if origin_only else parsed_url.query,
'' if strip_fragment else parsed_url.fragment
))

View file

@ -0,0 +1,31 @@
import platform
import sys
import cryptography
import cssselect
import lxml.etree
import parsel
import twisted
import w3lib
import scrapy
from scrapy.utils.ssl import get_openssl_version
def scrapy_components_versions():
lxml_version = ".".join(map(str, lxml.etree.LXML_VERSION))
libxml2_version = ".".join(map(str, lxml.etree.LIBXML_VERSION))
return [
("Scrapy", scrapy.__version__),
("lxml", lxml_version),
("libxml2", libxml2_version),
("cssselect", cssselect.__version__),
("parsel", parsel.__version__),
("w3lib", w3lib.__version__),
("Twisted", twisted.version.short()),
("Python", sys.version.replace("\n", "- ")),
("pyOpenSSL", get_openssl_version()),
("cryptography", cryptography.__version__),
("Platform", platform.platform()),
]