Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
44
venv/lib/python3.9/site-packages/scrapy/utils/benchserver.py
Normal file
44
venv/lib/python3.9/site-packages/scrapy/utils/benchserver.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
import random
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from twisted.web.server import Site
|
||||
from twisted.web.resource import Resource
|
||||
|
||||
|
||||
class Root(Resource):
|
||||
|
||||
isLeaf = True
|
||||
|
||||
def getChild(self, name, request):
|
||||
return self
|
||||
|
||||
def render(self, request):
|
||||
total = _getarg(request, b'total', 100, int)
|
||||
show = _getarg(request, b'show', 10, int)
|
||||
nlist = [random.randint(1, total) for _ in range(show)]
|
||||
request.write(b"<html><head></head><body>")
|
||||
args = request.args.copy()
|
||||
for nl in nlist:
|
||||
args['n'] = nl
|
||||
argstr = urlencode(args, doseq=True)
|
||||
request.write(f"<a href='/follow?{argstr}'>follow {nl}</a><br>"
|
||||
.encode('utf8'))
|
||||
request.write(b"</body></html>")
|
||||
return b''
|
||||
|
||||
|
||||
def _getarg(request, name, default=None, type=str):
|
||||
return type(request.args[name][0]) if name in request.args else default
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from twisted.internet import reactor
|
||||
root = Root()
|
||||
factory = Site(root)
|
||||
httpPort = reactor.listenTCP(8998, Site(root))
|
||||
|
||||
def _print_listening():
|
||||
httpHost = httpPort.getHost()
|
||||
print(f"Bench server at http://{httpHost.host}:{httpHost.port}")
|
||||
reactor.callWhenRunning(_print_listening)
|
||||
reactor.run()
|
||||
32
venv/lib/python3.9/site-packages/scrapy/utils/boto.py
Normal file
32
venv/lib/python3.9/site-packages/scrapy/utils/boto.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
"""Boto/botocore helpers"""
|
||||
import warnings
|
||||
|
||||
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||
|
||||
|
||||
def is_botocore():
|
||||
""" Returns True if botocore is available, otherwise raises NotConfigured. Never returns False.
|
||||
|
||||
Previously, when boto was supported in addition to botocore, this returned False if boto was available
|
||||
but botocore wasn't.
|
||||
"""
|
||||
message = (
|
||||
'is_botocore() is deprecated and always returns True or raises an Exception, '
|
||||
'so it cannot be used for checking if boto is available instead of botocore. '
|
||||
'You can use scrapy.utils.boto.is_botocore_available() to check if botocore '
|
||||
'is available.'
|
||||
)
|
||||
warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
|
||||
try:
|
||||
import botocore # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
raise NotConfigured('missing botocore library')
|
||||
|
||||
|
||||
def is_botocore_available():
|
||||
try:
|
||||
import botocore # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
195
venv/lib/python3.9/site-packages/scrapy/utils/conf.py
Normal file
195
venv/lib/python3.9/site-packages/scrapy/utils/conf.py
Normal file
|
|
@ -0,0 +1,195 @@
|
|||
import numbers
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from configparser import ConfigParser
|
||||
from operator import itemgetter
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning, UsageError
|
||||
|
||||
from scrapy.settings import BaseSettings
|
||||
from scrapy.utils.deprecate import update_classpath
|
||||
from scrapy.utils.python import without_none_values
|
||||
|
||||
|
||||
def build_component_list(compdict, custom=None, convert=update_classpath):
|
||||
"""Compose a component list from a { class: order } dictionary."""
|
||||
|
||||
def _check_components(complist):
|
||||
if len({convert(c) for c in complist}) != len(complist):
|
||||
raise ValueError(f'Some paths in {complist!r} convert to the same object, '
|
||||
'please update your settings')
|
||||
|
||||
def _map_keys(compdict):
|
||||
if isinstance(compdict, BaseSettings):
|
||||
compbs = BaseSettings()
|
||||
for k, v in compdict.items():
|
||||
prio = compdict.getpriority(k)
|
||||
if compbs.getpriority(convert(k)) == prio:
|
||||
raise ValueError(f'Some paths in {list(compdict.keys())!r} '
|
||||
'convert to the same '
|
||||
'object, please update your settings'
|
||||
)
|
||||
else:
|
||||
compbs.set(convert(k), v, priority=prio)
|
||||
return compbs
|
||||
else:
|
||||
_check_components(compdict)
|
||||
return {convert(k): v for k, v in compdict.items()}
|
||||
|
||||
def _validate_values(compdict):
|
||||
"""Fail if a value in the components dict is not a real number or None."""
|
||||
for name, value in compdict.items():
|
||||
if value is not None and not isinstance(value, numbers.Real):
|
||||
raise ValueError(f'Invalid value {value} for component {name}, '
|
||||
'please provide a real number or None instead')
|
||||
|
||||
# BEGIN Backward compatibility for old (base, custom) call signature
|
||||
if isinstance(custom, (list, tuple)):
|
||||
_check_components(custom)
|
||||
return type(custom)(convert(c) for c in custom)
|
||||
|
||||
if custom is not None:
|
||||
compdict.update(custom)
|
||||
# END Backward compatibility
|
||||
|
||||
_validate_values(compdict)
|
||||
compdict = without_none_values(_map_keys(compdict))
|
||||
return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
|
||||
|
||||
|
||||
def arglist_to_dict(arglist):
|
||||
"""Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a
|
||||
dict
|
||||
"""
|
||||
return dict(x.split('=', 1) for x in arglist)
|
||||
|
||||
|
||||
def closest_scrapy_cfg(path='.', prevpath=None):
|
||||
"""Return the path to the closest scrapy.cfg file by traversing the current
|
||||
directory and its parents
|
||||
"""
|
||||
if path == prevpath:
|
||||
return ''
|
||||
path = os.path.abspath(path)
|
||||
cfgfile = os.path.join(path, 'scrapy.cfg')
|
||||
if os.path.exists(cfgfile):
|
||||
return cfgfile
|
||||
return closest_scrapy_cfg(os.path.dirname(path), path)
|
||||
|
||||
|
||||
def init_env(project='default', set_syspath=True):
|
||||
"""Initialize environment to use command-line tool from inside a project
|
||||
dir. This sets the Scrapy settings module and modifies the Python path to
|
||||
be able to locate the project module.
|
||||
"""
|
||||
cfg = get_config()
|
||||
if cfg.has_option('settings', project):
|
||||
os.environ['SCRAPY_SETTINGS_MODULE'] = cfg.get('settings', project)
|
||||
closest = closest_scrapy_cfg()
|
||||
if closest:
|
||||
projdir = os.path.dirname(closest)
|
||||
if set_syspath and projdir not in sys.path:
|
||||
sys.path.append(projdir)
|
||||
|
||||
|
||||
def get_config(use_closest=True):
|
||||
"""Get Scrapy config file as a ConfigParser"""
|
||||
sources = get_sources(use_closest)
|
||||
cfg = ConfigParser()
|
||||
cfg.read(sources)
|
||||
return cfg
|
||||
|
||||
|
||||
def get_sources(use_closest=True):
|
||||
xdg_config_home = os.environ.get('XDG_CONFIG_HOME') or os.path.expanduser('~/.config')
|
||||
sources = [
|
||||
'/etc/scrapy.cfg',
|
||||
r'c:\scrapy\scrapy.cfg',
|
||||
xdg_config_home + '/scrapy.cfg',
|
||||
os.path.expanduser('~/.scrapy.cfg'),
|
||||
]
|
||||
if use_closest:
|
||||
sources.append(closest_scrapy_cfg())
|
||||
return sources
|
||||
|
||||
|
||||
def feed_complete_default_values_from_settings(feed, settings):
|
||||
out = feed.copy()
|
||||
out.setdefault("batch_item_count", settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT'))
|
||||
out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"])
|
||||
out.setdefault("fields", settings.getlist("FEED_EXPORT_FIELDS") or None)
|
||||
out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY"))
|
||||
out.setdefault("uri_params", settings["FEED_URI_PARAMS"])
|
||||
out.setdefault("item_export_kwargs", dict())
|
||||
if settings["FEED_EXPORT_INDENT"] is None:
|
||||
out.setdefault("indent", None)
|
||||
else:
|
||||
out.setdefault("indent", settings.getint("FEED_EXPORT_INDENT"))
|
||||
return out
|
||||
|
||||
|
||||
def feed_process_params_from_cli(settings, output, output_format=None,
|
||||
overwrite_output=None):
|
||||
"""
|
||||
Receives feed export params (from the 'crawl' or 'runspider' commands),
|
||||
checks for inconsistencies in their quantities and returns a dictionary
|
||||
suitable to be used as the FEEDS setting.
|
||||
"""
|
||||
valid_output_formats = without_none_values(
|
||||
settings.getwithbase('FEED_EXPORTERS')
|
||||
).keys()
|
||||
|
||||
def check_valid_format(output_format):
|
||||
if output_format not in valid_output_formats:
|
||||
raise UsageError(
|
||||
f"Unrecognized output format '{output_format}'. "
|
||||
f"Set a supported one ({tuple(valid_output_formats)}) "
|
||||
"after a colon at the end of the output URI (i.e. -o/-O "
|
||||
"<URI>:<FORMAT>) or as a file extension."
|
||||
)
|
||||
|
||||
overwrite = False
|
||||
if overwrite_output:
|
||||
if output:
|
||||
raise UsageError(
|
||||
"Please use only one of -o/--output and -O/--overwrite-output"
|
||||
)
|
||||
output = overwrite_output
|
||||
overwrite = True
|
||||
|
||||
if output_format:
|
||||
if len(output) == 1:
|
||||
check_valid_format(output_format)
|
||||
message = (
|
||||
'The -t command line option is deprecated in favor of '
|
||||
'specifying the output format within the output URI. See the '
|
||||
'documentation of the -o and -O options for more information.',
|
||||
)
|
||||
warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
|
||||
return {output[0]: {'format': output_format}}
|
||||
else:
|
||||
raise UsageError(
|
||||
'The -t command-line option cannot be used if multiple output '
|
||||
'URIs are specified'
|
||||
)
|
||||
|
||||
result = {}
|
||||
for element in output:
|
||||
try:
|
||||
feed_uri, feed_format = element.rsplit(':', 1)
|
||||
except ValueError:
|
||||
feed_uri = element
|
||||
feed_format = os.path.splitext(element)[1].replace('.', '')
|
||||
else:
|
||||
if feed_uri == '-':
|
||||
feed_uri = 'stdout:'
|
||||
check_valid_format(feed_format)
|
||||
result[feed_uri] = {'format': feed_format}
|
||||
if overwrite:
|
||||
result[feed_uri]['overwrite'] = True
|
||||
|
||||
# FEEDS setting should take precedence over the matching CLI options
|
||||
result.update(settings.getdict('FEEDS'))
|
||||
|
||||
return result
|
||||
104
venv/lib/python3.9/site-packages/scrapy/utils/console.py
Normal file
104
venv/lib/python3.9/site-packages/scrapy/utils/console.py
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
from functools import wraps
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def _embed_ipython_shell(namespace={}, banner=''):
|
||||
"""Start an IPython Shell"""
|
||||
try:
|
||||
from IPython.terminal.embed import InteractiveShellEmbed
|
||||
from IPython.terminal.ipapp import load_default_config
|
||||
except ImportError:
|
||||
from IPython.frontend.terminal.embed import InteractiveShellEmbed
|
||||
from IPython.frontend.terminal.ipapp import load_default_config
|
||||
|
||||
@wraps(_embed_ipython_shell)
|
||||
def wrapper(namespace=namespace, banner=''):
|
||||
config = load_default_config()
|
||||
# Always use .instace() to ensure _instance propagation to all parents
|
||||
# this is needed for <TAB> completion works well for new imports
|
||||
# and clear the instance to always have the fresh env
|
||||
# on repeated breaks like with inspect_response()
|
||||
InteractiveShellEmbed.clear_instance()
|
||||
shell = InteractiveShellEmbed.instance(
|
||||
banner1=banner, user_ns=namespace, config=config)
|
||||
shell()
|
||||
return wrapper
|
||||
|
||||
|
||||
def _embed_bpython_shell(namespace={}, banner=''):
|
||||
"""Start a bpython shell"""
|
||||
import bpython
|
||||
|
||||
@wraps(_embed_bpython_shell)
|
||||
def wrapper(namespace=namespace, banner=''):
|
||||
bpython.embed(locals_=namespace, banner=banner)
|
||||
return wrapper
|
||||
|
||||
|
||||
def _embed_ptpython_shell(namespace={}, banner=''):
|
||||
"""Start a ptpython shell"""
|
||||
import ptpython.repl
|
||||
|
||||
@wraps(_embed_ptpython_shell)
|
||||
def wrapper(namespace=namespace, banner=''):
|
||||
print(banner)
|
||||
ptpython.repl.embed(locals=namespace)
|
||||
return wrapper
|
||||
|
||||
|
||||
def _embed_standard_shell(namespace={}, banner=''):
|
||||
"""Start a standard python shell"""
|
||||
import code
|
||||
try: # readline module is only available on unix systems
|
||||
import readline
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
import rlcompleter # noqa: F401
|
||||
readline.parse_and_bind("tab:complete")
|
||||
|
||||
@wraps(_embed_standard_shell)
|
||||
def wrapper(namespace=namespace, banner=''):
|
||||
code.interact(banner=banner, local=namespace)
|
||||
return wrapper
|
||||
|
||||
|
||||
DEFAULT_PYTHON_SHELLS = OrderedDict([
|
||||
('ptpython', _embed_ptpython_shell),
|
||||
('ipython', _embed_ipython_shell),
|
||||
('bpython', _embed_bpython_shell),
|
||||
('python', _embed_standard_shell),
|
||||
])
|
||||
|
||||
|
||||
def get_shell_embed_func(shells=None, known_shells=None):
|
||||
"""Return the first acceptable shell-embed function
|
||||
from a given list of shell names.
|
||||
"""
|
||||
if shells is None: # list, preference order of shells
|
||||
shells = DEFAULT_PYTHON_SHELLS.keys()
|
||||
if known_shells is None: # available embeddable shells
|
||||
known_shells = DEFAULT_PYTHON_SHELLS.copy()
|
||||
for shell in shells:
|
||||
if shell in known_shells:
|
||||
try:
|
||||
# function test: run all setup code (imports),
|
||||
# but dont fall into the shell
|
||||
return known_shells[shell]()
|
||||
except ImportError:
|
||||
continue
|
||||
|
||||
|
||||
def start_python_console(namespace=None, banner='', shells=None):
|
||||
"""Start Python console bound to the given namespace.
|
||||
Readline support and tab completion will be used on Unix, if available.
|
||||
"""
|
||||
if namespace is None:
|
||||
namespace = {}
|
||||
|
||||
try:
|
||||
shell = get_shell_embed_func(shells)
|
||||
if shell is not None:
|
||||
shell(namespace=namespace, banner=banner)
|
||||
except SystemExit: # raised when using exit() in python code.interact
|
||||
pass
|
||||
100
venv/lib/python3.9/site-packages/scrapy/utils/curl.py
Normal file
100
venv/lib/python3.9/site-packages/scrapy/utils/curl.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
import argparse
|
||||
import warnings
|
||||
from shlex import split
|
||||
from http.cookies import SimpleCookie
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
|
||||
|
||||
class CurlParser(argparse.ArgumentParser):
|
||||
def error(self, message):
|
||||
error_msg = f'There was an error parsing the curl command: {message}'
|
||||
raise ValueError(error_msg)
|
||||
|
||||
|
||||
curl_parser = CurlParser()
|
||||
curl_parser.add_argument('url')
|
||||
curl_parser.add_argument('-H', '--header', dest='headers', action='append')
|
||||
curl_parser.add_argument('-X', '--request', dest='method')
|
||||
curl_parser.add_argument('-d', '--data', '--data-raw', dest='data')
|
||||
curl_parser.add_argument('-u', '--user', dest='auth')
|
||||
|
||||
|
||||
safe_to_ignore_arguments = [
|
||||
['--compressed'],
|
||||
# `--compressed` argument is not safe to ignore, but it's included here
|
||||
# because the `HttpCompressionMiddleware` is enabled by default
|
||||
['-s', '--silent'],
|
||||
['-v', '--verbose'],
|
||||
['-#', '--progress-bar']
|
||||
]
|
||||
|
||||
for argument in safe_to_ignore_arguments:
|
||||
curl_parser.add_argument(*argument, action='store_true')
|
||||
|
||||
|
||||
def curl_to_request_kwargs(curl_command, ignore_unknown_options=True):
|
||||
"""Convert a cURL command syntax to Request kwargs.
|
||||
|
||||
:param str curl_command: string containing the curl command
|
||||
:param bool ignore_unknown_options: If true, only a warning is emitted when
|
||||
cURL options are unknown. Otherwise
|
||||
raises an error. (default: True)
|
||||
:return: dictionary of Request kwargs
|
||||
"""
|
||||
|
||||
curl_args = split(curl_command)
|
||||
|
||||
if curl_args[0] != 'curl':
|
||||
raise ValueError('A curl command must start with "curl"')
|
||||
|
||||
parsed_args, argv = curl_parser.parse_known_args(curl_args[1:])
|
||||
|
||||
if argv:
|
||||
msg = f'Unrecognized options: {", ".join(argv)}'
|
||||
if ignore_unknown_options:
|
||||
warnings.warn(msg)
|
||||
else:
|
||||
raise ValueError(msg)
|
||||
|
||||
url = parsed_args.url
|
||||
|
||||
# curl automatically prepends 'http' if the scheme is missing, but Request
|
||||
# needs the scheme to work
|
||||
parsed_url = urlparse(url)
|
||||
if not parsed_url.scheme:
|
||||
url = 'http://' + url
|
||||
|
||||
method = parsed_args.method or 'GET'
|
||||
|
||||
result = {'method': method.upper(), 'url': url}
|
||||
|
||||
headers = []
|
||||
cookies = {}
|
||||
for header in parsed_args.headers or ():
|
||||
name, val = header.split(':', 1)
|
||||
name = name.strip()
|
||||
val = val.strip()
|
||||
if name.title() == 'Cookie':
|
||||
for name, morsel in SimpleCookie(val).items():
|
||||
cookies[name] = morsel.value
|
||||
else:
|
||||
headers.append((name, val))
|
||||
|
||||
if parsed_args.auth:
|
||||
user, password = parsed_args.auth.split(':', 1)
|
||||
headers.append(('Authorization', basic_auth_header(user, password)))
|
||||
|
||||
if headers:
|
||||
result['headers'] = headers
|
||||
if cookies:
|
||||
result['cookies'] = cookies
|
||||
if parsed_args.data:
|
||||
result['body'] = parsed_args.data
|
||||
if not parsed_args.method:
|
||||
# if the "data" is specified but the "method" is not specified,
|
||||
# the default method is 'POST'
|
||||
result['method'] = 'POST'
|
||||
|
||||
return result
|
||||
119
venv/lib/python3.9/site-packages/scrapy/utils/datatypes.py
Normal file
119
venv/lib/python3.9/site-packages/scrapy/utils/datatypes.py
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
"""
|
||||
This module contains data types used by Scrapy which are not included in the
|
||||
Python Standard Library.
|
||||
|
||||
This module must not depend on any module outside the Standard Library.
|
||||
"""
|
||||
|
||||
import collections
|
||||
import weakref
|
||||
from collections.abc import Mapping
|
||||
|
||||
|
||||
class CaselessDict(dict):
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, seq=None):
|
||||
super().__init__()
|
||||
if seq:
|
||||
self.update(seq)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return dict.__getitem__(self, self.normkey(key))
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
dict.__setitem__(self, self.normkey(key), self.normvalue(value))
|
||||
|
||||
def __delitem__(self, key):
|
||||
dict.__delitem__(self, self.normkey(key))
|
||||
|
||||
def __contains__(self, key):
|
||||
return dict.__contains__(self, self.normkey(key))
|
||||
has_key = __contains__
|
||||
|
||||
def __copy__(self):
|
||||
return self.__class__(self)
|
||||
copy = __copy__
|
||||
|
||||
def normkey(self, key):
|
||||
"""Method to normalize dictionary key access"""
|
||||
return key.lower()
|
||||
|
||||
def normvalue(self, value):
|
||||
"""Method to normalize values prior to be setted"""
|
||||
return value
|
||||
|
||||
def get(self, key, def_val=None):
|
||||
return dict.get(self, self.normkey(key), self.normvalue(def_val))
|
||||
|
||||
def setdefault(self, key, def_val=None):
|
||||
return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))
|
||||
|
||||
def update(self, seq):
|
||||
seq = seq.items() if isinstance(seq, Mapping) else seq
|
||||
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
|
||||
super().update(iseq)
|
||||
|
||||
@classmethod
|
||||
def fromkeys(cls, keys, value=None):
|
||||
return cls((k, value) for k in keys)
|
||||
|
||||
def pop(self, key, *args):
|
||||
return dict.pop(self, self.normkey(key), *args)
|
||||
|
||||
|
||||
class LocalCache(collections.OrderedDict):
|
||||
"""Dictionary with a finite number of keys.
|
||||
|
||||
Older items expires first.
|
||||
"""
|
||||
|
||||
def __init__(self, limit=None):
|
||||
super().__init__()
|
||||
self.limit = limit
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
if self.limit:
|
||||
while len(self) >= self.limit:
|
||||
self.popitem(last=False)
|
||||
super().__setitem__(key, value)
|
||||
|
||||
|
||||
class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
|
||||
"""
|
||||
A weakref.WeakKeyDictionary implementation that uses LocalCache as its
|
||||
underlying data structure, making it ordered and capable of being size-limited.
|
||||
|
||||
Useful for memoization, while avoiding keeping received
|
||||
arguments in memory only because of the cached references.
|
||||
|
||||
Note: like LocalCache and unlike weakref.WeakKeyDictionary,
|
||||
it cannot be instantiated with an initial dictionary.
|
||||
"""
|
||||
|
||||
def __init__(self, limit=None):
|
||||
super().__init__()
|
||||
self.data = LocalCache(limit=limit)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
try:
|
||||
super().__setitem__(key, value)
|
||||
except TypeError:
|
||||
pass # key is not weak-referenceable, skip caching
|
||||
|
||||
def __getitem__(self, key):
|
||||
try:
|
||||
return super().__getitem__(key)
|
||||
except (TypeError, KeyError):
|
||||
return None # key is either not weak-referenceable or not cached
|
||||
|
||||
|
||||
class SequenceExclude:
|
||||
"""Object to test if an item is NOT within some sequence."""
|
||||
|
||||
def __init__(self, seq):
|
||||
self.seq = seq
|
||||
|
||||
def __contains__(self, item):
|
||||
return item not in self.seq
|
||||
45
venv/lib/python3.9/site-packages/scrapy/utils/decorators.py
Normal file
45
venv/lib/python3.9/site-packages/scrapy/utils/decorators.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
import warnings
|
||||
from functools import wraps
|
||||
|
||||
from twisted.internet import defer, threads
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
def deprecated(use_instead=None):
|
||||
"""This is a decorator which can be used to mark functions
|
||||
as deprecated. It will result in a warning being emitted
|
||||
when the function is used."""
|
||||
|
||||
def deco(func):
|
||||
@wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
message = f"Call to deprecated function {func.__name__}."
|
||||
if use_instead:
|
||||
message += f" Use {use_instead} instead."
|
||||
warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
|
||||
return func(*args, **kwargs)
|
||||
return wrapped
|
||||
|
||||
if callable(use_instead):
|
||||
deco = deco(use_instead)
|
||||
use_instead = None
|
||||
return deco
|
||||
|
||||
|
||||
def defers(func):
|
||||
"""Decorator to make sure a function always returns a deferred"""
|
||||
@wraps(func)
|
||||
def wrapped(*a, **kw):
|
||||
return defer.maybeDeferred(func, *a, **kw)
|
||||
return wrapped
|
||||
|
||||
|
||||
def inthread(func):
|
||||
"""Decorator to call a function in a thread and return a deferred with the
|
||||
result
|
||||
"""
|
||||
@wraps(func)
|
||||
def wrapped(*a, **kw):
|
||||
return threads.deferToThread(func, *a, **kw)
|
||||
return wrapped
|
||||
168
venv/lib/python3.9/site-packages/scrapy/utils/defer.py
Normal file
168
venv/lib/python3.9/site-packages/scrapy/utils/defer.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
"""
|
||||
Helper functions for dealing with Twisted deferreds
|
||||
"""
|
||||
import asyncio
|
||||
import inspect
|
||||
from functools import wraps
|
||||
|
||||
from twisted.internet import defer, task
|
||||
from twisted.python import failure
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest
|
||||
from scrapy.utils.reactor import is_asyncio_reactor_installed
|
||||
|
||||
|
||||
def defer_fail(_failure):
|
||||
"""Same as twisted.internet.defer.fail but delay calling errback until
|
||||
next reactor loop
|
||||
|
||||
It delays by 100ms so reactor has a chance to go through readers and writers
|
||||
before attending pending delayed calls, so do not set delay to zero.
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
d = defer.Deferred()
|
||||
reactor.callLater(0.1, d.errback, _failure)
|
||||
return d
|
||||
|
||||
|
||||
def defer_succeed(result):
|
||||
"""Same as twisted.internet.defer.succeed but delay calling callback until
|
||||
next reactor loop
|
||||
|
||||
It delays by 100ms so reactor has a chance to go trough readers and writers
|
||||
before attending pending delayed calls, so do not set delay to zero.
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
d = defer.Deferred()
|
||||
reactor.callLater(0.1, d.callback, result)
|
||||
return d
|
||||
|
||||
|
||||
def defer_result(result):
|
||||
if isinstance(result, defer.Deferred):
|
||||
return result
|
||||
elif isinstance(result, failure.Failure):
|
||||
return defer_fail(result)
|
||||
else:
|
||||
return defer_succeed(result)
|
||||
|
||||
|
||||
def mustbe_deferred(f, *args, **kw):
|
||||
"""Same as twisted.internet.defer.maybeDeferred, but delay calling
|
||||
callback/errback to next reactor loop
|
||||
"""
|
||||
try:
|
||||
result = f(*args, **kw)
|
||||
# FIXME: Hack to avoid introspecting tracebacks. This to speed up
|
||||
# processing of IgnoreRequest errors which are, by far, the most common
|
||||
# exception in Scrapy - see #125
|
||||
except IgnoreRequest as e:
|
||||
return defer_fail(failure.Failure(e))
|
||||
except Exception:
|
||||
return defer_fail(failure.Failure())
|
||||
else:
|
||||
return defer_result(result)
|
||||
|
||||
|
||||
def parallel(iterable, count, callable, *args, **named):
|
||||
"""Execute a callable over the objects in the given iterable, in parallel,
|
||||
using no more than ``count`` concurrent calls.
|
||||
|
||||
Taken from: https://jcalderone.livejournal.com/24285.html
|
||||
"""
|
||||
coop = task.Cooperator()
|
||||
work = (callable(elem, *args, **named) for elem in iterable)
|
||||
return defer.DeferredList([coop.coiterate(work) for _ in range(count)])
|
||||
|
||||
|
||||
def process_chain(callbacks, input, *a, **kw):
|
||||
"""Return a Deferred built by chaining the given callbacks"""
|
||||
d = defer.Deferred()
|
||||
for x in callbacks:
|
||||
d.addCallback(x, *a, **kw)
|
||||
d.callback(input)
|
||||
return d
|
||||
|
||||
|
||||
def process_chain_both(callbacks, errbacks, input, *a, **kw):
|
||||
"""Return a Deferred built by chaining the given callbacks and errbacks"""
|
||||
d = defer.Deferred()
|
||||
for cb, eb in zip(callbacks, errbacks):
|
||||
d.addCallbacks(
|
||||
callback=cb, errback=eb,
|
||||
callbackArgs=a, callbackKeywords=kw,
|
||||
errbackArgs=a, errbackKeywords=kw,
|
||||
)
|
||||
if isinstance(input, failure.Failure):
|
||||
d.errback(input)
|
||||
else:
|
||||
d.callback(input)
|
||||
return d
|
||||
|
||||
|
||||
def process_parallel(callbacks, input, *a, **kw):
|
||||
"""Return a Deferred with the output of all successful calls to the given
|
||||
callbacks
|
||||
"""
|
||||
dfds = [defer.succeed(input).addCallback(x, *a, **kw) for x in callbacks]
|
||||
d = defer.DeferredList(dfds, fireOnOneErrback=1, consumeErrors=1)
|
||||
d.addCallbacks(lambda r: [x[1] for x in r], lambda f: f.value.subFailure)
|
||||
return d
|
||||
|
||||
|
||||
def iter_errback(iterable, errback, *a, **kw):
|
||||
"""Wraps an iterable calling an errback if an error is caught while
|
||||
iterating it.
|
||||
"""
|
||||
it = iter(iterable)
|
||||
while True:
|
||||
try:
|
||||
yield next(it)
|
||||
except StopIteration:
|
||||
break
|
||||
except Exception:
|
||||
errback(failure.Failure(), *a, **kw)
|
||||
|
||||
|
||||
def deferred_from_coro(o):
|
||||
"""Converts a coroutine into a Deferred, or returns the object as is if it isn't a coroutine"""
|
||||
if isinstance(o, defer.Deferred):
|
||||
return o
|
||||
if asyncio.isfuture(o) or inspect.isawaitable(o):
|
||||
if not is_asyncio_reactor_installed():
|
||||
# wrapping the coroutine directly into a Deferred, this doesn't work correctly with coroutines
|
||||
# that use asyncio, e.g. "await asyncio.sleep(1)"
|
||||
return defer.ensureDeferred(o)
|
||||
else:
|
||||
# wrapping the coroutine into a Future and then into a Deferred, this requires AsyncioSelectorReactor
|
||||
return defer.Deferred.fromFuture(asyncio.ensure_future(o))
|
||||
return o
|
||||
|
||||
|
||||
def deferred_f_from_coro_f(coro_f):
|
||||
""" Converts a coroutine function into a function that returns a Deferred.
|
||||
|
||||
The coroutine function will be called at the time when the wrapper is called. Wrapper args will be passed to it.
|
||||
This is useful for callback chains, as callback functions are called with the previous callback result.
|
||||
"""
|
||||
@wraps(coro_f)
|
||||
def f(*coro_args, **coro_kwargs):
|
||||
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
|
||||
return f
|
||||
|
||||
|
||||
def maybeDeferred_coro(f, *args, **kw):
|
||||
""" Copy of defer.maybeDeferred that also converts coroutines to Deferreds. """
|
||||
try:
|
||||
result = f(*args, **kw)
|
||||
except: # noqa: E722
|
||||
return defer.fail(failure.Failure(captureVars=defer.Deferred.debug))
|
||||
|
||||
if isinstance(result, defer.Deferred):
|
||||
return result
|
||||
elif asyncio.isfuture(result) or inspect.isawaitable(result):
|
||||
return deferred_from_coro(result)
|
||||
elif isinstance(result, failure.Failure):
|
||||
return defer.fail(result)
|
||||
else:
|
||||
return defer.succeed(result)
|
||||
174
venv/lib/python3.9/site-packages/scrapy/utils/deprecate.py
Normal file
174
venv/lib/python3.9/site-packages/scrapy/utils/deprecate.py
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
"""Some helpers for deprecation messages"""
|
||||
|
||||
import warnings
|
||||
import inspect
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
def attribute(obj, oldattr, newattr, version='0.12'):
|
||||
cname = obj.__class__.__name__
|
||||
warnings.warn(
|
||||
f"{cname}.{oldattr} attribute is deprecated and will be no longer supported "
|
||||
f"in Scrapy {version}, use {cname}.{newattr} attribute instead",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=3)
|
||||
|
||||
|
||||
def create_deprecated_class(
|
||||
name,
|
||||
new_class,
|
||||
clsdict=None,
|
||||
warn_category=ScrapyDeprecationWarning,
|
||||
warn_once=True,
|
||||
old_class_path=None,
|
||||
new_class_path=None,
|
||||
subclass_warn_message="{cls} inherits from deprecated class {old}, please inherit from {new}.",
|
||||
instance_warn_message="{cls} is deprecated, instantiate {new} instead."
|
||||
):
|
||||
"""
|
||||
Return a "deprecated" class that causes its subclasses to issue a warning.
|
||||
Subclasses of ``new_class`` are considered subclasses of this class.
|
||||
It also warns when the deprecated class is instantiated, but do not when
|
||||
its subclasses are instantiated.
|
||||
|
||||
It can be used to rename a base class in a library. For example, if we
|
||||
have
|
||||
|
||||
class OldName(SomeClass):
|
||||
# ...
|
||||
|
||||
and we want to rename it to NewName, we can do the following::
|
||||
|
||||
class NewName(SomeClass):
|
||||
# ...
|
||||
|
||||
OldName = create_deprecated_class('OldName', NewName)
|
||||
|
||||
Then, if user class inherits from OldName, warning is issued. Also, if
|
||||
some code uses ``issubclass(sub, OldName)`` or ``isinstance(sub(), OldName)``
|
||||
checks they'll still return True if sub is a subclass of NewName instead of
|
||||
OldName.
|
||||
"""
|
||||
|
||||
class DeprecatedClass(new_class.__class__):
|
||||
|
||||
deprecated_class = None
|
||||
warned_on_subclass = False
|
||||
|
||||
def __new__(metacls, name, bases, clsdict_):
|
||||
cls = super().__new__(metacls, name, bases, clsdict_)
|
||||
if metacls.deprecated_class is None:
|
||||
metacls.deprecated_class = cls
|
||||
return cls
|
||||
|
||||
def __init__(cls, name, bases, clsdict_):
|
||||
meta = cls.__class__
|
||||
old = meta.deprecated_class
|
||||
if old in bases and not (warn_once and meta.warned_on_subclass):
|
||||
meta.warned_on_subclass = True
|
||||
msg = subclass_warn_message.format(cls=_clspath(cls),
|
||||
old=_clspath(old, old_class_path),
|
||||
new=_clspath(new_class, new_class_path))
|
||||
if warn_once:
|
||||
msg += ' (warning only on first subclass, there may be others)'
|
||||
warnings.warn(msg, warn_category, stacklevel=2)
|
||||
super().__init__(name, bases, clsdict_)
|
||||
|
||||
# see https://www.python.org/dev/peps/pep-3119/#overloading-isinstance-and-issubclass
|
||||
# and https://docs.python.org/reference/datamodel.html#customizing-instance-and-subclass-checks
|
||||
# for implementation details
|
||||
def __instancecheck__(cls, inst):
|
||||
return any(cls.__subclasscheck__(c)
|
||||
for c in {type(inst), inst.__class__})
|
||||
|
||||
def __subclasscheck__(cls, sub):
|
||||
if cls is not DeprecatedClass.deprecated_class:
|
||||
# we should do the magic only if second `issubclass` argument
|
||||
# is the deprecated class itself - subclasses of the
|
||||
# deprecated class should not use custom `__subclasscheck__`
|
||||
# method.
|
||||
return super().__subclasscheck__(sub)
|
||||
|
||||
if not inspect.isclass(sub):
|
||||
raise TypeError("issubclass() arg 1 must be a class")
|
||||
|
||||
mro = getattr(sub, '__mro__', ())
|
||||
return any(c in {cls, new_class} for c in mro)
|
||||
|
||||
def __call__(cls, *args, **kwargs):
|
||||
old = DeprecatedClass.deprecated_class
|
||||
if cls is old:
|
||||
msg = instance_warn_message.format(cls=_clspath(cls, old_class_path),
|
||||
new=_clspath(new_class, new_class_path))
|
||||
warnings.warn(msg, warn_category, stacklevel=2)
|
||||
return super().__call__(*args, **kwargs)
|
||||
|
||||
deprecated_cls = DeprecatedClass(name, (new_class,), clsdict or {})
|
||||
|
||||
try:
|
||||
frm = inspect.stack()[1]
|
||||
parent_module = inspect.getmodule(frm[0])
|
||||
if parent_module is not None:
|
||||
deprecated_cls.__module__ = parent_module.__name__
|
||||
except Exception as e:
|
||||
# Sometimes inspect.stack() fails (e.g. when the first import of
|
||||
# deprecated class is in jinja2 template). __module__ attribute is not
|
||||
# important enough to raise an exception as users may be unable
|
||||
# to fix inspect.stack() errors.
|
||||
warnings.warn(f"Error detecting parent module: {e!r}")
|
||||
|
||||
return deprecated_cls
|
||||
|
||||
|
||||
def _clspath(cls, forced=None):
|
||||
if forced is not None:
|
||||
return forced
|
||||
return f'{cls.__module__}.{cls.__name__}'
|
||||
|
||||
|
||||
DEPRECATION_RULES = [
|
||||
('scrapy.telnet.', 'scrapy.extensions.telnet.'),
|
||||
]
|
||||
|
||||
|
||||
def update_classpath(path):
|
||||
"""Update a deprecated path from an object with its new location"""
|
||||
for prefix, replacement in DEPRECATION_RULES:
|
||||
if isinstance(path, str) and path.startswith(prefix):
|
||||
new_path = path.replace(prefix, replacement, 1)
|
||||
warnings.warn(f"`{path}` class is deprecated, use `{new_path}` instead",
|
||||
ScrapyDeprecationWarning)
|
||||
return new_path
|
||||
return path
|
||||
|
||||
|
||||
def method_is_overridden(subclass, base_class, method_name):
|
||||
"""
|
||||
Return True if a method named ``method_name`` of a ``base_class``
|
||||
is overridden in a ``subclass``.
|
||||
|
||||
>>> class Base:
|
||||
... def foo(self):
|
||||
... pass
|
||||
>>> class Sub1(Base):
|
||||
... pass
|
||||
>>> class Sub2(Base):
|
||||
... def foo(self):
|
||||
... pass
|
||||
>>> class Sub3(Sub1):
|
||||
... def foo(self):
|
||||
... pass
|
||||
>>> class Sub4(Sub2):
|
||||
... pass
|
||||
>>> method_is_overridden(Sub1, Base, 'foo')
|
||||
False
|
||||
>>> method_is_overridden(Sub2, Base, 'foo')
|
||||
True
|
||||
>>> method_is_overridden(Sub3, Base, 'foo')
|
||||
True
|
||||
>>> method_is_overridden(Sub4, Base, 'foo')
|
||||
True
|
||||
"""
|
||||
base_method = getattr(base_class, method_name)
|
||||
sub_method = getattr(subclass, method_name)
|
||||
return base_method.__code__ is not sub_method.__code__
|
||||
48
venv/lib/python3.9/site-packages/scrapy/utils/display.py
Normal file
48
venv/lib/python3.9/site-packages/scrapy/utils/display.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
pprint and pformat wrappers with colorization support
|
||||
"""
|
||||
|
||||
import ctypes
|
||||
import platform
|
||||
import sys
|
||||
from distutils.version import LooseVersion as parse_version
|
||||
from pprint import pformat as pformat_
|
||||
|
||||
|
||||
def _enable_windows_terminal_processing():
|
||||
# https://stackoverflow.com/a/36760881
|
||||
kernel32 = ctypes.windll.kernel32
|
||||
return bool(kernel32.SetConsoleMode(kernel32.GetStdHandle(-11), 7))
|
||||
|
||||
|
||||
def _tty_supports_color():
|
||||
if sys.platform != "win32":
|
||||
return True
|
||||
|
||||
if parse_version(platform.version()) < parse_version("10.0.14393"):
|
||||
return True
|
||||
|
||||
# Windows >= 10.0.14393 interprets ANSI escape sequences providing terminal
|
||||
# processing is enabled.
|
||||
return _enable_windows_terminal_processing()
|
||||
|
||||
|
||||
def _colorize(text, colorize=True):
|
||||
if not colorize or not sys.stdout.isatty() or not _tty_supports_color():
|
||||
return text
|
||||
try:
|
||||
from pygments import highlight
|
||||
except ImportError:
|
||||
return text
|
||||
else:
|
||||
from pygments.formatters import TerminalFormatter
|
||||
from pygments.lexers import PythonLexer
|
||||
return highlight(text, PythonLexer(), TerminalFormatter())
|
||||
|
||||
|
||||
def pformat(obj, *args, **kwargs):
|
||||
return _colorize(pformat_(obj), kwargs.pop('colorize', True))
|
||||
|
||||
|
||||
def pprint(obj, *args, **kwargs):
|
||||
print(pformat(obj, *args, **kwargs))
|
||||
48
venv/lib/python3.9/site-packages/scrapy/utils/engine.py
Normal file
48
venv/lib/python3.9/site-packages/scrapy/utils/engine.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
"""Some debugging functions for working with the Scrapy engine"""
|
||||
|
||||
# used in global tests code
|
||||
from time import time # noqa: F401
|
||||
|
||||
|
||||
def get_engine_status(engine):
|
||||
"""Return a report of the current engine status"""
|
||||
tests = [
|
||||
"time()-engine.start_time",
|
||||
"engine.has_capacity()",
|
||||
"len(engine.downloader.active)",
|
||||
"engine.scraper.is_idle()",
|
||||
"engine.spider.name",
|
||||
"engine.spider_is_idle(engine.spider)",
|
||||
"engine.slot.closing",
|
||||
"len(engine.slot.inprogress)",
|
||||
"len(engine.slot.scheduler.dqs or [])",
|
||||
"len(engine.slot.scheduler.mqs)",
|
||||
"len(engine.scraper.slot.queue)",
|
||||
"len(engine.scraper.slot.active)",
|
||||
"engine.scraper.slot.active_size",
|
||||
"engine.scraper.slot.itemproc_size",
|
||||
"engine.scraper.slot.needs_backout()",
|
||||
]
|
||||
|
||||
checks = []
|
||||
for test in tests:
|
||||
try:
|
||||
checks += [(test, eval(test))]
|
||||
except Exception as e:
|
||||
checks += [(test, f"{type(e).__name__} (exception)")]
|
||||
|
||||
return checks
|
||||
|
||||
|
||||
def format_engine_status(engine=None):
|
||||
checks = get_engine_status(engine)
|
||||
s = "Execution engine status\n\n"
|
||||
for test, result in checks:
|
||||
s += f"{test:<47} : {result}\n"
|
||||
s += "\n"
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def print_engine_status(engine):
|
||||
print(format_engine_status(engine))
|
||||
37
venv/lib/python3.9/site-packages/scrapy/utils/ftp.py
Normal file
37
venv/lib/python3.9/site-packages/scrapy/utils/ftp.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import posixpath
|
||||
|
||||
from ftplib import error_perm, FTP
|
||||
from posixpath import dirname
|
||||
|
||||
|
||||
def ftp_makedirs_cwd(ftp, path, first_call=True):
|
||||
"""Set the current directory of the FTP connection given in the ``ftp``
|
||||
argument (as a ftplib.FTP object), creating all parent directories if they
|
||||
don't exist. The ftplib.FTP object must be already connected and logged in.
|
||||
"""
|
||||
try:
|
||||
ftp.cwd(path)
|
||||
except error_perm:
|
||||
ftp_makedirs_cwd(ftp, dirname(path), False)
|
||||
ftp.mkd(path)
|
||||
if first_call:
|
||||
ftp.cwd(path)
|
||||
|
||||
|
||||
def ftp_store_file(
|
||||
*, path, file, host, port,
|
||||
username, password, use_active_mode=False, overwrite=True):
|
||||
"""Opens a FTP connection with passed credentials,sets current directory
|
||||
to the directory extracted from given path, then uploads the file to server
|
||||
"""
|
||||
with FTP() as ftp:
|
||||
ftp.connect(host, port)
|
||||
ftp.login(username, password)
|
||||
if use_active_mode:
|
||||
ftp.set_pasv(False)
|
||||
file.seek(0)
|
||||
dirname, filename = posixpath.split(path)
|
||||
ftp_makedirs_cwd(ftp, dirname)
|
||||
command = 'STOR' if overwrite else 'APPE'
|
||||
ftp.storbinary(f'{command} {filename}', file)
|
||||
file.close()
|
||||
58
venv/lib/python3.9/site-packages/scrapy/utils/gz.py
Normal file
58
venv/lib/python3.9/site-packages/scrapy/utils/gz.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
from gzip import GzipFile
|
||||
from io import BytesIO
|
||||
import re
|
||||
import struct
|
||||
|
||||
from scrapy.utils.decorators import deprecated
|
||||
|
||||
|
||||
# - GzipFile's read() has issues returning leftover uncompressed data when
|
||||
# input is corrupted
|
||||
# - read1(), which fetches data before raising EOFError on next call
|
||||
# works here
|
||||
@deprecated('GzipFile.read1')
|
||||
def read1(gzf, size=-1):
|
||||
return gzf.read1(size)
|
||||
|
||||
|
||||
def gunzip(data):
|
||||
"""Gunzip the given data and return as much data as possible.
|
||||
|
||||
This is resilient to CRC checksum errors.
|
||||
"""
|
||||
f = GzipFile(fileobj=BytesIO(data))
|
||||
output_list = []
|
||||
chunk = b'.'
|
||||
while chunk:
|
||||
try:
|
||||
chunk = f.read1(8196)
|
||||
output_list.append(chunk)
|
||||
except (IOError, EOFError, struct.error):
|
||||
# complete only if there is some data, otherwise re-raise
|
||||
# see issue 87 about catching struct.error
|
||||
# some pages are quite small so output_list is empty and f.extrabuf
|
||||
# contains the whole page content
|
||||
if output_list or getattr(f, 'extrabuf', None):
|
||||
try:
|
||||
output_list.append(f.extrabuf[-f.extrasize:])
|
||||
finally:
|
||||
break
|
||||
else:
|
||||
raise
|
||||
return b''.join(output_list)
|
||||
|
||||
|
||||
_is_gzipped = re.compile(br'^application/(x-)?gzip\b', re.I).search
|
||||
_is_octetstream = re.compile(br'^(application|binary)/octet-stream\b', re.I).search
|
||||
|
||||
|
||||
@deprecated
|
||||
def is_gzipped(response):
|
||||
"""Return True if the response is gzipped, or False otherwise"""
|
||||
ctype = response.headers.get('Content-Type', b'')
|
||||
cenc = response.headers.get('Content-Encoding', b'').lower()
|
||||
return _is_gzipped(ctype) or _is_octetstream(ctype) and cenc in (b'gzip', b'x-gzip')
|
||||
|
||||
|
||||
def gzip_magic_number(response):
|
||||
return response.body[:3] == b'\x1f\x8b\x08'
|
||||
36
venv/lib/python3.9/site-packages/scrapy/utils/http.py
Normal file
36
venv/lib/python3.9/site-packages/scrapy/utils/http.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
"""
|
||||
Transitional module for moving to the w3lib library.
|
||||
|
||||
For new code, always import from w3lib.http instead of this module
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.decorators import deprecated
|
||||
from w3lib.http import * # noqa: F401
|
||||
|
||||
|
||||
warnings.warn("Module `scrapy.utils.http` is deprecated, "
|
||||
"Please import from `w3lib.http` instead.",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
|
||||
|
||||
@deprecated
|
||||
def decode_chunked_transfer(chunked_body):
|
||||
"""Parsed body received with chunked transfer encoding, and return the
|
||||
decoded body.
|
||||
|
||||
For more info see:
|
||||
https://en.wikipedia.org/wiki/Chunked_transfer_encoding
|
||||
|
||||
"""
|
||||
body, h, t = '', '', chunked_body
|
||||
while t:
|
||||
h, t = t.split('\r\n', 1)
|
||||
if h == '0':
|
||||
break
|
||||
size = int(h, 16)
|
||||
body += t[:size]
|
||||
t = t[size + 2:]
|
||||
return body
|
||||
16
venv/lib/python3.9/site-packages/scrapy/utils/httpobj.py
Normal file
16
venv/lib/python3.9/site-packages/scrapy/utils/httpobj.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
"""Helper functions for scrapy.http objects (Request, Response)"""
|
||||
|
||||
import weakref
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
_urlparse_cache = weakref.WeakKeyDictionary()
|
||||
|
||||
|
||||
def urlparse_cached(request_or_response):
|
||||
"""Return urlparse.urlparse caching the result, where the argument can be a
|
||||
Request or Response object
|
||||
"""
|
||||
if request_or_response not in _urlparse_cache:
|
||||
_urlparse_cache[request_or_response] = urlparse(request_or_response.url)
|
||||
return _urlparse_cache[request_or_response]
|
||||
162
venv/lib/python3.9/site-packages/scrapy/utils/iterators.py
Normal file
162
venv/lib/python3.9/site-packages/scrapy/utils/iterators.py
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
import csv
|
||||
import logging
|
||||
import re
|
||||
from io import StringIO
|
||||
|
||||
from scrapy.http import TextResponse, Response
|
||||
from scrapy.selector import Selector
|
||||
from scrapy.utils.python import re_rsearch, to_unicode
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def xmliter(obj, nodename):
|
||||
"""Return a iterator of Selector's over all nodes of a XML document,
|
||||
given the name of the node to iterate. Useful for parsing XML feeds.
|
||||
|
||||
obj can be:
|
||||
- a Response object
|
||||
- a unicode string
|
||||
- a string encoded as utf-8
|
||||
"""
|
||||
nodename_patt = re.escape(nodename)
|
||||
|
||||
DOCUMENT_HEADER_RE = re.compile(r'<\?xml[^>]+>\s*', re.S)
|
||||
HEADER_END_RE = re.compile(fr'<\s*/{nodename_patt}\s*>', re.S)
|
||||
END_TAG_RE = re.compile(r'<\s*/([^\s>]+)\s*>', re.S)
|
||||
NAMESPACE_RE = re.compile(r'((xmlns[:A-Za-z]*)=[^>\s]+)', re.S)
|
||||
text = _body_or_str(obj)
|
||||
|
||||
document_header = re.search(DOCUMENT_HEADER_RE, text)
|
||||
document_header = document_header.group().strip() if document_header else ''
|
||||
header_end_idx = re_rsearch(HEADER_END_RE, text)
|
||||
header_end = text[header_end_idx[1]:].strip() if header_end_idx else ''
|
||||
namespaces = {}
|
||||
if header_end:
|
||||
for tagname in reversed(re.findall(END_TAG_RE, header_end)):
|
||||
tag = re.search(fr'<\s*{tagname}.*?xmlns[:=][^>]*>', text[:header_end_idx[1]], re.S)
|
||||
if tag:
|
||||
namespaces.update(reversed(x) for x in re.findall(NAMESPACE_RE, tag.group()))
|
||||
|
||||
r = re.compile(fr'<{nodename_patt}[\s>].*?</{nodename_patt}>', re.DOTALL)
|
||||
for match in r.finditer(text):
|
||||
nodetext = (
|
||||
document_header
|
||||
+ match.group().replace(
|
||||
nodename,
|
||||
f'{nodename} {" ".join(namespaces.values())}',
|
||||
1
|
||||
)
|
||||
+ header_end
|
||||
)
|
||||
yield Selector(text=nodetext, type='xml')
|
||||
|
||||
|
||||
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
|
||||
from lxml import etree
|
||||
reader = _StreamReader(obj)
|
||||
tag = f'{{{namespace}}}{nodename}' if namespace else nodename
|
||||
iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
|
||||
selxpath = '//' + (f'{prefix}:{nodename}' if namespace else nodename)
|
||||
for _, node in iterable:
|
||||
nodetext = etree.tostring(node, encoding='unicode')
|
||||
node.clear()
|
||||
xs = Selector(text=nodetext, type='xml')
|
||||
if namespace:
|
||||
xs.register_namespace(prefix, namespace)
|
||||
yield xs.xpath(selxpath)[0]
|
||||
|
||||
|
||||
class _StreamReader:
|
||||
|
||||
def __init__(self, obj):
|
||||
self._ptr = 0
|
||||
if isinstance(obj, Response):
|
||||
self._text, self.encoding = obj.body, obj.encoding
|
||||
else:
|
||||
self._text, self.encoding = obj, 'utf-8'
|
||||
self._is_unicode = isinstance(self._text, str)
|
||||
|
||||
def read(self, n=65535):
|
||||
self.read = self._read_unicode if self._is_unicode else self._read_string
|
||||
return self.read(n).lstrip()
|
||||
|
||||
def _read_string(self, n=65535):
|
||||
s, e = self._ptr, self._ptr + n
|
||||
self._ptr = e
|
||||
return self._text[s:e]
|
||||
|
||||
def _read_unicode(self, n=65535):
|
||||
s, e = self._ptr, self._ptr + n
|
||||
self._ptr = e
|
||||
return self._text[s:e].encode('utf-8')
|
||||
|
||||
|
||||
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
|
||||
""" Returns an iterator of dictionaries from the given csv object
|
||||
|
||||
obj can be:
|
||||
- a Response object
|
||||
- a unicode string
|
||||
- a string encoded as utf-8
|
||||
|
||||
delimiter is the character used to separate fields on the given obj.
|
||||
|
||||
headers is an iterable that when provided offers the keys
|
||||
for the returned dictionaries, if not the first row is used.
|
||||
|
||||
quotechar is the character used to enclosure fields on the given obj.
|
||||
"""
|
||||
|
||||
encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
|
||||
|
||||
def row_to_unicode(row_):
|
||||
return [to_unicode(field, encoding) for field in row_]
|
||||
|
||||
lines = StringIO(_body_or_str(obj, unicode=True))
|
||||
|
||||
kwargs = {}
|
||||
if delimiter:
|
||||
kwargs["delimiter"] = delimiter
|
||||
if quotechar:
|
||||
kwargs["quotechar"] = quotechar
|
||||
csv_r = csv.reader(lines, **kwargs)
|
||||
|
||||
if not headers:
|
||||
try:
|
||||
row = next(csv_r)
|
||||
except StopIteration:
|
||||
return
|
||||
headers = row_to_unicode(row)
|
||||
|
||||
for row in csv_r:
|
||||
row = row_to_unicode(row)
|
||||
if len(row) != len(headers):
|
||||
logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
|
||||
"should be: %(csvheader)d)",
|
||||
{'csvlnum': csv_r.line_num, 'csvrow': len(row),
|
||||
'csvheader': len(headers)})
|
||||
continue
|
||||
else:
|
||||
yield dict(zip(headers, row))
|
||||
|
||||
|
||||
def _body_or_str(obj, unicode=True):
|
||||
expected_types = (Response, str, bytes)
|
||||
if not isinstance(obj, expected_types):
|
||||
expected_types_str = " or ".join(t.__name__ for t in expected_types)
|
||||
raise TypeError(
|
||||
f"Object {obj!r} must be {expected_types_str}, not {type(obj).__name__}"
|
||||
)
|
||||
if isinstance(obj, Response):
|
||||
if not unicode:
|
||||
return obj.body
|
||||
elif isinstance(obj, TextResponse):
|
||||
return obj.text
|
||||
else:
|
||||
return obj.body.decode('utf-8')
|
||||
elif isinstance(obj, str):
|
||||
return obj if unicode else obj.encode('utf-8')
|
||||
else:
|
||||
return obj.decode('utf-8') if unicode else obj
|
||||
8
venv/lib/python3.9/site-packages/scrapy/utils/job.py
Normal file
8
venv/lib/python3.9/site-packages/scrapy/utils/job.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
import os
|
||||
|
||||
|
||||
def job_dir(settings):
|
||||
path = settings['JOBDIR']
|
||||
if path and not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
return path
|
||||
215
venv/lib/python3.9/site-packages/scrapy/utils/log.py
Normal file
215
venv/lib/python3.9/site-packages/scrapy/utils/log.py
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
import logging
|
||||
import sys
|
||||
import warnings
|
||||
from logging.config import dictConfig
|
||||
|
||||
from twisted.python import log as twisted_log
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
import scrapy
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.versions import scrapy_components_versions
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def failure_to_exc_info(failure):
|
||||
"""Extract exc_info from Failure instances"""
|
||||
if isinstance(failure, Failure):
|
||||
return (failure.type, failure.value, failure.getTracebackObject())
|
||||
|
||||
|
||||
class TopLevelFormatter(logging.Filter):
|
||||
"""Keep only top level loggers's name (direct children from root) from
|
||||
records.
|
||||
|
||||
This filter will replace Scrapy loggers' names with 'scrapy'. This mimics
|
||||
the old Scrapy log behaviour and helps shortening long names.
|
||||
|
||||
Since it can't be set for just one logger (it won't propagate for its
|
||||
children), it's going to be set in the root handler, with a parametrized
|
||||
``loggers`` list where it should act.
|
||||
"""
|
||||
|
||||
def __init__(self, loggers=None):
|
||||
self.loggers = loggers or []
|
||||
|
||||
def filter(self, record):
|
||||
if any(record.name.startswith(logger + '.') for logger in self.loggers):
|
||||
record.name = record.name.split('.', 1)[0]
|
||||
return True
|
||||
|
||||
|
||||
DEFAULT_LOGGING = {
|
||||
'version': 1,
|
||||
'disable_existing_loggers': False,
|
||||
'loggers': {
|
||||
'scrapy': {
|
||||
'level': 'DEBUG',
|
||||
},
|
||||
'twisted': {
|
||||
'level': 'ERROR',
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def configure_logging(settings=None, install_root_handler=True):
|
||||
"""
|
||||
Initialize logging defaults for Scrapy.
|
||||
|
||||
:param settings: settings used to create and configure a handler for the
|
||||
root logger (default: None).
|
||||
:type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``
|
||||
|
||||
:param install_root_handler: whether to install root logging handler
|
||||
(default: True)
|
||||
:type install_root_handler: bool
|
||||
|
||||
This function does:
|
||||
|
||||
- Route warnings and twisted logging through Python standard logging
|
||||
- Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
|
||||
- Route stdout to log if LOG_STDOUT setting is True
|
||||
|
||||
When ``install_root_handler`` is True (default), this function also
|
||||
creates a handler for the root logger according to given settings
|
||||
(see :ref:`topics-logging-settings`). You can override default options
|
||||
using ``settings`` argument. When ``settings`` is empty or None, defaults
|
||||
are used.
|
||||
"""
|
||||
if not sys.warnoptions:
|
||||
# Route warnings through python logging
|
||||
logging.captureWarnings(True)
|
||||
|
||||
observer = twisted_log.PythonLoggingObserver('twisted')
|
||||
observer.start()
|
||||
|
||||
dictConfig(DEFAULT_LOGGING)
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
|
||||
if settings.getbool('LOG_STDOUT'):
|
||||
sys.stdout = StreamLogger(logging.getLogger('stdout'))
|
||||
|
||||
if install_root_handler:
|
||||
install_scrapy_root_handler(settings)
|
||||
|
||||
|
||||
def install_scrapy_root_handler(settings):
|
||||
global _scrapy_root_handler
|
||||
|
||||
if (_scrapy_root_handler is not None
|
||||
and _scrapy_root_handler in logging.root.handlers):
|
||||
logging.root.removeHandler(_scrapy_root_handler)
|
||||
logging.root.setLevel(logging.NOTSET)
|
||||
_scrapy_root_handler = _get_handler(settings)
|
||||
logging.root.addHandler(_scrapy_root_handler)
|
||||
|
||||
|
||||
def get_scrapy_root_handler():
|
||||
return _scrapy_root_handler
|
||||
|
||||
|
||||
_scrapy_root_handler = None
|
||||
|
||||
|
||||
def _get_handler(settings):
|
||||
""" Return a log handler object according to settings """
|
||||
filename = settings.get('LOG_FILE')
|
||||
if filename:
|
||||
encoding = settings.get('LOG_ENCODING')
|
||||
handler = logging.FileHandler(filename, encoding=encoding)
|
||||
elif settings.getbool('LOG_ENABLED'):
|
||||
handler = logging.StreamHandler()
|
||||
else:
|
||||
handler = logging.NullHandler()
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt=settings.get('LOG_FORMAT'),
|
||||
datefmt=settings.get('LOG_DATEFORMAT')
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
handler.setLevel(settings.get('LOG_LEVEL'))
|
||||
if settings.getbool('LOG_SHORT_NAMES'):
|
||||
handler.addFilter(TopLevelFormatter(['scrapy']))
|
||||
return handler
|
||||
|
||||
|
||||
def log_scrapy_info(settings):
|
||||
logger.info("Scrapy %(version)s started (bot: %(bot)s)",
|
||||
{'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
|
||||
versions = [
|
||||
f"{name} {version}"
|
||||
for name, version in scrapy_components_versions()
|
||||
if name != "Scrapy"
|
||||
]
|
||||
logger.info("Versions: %(versions)s", {'versions': ", ".join(versions)})
|
||||
from twisted.internet import reactor
|
||||
logger.debug("Using reactor: %s.%s", reactor.__module__, reactor.__class__.__name__)
|
||||
from twisted.internet import asyncioreactor
|
||||
if isinstance(reactor, asyncioreactor.AsyncioSelectorReactor):
|
||||
logger.debug(
|
||||
"Using asyncio event loop: %s.%s",
|
||||
reactor._asyncioEventloop.__module__,
|
||||
reactor._asyncioEventloop.__class__.__name__,
|
||||
)
|
||||
|
||||
|
||||
class StreamLogger:
|
||||
"""Fake file-like stream object that redirects writes to a logger instance
|
||||
|
||||
Taken from:
|
||||
https://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
|
||||
"""
|
||||
def __init__(self, logger, log_level=logging.INFO):
|
||||
self.logger = logger
|
||||
self.log_level = log_level
|
||||
self.linebuf = ''
|
||||
|
||||
def write(self, buf):
|
||||
for line in buf.rstrip().splitlines():
|
||||
self.logger.log(self.log_level, line.rstrip())
|
||||
|
||||
def flush(self):
|
||||
for h in self.logger.handlers:
|
||||
h.flush()
|
||||
|
||||
|
||||
class LogCounterHandler(logging.Handler):
|
||||
"""Record log levels count into a crawler stats"""
|
||||
|
||||
def __init__(self, crawler, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.crawler = crawler
|
||||
|
||||
def emit(self, record):
|
||||
sname = f'log_count/{record.levelname}'
|
||||
self.crawler.stats.inc_value(sname)
|
||||
|
||||
|
||||
def logformatter_adapter(logkws):
|
||||
"""
|
||||
Helper that takes the dictionary output from the methods in LogFormatter
|
||||
and adapts it into a tuple of positional arguments for logger.log calls,
|
||||
handling backward compatibility as well.
|
||||
"""
|
||||
if not {'level', 'msg', 'args'} <= set(logkws):
|
||||
warnings.warn('Missing keys in LogFormatter method',
|
||||
ScrapyDeprecationWarning)
|
||||
|
||||
if 'format' in logkws:
|
||||
warnings.warn('`format` key in LogFormatter methods has been '
|
||||
'deprecated, use `msg` instead',
|
||||
ScrapyDeprecationWarning)
|
||||
|
||||
level = logkws.get('level', logging.INFO)
|
||||
message = logkws.get('format', logkws.get('msg'))
|
||||
# NOTE: This also handles 'args' being an empty dict, that case doesn't
|
||||
# play well in logger.log calls
|
||||
args = logkws if not logkws.get('args') else logkws['args']
|
||||
|
||||
return (level, message, args)
|
||||
14
venv/lib/python3.9/site-packages/scrapy/utils/markup.py
Normal file
14
venv/lib/python3.9/site-packages/scrapy/utils/markup.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
"""
|
||||
Transitional module for moving to the w3lib library.
|
||||
|
||||
For new code, always import from w3lib.html instead of this module
|
||||
"""
|
||||
import warnings
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from w3lib.html import * # noqa: F401
|
||||
|
||||
|
||||
warnings.warn("Module `scrapy.utils.markup` is deprecated. "
|
||||
"Please import from `w3lib.html` instead.",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
253
venv/lib/python3.9/site-packages/scrapy/utils/misc.py
Normal file
253
venv/lib/python3.9/site-packages/scrapy/utils/misc.py
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
"""Helper functions which don't fit anywhere else"""
|
||||
import ast
|
||||
import inspect
|
||||
import os
|
||||
import re
|
||||
import hashlib
|
||||
import warnings
|
||||
from collections import deque
|
||||
from contextlib import contextmanager
|
||||
from importlib import import_module
|
||||
from pkgutil import iter_modules
|
||||
from textwrap import dedent
|
||||
|
||||
from w3lib.html import replace_entities
|
||||
|
||||
from scrapy.utils.datatypes import LocalWeakReferencedCache
|
||||
from scrapy.utils.python import flatten, to_unicode
|
||||
from scrapy.item import _BaseItem
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
|
||||
|
||||
_ITERABLE_SINGLE_VALUES = dict, _BaseItem, str, bytes
|
||||
|
||||
|
||||
def arg_to_iter(arg):
|
||||
"""Convert an argument to an iterable. The argument can be a None, single
|
||||
value, or an iterable.
|
||||
|
||||
Exception: if arg is a dict, [arg] will be returned
|
||||
"""
|
||||
if arg is None:
|
||||
return []
|
||||
elif not isinstance(arg, _ITERABLE_SINGLE_VALUES) and hasattr(arg, '__iter__'):
|
||||
return arg
|
||||
else:
|
||||
return [arg]
|
||||
|
||||
|
||||
def load_object(path):
|
||||
"""Load an object given its absolute object path, and return it.
|
||||
|
||||
The object can be the import path of a class, function, variable or an
|
||||
instance, e.g. 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware'.
|
||||
|
||||
If ``path`` is not a string, but is a callable object, such as a class or
|
||||
a function, then return it as is.
|
||||
"""
|
||||
|
||||
if not isinstance(path, str):
|
||||
if callable(path):
|
||||
return path
|
||||
else:
|
||||
raise TypeError("Unexpected argument type, expected string "
|
||||
"or object, got: %s" % type(path))
|
||||
|
||||
try:
|
||||
dot = path.rindex('.')
|
||||
except ValueError:
|
||||
raise ValueError(f"Error loading object '{path}': not a full path")
|
||||
|
||||
module, name = path[:dot], path[dot + 1:]
|
||||
mod = import_module(module)
|
||||
|
||||
try:
|
||||
obj = getattr(mod, name)
|
||||
except AttributeError:
|
||||
raise NameError(f"Module '{module}' doesn't define any object named '{name}'")
|
||||
|
||||
return obj
|
||||
|
||||
|
||||
def walk_modules(path):
|
||||
"""Loads a module and all its submodules from the given module path and
|
||||
returns them. If *any* module throws an exception while importing, that
|
||||
exception is thrown back.
|
||||
|
||||
For example: walk_modules('scrapy.utils')
|
||||
"""
|
||||
|
||||
mods = []
|
||||
mod = import_module(path)
|
||||
mods.append(mod)
|
||||
if hasattr(mod, '__path__'):
|
||||
for _, subpath, ispkg in iter_modules(mod.__path__):
|
||||
fullpath = path + '.' + subpath
|
||||
if ispkg:
|
||||
mods += walk_modules(fullpath)
|
||||
else:
|
||||
submod = import_module(fullpath)
|
||||
mods.append(submod)
|
||||
return mods
|
||||
|
||||
|
||||
def extract_regex(regex, text, encoding='utf-8'):
|
||||
"""Extract a list of unicode strings from the given text/encoding using the following policies:
|
||||
|
||||
* if the regex contains a named group called "extract" that will be returned
|
||||
* if the regex contains multiple numbered groups, all those will be returned (flattened)
|
||||
* if the regex doesn't contain any group the entire regex matching is returned
|
||||
"""
|
||||
warnings.warn(
|
||||
"scrapy.utils.misc.extract_regex has moved to parsel.utils.extract_regex.",
|
||||
ScrapyDeprecationWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
if isinstance(regex, str):
|
||||
regex = re.compile(regex, re.UNICODE)
|
||||
|
||||
try:
|
||||
strings = [regex.search(text).group('extract')] # named group
|
||||
except Exception:
|
||||
strings = regex.findall(text) # full regex or numbered groups
|
||||
strings = flatten(strings)
|
||||
|
||||
if isinstance(text, str):
|
||||
return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
|
||||
else:
|
||||
return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
|
||||
for s in strings]
|
||||
|
||||
|
||||
def md5sum(file):
|
||||
"""Calculate the md5 checksum of a file-like object without reading its
|
||||
whole content in memory.
|
||||
|
||||
>>> from io import BytesIO
|
||||
>>> md5sum(BytesIO(b'file content to hash'))
|
||||
'784406af91dd5a54fbb9c84c2236595a'
|
||||
"""
|
||||
m = hashlib.md5()
|
||||
while True:
|
||||
d = file.read(8096)
|
||||
if not d:
|
||||
break
|
||||
m.update(d)
|
||||
return m.hexdigest()
|
||||
|
||||
|
||||
def rel_has_nofollow(rel):
|
||||
"""Return True if link rel attribute has nofollow type"""
|
||||
return rel is not None and 'nofollow' in rel.split()
|
||||
|
||||
|
||||
def create_instance(objcls, settings, crawler, *args, **kwargs):
|
||||
"""Construct a class instance using its ``from_crawler`` or
|
||||
``from_settings`` constructors, if available.
|
||||
|
||||
At least one of ``settings`` and ``crawler`` needs to be different from
|
||||
``None``. If ``settings `` is ``None``, ``crawler.settings`` will be used.
|
||||
If ``crawler`` is ``None``, only the ``from_settings`` constructor will be
|
||||
tried.
|
||||
|
||||
``*args`` and ``**kwargs`` are forwarded to the constructors.
|
||||
|
||||
Raises ``ValueError`` if both ``settings`` and ``crawler`` are ``None``.
|
||||
|
||||
.. versionchanged:: 2.2
|
||||
Raises ``TypeError`` if the resulting instance is ``None`` (e.g. if an
|
||||
extension has not been implemented correctly).
|
||||
"""
|
||||
if settings is None:
|
||||
if crawler is None:
|
||||
raise ValueError("Specify at least one of settings and crawler.")
|
||||
settings = crawler.settings
|
||||
if crawler and hasattr(objcls, 'from_crawler'):
|
||||
instance = objcls.from_crawler(crawler, *args, **kwargs)
|
||||
method_name = 'from_crawler'
|
||||
elif hasattr(objcls, 'from_settings'):
|
||||
instance = objcls.from_settings(settings, *args, **kwargs)
|
||||
method_name = 'from_settings'
|
||||
else:
|
||||
instance = objcls(*args, **kwargs)
|
||||
method_name = '__new__'
|
||||
if instance is None:
|
||||
raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
|
||||
return instance
|
||||
|
||||
|
||||
@contextmanager
|
||||
def set_environ(**kwargs):
|
||||
"""Temporarily set environment variables inside the context manager and
|
||||
fully restore previous environment afterwards
|
||||
"""
|
||||
|
||||
original_env = {k: os.environ.get(k) for k in kwargs}
|
||||
os.environ.update(kwargs)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
for k, v in original_env.items():
|
||||
if v is None:
|
||||
del os.environ[k]
|
||||
else:
|
||||
os.environ[k] = v
|
||||
|
||||
|
||||
def walk_callable(node):
|
||||
"""Similar to ``ast.walk``, but walks only function body and skips nested
|
||||
functions defined within the node.
|
||||
"""
|
||||
todo = deque([node])
|
||||
walked_func_def = False
|
||||
while todo:
|
||||
node = todo.popleft()
|
||||
if isinstance(node, ast.FunctionDef):
|
||||
if walked_func_def:
|
||||
continue
|
||||
walked_func_def = True
|
||||
todo.extend(ast.iter_child_nodes(node))
|
||||
yield node
|
||||
|
||||
|
||||
_generator_callbacks_cache = LocalWeakReferencedCache(limit=128)
|
||||
|
||||
|
||||
def is_generator_with_return_value(callable):
|
||||
"""
|
||||
Returns True if a callable is a generator function which includes a
|
||||
'return' statement with a value different than None, False otherwise
|
||||
"""
|
||||
if callable in _generator_callbacks_cache:
|
||||
return _generator_callbacks_cache[callable]
|
||||
|
||||
def returns_none(return_node):
|
||||
value = return_node.value
|
||||
return value is None or isinstance(value, ast.NameConstant) and value.value is None
|
||||
|
||||
if inspect.isgeneratorfunction(callable):
|
||||
tree = ast.parse(dedent(inspect.getsource(callable)))
|
||||
for node in walk_callable(tree):
|
||||
if isinstance(node, ast.Return) and not returns_none(node):
|
||||
_generator_callbacks_cache[callable] = True
|
||||
return _generator_callbacks_cache[callable]
|
||||
|
||||
_generator_callbacks_cache[callable] = False
|
||||
return _generator_callbacks_cache[callable]
|
||||
|
||||
|
||||
def warn_on_generator_with_return_value(spider, callable):
|
||||
"""
|
||||
Logs a warning if a callable is a generator function and includes
|
||||
a 'return' statement with a value different than None
|
||||
"""
|
||||
if is_generator_with_return_value(callable):
|
||||
warnings.warn(
|
||||
f'The "{spider.__class__.__name__}.{callable.__name__}" method is '
|
||||
'a generator and includes a "return" statement with a value '
|
||||
'different than None. This could lead to unexpected behaviour. Please see '
|
||||
'https://docs.python.org/3/reference/simple_stmts.html#the-return-statement '
|
||||
'for details about the semantics of the "return" statement within generators',
|
||||
stacklevel=2,
|
||||
)
|
||||
15
venv/lib/python3.9/site-packages/scrapy/utils/multipart.py
Normal file
15
venv/lib/python3.9/site-packages/scrapy/utils/multipart.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Transitional module for moving to the w3lib library.
|
||||
|
||||
For new code, always import from w3lib.form instead of this module
|
||||
"""
|
||||
import warnings
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from w3lib.form import * # noqa: F401
|
||||
|
||||
|
||||
warnings.warn("Module `scrapy.utils.multipart` is deprecated. "
|
||||
"If you're using `encode_multipart` function, please use "
|
||||
"`urllib3.filepost.encode_multipart_formdata` instead",
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
25
venv/lib/python3.9/site-packages/scrapy/utils/ossignal.py
Normal file
25
venv/lib/python3.9/site-packages/scrapy/utils/ossignal.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
import signal
|
||||
|
||||
|
||||
signal_names = {}
|
||||
for signame in dir(signal):
|
||||
if signame.startswith('SIG') and not signame.startswith('SIG_'):
|
||||
signum = getattr(signal, signame)
|
||||
if isinstance(signum, int):
|
||||
signal_names[signum] = signame
|
||||
|
||||
|
||||
def install_shutdown_handlers(function, override_sigint=True):
|
||||
"""Install the given function as a signal handler for all common shutdown
|
||||
signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the
|
||||
SIGINT handler won't be install if there is already a handler in place
|
||||
(e.g. Pdb)
|
||||
"""
|
||||
from twisted.internet import reactor
|
||||
reactor._handleSignals()
|
||||
signal.signal(signal.SIGTERM, function)
|
||||
if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint:
|
||||
signal.signal(signal.SIGINT, function)
|
||||
# Catch Ctrl-Break in windows
|
||||
if hasattr(signal, 'SIGBREAK'):
|
||||
signal.signal(signal.SIGBREAK, function)
|
||||
98
venv/lib/python3.9/site-packages/scrapy/utils/project.py
Normal file
98
venv/lib/python3.9/site-packages/scrapy/utils/project.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
import os
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
from importlib import import_module
|
||||
from os.path import join, dirname, abspath, isabs, exists
|
||||
|
||||
from scrapy.utils.conf import closest_scrapy_cfg, get_config, init_env
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
|
||||
|
||||
|
||||
ENVVAR = 'SCRAPY_SETTINGS_MODULE'
|
||||
DATADIR_CFG_SECTION = 'datadir'
|
||||
|
||||
|
||||
def inside_project():
|
||||
scrapy_module = os.environ.get('SCRAPY_SETTINGS_MODULE')
|
||||
if scrapy_module is not None:
|
||||
try:
|
||||
import_module(scrapy_module)
|
||||
except ImportError as exc:
|
||||
warnings.warn(f"Cannot import scrapy settings module {scrapy_module}: {exc}")
|
||||
else:
|
||||
return True
|
||||
return bool(closest_scrapy_cfg())
|
||||
|
||||
|
||||
def project_data_dir(project='default'):
|
||||
"""Return the current project data dir, creating it if it doesn't exist"""
|
||||
if not inside_project():
|
||||
raise NotConfigured("Not inside a project")
|
||||
cfg = get_config()
|
||||
if cfg.has_option(DATADIR_CFG_SECTION, project):
|
||||
d = cfg.get(DATADIR_CFG_SECTION, project)
|
||||
else:
|
||||
scrapy_cfg = closest_scrapy_cfg()
|
||||
if not scrapy_cfg:
|
||||
raise NotConfigured("Unable to find scrapy.cfg file to infer project data dir")
|
||||
d = abspath(join(dirname(scrapy_cfg), '.scrapy'))
|
||||
if not exists(d):
|
||||
os.makedirs(d)
|
||||
return d
|
||||
|
||||
|
||||
def data_path(path, createdir=False):
|
||||
"""
|
||||
Return the given path joined with the .scrapy data directory.
|
||||
If given an absolute path, return it unmodified.
|
||||
"""
|
||||
if not isabs(path):
|
||||
if inside_project():
|
||||
path = join(project_data_dir(), path)
|
||||
else:
|
||||
path = join('.scrapy', path)
|
||||
if createdir and not exists(path):
|
||||
os.makedirs(path)
|
||||
return path
|
||||
|
||||
|
||||
def get_project_settings():
|
||||
if ENVVAR not in os.environ:
|
||||
project = os.environ.get('SCRAPY_PROJECT', 'default')
|
||||
init_env(project)
|
||||
|
||||
settings = Settings()
|
||||
settings_module_path = os.environ.get(ENVVAR)
|
||||
if settings_module_path:
|
||||
settings.setmodule(settings_module_path, priority='project')
|
||||
|
||||
pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
|
||||
if pickled_settings:
|
||||
warnings.warn("Use of environment variable "
|
||||
"'SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE' "
|
||||
"is deprecated.", ScrapyDeprecationWarning)
|
||||
settings.setdict(pickle.loads(pickled_settings), priority='project')
|
||||
|
||||
scrapy_envvars = {k[7:]: v for k, v in os.environ.items() if
|
||||
k.startswith('SCRAPY_')}
|
||||
valid_envvars = {
|
||||
'CHECK',
|
||||
'PICKLED_SETTINGS_TO_OVERRIDE',
|
||||
'PROJECT',
|
||||
'PYTHON_SHELL',
|
||||
'SETTINGS_MODULE',
|
||||
}
|
||||
setting_envvars = {k for k in scrapy_envvars if k not in valid_envvars}
|
||||
if setting_envvars:
|
||||
setting_envvar_list = ', '.join(sorted(setting_envvars))
|
||||
warnings.warn(
|
||||
'Use of environment variables prefixed with SCRAPY_ to override '
|
||||
'settings is deprecated. The following environment variables are '
|
||||
f'currently defined: {setting_envvar_list}',
|
||||
ScrapyDeprecationWarning
|
||||
)
|
||||
settings.setdict(scrapy_envvars, priority='project')
|
||||
|
||||
return settings
|
||||
10
venv/lib/python3.9/site-packages/scrapy/utils/py36.py
Normal file
10
venv/lib/python3.9/site-packages/scrapy/utils/py36.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
"""
|
||||
Helpers using Python 3.6+ syntax (ignore SyntaxError on import).
|
||||
"""
|
||||
|
||||
|
||||
async def collect_asyncgen(result):
|
||||
results = []
|
||||
async for x in result:
|
||||
results.append(x)
|
||||
return results
|
||||
357
venv/lib/python3.9/site-packages/scrapy/utils/python.py
Normal file
357
venv/lib/python3.9/site-packages/scrapy/utils/python.py
Normal file
|
|
@ -0,0 +1,357 @@
|
|||
"""
|
||||
This module contains essential stuff that should've come with Python itself ;)
|
||||
"""
|
||||
import errno
|
||||
import gc
|
||||
import inspect
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
import weakref
|
||||
from functools import partial, wraps
|
||||
from itertools import chain
|
||||
|
||||
from scrapy.exceptions import ScrapyDeprecationWarning
|
||||
from scrapy.utils.decorators import deprecated
|
||||
|
||||
|
||||
def flatten(x):
|
||||
"""flatten(sequence) -> list
|
||||
|
||||
Returns a single, flat list which contains all elements retrieved
|
||||
from the sequence and all recursively contained sub-sequences
|
||||
(iterables).
|
||||
|
||||
Examples:
|
||||
>>> [1, 2, [3,4], (5,6)]
|
||||
[1, 2, [3, 4], (5, 6)]
|
||||
>>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
|
||||
[1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
|
||||
>>> flatten(["foo", "bar"])
|
||||
['foo', 'bar']
|
||||
>>> flatten(["foo", ["baz", 42], "bar"])
|
||||
['foo', 'baz', 42, 'bar']
|
||||
"""
|
||||
return list(iflatten(x))
|
||||
|
||||
|
||||
def iflatten(x):
|
||||
"""iflatten(sequence) -> iterator
|
||||
|
||||
Similar to ``.flatten()``, but returns iterator instead"""
|
||||
for el in x:
|
||||
if is_listlike(el):
|
||||
for el_ in iflatten(el):
|
||||
yield el_
|
||||
else:
|
||||
yield el
|
||||
|
||||
|
||||
def is_listlike(x):
|
||||
"""
|
||||
>>> is_listlike("foo")
|
||||
False
|
||||
>>> is_listlike(5)
|
||||
False
|
||||
>>> is_listlike(b"foo")
|
||||
False
|
||||
>>> is_listlike([b"foo"])
|
||||
True
|
||||
>>> is_listlike((b"foo",))
|
||||
True
|
||||
>>> is_listlike({})
|
||||
True
|
||||
>>> is_listlike(set())
|
||||
True
|
||||
>>> is_listlike((x for x in range(3)))
|
||||
True
|
||||
>>> is_listlike(range(5))
|
||||
True
|
||||
"""
|
||||
return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))
|
||||
|
||||
|
||||
def unique(list_, key=lambda x: x):
|
||||
"""efficient function to uniquify a list preserving item order"""
|
||||
seen = set()
|
||||
result = []
|
||||
for item in list_:
|
||||
seenkey = key(item)
|
||||
if seenkey in seen:
|
||||
continue
|
||||
seen.add(seenkey)
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def to_unicode(text, encoding=None, errors='strict'):
|
||||
"""Return the unicode representation of a bytes object ``text``. If
|
||||
``text`` is already an unicode object, return it as-is."""
|
||||
if isinstance(text, str):
|
||||
return text
|
||||
if not isinstance(text, (bytes, str)):
|
||||
raise TypeError('to_unicode must receive a bytes or str '
|
||||
f'object, got {type(text).__name__}')
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
return text.decode(encoding, errors)
|
||||
|
||||
|
||||
def to_bytes(text, encoding=None, errors='strict'):
|
||||
"""Return the binary representation of ``text``. If ``text``
|
||||
is already a bytes object, return it as-is."""
|
||||
if isinstance(text, bytes):
|
||||
return text
|
||||
if not isinstance(text, str):
|
||||
raise TypeError('to_bytes must receive a str or bytes '
|
||||
f'object, got {type(text).__name__}')
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
return text.encode(encoding, errors)
|
||||
|
||||
|
||||
@deprecated('to_unicode')
|
||||
def to_native_str(text, encoding=None, errors='strict'):
|
||||
""" Return str representation of ``text``. """
|
||||
return to_unicode(text, encoding, errors)
|
||||
|
||||
|
||||
def re_rsearch(pattern, text, chunk_size=1024):
|
||||
"""
|
||||
This function does a reverse search in a text using a regular expression
|
||||
given in the attribute 'pattern'.
|
||||
Since the re module does not provide this functionality, we have to find for
|
||||
the expression into chunks of text extracted from the end (for the sake of efficiency).
|
||||
At first, a chunk of 'chunk_size' kilobytes is extracted from the end, and searched for
|
||||
the pattern. If the pattern is not found, another chunk is extracted, and another
|
||||
search is performed.
|
||||
This process continues until a match is found, or until the whole file is read.
|
||||
In case the pattern wasn't found, None is returned, otherwise it returns a tuple containing
|
||||
the start position of the match, and the ending (regarding the entire text).
|
||||
"""
|
||||
|
||||
def _chunk_iter():
|
||||
offset = len(text)
|
||||
while True:
|
||||
offset -= (chunk_size * 1024)
|
||||
if offset <= 0:
|
||||
break
|
||||
yield (text[offset:], offset)
|
||||
yield (text, 0)
|
||||
|
||||
if isinstance(pattern, str):
|
||||
pattern = re.compile(pattern)
|
||||
|
||||
for chunk, offset in _chunk_iter():
|
||||
matches = [match for match in pattern.finditer(chunk)]
|
||||
if matches:
|
||||
start, end = matches[-1].span()
|
||||
return offset + start, offset + end
|
||||
return None
|
||||
|
||||
|
||||
def memoizemethod_noargs(method):
|
||||
"""Decorator to cache the result of a method (without arguments) using a
|
||||
weak reference to its object
|
||||
"""
|
||||
cache = weakref.WeakKeyDictionary()
|
||||
|
||||
@wraps(method)
|
||||
def new_method(self, *args, **kwargs):
|
||||
if self not in cache:
|
||||
cache[self] = method(self, *args, **kwargs)
|
||||
return cache[self]
|
||||
|
||||
return new_method
|
||||
|
||||
|
||||
_BINARYCHARS = {to_bytes(chr(i)) for i in range(32)} - {b"\0", b"\t", b"\n", b"\r"}
|
||||
_BINARYCHARS |= {ord(ch) for ch in _BINARYCHARS}
|
||||
|
||||
|
||||
def binary_is_text(data):
|
||||
""" Returns ``True`` if the given ``data`` argument (a ``bytes`` object)
|
||||
does not contain unprintable control characters.
|
||||
"""
|
||||
if not isinstance(data, bytes):
|
||||
raise TypeError(f"data must be bytes, got '{type(data).__name__}'")
|
||||
return all(c not in _BINARYCHARS for c in data)
|
||||
|
||||
|
||||
def _getargspec_py23(func):
|
||||
"""_getargspec_py23(function) -> named tuple ArgSpec(args, varargs, keywords,
|
||||
defaults)
|
||||
|
||||
Was identical to inspect.getargspec() in python2, but uses
|
||||
inspect.getfullargspec() for python3 behind the scenes to avoid
|
||||
DeprecationWarning.
|
||||
|
||||
>>> def f(a, b=2, *ar, **kw):
|
||||
... pass
|
||||
|
||||
>>> _getargspec_py23(f)
|
||||
ArgSpec(args=['a', 'b'], varargs='ar', keywords='kw', defaults=(2,))
|
||||
"""
|
||||
return inspect.ArgSpec(*inspect.getfullargspec(func)[:4])
|
||||
|
||||
|
||||
def get_func_args(func, stripself=False):
|
||||
"""Return the argument name list of a callable"""
|
||||
if inspect.isfunction(func):
|
||||
spec = inspect.getfullargspec(func)
|
||||
func_args = spec.args + spec.kwonlyargs
|
||||
elif inspect.isclass(func):
|
||||
return get_func_args(func.__init__, True)
|
||||
elif inspect.ismethod(func):
|
||||
return get_func_args(func.__func__, True)
|
||||
elif inspect.ismethoddescriptor(func):
|
||||
return []
|
||||
elif isinstance(func, partial):
|
||||
return [x for x in get_func_args(func.func)[len(func.args):]
|
||||
if not (func.keywords and x in func.keywords)]
|
||||
elif hasattr(func, '__call__'):
|
||||
if inspect.isroutine(func):
|
||||
return []
|
||||
elif getattr(func, '__name__', None) == '__call__':
|
||||
return []
|
||||
else:
|
||||
return get_func_args(func.__call__, True)
|
||||
else:
|
||||
raise TypeError(f'{type(func)} is not callable')
|
||||
if stripself:
|
||||
func_args.pop(0)
|
||||
return func_args
|
||||
|
||||
|
||||
def get_spec(func):
|
||||
"""Returns (args, kwargs) tuple for a function
|
||||
>>> import re
|
||||
>>> get_spec(re.match)
|
||||
(['pattern', 'string'], {'flags': 0})
|
||||
|
||||
>>> class Test:
|
||||
... def __call__(self, val):
|
||||
... pass
|
||||
... def method(self, val, flags=0):
|
||||
... pass
|
||||
|
||||
>>> get_spec(Test)
|
||||
(['self', 'val'], {})
|
||||
|
||||
>>> get_spec(Test.method)
|
||||
(['self', 'val'], {'flags': 0})
|
||||
|
||||
>>> get_spec(Test().method)
|
||||
(['self', 'val'], {'flags': 0})
|
||||
"""
|
||||
|
||||
if inspect.isfunction(func) or inspect.ismethod(func):
|
||||
spec = _getargspec_py23(func)
|
||||
elif hasattr(func, '__call__'):
|
||||
spec = _getargspec_py23(func.__call__)
|
||||
else:
|
||||
raise TypeError(f'{type(func)} is not callable')
|
||||
|
||||
defaults = spec.defaults or []
|
||||
|
||||
firstdefault = len(spec.args) - len(defaults)
|
||||
args = spec.args[:firstdefault]
|
||||
kwargs = dict(zip(spec.args[firstdefault:], defaults))
|
||||
return args, kwargs
|
||||
|
||||
|
||||
def equal_attributes(obj1, obj2, attributes):
|
||||
"""Compare two objects attributes"""
|
||||
# not attributes given return False by default
|
||||
if not attributes:
|
||||
return False
|
||||
|
||||
temp1, temp2 = object(), object()
|
||||
for attr in attributes:
|
||||
# support callables like itemgetter
|
||||
if callable(attr):
|
||||
if attr(obj1) != attr(obj2):
|
||||
return False
|
||||
elif getattr(obj1, attr, temp1) != getattr(obj2, attr, temp2):
|
||||
return False
|
||||
# all attributes equal
|
||||
return True
|
||||
|
||||
|
||||
class WeakKeyCache:
|
||||
|
||||
def __init__(self, default_factory):
|
||||
warnings.warn("The WeakKeyCache class is deprecated", category=ScrapyDeprecationWarning, stacklevel=2)
|
||||
self.default_factory = default_factory
|
||||
self._weakdict = weakref.WeakKeyDictionary()
|
||||
|
||||
def __getitem__(self, key):
|
||||
if key not in self._weakdict:
|
||||
self._weakdict[key] = self.default_factory(key)
|
||||
return self._weakdict[key]
|
||||
|
||||
|
||||
@deprecated
|
||||
def retry_on_eintr(function, *args, **kw):
|
||||
"""Run a function and retry it while getting EINTR errors"""
|
||||
while True:
|
||||
try:
|
||||
return function(*args, **kw)
|
||||
except IOError as e:
|
||||
if e.errno != errno.EINTR:
|
||||
raise
|
||||
|
||||
|
||||
def without_none_values(iterable):
|
||||
"""Return a copy of ``iterable`` with all ``None`` entries removed.
|
||||
|
||||
If ``iterable`` is a mapping, return a dictionary where all pairs that have
|
||||
value ``None`` have been removed.
|
||||
"""
|
||||
try:
|
||||
return {k: v for k, v in iterable.items() if v is not None}
|
||||
except AttributeError:
|
||||
return type(iterable)((v for v in iterable if v is not None))
|
||||
|
||||
|
||||
def global_object_name(obj):
|
||||
"""
|
||||
Return full name of a global object.
|
||||
|
||||
>>> from scrapy import Request
|
||||
>>> global_object_name(Request)
|
||||
'scrapy.http.request.Request'
|
||||
"""
|
||||
return f"{obj.__module__}.{obj.__name__}"
|
||||
|
||||
|
||||
if hasattr(sys, "pypy_version_info"):
|
||||
def garbage_collect():
|
||||
# Collecting weakreferences can take two collections on PyPy.
|
||||
gc.collect()
|
||||
gc.collect()
|
||||
else:
|
||||
def garbage_collect():
|
||||
gc.collect()
|
||||
|
||||
|
||||
class MutableChain:
|
||||
"""
|
||||
Thin wrapper around itertools.chain, allowing to add iterables "in-place"
|
||||
"""
|
||||
|
||||
def __init__(self, *args):
|
||||
self.data = chain.from_iterable(args)
|
||||
|
||||
def extend(self, *iterables):
|
||||
self.data = chain(self.data, chain.from_iterable(iterables))
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
return next(self.data)
|
||||
|
||||
@deprecated("scrapy.utils.python.MutableChain.__next__")
|
||||
def next(self):
|
||||
return self.__next__()
|
||||
90
venv/lib/python3.9/site-packages/scrapy/utils/reactor.py
Normal file
90
venv/lib/python3.9/site-packages/scrapy/utils/reactor.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
import asyncio
|
||||
from contextlib import suppress
|
||||
|
||||
from twisted.internet import asyncioreactor, error
|
||||
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
|
||||
def listen_tcp(portrange, host, factory):
|
||||
"""Like reactor.listenTCP but tries different ports in a range."""
|
||||
from twisted.internet import reactor
|
||||
if len(portrange) > 2:
|
||||
raise ValueError(f"invalid portrange: {portrange}")
|
||||
if not portrange:
|
||||
return reactor.listenTCP(0, factory, interface=host)
|
||||
if not hasattr(portrange, '__iter__'):
|
||||
return reactor.listenTCP(portrange, factory, interface=host)
|
||||
if len(portrange) == 1:
|
||||
return reactor.listenTCP(portrange[0], factory, interface=host)
|
||||
for x in range(portrange[0], portrange[1] + 1):
|
||||
try:
|
||||
return reactor.listenTCP(x, factory, interface=host)
|
||||
except error.CannotListenError:
|
||||
if x == portrange[1]:
|
||||
raise
|
||||
|
||||
|
||||
class CallLaterOnce:
|
||||
"""Schedule a function to be called in the next reactor loop, but only if
|
||||
it hasn't been already scheduled since the last time it ran.
|
||||
"""
|
||||
|
||||
def __init__(self, func, *a, **kw):
|
||||
self._func = func
|
||||
self._a = a
|
||||
self._kw = kw
|
||||
self._call = None
|
||||
|
||||
def schedule(self, delay=0):
|
||||
from twisted.internet import reactor
|
||||
if self._call is None:
|
||||
self._call = reactor.callLater(delay, self)
|
||||
|
||||
def cancel(self):
|
||||
if self._call:
|
||||
self._call.cancel()
|
||||
|
||||
def __call__(self):
|
||||
self._call = None
|
||||
return self._func(*self._a, **self._kw)
|
||||
|
||||
|
||||
def install_reactor(reactor_path, event_loop_path=None):
|
||||
"""Installs the :mod:`~twisted.internet.reactor` with the specified
|
||||
import path. Also installs the asyncio event loop with the specified import
|
||||
path if the asyncio reactor is enabled"""
|
||||
reactor_class = load_object(reactor_path)
|
||||
if reactor_class is asyncioreactor.AsyncioSelectorReactor:
|
||||
with suppress(error.ReactorAlreadyInstalledError):
|
||||
if event_loop_path is not None:
|
||||
event_loop_class = load_object(event_loop_path)
|
||||
event_loop = event_loop_class()
|
||||
asyncio.set_event_loop(event_loop)
|
||||
else:
|
||||
event_loop = asyncio.get_event_loop()
|
||||
asyncioreactor.install(eventloop=event_loop)
|
||||
else:
|
||||
*module, _ = reactor_path.split(".")
|
||||
installer_path = module + ["install"]
|
||||
installer = load_object(".".join(installer_path))
|
||||
with suppress(error.ReactorAlreadyInstalledError):
|
||||
installer()
|
||||
|
||||
|
||||
def verify_installed_reactor(reactor_path):
|
||||
"""Raises :exc:`Exception` if the installed
|
||||
:mod:`~twisted.internet.reactor` does not match the specified import
|
||||
path."""
|
||||
from twisted.internet import reactor
|
||||
reactor_class = load_object(reactor_path)
|
||||
if not isinstance(reactor, reactor_class):
|
||||
msg = ("The installed reactor "
|
||||
f"({reactor.__module__}.{reactor.__class__.__name__}) does not "
|
||||
f"match the requested one ({reactor_path})")
|
||||
raise Exception(msg)
|
||||
|
||||
|
||||
def is_asyncio_reactor_installed():
|
||||
from twisted.internet import reactor
|
||||
return isinstance(reactor, asyncioreactor.AsyncioSelectorReactor)
|
||||
95
venv/lib/python3.9/site-packages/scrapy/utils/reqser.py
Normal file
95
venv/lib/python3.9/site-packages/scrapy/utils/reqser.py
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
"""
|
||||
Helper functions for serializing (and deserializing) requests.
|
||||
"""
|
||||
import inspect
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy.utils.python import to_unicode
|
||||
from scrapy.utils.misc import load_object
|
||||
|
||||
|
||||
def request_to_dict(request, spider=None):
|
||||
"""Convert Request object to a dict.
|
||||
|
||||
If a spider is given, it will try to find out the name of the spider method
|
||||
used in the callback and store that as the callback.
|
||||
"""
|
||||
cb = request.callback
|
||||
if callable(cb):
|
||||
cb = _find_method(spider, cb)
|
||||
eb = request.errback
|
||||
if callable(eb):
|
||||
eb = _find_method(spider, eb)
|
||||
d = {
|
||||
'url': to_unicode(request.url), # urls should be safe (safe_string_url)
|
||||
'callback': cb,
|
||||
'errback': eb,
|
||||
'method': request.method,
|
||||
'headers': dict(request.headers),
|
||||
'body': request.body,
|
||||
'cookies': request.cookies,
|
||||
'meta': request.meta,
|
||||
'_encoding': request._encoding,
|
||||
'priority': request.priority,
|
||||
'dont_filter': request.dont_filter,
|
||||
'flags': request.flags,
|
||||
'cb_kwargs': request.cb_kwargs,
|
||||
}
|
||||
if type(request) is not Request:
|
||||
d['_class'] = request.__module__ + '.' + request.__class__.__name__
|
||||
return d
|
||||
|
||||
|
||||
def request_from_dict(d, spider=None):
|
||||
"""Create Request object from a dict.
|
||||
|
||||
If a spider is given, it will try to resolve the callbacks looking at the
|
||||
spider for methods with the same name.
|
||||
"""
|
||||
cb = d['callback']
|
||||
if cb and spider:
|
||||
cb = _get_method(spider, cb)
|
||||
eb = d['errback']
|
||||
if eb and spider:
|
||||
eb = _get_method(spider, eb)
|
||||
request_cls = load_object(d['_class']) if '_class' in d else Request
|
||||
return request_cls(
|
||||
url=to_unicode(d['url']),
|
||||
callback=cb,
|
||||
errback=eb,
|
||||
method=d['method'],
|
||||
headers=d['headers'],
|
||||
body=d['body'],
|
||||
cookies=d['cookies'],
|
||||
meta=d['meta'],
|
||||
encoding=d['_encoding'],
|
||||
priority=d['priority'],
|
||||
dont_filter=d['dont_filter'],
|
||||
flags=d.get('flags'),
|
||||
cb_kwargs=d.get('cb_kwargs'),
|
||||
)
|
||||
|
||||
|
||||
def _find_method(obj, func):
|
||||
# Only instance methods contain ``__func__``
|
||||
if obj and hasattr(func, '__func__'):
|
||||
members = inspect.getmembers(obj, predicate=inspect.ismethod)
|
||||
for name, obj_func in members:
|
||||
# We need to use __func__ to access the original
|
||||
# function object because instance method objects
|
||||
# are generated each time attribute is retrieved from
|
||||
# instance.
|
||||
#
|
||||
# Reference: The standard type hierarchy
|
||||
# https://docs.python.org/3/reference/datamodel.html
|
||||
if obj_func.__func__ is func.__func__:
|
||||
return name
|
||||
raise ValueError(f"Function {func} is not an instance method in: {obj}")
|
||||
|
||||
|
||||
def _get_method(obj, name):
|
||||
name = str(name)
|
||||
try:
|
||||
return getattr(obj, name)
|
||||
except AttributeError:
|
||||
raise ValueError(f"Method {name!r} not found in: {obj}")
|
||||
100
venv/lib/python3.9/site-packages/scrapy/utils/request.py
Normal file
100
venv/lib/python3.9/site-packages/scrapy/utils/request.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
"""
|
||||
This module provides some useful functions for working with
|
||||
scrapy.http.Request objects
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import weakref
|
||||
from urllib.parse import urlunparse
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
from w3lib.url import canonicalize_url
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
|
||||
|
||||
_fingerprint_cache = weakref.WeakKeyDictionary()
|
||||
|
||||
|
||||
def request_fingerprint(request, include_headers=None, keep_fragments=False):
|
||||
"""
|
||||
Return the request fingerprint.
|
||||
|
||||
The request fingerprint is a hash that uniquely identifies the resource the
|
||||
request points to. For example, take the following two urls:
|
||||
|
||||
http://www.example.com/query?id=111&cat=222
|
||||
http://www.example.com/query?cat=222&id=111
|
||||
|
||||
Even though those are two different URLs both point to the same resource
|
||||
and are equivalent (i.e. they should return the same response).
|
||||
|
||||
Another example are cookies used to store session ids. Suppose the
|
||||
following page is only accessible to authenticated users:
|
||||
|
||||
http://www.example.com/members/offers.html
|
||||
|
||||
Lot of sites use a cookie to store the session id, which adds a random
|
||||
component to the HTTP Request and thus should be ignored when calculating
|
||||
the fingerprint.
|
||||
|
||||
For this reason, request headers are ignored by default when calculating
|
||||
the fingeprint. If you want to include specific headers use the
|
||||
include_headers argument, which is a list of Request headers to include.
|
||||
|
||||
Also, servers usually ignore fragments in urls when handling requests,
|
||||
so they are also ignored by default when calculating the fingerprint.
|
||||
If you want to include them, set the keep_fragments argument to True
|
||||
(for instance when handling requests with a headless browser).
|
||||
|
||||
"""
|
||||
if include_headers:
|
||||
include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers))
|
||||
cache = _fingerprint_cache.setdefault(request, {})
|
||||
cache_key = (include_headers, keep_fragments)
|
||||
if cache_key not in cache:
|
||||
fp = hashlib.sha1()
|
||||
fp.update(to_bytes(request.method))
|
||||
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
|
||||
fp.update(request.body or b'')
|
||||
if include_headers:
|
||||
for hdr in include_headers:
|
||||
if hdr in request.headers:
|
||||
fp.update(hdr)
|
||||
for v in request.headers.getlist(hdr):
|
||||
fp.update(v)
|
||||
cache[cache_key] = fp.hexdigest()
|
||||
return cache[cache_key]
|
||||
|
||||
|
||||
def request_authenticate(request, username, password):
|
||||
"""Autenticate the given request (in place) using the HTTP basic access
|
||||
authentication mechanism (RFC 2617) and the given username and password
|
||||
"""
|
||||
request.headers['Authorization'] = basic_auth_header(username, password)
|
||||
|
||||
|
||||
def request_httprepr(request):
|
||||
"""Return the raw HTTP representation (as bytes) of the given request.
|
||||
This is provided only for reference since it's not the actual stream of
|
||||
bytes that will be send when performing the request (that's controlled
|
||||
by Twisted).
|
||||
"""
|
||||
parsed = urlparse_cached(request)
|
||||
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
||||
s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
|
||||
s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
|
||||
if request.headers:
|
||||
s += request.headers.to_string() + b"\r\n"
|
||||
s += b"\r\n"
|
||||
s += request.body
|
||||
return s
|
||||
|
||||
|
||||
def referer_str(request):
|
||||
""" Return Referer HTTP header suitable for logging. """
|
||||
referrer = request.headers.get('Referer')
|
||||
if referrer is None:
|
||||
return referrer
|
||||
return to_unicode(referrer, errors='replace')
|
||||
83
venv/lib/python3.9/site-packages/scrapy/utils/response.py
Normal file
83
venv/lib/python3.9/site-packages/scrapy/utils/response.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
"""
|
||||
This module provides some useful functions for working with
|
||||
scrapy.http.Response objects
|
||||
"""
|
||||
import os
|
||||
import weakref
|
||||
import webbrowser
|
||||
import tempfile
|
||||
|
||||
from twisted.web import http
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
from w3lib import html
|
||||
|
||||
|
||||
_baseurl_cache = weakref.WeakKeyDictionary()
|
||||
|
||||
|
||||
def get_base_url(response):
|
||||
"""Return the base url of the given response, joined with the response url"""
|
||||
if response not in _baseurl_cache:
|
||||
text = response.text[0:4096]
|
||||
_baseurl_cache[response] = html.get_base_url(text, response.url, response.encoding)
|
||||
return _baseurl_cache[response]
|
||||
|
||||
|
||||
_metaref_cache = weakref.WeakKeyDictionary()
|
||||
|
||||
|
||||
def get_meta_refresh(response, ignore_tags=('script', 'noscript')):
|
||||
"""Parse the http-equiv refrsh parameter from the given response"""
|
||||
if response not in _metaref_cache:
|
||||
text = response.text[0:4096]
|
||||
_metaref_cache[response] = html.get_meta_refresh(
|
||||
text, response.url, response.encoding, ignore_tags=ignore_tags)
|
||||
return _metaref_cache[response]
|
||||
|
||||
|
||||
def response_status_message(status):
|
||||
"""Return status code plus status text descriptive message
|
||||
"""
|
||||
message = http.RESPONSES.get(int(status), "Unknown Status")
|
||||
return f'{status} {to_unicode(message)}'
|
||||
|
||||
|
||||
def response_httprepr(response):
|
||||
"""Return raw HTTP representation (as bytes) of the given response. This
|
||||
is provided only for reference, since it's not the exact stream of bytes
|
||||
that was received (that's not exposed by Twisted).
|
||||
"""
|
||||
values = [
|
||||
b"HTTP/1.1 ",
|
||||
to_bytes(str(response.status)),
|
||||
b" ",
|
||||
to_bytes(http.RESPONSES.get(response.status, b'')),
|
||||
b"\r\n",
|
||||
]
|
||||
if response.headers:
|
||||
values.extend([response.headers.to_string(), b"\r\n"])
|
||||
values.extend([b"\r\n", response.body])
|
||||
return b"".join(values)
|
||||
|
||||
|
||||
def open_in_browser(response, _openfunc=webbrowser.open):
|
||||
"""Open the given response in a local web browser, populating the <base>
|
||||
tag for external links to work
|
||||
"""
|
||||
from scrapy.http import HtmlResponse, TextResponse
|
||||
# XXX: this implementation is a bit dirty and could be improved
|
||||
body = response.body
|
||||
if isinstance(response, HtmlResponse):
|
||||
if b'<base' not in body:
|
||||
repl = f'<head><base href="{response.url}">'
|
||||
body = body.replace(b'<head>', to_bytes(repl))
|
||||
ext = '.html'
|
||||
elif isinstance(response, TextResponse):
|
||||
ext = '.txt'
|
||||
else:
|
||||
raise TypeError("Unsupported response type: "
|
||||
f"{response.__class__.__name__}")
|
||||
fd, fname = tempfile.mkstemp(ext)
|
||||
os.write(fd, body)
|
||||
os.close(fd)
|
||||
return _openfunc(f"file://{fname}")
|
||||
40
venv/lib/python3.9/site-packages/scrapy/utils/serialize.py
Normal file
40
venv/lib/python3.9/site-packages/scrapy/utils/serialize.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
import json
|
||||
import datetime
|
||||
import decimal
|
||||
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
from twisted.internet import defer
|
||||
|
||||
from scrapy.http import Request, Response
|
||||
|
||||
|
||||
class ScrapyJSONEncoder(json.JSONEncoder):
|
||||
|
||||
DATE_FORMAT = "%Y-%m-%d"
|
||||
TIME_FORMAT = "%H:%M:%S"
|
||||
|
||||
def default(self, o):
|
||||
if isinstance(o, set):
|
||||
return list(o)
|
||||
elif isinstance(o, datetime.datetime):
|
||||
return o.strftime(f"{self.DATE_FORMAT} {self.TIME_FORMAT}")
|
||||
elif isinstance(o, datetime.date):
|
||||
return o.strftime(self.DATE_FORMAT)
|
||||
elif isinstance(o, datetime.time):
|
||||
return o.strftime(self.TIME_FORMAT)
|
||||
elif isinstance(o, decimal.Decimal):
|
||||
return str(o)
|
||||
elif isinstance(o, defer.Deferred):
|
||||
return str(o)
|
||||
elif is_item(o):
|
||||
return ItemAdapter(o).asdict()
|
||||
elif isinstance(o, Request):
|
||||
return f"<{type(o).__name__} {o.method} {o.url}>"
|
||||
elif isinstance(o, Response):
|
||||
return f"<{type(o).__name__} {o.status} {o.url}>"
|
||||
else:
|
||||
return super().default(o)
|
||||
|
||||
|
||||
class ScrapyJSONDecoder(json.JSONDecoder):
|
||||
pass
|
||||
80
venv/lib/python3.9/site-packages/scrapy/utils/signal.py
Normal file
80
venv/lib/python3.9/site-packages/scrapy/utils/signal.py
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
"""Helper functions for working with signals"""
|
||||
|
||||
import logging
|
||||
|
||||
from twisted.internet.defer import DeferredList, Deferred
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers
|
||||
from pydispatch.robustapply import robustApply
|
||||
|
||||
from scrapy.exceptions import StopDownload
|
||||
from scrapy.utils.defer import maybeDeferred_coro
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _IgnoredException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
|
||||
"""Like pydispatcher.robust.sendRobust but it also logs errors and returns
|
||||
Failures instead of exceptions.
|
||||
"""
|
||||
dont_log = (named.pop('dont_log', _IgnoredException), StopDownload)
|
||||
spider = named.get('spider', None)
|
||||
responses = []
|
||||
for receiver in liveReceivers(getAllReceivers(sender, signal)):
|
||||
try:
|
||||
response = robustApply(receiver, signal=signal, sender=sender, *arguments, **named)
|
||||
if isinstance(response, Deferred):
|
||||
logger.error("Cannot return deferreds from signal handler: %(receiver)s",
|
||||
{'receiver': receiver}, extra={'spider': spider})
|
||||
except dont_log:
|
||||
result = Failure()
|
||||
except Exception:
|
||||
result = Failure()
|
||||
logger.error("Error caught on signal handler: %(receiver)s",
|
||||
{'receiver': receiver},
|
||||
exc_info=True, extra={'spider': spider})
|
||||
else:
|
||||
result = response
|
||||
responses.append((receiver, result))
|
||||
return responses
|
||||
|
||||
|
||||
def send_catch_log_deferred(signal=Any, sender=Anonymous, *arguments, **named):
|
||||
"""Like send_catch_log but supports returning deferreds on signal handlers.
|
||||
Returns a deferred that gets fired once all signal handlers deferreds were
|
||||
fired.
|
||||
"""
|
||||
def logerror(failure, recv):
|
||||
if dont_log is None or not isinstance(failure.value, dont_log):
|
||||
logger.error("Error caught on signal handler: %(receiver)s",
|
||||
{'receiver': recv},
|
||||
exc_info=failure_to_exc_info(failure),
|
||||
extra={'spider': spider})
|
||||
return failure
|
||||
|
||||
dont_log = named.pop('dont_log', None)
|
||||
spider = named.get('spider', None)
|
||||
dfds = []
|
||||
for receiver in liveReceivers(getAllReceivers(sender, signal)):
|
||||
d = maybeDeferred_coro(robustApply, receiver, signal=signal, sender=sender, *arguments, **named)
|
||||
d.addErrback(logerror, receiver)
|
||||
d.addBoth(lambda result: (receiver, result))
|
||||
dfds.append(d)
|
||||
d = DeferredList(dfds)
|
||||
d.addCallback(lambda out: [x[1] for x in out])
|
||||
return d
|
||||
|
||||
|
||||
def disconnect_all(signal=Any, sender=Any):
|
||||
"""Disconnect all signal handlers. Useful for cleaning up after running
|
||||
tests
|
||||
"""
|
||||
for receiver in liveReceivers(getAllReceivers(sender, signal)):
|
||||
disconnect(receiver, signal=signal, sender=sender)
|
||||
47
venv/lib/python3.9/site-packages/scrapy/utils/sitemap.py
Normal file
47
venv/lib/python3.9/site-packages/scrapy/utils/sitemap.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
Module for processing Sitemaps.
|
||||
|
||||
Note: The main purpose of this module is to provide support for the
|
||||
SitemapSpider, its API is subject to change without notice.
|
||||
"""
|
||||
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import lxml.etree
|
||||
|
||||
|
||||
class Sitemap:
|
||||
"""Class to parse Sitemap (type=urlset) and Sitemap Index
|
||||
(type=sitemapindex) files"""
|
||||
|
||||
def __init__(self, xmltext):
|
||||
xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True, resolve_entities=False)
|
||||
self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
|
||||
rt = self._root.tag
|
||||
self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt
|
||||
|
||||
def __iter__(self):
|
||||
for elem in self._root.getchildren():
|
||||
d = {}
|
||||
for el in elem.getchildren():
|
||||
tag = el.tag
|
||||
name = tag.split('}', 1)[1] if '}' in tag else tag
|
||||
|
||||
if name == 'link':
|
||||
if 'href' in el.attrib:
|
||||
d.setdefault('alternate', []).append(el.get('href'))
|
||||
else:
|
||||
d[name] = el.text.strip() if el.text else ''
|
||||
|
||||
if 'loc' in d:
|
||||
yield d
|
||||
|
||||
|
||||
def sitemap_urls_from_robots(robots_text, base_url=None):
|
||||
"""Return an iterator over all sitemap urls contained in the given
|
||||
robots.txt file
|
||||
"""
|
||||
for line in robots_text.splitlines():
|
||||
if line.lstrip().lower().startswith('sitemap:'):
|
||||
url = line.split(':', 1)[1].strip()
|
||||
yield urljoin(base_url, url)
|
||||
74
venv/lib/python3.9/site-packages/scrapy/utils/spider.py
Normal file
74
venv/lib/python3.9/site-packages/scrapy/utils/spider.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
import inspect
|
||||
import logging
|
||||
|
||||
from scrapy.spiders import Spider
|
||||
from scrapy.utils.defer import deferred_from_coro
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
try:
|
||||
from scrapy.utils.py36 import collect_asyncgen
|
||||
except SyntaxError:
|
||||
collect_asyncgen = None
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def iterate_spider_output(result):
|
||||
if collect_asyncgen and hasattr(inspect, 'isasyncgen') and inspect.isasyncgen(result):
|
||||
d = deferred_from_coro(collect_asyncgen(result))
|
||||
d.addCallback(iterate_spider_output)
|
||||
return d
|
||||
elif inspect.iscoroutine(result):
|
||||
d = deferred_from_coro(result)
|
||||
d.addCallback(iterate_spider_output)
|
||||
return d
|
||||
return arg_to_iter(result)
|
||||
|
||||
|
||||
def iter_spider_classes(module):
|
||||
"""Return an iterator over all spider classes defined in the given module
|
||||
that can be instantiated (i.e. which have name)
|
||||
"""
|
||||
# this needs to be imported here until get rid of the spider manager
|
||||
# singleton in scrapy.spider.spiders
|
||||
from scrapy.spiders import Spider
|
||||
|
||||
for obj in vars(module).values():
|
||||
if (
|
||||
inspect.isclass(obj)
|
||||
and issubclass(obj, Spider)
|
||||
and obj.__module__ == module.__name__
|
||||
and getattr(obj, 'name', None)
|
||||
):
|
||||
yield obj
|
||||
|
||||
|
||||
def spidercls_for_request(spider_loader, request, default_spidercls=None,
|
||||
log_none=False, log_multiple=False):
|
||||
"""Return a spider class that handles the given Request.
|
||||
|
||||
This will look for the spiders that can handle the given request (using
|
||||
the spider loader) and return a Spider class if (and only if) there is
|
||||
only one Spider able to handle the Request.
|
||||
|
||||
If multiple spiders (or no spider) are found, it will return the
|
||||
default_spidercls passed. It can optionally log if multiple or no spiders
|
||||
are found.
|
||||
"""
|
||||
snames = spider_loader.find_by_request(request)
|
||||
if len(snames) == 1:
|
||||
return spider_loader.load(snames[0])
|
||||
|
||||
if len(snames) > 1 and log_multiple:
|
||||
logger.error('More than one spider can handle: %(request)s - %(snames)s',
|
||||
{'request': request, 'snames': ', '.join(snames)})
|
||||
|
||||
if len(snames) == 0 and log_none:
|
||||
logger.error('Unable to find spider that handles: %(request)s',
|
||||
{'request': request})
|
||||
|
||||
return default_spidercls
|
||||
|
||||
|
||||
class DefaultSpider(Spider):
|
||||
name = 'default'
|
||||
61
venv/lib/python3.9/site-packages/scrapy/utils/ssl.py
Normal file
61
venv/lib/python3.9/site-packages/scrapy/utils/ssl.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
import OpenSSL
|
||||
import OpenSSL._util as pyOpenSSLutil
|
||||
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
# The OpenSSL symbol is present since 1.1.1 but it's not currently supported in any version of pyOpenSSL.
|
||||
# Using the binding directly, as this code does, requires cryptography 2.4.
|
||||
SSL_OP_NO_TLSv1_3 = getattr(pyOpenSSLutil.lib, 'SSL_OP_NO_TLSv1_3', 0)
|
||||
|
||||
|
||||
def ffi_buf_to_string(buf):
|
||||
return to_unicode(pyOpenSSLutil.ffi.string(buf))
|
||||
|
||||
|
||||
def x509name_to_string(x509name):
|
||||
# from OpenSSL.crypto.X509Name.__repr__
|
||||
result_buffer = pyOpenSSLutil.ffi.new("char[]", 512)
|
||||
pyOpenSSLutil.lib.X509_NAME_oneline(x509name._name, result_buffer, len(result_buffer))
|
||||
|
||||
return ffi_buf_to_string(result_buffer)
|
||||
|
||||
|
||||
def get_temp_key_info(ssl_object):
|
||||
if not hasattr(pyOpenSSLutil.lib, 'SSL_get_server_tmp_key'): # requires OpenSSL 1.0.2
|
||||
return None
|
||||
|
||||
# adapted from OpenSSL apps/s_cb.c::ssl_print_tmp_key()
|
||||
temp_key_p = pyOpenSSLutil.ffi.new("EVP_PKEY **")
|
||||
if not pyOpenSSLutil.lib.SSL_get_server_tmp_key(ssl_object, temp_key_p):
|
||||
return None
|
||||
temp_key = temp_key_p[0]
|
||||
if temp_key == pyOpenSSLutil.ffi.NULL:
|
||||
return None
|
||||
temp_key = pyOpenSSLutil.ffi.gc(temp_key, pyOpenSSLutil.lib.EVP_PKEY_free)
|
||||
key_info = []
|
||||
key_type = pyOpenSSLutil.lib.EVP_PKEY_id(temp_key)
|
||||
if key_type == pyOpenSSLutil.lib.EVP_PKEY_RSA:
|
||||
key_info.append('RSA')
|
||||
elif key_type == pyOpenSSLutil.lib.EVP_PKEY_DH:
|
||||
key_info.append('DH')
|
||||
elif key_type == pyOpenSSLutil.lib.EVP_PKEY_EC:
|
||||
key_info.append('ECDH')
|
||||
ec_key = pyOpenSSLutil.lib.EVP_PKEY_get1_EC_KEY(temp_key)
|
||||
ec_key = pyOpenSSLutil.ffi.gc(ec_key, pyOpenSSLutil.lib.EC_KEY_free)
|
||||
nid = pyOpenSSLutil.lib.EC_GROUP_get_curve_name(pyOpenSSLutil.lib.EC_KEY_get0_group(ec_key))
|
||||
cname = pyOpenSSLutil.lib.EC_curve_nid2nist(nid)
|
||||
if cname == pyOpenSSLutil.ffi.NULL:
|
||||
cname = pyOpenSSLutil.lib.OBJ_nid2sn(nid)
|
||||
key_info.append(ffi_buf_to_string(cname))
|
||||
else:
|
||||
key_info.append(ffi_buf_to_string(pyOpenSSLutil.lib.OBJ_nid2sn(key_type)))
|
||||
key_info.append(f'{pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key)} bits')
|
||||
return ', '.join(key_info)
|
||||
|
||||
|
||||
def get_openssl_version():
|
||||
system_openssl = OpenSSL.SSL.SSLeay_version(
|
||||
OpenSSL.SSL.SSLEAY_VERSION
|
||||
).decode('ascii', errors='replace')
|
||||
return f'{OpenSSL.version.__version__} ({system_openssl})'
|
||||
36
venv/lib/python3.9/site-packages/scrapy/utils/template.py
Normal file
36
venv/lib/python3.9/site-packages/scrapy/utils/template.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
"""Helper functions for working with templates"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import string
|
||||
|
||||
|
||||
def render_templatefile(path, **kwargs):
|
||||
with open(path, 'rb') as fp:
|
||||
raw = fp.read().decode('utf8')
|
||||
|
||||
content = string.Template(raw).substitute(**kwargs)
|
||||
|
||||
render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path
|
||||
|
||||
if path.endswith('.tmpl'):
|
||||
os.rename(path, render_path)
|
||||
|
||||
with open(render_path, 'wb') as fp:
|
||||
fp.write(content.encode('utf8'))
|
||||
|
||||
|
||||
CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]')
|
||||
|
||||
|
||||
def string_camelcase(string):
|
||||
""" Convert a word to its CamelCase version and remove invalid chars
|
||||
|
||||
>>> string_camelcase('lost-pound')
|
||||
'LostPound'
|
||||
|
||||
>>> string_camelcase('missing_images')
|
||||
'MissingImages'
|
||||
|
||||
"""
|
||||
return CAMELCASE_INVALID_CHARS.sub('', string.title())
|
||||
112
venv/lib/python3.9/site-packages/scrapy/utils/test.py
Normal file
112
venv/lib/python3.9/site-packages/scrapy/utils/test.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
"""
|
||||
This module contains some assorted functions used in tests
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from posixpath import split
|
||||
from unittest import mock
|
||||
|
||||
from importlib import import_module
|
||||
from twisted.trial.unittest import SkipTest
|
||||
|
||||
from scrapy.utils.boto import is_botocore_available
|
||||
|
||||
|
||||
def assert_gcs_environ():
|
||||
if 'GCS_PROJECT_ID' not in os.environ:
|
||||
raise SkipTest("GCS_PROJECT_ID not found")
|
||||
|
||||
|
||||
def skip_if_no_boto():
|
||||
if not is_botocore_available():
|
||||
raise SkipTest('missing botocore library')
|
||||
|
||||
|
||||
def get_gcs_content_and_delete(bucket, path):
|
||||
from google.cloud import storage
|
||||
client = storage.Client(project=os.environ.get('GCS_PROJECT_ID'))
|
||||
bucket = client.get_bucket(bucket)
|
||||
blob = bucket.get_blob(path)
|
||||
content = blob.download_as_string()
|
||||
acl = list(blob.acl) # loads acl before it will be deleted
|
||||
bucket.delete_blob(path)
|
||||
return content, acl, blob
|
||||
|
||||
|
||||
def get_ftp_content_and_delete(
|
||||
path, host, port, username,
|
||||
password, use_active_mode=False):
|
||||
from ftplib import FTP
|
||||
ftp = FTP()
|
||||
ftp.connect(host, port)
|
||||
ftp.login(username, password)
|
||||
if use_active_mode:
|
||||
ftp.set_pasv(False)
|
||||
ftp_data = []
|
||||
|
||||
def buffer_data(data):
|
||||
ftp_data.append(data)
|
||||
ftp.retrbinary(f'RETR {path}', buffer_data)
|
||||
dirname, filename = split(path)
|
||||
ftp.cwd(dirname)
|
||||
ftp.delete(filename)
|
||||
return "".join(ftp_data)
|
||||
|
||||
|
||||
def get_crawler(spidercls=None, settings_dict=None):
|
||||
"""Return an unconfigured Crawler object. If settings_dict is given, it
|
||||
will be used to populate the crawler settings with a project level
|
||||
priority.
|
||||
"""
|
||||
from scrapy.crawler import CrawlerRunner
|
||||
from scrapy.spiders import Spider
|
||||
|
||||
runner = CrawlerRunner(settings_dict)
|
||||
return runner.create_crawler(spidercls or Spider)
|
||||
|
||||
|
||||
def get_pythonpath():
|
||||
"""Return a PYTHONPATH suitable to use in processes so that they find this
|
||||
installation of Scrapy"""
|
||||
scrapy_path = import_module('scrapy').__path__[0]
|
||||
return os.path.dirname(scrapy_path) + os.pathsep + os.environ.get('PYTHONPATH', '')
|
||||
|
||||
|
||||
def get_testenv():
|
||||
"""Return a OS environment dict suitable to fork processes that need to import
|
||||
this installation of Scrapy, instead of a system installed one.
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
env['PYTHONPATH'] = get_pythonpath()
|
||||
return env
|
||||
|
||||
|
||||
def assert_samelines(testcase, text1, text2, msg=None):
|
||||
"""Asserts text1 and text2 have the same lines, ignoring differences in
|
||||
line endings between platforms
|
||||
"""
|
||||
testcase.assertEqual(text1.splitlines(), text2.splitlines(), msg)
|
||||
|
||||
|
||||
def get_from_asyncio_queue(value):
|
||||
q = asyncio.Queue()
|
||||
getter = q.get()
|
||||
q.put_nowait(value)
|
||||
return getter
|
||||
|
||||
|
||||
def mock_google_cloud_storage():
|
||||
"""Creates autospec mocks for google-cloud-storage Client, Bucket and Blob
|
||||
classes and set their proper return values.
|
||||
"""
|
||||
from google.cloud.storage import Client, Bucket, Blob
|
||||
client_mock = mock.create_autospec(Client)
|
||||
|
||||
bucket_mock = mock.create_autospec(Bucket)
|
||||
client_mock.get_bucket.return_value = bucket_mock
|
||||
|
||||
blob_mock = mock.create_autospec(Blob)
|
||||
bucket_mock.blob.return_value = blob_mock
|
||||
|
||||
return (client_mock, bucket_mock, blob_mock)
|
||||
50
venv/lib/python3.9/site-packages/scrapy/utils/testproc.py
Normal file
50
venv/lib/python3.9/site-packages/scrapy/utils/testproc.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
import sys
|
||||
import os
|
||||
|
||||
from twisted.internet import defer, protocol
|
||||
|
||||
|
||||
class ProcessTest:
|
||||
|
||||
command = None
|
||||
prefix = [sys.executable, '-m', 'scrapy.cmdline']
|
||||
cwd = os.getcwd() # trial chdirs to temp dir
|
||||
|
||||
def execute(self, args, check_code=True, settings=None):
|
||||
from twisted.internet import reactor
|
||||
env = os.environ.copy()
|
||||
if settings is not None:
|
||||
env['SCRAPY_SETTINGS_MODULE'] = settings
|
||||
cmd = self.prefix + [self.command] + list(args)
|
||||
pp = TestProcessProtocol()
|
||||
pp.deferred.addBoth(self._process_finished, cmd, check_code)
|
||||
reactor.spawnProcess(pp, cmd[0], cmd, env=env, path=self.cwd)
|
||||
return pp.deferred
|
||||
|
||||
def _process_finished(self, pp, cmd, check_code):
|
||||
if pp.exitcode and check_code:
|
||||
msg = f"process {cmd} exit with code {pp.exitcode}"
|
||||
msg += f"\n>>> stdout <<<\n{pp.out}"
|
||||
msg += "\n"
|
||||
msg += f"\n>>> stderr <<<\n{pp.err}"
|
||||
raise RuntimeError(msg)
|
||||
return pp.exitcode, pp.out, pp.err
|
||||
|
||||
|
||||
class TestProcessProtocol(protocol.ProcessProtocol):
|
||||
|
||||
def __init__(self):
|
||||
self.deferred = defer.Deferred()
|
||||
self.out = b''
|
||||
self.err = b''
|
||||
self.exitcode = None
|
||||
|
||||
def outReceived(self, data):
|
||||
self.out += data
|
||||
|
||||
def errReceived(self, data):
|
||||
self.err += data
|
||||
|
||||
def processEnded(self, status):
|
||||
self.exitcode = status.value.exitCode
|
||||
self.deferred.callback(self)
|
||||
44
venv/lib/python3.9/site-packages/scrapy/utils/testsite.py
Normal file
44
venv/lib/python3.9/site-packages/scrapy/utils/testsite.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
from urllib.parse import urljoin
|
||||
|
||||
from twisted.web import server, resource, static, util
|
||||
|
||||
|
||||
class SiteTest:
|
||||
|
||||
def setUp(self):
|
||||
from twisted.internet import reactor
|
||||
super().setUp()
|
||||
self.site = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
|
||||
self.baseurl = f"http://localhost:{self.site.getHost().port}/"
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
self.site.stopListening()
|
||||
|
||||
def url(self, path):
|
||||
return urljoin(self.baseurl, path)
|
||||
|
||||
|
||||
class NoMetaRefreshRedirect(util.Redirect):
|
||||
def render(self, request):
|
||||
content = util.Redirect.render(self, request)
|
||||
return content.replace(b'http-equiv=\"refresh\"',
|
||||
b'http-no-equiv=\"do-not-refresh-me\"')
|
||||
|
||||
|
||||
def test_site():
|
||||
r = resource.Resource()
|
||||
r.putChild(b"text", static.Data(b"Works", "text/plain"))
|
||||
r.putChild(b"html", static.Data(b"<body><p class='one'>Works</p><p class='two'>World</p></body>", "text/html"))
|
||||
r.putChild(b"enc-gb18030", static.Data(b"<p>gb18030 encoding</p>", "text/html; charset=gb18030"))
|
||||
r.putChild(b"redirect", util.Redirect(b"/redirected"))
|
||||
r.putChild(b"redirect-no-meta-refresh", NoMetaRefreshRedirect(b"/redirected"))
|
||||
r.putChild(b"redirected", static.Data(b"Redirected here", "text/plain"))
|
||||
return server.Site(r)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from twisted.internet import reactor
|
||||
port = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
|
||||
print(f"http://localhost:{port.getHost().port}/")
|
||||
reactor.run()
|
||||
66
venv/lib/python3.9/site-packages/scrapy/utils/trackref.py
Normal file
66
venv/lib/python3.9/site-packages/scrapy/utils/trackref.py
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
"""This module provides some functions and classes to record and report
|
||||
references to live object instances.
|
||||
|
||||
If you want live objects for a particular class to be tracked, you only have to
|
||||
subclass from object_ref (instead of object).
|
||||
|
||||
About performance: This library has a minimal performance impact when enabled,
|
||||
and no performance penalty at all when disabled (as object_ref becomes just an
|
||||
alias to object in that case).
|
||||
"""
|
||||
|
||||
import weakref
|
||||
from time import time
|
||||
from operator import itemgetter
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
NoneType = type(None)
|
||||
live_refs = defaultdict(weakref.WeakKeyDictionary)
|
||||
|
||||
|
||||
class object_ref:
|
||||
"""Inherit from this class to a keep a record of live instances"""
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
obj = object.__new__(cls)
|
||||
live_refs[cls][obj] = time()
|
||||
return obj
|
||||
|
||||
|
||||
def format_live_refs(ignore=NoneType):
|
||||
"""Return a tabular representation of tracked objects"""
|
||||
s = "Live References\n\n"
|
||||
now = time()
|
||||
for cls, wdict in sorted(live_refs.items(),
|
||||
key=lambda x: x[0].__name__):
|
||||
if not wdict:
|
||||
continue
|
||||
if issubclass(cls, ignore):
|
||||
continue
|
||||
oldest = min(wdict.values())
|
||||
s += f"{cls.__name__:<30} {len(wdict):6} oldest: {int(now - oldest)}s ago\n"
|
||||
return s
|
||||
|
||||
|
||||
def print_live_refs(*a, **kw):
|
||||
"""Print tracked objects"""
|
||||
print(format_live_refs(*a, **kw))
|
||||
|
||||
|
||||
def get_oldest(class_name):
|
||||
"""Get the oldest object for a specific class name"""
|
||||
for cls, wdict in live_refs.items():
|
||||
if cls.__name__ == class_name:
|
||||
if not wdict:
|
||||
break
|
||||
return min(wdict.items(), key=itemgetter(1))[0]
|
||||
|
||||
|
||||
def iter_all(class_name):
|
||||
"""Iterate over all objects of the same class by its class name"""
|
||||
for cls, wdict in live_refs.items():
|
||||
if cls.__name__ == class_name:
|
||||
return wdict.keys()
|
||||
164
venv/lib/python3.9/site-packages/scrapy/utils/url.py
Normal file
164
venv/lib/python3.9/site-packages/scrapy/utils/url.py
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
"""
|
||||
This module contains general purpose URL functions not found in the standard
|
||||
library.
|
||||
|
||||
Some of the functions that used to be imported from this module have been moved
|
||||
to the w3lib.url module. Always import those from there instead.
|
||||
"""
|
||||
import posixpath
|
||||
import re
|
||||
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
|
||||
|
||||
# scrapy.utils.url was moved to w3lib.url and import * ensures this
|
||||
# move doesn't break old code
|
||||
from w3lib.url import *
|
||||
from w3lib.url import _safe_chars, _unquotepath # noqa: F401
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
def url_is_from_any_domain(url, domains):
|
||||
"""Return True if the url belongs to any of the given domains"""
|
||||
host = parse_url(url).netloc.lower()
|
||||
if not host:
|
||||
return False
|
||||
domains = [d.lower() for d in domains]
|
||||
return any((host == d) or (host.endswith(f'.{d}')) for d in domains)
|
||||
|
||||
|
||||
def url_is_from_spider(url, spider):
|
||||
"""Return True if the url belongs to the given spider"""
|
||||
return url_is_from_any_domain(url, [spider.name] + list(getattr(spider, 'allowed_domains', [])))
|
||||
|
||||
|
||||
def url_has_any_extension(url, extensions):
|
||||
return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
|
||||
|
||||
|
||||
def parse_url(url, encoding=None):
|
||||
"""Return urlparsed url from the given argument (which could be an already
|
||||
parsed url)
|
||||
"""
|
||||
if isinstance(url, ParseResult):
|
||||
return url
|
||||
return urlparse(to_unicode(url, encoding))
|
||||
|
||||
|
||||
def escape_ajax(url):
|
||||
"""
|
||||
Return the crawleable url according to:
|
||||
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
||||
|
||||
>>> escape_ajax("www.example.com/ajax.html#!key=value")
|
||||
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
||||
>>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
|
||||
'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
|
||||
>>> escape_ajax("www.example.com/ajax.html?#!key=value")
|
||||
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
||||
>>> escape_ajax("www.example.com/ajax.html#!")
|
||||
'www.example.com/ajax.html?_escaped_fragment_='
|
||||
|
||||
URLs that are not "AJAX crawlable" (according to Google) returned as-is:
|
||||
|
||||
>>> escape_ajax("www.example.com/ajax.html#key=value")
|
||||
'www.example.com/ajax.html#key=value'
|
||||
>>> escape_ajax("www.example.com/ajax.html#")
|
||||
'www.example.com/ajax.html#'
|
||||
>>> escape_ajax("www.example.com/ajax.html")
|
||||
'www.example.com/ajax.html'
|
||||
"""
|
||||
defrag, frag = urldefrag(url)
|
||||
if not frag.startswith('!'):
|
||||
return url
|
||||
return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
|
||||
|
||||
|
||||
def add_http_if_no_scheme(url):
|
||||
"""Add http as the default scheme if it is missing from the url."""
|
||||
match = re.match(r"^\w+://", url, flags=re.I)
|
||||
if not match:
|
||||
parts = urlparse(url)
|
||||
scheme = "http:" if parts.netloc else "http://"
|
||||
url = scheme + url
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def _is_posix_path(string):
|
||||
return bool(
|
||||
re.match(
|
||||
r'''
|
||||
^ # start with...
|
||||
(
|
||||
\. # ...a single dot,
|
||||
(
|
||||
\. | [^/\.]+ # optionally followed by
|
||||
)? # either a second dot or some characters
|
||||
|
|
||||
~ # $HOME
|
||||
)? # optional match of ".", ".." or ".blabla"
|
||||
/ # at least one "/" for a file path,
|
||||
. # and something after the "/"
|
||||
''',
|
||||
string,
|
||||
flags=re.VERBOSE,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _is_windows_path(string):
|
||||
return bool(
|
||||
re.match(
|
||||
r'''
|
||||
^
|
||||
(
|
||||
[a-z]:\\
|
||||
| \\\\
|
||||
)
|
||||
''',
|
||||
string,
|
||||
flags=re.IGNORECASE | re.VERBOSE,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _is_filesystem_path(string):
|
||||
return _is_posix_path(string) or _is_windows_path(string)
|
||||
|
||||
|
||||
def guess_scheme(url):
|
||||
"""Add an URL scheme if missing: file:// for filepath-like input or
|
||||
http:// otherwise."""
|
||||
if _is_filesystem_path(url):
|
||||
return any_to_uri(url)
|
||||
return add_http_if_no_scheme(url)
|
||||
|
||||
|
||||
def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=False, strip_fragment=True):
|
||||
|
||||
"""Strip URL string from some of its components:
|
||||
|
||||
- ``strip_credentials`` removes "user:password@"
|
||||
- ``strip_default_port`` removes ":80" (resp. ":443", ":21")
|
||||
from http:// (resp. https://, ftp://) URLs
|
||||
- ``origin_only`` replaces path component with "/", also dropping
|
||||
query and fragment components ; it also strips credentials
|
||||
- ``strip_fragment`` drops any #fragment component
|
||||
"""
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
netloc = parsed_url.netloc
|
||||
if (strip_credentials or origin_only) and (parsed_url.username or parsed_url.password):
|
||||
netloc = netloc.split('@')[-1]
|
||||
if strip_default_port and parsed_url.port:
|
||||
if (parsed_url.scheme, parsed_url.port) in (('http', 80),
|
||||
('https', 443),
|
||||
('ftp', 21)):
|
||||
netloc = netloc.replace(f':{parsed_url.port}', '')
|
||||
return urlunparse((
|
||||
parsed_url.scheme,
|
||||
netloc,
|
||||
'/' if origin_only else parsed_url.path,
|
||||
'' if origin_only else parsed_url.params,
|
||||
'' if origin_only else parsed_url.query,
|
||||
'' if strip_fragment else parsed_url.fragment
|
||||
))
|
||||
31
venv/lib/python3.9/site-packages/scrapy/utils/versions.py
Normal file
31
venv/lib/python3.9/site-packages/scrapy/utils/versions.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
import platform
|
||||
import sys
|
||||
|
||||
import cryptography
|
||||
import cssselect
|
||||
import lxml.etree
|
||||
import parsel
|
||||
import twisted
|
||||
import w3lib
|
||||
|
||||
import scrapy
|
||||
from scrapy.utils.ssl import get_openssl_version
|
||||
|
||||
|
||||
def scrapy_components_versions():
|
||||
lxml_version = ".".join(map(str, lxml.etree.LXML_VERSION))
|
||||
libxml2_version = ".".join(map(str, lxml.etree.LIBXML_VERSION))
|
||||
|
||||
return [
|
||||
("Scrapy", scrapy.__version__),
|
||||
("lxml", lxml_version),
|
||||
("libxml2", libxml2_version),
|
||||
("cssselect", cssselect.__version__),
|
||||
("parsel", parsel.__version__),
|
||||
("w3lib", w3lib.__version__),
|
||||
("Twisted", twisted.version.short()),
|
||||
("Python", sys.version.replace("\n", "- ")),
|
||||
("pyOpenSSL", get_openssl_version()),
|
||||
("cryptography", cryptography.__version__),
|
||||
("Platform", platform.platform()),
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue