195 lines
7 KiB
Python
195 lines
7 KiB
Python
import numbers
|
|
import os
|
|
import sys
|
|
import warnings
|
|
from configparser import ConfigParser
|
|
from operator import itemgetter
|
|
|
|
from scrapy.exceptions import ScrapyDeprecationWarning, UsageError
|
|
|
|
from scrapy.settings import BaseSettings
|
|
from scrapy.utils.deprecate import update_classpath
|
|
from scrapy.utils.python import without_none_values
|
|
|
|
|
|
def build_component_list(compdict, custom=None, convert=update_classpath):
|
|
"""Compose a component list from a { class: order } dictionary."""
|
|
|
|
def _check_components(complist):
|
|
if len({convert(c) for c in complist}) != len(complist):
|
|
raise ValueError(f'Some paths in {complist!r} convert to the same object, '
|
|
'please update your settings')
|
|
|
|
def _map_keys(compdict):
|
|
if isinstance(compdict, BaseSettings):
|
|
compbs = BaseSettings()
|
|
for k, v in compdict.items():
|
|
prio = compdict.getpriority(k)
|
|
if compbs.getpriority(convert(k)) == prio:
|
|
raise ValueError(f'Some paths in {list(compdict.keys())!r} '
|
|
'convert to the same '
|
|
'object, please update your settings'
|
|
)
|
|
else:
|
|
compbs.set(convert(k), v, priority=prio)
|
|
return compbs
|
|
else:
|
|
_check_components(compdict)
|
|
return {convert(k): v for k, v in compdict.items()}
|
|
|
|
def _validate_values(compdict):
|
|
"""Fail if a value in the components dict is not a real number or None."""
|
|
for name, value in compdict.items():
|
|
if value is not None and not isinstance(value, numbers.Real):
|
|
raise ValueError(f'Invalid value {value} for component {name}, '
|
|
'please provide a real number or None instead')
|
|
|
|
# BEGIN Backward compatibility for old (base, custom) call signature
|
|
if isinstance(custom, (list, tuple)):
|
|
_check_components(custom)
|
|
return type(custom)(convert(c) for c in custom)
|
|
|
|
if custom is not None:
|
|
compdict.update(custom)
|
|
# END Backward compatibility
|
|
|
|
_validate_values(compdict)
|
|
compdict = without_none_values(_map_keys(compdict))
|
|
return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
|
|
|
|
|
|
def arglist_to_dict(arglist):
|
|
"""Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a
|
|
dict
|
|
"""
|
|
return dict(x.split('=', 1) for x in arglist)
|
|
|
|
|
|
def closest_scrapy_cfg(path='.', prevpath=None):
|
|
"""Return the path to the closest scrapy.cfg file by traversing the current
|
|
directory and its parents
|
|
"""
|
|
if path == prevpath:
|
|
return ''
|
|
path = os.path.abspath(path)
|
|
cfgfile = os.path.join(path, 'scrapy.cfg')
|
|
if os.path.exists(cfgfile):
|
|
return cfgfile
|
|
return closest_scrapy_cfg(os.path.dirname(path), path)
|
|
|
|
|
|
def init_env(project='default', set_syspath=True):
|
|
"""Initialize environment to use command-line tool from inside a project
|
|
dir. This sets the Scrapy settings module and modifies the Python path to
|
|
be able to locate the project module.
|
|
"""
|
|
cfg = get_config()
|
|
if cfg.has_option('settings', project):
|
|
os.environ['SCRAPY_SETTINGS_MODULE'] = cfg.get('settings', project)
|
|
closest = closest_scrapy_cfg()
|
|
if closest:
|
|
projdir = os.path.dirname(closest)
|
|
if set_syspath and projdir not in sys.path:
|
|
sys.path.append(projdir)
|
|
|
|
|
|
def get_config(use_closest=True):
|
|
"""Get Scrapy config file as a ConfigParser"""
|
|
sources = get_sources(use_closest)
|
|
cfg = ConfigParser()
|
|
cfg.read(sources)
|
|
return cfg
|
|
|
|
|
|
def get_sources(use_closest=True):
|
|
xdg_config_home = os.environ.get('XDG_CONFIG_HOME') or os.path.expanduser('~/.config')
|
|
sources = [
|
|
'/etc/scrapy.cfg',
|
|
r'c:\scrapy\scrapy.cfg',
|
|
xdg_config_home + '/scrapy.cfg',
|
|
os.path.expanduser('~/.scrapy.cfg'),
|
|
]
|
|
if use_closest:
|
|
sources.append(closest_scrapy_cfg())
|
|
return sources
|
|
|
|
|
|
def feed_complete_default_values_from_settings(feed, settings):
|
|
out = feed.copy()
|
|
out.setdefault("batch_item_count", settings.getint('FEED_EXPORT_BATCH_ITEM_COUNT'))
|
|
out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"])
|
|
out.setdefault("fields", settings.getlist("FEED_EXPORT_FIELDS") or None)
|
|
out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY"))
|
|
out.setdefault("uri_params", settings["FEED_URI_PARAMS"])
|
|
out.setdefault("item_export_kwargs", dict())
|
|
if settings["FEED_EXPORT_INDENT"] is None:
|
|
out.setdefault("indent", None)
|
|
else:
|
|
out.setdefault("indent", settings.getint("FEED_EXPORT_INDENT"))
|
|
return out
|
|
|
|
|
|
def feed_process_params_from_cli(settings, output, output_format=None,
|
|
overwrite_output=None):
|
|
"""
|
|
Receives feed export params (from the 'crawl' or 'runspider' commands),
|
|
checks for inconsistencies in their quantities and returns a dictionary
|
|
suitable to be used as the FEEDS setting.
|
|
"""
|
|
valid_output_formats = without_none_values(
|
|
settings.getwithbase('FEED_EXPORTERS')
|
|
).keys()
|
|
|
|
def check_valid_format(output_format):
|
|
if output_format not in valid_output_formats:
|
|
raise UsageError(
|
|
f"Unrecognized output format '{output_format}'. "
|
|
f"Set a supported one ({tuple(valid_output_formats)}) "
|
|
"after a colon at the end of the output URI (i.e. -o/-O "
|
|
"<URI>:<FORMAT>) or as a file extension."
|
|
)
|
|
|
|
overwrite = False
|
|
if overwrite_output:
|
|
if output:
|
|
raise UsageError(
|
|
"Please use only one of -o/--output and -O/--overwrite-output"
|
|
)
|
|
output = overwrite_output
|
|
overwrite = True
|
|
|
|
if output_format:
|
|
if len(output) == 1:
|
|
check_valid_format(output_format)
|
|
message = (
|
|
'The -t command line option is deprecated in favor of '
|
|
'specifying the output format within the output URI. See the '
|
|
'documentation of the -o and -O options for more information.',
|
|
)
|
|
warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
|
|
return {output[0]: {'format': output_format}}
|
|
else:
|
|
raise UsageError(
|
|
'The -t command-line option cannot be used if multiple output '
|
|
'URIs are specified'
|
|
)
|
|
|
|
result = {}
|
|
for element in output:
|
|
try:
|
|
feed_uri, feed_format = element.rsplit(':', 1)
|
|
except ValueError:
|
|
feed_uri = element
|
|
feed_format = os.path.splitext(element)[1].replace('.', '')
|
|
else:
|
|
if feed_uri == '-':
|
|
feed_uri = 'stdout:'
|
|
check_valid_format(feed_format)
|
|
result[feed_uri] = {'format': feed_format}
|
|
if overwrite:
|
|
result[feed_uri]['overwrite'] = True
|
|
|
|
# FEEDS setting should take precedence over the matching CLI options
|
|
result.update(settings.getdict('FEEDS'))
|
|
|
|
return result
|