Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,18 @@
"""
Module containing all HTTP related classes
Use this module (instead of the more specific ones) when importing Headers,
Request and Response outside this module.
"""
from scrapy.http.headers import Headers
from scrapy.http.request import Request
from scrapy.http.request.form import FormRequest
from scrapy.http.request.rpc import XmlRpcRequest
from scrapy.http.request.json_request import JsonRequest
from scrapy.http.response import Response
from scrapy.http.response.html import HtmlResponse
from scrapy.http.response.xml import XmlResponse
from scrapy.http.response.text import TextResponse

View file

@ -0,0 +1,6 @@
def obsolete_setter(setter, attrname):
def newsetter(self, value):
c = self.__class__.__name__
msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
raise AttributeError(msg)
return newsetter

View file

@ -0,0 +1,191 @@
import time
from http.cookiejar import CookieJar as _CookieJar, DefaultCookiePolicy, IPV4_RE
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_unicode
class CookieJar:
def __init__(self, policy=None, check_expired_frequency=10000):
self.policy = policy or DefaultCookiePolicy()
self.jar = _CookieJar(self.policy)
self.jar._cookies_lock = _DummyLock()
self.check_expired_frequency = check_expired_frequency
self.processed = 0
def extract_cookies(self, response, request):
wreq = WrappedRequest(request)
wrsp = WrappedResponse(response)
return self.jar.extract_cookies(wrsp, wreq)
def add_cookie_header(self, request):
wreq = WrappedRequest(request)
self.policy._now = self.jar._now = int(time.time())
# the cookiejar implementation iterates through all domains
# instead we restrict to potential matches on the domain
req_host = urlparse_cached(request).hostname
if not req_host:
return
if not IPV4_RE.search(req_host):
hosts = potential_domain_matches(req_host)
if '.' not in req_host:
hosts += [req_host + ".local"]
else:
hosts = [req_host]
cookies = []
for host in hosts:
if host in self.jar._cookies:
cookies += self.jar._cookies_for_domain(host, wreq)
attrs = self.jar._cookie_attrs(cookies)
if attrs:
if not wreq.has_header("Cookie"):
wreq.add_unredirected_header("Cookie", "; ".join(attrs))
self.processed += 1
if self.processed % self.check_expired_frequency == 0:
# This is still quite inefficient for large number of cookies
self.jar.clear_expired_cookies()
@property
def _cookies(self):
return self.jar._cookies
def clear_session_cookies(self, *args, **kwargs):
return self.jar.clear_session_cookies(*args, **kwargs)
def clear(self, domain=None, path=None, name=None):
return self.jar.clear(domain, path, name)
def __iter__(self):
return iter(self.jar)
def __len__(self):
return len(self.jar)
def set_policy(self, pol):
return self.jar.set_policy(pol)
def make_cookies(self, response, request):
wreq = WrappedRequest(request)
wrsp = WrappedResponse(response)
return self.jar.make_cookies(wrsp, wreq)
def set_cookie(self, cookie):
self.jar.set_cookie(cookie)
def set_cookie_if_ok(self, cookie, request):
self.jar.set_cookie_if_ok(cookie, WrappedRequest(request))
def potential_domain_matches(domain):
"""Potential domain matches for a cookie
>>> potential_domain_matches('www.example.com')
['www.example.com', 'example.com', '.www.example.com', '.example.com']
"""
matches = [domain]
try:
start = domain.index('.') + 1
end = domain.rindex('.')
while start < end:
matches.append(domain[start:])
start = domain.index('.', start) + 1
except ValueError:
pass
return matches + ['.' + d for d in matches]
class _DummyLock:
def acquire(self):
pass
def release(self):
pass
class WrappedRequest:
"""Wraps a scrapy Request class with methods defined by urllib2.Request class to interact with CookieJar class
see http://docs.python.org/library/urllib2.html#urllib2.Request
"""
def __init__(self, request):
self.request = request
def get_full_url(self):
return self.request.url
def get_host(self):
return urlparse_cached(self.request).netloc
def get_type(self):
return urlparse_cached(self.request).scheme
def is_unverifiable(self):
"""Unverifiable should indicate whether the request is unverifiable, as defined by RFC 2965.
It defaults to False. An unverifiable request is one whose URL the user did not have the
option to approve. For example, if the request is for an image in an
HTML document, and the user had no option to approve the automatic
fetching of the image, this should be true.
"""
return self.request.meta.get('is_unverifiable', False)
def get_origin_req_host(self):
return urlparse_cached(self.request).hostname
# python3 uses attributes instead of methods
@property
def full_url(self):
return self.get_full_url()
@property
def host(self):
return self.get_host()
@property
def type(self):
return self.get_type()
@property
def unverifiable(self):
return self.is_unverifiable()
@property
def origin_req_host(self):
return self.get_origin_req_host()
def has_header(self, name):
return name in self.request.headers
def get_header(self, name, default=None):
return to_unicode(self.request.headers.get(name, default),
errors='replace')
def header_items(self):
return [
(to_unicode(k, errors='replace'),
[to_unicode(x, errors='replace') for x in v])
for k, v in self.request.headers.items()
]
def add_unredirected_header(self, name, value):
self.request.headers.appendlist(name, value)
class WrappedResponse:
def __init__(self, response):
self.response = response
def info(self):
return self
def get_all(self, name, default=None):
return [to_unicode(v, errors='replace')
for v in self.response.headers.getlist(name)]

View file

@ -0,0 +1,89 @@
from w3lib.http import headers_dict_to_raw
from scrapy.utils.datatypes import CaselessDict
from scrapy.utils.python import to_unicode
class Headers(CaselessDict):
"""Case insensitive http headers dictionary"""
def __init__(self, seq=None, encoding='utf-8'):
self.encoding = encoding
super().__init__(seq)
def normkey(self, key):
"""Normalize key to bytes"""
return self._tobytes(key.title())
def normvalue(self, value):
"""Normalize values to bytes"""
if value is None:
value = []
elif isinstance(value, (str, bytes)):
value = [value]
elif not hasattr(value, '__iter__'):
value = [value]
return [self._tobytes(x) for x in value]
def _tobytes(self, x):
if isinstance(x, bytes):
return x
elif isinstance(x, str):
return x.encode(self.encoding)
elif isinstance(x, int):
return str(x).encode(self.encoding)
else:
raise TypeError(f'Unsupported value type: {type(x)}')
def __getitem__(self, key):
try:
return super().__getitem__(key)[-1]
except IndexError:
return None
def get(self, key, def_val=None):
try:
return super().get(key, def_val)[-1]
except IndexError:
return None
def getlist(self, key, def_val=None):
try:
return super().__getitem__(key)
except KeyError:
if def_val is not None:
return self.normvalue(def_val)
return []
def setlist(self, key, list_):
self[key] = list_
def setlistdefault(self, key, default_list=()):
return self.setdefault(key, default_list)
def appendlist(self, key, value):
lst = self.getlist(key)
lst.extend(self.normvalue(value))
self[key] = lst
def items(self):
return ((k, self.getlist(k)) for k in self.keys())
def values(self):
return [self[k] for k in self.keys()]
def to_string(self):
return headers_dict_to_raw(self)
def to_unicode_dict(self):
""" Return headers as a CaselessDict with unicode keys
and unicode values. Multiple values are joined with ','.
"""
return CaselessDict(
(to_unicode(key, encoding=self.encoding),
to_unicode(b','.join(value), encoding=self.encoding))
for key, value in self.items())
def __copy__(self):
return self.__class__(self)
copy = __copy__

View file

@ -0,0 +1,143 @@
"""
This module implements the Request class which is used to represent HTTP
requests in Scrapy.
See documentation in docs/topics/request-response.rst
"""
from w3lib.url import safe_url_string
from scrapy.http.headers import Headers
from scrapy.utils.python import to_bytes
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import escape_ajax
from scrapy.http.common import obsolete_setter
from scrapy.utils.curl import curl_to_request_kwargs
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None, cb_kwargs=None):
self._encoding = encoding # this one has to be set first
self.method = str(method).upper()
self._set_url(url)
self._set_body(body)
if not isinstance(priority, int):
raise TypeError(f"Request priority not an integer: {priority!r}")
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
if errback is not None and not callable(errback):
raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
self.callback = callback
self.errback = errback
self.cookies = cookies or {}
self.headers = Headers(headers or {}, encoding=encoding)
self.dont_filter = dont_filter
self._meta = dict(meta) if meta else None
self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
self.flags = [] if flags is None else list(flags)
@property
def cb_kwargs(self):
if self._cb_kwargs is None:
self._cb_kwargs = {}
return self._cb_kwargs
@property
def meta(self):
if self._meta is None:
self._meta = {}
return self._meta
def _get_url(self):
return self._url
def _set_url(self, url):
if not isinstance(url, str):
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
if (
'://' not in self._url
and not self._url.startswith('about:')
and not self._url.startswith('data:')
):
raise ValueError(f'Missing scheme in request url: {self._url}')
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
else:
self._body = to_bytes(body, self.encoding)
body = property(_get_body, obsolete_setter(_set_body, 'body'))
@property
def encoding(self):
return self._encoding
def __str__(self):
return f"<{self.method} {self.url}>"
__repr__ = __str__
def copy(self):
"""Return a copy of this Request"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Request with the same attributes except for those
given new values.
"""
for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags',
'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
@classmethod
def from_curl(cls, curl_command, ignore_unknown_options=True, **kwargs):
"""Create a Request object from a string containing a `cURL
<https://curl.haxx.se/>`_ command. It populates the HTTP method, the
URL, the headers, the cookies and the body. It accepts the same
arguments as the :class:`Request` class, taking preference and
overriding the values of the same arguments contained in the cURL
command.
Unrecognized options are ignored by default. To raise an error when
finding unknown options call this method by passing
``ignore_unknown_options=False``.
.. caution:: Using :meth:`from_curl` from :class:`~scrapy.http.Request`
subclasses, such as :class:`~scrapy.http.JSONRequest`, or
:class:`~scrapy.http.XmlRpcRequest`, as well as having
:ref:`downloader middlewares <topics-downloader-middleware>`
and
:ref:`spider middlewares <topics-spider-middleware>`
enabled, such as
:class:`~scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware`,
:class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`,
or
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
may modify the :class:`~scrapy.http.Request` object.
To translate a cURL command into a Scrapy request,
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
"""
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
request_kwargs.update(kwargs)
return cls(**request_kwargs)

View file

@ -0,0 +1,215 @@
"""
This module implements the FormRequest class which is a more convenient class
(than Request) to generate Requests based on form data.
See documentation in docs/topics/request-response.rst
"""
from urllib.parse import urljoin, urlencode
import lxml.html
from parsel.selector import create_root_node
from w3lib.html import strip_html5_whitespace
from scrapy.http.request import Request
from scrapy.utils.python import to_bytes, is_listlike
from scrapy.utils.response import get_base_url
class FormRequest(Request):
valid_form_methods = ['GET', 'POST']
def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'
super().__init__(*args, **kwargs)
if formdata:
items = formdata.items() if isinstance(formdata, dict) else formdata
querystr = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
self._set_body(querystr)
else:
self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)
@classmethod
def from_response(cls, response, formname=None, formid=None, formnumber=0, formdata=None,
clickdata=None, dont_click=False, formxpath=None, formcss=None, **kwargs):
kwargs.setdefault('encoding', response.encoding)
if formcss is not None:
from parsel.csstranslator import HTMLTranslator
formxpath = HTMLTranslator().css_to_xpath(formcss)
form = _get_form(response, formname, formid, formnumber, formxpath)
formdata = _get_inputs(form, formdata, dont_click, clickdata, response)
url = _get_form_url(form, kwargs.pop('url', None))
method = kwargs.pop('method', form.method)
if method is not None:
method = method.upper()
if method not in cls.valid_form_methods:
method = 'GET'
return cls(url=url, method=method, formdata=formdata, **kwargs)
def _get_form_url(form, url):
if url is None:
action = form.get('action')
if action is None:
return form.base_url
return urljoin(form.base_url, strip_html5_whitespace(action))
return urljoin(form.base_url, url)
def _urlencode(seq, enc):
values = [(to_bytes(k, enc), to_bytes(v, enc))
for k, vs in seq
for v in (vs if is_listlike(vs) else [vs])]
return urlencode(values, doseq=1)
def _get_form(response, formname, formid, formnumber, formxpath):
"""Find the form element """
root = create_root_node(response.text, lxml.html.HTMLParser,
base_url=get_base_url(response))
forms = root.xpath('//form')
if not forms:
raise ValueError(f"No <form> element found in {response}")
if formname is not None:
f = root.xpath(f'//form[@name="{formname}"]')
if f:
return f[0]
if formid is not None:
f = root.xpath(f'//form[@id="{formid}"]')
if f:
return f[0]
# Get form element from xpath, if not found, go up
if formxpath is not None:
nodes = root.xpath(formxpath)
if nodes:
el = nodes[0]
while True:
if el.tag == 'form':
return el
el = el.getparent()
if el is None:
break
raise ValueError(f'No <form> element found with {formxpath}')
# If we get here, it means that either formname was None
# or invalid
if formnumber is not None:
try:
form = forms[formnumber]
except IndexError:
raise IndexError(f"Form number {formnumber} not found in {response}")
else:
return form
def _get_inputs(form, formdata, dont_click, clickdata, response):
try:
formdata_keys = dict(formdata or ()).keys()
except (ValueError, TypeError):
raise ValueError('formdata should be a dict or iterable of tuples')
if not formdata:
formdata = ()
inputs = form.xpath('descendant::textarea'
'|descendant::select'
'|descendant::input[not(@type) or @type['
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
' and (../@checked or'
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
namespaces={
"re": "http://exslt.org/regular-expressions"})
values = [(k, '' if v is None else v)
for k, v in (_value(e) for e in inputs)
if k and k not in formdata_keys]
if not dont_click:
clickable = _get_clickable(clickdata, form)
if clickable and clickable[0] not in formdata and not clickable[0] is None:
values.append(clickable)
if isinstance(formdata, dict):
formdata = formdata.items()
values.extend((k, v) for k, v in formdata if v is not None)
return values
def _value(ele):
n = ele.name
v = ele.value
if ele.tag == 'select':
return _select_value(ele, n, v)
return n, v
def _select_value(ele, n, v):
multiple = ele.multiple
if v is None and not multiple:
# Match browser behaviour on simple select tag without options selected
# And for select tags wihout options
o = ele.value_options
return (n, o[0]) if o else (None, None)
elif v is not None and multiple:
# This is a workround to bug in lxml fixed 2.3.1
# fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
selected_options = ele.xpath('.//option[@selected]')
v = [(o.get('value') or o.text or '').strip() for o in selected_options]
return n, v
def _get_clickable(clickdata, form):
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first
clickable element found
"""
clickables = list(form.xpath(
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
namespaces={"re": "http://exslt.org/regular-expressions"}
))
if not clickables:
return
# If we don't have clickdata, we just use the first clickable element
if clickdata is None:
el = clickables[0]
return (el.get('name'), el.get('value') or '')
# If clickdata is given, we compare it to the clickable elements to find a
# match. We first look to see if the number is specified in clickdata,
# because that uniquely identifies the element
nr = clickdata.get('nr', None)
if nr is not None:
try:
el = list(form.inputs)[nr]
except IndexError:
pass
else:
return (el.get('name'), el.get('value') or '')
# We didn't find it, so now we build an XPath expression out of the other
# arguments, because they can be used as such
xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
el = form.xpath(xpath)
if len(el) == 1:
return (el[0].get('name'), el[0].get('value') or '')
elif len(el) > 1:
raise ValueError(f"Multiple elements found ({el!r}) matching the "
f"criteria in clickdata: {clickdata!r}")
else:
raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')

View file

@ -0,0 +1,57 @@
"""
This module implements the JsonRequest class which is a more convenient class
(than Request) to generate JSON Requests.
See documentation in docs/topics/request-response.rst
"""
import copy
import json
import warnings
from scrapy.http.request import Request
from scrapy.utils.deprecate import create_deprecated_class
class JsonRequest(Request):
def __init__(self, *args, **kwargs):
dumps_kwargs = copy.deepcopy(kwargs.pop('dumps_kwargs', {}))
dumps_kwargs.setdefault('sort_keys', True)
self._dumps_kwargs = dumps_kwargs
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
data_passed = data is not None
if body_passed and data_passed:
warnings.warn('Both body and data passed. data will be ignored')
elif not body_passed and data_passed:
kwargs['body'] = self._dumps(data)
if 'method' not in kwargs:
kwargs['method'] = 'POST'
super().__init__(*args, **kwargs)
self.headers.setdefault('Content-Type', 'application/json')
self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01')
def replace(self, *args, **kwargs):
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
data_passed = data is not None
if body_passed and data_passed:
warnings.warn('Both body and data passed. data will be ignored')
elif not body_passed and data_passed:
kwargs['body'] = self._dumps(data)
return super().replace(*args, **kwargs)
def _dumps(self, data):
"""Convert to JSON """
return json.dumps(data, **self._dumps_kwargs)
JSONRequest = create_deprecated_class("JSONRequest", JsonRequest)

View file

@ -0,0 +1,35 @@
"""
This module implements the XmlRpcRequest class which is a more convenient class
(that Request) to generate xml-rpc requests.
See documentation in docs/topics/request-response.rst
"""
import xmlrpc.client as xmlrpclib
from scrapy.http.request import Request
from scrapy.utils.python import get_func_args
DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
class XmlRpcRequest(Request):
def __init__(self, *args, **kwargs):
encoding = kwargs.get('encoding', None)
if 'body' not in kwargs and 'params' in kwargs:
kw = dict((k, kwargs.pop(k)) for k in DUMPS_ARGS if k in kwargs)
kwargs['body'] = xmlrpclib.dumps(**kw)
# spec defines that requests must use POST method
kwargs.setdefault('method', 'POST')
# xmlrpc query multiples times over the same url
kwargs.setdefault('dont_filter', True)
# restore encoding
if encoding is not None:
kwargs['encoding'] = encoding
super().__init__(*args, **kwargs)
self.headers.setdefault('Content-Type', 'text/xml')

View file

@ -0,0 +1,196 @@
"""
This module implements the Response class which is used to represent HTTP
responses in Scrapy.
See documentation in docs/topics/request-response.rst
"""
from typing import Generator
from urllib.parse import urljoin
from scrapy.exceptions import NotSupported
from scrapy.http.common import obsolete_setter
from scrapy.http.headers import Headers
from scrapy.http.request import Request
from scrapy.link import Link
from scrapy.utils.trackref import object_ref
class Response(object_ref):
def __init__(self, url, status=200, headers=None, body=b'', flags=None,
request=None, certificate=None, ip_address=None):
self.headers = Headers(headers or {})
self.status = int(status)
self._set_body(body)
self._set_url(url)
self.request = request
self.flags = [] if flags is None else list(flags)
self.certificate = certificate
self.ip_address = ip_address
@property
def cb_kwargs(self):
try:
return self.request.cb_kwargs
except AttributeError:
raise AttributeError(
"Response.cb_kwargs not available, this response "
"is not tied to any request"
)
@property
def meta(self):
try:
return self.request.meta
except AttributeError:
raise AttributeError(
"Response.meta not available, this response "
"is not tied to any request"
)
def _get_url(self):
return self._url
def _set_url(self, url):
if isinstance(url, str):
self._url = url
else:
raise TypeError(f'{type(self).__name__} url must be str, '
f'got {type(url).__name__}')
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
elif not isinstance(body, bytes):
raise TypeError(
"Response body must be bytes. "
"If you want to pass unicode body use TextResponse "
"or HtmlResponse.")
else:
self._body = body
body = property(_get_body, obsolete_setter(_set_body, 'body'))
def __str__(self):
return f"<{self.status} {self.url}>"
__repr__ = __str__
def copy(self):
"""Return a copy of this Response"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Response with the same attributes except for those
given new values.
"""
for x in ['url', 'status', 'headers', 'body',
'request', 'flags', 'certificate', 'ip_address']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
def urljoin(self, url):
"""Join this Response's url with a possible relative url to form an
absolute interpretation of the latter."""
return urljoin(self.url, url)
@property
def text(self):
"""For subclasses of TextResponse, this will return the body
as str
"""
raise AttributeError("Response content isn't text")
def css(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def xpath(self, *a, **kw):
"""Shortcut method implemented only by responses whose content
is text (subclasses of TextResponse).
"""
raise NotSupported("Response content isn't text")
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Request
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be a relative URL or a ``scrapy.link.Link`` object,
not only an absolute URL.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
.. versionadded:: 2.0
The *flags* parameter.
"""
if isinstance(url, Link):
url = url.url
elif url is None:
raise ValueError("url can't be None")
url = self.urljoin(url)
return Request(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Generator[Request, None, None]
"""
.. versionadded:: 2.0
Return an iterable of :class:`~.Request` instances to follow all links
in ``urls``. It accepts the same arguments as ``Request.__init__`` method,
but elements of ``urls`` can be relative URLs or :class:`~scrapy.link.Link` objects,
not only absolute URLs.
:class:`~.TextResponse` provides a :meth:`~.TextResponse.follow_all`
method which supports selectors in addition to absolute/relative URLs
and Link objects.
"""
if not hasattr(urls, '__iter__'):
raise TypeError("'urls' argument must be an iterable")
return (
self.follow(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
for url in urls
)

View file

@ -0,0 +1,12 @@
"""
This module implements the HtmlResponse class which adds encoding
discovering through HTML encoding declarations to the TextResponse class.
See documentation in docs/topics/request-response.rst
"""
from scrapy.http.response.text import TextResponse
class HtmlResponse(TextResponse):
pass

View file

@ -0,0 +1,265 @@
"""
This module implements the TextResponse class which adds encoding handling and
discovering (through HTTP headers) to base Response class.
See documentation in docs/topics/request-response.rst
"""
import json
import warnings
from contextlib import suppress
from typing import Generator
from urllib.parse import urljoin
import parsel
from w3lib.encoding import (html_body_declared_encoding, html_to_unicode,
http_content_type_encoding, resolve_encoding)
from w3lib.html import strip_html5_whitespace
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs, to_unicode
from scrapy.utils.response import get_base_url
_NONE = object()
class TextResponse(Response):
_DEFAULT_ENCODING = 'ascii'
_cached_decoded_json = _NONE
def __init__(self, *args, **kwargs):
self._encoding = kwargs.pop('encoding', None)
self._cached_benc = None
self._cached_ubody = None
self._cached_selector = None
super().__init__(*args, **kwargs)
def _set_url(self, url):
if isinstance(url, str):
self._url = to_unicode(url, self.encoding)
else:
super()._set_url(url)
def _set_body(self, body):
self._body = b'' # used by encoding detection
if isinstance(body, str):
if self._encoding is None:
raise TypeError('Cannot convert unicode body - '
f'{type(self).__name__} has no encoding')
self._body = body.encode(self._encoding)
else:
super()._set_body(body)
def replace(self, *args, **kwargs):
kwargs.setdefault('encoding', self.encoding)
return Response.replace(self, *args, **kwargs)
@property
def encoding(self):
return self._declared_encoding() or self._body_inferred_encoding()
def _declared_encoding(self):
return (
self._encoding
or self._headers_encoding()
or self._body_declared_encoding()
)
def body_as_unicode(self):
"""Return body as unicode"""
warnings.warn('Response.body_as_unicode() is deprecated, '
'please use Response.text instead.',
ScrapyDeprecationWarning, stacklevel=2)
return self.text
def json(self):
"""
.. versionadded:: 2.2
Deserialize a JSON document to a Python object.
"""
if self._cached_decoded_json is _NONE:
self._cached_decoded_json = json.loads(self.text)
return self._cached_decoded_json
@property
def text(self):
""" Body as unicode """
# access self.encoding before _cached_ubody to make sure
# _body_inferred_encoding is called
benc = self.encoding
if self._cached_ubody is None:
charset = f'charset={benc}'
self._cached_ubody = html_to_unicode(charset, self.body)[1]
return self._cached_ubody
def urljoin(self, url):
"""Join this Response's url with a possible relative url to form an
absolute interpretation of the latter."""
return urljoin(get_base_url(self), url)
@memoizemethod_noargs
def _headers_encoding(self):
content_type = self.headers.get(b'Content-Type', b'')
return http_content_type_encoding(to_unicode(content_type))
def _body_inferred_encoding(self):
if self._cached_benc is None:
content_type = to_unicode(self.headers.get(b'Content-Type', b''))
benc, ubody = html_to_unicode(content_type, self.body,
auto_detect_fun=self._auto_detect_fun,
default_encoding=self._DEFAULT_ENCODING)
self._cached_benc = benc
self._cached_ubody = ubody
return self._cached_benc
def _auto_detect_fun(self, text):
for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
try:
text.decode(enc)
except UnicodeError:
continue
return resolve_encoding(enc)
@memoizemethod_noargs
def _body_declared_encoding(self):
return html_body_declared_encoding(self.body)
@property
def selector(self):
from scrapy.selector import Selector
if self._cached_selector is None:
self._cached_selector = Selector(self)
return self._cached_selector
def xpath(self, query, **kwargs):
return self.selector.xpath(query, **kwargs)
def css(self, query):
return self.selector.css(query)
def follow(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding=None, priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None):
# type: (...) -> Request
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
but ``url`` can be not only an absolute URL, but also
* a relative URL
* a :class:`~scrapy.link.Link` object, e.g. the result of
:ref:`topics-link-extractors`
* a :class:`~scrapy.selector.Selector` object for a ``<link>`` or ``<a>`` element, e.g.
``response.css('a.my_link')[0]``
* an attribute :class:`~scrapy.selector.Selector` (not SelectorList), e.g.
``response.css('a::attr(href)')[0]`` or
``response.xpath('//img/@src')[0]``
See :ref:`response-follow-example` for usage examples.
"""
if isinstance(url, parsel.Selector):
url = _url_from_selector(url)
elif isinstance(url, parsel.SelectorList):
raise ValueError("SelectorList is not supported")
encoding = self.encoding if encoding is None else encoding
return super().follow(
url=url,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
def follow_all(self, urls=None, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding=None, priority=0,
dont_filter=False, errback=None, cb_kwargs=None, flags=None,
css=None, xpath=None):
# type: (...) -> Generator[Request, None, None]
"""
A generator that produces :class:`~.Request` instances to follow all
links in ``urls``. It accepts the same arguments as the :class:`~.Request`'s
``__init__`` method, except that each ``urls`` element does not need to be
an absolute URL, it can be any of the following:
* a relative URL
* a :class:`~scrapy.link.Link` object, e.g. the result of
:ref:`topics-link-extractors`
* a :class:`~scrapy.selector.Selector` object for a ``<link>`` or ``<a>`` element, e.g.
``response.css('a.my_link')[0]``
* an attribute :class:`~scrapy.selector.Selector` (not SelectorList), e.g.
``response.css('a::attr(href)')[0]`` or
``response.xpath('//img/@src')[0]``
In addition, ``css`` and ``xpath`` arguments are accepted to perform the link extraction
within the ``follow_all`` method (only one of ``urls``, ``css`` and ``xpath`` is accepted).
Note that when passing a ``SelectorList`` as argument for the ``urls`` parameter or
using the ``css`` or ``xpath`` parameters, this method will not produce requests for
selectors from which links cannot be obtained (for instance, anchor tags without an
``href`` attribute)
"""
arguments = [x for x in (urls, css, xpath) if x is not None]
if len(arguments) != 1:
raise ValueError(
"Please supply exactly one of the following arguments: urls, css, xpath"
)
if not urls:
if css:
urls = self.css(css)
if xpath:
urls = self.xpath(xpath)
if isinstance(urls, parsel.SelectorList):
selectors = urls
urls = []
for sel in selectors:
with suppress(_InvalidSelector):
urls.append(_url_from_selector(sel))
return super().follow_all(
urls=urls,
callback=callback,
method=method,
headers=headers,
body=body,
cookies=cookies,
meta=meta,
encoding=encoding,
priority=priority,
dont_filter=dont_filter,
errback=errback,
cb_kwargs=cb_kwargs,
flags=flags,
)
class _InvalidSelector(ValueError):
"""
Raised when a URL cannot be obtained from a Selector
"""
def _url_from_selector(sel):
# type: (parsel.Selector) -> str
if isinstance(sel.root, str):
# e.g. ::attr(href) result
return strip_html5_whitespace(sel.root)
if not hasattr(sel.root, 'tag'):
raise _InvalidSelector(f"Unsupported selector: {sel}")
if sel.root.tag not in ('a', 'link'):
raise _InvalidSelector("Only <a> and <link> elements are supported; "
f"got <{sel.root.tag}>")
href = sel.root.get('href')
if href is None:
raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
return strip_html5_whitespace(href)

View file

@ -0,0 +1,12 @@
"""
This module implements the XmlResponse class which adds encoding
discovering through XML encoding declarations to the TextResponse class.
See documentation in docs/topics/request-response.rst
"""
from scrapy.http.response.text import TextResponse
class XmlResponse(TextResponse):
pass