Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,143 @@
"""
This module implements the Request class which is used to represent HTTP
requests in Scrapy.
See documentation in docs/topics/request-response.rst
"""
from w3lib.url import safe_url_string
from scrapy.http.headers import Headers
from scrapy.utils.python import to_bytes
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import escape_ajax
from scrapy.http.common import obsolete_setter
from scrapy.utils.curl import curl_to_request_kwargs
class Request(object_ref):
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None, flags=None, cb_kwargs=None):
self._encoding = encoding # this one has to be set first
self.method = str(method).upper()
self._set_url(url)
self._set_body(body)
if not isinstance(priority, int):
raise TypeError(f"Request priority not an integer: {priority!r}")
self.priority = priority
if callback is not None and not callable(callback):
raise TypeError(f'callback must be a callable, got {type(callback).__name__}')
if errback is not None and not callable(errback):
raise TypeError(f'errback must be a callable, got {type(errback).__name__}')
self.callback = callback
self.errback = errback
self.cookies = cookies or {}
self.headers = Headers(headers or {}, encoding=encoding)
self.dont_filter = dont_filter
self._meta = dict(meta) if meta else None
self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
self.flags = [] if flags is None else list(flags)
@property
def cb_kwargs(self):
if self._cb_kwargs is None:
self._cb_kwargs = {}
return self._cb_kwargs
@property
def meta(self):
if self._meta is None:
self._meta = {}
return self._meta
def _get_url(self):
return self._url
def _set_url(self, url):
if not isinstance(url, str):
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
s = safe_url_string(url, self.encoding)
self._url = escape_ajax(s)
if (
'://' not in self._url
and not self._url.startswith('about:')
and not self._url.startswith('data:')
):
raise ValueError(f'Missing scheme in request url: {self._url}')
url = property(_get_url, obsolete_setter(_set_url, 'url'))
def _get_body(self):
return self._body
def _set_body(self, body):
if body is None:
self._body = b''
else:
self._body = to_bytes(body, self.encoding)
body = property(_get_body, obsolete_setter(_set_body, 'body'))
@property
def encoding(self):
return self._encoding
def __str__(self):
return f"<{self.method} {self.url}>"
__repr__ = __str__
def copy(self):
"""Return a copy of this Request"""
return self.replace()
def replace(self, *args, **kwargs):
"""Create a new Request with the same attributes except for those
given new values.
"""
for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags',
'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']:
kwargs.setdefault(x, getattr(self, x))
cls = kwargs.pop('cls', self.__class__)
return cls(*args, **kwargs)
@classmethod
def from_curl(cls, curl_command, ignore_unknown_options=True, **kwargs):
"""Create a Request object from a string containing a `cURL
<https://curl.haxx.se/>`_ command. It populates the HTTP method, the
URL, the headers, the cookies and the body. It accepts the same
arguments as the :class:`Request` class, taking preference and
overriding the values of the same arguments contained in the cURL
command.
Unrecognized options are ignored by default. To raise an error when
finding unknown options call this method by passing
``ignore_unknown_options=False``.
.. caution:: Using :meth:`from_curl` from :class:`~scrapy.http.Request`
subclasses, such as :class:`~scrapy.http.JSONRequest`, or
:class:`~scrapy.http.XmlRpcRequest`, as well as having
:ref:`downloader middlewares <topics-downloader-middleware>`
and
:ref:`spider middlewares <topics-spider-middleware>`
enabled, such as
:class:`~scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware`,
:class:`~scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`,
or
:class:`~scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware`,
may modify the :class:`~scrapy.http.Request` object.
To translate a cURL command into a Scrapy request,
you may use `curl2scrapy <https://michael-shub.github.io/curl2scrapy/>`_.
"""
request_kwargs = curl_to_request_kwargs(curl_command, ignore_unknown_options)
request_kwargs.update(kwargs)
return cls(**request_kwargs)

View file

@ -0,0 +1,215 @@
"""
This module implements the FormRequest class which is a more convenient class
(than Request) to generate Requests based on form data.
See documentation in docs/topics/request-response.rst
"""
from urllib.parse import urljoin, urlencode
import lxml.html
from parsel.selector import create_root_node
from w3lib.html import strip_html5_whitespace
from scrapy.http.request import Request
from scrapy.utils.python import to_bytes, is_listlike
from scrapy.utils.response import get_base_url
class FormRequest(Request):
valid_form_methods = ['GET', 'POST']
def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'
super().__init__(*args, **kwargs)
if formdata:
items = formdata.items() if isinstance(formdata, dict) else formdata
querystr = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
self._set_body(querystr)
else:
self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)
@classmethod
def from_response(cls, response, formname=None, formid=None, formnumber=0, formdata=None,
clickdata=None, dont_click=False, formxpath=None, formcss=None, **kwargs):
kwargs.setdefault('encoding', response.encoding)
if formcss is not None:
from parsel.csstranslator import HTMLTranslator
formxpath = HTMLTranslator().css_to_xpath(formcss)
form = _get_form(response, formname, formid, formnumber, formxpath)
formdata = _get_inputs(form, formdata, dont_click, clickdata, response)
url = _get_form_url(form, kwargs.pop('url', None))
method = kwargs.pop('method', form.method)
if method is not None:
method = method.upper()
if method not in cls.valid_form_methods:
method = 'GET'
return cls(url=url, method=method, formdata=formdata, **kwargs)
def _get_form_url(form, url):
if url is None:
action = form.get('action')
if action is None:
return form.base_url
return urljoin(form.base_url, strip_html5_whitespace(action))
return urljoin(form.base_url, url)
def _urlencode(seq, enc):
values = [(to_bytes(k, enc), to_bytes(v, enc))
for k, vs in seq
for v in (vs if is_listlike(vs) else [vs])]
return urlencode(values, doseq=1)
def _get_form(response, formname, formid, formnumber, formxpath):
"""Find the form element """
root = create_root_node(response.text, lxml.html.HTMLParser,
base_url=get_base_url(response))
forms = root.xpath('//form')
if not forms:
raise ValueError(f"No <form> element found in {response}")
if formname is not None:
f = root.xpath(f'//form[@name="{formname}"]')
if f:
return f[0]
if formid is not None:
f = root.xpath(f'//form[@id="{formid}"]')
if f:
return f[0]
# Get form element from xpath, if not found, go up
if formxpath is not None:
nodes = root.xpath(formxpath)
if nodes:
el = nodes[0]
while True:
if el.tag == 'form':
return el
el = el.getparent()
if el is None:
break
raise ValueError(f'No <form> element found with {formxpath}')
# If we get here, it means that either formname was None
# or invalid
if formnumber is not None:
try:
form = forms[formnumber]
except IndexError:
raise IndexError(f"Form number {formnumber} not found in {response}")
else:
return form
def _get_inputs(form, formdata, dont_click, clickdata, response):
try:
formdata_keys = dict(formdata or ()).keys()
except (ValueError, TypeError):
raise ValueError('formdata should be a dict or iterable of tuples')
if not formdata:
formdata = ()
inputs = form.xpath('descendant::textarea'
'|descendant::select'
'|descendant::input[not(@type) or @type['
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
' and (../@checked or'
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
namespaces={
"re": "http://exslt.org/regular-expressions"})
values = [(k, '' if v is None else v)
for k, v in (_value(e) for e in inputs)
if k and k not in formdata_keys]
if not dont_click:
clickable = _get_clickable(clickdata, form)
if clickable and clickable[0] not in formdata and not clickable[0] is None:
values.append(clickable)
if isinstance(formdata, dict):
formdata = formdata.items()
values.extend((k, v) for k, v in formdata if v is not None)
return values
def _value(ele):
n = ele.name
v = ele.value
if ele.tag == 'select':
return _select_value(ele, n, v)
return n, v
def _select_value(ele, n, v):
multiple = ele.multiple
if v is None and not multiple:
# Match browser behaviour on simple select tag without options selected
# And for select tags wihout options
o = ele.value_options
return (n, o[0]) if o else (None, None)
elif v is not None and multiple:
# This is a workround to bug in lxml fixed 2.3.1
# fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
selected_options = ele.xpath('.//option[@selected]')
v = [(o.get('value') or o.text or '').strip() for o in selected_options]
return n, v
def _get_clickable(clickdata, form):
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first
clickable element found
"""
clickables = list(form.xpath(
'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
'|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
namespaces={"re": "http://exslt.org/regular-expressions"}
))
if not clickables:
return
# If we don't have clickdata, we just use the first clickable element
if clickdata is None:
el = clickables[0]
return (el.get('name'), el.get('value') or '')
# If clickdata is given, we compare it to the clickable elements to find a
# match. We first look to see if the number is specified in clickdata,
# because that uniquely identifies the element
nr = clickdata.get('nr', None)
if nr is not None:
try:
el = list(form.inputs)[nr]
except IndexError:
pass
else:
return (el.get('name'), el.get('value') or '')
# We didn't find it, so now we build an XPath expression out of the other
# arguments, because they can be used as such
xpath = './/*' + ''.join(f'[@{k}="{v}"]' for k, v in clickdata.items())
el = form.xpath(xpath)
if len(el) == 1:
return (el[0].get('name'), el[0].get('value') or '')
elif len(el) > 1:
raise ValueError(f"Multiple elements found ({el!r}) matching the "
f"criteria in clickdata: {clickdata!r}")
else:
raise ValueError(f'No clickable element matching clickdata: {clickdata!r}')

View file

@ -0,0 +1,57 @@
"""
This module implements the JsonRequest class which is a more convenient class
(than Request) to generate JSON Requests.
See documentation in docs/topics/request-response.rst
"""
import copy
import json
import warnings
from scrapy.http.request import Request
from scrapy.utils.deprecate import create_deprecated_class
class JsonRequest(Request):
def __init__(self, *args, **kwargs):
dumps_kwargs = copy.deepcopy(kwargs.pop('dumps_kwargs', {}))
dumps_kwargs.setdefault('sort_keys', True)
self._dumps_kwargs = dumps_kwargs
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
data_passed = data is not None
if body_passed and data_passed:
warnings.warn('Both body and data passed. data will be ignored')
elif not body_passed and data_passed:
kwargs['body'] = self._dumps(data)
if 'method' not in kwargs:
kwargs['method'] = 'POST'
super().__init__(*args, **kwargs)
self.headers.setdefault('Content-Type', 'application/json')
self.headers.setdefault('Accept', 'application/json, text/javascript, */*; q=0.01')
def replace(self, *args, **kwargs):
body_passed = kwargs.get('body', None) is not None
data = kwargs.pop('data', None)
data_passed = data is not None
if body_passed and data_passed:
warnings.warn('Both body and data passed. data will be ignored')
elif not body_passed and data_passed:
kwargs['body'] = self._dumps(data)
return super().replace(*args, **kwargs)
def _dumps(self, data):
"""Convert to JSON """
return json.dumps(data, **self._dumps_kwargs)
JSONRequest = create_deprecated_class("JSONRequest", JsonRequest)

View file

@ -0,0 +1,35 @@
"""
This module implements the XmlRpcRequest class which is a more convenient class
(that Request) to generate xml-rpc requests.
See documentation in docs/topics/request-response.rst
"""
import xmlrpc.client as xmlrpclib
from scrapy.http.request import Request
from scrapy.utils.python import get_func_args
DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
class XmlRpcRequest(Request):
def __init__(self, *args, **kwargs):
encoding = kwargs.get('encoding', None)
if 'body' not in kwargs and 'params' in kwargs:
kw = dict((k, kwargs.pop(k)) for k in DUMPS_ARGS if k in kwargs)
kwargs['body'] = xmlrpclib.dumps(**kw)
# spec defines that requests must use POST method
kwargs.setdefault('method', 'POST')
# xmlrpc query multiples times over the same url
kwargs.setdefault('dont_filter', True)
# restore encoding
if encoding is not None:
kwargs['encoding'] = encoding
super().__init__(*args, **kwargs)
self.headers.setdefault('Content-Type', 'text/xml')