Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
100
venv/lib/python3.9/site-packages/scrapy/utils/request.py
Normal file
100
venv/lib/python3.9/site-packages/scrapy/utils/request.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
"""
|
||||
This module provides some useful functions for working with
|
||||
scrapy.http.Request objects
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import weakref
|
||||
from urllib.parse import urlunparse
|
||||
|
||||
from w3lib.http import basic_auth_header
|
||||
from w3lib.url import canonicalize_url
|
||||
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes, to_unicode
|
||||
|
||||
|
||||
_fingerprint_cache = weakref.WeakKeyDictionary()
|
||||
|
||||
|
||||
def request_fingerprint(request, include_headers=None, keep_fragments=False):
|
||||
"""
|
||||
Return the request fingerprint.
|
||||
|
||||
The request fingerprint is a hash that uniquely identifies the resource the
|
||||
request points to. For example, take the following two urls:
|
||||
|
||||
http://www.example.com/query?id=111&cat=222
|
||||
http://www.example.com/query?cat=222&id=111
|
||||
|
||||
Even though those are two different URLs both point to the same resource
|
||||
and are equivalent (i.e. they should return the same response).
|
||||
|
||||
Another example are cookies used to store session ids. Suppose the
|
||||
following page is only accessible to authenticated users:
|
||||
|
||||
http://www.example.com/members/offers.html
|
||||
|
||||
Lot of sites use a cookie to store the session id, which adds a random
|
||||
component to the HTTP Request and thus should be ignored when calculating
|
||||
the fingerprint.
|
||||
|
||||
For this reason, request headers are ignored by default when calculating
|
||||
the fingeprint. If you want to include specific headers use the
|
||||
include_headers argument, which is a list of Request headers to include.
|
||||
|
||||
Also, servers usually ignore fragments in urls when handling requests,
|
||||
so they are also ignored by default when calculating the fingerprint.
|
||||
If you want to include them, set the keep_fragments argument to True
|
||||
(for instance when handling requests with a headless browser).
|
||||
|
||||
"""
|
||||
if include_headers:
|
||||
include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers))
|
||||
cache = _fingerprint_cache.setdefault(request, {})
|
||||
cache_key = (include_headers, keep_fragments)
|
||||
if cache_key not in cache:
|
||||
fp = hashlib.sha1()
|
||||
fp.update(to_bytes(request.method))
|
||||
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
|
||||
fp.update(request.body or b'')
|
||||
if include_headers:
|
||||
for hdr in include_headers:
|
||||
if hdr in request.headers:
|
||||
fp.update(hdr)
|
||||
for v in request.headers.getlist(hdr):
|
||||
fp.update(v)
|
||||
cache[cache_key] = fp.hexdigest()
|
||||
return cache[cache_key]
|
||||
|
||||
|
||||
def request_authenticate(request, username, password):
|
||||
"""Autenticate the given request (in place) using the HTTP basic access
|
||||
authentication mechanism (RFC 2617) and the given username and password
|
||||
"""
|
||||
request.headers['Authorization'] = basic_auth_header(username, password)
|
||||
|
||||
|
||||
def request_httprepr(request):
|
||||
"""Return the raw HTTP representation (as bytes) of the given request.
|
||||
This is provided only for reference since it's not the actual stream of
|
||||
bytes that will be send when performing the request (that's controlled
|
||||
by Twisted).
|
||||
"""
|
||||
parsed = urlparse_cached(request)
|
||||
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
||||
s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
|
||||
s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
|
||||
if request.headers:
|
||||
s += request.headers.to_string() + b"\r\n"
|
||||
s += b"\r\n"
|
||||
s += request.body
|
||||
return s
|
||||
|
||||
|
||||
def referer_str(request):
|
||||
""" Return Referer HTTP header suitable for logging. """
|
||||
referrer = request.headers.get('Referer')
|
||||
if referrer is None:
|
||||
return referrer
|
||||
return to_unicode(referrer, errors='replace')
|
||||
Loading…
Add table
Add a link
Reference in a new issue