100 lines
3.7 KiB
Python
100 lines
3.7 KiB
Python
"""
|
|
This module provides some useful functions for working with
|
|
scrapy.http.Request objects
|
|
"""
|
|
|
|
import hashlib
|
|
import weakref
|
|
from urllib.parse import urlunparse
|
|
|
|
from w3lib.http import basic_auth_header
|
|
from w3lib.url import canonicalize_url
|
|
|
|
from scrapy.utils.httpobj import urlparse_cached
|
|
from scrapy.utils.python import to_bytes, to_unicode
|
|
|
|
|
|
_fingerprint_cache = weakref.WeakKeyDictionary()
|
|
|
|
|
|
def request_fingerprint(request, include_headers=None, keep_fragments=False):
|
|
"""
|
|
Return the request fingerprint.
|
|
|
|
The request fingerprint is a hash that uniquely identifies the resource the
|
|
request points to. For example, take the following two urls:
|
|
|
|
http://www.example.com/query?id=111&cat=222
|
|
http://www.example.com/query?cat=222&id=111
|
|
|
|
Even though those are two different URLs both point to the same resource
|
|
and are equivalent (i.e. they should return the same response).
|
|
|
|
Another example are cookies used to store session ids. Suppose the
|
|
following page is only accessible to authenticated users:
|
|
|
|
http://www.example.com/members/offers.html
|
|
|
|
Lot of sites use a cookie to store the session id, which adds a random
|
|
component to the HTTP Request and thus should be ignored when calculating
|
|
the fingerprint.
|
|
|
|
For this reason, request headers are ignored by default when calculating
|
|
the fingeprint. If you want to include specific headers use the
|
|
include_headers argument, which is a list of Request headers to include.
|
|
|
|
Also, servers usually ignore fragments in urls when handling requests,
|
|
so they are also ignored by default when calculating the fingerprint.
|
|
If you want to include them, set the keep_fragments argument to True
|
|
(for instance when handling requests with a headless browser).
|
|
|
|
"""
|
|
if include_headers:
|
|
include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers))
|
|
cache = _fingerprint_cache.setdefault(request, {})
|
|
cache_key = (include_headers, keep_fragments)
|
|
if cache_key not in cache:
|
|
fp = hashlib.sha1()
|
|
fp.update(to_bytes(request.method))
|
|
fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments)))
|
|
fp.update(request.body or b'')
|
|
if include_headers:
|
|
for hdr in include_headers:
|
|
if hdr in request.headers:
|
|
fp.update(hdr)
|
|
for v in request.headers.getlist(hdr):
|
|
fp.update(v)
|
|
cache[cache_key] = fp.hexdigest()
|
|
return cache[cache_key]
|
|
|
|
|
|
def request_authenticate(request, username, password):
|
|
"""Autenticate the given request (in place) using the HTTP basic access
|
|
authentication mechanism (RFC 2617) and the given username and password
|
|
"""
|
|
request.headers['Authorization'] = basic_auth_header(username, password)
|
|
|
|
|
|
def request_httprepr(request):
|
|
"""Return the raw HTTP representation (as bytes) of the given request.
|
|
This is provided only for reference since it's not the actual stream of
|
|
bytes that will be send when performing the request (that's controlled
|
|
by Twisted).
|
|
"""
|
|
parsed = urlparse_cached(request)
|
|
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
|
|
s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
|
|
s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
|
|
if request.headers:
|
|
s += request.headers.to_string() + b"\r\n"
|
|
s += b"\r\n"
|
|
s += request.body
|
|
return s
|
|
|
|
|
|
def referer_str(request):
|
|
""" Return Referer HTTP header suitable for logging. """
|
|
referrer = request.headers.get('Referer')
|
|
if referrer is None:
|
|
return referrer
|
|
return to_unicode(referrer, errors='replace')
|