Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
|
|
@ -0,0 +1,75 @@
|
|||
import base64
|
||||
from urllib.parse import unquote, urlunparse
|
||||
from urllib.request import getproxies, proxy_bypass, _parse_proxy
|
||||
|
||||
from scrapy.exceptions import NotConfigured
|
||||
from scrapy.utils.httpobj import urlparse_cached
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class HttpProxyMiddleware:
|
||||
|
||||
def __init__(self, auth_encoding='latin-1'):
|
||||
self.auth_encoding = auth_encoding
|
||||
self.proxies = {}
|
||||
for type_, url in getproxies().items():
|
||||
try:
|
||||
self.proxies[type_] = self._get_proxy(url, type_)
|
||||
# some values such as '/var/run/docker.sock' can't be parsed
|
||||
# by _parse_proxy and as such should be skipped
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
|
||||
raise NotConfigured
|
||||
auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
|
||||
return cls(auth_encoding)
|
||||
|
||||
def _basic_auth_header(self, username, password):
|
||||
user_pass = to_bytes(
|
||||
f'{unquote(username)}:{unquote(password)}',
|
||||
encoding=self.auth_encoding)
|
||||
return base64.b64encode(user_pass)
|
||||
|
||||
def _get_proxy(self, url, orig_type):
|
||||
proxy_type, user, password, hostport = _parse_proxy(url)
|
||||
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
|
||||
|
||||
if user:
|
||||
creds = self._basic_auth_header(user, password)
|
||||
else:
|
||||
creds = None
|
||||
|
||||
return creds, proxy_url
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# ignore if proxy is already set
|
||||
if 'proxy' in request.meta:
|
||||
if request.meta['proxy'] is None:
|
||||
return
|
||||
# extract credentials if present
|
||||
creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
|
||||
request.meta['proxy'] = proxy_url
|
||||
if creds and not request.headers.get('Proxy-Authorization'):
|
||||
request.headers['Proxy-Authorization'] = b'Basic ' + creds
|
||||
return
|
||||
elif not self.proxies:
|
||||
return
|
||||
|
||||
parsed = urlparse_cached(request)
|
||||
scheme = parsed.scheme
|
||||
|
||||
# 'no_proxy' is only supported by http schemes
|
||||
if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
|
||||
return
|
||||
|
||||
if scheme in self.proxies:
|
||||
self._set_proxy(request, scheme)
|
||||
|
||||
def _set_proxy(self, request, scheme):
|
||||
creds, proxy = self.proxies[scheme]
|
||||
request.meta['proxy'] = proxy
|
||||
if creds:
|
||||
request.headers['Proxy-Authorization'] = b'Basic ' + creds
|
||||
Loading…
Add table
Add a link
Reference in a new issue