Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
3
venv/lib/python3.9/site-packages/w3lib/__init__.py
Normal file
3
venv/lib/python3.9/site-packages/w3lib/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
__version__ = "1.22.0"
|
||||
version_info = tuple(int(v) if v.isdigit() else v
|
||||
for v in __version__.split('.'))
|
||||
274
venv/lib/python3.9/site-packages/w3lib/encoding.py
Normal file
274
venv/lib/python3.9/site-packages/w3lib/encoding.py
Normal file
|
|
@ -0,0 +1,274 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Functions for handling encoding of web pages
|
||||
"""
|
||||
import re, codecs, encodings
|
||||
from sys import version_info
|
||||
|
||||
_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
|
||||
|
||||
def http_content_type_encoding(content_type):
|
||||
"""Extract the encoding in the content-type header
|
||||
|
||||
>>> import w3lib.encoding
|
||||
>>> w3lib.encoding.http_content_type_encoding("Content-Type: text/html; charset=ISO-8859-4")
|
||||
'iso8859-4'
|
||||
|
||||
"""
|
||||
|
||||
if content_type:
|
||||
match = _HEADER_ENCODING_RE.search(content_type)
|
||||
if match:
|
||||
return resolve_encoding(match.group(1))
|
||||
|
||||
# regexp for parsing HTTP meta tags
|
||||
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
|
||||
_SKIP_ATTRS = '''(?:\\s+
|
||||
[^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name
|
||||
(?:\\s*=\\s*
|
||||
(?: # ' and " are entity encoded (', "), so no need for \', \"
|
||||
'[^']*' # attr in '
|
||||
|
|
||||
"[^"]*" # attr in "
|
||||
|
|
||||
[^'"\\s]+ # attr having no ' nor "
|
||||
))?
|
||||
)*?''' # must be used with re.VERBOSE flag
|
||||
_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
|
||||
_CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
|
||||
_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
|
||||
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
|
||||
|
||||
# check for meta tags, or xml decl. and stop search if a body tag is encountered
|
||||
_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
|
||||
_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
|
||||
_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE)
|
||||
_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'),
|
||||
re.I | re.VERBOSE)
|
||||
|
||||
def html_body_declared_encoding(html_body_str):
|
||||
'''Return the encoding specified in meta tags in the html body,
|
||||
or ``None`` if no suitable encoding was found
|
||||
|
||||
>>> import w3lib.encoding
|
||||
>>> w3lib.encoding.html_body_declared_encoding(
|
||||
... """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
... "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
... <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
... <head>
|
||||
... <title>Some title</title>
|
||||
... <meta http-equiv="content-type" content="text/html;charset=utf-8" />
|
||||
... </head>
|
||||
... <body>
|
||||
... ...
|
||||
... </body>
|
||||
... </html>""")
|
||||
'utf-8'
|
||||
>>>
|
||||
|
||||
'''
|
||||
|
||||
# html5 suggests the first 1024 bytes are sufficient, we allow for more
|
||||
chunk = html_body_str[:4096]
|
||||
if isinstance(chunk, bytes):
|
||||
match = _BODY_ENCODING_BYTES_RE.search(chunk)
|
||||
else:
|
||||
match = _BODY_ENCODING_STR_RE.search(chunk)
|
||||
|
||||
if match:
|
||||
encoding = match.group('charset') or match.group('charset2') \
|
||||
or match.group('xmlcharset')
|
||||
if encoding:
|
||||
return resolve_encoding(encoding)
|
||||
|
||||
# Default encoding translation
|
||||
# this maps cannonicalized encodings to target encodings
|
||||
# see http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#character-encodings-0
|
||||
# in addition, gb18030 supercedes gb2312 & gbk
|
||||
# the keys are converted using _c18n_encoding and in sorted order
|
||||
DEFAULT_ENCODING_TRANSLATION = {
|
||||
'ascii': 'cp1252',
|
||||
'big5': 'big5hkscs',
|
||||
'euc_kr': 'cp949',
|
||||
'gb2312': 'gb18030',
|
||||
'gb_2312_80': 'gb18030',
|
||||
'gbk': 'gb18030',
|
||||
'iso8859_11': 'cp874',
|
||||
'iso8859_9': 'cp1254',
|
||||
'latin_1': 'cp1252',
|
||||
'macintosh': 'mac_roman',
|
||||
'shift_jis': 'cp932',
|
||||
'tis_620': 'cp874',
|
||||
'win_1251': 'cp1251',
|
||||
'windows_31j': 'cp932',
|
||||
'win_31j': 'cp932',
|
||||
'windows_874': 'cp874',
|
||||
'win_874': 'cp874',
|
||||
'x_sjis': 'cp932',
|
||||
'zh_cn': 'gb18030'
|
||||
}
|
||||
|
||||
def _c18n_encoding(encoding):
|
||||
"""Canonicalize an encoding name
|
||||
|
||||
This performs normalization and translates aliases using python's
|
||||
encoding aliases
|
||||
"""
|
||||
normed = encodings.normalize_encoding(encoding).lower()
|
||||
return encodings.aliases.aliases.get(normed, normed)
|
||||
|
||||
def resolve_encoding(encoding_alias):
|
||||
"""Return the encoding that `encoding_alias` maps to, or ``None``
|
||||
if the encoding cannot be interpreted
|
||||
|
||||
>>> import w3lib.encoding
|
||||
>>> w3lib.encoding.resolve_encoding('latin1')
|
||||
'cp1252'
|
||||
>>> w3lib.encoding.resolve_encoding('gb_2312-80')
|
||||
'gb18030'
|
||||
>>>
|
||||
|
||||
"""
|
||||
c18n_encoding = _c18n_encoding(encoding_alias)
|
||||
translated = DEFAULT_ENCODING_TRANSLATION.get(c18n_encoding, c18n_encoding)
|
||||
try:
|
||||
return codecs.lookup(translated).name
|
||||
except LookupError:
|
||||
return None
|
||||
|
||||
_BOM_TABLE = [
|
||||
(codecs.BOM_UTF32_BE, 'utf-32-be'),
|
||||
(codecs.BOM_UTF32_LE, 'utf-32-le'),
|
||||
(codecs.BOM_UTF16_BE, 'utf-16-be'),
|
||||
(codecs.BOM_UTF16_LE, 'utf-16-le'),
|
||||
(codecs.BOM_UTF8, 'utf-8')
|
||||
]
|
||||
_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
|
||||
|
||||
def read_bom(data):
|
||||
r"""Read the byte order mark in the text, if present, and
|
||||
return the encoding represented by the BOM and the BOM.
|
||||
|
||||
If no BOM can be detected, ``(None, None)`` is returned.
|
||||
|
||||
>>> import w3lib.encoding
|
||||
>>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34')
|
||||
('utf-16-be', '\xfe\xff')
|
||||
>>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c')
|
||||
('utf-16-le', '\xff\xfe')
|
||||
>>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34')
|
||||
('utf-32-be', '\x00\x00\xfe\xff')
|
||||
>>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00')
|
||||
('utf-32-le', '\xff\xfe\x00\x00')
|
||||
>>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04')
|
||||
(None, None)
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
# common case is no BOM, so this is fast
|
||||
if data and data[0] in _FIRST_CHARS:
|
||||
for bom, encoding in _BOM_TABLE:
|
||||
if data.startswith(bom):
|
||||
return encoding, bom
|
||||
return None, None
|
||||
|
||||
# Python decoder doesn't follow unicode standard when handling
|
||||
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
|
||||
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end))
|
||||
|
||||
def to_unicode(data_str, encoding):
|
||||
"""Convert a str object to unicode using the encoding given
|
||||
|
||||
Characters that cannot be converted will be converted to ``\\ufffd`` (the
|
||||
unicode replacement character).
|
||||
"""
|
||||
return data_str.decode(encoding, 'replace' if version_info[0:2] >= (3, 3) else 'w3lib_replace')
|
||||
|
||||
def html_to_unicode(content_type_header, html_body_str,
|
||||
default_encoding='utf8', auto_detect_fun=None):
|
||||
r'''Convert raw html bytes to unicode
|
||||
|
||||
This attempts to make a reasonable guess at the content encoding of the
|
||||
html body, following a similar process to a web browser.
|
||||
|
||||
It will try in order:
|
||||
|
||||
* http content type header
|
||||
* BOM (byte-order mark)
|
||||
* meta or xml tag declarations
|
||||
* auto-detection, if the `auto_detect_fun` keyword argument is not ``None``
|
||||
* default encoding in keyword arg (which defaults to utf8)
|
||||
|
||||
If an encoding other than the auto-detected or default encoding is used,
|
||||
overrides will be applied, converting some character encodings to more
|
||||
suitable alternatives.
|
||||
|
||||
If a BOM is found matching the encoding, it will be stripped.
|
||||
|
||||
The `auto_detect_fun` argument can be used to pass a function that will
|
||||
sniff the encoding of the text. This function must take the raw text as an
|
||||
argument and return the name of an encoding that python can process, or
|
||||
None. To use chardet, for example, you can define the function as::
|
||||
|
||||
auto_detect_fun=lambda x: chardet.detect(x).get('encoding')
|
||||
|
||||
or to use UnicodeDammit (shipped with the BeautifulSoup library)::
|
||||
|
||||
auto_detect_fun=lambda x: UnicodeDammit(x).originalEncoding
|
||||
|
||||
If the locale of the website or user language preference is known, then a
|
||||
better default encoding can be supplied.
|
||||
|
||||
If `content_type_header` is not present, ``None`` can be passed signifying
|
||||
that the header was not present.
|
||||
|
||||
This method will not fail, if characters cannot be converted to unicode,
|
||||
``\\ufffd`` (the unicode replacement character) will be inserted instead.
|
||||
|
||||
Returns a tuple of ``(<encoding used>, <unicode_string>)``
|
||||
|
||||
Examples:
|
||||
|
||||
>>> import w3lib.encoding
|
||||
>>> w3lib.encoding.html_to_unicode(None,
|
||||
... b"""<!DOCTYPE html>
|
||||
... <head>
|
||||
... <meta charset="UTF-8" />
|
||||
... <meta name="viewport" content="width=device-width" />
|
||||
... <title>Creative Commons France</title>
|
||||
... <link rel='canonical' href='http://creativecommons.fr/' />
|
||||
... <body>
|
||||
... <p>Creative Commons est une organisation \xc3\xa0 but non lucratif
|
||||
... qui a pour dessein de faciliter la diffusion et le partage des oeuvres
|
||||
... tout en accompagnant les nouvelles pratiques de cr\xc3\xa9ation \xc3\xa0 l\xe2\x80\x99\xc3\xa8re numerique.</p>
|
||||
... </body>
|
||||
... </html>""")
|
||||
('utf-8', u'<!DOCTYPE html>\n<head>\n<meta charset="UTF-8" />\n<meta name="viewport" content="width=device-width" />\n<title>Creative Commons France</title>\n<link rel=\'canonical\' href=\'http://creativecommons.fr/\' />\n<body>\n<p>Creative Commons est une organisation \xe0 but non lucratif\nqui a pour dessein de faciliter la diffusion et le partage des oeuvres\ntout en accompagnant les nouvelles pratiques de cr\xe9ation \xe0 l\u2019\xe8re numerique.</p>\n</body>\n</html>')
|
||||
>>>
|
||||
|
||||
'''
|
||||
|
||||
enc = http_content_type_encoding(content_type_header)
|
||||
bom_enc, bom = read_bom(html_body_str)
|
||||
if enc is not None:
|
||||
# remove BOM if it agrees with the encoding
|
||||
if enc == bom_enc:
|
||||
html_body_str = html_body_str[len(bom):]
|
||||
elif enc == 'utf-16' or enc == 'utf-32':
|
||||
# read endianness from BOM, or default to big endian
|
||||
# tools.ietf.org/html/rfc2781 section 4.3
|
||||
if bom_enc is not None and bom_enc.startswith(enc):
|
||||
enc = bom_enc
|
||||
html_body_str = html_body_str[len(bom):]
|
||||
else:
|
||||
enc += '-be'
|
||||
return enc, to_unicode(html_body_str, enc)
|
||||
if bom_enc is not None:
|
||||
return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc)
|
||||
enc = html_body_declared_encoding(html_body_str)
|
||||
if enc is None and (auto_detect_fun is not None):
|
||||
enc = auto_detect_fun(html_body_str)
|
||||
if enc is None:
|
||||
enc = default_encoding
|
||||
return enc, to_unicode(html_body_str, enc)
|
||||
68
venv/lib/python3.9/site-packages/w3lib/form.py
Normal file
68
venv/lib/python3.9/site-packages/w3lib/form.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
import warnings
|
||||
import six
|
||||
if six.PY2:
|
||||
from cStringIO import StringIO as BytesIO
|
||||
else:
|
||||
from io import BytesIO
|
||||
from w3lib.util import unicode_to_str
|
||||
|
||||
|
||||
def encode_multipart(data):
|
||||
r"""
|
||||
|
||||
.. warning::
|
||||
|
||||
This function is deprecated and will be removed in future.
|
||||
Please use ``urllib3.filepost.encode_multipart_formdata`` instead.
|
||||
|
||||
Encode the given data to be used in a multipart HTTP POST.
|
||||
|
||||
`data` is a dictionary where keys are the field name, and values are
|
||||
either strings or tuples as `(filename, content)` for file uploads.
|
||||
|
||||
This code is based on :class:`distutils.command.upload`.
|
||||
|
||||
Returns a `(body, boundary)` tuple where `body` is binary body value,
|
||||
and `boundary` is the boundary used (as native string).
|
||||
|
||||
>>> import w3lib.form
|
||||
>>> w3lib.form.encode_multipart({'key': 'value'})
|
||||
('\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254\r\nContent-Disposition: form-data; name="key"\r\n\r\nvalue\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254--\r\n', '--------------GHSKFJDLGDS7543FJKLFHRE75642756743254')
|
||||
>>> w3lib.form.encode_multipart({'key1': 'value1', 'key2': 'value2'}) # doctest: +SKIP
|
||||
('\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254\r\nContent-Disposition: form-data; name="key2"\r\n\r\nvalue2\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254\r\nContent-Disposition: form-data; name="key1"\r\n\r\nvalue1\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254--\r\n', '--------------GHSKFJDLGDS7543FJKLFHRE75642756743254')
|
||||
>>> w3lib.form.encode_multipart({'somekey': ('path/to/filename', b'\xa1\xa2\xa3\xa4\r\n\r')})
|
||||
('\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254\r\nContent-Disposition: form-data; name="somekey"; filename="path/to/filename"\r\n\r\n\xa1\xa2\xa3\xa4\r\n\r\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254--\r\n', '--------------GHSKFJDLGDS7543FJKLFHRE75642756743254')
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
warnings.warn(
|
||||
"`w3lib.form.encode_multipart` function is deprecated and "
|
||||
"will be removed in future releases. Please use "
|
||||
"`urllib3.filepost.encode_multipart_formdata` instead.",
|
||||
DeprecationWarning
|
||||
)
|
||||
|
||||
# Build up the MIME payload for the POST data
|
||||
boundary = '--------------GHSKFJDLGDS7543FJKLFHRE75642756743254'
|
||||
sep_boundary = b'\r\n--' + boundary.encode('ascii')
|
||||
end_boundary = sep_boundary + b'--'
|
||||
body = BytesIO()
|
||||
for key, value in data.items():
|
||||
title = u'\r\nContent-Disposition: form-data; name="%s"' % key
|
||||
# handle multiple entries for the same name
|
||||
if type(value) != type([]):
|
||||
value = [value]
|
||||
for value in value:
|
||||
if type(value) is tuple:
|
||||
title += u'; filename="%s"' % value[0]
|
||||
value = value[1]
|
||||
else:
|
||||
value = unicode_to_str(value) # in distutils: str(value).encode('utf-8')
|
||||
body.write(sep_boundary)
|
||||
body.write(title.encode('utf-8'))
|
||||
body.write(b"\r\n\r\n")
|
||||
body.write(value)
|
||||
body.write(end_boundary)
|
||||
body.write(b"\r\n")
|
||||
return body.getvalue(), boundary
|
||||
336
venv/lib/python3.9/site-packages/w3lib/html.py
Normal file
336
venv/lib/python3.9/site-packages/w3lib/html.py
Normal file
|
|
@ -0,0 +1,336 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Functions for dealing with markup text
|
||||
"""
|
||||
|
||||
import warnings
|
||||
import re
|
||||
import six
|
||||
from six import moves
|
||||
|
||||
from w3lib.util import to_bytes, to_unicode
|
||||
from w3lib.url import safe_url_string
|
||||
|
||||
_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
|
||||
_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
|
||||
_baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I)
|
||||
_meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
|
||||
_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
|
||||
|
||||
HTML5_WHITESPACE = ' \t\n\r\x0c'
|
||||
|
||||
|
||||
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
|
||||
r"""
|
||||
|
||||
.. warning::
|
||||
|
||||
This function is deprecated and will be removed in future.
|
||||
Please use :func:`replace_entities` instead.
|
||||
"""
|
||||
|
||||
warnings.warn(
|
||||
"`w3lib.html.remove_entities` function is deprecated and "
|
||||
"will be removed in future releases. Please use "
|
||||
"`w3lib.html.replace_entities` instead.",
|
||||
DeprecationWarning
|
||||
)
|
||||
|
||||
return replace_entities(text, keep, remove_illegal, encoding)
|
||||
|
||||
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
|
||||
u"""Remove entities from the given `text` by converting them to their
|
||||
corresponding unicode character.
|
||||
|
||||
`text` can be a unicode string or a byte string encoded in the given
|
||||
`encoding` (which defaults to 'utf-8').
|
||||
|
||||
If `keep` is passed (with a list of entity names) those entities will
|
||||
be kept (they won't be removed).
|
||||
|
||||
It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
|
||||
and named entities (such as `` `` or ``>``).
|
||||
|
||||
If `remove_illegal` is ``True``, entities that can't be converted are removed.
|
||||
If `remove_illegal` is ``False``, entities that can't be converted are kept "as
|
||||
is". For more information see the tests.
|
||||
|
||||
Always returns a unicode string (with the entities removed).
|
||||
|
||||
>>> import w3lib.html
|
||||
>>> w3lib.html.replace_entities(b'Price: £100')
|
||||
u'Price: \\xa3100'
|
||||
>>> print(w3lib.html.replace_entities(b'Price: £100'))
|
||||
Price: £100
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
def convert_entity(m):
|
||||
groups = m.groupdict()
|
||||
if groups.get('dec'):
|
||||
number = int(groups['dec'], 10)
|
||||
elif groups.get('hex'):
|
||||
number = int(groups['hex'], 16)
|
||||
elif groups.get('named'):
|
||||
entity_name = groups['named']
|
||||
if entity_name.lower() in keep:
|
||||
return m.group(0)
|
||||
else:
|
||||
number = (moves.html_entities.name2codepoint.get(entity_name) or
|
||||
moves.html_entities.name2codepoint.get(entity_name.lower()))
|
||||
if number is not None:
|
||||
# Numeric character references in the 80-9F range are typically
|
||||
# interpreted by browsers as representing the characters mapped
|
||||
# to bytes 80-9F in the Windows-1252 encoding. For more info
|
||||
# see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
|
||||
try:
|
||||
if 0x80 <= number <= 0x9f:
|
||||
return six.int2byte(number).decode('cp1252')
|
||||
else:
|
||||
return six.unichr(number)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return u'' if remove_illegal and groups.get('semicolon') else m.group(0)
|
||||
|
||||
return _ent_re.sub(convert_entity, to_unicode(text, encoding))
|
||||
|
||||
def has_entities(text, encoding=None):
|
||||
return bool(_ent_re.search(to_unicode(text, encoding)))
|
||||
|
||||
def replace_tags(text, token='', encoding=None):
|
||||
"""Replace all markup tags found in the given `text` by the given token.
|
||||
By default `token` is an empty string so it just removes all tags.
|
||||
|
||||
`text` can be a unicode string or a regular string encoded as `encoding`
|
||||
(or ``'utf-8'`` if `encoding` is not given.)
|
||||
|
||||
Always returns a unicode string.
|
||||
|
||||
Examples:
|
||||
|
||||
>>> import w3lib.html
|
||||
>>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
|
||||
u'This text contains some tag'
|
||||
>>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')
|
||||
u' -- Je ne parle pas -- fran\\xe7ais -- -- '
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
return _tag_re.sub(token, to_unicode(text, encoding))
|
||||
|
||||
|
||||
_REMOVECOMMENTS_RE = re.compile(u'<!--.*?(?:-->|$)', re.DOTALL)
|
||||
def remove_comments(text, encoding=None):
|
||||
""" Remove HTML Comments.
|
||||
|
||||
>>> import w3lib.html
|
||||
>>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
|
||||
u'test whatever'
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
text = to_unicode(text, encoding)
|
||||
return _REMOVECOMMENTS_RE.sub(u'', text)
|
||||
|
||||
def remove_tags(text, which_ones=(), keep=(), encoding=None):
|
||||
""" Remove HTML Tags only.
|
||||
|
||||
`which_ones` and `keep` are both tuples, there are four cases:
|
||||
|
||||
============== ============= ==========================================
|
||||
``which_ones`` ``keep`` what it does
|
||||
============== ============= ==========================================
|
||||
**not empty** empty remove all tags in ``which_ones``
|
||||
empty **not empty** remove all tags except the ones in ``keep``
|
||||
empty empty remove all tags
|
||||
**not empty** **not empty** not allowed
|
||||
============== ============= ==========================================
|
||||
|
||||
|
||||
Remove all tags:
|
||||
|
||||
>>> import w3lib.html
|
||||
>>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
|
||||
>>> w3lib.html.remove_tags(doc)
|
||||
u'This is a link: example'
|
||||
>>>
|
||||
|
||||
Keep only some tags:
|
||||
|
||||
>>> w3lib.html.remove_tags(doc, keep=('div',))
|
||||
u'<div>This is a link: example</div>'
|
||||
>>>
|
||||
|
||||
Remove only specific tags:
|
||||
|
||||
>>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
|
||||
u'<div><p>This is a link: example</p></div>'
|
||||
>>>
|
||||
|
||||
You can't remove some and keep some:
|
||||
|
||||
>>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Cannot use both which_ones and keep
|
||||
>>>
|
||||
|
||||
"""
|
||||
if which_ones and keep:
|
||||
raise ValueError('Cannot use both which_ones and keep')
|
||||
|
||||
which_ones = {tag.lower() for tag in which_ones}
|
||||
keep = {tag.lower() for tag in keep}
|
||||
|
||||
def will_remove(tag):
|
||||
tag = tag.lower()
|
||||
if which_ones:
|
||||
return tag in which_ones
|
||||
else:
|
||||
return tag not in keep
|
||||
|
||||
def remove_tag(m):
|
||||
tag = m.group(1)
|
||||
return u'' if will_remove(tag) else m.group(0)
|
||||
|
||||
regex = '</?([^ >/]+).*?>'
|
||||
retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
return retags.sub(remove_tag, to_unicode(text, encoding))
|
||||
|
||||
def remove_tags_with_content(text, which_ones=(), encoding=None):
|
||||
"""Remove tags and their content.
|
||||
|
||||
`which_ones` is a tuple of which tags to remove including their content.
|
||||
If is empty, returns the string unmodified.
|
||||
|
||||
>>> import w3lib.html
|
||||
>>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
|
||||
>>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
|
||||
u'<div><p> <a href="http://www.example.com">example</a></p></div>'
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
text = to_unicode(text, encoding)
|
||||
if which_ones:
|
||||
tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
|
||||
retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
|
||||
text = retags.sub(u'', text)
|
||||
return text
|
||||
|
||||
|
||||
def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
|
||||
encoding=None):
|
||||
"""Remove escape characters.
|
||||
|
||||
`which_ones` is a tuple of which escape characters we want to remove.
|
||||
By default removes ``\\n``, ``\\t``, ``\\r``.
|
||||
|
||||
`replace_by` is the string to replace the escape characters by.
|
||||
It defaults to ``''``, meaning the escape characters are removed.
|
||||
|
||||
"""
|
||||
|
||||
text = to_unicode(text, encoding)
|
||||
for ec in which_ones:
|
||||
text = text.replace(ec, to_unicode(replace_by, encoding))
|
||||
return text
|
||||
|
||||
def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
|
||||
"""
|
||||
This function receives markup as a text (always a unicode string or
|
||||
a UTF-8 encoded string) and does the following:
|
||||
|
||||
1. removes entities (except the ones in `keep`) from any part of it
|
||||
that is not inside a CDATA
|
||||
2. searches for CDATAs and extracts their text (if any) without modifying it.
|
||||
3. removes the found CDATAs
|
||||
|
||||
"""
|
||||
|
||||
def _get_fragments(txt, pattern):
|
||||
offset = 0
|
||||
for match in pattern.finditer(txt):
|
||||
match_s, match_e = match.span(1)
|
||||
yield txt[offset:match_s]
|
||||
yield match
|
||||
offset = match_e
|
||||
yield txt[offset:]
|
||||
|
||||
text = to_unicode(text, encoding)
|
||||
ret_text = u''
|
||||
for fragment in _get_fragments(text, _cdata_re):
|
||||
if isinstance(fragment, six.string_types):
|
||||
# it's not a CDATA (so we try to remove its entities)
|
||||
ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
|
||||
else:
|
||||
# it's a CDATA (so we just extract its content)
|
||||
ret_text += fragment.group('cdata_d')
|
||||
return ret_text
|
||||
|
||||
def get_base_url(text, baseurl='', encoding='utf-8'):
|
||||
"""Return the base url if declared in the given HTML `text`,
|
||||
relative to the given base url.
|
||||
|
||||
If no base url is found, the given `baseurl` is returned.
|
||||
|
||||
"""
|
||||
|
||||
text = to_unicode(text, encoding)
|
||||
m = _baseurl_re.search(text)
|
||||
if m:
|
||||
return moves.urllib.parse.urljoin(
|
||||
safe_url_string(baseurl),
|
||||
safe_url_string(m.group(1), encoding=encoding)
|
||||
)
|
||||
else:
|
||||
return safe_url_string(baseurl)
|
||||
|
||||
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
|
||||
"""Return the http-equiv parameter of the HTML meta element from the given
|
||||
HTML text and return a tuple ``(interval, url)`` where interval is an integer
|
||||
containing the delay in seconds (or zero if not present) and url is a
|
||||
string with the absolute url to redirect.
|
||||
|
||||
If no meta redirect is found, ``(None, None)`` is returned.
|
||||
|
||||
"""
|
||||
|
||||
if six.PY2:
|
||||
baseurl = to_bytes(baseurl, encoding)
|
||||
try:
|
||||
text = to_unicode(text, encoding)
|
||||
except UnicodeDecodeError:
|
||||
print(text)
|
||||
raise
|
||||
text = remove_tags_with_content(text, ignore_tags)
|
||||
text = remove_comments(replace_entities(text))
|
||||
m = _meta_refresh_re.search(text)
|
||||
if m:
|
||||
interval = float(m.group('int'))
|
||||
url = safe_url_string(m.group('url').strip(' "\''), encoding)
|
||||
url = moves.urllib.parse.urljoin(baseurl, url)
|
||||
return interval, url
|
||||
else:
|
||||
return None, None
|
||||
|
||||
|
||||
def strip_html5_whitespace(text):
|
||||
r"""
|
||||
Strip all leading and trailing space characters (as defined in
|
||||
https://www.w3.org/TR/html5/infrastructure.html#space-character).
|
||||
|
||||
Such stripping is useful e.g. for processing HTML element attributes which
|
||||
contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
|
||||
defines them as "valid URL potentially surrounded by spaces"
|
||||
or "valid non-empty URL potentially surrounded by spaces".
|
||||
|
||||
>>> strip_html5_whitespace(' hello\n')
|
||||
'hello'
|
||||
"""
|
||||
return text.strip(HTML5_WHITESPACE)
|
||||
99
venv/lib/python3.9/site-packages/w3lib/http.py
Normal file
99
venv/lib/python3.9/site-packages/w3lib/http.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
from base64 import urlsafe_b64encode
|
||||
|
||||
|
||||
def headers_raw_to_dict(headers_raw):
|
||||
r"""
|
||||
Convert raw headers (single multi-line bytestring)
|
||||
to a dictionary.
|
||||
|
||||
For example:
|
||||
|
||||
>>> import w3lib.http
|
||||
>>> w3lib.http.headers_raw_to_dict(b"Content-type: text/html\n\rAccept: gzip\n\n") # doctest: +SKIP
|
||||
{'Content-type': ['text/html'], 'Accept': ['gzip']}
|
||||
|
||||
Incorrect input:
|
||||
|
||||
>>> w3lib.http.headers_raw_to_dict(b"Content-typt gzip\n\n")
|
||||
{}
|
||||
>>>
|
||||
|
||||
Argument is ``None`` (return ``None``):
|
||||
|
||||
>>> w3lib.http.headers_raw_to_dict(None)
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
if headers_raw is None:
|
||||
return None
|
||||
headers = headers_raw.splitlines()
|
||||
headers_tuples = [header.split(b':', 1) for header in headers]
|
||||
|
||||
result_dict = {}
|
||||
for header_item in headers_tuples:
|
||||
if not len(header_item) == 2:
|
||||
continue
|
||||
|
||||
item_key = header_item[0].strip()
|
||||
item_value = header_item[1].strip()
|
||||
|
||||
if item_key in result_dict:
|
||||
result_dict[item_key].append(item_value)
|
||||
else:
|
||||
result_dict[item_key] = [item_value]
|
||||
|
||||
return result_dict
|
||||
|
||||
|
||||
def headers_dict_to_raw(headers_dict):
|
||||
r"""
|
||||
Returns a raw HTTP headers representation of headers
|
||||
|
||||
For example:
|
||||
|
||||
>>> import w3lib.http
|
||||
>>> w3lib.http.headers_dict_to_raw({b'Content-type': b'text/html', b'Accept': b'gzip'}) # doctest: +SKIP
|
||||
'Content-type: text/html\\r\\nAccept: gzip'
|
||||
>>>
|
||||
|
||||
Note that keys and values must be bytes.
|
||||
|
||||
Argument is ``None`` (returns ``None``):
|
||||
|
||||
>>> w3lib.http.headers_dict_to_raw(None)
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
if headers_dict is None:
|
||||
return None
|
||||
raw_lines = []
|
||||
for key, value in headers_dict.items():
|
||||
if isinstance(value, bytes):
|
||||
raw_lines.append(b": ".join([key, value]))
|
||||
elif isinstance(value, (list, tuple)):
|
||||
for v in value:
|
||||
raw_lines.append(b": ".join([key, v]))
|
||||
return b'\r\n'.join(raw_lines)
|
||||
|
||||
|
||||
def basic_auth_header(username, password, encoding='ISO-8859-1'):
|
||||
"""
|
||||
Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_
|
||||
|
||||
>>> import w3lib.http
|
||||
>>> w3lib.http.basic_auth_header('someuser', 'somepass')
|
||||
'Basic c29tZXVzZXI6c29tZXBhc3M='
|
||||
|
||||
.. _HTTP Basic Access Authentication (RFC 2617): http://www.ietf.org/rfc/rfc2617.txt
|
||||
|
||||
"""
|
||||
|
||||
auth = "%s:%s" % (username, password)
|
||||
if not isinstance(auth, bytes):
|
||||
# XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
|
||||
# seems to be the most widely used encoding here. See also:
|
||||
# http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
|
||||
auth = auth.encode(encoding)
|
||||
return b'Basic ' + urlsafe_b64encode(auth)
|
||||
623
venv/lib/python3.9/site-packages/w3lib/url.py
Normal file
623
venv/lib/python3.9/site-packages/w3lib/url.py
Normal file
|
|
@ -0,0 +1,623 @@
|
|||
"""
|
||||
This module contains general purpose URL functions not found in the standard
|
||||
library.
|
||||
"""
|
||||
import base64
|
||||
import codecs
|
||||
import os
|
||||
import re
|
||||
import posixpath
|
||||
import warnings
|
||||
import string
|
||||
from collections import namedtuple
|
||||
import six
|
||||
from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
|
||||
urldefrag, urlencode, urlparse,
|
||||
quote, parse_qs, parse_qsl,
|
||||
ParseResult, unquote, urlunparse)
|
||||
from six.moves.urllib.request import pathname2url, url2pathname
|
||||
from w3lib.util import to_bytes, to_native_str, to_unicode
|
||||
|
||||
|
||||
# error handling function for bytes-to-Unicode decoding errors with URLs
|
||||
def _quote_byte(error):
|
||||
return (to_unicode(quote(error.object[error.start:error.end])), error.end)
|
||||
|
||||
codecs.register_error('percentencode', _quote_byte)
|
||||
|
||||
# constants from RFC 3986, Section 2.2 and 2.3
|
||||
RFC3986_GEN_DELIMS = b':/?#[]@'
|
||||
RFC3986_SUB_DELIMS = b"!$&'()*+,;="
|
||||
RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS
|
||||
RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode('ascii')
|
||||
EXTRA_SAFE_CHARS = b'|' # see https://github.com/scrapy/w3lib/pull/25
|
||||
|
||||
_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
|
||||
|
||||
_ascii_tab_newline_re = re.compile(r'[\t\n\r]') # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
|
||||
|
||||
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True):
|
||||
"""Convert the given URL into a legal URL by escaping unsafe characters
|
||||
according to RFC-3986. Also, ASCII tabs and newlines are removed
|
||||
as per https://url.spec.whatwg.org/#url-parsing.
|
||||
|
||||
If a bytes URL is given, it is first converted to `str` using the given
|
||||
encoding (which defaults to 'utf-8'). If quote_path is True (default),
|
||||
path_encoding ('utf-8' by default) is used to encode URL path component
|
||||
which is then quoted. Otherwise, if quote_path is False, path component
|
||||
is not encoded or quoted. Given encoding is used for query string
|
||||
or form data.
|
||||
|
||||
When passing an encoding, you should use the encoding of the
|
||||
original page (the page from which the URL was extracted from).
|
||||
|
||||
Calling this function on an already "safe" URL will return the URL
|
||||
unmodified.
|
||||
|
||||
Always returns a native `str` (bytes in Python2, unicode in Python3).
|
||||
"""
|
||||
# Python3's urlsplit() chokes on bytes input with non-ASCII chars,
|
||||
# so let's decode (to Unicode) using page encoding:
|
||||
# - it is assumed that a raw bytes input comes from a document
|
||||
# encoded with the supplied encoding (or UTF8 by default)
|
||||
# - if the supplied (or default) encoding chokes,
|
||||
# percent-encode offending bytes
|
||||
decoded = to_unicode(url, encoding=encoding, errors='percentencode')
|
||||
parts = urlsplit(_ascii_tab_newline_re.sub('', decoded))
|
||||
|
||||
# IDNA encoding can fail for too long labels (>63 characters)
|
||||
# or missing labels (e.g. http://.example.com)
|
||||
try:
|
||||
netloc = parts.netloc.encode('idna')
|
||||
except UnicodeError:
|
||||
netloc = parts.netloc
|
||||
|
||||
# default encoding for path component SHOULD be UTF-8
|
||||
if quote_path:
|
||||
path = quote(to_bytes(parts.path, path_encoding), _safe_chars)
|
||||
else:
|
||||
path = to_native_str(parts.path)
|
||||
|
||||
# quote() in Python2 return type follows input type;
|
||||
# quote() in Python3 always returns Unicode (native str)
|
||||
return urlunsplit((
|
||||
to_native_str(parts.scheme),
|
||||
to_native_str(netloc).rstrip(':'),
|
||||
path,
|
||||
# encoding of query and fragment follows page encoding
|
||||
# or form-charset (if known and passed)
|
||||
quote(to_bytes(parts.query, encoding), _safe_chars),
|
||||
quote(to_bytes(parts.fragment, encoding), _safe_chars),
|
||||
))
|
||||
|
||||
|
||||
_parent_dirs = re.compile(r'/?(\.\./)+')
|
||||
|
||||
def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
|
||||
""" Make a url for download. This will call safe_url_string
|
||||
and then strip the fragment, if one exists. The path will
|
||||
be normalised.
|
||||
|
||||
If the path is outside the document root, it will be changed
|
||||
to be within the document root.
|
||||
"""
|
||||
safe_url = safe_url_string(url, encoding, path_encoding)
|
||||
scheme, netloc, path, query, _ = urlsplit(safe_url)
|
||||
if path:
|
||||
path = _parent_dirs.sub('', posixpath.normpath(path))
|
||||
if safe_url.endswith('/') and not path.endswith('/'):
|
||||
path += '/'
|
||||
else:
|
||||
path = '/'
|
||||
return urlunsplit((scheme, netloc, path, query, ''))
|
||||
|
||||
|
||||
def is_url(text):
|
||||
return text.partition("://")[0] in ('file', 'http', 'https')
|
||||
|
||||
|
||||
def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
|
||||
"""Return the value of a url parameter, given the url and parameter name
|
||||
|
||||
General case:
|
||||
|
||||
>>> import w3lib.url
|
||||
>>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id")
|
||||
'200'
|
||||
>>>
|
||||
|
||||
Return a default value if the parameter is not found:
|
||||
|
||||
>>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault")
|
||||
'mydefault'
|
||||
>>>
|
||||
|
||||
Returns None if `keep_blank_values` not set or 0 (default):
|
||||
|
||||
>>> w3lib.url.url_query_parameter("product.html?id=", "id")
|
||||
>>>
|
||||
|
||||
Returns an empty string if `keep_blank_values` set to 1:
|
||||
|
||||
>>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1)
|
||||
''
|
||||
>>>
|
||||
|
||||
"""
|
||||
|
||||
queryparams = parse_qs(
|
||||
urlsplit(str(url))[3],
|
||||
keep_blank_values=keep_blank_values
|
||||
)
|
||||
return queryparams.get(parameter, [default])[0]
|
||||
|
||||
|
||||
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
|
||||
"""Clean URL arguments leaving only those passed in the parameterlist keeping order
|
||||
|
||||
>>> import w3lib.url
|
||||
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
|
||||
'product.html?id=200'
|
||||
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
|
||||
'product.html?id=200&name=wired'
|
||||
>>>
|
||||
|
||||
If `unique` is ``False``, do not remove duplicated keys
|
||||
|
||||
>>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
|
||||
'product.html?d=1&d=2&d=3'
|
||||
>>>
|
||||
|
||||
If `remove` is ``True``, leave only those **not in parameterlist**.
|
||||
|
||||
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
|
||||
'product.html?foo=bar&name=wired'
|
||||
>>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
|
||||
'product.html?name=wired'
|
||||
>>>
|
||||
|
||||
By default, URL fragments are removed. If you need to preserve fragments,
|
||||
pass the ``keep_fragments`` argument as ``True``.
|
||||
|
||||
>>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True)
|
||||
'http://domain.tld/#123123'
|
||||
|
||||
"""
|
||||
|
||||
if isinstance(parameterlist, (six.text_type, bytes)):
|
||||
parameterlist = [parameterlist]
|
||||
url, fragment = urldefrag(url)
|
||||
base, _, query = url.partition('?')
|
||||
seen = set()
|
||||
querylist = []
|
||||
for ksv in query.split(sep):
|
||||
if not ksv:
|
||||
continue
|
||||
k, _, _ = ksv.partition(kvsep)
|
||||
if unique and k in seen:
|
||||
continue
|
||||
elif remove and k in parameterlist:
|
||||
continue
|
||||
elif not remove and k not in parameterlist:
|
||||
continue
|
||||
else:
|
||||
querylist.append(ksv)
|
||||
seen.add(k)
|
||||
url = '?'.join([base, sep.join(querylist)]) if querylist else base
|
||||
if keep_fragments:
|
||||
url += '#' + fragment
|
||||
return url
|
||||
|
||||
def _add_or_replace_parameters(url, params):
|
||||
parsed = urlsplit(url)
|
||||
current_args = parse_qsl(parsed.query, keep_blank_values=True)
|
||||
|
||||
new_args = []
|
||||
seen_params = set()
|
||||
for name, value in current_args:
|
||||
if name not in params:
|
||||
new_args.append((name, value))
|
||||
elif name not in seen_params:
|
||||
new_args.append((name, params[name]))
|
||||
seen_params.add(name)
|
||||
|
||||
not_modified_args = [(name, value) for name, value in params.items() if name not in seen_params]
|
||||
new_args += not_modified_args
|
||||
|
||||
query = urlencode(new_args)
|
||||
return urlunsplit(parsed._replace(query=query))
|
||||
|
||||
|
||||
def add_or_replace_parameter(url, name, new_value):
|
||||
"""Add or remove a parameter to a given url
|
||||
|
||||
>>> import w3lib.url
|
||||
>>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v')
|
||||
'http://www.example.com/index.php?arg=v'
|
||||
>>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4')
|
||||
'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4'
|
||||
>>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new')
|
||||
'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new'
|
||||
>>>
|
||||
|
||||
"""
|
||||
return _add_or_replace_parameters(url, {name: new_value})
|
||||
|
||||
|
||||
def add_or_replace_parameters(url, new_parameters):
|
||||
"""Add or remove a parameters to a given url
|
||||
|
||||
>>> import w3lib.url
|
||||
>>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'})
|
||||
'http://www.example.com/index.php?arg=v'
|
||||
>>> args = {'arg4': 'v4', 'arg3': 'v3new'}
|
||||
>>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args)
|
||||
'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4'
|
||||
>>>
|
||||
|
||||
"""
|
||||
return _add_or_replace_parameters(url, new_parameters)
|
||||
|
||||
|
||||
def path_to_file_uri(path):
|
||||
"""Convert local filesystem path to legal File URIs as described in:
|
||||
http://en.wikipedia.org/wiki/File_URI_scheme
|
||||
"""
|
||||
x = pathname2url(os.path.abspath(path))
|
||||
if os.name == 'nt':
|
||||
x = x.replace('|', ':') # http://bugs.python.org/issue5861
|
||||
return 'file:///%s' % x.lstrip('/')
|
||||
|
||||
|
||||
def file_uri_to_path(uri):
|
||||
"""Convert File URI to local filesystem path according to:
|
||||
http://en.wikipedia.org/wiki/File_URI_scheme
|
||||
"""
|
||||
uri_path = urlparse(uri).path
|
||||
return url2pathname(uri_path)
|
||||
|
||||
|
||||
def any_to_uri(uri_or_path):
|
||||
"""If given a path name, return its File URI, otherwise return it
|
||||
unmodified
|
||||
"""
|
||||
if os.path.splitdrive(uri_or_path)[0]:
|
||||
return path_to_file_uri(uri_or_path)
|
||||
u = urlparse(uri_or_path)
|
||||
return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)
|
||||
|
||||
|
||||
# ASCII characters.
|
||||
_char = set(map(chr, range(127)))
|
||||
|
||||
# RFC 2045 token.
|
||||
_token = r'[{}]+'.format(re.escape(''.join(_char -
|
||||
# Control characters.
|
||||
set(map(chr, range(0, 32))) -
|
||||
# tspecials and space.
|
||||
set('()<>@,;:\\"/[]?= '))))
|
||||
|
||||
# RFC 822 quoted-string, without surrounding quotation marks.
|
||||
_quoted_string = r'(?:[{}]|(?:\\[{}]))*'.format(
|
||||
re.escape(''.join(_char - {'"', '\\', '\r'})),
|
||||
re.escape(''.join(_char))
|
||||
)
|
||||
|
||||
# Encode the regular expression strings to make them into bytes, as Python 3
|
||||
# bytes have no format() method, but bytes must be passed to re.compile() in
|
||||
# order to make a pattern object that can be used to match on bytes.
|
||||
|
||||
# RFC 2397 mediatype.
|
||||
_mediatype_pattern = re.compile(
|
||||
r'{token}/{token}'.format(token=_token).encode()
|
||||
)
|
||||
_mediatype_parameter_pattern = re.compile(
|
||||
r';({token})=(?:({token})|"({quoted})")'.format(token=_token,
|
||||
quoted=_quoted_string
|
||||
).encode()
|
||||
)
|
||||
|
||||
_ParseDataURIResult = namedtuple("ParseDataURIResult",
|
||||
"media_type media_type_parameters data")
|
||||
|
||||
|
||||
def parse_data_uri(uri):
|
||||
"""
|
||||
|
||||
Parse a data: URI, returning a 3-tuple of media type, dictionary of media
|
||||
type parameters, and data.
|
||||
|
||||
"""
|
||||
|
||||
if not isinstance(uri, bytes):
|
||||
uri = safe_url_string(uri).encode('ascii')
|
||||
|
||||
try:
|
||||
scheme, uri = uri.split(b':', 1)
|
||||
except ValueError:
|
||||
raise ValueError("invalid URI")
|
||||
if scheme.lower() != b'data':
|
||||
raise ValueError("not a data URI")
|
||||
|
||||
# RFC 3986 section 2.1 allows percent encoding to escape characters that
|
||||
# would be interpreted as delimiters, implying that actual delimiters
|
||||
# should not be percent-encoded.
|
||||
# Decoding before parsing will allow malformed URIs with percent-encoded
|
||||
# delimiters, but it makes parsing easier and should not affect
|
||||
# well-formed URIs, as the delimiters used in this URI scheme are not
|
||||
# allowed, percent-encoded or not, in tokens.
|
||||
if six.PY2:
|
||||
uri = unquote(uri)
|
||||
else:
|
||||
uri = unquote_to_bytes(uri)
|
||||
|
||||
media_type = "text/plain"
|
||||
media_type_params = {}
|
||||
|
||||
m = _mediatype_pattern.match(uri)
|
||||
if m:
|
||||
media_type = m.group().decode()
|
||||
uri = uri[m.end():]
|
||||
else:
|
||||
media_type_params['charset'] = "US-ASCII"
|
||||
|
||||
while True:
|
||||
m = _mediatype_parameter_pattern.match(uri)
|
||||
if m:
|
||||
attribute, value, value_quoted = m.groups()
|
||||
if value_quoted:
|
||||
value = re.sub(br'\\(.)', r'\1', value_quoted)
|
||||
media_type_params[attribute.decode()] = value.decode()
|
||||
uri = uri[m.end():]
|
||||
else:
|
||||
break
|
||||
|
||||
try:
|
||||
is_base64, data = uri.split(b',', 1)
|
||||
except ValueError:
|
||||
raise ValueError("invalid data URI")
|
||||
if is_base64:
|
||||
if is_base64 != b";base64":
|
||||
raise ValueError("invalid data URI")
|
||||
data = base64.b64decode(data)
|
||||
|
||||
return _ParseDataURIResult(media_type, media_type_params, data)
|
||||
|
||||
|
||||
__all__ = ["add_or_replace_parameter",
|
||||
"add_or_replace_parameters",
|
||||
"any_to_uri",
|
||||
"canonicalize_url",
|
||||
"file_uri_to_path",
|
||||
"is_url",
|
||||
"parse_data_uri",
|
||||
"path_to_file_uri",
|
||||
"safe_download_url",
|
||||
"safe_url_string",
|
||||
"url_query_cleaner",
|
||||
"url_query_parameter",
|
||||
|
||||
# this last one is deprecated ; include it to be on the safe side
|
||||
"urljoin_rfc"]
|
||||
|
||||
|
||||
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
|
||||
# IDNA encoding can fail for too long labels (>63 characters)
|
||||
# or missing labels (e.g. http://.example.com)
|
||||
try:
|
||||
netloc = parts.netloc.encode('idna')
|
||||
except UnicodeError:
|
||||
netloc = parts.netloc
|
||||
|
||||
return (
|
||||
to_native_str(parts.scheme),
|
||||
to_native_str(netloc),
|
||||
|
||||
# default encoding for path component SHOULD be UTF-8
|
||||
quote(to_bytes(parts.path, path_encoding), _safe_chars),
|
||||
quote(to_bytes(parts.params, path_encoding), _safe_chars),
|
||||
|
||||
# encoding of query and fragment follows page encoding
|
||||
# or form-charset (if known and passed)
|
||||
quote(to_bytes(parts.query, encoding), _safe_chars),
|
||||
quote(to_bytes(parts.fragment, encoding), _safe_chars)
|
||||
)
|
||||
|
||||
|
||||
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
|
||||
encoding=None):
|
||||
r"""Canonicalize the given url by applying the following procedures:
|
||||
|
||||
- sort query arguments, first by key, then by value
|
||||
- percent encode paths ; non-ASCII characters are percent-encoded
|
||||
using UTF-8 (RFC-3986)
|
||||
- percent encode query arguments ; non-ASCII characters are percent-encoded
|
||||
using passed `encoding` (UTF-8 by default)
|
||||
- normalize all spaces (in query arguments) '+' (plus symbol)
|
||||
- normalize percent encodings case (%2f -> %2F)
|
||||
- remove query arguments with blank values (unless `keep_blank_values` is True)
|
||||
- remove fragments (unless `keep_fragments` is True)
|
||||
|
||||
The url passed can be bytes or unicode, while the url returned is
|
||||
always a native str (bytes in Python 2, unicode in Python 3).
|
||||
|
||||
>>> import w3lib.url
|
||||
>>>
|
||||
>>> # sorting query arguments
|
||||
>>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50')
|
||||
'http://www.example.com/do?a=50&b=2&b=5&c=3'
|
||||
>>>
|
||||
>>> # UTF-8 conversion + percent-encoding of non-ASCII characters
|
||||
>>> w3lib.url.canonicalize_url(u'http://www.example.com/r\u00e9sum\u00e9')
|
||||
'http://www.example.com/r%C3%A9sum%C3%A9'
|
||||
>>>
|
||||
|
||||
For more examples, see the tests in `tests/test_url.py`.
|
||||
"""
|
||||
# If supplied `encoding` is not compatible with all characters in `url`,
|
||||
# fallback to UTF-8 as safety net.
|
||||
# UTF-8 can handle all Unicode characters,
|
||||
# so we should be covered regarding URL normalization,
|
||||
# if not for proper URL expected by remote website.
|
||||
try:
|
||||
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
|
||||
parse_url(url), encoding=encoding)
|
||||
except UnicodeEncodeError as e:
|
||||
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
|
||||
parse_url(url), encoding='utf8')
|
||||
|
||||
# 1. decode query-string as UTF-8 (or keep raw bytes),
|
||||
# sort values,
|
||||
# and percent-encode them back
|
||||
if six.PY2:
|
||||
keyvals = parse_qsl(query, keep_blank_values)
|
||||
else:
|
||||
# Python3's urllib.parse.parse_qsl does not work as wanted
|
||||
# for percent-encoded characters that do not match passed encoding,
|
||||
# they get lost.
|
||||
#
|
||||
# e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
|
||||
# (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
|
||||
# instead of \xa3 that you get with Python2's parse_qsl)
|
||||
#
|
||||
# what we want here is to keep raw bytes, and percent encode them
|
||||
# so as to preserve whatever encoding what originally used.
|
||||
#
|
||||
# See https://tools.ietf.org/html/rfc3987#section-6.4:
|
||||
#
|
||||
# For example, it is possible to have a URI reference of
|
||||
# "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
|
||||
# document name is encoded in iso-8859-1 based on server settings, but
|
||||
# where the fragment identifier is encoded in UTF-8 according to
|
||||
# [XPointer]. The IRI corresponding to the above URI would be (in XML
|
||||
# notation)
|
||||
# "http://www.example.org/r%E9sum%E9.xml#résumé".
|
||||
# Similar considerations apply to query parts. The functionality of
|
||||
# IRIs (namely, to be able to include non-ASCII characters) can only be
|
||||
# used if the query part is encoded in UTF-8.
|
||||
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
|
||||
keyvals.sort()
|
||||
query = urlencode(keyvals)
|
||||
|
||||
# 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
|
||||
# and percent-encode path again (this normalizes to upper-case %XX)
|
||||
uqp = _unquotepath(path)
|
||||
path = quote(uqp, _safe_chars) or '/'
|
||||
|
||||
fragment = '' if not keep_fragments else fragment
|
||||
|
||||
# every part should be safe already
|
||||
return urlunparse((scheme,
|
||||
netloc.lower().rstrip(':'),
|
||||
path,
|
||||
params,
|
||||
query,
|
||||
fragment))
|
||||
|
||||
|
||||
def _unquotepath(path):
|
||||
for reserved in ('2f', '2F', '3f', '3F'):
|
||||
path = path.replace('%' + reserved, '%25' + reserved.upper())
|
||||
|
||||
if six.PY2:
|
||||
# in Python 2, '%a3' becomes '\xa3', which is what we want
|
||||
return unquote(path)
|
||||
else:
|
||||
# in Python 3,
|
||||
# standard lib's unquote() does not work for non-UTF-8
|
||||
# percent-escaped characters, they get lost.
|
||||
# e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
|
||||
#
|
||||
# unquote_to_bytes() returns raw bytes instead
|
||||
return unquote_to_bytes(path)
|
||||
|
||||
|
||||
def parse_url(url, encoding=None):
|
||||
"""Return urlparsed url from the given argument (which could be an already
|
||||
parsed url)
|
||||
"""
|
||||
if isinstance(url, ParseResult):
|
||||
return url
|
||||
return urlparse(to_unicode(url, encoding))
|
||||
|
||||
|
||||
if not six.PY2:
|
||||
from urllib.parse import _coerce_args, unquote_to_bytes
|
||||
|
||||
def parse_qsl_to_bytes(qs, keep_blank_values=False):
|
||||
"""Parse a query given as a string argument.
|
||||
|
||||
Data are returned as a list of name, value pairs as bytes.
|
||||
|
||||
Arguments:
|
||||
|
||||
qs: percent-encoded query string to be parsed
|
||||
|
||||
keep_blank_values: flag indicating whether blank values in
|
||||
percent-encoded queries should be treated as blank strings. A
|
||||
true value indicates that blanks should be retained as blank
|
||||
strings. The default false value indicates that blank values
|
||||
are to be ignored and treated as if they were not included.
|
||||
|
||||
"""
|
||||
# This code is the same as Python3's parse_qsl()
|
||||
# (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
|
||||
# except for the unquote(s, encoding, errors) calls replaced
|
||||
# with unquote_to_bytes(s)
|
||||
qs, _coerce_result = _coerce_args(qs)
|
||||
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
|
||||
r = []
|
||||
for name_value in pairs:
|
||||
if not name_value:
|
||||
continue
|
||||
nv = name_value.split('=', 1)
|
||||
if len(nv) != 2:
|
||||
# Handle case of a control-name with no equal sign
|
||||
if keep_blank_values:
|
||||
nv.append('')
|
||||
else:
|
||||
continue
|
||||
if len(nv[1]) or keep_blank_values:
|
||||
name = nv[0].replace('+', ' ')
|
||||
name = unquote_to_bytes(name)
|
||||
name = _coerce_result(name)
|
||||
value = nv[1].replace('+', ' ')
|
||||
value = unquote_to_bytes(value)
|
||||
value = _coerce_result(value)
|
||||
r.append((name, value))
|
||||
return r
|
||||
|
||||
|
||||
def urljoin_rfc(base, ref, encoding='utf-8'):
|
||||
r"""
|
||||
.. warning::
|
||||
|
||||
This function is deprecated and will be removed in future.
|
||||
It is not supported with Python 3.
|
||||
Please use ``urlparse.urljoin`` instead.
|
||||
|
||||
Same as urlparse.urljoin but supports unicode values in base and ref
|
||||
parameters (in which case they will be converted to str using the given
|
||||
encoding).
|
||||
|
||||
Always returns a str.
|
||||
|
||||
>>> import w3lib.url
|
||||
>>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
|
||||
'http://www.example.com/otherpath/index2.html'
|
||||
>>>
|
||||
|
||||
>>> # Note: the following does not work in Python 3
|
||||
>>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
|
||||
'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
|
||||
>>>
|
||||
|
||||
|
||||
"""
|
||||
|
||||
warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
|
||||
DeprecationWarning)
|
||||
|
||||
str_base = to_bytes(base, encoding)
|
||||
str_ref = to_bytes(ref, encoding)
|
||||
return urljoin(str_base, str_ref)
|
||||
47
venv/lib/python3.9/site-packages/w3lib/util.py
Normal file
47
venv/lib/python3.9/site-packages/w3lib/util.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import six
|
||||
|
||||
def str_to_unicode(text, encoding=None, errors='strict'):
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
if isinstance(text, bytes):
|
||||
return text.decode(encoding, errors)
|
||||
return text
|
||||
|
||||
def unicode_to_str(text, encoding=None, errors='strict'):
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
if isinstance(text, six.text_type):
|
||||
return text.encode(encoding, errors)
|
||||
return text
|
||||
|
||||
def to_unicode(text, encoding=None, errors='strict'):
|
||||
"""Return the unicode representation of a bytes object `text`. If `text`
|
||||
is already an unicode object, return it as-is."""
|
||||
if isinstance(text, six.text_type):
|
||||
return text
|
||||
if not isinstance(text, (bytes, six.text_type)):
|
||||
raise TypeError('to_unicode must receive a bytes, str or unicode '
|
||||
'object, got %s' % type(text).__name__)
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
return text.decode(encoding, errors)
|
||||
|
||||
def to_bytes(text, encoding=None, errors='strict'):
|
||||
"""Return the binary representation of `text`. If `text`
|
||||
is already a bytes object, return it as-is."""
|
||||
if isinstance(text, bytes):
|
||||
return text
|
||||
if not isinstance(text, six.string_types):
|
||||
raise TypeError('to_bytes must receive a unicode, str or bytes '
|
||||
'object, got %s' % type(text).__name__)
|
||||
if encoding is None:
|
||||
encoding = 'utf-8'
|
||||
return text.encode(encoding, errors)
|
||||
|
||||
def to_native_str(text, encoding=None, errors='strict'):
|
||||
""" Return str representation of `text`
|
||||
(bytes in Python 2.x and unicode in Python 3.x). """
|
||||
if six.PY2:
|
||||
return to_bytes(text, encoding, errors)
|
||||
else:
|
||||
return to_unicode(text, encoding, errors)
|
||||
Loading…
Add table
Add a link
Reference in a new issue