Ausgabe der neuen DB Einträge

This commit is contained in:
hubobel 2022-01-02 21:50:48 +01:00
parent bad48e1627
commit cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions

View file

@ -0,0 +1,3 @@
__version__ = "1.22.0"
version_info = tuple(int(v) if v.isdigit() else v
for v in __version__.split('.'))

View file

@ -0,0 +1,274 @@
# -*- coding: utf-8 -*-
"""
Functions for handling encoding of web pages
"""
import re, codecs, encodings
from sys import version_info
_HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
def http_content_type_encoding(content_type):
"""Extract the encoding in the content-type header
>>> import w3lib.encoding
>>> w3lib.encoding.http_content_type_encoding("Content-Type: text/html; charset=ISO-8859-4")
'iso8859-4'
"""
if content_type:
match = _HEADER_ENCODING_RE.search(content_type)
if match:
return resolve_encoding(match.group(1))
# regexp for parsing HTTP meta tags
_TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
_SKIP_ATTRS = '''(?:\\s+
[^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name
(?:\\s*=\\s*
(?: # ' and " are entity encoded (&apos;, &quot;), so no need for \', \"
'[^']*' # attr in '
|
"[^"]*" # attr in "
|
[^'"\\s]+ # attr having no ' nor "
))?
)*?''' # must be used with re.VERBOSE flag
_HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
_CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
_CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
_XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
# check for meta tags, or xml decl. and stop search if a body tag is encountered
_BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
_BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I | re.VERBOSE)
_BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'),
re.I | re.VERBOSE)
def html_body_declared_encoding(html_body_str):
'''Return the encoding specified in meta tags in the html body,
or ``None`` if no suitable encoding was found
>>> import w3lib.encoding
>>> w3lib.encoding.html_body_declared_encoding(
... """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
... "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
... <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
... <head>
... <title>Some title</title>
... <meta http-equiv="content-type" content="text/html;charset=utf-8" />
... </head>
... <body>
... ...
... </body>
... </html>""")
'utf-8'
>>>
'''
# html5 suggests the first 1024 bytes are sufficient, we allow for more
chunk = html_body_str[:4096]
if isinstance(chunk, bytes):
match = _BODY_ENCODING_BYTES_RE.search(chunk)
else:
match = _BODY_ENCODING_STR_RE.search(chunk)
if match:
encoding = match.group('charset') or match.group('charset2') \
or match.group('xmlcharset')
if encoding:
return resolve_encoding(encoding)
# Default encoding translation
# this maps cannonicalized encodings to target encodings
# see http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#character-encodings-0
# in addition, gb18030 supercedes gb2312 & gbk
# the keys are converted using _c18n_encoding and in sorted order
DEFAULT_ENCODING_TRANSLATION = {
'ascii': 'cp1252',
'big5': 'big5hkscs',
'euc_kr': 'cp949',
'gb2312': 'gb18030',
'gb_2312_80': 'gb18030',
'gbk': 'gb18030',
'iso8859_11': 'cp874',
'iso8859_9': 'cp1254',
'latin_1': 'cp1252',
'macintosh': 'mac_roman',
'shift_jis': 'cp932',
'tis_620': 'cp874',
'win_1251': 'cp1251',
'windows_31j': 'cp932',
'win_31j': 'cp932',
'windows_874': 'cp874',
'win_874': 'cp874',
'x_sjis': 'cp932',
'zh_cn': 'gb18030'
}
def _c18n_encoding(encoding):
"""Canonicalize an encoding name
This performs normalization and translates aliases using python's
encoding aliases
"""
normed = encodings.normalize_encoding(encoding).lower()
return encodings.aliases.aliases.get(normed, normed)
def resolve_encoding(encoding_alias):
"""Return the encoding that `encoding_alias` maps to, or ``None``
if the encoding cannot be interpreted
>>> import w3lib.encoding
>>> w3lib.encoding.resolve_encoding('latin1')
'cp1252'
>>> w3lib.encoding.resolve_encoding('gb_2312-80')
'gb18030'
>>>
"""
c18n_encoding = _c18n_encoding(encoding_alias)
translated = DEFAULT_ENCODING_TRANSLATION.get(c18n_encoding, c18n_encoding)
try:
return codecs.lookup(translated).name
except LookupError:
return None
_BOM_TABLE = [
(codecs.BOM_UTF32_BE, 'utf-32-be'),
(codecs.BOM_UTF32_LE, 'utf-32-le'),
(codecs.BOM_UTF16_BE, 'utf-16-be'),
(codecs.BOM_UTF16_LE, 'utf-16-le'),
(codecs.BOM_UTF8, 'utf-8')
]
_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
def read_bom(data):
r"""Read the byte order mark in the text, if present, and
return the encoding represented by the BOM and the BOM.
If no BOM can be detected, ``(None, None)`` is returned.
>>> import w3lib.encoding
>>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34')
('utf-16-be', '\xfe\xff')
>>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c')
('utf-16-le', '\xff\xfe')
>>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34')
('utf-32-be', '\x00\x00\xfe\xff')
>>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00')
('utf-32-le', '\xff\xfe\x00\x00')
>>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04')
(None, None)
>>>
"""
# common case is no BOM, so this is fast
if data and data[0] in _FIRST_CHARS:
for bom, encoding in _BOM_TABLE:
if data.startswith(bom):
return encoding, bom
return None, None
# Python decoder doesn't follow unicode standard when handling
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end))
def to_unicode(data_str, encoding):
"""Convert a str object to unicode using the encoding given
Characters that cannot be converted will be converted to ``\\ufffd`` (the
unicode replacement character).
"""
return data_str.decode(encoding, 'replace' if version_info[0:2] >= (3, 3) else 'w3lib_replace')
def html_to_unicode(content_type_header, html_body_str,
default_encoding='utf8', auto_detect_fun=None):
r'''Convert raw html bytes to unicode
This attempts to make a reasonable guess at the content encoding of the
html body, following a similar process to a web browser.
It will try in order:
* http content type header
* BOM (byte-order mark)
* meta or xml tag declarations
* auto-detection, if the `auto_detect_fun` keyword argument is not ``None``
* default encoding in keyword arg (which defaults to utf8)
If an encoding other than the auto-detected or default encoding is used,
overrides will be applied, converting some character encodings to more
suitable alternatives.
If a BOM is found matching the encoding, it will be stripped.
The `auto_detect_fun` argument can be used to pass a function that will
sniff the encoding of the text. This function must take the raw text as an
argument and return the name of an encoding that python can process, or
None. To use chardet, for example, you can define the function as::
auto_detect_fun=lambda x: chardet.detect(x).get('encoding')
or to use UnicodeDammit (shipped with the BeautifulSoup library)::
auto_detect_fun=lambda x: UnicodeDammit(x).originalEncoding
If the locale of the website or user language preference is known, then a
better default encoding can be supplied.
If `content_type_header` is not present, ``None`` can be passed signifying
that the header was not present.
This method will not fail, if characters cannot be converted to unicode,
``\\ufffd`` (the unicode replacement character) will be inserted instead.
Returns a tuple of ``(<encoding used>, <unicode_string>)``
Examples:
>>> import w3lib.encoding
>>> w3lib.encoding.html_to_unicode(None,
... b"""<!DOCTYPE html>
... <head>
... <meta charset="UTF-8" />
... <meta name="viewport" content="width=device-width" />
... <title>Creative Commons France</title>
... <link rel='canonical' href='http://creativecommons.fr/' />
... <body>
... <p>Creative Commons est une organisation \xc3\xa0 but non lucratif
... qui a pour dessein de faciliter la diffusion et le partage des oeuvres
... tout en accompagnant les nouvelles pratiques de cr\xc3\xa9ation \xc3\xa0 l\xe2\x80\x99\xc3\xa8re numerique.</p>
... </body>
... </html>""")
('utf-8', u'<!DOCTYPE html>\n<head>\n<meta charset="UTF-8" />\n<meta name="viewport" content="width=device-width" />\n<title>Creative Commons France</title>\n<link rel=\'canonical\' href=\'http://creativecommons.fr/\' />\n<body>\n<p>Creative Commons est une organisation \xe0 but non lucratif\nqui a pour dessein de faciliter la diffusion et le partage des oeuvres\ntout en accompagnant les nouvelles pratiques de cr\xe9ation \xe0 l\u2019\xe8re numerique.</p>\n</body>\n</html>')
>>>
'''
enc = http_content_type_encoding(content_type_header)
bom_enc, bom = read_bom(html_body_str)
if enc is not None:
# remove BOM if it agrees with the encoding
if enc == bom_enc:
html_body_str = html_body_str[len(bom):]
elif enc == 'utf-16' or enc == 'utf-32':
# read endianness from BOM, or default to big endian
# tools.ietf.org/html/rfc2781 section 4.3
if bom_enc is not None and bom_enc.startswith(enc):
enc = bom_enc
html_body_str = html_body_str[len(bom):]
else:
enc += '-be'
return enc, to_unicode(html_body_str, enc)
if bom_enc is not None:
return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc)
enc = html_body_declared_encoding(html_body_str)
if enc is None and (auto_detect_fun is not None):
enc = auto_detect_fun(html_body_str)
if enc is None:
enc = default_encoding
return enc, to_unicode(html_body_str, enc)

View file

@ -0,0 +1,68 @@
import warnings
import six
if six.PY2:
from cStringIO import StringIO as BytesIO
else:
from io import BytesIO
from w3lib.util import unicode_to_str
def encode_multipart(data):
r"""
.. warning::
This function is deprecated and will be removed in future.
Please use ``urllib3.filepost.encode_multipart_formdata`` instead.
Encode the given data to be used in a multipart HTTP POST.
`data` is a dictionary where keys are the field name, and values are
either strings or tuples as `(filename, content)` for file uploads.
This code is based on :class:`distutils.command.upload`.
Returns a `(body, boundary)` tuple where `body` is binary body value,
and `boundary` is the boundary used (as native string).
>>> import w3lib.form
>>> w3lib.form.encode_multipart({'key': 'value'})
('\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254\r\nContent-Disposition: form-data; name="key"\r\n\r\nvalue\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254--\r\n', '--------------GHSKFJDLGDS7543FJKLFHRE75642756743254')
>>> w3lib.form.encode_multipart({'key1': 'value1', 'key2': 'value2'}) # doctest: +SKIP
('\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254\r\nContent-Disposition: form-data; name="key2"\r\n\r\nvalue2\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254\r\nContent-Disposition: form-data; name="key1"\r\n\r\nvalue1\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254--\r\n', '--------------GHSKFJDLGDS7543FJKLFHRE75642756743254')
>>> w3lib.form.encode_multipart({'somekey': ('path/to/filename', b'\xa1\xa2\xa3\xa4\r\n\r')})
('\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254\r\nContent-Disposition: form-data; name="somekey"; filename="path/to/filename"\r\n\r\n\xa1\xa2\xa3\xa4\r\n\r\r\n----------------GHSKFJDLGDS7543FJKLFHRE75642756743254--\r\n', '--------------GHSKFJDLGDS7543FJKLFHRE75642756743254')
>>>
"""
warnings.warn(
"`w3lib.form.encode_multipart` function is deprecated and "
"will be removed in future releases. Please use "
"`urllib3.filepost.encode_multipart_formdata` instead.",
DeprecationWarning
)
# Build up the MIME payload for the POST data
boundary = '--------------GHSKFJDLGDS7543FJKLFHRE75642756743254'
sep_boundary = b'\r\n--' + boundary.encode('ascii')
end_boundary = sep_boundary + b'--'
body = BytesIO()
for key, value in data.items():
title = u'\r\nContent-Disposition: form-data; name="%s"' % key
# handle multiple entries for the same name
if type(value) != type([]):
value = [value]
for value in value:
if type(value) is tuple:
title += u'; filename="%s"' % value[0]
value = value[1]
else:
value = unicode_to_str(value) # in distutils: str(value).encode('utf-8')
body.write(sep_boundary)
body.write(title.encode('utf-8'))
body.write(b"\r\n\r\n")
body.write(value)
body.write(end_boundary)
body.write(b"\r\n")
return body.getvalue(), boundary

View file

@ -0,0 +1,336 @@
# -*- coding: utf-8 -*-
"""
Functions for dealing with markup text
"""
import warnings
import re
import six
from six import moves
from w3lib.util import to_bytes, to_unicode
from w3lib.url import safe_url_string
_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
_baseurl_re = re.compile(six.u(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I)
_meta_refresh_re = re.compile(six.u(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE)
_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
HTML5_WHITESPACE = ' \t\n\r\x0c'
def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
r"""
.. warning::
This function is deprecated and will be removed in future.
Please use :func:`replace_entities` instead.
"""
warnings.warn(
"`w3lib.html.remove_entities` function is deprecated and "
"will be removed in future releases. Please use "
"`w3lib.html.replace_entities` instead.",
DeprecationWarning
)
return replace_entities(text, keep, remove_illegal, encoding)
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
u"""Remove entities from the given `text` by converting them to their
corresponding unicode character.
`text` can be a unicode string or a byte string encoded in the given
`encoding` (which defaults to 'utf-8').
If `keep` is passed (with a list of entity names) those entities will
be kept (they won't be removed).
It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
and named entities (such as ``&nbsp;`` or ``&gt;``).
If `remove_illegal` is ``True``, entities that can't be converted are removed.
If `remove_illegal` is ``False``, entities that can't be converted are kept "as
is". For more information see the tests.
Always returns a unicode string (with the entities removed).
>>> import w3lib.html
>>> w3lib.html.replace_entities(b'Price: &pound;100')
u'Price: \\xa3100'
>>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
Price: £100
>>>
"""
def convert_entity(m):
groups = m.groupdict()
if groups.get('dec'):
number = int(groups['dec'], 10)
elif groups.get('hex'):
number = int(groups['hex'], 16)
elif groups.get('named'):
entity_name = groups['named']
if entity_name.lower() in keep:
return m.group(0)
else:
number = (moves.html_entities.name2codepoint.get(entity_name) or
moves.html_entities.name2codepoint.get(entity_name.lower()))
if number is not None:
# Numeric character references in the 80-9F range are typically
# interpreted by browsers as representing the characters mapped
# to bytes 80-9F in the Windows-1252 encoding. For more info
# see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
try:
if 0x80 <= number <= 0x9f:
return six.int2byte(number).decode('cp1252')
else:
return six.unichr(number)
except ValueError:
pass
return u'' if remove_illegal and groups.get('semicolon') else m.group(0)
return _ent_re.sub(convert_entity, to_unicode(text, encoding))
def has_entities(text, encoding=None):
return bool(_ent_re.search(to_unicode(text, encoding)))
def replace_tags(text, token='', encoding=None):
"""Replace all markup tags found in the given `text` by the given token.
By default `token` is an empty string so it just removes all tags.
`text` can be a unicode string or a regular string encoded as `encoding`
(or ``'utf-8'`` if `encoding` is not given.)
Always returns a unicode string.
Examples:
>>> import w3lib.html
>>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
u'This text contains some tag'
>>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')
u' -- Je ne parle pas -- fran\\xe7ais -- -- '
>>>
"""
return _tag_re.sub(token, to_unicode(text, encoding))
_REMOVECOMMENTS_RE = re.compile(u'<!--.*?(?:-->|$)', re.DOTALL)
def remove_comments(text, encoding=None):
""" Remove HTML Comments.
>>> import w3lib.html
>>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
u'test whatever'
>>>
"""
text = to_unicode(text, encoding)
return _REMOVECOMMENTS_RE.sub(u'', text)
def remove_tags(text, which_ones=(), keep=(), encoding=None):
""" Remove HTML Tags only.
`which_ones` and `keep` are both tuples, there are four cases:
============== ============= ==========================================
``which_ones`` ``keep`` what it does
============== ============= ==========================================
**not empty** empty remove all tags in ``which_ones``
empty **not empty** remove all tags except the ones in ``keep``
empty empty remove all tags
**not empty** **not empty** not allowed
============== ============= ==========================================
Remove all tags:
>>> import w3lib.html
>>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
>>> w3lib.html.remove_tags(doc)
u'This is a link: example'
>>>
Keep only some tags:
>>> w3lib.html.remove_tags(doc, keep=('div',))
u'<div>This is a link: example</div>'
>>>
Remove only specific tags:
>>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
u'<div><p>This is a link: example</p></div>'
>>>
You can't remove some and keep some:
>>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
Traceback (most recent call last):
...
ValueError: Cannot use both which_ones and keep
>>>
"""
if which_ones and keep:
raise ValueError('Cannot use both which_ones and keep')
which_ones = {tag.lower() for tag in which_ones}
keep = {tag.lower() for tag in keep}
def will_remove(tag):
tag = tag.lower()
if which_ones:
return tag in which_ones
else:
return tag not in keep
def remove_tag(m):
tag = m.group(1)
return u'' if will_remove(tag) else m.group(0)
regex = '</?([^ >/]+).*?>'
retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
return retags.sub(remove_tag, to_unicode(text, encoding))
def remove_tags_with_content(text, which_ones=(), encoding=None):
"""Remove tags and their content.
`which_ones` is a tuple of which tags to remove including their content.
If is empty, returns the string unmodified.
>>> import w3lib.html
>>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
>>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
u'<div><p> <a href="http://www.example.com">example</a></p></div>'
>>>
"""
text = to_unicode(text, encoding)
if which_ones:
tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
text = retags.sub(u'', text)
return text
def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
encoding=None):
"""Remove escape characters.
`which_ones` is a tuple of which escape characters we want to remove.
By default removes ``\\n``, ``\\t``, ``\\r``.
`replace_by` is the string to replace the escape characters by.
It defaults to ``''``, meaning the escape characters are removed.
"""
text = to_unicode(text, encoding)
for ec in which_ones:
text = text.replace(ec, to_unicode(replace_by, encoding))
return text
def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
"""
This function receives markup as a text (always a unicode string or
a UTF-8 encoded string) and does the following:
1. removes entities (except the ones in `keep`) from any part of it
that is not inside a CDATA
2. searches for CDATAs and extracts their text (if any) without modifying it.
3. removes the found CDATAs
"""
def _get_fragments(txt, pattern):
offset = 0
for match in pattern.finditer(txt):
match_s, match_e = match.span(1)
yield txt[offset:match_s]
yield match
offset = match_e
yield txt[offset:]
text = to_unicode(text, encoding)
ret_text = u''
for fragment in _get_fragments(text, _cdata_re):
if isinstance(fragment, six.string_types):
# it's not a CDATA (so we try to remove its entities)
ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
else:
# it's a CDATA (so we just extract its content)
ret_text += fragment.group('cdata_d')
return ret_text
def get_base_url(text, baseurl='', encoding='utf-8'):
"""Return the base url if declared in the given HTML `text`,
relative to the given base url.
If no base url is found, the given `baseurl` is returned.
"""
text = to_unicode(text, encoding)
m = _baseurl_re.search(text)
if m:
return moves.urllib.parse.urljoin(
safe_url_string(baseurl),
safe_url_string(m.group(1), encoding=encoding)
)
else:
return safe_url_string(baseurl)
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
"""Return the http-equiv parameter of the HTML meta element from the given
HTML text and return a tuple ``(interval, url)`` where interval is an integer
containing the delay in seconds (or zero if not present) and url is a
string with the absolute url to redirect.
If no meta redirect is found, ``(None, None)`` is returned.
"""
if six.PY2:
baseurl = to_bytes(baseurl, encoding)
try:
text = to_unicode(text, encoding)
except UnicodeDecodeError:
print(text)
raise
text = remove_tags_with_content(text, ignore_tags)
text = remove_comments(replace_entities(text))
m = _meta_refresh_re.search(text)
if m:
interval = float(m.group('int'))
url = safe_url_string(m.group('url').strip(' "\''), encoding)
url = moves.urllib.parse.urljoin(baseurl, url)
return interval, url
else:
return None, None
def strip_html5_whitespace(text):
r"""
Strip all leading and trailing space characters (as defined in
https://www.w3.org/TR/html5/infrastructure.html#space-character).
Such stripping is useful e.g. for processing HTML element attributes which
contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard
defines them as "valid URL potentially surrounded by spaces"
or "valid non-empty URL potentially surrounded by spaces".
>>> strip_html5_whitespace(' hello\n')
'hello'
"""
return text.strip(HTML5_WHITESPACE)

View file

@ -0,0 +1,99 @@
from base64 import urlsafe_b64encode
def headers_raw_to_dict(headers_raw):
r"""
Convert raw headers (single multi-line bytestring)
to a dictionary.
For example:
>>> import w3lib.http
>>> w3lib.http.headers_raw_to_dict(b"Content-type: text/html\n\rAccept: gzip\n\n") # doctest: +SKIP
{'Content-type': ['text/html'], 'Accept': ['gzip']}
Incorrect input:
>>> w3lib.http.headers_raw_to_dict(b"Content-typt gzip\n\n")
{}
>>>
Argument is ``None`` (return ``None``):
>>> w3lib.http.headers_raw_to_dict(None)
>>>
"""
if headers_raw is None:
return None
headers = headers_raw.splitlines()
headers_tuples = [header.split(b':', 1) for header in headers]
result_dict = {}
for header_item in headers_tuples:
if not len(header_item) == 2:
continue
item_key = header_item[0].strip()
item_value = header_item[1].strip()
if item_key in result_dict:
result_dict[item_key].append(item_value)
else:
result_dict[item_key] = [item_value]
return result_dict
def headers_dict_to_raw(headers_dict):
r"""
Returns a raw HTTP headers representation of headers
For example:
>>> import w3lib.http
>>> w3lib.http.headers_dict_to_raw({b'Content-type': b'text/html', b'Accept': b'gzip'}) # doctest: +SKIP
'Content-type: text/html\\r\\nAccept: gzip'
>>>
Note that keys and values must be bytes.
Argument is ``None`` (returns ``None``):
>>> w3lib.http.headers_dict_to_raw(None)
>>>
"""
if headers_dict is None:
return None
raw_lines = []
for key, value in headers_dict.items():
if isinstance(value, bytes):
raw_lines.append(b": ".join([key, value]))
elif isinstance(value, (list, tuple)):
for v in value:
raw_lines.append(b": ".join([key, v]))
return b'\r\n'.join(raw_lines)
def basic_auth_header(username, password, encoding='ISO-8859-1'):
"""
Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_
>>> import w3lib.http
>>> w3lib.http.basic_auth_header('someuser', 'somepass')
'Basic c29tZXVzZXI6c29tZXBhc3M='
.. _HTTP Basic Access Authentication (RFC 2617): http://www.ietf.org/rfc/rfc2617.txt
"""
auth = "%s:%s" % (username, password)
if not isinstance(auth, bytes):
# XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
# seems to be the most widely used encoding here. See also:
# http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
auth = auth.encode(encoding)
return b'Basic ' + urlsafe_b64encode(auth)

View file

@ -0,0 +1,623 @@
"""
This module contains general purpose URL functions not found in the standard
library.
"""
import base64
import codecs
import os
import re
import posixpath
import warnings
import string
from collections import namedtuple
import six
from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
urldefrag, urlencode, urlparse,
quote, parse_qs, parse_qsl,
ParseResult, unquote, urlunparse)
from six.moves.urllib.request import pathname2url, url2pathname
from w3lib.util import to_bytes, to_native_str, to_unicode
# error handling function for bytes-to-Unicode decoding errors with URLs
def _quote_byte(error):
return (to_unicode(quote(error.object[error.start:error.end])), error.end)
codecs.register_error('percentencode', _quote_byte)
# constants from RFC 3986, Section 2.2 and 2.3
RFC3986_GEN_DELIMS = b':/?#[]@'
RFC3986_SUB_DELIMS = b"!$&'()*+,;="
RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS
RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode('ascii')
EXTRA_SAFE_CHARS = b'|' # see https://github.com/scrapy/w3lib/pull/25
_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
_ascii_tab_newline_re = re.compile(r'[\t\n\r]') # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True):
"""Convert the given URL into a legal URL by escaping unsafe characters
according to RFC-3986. Also, ASCII tabs and newlines are removed
as per https://url.spec.whatwg.org/#url-parsing.
If a bytes URL is given, it is first converted to `str` using the given
encoding (which defaults to 'utf-8'). If quote_path is True (default),
path_encoding ('utf-8' by default) is used to encode URL path component
which is then quoted. Otherwise, if quote_path is False, path component
is not encoded or quoted. Given encoding is used for query string
or form data.
When passing an encoding, you should use the encoding of the
original page (the page from which the URL was extracted from).
Calling this function on an already "safe" URL will return the URL
unmodified.
Always returns a native `str` (bytes in Python2, unicode in Python3).
"""
# Python3's urlsplit() chokes on bytes input with non-ASCII chars,
# so let's decode (to Unicode) using page encoding:
# - it is assumed that a raw bytes input comes from a document
# encoded with the supplied encoding (or UTF8 by default)
# - if the supplied (or default) encoding chokes,
# percent-encode offending bytes
decoded = to_unicode(url, encoding=encoding, errors='percentencode')
parts = urlsplit(_ascii_tab_newline_re.sub('', decoded))
# IDNA encoding can fail for too long labels (>63 characters)
# or missing labels (e.g. http://.example.com)
try:
netloc = parts.netloc.encode('idna')
except UnicodeError:
netloc = parts.netloc
# default encoding for path component SHOULD be UTF-8
if quote_path:
path = quote(to_bytes(parts.path, path_encoding), _safe_chars)
else:
path = to_native_str(parts.path)
# quote() in Python2 return type follows input type;
# quote() in Python3 always returns Unicode (native str)
return urlunsplit((
to_native_str(parts.scheme),
to_native_str(netloc).rstrip(':'),
path,
# encoding of query and fragment follows page encoding
# or form-charset (if known and passed)
quote(to_bytes(parts.query, encoding), _safe_chars),
quote(to_bytes(parts.fragment, encoding), _safe_chars),
))
_parent_dirs = re.compile(r'/?(\.\./)+')
def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
""" Make a url for download. This will call safe_url_string
and then strip the fragment, if one exists. The path will
be normalised.
If the path is outside the document root, it will be changed
to be within the document root.
"""
safe_url = safe_url_string(url, encoding, path_encoding)
scheme, netloc, path, query, _ = urlsplit(safe_url)
if path:
path = _parent_dirs.sub('', posixpath.normpath(path))
if safe_url.endswith('/') and not path.endswith('/'):
path += '/'
else:
path = '/'
return urlunsplit((scheme, netloc, path, query, ''))
def is_url(text):
return text.partition("://")[0] in ('file', 'http', 'https')
def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
"""Return the value of a url parameter, given the url and parameter name
General case:
>>> import w3lib.url
>>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "id")
'200'
>>>
Return a default value if the parameter is not found:
>>> w3lib.url.url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault")
'mydefault'
>>>
Returns None if `keep_blank_values` not set or 0 (default):
>>> w3lib.url.url_query_parameter("product.html?id=", "id")
>>>
Returns an empty string if `keep_blank_values` set to 1:
>>> w3lib.url.url_query_parameter("product.html?id=", "id", keep_blank_values=1)
''
>>>
"""
queryparams = parse_qs(
urlsplit(str(url))[3],
keep_blank_values=keep_blank_values
)
return queryparams.get(parameter, [default])[0]
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
"""Clean URL arguments leaving only those passed in the parameterlist keeping order
>>> import w3lib.url
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ('id',))
'product.html?id=200'
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])
'product.html?id=200&name=wired'
>>>
If `unique` is ``False``, do not remove duplicated keys
>>> w3lib.url.url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)
'product.html?d=1&d=2&d=3'
>>>
If `remove` is ``True``, leave only those **not in parameterlist**.
>>> w3lib.url.url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)
'product.html?foo=bar&name=wired'
>>> w3lib.url.url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)
'product.html?name=wired'
>>>
By default, URL fragments are removed. If you need to preserve fragments,
pass the ``keep_fragments`` argument as ``True``.
>>> w3lib.url.url_query_cleaner('http://domain.tld/?bla=123#123123', ['bla'], remove=True, keep_fragments=True)
'http://domain.tld/#123123'
"""
if isinstance(parameterlist, (six.text_type, bytes)):
parameterlist = [parameterlist]
url, fragment = urldefrag(url)
base, _, query = url.partition('?')
seen = set()
querylist = []
for ksv in query.split(sep):
if not ksv:
continue
k, _, _ = ksv.partition(kvsep)
if unique and k in seen:
continue
elif remove and k in parameterlist:
continue
elif not remove and k not in parameterlist:
continue
else:
querylist.append(ksv)
seen.add(k)
url = '?'.join([base, sep.join(querylist)]) if querylist else base
if keep_fragments:
url += '#' + fragment
return url
def _add_or_replace_parameters(url, params):
parsed = urlsplit(url)
current_args = parse_qsl(parsed.query, keep_blank_values=True)
new_args = []
seen_params = set()
for name, value in current_args:
if name not in params:
new_args.append((name, value))
elif name not in seen_params:
new_args.append((name, params[name]))
seen_params.add(name)
not_modified_args = [(name, value) for name, value in params.items() if name not in seen_params]
new_args += not_modified_args
query = urlencode(new_args)
return urlunsplit(parsed._replace(query=query))
def add_or_replace_parameter(url, name, new_value):
"""Add or remove a parameter to a given url
>>> import w3lib.url
>>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php', 'arg', 'v')
'http://www.example.com/index.php?arg=v'
>>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg4', 'v4')
'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3&arg4=v4'
>>> w3lib.url.add_or_replace_parameter('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', 'arg3', 'v3new')
'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new'
>>>
"""
return _add_or_replace_parameters(url, {name: new_value})
def add_or_replace_parameters(url, new_parameters):
"""Add or remove a parameters to a given url
>>> import w3lib.url
>>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php', {'arg': 'v'})
'http://www.example.com/index.php?arg=v'
>>> args = {'arg4': 'v4', 'arg3': 'v3new'}
>>> w3lib.url.add_or_replace_parameters('http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3', args)
'http://www.example.com/index.php?arg1=v1&arg2=v2&arg3=v3new&arg4=v4'
>>>
"""
return _add_or_replace_parameters(url, new_parameters)
def path_to_file_uri(path):
"""Convert local filesystem path to legal File URIs as described in:
http://en.wikipedia.org/wiki/File_URI_scheme
"""
x = pathname2url(os.path.abspath(path))
if os.name == 'nt':
x = x.replace('|', ':') # http://bugs.python.org/issue5861
return 'file:///%s' % x.lstrip('/')
def file_uri_to_path(uri):
"""Convert File URI to local filesystem path according to:
http://en.wikipedia.org/wiki/File_URI_scheme
"""
uri_path = urlparse(uri).path
return url2pathname(uri_path)
def any_to_uri(uri_or_path):
"""If given a path name, return its File URI, otherwise return it
unmodified
"""
if os.path.splitdrive(uri_or_path)[0]:
return path_to_file_uri(uri_or_path)
u = urlparse(uri_or_path)
return uri_or_path if u.scheme else path_to_file_uri(uri_or_path)
# ASCII characters.
_char = set(map(chr, range(127)))
# RFC 2045 token.
_token = r'[{}]+'.format(re.escape(''.join(_char -
# Control characters.
set(map(chr, range(0, 32))) -
# tspecials and space.
set('()<>@,;:\\"/[]?= '))))
# RFC 822 quoted-string, without surrounding quotation marks.
_quoted_string = r'(?:[{}]|(?:\\[{}]))*'.format(
re.escape(''.join(_char - {'"', '\\', '\r'})),
re.escape(''.join(_char))
)
# Encode the regular expression strings to make them into bytes, as Python 3
# bytes have no format() method, but bytes must be passed to re.compile() in
# order to make a pattern object that can be used to match on bytes.
# RFC 2397 mediatype.
_mediatype_pattern = re.compile(
r'{token}/{token}'.format(token=_token).encode()
)
_mediatype_parameter_pattern = re.compile(
r';({token})=(?:({token})|"({quoted})")'.format(token=_token,
quoted=_quoted_string
).encode()
)
_ParseDataURIResult = namedtuple("ParseDataURIResult",
"media_type media_type_parameters data")
def parse_data_uri(uri):
"""
Parse a data: URI, returning a 3-tuple of media type, dictionary of media
type parameters, and data.
"""
if not isinstance(uri, bytes):
uri = safe_url_string(uri).encode('ascii')
try:
scheme, uri = uri.split(b':', 1)
except ValueError:
raise ValueError("invalid URI")
if scheme.lower() != b'data':
raise ValueError("not a data URI")
# RFC 3986 section 2.1 allows percent encoding to escape characters that
# would be interpreted as delimiters, implying that actual delimiters
# should not be percent-encoded.
# Decoding before parsing will allow malformed URIs with percent-encoded
# delimiters, but it makes parsing easier and should not affect
# well-formed URIs, as the delimiters used in this URI scheme are not
# allowed, percent-encoded or not, in tokens.
if six.PY2:
uri = unquote(uri)
else:
uri = unquote_to_bytes(uri)
media_type = "text/plain"
media_type_params = {}
m = _mediatype_pattern.match(uri)
if m:
media_type = m.group().decode()
uri = uri[m.end():]
else:
media_type_params['charset'] = "US-ASCII"
while True:
m = _mediatype_parameter_pattern.match(uri)
if m:
attribute, value, value_quoted = m.groups()
if value_quoted:
value = re.sub(br'\\(.)', r'\1', value_quoted)
media_type_params[attribute.decode()] = value.decode()
uri = uri[m.end():]
else:
break
try:
is_base64, data = uri.split(b',', 1)
except ValueError:
raise ValueError("invalid data URI")
if is_base64:
if is_base64 != b";base64":
raise ValueError("invalid data URI")
data = base64.b64decode(data)
return _ParseDataURIResult(media_type, media_type_params, data)
__all__ = ["add_or_replace_parameter",
"add_or_replace_parameters",
"any_to_uri",
"canonicalize_url",
"file_uri_to_path",
"is_url",
"parse_data_uri",
"path_to_file_uri",
"safe_download_url",
"safe_url_string",
"url_query_cleaner",
"url_query_parameter",
# this last one is deprecated ; include it to be on the safe side
"urljoin_rfc"]
def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
# IDNA encoding can fail for too long labels (>63 characters)
# or missing labels (e.g. http://.example.com)
try:
netloc = parts.netloc.encode('idna')
except UnicodeError:
netloc = parts.netloc
return (
to_native_str(parts.scheme),
to_native_str(netloc),
# default encoding for path component SHOULD be UTF-8
quote(to_bytes(parts.path, path_encoding), _safe_chars),
quote(to_bytes(parts.params, path_encoding), _safe_chars),
# encoding of query and fragment follows page encoding
# or form-charset (if known and passed)
quote(to_bytes(parts.query, encoding), _safe_chars),
quote(to_bytes(parts.fragment, encoding), _safe_chars)
)
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
encoding=None):
r"""Canonicalize the given url by applying the following procedures:
- sort query arguments, first by key, then by value
- percent encode paths ; non-ASCII characters are percent-encoded
using UTF-8 (RFC-3986)
- percent encode query arguments ; non-ASCII characters are percent-encoded
using passed `encoding` (UTF-8 by default)
- normalize all spaces (in query arguments) '+' (plus symbol)
- normalize percent encodings case (%2f -> %2F)
- remove query arguments with blank values (unless `keep_blank_values` is True)
- remove fragments (unless `keep_fragments` is True)
The url passed can be bytes or unicode, while the url returned is
always a native str (bytes in Python 2, unicode in Python 3).
>>> import w3lib.url
>>>
>>> # sorting query arguments
>>> w3lib.url.canonicalize_url('http://www.example.com/do?c=3&b=5&b=2&a=50')
'http://www.example.com/do?a=50&b=2&b=5&c=3'
>>>
>>> # UTF-8 conversion + percent-encoding of non-ASCII characters
>>> w3lib.url.canonicalize_url(u'http://www.example.com/r\u00e9sum\u00e9')
'http://www.example.com/r%C3%A9sum%C3%A9'
>>>
For more examples, see the tests in `tests/test_url.py`.
"""
# If supplied `encoding` is not compatible with all characters in `url`,
# fallback to UTF-8 as safety net.
# UTF-8 can handle all Unicode characters,
# so we should be covered regarding URL normalization,
# if not for proper URL expected by remote website.
try:
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
parse_url(url), encoding=encoding)
except UnicodeEncodeError as e:
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
parse_url(url), encoding='utf8')
# 1. decode query-string as UTF-8 (or keep raw bytes),
# sort values,
# and percent-encode them back
if six.PY2:
keyvals = parse_qsl(query, keep_blank_values)
else:
# Python3's urllib.parse.parse_qsl does not work as wanted
# for percent-encoded characters that do not match passed encoding,
# they get lost.
#
# e.g., 'q=b%a3' becomes [('q', 'b\ufffd')]
# (ie. with 'REPLACEMENT CHARACTER' (U+FFFD),
# instead of \xa3 that you get with Python2's parse_qsl)
#
# what we want here is to keep raw bytes, and percent encode them
# so as to preserve whatever encoding what originally used.
#
# See https://tools.ietf.org/html/rfc3987#section-6.4:
#
# For example, it is possible to have a URI reference of
# "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the
# document name is encoded in iso-8859-1 based on server settings, but
# where the fragment identifier is encoded in UTF-8 according to
# [XPointer]. The IRI corresponding to the above URI would be (in XML
# notation)
# "http://www.example.org/r%E9sum%E9.xml#r&#xE9;sum&#xE9;".
# Similar considerations apply to query parts. The functionality of
# IRIs (namely, to be able to include non-ASCII characters) can only be
# used if the query part is encoded in UTF-8.
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
keyvals.sort()
query = urlencode(keyvals)
# 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
# and percent-encode path again (this normalizes to upper-case %XX)
uqp = _unquotepath(path)
path = quote(uqp, _safe_chars) or '/'
fragment = '' if not keep_fragments else fragment
# every part should be safe already
return urlunparse((scheme,
netloc.lower().rstrip(':'),
path,
params,
query,
fragment))
def _unquotepath(path):
for reserved in ('2f', '2F', '3f', '3F'):
path = path.replace('%' + reserved, '%25' + reserved.upper())
if six.PY2:
# in Python 2, '%a3' becomes '\xa3', which is what we want
return unquote(path)
else:
# in Python 3,
# standard lib's unquote() does not work for non-UTF-8
# percent-escaped characters, they get lost.
# e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD)
#
# unquote_to_bytes() returns raw bytes instead
return unquote_to_bytes(path)
def parse_url(url, encoding=None):
"""Return urlparsed url from the given argument (which could be an already
parsed url)
"""
if isinstance(url, ParseResult):
return url
return urlparse(to_unicode(url, encoding))
if not six.PY2:
from urllib.parse import _coerce_args, unquote_to_bytes
def parse_qsl_to_bytes(qs, keep_blank_values=False):
"""Parse a query given as a string argument.
Data are returned as a list of name, value pairs as bytes.
Arguments:
qs: percent-encoded query string to be parsed
keep_blank_values: flag indicating whether blank values in
percent-encoded queries should be treated as blank strings. A
true value indicates that blanks should be retained as blank
strings. The default false value indicates that blank values
are to be ignored and treated as if they were not included.
"""
# This code is the same as Python3's parse_qsl()
# (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
# except for the unquote(s, encoding, errors) calls replaced
# with unquote_to_bytes(s)
qs, _coerce_result = _coerce_args(qs)
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
if not name_value:
continue
nv = name_value.split('=', 1)
if len(nv) != 2:
# Handle case of a control-name with no equal sign
if keep_blank_values:
nv.append('')
else:
continue
if len(nv[1]) or keep_blank_values:
name = nv[0].replace('+', ' ')
name = unquote_to_bytes(name)
name = _coerce_result(name)
value = nv[1].replace('+', ' ')
value = unquote_to_bytes(value)
value = _coerce_result(value)
r.append((name, value))
return r
def urljoin_rfc(base, ref, encoding='utf-8'):
r"""
.. warning::
This function is deprecated and will be removed in future.
It is not supported with Python 3.
Please use ``urlparse.urljoin`` instead.
Same as urlparse.urljoin but supports unicode values in base and ref
parameters (in which case they will be converted to str using the given
encoding).
Always returns a str.
>>> import w3lib.url
>>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
'http://www.example.com/otherpath/index2.html'
>>>
>>> # Note: the following does not work in Python 3
>>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
>>>
"""
warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
DeprecationWarning)
str_base = to_bytes(base, encoding)
str_ref = to_bytes(ref, encoding)
return urljoin(str_base, str_ref)

View file

@ -0,0 +1,47 @@
import six
def str_to_unicode(text, encoding=None, errors='strict'):
if encoding is None:
encoding = 'utf-8'
if isinstance(text, bytes):
return text.decode(encoding, errors)
return text
def unicode_to_str(text, encoding=None, errors='strict'):
if encoding is None:
encoding = 'utf-8'
if isinstance(text, six.text_type):
return text.encode(encoding, errors)
return text
def to_unicode(text, encoding=None, errors='strict'):
"""Return the unicode representation of a bytes object `text`. If `text`
is already an unicode object, return it as-is."""
if isinstance(text, six.text_type):
return text
if not isinstance(text, (bytes, six.text_type)):
raise TypeError('to_unicode must receive a bytes, str or unicode '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.decode(encoding, errors)
def to_bytes(text, encoding=None, errors='strict'):
"""Return the binary representation of `text`. If `text`
is already a bytes object, return it as-is."""
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError('to_bytes must receive a unicode, str or bytes '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors)
def to_native_str(text, encoding=None, errors='strict'):
""" Return str representation of `text`
(bytes in Python 2.x and unicode in Python 3.x). """
if six.PY2:
return to_bytes(text, encoding, errors)
else:
return to_unicode(text, encoding, errors)