83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
""" This module implements the DecompressionMiddleware which tries to recognise
|
|
and extract the potentially compressed responses that may arrive.
|
|
"""
|
|
|
|
import bz2
|
|
import gzip
|
|
import logging
|
|
import tarfile
|
|
import zipfile
|
|
from io import BytesIO
|
|
from tempfile import mktemp
|
|
|
|
from scrapy.responsetypes import responsetypes
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DecompressionMiddleware:
|
|
""" This middleware tries to recognise and extract the possibly compressed
|
|
responses that may arrive. """
|
|
|
|
def __init__(self):
|
|
self._formats = {
|
|
'tar': self._is_tar,
|
|
'zip': self._is_zip,
|
|
'gz': self._is_gzip,
|
|
'bz2': self._is_bzip2
|
|
}
|
|
|
|
def _is_tar(self, response):
|
|
archive = BytesIO(response.body)
|
|
try:
|
|
tar_file = tarfile.open(name=mktemp(), fileobj=archive)
|
|
except tarfile.ReadError:
|
|
return
|
|
|
|
body = tar_file.extractfile(tar_file.members[0]).read()
|
|
respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
|
|
return response.replace(body=body, cls=respcls)
|
|
|
|
def _is_zip(self, response):
|
|
archive = BytesIO(response.body)
|
|
try:
|
|
zip_file = zipfile.ZipFile(archive)
|
|
except zipfile.BadZipfile:
|
|
return
|
|
|
|
namelist = zip_file.namelist()
|
|
body = zip_file.read(namelist[0])
|
|
respcls = responsetypes.from_args(filename=namelist[0], body=body)
|
|
return response.replace(body=body, cls=respcls)
|
|
|
|
def _is_gzip(self, response):
|
|
archive = BytesIO(response.body)
|
|
try:
|
|
body = gzip.GzipFile(fileobj=archive).read()
|
|
except IOError:
|
|
return
|
|
|
|
respcls = responsetypes.from_args(body=body)
|
|
return response.replace(body=body, cls=respcls)
|
|
|
|
def _is_bzip2(self, response):
|
|
try:
|
|
body = bz2.decompress(response.body)
|
|
except IOError:
|
|
return
|
|
|
|
respcls = responsetypes.from_args(body=body)
|
|
return response.replace(body=body, cls=respcls)
|
|
|
|
def process_response(self, request, response, spider):
|
|
if not response.body:
|
|
return response
|
|
|
|
for fmt, func in self._formats.items():
|
|
new_response = func(response)
|
|
if new_response:
|
|
logger.debug('Decompressed response with format: %(responsefmt)s',
|
|
{'responsefmt': fmt}, extra={'spider': spider})
|
|
return new_response
|
|
return response
|