Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
|
|
@ -0,0 +1,26 @@
|
|||
"""
|
||||
Item pipeline
|
||||
|
||||
See documentation in docs/item-pipeline.rst
|
||||
"""
|
||||
|
||||
from scrapy.middleware import MiddlewareManager
|
||||
from scrapy.utils.conf import build_component_list
|
||||
from scrapy.utils.defer import deferred_f_from_coro_f
|
||||
|
||||
|
||||
class ItemPipelineManager(MiddlewareManager):
|
||||
|
||||
component_name = 'item pipeline'
|
||||
|
||||
@classmethod
|
||||
def _get_mwlist_from_settings(cls, settings):
|
||||
return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
|
||||
|
||||
def _add_middleware(self, pipe):
|
||||
super(ItemPipelineManager, self)._add_middleware(pipe)
|
||||
if hasattr(pipe, 'process_item'):
|
||||
self.methods['process_item'].append(deferred_f_from_coro_f(pipe.process_item))
|
||||
|
||||
def process_item(self, item, spider):
|
||||
return self._process_chain('process_item', item, spider)
|
||||
514
venv/lib/python3.9/site-packages/scrapy/pipelines/files.py
Normal file
514
venv/lib/python3.9/site-packages/scrapy/pipelines/files.py
Normal file
|
|
@ -0,0 +1,514 @@
|
|||
"""
|
||||
Files Pipeline
|
||||
|
||||
See documentation in topics/media-pipeline.rst
|
||||
"""
|
||||
import functools
|
||||
import hashlib
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from contextlib import suppress
|
||||
from ftplib import FTP
|
||||
from io import BytesIO
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from twisted.internet import defer, threads
|
||||
|
||||
from scrapy.exceptions import IgnoreRequest, NotConfigured
|
||||
from scrapy.http import Request
|
||||
from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.boto import is_botocore_available
|
||||
from scrapy.utils.datatypes import CaselessDict
|
||||
from scrapy.utils.ftp import ftp_store_file
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.utils.python import to_bytes
|
||||
from scrapy.utils.request import referer_str
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileException(Exception):
|
||||
"""General media error exception"""
|
||||
|
||||
|
||||
class FSFilesStore:
|
||||
def __init__(self, basedir):
|
||||
if '://' in basedir:
|
||||
basedir = basedir.split('://', 1)[1]
|
||||
self.basedir = basedir
|
||||
self._mkdir(self.basedir)
|
||||
self.created_directories = defaultdict(set)
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
self._mkdir(os.path.dirname(absolute_path), info)
|
||||
with open(absolute_path, 'wb') as f:
|
||||
f.write(buf.getvalue())
|
||||
|
||||
def stat_file(self, path, info):
|
||||
absolute_path = self._get_filesystem_path(path)
|
||||
try:
|
||||
last_modified = os.path.getmtime(absolute_path)
|
||||
except os.error:
|
||||
return {}
|
||||
|
||||
with open(absolute_path, 'rb') as f:
|
||||
checksum = md5sum(f)
|
||||
|
||||
return {'last_modified': last_modified, 'checksum': checksum}
|
||||
|
||||
def _get_filesystem_path(self, path):
|
||||
path_comps = path.split('/')
|
||||
return os.path.join(self.basedir, *path_comps)
|
||||
|
||||
def _mkdir(self, dirname, domain=None):
|
||||
seen = self.created_directories[domain] if domain else set()
|
||||
if dirname not in seen:
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
seen.add(dirname)
|
||||
|
||||
|
||||
class S3FilesStore:
|
||||
AWS_ACCESS_KEY_ID = None
|
||||
AWS_SECRET_ACCESS_KEY = None
|
||||
AWS_ENDPOINT_URL = None
|
||||
AWS_REGION_NAME = None
|
||||
AWS_USE_SSL = None
|
||||
AWS_VERIFY = None
|
||||
|
||||
POLICY = 'private' # Overriden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
|
||||
HEADERS = {
|
||||
'Cache-Control': 'max-age=172800',
|
||||
}
|
||||
|
||||
def __init__(self, uri):
|
||||
if not is_botocore_available():
|
||||
raise NotConfigured('missing botocore library')
|
||||
import botocore.session
|
||||
session = botocore.session.get_session()
|
||||
self.s3_client = session.create_client(
|
||||
's3',
|
||||
aws_access_key_id=self.AWS_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
|
||||
endpoint_url=self.AWS_ENDPOINT_URL,
|
||||
region_name=self.AWS_REGION_NAME,
|
||||
use_ssl=self.AWS_USE_SSL,
|
||||
verify=self.AWS_VERIFY
|
||||
)
|
||||
if not uri.startswith("s3://"):
|
||||
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
|
||||
self.bucket, self.prefix = uri[5:].split('/', 1)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _onsuccess(boto_key):
|
||||
checksum = boto_key['ETag'].strip('"')
|
||||
last_modified = boto_key['LastModified']
|
||||
modified_stamp = time.mktime(last_modified.timetuple())
|
||||
return {'checksum': checksum, 'last_modified': modified_stamp}
|
||||
|
||||
return self._get_boto_key(path).addCallback(_onsuccess)
|
||||
|
||||
def _get_boto_key(self, path):
|
||||
key_name = f'{self.prefix}{path}'
|
||||
return threads.deferToThread(
|
||||
self.s3_client.head_object,
|
||||
Bucket=self.bucket,
|
||||
Key=key_name)
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
"""Upload file to S3 storage"""
|
||||
key_name = f'{self.prefix}{path}'
|
||||
buf.seek(0)
|
||||
extra = self._headers_to_botocore_kwargs(self.HEADERS)
|
||||
if headers:
|
||||
extra.update(self._headers_to_botocore_kwargs(headers))
|
||||
return threads.deferToThread(
|
||||
self.s3_client.put_object,
|
||||
Bucket=self.bucket,
|
||||
Key=key_name,
|
||||
Body=buf,
|
||||
Metadata={k: str(v) for k, v in (meta or {}).items()},
|
||||
ACL=self.POLICY,
|
||||
**extra)
|
||||
|
||||
def _headers_to_botocore_kwargs(self, headers):
|
||||
""" Convert headers to botocore keyword agruments.
|
||||
"""
|
||||
# This is required while we need to support both boto and botocore.
|
||||
mapping = CaselessDict({
|
||||
'Content-Type': 'ContentType',
|
||||
'Cache-Control': 'CacheControl',
|
||||
'Content-Disposition': 'ContentDisposition',
|
||||
'Content-Encoding': 'ContentEncoding',
|
||||
'Content-Language': 'ContentLanguage',
|
||||
'Content-Length': 'ContentLength',
|
||||
'Content-MD5': 'ContentMD5',
|
||||
'Expires': 'Expires',
|
||||
'X-Amz-Grant-Full-Control': 'GrantFullControl',
|
||||
'X-Amz-Grant-Read': 'GrantRead',
|
||||
'X-Amz-Grant-Read-ACP': 'GrantReadACP',
|
||||
'X-Amz-Grant-Write-ACP': 'GrantWriteACP',
|
||||
'X-Amz-Object-Lock-Legal-Hold': 'ObjectLockLegalHoldStatus',
|
||||
'X-Amz-Object-Lock-Mode': 'ObjectLockMode',
|
||||
'X-Amz-Object-Lock-Retain-Until-Date': 'ObjectLockRetainUntilDate',
|
||||
'X-Amz-Request-Payer': 'RequestPayer',
|
||||
'X-Amz-Server-Side-Encryption': 'ServerSideEncryption',
|
||||
'X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id': 'SSEKMSKeyId',
|
||||
'X-Amz-Server-Side-Encryption-Context': 'SSEKMSEncryptionContext',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Algorithm': 'SSECustomerAlgorithm',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Key': 'SSECustomerKey',
|
||||
'X-Amz-Server-Side-Encryption-Customer-Key-Md5': 'SSECustomerKeyMD5',
|
||||
'X-Amz-Storage-Class': 'StorageClass',
|
||||
'X-Amz-Tagging': 'Tagging',
|
||||
'X-Amz-Website-Redirect-Location': 'WebsiteRedirectLocation',
|
||||
})
|
||||
extra = {}
|
||||
for key, value in headers.items():
|
||||
try:
|
||||
kwarg = mapping[key]
|
||||
except KeyError:
|
||||
raise TypeError(f'Header "{key}" is not supported by botocore')
|
||||
else:
|
||||
extra[kwarg] = value
|
||||
return extra
|
||||
|
||||
|
||||
class GCSFilesStore:
|
||||
|
||||
GCS_PROJECT_ID = None
|
||||
|
||||
CACHE_CONTROL = 'max-age=172800'
|
||||
|
||||
# The bucket's default object ACL will be applied to the object.
|
||||
# Overriden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
|
||||
POLICY = None
|
||||
|
||||
def __init__(self, uri):
|
||||
from google.cloud import storage
|
||||
client = storage.Client(project=self.GCS_PROJECT_ID)
|
||||
bucket, prefix = uri[5:].split('/', 1)
|
||||
self.bucket = client.bucket(bucket)
|
||||
self.prefix = prefix
|
||||
permissions = self.bucket.test_iam_permissions(
|
||||
['storage.objects.get', 'storage.objects.create']
|
||||
)
|
||||
if 'storage.objects.get' not in permissions:
|
||||
logger.warning(
|
||||
"No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
|
||||
"Checking if files are up to date will be impossible. Files will be downloaded every time.",
|
||||
{'bucket': bucket}
|
||||
)
|
||||
if 'storage.objects.create' not in permissions:
|
||||
logger.error(
|
||||
"No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
|
||||
{'bucket': bucket}
|
||||
)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _onsuccess(blob):
|
||||
if blob:
|
||||
checksum = blob.md5_hash
|
||||
last_modified = time.mktime(blob.updated.timetuple())
|
||||
return {'checksum': checksum, 'last_modified': last_modified}
|
||||
else:
|
||||
return {}
|
||||
|
||||
return threads.deferToThread(self.bucket.get_blob, path).addCallback(_onsuccess)
|
||||
|
||||
def _get_content_type(self, headers):
|
||||
if headers and 'Content-Type' in headers:
|
||||
return headers['Content-Type']
|
||||
else:
|
||||
return 'application/octet-stream'
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
blob = self.bucket.blob(self.prefix + path)
|
||||
blob.cache_control = self.CACHE_CONTROL
|
||||
blob.metadata = {k: str(v) for k, v in (meta or {}).items()}
|
||||
return threads.deferToThread(
|
||||
blob.upload_from_string,
|
||||
data=buf.getvalue(),
|
||||
content_type=self._get_content_type(headers),
|
||||
predefined_acl=self.POLICY
|
||||
)
|
||||
|
||||
|
||||
class FTPFilesStore:
|
||||
|
||||
FTP_USERNAME = None
|
||||
FTP_PASSWORD = None
|
||||
USE_ACTIVE_MODE = None
|
||||
|
||||
def __init__(self, uri):
|
||||
if not uri.startswith("ftp://"):
|
||||
raise ValueError(f"Incorrect URI scheme in {uri}, expected 'ftp'")
|
||||
u = urlparse(uri)
|
||||
self.port = u.port
|
||||
self.host = u.hostname
|
||||
self.port = int(u.port or 21)
|
||||
self.username = u.username or self.FTP_USERNAME
|
||||
self.password = u.password or self.FTP_PASSWORD
|
||||
self.basedir = u.path.rstrip('/')
|
||||
|
||||
def persist_file(self, path, buf, info, meta=None, headers=None):
|
||||
path = f'{self.basedir}/{path}'
|
||||
return threads.deferToThread(
|
||||
ftp_store_file, path=path, file=buf,
|
||||
host=self.host, port=self.port, username=self.username,
|
||||
password=self.password, use_active_mode=self.USE_ACTIVE_MODE
|
||||
)
|
||||
|
||||
def stat_file(self, path, info):
|
||||
def _stat_file(path):
|
||||
try:
|
||||
ftp = FTP()
|
||||
ftp.connect(self.host, self.port)
|
||||
ftp.login(self.username, self.password)
|
||||
if self.USE_ACTIVE_MODE:
|
||||
ftp.set_pasv(False)
|
||||
file_path = f"{self.basedir}/{path}"
|
||||
last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
|
||||
m = hashlib.md5()
|
||||
ftp.retrbinary(f'RETR {file_path}', m.update)
|
||||
return {'last_modified': last_modified, 'checksum': m.hexdigest()}
|
||||
# The file doesn't exist
|
||||
except Exception:
|
||||
return {}
|
||||
return threads.deferToThread(_stat_file, path)
|
||||
|
||||
|
||||
class FilesPipeline(MediaPipeline):
|
||||
"""Abstract pipeline that implement the file downloading
|
||||
|
||||
This pipeline tries to minimize network transfers and file processing,
|
||||
doing stat of the files and determining if file is new, uptodate or
|
||||
expired.
|
||||
|
||||
``new`` files are those that pipeline never processed and needs to be
|
||||
downloaded from supplier site the first time.
|
||||
|
||||
``uptodate`` files are the ones that the pipeline processed and are still
|
||||
valid files.
|
||||
|
||||
``expired`` files are those that pipeline already processed but the last
|
||||
modification was made long time ago, so a reprocessing is recommended to
|
||||
refresh it in case of change.
|
||||
|
||||
"""
|
||||
|
||||
MEDIA_NAME = "file"
|
||||
EXPIRES = 90
|
||||
STORE_SCHEMES = {
|
||||
'': FSFilesStore,
|
||||
'file': FSFilesStore,
|
||||
's3': S3FilesStore,
|
||||
'gs': GCSFilesStore,
|
||||
'ftp': FTPFilesStore
|
||||
}
|
||||
DEFAULT_FILES_URLS_FIELD = 'file_urls'
|
||||
DEFAULT_FILES_RESULT_FIELD = 'files'
|
||||
|
||||
def __init__(self, store_uri, download_func=None, settings=None):
|
||||
if not store_uri:
|
||||
raise NotConfigured
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
|
||||
cls_name = "FilesPipeline"
|
||||
self.store = self._get_store(store_uri)
|
||||
resolve = functools.partial(self._key_for_pipe,
|
||||
base_class_name=cls_name,
|
||||
settings=settings)
|
||||
self.expires = settings.getint(
|
||||
resolve('FILES_EXPIRES'), self.EXPIRES
|
||||
)
|
||||
if not hasattr(self, "FILES_URLS_FIELD"):
|
||||
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
|
||||
if not hasattr(self, "FILES_RESULT_FIELD"):
|
||||
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
|
||||
self.files_urls_field = settings.get(
|
||||
resolve('FILES_URLS_FIELD'), self.FILES_URLS_FIELD
|
||||
)
|
||||
self.files_result_field = settings.get(
|
||||
resolve('FILES_RESULT_FIELD'), self.FILES_RESULT_FIELD
|
||||
)
|
||||
|
||||
super().__init__(download_func=download_func, settings=settings)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
s3store = cls.STORE_SCHEMES['s3']
|
||||
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
|
||||
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
|
||||
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
|
||||
s3store.AWS_VERIFY = settings['AWS_VERIFY']
|
||||
s3store.POLICY = settings['FILES_STORE_S3_ACL']
|
||||
|
||||
gcs_store = cls.STORE_SCHEMES['gs']
|
||||
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
|
||||
gcs_store.POLICY = settings['FILES_STORE_GCS_ACL'] or None
|
||||
|
||||
ftp_store = cls.STORE_SCHEMES['ftp']
|
||||
ftp_store.FTP_USERNAME = settings['FTP_USER']
|
||||
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
|
||||
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
|
||||
|
||||
store_uri = settings['FILES_STORE']
|
||||
return cls(store_uri, settings=settings)
|
||||
|
||||
def _get_store(self, uri):
|
||||
if os.path.isabs(uri): # to support win32 paths like: C:\\some\dir
|
||||
scheme = 'file'
|
||||
else:
|
||||
scheme = urlparse(uri).scheme
|
||||
store_cls = self.STORE_SCHEMES[scheme]
|
||||
return store_cls(uri)
|
||||
|
||||
def media_to_download(self, request, info, *, item=None):
|
||||
def _onsuccess(result):
|
||||
if not result:
|
||||
return # returning None force download
|
||||
|
||||
last_modified = result.get('last_modified', None)
|
||||
if not last_modified:
|
||||
return # returning None force download
|
||||
|
||||
age_seconds = time.time() - last_modified
|
||||
age_days = age_seconds / 60 / 60 / 24
|
||||
if age_days > self.expires:
|
||||
return # returning None force download
|
||||
|
||||
referer = referer_str(request)
|
||||
logger.debug(
|
||||
'File (uptodate): Downloaded %(medianame)s from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
self.inc_stats(info.spider, 'uptodate')
|
||||
|
||||
checksum = result.get('checksum', None)
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': 'uptodate'}
|
||||
|
||||
path = self.file_path(request, info=info, item=item)
|
||||
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
|
||||
dfd.addCallbacks(_onsuccess, lambda _: None)
|
||||
dfd.addErrback(
|
||||
lambda f:
|
||||
logger.error(self.__class__.__name__ + '.store.stat_file',
|
||||
exc_info=failure_to_exc_info(f),
|
||||
extra={'spider': info.spider})
|
||||
)
|
||||
return dfd
|
||||
|
||||
def media_failed(self, failure, request, info):
|
||||
if not isinstance(failure.value, IgnoreRequest):
|
||||
referer = referer_str(request)
|
||||
logger.warning(
|
||||
'File (unknown-error): Error downloading %(medianame)s from '
|
||||
'%(request)s referred in <%(referer)s>: %(exception)s',
|
||||
{'medianame': self.MEDIA_NAME, 'request': request,
|
||||
'referer': referer, 'exception': failure.value},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
|
||||
raise FileException
|
||||
|
||||
def media_downloaded(self, response, request, info, *, item=None):
|
||||
referer = referer_str(request)
|
||||
|
||||
if response.status != 200:
|
||||
logger.warning(
|
||||
'File (code: %(status)s): Error downloading file from '
|
||||
'%(request)s referred in <%(referer)s>',
|
||||
{'status': response.status,
|
||||
'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException('download-error')
|
||||
|
||||
if not response.body:
|
||||
logger.warning(
|
||||
'File (empty-content): Empty file from %(request)s referred '
|
||||
'in <%(referer)s>: no-content',
|
||||
{'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException('empty-content')
|
||||
|
||||
status = 'cached' if 'cached' in response.flags else 'downloaded'
|
||||
logger.debug(
|
||||
'File (%(status)s): Downloaded file from %(request)s referred in '
|
||||
'<%(referer)s>',
|
||||
{'status': status, 'request': request, 'referer': referer},
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
self.inc_stats(info.spider, status)
|
||||
|
||||
try:
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
checksum = self.file_downloaded(response, request, info, item=item)
|
||||
except FileException as exc:
|
||||
logger.warning(
|
||||
'File (error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>: %(errormsg)s',
|
||||
{'request': request, 'referer': referer, 'errormsg': str(exc)},
|
||||
extra={'spider': info.spider}, exc_info=True
|
||||
)
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.error(
|
||||
'File (unknown-error): Error processing file from %(request)s '
|
||||
'referred in <%(referer)s>',
|
||||
{'request': request, 'referer': referer},
|
||||
exc_info=True, extra={'spider': info.spider}
|
||||
)
|
||||
raise FileException(str(exc))
|
||||
|
||||
return {'url': request.url, 'path': path, 'checksum': checksum, 'status': status}
|
||||
|
||||
def inc_stats(self, spider, status):
|
||||
spider.crawler.stats.inc_value('file_count', spider=spider)
|
||||
spider.crawler.stats.inc_value(f'file_status_count/{status}', spider=spider)
|
||||
|
||||
# Overridable Interface
|
||||
def get_media_requests(self, item, info):
|
||||
urls = ItemAdapter(item).get(self.files_urls_field, [])
|
||||
return [Request(u) for u in urls]
|
||||
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
buf = BytesIO(response.body)
|
||||
checksum = md5sum(buf)
|
||||
buf.seek(0)
|
||||
self.store.persist_file(path, buf, info)
|
||||
return checksum
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
with suppress(KeyError):
|
||||
ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
media_ext = os.path.splitext(request.url)[1]
|
||||
# Handles empty and wild extensions by trying to guess the
|
||||
# mime type then extension or default to empty string otherwise
|
||||
if media_ext not in mimetypes.types_map:
|
||||
media_ext = ''
|
||||
media_type = mimetypes.guess_type(request.url)[0]
|
||||
if media_type:
|
||||
media_ext = mimetypes.guess_extension(media_type)
|
||||
return f'full/{media_guid}{media_ext}'
|
||||
176
venv/lib/python3.9/site-packages/scrapy/pipelines/images.py
Normal file
176
venv/lib/python3.9/site-packages/scrapy/pipelines/images.py
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
"""
|
||||
Images Pipeline
|
||||
|
||||
See documentation in topics/media-pipeline.rst
|
||||
"""
|
||||
import functools
|
||||
import hashlib
|
||||
from contextlib import suppress
|
||||
from io import BytesIO
|
||||
|
||||
from itemadapter import ItemAdapter
|
||||
from PIL import Image
|
||||
|
||||
from scrapy.exceptions import DropItem
|
||||
from scrapy.http import Request
|
||||
from scrapy.pipelines.files import FileException, FilesPipeline
|
||||
# TODO: from scrapy.pipelines.media import MediaPipeline
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.misc import md5sum
|
||||
from scrapy.utils.python import to_bytes
|
||||
|
||||
|
||||
class NoimagesDrop(DropItem):
|
||||
"""Product with no images exception"""
|
||||
|
||||
|
||||
class ImageException(FileException):
|
||||
"""General image error exception"""
|
||||
|
||||
|
||||
class ImagesPipeline(FilesPipeline):
|
||||
"""Abstract pipeline that implement the image thumbnail generation logic
|
||||
|
||||
"""
|
||||
|
||||
MEDIA_NAME = 'image'
|
||||
|
||||
# Uppercase attributes kept for backward compatibility with code that subclasses
|
||||
# ImagesPipeline. They may be overridden by settings.
|
||||
MIN_WIDTH = 0
|
||||
MIN_HEIGHT = 0
|
||||
EXPIRES = 90
|
||||
THUMBS = {}
|
||||
DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
|
||||
DEFAULT_IMAGES_RESULT_FIELD = 'images'
|
||||
|
||||
def __init__(self, store_uri, download_func=None, settings=None):
|
||||
super().__init__(store_uri, settings=settings, download_func=download_func)
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
|
||||
resolve = functools.partial(self._key_for_pipe,
|
||||
base_class_name="ImagesPipeline",
|
||||
settings=settings)
|
||||
self.expires = settings.getint(
|
||||
resolve("IMAGES_EXPIRES"), self.EXPIRES
|
||||
)
|
||||
|
||||
if not hasattr(self, "IMAGES_RESULT_FIELD"):
|
||||
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
|
||||
if not hasattr(self, "IMAGES_URLS_FIELD"):
|
||||
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
|
||||
|
||||
self.images_urls_field = settings.get(
|
||||
resolve('IMAGES_URLS_FIELD'),
|
||||
self.IMAGES_URLS_FIELD
|
||||
)
|
||||
self.images_result_field = settings.get(
|
||||
resolve('IMAGES_RESULT_FIELD'),
|
||||
self.IMAGES_RESULT_FIELD
|
||||
)
|
||||
self.min_width = settings.getint(
|
||||
resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
|
||||
)
|
||||
self.min_height = settings.getint(
|
||||
resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
|
||||
)
|
||||
self.thumbs = settings.get(
|
||||
resolve('IMAGES_THUMBS'), self.THUMBS
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_settings(cls, settings):
|
||||
s3store = cls.STORE_SCHEMES['s3']
|
||||
s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
|
||||
s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
|
||||
s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
|
||||
s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
|
||||
s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
|
||||
s3store.AWS_VERIFY = settings['AWS_VERIFY']
|
||||
s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
|
||||
|
||||
gcs_store = cls.STORE_SCHEMES['gs']
|
||||
gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
|
||||
gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
|
||||
|
||||
ftp_store = cls.STORE_SCHEMES['ftp']
|
||||
ftp_store.FTP_USERNAME = settings['FTP_USER']
|
||||
ftp_store.FTP_PASSWORD = settings['FTP_PASSWORD']
|
||||
ftp_store.USE_ACTIVE_MODE = settings.getbool('FEED_STORAGE_FTP_ACTIVE')
|
||||
|
||||
store_uri = settings['IMAGES_STORE']
|
||||
return cls(store_uri, settings=settings)
|
||||
|
||||
def file_downloaded(self, response, request, info, *, item=None):
|
||||
return self.image_downloaded(response, request, info, item=item)
|
||||
|
||||
def image_downloaded(self, response, request, info, *, item=None):
|
||||
checksum = None
|
||||
for path, image, buf in self.get_images(response, request, info, item=item):
|
||||
if checksum is None:
|
||||
buf.seek(0)
|
||||
checksum = md5sum(buf)
|
||||
width, height = image.size
|
||||
self.store.persist_file(
|
||||
path, buf, info,
|
||||
meta={'width': width, 'height': height},
|
||||
headers={'Content-Type': 'image/jpeg'})
|
||||
return checksum
|
||||
|
||||
def get_images(self, response, request, info, *, item=None):
|
||||
path = self.file_path(request, response=response, info=info, item=item)
|
||||
orig_image = Image.open(BytesIO(response.body))
|
||||
|
||||
width, height = orig_image.size
|
||||
if width < self.min_width or height < self.min_height:
|
||||
raise ImageException("Image too small "
|
||||
f"({width}x{height} < "
|
||||
f"{self.min_width}x{self.min_height})")
|
||||
|
||||
image, buf = self.convert_image(orig_image)
|
||||
yield path, image, buf
|
||||
|
||||
for thumb_id, size in self.thumbs.items():
|
||||
thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
|
||||
thumb_image, thumb_buf = self.convert_image(image, size)
|
||||
yield thumb_path, thumb_image, thumb_buf
|
||||
|
||||
def convert_image(self, image, size=None):
|
||||
if image.format == 'PNG' and image.mode == 'RGBA':
|
||||
background = Image.new('RGBA', image.size, (255, 255, 255))
|
||||
background.paste(image, image)
|
||||
image = background.convert('RGB')
|
||||
elif image.mode == 'P':
|
||||
image = image.convert("RGBA")
|
||||
background = Image.new('RGBA', image.size, (255, 255, 255))
|
||||
background.paste(image, image)
|
||||
image = background.convert('RGB')
|
||||
elif image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
if size:
|
||||
image = image.copy()
|
||||
image.thumbnail(size, Image.ANTIALIAS)
|
||||
|
||||
buf = BytesIO()
|
||||
image.save(buf, 'JPEG')
|
||||
return image, buf
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
urls = ItemAdapter(item).get(self.images_urls_field, [])
|
||||
return [Request(u) for u in urls]
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
with suppress(KeyError):
|
||||
ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
return f'full/{image_guid}.jpg'
|
||||
|
||||
def thumb_path(self, request, thumb_id, response=None, info=None):
|
||||
thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
|
||||
return f'thumbs/{thumb_id}/{thumb_guid}.jpg'
|
||||
251
venv/lib/python3.9/site-packages/scrapy/pipelines/media.py
Normal file
251
venv/lib/python3.9/site-packages/scrapy/pipelines/media.py
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
import functools
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from inspect import signature
|
||||
from warnings import warn
|
||||
|
||||
from twisted.internet.defer import Deferred, DeferredList
|
||||
from twisted.python.failure import Failure
|
||||
|
||||
from scrapy.settings import Settings
|
||||
from scrapy.utils.datatypes import SequenceExclude
|
||||
from scrapy.utils.defer import mustbe_deferred, defer_result
|
||||
from scrapy.utils.deprecate import ScrapyDeprecationWarning
|
||||
from scrapy.utils.request import request_fingerprint
|
||||
from scrapy.utils.misc import arg_to_iter
|
||||
from scrapy.utils.log import failure_to_exc_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MediaPipeline:
|
||||
|
||||
LOG_FAILED_RESULTS = True
|
||||
|
||||
class SpiderInfo:
|
||||
def __init__(self, spider):
|
||||
self.spider = spider
|
||||
self.downloading = set()
|
||||
self.downloaded = {}
|
||||
self.waiting = defaultdict(list)
|
||||
|
||||
def __init__(self, download_func=None, settings=None):
|
||||
self.download_func = download_func
|
||||
self._expects_item = {}
|
||||
|
||||
if isinstance(settings, dict) or settings is None:
|
||||
settings = Settings(settings)
|
||||
resolve = functools.partial(self._key_for_pipe,
|
||||
base_class_name="MediaPipeline",
|
||||
settings=settings)
|
||||
self.allow_redirects = settings.getbool(
|
||||
resolve('MEDIA_ALLOW_REDIRECTS'), False
|
||||
)
|
||||
self._handle_statuses(self.allow_redirects)
|
||||
|
||||
# Check if deprecated methods are being used and make them compatible
|
||||
self._make_compatible()
|
||||
|
||||
def _handle_statuses(self, allow_redirects):
|
||||
self.handle_httpstatus_list = None
|
||||
if allow_redirects:
|
||||
self.handle_httpstatus_list = SequenceExclude(range(300, 400))
|
||||
|
||||
def _key_for_pipe(self, key, base_class_name=None, settings=None):
|
||||
"""
|
||||
>>> MediaPipeline()._key_for_pipe("IMAGES")
|
||||
'IMAGES'
|
||||
>>> class MyPipe(MediaPipeline):
|
||||
... pass
|
||||
>>> MyPipe()._key_for_pipe("IMAGES", base_class_name="MediaPipeline")
|
||||
'MYPIPE_IMAGES'
|
||||
"""
|
||||
class_name = self.__class__.__name__
|
||||
formatted_key = f"{class_name.upper()}_{key}"
|
||||
if (
|
||||
not base_class_name
|
||||
or class_name == base_class_name
|
||||
or settings and not settings.get(formatted_key)
|
||||
):
|
||||
return key
|
||||
return formatted_key
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
try:
|
||||
pipe = cls.from_settings(crawler.settings)
|
||||
except AttributeError:
|
||||
pipe = cls()
|
||||
pipe.crawler = crawler
|
||||
return pipe
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.spiderinfo = self.SpiderInfo(spider)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
info = self.spiderinfo
|
||||
requests = arg_to_iter(self.get_media_requests(item, info))
|
||||
dlist = [self._process_request(r, info, item) for r in requests]
|
||||
dfd = DeferredList(dlist, consumeErrors=1)
|
||||
return dfd.addCallback(self.item_completed, item, info)
|
||||
|
||||
def _process_request(self, request, info, item):
|
||||
fp = request_fingerprint(request)
|
||||
cb = request.callback or (lambda _: _)
|
||||
eb = request.errback
|
||||
request.callback = None
|
||||
request.errback = None
|
||||
|
||||
# Return cached result if request was already seen
|
||||
if fp in info.downloaded:
|
||||
return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)
|
||||
|
||||
# Otherwise, wait for result
|
||||
wad = Deferred().addCallbacks(cb, eb)
|
||||
info.waiting[fp].append(wad)
|
||||
|
||||
# Check if request is downloading right now to avoid doing it twice
|
||||
if fp in info.downloading:
|
||||
return wad
|
||||
|
||||
# Download request checking media_to_download hook output first
|
||||
info.downloading.add(fp)
|
||||
dfd = mustbe_deferred(self.media_to_download, request, info, item=item)
|
||||
dfd.addCallback(self._check_media_to_download, request, info, item=item)
|
||||
dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
|
||||
dfd.addErrback(lambda f: logger.error(
|
||||
f.value, exc_info=failure_to_exc_info(f), extra={'spider': info.spider})
|
||||
)
|
||||
return dfd.addBoth(lambda _: wad) # it must return wad at last
|
||||
|
||||
def _make_compatible(self):
|
||||
"""Make overridable methods of MediaPipeline and subclasses backwards compatible"""
|
||||
methods = [
|
||||
"file_path", "media_to_download", "media_downloaded",
|
||||
"file_downloaded", "image_downloaded", "get_images"
|
||||
]
|
||||
|
||||
for method_name in methods:
|
||||
method = getattr(self, method_name, None)
|
||||
if callable(method):
|
||||
setattr(self, method_name, self._compatible(method))
|
||||
|
||||
def _compatible(self, func):
|
||||
"""Wrapper for overridable methods to allow backwards compatibility"""
|
||||
self._check_signature(func)
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if self._expects_item[func.__name__]:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
kwargs.pop('item', None)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
def _check_signature(self, func):
|
||||
sig = signature(func)
|
||||
self._expects_item[func.__name__] = True
|
||||
|
||||
if 'item' not in sig.parameters:
|
||||
old_params = str(sig)[1:-1]
|
||||
new_params = old_params + ", *, item=None"
|
||||
warn(f'{func.__name__}(self, {old_params}) is deprecated, '
|
||||
f'please use {func.__name__}(self, {new_params})',
|
||||
ScrapyDeprecationWarning, stacklevel=2)
|
||||
self._expects_item[func.__name__] = False
|
||||
|
||||
def _modify_media_request(self, request):
|
||||
if self.handle_httpstatus_list:
|
||||
request.meta['handle_httpstatus_list'] = self.handle_httpstatus_list
|
||||
else:
|
||||
request.meta['handle_httpstatus_all'] = True
|
||||
|
||||
def _check_media_to_download(self, result, request, info, item):
|
||||
if result is not None:
|
||||
return result
|
||||
if self.download_func:
|
||||
# this ugly code was left only to support tests. TODO: remove
|
||||
dfd = mustbe_deferred(self.download_func, request, info.spider)
|
||||
dfd.addCallbacks(
|
||||
callback=self.media_downloaded, callbackArgs=(request, info), callbackKeywords={'item': item},
|
||||
errback=self.media_failed, errbackArgs=(request, info))
|
||||
else:
|
||||
self._modify_media_request(request)
|
||||
dfd = self.crawler.engine.download(request, info.spider)
|
||||
dfd.addCallbacks(
|
||||
callback=self.media_downloaded, callbackArgs=(request, info), callbackKeywords={'item': item},
|
||||
errback=self.media_failed, errbackArgs=(request, info))
|
||||
return dfd
|
||||
|
||||
def _cache_result_and_execute_waiters(self, result, fp, info):
|
||||
if isinstance(result, Failure):
|
||||
# minimize cached information for failure
|
||||
result.cleanFailure()
|
||||
result.frames = []
|
||||
result.stack = None
|
||||
|
||||
# This code fixes a memory leak by avoiding to keep references to
|
||||
# the Request and Response objects on the Media Pipeline cache.
|
||||
#
|
||||
# What happens when the media_downloaded callback raises an
|
||||
# exception, for example a FileException('download-error') when
|
||||
# the Response status code is not 200 OK, is that the original
|
||||
# StopIteration exception (which in turn contains the failed
|
||||
# Response and by extension, the original Request) gets encapsulated
|
||||
# within the FileException context.
|
||||
#
|
||||
# Originally, Scrapy was using twisted.internet.defer.returnValue
|
||||
# inside functions decorated with twisted.internet.defer.inlineCallbacks,
|
||||
# encapsulating the returned Response in a _DefGen_Return exception
|
||||
# instead of a StopIteration.
|
||||
#
|
||||
# To avoid keeping references to the Response and therefore Request
|
||||
# objects on the Media Pipeline cache, we should wipe the context of
|
||||
# the encapsulated exception when it is a StopIteration instance
|
||||
#
|
||||
# This problem does not occur in Python 2.7 since we don't have
|
||||
# Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
|
||||
context = getattr(result.value, '__context__', None)
|
||||
if isinstance(context, StopIteration):
|
||||
setattr(result.value, '__context__', None)
|
||||
|
||||
info.downloading.remove(fp)
|
||||
info.downloaded[fp] = result # cache result
|
||||
for wad in info.waiting.pop(fp):
|
||||
defer_result(result).chainDeferred(wad)
|
||||
|
||||
# Overridable Interface
|
||||
def media_to_download(self, request, info, *, item=None):
|
||||
"""Check request before starting download"""
|
||||
pass
|
||||
|
||||
def get_media_requests(self, item, info):
|
||||
"""Returns the media requests to download"""
|
||||
pass
|
||||
|
||||
def media_downloaded(self, response, request, info, *, item=None):
|
||||
"""Handler for success downloads"""
|
||||
return response
|
||||
|
||||
def media_failed(self, failure, request, info):
|
||||
"""Handler for failed downloads"""
|
||||
return failure
|
||||
|
||||
def item_completed(self, results, item, info):
|
||||
"""Called per item when all media requests has been processed"""
|
||||
if self.LOG_FAILED_RESULTS:
|
||||
for ok, value in results:
|
||||
if not ok:
|
||||
logger.error(
|
||||
'%(class)s found errors processing %(item)s',
|
||||
{'class': self.__class__.__name__, 'item': item},
|
||||
exc_info=failure_to_exc_info(value),
|
||||
extra={'spider': info.spider}
|
||||
)
|
||||
return item
|
||||
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
"""Returns the path where downloaded media should be stored"""
|
||||
pass
|
||||
Loading…
Add table
Add a link
Reference in a new issue