57 lines
1.9 KiB
Python
57 lines
1.9 KiB
Python
"""
|
|
HttpError Spider Middleware
|
|
|
|
See documentation in docs/topics/spider-middleware.rst
|
|
"""
|
|
import logging
|
|
|
|
from scrapy.exceptions import IgnoreRequest
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HttpError(IgnoreRequest):
|
|
"""A non-200 response was filtered"""
|
|
|
|
def __init__(self, response, *args, **kwargs):
|
|
self.response = response
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
|
class HttpErrorMiddleware:
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
return cls(crawler.settings)
|
|
|
|
def __init__(self, settings):
|
|
self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
|
|
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
|
|
|
|
def process_spider_input(self, response, spider):
|
|
if 200 <= response.status < 300: # common case
|
|
return
|
|
meta = response.meta
|
|
if 'handle_httpstatus_all' in meta:
|
|
return
|
|
if 'handle_httpstatus_list' in meta:
|
|
allowed_statuses = meta['handle_httpstatus_list']
|
|
elif self.handle_httpstatus_all:
|
|
return
|
|
else:
|
|
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
|
|
if response.status in allowed_statuses:
|
|
return
|
|
raise HttpError(response, 'Ignoring non-200 response')
|
|
|
|
def process_spider_exception(self, response, exception, spider):
|
|
if isinstance(exception, HttpError):
|
|
spider.crawler.stats.inc_value('httperror/response_ignored_count')
|
|
spider.crawler.stats.inc_value(
|
|
f'httperror/response_ignored_status_count/{response.status}'
|
|
)
|
|
logger.info(
|
|
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",
|
|
{'response': response}, extra={'spider': spider},
|
|
)
|
|
return []
|