130 lines
4.1 KiB
Python
130 lines
4.1 KiB
Python
import sys
|
|
import logging
|
|
from abc import ABCMeta, abstractmethod
|
|
|
|
from scrapy.utils.python import to_unicode
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
|
|
try:
|
|
if to_native_str_type:
|
|
robotstxt_body = to_unicode(robotstxt_body)
|
|
else:
|
|
robotstxt_body = robotstxt_body.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
|
|
# Switch to 'allow all' state.
|
|
logger.warning(
|
|
"Failure while parsing robots.txt. File either contains garbage or "
|
|
"is in an encoding other than UTF-8, treating it as an empty file.",
|
|
exc_info=sys.exc_info(),
|
|
extra={'spider': spider},
|
|
)
|
|
robotstxt_body = ''
|
|
return robotstxt_body
|
|
|
|
|
|
class RobotParser(metaclass=ABCMeta):
|
|
@classmethod
|
|
@abstractmethod
|
|
def from_crawler(cls, crawler, robotstxt_body):
|
|
"""Parse the content of a robots.txt_ file as bytes. This must be a class method.
|
|
It must return a new instance of the parser backend.
|
|
|
|
:param crawler: crawler which made the request
|
|
:type crawler: :class:`~scrapy.crawler.Crawler` instance
|
|
|
|
:param robotstxt_body: content of a robots.txt_ file.
|
|
:type robotstxt_body: bytes
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def allowed(self, url, user_agent):
|
|
"""Return ``True`` if ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
|
|
|
|
:param url: Absolute URL
|
|
:type url: str
|
|
|
|
:param user_agent: User agent
|
|
:type user_agent: str
|
|
"""
|
|
pass
|
|
|
|
|
|
class PythonRobotParser(RobotParser):
|
|
def __init__(self, robotstxt_body, spider):
|
|
from urllib.robotparser import RobotFileParser
|
|
self.spider = spider
|
|
robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
|
|
self.rp = RobotFileParser()
|
|
self.rp.parse(robotstxt_body.splitlines())
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler, robotstxt_body):
|
|
spider = None if not crawler else crawler.spider
|
|
o = cls(robotstxt_body, spider)
|
|
return o
|
|
|
|
def allowed(self, url, user_agent):
|
|
user_agent = to_unicode(user_agent)
|
|
url = to_unicode(url)
|
|
return self.rp.can_fetch(user_agent, url)
|
|
|
|
|
|
class ReppyRobotParser(RobotParser):
|
|
def __init__(self, robotstxt_body, spider):
|
|
from reppy.robots import Robots
|
|
self.spider = spider
|
|
self.rp = Robots.parse('', robotstxt_body)
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler, robotstxt_body):
|
|
spider = None if not crawler else crawler.spider
|
|
o = cls(robotstxt_body, spider)
|
|
return o
|
|
|
|
def allowed(self, url, user_agent):
|
|
return self.rp.allowed(url, user_agent)
|
|
|
|
|
|
class RerpRobotParser(RobotParser):
|
|
def __init__(self, robotstxt_body, spider):
|
|
from robotexclusionrulesparser import RobotExclusionRulesParser
|
|
self.spider = spider
|
|
self.rp = RobotExclusionRulesParser()
|
|
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
|
|
self.rp.parse(robotstxt_body)
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler, robotstxt_body):
|
|
spider = None if not crawler else crawler.spider
|
|
o = cls(robotstxt_body, spider)
|
|
return o
|
|
|
|
def allowed(self, url, user_agent):
|
|
user_agent = to_unicode(user_agent)
|
|
url = to_unicode(url)
|
|
return self.rp.is_allowed(user_agent, url)
|
|
|
|
|
|
class ProtegoRobotParser(RobotParser):
|
|
def __init__(self, robotstxt_body, spider):
|
|
from protego import Protego
|
|
self.spider = spider
|
|
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
|
|
self.rp = Protego.parse(robotstxt_body)
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler, robotstxt_body):
|
|
spider = None if not crawler else crawler.spider
|
|
o = cls(robotstxt_body, spider)
|
|
return o
|
|
|
|
def allowed(self, url, user_agent):
|
|
user_agent = to_unicode(user_agent)
|
|
url = to_unicode(url)
|
|
return self.rp.can_fetch(url, user_agent)
|