Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
130
venv/lib/python3.9/site-packages/scrapy/robotstxt.py
Normal file
130
venv/lib/python3.9/site-packages/scrapy/robotstxt.py
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
import sys
|
||||
import logging
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
from scrapy.utils.python import to_unicode
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
|
||||
try:
|
||||
if to_native_str_type:
|
||||
robotstxt_body = to_unicode(robotstxt_body)
|
||||
else:
|
||||
robotstxt_body = robotstxt_body.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
|
||||
# Switch to 'allow all' state.
|
||||
logger.warning(
|
||||
"Failure while parsing robots.txt. File either contains garbage or "
|
||||
"is in an encoding other than UTF-8, treating it as an empty file.",
|
||||
exc_info=sys.exc_info(),
|
||||
extra={'spider': spider},
|
||||
)
|
||||
robotstxt_body = ''
|
||||
return robotstxt_body
|
||||
|
||||
|
||||
class RobotParser(metaclass=ABCMeta):
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def from_crawler(cls, crawler, robotstxt_body):
|
||||
"""Parse the content of a robots.txt_ file as bytes. This must be a class method.
|
||||
It must return a new instance of the parser backend.
|
||||
|
||||
:param crawler: crawler which made the request
|
||||
:type crawler: :class:`~scrapy.crawler.Crawler` instance
|
||||
|
||||
:param robotstxt_body: content of a robots.txt_ file.
|
||||
:type robotstxt_body: bytes
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def allowed(self, url, user_agent):
|
||||
"""Return ``True`` if ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
|
||||
|
||||
:param url: Absolute URL
|
||||
:type url: str
|
||||
|
||||
:param user_agent: User agent
|
||||
:type user_agent: str
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class PythonRobotParser(RobotParser):
|
||||
def __init__(self, robotstxt_body, spider):
|
||||
from urllib.robotparser import RobotFileParser
|
||||
self.spider = spider
|
||||
robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
|
||||
self.rp = RobotFileParser()
|
||||
self.rp.parse(robotstxt_body.splitlines())
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, robotstxt_body):
|
||||
spider = None if not crawler else crawler.spider
|
||||
o = cls(robotstxt_body, spider)
|
||||
return o
|
||||
|
||||
def allowed(self, url, user_agent):
|
||||
user_agent = to_unicode(user_agent)
|
||||
url = to_unicode(url)
|
||||
return self.rp.can_fetch(user_agent, url)
|
||||
|
||||
|
||||
class ReppyRobotParser(RobotParser):
|
||||
def __init__(self, robotstxt_body, spider):
|
||||
from reppy.robots import Robots
|
||||
self.spider = spider
|
||||
self.rp = Robots.parse('', robotstxt_body)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, robotstxt_body):
|
||||
spider = None if not crawler else crawler.spider
|
||||
o = cls(robotstxt_body, spider)
|
||||
return o
|
||||
|
||||
def allowed(self, url, user_agent):
|
||||
return self.rp.allowed(url, user_agent)
|
||||
|
||||
|
||||
class RerpRobotParser(RobotParser):
|
||||
def __init__(self, robotstxt_body, spider):
|
||||
from robotexclusionrulesparser import RobotExclusionRulesParser
|
||||
self.spider = spider
|
||||
self.rp = RobotExclusionRulesParser()
|
||||
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
|
||||
self.rp.parse(robotstxt_body)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, robotstxt_body):
|
||||
spider = None if not crawler else crawler.spider
|
||||
o = cls(robotstxt_body, spider)
|
||||
return o
|
||||
|
||||
def allowed(self, url, user_agent):
|
||||
user_agent = to_unicode(user_agent)
|
||||
url = to_unicode(url)
|
||||
return self.rp.is_allowed(user_agent, url)
|
||||
|
||||
|
||||
class ProtegoRobotParser(RobotParser):
|
||||
def __init__(self, robotstxt_body, spider):
|
||||
from protego import Protego
|
||||
self.spider = spider
|
||||
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
|
||||
self.rp = Protego.parse(robotstxt_body)
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, robotstxt_body):
|
||||
spider = None if not crawler else crawler.spider
|
||||
o = cls(robotstxt_body, spider)
|
||||
return o
|
||||
|
||||
def allowed(self, url, user_agent):
|
||||
user_agent = to_unicode(user_agent)
|
||||
url = to_unicode(url)
|
||||
return self.rp.can_fetch(url, user_agent)
|
||||
Loading…
Add table
Add a link
Reference in a new issue