Ausgabe der neuen DB Einträge

2022-01-02 21:50:48 +01:00 · 2022-01-02 21:50:48 +01:00 · cfbbb9ee3d
commit cfbbb9ee3d
parent bad48e1627
2399 changed files with 843193 additions and 43 deletions
--- a/venv/lib/python3.9/site-packages/scrapy/robotstxt.py
+++ b/venv/lib/python3.9/site-packages/scrapy/robotstxt.py
@ -0,0 +1,130 @@
+import sys
+import logging
+from abc import ABCMeta, abstractmethod
+
+from scrapy.utils.python import to_unicode
+
+
+logger = logging.getLogger(__name__)
+
+
+def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
+    try:
+        if to_native_str_type:
+            robotstxt_body = to_unicode(robotstxt_body)
+        else:
+            robotstxt_body = robotstxt_body.decode('utf-8')
+    except UnicodeDecodeError:
+        # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
+        # Switch to 'allow all' state.
+        logger.warning(
+            "Failure while parsing robots.txt. File either contains garbage or "
+            "is in an encoding other than UTF-8, treating it as an empty file.",
+            exc_info=sys.exc_info(),
+            extra={'spider': spider},
+        )
+        robotstxt_body = ''
+    return robotstxt_body
+
+
+class RobotParser(metaclass=ABCMeta):
+    @classmethod
+    @abstractmethod
+    def from_crawler(cls, crawler, robotstxt_body):
+        """Parse the content of a robots.txt_ file as bytes. This must be a class method.
+        It must return a new instance of the parser backend.
+
+        :param crawler: crawler which made the request
+        :type crawler: :class:`~scrapy.crawler.Crawler` instance
+
+        :param robotstxt_body: content of a robots.txt_ file.
+        :type robotstxt_body: bytes
+        """
+        pass
+
+    @abstractmethod
+    def allowed(self, url, user_agent):
+        """Return ``True`` if  ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
+
+        :param url: Absolute URL
+        :type url: str
+
+        :param user_agent: User agent
+        :type user_agent: str
+        """
+        pass
+
+
+class PythonRobotParser(RobotParser):
+    def __init__(self, robotstxt_body, spider):
+        from urllib.robotparser import RobotFileParser
+        self.spider = spider
+        robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
+        self.rp = RobotFileParser()
+        self.rp.parse(robotstxt_body.splitlines())
+
+    @classmethod
+    def from_crawler(cls, crawler, robotstxt_body):
+        spider = None if not crawler else crawler.spider
+        o = cls(robotstxt_body, spider)
+        return o
+
+    def allowed(self, url, user_agent):
+        user_agent = to_unicode(user_agent)
+        url = to_unicode(url)
+        return self.rp.can_fetch(user_agent, url)
+
+
+class ReppyRobotParser(RobotParser):
+    def __init__(self, robotstxt_body, spider):
+        from reppy.robots import Robots
+        self.spider = spider
+        self.rp = Robots.parse('', robotstxt_body)
+
+    @classmethod
+    def from_crawler(cls, crawler, robotstxt_body):
+        spider = None if not crawler else crawler.spider
+        o = cls(robotstxt_body, spider)
+        return o
+
+    def allowed(self, url, user_agent):
+        return self.rp.allowed(url, user_agent)
+
+
+class RerpRobotParser(RobotParser):
+    def __init__(self, robotstxt_body, spider):
+        from robotexclusionrulesparser import RobotExclusionRulesParser
+        self.spider = spider
+        self.rp = RobotExclusionRulesParser()
+        robotstxt_body = decode_robotstxt(robotstxt_body, spider)
+        self.rp.parse(robotstxt_body)
+
+    @classmethod
+    def from_crawler(cls, crawler, robotstxt_body):
+        spider = None if not crawler else crawler.spider
+        o = cls(robotstxt_body, spider)
+        return o
+
+    def allowed(self, url, user_agent):
+        user_agent = to_unicode(user_agent)
+        url = to_unicode(url)
+        return self.rp.is_allowed(user_agent, url)
+
+
+class ProtegoRobotParser(RobotParser):
+    def __init__(self, robotstxt_body, spider):
+        from protego import Protego
+        self.spider = spider
+        robotstxt_body = decode_robotstxt(robotstxt_body, spider)
+        self.rp = Protego.parse(robotstxt_body)
+
+    @classmethod
+    def from_crawler(cls, crawler, robotstxt_body):
+        spider = None if not crawler else crawler.spider
+        o = cls(robotstxt_body, spider)
+        return o
+
+    def allowed(self, url, user_agent):
+        user_agent = to_unicode(user_agent)
+        url = to_unicode(url)
+        return self.rp.can_fetch(url, user_agent)