Ausgabe der neuen DB Einträge
This commit is contained in:
parent
bad48e1627
commit
cfbbb9ee3d
2399 changed files with 843193 additions and 43 deletions
47
venv/lib/python3.9/site-packages/scrapy/utils/sitemap.py
Normal file
47
venv/lib/python3.9/site-packages/scrapy/utils/sitemap.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
Module for processing Sitemaps.
|
||||
|
||||
Note: The main purpose of this module is to provide support for the
|
||||
SitemapSpider, its API is subject to change without notice.
|
||||
"""
|
||||
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import lxml.etree
|
||||
|
||||
|
||||
class Sitemap:
|
||||
"""Class to parse Sitemap (type=urlset) and Sitemap Index
|
||||
(type=sitemapindex) files"""
|
||||
|
||||
def __init__(self, xmltext):
|
||||
xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True, resolve_entities=False)
|
||||
self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
|
||||
rt = self._root.tag
|
||||
self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt
|
||||
|
||||
def __iter__(self):
|
||||
for elem in self._root.getchildren():
|
||||
d = {}
|
||||
for el in elem.getchildren():
|
||||
tag = el.tag
|
||||
name = tag.split('}', 1)[1] if '}' in tag else tag
|
||||
|
||||
if name == 'link':
|
||||
if 'href' in el.attrib:
|
||||
d.setdefault('alternate', []).append(el.get('href'))
|
||||
else:
|
||||
d[name] = el.text.strip() if el.text else ''
|
||||
|
||||
if 'loc' in d:
|
||||
yield d
|
||||
|
||||
|
||||
def sitemap_urls_from_robots(robots_text, base_url=None):
|
||||
"""Return an iterator over all sitemap urls contained in the given
|
||||
robots.txt file
|
||||
"""
|
||||
for line in robots_text.splitlines():
|
||||
if line.lstrip().lower().startswith('sitemap:'):
|
||||
url = line.split(':', 1)[1].strip()
|
||||
yield urljoin(base_url, url)
|
||||
Loading…
Add table
Add a link
Reference in a new issue