40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
import os
|
|
import pickle
|
|
|
|
from scrapy import signals
|
|
from scrapy.exceptions import NotConfigured
|
|
from scrapy.utils.job import job_dir
|
|
|
|
|
|
class SpiderState:
|
|
"""Store and load spider state during a scraping job"""
|
|
|
|
def __init__(self, jobdir=None):
|
|
self.jobdir = jobdir
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
jobdir = job_dir(crawler.settings)
|
|
if not jobdir:
|
|
raise NotConfigured
|
|
|
|
obj = cls(jobdir)
|
|
crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
|
|
crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
|
|
return obj
|
|
|
|
def spider_closed(self, spider):
|
|
if self.jobdir:
|
|
with open(self.statefn, 'wb') as f:
|
|
pickle.dump(spider.state, f, protocol=4)
|
|
|
|
def spider_opened(self, spider):
|
|
if self.jobdir and os.path.exists(self.statefn):
|
|
with open(self.statefn, 'rb') as f:
|
|
spider.state = pickle.load(f)
|
|
else:
|
|
spider.state = {}
|
|
|
|
@property
|
|
def statefn(self):
|
|
return os.path.join(self.jobdir, 'spider.state')
|