I'm trying to implement a caching mechanism in scrapy
using my own downloader-middleware
. Implementation is as follows
class DevCacheMiddleware(object):
dev_cache = None
def __init__(self, crawler):
self.crawler = crawler
dcfile = open('dev_cache.pkl', 'rb+')
try:
self.dev_cache = pickle.load(dcfile)
except:
None
if self.dev_cache==None:
self.dev_cache = {}
@classmethod
def from_crawler(cls, crawler):
o = cls(crawler)
crawler.signals.connect(o.spider_closed, signal=spider_closed)
return o
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
if response.status == 200:
if(spider.name in self.dev_cache and request.url in self.dev_cache[spider.name]):
print("found cached response for <"+ request.url +">")
return self.dev_cache[spider.name][request.url]
else:
self.cache_response(request, response, spider)
return response
def cache_response(self, request, response, spider):
if not (spider.name in self.dev_cache):
self.dev_cache[spider.name] = {}
self.dev_cache[spider.name][request.url] = response
def spider_closed(self, spider):
dcfile = open('dev_cache.pickle', 'wb')
pickle.dump(self.dev_cache, dcfile)
dcfile.close()
when I try to run the spider it raised the exception TypeError: can't pickle Selector objects
.
Please help me achieve the serialization.