From 1705510150e582824e5be828810ea3e1ba37b58f Mon Sep 17 00:00:00 2001 From: pnhekgfuf <1913997697@qq.com> Date: Sat, 29 Apr 2023 12:18:14 +0800 Subject: [PATCH] ADD file via upload --- Scrapy-Redis-Zhihu/scrapy_redis/pipelines.py | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 Scrapy-Redis-Zhihu/scrapy_redis/pipelines.py diff --git a/Scrapy-Redis-Zhihu/scrapy_redis/pipelines.py b/Scrapy-Redis-Zhihu/scrapy_redis/pipelines.py new file mode 100644 index 0000000..8ae4ef0 --- /dev/null +++ b/Scrapy-Redis-Zhihu/scrapy_redis/pipelines.py @@ -0,0 +1,76 @@ +from scrapy.utils.misc import load_object +from scrapy.utils.serialize import ScrapyJSONEncoder +from twisted.internet.threads import deferToThread + +from . import connection, defaults + + +default_serialize = ScrapyJSONEncoder().encode + + +class RedisPipeline(object): + """Pushes serialized item into a redis list/queue + + Settings + -------- + REDIS_ITEMS_KEY : str + Redis key where to store items. + REDIS_ITEMS_SERIALIZER : str + Object path to serializer function. + + """ + + def __init__(self, server, + key=defaults.PIPELINE_KEY, + serialize_func=default_serialize): + """Initialize pipeline. + + Parameters + ---------- + server : StrictRedis + Redis client instance. + key : str + Redis key where to store items. + serialize_func : callable + Items serializer function. + + """ + self.server = server + self.key = key + self.serialize = serialize_func + + @classmethod + def from_settings(cls, settings): + params = { + 'server': connection.from_settings(settings), + } + if settings.get('REDIS_ITEMS_KEY'): + params['key'] = settings['REDIS_ITEMS_KEY'] + if settings.get('REDIS_ITEMS_SERIALIZER'): + params['serialize_func'] = load_object( + settings['REDIS_ITEMS_SERIALIZER'] + ) + + return cls(**params) + + @classmethod + def from_crawler(cls, crawler): + return cls.from_settings(crawler.settings) + + def process_item(self, item, spider): + return deferToThread(self._process_item, item, spider) + + def _process_item(self, item, spider): + key = self.item_key(item, spider) + data = self.serialize(item) + self.server.rpush(key, data) + return item + + def item_key(self, item, spider): + """Returns redis key based on given spider. + + Override this function to use a different key depending on the item + and/or spider. + + """ + return self.key % {'spider': spider.name}