From f8f8567984ad4db542b8d3f58dc6ff6ed27ebf7f Mon Sep 17 00:00:00 2001 From: pnhekgfuf <1913997697@qq.com> Date: Sat, 29 Apr 2023 12:17:45 +0800 Subject: [PATCH] ADD file via upload --- Scrapy-Redis-Zhihu/scrapy_redis/dupefilter.py | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 Scrapy-Redis-Zhihu/scrapy_redis/dupefilter.py diff --git a/Scrapy-Redis-Zhihu/scrapy_redis/dupefilter.py b/Scrapy-Redis-Zhihu/scrapy_redis/dupefilter.py new file mode 100644 index 0000000..5a48a50 --- /dev/null +++ b/Scrapy-Redis-Zhihu/scrapy_redis/dupefilter.py @@ -0,0 +1,164 @@ +import logging +import time + +from scrapy.dupefilters import BaseDupeFilter +from scrapy.utils.request import request_fingerprint + +from . import defaults +from .connection import get_redis_from_settings +from libs.bloomfilter import BloomFilter, conn + + +logger = logging.getLogger(__name__) + + +# TODO: Rename class to RedisDupeFilter. +class RFPDupeFilter(BaseDupeFilter): + """Redis-based request duplicates filter. + + This class can also be used with default Scrapy's scheduler. + + """ + + logger = logger + + def __init__(self, server, key, debug=False): + """Initialize the duplicates filter. + + Parameters + ---------- + server : redis.StrictRedis + The redis server instance. + key : str + Redis key Where to store fingerprints. + debug : bool, optional + Whether to log filtered requests. + + """ + self.server = server + self.key = key + self.debug = debug + self.logdupes = True + self.bf = BloomFilter(conn=conn, key=key) + + @classmethod + def from_settings(cls, settings): + """Returns an instance from given settings. + + This uses by default the key ``dupefilter:``. When using the + ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as + it needs to pass the spider name in the key. + + Parameters + ---------- + settings : scrapy.settings.Settings + + Returns + ------- + RFPDupeFilter + A RFPDupeFilter instance. + + + """ + server = get_redis_from_settings(settings) + # XXX: This creates one-time key. needed to support to use this + # class as standalone dupefilter with scrapy's default scheduler + # if scrapy passes spider on open() method this wouldn't be needed + # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. + key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} + debug = settings.getbool('DUPEFILTER_DEBUG') + return cls(server, key=key, debug=debug) + + @classmethod + def from_crawler(cls, crawler): + """Returns instance from crawler. + + Parameters + ---------- + crawler : scrapy.crawler.Crawler + + Returns + ------- + RFPDupeFilter + Instance of RFPDupeFilter. + + """ + return cls.from_settings(crawler.settings) + + def request_seen(self, request): + """Returns True if request was already seen. + + Parameters + ---------- + request : scrapy.http.Request + + Returns + ------- + bool + + """ + fp = self.request_fingerprint(request) + if self.bf.is_exist(fp): + return True + else: + self.bf.add(fp) + return False + # # This returns the number of values added, zero if already exists. + # added = self.server.sadd(self.key, fp) + # return added == 0 + + def request_fingerprint(self, request): + """Returns a fingerprint for a given request. + + Parameters + ---------- + request : scrapy.http.Request + + Returns + ------- + str + + """ + return request_fingerprint(request) + + @classmethod + def from_spider(cls, spider): + settings = spider.settings + server = get_redis_from_settings(settings) + dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY) + key = dupefilter_key % {'spider': spider.name} + debug = settings.getbool('DUPEFILTER_DEBUG') + return cls(server, key=key, debug=debug) + + def close(self, reason=''): + """Delete data on close. Called by Scrapy's scheduler. + + Parameters + ---------- + reason : str, optional + + """ + self.clear() + + def clear(self): + """Clears fingerprints data.""" + self.server.delete(self.key) + + def log(self, request, spider): + """Logs given request. + + Parameters + ---------- + request : scrapy.http.Request + spider : scrapy.spiders.Spider + + """ + if self.debug: + msg = "Filtered duplicate request: %(request)s" + self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + elif self.logdupes: + msg = ("Filtered duplicate request %(request)s" + " - no more duplicates will be shown" + " (see DUPEFILTER_DEBUG to show all duplicates)") + self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + self.logdupes = False