parent
c282289380
commit
f8f8567984
@ -0,0 +1,164 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
|
from scrapy.dupefilters import BaseDupeFilter
|
||||||
|
from scrapy.utils.request import request_fingerprint
|
||||||
|
|
||||||
|
from . import defaults
|
||||||
|
from .connection import get_redis_from_settings
|
||||||
|
from libs.bloomfilter import BloomFilter, conn
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: Rename class to RedisDupeFilter.
|
||||||
|
class RFPDupeFilter(BaseDupeFilter):
|
||||||
|
"""Redis-based request duplicates filter.
|
||||||
|
|
||||||
|
This class can also be used with default Scrapy's scheduler.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger = logger
|
||||||
|
|
||||||
|
def __init__(self, server, key, debug=False):
|
||||||
|
"""Initialize the duplicates filter.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
server : redis.StrictRedis
|
||||||
|
The redis server instance.
|
||||||
|
key : str
|
||||||
|
Redis key Where to store fingerprints.
|
||||||
|
debug : bool, optional
|
||||||
|
Whether to log filtered requests.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.server = server
|
||||||
|
self.key = key
|
||||||
|
self.debug = debug
|
||||||
|
self.logdupes = True
|
||||||
|
self.bf = BloomFilter(conn=conn, key=key)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_settings(cls, settings):
|
||||||
|
"""Returns an instance from given settings.
|
||||||
|
|
||||||
|
This uses by default the key ``dupefilter:<timestamp>``. When using the
|
||||||
|
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
|
||||||
|
it needs to pass the spider name in the key.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
settings : scrapy.settings.Settings
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
RFPDupeFilter
|
||||||
|
A RFPDupeFilter instance.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
server = get_redis_from_settings(settings)
|
||||||
|
# XXX: This creates one-time key. needed to support to use this
|
||||||
|
# class as standalone dupefilter with scrapy's default scheduler
|
||||||
|
# if scrapy passes spider on open() method this wouldn't be needed
|
||||||
|
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
|
||||||
|
key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
|
||||||
|
debug = settings.getbool('DUPEFILTER_DEBUG')
|
||||||
|
return cls(server, key=key, debug=debug)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
"""Returns instance from crawler.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
crawler : scrapy.crawler.Crawler
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
RFPDupeFilter
|
||||||
|
Instance of RFPDupeFilter.
|
||||||
|
|
||||||
|
"""
|
||||||
|
return cls.from_settings(crawler.settings)
|
||||||
|
|
||||||
|
def request_seen(self, request):
|
||||||
|
"""Returns True if request was already seen.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
request : scrapy.http.Request
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
bool
|
||||||
|
|
||||||
|
"""
|
||||||
|
fp = self.request_fingerprint(request)
|
||||||
|
if self.bf.is_exist(fp):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self.bf.add(fp)
|
||||||
|
return False
|
||||||
|
# # This returns the number of values added, zero if already exists.
|
||||||
|
# added = self.server.sadd(self.key, fp)
|
||||||
|
# return added == 0
|
||||||
|
|
||||||
|
def request_fingerprint(self, request):
|
||||||
|
"""Returns a fingerprint for a given request.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
request : scrapy.http.Request
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
|
||||||
|
"""
|
||||||
|
return request_fingerprint(request)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_spider(cls, spider):
|
||||||
|
settings = spider.settings
|
||||||
|
server = get_redis_from_settings(settings)
|
||||||
|
dupefilter_key = settings.get("SCHEDULER_DUPEFILTER_KEY", defaults.SCHEDULER_DUPEFILTER_KEY)
|
||||||
|
key = dupefilter_key % {'spider': spider.name}
|
||||||
|
debug = settings.getbool('DUPEFILTER_DEBUG')
|
||||||
|
return cls(server, key=key, debug=debug)
|
||||||
|
|
||||||
|
def close(self, reason=''):
|
||||||
|
"""Delete data on close. Called by Scrapy's scheduler.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
reason : str, optional
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.clear()
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
"""Clears fingerprints data."""
|
||||||
|
self.server.delete(self.key)
|
||||||
|
|
||||||
|
def log(self, request, spider):
|
||||||
|
"""Logs given request.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
request : scrapy.http.Request
|
||||||
|
spider : scrapy.spiders.Spider
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self.debug:
|
||||||
|
msg = "Filtered duplicate request: %(request)s"
|
||||||
|
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||||
|
elif self.logdupes:
|
||||||
|
msg = ("Filtered duplicate request %(request)s"
|
||||||
|
" - no more duplicates will be shown"
|
||||||
|
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
||||||
|
self.logger.debug(msg, {'request': request}, extra={'spider': spider})
|
||||||
|
self.logdupes = False
|
Loading…
Reference in new issue