# Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html import requests from scrapy import signals # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter # class JdSpiderMiddleware: # # Not all methods need to be defined. If a method is not defined, # # scrapy acts as if the spider middleware does not modify the # # passed objects. # # @classmethod # def from_crawler(cls, crawler): # # This method is used by Scrapy to create your spiders. # s = cls() # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) # return s # # def process_spider_input(self, response, spider): # # Called for each response that goes through the spider # # middleware and into the spider. # # # Should return None or raise an exception. # return None # # def process_spider_output(self, response, result, spider): # # Called with the results returned from the Spider, after # # it has processed the response. # # # Must return an iterable of Request, or item objects. # for i in result: # yield i # # def process_spider_exception(self, response, exception, spider): # # Called when a spider or process_spider_input() method # # (from other spider middleware) raises an exception. # # # Should return either None or an iterable of Request or item objects. # pass # # def process_start_requests(self, start_requests, spider): # # Called with the start requests of the spider, and works # # similarly to the process_spider_output() method, except # # that it doesn’t have a response associated. # # # Must return only requests (not items). # for r in start_requests: # yield r # # def spider_opened(self, spider): # spider.logger.info('Spider opened: %s' % spider.name) import random class JdDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. # @classmethod # def from_crawler(cls, crawler): # # This method is used by Scrapy to create your spiders. # s = cls() # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) # return s #随机设置UA,进行UA伪装 user_agent_list = ["Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Mobile Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29", ] #设置代理ip PROXY_https = ["125.79.50.39", "182.111.85.77", "117.42.193.91", "27.150.161.116" ] #拦截请求设置UA def process_request(self, request, spider): #UA伪装,随机选择 request.headers['User-Agent'] = random.choice(self.user_agent_list) return None #拦截响应 def process_response(self, request, response, spider): return response #拦截异常的请求对象,如果本机ip出现异常就会就会随机在ip池里面选一条ip def process_exception(self, request, exception, spider): #代理ip request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https) return request # def spider_opened(self, spider): # spider.logger.info('Spider opened: %s' % spider.name)