diff --git a/middlewares.py b/middlewares.py new file mode 100644 index 0000000..a45e69b --- /dev/null +++ b/middlewares.py @@ -0,0 +1,99 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import requests +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +# class JdSpiderMiddleware: +# # Not all methods need to be defined. If a method is not defined, +# # scrapy acts as if the spider middleware does not modify the +# # passed objects. +# +# @classmethod +# def from_crawler(cls, crawler): +# # This method is used by Scrapy to create your spiders. +# s = cls() +# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) +# return s +# +# def process_spider_input(self, response, spider): +# # Called for each response that goes through the spider +# # middleware and into the spider. +# +# # Should return None or raise an exception. +# return None +# +# def process_spider_output(self, response, result, spider): +# # Called with the results returned from the Spider, after +# # it has processed the response. +# +# # Must return an iterable of Request, or item objects. +# for i in result: +# yield i +# +# def process_spider_exception(self, response, exception, spider): +# # Called when a spider or process_spider_input() method +# # (from other spider middleware) raises an exception. +# +# # Should return either None or an iterable of Request or item objects. +# pass +# +# def process_start_requests(self, start_requests, spider): +# # Called with the start requests of the spider, and works +# # similarly to the process_spider_output() method, except +# # that it doesn’t have a response associated. +# +# # Must return only requests (not items). +# for r in start_requests: +# yield r +# +# def spider_opened(self, spider): +# spider.logger.info('Spider opened: %s' % spider.name) + +import random +class JdDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + # @classmethod + # def from_crawler(cls, crawler): + # # This method is used by Scrapy to create your spiders. + # s = cls() + # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + # return s + user_agent_list = ["Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Mobile Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29", + ] + + PROXY_https = ["125.79.50.39", + "182.111.85.77", + "117.42.193.91", + "27.150.161.116" + ] + + #拦截请求 + def process_request(self, request, spider): + #UA伪装 + request.headers['User-Agent'] = random.choice(self.user_agent_list) + return None + + #拦截响应 + def process_response(self, request, response, spider): + + return response + + #拦截异常的请求对象 + def process_exception(self, request, exception, spider): + #代理ip + request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https) + return request + + + # def spider_opened(self, spider): + # spider.logger.info('Spider opened: %s' % spider.name)