|
|
# Define here the models for your spider middleware
|
|
|
#
|
|
|
# See documentation in:
|
|
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
|
import requests
|
|
|
from scrapy import signals
|
|
|
|
|
|
# useful for handling different item types with a single interface
|
|
|
from itemadapter import is_item, ItemAdapter
|
|
|
|
|
|
|
|
|
# class JdSpiderMiddleware:
|
|
|
# # Not all methods need to be defined. If a method is not defined,
|
|
|
# # scrapy acts as if the spider middleware does not modify the
|
|
|
# # passed objects.
|
|
|
#
|
|
|
# @classmethod
|
|
|
# def from_crawler(cls, crawler):
|
|
|
# # This method is used by Scrapy to create your spiders.
|
|
|
# s = cls()
|
|
|
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
|
|
# return s
|
|
|
#
|
|
|
# def process_spider_input(self, response, spider):
|
|
|
# # Called for each response that goes through the spider
|
|
|
# # middleware and into the spider.
|
|
|
#
|
|
|
# # Should return None or raise an exception.
|
|
|
# return None
|
|
|
#
|
|
|
# def process_spider_output(self, response, result, spider):
|
|
|
# # Called with the results returned from the Spider, after
|
|
|
# # it has processed the response.
|
|
|
#
|
|
|
# # Must return an iterable of Request, or item objects.
|
|
|
# for i in result:
|
|
|
# yield i
|
|
|
#
|
|
|
# def process_spider_exception(self, response, exception, spider):
|
|
|
# # Called when a spider or process_spider_input() method
|
|
|
# # (from other spider middleware) raises an exception.
|
|
|
#
|
|
|
# # Should return either None or an iterable of Request or item objects.
|
|
|
# pass
|
|
|
#
|
|
|
# def process_start_requests(self, start_requests, spider):
|
|
|
# # Called with the start requests of the spider, and works
|
|
|
# # similarly to the process_spider_output() method, except
|
|
|
# # that it doesn’t have a response associated.
|
|
|
#
|
|
|
# # Must return only requests (not items).
|
|
|
# for r in start_requests:
|
|
|
# yield r
|
|
|
#
|
|
|
# def spider_opened(self, spider):
|
|
|
# spider.logger.info('Spider opened: %s' % spider.name)
|
|
|
|
|
|
import random
|
|
|
class JdDownloaderMiddleware:
|
|
|
# Not all methods need to be defined. If a method is not defined,
|
|
|
# scrapy acts as if the downloader middleware does not modify the
|
|
|
# passed objects.
|
|
|
|
|
|
# @classmethod
|
|
|
# def from_crawler(cls, crawler):
|
|
|
# # This method is used by Scrapy to create your spiders.
|
|
|
# s = cls()
|
|
|
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
|
|
# return s
|
|
|
|
|
|
#随机设置UA,进行UA伪装
|
|
|
user_agent_list = ["Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Mobile Safari/537.36",
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29",
|
|
|
]
|
|
|
#设置代理ip
|
|
|
PROXY_https = ["125.79.50.39",
|
|
|
"182.111.85.77",
|
|
|
"117.42.193.91",
|
|
|
"27.150.161.116"
|
|
|
]
|
|
|
|
|
|
#拦截请求设置UA
|
|
|
def process_request(self, request, spider):
|
|
|
#UA伪装,随机选择
|
|
|
request.headers['User-Agent'] = random.choice(self.user_agent_list)
|
|
|
return None
|
|
|
|
|
|
#拦截响应
|
|
|
def process_response(self, request, response, spider):
|
|
|
|
|
|
return response
|
|
|
|
|
|
#拦截异常的请求对象,如果本机ip出现异常就会就会随机在ip池里面选一条ip
|
|
|
def process_exception(self, request, exception, spider):
|
|
|
#代理ip
|
|
|
request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
|
|
|
return request
|
|
|
|
|
|
|
|
|
# def spider_opened(self, spider):
|
|
|
# spider.logger.info('Spider opened: %s' % spider.name)
|