爬虫中间键

master
p6mtf24ic 4 years ago
parent 5740f162ae
commit 9f5f70207b

@ -0,0 +1,101 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import requests
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
# class JdSpiderMiddleware:
# # Not all methods need to be defined. If a method is not defined,
# # scrapy acts as if the spider middleware does not modify the
# # passed objects.
#
# @classmethod
# def from_crawler(cls, crawler):
# # This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
#
# def process_spider_input(self, response, spider):
# # Called for each response that goes through the spider
# # middleware and into the spider.
#
# # Should return None or raise an exception.
# return None
#
# def process_spider_output(self, response, result, spider):
# # Called with the results returned from the Spider, after
# # it has processed the response.
#
# # Must return an iterable of Request, or item objects.
# for i in result:
# yield i
#
# def process_spider_exception(self, response, exception, spider):
# # Called when a spider or process_spider_input() method
# # (from other spider middleware) raises an exception.
#
# # Should return either None or an iterable of Request or item objects.
# pass
#
# def process_start_requests(self, start_requests, spider):
# # Called with the start requests of the spider, and works
# # similarly to the process_spider_output() method, except
# # that it doesnt have a response associated.
#
# # Must return only requests (not items).
# for r in start_requests:
# yield r
#
# def spider_opened(self, spider):
# spider.logger.info('Spider opened: %s' % spider.name)
import random
class JdDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
# @classmethod
# def from_crawler(cls, crawler):
# # This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
#随机设置UA进行UA伪装
user_agent_list = ["Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29",
]
#设置代理ip
PROXY_https = ["125.79.50.39",
"182.111.85.77",
"117.42.193.91",
"27.150.161.116"
]
#拦截请求设置UA
def process_request(self, request, spider):
#UA伪装随机选择
request.headers['User-Agent'] = random.choice(self.user_agent_list)
return None
#拦截响应
def process_response(self, request, response, spider):
return response
#拦截异常的请求对象如果本机ip出现异常就会就会随机在ip池里面选一条ip
def process_exception(self, request, exception, spider):
#代理ip
request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
return request
# def spider_opened(self, spider):
# spider.logger.info('Spider opened: %s' % spider.name)
Loading…
Cancel
Save