|
|
|
|
import requests
|
|
|
|
|
import random
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Optimize:
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.proxies = [] # 初始代理列表
|
|
|
|
|
self.working_proxies = [] # 当前工作代理列表
|
|
|
|
|
self.proxy_cache_file = 'working_proxies.cache' # 代理缓存文件
|
|
|
|
|
# ip代理池
|
|
|
|
|
self.proxies = [
|
|
|
|
|
# {'http': 'http://203.74.125.18:8888'},
|
|
|
|
|
# {'http': 'http://39.165.0.137:9002'},
|
|
|
|
|
# 如果有HTTPS代理,也可以这样添加
|
|
|
|
|
# {'https': 'https://example.com:port'},
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# 加载ip代理池
|
|
|
|
|
self.load_proxy_pool()
|
|
|
|
|
|
|
|
|
|
# 测试成功的ip代理池
|
|
|
|
|
self.working_proxies = []
|
|
|
|
|
|
|
|
|
|
# 加载ip代理池
|
|
|
|
|
def load_proxy_pool(self):
|
|
|
|
|
with open('ip代理池.json', 'r', encoding='utf-8') as f:
|
|
|
|
|
content = json.load(f)
|
|
|
|
|
if content: # 检查内容是否不为空
|
|
|
|
|
for proxy_dict in content:
|
|
|
|
|
self.proxies.append(proxy_dict)
|
|
|
|
|
else:
|
|
|
|
|
print("ip代理池为空")
|
|
|
|
|
|
|
|
|
|
def load_working_proxies_from_cache(self):
|
|
|
|
|
# 从缓存文件中加载工作代理
|
|
|
|
|
try:
|
|
|
|
|
with open(self.proxy_cache_file, 'r') as f:
|
|
|
|
|
return [line.strip() for line in f.readlines()]
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def save_working_proxies_to_cache(self, proxies):
|
|
|
|
|
# 将工作代理保存到缓存文件
|
|
|
|
|
with open(self.proxy_cache_file, 'w') as f:
|
|
|
|
|
for proxy in proxies:
|
|
|
|
|
f.write(f"{proxy}\n")
|
|
|
|
|
|
|
|
|
|
def test_proxy(self, proxy):
|
|
|
|
|
# 测试单个代理是否有效
|
|
|
|
|
# test_url = 'http://example.com'
|
|
|
|
|
test_url = 'https://www.baidu.com/'
|
|
|
|
|
try:
|
|
|
|
|
response = requests.get(url=test_url, proxies=proxy, timeout=5)
|
|
|
|
|
return response.status_code == 200
|
|
|
|
|
except requests.RequestException:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def refresh_working_proxies(self):
|
|
|
|
|
# 刷新工作代理列表
|
|
|
|
|
with ThreadPoolExecutor(max_workers=20) as executor: # 使用线程池并行测试代理
|
|
|
|
|
futures = {executor.submit(self.test_proxy, proxy): proxy for proxy in self.proxies}
|
|
|
|
|
for future in futures:
|
|
|
|
|
if future.result():
|
|
|
|
|
self.working_proxies.append(futures[future])
|
|
|
|
|
|
|
|
|
|
# 保存代理到缓存
|
|
|
|
|
self.save_working_proxies_to_cache(self.working_proxies)
|
|
|
|
|
|
|
|
|
|
def get_random_working_proxy(self):
|
|
|
|
|
# 获取随机工作代理
|
|
|
|
|
if not self.working_proxies:
|
|
|
|
|
# 如果工作代理为空,尝试从缓存加载
|
|
|
|
|
self.working_proxies = self.load_working_proxies_from_cache()
|
|
|
|
|
if not self.working_proxies:
|
|
|
|
|
# 如果缓存也为空,则刷新代理列表
|
|
|
|
|
self.refresh_working_proxies()
|
|
|
|
|
|
|
|
|
|
if self.working_proxies:
|
|
|
|
|
return random.choice(self.working_proxies)
|
|
|
|
|
else:
|
|
|
|
|
print("没有找到有效的代理")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def startup():
|
|
|
|
|
# 假设你已经有了一个Spider实例
|
|
|
|
|
optimize = Optimize()
|
|
|
|
|
|
|
|
|
|
# 调用refresh_working_proxies方法来刷新工作代理列表
|
|
|
|
|
optimize.refresh_working_proxies()
|
|
|
|
|
|
|
|
|
|
# 调用get_random_working_proxy方法来获取一个随机的工作代理
|
|
|
|
|
proxy = optimize.get_random_working_proxy()
|
|
|
|
|
if proxy:
|
|
|
|
|
print("获取到的代理是:", proxy)
|
|
|
|
|
return proxy
|
|
|
|
|
# 在这里使用代理进行你的网络请求
|
|
|
|
|
else:
|
|
|
|
|
print("没有可用的代理")
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
# 假设你已经有了一个Spider实例
|
|
|
|
|
optimize = Optimize()
|
|
|
|
|
|
|
|
|
|
# 调用refresh_working_proxies方法来刷新工作代理列表
|
|
|
|
|
optimize.refresh_working_proxies()
|
|
|
|
|
|
|
|
|
|
# 调用get_random_working_proxy方法来获取一个随机的工作代理
|
|
|
|
|
proxy = optimize.get_random_working_proxy()
|
|
|
|
|
if proxy:
|
|
|
|
|
print("获取到的代理是:", proxy)
|
|
|
|
|
# 在这里使用代理进行你的网络请求
|
|
|
|
|
else:
|
|
|
|
|
print("没有可用的代理")
|