|
|
from selenium import webdriver
|
|
|
from bs4 import BeautifulSoup
|
|
|
from selenium.webdriver.edge.options import Options
|
|
|
import time
|
|
|
import json
|
|
|
|
|
|
# 设置 Edge 以无头模式运行
|
|
|
options = Options()
|
|
|
options.add_argument("--headless")
|
|
|
options.add_argument("--disable-gpu")
|
|
|
|
|
|
# 实例化浏览器
|
|
|
driver = webdriver.Edge(options=options)
|
|
|
|
|
|
# driver = webdriver.Edge()
|
|
|
|
|
|
ip_list = []
|
|
|
ip_port_dict = {}
|
|
|
proxies = []
|
|
|
|
|
|
a = 0
|
|
|
for i in range(1, 10):
|
|
|
# 打开网页
|
|
|
driver.get(f'https://www.kuaidaili.com/free/inha/{i}/')
|
|
|
# 获取页面源代码
|
|
|
html = driver.page_source
|
|
|
# 使用BeautifulSoup解析HTML内容
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
|
|
|
all_l = soup.select('tbody>tr')
|
|
|
|
|
|
for all_2 in all_l:
|
|
|
# 确保tr标签内确实有td标签
|
|
|
if len(all_2.select('td')) >= 2:
|
|
|
a += 1
|
|
|
ip_l = all_2.select('td')[0].text.strip()
|
|
|
port_l = all_2.select('td')[1].text.strip()
|
|
|
# 分别添加到列表和字典中
|
|
|
ip_list.append(ip_l)
|
|
|
ip_port_dict[ip_l] = port_l
|
|
|
print(f'IP代理池正在加载第:{a}个')
|
|
|
time.sleep(0.4)
|
|
|
|
|
|
print('IP代理池加载完成')
|
|
|
|
|
|
# 构建proxies列表
|
|
|
for ip in ip_list:
|
|
|
proxies.append({'http': f'http://{ip}:{ip_port_dict[ip]}'})
|
|
|
|
|
|
# 写入JSON文件
|
|
|
with open('ip代理池.json', 'w', encoding='utf-8') as f:
|
|
|
json.dump(proxies, f, ensure_ascii=False, indent=4)
|
|
|
print("写入文件完成:ip代理池.json")
|
|
|
|
|
|
# 写入txt文件
|
|
|
# with open('ip代理池.txt', 'w', encoding='UTF-8') as f:
|
|
|
# for i in proxies:
|
|
|
# f.write(str(i) + ',' + "\n")
|
|
|
# print("写入文件完成:ip代理池.txt")
|
|
|
|
|
|
driver.quit()
|
|
|
# time.sleep(999)
|