|
|
import json
|
|
|
import sys
|
|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import random
|
|
|
import time
|
|
|
|
|
|
class IpManage(object):
|
|
|
|
|
|
def __init__(self, max_ip_num):
|
|
|
self.ip_pool = [] # ip池数组
|
|
|
|
|
|
const_path = sys.path[0].split("quant-on-volume")[0] + "quant-on-volume"
|
|
|
f = open(const_path + '\\const.json', "r", encoding="utf8")
|
|
|
consts = json.loads(f.read())
|
|
|
self.check_url = consts['check_url1'] # 校验ip是否可用的 url
|
|
|
self.type = consts['type'] # https / http
|
|
|
self.pages = [] # 已爬取的 page 数组
|
|
|
self.max_ip_num = max_ip_num # 要爬取的 ip 数量上限
|
|
|
|
|
|
# 抓取单页的ip
|
|
|
def craw_ips_by_page(self, page):
|
|
|
url = "https://www.xicidaili.com/nn/%s" % page
|
|
|
html_content = self.requests_get(url, 'html')
|
|
|
|
|
|
if html_content == "0":
|
|
|
print("获取ip池失败, 请检查网络设置, 并确认本机ip没有被封禁")
|
|
|
return
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
all_trs = soup.find("table", id="ip_list").find_all('tr')
|
|
|
for tr in all_trs[1:]:
|
|
|
tds = tr.find_all('td')
|
|
|
ip = {
|
|
|
'ip': tds[1].get_text(),
|
|
|
'port': tds[2].get_text(),
|
|
|
'type': tds[5].get_text()
|
|
|
}
|
|
|
# 检查ip是否可用
|
|
|
if self.check_ip(ip):
|
|
|
self.ip_pool.append(ip)
|
|
|
if len(self.ip_pool) >= self.max_ip_num:
|
|
|
break
|
|
|
|
|
|
# 获取 IP 池
|
|
|
def craw_ips(self):
|
|
|
page = self.get_random_page()
|
|
|
self.pages.append(page)
|
|
|
print("当前爬取的ip页码为: ", page)
|
|
|
self.craw_ips_by_page(page)
|
|
|
|
|
|
# 判断 ip 数量是否已足够
|
|
|
if len(self.ip_pool) < self.max_ip_num:
|
|
|
self.craw_ips()
|
|
|
else:
|
|
|
with open(sys.path[0] + "\\ip_pool.json", "w", encoding="utf-8") as f:
|
|
|
json.dump(self.ip_pool, f)
|
|
|
print("共抓取 %s 条ip" % len(self.ip_pool))
|
|
|
|
|
|
# 检查代理ip是否可用
|
|
|
def check_ip(self, ip):
|
|
|
proxy_temp = {
|
|
|
"http": "http://%s:%s" % (ip['ip'], ip['port']),
|
|
|
"https": "http://%s:%s" % (ip['ip'], ip['port'])
|
|
|
}
|
|
|
show_info = self.check_url + "---" + "http://%s:%s [%s]" % (ip['ip'], ip['port'], time.perf_counter())
|
|
|
try:
|
|
|
# print(show_info)
|
|
|
res = requests.get(self.check_url, timeout=1, proxies=proxy_temp)
|
|
|
if res.status_code == 200:
|
|
|
print(show_info, ":SUCCESS")
|
|
|
return True
|
|
|
else:
|
|
|
print(show_info, ":FAIL")
|
|
|
return False
|
|
|
except:
|
|
|
print(show_info, ":FAIL")
|
|
|
return False
|
|
|
|
|
|
# get 请求
|
|
|
def requests_get(self, url, type, data=None):
|
|
|
headers = {
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
|
|
|
}
|
|
|
response = requests.get(url=url, headers=headers, data=data)
|
|
|
if response.status_code == 200:
|
|
|
if type == "img":
|
|
|
# 获取图片
|
|
|
return response.content
|
|
|
if type == "html":
|
|
|
html = response.content
|
|
|
html_content = html.decode("utf-8", "ignore")
|
|
|
return html_content
|
|
|
if type == "text":
|
|
|
return response.text
|
|
|
else:
|
|
|
print("Request Falied For Code: %s" % response.status_code)
|
|
|
return "0"
|
|
|
|
|
|
# 生成随机页码
|
|
|
def get_random_page(self):
|
|
|
max_page = 3000
|
|
|
page = 0
|
|
|
while True:
|
|
|
page = random.randint(0, max_page)
|
|
|
if page not in self.pages:
|
|
|
break
|
|
|
return page
|
|
|
|
|
|
# if __name__ == "__main__":
|
|
|
# ip_manage = IpManage(10)
|
|
|
# ip_manage.craw_ips();
|