ADD file via upload

main
polxzs45r 10 months ago
parent 90a7776709
commit c893cb008a

@ -0,0 +1,112 @@
import time
import threading
import openpyxl
import requests
from lxml import etree
from selenium import webdriver
from openpyxl import Workbook
from selenium.webdriver.common.by import By
import random
from selenium.webdriver.edge.options import Options
lock = threading.Lock() # 互斥锁
IPS = [] # 装ip的列表
urls = [
'http://www.66ip.cn/nmtq.php?getnum=300&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1'
'&proxytype=2&api=66ip'] + [
f'http://www.ip3366.net/free/?stype=1&page={x}' for x in range(1, 8)]
class Proxy_pool(threading.Thread):
def __init__(self, url):
super().__init__()
self.url = url
opt = Options()
opt.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
opt.add_argument('window-size=1920x3000') # 设置浏览器分辨率
opt.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
opt.add_argument('--hide-scrollbars') # 隐藏滚动条,应对一些特殊页面
opt.add_argument('blink-settings=imagesEnabled=false') # 不加载图片,提升运行速度
opt.add_argument('--disable-javascript') # 禁用 JavaScript
opt.add_argument('--headless') # 浏览器不提供可视化界面。Linux下如果系统不支持可视化不加这条会启动失败
self.browser = webdriver.Chrome(options=opt)
def run(self):
global IPS
global lock
if 'ip3366' in self.url:
self.browser.get(self.url) # 放在这里访问是为了在操作下面的时间里让页面加载完毕
self.get_ip(IPS, lock)
def get_ip(self, IPS, lock):
# 判断是否是爬取ip3366.com的ip
if 'ip3366' in self.url:
trs = self.browser.find_elements(By.XPATH, "//tbody//tr")
for tr in trs:
ip = tr.find_element(By.XPATH, "./td[1]").text
port = tr.find_element(By.XPATH, "./td[2]").text
lock.acquire() # 获取锁
try:
IPS.append(('http://' + ip + ':' + port))
finally:
lock.release() # 释放锁
# 判断爬取的是否是66ip.com的ip
elif '66ip.cn' in self.url:
res = requests.get(url=self.url).text
tree = etree.HTML(res)
ip = tree.xpath('//body//br/following-sibling::text()[preceding-sibling::br]')
ip = [x.replace("\n", "").replace("\t", "").replace("\r", "") for x in ip if
len(x.replace("\n", "").replace("\t", "").replace("\r", ""))]
lock.acquire() # 获取锁
try:
IPS.extend(ip)
finally:
lock.release() # 释放锁
self.browser.close()
def Write_excel(IP, path): # 将爬取到的ip写入excel中
wb = Workbook()
ws = wb.active
for date in IP:
ws.append((date,))
wb.save(path)
print(f"{len(IP)}条ip爬取成功")
def Read_ip(path): # 读取文件的ip返回一个列表
wb = openpyxl.load_workbook(path)
sheet = wb.active
ips = [x.value for x in sheet['A']]
return ips
def Threads_spider(): # 爬取ip
"""
获取代理IP
:return: IPS 返回IP代理池
"""
print(f'线程爬取代理IP中...')
timestamp1 = int(time.time())
threads = [Proxy_pool(url) for url in urls]
for t in threads:
t.start()
for t in threads:
t.join()
# Write_excel(IPS, 'ip_pool.xlsx')
timestamp2 = int(time.time())
print(f'用时:{timestamp2 - timestamp1}')
return IPS
def Get_UA():
Major_verssion = random.randrange(30, 88)
Build_number = random.randrange(3000, 4500)
Patch_number = random.randrange(1, 150)
systems = ['Windows NT 10.0; Win64; x64', 'Linux; Android 10; Pixel 4', 'X11; Linux x86_64',
'Macintosh; Intel Mac OS X 10_9_2', 'Linux; Android 4.4; Nexus 5 Build/KRT16M', 'Windows NT 6.1; WOW64']
chrome=[f'Edge/{Major_verssion}.0.664.{Patch_number}','Safari/537.36']
UA = f'Mozilla/5.0 ({random.choice(systems)}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{Major_verssion}.0.{Build_number}.{Patch_number} {random.choice(chrome)}'
return UA
Loading…
Cancel
Save