parent
90a7776709
commit
c893cb008a
@ -0,0 +1,112 @@
|
|||||||
|
import time
|
||||||
|
import threading
|
||||||
|
import openpyxl
|
||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
from selenium import webdriver
|
||||||
|
from openpyxl import Workbook
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
import random
|
||||||
|
from selenium.webdriver.edge.options import Options
|
||||||
|
|
||||||
|
|
||||||
|
lock = threading.Lock() # 互斥锁
|
||||||
|
IPS = [] # 装ip的列表
|
||||||
|
urls = [
|
||||||
|
'http://www.66ip.cn/nmtq.php?getnum=300&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1'
|
||||||
|
'&proxytype=2&api=66ip'] + [
|
||||||
|
f'http://www.ip3366.net/free/?stype=1&page={x}' for x in range(1, 8)]
|
||||||
|
|
||||||
|
|
||||||
|
class Proxy_pool(threading.Thread):
|
||||||
|
def __init__(self, url):
|
||||||
|
super().__init__()
|
||||||
|
self.url = url
|
||||||
|
|
||||||
|
opt = Options()
|
||||||
|
opt.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
|
||||||
|
opt.add_argument('window-size=1920x3000') # 设置浏览器分辨率
|
||||||
|
opt.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
|
||||||
|
opt.add_argument('--hide-scrollbars') # 隐藏滚动条,应对一些特殊页面
|
||||||
|
opt.add_argument('blink-settings=imagesEnabled=false') # 不加载图片,提升运行速度
|
||||||
|
opt.add_argument('--disable-javascript') # 禁用 JavaScript
|
||||||
|
opt.add_argument('--headless') # 浏览器不提供可视化界面。Linux下如果系统不支持可视化不加这条会启动失败
|
||||||
|
self.browser = webdriver.Chrome(options=opt)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
global IPS
|
||||||
|
global lock
|
||||||
|
if 'ip3366' in self.url:
|
||||||
|
self.browser.get(self.url) # 放在这里访问是为了在操作下面的时间里让页面加载完毕
|
||||||
|
self.get_ip(IPS, lock)
|
||||||
|
|
||||||
|
def get_ip(self, IPS, lock):
|
||||||
|
# 判断是否是爬取ip3366.com的ip
|
||||||
|
if 'ip3366' in self.url:
|
||||||
|
trs = self.browser.find_elements(By.XPATH, "//tbody//tr")
|
||||||
|
for tr in trs:
|
||||||
|
ip = tr.find_element(By.XPATH, "./td[1]").text
|
||||||
|
port = tr.find_element(By.XPATH, "./td[2]").text
|
||||||
|
lock.acquire() # 获取锁
|
||||||
|
try:
|
||||||
|
IPS.append(('http://' + ip + ':' + port))
|
||||||
|
finally:
|
||||||
|
lock.release() # 释放锁
|
||||||
|
# 判断爬取的是否是66ip.com的ip
|
||||||
|
elif '66ip.cn' in self.url:
|
||||||
|
res = requests.get(url=self.url).text
|
||||||
|
tree = etree.HTML(res)
|
||||||
|
ip = tree.xpath('//body//br/following-sibling::text()[preceding-sibling::br]')
|
||||||
|
ip = [x.replace("\n", "").replace("\t", "").replace("\r", "") for x in ip if
|
||||||
|
len(x.replace("\n", "").replace("\t", "").replace("\r", ""))]
|
||||||
|
lock.acquire() # 获取锁
|
||||||
|
try:
|
||||||
|
IPS.extend(ip)
|
||||||
|
finally:
|
||||||
|
lock.release() # 释放锁
|
||||||
|
self.browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
def Write_excel(IP, path): # 将爬取到的ip写入excel中
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
for date in IP:
|
||||||
|
ws.append((date,))
|
||||||
|
wb.save(path)
|
||||||
|
print(f"{len(IP)}条ip爬取成功")
|
||||||
|
|
||||||
|
|
||||||
|
def Read_ip(path): # 读取文件的ip返回一个列表
|
||||||
|
wb = openpyxl.load_workbook(path)
|
||||||
|
sheet = wb.active
|
||||||
|
ips = [x.value for x in sheet['A']]
|
||||||
|
return ips
|
||||||
|
|
||||||
|
|
||||||
|
def Threads_spider(): # 爬取ip
|
||||||
|
"""
|
||||||
|
获取代理IP
|
||||||
|
:return: IPS 返回IP代理池
|
||||||
|
"""
|
||||||
|
print(f'线程爬取代理IP中...')
|
||||||
|
timestamp1 = int(time.time())
|
||||||
|
threads = [Proxy_pool(url) for url in urls]
|
||||||
|
for t in threads:
|
||||||
|
t.start()
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
# Write_excel(IPS, 'ip_pool.xlsx')
|
||||||
|
timestamp2 = int(time.time())
|
||||||
|
print(f'用时:{timestamp2 - timestamp1}秒')
|
||||||
|
return IPS
|
||||||
|
|
||||||
|
def Get_UA():
|
||||||
|
Major_verssion = random.randrange(30, 88)
|
||||||
|
Build_number = random.randrange(3000, 4500)
|
||||||
|
Patch_number = random.randrange(1, 150)
|
||||||
|
systems = ['Windows NT 10.0; Win64; x64', 'Linux; Android 10; Pixel 4', 'X11; Linux x86_64',
|
||||||
|
'Macintosh; Intel Mac OS X 10_9_2', 'Linux; Android 4.4; Nexus 5 Build/KRT16M', 'Windows NT 6.1; WOW64']
|
||||||
|
chrome=[f'Edge/{Major_verssion}.0.664.{Patch_number}','Safari/537.36']
|
||||||
|
UA = f'Mozilla/5.0 ({random.choice(systems)}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{Major_verssion}.0.{Build_number}.{Patch_number} {random.choice(chrome)}'
|
||||||
|
return UA
|
||||||
|
|
Loading…
Reference in new issue