diff --git a/Proxypool.py b/Proxypool.py new file mode 100644 index 0000000..c196773 --- /dev/null +++ b/Proxypool.py @@ -0,0 +1,112 @@ +import time +import threading +import openpyxl +import requests +from lxml import etree +from selenium import webdriver +from openpyxl import Workbook +from selenium.webdriver.common.by import By +import random +from selenium.webdriver.edge.options import Options + + +lock = threading.Lock() # 互斥锁 +IPS = [] # 装ip的列表 +urls = [ + 'http://www.66ip.cn/nmtq.php?getnum=300&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1' + '&proxytype=2&api=66ip'] + [ + f'http://www.ip3366.net/free/?stype=1&page={x}' for x in range(1, 8)] + + +class Proxy_pool(threading.Thread): + def __init__(self, url): + super().__init__() + self.url = url + + opt = Options() + opt.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错 + opt.add_argument('window-size=1920x3000') # 设置浏览器分辨率 + opt.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug + opt.add_argument('--hide-scrollbars') # 隐藏滚动条,应对一些特殊页面 + opt.add_argument('blink-settings=imagesEnabled=false') # 不加载图片,提升运行速度 + opt.add_argument('--disable-javascript') # 禁用 JavaScript + opt.add_argument('--headless') # 浏览器不提供可视化界面。Linux下如果系统不支持可视化不加这条会启动失败 + self.browser = webdriver.Chrome(options=opt) + + def run(self): + global IPS + global lock + if 'ip3366' in self.url: + self.browser.get(self.url) # 放在这里访问是为了在操作下面的时间里让页面加载完毕 + self.get_ip(IPS, lock) + + def get_ip(self, IPS, lock): + # 判断是否是爬取ip3366.com的ip + if 'ip3366' in self.url: + trs = self.browser.find_elements(By.XPATH, "//tbody//tr") + for tr in trs: + ip = tr.find_element(By.XPATH, "./td[1]").text + port = tr.find_element(By.XPATH, "./td[2]").text + lock.acquire() # 获取锁 + try: + IPS.append(('http://' + ip + ':' + port)) + finally: + lock.release() # 释放锁 + # 判断爬取的是否是66ip.com的ip + elif '66ip.cn' in self.url: + res = requests.get(url=self.url).text + tree = etree.HTML(res) + ip = tree.xpath('//body//br/following-sibling::text()[preceding-sibling::br]') + ip = [x.replace("\n", "").replace("\t", "").replace("\r", "") for x in ip if + len(x.replace("\n", "").replace("\t", "").replace("\r", ""))] + lock.acquire() # 获取锁 + try: + IPS.extend(ip) + finally: + lock.release() # 释放锁 + self.browser.close() + + +def Write_excel(IP, path): # 将爬取到的ip写入excel中 + wb = Workbook() + ws = wb.active + for date in IP: + ws.append((date,)) + wb.save(path) + print(f"{len(IP)}条ip爬取成功") + + +def Read_ip(path): # 读取文件的ip返回一个列表 + wb = openpyxl.load_workbook(path) + sheet = wb.active + ips = [x.value for x in sheet['A']] + return ips + + +def Threads_spider(): # 爬取ip + """ + 获取代理IP + :return: IPS 返回IP代理池 + """ + print(f'线程爬取代理IP中...') + timestamp1 = int(time.time()) + threads = [Proxy_pool(url) for url in urls] + for t in threads: + t.start() + for t in threads: + t.join() + # Write_excel(IPS, 'ip_pool.xlsx') + timestamp2 = int(time.time()) + print(f'用时:{timestamp2 - timestamp1}秒') + return IPS + +def Get_UA(): + Major_verssion = random.randrange(30, 88) + Build_number = random.randrange(3000, 4500) + Patch_number = random.randrange(1, 150) + systems = ['Windows NT 10.0; Win64; x64', 'Linux; Android 10; Pixel 4', 'X11; Linux x86_64', + 'Macintosh; Intel Mac OS X 10_9_2', 'Linux; Android 4.4; Nexus 5 Build/KRT16M', 'Windows NT 6.1; WOW64'] + chrome=[f'Edge/{Major_verssion}.0.664.{Patch_number}','Safari/537.36'] + UA = f'Mozilla/5.0 ({random.choice(systems)}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{Major_verssion}.0.{Build_number}.{Patch_number} {random.choice(chrome)}' + return UA +