import time import threading import openpyxl import requests from lxml import etree from selenium import webdriver from openpyxl import Workbook from selenium.webdriver.common.by import By import random from selenium.webdriver.edge.options import Options lock = threading.Lock() # 互斥锁 IPS = [] # 装ip的列表 urls = [ 'http://www.66ip.cn/nmtq.php?getnum=300&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=1' '&proxytype=2&api=66ip'] + [ f'http://www.ip3366.net/free/?stype=1&page={x}' for x in range(1, 8)] class Proxy_pool(threading.Thread): def __init__(self, url): super().__init__() self.url = url opt = Options() opt.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错 opt.add_argument('window-size=1920x3000') # 设置浏览器分辨率 opt.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug opt.add_argument('--hide-scrollbars') # 隐藏滚动条,应对一些特殊页面 opt.add_argument('blink-settings=imagesEnabled=false') # 不加载图片,提升运行速度 opt.add_argument('--disable-javascript') # 禁用 JavaScript opt.add_argument('--headless') # 浏览器不提供可视化界面。Linux下如果系统不支持可视化不加这条会启动失败 self.browser = webdriver.Chrome(options=opt) def run(self): global IPS global lock if 'ip3366' in self.url: self.browser.get(self.url) # 放在这里访问是为了在操作下面的时间里让页面加载完毕 self.get_ip(IPS, lock) def get_ip(self, IPS, lock): # 判断是否是爬取ip3366.com的ip if 'ip3366' in self.url: trs = self.browser.find_elements(By.XPATH, "//tbody//tr") for tr in trs: ip = tr.find_element(By.XPATH, "./td[1]").text port = tr.find_element(By.XPATH, "./td[2]").text lock.acquire() # 获取锁 try: IPS.append(('http://' + ip + ':' + port)) finally: lock.release() # 释放锁 # 判断爬取的是否是66ip.com的ip elif '66ip.cn' in self.url: res = requests.get(url=self.url).text tree = etree.HTML(res) ip = tree.xpath('//body//br/following-sibling::text()[preceding-sibling::br]') ip = [x.replace("\n", "").replace("\t", "").replace("\r", "") for x in ip if len(x.replace("\n", "").replace("\t", "").replace("\r", ""))] lock.acquire() # 获取锁 try: IPS.extend(ip) finally: lock.release() # 释放锁 self.browser.close() def Write_excel(IP, path): # 将爬取到的ip写入excel中 wb = Workbook() ws = wb.active for date in IP: ws.append((date,)) wb.save(path) print(f"{len(IP)}条ip爬取成功") def Read_ip(path): # 读取文件的ip返回一个列表 wb = openpyxl.load_workbook(path) sheet = wb.active ips = [x.value for x in sheet['A']] return ips def Threads_spider(): # 爬取ip """ 获取代理IP :return: IPS 返回IP代理池 """ print(f'线程爬取代理IP中...') timestamp1 = int(time.time()) threads = [Proxy_pool(url) for url in urls] for t in threads: t.start() for t in threads: t.join() # Write_excel(IPS, 'ip_pool.xlsx') timestamp2 = int(time.time()) print(f'用时:{timestamp2 - timestamp1}秒') return IPS def Get_UA(): Major_verssion = random.randrange(30, 88) Build_number = random.randrange(3000, 4500) Patch_number = random.randrange(1, 150) systems = ['Windows NT 10.0; Win64; x64', 'Linux; Android 10; Pixel 4', 'X11; Linux x86_64', 'Macintosh; Intel Mac OS X 10_9_2', 'Linux; Android 4.4; Nexus 5 Build/KRT16M', 'Windows NT 6.1; WOW64'] chrome=[f'Edge/{Major_verssion}.0.664.{Patch_number}','Safari/537.36'] UA = f'Mozilla/5.0 ({random.choice(systems)}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{Major_verssion}.0.{Build_number}.{Patch_number} {random.choice(chrome)}' return UA