From ea0b4d7d6ba262afefe73ea66980f73748905ef9 Mon Sep 17 00:00:00 2001 From: pkyftzsbu <369765584@qq.com> Date: Fri, 26 Apr 2024 23:35:06 +0800 Subject: [PATCH] Delete 'final.py' --- final.py | 118 ------------------------------------------------------- 1 file changed, 118 deletions(-) delete mode 100644 final.py diff --git a/final.py b/final.py deleted file mode 100644 index 2b4f332..0000000 --- a/final.py +++ /dev/null @@ -1,118 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.chrome.options import Options -import json -import csv -import threading -import random -import time -from time import sleep -from queue import Queue - -def get_cookie():#登录后获取cookie - options = Options() - driver = webdriver.Chrome(options=options) - url = 'https://passport.jd.com/new/login.aspx?/' - driver.get(url) - print("请手动登录京东账号...") - input("登录完成后,请按回车键继续...") - - cookie_list = driver.get_cookies() - cookie_str = json.dumps(cookie_list) - with open('Jdcookie.txt', 'w') as f: - f.write(cookie_str) - print('Cookie已保存到Jdcookie.txt') - driver.quit() - -def crawl_page(start_page, end_page, current_url, queue): - options = Options() - driver = webdriver.Chrome(options=options) - - # 设置隐式等待时间 - driver.implicitly_wait(random.uniform(3, 6)) - - with open('Jdcookie.txt', mode='r', encoding='utf-8') as f: - cookie = f.read() - cookie = json.loads(cookie) - driver.get('https://www.jd.com/') - driver.delete_all_cookies()#清除cookie可以模拟用户每次访问网站时都是一个全新的会话,提高爬虫成功率 - for item in cookie: - driver.add_cookie(item) - driver.refresh() - - for page in range(start_page, end_page + 1): - url = current_url + f'&isList=0&page={page*2-1}'#根据分析url得出的规律 - driver.get(url) - for x in range(1, 12, 2):#下拉页面 - time.sleep(1) - j = x / 9 - js = "document.documentElement.scrollTop=document.documentElement.scrollHeight * " + str(j) - driver.execute_script(js) - goods = driver.find_elements(By.CLASS_NAME, 'gl-i-wrap') - for good in goods: - try: - title = good.find_element(By.CSS_SELECTOR, '.p-name em').text.strip()#元素定位 - price = good.find_element(By.CSS_SELECTOR, '.p-price strong').text.strip() - shop = good.find_element(By.CSS_SELECTOR, '.p-shop span a').text.strip() - comment = good.find_element(By.CSS_SELECTOR, '.p-commit strong a').text.strip() - print('title: ' + title) - print('price: ' + price) - print('shop: ' + shop) - print('comment: ' + comment) - with open('JD.csv', mode='a', encoding='UTF-8', newline='') as file:#信息写入csv文件 - csv.writer(file).writerow([title, price, shop, comment]) - except Exception as e: - print(f"Error: {e}") - print(f'第{page}页爬取完毕!') - - driver.quit() - queue.put(1) # 爬取完成后,放入队列表示该线程完成任务 - -def main(): - # get_cookie() # 首次登录请取消注释 - options = Options() - driver = webdriver.Chrome(options=options) - with open('Jdcookie.txt', mode='r', encoding='utf-8') as f: - cookie = f.read() - cookie = json.loads(cookie) - driver.get('https://www.jd.com/') - driver.delete_all_cookies() - for item in cookie: - driver.add_cookie(item) - driver.refresh() - sleep(1) - driver.find_element(By.ID, 'key').send_keys("口红") - sleep(2) - driver.find_element(By.CLASS_NAME, 'button').click() - sleep(15) - current_url = driver.current_url - driver.quit() - - queue = Queue() # 创建一个队列 - - # 创建4个线程,每个线程都使用独立的WebDriver实例 - t1 = threading.Thread(target=crawl_page, args=(1, 5, current_url, queue)) - t2 = threading.Thread(target=crawl_page, args=(6, 10, current_url, queue)) - t3 = threading.Thread(target=crawl_page, args=(11, 15, current_url, queue)) - t4 = threading.Thread(target=crawl_page, args=(16, 20, current_url, queue)) - - # 启动线程 - t1.start() - t2.start() - t3.start() - t4.start() - - # 等待线程完成第一页的爬取 - queue.get() - queue.get() - queue.get() - queue.get() - - # 继续爬取剩余页面 - t1.join() - t2.join() - t3.join() - t4.join() - -if __name__ == '__main__': - main()