From c8ef4d545fe32b6e24b287ec102819488ba3993a Mon Sep 17 00:00:00 2001 From: pkyftzsbu <369765584@qq.com> Date: Fri, 26 Apr 2024 23:35:21 +0800 Subject: [PATCH] Add final.py --- final.py | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 final.py diff --git a/final.py b/final.py new file mode 100644 index 0000000..d766920 --- /dev/null +++ b/final.py @@ -0,0 +1,121 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +import json +import csv +import threading +import random +import time +from time import sleep +from queue import Queue + +def get_cookie():#登录后获取cookie + options = Options() + driver = webdriver.Chrome(options=options) + url = 'https://passport.jd.com/new/login.aspx?/' + driver.get(url) + print("请手动登录京东账号...") + input("登录完成后,请按回车键继续...") + + cookie_list = driver.get_cookies() + cookie_str = json.dumps(cookie_list) + with open('Jdcookie.txt', 'w') as f: + f.write(cookie_str) + print('Cookie已保存到Jdcookie.txt') + driver.quit() + +def crawl_page(start_page, end_page, current_url, queue): + options = Options() + driver = webdriver.Chrome(options=options) + + # 设置隐式等待时间 + driver.implicitly_wait(random.uniform(3, 6)) + + with open('Jdcookie.txt', mode='r', encoding='utf-8') as f: + cookie = f.read() + cookie = json.loads(cookie) + driver.get('https://www.jd.com/') + driver.delete_all_cookies()#清除cookie可以模拟用户每次访问网站时都是一个全新的会话,提高爬虫成功率 + for item in cookie: + driver.add_cookie(item) + driver.refresh() + + for page in range(start_page, end_page + 1): + url = current_url + f'&isList=0&page={page*2-1}'#根据分析url得出的规律 + driver.get(url) + for x in range(1, 12, 2):#下拉页面 + time.sleep(1) + j = x / 9 + js = "document.documentElement.scrollTop=document.documentElement.scrollHeight * " + str(j) + driver.execute_script(js) + goods = driver.find_elements(By.CLASS_NAME, 'gl-i-wrap') + for good in goods: + try: + title = good.find_element(By.CSS_SELECTOR, '.p-name em').text.strip()#元素定位 + price = good.find_element(By.CSS_SELECTOR, '.p-price strong').text.strip() + shop = good.find_element(By.CSS_SELECTOR, '.p-shop span a').text.strip() + comment = good.find_element(By.CSS_SELECTOR, '.p-commit strong a').text.strip() + print('title: ' + title) + print('price: ' + price) + print('shop: ' + shop) + print('comment: ' + comment) + with open('JD.csv', mode='a', encoding='UTF-8', newline='') as file:#信息写入csv文件 + csv.writer(file).writerow([title, price, shop, comment]) + except Exception as e: + print(f"Error: {e}") + print(f'第{page}页爬取完毕!') + + driver.quit() + queue.put(1) # 爬取完成后,放入队列表示该线程完成任务 + +def main(): + # get_cookie() # 首次登录请取消注释 + options = Options() + driver = webdriver.Chrome(options=options) + with open('Jdcookie.txt', mode='r', encoding='utf-8') as f: + cookie = f.read() + cookie = json.loads(cookie) + driver.get('https://www.jd.com/') + driver.delete_all_cookies() + for item in cookie: + driver.add_cookie(item) + driver.refresh() + sleep(1) + driver.find_element(By.ID, 'key').send_keys("口红") + sleep(2) + driver.find_element(By.CLASS_NAME, 'button').click() + sleep(15) + current_url = driver.current_url + driver.quit() + + queue = Queue() # 创建一个队列 + + with open('JD.csv',mode='w',encoding='UTF-8',newline='') as file: + csv.writer(file).writerow(['商品', '价格','店铺','评论数']) + + # 创建4个线程,每个线程都使用独立的WebDriver实例 + t1 = threading.Thread(target=crawl_page, args=(1, 5, current_url, queue)) + t2 = threading.Thread(target=crawl_page, args=(6, 10, current_url, queue)) + t3 = threading.Thread(target=crawl_page, args=(11, 15, current_url, queue)) + t4 = threading.Thread(target=crawl_page, args=(16, 20, current_url, queue)) + + # 启动线程 + t1.start() + t2.start() + t3.start() + t4.start() + + # 等待线程完成第一页的爬取 + queue.get() + queue.get() + queue.get() + queue.get() + + # 继续爬取剩余页面 + t1.join() + t2.join() + t3.join() + t4.join() + +if __name__ == '__main__': + main()