|
|
|
@ -1,118 +0,0 @@
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
import json
|
|
|
|
|
import csv
|
|
|
|
|
import threading
|
|
|
|
|
import random
|
|
|
|
|
import time
|
|
|
|
|
from time import sleep
|
|
|
|
|
from queue import Queue
|
|
|
|
|
|
|
|
|
|
def get_cookie():#登录后获取cookie
|
|
|
|
|
options = Options()
|
|
|
|
|
driver = webdriver.Chrome(options=options)
|
|
|
|
|
url = 'https://passport.jd.com/new/login.aspx?/'
|
|
|
|
|
driver.get(url)
|
|
|
|
|
print("请手动登录京东账号...")
|
|
|
|
|
input("登录完成后,请按回车键继续...")
|
|
|
|
|
|
|
|
|
|
cookie_list = driver.get_cookies()
|
|
|
|
|
cookie_str = json.dumps(cookie_list)
|
|
|
|
|
with open('Jdcookie.txt', 'w') as f:
|
|
|
|
|
f.write(cookie_str)
|
|
|
|
|
print('Cookie已保存到Jdcookie.txt')
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
def crawl_page(start_page, end_page, current_url, queue):
|
|
|
|
|
options = Options()
|
|
|
|
|
driver = webdriver.Chrome(options=options)
|
|
|
|
|
|
|
|
|
|
# 设置隐式等待时间
|
|
|
|
|
driver.implicitly_wait(random.uniform(3, 6))
|
|
|
|
|
|
|
|
|
|
with open('Jdcookie.txt', mode='r', encoding='utf-8') as f:
|
|
|
|
|
cookie = f.read()
|
|
|
|
|
cookie = json.loads(cookie)
|
|
|
|
|
driver.get('https://www.jd.com/')
|
|
|
|
|
driver.delete_all_cookies()#清除cookie可以模拟用户每次访问网站时都是一个全新的会话,提高爬虫成功率
|
|
|
|
|
for item in cookie:
|
|
|
|
|
driver.add_cookie(item)
|
|
|
|
|
driver.refresh()
|
|
|
|
|
|
|
|
|
|
for page in range(start_page, end_page + 1):
|
|
|
|
|
url = current_url + f'&isList=0&page={page*2-1}'#根据分析url得出的规律
|
|
|
|
|
driver.get(url)
|
|
|
|
|
for x in range(1, 12, 2):#下拉页面
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
j = x / 9
|
|
|
|
|
js = "document.documentElement.scrollTop=document.documentElement.scrollHeight * " + str(j)
|
|
|
|
|
driver.execute_script(js)
|
|
|
|
|
goods = driver.find_elements(By.CLASS_NAME, 'gl-i-wrap')
|
|
|
|
|
for good in goods:
|
|
|
|
|
try:
|
|
|
|
|
title = good.find_element(By.CSS_SELECTOR, '.p-name em').text.strip()#元素定位
|
|
|
|
|
price = good.find_element(By.CSS_SELECTOR, '.p-price strong').text.strip()
|
|
|
|
|
shop = good.find_element(By.CSS_SELECTOR, '.p-shop span a').text.strip()
|
|
|
|
|
comment = good.find_element(By.CSS_SELECTOR, '.p-commit strong a').text.strip()
|
|
|
|
|
print('title: ' + title)
|
|
|
|
|
print('price: ' + price)
|
|
|
|
|
print('shop: ' + shop)
|
|
|
|
|
print('comment: ' + comment)
|
|
|
|
|
with open('JD.csv', mode='a', encoding='UTF-8', newline='') as file:#信息写入csv文件
|
|
|
|
|
csv.writer(file).writerow([title, price, shop, comment])
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error: {e}")
|
|
|
|
|
print(f'第{page}页爬取完毕!')
|
|
|
|
|
|
|
|
|
|
driver.quit()
|
|
|
|
|
queue.put(1) # 爬取完成后,放入队列表示该线程完成任务
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
# get_cookie() # 首次登录请取消注释
|
|
|
|
|
options = Options()
|
|
|
|
|
driver = webdriver.Chrome(options=options)
|
|
|
|
|
with open('Jdcookie.txt', mode='r', encoding='utf-8') as f:
|
|
|
|
|
cookie = f.read()
|
|
|
|
|
cookie = json.loads(cookie)
|
|
|
|
|
driver.get('https://www.jd.com/')
|
|
|
|
|
driver.delete_all_cookies()
|
|
|
|
|
for item in cookie:
|
|
|
|
|
driver.add_cookie(item)
|
|
|
|
|
driver.refresh()
|
|
|
|
|
sleep(1)
|
|
|
|
|
driver.find_element(By.ID, 'key').send_keys("口红")
|
|
|
|
|
sleep(2)
|
|
|
|
|
driver.find_element(By.CLASS_NAME, 'button').click()
|
|
|
|
|
sleep(15)
|
|
|
|
|
current_url = driver.current_url
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
queue = Queue() # 创建一个队列
|
|
|
|
|
|
|
|
|
|
# 创建4个线程,每个线程都使用独立的WebDriver实例
|
|
|
|
|
t1 = threading.Thread(target=crawl_page, args=(1, 5, current_url, queue))
|
|
|
|
|
t2 = threading.Thread(target=crawl_page, args=(6, 10, current_url, queue))
|
|
|
|
|
t3 = threading.Thread(target=crawl_page, args=(11, 15, current_url, queue))
|
|
|
|
|
t4 = threading.Thread(target=crawl_page, args=(16, 20, current_url, queue))
|
|
|
|
|
|
|
|
|
|
# 启动线程
|
|
|
|
|
t1.start()
|
|
|
|
|
t2.start()
|
|
|
|
|
t3.start()
|
|
|
|
|
t4.start()
|
|
|
|
|
|
|
|
|
|
# 等待线程完成第一页的爬取
|
|
|
|
|
queue.get()
|
|
|
|
|
queue.get()
|
|
|
|
|
queue.get()
|
|
|
|
|
queue.get()
|
|
|
|
|
|
|
|
|
|
# 继续爬取剩余页面
|
|
|
|
|
t1.join()
|
|
|
|
|
t2.join()
|
|
|
|
|
t3.join()
|
|
|
|
|
t4.join()
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|