|
|
|
@ -1,82 +0,0 @@
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
import json
|
|
|
|
|
import csv
|
|
|
|
|
import threading
|
|
|
|
|
from time import sleep
|
|
|
|
|
|
|
|
|
|
def get_cookie():
|
|
|
|
|
options = Options()
|
|
|
|
|
driver = webdriver.Chrome(options=options)
|
|
|
|
|
url = 'https://passport.jd.com/new/login.aspx?/'
|
|
|
|
|
driver.get(url)
|
|
|
|
|
print("请手动登录京东账号...")
|
|
|
|
|
input("登录完成后,请按回车键继续...")
|
|
|
|
|
|
|
|
|
|
cookie_list = driver.get_cookies()
|
|
|
|
|
cookie_str = json.dumps(cookie_list)
|
|
|
|
|
with open('Jdcookie.txt', 'w') as f:
|
|
|
|
|
f.write(cookie_str)
|
|
|
|
|
print('Cookie已保存到Jdcookie.txt')
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
def crawl_page(start_page, end_page):
|
|
|
|
|
options = Options()
|
|
|
|
|
driver = webdriver.Chrome(options=options)
|
|
|
|
|
with open('Jdcookie.txt', mode='r', encoding='utf-8') as f:
|
|
|
|
|
cookie = f.read()
|
|
|
|
|
cookie = json.loads(cookie)
|
|
|
|
|
driver.get('https://www.jd.com/')
|
|
|
|
|
driver.delete_all_cookies()
|
|
|
|
|
for item in cookie:
|
|
|
|
|
driver.add_cookie(item)
|
|
|
|
|
driver.refresh()
|
|
|
|
|
|
|
|
|
|
for page in range(start_page, end_page + 1):
|
|
|
|
|
url = f"https://search.jd.com/Search?keyword=%E5%8F%A3%E7%BA%A2&qrst=1&wq=%E5%8F%A3%E7%BA%A2&stock=1&pvid=a2121da231fd4f5e90a6541711da68a0&isList=0&page={page*2-1}"
|
|
|
|
|
driver.get(url)
|
|
|
|
|
sleep(5)
|
|
|
|
|
goods = driver.find_elements(By.CLASS_NAME, 'gl-i-wrap')
|
|
|
|
|
for good in goods:
|
|
|
|
|
try:
|
|
|
|
|
title = good.find_element(By.CSS_SELECTOR, '.p-name em').text.strip()
|
|
|
|
|
price = good.find_element(By.CSS_SELECTOR, '.p-price strong').text.strip()
|
|
|
|
|
shop = good.find_element(By.CSS_SELECTOR, '.p-shop span a').text.strip()
|
|
|
|
|
comment = good.find_element(By.CSS_SELECTOR, '.p-commit strong a').text.strip()
|
|
|
|
|
print('title: ' + title)
|
|
|
|
|
print('price: ' + price)
|
|
|
|
|
print('shop: ' + shop)
|
|
|
|
|
print('comment: ' + comment)
|
|
|
|
|
with open('JD.csv', mode='a+', encoding='UTF-8', newline='') as file:
|
|
|
|
|
csv.writer(file).writerow([title, price, shop, comment])
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error: {e}")
|
|
|
|
|
print(f'第{page}页爬取完毕!')
|
|
|
|
|
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
# 先获取cookie
|
|
|
|
|
# get_cookie()
|
|
|
|
|
|
|
|
|
|
# 创建两个线程,每个线程都使用独立的WebDriver实例
|
|
|
|
|
t1 = threading.Thread(target=crawl_page, args=(1, 25))
|
|
|
|
|
t2 = threading.Thread(target=crawl_page, args=(26, 50))
|
|
|
|
|
t3 = threading.Thread(target=crawl_page, args=(51, 75))
|
|
|
|
|
t4 = threading.Thread(target=crawl_page, args=(76, 100))
|
|
|
|
|
|
|
|
|
|
# 启动线程
|
|
|
|
|
t1.start()
|
|
|
|
|
t2.start()
|
|
|
|
|
t3.start()
|
|
|
|
|
t4.start()
|
|
|
|
|
|
|
|
|
|
# 等待线程结束
|
|
|
|
|
t1.join()
|
|
|
|
|
t2.join()
|
|
|
|
|
t3.join()
|
|
|
|
|
t4.join()
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|