|
|
from selenium import webdriver
|
|
|
import time
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
import json
|
|
|
import csv
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from time import sleep
|
|
|
from multiprocessing import Process
|
|
|
|
|
|
|
|
|
driver = webdriver.Chrome()
|
|
|
|
|
|
#先手动登录,让程序获取到cookie,保存下来
|
|
|
def getcookie():
|
|
|
url = 'https://passport.jd.com/new/login.aspx?/'
|
|
|
#首先直接访问登录的页面 passport.jd.com
|
|
|
driver.get(url)
|
|
|
time.sleep(20)
|
|
|
#扫码登录
|
|
|
#登录之后的页面会跳转到这里,让浏览器等待,直到url完全匹配
|
|
|
url='https://www.jd.com/'
|
|
|
WebDriverWait(driver,20).until(EC.url_to_be(url))
|
|
|
#登录之后停2秒
|
|
|
time.sleep(2)
|
|
|
#获取到的cookies是列表
|
|
|
cookieList = driver.get_cookies()
|
|
|
#转成字符串
|
|
|
cookieStr = json.dumps(cookieList)
|
|
|
|
|
|
# print(cookieStr)
|
|
|
with open('Jdcookie.txt', 'w') as f:
|
|
|
f.write(cookieStr)
|
|
|
|
|
|
print('cookie已写入')
|
|
|
print(driver.current_url)
|
|
|
driver.close()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
# getcookie()#首次登录时去除注释
|
|
|
|
|
|
driver.get('https://www.jd.com/')
|
|
|
with open('Jdcookie.txt',mode='r',encoding='utf-8') as f:
|
|
|
cookie = f.read()
|
|
|
|
|
|
#读取到的是字符串类型,loads之后就变成了python中的字典类型
|
|
|
cookie = json.loads(cookie)
|
|
|
|
|
|
#先把所有的cookie全部删掉
|
|
|
driver.delete_all_cookies()
|
|
|
for item in cookie:
|
|
|
print(type(item))
|
|
|
print(item)
|
|
|
driver.add_cookie(item)
|
|
|
#是一个列表内套字典的形式
|
|
|
|
|
|
driver.refresh()
|
|
|
|
|
|
driver.find_element(By.ID,'key').send_keys("口红") #通过id找到搜索框,输入内容
|
|
|
driver.find_element(By.CLASS_NAME, 'button').click() #通过类名找到button提交,并点击
|
|
|
driver.implicitly_wait(10)
|
|
|
with open('JD.csv',mode='w',encoding='UTF-8',newline='') as file:
|
|
|
csv.writer(file).writerow(['商品', '价格','店铺','评论数'])
|
|
|
for page in range(0,50):
|
|
|
sleep(10)
|
|
|
# 下滑页面
|
|
|
# 滑动到最底部
|
|
|
driver.execute_script('window.scrollTo(0,1000)')
|
|
|
sleep(1)
|
|
|
driver.execute_script('window.scrollTo(1000,2000)')
|
|
|
sleep(1)
|
|
|
driver.execute_script('window.scrollTo(2000,3000)')
|
|
|
sleep(1)
|
|
|
driver.execute_script('window.scrollTo(3000,4000)')
|
|
|
sleep(1)
|
|
|
driver.execute_script('window.scrollTo(4000,5000)')
|
|
|
sleep(1)
|
|
|
driver.execute_script('window.scrollTo(5000,6000)')
|
|
|
sleep(1)
|
|
|
driver.execute_script('window.scrollTo(6000,7000)')
|
|
|
|
|
|
|
|
|
sleep(10)
|
|
|
goods = driver.find_elements(By.CLASS_NAME,'gl-i-wrap')
|
|
|
for good in goods:
|
|
|
title = good.find_element(By.CSS_SELECTOR,'.p-name em').text.strip()
|
|
|
price = good.find_element(By.CSS_SELECTOR,'.p-price strong').text.strip()
|
|
|
shop = good.find_element(By.CSS_SELECTOR,'.p-shop span a').text.strip()
|
|
|
comment = good.find_element(By.CSS_SELECTOR,'.p-commit strong a').text.strip()
|
|
|
print('title: ' + title)
|
|
|
print('price: ' + price)
|
|
|
print('shop: ' + shop)
|
|
|
print('comment: ' + comment)
|
|
|
# 用a+模式创建csv文件并写入
|
|
|
f = open('JD.csv', 'a+', encoding='utf-8')
|
|
|
# 基于文件对象构建csv写入
|
|
|
csv_a = csv.writer(f)
|
|
|
# 将数据写入
|
|
|
csv_a.writerow([title, price, shop,comment])
|
|
|
# 关闭文件
|
|
|
f.close()
|
|
|
driver.find_element(By.CLASS_NAME,'pn-next').click()
|
|
|
print(f'第{page+1}页爬取完毕!')
|
|
|
print('下一页加载中……')
|
|
|
# time.sleep(5)
|
|
|
|
|
|
driver.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|