You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
197 lines
6.5 KiB
197 lines
6.5 KiB
"""
|
|
用selenium操控浏览器免去了请求对象的定制,再模拟浏览器向服务器发送请求的过程,而是直接操控浏览器,降低被反爬的概率
|
|
"""
|
|
from selenium import webdriver
|
|
import time
|
|
import csv
|
|
from selenium.webdriver.common.keys import Keys
|
|
import jieba
|
|
import wordcloud
|
|
from pyecharts.charts import Bar
|
|
from pyecharts import options
|
|
from pyecharts.options import *
|
|
|
|
|
|
def work(start_page, end_page, trade_name):
|
|
|
|
fp = open(f'{trade_name}采集数据.csv', 'a', encoding='utf-8',newline='')
|
|
csv_writer = csv.DictWriter(fp, fieldnames=[
|
|
'标题',
|
|
'价格',
|
|
'评论数',
|
|
'店铺名',
|
|
'详情页链接',
|
|
])
|
|
# 写入表头
|
|
csv_writer.writeheader()
|
|
|
|
# 京东网站地址
|
|
url = 'https://www.jd.com/'
|
|
|
|
# 谷歌浏览器驱动路径
|
|
path = 'chromedriver.exe'
|
|
# 构建浏览器对象
|
|
browser = webdriver.Chrome(path)
|
|
# 打开京东网站
|
|
browser.get(url)
|
|
# 睡眠方便观察
|
|
# time.sleep(3)
|
|
|
|
# 通过xpath来获取文本框
|
|
input = browser.find_element_by_xpath('//*[@id="key"]')
|
|
# time.sleep(3)
|
|
# 在文本框中输入要搜索的商品
|
|
input.send_keys(trade_name)
|
|
# time.sleep(2)
|
|
# 获取搜索按钮并点击
|
|
# 根据xpath语法来获取对象
|
|
button = browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
|
|
button.click()
|
|
# time.sleep(3)
|
|
|
|
|
|
# 将页面下划到底部,加载全部数据
|
|
# 不将页面下划到底部,无法加载当前页面的全部商品
|
|
def drop_down():# 固定写法之一
|
|
for x in range(1, 12, 2):
|
|
time.sleep(1)
|
|
j = x / 9
|
|
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
|
|
browser.execute_script(js)
|
|
|
|
def shop_info():
|
|
try:
|
|
browser.implicitly_wait(10)
|
|
drop_down()
|
|
# 获取当前页面搜索出的所有商品
|
|
# 根据bs4语法来获取对象
|
|
lis = browser.find_elements_by_css_selector('#J_goodsList ul li.gl-item')
|
|
|
|
for li in lis:
|
|
title = li.find_element_by_css_selector('.p-name em').text.replace('\n', '') # 获取商品名
|
|
price = li.find_element_by_css_selector('.p-price i').text # 获取价格
|
|
comment_count = li.find_element_by_css_selector('.p-commit a').text # 获取评论数
|
|
shop_name = li.find_element_by_css_selector('.p-shop a').text # 获取商家名
|
|
href = li.find_element_by_css_selector('.p-img a').get_attribute('href') # 获取商品链接
|
|
|
|
# 将数据保存到字典中
|
|
my_dict = {
|
|
'标题': title,
|
|
'价格': price,
|
|
'评论数': comment_count,
|
|
'店铺名': shop_name,
|
|
'详情页链接': href
|
|
}
|
|
csv_writer.writerow(my_dict)
|
|
with open(f'{trade_name}商家.txt','a',encoding='utf-8')as fp:
|
|
fp.write(my_dict['店铺名'] + '\n')
|
|
|
|
with open(f'{trade_name}商品价格.txt','a',encoding='utf-8')as f:
|
|
f.write(my_dict['标题'] + ';' + my_dict['价格'] + '\n')
|
|
|
|
print(title,price,comment_count,shop_name,href)
|
|
|
|
except:
|
|
# 一些系统推广的广告由于结构和样式不同,获取时会报错影响获取进度,直接忽略
|
|
pass
|
|
|
|
for page in range(start_page, end_page+1):
|
|
print(f"----------当前采集第{page}页的内容----------")
|
|
shop_info()
|
|
next_page_button = browser.find_element_by_css_selector('.pn-next').send_keys(Keys.ARROW_RIGHT)# 键盘向右箭头
|
|
# next_page_button.click()
|
|
|
|
|
|
# 执行完毕后关闭浏览器
|
|
# time.sleep(2)
|
|
browser.quit()
|
|
|
|
|
|
def cloud(trade_name):
|
|
with open(f'{trade_name}商家.txt','r',encoding='utf-8')as fp:
|
|
txt = fp.read()
|
|
txt_list = jieba.lcut(txt)
|
|
# 用空格分割词语
|
|
my_string = ' '.join(txt_list)
|
|
wc = wordcloud.WordCloud(
|
|
width=800,
|
|
height=500,
|
|
background_color='white',
|
|
scale=15,
|
|
font_path='msyh.ttc'
|
|
)
|
|
wc.generate(my_string)
|
|
wc.to_file(f'{trade_name}商家词云.png')
|
|
|
|
|
|
def chart(trade_name):
|
|
with open(f'{trade_name}商品价格.txt','r',encoding='utf-8')as fp:
|
|
data_lines = fp.readlines()
|
|
|
|
data_dict = {}
|
|
for line in data_lines:
|
|
goods = line.replace(" ","").split(";")[0]
|
|
prices = float(line.replace(" ","").split(";")[1])
|
|
|
|
try:
|
|
data_dict[goods].append(prices)
|
|
except KeyError:
|
|
data_dict[goods] = []
|
|
data_dict[goods].append(prices)
|
|
|
|
sorted_data =sorted(data_dict.items(), key=lambda element:element[1], reverse=True)[0:20] # 列表里面为元组
|
|
|
|
x_data = []
|
|
y_data = []
|
|
x_list = []
|
|
y_list = []
|
|
for i in range(20):
|
|
x_data.append(sorted_data[i][0])
|
|
y_data.append(sorted_data[i][1])
|
|
x = x_data[i]
|
|
y = y_data[i][0]
|
|
x_list.append(x)
|
|
y_list.append(y)
|
|
|
|
bar = Bar(init_opts=options.InitOpts(width="1500px"))
|
|
bar.add_xaxis(x_list)
|
|
bar.add_yaxis("价格(元)", y_list,label_opts=LabelOpts(position="right"))
|
|
bar.set_global_opts(
|
|
title_opts=TitleOpts(title=f"{trade_name}价格前20商品"),
|
|
xaxis_opts = options.AxisOpts(is_show=False),# 隐藏x轴
|
|
visualmap_opts=VisualMapOpts(
|
|
is_show=True,
|
|
is_piecewise=True, # 是否分段
|
|
pieces=[
|
|
{"min": 1, "max": 99, "label": '1 - 99', "color": "#CCFFFF"},
|
|
{"min": 100, "max": 999, "label": '100 - 999', "color": "#FFFF99"},
|
|
{"min": 1000, "max": 4999, "label": '1000 - 999', "color": "#FF9966"},
|
|
{"min": 5000, "max": 9999, "label": '5000 - 9999', "color": "#FF6666"},
|
|
{"min": 10000, "max": 99999, "label": '10000 - 99999', "color": "#CC3333"}
|
|
]
|
|
)
|
|
)
|
|
bar.render(f"{trade_name}价格前20商品.html")
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
trade_name = input("输入要搜索的商品名:")
|
|
start_page = int(input("输入起始页码:"))
|
|
end_page = int(input("输入结束页码:"))
|
|
work(start_page, end_page, trade_name)
|
|
cloud(trade_name)
|
|
chart(trade_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|