From ba954d2d7cc3607442d6a281bf1734644c0408f9 Mon Sep 17 00:00:00 2001 From: pfacgmq8j <2511136947@qq.com> Date: Thu, 1 Dec 2022 23:04:37 +0800 Subject: [PATCH] ADD file via upload --- ...ium操控浏览器获取京东某商品.py | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 selenium操控浏览器获取京东某商品.py diff --git a/selenium操控浏览器获取京东某商品.py b/selenium操控浏览器获取京东某商品.py new file mode 100644 index 0000000..9435070 --- /dev/null +++ b/selenium操控浏览器获取京东某商品.py @@ -0,0 +1,196 @@ +""" +用selenium操控浏览器免去了请求对象的定制,再模拟浏览器向服务器发送请求的过程,而是直接操控浏览器,降低被反爬的概率 +""" +from selenium import webdriver +import time +import csv +from selenium.webdriver.common.keys import Keys +import jieba +import wordcloud +from pyecharts.charts import Bar +from pyecharts import options +from pyecharts.options import * + + +def work(start_page, end_page, trade_name): + + fp = open(f'{trade_name}采集数据.csv', 'a', encoding='utf-8',newline='') + csv_writer = csv.DictWriter(fp, fieldnames=[ + '标题', + '价格', + '评论数', + '店铺名', + '详情页链接', + ]) + # 写入表头 + csv_writer.writeheader() + + # 京东网站地址 + url = 'https://www.jd.com/' + + # 谷歌浏览器驱动路径 + path = 'chromedriver.exe' + # 构建浏览器对象 + browser = webdriver.Chrome(path) + # 打开京东网站 + browser.get(url) + # 睡眠方便观察 + # time.sleep(3) + + # 通过xpath来获取文本框 + input = browser.find_element_by_xpath('//*[@id="key"]') + # time.sleep(3) + # 在文本框中输入要搜索的商品 + input.send_keys(trade_name) + # time.sleep(2) + # 获取搜索按钮并点击 + # 根据xpath语法来获取对象 + button = browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button') + button.click() + # time.sleep(3) + + + # 将页面下划到底部,加载全部数据 + # 不将页面下划到底部,无法加载当前页面的全部商品 + def drop_down():# 固定写法之一 + for x in range(1, 12, 2): + time.sleep(1) + j = x / 9 + js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j + browser.execute_script(js) + + def shop_info(): + try: + browser.implicitly_wait(10) + drop_down() + # 获取当前页面搜索出的所有商品 + # 根据bs4语法来获取对象 + lis = browser.find_elements_by_css_selector('#J_goodsList ul li.gl-item') + + for li in lis: + title = li.find_element_by_css_selector('.p-name em').text.replace('\n', '') # 获取商品名 + price = li.find_element_by_css_selector('.p-price i').text # 获取价格 + comment_count = li.find_element_by_css_selector('.p-commit a').text # 获取评论数 + shop_name = li.find_element_by_css_selector('.p-shop a').text # 获取商家名 + href = li.find_element_by_css_selector('.p-img a').get_attribute('href') # 获取商品链接 + + # 将数据保存到字典中 + my_dict = { + '标题': title, + '价格': price, + '评论数': comment_count, + '店铺名': shop_name, + '详情页链接': href + } + csv_writer.writerow(my_dict) + with open(f'{trade_name}商家.txt','a',encoding='utf-8')as fp: + fp.write(my_dict['店铺名'] + '\n') + + with open(f'{trade_name}商品价格.txt','a',encoding='utf-8')as f: + f.write(my_dict['标题'] + ';' + my_dict['价格'] + '\n') + + print(title,price,comment_count,shop_name,href) + + except: + # 一些系统推广的广告由于结构和样式不同,获取时会报错影响获取进度,直接忽略 + pass + + for page in range(start_page, end_page+1): + print(f"----------当前采集第{page}页的内容----------") + shop_info() + next_page_button = browser.find_element_by_css_selector('.pn-next').send_keys(Keys.ARROW_RIGHT)# 键盘向右箭头 + # next_page_button.click() + + + # 执行完毕后关闭浏览器 + # time.sleep(2) + browser.quit() + + +def cloud(trade_name): + with open(f'{trade_name}商家.txt','r',encoding='utf-8')as fp: + txt = fp.read() + txt_list = jieba.lcut(txt) + # 用空格分割词语 + my_string = ' '.join(txt_list) + wc = wordcloud.WordCloud( + width=800, + height=500, + background_color='white', + scale=15, + font_path='msyh.ttc' + ) + wc.generate(my_string) + wc.to_file(f'{trade_name}商家词云.png') + + +def chart(trade_name): + with open(f'{trade_name}商品价格.txt','r',encoding='utf-8')as fp: + data_lines = fp.readlines() + + data_dict = {} + for line in data_lines: + goods = line.replace(" ","").split(";")[0] + prices = float(line.replace(" ","").split(";")[1]) + + try: + data_dict[goods].append(prices) + except KeyError: + data_dict[goods] = [] + data_dict[goods].append(prices) + + sorted_data =sorted(data_dict.items(), key=lambda element:element[1], reverse=True)[0:20] # 列表里面为元组 + + x_data = [] + y_data = [] + x_list = [] + y_list = [] + for i in range(20): + x_data.append(sorted_data[i][0]) + y_data.append(sorted_data[i][1]) + x = x_data[i] + y = y_data[i][0] + x_list.append(x) + y_list.append(y) + + bar = Bar(init_opts=options.InitOpts(width="1500px")) + bar.add_xaxis(x_list) + bar.add_yaxis("价格(元)", y_list,label_opts=LabelOpts(position="right")) + bar.set_global_opts( + title_opts=TitleOpts(title=f"{trade_name}价格前20商品"), + xaxis_opts = options.AxisOpts(is_show=False),# 隐藏x轴 + visualmap_opts=VisualMapOpts( + is_show=True, + is_piecewise=True, # 是否分段 + pieces=[ + {"min": 1, "max": 99, "label": '1 - 99', "color": "#CCFFFF"}, + {"min": 100, "max": 999, "label": '100 - 999', "color": "#FFFF99"}, + {"min": 1000, "max": 4999, "label": '1000 - 999', "color": "#FF9966"}, + {"min": 5000, "max": 9999, "label": '5000 - 9999', "color": "#FF6666"}, + {"min": 10000, "max": 99999, "label": '10000 - 99999', "color": "#CC3333"} + ] + ) + ) + bar.render(f"{trade_name}价格前20商品.html") + + + + +if __name__ == '__main__': + trade_name = input("输入要搜索的商品名:") + start_page = int(input("输入起始页码:")) + end_page = int(input("输入结束页码:")) + work(start_page, end_page, trade_name) + cloud(trade_name) + chart(trade_name) + + + + + + + + + + +