ADD file via upload

2 years ago · ba954d2d7c
parent 50e255d34b
commit ba954d2d7c
1 changed files with 196 additions and 0 deletions
--- a/selenium操控浏览器获取京东某商品.py
+++ b/selenium操控浏览器获取京东某商品.py
@ -0,0 +1,196 @@
+"""
+用selenium操控浏览器免去了请求对象的定制,再模拟浏览器向服务器发送请求的过程,而是直接操控浏览器,降低被反爬的概率
+"""
+from selenium import webdriver
+import time
+import csv
+from selenium.webdriver.common.keys import Keys
+import jieba
+import wordcloud
+from pyecharts.charts import Bar
+from pyecharts import options
+from pyecharts.options import *
+
+
+def work(start_page, end_page, trade_name):
+
+    fp = open(f'{trade_name}采集数据.csv', 'a', encoding='utf-8',newline='')
+    csv_writer = csv.DictWriter(fp, fieldnames=[
+        '标题',
+        '价格',
+        '评论数',
+        '店铺名',
+        '详情页链接',
+    ])
+    # 写入表头
+    csv_writer.writeheader()
+
+    # 京东网站地址
+    url = 'https://www.jd.com/'
+
+    # 谷歌浏览器驱动路径
+    path = 'chromedriver.exe'
+    # 构建浏览器对象
+    browser = webdriver.Chrome(path)
+    # 打开京东网站
+    browser.get(url)
+    # 睡眠方便观察
+    # time.sleep(3)
+
+    # 通过xpath来获取文本框
+    input = browser.find_element_by_xpath('//*[@id="key"]')
+    # time.sleep(3)
+    # 在文本框中输入要搜索的商品
+    input.send_keys(trade_name)
+    # time.sleep(2)
+    # 获取搜索按钮并点击
+    # 根据xpath语法来获取对象
+    button = browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
+    button.click()
+    # time.sleep(3)
+
+
+    # 将页面下划到底部，加载全部数据
+    # 不将页面下划到底部，无法加载当前页面的全部商品
+    def drop_down():# 固定写法之一
+        for x in range(1, 12, 2):
+            time.sleep(1)
+            j = x / 9
+            js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
+            browser.execute_script(js)
+
+    def shop_info():
+        try:
+            browser.implicitly_wait(10)
+            drop_down()
+            # 获取当前页面搜索出的所有商品
+            # 根据bs4语法来获取对象
+            lis = browser.find_elements_by_css_selector('#J_goodsList ul li.gl-item')
+
+            for li in lis:
+                title = li.find_element_by_css_selector('.p-name em').text.replace('\n', '')    # 获取商品名
+                price = li.find_element_by_css_selector('.p-price i').text  # 获取价格
+                comment_count = li.find_element_by_css_selector('.p-commit a').text # 获取评论数
+                shop_name = li.find_element_by_css_selector('.p-shop a').text   # 获取商家名
+                href = li.find_element_by_css_selector('.p-img a').get_attribute('href')    # 获取商品链接
+
+                # 将数据保存到字典中
+                my_dict = {
+                    '标题': title,
+                    '价格': price,
+                    '评论数': comment_count,
+                    '店铺名': shop_name,
+                    '详情页链接': href
+                }
+                csv_writer.writerow(my_dict)
+                with open(f'{trade_name}商家.txt','a',encoding='utf-8')as fp:
+                    fp.write(my_dict['店铺名'] + '\n')
+
+                with open(f'{trade_name}商品价格.txt','a',encoding='utf-8')as f:
+                    f.write(my_dict['标题'] + ';' + my_dict['价格'] + '\n')
+
+                print(title,price,comment_count,shop_name,href)
+
+        except:
+            # 一些系统推广的广告由于结构和样式不同，获取时会报错影响获取进度，直接忽略
+            pass
+
+    for page in range(start_page, end_page+1):
+        print(f"----------当前采集第{page}页的内容----------")
+        shop_info()
+        next_page_button = browser.find_element_by_css_selector('.pn-next').send_keys(Keys.ARROW_RIGHT)# 键盘向右箭头
+        # next_page_button.click()
+
+
+    # 执行完毕后关闭浏览器
+    # time.sleep(2)
+    browser.quit()
+
+
+def cloud(trade_name):
+    with open(f'{trade_name}商家.txt','r',encoding='utf-8')as fp:
+        txt = fp.read()
+        txt_list = jieba.lcut(txt)
+        # 用空格分割词语
+        my_string = ' '.join(txt_list)
+        wc = wordcloud.WordCloud(
+            width=800,
+            height=500,
+            background_color='white',
+            scale=15,
+            font_path='msyh.ttc'
+        )
+        wc.generate(my_string)
+        wc.to_file(f'{trade_name}商家词云.png')
+
+
+def chart(trade_name):
+    with open(f'{trade_name}商品价格.txt','r',encoding='utf-8')as fp:
+        data_lines = fp.readlines()
+
+    data_dict = {}
+    for line in data_lines:
+        goods = line.replace(" ","").split(";")[0]
+        prices = float(line.replace(" ","").split(";")[1])
+
+        try:
+            data_dict[goods].append(prices)
+        except KeyError:
+            data_dict[goods] = []
+            data_dict[goods].append(prices)
+
+    sorted_data =sorted(data_dict.items(), key=lambda element:element[1], reverse=True)[0:20]   # 列表里面为元组
+
+    x_data = []
+    y_data = []
+    x_list = []
+    y_list = []
+    for i in range(20):
+        x_data.append(sorted_data[i][0])
+        y_data.append(sorted_data[i][1])
+        x = x_data[i]
+        y = y_data[i][0]
+        x_list.append(x)
+        y_list.append(y)
+
+    bar = Bar(init_opts=options.InitOpts(width="1500px"))
+    bar.add_xaxis(x_list)
+    bar.add_yaxis("价格(元)", y_list,label_opts=LabelOpts(position="right"))
+    bar.set_global_opts(
+        title_opts=TitleOpts(title=f"{trade_name}价格前20商品"),
+        xaxis_opts = options.AxisOpts(is_show=False),# 隐藏x轴
+        visualmap_opts=VisualMapOpts(
+            is_show=True,
+            is_piecewise=True,  # 是否分段
+            pieces=[
+                {"min": 1, "max": 99, "label": '1 - 99', "color": "#CCFFFF"},
+                {"min": 100, "max": 999, "label": '100 - 999', "color": "#FFFF99"},
+                {"min": 1000, "max": 4999, "label": '1000 - 999', "color": "#FF9966"},
+                {"min": 5000, "max": 9999, "label": '5000 - 9999', "color": "#FF6666"},
+                {"min": 10000, "max": 99999, "label": '10000 - 99999', "color": "#CC3333"}
+            ]
+        )
+    )
+    bar.render(f"{trade_name}价格前20商品.html")
+
+
+
+
+if __name__ == '__main__':
+    trade_name = input("输入要搜索的商品名:")
+    start_page = int(input("输入起始页码:"))
+    end_page = int(input("输入结束页码:"))
+    work(start_page, end_page, trade_name)
+    cloud(trade_name)
+    chart(trade_name)
+
+
+
+
+
+
+
+
+
+
+