selenium/selenium操控浏览器获取京东某商品.py

"""
用selenium操控浏览器免去了请求对象的定制,再模拟浏览器向服务器发送请求的过程,而是直接操控浏览器,降低被反爬的概率
"""
from selenium import webdriver
import time
import csv
from selenium.webdriver.common.keys import Keys
import jieba
import wordcloud
from pyecharts.charts import Bar
from pyecharts import options
from pyecharts.options import *


def work(start_page, end_page, trade_name):

    fp = open(f'{trade_name}采集数据.csv', 'a', encoding='utf-8',newline='')
    csv_writer = csv.DictWriter(fp, fieldnames=[
        '标题',
        '价格',
        '评论数',
        '店铺名',
        '详情页链接',
    ])
    # 写入表头
    csv_writer.writeheader()

    # 京东网站地址
    url = 'https://www.jd.com/'

    # 谷歌浏览器驱动路径
    path = 'chromedriver.exe'
    # 构建浏览器对象
    browser = webdriver.Chrome(path)
    # 打开京东网站
    browser.get(url)
    # 睡眠方便观察
    # time.sleep(3)

    # 通过xpath来获取文本框
    input = browser.find_element_by_xpath('//*[@id="key"]')
    # time.sleep(3)
    # 在文本框中输入要搜索的商品
    input.send_keys(trade_name)
    # time.sleep(2)
    # 获取搜索按钮并点击
    # 根据xpath语法来获取对象
    button = browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
    button.click()
    # time.sleep(3)


    # 将页面下划到底部，加载全部数据
    # 不将页面下划到底部，无法加载当前页面的全部商品
    def drop_down():# 固定写法之一
        for x in range(1, 12, 2):
            time.sleep(1)
            j = x / 9
            js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
            browser.execute_script(js)

    def shop_info():
        try:
            browser.implicitly_wait(10)
            drop_down()
            # 获取当前页面搜索出的所有商品
            # 根据bs4语法来获取对象
            lis = browser.find_elements_by_css_selector('#J_goodsList ul li.gl-item')

            for li in lis:
                title = li.find_element_by_css_selector('.p-name em').text.replace('\n', '')    # 获取商品名
                price = li.find_element_by_css_selector('.p-price i').text  # 获取价格
                comment_count = li.find_element_by_css_selector('.p-commit a').text # 获取评论数
                shop_name = li.find_element_by_css_selector('.p-shop a').text   # 获取商家名
                href = li.find_element_by_css_selector('.p-img a').get_attribute('href')    # 获取商品链接

                # 将数据保存到字典中
                my_dict = {
                    '标题': title,
                    '价格': price,
                    '评论数': comment_count,
                    '店铺名': shop_name,
                    '详情页链接': href
                }
                csv_writer.writerow(my_dict)
                with open(f'{trade_name}商家.txt','a',encoding='utf-8')as fp:
                    fp.write(my_dict['店铺名'] + '\n')

                with open(f'{trade_name}商品价格.txt','a',encoding='utf-8')as f:
                    f.write(my_dict['标题'] + ';' + my_dict['价格'] + '\n')

                print(title,price,comment_count,shop_name,href)

        except:
            # 一些系统推广的广告由于结构和样式不同，获取时会报错影响获取进度，直接忽略
            pass

    for page in range(start_page, end_page+1):
        print(f"----------当前采集第{page}页的内容----------")
        shop_info()
        next_page_button = browser.find_element_by_css_selector('.pn-next').send_keys(Keys.ARROW_RIGHT)# 键盘向右箭头
        # next_page_button.click()


    # 执行完毕后关闭浏览器
    # time.sleep(2)
    browser.quit()


def cloud(trade_name):
    with open(f'{trade_name}商家.txt','r',encoding='utf-8')as fp:
        txt = fp.read()
        txt_list = jieba.lcut(txt)
        # 用空格分割词语
        my_string = ' '.join(txt_list)
        wc = wordcloud.WordCloud(
            width=800,
            height=500,
            background_color='white',
            scale=15,
            font_path='msyh.ttc'
        )
        wc.generate(my_string)
        wc.to_file(f'{trade_name}商家词云.png')


def chart(trade_name):
    with open(f'{trade_name}商品价格.txt','r',encoding='utf-8')as fp:
        data_lines = fp.readlines()

    data_dict = {}
    for line in data_lines:
        goods = line.replace(" ","").split(";")[0]
        prices = float(line.replace(" ","").split(";")[1])

        try:
            data_dict[goods].append(prices)
        except KeyError:
            data_dict[goods] = []
            data_dict[goods].append(prices)

    sorted_data =sorted(data_dict.items(), key=lambda element:element[1], reverse=True)[0:20]   # 列表里面为元组

    x_data = []
    y_data = []
    x_list = []
    y_list = []
    for i in range(20):
        x_data.append(sorted_data[i][0])
        y_data.append(sorted_data[i][1])
        x = x_data[i]
        y = y_data[i][0]
        x_list.append(x)
        y_list.append(y)

    bar = Bar(init_opts=options.InitOpts(width="1500px"))
    bar.add_xaxis(x_list)
    bar.add_yaxis("价格(元)", y_list,label_opts=LabelOpts(position="right"))
    bar.set_global_opts(
        title_opts=TitleOpts(title=f"{trade_name}价格前20商品"),
        xaxis_opts = options.AxisOpts(is_show=False),# 隐藏x轴
        visualmap_opts=VisualMapOpts(
            is_show=True,
            is_piecewise=True,  # 是否分段
            pieces=[
                {"min": 1, "max": 99, "label": '1 - 99', "color": "#CCFFFF"},
                {"min": 100, "max": 999, "label": '100 - 999', "color": "#FFFF99"},
                {"min": 1000, "max": 4999, "label": '1000 - 999', "color": "#FF9966"},
                {"min": 5000, "max": 9999, "label": '5000 - 9999', "color": "#FF6666"},
                {"min": 10000, "max": 99999, "label": '10000 - 99999', "color": "#CC3333"}
            ]
        )
    )
    bar.render(f"{trade_name}价格前20商品.html")


if __name__ == '__main__':
    trade_name = input("输入要搜索的商品名:")
    start_page = int(input("输入起始页码:"))
    end_page = int(input("输入结束页码:"))
    work(start_page, end_page, trade_name)
    cloud(trade_name)
    chart(trade_name)