做完了捏

3 years ago · 939095f29a
parent 0fea744a5b
commit 939095f29a
4 changed files with 170 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ selenium + redis + 分布式 + xpath + etree + 可视化
 -   [x] 数据可视化
    -   [ ] 预计两种模式（终端交互）：随机或取评价数为索引目标，给出取出的item的具体信息，例如价格趋势
        -   [ ] 选择目录，友好的选择交互体验
-        -   [ ] 选择抽取item模式（热评就列出前五条，随机就随机取一条）
+        -   [ ] 选择主要参考方式（价格，评论）
 -   [ ] python打包exe，需要图形化界面？
 ## project
@ -198,6 +198,22 @@ def getFont():  # 列出可用的字体
 plt.rcParams['font.family'] = ['Microsoft YaHei']
 ```
 ### Pandas
 ```python
 import pandas as pd
 df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
 df.columns	# 查看所有列头的名字
 df.xx	# 获得xx那一列的信息
 df['xx']	# 同上
 df.sort_values(by = 'xx', ascending = True)	# 按某一列排序
 df.loc[index]	# 取index行全部数据
 df.loc[index][index2]	# 取那行的某一数据
 ```
 ### Requests
 经典老碟
@ -372,4 +388,16 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
 19，[python matplotlib坐标轴设置的方法](https://www.csdn.net/tags/NtzaUgxsOTQ2NjgtYmxvZwO0O0OO0O0O.html)
-20，
+20，[史上最全！用Pandas读取CSV，看这篇就够了](https://cloud.tencent.com/developer/article/1856554)
 21，[pandas数据处理的常用操作](https://zhuanlan.zhihu.com/p/29535766)
 22，[★★pandas的数据输出显示设置](https://www.jianshu.com/p/5c0aa1fa19af)
 23，[解决pandas：ValueError: Cannot convert non-finite values (NA or inf) to integer](https://blog.csdn.net/zhongkeyuanchongqing/article/details/123599260)
 24，[pandas取dataframe特定行/列](https://www.cnblogs.com/nxf-rabbit75/p/10105271.html)
 25，[Pandas 获取DataFrame 的行索引和列索引](https://blog.csdn.net/YENTERTAINR/article/details/109254583)
 26，
--- a/pipelines.py
+++ b/pipelines.py
@ -1,9 +1,12 @@
 # -*- coding: utf-8 -*-
 from lxml import etree
 from concurrent.futures import ThreadPoolExecutor
 import os
 from concurrent.futures import ThreadPoolExecutor
 from lxml import etree
 import historyPrice
 def gethtml(response, gethtml_mode = "url"):  # 用etree格式化得到的对象
    try:
        if isElementTree(response):
--- a/settings.py
+++ b/settings.py
@ -2,8 +2,6 @@
 import os
 from hyperlink import URL
 # 修改要生成的文件名，下面的是默认，注意要用.csv结尾
 FILENAME_CSV = {
    "牛奶": "milk.csv",
@ -40,8 +38,6 @@ USER_AGENT = [
    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
 ]
 COOKIES_FILENAME = "cookies.json"
 # 历史价格查询网站 vveby.com
 HISTORY_PRICE_URL = r"https://www.vveby.com/search?keyword="
@ -52,13 +48,13 @@ FONT = ['Microsoft YaHei']
 BANNER = {
    "main": '''
 #================*main*=================#
-#   1.主界面 [x]
+#   1.主界面
-#   2.介绍 [x]
+#   2.介绍
 #   3.数据可视化
-#   4.向Redis中填充数据 [x]
+#   4.向Redis中填充数据
-#   5.清空 Redis 队列缓存 [x]
+#   5.清空 Redis 队列缓存
-#   6.调用 milkSpider [x]
+#   6.调用 milkSpider
-#   7.退出 [x]
+#   7.退出
 #========================================#    
 ''',
    "introduce": '''
@ -71,8 +67,10 @@ BANNER = {
 #=============================================#
 ''',
    "view": '''
- 
+#================*view*=================#
-    
+#   1.列出评论数最多的前几条商品信息
-    
+#   2.列出价格最低的前几条商品信息
 #   3.返回上一层目录
 #=============================================# 
 '''
 }
--- a/view.py
+++ b/view.py
@ -1,16 +1,23 @@
 # -*- coding: utf-8 -*-
-from matplotlib import pyplot as plt
+import os
 import re
 import matplotlib
 import pandas as pd
 from matplotlib import pyplot as plt
 import milkSpider
 import settings
 import re
 plt.rcParams['font.family'] = settings.FONT
 # pd.set_option('display.expand_frame_repr', True)
 # pd.set_option('display.max_colwidth', 10)
 class view:
    def __init__(self, itemList):
-        self.id = itemList[0]
+        self.name = itemList[1]
-        self.string = itemList[1]
+        self.string = itemList[10]
    def getFont():  # 列出可用的字体
        font = sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])
@ -28,9 +35,10 @@ class view:
            x = []
            y = []
            itemList.pop()
            itemList.reverse()
-            while itemList:
+            for temp in itemList:
                temp = itemList.pop()
                date = temp[0] + "月" + temp[1] + "日"
                price = "￥" + temp[2] + "元"
@ -42,16 +50,21 @@ class view:
                x.append(date)
                y.append(price)
-            plt.title("价格趋势")
+            plt.title("商品 [{}] 价格趋势".format(self.name))
            plt.bar(x, y, color = 'g', align = 'center')
            plt.xticks(size = 10.0, rotation = 45)
            plt.xlabel("日期")
            plt.ylabel("价格")
            plt.plot(x, y, color = 'red', linewidth = 5.0, linestyle = '--')  
            print("等待可视化界面结束。。。")
            plt.show()
        if self.string == 0:
            print("该商品历史价格趋势数据尚未被收录!")
            return
        itemList = []
        print("以下是商品 [{}] 的历史价格趋势：".format(self.name))
        for astr in self.string.split(';'):
            strList = str2data(astr)
            try:
@ -61,5 +74,105 @@ class view:
                break
        show(itemList)
-def getData():
+def listCatalogues():
    path = r"./Catalogues/"
    dirList = os.listdir(path)
    fileList = []
    for filename in dirList:
        fileList.append(path + filename)
    return len(fileList), fileList
 def getData(filename, catalogue):
    while True:
        print("# 当前选择的目录是 {} .".format(catalogue))
        milkSpider.showBanner(menu = "view")
        print("选择一项以查看or返回：", end = '')
        choice = str(input())
        case = {'1': '评论数量(条)', '2': '价格(人民币)'}
        if choice in case.keys():
            mode = case.get(choice)
            sort = False
            break
        elif choice == '3': return
        else: print("无效选择!")
    while True:
        print("当前选择的模式是以 [{}] 为基准的排序方式".format(mode))
        print("想要查看多少条数据(int)：", end = '')
        try:
            flag = eval(input())
            break
        except BaseException:
            print("无效输入!")
            continue
    df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
    if choice == '2': sort = True
    dfnew = df.sort_values(by = mode, ascending = sort)
    try:
        dfnew = dfnew.fillna(0)
        dfnew[["评论数量(条)"]] = dfnew[["评论数量(条)"]].astype(int)
    except BaseException:
        pass
    dfnew = dfnew[:flag]
    while True:
        print(dfnew.iloc[:, [1, 2, 3]])
        indexList = list(dfnew.index)
        while True:
            print("选择一项id以查看详细信息(或者 [r] 返回上层目录)：", end = '')
            try:
                index = str(input())
                index = eval(index)
                if index in indexList:
                    break
                else:
                    print("无效输入!")
                    continue
            except BaseException:
                if index == 'r': return
                print("无效输入!")
                continue
        toShow = dfnew.loc[index]
        print(toShow)
        aitem = view(toShow)
        aitem.main()
        while True:
            flag = str(input("输入 [r]返回上一级菜单 [c]继续查看: "))
            if flag == 'r': return
            elif flag == 'c': break
            else: print("无效选项!")
 def main():
    length, fileList = listCatalogues()
    while True:
        print("检测到当前缓存中共有{}个目录：".format(length))
        case = {}
        for i in range(length):
            print("#    {}.{}".format(i + 1, fileList[i][13:-4]))
            case[str(i+1)] = fileList[i]
        print("#    {}.输入 [r] 返回上一级菜单".format(i + 2))
        print("选择一项以查看or返回：", end = '')
        choice = str(input())
        if choice in case.keys():
            getData(case.get(choice), case.get(choice)[13:-4])
        elif choice == 'r': return
        else: print("无效选择!")
 if __name__ == "__main__":
    # fileList = listCatalogues()
    # length, dataList = getData(fileList)
    main()
 '''
 # 数据调试
 import pandas as pd
 filename = "./Catalogues/milk.csv"
 df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
 '''