做完了捏

3 years ago · 939095f29a
parent 0fea744a5b
commit 939095f29a
4 changed files with 170 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ selenium + redis + 分布式 + xpath + etree + 可视化
 -   [x] 数据可视化
    -   [ ] 预计两种模式（终端交互）：随机或取评价数为索引目标，给出取出的item的具体信息，例如价格趋势
        -   [ ] 选择目录，友好的选择交互体验
-        -   [ ] 选择抽取item模式（热评就列出前五条，随机就随机取一条）
+        -   [ ] 选择主要参考方式（价格，评论）
 -   [ ] python打包exe，需要图形化界面？

 ## project
@ -198,6 +198,22 @@ def getFont():  # 列出可用的字体
 plt.rcParams['font.family'] = ['Microsoft YaHei']
 ```

+### Pandas
+
+```python
+import pandas as pd
+df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
+
+df.columns	# 查看所有列头的名字
+df.xx	# 获得xx那一列的信息
+df['xx']	# 同上
+df.sort_values(by = 'xx', ascending = True)	# 按某一列排序
+df.loc[index]	# 取index行全部数据
+df.loc[index][index2]	# 取那行的某一数据
+```
+
+
+
 ### Requests

 经典老碟
@ -372,4 +388,16 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =

 19，[python matplotlib坐标轴设置的方法](https://www.csdn.net/tags/NtzaUgxsOTQ2NjgtYmxvZwO0O0OO0O0O.html)

-20，
+20，[史上最全！用Pandas读取CSV，看这篇就够了](https://cloud.tencent.com/developer/article/1856554)
+
+21，[pandas数据处理的常用操作](https://zhuanlan.zhihu.com/p/29535766)
+
+22，[★★pandas的数据输出显示设置](https://www.jianshu.com/p/5c0aa1fa19af)
+
+23，[解决pandas：ValueError: Cannot convert non-finite values (NA or inf) to integer](https://blog.csdn.net/zhongkeyuanchongqing/article/details/123599260)
+
+24，[pandas取dataframe特定行/列](https://www.cnblogs.com/nxf-rabbit75/p/10105271.html)
+
+25，[Pandas 获取DataFrame 的行索引和列索引](https://blog.csdn.net/YENTERTAINR/article/details/109254583)
+
+26，
--- a/pipelines.py
+++ b/pipelines.py
@ -1,9 +1,12 @@
 # -*- coding: utf-8 -*-
-from lxml import etree
-from concurrent.futures import ThreadPoolExecutor
 import os
+from concurrent.futures import ThreadPoolExecutor
+
+from lxml import etree
+
 import historyPrice

+
 def gethtml(response, gethtml_mode = "url"):  # 用etree格式化得到的对象
    try:
        if isElementTree(response):
@ -236,4 +239,4 @@ html = etree.HTML(item)
 name = html.xpath(reg)[0]


-'''
+'''
--- a/settings.py
+++ b/settings.py
@ -2,8 +2,6 @@

 import os

-from hyperlink import URL
-
 # 修改要生成的文件名，下面的是默认，注意要用.csv结尾
 FILENAME_CSV = {
    "牛奶": "milk.csv",
@ -40,8 +38,6 @@ USER_AGENT = [
    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
 ]

-COOKIES_FILENAME = "cookies.json"
-
 # 历史价格查询网站 vveby.com
 HISTORY_PRICE_URL = r"https://www.vveby.com/search?keyword="

@ -52,13 +48,13 @@ FONT = ['Microsoft YaHei']
 BANNER = {
    "main": '''
 #================*main*=================#
-#   1.主界面 [x]
-#   2.介绍 [x]
+#   1.主界面
+#   2.介绍
 #   3.数据可视化
-#   4.向Redis中填充数据 [x]
-#   5.清空 Redis 队列缓存 [x]
-#   6.调用 milkSpider [x]
-#   7.退出 [x]
+#   4.向Redis中填充数据
+#   5.清空 Redis 队列缓存
+#   6.调用 milkSpider
+#   7.退出
 #========================================#    
 ''',
    "introduce": '''
@ -71,8 +67,10 @@ BANNER = {
 #=============================================#
 ''',
    "view": '''
- 
-    
-    
+#================*view*=================#
+#   1.列出评论数最多的前几条商品信息
+#   2.列出价格最低的前几条商品信息
+#   3.返回上一层目录
+#=============================================# 
 '''
 }
--- a/view.py
+++ b/view.py
@ -1,16 +1,23 @@
 # -*- coding: utf-8 -*-
-from matplotlib import pyplot as plt
+import os
+import re
+
 import matplotlib
+import pandas as pd
+from matplotlib import pyplot as plt
+
+import milkSpider
 import settings
-import re

 plt.rcParams['font.family'] = settings.FONT
+# pd.set_option('display.expand_frame_repr', True)
+# pd.set_option('display.max_colwidth', 10)

 class view:

    def __init__(self, itemList):
-        self.id = itemList[0]
-        self.string = itemList[1]
+        self.name = itemList[1]
+        self.string = itemList[10]

    def getFont():  # 列出可用的字体
        font = sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])
@ -28,9 +35,10 @@ class view:

            x = []
            y = []
+            itemList.pop()
+            itemList.reverse()

-            while itemList:
-                temp = itemList.pop()
+            for temp in itemList:

                date = temp[0] + "月" + temp[1] + "日"
                price = "￥" + temp[2] + "元"
@ -42,16 +50,21 @@ class view:
                x.append(date)
                y.append(price)

-            plt.title("价格趋势")
+            plt.title("商品 [{}] 价格趋势".format(self.name))
            plt.bar(x, y, color = 'g', align = 'center')
            plt.xticks(size = 10.0, rotation = 45)
            plt.xlabel("日期")
            plt.ylabel("价格")
            plt.plot(x, y, color = 'red', linewidth = 5.0, linestyle = '--')  
-
+            
+            print("等待可视化界面结束。。。")
            plt.show()

+        if self.string == 0:
+            print("该商品历史价格趋势数据尚未被收录!")
+            return
        itemList = []
+        print("以下是商品 [{}] 的历史价格趋势：".format(self.name))
        for astr in self.string.split(';'):
            strList = str2data(astr)
            try:
@ -61,5 +74,105 @@ class view:
                break
        show(itemList)

-def getData():
-    pass
+def listCatalogues():
+    path = r"./Catalogues/"
+    dirList = os.listdir(path)
+    fileList = []
+    for filename in dirList:
+        fileList.append(path + filename)
+    return len(fileList), fileList
+
+def getData(filename, catalogue):
+    while True:
+        print("# 当前选择的目录是 {} .".format(catalogue))
+        milkSpider.showBanner(menu = "view")
+        print("选择一项以查看or返回：", end = '')
+        choice = str(input())
+        case = {'1': '评论数量(条)', '2': '价格(人民币)'}
+        if choice in case.keys():
+            mode = case.get(choice)
+            sort = False
+            break
+        elif choice == '3': return
+        else: print("无效选择!")
+        
+    while True:
+        print("当前选择的模式是以 [{}] 为基准的排序方式".format(mode))
+        print("想要查看多少条数据(int)：", end = '')
+        try:
+            flag = eval(input())
+            break
+        except BaseException:
+            print("无效输入!")
+            continue
+
+    df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
+    if choice == '2': sort = True
+    dfnew = df.sort_values(by = mode, ascending = sort)
+
+    try:
+        dfnew = dfnew.fillna(0)
+        dfnew[["评论数量(条)"]] = dfnew[["评论数量(条)"]].astype(int)
+    except BaseException:
+        pass
+
+    dfnew = dfnew[:flag]
+    while True:
+        print(dfnew.iloc[:, [1, 2, 3]])
+        indexList = list(dfnew.index)
+        while True:
+            print("选择一项id以查看详细信息(或者 [r] 返回上层目录)：", end = '')
+            try:
+                index = str(input())
+                index = eval(index)
+                if index in indexList:
+                    break
+                else:
+                    print("无效输入!")
+                    continue
+            except BaseException:
+                if index == 'r': return
+                print("无效输入!")
+                continue
+        toShow = dfnew.loc[index]
+        print(toShow)
+        aitem = view(toShow)
+        aitem.main()
+        while True:
+            flag = str(input("输入 [r]返回上一级菜单 [c]继续查看: "))
+            if flag == 'r': return
+            elif flag == 'c': break
+            else: print("无效选项!")
+
+def main():
+    length, fileList = listCatalogues()
+    while True:
+        print("检测到当前缓存中共有{}个目录：".format(length))
+        case = {}
+        for i in range(length):
+            print("#    {}.{}".format(i + 1, fileList[i][13:-4]))
+            case[str(i+1)] = fileList[i]
+        print("#    {}.输入 [r] 返回上一级菜单".format(i + 2))
+        print("选择一项以查看or返回：", end = '')
+        choice = str(input())
+        if choice in case.keys():
+            getData(case.get(choice), case.get(choice)[13:-4])
+        elif choice == 'r': return
+        else: print("无效选择!")
+
+
+if __name__ == "__main__":
+    # fileList = listCatalogues()
+    # length, dataList = getData(fileList)
+    main()
+
+
+'''
+# 数据调试
+import pandas as pd
+
+filename = "./Catalogues/milk.csv"
+df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
+
+
+'''