做完了捏

master
wkyuu 3 years ago
parent 0fea744a5b
commit 939095f29a

@ -12,7 +12,7 @@ selenium + redis + 分布式 + xpath + etree + 可视化
- [x] 数据可视化 - [x] 数据可视化
- [ ] 预计两种模式终端交互随机或取评价数为索引目标给出取出的item的具体信息例如价格趋势 - [ ] 预计两种模式终端交互随机或取评价数为索引目标给出取出的item的具体信息例如价格趋势
- [ ] 选择目录,友好的选择交互体验 - [ ] 选择目录,友好的选择交互体验
- [ ] 选择抽取item模式热评就列出前五条随机就随机取一条 - [ ] 选择主要参考方式(价格,评论
- [ ] python打包exe需要图形化界面 - [ ] python打包exe需要图形化界面
## project ## project
@ -198,6 +198,22 @@ def getFont(): # 列出可用的字体
plt.rcParams['font.family'] = ['Microsoft YaHei'] plt.rcParams['font.family'] = ['Microsoft YaHei']
``` ```
### Pandas
```python
import pandas as pd
df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
df.columns # 查看所有列头的名字
df.xx # 获得xx那一列的信息
df['xx'] # 同上
df.sort_values(by = 'xx', ascending = True) # 按某一列排序
df.loc[index] # 取index行全部数据
df.loc[index][index2] # 取那行的某一数据
```
### Requests ### Requests
经典老碟 经典老碟
@ -372,4 +388,16 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
19[python matplotlib坐标轴设置的方法](https://www.csdn.net/tags/NtzaUgxsOTQ2NjgtYmxvZwO0O0OO0O0O.html) 19[python matplotlib坐标轴设置的方法](https://www.csdn.net/tags/NtzaUgxsOTQ2NjgtYmxvZwO0O0OO0O0O.html)
20 20[史上最全用Pandas读取CSV看这篇就够了](https://cloud.tencent.com/developer/article/1856554)
21[pandas数据处理的常用操作](https://zhuanlan.zhihu.com/p/29535766)
22[★★pandas的数据输出显示设置](https://www.jianshu.com/p/5c0aa1fa19af)
23[解决pandasValueError: Cannot convert non-finite values (NA or inf) to integer](https://blog.csdn.net/zhongkeyuanchongqing/article/details/123599260)
24[pandas取dataframe特定行/列](https://www.cnblogs.com/nxf-rabbit75/p/10105271.html)
25[Pandas 获取DataFrame 的行索引和列索引](https://blog.csdn.net/YENTERTAINR/article/details/109254583)
26

@ -1,9 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import os import os
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
import historyPrice import historyPrice
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象 def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
try: try:
if isElementTree(response): if isElementTree(response):

@ -2,8 +2,6 @@
import os import os
from hyperlink import URL
# 修改要生成的文件名,下面的是默认,注意要用.csv结尾 # 修改要生成的文件名,下面的是默认,注意要用.csv结尾
FILENAME_CSV = { FILENAME_CSV = {
"牛奶": "milk.csv", "牛奶": "milk.csv",
@ -40,8 +38,6 @@ USER_AGENT = [
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
] ]
COOKIES_FILENAME = "cookies.json"
# 历史价格查询网站 vveby.com # 历史价格查询网站 vveby.com
HISTORY_PRICE_URL = r"https://www.vveby.com/search?keyword=" HISTORY_PRICE_URL = r"https://www.vveby.com/search?keyword="
@ -52,13 +48,13 @@ FONT = ['Microsoft YaHei']
BANNER = { BANNER = {
"main": ''' "main": '''
#================*main*=================# #================*main*=================#
# 1.主界面 [x] # 1.主界面
# 2.介绍 [x] # 2.介绍
# 3.数据可视化 # 3.数据可视化
# 4.向Redis中填充数据 [x] # 4.向Redis中填充数据
# 5.清空 Redis 队列缓存 [x] # 5.清空 Redis 队列缓存
# 6.调用 milkSpider [x] # 6.调用 milkSpider
# 7.退出 [x] # 7.退出
#========================================# #========================================#
''', ''',
"introduce": ''' "introduce": '''
@ -71,8 +67,10 @@ BANNER = {
#=============================================# #=============================================#
''', ''',
"view": ''' "view": '''
#================*view*=================#
# 1.列出评论数最多的前几条商品信息
# 2.列出价格最低的前几条商品信息
# 3.返回上一层目录
#=============================================#
''' '''
} }

@ -1,16 +1,23 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from matplotlib import pyplot as plt import os
import re
import matplotlib import matplotlib
import pandas as pd
from matplotlib import pyplot as plt
import milkSpider
import settings import settings
import re
plt.rcParams['font.family'] = settings.FONT plt.rcParams['font.family'] = settings.FONT
# pd.set_option('display.expand_frame_repr', True)
# pd.set_option('display.max_colwidth', 10)
class view: class view:
def __init__(self, itemList): def __init__(self, itemList):
self.id = itemList[0] self.name = itemList[1]
self.string = itemList[1] self.string = itemList[10]
def getFont(): # 列出可用的字体 def getFont(): # 列出可用的字体
font = sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist]) font = sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])
@ -28,9 +35,10 @@ class view:
x = [] x = []
y = [] y = []
itemList.pop()
itemList.reverse()
while itemList: for temp in itemList:
temp = itemList.pop()
date = temp[0] + "" + temp[1] + "" date = temp[0] + "" + temp[1] + ""
price = "" + temp[2] + "" price = "" + temp[2] + ""
@ -42,16 +50,21 @@ class view:
x.append(date) x.append(date)
y.append(price) y.append(price)
plt.title("价格趋势") plt.title("商品 [{}] 价格趋势".format(self.name))
plt.bar(x, y, color = 'g', align = 'center') plt.bar(x, y, color = 'g', align = 'center')
plt.xticks(size = 10.0, rotation = 45) plt.xticks(size = 10.0, rotation = 45)
plt.xlabel("日期") plt.xlabel("日期")
plt.ylabel("价格") plt.ylabel("价格")
plt.plot(x, y, color = 'red', linewidth = 5.0, linestyle = '--') plt.plot(x, y, color = 'red', linewidth = 5.0, linestyle = '--')
print("等待可视化界面结束。。。")
plt.show() plt.show()
if self.string == 0:
print("该商品历史价格趋势数据尚未被收录!")
return
itemList = [] itemList = []
print("以下是商品 [{}] 的历史价格趋势:".format(self.name))
for astr in self.string.split(';'): for astr in self.string.split(';'):
strList = str2data(astr) strList = str2data(astr)
try: try:
@ -61,5 +74,105 @@ class view:
break break
show(itemList) show(itemList)
def getData(): def listCatalogues():
path = r"./Catalogues/"
dirList = os.listdir(path)
fileList = []
for filename in dirList:
fileList.append(path + filename)
return len(fileList), fileList
def getData(filename, catalogue):
while True:
print("# 当前选择的目录是 {} .".format(catalogue))
milkSpider.showBanner(menu = "view")
print("选择一项以查看or返回", end = '')
choice = str(input())
case = {'1': '评论数量(条)', '2': '价格(人民币)'}
if choice in case.keys():
mode = case.get(choice)
sort = False
break
elif choice == '3': return
else: print("无效选择!")
while True:
print("当前选择的模式是以 [{}] 为基准的排序方式".format(mode))
print("想要查看多少条数据(int)", end = '')
try:
flag = eval(input())
break
except BaseException:
print("无效输入!")
continue
df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
if choice == '2': sort = True
dfnew = df.sort_values(by = mode, ascending = sort)
try:
dfnew = dfnew.fillna(0)
dfnew[["评论数量(条)"]] = dfnew[["评论数量(条)"]].astype(int)
except BaseException:
pass pass
dfnew = dfnew[:flag]
while True:
print(dfnew.iloc[:, [1, 2, 3]])
indexList = list(dfnew.index)
while True:
print("选择一项id以查看详细信息(或者 [r] 返回上层目录)", end = '')
try:
index = str(input())
index = eval(index)
if index in indexList:
break
else:
print("无效输入!")
continue
except BaseException:
if index == 'r': return
print("无效输入!")
continue
toShow = dfnew.loc[index]
print(toShow)
aitem = view(toShow)
aitem.main()
while True:
flag = str(input("输入 [r]返回上一级菜单 [c]继续查看: "))
if flag == 'r': return
elif flag == 'c': break
else: print("无效选项!")
def main():
length, fileList = listCatalogues()
while True:
print("检测到当前缓存中共有{}个目录:".format(length))
case = {}
for i in range(length):
print("# {}.{}".format(i + 1, fileList[i][13:-4]))
case[str(i+1)] = fileList[i]
print("# {}.输入 [r] 返回上一级菜单".format(i + 2))
print("选择一项以查看or返回", end = '')
choice = str(input())
if choice in case.keys():
getData(case.get(choice), case.get(choice)[13:-4])
elif choice == 'r': return
else: print("无效选择!")
if __name__ == "__main__":
# fileList = listCatalogues()
# length, dataList = getData(fileList)
main()
'''
# 数据调试
import pandas as pd
filename = "./Catalogues/milk.csv"
df = pd.read_csv(filename, encoding = 'utf-8', header = 0, error_bad_lines = False)
'''

Loading…
Cancel
Save