做完了

3 years ago · 43423c6ab3
parent f014993c2b
commit 43423c6ab3
4 changed files with 7 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # milkSpider

-selenium + redis + 分布式 + xpath + etree + 可视化 
+selenium + redis + Matplotlib

 任务：爬取京东网站上在售的各类牛奶品类的商品名称，简介，价格相关。并给出相应的价格波动趋势，用python的可视化展示。计划任务自动爬取。

@ -13,7 +13,6 @@ selenium + redis + 分布式 + xpath + etree + 可视化
    -   [x] 预计两种模式（终端交互）：随机或取评价数为索引目标，给出取出的item的具体信息，例如价格趋势
        -   [x] 选择目录，友好的选择交互体验
        -   [x] 选择主要参考方式（价格，评论）
-   [ ] python打包exe，需要图形化界面？

 ## project

@ -235,7 +234,7 @@ print(res.text)

 ```python
 # 完全匹配浮点数
-reg = [-+]?[0-9]*\.?[0-9]*
+reg = "[-+]?[0-9]*\.?[0-9]*"
 ```

 ### 线程
--- a/historyPrice.py
+++ b/historyPrice.py
@ -1,12 +1,12 @@
 # -*- coding: utf-8 -*-

-import string
 from lxml import etree

 import downloader
 import pipelines
 import settings

+
 class historyPriceItem:
    def __init__(self, id):
        self.url = settings.HISTORY_PRICE_URL + str(id)
--- a/middlewares.py
+++ b/middlewares.py
@ -18,13 +18,13 @@ FILENAME_CSV = settings.FILENAME_CSV
 connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True)
 redisconn = redis.Redis(connection_pool = connection_pool)

-def getCategory(url) -> str:
+def getCategory(url) -> str:    # 用于判断获取的url是属于什么条目，便于后续打开对应的文件
    for urlstr in BASEURL.items():
        if urlstr[1] in url: return urlstr[0]
    print("can't get a valid baseurl! Check your settings.BASEURL.")
    exit()

-def geturlList(baseurl) -> list:
+def geturlList(baseurl) -> list:    # 产生url条目对应页数序列，用于存入到redis数据库中
    urlList = []
    for i in range(1, 20, 2):   # 爬取10页
        url = baseurl + r"&page=" + str(i)
--- a/view.py
+++ b/view.py
@ -74,7 +74,7 @@ class view:
                break
        show(itemList)

-def listCatalogues():
+def listCatalogues():   # 列出缓存目录下的条目
    path = r"./Catalogues/"
    if not os.path.exists(path):
        print("当前目录[{}]下没有数据!".format(path))
@ -85,7 +85,7 @@ def listCatalogues():
        fileList.append(path + filename)
    return True, len(fileList), fileList

-def getData(filename, catalogue):
+def getData(filename, catalogue):   # 选择读取数据的方式
    while True:
        print("# 当前选择的目录是 {} .".format(catalogue))
        milkSpider.showBanner(menu = "view")