完善历史价格查询

3 years ago · 3adaf63228
parent c372046c7b
commit 3adaf63228
4 changed files with 67 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -9,11 +9,11 @@ selenium + redis + 分布式 + xpath + etree + 可视化
 -   [x] 初始化 selenium 框架，编写好相应的爬取规则，初步实现小规模爬取内容
 -   [x] 从历史价格网页爬取历史价格
    -   [ ] 同时/后期追加 存入csv 价格趋势，涨跌幅。比对，给出价格波动趋势
-
 -   [x] 加入Redis分布式设计
 -   [ ] 数据可视化
-    -   [ ] 使用python终端绘图，需要解决如何选取想要展示的条例
-
+    -   [ ] 预计两种模式（终端交互）：随机或取评价数为索引目标，给出取出的item的具体信息，例如价格预测。
+        -   [ ] 选择目录，友好的选择交互体验
+        -   [ ] 选择抽取item模式（热评就列出前五条，随机就随机取一条）
 -   [ ] python打包exe，需要图形化界面？

 ## project
@ -212,24 +212,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =



-1，在爬取的视频评论很多的情况下，效率太低，且频次太高了容易被ban。应该设定一下，比如说只爬几页，或者只爬热门评论。而且。。哔哩哔哩的评论区是其他加载的，本爬虫在requests.get的时候，会多次爬取，太危险了。为了一条数据就多次访问，被ban风险更高了。可以改成，对单个页面就取一次网页源代码，但是这个需要真实浏览器来访问才能拿到（selenium框架），然后对这个源代码进行数据提取操作，这样可以比较好地减少爬虫访问量，降低被ban的风险。
-
-2，爬虫结构是先完全爬取数据，存到临时内存中，最后才存入文件。风险太大，如果中途被ban了，容易导致整个爬取的过程前功尽弃，且之前数据全都丢失。
-
-	2.1，线程作用效果应为
-
-	结构：视频{页数{父评论{子评论}}}
-
-	采用多线程爬取父评论+子评论，也就是一次能对多个父子评论爬取数据；或者采用多线程爬取页数中的评论，一次能爬取多个页面。后者效率更高。
-
-	2.2，但是由于该爬虫设计结构是把全部数据都先爬取再存起来，这样子要求线程对同一个字符串结构进行数据追加，容易导致混乱（即 在线程优先级不同的情况下，多个线程同时操作一个全局字符串，可能上一条还是父评论，下一条就变成另一条父评论的子评论了。）。
-
-	2.3，整改建议是：采用多线程对多个页面爬取数据，某个线程结束时即爬完一页就把该页对应的字符串存到csv文件中，采用线程锁来控制每次存数据时都仅单个占用csv文件，防止数据混乱。这一步需要修改数据的存储过程的流程。
-
-~~3，爬取效果不好，爬取父评论+子评论的方式，实际上意义不大。意思就是父和子评论都混到一起了，没有主次效果，应该加个标示便于直观显示。~~
-
-
-



@ -268,4 +250,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =

 16，[爬虫：常见的HTTP错误代码及错误原因](https://blog.csdn.net/Smart_look/article/details/109967222)

-17，
+17，[Python字符串操作之字符串分割与组合](https://blog.csdn.net/seetheworld518/article/details/47346527)
+
+18，
--- a/downloader.py
+++ b/downloader.py
@ -30,8 +30,7 @@ def getsource(url):
    driver.get(url)

    response = etree.HTML(driver.page_source)
-    response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html")
-    response = response.decode('utf-8')
+    response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html").decode('utf-8')

    driver.close()
    return response
@ -40,7 +39,6 @@ def useRequests(url):
    try:
        session = requests.Session()
        res = session.get(url, headers = headers)
-        res.raise_for_status()  # 判断是不是200
        # print(res.request.headers)
        res.encoding = res.apparent_encoding
        res = etree.HTML(res.text)
--- a/historyPrice.py
+++ b/historyPrice.py
@ -1,17 +1,17 @@
 # -*- coding: utf-8 -*-

 from lxml import etree
-import settings
-import downloader

-def myreplace(text):
-    return text.strip().replace(' ', '').replace("\r\n", '')
+import downloader
+import pipelines
+import settings

 class historyPriceItem:
    def __init__(self, id):
        self.url = settings.HISTORY_PRICE_URL + str(id)
-        # self.response = downloader.useRequests(self.url)
-        self.response = etree.parse('historyPrice.html', etree.HTMLParser(encoding = 'utf-8'))
+        self.response = pipelines.gethtml(downloader.useRequests(self.url))
+        # self.response = etree.parse('historyPriceMore.html', etree.HTMLParser(encoding = 'utf-8'))
+        # self.response = etree.parse('historyPrice.html', etree.HTMLParser(encoding = 'utf-8'))

    def gethistoryPrice(self) -> list:
        
@ -19,21 +19,31 @@ class historyPriceItem:
        item = self.response.xpath(reg)[0]
        item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')

-        def getTag(self) -> str:
-            reg = r"//div[@data-content='商品类别：']/text()"
-            tag = self.response.xpath(reg)[0]
-            tag = etree.HTML(tag)
-            tag = myreplace(tag)
-            return tag[5:]
-
-        def get
-
-# tree = etree.tostring(response.xpath(reg)[0], encoding = 'utf-8', method = 'html').decode('utf-8')
-
+        def updateTime() -> str:
+            reg = r"//div[@class='p3']/p[@class='tips']/text()"
+            time = self.response.xpath(reg)[0]
+            time = pipelines.myreplace(time, mode = 'strip')
+            return str(time[5:])
+
+        def priceTrend() -> str:
+            check = 'timeline-text'
+            if not check in item:  # 用于判断有无历史价格记录
+                return ''
+            reg = r"//div[@class='timeline-text']/p/text()"
+            regList = self.response.xpath(reg)
+            price = ''
+            for i in range(0, len(regList), 2):
+                price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';'
+            return price
+
+        priceHistoryList = [updateTime(), priceTrend()]
+        return priceHistoryList
+        
 if __name__ == '__main__':
-    id = "10036840192083"
+    # id = "10036840192083"
+    id = "11564571796"  # More
    aitem = historyPriceItem(id)
-    aitem.gethistoryPrice()
+    print(aitem.gethistoryPrice())



--- a/pipelines.py
+++ b/pipelines.py
@ -2,6 +2,8 @@
 from lxml import etree
 import csv
 import os
+import time
+import historyPrice

 def gethtml(response, gethtml_mode = "url"):  # 用etree格式化得到的对象
    try:
@ -30,9 +32,11 @@ def getidlist(response) -> list:    # 获取id
    idlist = html.xpath(reg)
    return idlist

-def myreplace(name) -> str: # 简单的处理输出
-    name = name.strip()
-    return name
+def myreplace(text, mode = '') -> str: # 简单的处理输出
+    if mode == 'all':
+        return text.strip().replace(' ', '').replace("\r\n", '')
+    elif mode == 'strip': return text.strip().replace('\r', '')
+    else: return text.strip()

 def isElementTree(response) -> bool:    # 用于判断是否已经为etree对象
    if str(type(response)) == "<class 'lxml.etree._ElementTree'>":
@ -91,7 +95,10 @@ class item:
            url = r"https://item.jd.com/" + str(self.id) + r".html"
            return url

-        itemlist = [str(self.id), name(), price(), attribute(), sales(), url()]
+        historyPriceItem = historyPrice.historyPriceItem(self.id)
+        priceHistoryList = historyPriceItem.gethistoryPrice()
+
+        itemlist = [str(self.id), name(), price(), attribute(), sales(), url()] + priceHistoryList
        return itemlist

 def print2console(response):    # 输出到命令行
@ -147,7 +154,7 @@ def write2csv(response, filename_csv):    # 写入到csv文件
                except BaseException as e:
                    print(e)

-                headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url']
+                headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url', '数据更新时间', '历史价格趋势']
                writer.writerow(headers)

                write(writer)
@ -160,4 +167,22 @@ def write2csv(response, filename_csv):    # 写入到csv文件
        print("sth wrong in pipelines.write2csv")

 if __name__ == "__main__":
-    pass
+    pass
+
+
+'''
+# 调试数据
+import pipelines
+from lxml import etree
+response = 'index.html'   # 文件名 or url
+html = pipelines.gethtml(response, gethtml_mode = 'cache')  # cache or url
+id = '1127466'
+aitem = pipelines.item(id, html)
+a = aitem.getitem()
+
+import historyPrice
+bitem = historyPrice.historyPriceItem(id)
+b = bitem.gethistoryPrice()
+
+itemList = a + b
+'''