update

3 years ago · 22a274b5a8
parent 3adaf63228
commit 22a274b5a8
3 changed files with 75 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -181,6 +181,8 @@ ChromeDriver
 下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到当前目录就行(如果是放在 python 根目录可以不用在实例化 selenium 时指定chromedriver 路径)
 ### Matplotlib
 ### Requests
 经典老碟
@ -212,10 +214,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
 ## 参考链接
 1，[selenium+python自动化100-centos上搭建selenium启动chrome浏览器headless无界面模式](https://www.cnblogs.com/yoyoketang/p/11582012.html)
--- a/historyPrice.py
+++ b/historyPrice.py
@ -19,6 +19,18 @@ class historyPriceItem:
        item = self.response.xpath(reg)[0]
        item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
        def getCommit() -> str:
            reg = "//div[@data-content='商品点评：']/text()"
            commit = self.response.xpath(reg)[0]
            commit = pipelines.myreplace(commit, mode = 'all')
            return str(commit[5:-1])
        def getTags() -> str:
            reg = "//div[@data-content='商品类别：']/text()"
            tags = self.response.xpath(reg)[0]
            tags = pipelines.myreplace(tags, mode = 'all')
            return str(tags[5:])
        def updateTime() -> str:
            reg = r"//div[@class='p3']/p[@class='tips']/text()"
            time = self.response.xpath(reg)[0]
@ -36,7 +48,7 @@ class historyPriceItem:
                price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';'
            return price
-        priceHistoryList = [updateTime(), priceTrend()]
+        priceHistoryList = [getCommit(), getTags(), updateTime(), priceTrend()]
        return priceHistoryList
 if __name__ == '__main__':
--- a/pipelines.py
+++ b/pipelines.py
@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 from lxml import etree
 import csv
 import os
 import time
 import historyPrice
 def gethtml(response, gethtml_mode = "url"):  # 用etree格式化得到的对象
@ -34,8 +32,9 @@ def getidlist(response) -> list:    # 获取id
 def myreplace(text, mode = '') -> str: # 简单的处理输出
    if mode == 'all':
-        return text.strip().replace(' ', '').replace("\r\n", '')
+        return text.strip().replace(' ', '').replace("\r\n", '').replace('\r', '').replace('\n', '')
    elif mode == 'strip': return text.strip().replace('\r', '')
    elif mode == 'n': return text.replace('\n', '')
    else: return text.strip()
 def isElementTree(response) -> bool:    # 用于判断是否已经为etree对象
@ -50,7 +49,7 @@ class item:
        self.id = id
        self.response = response
-    def getitem(self) -> list:
+    def getitem(self) -> str:
        reg = r"//li[@data-sku='" + str(self.id) + r"']"
        item = self.response.xpath(reg)[0]
        item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
@ -63,6 +62,14 @@ class item:
            # print(name)
            return name
        def shop() -> str:
            reg = "//div[@class='p-shop']/span/a/text()"
            html = etree.HTML(item)
            shop = html.xpath(reg)[0]
            shop = myreplace(shop)
            # print(shop)
            return shop
        def price() -> str:
            reg = r"//i[@data-price]/text()"
            html = etree.HTML(item)
@ -97,9 +104,11 @@ class item:
        historyPriceItem = historyPrice.historyPriceItem(self.id)
        priceHistoryList = historyPriceItem.gethistoryPrice()
        # print("id = {}, list = {}".format(self.id, priceHistoryList[3]))
-        itemlist = [str(self.id), name(), price(), attribute(), sales(), url()] + priceHistoryList
+        # itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
-        return itemlist
+        itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
        return itemString
 def print2console(response):    # 输出到命令行
@ -128,39 +137,33 @@ def print2console(response):    # 输出到命令行
 def write2csv(response, filename_csv):    # 写入到csv文件
-    def write(writer):
+    def writer(fd):
        for id in getidlist(response):
            print('flag1')
            if int(id) < 1000:
                continue
            aitem = item(id, gethtml(response))
-            itemlist = aitem.getitem()
+            itemString = aitem.getitem()
-            # print(itemlist)
+            # print(itemList)
            writer.writerow(itemlist)
    try:
        if os.path.exists(filename_csv):
            with open(filename_csv, 'a+', encoding = 'utf-8-sig', newline = '') as fd:  # 存在，文件尾追加
            try:
-                    writer = csv.writer(fd)
+                fd.write(itemString)
            except BaseException as e:
                print(e)
-                write(writer)
+                print("sth wrong in pipelines.write2csv.write.")
                fd.close()
        else:
            with open(filename_csv, 'w+', encoding = 'utf-8-sig', newline = '') as fd:  # 不存在，创建并从文件头开始
    try:
-                    writer = csv.writer(fd)
+        if os.path.exists(filename_csv):
-                except BaseException as e:
+            with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd:  # 存在，文件尾追加
-                    print(e)
+                print('flag2')
-
+                writer(fd)
                headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url', '数据更新时间', '历史价格趋势']
                writer.writerow(headers)
                write(writer)
                fd.close()
-        # print("pipelines.write2csv is done")
+        else:
            with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd:  # 不存在，创建并从文件头开始
                # headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
                headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
                print('flag3')
                fd.write(headers)
                writer(fd)
    except BaseException as e:
        print(e)
@ -174,9 +177,9 @@ if __name__ == "__main__":
 # 调试数据
 import pipelines
 from lxml import etree
-response = 'index.html'   # 文件名 or url
+response = './1320,1585,9434/index.html'   # 文件名 or url
 html = pipelines.gethtml(response, gethtml_mode = 'cache')  # cache or url
-id = '1127466'
+id = '3742086'
 aitem = pipelines.item(id, html)
 a = aitem.getitem()
@ -184,5 +187,30 @@ import historyPrice
 bitem = historyPrice.historyPriceItem(id)
 b = bitem.gethistoryPrice()
 ################
 bitem.response  # 查看requests返回的sources
 item = bitem.response.xpath(reg)[0]
 item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
 itemList = a + b
 '''
 '''
 # Xpath 调试
 from pipelines import *
 response = './1320,1585,9434/index.html'
 response = gethtml(response, gethtml_mode = 'cache')
 id = '3742086'
 reg = r"//li[@data-sku='" + str(id) + r"']"
 item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
 reg = "//div[@class='p-shop']/span/a/text()"
 html = etree.HTML(item)
 name = html.xpath(reg)[0]
 '''