update

3 years ago · 22a274b5a8
parent 3adaf63228
commit 22a274b5a8
3 changed files with 75 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -181,6 +181,8 @@ ChromeDriver

 下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到当前目录就行(如果是放在 python 根目录可以不用在实例化 selenium 时指定chromedriver 路径)

+### Matplotlib
+
 ### Requests

 经典老碟
@ -212,10 +214,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =



-
-
-
-
 ## 参考链接

 1，[selenium+python自动化100-centos上搭建selenium启动chrome浏览器headless无界面模式](https://www.cnblogs.com/yoyoketang/p/11582012.html)
--- a/historyPrice.py
+++ b/historyPrice.py
@ -19,6 +19,18 @@ class historyPriceItem:
        item = self.response.xpath(reg)[0]
        item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')

+        def getCommit() -> str:
+            reg = "//div[@data-content='商品点评：']/text()"
+            commit = self.response.xpath(reg)[0]
+            commit = pipelines.myreplace(commit, mode = 'all')
+            return str(commit[5:-1])
+
+        def getTags() -> str:
+            reg = "//div[@data-content='商品类别：']/text()"
+            tags = self.response.xpath(reg)[0]
+            tags = pipelines.myreplace(tags, mode = 'all')
+            return str(tags[5:])
+
        def updateTime() -> str:
            reg = r"//div[@class='p3']/p[@class='tips']/text()"
            time = self.response.xpath(reg)[0]
@ -36,7 +48,7 @@ class historyPriceItem:
                price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';'
            return price

-        priceHistoryList = [updateTime(), priceTrend()]
+        priceHistoryList = [getCommit(), getTags(), updateTime(), priceTrend()]
        return priceHistoryList
        
 if __name__ == '__main__':
--- a/pipelines.py
+++ b/pipelines.py
@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 from lxml import etree
-import csv
 import os
-import time
 import historyPrice

 def gethtml(response, gethtml_mode = "url"):  # 用etree格式化得到的对象
@ -34,8 +32,9 @@ def getidlist(response) -> list:    # 获取id

 def myreplace(text, mode = '') -> str: # 简单的处理输出
    if mode == 'all':
-        return text.strip().replace(' ', '').replace("\r\n", '')
+        return text.strip().replace(' ', '').replace("\r\n", '').replace('\r', '').replace('\n', '')
    elif mode == 'strip': return text.strip().replace('\r', '')
+    elif mode == 'n': return text.replace('\n', '')
    else: return text.strip()

 def isElementTree(response) -> bool:    # 用于判断是否已经为etree对象
@ -50,7 +49,7 @@ class item:
        self.id = id
        self.response = response

-    def getitem(self) -> list:
+    def getitem(self) -> str:
        reg = r"//li[@data-sku='" + str(self.id) + r"']"
        item = self.response.xpath(reg)[0]
        item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
@ -63,6 +62,14 @@ class item:
            # print(name)
            return name

+        def shop() -> str:
+            reg = "//div[@class='p-shop']/span/a/text()"
+            html = etree.HTML(item)
+            shop = html.xpath(reg)[0]
+            shop = myreplace(shop)
+            # print(shop)
+            return shop
+
        def price() -> str:
            reg = r"//i[@data-price]/text()"
            html = etree.HTML(item)
@ -97,9 +104,11 @@ class item:

        historyPriceItem = historyPrice.historyPriceItem(self.id)
        priceHistoryList = historyPriceItem.gethistoryPrice()
+        # print("id = {}, list = {}".format(self.id, priceHistoryList[3]))

-        itemlist = [str(self.id), name(), price(), attribute(), sales(), url()] + priceHistoryList
-        return itemlist
+        # itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
+        itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
+        return itemString

 def print2console(response):    # 输出到命令行

@ -128,40 +137,34 @@ def print2console(response):    # 输出到命令行

 def write2csv(response, filename_csv):    # 写入到csv文件

-    def write(writer):
+    def writer(fd):
        for id in getidlist(response):
+            print('flag1')
            if int(id) < 1000:
                continue
            aitem = item(id, gethtml(response))
-            itemlist = aitem.getitem()
-            # print(itemlist)
-            writer.writerow(itemlist)
+            itemString = aitem.getitem()
+            # print(itemList)
+            try:
+                fd.write(itemString)
+            except BaseException as e:
+                print(e)
+                print("sth wrong in pipelines.write2csv.write.")

    try:
        if os.path.exists(filename_csv):
-            with open(filename_csv, 'a+', encoding = 'utf-8-sig', newline = '') as fd:  # 存在，文件尾追加
-                try:
-                    writer = csv.writer(fd)
-                except BaseException as e:
-                    print(e)
-                write(writer)
-                fd.close()
+            with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd:  # 存在，文件尾追加
+                print('flag2')
+                writer(fd)

        else:
-            with open(filename_csv, 'w+', encoding = 'utf-8-sig', newline = '') as fd:  # 不存在，创建并从文件头开始
-                try:
-                    writer = csv.writer(fd)
-                except BaseException as e:
-                    print(e)
-
-                headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url', '数据更新时间', '历史价格趋势']
-                writer.writerow(headers)
+            with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd:  # 不存在，创建并从文件头开始
+                # headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
+                headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
+                print('flag3')
+                fd.write(headers)
+                writer(fd)

-                write(writer)
-                fd.close()
-
-        # print("pipelines.write2csv is done")
-                
    except BaseException as e:
        print(e)
        print("sth wrong in pipelines.write2csv")
@ -174,9 +177,9 @@ if __name__ == "__main__":
 # 调试数据
 import pipelines
 from lxml import etree
-response = 'index.html'   # 文件名 or url
+response = './1320,1585,9434/index.html'   # 文件名 or url
 html = pipelines.gethtml(response, gethtml_mode = 'cache')  # cache or url
-id = '1127466'
+id = '3742086'
 aitem = pipelines.item(id, html)
 a = aitem.getitem()

@ -184,5 +187,30 @@ import historyPrice
 bitem = historyPrice.historyPriceItem(id)
 b = bitem.gethistoryPrice()

+################
+
+bitem.response  # 查看requests返回的sources
+item = bitem.response.xpath(reg)[0]
+item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
+
+
 itemList = a + b
+'''
+
+'''
+# Xpath 调试
+
+from pipelines import *
+response = './1320,1585,9434/index.html'
+response = gethtml(response, gethtml_mode = 'cache')
+
+id = '3742086'
+reg = r"//li[@data-sku='" + str(id) + r"']"
+item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
+
+reg = "//div[@class='p-shop']/span/a/text()"
+html = etree.HTML(item)
+name = html.xpath(reg)[0]
+
+
 '''