解决了xpath解析问题，爬虫部分还差一个requests那块的多线程，由于时间开销太大，会考虑缩减展示的数量

3 years ago · f391dc8362
parent 5167ed6bc8
commit f391dc8362
3 changed files with 47 additions and 15 deletions
--- a/historyPrice.py
+++ b/historyPrice.py
@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-

+import string
 from lxml import etree

 import downloader
@ -20,26 +21,35 @@ class historyPriceItem:
        item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')

        def getCommit() -> str:
+            string = '商品点评：'
+            if check(string) == False:  # 用于判断有无商品点评
+                return ''
            reg = "//div[@data-content='商品点评：']/text()"
            commit = self.response.xpath(reg)[0]
            commit = pipelines.myreplace(commit, mode = 'all')
            return str(commit[5:-1])

        def getTags() -> str:
+            string = '商品类别：'
+            if check(string) == False:  # 用于判断有无商品类别
+                return ''
            reg = "//div[@data-content='商品类别：']/text()"
            tags = self.response.xpath(reg)[0]
            tags = pipelines.myreplace(tags, mode = 'all')
            return str(tags[5:])

        def updateTime() -> str:
+            string = 'timeline-text'
+            if check(string) == False:  # 用于判断有无数据更新时间记录
+                return ''
            reg = r"//div[@class='p3']/p[@class='tips']/text()"
            time = self.response.xpath(reg)[0]
            time = pipelines.myreplace(time, mode = 'strip')
            return str(time[5:])

        def priceTrend() -> str:
-            check = 'timeline-text'
-            if not check in item:  # 用于判断有无历史价格记录
+            string = 'timeline-text'
+            if check(string) == False:  # 用于判断有无历史价格记录
                return ''
            reg = r"//div[@class='timeline-text']/p/text()"
            regList = self.response.xpath(reg)
@ -48,12 +58,16 @@ class historyPriceItem:
                price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';'
            return price

+        def check(string, item = item) -> bool:
+            if string in item: return True
+            elif not string in item: return False
+
        priceHistoryList = [getCommit(), getTags(), updateTime(), priceTrend()]
        return priceHistoryList
        
 if __name__ == '__main__':
    # id = "10036840192083"
-    id = "11564571796"  # More
+    id = "100020511880"  # More
    aitem = historyPriceItem(id)
    print(aitem.gethistoryPrice())

--- a/middlewares.py
+++ b/middlewares.py
@ -133,5 +133,4 @@ def localtest(category): # 本地加载的源码测试
        time.sleep(10)
        print("page " + str(page) + " sleep over at " + time.ctime())
        page += 1
-        aaa = 1

--- a/pipelines.py
+++ b/pipelines.py
@ -55,6 +55,9 @@ class item:
        item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')

        def name() -> str:
+            string = 'p-name p-name-type-3'
+            if check(string) == False:  # 用于判断有无名字
+                return ''
            reg = r"//div[@class='p-name p-name-type-3']/a/em/text()"
            html = etree.HTML(item)
            name = html.xpath(reg)[0]
@ -63,7 +66,10 @@ class item:
            return name

        def shop() -> str:
-            reg = "//div[@class='p-shop']/span/a/text()"
+            string = 'curr-shop hd-shopname'
+            if check(string) == False:  # 用于判断有无商店信息
+                return ''
+            reg = "//a[@class='curr-shop hd-shopname']/text()"
            html = etree.HTML(item)
            shop = html.xpath(reg)[0]
            shop = myreplace(shop)
@ -71,6 +77,9 @@ class item:
            return shop

        def price() -> str:
+            string = 'data-price'
+            if check(string) == False:  # 用于判断有无价格信息
+                return ''            
            reg = r"//i[@data-price]/text()"
            html = etree.HTML(item)
            price = html.xpath(reg)[0]
@ -79,6 +88,9 @@ class item:
            return price

        def attribute() -> str:
+            string = 'attr'
+            if check(string) == False:  # 用于判断有无标签
+                return ''            
            reg = r"//span[@class='attr']/b/text()"
            html = etree.HTML(item)
            attribute = html.xpath(reg)
@ -89,6 +101,9 @@ class item:
            return myreplace(attrStr)

        def sales() -> str:
+            string = 'p-icons'
+            if check(string) == False:  # 用于判断有无促销信息
+                return ''            
            reg = r"//div[@class='p-icons']/i/text()"
            html = etree.HTML(item)
            sales = html.xpath(reg)
@ -102,12 +117,18 @@ class item:
            url = r"https://item.jd.com/" + str(self.id) + r".html"
            return url

+        def check(string, item = item) -> bool:
+            if string in item: return True
+            elif not string in item: return False
+
        historyPriceItem = historyPrice.historyPriceItem(self.id)
        priceHistoryList = historyPriceItem.gethistoryPrice()
        # print("id = {}, list = {}".format(self.id, priceHistoryList[3]))

        # itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
+
        itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
+        # print(itemString)
        return itemString

 def print2console(response):    # 输出到命令行
@ -139,29 +160,20 @@ def write2csv(response, filename_csv):    # 写入到csv文件

    def writer(fd):
        for id in getidlist(response):
-            print('flag1')
            if int(id) < 1000:
                continue
            aitem = item(id, gethtml(response))
            itemString = aitem.getitem()
-            # print(itemList)
-            try:
-                fd.write(itemString)
-            except BaseException as e:
-                print(e)
-                print("sth wrong in pipelines.write2csv.write.")
+            fd.write(itemString)

    try:
        if os.path.exists(filename_csv):
            with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd:  # 存在，文件尾追加
-                print('flag2')
                writer(fd)

        else:
            with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd:  # 不存在，创建并从文件头开始
-                # headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
                headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
-                print('flag3')
                fd.write(headers)
                writer(fd)

@ -171,6 +183,13 @@ def write2csv(response, filename_csv):    # 写入到csv文件

 if __name__ == "__main__":
    pass
+'''
+# 调试
+    filename_csv = os.getcwd() + '\\' + "milk.csv"
+    response = './1320,1585,9434/1320,1585,9434&page=1.html'
+    res = gethtml(response, gethtml_mode = 'cache')
+    write2csv(res, filename_csv)
+'''


 '''