diff --git a/historyPrice.py b/historyPrice.py index 419ff2a..da867b1 100644 --- a/historyPrice.py +++ b/historyPrice.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import string from lxml import etree import downloader @@ -20,26 +21,35 @@ class historyPriceItem: item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8') def getCommit() -> str: + string = '商品点评:' + if check(string) == False: # 用于判断有无商品点评 + return '' reg = "//div[@data-content='商品点评:']/text()" commit = self.response.xpath(reg)[0] commit = pipelines.myreplace(commit, mode = 'all') return str(commit[5:-1]) def getTags() -> str: + string = '商品类别:' + if check(string) == False: # 用于判断有无商品类别 + return '' reg = "//div[@data-content='商品类别:']/text()" tags = self.response.xpath(reg)[0] tags = pipelines.myreplace(tags, mode = 'all') return str(tags[5:]) def updateTime() -> str: + string = 'timeline-text' + if check(string) == False: # 用于判断有无数据更新时间记录 + return '' reg = r"//div[@class='p3']/p[@class='tips']/text()" time = self.response.xpath(reg)[0] time = pipelines.myreplace(time, mode = 'strip') return str(time[5:]) def priceTrend() -> str: - check = 'timeline-text' - if not check in item: # 用于判断有无历史价格记录 + string = 'timeline-text' + if check(string) == False: # 用于判断有无历史价格记录 return '' reg = r"//div[@class='timeline-text']/p/text()" regList = self.response.xpath(reg) @@ -48,12 +58,16 @@ class historyPriceItem: price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';' return price + def check(string, item = item) -> bool: + if string in item: return True + elif not string in item: return False + priceHistoryList = [getCommit(), getTags(), updateTime(), priceTrend()] return priceHistoryList if __name__ == '__main__': # id = "10036840192083" - id = "11564571796" # More + id = "100020511880" # More aitem = historyPriceItem(id) print(aitem.gethistoryPrice()) diff --git a/middlewares.py b/middlewares.py index b26755e..a0b5fa6 100644 --- a/middlewares.py +++ b/middlewares.py @@ -133,5 +133,4 @@ def localtest(category): # 本地加载的源码测试 time.sleep(10) print("page " + str(page) + " sleep over at " + time.ctime()) page += 1 - aaa = 1 diff --git a/pipelines.py b/pipelines.py index be2c0bf..16b1552 100644 --- a/pipelines.py +++ b/pipelines.py @@ -55,6 +55,9 @@ class item: item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8') def name() -> str: + string = 'p-name p-name-type-3' + if check(string) == False: # 用于判断有无名字 + return '' reg = r"//div[@class='p-name p-name-type-3']/a/em/text()" html = etree.HTML(item) name = html.xpath(reg)[0] @@ -63,7 +66,10 @@ class item: return name def shop() -> str: - reg = "//div[@class='p-shop']/span/a/text()" + string = 'curr-shop hd-shopname' + if check(string) == False: # 用于判断有无商店信息 + return '' + reg = "//a[@class='curr-shop hd-shopname']/text()" html = etree.HTML(item) shop = html.xpath(reg)[0] shop = myreplace(shop) @@ -71,6 +77,9 @@ class item: return shop def price() -> str: + string = 'data-price' + if check(string) == False: # 用于判断有无价格信息 + return '' reg = r"//i[@data-price]/text()" html = etree.HTML(item) price = html.xpath(reg)[0] @@ -79,6 +88,9 @@ class item: return price def attribute() -> str: + string = 'attr' + if check(string) == False: # 用于判断有无标签 + return '' reg = r"//span[@class='attr']/b/text()" html = etree.HTML(item) attribute = html.xpath(reg) @@ -89,6 +101,9 @@ class item: return myreplace(attrStr) def sales() -> str: + string = 'p-icons' + if check(string) == False: # 用于判断有无促销信息 + return '' reg = r"//div[@class='p-icons']/i/text()" html = etree.HTML(item) sales = html.xpath(reg) @@ -102,12 +117,18 @@ class item: url = r"https://item.jd.com/" + str(self.id) + r".html" return url + def check(string, item = item) -> bool: + if string in item: return True + elif not string in item: return False + historyPriceItem = historyPrice.historyPriceItem(self.id) priceHistoryList = historyPriceItem.gethistoryPrice() # print("id = {}, list = {}".format(self.id, priceHistoryList[3])) # itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]] + itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3])) + # print(itemString) return itemString def print2console(response): # 输出到命令行 @@ -139,29 +160,20 @@ def write2csv(response, filename_csv): # 写入到csv文件 def writer(fd): for id in getidlist(response): - print('flag1') if int(id) < 1000: continue aitem = item(id, gethtml(response)) itemString = aitem.getitem() - # print(itemList) - try: - fd.write(itemString) - except BaseException as e: - print(e) - print("sth wrong in pipelines.write2csv.write.") + fd.write(itemString) try: if os.path.exists(filename_csv): with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加 - print('flag2') writer(fd) else: with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始 - # headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势'] headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n" - print('flag3') fd.write(headers) writer(fd) @@ -171,6 +183,13 @@ def write2csv(response, filename_csv): # 写入到csv文件 if __name__ == "__main__": pass +''' +# 调试 + filename_csv = os.getcwd() + '\\' + "milk.csv" + response = './1320,1585,9434/1320,1585,9434&page=1.html' + res = gethtml(response, gethtml_mode = 'cache') + write2csv(res, filename_csv) +''' '''