diff --git a/README.md b/README.md index d00e326..2dbb934 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,8 @@ ChromeDriver 下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到当前目录就行(如果是放在 python 根目录可以不用在实例化 selenium 时指定chromedriver 路径) +### Matplotlib + ### Requests 经典老碟 @@ -212,10 +214,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db = - - - - ## 参考链接 1,[selenium+python自动化100-centos上搭建selenium启动chrome浏览器headless无界面模式](https://www.cnblogs.com/yoyoketang/p/11582012.html) diff --git a/historyPrice.py b/historyPrice.py index 6f80d3f..419ff2a 100644 --- a/historyPrice.py +++ b/historyPrice.py @@ -19,6 +19,18 @@ class historyPriceItem: item = self.response.xpath(reg)[0] item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8') + def getCommit() -> str: + reg = "//div[@data-content='商品点评:']/text()" + commit = self.response.xpath(reg)[0] + commit = pipelines.myreplace(commit, mode = 'all') + return str(commit[5:-1]) + + def getTags() -> str: + reg = "//div[@data-content='商品类别:']/text()" + tags = self.response.xpath(reg)[0] + tags = pipelines.myreplace(tags, mode = 'all') + return str(tags[5:]) + def updateTime() -> str: reg = r"//div[@class='p3']/p[@class='tips']/text()" time = self.response.xpath(reg)[0] @@ -36,7 +48,7 @@ class historyPriceItem: price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';' return price - priceHistoryList = [updateTime(), priceTrend()] + priceHistoryList = [getCommit(), getTags(), updateTime(), priceTrend()] return priceHistoryList if __name__ == '__main__': diff --git a/pipelines.py b/pipelines.py index 22da452..be2c0bf 100644 --- a/pipelines.py +++ b/pipelines.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from lxml import etree -import csv import os -import time import historyPrice def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象 @@ -34,8 +32,9 @@ def getidlist(response) -> list: # 获取id def myreplace(text, mode = '') -> str: # 简单的处理输出 if mode == 'all': - return text.strip().replace(' ', '').replace("\r\n", '') + return text.strip().replace(' ', '').replace("\r\n", '').replace('\r', '').replace('\n', '') elif mode == 'strip': return text.strip().replace('\r', '') + elif mode == 'n': return text.replace('\n', '') else: return text.strip() def isElementTree(response) -> bool: # 用于判断是否已经为etree对象 @@ -50,7 +49,7 @@ class item: self.id = id self.response = response - def getitem(self) -> list: + def getitem(self) -> str: reg = r"//li[@data-sku='" + str(self.id) + r"']" item = self.response.xpath(reg)[0] item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8') @@ -63,6 +62,14 @@ class item: # print(name) return name + def shop() -> str: + reg = "//div[@class='p-shop']/span/a/text()" + html = etree.HTML(item) + shop = html.xpath(reg)[0] + shop = myreplace(shop) + # print(shop) + return shop + def price() -> str: reg = r"//i[@data-price]/text()" html = etree.HTML(item) @@ -97,9 +104,11 @@ class item: historyPriceItem = historyPrice.historyPriceItem(self.id) priceHistoryList = historyPriceItem.gethistoryPrice() + # print("id = {}, list = {}".format(self.id, priceHistoryList[3])) - itemlist = [str(self.id), name(), price(), attribute(), sales(), url()] + priceHistoryList - return itemlist + # itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]] + itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3])) + return itemString def print2console(response): # 输出到命令行 @@ -128,40 +137,34 @@ def print2console(response): # 输出到命令行 def write2csv(response, filename_csv): # 写入到csv文件 - def write(writer): + def writer(fd): for id in getidlist(response): + print('flag1') if int(id) < 1000: continue aitem = item(id, gethtml(response)) - itemlist = aitem.getitem() - # print(itemlist) - writer.writerow(itemlist) + itemString = aitem.getitem() + # print(itemList) + try: + fd.write(itemString) + except BaseException as e: + print(e) + print("sth wrong in pipelines.write2csv.write.") try: if os.path.exists(filename_csv): - with open(filename_csv, 'a+', encoding = 'utf-8-sig', newline = '') as fd: # 存在,文件尾追加 - try: - writer = csv.writer(fd) - except BaseException as e: - print(e) - write(writer) - fd.close() + with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加 + print('flag2') + writer(fd) else: - with open(filename_csv, 'w+', encoding = 'utf-8-sig', newline = '') as fd: # 不存在,创建并从文件头开始 - try: - writer = csv.writer(fd) - except BaseException as e: - print(e) - - headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url', '数据更新时间', '历史价格趋势'] - writer.writerow(headers) + with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始 + # headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势'] + headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n" + print('flag3') + fd.write(headers) + writer(fd) - write(writer) - fd.close() - - # print("pipelines.write2csv is done") - except BaseException as e: print(e) print("sth wrong in pipelines.write2csv") @@ -174,9 +177,9 @@ if __name__ == "__main__": # 调试数据 import pipelines from lxml import etree -response = 'index.html' # 文件名 or url +response = './1320,1585,9434/index.html' # 文件名 or url html = pipelines.gethtml(response, gethtml_mode = 'cache') # cache or url -id = '1127466' +id = '3742086' aitem = pipelines.item(id, html) a = aitem.getitem() @@ -184,5 +187,30 @@ import historyPrice bitem = historyPrice.historyPriceItem(id) b = bitem.gethistoryPrice() +################ + +bitem.response # 查看requests返回的sources +item = bitem.response.xpath(reg)[0] +item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8') + + itemList = a + b +''' + +''' +# Xpath 调试 + +from pipelines import * +response = './1320,1585,9434/index.html' +response = gethtml(response, gethtml_mode = 'cache') + +id = '3742086' +reg = r"//li[@data-sku='" + str(id) + r"']" +item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8') + +reg = "//div[@class='p-shop']/span/a/text()" +html = etree.HTML(item) +name = html.xpath(reg)[0] + + ''' \ No newline at end of file