master
wkyuu 3 years ago
parent 3adaf63228
commit 22a274b5a8

@ -181,6 +181,8 @@ ChromeDriver
下载 [ChromeDriver](https://chromedriver.chromium.org/home) 放到当前目录就行(如果是放在 python 根目录可以不用在实例化 selenium 时指定chromedriver 路径)
### Matplotlib
### Requests
经典老碟
@ -212,10 +214,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
## 参考链接
1[selenium+python自动化100-centos上搭建selenium启动chrome浏览器headless无界面模式](https://www.cnblogs.com/yoyoketang/p/11582012.html)

@ -19,6 +19,18 @@ class historyPriceItem:
item = self.response.xpath(reg)[0]
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
def getCommit() -> str:
reg = "//div[@data-content='商品点评:']/text()"
commit = self.response.xpath(reg)[0]
commit = pipelines.myreplace(commit, mode = 'all')
return str(commit[5:-1])
def getTags() -> str:
reg = "//div[@data-content='商品类别:']/text()"
tags = self.response.xpath(reg)[0]
tags = pipelines.myreplace(tags, mode = 'all')
return str(tags[5:])
def updateTime() -> str:
reg = r"//div[@class='p3']/p[@class='tips']/text()"
time = self.response.xpath(reg)[0]
@ -36,7 +48,7 @@ class historyPriceItem:
price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';'
return price
priceHistoryList = [updateTime(), priceTrend()]
priceHistoryList = [getCommit(), getTags(), updateTime(), priceTrend()]
return priceHistoryList
if __name__ == '__main__':

@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
from lxml import etree
import csv
import os
import time
import historyPrice
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
@ -34,8 +32,9 @@ def getidlist(response) -> list: # 获取id
def myreplace(text, mode = '') -> str: # 简单的处理输出
if mode == 'all':
return text.strip().replace(' ', '').replace("\r\n", '')
return text.strip().replace(' ', '').replace("\r\n", '').replace('\r', '').replace('\n', '')
elif mode == 'strip': return text.strip().replace('\r', '')
elif mode == 'n': return text.replace('\n', '')
else: return text.strip()
def isElementTree(response) -> bool: # 用于判断是否已经为etree对象
@ -50,7 +49,7 @@ class item:
self.id = id
self.response = response
def getitem(self) -> list:
def getitem(self) -> str:
reg = r"//li[@data-sku='" + str(self.id) + r"']"
item = self.response.xpath(reg)[0]
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
@ -63,6 +62,14 @@ class item:
# print(name)
return name
def shop() -> str:
reg = "//div[@class='p-shop']/span/a/text()"
html = etree.HTML(item)
shop = html.xpath(reg)[0]
shop = myreplace(shop)
# print(shop)
return shop
def price() -> str:
reg = r"//i[@data-price]/text()"
html = etree.HTML(item)
@ -97,9 +104,11 @@ class item:
historyPriceItem = historyPrice.historyPriceItem(self.id)
priceHistoryList = historyPriceItem.gethistoryPrice()
# print("id = {}, list = {}".format(self.id, priceHistoryList[3]))
itemlist = [str(self.id), name(), price(), attribute(), sales(), url()] + priceHistoryList
return itemlist
# itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
return itemString
def print2console(response): # 输出到命令行
@ -128,40 +137,34 @@ def print2console(response): # 输出到命令行
def write2csv(response, filename_csv): # 写入到csv文件
def write(writer):
def writer(fd):
for id in getidlist(response):
print('flag1')
if int(id) < 1000:
continue
aitem = item(id, gethtml(response))
itemlist = aitem.getitem()
# print(itemlist)
writer.writerow(itemlist)
itemString = aitem.getitem()
# print(itemList)
try:
fd.write(itemString)
except BaseException as e:
print(e)
print("sth wrong in pipelines.write2csv.write.")
try:
if os.path.exists(filename_csv):
with open(filename_csv, 'a+', encoding = 'utf-8-sig', newline = '') as fd: # 存在,文件尾追加
try:
writer = csv.writer(fd)
except BaseException as e:
print(e)
write(writer)
fd.close()
with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加
print('flag2')
writer(fd)
else:
with open(filename_csv, 'w+', encoding = 'utf-8-sig', newline = '') as fd: # 不存在,创建并从文件头开始
try:
writer = csv.writer(fd)
except BaseException as e:
print(e)
headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url', '数据更新时间', '历史价格趋势']
writer.writerow(headers)
with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始
# headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
print('flag3')
fd.write(headers)
writer(fd)
write(writer)
fd.close()
# print("pipelines.write2csv is done")
except BaseException as e:
print(e)
print("sth wrong in pipelines.write2csv")
@ -174,9 +177,9 @@ if __name__ == "__main__":
# 调试数据
import pipelines
from lxml import etree
response = 'index.html' # 文件名 or url
response = './1320,1585,9434/index.html' # 文件名 or url
html = pipelines.gethtml(response, gethtml_mode = 'cache') # cache or url
id = '1127466'
id = '3742086'
aitem = pipelines.item(id, html)
a = aitem.getitem()
@ -184,5 +187,30 @@ import historyPrice
bitem = historyPrice.historyPriceItem(id)
b = bitem.gethistoryPrice()
################
bitem.response # 查看requests返回的sources
item = bitem.response.xpath(reg)[0]
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
itemList = a + b
'''
'''
# Xpath 调试
from pipelines import *
response = './1320,1585,9434/index.html'
response = gethtml(response, gethtml_mode = 'cache')
id = '3742086'
reg = r"//li[@data-sku='" + str(id) + r"']"
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
reg = "//div[@class='p-shop']/span/a/text()"
html = etree.HTML(item)
name = html.xpath(reg)[0]
'''
Loading…
Cancel
Save