|
|
@ -1,8 +1,6 @@
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from lxml import etree
|
|
|
|
from lxml import etree
|
|
|
|
import csv
|
|
|
|
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
|
|
|
import historyPrice
|
|
|
|
import historyPrice
|
|
|
|
|
|
|
|
|
|
|
|
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
|
|
|
|
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
|
|
|
@ -34,8 +32,9 @@ def getidlist(response) -> list: # 获取id
|
|
|
|
|
|
|
|
|
|
|
|
def myreplace(text, mode = '') -> str: # 简单的处理输出
|
|
|
|
def myreplace(text, mode = '') -> str: # 简单的处理输出
|
|
|
|
if mode == 'all':
|
|
|
|
if mode == 'all':
|
|
|
|
return text.strip().replace(' ', '').replace("\r\n", '')
|
|
|
|
return text.strip().replace(' ', '').replace("\r\n", '').replace('\r', '').replace('\n', '')
|
|
|
|
elif mode == 'strip': return text.strip().replace('\r', '')
|
|
|
|
elif mode == 'strip': return text.strip().replace('\r', '')
|
|
|
|
|
|
|
|
elif mode == 'n': return text.replace('\n', '')
|
|
|
|
else: return text.strip()
|
|
|
|
else: return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
def isElementTree(response) -> bool: # 用于判断是否已经为etree对象
|
|
|
|
def isElementTree(response) -> bool: # 用于判断是否已经为etree对象
|
|
|
@ -50,7 +49,7 @@ class item:
|
|
|
|
self.id = id
|
|
|
|
self.id = id
|
|
|
|
self.response = response
|
|
|
|
self.response = response
|
|
|
|
|
|
|
|
|
|
|
|
def getitem(self) -> list:
|
|
|
|
def getitem(self) -> str:
|
|
|
|
reg = r"//li[@data-sku='" + str(self.id) + r"']"
|
|
|
|
reg = r"//li[@data-sku='" + str(self.id) + r"']"
|
|
|
|
item = self.response.xpath(reg)[0]
|
|
|
|
item = self.response.xpath(reg)[0]
|
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
@ -63,6 +62,14 @@ class item:
|
|
|
|
# print(name)
|
|
|
|
# print(name)
|
|
|
|
return name
|
|
|
|
return name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def shop() -> str:
|
|
|
|
|
|
|
|
reg = "//div[@class='p-shop']/span/a/text()"
|
|
|
|
|
|
|
|
html = etree.HTML(item)
|
|
|
|
|
|
|
|
shop = html.xpath(reg)[0]
|
|
|
|
|
|
|
|
shop = myreplace(shop)
|
|
|
|
|
|
|
|
# print(shop)
|
|
|
|
|
|
|
|
return shop
|
|
|
|
|
|
|
|
|
|
|
|
def price() -> str:
|
|
|
|
def price() -> str:
|
|
|
|
reg = r"//i[@data-price]/text()"
|
|
|
|
reg = r"//i[@data-price]/text()"
|
|
|
|
html = etree.HTML(item)
|
|
|
|
html = etree.HTML(item)
|
|
|
@ -97,9 +104,11 @@ class item:
|
|
|
|
|
|
|
|
|
|
|
|
historyPriceItem = historyPrice.historyPriceItem(self.id)
|
|
|
|
historyPriceItem = historyPrice.historyPriceItem(self.id)
|
|
|
|
priceHistoryList = historyPriceItem.gethistoryPrice()
|
|
|
|
priceHistoryList = historyPriceItem.gethistoryPrice()
|
|
|
|
|
|
|
|
# print("id = {}, list = {}".format(self.id, priceHistoryList[3]))
|
|
|
|
|
|
|
|
|
|
|
|
itemlist = [str(self.id), name(), price(), attribute(), sales(), url()] + priceHistoryList
|
|
|
|
# itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
|
|
|
|
return itemlist
|
|
|
|
itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
|
|
|
|
|
|
|
|
return itemString
|
|
|
|
|
|
|
|
|
|
|
|
def print2console(response): # 输出到命令行
|
|
|
|
def print2console(response): # 输出到命令行
|
|
|
|
|
|
|
|
|
|
|
@ -128,39 +137,33 @@ def print2console(response): # 输出到命令行
|
|
|
|
|
|
|
|
|
|
|
|
def write2csv(response, filename_csv): # 写入到csv文件
|
|
|
|
def write2csv(response, filename_csv): # 写入到csv文件
|
|
|
|
|
|
|
|
|
|
|
|
def write(writer):
|
|
|
|
def writer(fd):
|
|
|
|
for id in getidlist(response):
|
|
|
|
for id in getidlist(response):
|
|
|
|
|
|
|
|
print('flag1')
|
|
|
|
if int(id) < 1000:
|
|
|
|
if int(id) < 1000:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
aitem = item(id, gethtml(response))
|
|
|
|
aitem = item(id, gethtml(response))
|
|
|
|
itemlist = aitem.getitem()
|
|
|
|
itemString = aitem.getitem()
|
|
|
|
# print(itemlist)
|
|
|
|
# print(itemList)
|
|
|
|
writer.writerow(itemlist)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
if os.path.exists(filename_csv):
|
|
|
|
|
|
|
|
with open(filename_csv, 'a+', encoding = 'utf-8-sig', newline = '') as fd: # 存在,文件尾追加
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
writer = csv.writer(fd)
|
|
|
|
fd.write(itemString)
|
|
|
|
except BaseException as e:
|
|
|
|
except BaseException as e:
|
|
|
|
print(e)
|
|
|
|
print(e)
|
|
|
|
write(writer)
|
|
|
|
print("sth wrong in pipelines.write2csv.write.")
|
|
|
|
fd.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
with open(filename_csv, 'w+', encoding = 'utf-8-sig', newline = '') as fd: # 不存在,创建并从文件头开始
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
writer = csv.writer(fd)
|
|
|
|
if os.path.exists(filename_csv):
|
|
|
|
except BaseException as e:
|
|
|
|
with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加
|
|
|
|
print(e)
|
|
|
|
print('flag2')
|
|
|
|
|
|
|
|
writer(fd)
|
|
|
|
headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url', '数据更新时间', '历史价格趋势']
|
|
|
|
|
|
|
|
writer.writerow(headers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
write(writer)
|
|
|
|
|
|
|
|
fd.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# print("pipelines.write2csv is done")
|
|
|
|
else:
|
|
|
|
|
|
|
|
with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始
|
|
|
|
|
|
|
|
# headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
|
|
|
|
|
|
|
|
headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
|
|
|
|
|
|
|
|
print('flag3')
|
|
|
|
|
|
|
|
fd.write(headers)
|
|
|
|
|
|
|
|
writer(fd)
|
|
|
|
|
|
|
|
|
|
|
|
except BaseException as e:
|
|
|
|
except BaseException as e:
|
|
|
|
print(e)
|
|
|
|
print(e)
|
|
|
@ -174,9 +177,9 @@ if __name__ == "__main__":
|
|
|
|
# 调试数据
|
|
|
|
# 调试数据
|
|
|
|
import pipelines
|
|
|
|
import pipelines
|
|
|
|
from lxml import etree
|
|
|
|
from lxml import etree
|
|
|
|
response = 'index.html' # 文件名 or url
|
|
|
|
response = './1320,1585,9434/index.html' # 文件名 or url
|
|
|
|
html = pipelines.gethtml(response, gethtml_mode = 'cache') # cache or url
|
|
|
|
html = pipelines.gethtml(response, gethtml_mode = 'cache') # cache or url
|
|
|
|
id = '1127466'
|
|
|
|
id = '3742086'
|
|
|
|
aitem = pipelines.item(id, html)
|
|
|
|
aitem = pipelines.item(id, html)
|
|
|
|
a = aitem.getitem()
|
|
|
|
a = aitem.getitem()
|
|
|
|
|
|
|
|
|
|
|
@ -184,5 +187,30 @@ import historyPrice
|
|
|
|
bitem = historyPrice.historyPriceItem(id)
|
|
|
|
bitem = historyPrice.historyPriceItem(id)
|
|
|
|
b = bitem.gethistoryPrice()
|
|
|
|
b = bitem.gethistoryPrice()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bitem.response # 查看requests返回的sources
|
|
|
|
|
|
|
|
item = bitem.response.xpath(reg)[0]
|
|
|
|
|
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
itemList = a + b
|
|
|
|
itemList = a + b
|
|
|
|
'''
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
# Xpath 调试
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from pipelines import *
|
|
|
|
|
|
|
|
response = './1320,1585,9434/index.html'
|
|
|
|
|
|
|
|
response = gethtml(response, gethtml_mode = 'cache')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
id = '3742086'
|
|
|
|
|
|
|
|
reg = r"//li[@data-sku='" + str(id) + r"']"
|
|
|
|
|
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reg = "//div[@class='p-shop']/span/a/text()"
|
|
|
|
|
|
|
|
html = etree.HTML(item)
|
|
|
|
|
|
|
|
name = html.xpath(reg)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|