|
|
# -*- coding: utf-8 -*-
|
|
|
from lxml import etree
|
|
|
import os
|
|
|
import historyPrice
|
|
|
|
|
|
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
|
|
|
try:
|
|
|
if isElementTree(response):
|
|
|
return response
|
|
|
if gethtml_mode == "cache":
|
|
|
html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8'))
|
|
|
elif gethtml_mode == "url":
|
|
|
html = etree.HTML(response)
|
|
|
else:
|
|
|
print("sth wrong with parameters in pipelines.gethtml(gethtml_mode)")
|
|
|
exit()
|
|
|
return html
|
|
|
except BaseException as e:
|
|
|
print(e)
|
|
|
print("sth wrong in pipelines.gethtml, see your settings in settings.py")
|
|
|
exit()
|
|
|
|
|
|
def getidlist(response) -> list: # 获取id
|
|
|
reg = r"//li/@data-sku"
|
|
|
if isElementTree(response):
|
|
|
html = response
|
|
|
else:
|
|
|
html = gethtml(response)
|
|
|
# print(html)
|
|
|
idlist = html.xpath(reg)
|
|
|
return idlist
|
|
|
|
|
|
def myreplace(text, mode = '') -> str: # 简单的处理输出
|
|
|
if mode == 'all':
|
|
|
return text.strip().replace(' ', '').replace("\r\n", '').replace('\r', '').replace('\n', '')
|
|
|
elif mode == 'strip': return text.strip().replace('\r', '')
|
|
|
elif mode == 'n': return text.replace('\n', '')
|
|
|
else: return text.strip()
|
|
|
|
|
|
def isElementTree(response) -> bool: # 用于判断是否已经为etree对象
|
|
|
if str(type(response)) == "<class 'lxml.etree._ElementTree'>":
|
|
|
return True
|
|
|
else:
|
|
|
return False
|
|
|
|
|
|
class item:
|
|
|
|
|
|
def __init__(self, id, response):
|
|
|
self.id = id
|
|
|
self.response = response
|
|
|
|
|
|
def getitem(self) -> str:
|
|
|
reg = r"//li[@data-sku='" + str(self.id) + r"']"
|
|
|
item = self.response.xpath(reg)[0]
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
def name() -> str:
|
|
|
reg = r"//div[@class='p-name p-name-type-3']/a/em/text()"
|
|
|
html = etree.HTML(item)
|
|
|
name = html.xpath(reg)[0]
|
|
|
name = myreplace(name)
|
|
|
# print(name)
|
|
|
return name
|
|
|
|
|
|
def shop() -> str:
|
|
|
reg = "//div[@class='p-shop']/span/a/text()"
|
|
|
html = etree.HTML(item)
|
|
|
shop = html.xpath(reg)[0]
|
|
|
shop = myreplace(shop)
|
|
|
# print(shop)
|
|
|
return shop
|
|
|
|
|
|
def price() -> str:
|
|
|
reg = r"//i[@data-price]/text()"
|
|
|
html = etree.HTML(item)
|
|
|
price = html.xpath(reg)[0]
|
|
|
price = str(price)
|
|
|
# print(price)
|
|
|
return price
|
|
|
|
|
|
def attribute() -> str:
|
|
|
reg = r"//span[@class='attr']/b/text()"
|
|
|
html = etree.HTML(item)
|
|
|
attribute = html.xpath(reg)
|
|
|
attrStr = ""
|
|
|
for attr in attribute:
|
|
|
attrStr += attr + ' '
|
|
|
# print(attribute)
|
|
|
return myreplace(attrStr)
|
|
|
|
|
|
def sales() -> str:
|
|
|
reg = r"//div[@class='p-icons']/i/text()"
|
|
|
html = etree.HTML(item)
|
|
|
sales = html.xpath(reg)
|
|
|
saleStr = ""
|
|
|
for sale in sales:
|
|
|
saleStr += sale + ' '
|
|
|
# print(sales)
|
|
|
return myreplace(saleStr)
|
|
|
|
|
|
def url() -> str:
|
|
|
url = r"https://item.jd.com/" + str(self.id) + r".html"
|
|
|
return url
|
|
|
|
|
|
historyPriceItem = historyPrice.historyPriceItem(self.id)
|
|
|
priceHistoryList = historyPriceItem.gethistoryPrice()
|
|
|
# print("id = {}, list = {}".format(self.id, priceHistoryList[3]))
|
|
|
|
|
|
# itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
|
|
|
itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
|
|
|
return itemString
|
|
|
|
|
|
def print2console(response): # 输出到命令行
|
|
|
|
|
|
def output(itemlist = []):
|
|
|
print("商品id:" + itemlist[0])
|
|
|
print("商品名称:" + itemlist[1])
|
|
|
print("价格:¥" + itemlist[2])
|
|
|
print("关键词:" + itemlist[3])
|
|
|
print("促销活动:" + itemlist[4])
|
|
|
print("商品链接:" + itemlist[5])
|
|
|
print("")
|
|
|
|
|
|
try:
|
|
|
for id in getidlist(response):
|
|
|
if int(id) < 1000:
|
|
|
continue
|
|
|
aitem = item(id, gethtml(response))
|
|
|
itemlist = aitem.getitem()
|
|
|
output(itemlist)
|
|
|
|
|
|
except BaseException as e:
|
|
|
print(e)
|
|
|
print("pipelines.py didn't work properly")
|
|
|
|
|
|
print("pipelines.print2console is done")
|
|
|
|
|
|
def write2csv(response, filename_csv): # 写入到csv文件
|
|
|
|
|
|
def writer(fd):
|
|
|
for id in getidlist(response):
|
|
|
print('flag1')
|
|
|
if int(id) < 1000:
|
|
|
continue
|
|
|
aitem = item(id, gethtml(response))
|
|
|
itemString = aitem.getitem()
|
|
|
# print(itemList)
|
|
|
try:
|
|
|
fd.write(itemString)
|
|
|
except BaseException as e:
|
|
|
print(e)
|
|
|
print("sth wrong in pipelines.write2csv.write.")
|
|
|
|
|
|
try:
|
|
|
if os.path.exists(filename_csv):
|
|
|
with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加
|
|
|
print('flag2')
|
|
|
writer(fd)
|
|
|
|
|
|
else:
|
|
|
with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始
|
|
|
# headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
|
|
|
headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
|
|
|
print('flag3')
|
|
|
fd.write(headers)
|
|
|
writer(fd)
|
|
|
|
|
|
except BaseException as e:
|
|
|
print(e)
|
|
|
print("sth wrong in pipelines.write2csv")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
pass
|
|
|
|
|
|
|
|
|
'''
|
|
|
# 调试数据
|
|
|
import pipelines
|
|
|
from lxml import etree
|
|
|
response = './1320,1585,9434/index.html' # 文件名 or url
|
|
|
html = pipelines.gethtml(response, gethtml_mode = 'cache') # cache or url
|
|
|
id = '3742086'
|
|
|
aitem = pipelines.item(id, html)
|
|
|
a = aitem.getitem()
|
|
|
|
|
|
import historyPrice
|
|
|
bitem = historyPrice.historyPriceItem(id)
|
|
|
b = bitem.gethistoryPrice()
|
|
|
|
|
|
################
|
|
|
|
|
|
bitem.response # 查看requests返回的sources
|
|
|
item = bitem.response.xpath(reg)[0]
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
|
|
|
itemList = a + b
|
|
|
'''
|
|
|
|
|
|
'''
|
|
|
# Xpath 调试
|
|
|
|
|
|
from pipelines import *
|
|
|
response = './1320,1585,9434/index.html'
|
|
|
response = gethtml(response, gethtml_mode = 'cache')
|
|
|
|
|
|
id = '3742086'
|
|
|
reg = r"//li[@data-sku='" + str(id) + r"']"
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
reg = "//div[@class='p-shop']/span/a/text()"
|
|
|
html = etree.HTML(item)
|
|
|
name = html.xpath(reg)[0]
|
|
|
|
|
|
|
|
|
''' |