You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

216 lines
6.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
from lxml import etree
import os
import historyPrice
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
try:
if isElementTree(response):
return response
if gethtml_mode == "cache":
html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8'))
elif gethtml_mode == "url":
html = etree.HTML(response)
else:
print("sth wrong with parameters in pipelines.gethtml(gethtml_mode)")
exit()
return html
except BaseException as e:
print(e)
print("sth wrong in pipelines.gethtml, see your settings in settings.py")
exit()
def getidlist(response) -> list: # 获取id
reg = r"//li/@data-sku"
if isElementTree(response):
html = response
else:
html = gethtml(response)
# print(html)
idlist = html.xpath(reg)
return idlist
def myreplace(text, mode = '') -> str: # 简单的处理输出
if mode == 'all':
return text.strip().replace(' ', '').replace("\r\n", '').replace('\r', '').replace('\n', '')
elif mode == 'strip': return text.strip().replace('\r', '')
elif mode == 'n': return text.replace('\n', '')
else: return text.strip()
def isElementTree(response) -> bool: # 用于判断是否已经为etree对象
if str(type(response)) == "<class 'lxml.etree._ElementTree'>":
return True
else:
return False
class item:
def __init__(self, id, response):
self.id = id
self.response = response
def getitem(self) -> str:
reg = r"//li[@data-sku='" + str(self.id) + r"']"
item = self.response.xpath(reg)[0]
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
def name() -> str:
reg = r"//div[@class='p-name p-name-type-3']/a/em/text()"
html = etree.HTML(item)
name = html.xpath(reg)[0]
name = myreplace(name)
# print(name)
return name
def shop() -> str:
reg = "//div[@class='p-shop']/span/a/text()"
html = etree.HTML(item)
shop = html.xpath(reg)[0]
shop = myreplace(shop)
# print(shop)
return shop
def price() -> str:
reg = r"//i[@data-price]/text()"
html = etree.HTML(item)
price = html.xpath(reg)[0]
price = str(price)
# print(price)
return price
def attribute() -> str:
reg = r"//span[@class='attr']/b/text()"
html = etree.HTML(item)
attribute = html.xpath(reg)
attrStr = ""
for attr in attribute:
attrStr += attr + ' '
# print(attribute)
return myreplace(attrStr)
def sales() -> str:
reg = r"//div[@class='p-icons']/i/text()"
html = etree.HTML(item)
sales = html.xpath(reg)
saleStr = ""
for sale in sales:
saleStr += sale + ' '
# print(sales)
return myreplace(saleStr)
def url() -> str:
url = r"https://item.jd.com/" + str(self.id) + r".html"
return url
historyPriceItem = historyPrice.historyPriceItem(self.id)
priceHistoryList = historyPriceItem.gethistoryPrice()
# print("id = {}, list = {}".format(self.id, priceHistoryList[3]))
# itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
return itemString
def print2console(response): # 输出到命令行
def output(itemlist = []):
print("商品id" + itemlist[0])
print("商品名称:" + itemlist[1])
print("价格:¥" + itemlist[2])
print("关键词:" + itemlist[3])
print("促销活动:" + itemlist[4])
print("商品链接:" + itemlist[5])
print("")
try:
for id in getidlist(response):
if int(id) < 1000:
continue
aitem = item(id, gethtml(response))
itemlist = aitem.getitem()
output(itemlist)
except BaseException as e:
print(e)
print("pipelines.py didn't work properly")
print("pipelines.print2console is done")
def write2csv(response, filename_csv): # 写入到csv文件
def writer(fd):
for id in getidlist(response):
print('flag1')
if int(id) < 1000:
continue
aitem = item(id, gethtml(response))
itemString = aitem.getitem()
# print(itemList)
try:
fd.write(itemString)
except BaseException as e:
print(e)
print("sth wrong in pipelines.write2csv.write.")
try:
if os.path.exists(filename_csv):
with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加
print('flag2')
writer(fd)
else:
with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始
# headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
print('flag3')
fd.write(headers)
writer(fd)
except BaseException as e:
print(e)
print("sth wrong in pipelines.write2csv")
if __name__ == "__main__":
pass
'''
# 调试数据
import pipelines
from lxml import etree
response = './1320,1585,9434/index.html' # 文件名 or url
html = pipelines.gethtml(response, gethtml_mode = 'cache') # cache or url
id = '3742086'
aitem = pipelines.item(id, html)
a = aitem.getitem()
import historyPrice
bitem = historyPrice.historyPriceItem(id)
b = bitem.gethistoryPrice()
################
bitem.response # 查看requests返回的sources
item = bitem.response.xpath(reg)[0]
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
itemList = a + b
'''
'''
# Xpath 调试
from pipelines import *
response = './1320,1585,9434/index.html'
response = gethtml(response, gethtml_mode = 'cache')
id = '3742086'
reg = r"//li[@data-sku='" + str(id) + r"']"
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
reg = "//div[@class='p-shop']/span/a/text()"
html = etree.HTML(item)
name = html.xpath(reg)[0]
'''