|
|
# -*- coding: utf-8 -*-
|
|
|
import os
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
import historyPrice
|
|
|
|
|
|
|
|
|
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
|
|
|
try:
|
|
|
if isElementTree(response):
|
|
|
return response
|
|
|
if gethtml_mode == "cache":
|
|
|
html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8'))
|
|
|
elif gethtml_mode == "url":
|
|
|
html = etree.HTML(response)
|
|
|
else:
|
|
|
print("sth wrong with parameters in pipelines.gethtml(gethtml_mode)")
|
|
|
exit()
|
|
|
return html
|
|
|
except BaseException as e:
|
|
|
print(e)
|
|
|
print("sth wrong in pipelines.gethtml, see your settings in settings.py")
|
|
|
exit()
|
|
|
|
|
|
def getidlist(response) -> list: # 获取id
|
|
|
reg = r"//li/@data-sku"
|
|
|
if isElementTree(response):
|
|
|
html = response
|
|
|
else:
|
|
|
html = gethtml(response)
|
|
|
# print(html)
|
|
|
idlist = html.xpath(reg)
|
|
|
return idlist
|
|
|
|
|
|
def myreplace(text, mode = '') -> str: # 简单的处理输出
|
|
|
if mode == 'all':
|
|
|
return text.strip().replace(' ', '').replace("\r\n", '').replace('\r', '').replace('\n', '')
|
|
|
elif mode == 'strip': return text.strip().replace('\r', '')
|
|
|
elif mode == 'n': return text.replace('\n', '')
|
|
|
else: return text.strip()
|
|
|
|
|
|
def isElementTree(response) -> bool: # 用于判断是否已经为etree对象
|
|
|
if str(type(response)) == "<class 'lxml.etree._ElementTree'>":
|
|
|
return True
|
|
|
else:
|
|
|
return False
|
|
|
|
|
|
class item:
|
|
|
|
|
|
def __init__(self, id, response):
|
|
|
self.id = id
|
|
|
self.response = response
|
|
|
|
|
|
def getitem(self) -> str:
|
|
|
reg = r"//li[@data-sku='" + str(self.id) + r"']"
|
|
|
item = self.response.xpath(reg)[0]
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
def name() -> str:
|
|
|
string = 'p-name p-name-type-3'
|
|
|
if check(string) == False: # 用于判断有无名字
|
|
|
return ''
|
|
|
reg = r"//div[@class='p-name p-name-type-3']/a/em/text()"
|
|
|
html = etree.HTML(item)
|
|
|
name = html.xpath(reg)[0]
|
|
|
name = myreplace(name)
|
|
|
# print(name)
|
|
|
return name
|
|
|
|
|
|
def shop() -> str:
|
|
|
string = 'curr-shop hd-shopname'
|
|
|
if check(string) == False: # 用于判断有无商店信息
|
|
|
return ''
|
|
|
reg = "//a[@class='curr-shop hd-shopname']/text()"
|
|
|
html = etree.HTML(item)
|
|
|
shop = html.xpath(reg)[0]
|
|
|
shop = myreplace(shop)
|
|
|
# print(shop)
|
|
|
return shop
|
|
|
|
|
|
def price() -> str:
|
|
|
string = 'data-price'
|
|
|
if check(string) == False: # 用于判断有无价格信息
|
|
|
return ''
|
|
|
reg = r"//i[@data-price]/text()"
|
|
|
html = etree.HTML(item)
|
|
|
price = html.xpath(reg)[0]
|
|
|
price = str(price)
|
|
|
# print(price)
|
|
|
return price
|
|
|
|
|
|
def attribute() -> str:
|
|
|
string = 'attr'
|
|
|
if check(string) == False: # 用于判断有无标签
|
|
|
return ''
|
|
|
reg = r"//span[@class='attr']/b/text()"
|
|
|
html = etree.HTML(item)
|
|
|
attribute = html.xpath(reg)
|
|
|
attrStr = ""
|
|
|
for attr in attribute:
|
|
|
attrStr += attr + ' '
|
|
|
# print(attribute)
|
|
|
return myreplace(attrStr)
|
|
|
|
|
|
def sales() -> str:
|
|
|
string = 'p-icons'
|
|
|
if check(string) == False: # 用于判断有无促销信息
|
|
|
return ''
|
|
|
reg = r"//div[@class='p-icons']/i/text()"
|
|
|
html = etree.HTML(item)
|
|
|
sales = html.xpath(reg)
|
|
|
saleStr = ""
|
|
|
for sale in sales:
|
|
|
saleStr += sale + ' '
|
|
|
# print(sales)
|
|
|
return myreplace(saleStr)
|
|
|
|
|
|
def url() -> str:
|
|
|
url = r"https://item.jd.com/" + str(self.id) + r".html"
|
|
|
return url
|
|
|
|
|
|
def check(string, item = item) -> bool:
|
|
|
if string in item: return True
|
|
|
elif not string in item: return False
|
|
|
|
|
|
historyPriceItem = historyPrice.historyPriceItem(self.id)
|
|
|
priceHistoryList = historyPriceItem.gethistoryPrice()
|
|
|
# print("id = {}, list = {}".format(self.id, priceHistoryList[3]))
|
|
|
|
|
|
# itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
|
|
|
|
|
|
itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
|
|
|
# print(itemString)
|
|
|
return itemString
|
|
|
|
|
|
def print2console(response): # 输出到命令行
|
|
|
|
|
|
def output(itemlist = []):
|
|
|
print("商品id:" + itemlist[0])
|
|
|
print("商品名称:" + itemlist[1])
|
|
|
print("价格:¥" + itemlist[2])
|
|
|
print("关键词:" + itemlist[3])
|
|
|
print("促销活动:" + itemlist[4])
|
|
|
print("商品链接:" + itemlist[5])
|
|
|
print("")
|
|
|
|
|
|
try:
|
|
|
for id in getidlist(response):
|
|
|
if int(id) < 1000:
|
|
|
continue
|
|
|
aitem = item(id, gethtml(response))
|
|
|
itemlist = aitem.getitem()
|
|
|
output(itemlist)
|
|
|
|
|
|
except BaseException as e:
|
|
|
print(e)
|
|
|
print("pipelines.py didn't work properly")
|
|
|
|
|
|
print("pipelines.print2console is done")
|
|
|
|
|
|
def write2csv(response, filename_csv): # 写入到csv文件
|
|
|
|
|
|
def writer(fd):
|
|
|
with ThreadPoolExecutor(max_workers = 8) as thread:
|
|
|
for id in getidlist(response):
|
|
|
if int(id) < 1000:
|
|
|
continue
|
|
|
aitem = item(id, gethtml(response))
|
|
|
task = thread.submit(aitem.getitem)
|
|
|
itemString = task.result()
|
|
|
print(itemString)
|
|
|
fd.write(itemString)
|
|
|
|
|
|
try:
|
|
|
|
|
|
dir = "Catalogues"
|
|
|
if not os.path.exists(dir):
|
|
|
os.mkdir(dir)
|
|
|
|
|
|
if os.path.exists(filename_csv):
|
|
|
with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加
|
|
|
writer(fd)
|
|
|
|
|
|
else:
|
|
|
with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始
|
|
|
headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
|
|
|
fd.write(headers)
|
|
|
writer(fd)
|
|
|
|
|
|
except BaseException as e:
|
|
|
print(e)
|
|
|
print("sth wrong in pipelines.write2csv")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
pass
|
|
|
'''
|
|
|
# 调试
|
|
|
filename_csv = os.getcwd() + '\\' + "milk.csv"
|
|
|
response = './1320,1585,9434/1320,1585,9434&page=1.html'
|
|
|
res = gethtml(response, gethtml_mode = 'cache')
|
|
|
write2csv(res, filename_csv)
|
|
|
'''
|
|
|
|
|
|
|
|
|
'''
|
|
|
# 调试数据
|
|
|
import pipelines
|
|
|
from lxml import etree
|
|
|
response = './1320,1585,9434/index.html' # 文件名 or url
|
|
|
html = pipelines.gethtml(response, gethtml_mode = 'cache') # cache or url
|
|
|
id = '3742086'
|
|
|
aitem = pipelines.item(id, html)
|
|
|
a = aitem.getitem()
|
|
|
|
|
|
import historyPrice
|
|
|
bitem = historyPrice.historyPriceItem(id)
|
|
|
b = bitem.gethistoryPrice()
|
|
|
|
|
|
################
|
|
|
|
|
|
bitem.response # 查看requests返回的sources
|
|
|
item = bitem.response.xpath(reg)[0]
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
|
|
|
itemList = a + b
|
|
|
'''
|
|
|
|
|
|
'''
|
|
|
# Xpath 调试
|
|
|
|
|
|
from pipelines import *
|
|
|
response = './1320,1585,9434/index.html'
|
|
|
response = gethtml(response, gethtml_mode = 'cache')
|
|
|
|
|
|
id = '3742086'
|
|
|
reg = r"//li[@data-sku='" + str(id) + r"']"
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
reg = "//div[@class='p-shop']/span/a/text()"
|
|
|
html = etree.HTML(item)
|
|
|
name = html.xpath(reg)[0]
|
|
|
|
|
|
|
|
|
'''
|