|
|
|
@ -55,6 +55,9 @@ class item:
|
|
|
|
|
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
|
|
|
|
|
|
|
|
|
|
def name() -> str:
|
|
|
|
|
string = 'p-name p-name-type-3'
|
|
|
|
|
if check(string) == False: # 用于判断有无名字
|
|
|
|
|
return ''
|
|
|
|
|
reg = r"//div[@class='p-name p-name-type-3']/a/em/text()"
|
|
|
|
|
html = etree.HTML(item)
|
|
|
|
|
name = html.xpath(reg)[0]
|
|
|
|
@ -63,7 +66,10 @@ class item:
|
|
|
|
|
return name
|
|
|
|
|
|
|
|
|
|
def shop() -> str:
|
|
|
|
|
reg = "//div[@class='p-shop']/span/a/text()"
|
|
|
|
|
string = 'curr-shop hd-shopname'
|
|
|
|
|
if check(string) == False: # 用于判断有无商店信息
|
|
|
|
|
return ''
|
|
|
|
|
reg = "//a[@class='curr-shop hd-shopname']/text()"
|
|
|
|
|
html = etree.HTML(item)
|
|
|
|
|
shop = html.xpath(reg)[0]
|
|
|
|
|
shop = myreplace(shop)
|
|
|
|
@ -71,6 +77,9 @@ class item:
|
|
|
|
|
return shop
|
|
|
|
|
|
|
|
|
|
def price() -> str:
|
|
|
|
|
string = 'data-price'
|
|
|
|
|
if check(string) == False: # 用于判断有无价格信息
|
|
|
|
|
return ''
|
|
|
|
|
reg = r"//i[@data-price]/text()"
|
|
|
|
|
html = etree.HTML(item)
|
|
|
|
|
price = html.xpath(reg)[0]
|
|
|
|
@ -79,6 +88,9 @@ class item:
|
|
|
|
|
return price
|
|
|
|
|
|
|
|
|
|
def attribute() -> str:
|
|
|
|
|
string = 'attr'
|
|
|
|
|
if check(string) == False: # 用于判断有无标签
|
|
|
|
|
return ''
|
|
|
|
|
reg = r"//span[@class='attr']/b/text()"
|
|
|
|
|
html = etree.HTML(item)
|
|
|
|
|
attribute = html.xpath(reg)
|
|
|
|
@ -89,6 +101,9 @@ class item:
|
|
|
|
|
return myreplace(attrStr)
|
|
|
|
|
|
|
|
|
|
def sales() -> str:
|
|
|
|
|
string = 'p-icons'
|
|
|
|
|
if check(string) == False: # 用于判断有无促销信息
|
|
|
|
|
return ''
|
|
|
|
|
reg = r"//div[@class='p-icons']/i/text()"
|
|
|
|
|
html = etree.HTML(item)
|
|
|
|
|
sales = html.xpath(reg)
|
|
|
|
@ -102,12 +117,18 @@ class item:
|
|
|
|
|
url = r"https://item.jd.com/" + str(self.id) + r".html"
|
|
|
|
|
return url
|
|
|
|
|
|
|
|
|
|
def check(string, item = item) -> bool:
|
|
|
|
|
if string in item: return True
|
|
|
|
|
elif not string in item: return False
|
|
|
|
|
|
|
|
|
|
historyPriceItem = historyPrice.historyPriceItem(self.id)
|
|
|
|
|
priceHistoryList = historyPriceItem.gethistoryPrice()
|
|
|
|
|
# print("id = {}, list = {}".format(self.id, priceHistoryList[3]))
|
|
|
|
|
|
|
|
|
|
# itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
|
|
|
|
|
|
|
|
|
|
itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
|
|
|
|
|
# print(itemString)
|
|
|
|
|
return itemString
|
|
|
|
|
|
|
|
|
|
def print2console(response): # 输出到命令行
|
|
|
|
@ -139,29 +160,20 @@ def write2csv(response, filename_csv): # 写入到csv文件
|
|
|
|
|
|
|
|
|
|
def writer(fd):
|
|
|
|
|
for id in getidlist(response):
|
|
|
|
|
print('flag1')
|
|
|
|
|
if int(id) < 1000:
|
|
|
|
|
continue
|
|
|
|
|
aitem = item(id, gethtml(response))
|
|
|
|
|
itemString = aitem.getitem()
|
|
|
|
|
# print(itemList)
|
|
|
|
|
try:
|
|
|
|
|
fd.write(itemString)
|
|
|
|
|
except BaseException as e:
|
|
|
|
|
print(e)
|
|
|
|
|
print("sth wrong in pipelines.write2csv.write.")
|
|
|
|
|
fd.write(itemString)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if os.path.exists(filename_csv):
|
|
|
|
|
with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加
|
|
|
|
|
print('flag2')
|
|
|
|
|
writer(fd)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始
|
|
|
|
|
# headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
|
|
|
|
|
headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
|
|
|
|
|
print('flag3')
|
|
|
|
|
fd.write(headers)
|
|
|
|
|
writer(fd)
|
|
|
|
|
|
|
|
|
@ -171,6 +183,13 @@ def write2csv(response, filename_csv): # 写入到csv文件
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
pass
|
|
|
|
|
'''
|
|
|
|
|
# 调试
|
|
|
|
|
filename_csv = os.getcwd() + '\\' + "milk.csv"
|
|
|
|
|
response = './1320,1585,9434/1320,1585,9434&page=1.html'
|
|
|
|
|
res = gethtml(response, gethtml_mode = 'cache')
|
|
|
|
|
write2csv(res, filename_csv)
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|