You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

163 lines
4.9 KiB

3 years ago
# -*- coding: utf-8 -*-
from lxml import etree
import csv
import os
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
try:
if isElementTree(response):
return response
if gethtml_mode == "cache":
html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8'))
3 years ago
elif gethtml_mode == "url":
html = etree.HTML(response)
else:
print("sth wrong with parameters in pipelines.gethtml(gethtml_mode)")
exit()
return html
except BaseException as e:
print(e)
print("sth wrong in pipelines.gethtml, see your settings in settings.py")
exit()
def getidlist(response) -> list: # 获取id
reg = r"//li/@data-sku"
if isElementTree(response):
html = response
else:
html = gethtml(response)
# print(html)
idlist = html.xpath(reg)
return idlist
def myreplace(name) -> str: # 简单的处理输出
name = name.strip()
return name
def isElementTree(response) -> bool: # 用于判断是否已经为etree对象
if str(type(response)) == "<class 'lxml.etree._ElementTree'>":
return True
else:
return False
class item:
def __init__(self, id, response):
self.id = id
self.response = response
def getitem(self) -> list:
reg = r"//li[@data-sku='" + str(self.id) + r"']"
item = self.response.xpath(reg)[0]
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
def name() -> str:
reg = r"//div[@class='p-name p-name-type-3']/a/em/text()"
html = etree.HTML(item)
name = html.xpath(reg)[0]
name = myreplace(name)
# print(name)
return name
def price() -> str:
reg = r"//i[@data-price]/text()"
html = etree.HTML(item)
price = html.xpath(reg)[0]
price = str(price)
# print(price)
return price
def attribute() -> str:
reg = r"//span[@class='attr']/b/text()"
html = etree.HTML(item)
attribute = html.xpath(reg)
attrStr = ""
for attr in attribute:
attrStr += attr + ' '
# print(attribute)
return myreplace(attrStr)
def sales() -> str:
reg = r"//div[@class='p-icons']/i/text()"
html = etree.HTML(item)
sales = html.xpath(reg)
saleStr = ""
for sale in sales:
saleStr += sale + ' '
# print(sales)
return myreplace(saleStr)
def url() -> str:
url = r"https://item.jd.com/" + str(self.id) + r".html"
return url
itemlist = [str(self.id), name(), price(), attribute(), sales(), url()]
return itemlist
def print2console(response): # 输出到命令行
def output(itemlist = []):
print("商品id" + itemlist[0])
print("商品名称:" + itemlist[1])
print("价格:¥" + itemlist[2])
print("关键词:" + itemlist[3])
print("促销活动:" + itemlist[4])
print("商品链接:" + itemlist[5])
print("")
try:
for id in getidlist(response):
if int(id) < 1000:
continue
aitem = item(id, gethtml(response))
itemlist = aitem.getitem()
output(itemlist)
except BaseException as e:
print(e)
print("pipelines.py didn't work properly")
print("pipelines.print2console is done")
def write2csv(response, filename_csv): # 写入到csv文件
def write(writer):
for id in getidlist(response):
if int(id) < 1000:
continue
aitem = item(id, gethtml(response))
itemlist = aitem.getitem()
# print(itemlist)
writer.writerow(itemlist)
try:
if os.path.exists(filename_csv):
with open(filename_csv, 'a+', encoding = 'utf-8-sig', newline = '') as fd: # 存在,文件尾追加
try:
writer = csv.writer(fd)
except BaseException as e:
print(e)
write(writer)
fd.close()
else:
with open(filename_csv, 'w+', encoding = 'utf-8-sig', newline = '') as fd: # 不存在,创建并从文件头开始
try:
writer = csv.writer(fd)
except BaseException as e:
print(e)
headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url']
writer.writerow(headers)
write(writer)
fd.close()
# print("pipelines.write2csv is done")
except BaseException as e:
print(e)
print("sth wrong in pipelines.write2csv")
if __name__ == "__main__":
3 years ago
pass