解决了xpath解析问题,爬虫部分还差一个requests那块的多线程,由于时间开销太大,会考虑缩减展示的数量

master
wkyuu 3 years ago
parent 5167ed6bc8
commit f391dc8362

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import string
from lxml import etree
import downloader
@ -20,26 +21,35 @@ class historyPriceItem:
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
def getCommit() -> str:
string = '商品点评:'
if check(string) == False: # 用于判断有无商品点评
return ''
reg = "//div[@data-content='商品点评:']/text()"
commit = self.response.xpath(reg)[0]
commit = pipelines.myreplace(commit, mode = 'all')
return str(commit[5:-1])
def getTags() -> str:
string = '商品类别:'
if check(string) == False: # 用于判断有无商品类别
return ''
reg = "//div[@data-content='商品类别:']/text()"
tags = self.response.xpath(reg)[0]
tags = pipelines.myreplace(tags, mode = 'all')
return str(tags[5:])
def updateTime() -> str:
string = 'timeline-text'
if check(string) == False: # 用于判断有无数据更新时间记录
return ''
reg = r"//div[@class='p3']/p[@class='tips']/text()"
time = self.response.xpath(reg)[0]
time = pipelines.myreplace(time, mode = 'strip')
return str(time[5:])
def priceTrend() -> str:
check = 'timeline-text'
if not check in item: # 用于判断有无历史价格记录
string = 'timeline-text'
if check(string) == False: # 用于判断有无历史价格记录
return ''
reg = r"//div[@class='timeline-text']/p/text()"
regList = self.response.xpath(reg)
@ -48,12 +58,16 @@ class historyPriceItem:
price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';'
return price
def check(string, item = item) -> bool:
if string in item: return True
elif not string in item: return False
priceHistoryList = [getCommit(), getTags(), updateTime(), priceTrend()]
return priceHistoryList
if __name__ == '__main__':
# id = "10036840192083"
id = "11564571796" # More
id = "100020511880" # More
aitem = historyPriceItem(id)
print(aitem.gethistoryPrice())

@ -133,5 +133,4 @@ def localtest(category): # 本地加载的源码测试
time.sleep(10)
print("page " + str(page) + " sleep over at " + time.ctime())
page += 1
aaa = 1

@ -55,6 +55,9 @@ class item:
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
def name() -> str:
string = 'p-name p-name-type-3'
if check(string) == False: # 用于判断有无名字
return ''
reg = r"//div[@class='p-name p-name-type-3']/a/em/text()"
html = etree.HTML(item)
name = html.xpath(reg)[0]
@ -63,7 +66,10 @@ class item:
return name
def shop() -> str:
reg = "//div[@class='p-shop']/span/a/text()"
string = 'curr-shop hd-shopname'
if check(string) == False: # 用于判断有无商店信息
return ''
reg = "//a[@class='curr-shop hd-shopname']/text()"
html = etree.HTML(item)
shop = html.xpath(reg)[0]
shop = myreplace(shop)
@ -71,6 +77,9 @@ class item:
return shop
def price() -> str:
string = 'data-price'
if check(string) == False: # 用于判断有无价格信息
return ''
reg = r"//i[@data-price]/text()"
html = etree.HTML(item)
price = html.xpath(reg)[0]
@ -79,6 +88,9 @@ class item:
return price
def attribute() -> str:
string = 'attr'
if check(string) == False: # 用于判断有无标签
return ''
reg = r"//span[@class='attr']/b/text()"
html = etree.HTML(item)
attribute = html.xpath(reg)
@ -89,6 +101,9 @@ class item:
return myreplace(attrStr)
def sales() -> str:
string = 'p-icons'
if check(string) == False: # 用于判断有无促销信息
return ''
reg = r"//div[@class='p-icons']/i/text()"
html = etree.HTML(item)
sales = html.xpath(reg)
@ -102,12 +117,18 @@ class item:
url = r"https://item.jd.com/" + str(self.id) + r".html"
return url
def check(string, item = item) -> bool:
if string in item: return True
elif not string in item: return False
historyPriceItem = historyPrice.historyPriceItem(self.id)
priceHistoryList = historyPriceItem.gethistoryPrice()
# print("id = {}, list = {}".format(self.id, priceHistoryList[3]))
# itemList = [str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]]
itemString = ("{},{},{},{},{},{},{},{},{},{},{}\n".format(str(self.id), name(), price(), priceHistoryList[0], shop(), priceHistoryList[1], attribute(), sales(), url(), priceHistoryList[2], priceHistoryList[3]))
# print(itemString)
return itemString
def print2console(response): # 输出到命令行
@ -139,29 +160,20 @@ def write2csv(response, filename_csv): # 写入到csv文件
def writer(fd):
for id in getidlist(response):
print('flag1')
if int(id) < 1000:
continue
aitem = item(id, gethtml(response))
itemString = aitem.getitem()
# print(itemList)
try:
fd.write(itemString)
except BaseException as e:
print(e)
print("sth wrong in pipelines.write2csv.write.")
fd.write(itemString)
try:
if os.path.exists(filename_csv):
with open(filename_csv, 'a+', encoding = 'utf-8-sig') as fd: # 存在,文件尾追加
print('flag2')
writer(fd)
else:
with open(filename_csv, 'w+', encoding = 'utf-8-sig') as fd: # 不存在,创建并从文件头开始
# headers = ['id', '商品名称', '价格(人民币)', '评论数量(条)', '商铺名称', '商品类别', '标签', '促销策略', 'url', '价格数据更新时间', '历史价格趋势']
headers = "id,商品名称,价格(人民币),评论数量(条),商铺名称,商品类别,标签,促销策略,url,价格数据更新时间,历史价格趋势\n"
print('flag3')
fd.write(headers)
writer(fd)
@ -171,6 +183,13 @@ def write2csv(response, filename_csv): # 写入到csv文件
if __name__ == "__main__":
pass
'''
# 调试
filename_csv = os.getcwd() + '\\' + "milk.csv"
response = './1320,1585,9434/1320,1585,9434&page=1.html'
res = gethtml(response, gethtml_mode = 'cache')
write2csv(res, filename_csv)
'''
'''

Loading…
Cancel
Save