完善历史价格查询

master
wkyuu 3 years ago
parent c372046c7b
commit 3adaf63228

@ -9,11 +9,11 @@ selenium + redis + 分布式 + xpath + etree + 可视化
- [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容
- [x] 从历史价格网页爬取历史价格
- [ ] 同时/后期追加 存入csv 价格趋势,涨跌幅。比对,给出价格波动趋势
- [x] 加入Redis分布式设计
- [ ] 数据可视化
- [ ] 使用python终端绘图需要解决如何选取想要展示的条例
- [ ] 预计两种模式终端交互随机或取评价数为索引目标给出取出的item的具体信息例如价格预测。
- [ ] 选择目录,友好的选择交互体验
- [ ] 选择抽取item模式热评就列出前五条随机就随机取一条
- [ ] python打包exe需要图形化界面
## project
@ -212,24 +212,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
1在爬取的视频评论很多的情况下效率太低且频次太高了容易被ban。应该设定一下比如说只爬几页或者只爬热门评论。而且。。哔哩哔哩的评论区是其他加载的本爬虫在requests.get的时候会多次爬取太危险了。为了一条数据就多次访问被ban风险更高了。可以改成对单个页面就取一次网页源代码但是这个需要真实浏览器来访问才能拿到selenium框架然后对这个源代码进行数据提取操作这样可以比较好地减少爬虫访问量降低被ban的风险。
2爬虫结构是先完全爬取数据存到临时内存中最后才存入文件。风险太大如果中途被ban了容易导致整个爬取的过程前功尽弃且之前数据全都丢失。
2.1,线程作用效果应为
结构:视频{页数{父评论{子评论}}}
采用多线程爬取父评论+子评论,也就是一次能对多个父子评论爬取数据;或者采用多线程爬取页数中的评论,一次能爬取多个页面。后者效率更高。
2.2,但是由于该爬虫设计结构是把全部数据都先爬取再存起来,这样子要求线程对同一个字符串结构进行数据追加,容易导致混乱(即 在线程优先级不同的情况下,多个线程同时操作一个全局字符串,可能上一条还是父评论,下一条就变成另一条父评论的子评论了。)。
2.3整改建议是采用多线程对多个页面爬取数据某个线程结束时即爬完一页就把该页对应的字符串存到csv文件中采用线程锁来控制每次存数据时都仅单个占用csv文件防止数据混乱。这一步需要修改数据的存储过程的流程。
~~3爬取效果不好爬取父评论+子评论的方式,实际上意义不大。意思就是父和子评论都混到一起了,没有主次效果,应该加个标示便于直观显示。~~
@ -268,4 +250,6 @@ redisconn = redis.Redis(host = '127.0.0.1', port = '6379', password = 'x', db =
16[爬虫常见的HTTP错误代码及错误原因](https://blog.csdn.net/Smart_look/article/details/109967222)
17
17[Python字符串操作之字符串分割与组合](https://blog.csdn.net/seetheworld518/article/details/47346527)
18

@ -30,8 +30,7 @@ def getsource(url):
driver.get(url)
response = etree.HTML(driver.page_source)
response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html")
response = response.decode('utf-8')
response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html").decode('utf-8')
driver.close()
return response
@ -40,7 +39,6 @@ def useRequests(url):
try:
session = requests.Session()
res = session.get(url, headers = headers)
res.raise_for_status() # 判断是不是200
# print(res.request.headers)
res.encoding = res.apparent_encoding
res = etree.HTML(res.text)

@ -1,17 +1,17 @@
# -*- coding: utf-8 -*-
from lxml import etree
import settings
import downloader
def myreplace(text):
return text.strip().replace(' ', '').replace("\r\n", '')
import downloader
import pipelines
import settings
class historyPriceItem:
def __init__(self, id):
self.url = settings.HISTORY_PRICE_URL + str(id)
# self.response = downloader.useRequests(self.url)
self.response = etree.parse('historyPrice.html', etree.HTMLParser(encoding = 'utf-8'))
self.response = pipelines.gethtml(downloader.useRequests(self.url))
# self.response = etree.parse('historyPriceMore.html', etree.HTMLParser(encoding = 'utf-8'))
# self.response = etree.parse('historyPrice.html', etree.HTMLParser(encoding = 'utf-8'))
def gethistoryPrice(self) -> list:
@ -19,21 +19,31 @@ class historyPriceItem:
item = self.response.xpath(reg)[0]
item = etree.tostring(item, encoding = 'utf-8', method = 'html').decode('utf-8')
def getTag(self) -> str:
reg = r"//div[@data-content='商品类别:']/text()"
tag = self.response.xpath(reg)[0]
tag = etree.HTML(tag)
tag = myreplace(tag)
return tag[5:]
def get
# tree = etree.tostring(response.xpath(reg)[0], encoding = 'utf-8', method = 'html').decode('utf-8')
def updateTime() -> str:
reg = r"//div[@class='p3']/p[@class='tips']/text()"
time = self.response.xpath(reg)[0]
time = pipelines.myreplace(time, mode = 'strip')
return str(time[5:])
def priceTrend() -> str:
check = 'timeline-text'
if not check in item: # 用于判断有无历史价格记录
return ''
reg = r"//div[@class='timeline-text']/p/text()"
regList = self.response.xpath(reg)
price = ''
for i in range(0, len(regList), 2):
price += pipelines.myreplace(regList[i]) + pipelines.myreplace(regList[i + 1]) + ';'
return price
priceHistoryList = [updateTime(), priceTrend()]
return priceHistoryList
if __name__ == '__main__':
id = "10036840192083"
# id = "10036840192083"
id = "11564571796" # More
aitem = historyPriceItem(id)
aitem.gethistoryPrice()
print(aitem.gethistoryPrice())

@ -2,6 +2,8 @@
from lxml import etree
import csv
import os
import time
import historyPrice
def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
try:
@ -30,9 +32,11 @@ def getidlist(response) -> list: # 获取id
idlist = html.xpath(reg)
return idlist
def myreplace(name) -> str: # 简单的处理输出
name = name.strip()
return name
def myreplace(text, mode = '') -> str: # 简单的处理输出
if mode == 'all':
return text.strip().replace(' ', '').replace("\r\n", '')
elif mode == 'strip': return text.strip().replace('\r', '')
else: return text.strip()
def isElementTree(response) -> bool: # 用于判断是否已经为etree对象
if str(type(response)) == "<class 'lxml.etree._ElementTree'>":
@ -91,7 +95,10 @@ class item:
url = r"https://item.jd.com/" + str(self.id) + r".html"
return url
itemlist = [str(self.id), name(), price(), attribute(), sales(), url()]
historyPriceItem = historyPrice.historyPriceItem(self.id)
priceHistoryList = historyPriceItem.gethistoryPrice()
itemlist = [str(self.id), name(), price(), attribute(), sales(), url()] + priceHistoryList
return itemlist
def print2console(response): # 输出到命令行
@ -147,7 +154,7 @@ def write2csv(response, filename_csv): # 写入到csv文件
except BaseException as e:
print(e)
headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url']
headers = ['id', '商品名称', '价格(人民币)', '标签', '促销策略', 'url', '数据更新时间', '历史价格趋势']
writer.writerow(headers)
write(writer)
@ -160,4 +167,22 @@ def write2csv(response, filename_csv): # 写入到csv文件
print("sth wrong in pipelines.write2csv")
if __name__ == "__main__":
pass
pass
'''
# 调试数据
import pipelines
from lxml import etree
response = 'index.html' # 文件名 or url
html = pipelines.gethtml(response, gethtml_mode = 'cache') # cache or url
id = '1127466'
aitem = pipelines.item(id, html)
a = aitem.getitem()
import historyPrice
bitem = historyPrice.historyPriceItem(id)
b = bitem.gethistoryPrice()
itemList = a + b
'''
Loading…
Cancel
Save