添加历史内容爬取依赖

master
wkyuu 3 years ago
parent 46dcfdb879
commit e1ba325f86

@ -7,10 +7,14 @@ selenium + redis + 分布式 + xpath + etree + 可视化
## TODO ## TODO
- [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容 - [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容
- [ ] 从历史价格网页爬取历史价格,比对,给出价格波动趋势 - [x] 从历史价格网页爬取历史价格
- [ ] 同时/后期追加 存入csv 价格趋势,涨跌幅。比对,给出价格波动趋势
- [x] 加入Redis分布式设计 - [x] 加入Redis分布式设计
- [ ] 数据可视化 - [ ] 数据可视化
- [ ] 定时,自动化爬取 - [ ] 使用python终端绘图需要解决如何选取想要展示的条例
- [ ] python打包exe需要图形化界面
## project ## project
@ -26,6 +30,7 @@ selenium + redis + 分布式 + xpath + etree + 可视化
> > > >
> > milkSpider.py 主文件,配置爬取设置,自动化等 > > milkSpider.py 主文件,配置爬取设置,自动化等
> > > >
> > historyPrice.py 爬取历史价格
### selenium ### selenium
@ -169,7 +174,6 @@ def saveCookies(driver):
jsonCookies = json.dumps(driver.get_cookies()) jsonCookies = json.dumps(driver.get_cookies())
with open('cookies.json', 'w', encoding='utf-8') as fd: with open('cookies.json', 'w', encoding='utf-8') as fd:
fd.write(jsonCookies) fd.write(jsonCookies)
fd.close()
``` ```

@ -4,12 +4,15 @@ from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from lxml import etree from lxml import etree
import random import random
from zmq import proxy
import settings import settings
import requests import requests
import json import json
headers = { headers = {
'User-Agent': random.choice(settings.USER_AGENT) #'User-Agent': random.choice(settings.USER_AGENT)
'User-Agent': settings.USER_AGENT[1]
} }
def getsource(url): def getsource(url):
@ -36,34 +39,30 @@ def getsource(url):
def useRequests(url): def useRequests(url):
def saveCookies(response): def write2html(res):
myCookies = {} filename = 'historyPrice.html'
for key, value in response.cookies.items(): with open(filename, mode = 'w+', encoding='utf-8') as fd:
myCookies[key] = value fd.write(res)
jsonCookies = json.dumps(myCookies)
with open(settings.COOKIES_FILENAME, mode = 'a', encoding = 'utf-8') as fd:
fd.write(jsonCookies)
print("Cookies saved!")
try: try:
res = requests.get(url, headers = headers) session = requests.Session()
res = session.get(url, headers = headers)
res.raise_for_status() # 判断是不是200 res.raise_for_status() # 判断是不是200
# print(res.request.headers)
res.encoding = res.apparent_encoding res.encoding = res.apparent_encoding
print(res.cookies) res = etree.HTML(res.text)
saveCookies(res) source = etree.tostring(res, encoding = 'utf-8', pretty_print = True, method = 'html').decode('utf-8')
return res # write2html(res)
return source
except BaseException as e: except BaseException as e:
print(e) print(e)
print("sth wrong in your downloader.useRequests. Exiting...") print("sth wrong in your downloader.useRequests. Exiting...")
exit() exit()
if __name__ == "__main__": if __name__ == "__main__":
# jdurl = r"https://item.jd.com/10036840192083.html" jdurl = r"https://item.jd.com/10036840192083.html"
jdurl = r"https://item.jd.com/59162092942.html"
url = r"https://www.vveby.com/search?keyword=" + jdurl url = r"https://www.vveby.com/search?keyword=" + jdurl
print(url) print(url)
with open('newhistoryPrice.html', 'w+', encoding = 'utf-8') as fd: useRequests(url)
fd.write(useRequests(url).text)
print('done') print('done')

@ -1,16 +1,16 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from lxml import etree
import settings import settings
import downloader import downloader
'''
class historyPriceItem: class historyPriceItem:
def __init__(self, url, ): def __init__(self, id):
self.url =
if __name__ == '__main__': if __name__ == '__main__':
pass pass
'''

@ -8,7 +8,7 @@ def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
if isElementTree(response): if isElementTree(response):
return response return response
if gethtml_mode == "cache": if gethtml_mode == "cache":
html = etree.parse(response, etree.HTMLParser(encoding='utf-8')) html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8'))
elif gethtml_mode == "url": elif gethtml_mode == "url":
html = etree.HTML(response) html = etree.HTML(response)
else: else:

@ -38,4 +38,6 @@ USER_AGENT = [
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
] ]
COOKIES_FILENAME = "cookies.json" COOKIES_FILENAME = "cookies.json"
#
Loading…
Cancel
Save