添加历史内容爬取依赖

master
wkyuu 3 years ago
parent 46dcfdb879
commit e1ba325f86

@ -7,10 +7,14 @@ selenium + redis + 分布式 + xpath + etree + 可视化
## TODO
- [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容
- [ ] 从历史价格网页爬取历史价格,比对,给出价格波动趋势
- [x] 从历史价格网页爬取历史价格
- [ ] 同时/后期追加 存入csv 价格趋势,涨跌幅。比对,给出价格波动趋势
- [x] 加入Redis分布式设计
- [ ] 数据可视化
- [ ] 定时,自动化爬取
- [ ] 使用python终端绘图需要解决如何选取想要展示的条例
- [ ] python打包exe需要图形化界面
## project
@ -26,6 +30,7 @@ selenium + redis + 分布式 + xpath + etree + 可视化
> >
> > milkSpider.py 主文件,配置爬取设置,自动化等
> >
> > historyPrice.py 爬取历史价格
### selenium
@ -169,7 +174,6 @@ def saveCookies(driver):
jsonCookies = json.dumps(driver.get_cookies())
with open('cookies.json', 'w', encoding='utf-8') as fd:
fd.write(jsonCookies)
fd.close()
```

@ -4,12 +4,15 @@ from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
import random
from zmq import proxy
import settings
import requests
import json
headers = {
'User-Agent': random.choice(settings.USER_AGENT)
#'User-Agent': random.choice(settings.USER_AGENT)
'User-Agent': settings.USER_AGENT[1]
}
def getsource(url):
@ -36,34 +39,30 @@ def getsource(url):
def useRequests(url):
def saveCookies(response):
myCookies = {}
for key, value in response.cookies.items():
myCookies[key] = value
jsonCookies = json.dumps(myCookies)
with open(settings.COOKIES_FILENAME, mode = 'a', encoding = 'utf-8') as fd:
fd.write(jsonCookies)
print("Cookies saved!")
def write2html(res):
filename = 'historyPrice.html'
with open(filename, mode = 'w+', encoding='utf-8') as fd:
fd.write(res)
try:
res = requests.get(url, headers = headers)
session = requests.Session()
res = session.get(url, headers = headers)
res.raise_for_status() # 判断是不是200
# print(res.request.headers)
res.encoding = res.apparent_encoding
print(res.cookies)
saveCookies(res)
return res
res = etree.HTML(res.text)
source = etree.tostring(res, encoding = 'utf-8', pretty_print = True, method = 'html').decode('utf-8')
# write2html(res)
return source
except BaseException as e:
print(e)
print("sth wrong in your downloader.useRequests. Exiting...")
exit()
if __name__ == "__main__":
# jdurl = r"https://item.jd.com/10036840192083.html"
jdurl = r"https://item.jd.com/59162092942.html"
jdurl = r"https://item.jd.com/10036840192083.html"
url = r"https://www.vveby.com/search?keyword=" + jdurl
print(url)
with open('newhistoryPrice.html', 'w+', encoding = 'utf-8') as fd:
fd.write(useRequests(url).text)
useRequests(url)
print('done')

@ -1,16 +1,16 @@
# -*- coding: utf-8 -*-
from lxml import etree
import settings
import downloader
'''
class historyPriceItem:
def __init__(self, url, ):
def __init__(self, id):
self.url =
if __name__ == '__main__':
pass
'''

@ -8,7 +8,7 @@ def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象
if isElementTree(response):
return response
if gethtml_mode == "cache":
html = etree.parse(response, etree.HTMLParser(encoding='utf-8'))
html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8'))
elif gethtml_mode == "url":
html = etree.HTML(response)
else:

@ -38,4 +38,6 @@ USER_AGENT = [
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
]
COOKIES_FILENAME = "cookies.json"
COOKIES_FILENAME = "cookies.json"
#
Loading…
Cancel
Save