diff --git a/README.md b/README.md index 0030ed3..46c2a6f 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,14 @@ selenium + redis + 分布式 + xpath + etree + 可视化 ## TODO - [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容 -- [ ] 从历史价格网页爬取历史价格,比对,给出价格波动趋势 +- [x] 从历史价格网页爬取历史价格 + - [ ] 同时/后期追加 存入csv 价格趋势,涨跌幅。比对,给出价格波动趋势 + - [x] 加入Redis分布式设计 - [ ] 数据可视化 -- [ ] 定时,自动化爬取 + - [ ] 使用python终端绘图,需要解决如何选取想要展示的条例 + +- [ ] python打包exe,需要图形化界面? ## project @@ -26,6 +30,7 @@ selenium + redis + 分布式 + xpath + etree + 可视化 > > > > milkSpider.py 主文件,配置爬取设置,自动化等 > > +> > historyPrice.py 爬取历史价格 ### selenium @@ -169,7 +174,6 @@ def saveCookies(driver): jsonCookies = json.dumps(driver.get_cookies()) with open('cookies.json', 'w', encoding='utf-8') as fd: fd.write(jsonCookies) - fd.close() ``` diff --git a/downloader.py b/downloader.py index 5af41fd..067b540 100644 --- a/downloader.py +++ b/downloader.py @@ -4,12 +4,15 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options from lxml import etree import random + +from zmq import proxy import settings import requests import json headers = { - 'User-Agent': random.choice(settings.USER_AGENT) + #'User-Agent': random.choice(settings.USER_AGENT) + 'User-Agent': settings.USER_AGENT[1] } def getsource(url): @@ -36,34 +39,30 @@ def getsource(url): def useRequests(url): - def saveCookies(response): - myCookies = {} - for key, value in response.cookies.items(): - myCookies[key] = value - jsonCookies = json.dumps(myCookies) - with open(settings.COOKIES_FILENAME, mode = 'a', encoding = 'utf-8') as fd: - fd.write(jsonCookies) - print("Cookies saved!") + def write2html(res): + filename = 'historyPrice.html' + with open(filename, mode = 'w+', encoding='utf-8') as fd: + fd.write(res) try: - res = requests.get(url, headers = headers) + session = requests.Session() + res = session.get(url, headers = headers) res.raise_for_status() # 判断是不是200 + # print(res.request.headers) res.encoding = res.apparent_encoding - print(res.cookies) - saveCookies(res) - return res + res = etree.HTML(res.text) + source = etree.tostring(res, encoding = 'utf-8', pretty_print = True, method = 'html').decode('utf-8') + # write2html(res) + return source except BaseException as e: print(e) print("sth wrong in your downloader.useRequests. Exiting...") exit() - if __name__ == "__main__": - # jdurl = r"https://item.jd.com/10036840192083.html" - jdurl = r"https://item.jd.com/59162092942.html" + jdurl = r"https://item.jd.com/10036840192083.html" url = r"https://www.vveby.com/search?keyword=" + jdurl print(url) - with open('newhistoryPrice.html', 'w+', encoding = 'utf-8') as fd: - fd.write(useRequests(url).text) + useRequests(url) print('done') \ No newline at end of file diff --git a/historyPrice.py b/historyPrice.py index 9a78e92..ed43b00 100644 --- a/historyPrice.py +++ b/historyPrice.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- +from lxml import etree import settings import downloader -''' class historyPriceItem: - def __init__(self, url, ): + def __init__(self, id): + self.url = if __name__ == '__main__': pass -''' diff --git a/pipelines.py b/pipelines.py index 7f44047..10ddd5f 100644 --- a/pipelines.py +++ b/pipelines.py @@ -8,7 +8,7 @@ def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象 if isElementTree(response): return response if gethtml_mode == "cache": - html = etree.parse(response, etree.HTMLParser(encoding='utf-8')) + html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8')) elif gethtml_mode == "url": html = etree.HTML(response) else: diff --git a/settings.py b/settings.py index 520f830..f482d73 100644 --- a/settings.py +++ b/settings.py @@ -38,4 +38,6 @@ USER_AGENT = [ 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' ] -COOKIES_FILENAME = "cookies.json" \ No newline at end of file +COOKIES_FILENAME = "cookies.json" + +# \ No newline at end of file