From e1ba325f86034f14844bd5fe34d87b1c69c0cce5 Mon Sep 17 00:00:00 2001 From: wkyuu Date: Tue, 19 Apr 2022 15:07:19 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=8E=86=E5=8F=B2=E5=86=85?= =?UTF-8?q?=E5=AE=B9=E7=88=AC=E5=8F=96=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 10 +++++++--- downloader.py | 35 +++++++++++++++++------------------ historyPrice.py | 6 +++--- pipelines.py | 2 +- settings.py | 4 +++- 5 files changed, 31 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 0030ed3..46c2a6f 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,14 @@ selenium + redis + 分布式 + xpath + etree + 可视化 ## TODO - [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容 -- [ ] 从历史价格网页爬取历史价格,比对,给出价格波动趋势 +- [x] 从历史价格网页爬取历史价格 + - [ ] 同时/后期追加 存入csv 价格趋势,涨跌幅。比对,给出价格波动趋势 + - [x] 加入Redis分布式设计 - [ ] 数据可视化 -- [ ] 定时,自动化爬取 + - [ ] 使用python终端绘图,需要解决如何选取想要展示的条例 + +- [ ] python打包exe,需要图形化界面? ## project @@ -26,6 +30,7 @@ selenium + redis + 分布式 + xpath + etree + 可视化 > > > > milkSpider.py 主文件,配置爬取设置,自动化等 > > +> > historyPrice.py 爬取历史价格 ### selenium @@ -169,7 +174,6 @@ def saveCookies(driver): jsonCookies = json.dumps(driver.get_cookies()) with open('cookies.json', 'w', encoding='utf-8') as fd: fd.write(jsonCookies) - fd.close() ``` diff --git a/downloader.py b/downloader.py index 5af41fd..067b540 100644 --- a/downloader.py +++ b/downloader.py @@ -4,12 +4,15 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options from lxml import etree import random + +from zmq import proxy import settings import requests import json headers = { - 'User-Agent': random.choice(settings.USER_AGENT) + #'User-Agent': random.choice(settings.USER_AGENT) + 'User-Agent': settings.USER_AGENT[1] } def getsource(url): @@ -36,34 +39,30 @@ def getsource(url): def useRequests(url): - def saveCookies(response): - myCookies = {} - for key, value in response.cookies.items(): - myCookies[key] = value - jsonCookies = json.dumps(myCookies) - with open(settings.COOKIES_FILENAME, mode = 'a', encoding = 'utf-8') as fd: - fd.write(jsonCookies) - print("Cookies saved!") + def write2html(res): + filename = 'historyPrice.html' + with open(filename, mode = 'w+', encoding='utf-8') as fd: + fd.write(res) try: - res = requests.get(url, headers = headers) + session = requests.Session() + res = session.get(url, headers = headers) res.raise_for_status() # 判断是不是200 + # print(res.request.headers) res.encoding = res.apparent_encoding - print(res.cookies) - saveCookies(res) - return res + res = etree.HTML(res.text) + source = etree.tostring(res, encoding = 'utf-8', pretty_print = True, method = 'html').decode('utf-8') + # write2html(res) + return source except BaseException as e: print(e) print("sth wrong in your downloader.useRequests. Exiting...") exit() - if __name__ == "__main__": - # jdurl = r"https://item.jd.com/10036840192083.html" - jdurl = r"https://item.jd.com/59162092942.html" + jdurl = r"https://item.jd.com/10036840192083.html" url = r"https://www.vveby.com/search?keyword=" + jdurl print(url) - with open('newhistoryPrice.html', 'w+', encoding = 'utf-8') as fd: - fd.write(useRequests(url).text) + useRequests(url) print('done') \ No newline at end of file diff --git a/historyPrice.py b/historyPrice.py index 9a78e92..ed43b00 100644 --- a/historyPrice.py +++ b/historyPrice.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- +from lxml import etree import settings import downloader -''' class historyPriceItem: - def __init__(self, url, ): + def __init__(self, id): + self.url = if __name__ == '__main__': pass -''' diff --git a/pipelines.py b/pipelines.py index 7f44047..10ddd5f 100644 --- a/pipelines.py +++ b/pipelines.py @@ -8,7 +8,7 @@ def gethtml(response, gethtml_mode = "url"): # 用etree格式化得到的对象 if isElementTree(response): return response if gethtml_mode == "cache": - html = etree.parse(response, etree.HTMLParser(encoding='utf-8')) + html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8')) elif gethtml_mode == "url": html = etree.HTML(response) else: diff --git a/settings.py b/settings.py index 520f830..f482d73 100644 --- a/settings.py +++ b/settings.py @@ -38,4 +38,6 @@ USER_AGENT = [ 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' ] -COOKIES_FILENAME = "cookies.json" \ No newline at end of file +COOKIES_FILENAME = "cookies.json" + +# \ No newline at end of file