添加历史内容爬取依赖

3 years ago · e1ba325f86
parent 46dcfdb879
commit e1ba325f86
5 changed files with 31 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -7,10 +7,14 @@ selenium + redis + 分布式 + xpath + etree + 可视化
 ## TODO
 -   [x] 初始化 selenium 框架，编写好相应的爬取规则，初步实现小规模爬取内容
-   [ ] 从历史价格网页爬取历史价格，比对，给出价格波动趋势
+-   [x] 从历史价格网页爬取历史价格
    -   [ ] 同时/后期追加 存入csv 价格趋势，涨跌幅。比对，给出价格波动趋势
 -   [x] 加入Redis分布式设计
 -   [ ] 数据可视化
-   [ ] 定时，自动化爬取
+    -   [ ] 使用python终端绘图，需要解决如何选取想要展示的条例
 -   [ ] python打包exe，需要图形化界面？
 ## project
@ -26,6 +30,7 @@ selenium + redis + 分布式 + xpath + etree + 可视化
 >   >
 >   >   milkSpider.py	主文件，配置爬取设置，自动化等
 >   >
 >   >   historyPrice.py	爬取历史价格
 ### selenium
@ -169,7 +174,6 @@ def saveCookies(driver):
    jsonCookies = json.dumps(driver.get_cookies())
    with open('cookies.json', 'w', encoding='utf-8') as fd:
        fd.write(jsonCookies)
    fd.close()
 ```
--- a/downloader.py
+++ b/downloader.py
@ -4,12 +4,15 @@ from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from lxml import etree
 import random
 from zmq import proxy
 import settings
 import requests
 import json
 headers = {
-    'User-Agent': random.choice(settings.USER_AGENT)
+    #'User-Agent': random.choice(settings.USER_AGENT)
    'User-Agent': settings.USER_AGENT[1]
 }
 def getsource(url):
@ -36,34 +39,30 @@ def getsource(url):
 def useRequests(url):
-    def saveCookies(response):
+    def write2html(res):
-        myCookies = {}
+        filename = 'historyPrice.html'
-        for key, value in response.cookies.items():
+        with open(filename, mode = 'w+', encoding='utf-8') as fd:
-            myCookies[key] = value
+            fd.write(res)
        jsonCookies = json.dumps(myCookies)
        with open(settings.COOKIES_FILENAME, mode = 'a', encoding = 'utf-8') as fd:
            fd.write(jsonCookies)
        print("Cookies saved!")
    try:
-        res = requests.get(url, headers = headers)
+        session = requests.Session()
        res = session.get(url, headers = headers)
        res.raise_for_status()  # 判断是不是200
        # print(res.request.headers)
        res.encoding = res.apparent_encoding
-        print(res.cookies)
+        res = etree.HTML(res.text)
-        saveCookies(res)
+        source = etree.tostring(res, encoding = 'utf-8', pretty_print = True, method = 'html').decode('utf-8')
-        return res
+        # write2html(res)
        return source
    except BaseException as e:
        print(e)
        print("sth wrong in your downloader.useRequests. Exiting...")
        exit()
 if __name__ == "__main__":
-    # jdurl = r"https://item.jd.com/10036840192083.html"
+    jdurl = r"https://item.jd.com/10036840192083.html"
    jdurl = r"https://item.jd.com/59162092942.html"
    url = r"https://www.vveby.com/search?keyword=" + jdurl
    print(url)
-    with open('newhistoryPrice.html', 'w+', encoding = 'utf-8') as fd:
+    useRequests(url)
        fd.write(useRequests(url).text)
    print('done')
--- a/historyPrice.py
+++ b/historyPrice.py
@ -1,16 +1,16 @@
 # -*- coding: utf-8 -*-
 from lxml import etree
 import settings
 import downloader
 '''
 class historyPriceItem:
-    def __init__(self, url, ):
+    def __init__(self, id):
        self.url = 
 if __name__ == '__main__':
    pass
 '''
--- a/pipelines.py
+++ b/pipelines.py
@ -8,7 +8,7 @@ def gethtml(response, gethtml_mode = "url"):  # 用etree格式化得到的对象
        if isElementTree(response):
            return response
        if gethtml_mode == "cache":
-            html = etree.parse(response, etree.HTMLParser(encoding='utf-8'))
+            html = etree.parse(response, etree.HTMLParser(encoding = 'utf-8'))
        elif gethtml_mode == "url":
            html = etree.HTML(response)
        else:
--- a/settings.py
+++ b/settings.py
@ -38,4 +38,6 @@ USER_AGENT = [
    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'
 ]
-COOKIES_FILENAME = "cookies.json"
+COOKIES_FILENAME = "cookies.json"
 #