diff --git a/README.md b/README.md index 68968fe..938265b 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,9 @@ selenium + redis + 分布式 + xpath + etree + 可视化 - [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容 - [x] 从历史价格网页爬取历史价格 - - [ ] 同时/后期追加 存入csv 价格趋势,涨跌幅。比对,给出价格波动趋势 - [x] 加入Redis分布式设计 -- [ ] 数据可视化 - - [ ] 预计两种模式(终端交互):随机或取评价数为索引目标,给出取出的item的具体信息,例如价格趋势。 +- [x] 数据可视化 + - [ ] 预计两种模式(终端交互):随机或取评价数为索引目标,给出取出的item的具体信息,例如价格趋势 - [ ] 选择目录,友好的选择交互体验 - [ ] 选择抽取item模式(热评就列出前五条,随机就随机取一条) - [ ] python打包exe,需要图形化界面? @@ -31,39 +30,10 @@ selenium + redis + 分布式 + xpath + etree + 可视化 > > milkSpider.py 主文件,配置爬取设置,自动化等 > > > > historyPrice.py 爬取历史价格 - -### selenium - -配置下载器,利用selenium模拟浏览器正常浏览行为 - -```python -# -*- coding: utf-8 -*- -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from lxml import etree - -def getsource(url): - init = Options() - - init.add_argument('--no-sandbox') - init.add_argument('--headless') - init.add_argument('--disable-gpu') - init.add_argument("disable-cache") - init.add_argument('disable-infobars') - init.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0 - init.add_experimental_option("excludeSwitches",['enable-automation','enable-logging']) - - driver = webdriver.Chrome(chrome_options = init) - driver.implicitly_wait(10) - driver.get(url) - - response = etree.HTML(driver.page_source) - response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html") - response = response.decode('utf-8') - - driver.close() - return response -``` +> > +> > view.py 读取并解析数据,配置可视化内容 +> > +> > settings.py 主要配置文件 ## 安装,初始化 @@ -92,6 +62,8 @@ git push -u origin master # push, 出错就 -f(注意会造成不可回避的损 ### selenium +配置下载器,利用selenium模拟浏览器正常浏览行为 + 安装 ```powershell @@ -102,24 +74,35 @@ pip3 install selenium pip how selenium ``` -调用时导入的内容 +调用 ```python +# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.chrome.options import Options -chrome_options = Options() -# chrome_options.add_argument('lang=zh_CN.UTF-8') # 设置中文 -chrome_options.add_argument('--headless') # 无界面 -chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在报错问题 -chrome_options.add_argument('--disable-gpu') # 禁用GPU硬件加速。如果软件渲染器没有就位,则GPU进程将不会启动。 -chrome_options.add_argument('--disable-dev-shm-usage') -chrome_options.add_argument('--window-size=1920,1080') # 设置当前窗口的宽度和高度 -driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options) -#driver = webdriver.Chrome() -url = "" -driver.get(url) -print(driver.page_source) -driver.quit() +from lxml import etree + +def getsource(url): + init = Options() + + init.add_argument('--no-sandbox') + init.add_argument('--headless') + init.add_argument('--disable-gpu') + init.add_argument("disable-cache") + init.add_argument('disable-infobars') + init.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0 + init.add_experimental_option("excludeSwitches",['enable-automation','enable-logging']) + + driver = webdriver.Chrome(chrome_options = init) + driver.implicitly_wait(10) + driver.get(url) + + response = etree.HTML(driver.page_source) + response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html") + response = response.decode('utf-8') + + driver.close() + return response ``` 一些备忘录 @@ -215,8 +198,6 @@ def getFont(): # 列出可用的字体 plt.rcParams['font.family'] = ['Microsoft YaHei'] ``` - - ### Requests 经典老碟 @@ -224,6 +205,14 @@ plt.rcParams['font.family'] = ['Microsoft YaHei'] ```python import requests +headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"} +url = "" + +session = requests.Session() +res = session.get(url, headers = headers) +# print(res.request.headers) +res.encoding = res.apparent_encoding # 'utf-8' +print(res.text) ``` ### 正则表达式 @@ -233,8 +222,6 @@ import requests reg = [-+]?[0-9]*\.?[0-9]* ``` - - ### 线程 多线程,手动版 diff --git a/milkSpider.py b/milkSpider.py index ac58299..e29bd7b 100644 --- a/milkSpider.py +++ b/milkSpider.py @@ -44,11 +44,19 @@ def view(): return def milkSpider(): + + print("注意:调用milkSpider将启动selenium以及requests进程,因为爬取数据量较大,\n往往会占用较多时间,确定吗?[c]continue or [q]quit:", end = '') + flag = str(input()) + if flag == "q": + print("取消") + return + if middlewares.precheck(): start_time = time.time() middlewares.mainThread() print("Totally spend " + str(round(time.time() - start_time, 2)) + " seconds") print("milkSpider done.") + return def aexit(): print("bye!") diff --git a/settings.py b/settings.py index 8f38a2b..d23b4b7 100644 --- a/settings.py +++ b/settings.py @@ -52,13 +52,13 @@ FONT = ['Microsoft YaHei'] BANNER = { "main": ''' #================*main*=================# -# 1.主界面 -# 2.介绍 +# 1.主界面 [x] +# 2.介绍 [x] # 3.数据可视化 -# 4.向Redis中填充数据 -# 5.清空Redis队列缓存 -# 6.调用 milkSpider -# 7.退出 +# 4.向Redis中填充数据 [x] +# 5.清空 Redis 队列缓存 [x] +# 6.调用 milkSpider [x] +# 7.退出 [x] #========================================# ''', "introduce": ''' diff --git a/view.py b/view.py index 5857d6a..566cdb5 100644 --- a/view.py +++ b/view.py @@ -61,4 +61,5 @@ class view: break show(itemList) -def getData() \ No newline at end of file +def getData(): + pass \ No newline at end of file