|
|
|
@ -8,10 +8,9 @@ selenium + redis + 分布式 + xpath + etree + 可视化
|
|
|
|
|
|
|
|
|
|
- [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容
|
|
|
|
|
- [x] 从历史价格网页爬取历史价格
|
|
|
|
|
- [ ] 同时/后期追加 存入csv 价格趋势,涨跌幅。比对,给出价格波动趋势
|
|
|
|
|
- [x] 加入Redis分布式设计
|
|
|
|
|
- [ ] 数据可视化
|
|
|
|
|
- [ ] 预计两种模式(终端交互):随机或取评价数为索引目标,给出取出的item的具体信息,例如价格趋势。
|
|
|
|
|
- [x] 数据可视化
|
|
|
|
|
- [ ] 预计两种模式(终端交互):随机或取评价数为索引目标,给出取出的item的具体信息,例如价格趋势
|
|
|
|
|
- [ ] 选择目录,友好的选择交互体验
|
|
|
|
|
- [ ] 选择抽取item模式(热评就列出前五条,随机就随机取一条)
|
|
|
|
|
- [ ] python打包exe,需要图形化界面?
|
|
|
|
@ -31,39 +30,10 @@ selenium + redis + 分布式 + xpath + etree + 可视化
|
|
|
|
|
> > milkSpider.py 主文件,配置爬取设置,自动化等
|
|
|
|
|
> >
|
|
|
|
|
> > historyPrice.py 爬取历史价格
|
|
|
|
|
|
|
|
|
|
### selenium
|
|
|
|
|
|
|
|
|
|
配置下载器,利用selenium模拟浏览器正常浏览行为
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
def getsource(url):
|
|
|
|
|
init = Options()
|
|
|
|
|
|
|
|
|
|
init.add_argument('--no-sandbox')
|
|
|
|
|
init.add_argument('--headless')
|
|
|
|
|
init.add_argument('--disable-gpu')
|
|
|
|
|
init.add_argument("disable-cache")
|
|
|
|
|
init.add_argument('disable-infobars')
|
|
|
|
|
init.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0
|
|
|
|
|
init.add_experimental_option("excludeSwitches",['enable-automation','enable-logging'])
|
|
|
|
|
|
|
|
|
|
driver = webdriver.Chrome(chrome_options = init)
|
|
|
|
|
driver.implicitly_wait(10)
|
|
|
|
|
driver.get(url)
|
|
|
|
|
|
|
|
|
|
response = etree.HTML(driver.page_source)
|
|
|
|
|
response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html")
|
|
|
|
|
response = response.decode('utf-8')
|
|
|
|
|
|
|
|
|
|
driver.close()
|
|
|
|
|
return response
|
|
|
|
|
```
|
|
|
|
|
> >
|
|
|
|
|
> > view.py 读取并解析数据,配置可视化内容
|
|
|
|
|
> >
|
|
|
|
|
> > settings.py 主要配置文件
|
|
|
|
|
|
|
|
|
|
## 安装,初始化
|
|
|
|
|
|
|
|
|
@ -92,6 +62,8 @@ git push -u origin master # push, 出错就 -f(注意会造成不可回避的损
|
|
|
|
|
|
|
|
|
|
### selenium
|
|
|
|
|
|
|
|
|
|
配置下载器,利用selenium模拟浏览器正常浏览行为
|
|
|
|
|
|
|
|
|
|
安装
|
|
|
|
|
|
|
|
|
|
```powershell
|
|
|
|
@ -102,24 +74,35 @@ pip3 install selenium
|
|
|
|
|
pip how selenium
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
调用时导入的内容
|
|
|
|
|
调用
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
from selenium import webdriver
|
|
|
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
|
chrome_options = Options()
|
|
|
|
|
# chrome_options.add_argument('lang=zh_CN.UTF-8') # 设置中文
|
|
|
|
|
chrome_options.add_argument('--headless') # 无界面
|
|
|
|
|
chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在报错问题
|
|
|
|
|
chrome_options.add_argument('--disable-gpu') # 禁用GPU硬件加速。如果软件渲染器没有就位,则GPU进程将不会启动。
|
|
|
|
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
|
|
|
|
chrome_options.add_argument('--window-size=1920,1080') # 设置当前窗口的宽度和高度
|
|
|
|
|
driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
|
|
|
|
|
#driver = webdriver.Chrome()
|
|
|
|
|
url = ""
|
|
|
|
|
from lxml import etree
|
|
|
|
|
|
|
|
|
|
def getsource(url):
|
|
|
|
|
init = Options()
|
|
|
|
|
|
|
|
|
|
init.add_argument('--no-sandbox')
|
|
|
|
|
init.add_argument('--headless')
|
|
|
|
|
init.add_argument('--disable-gpu')
|
|
|
|
|
init.add_argument("disable-cache")
|
|
|
|
|
init.add_argument('disable-infobars')
|
|
|
|
|
init.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0
|
|
|
|
|
init.add_experimental_option("excludeSwitches",['enable-automation','enable-logging'])
|
|
|
|
|
|
|
|
|
|
driver = webdriver.Chrome(chrome_options = init)
|
|
|
|
|
driver.implicitly_wait(10)
|
|
|
|
|
driver.get(url)
|
|
|
|
|
print(driver.page_source)
|
|
|
|
|
driver.quit()
|
|
|
|
|
|
|
|
|
|
response = etree.HTML(driver.page_source)
|
|
|
|
|
response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html")
|
|
|
|
|
response = response.decode('utf-8')
|
|
|
|
|
|
|
|
|
|
driver.close()
|
|
|
|
|
return response
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
一些备忘录
|
|
|
|
@ -215,8 +198,6 @@ def getFont(): # 列出可用的字体
|
|
|
|
|
plt.rcParams['font.family'] = ['Microsoft YaHei']
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Requests
|
|
|
|
|
|
|
|
|
|
经典老碟
|
|
|
|
@ -224,6 +205,14 @@ plt.rcParams['font.family'] = ['Microsoft YaHei']
|
|
|
|
|
```python
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"}
|
|
|
|
|
url = ""
|
|
|
|
|
|
|
|
|
|
session = requests.Session()
|
|
|
|
|
res = session.get(url, headers = headers)
|
|
|
|
|
# print(res.request.headers)
|
|
|
|
|
res.encoding = res.apparent_encoding # 'utf-8'
|
|
|
|
|
print(res.text)
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
### 正则表达式
|
|
|
|
@ -233,8 +222,6 @@ import requests
|
|
|
|
|
reg = [-+]?[0-9]*\.?[0-9]*
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### 线程
|
|
|
|
|
|
|
|
|
|
多线程,手动版
|
|
|
|
|