update via git

master
wkyuu 3 years ago
parent 030474ebd1
commit 4d593a3d23

@ -8,10 +8,9 @@ selenium + redis + 分布式 + xpath + etree + 可视化
- [x] 初始化 selenium 框架,编写好相应的爬取规则,初步实现小规模爬取内容
- [x] 从历史价格网页爬取历史价格
- [ ] 同时/后期追加 存入csv 价格趋势,涨跌幅。比对,给出价格波动趋势
- [x] 加入Redis分布式设计
- [ ] 数据可视化
- [ ] 预计两种模式终端交互随机或取评价数为索引目标给出取出的item的具体信息例如价格趋势
- [x] 数据可视化
- [ ] 预计两种模式终端交互随机或取评价数为索引目标给出取出的item的具体信息例如价格趋势
- [ ] 选择目录,友好的选择交互体验
- [ ] 选择抽取item模式热评就列出前五条随机就随机取一条
- [ ] python打包exe需要图形化界面
@ -31,39 +30,10 @@ selenium + redis + 分布式 + xpath + etree + 可视化
> > milkSpider.py 主文件,配置爬取设置,自动化等
> >
> > historyPrice.py 爬取历史价格
### selenium
配置下载器利用selenium模拟浏览器正常浏览行为
```python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
def getsource(url):
init = Options()
init.add_argument('--no-sandbox')
init.add_argument('--headless')
init.add_argument('--disable-gpu')
init.add_argument("disable-cache")
init.add_argument('disable-infobars')
init.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0
init.add_experimental_option("excludeSwitches",['enable-automation','enable-logging'])
driver = webdriver.Chrome(chrome_options = init)
driver.implicitly_wait(10)
driver.get(url)
response = etree.HTML(driver.page_source)
response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html")
response = response.decode('utf-8')
driver.close()
return response
```
> >
> > view.py 读取并解析数据,配置可视化内容
> >
> > settings.py 主要配置文件
## 安装,初始化
@ -92,6 +62,8 @@ git push -u origin master # push, 出错就 -f(注意会造成不可回避的损
### selenium
配置下载器利用selenium模拟浏览器正常浏览行为
安装
```powershell
@ -102,24 +74,35 @@ pip3 install selenium
pip how selenium
```
调用时导入的内容
调用
```python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
# chrome_options.add_argument('lang=zh_CN.UTF-8') # 设置中文
chrome_options.add_argument('--headless') # 无界面
chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在报错问题
chrome_options.add_argument('--disable-gpu') # 禁用GPU硬件加速。如果软件渲染器没有就位则GPU进程将不会启动。
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--window-size=1920,1080') # 设置当前窗口的宽度和高度
driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
#driver = webdriver.Chrome()
url = ""
from lxml import etree
def getsource(url):
init = Options()
init.add_argument('--no-sandbox')
init.add_argument('--headless')
init.add_argument('--disable-gpu')
init.add_argument("disable-cache")
init.add_argument('disable-infobars')
init.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0
init.add_experimental_option("excludeSwitches",['enable-automation','enable-logging'])
driver = webdriver.Chrome(chrome_options = init)
driver.implicitly_wait(10)
driver.get(url)
print(driver.page_source)
driver.quit()
response = etree.HTML(driver.page_source)
response = etree.tostring(response, encoding = "utf-8", pretty_print = True, method = "html")
response = response.decode('utf-8')
driver.close()
return response
```
一些备忘录
@ -215,8 +198,6 @@ def getFont(): # 列出可用的字体
plt.rcParams['font.family'] = ['Microsoft YaHei']
```
### Requests
经典老碟
@ -224,6 +205,14 @@ plt.rcParams['font.family'] = ['Microsoft YaHei']
```python
import requests
headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"}
url = ""
session = requests.Session()
res = session.get(url, headers = headers)
# print(res.request.headers)
res.encoding = res.apparent_encoding # 'utf-8'
print(res.text)
```
### 正则表达式
@ -233,8 +222,6 @@ import requests
reg = [-+]?[0-9]*\.?[0-9]*
```
### 线程
多线程,手动版

@ -44,11 +44,19 @@ def view():
return
def milkSpider():
print("注意调用milkSpider将启动selenium以及requests进程因为爬取数据量较大\n往往会占用较多时间,确定吗?[c]continue or [q]quit", end = '')
flag = str(input())
if flag == "q":
print("取消")
return
if middlewares.precheck():
start_time = time.time()
middlewares.mainThread()
print("Totally spend " + str(round(time.time() - start_time, 2)) + " seconds")
print("milkSpider done.")
return
def aexit():
print("bye!")

@ -52,13 +52,13 @@ FONT = ['Microsoft YaHei']
BANNER = {
"main": '''
#================*main*=================#
# 1.主界面
# 2.介绍
# 1.主界面 [x]
# 2.介绍 [x]
# 3.数据可视化
# 4.向Redis中填充数据
# 5.清空Redis队列缓存
# 6.调用 milkSpider
# 7.退出
# 4.向Redis中填充数据 [x]
# 5.清空 Redis 队列缓存 [x]
# 6.调用 milkSpider [x]
# 7.退出 [x]
#========================================#
''',
"introduce": '''

@ -61,4 +61,5 @@ class view:
break
show(itemList)
def getData()
def getData():
pass
Loading…
Cancel
Save