master
wkyuu 3 years ago
parent f014993c2b
commit 43423c6ab3

@ -1,6 +1,6 @@
# milkSpider # milkSpider
selenium + redis + 分布式 + xpath + etree + 可视化 selenium + redis + Matplotlib
任务爬取京东网站上在售的各类牛奶品类的商品名称简介价格相关。并给出相应的价格波动趋势用python的可视化展示。计划任务自动爬取。 任务爬取京东网站上在售的各类牛奶品类的商品名称简介价格相关。并给出相应的价格波动趋势用python的可视化展示。计划任务自动爬取。
@ -13,7 +13,6 @@ selenium + redis + 分布式 + xpath + etree + 可视化
- [x] 预计两种模式终端交互随机或取评价数为索引目标给出取出的item的具体信息例如价格趋势 - [x] 预计两种模式终端交互随机或取评价数为索引目标给出取出的item的具体信息例如价格趋势
- [x] 选择目录,友好的选择交互体验 - [x] 选择目录,友好的选择交互体验
- [x] 选择主要参考方式(价格,评论) - [x] 选择主要参考方式(价格,评论)
- [ ] python打包exe需要图形化界面
## project ## project
@ -235,7 +234,7 @@ print(res.text)
```python ```python
# 完全匹配浮点数 # 完全匹配浮点数
reg = [-+]?[0-9]*\.?[0-9]* reg = "[-+]?[0-9]*\.?[0-9]*"
``` ```
### 线程 ### 线程

@ -1,12 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import string
from lxml import etree from lxml import etree
import downloader import downloader
import pipelines import pipelines
import settings import settings
class historyPriceItem: class historyPriceItem:
def __init__(self, id): def __init__(self, id):
self.url = settings.HISTORY_PRICE_URL + str(id) self.url = settings.HISTORY_PRICE_URL + str(id)

@ -18,13 +18,13 @@ FILENAME_CSV = settings.FILENAME_CSV
connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True) connection_pool = redis.ConnectionPool(host = REDIS_HOST, port = REDIS_PORT, password = REDIS_PASSWORD, decode_responses = True)
redisconn = redis.Redis(connection_pool = connection_pool) redisconn = redis.Redis(connection_pool = connection_pool)
def getCategory(url) -> str: def getCategory(url) -> str: # 用于判断获取的url是属于什么条目便于后续打开对应的文件
for urlstr in BASEURL.items(): for urlstr in BASEURL.items():
if urlstr[1] in url: return urlstr[0] if urlstr[1] in url: return urlstr[0]
print("can't get a valid baseurl! Check your settings.BASEURL.") print("can't get a valid baseurl! Check your settings.BASEURL.")
exit() exit()
def geturlList(baseurl) -> list: def geturlList(baseurl) -> list: # 产生url条目对应页数序列用于存入到redis数据库中
urlList = [] urlList = []
for i in range(1, 20, 2): # 爬取10页 for i in range(1, 20, 2): # 爬取10页
url = baseurl + r"&page=" + str(i) url = baseurl + r"&page=" + str(i)

@ -74,7 +74,7 @@ class view:
break break
show(itemList) show(itemList)
def listCatalogues(): def listCatalogues(): # 列出缓存目录下的条目
path = r"./Catalogues/" path = r"./Catalogues/"
if not os.path.exists(path): if not os.path.exists(path):
print("当前目录[{}]下没有数据!".format(path)) print("当前目录[{}]下没有数据!".format(path))
@ -85,7 +85,7 @@ def listCatalogues():
fileList.append(path + filename) fileList.append(path + filename)
return True, len(fileList), fileList return True, len(fileList), fileList
def getData(filename, catalogue): def getData(filename, catalogue): # 选择读取数据的方式
while True: while True:
print("# 当前选择的目录是 {} .".format(catalogue)) print("# 当前选择的目录是 {} .".format(catalogue))
milkSpider.showBanner(menu = "view") milkSpider.showBanner(menu = "view")

Loading…
Cancel
Save