@ -0,0 +1 @@
|
||||
*.js linguist-vendored
|
@ -0,0 +1,9 @@
|
||||
.git
|
||||
.venv
|
||||
__pycache__
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
.Python
|
||||
env
|
||||
data
|
@ -0,0 +1,53 @@
|
||||
*.py[cod]
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Packages
|
||||
*.egg
|
||||
*.egg-info
|
||||
dist
|
||||
build
|
||||
eggs
|
||||
parts
|
||||
bin
|
||||
var
|
||||
sdist
|
||||
develop-eggs
|
||||
.installed.cfg
|
||||
lib
|
||||
lib64
|
||||
__pycache__
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
.coverage
|
||||
.tox
|
||||
nosetests.xml
|
||||
.pytest_cache
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
||||
# Mr Developer
|
||||
.mr.developer.cfg
|
||||
.project
|
||||
.pydevproject
|
||||
|
||||
# temp file
|
||||
.DS_Store
|
||||
*.pkl
|
||||
|
||||
# venv
|
||||
.venv/
|
||||
|
||||
# Cookiecutter
|
||||
output/
|
||||
|
||||
# vscode
|
||||
.vscode
|
||||
|
||||
# notebooks
|
||||
notebooks/
|
@ -0,0 +1,55 @@
|
||||
FROM python:3.7.4-slim as base
|
||||
|
||||
# Create app directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install app dependencies
|
||||
COPY ./docker/sources.list .
|
||||
|
||||
RUN mv /etc/apt/sources.list /etc/apt/sources.list.bak && mv ./sources.list /etc/apt/
|
||||
|
||||
RUN apt-get -o Acquire::Check-Valid-Until=false update \
|
||||
&& apt-get install \
|
||||
--no-install-recommends --yes \
|
||||
build-essential libpq-dev cron git \
|
||||
python3-dev --yes
|
||||
|
||||
FROM base as build
|
||||
|
||||
COPY requirements.txt .
|
||||
|
||||
RUN mkdir /install
|
||||
|
||||
RUN pip download --destination-directory /install -r /app/requirements.txt -i https://pypi.douban.com/simple
|
||||
|
||||
FROM python:3.7.4-slim as release
|
||||
|
||||
COPY ./docker/sources.list .
|
||||
|
||||
RUN mv /etc/apt/sources.list /etc/apt/sources.list.bak && mv ./sources.list /etc/apt/
|
||||
|
||||
RUN apt-get update && apt-get -y install cron git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY --from=build /install /install
|
||||
|
||||
COPY requirements.txt .
|
||||
|
||||
RUN pip install --no-index --find-links=/install -r requirements.txt
|
||||
|
||||
RUN mkdir /app/docker
|
||||
|
||||
COPY docker/entry.sh /app/docker/
|
||||
|
||||
RUN touch /var/log/bustag.log
|
||||
|
||||
RUN rm -rf /install && rm -rf /root/.cache/pip
|
||||
|
||||
RUN chmod 755 /app/docker/*.sh
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
LABEL maintainer="gxtrobot <gxtrobot@gmail.com>"
|
||||
|
||||
CMD ["/app/docker/entry.sh"]
|
@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2019 gxtrobot
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -0,0 +1,20 @@
|
||||
PYTHON3=python3
|
||||
|
||||
javbus:
|
||||
$(PYTHON3) -m bustag.main download
|
||||
|
||||
recommend:
|
||||
$(PYTHON3) -m bustag.main recommend
|
||||
|
||||
build:
|
||||
docker build -t bustag-app-dev .
|
||||
|
||||
run:
|
||||
docker run --rm -d -v `pwd`/data:/app/data -p 8080:8080 bustag-app-dev
|
||||
|
||||
server:
|
||||
$(PYTHON3) bustag/app/index.py
|
||||
|
||||
publish:
|
||||
docker tag bustag-app-dev gxtrobot/bustag-app:latest
|
||||
docker push gxtrobot/bustag-app:latest
|
@ -1,2 +1,209 @@
|
||||
# bustag-master
|
||||
# 基于机器学习的老司机车牌自动推荐系统
|
||||
<img src="./bustag/app/static/images/logo.png" width="300">
|
||||
|
||||
**Bustag** 是一个基于我开发的 python 异步爬虫框架开发[aspider](https://github.com/gxtrobot/aspider)的自动车牌推荐系统, 系统原理为定时爬取最新车牌信息, 然后可以对车牌进行打标(标示是否喜欢), 打标车牌到一定数量可以进行训练并生成模型, 以后就可以基于此模型自动对下载的车牌进行预测是否喜欢, 可以过滤掉大量不喜欢的车牌, 节约时间
|
||||
|
||||
### Python in Action 学习视频发布
|
||||
[https://github.com/gxtrobot/pyinaction](https://github.com/gxtrobot/pyinaction)
|
||||
|
||||
为提高解决问题效率 ,建了个qq群
|
||||
|
||||
**QQ群: 941894005**
|
||||
|
||||
注意, 该群仅讨论**python学习, 爬虫开发, aspider 框架学习开发, Bustag系统bug, 运行问题**等, 请勿讨论无关主题
|
||||
|
||||
**免责声明:
|
||||
本软件仅用于技术学习使用,禁止用于商业用途,使用本软件所造成的的后果由使用者承担!
|
||||
如果你觉得这个软件不错, 可以请我喝杯冰阔落 ^_^.**
|
||||
|
||||
<p align="center">
|
||||
<img src="./bustag/app/static/images/alipay.jpg" width="200">
|
||||
<img src="./bustag/app/static/images/wechat_pay.jpg" width="200">
|
||||
</p>
|
||||
|
||||
# 紧急提示
|
||||
|
||||
|
||||
# windows, mac 绿色版下载地址
|
||||
链接: https://pan.baidu.com/s/1pqarq7fOXjsbad0WN4Uaaw 提取码: budu
|
||||
|
||||
压缩包密码: gxtrobot
|
||||
|
||||
# docker 版本同步更新
|
||||
运行 ```docker pull gxtrobot/bustag-app:latest``` 后重新启动项目即可
|
||||
|
||||
# 使用须知
|
||||
只需在data目录下创建[config.ini](https://raw.githubusercontent.com/gxtrobot/bustag/master/data/config.ini), 然后启动系统, 访问localhost:8000
|
||||
|
||||
### 使用视频
|
||||
|
||||
链接: https://pan.baidu.com/s/1pqarq7fOXjsbad0WN4Uaaw 提取码: budu
|
||||
|
||||
在视频目录下
|
||||
|
||||
- 群晖docker安装bustag 视频发布(2019-10-16)
|
||||
- bustag最新使用视频发布(2019-10-15)
|
||||
- linux/mac docker版本安装
|
||||
- win10 docker版本安装
|
||||
|
||||
## 更新
|
||||
|
||||
### release 0.2.1(2019-10-12)
|
||||
- 修复几个bug
|
||||
- 增加系统启动错误信息打印, 方便排查
|
||||
|
||||
该版本主要是为了更好发现错误, 如果能正常运行的可以不更新
|
||||
|
||||
### release 0.2.0(2019-10-7)
|
||||
- 一些bug修复, 如数据解析不全等
|
||||
- 多处页面优化(封面图点击放大, 页面跳转等等)
|
||||
- 启动检查config.ini, 没有自动退出
|
||||
- 手动本地文件管理, 增加手动添加番号及本地路径功能
|
||||
- 增加今日更新, 推荐数量
|
||||
- 手动上传番号, 直接打标为喜欢
|
||||
- 添加logo
|
||||
- 增加打标数据库导入功能
|
||||
|
||||
#### 注意事项
|
||||
- **v0.2.0 版本有数据库结构变化, 所以和老版本数据库不兼容, 建议新建一个目录运行新版, 如需要愿数据库打标数据, 可以通过数据页面, 导入数据库完成打标数据导入**
|
||||
|
||||
### 2019-9-6 0.1.1版发布
|
||||
- 修复部分bug
|
||||
- 增加windows(只在win10下测试过)), mac 绿色版, 解压直接使用,
|
||||
|
||||
|
||||
## 系统功能
|
||||
|
||||
- 自动抓取最新车牌信息, 抓取频率可以自定义
|
||||
- 系统启动后自动开启一次下载, 然后安装设置抓取频率下载
|
||||
- 车牌打标功能
|
||||
- 模型训练, 基于当前所有打标数据训练模型
|
||||
- 有了模型后, 自动预测判断是否喜欢
|
||||
- 手动上传番号, 本地文件管理
|
||||
- 数据库打标数据导入
|
||||
- Docker 镜像一键运行, 省去新手配置项目的麻烦
|
||||
- 项目访问地址: localhost:8000
|
||||
|
||||
## 系统截图(隐藏了左边封面图片)
|
||||
|
||||
- 推荐页面
|
||||
![](./docs/recommend.png)
|
||||
|
||||
- 打标页面
|
||||
![](./docs/tagit.png)
|
||||
|
||||
- 本地文件页面
|
||||
![](./docs/local.png)
|
||||
|
||||
- 本地番号, 链接上传页面
|
||||
![](./docs/local_upload.png)
|
||||
|
||||
- 模型页面
|
||||
![](./docs/model.png)
|
||||
|
||||
- 数据页面
|
||||
![](./docs/data.png)
|
||||
|
||||
## 如何运行项目
|
||||
|
||||
### windows , mac绿色版如何使用
|
||||
下载zip包后解压缩到任意目录, 然后在目录下的data目录里, 创建文件[config.ini](https://raw.githubusercontent.com/gxtrobot/bustag/master/data/config.ini)
|
||||
- windows 版: 执行(双击)bustag.exe
|
||||
- mac 版: 执行(双击)bustag
|
||||
- 浏览器访问: localhost:8000, 访问成功说明运行正常, 如果访问不成功, 可以看bustag程序窗口有无报错
|
||||
|
||||
### 本地源代码安装
|
||||
懂 python 开发的可以 clone 本项目, 建立一个虚拟环境并按照 requirements.txt 的 python 包后, 在项目根目录下
|
||||
直接运行
|
||||
|
||||
```
|
||||
python bustag/app/index.py
|
||||
|
||||
或者安装了gunicorn
|
||||
gunicorn bustag.app.index:app --bind='0.0.0.0:8000'
|
||||
```
|
||||
|
||||
### 使用 docker 运行(推荐)
|
||||
|
||||
1. 建立一个目录, 如 bustag, 然后在该目录下建一个子目录 data, data 目录用于保存配置文件以及下载数据的数据库
|
||||
2. 在 data 下需要建立一个文件, [config.ini](https://raw.githubusercontent.com/gxtrobot/bustag/master/data/config.ini), 该文件用于设置爬取的初始地址, 以及每次下载的最大数量
|
||||
3. 运行命令
|
||||
|
||||
```
|
||||
linux, mac
|
||||
docker run --rm -d -e TZ=Asia/Shanghai -e PYTHONUNBUFFERED=1 -v $(pwd)/data:/app/data -p 8000:8000 gxtrobot/bustag-app
|
||||
|
||||
windows powershell
|
||||
docker run --rm -d -e TZ=Asia/Shanghai -e PYTHONUNBUFFERED=1 -v ${PWD}/data:/app/data -p 8000:8000 gxtrobot/bustag-app
|
||||
|
||||
|
||||
注: -e TZ=Asia/Shanghai , 指的是docker container的时区设置, 如果需要其他时区可自行设置, 如果不设置默认为UTC时区
|
||||
-e PYTHONUNBUFFERED=1 , 指的是显示所有log输出, 如果不设置, 那只能看到debug 的错误log日志
|
||||
```
|
||||
|
||||
## 如何使用项目
|
||||
|
||||
### 请按照以下顺序
|
||||
|
||||
1. 到打标页面进行打标, 达到一定数量(喜欢+不喜欢), 比如 300
|
||||
2. 到其他页面训练模型
|
||||
3. 坐等系统自动推荐
|
||||
4. 在推荐页面进行确认(确认过的数据转为打标数据)
|
||||
5. 积累更多打标数据, 再次训练模型, 打标数据越多模型效果越好
|
||||
|
||||
### data 目录文件说明
|
||||
|
||||
```
|
||||
|____bus.db
|
||||
|____config.ini
|
||||
|____crontab.txt
|
||||
|____model
|
||||
| |____ label_binarizer.pkl
|
||||
| |____model.pkl
|
||||
```
|
||||
|
||||
- config.ini, (系统配置文件, 必须, 系统启动时候需要此文件, [参考文件](./data/config.ini))
|
||||
- root_path: 制定bus网站主页地址, 爬虫起始地址, 由于地址可能变化, 确保本机能够访问该地址, 如果需要代理才能访问, 必须开启全局代理, 系统本身无代理设置
|
||||
- count: 每次下载总数, 建议不要太多, 500以下比较好
|
||||
- interval: 每次下载间隔时间, 单位为秒, 建议不要低于1800秒
|
||||
|
||||
- bus.db (数据库文件, 可选, 但是可以放一个[现成的库, 有 2000 条数据, 方便直接开始打标, 不需要等下载](./data/bus.db))
|
||||
- crontab.txt (定时下载配置文件, 可选, [参考例子](./docker/crontab.txt))
|
||||
- model 目录(系统训练生成的模型)
|
||||
|
||||
## 其他问题
|
||||
|
||||
1. 改变自动下载的频率
|
||||
修改config.ini的interval 参数即可, 单位是秒, 比如修改为一小时更新一次为 `interval=3600`
|
||||
|
||||
2. 改变下载初始 url
|
||||
因为该 url 会经常改变, 所有系统的 config.ini -> download -> root_path 定义了初始 url, 可以根据需要改变
|
||||
|
||||
3. 是否可以使用代理
|
||||
目前系统还没加入代理功能, 不过可以在 docker 设置代理访问
|
||||
|
||||
4. 下载数量多少合适
|
||||
鉴于爬虫的稳定性, 不建议每次下载太多, 也可能会给 bus 服务器带来压力, 如果需要, 初次使用可以加大到 1000, 这样可以下载多点初始数据用于打标, 后面可以改为 300
|
||||
|
||||
5. 模型效果如何
|
||||
经过一些测试, 最终使用了 KNN 模型, 效果的话谈不上非常好, 在准确率上还可以, 不过召回率相对低一些, 也就是说推荐的准确率相对高点, 但是会漏掉一些喜欢的数据.
|
||||
所以, 鉴于定期对推荐数据进行确认, 经过确认后, 推荐数据转为打标数据, 然后重新训练,打标数据越多效果越好
|
||||
|
||||
6. 要多少打标数据才能训练模型
|
||||
建议至少达到 300 打标数据(包括喜欢, 不喜欢), 如何尝试训练模型, 并查看模型效果值, 如不满意可以增加训练数据并重新训练
|
||||
|
||||
7. 模型用了什么数据训练
|
||||
模型目前主要使用了各种标签数据, 比如影片分类, 女优名等等, 目前没有使用到标题
|
||||
|
||||
8. 如何改变服务器运行端口
|
||||
服务器默认为 8000 端口, 如果需要改变, 可以修改启动 docker 容器命令, 比如 8000
|
||||
|
||||
```
|
||||
修改为8000端口, 注意:后面的8000不要变, 然后可以通过localhost:8000访问
|
||||
|
||||
docker run --rm -d -v $(pwd)/data:/app/data -p 8000:8000 gxtrobot/bustag-app
|
||||
```
|
||||
|
||||
9. 如何备份数据库
|
||||
系统使用的数据库保存在 data 目录下的 bus.db, 如果有需要可以将此文件拷贝一份作为备份, 比如在打标测试模型时, 如果不想使用当前打标数据, 可以将数据库恢复到原来的版本
|
||||
该数据库为 sqlite 格式, 可以直接使用软件打开, 比如 [DB Browser for Sqlite](https://sqlitebrowser.org/), 该软件支持多平台
|
||||
|
@ -0,0 +1,5 @@
|
||||
"""bustag - a tag and recommend system for old bus driver"""
|
||||
|
||||
__version__ = '0.2.1'
|
||||
__author__ = 'gxtrobot <gxtrobot@gmail.com>'
|
||||
__all__ = []
|
@ -0,0 +1,235 @@
|
||||
from collections import defaultdict
|
||||
import threading
|
||||
import traceback
|
||||
import sys
|
||||
import os
|
||||
import bottle
|
||||
from multiprocessing import freeze_support
|
||||
from bottle import route, run, template, static_file, request, response, redirect, hook
|
||||
|
||||
dirname = os.path.dirname(os.path.realpath(__file__))
|
||||
if getattr(sys, 'frozen', False):
|
||||
dirname = sys._MEIPASS
|
||||
print('dirname:' + dirname)
|
||||
bottle.TEMPLATE_PATH.insert(0, dirname + '/views/')
|
||||
|
||||
|
||||
@hook('before_request')
|
||||
def _connect_db():
|
||||
dbconn.connect(reuse_if_open=True)
|
||||
|
||||
|
||||
@hook('after_request')
|
||||
def _close_db():
|
||||
if not dbconn.is_closed():
|
||||
dbconn.close()
|
||||
|
||||
|
||||
@route('/static/<filepath:path>')
|
||||
def send_static(filepath):
|
||||
return static_file(filepath, root=dirname+'/static/')
|
||||
|
||||
|
||||
def _remove_extra_tags(item):
|
||||
limit = 10
|
||||
tags_dict = item.tags_dict
|
||||
tags = ['genre', 'star']
|
||||
for t in tags:
|
||||
tags_dict[t] = tags_dict[t][:limit]
|
||||
|
||||
|
||||
@route('/')
|
||||
def index():
|
||||
rate_type = RATE_TYPE.SYSTEM_RATE.value
|
||||
rate_value = int(request.query.get('like', RATE_VALUE.LIKE.value))
|
||||
page = int(request.query.get('page', 1))
|
||||
items, page_info = get_items(
|
||||
rate_type=rate_type, rate_value=rate_value, page=page)
|
||||
for item in items:
|
||||
_remove_extra_tags(item)
|
||||
today_update_count = db.get_today_update_count()
|
||||
today_recommend_count = db.get_today_recommend_count()
|
||||
msg = f'今日更新 {today_update_count} , 今日推荐 {today_recommend_count}'
|
||||
return template('index', items=items, page_info=page_info, like=rate_value, path=request.path, msg=msg)
|
||||
|
||||
|
||||
@route('/tagit')
|
||||
def tagit():
|
||||
rate_value = request.query.get('like', None)
|
||||
rate_value = None if rate_value == 'None' else rate_value
|
||||
rate_type = None
|
||||
if rate_value:
|
||||
rate_value = int(rate_value)
|
||||
rate_type = RATE_TYPE.USER_RATE
|
||||
page = int(request.query.get('page', 1))
|
||||
items, page_info = get_items(
|
||||
rate_type=rate_type, rate_value=rate_value, page=page)
|
||||
for item in items:
|
||||
_remove_extra_tags(item)
|
||||
return template('tagit', items=items, page_info=page_info, like=rate_value, path=request.path)
|
||||
|
||||
|
||||
@route('/tag/<fanhao>', method='POST')
|
||||
def tag(fanhao):
|
||||
if request.POST.submit:
|
||||
formid = request.POST.formid
|
||||
item_rate = ItemRate.get_by_fanhao(fanhao)
|
||||
rate_value = request.POST.submit
|
||||
if not item_rate:
|
||||
rate_type = RATE_TYPE.USER_RATE
|
||||
ItemRate.saveit(rate_type, rate_value, fanhao)
|
||||
logger.debug(f'add new item_rate for fanhao:{fanhao}')
|
||||
else:
|
||||
item_rate.rate_value = rate_value
|
||||
item_rate.save()
|
||||
logger.debug(f'updated item_rate for fanhao:{fanhao}')
|
||||
page = int(request.query.get('page', 1))
|
||||
like = request.query.get('like')
|
||||
url = f'/tagit?page={page}&like={like}'
|
||||
if formid:
|
||||
url += f'#{formid}'
|
||||
redirect(url)
|
||||
|
||||
|
||||
@route('/correct/<fanhao>', method='POST')
|
||||
def correct(fanhao):
|
||||
if request.POST.submit:
|
||||
formid = request.POST.formid
|
||||
is_correct = int(request.POST.submit)
|
||||
item_rate = ItemRate.get_by_fanhao(fanhao)
|
||||
if item_rate:
|
||||
item_rate.rate_type = RATE_TYPE.USER_RATE
|
||||
if not is_correct:
|
||||
rate_value = item_rate.rate_value
|
||||
rate_value = 1 if rate_value == 0 else 0
|
||||
item_rate.rate_value = rate_value
|
||||
item_rate.save()
|
||||
logger.debug(
|
||||
f'updated item fanhao: {fanhao}, {"and correct the rate_value" if not is_correct else ""}')
|
||||
page = int(request.query.get('page', 1))
|
||||
like = int(request.query.get('like', 1))
|
||||
url = f'/?page={page}&like={like}'
|
||||
if formid:
|
||||
url += f'#{formid}'
|
||||
redirect(url)
|
||||
|
||||
|
||||
@route('/model')
|
||||
def other_settings():
|
||||
try:
|
||||
_, model_scores = clf.load()
|
||||
except FileNotFoundError:
|
||||
model_scores = None
|
||||
return template('model', path=request.path, model_scores=model_scores)
|
||||
|
||||
|
||||
@route('/do-training')
|
||||
def do_training():
|
||||
error_msg = None
|
||||
model_scores = None
|
||||
try:
|
||||
_, model_scores = clf.train()
|
||||
except ValueError as ex:
|
||||
logger.exception(ex)
|
||||
error_msg = ' '.join(ex.args)
|
||||
return template('model', path=request.path, model_scores=model_scores, error_msg=error_msg)
|
||||
|
||||
|
||||
@route('/local_fanhao', method=['GET', 'POST'])
|
||||
def update_local_fanhao():
|
||||
msg = ''
|
||||
if request.POST.submit:
|
||||
fanhao_list = request.POST.fanhao
|
||||
tag_like = request.POST.tag_like == '1'
|
||||
missed_fanhao, local_file_count, tag_file_count = add_local_fanhao(
|
||||
fanhao_list, tag_like)
|
||||
if len(missed_fanhao) > 0:
|
||||
urls = [bus_spider.get_url_by_fanhao(
|
||||
fanhao) for fanhao in missed_fanhao]
|
||||
add_download_job(urls)
|
||||
msg = f'上传 {len(missed_fanhao)} 个番号, {local_file_count} 个本地文件'
|
||||
if tag_like:
|
||||
msg += f', {tag_file_count} 个打标为喜欢'
|
||||
return template('local_fanhao', path=request.path, msg=msg)
|
||||
|
||||
|
||||
@route('/local')
|
||||
def local():
|
||||
page = int(request.query.get('page', 1))
|
||||
items, page_info = get_local_items(page=page)
|
||||
for local_item in items:
|
||||
LocalItem.loadit(local_item)
|
||||
_remove_extra_tags(local_item.item)
|
||||
return template('local', items=items, page_info=page_info, path=request.path)
|
||||
|
||||
|
||||
@route('/local_play/<id:int>')
|
||||
def local_play(id):
|
||||
local_item = LocalItem.update_play(id)
|
||||
file_path = local_item.path
|
||||
logger.debug(file_path)
|
||||
redirect(file_path)
|
||||
|
||||
|
||||
@route('/load_db', method=['GET', 'POST'])
|
||||
def load_db():
|
||||
msg = ''
|
||||
errmsg = ''
|
||||
if request.POST.submit:
|
||||
upload = request.files.get('dbfile')
|
||||
if upload:
|
||||
logger.debug(upload.filename)
|
||||
name = get_data_path('uploaded.db')
|
||||
upload.save(name, overwrite=True)
|
||||
logger.debug(f'uploaded file saved to {name}')
|
||||
try:
|
||||
tag_file_added, missed_fanhaos = load_tags_db()
|
||||
except DBError:
|
||||
errmsg = '数据库文件错误, 请检查文件是否正确上传'
|
||||
else:
|
||||
urls = [bus_spider.get_url_by_fanhao(
|
||||
fanhao) for fanhao in missed_fanhaos]
|
||||
add_download_job(urls)
|
||||
msg = f'上传 {tag_file_added} 条用户打标数据, {len(missed_fanhaos)} 个番号, '
|
||||
msg += ' 注意: 需要下载其他数据才能开始建模, 请等候一定时间'
|
||||
else:
|
||||
errmsg = '请上传数据库文件'
|
||||
return template('load_db', path=request.path, msg=msg, errmsg=errmsg)
|
||||
|
||||
|
||||
@route('/about')
|
||||
def about():
|
||||
return template('about', path=request.path)
|
||||
|
||||
|
||||
app = bottle.default_app()
|
||||
|
||||
|
||||
def start_app():
|
||||
t = threading.Thread(target=start_scheduler)
|
||||
t.start()
|
||||
run(host='0.0.0.0', server='paste', port=8000, debug=True)
|
||||
# run(host='0.0.0.0', port=8000, debug=True, reloader=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
freeze_support()
|
||||
from bustag import __version__
|
||||
print(f"Bustag server starting: version: {__version__}\n\n")
|
||||
import bustag.model.classifier as clf
|
||||
from bustag.util import logger, get_cwd, get_now_time, get_data_path
|
||||
from bustag.spider.db import (get_items, get_local_items, RATE_TYPE, RATE_VALUE, ItemRate,
|
||||
Item, LocalItem, DBError, db as dbconn)
|
||||
from bustag.spider import db
|
||||
from bustag.app.schedule import start_scheduler, add_download_job
|
||||
from bustag.spider import bus_spider
|
||||
from bustag.app.local import add_local_fanhao, load_tags_db
|
||||
start_app()
|
||||
except Exception as e:
|
||||
print('system error')
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
print("Press Enter to continue ...")
|
||||
input()
|
||||
os._exit(1)
|
@ -0,0 +1,109 @@
|
||||
'''
|
||||
handle local file related functions
|
||||
'''
|
||||
import re
|
||||
from peewee import SqliteDatabase, DatabaseError
|
||||
|
||||
from bustag.spider.db import Item, LocalItem, ItemRate, RATE_TYPE, RATE_VALUE, db, DBError
|
||||
from bustag.util import logger, get_data_path
|
||||
|
||||
|
||||
def add_local_fanhao(fanhao, tag_like):
|
||||
'''
|
||||
Args:
|
||||
fanhao:str - ',' separeted (fanhao, path)
|
||||
'''
|
||||
rows = fanhao.splitlines()
|
||||
items = []
|
||||
missed_fanhaos = []
|
||||
local_file_added = 0
|
||||
tag_file_added = 0
|
||||
pattern = r'([A-Z]+)-?([0-9]+)'
|
||||
for row in rows:
|
||||
if ',' in row:
|
||||
fanhao, path = row.split(',')
|
||||
else:
|
||||
fanhao = row
|
||||
path = None
|
||||
|
||||
fanhao = fanhao.strip().upper()
|
||||
match = re.search(pattern, fanhao)
|
||||
if match and len(match.groups()) == 2:
|
||||
series, num = match.groups()
|
||||
matched_fanhao = f'{series}-{num}'
|
||||
path = path.strip() if path else None
|
||||
logger.debug(f'matched fanhao {matched_fanhao}')
|
||||
items.append((matched_fanhao, path))
|
||||
with db.atomic():
|
||||
for item in items:
|
||||
fanhao, path = item
|
||||
# if path is not None, add to local item
|
||||
if path:
|
||||
local_item = LocalItem.saveit(fanhao, path)
|
||||
if local_item:
|
||||
local_file_added += 1
|
||||
# if tag_like is True, add it to item_rate table
|
||||
if tag_like:
|
||||
item_rate = ItemRate.saveit(
|
||||
RATE_TYPE.USER_RATE, RATE_VALUE.LIKE, fanhao)
|
||||
if item_rate:
|
||||
tag_file_added += 1
|
||||
if not Item.get_by_fanhao(fanhao):
|
||||
# add to get from spider
|
||||
missed_fanhaos.append(fanhao)
|
||||
logger.debug(f'missed_fanhaos:{missed_fanhaos}')
|
||||
logger.debug(f'tag_file_added:{tag_file_added}')
|
||||
logger.debug(f'local_file_added:{local_file_added}')
|
||||
return missed_fanhaos, local_file_added, tag_file_added
|
||||
|
||||
|
||||
def load_tags_db():
|
||||
'''
|
||||
load user tags data from uploaded db file
|
||||
|
||||
Args:
|
||||
file: io.BufferedRandom -> uploaded db file stream
|
||||
'''
|
||||
db_name = get_data_path('uploaded.db')
|
||||
try:
|
||||
db_upload = SqliteDatabase(db_name)
|
||||
db_upload.get_tables()
|
||||
except DatabaseError:
|
||||
raise DBError()
|
||||
db_is_old = False
|
||||
tag_data = []
|
||||
missed_fanhaos = []
|
||||
tag_file_added = 0
|
||||
sql_old = '''select item_rate.rate_value, item.fanhao
|
||||
from item_rate inner
|
||||
join item on item_rate.item_id = item.id
|
||||
where item_rate.rate_type=1 '''
|
||||
|
||||
sql_new = '''select item_rate.rate_value, item.fanhao
|
||||
from item_rate inner
|
||||
join item on item_rate.item_id = item.fanhao
|
||||
where item_rate.rate_type=1 '''
|
||||
cursor = db_upload.execute_sql(sql_old)
|
||||
res = cursor.fetchone()
|
||||
if res:
|
||||
db_is_old = True
|
||||
if db_is_old:
|
||||
cursor = db_upload.execute_sql(sql_old)
|
||||
else:
|
||||
cursor = db_upload.execute_sql(sql_new)
|
||||
|
||||
for row in cursor.fetchall():
|
||||
tag_data.append(row)
|
||||
with db_upload.atomic():
|
||||
for rate_value, fanhao in tag_data:
|
||||
item_rate = ItemRate.saveit(
|
||||
RATE_TYPE.USER_RATE, rate_value, fanhao)
|
||||
if item_rate:
|
||||
tag_file_added += 1
|
||||
if not Item.get_by_fanhao(fanhao):
|
||||
# add to get from spider
|
||||
missed_fanhaos.append(fanhao)
|
||||
logger.debug(tag_data)
|
||||
logger.info(f'added user tag rate: {tag_file_added}')
|
||||
logger.info(f'added fanhao to download: {len(missed_fanhaos)}')
|
||||
return tag_file_added, missed_fanhaos
|
@ -0,0 +1,76 @@
|
||||
import sys
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from apscheduler.triggers.date import DateTrigger
|
||||
from aspider import aspider
|
||||
from bustag.spider import bus_spider
|
||||
from bustag.util import logger, APP_CONFIG
|
||||
|
||||
scheduler = None
|
||||
loop = None
|
||||
|
||||
|
||||
def download(loop, no_parse_links=False, urls=None):
|
||||
"""
|
||||
下载更新数据
|
||||
|
||||
Args:
|
||||
urls:tuple - tuple of urls
|
||||
"""
|
||||
print('start download')
|
||||
# reset sys.argv
|
||||
sys.argv = sys.argv[:1]
|
||||
if not urls:
|
||||
logger.warning('no links to download')
|
||||
return
|
||||
count = APP_CONFIG['download.count']
|
||||
if no_parse_links:
|
||||
count = len(urls)
|
||||
extra_options = APP_CONFIG.get('options', {})
|
||||
options = {'no_parse_links': no_parse_links,
|
||||
'roots': urls, 'count': count}
|
||||
extra_options.update(options)
|
||||
|
||||
aspider.download(loop, extra_options)
|
||||
try:
|
||||
import bustag.model.classifier as clf
|
||||
|
||||
clf.recommend()
|
||||
except FileNotFoundError:
|
||||
print('还没有训练好的模型, 无法推荐')
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
global scheduler, loop
|
||||
|
||||
interval = int(APP_CONFIG.get('download.interval', 1800))
|
||||
loop = asyncio.new_event_loop()
|
||||
scheduler = AsyncIOScheduler(event_loop=loop)
|
||||
t1 = datetime.now() + timedelta(seconds=1)
|
||||
int_trigger = IntervalTrigger(seconds=interval)
|
||||
date_trigger = DateTrigger(run_date=t1)
|
||||
urls = (APP_CONFIG['download.root_path'],)
|
||||
# add for down at server start
|
||||
scheduler.add_job(download, trigger=date_trigger, args=(loop, False, urls))
|
||||
scheduler.add_job(download, trigger=int_trigger, args=(loop, False, urls))
|
||||
scheduler.start()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_forever()
|
||||
|
||||
|
||||
def add_download_job(urls):
|
||||
add_job(download, (urls,))
|
||||
|
||||
|
||||
def add_job(job_func, args):
|
||||
'''
|
||||
add a job to scheduler
|
||||
'''
|
||||
default_args = (loop, True)
|
||||
default_args = default_args + args
|
||||
logger.debug(default_args)
|
||||
t1 = datetime.now() + timedelta(seconds=10)
|
||||
date_trigger = DateTrigger(run_date=t1)
|
||||
scheduler.add_job(job_func, trigger=date_trigger, args=default_args)
|
@ -0,0 +1,331 @@
|
||||
/*!
|
||||
* Bootstrap Reboot v4.3.1 (https://getbootstrap.com/)
|
||||
* Copyright 2011-2019 The Bootstrap Authors
|
||||
* Copyright 2011-2019 Twitter, Inc.
|
||||
* Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
|
||||
* Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md)
|
||||
*/
|
||||
*,
|
||||
*::before,
|
||||
*::after {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
html {
|
||||
font-family: sans-serif;
|
||||
line-height: 1.15;
|
||||
-webkit-text-size-adjust: 100%;
|
||||
-webkit-tap-highlight-color: rgba(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
article, aside, figcaption, figure, footer, header, hgroup, main, nav, section {
|
||||
display: block;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
|
||||
font-size: 1rem;
|
||||
font-weight: 400;
|
||||
line-height: 1.5;
|
||||
color: #212529;
|
||||
text-align: left;
|
||||
background-color: #fff;
|
||||
}
|
||||
|
||||
[tabindex="-1"]:focus {
|
||||
outline: 0 !important;
|
||||
}
|
||||
|
||||
hr {
|
||||
box-sizing: content-box;
|
||||
height: 0;
|
||||
overflow: visible;
|
||||
}
|
||||
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
margin-top: 0;
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
p {
|
||||
margin-top: 0;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
abbr[title],
|
||||
abbr[data-original-title] {
|
||||
text-decoration: underline;
|
||||
-webkit-text-decoration: underline dotted;
|
||||
text-decoration: underline dotted;
|
||||
cursor: help;
|
||||
border-bottom: 0;
|
||||
-webkit-text-decoration-skip-ink: none;
|
||||
text-decoration-skip-ink: none;
|
||||
}
|
||||
|
||||
address {
|
||||
margin-bottom: 1rem;
|
||||
font-style: normal;
|
||||
line-height: inherit;
|
||||
}
|
||||
|
||||
ol,
|
||||
ul,
|
||||
dl {
|
||||
margin-top: 0;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
ol ol,
|
||||
ul ul,
|
||||
ol ul,
|
||||
ul ol {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
dt {
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
dd {
|
||||
margin-bottom: .5rem;
|
||||
margin-left: 0;
|
||||
}
|
||||
|
||||
blockquote {
|
||||
margin: 0 0 1rem;
|
||||
}
|
||||
|
||||
b,
|
||||
strong {
|
||||
font-weight: bolder;
|
||||
}
|
||||
|
||||
small {
|
||||
font-size: 80%;
|
||||
}
|
||||
|
||||
sub,
|
||||
sup {
|
||||
position: relative;
|
||||
font-size: 75%;
|
||||
line-height: 0;
|
||||
vertical-align: baseline;
|
||||
}
|
||||
|
||||
sub {
|
||||
bottom: -.25em;
|
||||
}
|
||||
|
||||
sup {
|
||||
top: -.5em;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #007bff;
|
||||
text-decoration: none;
|
||||
background-color: transparent;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
color: #0056b3;
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
a:not([href]):not([tabindex]) {
|
||||
color: inherit;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:not([href]):not([tabindex]):hover, a:not([href]):not([tabindex]):focus {
|
||||
color: inherit;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:not([href]):not([tabindex]):focus {
|
||||
outline: 0;
|
||||
}
|
||||
|
||||
pre,
|
||||
code,
|
||||
kbd,
|
||||
samp {
|
||||
font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
||||
font-size: 1em;
|
||||
}
|
||||
|
||||
pre {
|
||||
margin-top: 0;
|
||||
margin-bottom: 1rem;
|
||||
overflow: auto;
|
||||
}
|
||||
|
||||
figure {
|
||||
margin: 0 0 1rem;
|
||||
}
|
||||
|
||||
img {
|
||||
vertical-align: middle;
|
||||
border-style: none;
|
||||
}
|
||||
|
||||
svg {
|
||||
overflow: hidden;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
caption {
|
||||
padding-top: 0.75rem;
|
||||
padding-bottom: 0.75rem;
|
||||
color: #6c757d;
|
||||
text-align: left;
|
||||
caption-side: bottom;
|
||||
}
|
||||
|
||||
th {
|
||||
text-align: inherit;
|
||||
}
|
||||
|
||||
label {
|
||||
display: inline-block;
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
button {
|
||||
border-radius: 0;
|
||||
}
|
||||
|
||||
button:focus {
|
||||
outline: 1px dotted;
|
||||
outline: 5px auto -webkit-focus-ring-color;
|
||||
}
|
||||
|
||||
input,
|
||||
button,
|
||||
select,
|
||||
optgroup,
|
||||
textarea {
|
||||
margin: 0;
|
||||
font-family: inherit;
|
||||
font-size: inherit;
|
||||
line-height: inherit;
|
||||
}
|
||||
|
||||
button,
|
||||
input {
|
||||
overflow: visible;
|
||||
}
|
||||
|
||||
button,
|
||||
select {
|
||||
text-transform: none;
|
||||
}
|
||||
|
||||
select {
|
||||
word-wrap: normal;
|
||||
}
|
||||
|
||||
button,
|
||||
[type="button"],
|
||||
[type="reset"],
|
||||
[type="submit"] {
|
||||
-webkit-appearance: button;
|
||||
}
|
||||
|
||||
button:not(:disabled),
|
||||
[type="button"]:not(:disabled),
|
||||
[type="reset"]:not(:disabled),
|
||||
[type="submit"]:not(:disabled) {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
button::-moz-focus-inner,
|
||||
[type="button"]::-moz-focus-inner,
|
||||
[type="reset"]::-moz-focus-inner,
|
||||
[type="submit"]::-moz-focus-inner {
|
||||
padding: 0;
|
||||
border-style: none;
|
||||
}
|
||||
|
||||
input[type="radio"],
|
||||
input[type="checkbox"] {
|
||||
box-sizing: border-box;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
input[type="date"],
|
||||
input[type="time"],
|
||||
input[type="datetime-local"],
|
||||
input[type="month"] {
|
||||
-webkit-appearance: listbox;
|
||||
}
|
||||
|
||||
textarea {
|
||||
overflow: auto;
|
||||
resize: vertical;
|
||||
}
|
||||
|
||||
fieldset {
|
||||
min-width: 0;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
border: 0;
|
||||
}
|
||||
|
||||
legend {
|
||||
display: block;
|
||||
width: 100%;
|
||||
max-width: 100%;
|
||||
padding: 0;
|
||||
margin-bottom: .5rem;
|
||||
font-size: 1.5rem;
|
||||
line-height: inherit;
|
||||
color: inherit;
|
||||
white-space: normal;
|
||||
}
|
||||
|
||||
progress {
|
||||
vertical-align: baseline;
|
||||
}
|
||||
|
||||
[type="number"]::-webkit-inner-spin-button,
|
||||
[type="number"]::-webkit-outer-spin-button {
|
||||
height: auto;
|
||||
}
|
||||
|
||||
[type="search"] {
|
||||
outline-offset: -2px;
|
||||
-webkit-appearance: none;
|
||||
}
|
||||
|
||||
[type="search"]::-webkit-search-decoration {
|
||||
-webkit-appearance: none;
|
||||
}
|
||||
|
||||
::-webkit-file-upload-button {
|
||||
font: inherit;
|
||||
-webkit-appearance: button;
|
||||
}
|
||||
|
||||
output {
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
summary {
|
||||
display: list-item;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
template {
|
||||
display: none;
|
||||
}
|
||||
|
||||
[hidden] {
|
||||
display: none !important;
|
||||
}
|
||||
/*# sourceMappingURL=bootstrap-reboot.css.map */
|
@ -0,0 +1,8 @@
|
||||
/*!
|
||||
* Bootstrap Reboot v4.3.1 (https://getbootstrap.com/)
|
||||
* Copyright 2011-2019 The Bootstrap Authors
|
||||
* Copyright 2011-2019 Twitter, Inc.
|
||||
* Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
|
||||
* Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md)
|
||||
*/*,::after,::before{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}article,aside,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;text-align:left;background-color:#fff}[tabindex="-1"]:focus{outline:0!important}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[data-original-title],abbr[title]{text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;border-bottom:0;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#007bff;text-decoration:none;background-color:transparent}a:hover{color:#0056b3;text-decoration:underline}a:not([href]):not([tabindex]){color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus,a:not([href]):not([tabindex]):hover{color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus{outline:0}code,kbd,pre,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle;border-style:none}svg{overflow:hidden;vertical-align:middle}table{border-collapse:collapse}caption{padding-top:.75rem;padding-bottom:.75rem;color:#6c757d;text-align:left;caption-side:bottom}th{text-align:inherit}label{display:inline-block;margin-bottom:.5rem}button{border-radius:0}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}button,input,optgroup,select,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}select{word-wrap:normal}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]:not(:disabled),[type=reset]:not(:disabled),[type=submit]:not(:disabled),button:not(:disabled){cursor:pointer}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{padding:0;border-style:none}input[type=checkbox],input[type=radio]{box-sizing:border-box;padding:0}input[type=date],input[type=datetime-local],input[type=month],input[type=time]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;max-width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit;color:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item;cursor:pointer}template{display:none}[hidden]{display:none!important}
|
||||
/*# sourceMappingURL=bootstrap-reboot.min.css.map */
|
@ -0,0 +1 @@
|
||||
.coverimg { cursor: pointer; }
|
After Width: | Height: | Size: 87 KiB |
After Width: | Height: | Size: 31 KiB |
After Width: | Height: | Size: 24 KiB |
After Width: | Height: | Size: 142 KiB |
@ -0,0 +1,10 @@
|
||||
$(function () {
|
||||
$('.coverimg').on('click', function () {
|
||||
$('#imglarge').attr('src', $(this).attr('src'));
|
||||
$('#imagemodal').modal('show');
|
||||
});
|
||||
|
||||
$('#pagenav').on('change', function () {
|
||||
window.location = $(this).val();
|
||||
});
|
||||
});
|
@ -0,0 +1,32 @@
|
||||
% rebase('base.tpl', title='关于', path=path)
|
||||
|
||||
<div class="container">
|
||||
<div class="row py-3">
|
||||
<div class="col-10 offset-1 ">
|
||||
<div class="text-center">
|
||||
<h2>免责声明</h2>
|
||||
本软件仅用于技术学习使用,禁止用于商业用途,使用本软件所造成的的后果由使用者承担!
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row py-3">
|
||||
<div class="col-10 offset-1 ">
|
||||
<div class="text-center">
|
||||
如果你觉得这个软件不错, 可以请我喝杯冰阔落 ^_^.
|
||||
</div>
|
||||
<div class="row py-3">
|
||||
<div class="col-6">
|
||||
<img class="rounded mx-auto d-block" src="/static/images/alipay.jpg" width="200px">
|
||||
</div>
|
||||
<div class="col-6">
|
||||
<img class="rounded mx-auto d-block" src="/static/images/wechat_pay.jpg" width="200px">
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
@ -0,0 +1,108 @@
|
||||
<%print(path)%>
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<!-- Required meta tags -->
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
|
||||
|
||||
<link rel="shortcut icon" type="image/ico" href="/static/images/favicon.ico"/>
|
||||
|
||||
<!-- Bootstrap CSS -->
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/bootstrap.min.css">
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="/static/css/bustag.css">
|
||||
|
||||
<title>{{title or ''}}</title>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="container">
|
||||
<div class="row">
|
||||
<div class="col-12">
|
||||
<nav class="navbar navbar-expand-lg navbar-light bg-light">
|
||||
<a class="navbar-brand" href="/"><img src="/static/images/logo.png" width="140"></a>
|
||||
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
|
||||
<span class="navbar-toggler-icon"></span>
|
||||
</button>
|
||||
<div class="collapse navbar-collapse" id="navbarNav">
|
||||
<ul class="navbar-nav">
|
||||
<li class="nav-item {{ 'active' if path=='/' else ''}}">
|
||||
<a class="nav-link" href="/">推荐 <span class="sr-only">(current)</span></a>
|
||||
</li>
|
||||
<li class="nav-item {{ 'active' if path=='/tagit' else ''}}">
|
||||
<a class="nav-link" href="/tagit">打标</a>
|
||||
</li>
|
||||
<li class="nav-item {{ 'active' if path=='/local' else ''}}">
|
||||
<a class="nav-link" href="/local">本地</a>
|
||||
</li>
|
||||
<li class="nav-item {{ 'active' if path=='/model' else ''}}">
|
||||
<a class="nav-link" href="/model">模型</a>
|
||||
</li>
|
||||
<li class="nav-item {{ 'active' if path=='/load_db' else ''}}">
|
||||
<a class="nav-link" href="/load_db">数据</a>
|
||||
</li>
|
||||
<li class="nav-item {{ 'active' if path=='/about' else ''}}">
|
||||
<a class="nav-link" href="/about">关于</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<div class="row py-3">
|
||||
<div class="col-12">
|
||||
% if defined('msg') and msg != '':
|
||||
<div class="alert alert-success" role="alert">
|
||||
{{msg}}
|
||||
</div>
|
||||
% end
|
||||
|
||||
% if defined('errmsg') and errmsg != '':
|
||||
<div class="alert alert-danger" role="alert">
|
||||
{{errmsg}}
|
||||
</div>
|
||||
% end
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{{!base}}
|
||||
<% from bustag import __version__ %>
|
||||
<footer class="my-3">
|
||||
<div class="container">
|
||||
<div class="col">
|
||||
<p class="text-center">
|
||||
<span class="badge badge-pill badge-info">version : {{__version__}}</span>
|
||||
</p>
|
||||
<p class="text-center">
|
||||
Developed by 凤凰山@2019 <a href="https://github.com/gxtrobot/bustag" target="_blank">github</a>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- The Modal -->
|
||||
<div class="modal fade" id="imagemodal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel" aria-hidden="true">
|
||||
<div class="modal-dialog modal-lg">
|
||||
<div class="modal-content">
|
||||
<div class="modal-body">
|
||||
<button type="button" class="close" data-dismiss="modal"><span aria-hidden="true">×</span><span class="sr-only">Close</span></button>
|
||||
<img id="imglarge" src="" class="imagepreview" style="width: 100%;" >
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</footer>
|
||||
<!-- Optional JavaScript -->
|
||||
<!-- jQuery first, then Popper.js, then Bootstrap JS -->
|
||||
<script type="text/javascript" src="/static/js/jquery.min.js"></script>
|
||||
<script type="text/javascript" src="/static/js/popper.min.js"></script>
|
||||
<script type="text/javascript" src="/static/js/bootstrap.min.js"></script>
|
||||
<script type="text/javascript" src="/static/js/bustag.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
@ -0,0 +1,57 @@
|
||||
% rebase('base.tpl', title='推荐', path=path, msg=msg)
|
||||
% curr_page = page_info[2]
|
||||
|
||||
<div class="container">
|
||||
<div class="row py-3">
|
||||
<div class="col-12">
|
||||
<ul class="nav nav-tabs">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if like==1 else ''}}" href="?like=1">喜欢</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'' if like==1 else 'active'}}" href="?like=0">不喜欢</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%#generate list of rows of items
|
||||
% i = 1
|
||||
%for item in items:
|
||||
<form id="form-{{i}}" action="/correct/{{item.fanhao}}?page={{curr_page}}&like={{like}}" method="post">
|
||||
<div class="row py-3">
|
||||
<div class="col-12 col-md-4">
|
||||
<img class="img-fluid img-thumbnail coverimg" src={{item.cover_img_url}}>
|
||||
</div>
|
||||
|
||||
<div class="col-7 col-md-5">
|
||||
<div class="small text-muted">id: {{item.id}}</div>
|
||||
<div class="small text-muted">发行日期: {{item.release_date}}</div>
|
||||
<div class="small text-muted">添加日期: {{item.add_date}}</div>
|
||||
<h6>{{item.fanhao}} </h6>
|
||||
<a href="{{item.url}}" target="_blank"> {{item.title[:30]}} </a>
|
||||
<div>
|
||||
% for t in item.tags_dict['genre']:
|
||||
<span class="badge badge-primary">{{t}}</span>
|
||||
% end
|
||||
</div>
|
||||
<div>
|
||||
% for t in item.tags_dict['star']:
|
||||
<span class="badge badge-warning">{{t}}</span>
|
||||
% end
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div class="col-5 col-md-3 align-self-center">
|
||||
<input type=hidden name="formid" value="form-{{i}}">
|
||||
<button type="submit" name="submit" class="btn btn-primary mx-2" value="1">正确</button>
|
||||
<button type="submit" name="submit" class="btn btn-danger" value="0">错误</button>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
% i = i + 1
|
||||
%end
|
||||
% include('pagination.tpl', page_info=page_info)
|
||||
|
||||
</div>
|
@ -0,0 +1,31 @@
|
||||
% rebase('base.tpl', title='数据', path=path, msg=msg)
|
||||
|
||||
<div class="container">
|
||||
<div class="row py-3">
|
||||
<div class="col-10 offset-1 ">
|
||||
<ul class="nav nav-tabs">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if path=='/load_db' else ''}}" href="/load_db">导入打标数据</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<form action="" method="post" enctype="multipart/form-data">
|
||||
<div class="row py-3">
|
||||
<div class="col-10 offset-1 ">
|
||||
<div class="form-group">
|
||||
<label for="dbfile">选择要导入的数据库文件(*.db)</label>
|
||||
<input type="file" class="form-control-file" id="dbfile" name="dbfile">
|
||||
</div>
|
||||
|
||||
<div class="text-center">
|
||||
<button type="submit" name="submit" class="btn btn-primary mx-2 my-3" value="1">提交</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
</div>
|
@ -0,0 +1,51 @@
|
||||
% rebase('base.tpl', title='本地文件', path=path)
|
||||
% curr_page = page_info[2]
|
||||
|
||||
<div class="container">
|
||||
<div class="row py-3">
|
||||
<div class="col-12">
|
||||
<ul class="nav nav-tabs">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if path=='/local' else ''}}" href="/local">本地文件</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if path=='/local_fanhao' else ''}}" href="/local_fanhao">上传番号</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%#generate list of rows of items
|
||||
%for local_item in items:
|
||||
<div class="row py-3">
|
||||
<div class="col-12 col-md-4">
|
||||
<img class="img-fluid img-thumbnail coverimg" alt="点击放大" src={{local_item.item.cover_img_url}}>
|
||||
</div>
|
||||
|
||||
<div class="col-7 col-md-5">
|
||||
<div class="small text-muted">发行日期: {{local_item.item.release_date}}</div>
|
||||
<div class="small text-muted">上次观看: {{local_item.last_view_date}}</div>
|
||||
<div class="small text-muted">观看次数观看: {{local_item.view_times}}</div>
|
||||
<h6>{{local_item.item.fanhao}} </h6>
|
||||
<a href="{{local_item.item.url}}" target="_blank"> {{local_item.item.title[:30]}} </a>
|
||||
<div>
|
||||
% for t in local_item.item.tags_dict['genre']:
|
||||
<span class="badge badge-primary">{{t}}</span>
|
||||
% end
|
||||
</div>
|
||||
<div>
|
||||
% for t in local_item.item.tags_dict['star']:
|
||||
<span class="badge badge-warning">{{t}}</span>
|
||||
% end
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div class="col-5 col-md-3 align-self-center">
|
||||
<a class="btn btn-primary" target="_blank" href="/local_play/{{local_item.id}}" role="button">播放</a>
|
||||
</div>
|
||||
</div>
|
||||
%end
|
||||
% include('pagination.tpl', page_info=page_info)
|
||||
|
||||
</div>
|
@ -0,0 +1,40 @@
|
||||
% rebase('base.tpl', title='本地', path=path, msg=msg)
|
||||
|
||||
<div class="container">
|
||||
<div class="row py-3">
|
||||
<div class="col-12">
|
||||
<ul class="nav nav-tabs">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if path=='/local' else ''}}" href="/local">本地文件</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if path=='/local_fanhao' else ''}}" href="/local_fanhao">上传番号</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<form action="" method="post">
|
||||
<div class="row py-3">
|
||||
<div class="col-12 ">
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="checkbox" value="1" id="tag_like" name="tag_like">
|
||||
<label class="form-check-label" for="tag_like">
|
||||
全部打标为喜欢
|
||||
</label>
|
||||
</div>
|
||||
<div class="form-group py-3">
|
||||
<label for="fanhao">每行格式: 番号(XXX-123),URL(可省略, 本地文件无效, 须为Plex等服务器视频URL)</label>
|
||||
<textarea class="form-control" id="fanhao" name="fanhao" rows="20"></textarea>
|
||||
</div>
|
||||
<div class="text-center">
|
||||
<button type="submit" name="submit" class="btn btn-primary mx-2" value="1">提交</button>
|
||||
<button type="submit" name="submit" class="btn btn-danger" value="0">重置</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
</div>
|
@ -0,0 +1,59 @@
|
||||
% rebase('base.tpl', title='其他', path=path)
|
||||
|
||||
<div class="container">
|
||||
<div class="row py-3">
|
||||
<div class="col-10 offset-1 ">
|
||||
<div class="accordion" id="accordionExample">
|
||||
<div class="card">
|
||||
<div class="card-header" id="headingOne">
|
||||
<h2 class="mb-0">
|
||||
<button class="btn btn-link" type="button" data-toggle="collapse" data-target="#collapseOne" aria-expanded="true" aria-controls="collapseOne">
|
||||
训练模型
|
||||
</button>
|
||||
</h2>
|
||||
</div>
|
||||
|
||||
<div id="collapseOne" class="collapse show" aria-labelledby="headingOne" data-parent="#accordionExample">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">重新训练模型</h5>
|
||||
<p class="card-text">重新使用系统所有用户打标数据训练模型, 当打标数据增多后, 可以重新训练模型, 提高模型预测效果</p>
|
||||
<a href="/do-training" class="btn btn-primary">开始训练</a>
|
||||
</div>
|
||||
<div class="card-header">
|
||||
<h6> 当前模型数据 </h6>
|
||||
</div>
|
||||
% if defined('error_msg') and error_msg is not None:
|
||||
<p class="card-text text-danger">{{error_msg}} </a>
|
||||
% end
|
||||
% if model_scores is not None:
|
||||
<ul class="list-group list-group-flush">
|
||||
<li class="list-group-item">准确率: {{model_scores['precision']}}</li>
|
||||
<li class="list-group-item">覆盖率: {{model_scores['recall']}}</li>
|
||||
<li class="list-group-item">综合评分(越高越好): {{model_scores['f1']}}</li>
|
||||
</ul>
|
||||
% else:
|
||||
<div class="card-body">
|
||||
还没有训练过模型.
|
||||
</div>
|
||||
% end
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div class="card-header" id="headingTwo">
|
||||
<h2 class="mb-0">
|
||||
<button class="btn btn-link collapsed" type="button" data-toggle="collapse" data-target="#collapseTwo" aria-expanded="false" aria-controls="collapseTwo">
|
||||
|
||||
</button>
|
||||
</h2>
|
||||
</div>
|
||||
<div id="collapseTwo" class="collapse" aria-labelledby="headingTwo" data-parent="#accordionExample">
|
||||
<div class="card-body">
|
||||
Anim pariatur cliche reprehenderit, enim eiusmod high life accusamus terry richardson ad squid. 3 wolf moon officia aute, non cupidatat skateboard dolor brunch. Food truck quinoa nesciunt laborum eiusmod. Brunch 3 wolf moon tempor, sunt aliqua put a bird on it squid single-origin coffee nulla assumenda shoreditch et. Nihil anim keffiyeh helvetica, craft beer labore wes anderson cred nesciunt sapiente ea proident. Ad vegan excepteur butcher vice lomo. Leggings occaecat craft beer farm-to-table, raw denim aesthetic synth nesciunt you probably haven't heard of them accusamus labore sustainable VHS.
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
@ -0,0 +1,37 @@
|
||||
% curr_page = page_info[2]
|
||||
% max_page = page_info[1]
|
||||
% total_items = page_info[0]
|
||||
% setdefault('like', '')
|
||||
<div class="row">
|
||||
<div class="col-12 text-center">
|
||||
<h6>
|
||||
% if curr_page != 1:
|
||||
<a href="?page=1&like={{like}}"> 第一页</a>
|
||||
% end
|
||||
% if curr_page > 1:
|
||||
<a href="?page={{curr_page - 1}}&like={{like}}"> 上一页</a>
|
||||
% end
|
||||
第{{curr_page}}页
|
||||
% if curr_page < max_page:
|
||||
<a href="?page={{curr_page + 1}}&like={{like}}">下一页</a>
|
||||
% end
|
||||
% if curr_page != max_page:
|
||||
<a href="?page={{max_page}}&like={{like}}">最后页</a>
|
||||
% end
|
||||
</h6>
|
||||
<div>
|
||||
<form>
|
||||
<span>共 {{max_page}}页,{{total_items}}条</span>
|
||||
跳转
|
||||
<select id="pagenav">
|
||||
% for i in range(1, max_page+1):
|
||||
% url = '?page={}&like={}'.format(i, like)
|
||||
% selected = "selected" if i == curr_page else ""
|
||||
<option {{selected}} value="{{url}}">{{i}}</option>
|
||||
% end
|
||||
</select>
|
||||
页
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
@ -0,0 +1,61 @@
|
||||
% rebase('base.tpl', title='打标', path=path)
|
||||
% curr_page = page_info[2]
|
||||
<div class="container">
|
||||
<div class="row py-3">
|
||||
<div class="col-12">
|
||||
<ul class="nav nav-tabs">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if like is None else ''}}" href="?">未打标的</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if like==1 else ''}}" href="?like=1">喜欢</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link {{'active' if like==0 else ''}}" href="?like=0">不喜欢</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
%#generate list of rows of items
|
||||
% i = 1
|
||||
%for item in items:
|
||||
<form id="form-{{i}}" action="/tag/{{item.fanhao}}?page={{curr_page}}&like={{like}}" method="post">
|
||||
<div class="row py-3">
|
||||
<div class="col-12 col-md-4">
|
||||
<img class="img-fluid img-thumbnail coverimg" src={{item.cover_img_url}}>
|
||||
</div>
|
||||
|
||||
<div class="col-7 col-md-5">
|
||||
<div class="small text-muted">id: {{item.id}}</div>
|
||||
<div class="small text-muted">发行日期: {{item.release_date}}</div>
|
||||
<div class="small text-muted">添加日期: {{item.add_date}}</div>
|
||||
<h6>{{item.fanhao}} </h6>
|
||||
<a href="{{item.url}}" target="_blank"> {{item.title[:30]}} </a>
|
||||
<div>
|
||||
% for t in item.tags_dict['genre']:
|
||||
<span class="badge badge-primary">{{t}}</span>
|
||||
% end
|
||||
</div>
|
||||
<div>
|
||||
% for t in item.tags_dict['star']:
|
||||
<span class="badge badge-warning">{{t}}</span>
|
||||
% end
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div class="col-5 col-md-3 align-self-center">
|
||||
<input type=hidden name="formid" value="form-{{i}}">
|
||||
% if like is None or like == 0:
|
||||
<button type="submit" name="submit" class="btn btn-primary btn-sm" value="1">喜欢</button>
|
||||
% end
|
||||
% if like is None or like == 1:
|
||||
<button type="submit" name="submit" class="btn btn-danger btn-sm" value="0">不喜欢</button>
|
||||
% end
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
% i = i + 1
|
||||
%end
|
||||
% include('pagination.tpl', page_info=page_info)
|
||||
|
||||
</div>
|
@ -0,0 +1,49 @@
|
||||
'''
|
||||
entry point for command line
|
||||
'''
|
||||
|
||||
import click
|
||||
import sys
|
||||
from aspider import aspider
|
||||
from bustag.model.prepare import prepare_predict_data
|
||||
from bustag.spider.db import Item, ItemRate, RATE_TYPE
|
||||
import bustag.model.classifier as clf
|
||||
from bustag.spider import bus_spider
|
||||
from bustag.util import logger, APP_CONFIG
|
||||
|
||||
|
||||
@click.command()
|
||||
def recommend():
|
||||
'''
|
||||
根据现有模型预测推荐数据
|
||||
'''
|
||||
try:
|
||||
clf.recommend()
|
||||
except FileNotFoundError:
|
||||
click.echo('还没有训练好的模型, 无法推荐')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--count", help="打印次数", type=int)
|
||||
def download(count):
|
||||
"""
|
||||
下载更新数据
|
||||
"""
|
||||
print('start download')
|
||||
sys.argv = sys.argv[:1]
|
||||
if count is not None:
|
||||
APP_CONFIG['download.count'] = count
|
||||
sys.argv.append(APP_CONFIG['download.root_path'])
|
||||
aspider.main()
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
pass
|
||||
|
||||
|
||||
main.add_command(download)
|
||||
main.add_command(recommend)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,88 @@
|
||||
'''
|
||||
create classifier model and predict
|
||||
'''
|
||||
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score, confusion_matrix
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from bustag.model.prepare import prepare_data, prepare_predict_data
|
||||
from bustag.model.persist import load_model, dump_model
|
||||
from bustag.spider.db import RATE_TYPE, ItemRate
|
||||
from bustag.util import logger, get_data_path, MODEL_PATH
|
||||
|
||||
MODEL_FILE = MODEL_PATH + 'model.pkl'
|
||||
MIN_TRAIN_NUM = 200
|
||||
|
||||
|
||||
def load():
|
||||
model_data = load_model(get_data_path(MODEL_FILE))
|
||||
return model_data
|
||||
|
||||
|
||||
def create_model():
|
||||
knn = KNeighborsClassifier(n_neighbors=11)
|
||||
return knn
|
||||
|
||||
|
||||
def predict(X_test):
|
||||
model, _ = load()
|
||||
y_pred = model.predict(X_test)
|
||||
return y_pred
|
||||
|
||||
|
||||
def train():
|
||||
model = create_model()
|
||||
X_train, X_test, y_train, y_test = prepare_data()
|
||||
total = len(X_test) + len(X_train)
|
||||
if total < MIN_TRAIN_NUM:
|
||||
raise ValueError(f'训练数据不足, 无法训练模型. 需要{MIN_TRAIN_NUM}, 当前{total}')
|
||||
model.fit(X_train, y_train)
|
||||
y_pred = model.predict(X_test)
|
||||
confusion_mtx = confusion_matrix(y_test, y_pred)
|
||||
scores = evaluate(confusion_mtx, y_test, y_pred)
|
||||
models_data = (model, scores)
|
||||
dump_model(get_data_path(MODEL_FILE), models_data)
|
||||
logger.info('new model trained')
|
||||
return models_data
|
||||
|
||||
|
||||
def evaluate(confusion_mtx, y_test, y_pred):
|
||||
tn, fp, fn, tp = confusion_mtx.ravel()
|
||||
# accuracy = accuracy_score(y_test, y_pred)
|
||||
precision = precision_score(y_test, y_pred)
|
||||
recall = recall_score(y_test, y_pred)
|
||||
f1 = f1_score(y_test, y_pred)
|
||||
logger.info(f'tp: {tp}, fp: {fp}')
|
||||
logger.info(f'fn: {fn}, tn: {tn}')
|
||||
# logger.info(f'accuracy_score: {accuracy}')
|
||||
logger.info(f'precision_score: {precision}')
|
||||
logger.info(f'recall_score: {recall}')
|
||||
logger.info(f'f1_score: {f1}')
|
||||
model_scores = dict(precision=precision, recall=recall, f1=f1)
|
||||
model_scores = {key: float('{:.2f}'.format(value))
|
||||
for key, value in model_scores.items()}
|
||||
return model_scores
|
||||
|
||||
|
||||
def recommend():
|
||||
'''
|
||||
use trained model to recommend items
|
||||
'''
|
||||
ids, X = prepare_predict_data()
|
||||
if len(X) == 0:
|
||||
logger.warning(
|
||||
f'no data for recommend')
|
||||
return
|
||||
count = 0
|
||||
total = len(ids)
|
||||
y_pred = predict(X)
|
||||
for id, y in zip(ids, y_pred):
|
||||
if y == 1:
|
||||
count += 1
|
||||
rate_type = RATE_TYPE.SYSTEM_RATE
|
||||
rate_value = y
|
||||
item_id = id
|
||||
item_rate = ItemRate(rate_type=rate_type,
|
||||
rate_value=rate_value, item_id=item_id)
|
||||
item_rate.save()
|
||||
logger.warning(
|
||||
f'predicted {total} items, recommended {count}')
|
||||
return total, count
|
@ -0,0 +1,19 @@
|
||||
'''
|
||||
persist model required files
|
||||
'''
|
||||
import pickle
|
||||
|
||||
|
||||
def dump_model(path, models):
|
||||
'''
|
||||
Args:
|
||||
models: tuple of models to save
|
||||
'''
|
||||
with open(path, 'wb') as f:
|
||||
pickle.dump(models, f)
|
||||
|
||||
|
||||
def load_model(path):
|
||||
with open(path, 'rb') as f:
|
||||
models = pickle.load(f)
|
||||
return models
|
@ -0,0 +1,88 @@
|
||||
'''
|
||||
prepare data for model training
|
||||
'''
|
||||
import json
|
||||
import operator
|
||||
import pickle
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from bustag.spider.db import get_items, RATE_TYPE, ItemRate, Item, get_tags_for_items
|
||||
from bustag.model.persist import dump_model, load_model
|
||||
from bustag.util import logger, get_data_path, MODEL_PATH
|
||||
|
||||
BINARIZER_PATH = MODEL_PATH + 'label_binarizer.pkl'
|
||||
|
||||
|
||||
def load_data():
|
||||
'''
|
||||
load data from database and do processing
|
||||
'''
|
||||
rate_type = RATE_TYPE.USER_RATE.value
|
||||
rate_value = None
|
||||
page = None
|
||||
items, _ = get_items(rate_type=rate_type, rate_value=rate_value,
|
||||
page=page)
|
||||
return items
|
||||
|
||||
|
||||
def as_dict(item):
|
||||
tags_set = set()
|
||||
for tags in item.tags_dict.values():
|
||||
for tag in tags:
|
||||
tags_set.add(tag)
|
||||
d = {
|
||||
'id': item.fanhao,
|
||||
'title': item.title,
|
||||
'fanhao': item.fanhao,
|
||||
'url': item.url,
|
||||
'add_date': item.add_date,
|
||||
'tags': tags_set,
|
||||
'cover_img_url': item.cover_img_url,
|
||||
'target': item.rate_value
|
||||
}
|
||||
return d
|
||||
|
||||
|
||||
def process_data(df):
|
||||
'''
|
||||
do all processing , like onehotencode tag string
|
||||
'''
|
||||
X = df[['tags']]
|
||||
y = df[['target']]
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
X = mlb.fit_transform(X.tags.values)
|
||||
dump_model(get_data_path(BINARIZER_PATH), mlb)
|
||||
return X, y
|
||||
|
||||
|
||||
def split_data(X, y):
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.25, random_state=42)
|
||||
return (X_train, X_test, y_train, y_test)
|
||||
|
||||
|
||||
def prepare_data():
|
||||
items = load_data()
|
||||
dicts = (as_dict(item) for item in items)
|
||||
df = pd.DataFrame(dicts, columns=['id', 'title', 'fanhao', 'url', 'add_date', 'tags', 'cover_img_url',
|
||||
'target'])
|
||||
X, y = process_data(df)
|
||||
return split_data(X, y)
|
||||
|
||||
|
||||
def prepare_predict_data():
|
||||
# get not rated data
|
||||
rate_type = None
|
||||
rate_value = None
|
||||
page = None
|
||||
unrated_items, _ = get_items(
|
||||
rate_type=rate_type, rate_value=rate_value, page=page)
|
||||
mlb = load_model(get_data_path(BINARIZER_PATH))
|
||||
dicts = (as_dict(item) for item in unrated_items)
|
||||
df = pd.DataFrame(dicts, columns=['id', 'tags'])
|
||||
df.set_index('id', inplace=True)
|
||||
X = mlb.transform(df.tags.values)
|
||||
return df.index.values, X
|
@ -0,0 +1,61 @@
|
||||
'''
|
||||
define url routing process logic
|
||||
'''
|
||||
import sys
|
||||
import os
|
||||
import signal
|
||||
from aspider.routeing import get_router
|
||||
from .parser import parse_item
|
||||
from .db import save, Item
|
||||
from bustag.util import APP_CONFIG, get_full_url, logger
|
||||
router = get_router()
|
||||
MAXPAGE = 30
|
||||
|
||||
|
||||
def get_url_by_fanhao(fanhao):
|
||||
# return full url
|
||||
url = get_full_url(fanhao)
|
||||
return url
|
||||
|
||||
|
||||
def verify_page_path(path, no):
|
||||
logger.debug(f'verify page {path} , args {no}')
|
||||
no = int(no)
|
||||
if no <= MAXPAGE:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
@router.route('/page/<no>', verify_page_path)
|
||||
def process_page(text, path, no):
|
||||
'''
|
||||
process list page
|
||||
'''
|
||||
logger.debug(f'page {no} has length {len(text)}')
|
||||
print(f'process page {no}')
|
||||
|
||||
|
||||
def verify_fanhao(path, fanhao):
|
||||
'''
|
||||
verify fanhao before add it to queue
|
||||
'''
|
||||
exists = Item.get_by_fanhao(fanhao)
|
||||
logger.debug(
|
||||
f'verify {fanhao}: , exists:{exists is not None}, skip {path}')
|
||||
return exists is None
|
||||
|
||||
|
||||
@router.route('/<fanhao:[\w]+-[\d]+>', verify_fanhao, no_parse_links=True)
|
||||
def process_item(text, path, fanhao):
|
||||
'''
|
||||
process item page
|
||||
'''
|
||||
logger.debug(f'process item {fanhao}')
|
||||
url = path
|
||||
meta, tags = parse_item(text)
|
||||
meta.update(url=url)
|
||||
# logger.debug('meta keys', len(meta.keys()))
|
||||
# logger.debug('tag count', len(tags))
|
||||
save(meta, tags)
|
||||
print(f'item {fanhao} is processed')
|
@ -0,0 +1,370 @@
|
||||
'''
|
||||
persist data to db
|
||||
'''
|
||||
from datetime import date
|
||||
import datetime
|
||||
import operator
|
||||
from functools import reduce
|
||||
import json
|
||||
from peewee import *
|
||||
from enum import IntEnum
|
||||
from collections import defaultdict
|
||||
from bustag.util import logger, get_data_path, format_datetime, get_now_time, get_full_url
|
||||
|
||||
DB_FILE = 'bus.db'
|
||||
db = SqliteDatabase(get_data_path(DB_FILE), pragmas={
|
||||
'journal_mode': 'wal'})
|
||||
|
||||
|
||||
class BaseModel(Model):
|
||||
|
||||
class Meta:
|
||||
database = db
|
||||
legacy_table_names = False
|
||||
|
||||
|
||||
class ExistError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DBError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Item(BaseModel):
|
||||
'''
|
||||
item table
|
||||
'''
|
||||
title = CharField()
|
||||
fanhao = CharField(unique=True)
|
||||
url = CharField(unique=True)
|
||||
release_date = DateField()
|
||||
add_date = DateTimeField(default=datetime.datetime.now)
|
||||
meta_info = TextField()
|
||||
|
||||
def __repr__(self):
|
||||
return f'<Item:{self.fanhao} {self.title}>'
|
||||
|
||||
@staticmethod
|
||||
def saveit(meta_info):
|
||||
item_release_date = date.fromisoformat(meta_info.pop('release_date'))
|
||||
item_fanhao = meta_info.pop('fanhao')
|
||||
item_title = meta_info.pop('title')
|
||||
item_url = meta_info.pop('url')
|
||||
item_meta = json.dumps(meta_info)
|
||||
try:
|
||||
item = Item.create(fanhao=item_fanhao, title=item_title, url=item_url,
|
||||
release_date=item_release_date, meta_info=item_meta)
|
||||
logger.debug(f'save item: {item}')
|
||||
except IntegrityError:
|
||||
logger.debug('Item exists: {item_fanhao}')
|
||||
raise ExistError()
|
||||
else:
|
||||
return item
|
||||
|
||||
@staticmethod
|
||||
def loadit(item):
|
||||
item.url = get_full_url(item.url)
|
||||
meta = json.loads(item.meta_info)
|
||||
item.cover_img_url = meta['cover_img_url']
|
||||
series = item.fanhao.split('-')[0]
|
||||
item.add_date = format_datetime(item.add_date)
|
||||
|
||||
@staticmethod
|
||||
def getit(id):
|
||||
item = Item.get_by_id(id)
|
||||
return item
|
||||
|
||||
@staticmethod
|
||||
def get_by_fanhao(fanhao):
|
||||
item = Item.get_or_none(Item.fanhao == fanhao)
|
||||
return item
|
||||
|
||||
@staticmethod
|
||||
def get_tags_dict(item):
|
||||
tags_dict = defaultdict(list)
|
||||
for t in item.tags_list:
|
||||
tags_dict[t.tag.type_].append(t.tag.value)
|
||||
item.tags_dict = tags_dict
|
||||
|
||||
|
||||
class Tag(BaseModel):
|
||||
'''
|
||||
tag table
|
||||
'''
|
||||
type_ = CharField(column_name='type')
|
||||
value = CharField()
|
||||
url = CharField()
|
||||
|
||||
class Meta:
|
||||
indexes = (
|
||||
# Specify a unique multi-column index
|
||||
(('type_', 'value'), True),
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return f'<Tag {self.value}>'
|
||||
|
||||
@staticmethod
|
||||
def saveit(tag_info):
|
||||
tag, created = Tag.get_or_create(type_=tag_info.type, value=tag_info.value,
|
||||
defaults={'url': tag_info.link})
|
||||
if created:
|
||||
logger.debug(f'save tag: {tag}')
|
||||
return tag
|
||||
|
||||
|
||||
class ItemTag(BaseModel):
|
||||
item = ForeignKeyField(Item, field='fanhao', backref='tags_list')
|
||||
tag = ForeignKeyField(Tag, backref='items')
|
||||
|
||||
class Meta:
|
||||
indexes = (
|
||||
# Specify a unique multi-column index
|
||||
(('item', 'tag'), True),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def saveit(item, tag):
|
||||
try:
|
||||
item_tag = ItemTag.create(item=item, tag=tag)
|
||||
logger.debug(f'save tag_item: {item_tag}')
|
||||
except Exception as ex:
|
||||
logger.exception(ex)
|
||||
else:
|
||||
return item_tag
|
||||
|
||||
def __repr__(self):
|
||||
return f'<ItemTag {self.item.fanhao} - {self.tag.value}>'
|
||||
|
||||
|
||||
class RATE_TYPE(IntEnum):
|
||||
NOT_RATE = 0
|
||||
USER_RATE = 1
|
||||
SYSTEM_RATE = 2
|
||||
|
||||
|
||||
class RATE_VALUE(IntEnum):
|
||||
LIKE = 1
|
||||
DISLIKE = 0
|
||||
|
||||
|
||||
class ItemRate(BaseModel):
|
||||
rate_type = IntegerField()
|
||||
rate_value = IntegerField()
|
||||
item = ForeignKeyField(Item, field='fanhao',
|
||||
backref='rated_items', unique=True)
|
||||
rete_time = DateTimeField(default=datetime.datetime.now)
|
||||
|
||||
@staticmethod
|
||||
def saveit(rate_type, rate_value, fanhao):
|
||||
item_rate = None
|
||||
try:
|
||||
item_rate = ItemRate.create(
|
||||
item=fanhao, rate_type=rate_type, rate_value=rate_value)
|
||||
logger.debug(f'save ItemRate: {item_rate}')
|
||||
except IntegrityError:
|
||||
logger.debug(f'ItemRate exists: {fanhao}')
|
||||
else:
|
||||
return item_rate
|
||||
|
||||
@staticmethod
|
||||
def getit(id):
|
||||
item_rate = ItemRate.get_or_none(ItemRate.id == id)
|
||||
return item_rate
|
||||
|
||||
@staticmethod
|
||||
def get_by_fanhao(fanhao):
|
||||
item_rate = ItemRate.get_or_none(ItemRate.item_id == fanhao)
|
||||
return item_rate
|
||||
|
||||
|
||||
class LocalItem(BaseModel):
|
||||
'''
|
||||
local item table
|
||||
'''
|
||||
item = ForeignKeyField(Item, field='fanhao',
|
||||
backref='local_item', unique=True)
|
||||
path = CharField(null=True)
|
||||
size = IntegerField(null=True)
|
||||
add_date = DateTimeField(default=datetime.datetime.now)
|
||||
last_view_date = DateTimeField(null=True)
|
||||
view_times = IntegerField(default=0)
|
||||
|
||||
@staticmethod
|
||||
def saveit(fanhao, path):
|
||||
local_item = None
|
||||
try:
|
||||
local_item = LocalItem.create(
|
||||
item=fanhao, path=path)
|
||||
logger.debug(f'save LocalItem: {fanhao}')
|
||||
except IntegrityError:
|
||||
logger.debug(f'LocalItem exists: {fanhao}')
|
||||
else:
|
||||
return local_item
|
||||
|
||||
def __repr__(self):
|
||||
return f'<LocalItem {self.fanhao}({self.path})>'
|
||||
|
||||
@staticmethod
|
||||
def update_play(id):
|
||||
nrows = (LocalItem
|
||||
.update({LocalItem.last_view_date: get_now_time(),
|
||||
LocalItem.view_times: LocalItem.view_times+1})
|
||||
.where(LocalItem.id == id)
|
||||
.execute())
|
||||
logger.debug(f'update LocalItem {id} : rows:{nrows}')
|
||||
return LocalItem.get_by_id(id)
|
||||
|
||||
@staticmethod
|
||||
def loadit(local_item):
|
||||
local_item.last_view_date = format_datetime(
|
||||
local_item.last_view_date) if local_item.last_view_date else ''
|
||||
|
||||
|
||||
def save(meta_info, tags):
|
||||
item_title = meta_info['title']
|
||||
tag_objs = []
|
||||
try:
|
||||
item = Item.saveit(meta_info)
|
||||
except ExistError:
|
||||
logger.debug(f'item exists: {item_title}')
|
||||
else:
|
||||
with db.atomic():
|
||||
for tag_info in tags:
|
||||
tag = Tag.saveit(tag_info)
|
||||
if tag:
|
||||
tag_objs.append(tag)
|
||||
with db.atomic():
|
||||
for tag_obj in tag_objs:
|
||||
ItemTag.saveit(item, tag_obj)
|
||||
|
||||
|
||||
def test_save():
|
||||
item_url = 'https://www.cdnbus.bid/MADM-116'
|
||||
item_title = 'test item'
|
||||
item_fanhao = 'MADM-116'
|
||||
item_release_date = date(2019, 7, 19)
|
||||
item_meta_info = ''
|
||||
item = Item(title=item_title, url=item_url, fanhao=item_fanhao,
|
||||
release_date=item_release_date, meta_info=item_meta_info)
|
||||
item.save()
|
||||
|
||||
tag1 = Tag.create(type_='genre', value='素人',
|
||||
url='https://www.cdnbus.bid/genre/s1')
|
||||
tag2 = Tag.create(type_='star', value='樱田',
|
||||
url='https://www.cdnbus.bid/star/dbd')
|
||||
tag3 = Tag.create(type_='genre', value='高清',
|
||||
url='https://www.cdnbus.bid/genre/x1')
|
||||
ItemTag.create(item=item, tag=tag1)
|
||||
ItemTag.create(item=item, tag=tag2)
|
||||
|
||||
ItemRate.saveit(RATE_TYPE.USER_RATE, RATE_VALUE.LIKE, item.fanhao)
|
||||
LocalItem.saveit('MADM-116', '/Download/MADM-116.avi')
|
||||
|
||||
|
||||
def get_items(rate_type=None, rate_value=None, page=1, page_size=10):
|
||||
'''
|
||||
get required items based on some conditions
|
||||
'''
|
||||
items_list = []
|
||||
clauses = []
|
||||
if rate_type is not None:
|
||||
clauses.append(ItemRate.rate_type == rate_type)
|
||||
else:
|
||||
clauses.append(ItemRate.rate_type.is_null())
|
||||
if rate_value is not None:
|
||||
clauses.append(ItemRate.rate_value == rate_value)
|
||||
q = (Item.select(Item, ItemRate)
|
||||
.join(ItemRate, JOIN.LEFT_OUTER, attr='item_rate')
|
||||
.where(reduce(operator.and_, clauses))
|
||||
.order_by(Item.id.desc())
|
||||
)
|
||||
total_items = q.count()
|
||||
if not page is None:
|
||||
q = q.paginate(page, page_size)
|
||||
items = get_tags_for_items(q)
|
||||
for item in items:
|
||||
Item.loadit(item)
|
||||
if hasattr(item, 'item_rate'):
|
||||
item.rate_value = item.item_rate.rate_value
|
||||
else:
|
||||
item.rate_value = None
|
||||
items_list.append(item)
|
||||
|
||||
total_pages = (total_items + page_size - 1) // page_size
|
||||
page_info = (total_items, total_pages, page, page_size)
|
||||
return items_list, page_info
|
||||
|
||||
|
||||
def get_local_items(page=1, page_size=10):
|
||||
'''
|
||||
get local items
|
||||
'''
|
||||
items = []
|
||||
q = (LocalItem.select(LocalItem)
|
||||
.where(LocalItem.path.is_null(False))
|
||||
.order_by(LocalItem.id.desc())
|
||||
)
|
||||
total_items = q.count()
|
||||
if not page is None:
|
||||
q = q.paginate(page, page_size)
|
||||
|
||||
item_query = Item.select()
|
||||
item_tag_query = ItemTag.select()
|
||||
tag_query = Tag.select()
|
||||
items_with_tags = prefetch(q, item_query, item_tag_query, tag_query)
|
||||
|
||||
for local_item in items_with_tags:
|
||||
try:
|
||||
Item.loadit(local_item.item)
|
||||
Item.get_tags_dict(local_item.item)
|
||||
items.append(local_item)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
total_pages = (total_items + page_size - 1) // page_size
|
||||
page_info = (total_items, total_pages, page, page_size)
|
||||
return items, page_info
|
||||
|
||||
|
||||
def get_today_update_count():
|
||||
now = get_now_time()
|
||||
year, month, day = now.year, now.month, now.day
|
||||
q = Item.select().where((Item.add_date.year == year)
|
||||
& (Item.add_date.month == month)
|
||||
& (Item.add_date.day == day)
|
||||
)
|
||||
return q.count()
|
||||
|
||||
|
||||
def get_today_recommend_count():
|
||||
now = get_now_time()
|
||||
year, month, day = now.year, now.month, now.day
|
||||
q = ItemRate.select().where((ItemRate.rete_time.year == year)
|
||||
& (ItemRate.rete_time.month == month)
|
||||
& (ItemRate.rete_time.day == day)
|
||||
& (ItemRate.rate_type == RATE_TYPE.SYSTEM_RATE)
|
||||
& (ItemRate.rate_value == RATE_VALUE.LIKE)
|
||||
)
|
||||
return q.count()
|
||||
|
||||
|
||||
def get_tags_for_items(items_query):
|
||||
item_tag_query = ItemTag.select()
|
||||
tag_query = Tag.select()
|
||||
items_with_tags = prefetch(items_query, item_tag_query, tag_query)
|
||||
items = []
|
||||
for item in items_with_tags:
|
||||
Item.get_tags_dict(item)
|
||||
items.append(item)
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def init():
|
||||
db.connect(reuse_if_open=True)
|
||||
db.create_tables([Item, Tag, ItemTag, ItemRate, LocalItem])
|
||||
|
||||
|
||||
init()
|
@ -0,0 +1,70 @@
|
||||
'''
|
||||
html parser to extract data
|
||||
'''
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from requests_html import HTML
|
||||
from aspider.routeing import get_router
|
||||
router = get_router()
|
||||
|
||||
|
||||
Tag = namedtuple('Tag', ['type', 'value', 'link'])
|
||||
|
||||
|
||||
def parse_item(text):
|
||||
'''
|
||||
Args:
|
||||
text : str - html text
|
||||
|
||||
Returns:
|
||||
tuple: (dict, list)
|
||||
dict - meta data for this item
|
||||
list - tags for this item
|
||||
'''
|
||||
html = HTML(html=text)
|
||||
title_css = 'body > div.container > h3'
|
||||
title = html.find(title_css)[0].text
|
||||
cover_img_css = 'body > div.container > div.row.movie > div.col-md-9.screencap > a'
|
||||
cover_img_url = html.find(cover_img_css)[0].attrs['href']
|
||||
tags_css = 'body > div.container > div.row.movie > div.col-md-3.info'
|
||||
tags = html.find(tags_css)[0].find('p')
|
||||
release_date = tags[1].text
|
||||
length = tags[2].text
|
||||
# meta data
|
||||
meta = {}
|
||||
meta['fanhao'], meta['title'] = title.split(maxsplit=1)
|
||||
meta['cover_img_url'] = cover_img_url
|
||||
meta['release_date'] = release_date.split()[1]
|
||||
meta['length'] = re.search(r'\d+', length).group()
|
||||
|
||||
tag_list = []
|
||||
for tag in tags[3:]:
|
||||
tag_type = ''
|
||||
tag_value = ''
|
||||
tag_link = ''
|
||||
links = tag.find('a')
|
||||
spans = tag.find('span.header')
|
||||
if spans and len(links) == 1:
|
||||
tag_type = (spans[0].text)
|
||||
tag_link = links[0].attrs['href']
|
||||
tag_value = links[0].text
|
||||
if tag_type != '' and tag_value != '':
|
||||
tag_list.append(create_tag(tag_type, tag_value, tag_link))
|
||||
else:
|
||||
for link in links:
|
||||
tag_link = link.attrs['href']
|
||||
tag_value = link.text
|
||||
if 'genre' in tag_link:
|
||||
tag_type = 'genre'
|
||||
if 'star' in tag_link:
|
||||
tag_type = 'star'
|
||||
if tag_type != '' and tag_value != '':
|
||||
tag_list.append(create_tag(tag_type, tag_value, tag_link))
|
||||
|
||||
return meta, tag_list
|
||||
|
||||
|
||||
def create_tag(tag_type, tag_value, tag_link):
|
||||
tag_link = router.get_url_path(tag_link)
|
||||
tag = Tag(tag_type, tag_value, tag_link)
|
||||
return tag
|
@ -0,0 +1,121 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import configparser
|
||||
import pytz
|
||||
import datetime
|
||||
from urllib.parse import urljoin
|
||||
|
||||
logger = logging.getLogger('bustag')
|
||||
TESTING = False
|
||||
DATA_PATH = 'data/'
|
||||
CONFIG_FILE = 'config.ini'
|
||||
MODEL_PATH = 'model/'
|
||||
APP_CONFIG = {}
|
||||
DEFAULT_CONFIG = {
|
||||
'download': {
|
||||
'count': 100,
|
||||
'interval': 3600
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_cwd():
|
||||
if getattr(sys, 'frozen', False):
|
||||
return sys._MEIPASS
|
||||
else:
|
||||
return os.getcwd()
|
||||
|
||||
|
||||
def check_testing():
|
||||
global TESTING
|
||||
if os.environ.get('TESTING'):
|
||||
TESTING = True
|
||||
print('*** in test mode ***')
|
||||
|
||||
|
||||
def setup_logging():
|
||||
fmt = '%(asctime)s - %(name)s - %(levelname)s - %(filename)s - %(funcName)s \n %(message)s '
|
||||
formatter = logging.Formatter(fmt)
|
||||
ch = logging.StreamHandler()
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
logger.setLevel(logging.WARNING)
|
||||
if TESTING:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
pw_logger = logging.getLogger('peewee')
|
||||
pw_logger.addHandler(logging.StreamHandler())
|
||||
pw_logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
def get_data_path(file):
|
||||
cwd = get_cwd()
|
||||
file_path = os.path.join(cwd, DATA_PATH, file)
|
||||
return file_path
|
||||
|
||||
|
||||
def get_now_time():
|
||||
return datetime.datetime.now()
|
||||
|
||||
|
||||
def get_full_url(path):
|
||||
root_path = APP_CONFIG['download.root_path']
|
||||
full_url = urljoin(root_path, path)
|
||||
return full_url
|
||||
|
||||
|
||||
def check_config():
|
||||
config_path = get_data_path(CONFIG_FILE)
|
||||
abs_path = os.path.abspath(config_path)
|
||||
if not os.path.exists(abs_path):
|
||||
logger.error(
|
||||
f'file {abs_path} not exists, please make sure file exists and configed, system quit now!')
|
||||
logger.error(f'文件 {abs_path} 不存在, 请检查文件存在并已配置, 系统退出!')
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def load_config():
|
||||
check_config()
|
||||
config_path = get_data_path(CONFIG_FILE)
|
||||
conf = configparser.ConfigParser()
|
||||
conf.read_dict(DEFAULT_CONFIG)
|
||||
conf.read(config_path)
|
||||
for section in conf.sections():
|
||||
APP_CONFIG[section.lower()] = dict(conf[section])
|
||||
for key in conf.options(section):
|
||||
value = conf.get(section, key)
|
||||
key = section + '.' + key
|
||||
APP_CONFIG[key.lower()] = value
|
||||
logger.debug(APP_CONFIG)
|
||||
|
||||
|
||||
def format_datetime(dt):
|
||||
format = '%Y-%m-%d %H:%M:%S'
|
||||
return dt.strftime(format)
|
||||
|
||||
|
||||
def to_localtime(utc_dt):
|
||||
local_tz = pytz.timezone('Asia/Shanghai')
|
||||
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
|
||||
format = '%Y-%m-%d %H:%M:%S'
|
||||
local_dt = local_tz.normalize(local_dt)
|
||||
return local_dt.strftime(format)
|
||||
|
||||
|
||||
def check_model_folder():
|
||||
model_path = os.path.join(DATA_PATH, MODEL_PATH)
|
||||
abs_path = os.path.abspath(model_path)
|
||||
if not os.path.exists(abs_path):
|
||||
print(f'created model folder: {abs_path}')
|
||||
os.mkdir(abs_path)
|
||||
|
||||
|
||||
def init():
|
||||
print(f'CWD: {get_cwd()}')
|
||||
check_testing()
|
||||
setup_logging()
|
||||
load_config()
|
||||
check_model_folder()
|
||||
|
||||
|
||||
init()
|
@ -0,0 +1,4 @@
|
||||
[download]
|
||||
root_path = https://www.busdmm.work
|
||||
count = 100
|
||||
interval = 10800
|
@ -0,0 +1,2 @@
|
||||
*/30 * * * * /app/docker/run_download.sh >> /var/log/bustag.log 2>&1
|
||||
|
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
PYTHON=python3
|
||||
# check if crontab.txt exists
|
||||
|
||||
echo `pwd`
|
||||
|
||||
${PYTHON} -m bustag.app.index
|
@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
cd /app
|
||||
/usr/local/bin/python -m bustag.main download
|
||||
/usr/local/bin/python -m bustag.main recommend
|
@ -0,0 +1,8 @@
|
||||
deb http://mirrors.163.com/debian/ buster main non-free contrib
|
||||
deb http://mirrors.163.com/debian/ buster-updates main non-free contrib
|
||||
deb http://mirrors.163.com/debian/ buster-backports main non-free contrib
|
||||
deb-src http://mirrors.163.com/debian/ buster main non-free contrib
|
||||
deb-src http://mirrors.163.com/debian/ buster-updates main non-free contrib
|
||||
deb-src http://mirrors.163.com/debian/ buster-backports main non-free contrib
|
||||
deb http://mirrors.163.com/debian-security/ buster/updates main non-free contrib
|
||||
deb-src http://mirrors.163.com/debian-security/ buster/updates main non-free contrib
|
After Width: | Height: | Size: 58 KiB |
After Width: | Height: | Size: 127 KiB |
After Width: | Height: | Size: 91 KiB |
After Width: | Height: | Size: 101 KiB |
After Width: | Height: | Size: 134 KiB |
After Width: | Height: | Size: 127 KiB |
@ -0,0 +1,123 @@
|
||||
aiohttp==3.5.4
|
||||
alabaster==0.7.12
|
||||
altgraph==0.16.1
|
||||
appdirs==1.4.3
|
||||
appnope==0.1.0
|
||||
APScheduler==3.6.1
|
||||
aspider==0.1.2
|
||||
astroid==2.2.5
|
||||
async-timeout==3.0.1
|
||||
atomicwrites==1.3.0
|
||||
attrs==19.1.0
|
||||
autopep8==1.4.4
|
||||
Babel==2.7.0
|
||||
backcall==0.1.0
|
||||
beautifulsoup4==4.8.0
|
||||
bleach==3.1.0
|
||||
bottle==0.12.17
|
||||
bs4==0.0.1
|
||||
category-encoders==2.0.0
|
||||
certifi==2019.6.16
|
||||
chardet==3.0.4
|
||||
Click==7.0
|
||||
coverage==4.5.4
|
||||
cssselect==1.0.3
|
||||
decorator==4.4.0
|
||||
defusedxml==0.6.0
|
||||
docopt==0.6.2
|
||||
docutils==0.15.2
|
||||
entrypoints==0.3
|
||||
fake-useragent==0.1.11
|
||||
gitdb==0.6.4
|
||||
GitPython==0.3.6
|
||||
gunicorn==19.9.0
|
||||
idna==2.8
|
||||
imagesize==1.1.0
|
||||
importlib-metadata==0.19
|
||||
ipykernel==5.1.1
|
||||
ipython-genutils==0.2.0
|
||||
ipywidgets==7.5.1
|
||||
isort==4.3.21
|
||||
jedi==0.14.1
|
||||
Jinja2==2.10.1
|
||||
joblib==0.13.2
|
||||
jsonschema==3.0.2
|
||||
jupyter-client==5.3.1
|
||||
jupyter-console==6.0.0
|
||||
jupyter-core==4.5.0
|
||||
lazy-object-proxy==1.4.1
|
||||
lxml==4.4.0
|
||||
macholib==1.11
|
||||
MarkupSafe==1.1.1
|
||||
mccabe==0.6.1
|
||||
mistune==0.8.4
|
||||
more-itertools==7.2.0
|
||||
multidict==4.5.2
|
||||
nbconvert==5.5.0
|
||||
nbformat==4.4.0
|
||||
numpy==1.17.1
|
||||
packaging==19.1
|
||||
pandas==0.25.0
|
||||
pandocfilters==1.4.2
|
||||
parse==1.12.0
|
||||
parso==0.5.1
|
||||
Paste==3.1.1
|
||||
patsy==0.5.1
|
||||
peewee==3.9.6
|
||||
pexpect==4.7.0
|
||||
pickleshare==0.7.5
|
||||
pluggy==0.12.0
|
||||
prometheus-client==0.7.1
|
||||
prompt-toolkit==2.0.9
|
||||
ptyprocess==0.6.0
|
||||
py==1.8.0
|
||||
pycodestyle==2.5.0
|
||||
pyee==6.0.0
|
||||
Pygments==2.4.2
|
||||
pylint==2.3.1
|
||||
pyparsing==2.4.2
|
||||
pypi-publisher==0.0.4
|
||||
pyppeteer==0.0.25
|
||||
pyquery==1.4.0
|
||||
pyrsistent==0.15.4
|
||||
pytest==5.0.1
|
||||
python-dateutil==2.8.0
|
||||
pytz==2019.2
|
||||
pyzmq==18.0.2
|
||||
qtconsole==4.5.2
|
||||
requests==2.22.0
|
||||
requests-html==0.10.0
|
||||
scikit-learn==0.21.3
|
||||
scipy==1.3.0
|
||||
Send2Trash==1.5.0
|
||||
six==1.12.0
|
||||
sklearn==0.0
|
||||
smmap==0.9.0
|
||||
snowballstemmer==1.9.0
|
||||
soupsieve==1.9.2
|
||||
Sphinx==2.1.2
|
||||
sphinx-rtd-theme==0.4.3
|
||||
sphinxcontrib-applehelp==1.0.1
|
||||
sphinxcontrib-devhelp==1.0.1
|
||||
sphinxcontrib-htmlhelp==1.0.2
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
sphinxcontrib-qthelp==1.0.2
|
||||
sphinxcontrib-serializinghtml==1.1.3
|
||||
statsmodels==0.10.1
|
||||
terminado==0.8.2
|
||||
testpath==0.4.2
|
||||
tqdm==4.32.2
|
||||
traitlets==4.3.2
|
||||
typed-ast==1.4.0
|
||||
tzlocal==2.0.0
|
||||
urllib3==1.25.3
|
||||
w3lib==1.20.0
|
||||
wcwidth==0.1.7
|
||||
webencodings==0.5.1
|
||||
websockets==8.0.2
|
||||
widgetsnbextension==3.5.1
|
||||
wrapt==1.11.2
|
||||
yarg==0.1.9
|
||||
yarl==1.3.0
|
||||
zipp==0.5.2
|
||||
-e git+https://github.com/gxtrobot/bustag.git@v0.2.1#egg=bustag
|
@ -0,0 +1,45 @@
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
|
||||
from setuptools import find_packages
|
||||
from setuptools import setup
|
||||
from bustag import __version__
|
||||
|
||||
|
||||
def read(filename):
|
||||
filename = os.path.join(os.path.dirname(__file__), filename)
|
||||
text_type = type(u"")
|
||||
with io.open(filename, mode="r", encoding='utf-8') as fd:
|
||||
return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read())
|
||||
|
||||
|
||||
setup(
|
||||
name="bustag",
|
||||
version=__version__,
|
||||
url="https://github.com/gxtrobot/bustag",
|
||||
license='MIT',
|
||||
|
||||
author="gxtrobot",
|
||||
author_email="gxtrobot@gmail.com",
|
||||
|
||||
description="a tag and recommend system for old bus driver",
|
||||
long_description=read("README.md"),
|
||||
|
||||
packages=find_packages(exclude=('tests',)),
|
||||
|
||||
install_requires=[],
|
||||
|
||||
classifiers=[
|
||||
'Development Status :: 2 - Pre-Alpha',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Programming Language :: Python',
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
],
|
||||
)
|
@ -0,0 +1,6 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def start():
|
||||
print("\n **** start test ****")
|
@ -0,0 +1,85 @@
|
||||
from bustag.spider.db import get_items, Item, RATE_TYPE, RATE_VALUE
|
||||
from requests_html import HTMLSession, HTML
|
||||
from bustag.spider.parser import parse_item
|
||||
|
||||
|
||||
def test_get_items():
|
||||
rate_type = RATE_TYPE.SYSTEM_RATE
|
||||
rate_value = RATE_VALUE.DISLIKE
|
||||
page = None
|
||||
items, page_info = get_items(
|
||||
rate_type=rate_type, rate_value=rate_value, page=page)
|
||||
assert len(items) > 0
|
||||
print(f'item count:{len(items)}')
|
||||
print(
|
||||
f'total_items: {page_info[0]}, total_page: {page_info[1]}, current_page: {page_info[2]}, page_size:{page_info[3]}')
|
||||
|
||||
|
||||
def test_get_items2():
|
||||
rate_type = None
|
||||
rate_value = None
|
||||
page = None
|
||||
items, page_info = get_items(
|
||||
rate_type=rate_type, rate_value=rate_value, page=page)
|
||||
assert len(items) > 0
|
||||
print(f'item count:{len(items)}')
|
||||
print(
|
||||
f'total_items: {page_info[0]}, total_page: {page_info[1]}, current_page: {page_info[2]}, page_size:{page_info[3]}')
|
||||
|
||||
|
||||
def test_getit():
|
||||
id = 100
|
||||
item = Item.getit(id)
|
||||
print(repr(item))
|
||||
assert item is not None
|
||||
|
||||
|
||||
def test_load_item():
|
||||
id = 1251
|
||||
item = Item.getit(id)
|
||||
Item.loadit(item)
|
||||
print(item.tags)
|
||||
|
||||
|
||||
def test_get_item_tags():
|
||||
fanhao = 'JUY-981'
|
||||
item = Item.get_by_fanhao(fanhao)
|
||||
print(item)
|
||||
Item.get_tags_dict(item)
|
||||
print(item.tags_dict)
|
||||
|
||||
|
||||
def test_missed_tags():
|
||||
url_temp = 'https://www.cdnbus.bid/{}'
|
||||
session = HTMLSession()
|
||||
num = 300
|
||||
i = 0
|
||||
fanhaos = ['PPPD-759', 'PPBD-166', 'XVSR-490', 'PPBD-162',
|
||||
'XVSR-478', 'BMW-188', 'GVG-935', 'TIKC-037', 'OVG-111', 'TIKF-037']
|
||||
# for item in Item.select().where(Item.fanhao.in_(fanhaos)):
|
||||
for item in Item.select():
|
||||
fanhao = item.fanhao
|
||||
url = url_temp.format(fanhao)
|
||||
r = session.get(url)
|
||||
meta, tags = parse_item(r.text)
|
||||
tags_set = {t.value for t in tags}
|
||||
# print(tags_set)
|
||||
tags_db = {t.tag.value for t in item.tags_list}
|
||||
# print(tags_db)
|
||||
diff = tags_set - tags_db
|
||||
if diff:
|
||||
print(f'{fanhao}tags not equal: {diff}')
|
||||
# else:
|
||||
# print('tags are equal')
|
||||
i += 1
|
||||
if i > num:
|
||||
break
|
||||
|
||||
|
||||
def test_empty_tags():
|
||||
empty = []
|
||||
for item in Item.select():
|
||||
tags_db = {t.tag.value for t in item.tags_list}
|
||||
if not tags_db:
|
||||
empty.append(item.fanhao)
|
||||
print(empty)
|
@ -0,0 +1,6 @@
|
||||
from bustag.main import recommend
|
||||
|
||||
|
||||
def test_recommend():
|
||||
count, recommend_count = recommend()
|
||||
assert count > 0
|
@ -0,0 +1,41 @@
|
||||
import random
|
||||
from bustag.model import classifier as clf
|
||||
from bustag.model.prepare import prepare_predict_data
|
||||
from bustag.spider.db import Item, get_items, ItemRate
|
||||
|
||||
|
||||
def test_train_model():
|
||||
clf.train()
|
||||
|
||||
|
||||
def test_recommend():
|
||||
total, count = clf.recommend()
|
||||
print('total:', total)
|
||||
print('recommended:', count)
|
||||
|
||||
|
||||
def test_make_model():
|
||||
'''
|
||||
tag random data to generate model
|
||||
'''
|
||||
page = 50
|
||||
no_rate_items = []
|
||||
for i in range(1, page):
|
||||
items, _ = get_items(None, None, i)
|
||||
no_rate_items.extend(items)
|
||||
size = len(no_rate_items)
|
||||
like_ratio = 0.4
|
||||
like_items = []
|
||||
unlike_items = []
|
||||
for item in no_rate_items:
|
||||
if random.random() < like_ratio:
|
||||
like_items.append(item)
|
||||
else:
|
||||
unlike_items.append(item)
|
||||
print(f'like items: {len(like_items)}, unlike items: {len(unlike_items)}')
|
||||
for item in like_items:
|
||||
ItemRate.saveit(1, 1, item.fanhao)
|
||||
for item in unlike_items:
|
||||
ItemRate.saveit(1, 0, item.fanhao)
|
||||
|
||||
clf.train()
|
@ -0,0 +1,32 @@
|
||||
import asyncio
|
||||
import pytest
|
||||
import aiohttp
|
||||
from bustag.spider.parser import parse_item
|
||||
from aspider.routeing import get_router
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def html():
|
||||
# url = 'https://www.cdnbus.bid/SHKD-875'
|
||||
url = 'https://www.busdmm.work/DVAJ-419'
|
||||
router = get_router()
|
||||
router.add_root_path(url.rsplit('/')[0])
|
||||
|
||||
async def fetch(session, url):
|
||||
async with session.get(url) as response:
|
||||
return await response.text(errors='ignore')
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
html = await fetch(session, url)
|
||||
return html
|
||||
|
||||
html = asyncio.run(main())
|
||||
return html
|
||||
|
||||
|
||||
def test_process_item(html):
|
||||
print('')
|
||||
meta, tags = parse_item(html)
|
||||
print(meta)
|
||||
print(tags)
|
@ -0,0 +1,8 @@
|
||||
from bustag.model.persist import load_model, dump_model
|
||||
|
||||
|
||||
def test_load_model():
|
||||
mlb = load_model()
|
||||
assert len(mlb.classes_) > 0
|
||||
print(mlb.classes_[:10])
|
||||
print(f'total tags: {len(mlb.classes_)}')
|
@ -0,0 +1,23 @@
|
||||
from bustag.model.prepare import load_data, process_data, prepare_predict_data
|
||||
|
||||
|
||||
def test_load_data():
|
||||
items = load_data()
|
||||
print(len(items))
|
||||
item = items[0]
|
||||
print(item.fanhao, item.tags_dict)
|
||||
assert len(items) > 0
|
||||
|
||||
|
||||
def test_process_data():
|
||||
df = load_data()
|
||||
X, y = process_data(df)
|
||||
print(X.shape)
|
||||
print(y.shape)
|
||||
|
||||
|
||||
def test_prepare_predict_data():
|
||||
ids, X = prepare_predict_data()
|
||||
print(X.shape)
|
||||
print(X[0])
|
||||
print(ids)
|
@ -0,0 +1,15 @@
|
||||
from bustag.spider.bus_spider import process_item
|
||||
from aspider.routeing import get_router
|
||||
from requests_html import HTMLSession
|
||||
import logging
|
||||
|
||||
|
||||
def test_process_item():
|
||||
root_path = 'https://www.cdnbus.bid'
|
||||
url = 'https://www.cdnbus.bid/CESD-797'
|
||||
session = HTMLSession()
|
||||
router = get_router()
|
||||
router.add_root_path(root_path)
|
||||
fanhao = 'CESD-797'
|
||||
r = session.get(url)
|
||||
process_item(r.text, url, fanhao)
|
@ -0,0 +1,50 @@
|
||||
from datetime import datetime
|
||||
import configparser
|
||||
from bustag import util
|
||||
|
||||
|
||||
def test_file_path():
|
||||
file = 'bus.db'
|
||||
path = util.get_data_path(file)
|
||||
print(path)
|
||||
|
||||
|
||||
def test_read_config():
|
||||
util.load_config()
|
||||
print(util.APP_CONFIG)
|
||||
|
||||
|
||||
def test_to_localtime():
|
||||
t = datetime.utcnow()
|
||||
local = util.to_localtime(t)
|
||||
print(local)
|
||||
|
||||
|
||||
def test_testing_mode():
|
||||
import os
|
||||
print(f'env: {os.getenv("TESTING")}')
|
||||
assert util.TESTING == True
|
||||
|
||||
|
||||
def test_config_defaults():
|
||||
config_path = util.get_data_path(util.CONFIG_FILE)
|
||||
conf = configparser.ConfigParser()
|
||||
defaults = {
|
||||
'options': {
|
||||
'proxy': 'http://localhost:7890'
|
||||
},
|
||||
'download': {
|
||||
'count': 100,
|
||||
'interval': 3600
|
||||
}
|
||||
}
|
||||
conf.read_dict(defaults)
|
||||
conf.read(config_path)
|
||||
for section in conf:
|
||||
print(f'[{section}]')
|
||||
for key, value in conf[section].items():
|
||||
print(f'{key} = {value}')
|
||||
print('')
|
||||
print(conf.get('download', 'count'))
|
||||
print(conf.get('download', 'interval'))
|
||||
print(conf.get('options', 'proxy'))
|