|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import time
|
|
|
import pickle
|
|
|
from config import myshixuns_data_path, mysubjects_data_path, RANDOM_SEED
|
|
|
from config import myshixuns_train_data, myshixuns_test_data
|
|
|
from config import mysubjects_train_data, mysubjects_test_data
|
|
|
from config import shixuns_data_path, subjects_data_path
|
|
|
from config import myshixuns_train_data_baseline, myshixuns_test_data_baseline
|
|
|
from config import mysubjects_train_data_baseline, mysubjects_test_data_baseline
|
|
|
from config import logger
|
|
|
from config import shixun_id_to_name_dict_data, subject_id_to_name_dict_data
|
|
|
from config import myshixuns_save_path, mysubjects_save_path, users_data_path
|
|
|
from config import data_parent_path
|
|
|
from tqdm import tqdm
|
|
|
from utils import is_number
|
|
|
from utils import get_before_date
|
|
|
from bs4 import BeautifulSoup
|
|
|
from config import mysql_database, mysql_passwd
|
|
|
from config import mysql_host, mysql_port, mysql_user
|
|
|
from config import root_path
|
|
|
from config import samples_mode
|
|
|
from sqlalchemy import create_engine
|
|
|
import pymysql
|
|
|
import datetime
|
|
|
from urllib.parse import quote
|
|
|
import re
|
|
|
|
|
|
|
|
|
tqdm.pandas()
|
|
|
|
|
|
|
|
|
def build_myshixuns_train_test_data_baseline():
|
|
|
"""
|
|
|
划分学生参加实践项目的baseline训练集和测试集
|
|
|
"""
|
|
|
logger.info("加载物品行为数据")
|
|
|
data_df = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
|
|
|
|
|
|
logger.info("生成itemcf baseline训练集和测试集数据")
|
|
|
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
|
|
|
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
|
|
|
|
|
|
# 训练集划分90%,测试集10%
|
|
|
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
|
|
|
test_data = data_df[~data_df.index.isin(train_data.index)]
|
|
|
|
|
|
train_data.to_csv(myshixuns_train_data_baseline, sep='\t', index=False, header=True)
|
|
|
test_data.to_csv(myshixuns_test_data_baseline, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
def build_myshixuns_train_test_data():
|
|
|
"""
|
|
|
划分学生参加实践项目的训练集和测试集
|
|
|
"""
|
|
|
logger.info("加载物品行为数据")
|
|
|
data_df = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
|
|
|
|
|
|
logger.info("生成模型训练集和测试集数据")
|
|
|
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
|
|
|
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
|
|
|
|
|
|
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
|
|
|
|
|
|
# 对时间戳进行归一化, 用于在关联规则的时候计算权重
|
|
|
data_df['created_timestamp'] = data_df[['created_timestamp']].progress_apply(max_min_scaler)
|
|
|
|
|
|
# 训练集划分90%,测试集10%
|
|
|
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
|
|
|
test_data = data_df[~data_df.index.isin(train_data.index)]
|
|
|
|
|
|
train_data.to_csv(myshixuns_train_data, sep='\t', index=False, header=True)
|
|
|
test_data.to_csv(myshixuns_test_data, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
def build_myshixuns_samples_data(sample_nums=10000):
|
|
|
"""
|
|
|
划分学生参加实践项目的采样数据集
|
|
|
"""
|
|
|
all_select = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
|
|
|
|
|
|
# user_id过滤重复
|
|
|
all_user_ids = all_select.user_id.unique()
|
|
|
|
|
|
# 只采集指定数量的user_id
|
|
|
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
|
|
|
|
|
|
# 取出这些user_id选择的物品
|
|
|
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
|
|
|
|
|
|
# 根据user_id, shixun_id去重
|
|
|
all_select = all_select.drop_duplicates((['user_id', 'shixun_id']))
|
|
|
|
|
|
all_select.to_csv(myshixuns_save_path + 'myshixuns_samples.csv', sep='\t', index=False, header=True)
|
|
|
return all_select
|
|
|
|
|
|
|
|
|
def process_shixuns_data():
|
|
|
"""
|
|
|
处理实训数据集
|
|
|
"""
|
|
|
data_df = pd.read_csv(data_parent_path + 'shixuns_raw.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
|
data_df['created_at_ts'] = data_df['created_at'].progress_apply(
|
|
|
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
|
|
|
data_df['shixun_id'] = data_df['shixun_id'].astype(int)
|
|
|
|
|
|
# 去除无用的实训名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
# 生成实训ID和实训名称的字典,方便召回时使用
|
|
|
shixun_id_to_name_dict = dict(zip(data_df['shixun_id'], data_df['shixun_name']))
|
|
|
pickle.dump(shixun_id_to_name_dict, open(shixun_id_to_name_dict_data, 'wb'))
|
|
|
|
|
|
data_df.to_csv(shixuns_data_path, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
def process_subjects_data():
|
|
|
"""
|
|
|
处理课程数据集
|
|
|
"""
|
|
|
data_df = pd.read_csv(data_parent_path + 'subjects_raw.csv', sep='\t', encoding='utf-8')
|
|
|
data_df.dropna(axis=0, subset=['subject_id'], inplace=True)
|
|
|
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
|
|
|
data_df['created_at_ts'] = data_df['created_at'].progress_apply(
|
|
|
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
|
|
|
data_df['subject_id'] = data_df['subject_id'].astype(int)
|
|
|
|
|
|
data_df['disciplines_id'] = data_df['disciplines_id'].fillna(-1)
|
|
|
data_df['disciplines_id'] = data_df['disciplines_id'].astype(int)
|
|
|
|
|
|
# 去除无用的课程名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['subject_name'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
# 生成实训ID和实训名称的字典,方便召回时使用
|
|
|
subject_id_to_name_dict = dict(zip(data_df['subject_id'], data_df['subject_name']))
|
|
|
pickle.dump(subject_id_to_name_dict, open(subject_id_to_name_dict_data, 'wb'))
|
|
|
|
|
|
data_df.to_csv(subjects_data_path, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
def build_mysubjects_train_test_data_baseline():
|
|
|
"""
|
|
|
划分学生参加实践课程的baseline训练集和测试集
|
|
|
"""
|
|
|
logger.info("生成itemcf baseline训练集和测试集数据")
|
|
|
data_df = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
|
|
|
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
|
|
|
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
|
|
|
|
|
|
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
|
|
|
test_data = data_df[~data_df.index.isin(train_data.index)]
|
|
|
|
|
|
train_data.to_csv(mysubjects_train_data_baseline, sep='\t', index=False, header=True)
|
|
|
test_data.to_csv(mysubjects_test_data_baseline, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
def build_mysubjects_train_test_data():
|
|
|
"""
|
|
|
划分学生参加实践课程的训练集和测试集
|
|
|
"""
|
|
|
logger.info("加载物品行为数据")
|
|
|
data_df = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
|
|
|
|
|
|
logger.info("生成模型训练集和测试集数据")
|
|
|
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
|
|
|
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
|
|
|
|
|
|
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
|
|
|
|
|
|
# 对时间戳进行归一化, 用于在关联规则的时候计算权重
|
|
|
data_df['created_timestamp'] = data_df[['created_timestamp']].progress_apply(max_min_scaler)
|
|
|
|
|
|
# 训练集划分90%,测试集10%
|
|
|
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
|
|
|
test_data = data_df[~data_df.index.isin(train_data.index)]
|
|
|
|
|
|
train_data.to_csv(mysubjects_train_data, sep='\t', index=False, header=True)
|
|
|
test_data.to_csv(mysubjects_test_data, sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
def build_mysubjects_samples_data(sample_nums=10000):
|
|
|
"""
|
|
|
划分学生参加实践课程的采样数据集
|
|
|
"""
|
|
|
|
|
|
all_select = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
|
|
|
|
|
|
# user_id过滤重复
|
|
|
all_user_ids = all_select.user_id.unique()
|
|
|
|
|
|
# 只采集指定数量的user_id
|
|
|
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
|
|
|
|
|
|
# 取出这些user_id选择的物品
|
|
|
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
|
|
|
|
|
|
# 根据user_id, shixun_id去重
|
|
|
all_select = all_select.drop_duplicates((['user_id', 'subject_id']))
|
|
|
|
|
|
all_select.to_csv(mysubjects_save_path + 'mysubjects_samples.csv', sep='\t', index=False, header=True)
|
|
|
return all_select
|
|
|
|
|
|
|
|
|
def clean_html(x):
|
|
|
return BeautifulSoup(str(x), 'html').get_text().strip().replace("\r\n", "").replace("\r", "").replace("\t", "").replace("\n", "").replace("@", "")
|
|
|
|
|
|
|
|
|
def get_shixuns_data_from_mysql():
|
|
|
"""
|
|
|
从mysql数据库中获取实践项目的原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取实践项目数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
|
|
|
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(root_path + '/data_analysis/shixuns.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
data_df.to_csv(data_parent_path + 'shixuns_raw.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("实践项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
|
|
|
def get_cold_start_shixuns_data_from_mysql():
|
|
|
"""
|
|
|
从mysql数据库获取实践项目原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取实践项目冷启动数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
|
|
|
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(root_path + '/data_analysis/cold_start_shixuns.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
# 去除无用的实训名称
|
|
|
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\??\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
|
|
|
data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)]
|
|
|
|
|
|
data_df.to_csv(data_parent_path + 'cold_start_shixuns.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("实践项目冷启动数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
|
|
|
def get_subjects_data_from_mysql():
|
|
|
"""
|
|
|
从mysql数据库获取实践课程原始数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取实践课程数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
|
|
|
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(root_path + '/data_analysis/subjects.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
|
|
|
data_df.to_csv(data_parent_path + 'subjects_raw.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("实践课程数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
|
|
|
def get_myshixuns_data_from_mysql(last_login_time, last_created_time, data_save_path):
|
|
|
"""
|
|
|
从mysql数据库获取学生选择的实践项目行为数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取学生选择的实践项目行为数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
|
|
|
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(root_path + '/data_analysis/myshixuns.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
|
|
|
sql_text = sql_text.replace('LAST_CREATED_TIME', last_created_time)
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("学生实践项目行为数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
|
|
|
def get_mysubjects_data_from_mysql(last_login_time, last_created_time, data_save_path):
|
|
|
"""
|
|
|
从mysql数据库获取学生选择的实践课程行为数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取学生选择的实践课程行为数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
|
|
|
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(root_path + '/data_analysis/mysubjects.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
|
|
|
sql_text = sql_text.replace('LAST_CREATED_TIME', last_created_time)
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("学生实践课程行为数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
def process_users_data_null(user_info_df):
|
|
|
"""
|
|
|
填充用户数据缺失值
|
|
|
"""
|
|
|
user_info_df['edu_background'] = user_info_df['edu_background'].fillna(0.0)
|
|
|
user_info_df['edu_entry_year'] = user_info_df['edu_entry_year'].fillna(0.0)
|
|
|
user_info_df['experience'] = user_info_df['experience'].fillna(0.0)
|
|
|
user_info_df['gender'] = user_info_df['gender'].fillna(-1)
|
|
|
user_info_df['gid'] = user_info_df['gender'].fillna(0)
|
|
|
user_info_df['grade'] = user_info_df['grade'].fillna(0.0)
|
|
|
user_info_df['identity'] = user_info_df['identity'].fillna(0)
|
|
|
user_info_df['location'] = user_info_df['location'].fillna('')
|
|
|
user_info_df['location_city'] = user_info_df['location_city'].fillna('')
|
|
|
user_info_df['occupation'] = user_info_df['occupation'].fillna('')
|
|
|
user_info_df['school_id'] = user_info_df['school_id'].fillna(0)
|
|
|
user_info_df['school_name'] = user_info_df['school_name'].fillna('')
|
|
|
user_info_df['user_name'] = user_info_df['user_name'].fillna('')
|
|
|
user_info_df['logins'] = user_info_df['logins'].fillna(0)
|
|
|
|
|
|
return user_info_df
|
|
|
|
|
|
def get_users_data_from_mysql(last_login_time, data_save_path):
|
|
|
"""
|
|
|
从mysql数据库获取学生信息数据
|
|
|
"""
|
|
|
start = datetime.datetime.now()
|
|
|
|
|
|
logger.info("开始获取学生信息数据...")
|
|
|
|
|
|
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
|
|
|
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
|
|
|
|
|
|
sql_text = ''
|
|
|
with open(root_path + '/data_analysis/users.sql', 'r', encoding='utf-8') as fread:
|
|
|
for line in fread.readlines():
|
|
|
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
|
|
|
sql_text += line
|
|
|
|
|
|
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
|
|
|
|
|
|
data_df = pd.read_sql_query(sql_text, engine)
|
|
|
data_df = data_df.loc[data_df['user_id'].progress_apply(lambda x: is_number(x))]
|
|
|
data_df = process_users_data_null(data_df)
|
|
|
data_df.rename(columns={'visits': 'logins'}, inplace=True) #修改用户访问名称,与实训和课程区分,方便后期统一操作
|
|
|
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
|
|
|
|
|
|
logger.info("学生信息数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "秒")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
if samples_mode == True:
|
|
|
# 增量数据为最近1年登录的用户最近1年的选课数据
|
|
|
last_login_time = get_before_date(150)
|
|
|
last_created_time = get_before_date(150)
|
|
|
else:
|
|
|
# 全量数据为最近1年登录的用户的最近2年选课数据
|
|
|
last_login_time = get_before_date(365 )
|
|
|
last_created_time = get_before_date(365 * 2)
|
|
|
|
|
|
# 获取用户数据
|
|
|
get_users_data_from_mysql(last_login_time, users_data_path)
|
|
|
|
|
|
# 获取所有实践项目数据
|
|
|
get_shixuns_data_from_mysql()
|
|
|
get_cold_start_shixuns_data_from_mysql()
|
|
|
|
|
|
# # 获取所有实践项目选课行为数据
|
|
|
get_myshixuns_data_from_mysql(last_login_time, last_created_time, myshixuns_data_path)
|
|
|
|
|
|
# # 处理实践项目选课行为数据
|
|
|
process_shixuns_data()
|
|
|
build_myshixuns_train_test_data_baseline()
|
|
|
build_myshixuns_train_test_data()
|
|
|
|
|
|
# # 获取所有实践课程数据
|
|
|
get_subjects_data_from_mysql()
|
|
|
|
|
|
# # 获取所有实践课程选课行为数据
|
|
|
get_mysubjects_data_from_mysql(last_login_time, last_created_time, mysubjects_data_path)
|
|
|
|
|
|
# # 处理实践课程选课行为数据
|
|
|
process_subjects_data()
|
|
|
build_mysubjects_train_test_data_baseline()
|
|
|
build_mysubjects_train_test_data() |