You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

421 lines
17 KiB

5 months ago
import pandas as pd
import numpy as np
import time
import pickle
from config import myshixuns_data_path, mysubjects_data_path, RANDOM_SEED
from config import myshixuns_train_data, myshixuns_test_data
from config import mysubjects_train_data, mysubjects_test_data
from config import shixuns_data_path, subjects_data_path
from config import myshixuns_train_data_baseline, myshixuns_test_data_baseline
from config import mysubjects_train_data_baseline, mysubjects_test_data_baseline
from config import logger
from config import shixun_id_to_name_dict_data, subject_id_to_name_dict_data
from config import myshixuns_save_path, mysubjects_save_path, users_data_path
from config import data_parent_path
from tqdm import tqdm
from utils import is_number
from utils import get_before_date
from bs4 import BeautifulSoup
from config import mysql_database, mysql_passwd
from config import mysql_host, mysql_port, mysql_user
from config import root_path
from config import samples_mode
from sqlalchemy import create_engine
import pymysql
import datetime
from urllib.parse import quote
import re
tqdm.pandas()
def build_myshixuns_train_test_data_baseline():
"""
划分学生参加实践项目的baseline训练集和测试集
"""
logger.info("加载物品行为数据")
data_df = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
logger.info("生成itemcf baseline训练集和测试集数据")
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
# 训练集划分90%测试集10%
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
test_data = data_df[~data_df.index.isin(train_data.index)]
train_data.to_csv(myshixuns_train_data_baseline, sep='\t', index=False, header=True)
test_data.to_csv(myshixuns_test_data_baseline, sep='\t', index=False, header=True)
def build_myshixuns_train_test_data():
"""
划分学生参加实践项目的训练集和测试集
"""
logger.info("加载物品行为数据")
data_df = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
logger.info("生成模型训练集和测试集数据")
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
# 对时间戳进行归一化, 用于在关联规则的时候计算权重
data_df['created_timestamp'] = data_df[['created_timestamp']].progress_apply(max_min_scaler)
# 训练集划分90%测试集10%
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
test_data = data_df[~data_df.index.isin(train_data.index)]
train_data.to_csv(myshixuns_train_data, sep='\t', index=False, header=True)
test_data.to_csv(myshixuns_test_data, sep='\t', index=False, header=True)
def build_myshixuns_samples_data(sample_nums=10000):
"""
划分学生参加实践项目的采样数据集
"""
all_select = pd.read_csv(myshixuns_data_path, sep='\t', encoding='utf-8')
# user_id过滤重复
all_user_ids = all_select.user_id.unique()
# 只采集指定数量的user_id
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
# 取出这些user_id选择的物品
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
# 根据user_id, shixun_id去重
all_select = all_select.drop_duplicates((['user_id', 'shixun_id']))
all_select.to_csv(myshixuns_save_path + 'myshixuns_samples.csv', sep='\t', index=False, header=True)
return all_select
def process_shixuns_data():
"""
处理实训数据集
"""
data_df = pd.read_csv(data_parent_path + 'shixuns_raw.csv', sep='\t', encoding='utf-8')
data_df['created_at_ts'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
data_df['shixun_id'] = data_df['shixun_id'].astype(int)
# 去除无用的实训名称
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\?\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)]
# 生成实训ID和实训名称的字典,方便召回时使用
shixun_id_to_name_dict = dict(zip(data_df['shixun_id'], data_df['shixun_name']))
pickle.dump(shixun_id_to_name_dict, open(shixun_id_to_name_dict_data, 'wb'))
data_df.to_csv(shixuns_data_path, sep='\t', index=False, header=True)
def process_subjects_data():
"""
处理课程数据集
"""
data_df = pd.read_csv(data_parent_path + 'subjects_raw.csv', sep='\t', encoding='utf-8')
data_df.dropna(axis=0, subset=['subject_id'], inplace=True)
data_df['created_at'].fillna('2017-01-01 00:00:00', inplace=True)
data_df['created_at_ts'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
data_df['subject_id'] = data_df['subject_id'].astype(int)
data_df['disciplines_id'] = data_df['disciplines_id'].fillna(-1)
data_df['disciplines_id'] = data_df['disciplines_id'].astype(int)
# 去除无用的课程名称
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\?\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
data_df = data_df[~data_df['subject_name'].astype(str).str.match(regex_pattern)]
# 生成实训ID和实训名称的字典,方便召回时使用
subject_id_to_name_dict = dict(zip(data_df['subject_id'], data_df['subject_name']))
pickle.dump(subject_id_to_name_dict, open(subject_id_to_name_dict_data, 'wb'))
data_df.to_csv(subjects_data_path, sep='\t', index=False, header=True)
def build_mysubjects_train_test_data_baseline():
"""
划分学生参加实践课程的baseline训练集和测试集
"""
logger.info("生成itemcf baseline训练集和测试集数据")
data_df = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
test_data = data_df[~data_df.index.isin(train_data.index)]
train_data.to_csv(mysubjects_train_data_baseline, sep='\t', index=False, header=True)
test_data.to_csv(mysubjects_test_data_baseline, sep='\t', index=False, header=True)
def build_mysubjects_train_test_data():
"""
划分学生参加实践课程的训练集和测试集
"""
logger.info("加载物品行为数据")
data_df = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
logger.info("生成模型训练集和测试集数据")
data_df['created_timestamp'] = data_df['created_at'].progress_apply(
lambda x:time.mktime(time.strptime(x,'%Y-%m-%d %H:%M:%S')))
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
# 对时间戳进行归一化, 用于在关联规则的时候计算权重
data_df['created_timestamp'] = data_df[['created_timestamp']].progress_apply(max_min_scaler)
# 训练集划分90%测试集10%
train_data = data_df.sample(frac=0.9, random_state=RANDOM_SEED, axis=0)
test_data = data_df[~data_df.index.isin(train_data.index)]
train_data.to_csv(mysubjects_train_data, sep='\t', index=False, header=True)
test_data.to_csv(mysubjects_test_data, sep='\t', index=False, header=True)
def build_mysubjects_samples_data(sample_nums=10000):
"""
划分学生参加实践课程的采样数据集
"""
all_select = pd.read_csv(mysubjects_data_path, sep='\t', encoding='utf-8')
# user_id过滤重复
all_user_ids = all_select.user_id.unique()
# 只采集指定数量的user_id
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
# 取出这些user_id选择的物品
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
# 根据user_id, shixun_id去重
all_select = all_select.drop_duplicates((['user_id', 'subject_id']))
all_select.to_csv(mysubjects_save_path + 'mysubjects_samples.csv', sep='\t', index=False, header=True)
return all_select
def clean_html(x):
return BeautifulSoup(str(x), 'html').get_text().strip().replace("\r\n", "").replace("\r", "").replace("\t", "").replace("\n", "").replace("@", "")
def get_shixuns_data_from_mysql():
"""
从mysql数据库中获取实践项目的原始数据
"""
start = datetime.datetime.now()
logger.info("开始获取实践项目数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/shixuns.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
data_df = pd.read_sql_query(sql_text, engine)
data_df.to_csv(data_parent_path + 'shixuns_raw.csv', sep='\t', index=False, header=True)
logger.info("实践项目数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def get_cold_start_shixuns_data_from_mysql():
"""
从mysql数据库获取实践项目原始数据
"""
start = datetime.datetime.now()
logger.info("开始获取实践项目冷启动数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/cold_start_shixuns.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
data_df = pd.read_sql_query(sql_text, engine)
# 去除无用的实训名称
regex_pattern = '^[0-9"\'\+\-\/。!;:,、,.\!\@\?\[\\\*\&\(\\\\#¥\%\^\_=%`]+$'
data_df = data_df[~data_df['shixun_name'].astype(str).str.match(regex_pattern)]
data_df.to_csv(data_parent_path + 'cold_start_shixuns.csv', sep='\t', index=False, header=True)
logger.info("实践项目冷启动数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def get_subjects_data_from_mysql():
"""
从mysql数据库获取实践课程原始数据
"""
start = datetime.datetime.now()
logger.info("开始获取实践课程数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/subjects.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
data_df = pd.read_sql_query(sql_text, engine)
data_df.to_csv(data_parent_path + 'subjects_raw.csv', sep='\t', index=False, header=True)
logger.info("实践课程数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def get_myshixuns_data_from_mysql(last_login_time, last_created_time, data_save_path):
"""
从mysql数据库获取学生选择的实践项目行为数据
"""
start = datetime.datetime.now()
logger.info("开始获取学生选择的实践项目行为数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/myshixuns.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
sql_text = sql_text.replace('LAST_CREATED_TIME', last_created_time)
data_df = pd.read_sql_query(sql_text, engine)
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
logger.info("学生实践项目行为数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def get_mysubjects_data_from_mysql(last_login_time, last_created_time, data_save_path):
"""
从mysql数据库获取学生选择的实践课程行为数据
"""
start = datetime.datetime.now()
logger.info("开始获取学生选择的实践课程行为数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/mysubjects.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
sql_text = sql_text.replace('LAST_CREATED_TIME', last_created_time)
data_df = pd.read_sql_query(sql_text, engine)
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
logger.info("学生实践课程行为数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
def process_users_data_null(user_info_df):
"""
填充用户数据缺失值
"""
user_info_df['edu_background'] = user_info_df['edu_background'].fillna(0.0)
user_info_df['edu_entry_year'] = user_info_df['edu_entry_year'].fillna(0.0)
user_info_df['experience'] = user_info_df['experience'].fillna(0.0)
user_info_df['gender'] = user_info_df['gender'].fillna(-1)
user_info_df['gid'] = user_info_df['gender'].fillna(0)
user_info_df['grade'] = user_info_df['grade'].fillna(0.0)
user_info_df['identity'] = user_info_df['identity'].fillna(0)
user_info_df['location'] = user_info_df['location'].fillna('')
user_info_df['location_city'] = user_info_df['location_city'].fillna('')
user_info_df['occupation'] = user_info_df['occupation'].fillna('')
user_info_df['school_id'] = user_info_df['school_id'].fillna(0)
user_info_df['school_name'] = user_info_df['school_name'].fillna('')
user_info_df['user_name'] = user_info_df['user_name'].fillna('')
user_info_df['logins'] = user_info_df['logins'].fillna(0)
return user_info_df
def get_users_data_from_mysql(last_login_time, data_save_path):
"""
从mysql数据库获取学生信息数据
"""
start = datetime.datetime.now()
logger.info("开始获取学生信息数据...")
engine = create_engine('mysql+pymysql://' + str(mysql_user) + ':' + quote(mysql_passwd) + '@'
+ str(mysql_host) + ':' + str(mysql_port) + '/' + str(mysql_database))
sql_text = ''
with open(root_path + '/data_analysis/users.sql', 'r', encoding='utf-8') as fread:
for line in fread.readlines():
line = line.replace("\r\n", " ").replace("\r", " ").replace("\t", " ").replace("\n", " ")
sql_text += line
sql_text = sql_text.replace('LAST_LOGIN_TIME', last_login_time)
data_df = pd.read_sql_query(sql_text, engine)
data_df = data_df.loc[data_df['user_id'].progress_apply(lambda x: is_number(x))]
data_df = process_users_data_null(data_df)
data_df.rename(columns={'visits': 'logins'}, inplace=True) #修改用户访问名称,与实训和课程区分,方便后期统一操作
data_df.to_csv(data_save_path, sep='\t', index=False, header=True)
logger.info("学生信息数据下载完毕,共" + str(data_df.shape[0]) + "条数据,总耗时" + str((datetime.datetime.now() - start).seconds) + "")
if __name__ == '__main__':
if samples_mode == True:
# 增量数据为最近1年登录的用户最近1年的选课数据
last_login_time = get_before_date(150)
last_created_time = get_before_date(150)
else:
# 全量数据为最近1年登录的用户的最近2年选课数据
last_login_time = get_before_date(365 )
last_created_time = get_before_date(365 * 2)
# 获取用户数据
get_users_data_from_mysql(last_login_time, users_data_path)
# 获取所有实践项目数据
get_shixuns_data_from_mysql()
get_cold_start_shixuns_data_from_mysql()
# # 获取所有实践项目选课行为数据
get_myshixuns_data_from_mysql(last_login_time, last_created_time, myshixuns_data_path)
# # 处理实践项目选课行为数据
process_shixuns_data()
build_myshixuns_train_test_data_baseline()
build_myshixuns_train_test_data()
# # 获取所有实践课程数据
get_subjects_data_from_mysql()
# # 获取所有实践课程选课行为数据
get_mysubjects_data_from_mysql(last_login_time, last_created_time, mysubjects_data_path)
# # 处理实践课程选课行为数据
process_subjects_data()
build_mysubjects_train_test_data_baseline()
build_mysubjects_train_test_data()