|
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
import warnings
|
|
|
|
|
import pickle
|
|
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
from config import mysubjects_train_data, mysubjects_test_data
|
|
|
|
|
from config import subjects_data_path, subjects_embed_path, subject_save_path
|
|
|
|
|
from config import subject_user_item_time_dict_data
|
|
|
|
|
from config import subject_item_user_time_dict, data_parent_path
|
|
|
|
|
from config import subjects_emb_dict
|
|
|
|
|
from config import users_data_path
|
|
|
|
|
from utils import reduce_mem
|
|
|
|
|
from config import logger
|
|
|
|
|
from joblib import Parallel, delayed
|
|
|
|
|
from data_process import process_users_data_null
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tqdm.pandas()
|
|
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
|
|
|
|
def get_all_select_sample(sample_nums=10000):
|
|
|
|
|
"""
|
|
|
|
|
从训练集中划出一部分数据来调试代码
|
|
|
|
|
"""
|
|
|
|
|
if os.path.exists(data_parent_path + 'mysubjects_train_sample.csv'):
|
|
|
|
|
all_select = pd.read_csv(data_parent_path + 'mysubjects_train_sample.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
reduce_mem(all_select)
|
|
|
|
|
return all_select
|
|
|
|
|
|
|
|
|
|
all_select = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
# user_id过滤重复
|
|
|
|
|
all_user_ids = all_select.user_id.unique()
|
|
|
|
|
|
|
|
|
|
# 只采集指定数量的user_id
|
|
|
|
|
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
|
|
|
|
|
|
|
|
|
|
# 取出这些user_id选择的课程
|
|
|
|
|
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
|
|
|
|
|
|
|
|
|
|
# 根据user_id, subject_id去重
|
|
|
|
|
all_select = all_select.drop_duplicates((['user_id', 'subject_id']))
|
|
|
|
|
|
|
|
|
|
all_select.to_csv(data_parent_path + 'mysubjects_train_sample.csv', sep='\t', index=False, header=True)
|
|
|
|
|
reduce_mem(all_select)
|
|
|
|
|
return all_select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_all_select_df(offline = False):
|
|
|
|
|
"""
|
|
|
|
|
读取课程数据时分成线上和线下
|
|
|
|
|
线上预测时将测试集中的课程数据合并到总的数据中
|
|
|
|
|
线下验证模型的有效性或者特征的有效性时只使用训练集
|
|
|
|
|
"""
|
|
|
|
|
if offline:
|
|
|
|
|
all_select = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8')
|
|
|
|
|
else:
|
|
|
|
|
train_mysubjects = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
test_mysubjects = pd.read_csv(mysubjects_test_data, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
all_select = train_mysubjects.append(test_mysubjects)
|
|
|
|
|
|
|
|
|
|
all_select = all_select.drop_duplicates((['user_id', 'subject_id']))
|
|
|
|
|
reduce_mem(all_select)
|
|
|
|
|
return all_select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_info_df():
|
|
|
|
|
"""
|
|
|
|
|
读取用户的基本属性
|
|
|
|
|
"""
|
|
|
|
|
user_info_df = pd.read_csv(users_data_path, sep='\t', encoding='utf-8', quoting=3)
|
|
|
|
|
user_info_df = process_users_data_null(user_info_df)
|
|
|
|
|
reduce_mem(user_info_df)
|
|
|
|
|
return user_info_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_info_df():
|
|
|
|
|
"""
|
|
|
|
|
读取课程的基本属性
|
|
|
|
|
"""
|
|
|
|
|
item_info_df = pd.read_csv(subjects_data_path, sep='\t', encoding='utf-8')
|
|
|
|
|
reduce_mem(item_info_df)
|
|
|
|
|
|
|
|
|
|
return item_info_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_recall_item_info_df(item_info_df, subject_id_list):
|
|
|
|
|
"""
|
|
|
|
|
读取课程的基本属性
|
|
|
|
|
"""
|
|
|
|
|
recall_item_info_df = item_info_df[item_info_df['subject_id'].isin(subject_id_list)]
|
|
|
|
|
return recall_item_info_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_select_item_info(all_select_df, user_id):
|
|
|
|
|
"""
|
|
|
|
|
获取某个用户选择的课程行为数据
|
|
|
|
|
"""
|
|
|
|
|
# select_item_df = all_select_df[all_select_df['user_id'] == user_id]
|
|
|
|
|
|
|
|
|
|
# 用loc会快一些
|
|
|
|
|
select_item_df = all_select_df.loc[all_select_df['user_id'] == user_id, :].reset_index()
|
|
|
|
|
|
|
|
|
|
return select_item_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_emb_dict():
|
|
|
|
|
"""
|
|
|
|
|
生成和读取课程的Embedding数据
|
|
|
|
|
"""
|
|
|
|
|
# 加载已经保存的Embedding数据
|
|
|
|
|
if os.path.exists(subjects_emb_dict):
|
|
|
|
|
item_emb_dict = pickle.load(open(subjects_emb_dict, 'rb'))
|
|
|
|
|
return item_emb_dict
|
|
|
|
|
|
|
|
|
|
# 生成课程的Embedding数据
|
|
|
|
|
item_emb_df = pd.read_csv(subjects_embed_path, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]
|
|
|
|
|
item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])
|
|
|
|
|
# 进行归一化
|
|
|
|
|
item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)
|
|
|
|
|
|
|
|
|
|
item_emb_dict = dict(zip(item_emb_df['subject_id'], item_emb_np))
|
|
|
|
|
pickle.dump(item_emb_dict, open(subjects_emb_dict, 'wb'))
|
|
|
|
|
|
|
|
|
|
return item_emb_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_item_time_dict(select_df):
|
|
|
|
|
"""
|
|
|
|
|
根据时间获取用户选择的课程序列
|
|
|
|
|
{user1: [(item1, time1), (item2, time2)..]...}
|
|
|
|
|
"""
|
|
|
|
|
def make_item_time_pair(df):
|
|
|
|
|
"""
|
|
|
|
|
构造选课程,选择时间列表
|
|
|
|
|
"""
|
|
|
|
|
return list(zip(df['subject_id'], df['created_timestamp']))
|
|
|
|
|
|
|
|
|
|
# 加载之前pickel保存的
|
|
|
|
|
if os.path.exists(subject_user_item_time_dict_data):
|
|
|
|
|
user_item_time_dict = pickle.load(open(subject_user_item_time_dict_data, 'rb'))
|
|
|
|
|
return user_item_time_dict
|
|
|
|
|
|
|
|
|
|
# 按选择时间排序
|
|
|
|
|
select_df = select_df.sort_values('created_timestamp')
|
|
|
|
|
|
|
|
|
|
# 按用户分组生成用户选择的课程
|
|
|
|
|
user_item_time_df = select_df.groupby('user_id')['subject_id', 'created_timestamp'].progress_apply(lambda x: make_item_time_pair(x)).reset_index(). \
|
|
|
|
|
rename(columns = {0: 'item_time_list'})
|
|
|
|
|
|
|
|
|
|
# 生成用户选择的课程字典
|
|
|
|
|
user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
|
|
|
|
|
|
|
|
|
|
pickle.dump(user_item_time_dict, open(subject_user_item_time_dict_data, 'wb'))
|
|
|
|
|
|
|
|
|
|
return user_item_time_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_user_time_dict(select_df):
|
|
|
|
|
"""
|
|
|
|
|
根据选择时间获取课程被选择的用户序列
|
|
|
|
|
{item1: [(user1, time1), (user2, time2)...]...}
|
|
|
|
|
"""
|
|
|
|
|
def make_user_time_pair(df):
|
|
|
|
|
return list(zip(df['user_id'], df['created_timestamp']))
|
|
|
|
|
|
|
|
|
|
# 加载之前pickel保存的
|
|
|
|
|
if os.path.exists(subject_item_user_time_dict):
|
|
|
|
|
item_user_time_dict = pickle.load(open(subject_item_user_time_dict, 'rb'))
|
|
|
|
|
return item_user_time_dict
|
|
|
|
|
|
|
|
|
|
select_df = select_df.sort_values('created_timestamp')
|
|
|
|
|
|
|
|
|
|
item_user_time_df = select_df.groupby('subject_id')['user_id', 'created_timestamp']. \
|
|
|
|
|
progress_apply(lambda x: make_user_time_pair(x)).reset_index(). \
|
|
|
|
|
rename(columns = {0: 'user_time_list'})
|
|
|
|
|
|
|
|
|
|
item_user_time_dict = dict(zip(item_user_time_df['subject_id'], item_user_time_df['user_time_list']))
|
|
|
|
|
pickle.dump(item_user_time_dict, open(subject_item_user_time_dict, 'wb'))
|
|
|
|
|
|
|
|
|
|
return item_user_time_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_hist_and_last_select(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取课程行为数据的历史选择和最后一次选择
|
|
|
|
|
"""
|
|
|
|
|
# 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下
|
|
|
|
|
def hist_func(user_df):
|
|
|
|
|
if len(user_df) == 1:
|
|
|
|
|
return user_df
|
|
|
|
|
else:
|
|
|
|
|
return user_df[:-1]
|
|
|
|
|
|
|
|
|
|
def apply_parallel(df_grouped, func):
|
|
|
|
|
results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped)
|
|
|
|
|
return pd.concat(results)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(subject_save_path + 'select_last_df.csv') and \
|
|
|
|
|
os.path.exists(subject_save_path + 'select_hist_df.csv'):
|
|
|
|
|
select_last_df = pd.read_csv(subject_save_path + 'select_last_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
select_hist_df = pd.read_csv(subject_save_path + 'select_hist_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
else:
|
|
|
|
|
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
|
|
|
|
|
select_last_df = all_select.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
# 多进程并行提升处理速度
|
|
|
|
|
df_grouped = all_select.groupby('user_id')
|
|
|
|
|
select_hist_df = apply_parallel(df_grouped, hist_func)
|
|
|
|
|
select_hist_df = select_hist_df.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
#select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True)
|
|
|
|
|
|
|
|
|
|
select_last_df.to_csv(subject_save_path + 'select_last_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
select_hist_df.to_csv(subject_save_path + 'select_hist_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
reduce_mem(select_last_df)
|
|
|
|
|
reduce_mem(select_hist_df)
|
|
|
|
|
return select_hist_df, select_last_df
|
|
|
|
|
|
|
|
|
|
def get_all_hist_and_last_select(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取课程行为数据的历史选择和最后一次选择
|
|
|
|
|
"""
|
|
|
|
|
# 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下
|
|
|
|
|
def hist_func(user_df):
|
|
|
|
|
if len(user_df) == 1:
|
|
|
|
|
return user_df
|
|
|
|
|
else:
|
|
|
|
|
return user_df[:-1]
|
|
|
|
|
|
|
|
|
|
def apply_parallel(df_grouped, func):
|
|
|
|
|
results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped)
|
|
|
|
|
return pd.concat(results)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(subject_save_path + 'select_all_last_df.csv') and \
|
|
|
|
|
os.path.exists(subject_save_path + 'select_all_hist_df.csv'):
|
|
|
|
|
select_last_df = pd.read_csv(subject_save_path + 'select_all_last_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
select_hist_df = pd.read_csv(subject_save_path + 'select_all_hist_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
else:
|
|
|
|
|
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
|
|
|
|
|
select_last_df = all_select.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
# 多进程并行提升处理速度
|
|
|
|
|
df_grouped = all_select.groupby('user_id')
|
|
|
|
|
select_hist_df = apply_parallel(df_grouped, hist_func)
|
|
|
|
|
select_hist_df = select_hist_df.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
#select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True)
|
|
|
|
|
|
|
|
|
|
select_last_df.to_csv(subject_save_path + 'select_all_last_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
select_hist_df.to_csv(subject_save_path + 'select_all_hist_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
reduce_mem(select_last_df)
|
|
|
|
|
reduce_mem(select_hist_df)
|
|
|
|
|
return select_hist_df, select_last_df
|
|
|
|
|
|
|
|
|
|
def get_rank_hist_and_last_select(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取排序课程行为数据的历史选择和最后一次选择
|
|
|
|
|
"""
|
|
|
|
|
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
|
|
|
|
|
select_last_df = all_select.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
# 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下
|
|
|
|
|
def hist_func(user_df):
|
|
|
|
|
if len(user_df) == 1:
|
|
|
|
|
return user_df
|
|
|
|
|
else:
|
|
|
|
|
return user_df[:-1]
|
|
|
|
|
|
|
|
|
|
select_hist_df = all_select.groupby('user_id').apply(hist_func).reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
return select_hist_df, select_last_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_recall_item_info_dict(item_info_df):
|
|
|
|
|
"""
|
|
|
|
|
获取课程id对应的基本属性,保存成字典的形式
|
|
|
|
|
方便后面召回阶段,冷启动阶段直接使用
|
|
|
|
|
"""
|
|
|
|
|
if os.path.exists(subject_save_path + 'subjects_info_recall_dict.pkl'):
|
|
|
|
|
item_info_dict = pickle.load(open(subject_save_path + 'subjects_info_recall_dict.pkl', 'rb'))
|
|
|
|
|
return item_info_dict
|
|
|
|
|
|
|
|
|
|
item_info_df['visits'] = item_info_df['visits'].fillna(0)
|
|
|
|
|
item_info_df['visits'] = item_info_df['visits'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['study_count'] = item_info_df['study_count'].fillna(0)
|
|
|
|
|
item_info_df['study_count'] = item_info_df['visits'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['course_study_count'] = item_info_df['course_study_count'].fillna(0)
|
|
|
|
|
item_info_df['course_study_count'] = item_info_df['course_study_count'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['passed_count'] = item_info_df['passed_count'].fillna(0)
|
|
|
|
|
item_info_df['passed_count'] = item_info_df['passed_count'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['course_used_count'] = item_info_df['course_used_count'].fillna(0)
|
|
|
|
|
item_info_df['course_used_count'] = item_info_df['course_used_count'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['school_used_count'] = item_info_df['school_used_count'].fillna(0)
|
|
|
|
|
item_info_df['school_used_count'] = item_info_df['school_used_count'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['challenge_count'] = item_info_df['challenge_count'].fillna(0)
|
|
|
|
|
item_info_df['challenge_count'] = item_info_df['challenge_count'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['evaluate_count'] = item_info_df['evaluate_count'].fillna(0)
|
|
|
|
|
item_info_df['evaluate_count'] = item_info_df['evaluate_count'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['video_study_time'] = item_info_df['video_study_time'].fillna(0)
|
|
|
|
|
item_info_df['video_study_time'] = item_info_df['video_study_time'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].fillna(0)
|
|
|
|
|
item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].astype(int)
|
|
|
|
|
|
|
|
|
|
item_info_df['averge_star'] = item_info_df['averge_star'].fillna(0.0)
|
|
|
|
|
item_info_df['averge_star'] = item_info_df['averge_star'].astype(float)
|
|
|
|
|
|
|
|
|
|
# 创建时间进行归一化
|
|
|
|
|
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
|
|
|
|
|
item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].progress_apply(max_min_scaler)
|
|
|
|
|
|
|
|
|
|
# 课程名称
|
|
|
|
|
item_name_dict = dict(zip(item_info_df['subject_id'], item_info_df['subject_name']))
|
|
|
|
|
|
|
|
|
|
# 课程访问次数
|
|
|
|
|
item_visists_dict = dict(zip(item_info_df['subject_id'], item_info_df['visits']))
|
|
|
|
|
|
|
|
|
|
# 课程创建时间
|
|
|
|
|
item_created_time_dict = dict(zip(item_info_df['subject_id'], item_info_df['created_at_ts']))
|
|
|
|
|
|
|
|
|
|
# 课程评价星数
|
|
|
|
|
item_averge_star_dict = dict(zip(item_info_df['subject_id'], item_info_df['averge_star']))
|
|
|
|
|
|
|
|
|
|
# 课程学生人数
|
|
|
|
|
item_study_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['study_count']))
|
|
|
|
|
|
|
|
|
|
# 课程课堂学习人数
|
|
|
|
|
item_course_study_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['course_study_count']))
|
|
|
|
|
|
|
|
|
|
# 课程通过人数
|
|
|
|
|
item_passed_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['passed_count']))
|
|
|
|
|
|
|
|
|
|
# 课程课堂使用数量
|
|
|
|
|
item_course_used_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['course_used_count']))
|
|
|
|
|
|
|
|
|
|
# 课程学校使用数量
|
|
|
|
|
item_school_used_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['school_used_count']))
|
|
|
|
|
|
|
|
|
|
# 课程关卡数量
|
|
|
|
|
item_challenge_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['challenge_count']))
|
|
|
|
|
|
|
|
|
|
# 课程关卡评测次数
|
|
|
|
|
item_evaluate_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['evaluate_count']))
|
|
|
|
|
|
|
|
|
|
# 课程视频学习时长
|
|
|
|
|
item_video_study_time_dict = dict(zip(item_info_df['subject_id'], item_info_df['video_study_time']))
|
|
|
|
|
|
|
|
|
|
# 课程PDF附件数量
|
|
|
|
|
item_study_pdf_attachment_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['study_pdf_attachment_count']))
|
|
|
|
|
|
|
|
|
|
item_info_dict = {}
|
|
|
|
|
item_info_dict.clear()
|
|
|
|
|
|
|
|
|
|
for subject_id, \
|
|
|
|
|
subject_name, \
|
|
|
|
|
item_visists, \
|
|
|
|
|
item_created_time, \
|
|
|
|
|
item_averge_star, \
|
|
|
|
|
item_study_count, \
|
|
|
|
|
item_course_study_count, \
|
|
|
|
|
item_passed_count, \
|
|
|
|
|
item_course_used_count, \
|
|
|
|
|
item_school_used_count, \
|
|
|
|
|
item_challenge_count, \
|
|
|
|
|
item_evaluate_count, \
|
|
|
|
|
item_video_study_time, \
|
|
|
|
|
item_study_pdf_attachment_count in tqdm(zip(item_visists_dict.keys(),
|
|
|
|
|
item_name_dict.values(),
|
|
|
|
|
item_visists_dict.values(),
|
|
|
|
|
item_created_time_dict.values(),
|
|
|
|
|
item_averge_star_dict.values(),
|
|
|
|
|
item_study_count_dict.values(),
|
|
|
|
|
item_course_study_count_dict.values(),
|
|
|
|
|
item_passed_count_dict.values(),
|
|
|
|
|
item_course_used_count_dict.values(),
|
|
|
|
|
item_school_used_count_dict.values(),
|
|
|
|
|
item_challenge_count_dict.values(),
|
|
|
|
|
item_evaluate_count_dict.values(),
|
|
|
|
|
item_video_study_time_dict.values(),
|
|
|
|
|
item_study_pdf_attachment_count_dict.values())):
|
|
|
|
|
|
|
|
|
|
item_info_dict.setdefault(subject_id, {})
|
|
|
|
|
item_info_dict[subject_id]['subject_name'] = subject_name
|
|
|
|
|
item_info_dict[subject_id]['visists'] = item_visists
|
|
|
|
|
item_info_dict[subject_id]['created_at_ts'] = item_created_time
|
|
|
|
|
item_info_dict[subject_id]['averge_star'] = item_averge_star
|
|
|
|
|
item_info_dict[subject_id]['study_count'] = item_study_count
|
|
|
|
|
item_info_dict[subject_id]['course_study_count'] = item_course_study_count
|
|
|
|
|
item_info_dict[subject_id]['passed_count'] = item_passed_count
|
|
|
|
|
item_info_dict[subject_id]['course_used_count'] = item_course_used_count
|
|
|
|
|
item_info_dict[subject_id]['school_used_count'] = item_school_used_count
|
|
|
|
|
item_info_dict[subject_id]['challenge_count'] = item_challenge_count
|
|
|
|
|
item_info_dict[subject_id]['evaluate_count'] = item_evaluate_count
|
|
|
|
|
item_info_dict[subject_id]['video_study_time'] = item_video_study_time
|
|
|
|
|
item_info_dict[subject_id]['study_pdf_attachment_count'] = item_study_pdf_attachment_count
|
|
|
|
|
|
|
|
|
|
pickle.dump(item_info_dict, open(subject_save_path + 'subjects_info_recall_dict.pkl', 'wb'))
|
|
|
|
|
return item_info_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_hist_item_info_dict(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取用户历史选择的课程信息
|
|
|
|
|
"""
|
|
|
|
|
logger.info("获取用户历史选择的课程信息字典")
|
|
|
|
|
|
|
|
|
|
if os.path.exists(subject_save_path + 'user_hist_item_info_dict.pkl'):
|
|
|
|
|
user_hist_item_info_dict = pickle.load(open(subject_save_path + 'user_hist_item_info_dict.pkl', 'rb'))
|
|
|
|
|
return user_hist_item_info_dict
|
|
|
|
|
|
|
|
|
|
def get_last_item_created_time(x):
|
|
|
|
|
return max(x.values.tolist())
|
|
|
|
|
|
|
|
|
|
all_select['visits'] = all_select['visits'].fillna(0)
|
|
|
|
|
all_select['study_count'] = all_select['study_count'].fillna(0)
|
|
|
|
|
all_select['course_study_count'] = all_select['course_study_count'].fillna(0)
|
|
|
|
|
all_select['passed_count'] = all_select['passed_count'].fillna(0)
|
|
|
|
|
all_select['course_used_count'] = all_select['course_used_count'].fillna(0)
|
|
|
|
|
all_select['school_used_count'] = all_select['school_used_count'].fillna(0)
|
|
|
|
|
all_select['challenge_count'] = all_select['challenge_count'].fillna(0)
|
|
|
|
|
all_select['evaluate_count'] = all_select['evaluate_count'].fillna(0)
|
|
|
|
|
all_select['video_study_time'] = all_select['video_study_time'].fillna(0)
|
|
|
|
|
all_select['study_pdf_attachment_count'] = all_select['study_pdf_attachment_count'].fillna(0)
|
|
|
|
|
all_select['averge_star'] = all_select['averge_star'].fillna(0.0)
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均访问量的集合字典
|
|
|
|
|
user_hist_item_visits = all_select.groupby('user_id')['visits'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_visits_dict = dict(zip(user_hist_item_visits['user_id'], user_hist_item_visits['visits']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户选择课程的集合
|
|
|
|
|
user_hist_item_ids = all_select.groupby('user_id')['subject_id'].agg(set).reset_index()
|
|
|
|
|
user_hist_item_ids_dict = dict(zip(user_hist_item_ids['user_id'], user_hist_item_ids['subject_id']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择的课程的评价的星数
|
|
|
|
|
user_hist_item_averge_star = all_select.groupby('user_id')['averge_star'].agg(set).reset_index()
|
|
|
|
|
user_hist_item_averge_star_dict = dict(zip(user_hist_item_averge_star['user_id'], user_hist_item_averge_star['averge_star']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均学习人数的集合字典
|
|
|
|
|
user_hist_item_study_count = all_select.groupby('user_id')['study_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_study_count_dict = dict(zip(user_hist_item_study_count['user_id'], user_hist_item_study_count['study_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均课堂学习数量的集合字典
|
|
|
|
|
user_hist_item_course_study_count = all_select.groupby('user_id')['course_study_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_course_study_count_dict = dict(zip(user_hist_item_course_study_count['user_id'], user_hist_item_course_study_count['course_study_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均课堂学习数量的集合字典
|
|
|
|
|
user_hist_item_passed_count = all_select.groupby('user_id')['passed_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_passed_count_dict = dict(zip(user_hist_item_passed_count['user_id'], user_hist_item_passed_count['passed_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均课堂使用次数的集合字典
|
|
|
|
|
user_hist_item_course_used_count = all_select.groupby('user_id')['course_used_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_course_used_count_dict = dict(zip(user_hist_item_course_used_count['user_id'], user_hist_item_course_used_count['course_used_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均学校使用次数的集合字典
|
|
|
|
|
user_hist_item_school_used_count = all_select.groupby('user_id')['school_used_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_school_used_count_dict = dict(zip(user_hist_item_school_used_count['user_id'], user_hist_item_school_used_count['school_used_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均关卡数量的集合字典
|
|
|
|
|
user_hist_item_challenge_count = all_select.groupby('user_id')['challenge_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_challenge_count_dict = dict(zip(user_hist_item_challenge_count['user_id'], user_hist_item_challenge_count['challenge_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均关卡数量的集合字典
|
|
|
|
|
user_hist_item_evaluate_count = all_select.groupby('user_id')['evaluate_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_evaluate_count_dict = dict(zip(user_hist_item_evaluate_count['user_id'], user_hist_item_evaluate_count['evaluate_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均关卡数量的集合字典
|
|
|
|
|
user_hist_item_video_study_time = all_select.groupby('user_id')['video_study_time'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_video_study_time_dict = dict(zip(user_hist_item_video_study_time['user_id'], user_hist_item_video_study_time['video_study_time']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择课程平均关卡数量的集合字典
|
|
|
|
|
user_hist_item_study_pdf_attachment_count = all_select.groupby('user_id')['study_pdf_attachment_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_study_pdf_attachment_count_dict = dict(zip(user_hist_item_study_pdf_attachment_count['user_id'], user_hist_item_study_pdf_attachment_count['study_pdf_attachment_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户最后一次选择的课程的创建时间
|
|
|
|
|
all_select_ = all_select.sort_values('created_at_ts')
|
|
|
|
|
|
|
|
|
|
user_last_item_created_time = all_select_.groupby('user_id')['created_at_ts']. \
|
|
|
|
|
progress_apply(get_last_item_created_time).reset_index()
|
|
|
|
|
|
|
|
|
|
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
|
|
|
|
|
user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']]. \
|
|
|
|
|
progress_apply(max_min_scaler)
|
|
|
|
|
|
|
|
|
|
user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \
|
|
|
|
|
user_last_item_created_time['created_at_ts']))
|
|
|
|
|
|
|
|
|
|
user_hist_item_info_dict = {}
|
|
|
|
|
user_hist_item_info_dict.clear()
|
|
|
|
|
|
|
|
|
|
for user_id, \
|
|
|
|
|
hist_item_ids, \
|
|
|
|
|
hist_item_visits, \
|
|
|
|
|
hist_item_averge_star, \
|
|
|
|
|
hist_item_study_count, \
|
|
|
|
|
hist_item_course_study_count, \
|
|
|
|
|
hist_item_passed_count, \
|
|
|
|
|
hist_item_course_used_count, \
|
|
|
|
|
hist_item_school_used_count, \
|
|
|
|
|
hist_item_challenge_count, \
|
|
|
|
|
hist_item_evaluate_count, \
|
|
|
|
|
hist_item_video_study_time, \
|
|
|
|
|
hist_item_study_pdf_attachment_count, \
|
|
|
|
|
last_item_created_time in tqdm(zip(user_hist_item_ids_dict.keys(),
|
|
|
|
|
user_hist_item_ids_dict.values(),
|
|
|
|
|
user_hist_item_visits_dict.values(),
|
|
|
|
|
user_hist_item_averge_star_dict.values(),
|
|
|
|
|
user_hist_item_study_count_dict.values(),
|
|
|
|
|
user_hist_item_course_study_count_dict.values(),
|
|
|
|
|
user_hist_item_passed_count_dict.values(),
|
|
|
|
|
user_hist_item_course_used_count_dict.values(),
|
|
|
|
|
user_hist_item_school_used_count_dict.values(),
|
|
|
|
|
user_hist_item_challenge_count_dict.values(),
|
|
|
|
|
user_hist_item_evaluate_count_dict.values(),
|
|
|
|
|
user_hist_item_video_study_time_dict.values(),
|
|
|
|
|
user_hist_item_study_pdf_attachment_count_dict.values(),
|
|
|
|
|
user_last_item_created_time_dict.values())):
|
|
|
|
|
|
|
|
|
|
user_hist_item_info_dict.setdefault(user_id, {})
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_ids'] = hist_item_ids
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_visits'] = hist_item_visits
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_averge_star'] = hist_item_averge_star
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_study_count'] = hist_item_study_count
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_course_study_count'] = hist_item_course_study_count
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_passed_count'] = hist_item_passed_count
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_course_used_count'] = hist_item_course_used_count
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_school_used_count'] = hist_item_school_used_count
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_challenge_count'] = hist_item_challenge_count
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_evaluate_count'] = hist_item_evaluate_count
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_video_study_time'] = hist_item_video_study_time
|
|
|
|
|
user_hist_item_info_dict[user_id]['hist_item_study_pdf_attachment_count'] = hist_item_study_pdf_attachment_count
|
|
|
|
|
user_hist_item_info_dict[user_id]['last_item_created_time'] = last_item_created_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pickle.dump(user_hist_item_info_dict, open(subject_save_path + 'user_hist_item_info_dict.pkl', 'wb'))
|
|
|
|
|
return user_hist_item_info_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_topk_select(select_df, k):
|
|
|
|
|
"""
|
|
|
|
|
获取被选择次数最多的课程,用来做召回补全
|
|
|
|
|
"""
|
|
|
|
|
topk_select = select_df['subject_id'].value_counts().index[:k]
|
|
|
|
|
return topk_select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_activate_degree_dict(all_select_df):
|
|
|
|
|
"""
|
|
|
|
|
将用户的选择次数作为获取用户活跃度的指标
|
|
|
|
|
"""
|
|
|
|
|
# 加载保存好的用户活跃度字典
|
|
|
|
|
if os.path.exists(subject_save_path + 'user_activate_degree_dict.pkl'):
|
|
|
|
|
user_activate_degree_dict = pickle.load(open(subject_save_path + 'user_activate_degree_dict.pkl', 'rb'))
|
|
|
|
|
return user_activate_degree_dict
|
|
|
|
|
|
|
|
|
|
all_select_df_ = all_select_df.groupby('user_id')['subject_id'].count().reset_index()
|
|
|
|
|
|
|
|
|
|
# 用户活跃度归一化
|
|
|
|
|
mm = MinMaxScaler()
|
|
|
|
|
all_select_df_['subject_id'] = mm.fit_transform(all_select_df_[['subject_id']])
|
|
|
|
|
user_activate_degree_dict = dict(zip(all_select_df_['user_id'], all_select_df_['subject_id']))
|
|
|
|
|
|
|
|
|
|
# 保存生成好的用户活跃度字典
|
|
|
|
|
pickle.dump(user_activate_degree_dict, open(subject_save_path + 'user_activate_degree_dict.pkl', 'wb'))
|
|
|
|
|
return user_activate_degree_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def metrics_recall(user_recall_items_dict, train_last_select_df, topk=10):
|
|
|
|
|
"""
|
|
|
|
|
召回评估,依次评估召回的前 10, 20, 30,...topk/10 个课程的击中率
|
|
|
|
|
"""
|
|
|
|
|
# 生成用户最后选择课程的字典
|
|
|
|
|
last_select_item_dict = dict(zip(train_last_select_df['user_id'], train_last_select_df['subject_id']))
|
|
|
|
|
|
|
|
|
|
# 用户数量
|
|
|
|
|
user_num = len(user_recall_items_dict)
|
|
|
|
|
|
|
|
|
|
for k in range(10, topk + 1, 10):
|
|
|
|
|
hit_num = 0
|
|
|
|
|
for user_id, item_list in user_recall_items_dict.items():
|
|
|
|
|
# 获取前k个召回的结果
|
|
|
|
|
tmp_recall_items = [x[0] for x in user_recall_items_dict[user_id][:k]]
|
|
|
|
|
|
|
|
|
|
if last_select_item_dict[user_id] in set(tmp_recall_items):
|
|
|
|
|
hit_num += 1
|
|
|
|
|
|
|
|
|
|
hit_rate = round(hit_num * 1.0 / user_num, 5)
|
|
|
|
|
print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)
|
|
|
|
|
|
|
|
|
|
def metrics_pinsage_recall(user_recall_items_dict, train_last_select_df, topk=10):
|
|
|
|
|
"""
|
|
|
|
|
召回评估,依次评估召回的前 10, 20, 30,...topk/10 个物品的击中率
|
|
|
|
|
"""
|
|
|
|
|
# 生成用户最后选择物品的字典
|
|
|
|
|
last_select_item_dict = dict(zip(train_last_select_df['user'], train_last_select_df['item']))
|
|
|
|
|
|
|
|
|
|
# 用户数量
|
|
|
|
|
user_num = len(user_recall_items_dict)
|
|
|
|
|
print(user_num)
|
|
|
|
|
|
|
|
|
|
for k in range(10, topk + 1, 10):
|
|
|
|
|
hit_num = 0
|
|
|
|
|
for user_id, item_list in user_recall_items_dict.items():
|
|
|
|
|
# 获取前k个召回的结果
|
|
|
|
|
tmp_recall_items = [x[0] for x in user_recall_items_dict[user_id][:k]]
|
|
|
|
|
|
|
|
|
|
if last_select_item_dict[user_id] in set(tmp_recall_items):
|
|
|
|
|
hit_num += 1
|
|
|
|
|
|
|
|
|
|
hit_rate = round(hit_num * 1.0 / user_num, 5)
|
|
|
|
|
print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
logger.info('获取课程信息')
|
|
|
|
|
item_info_df = get_item_info_df()
|
|
|
|
|
|
|
|
|
|
logger.info('生成课程信息字典')
|
|
|
|
|
item_info_dict = get_recall_item_info_dict(item_info_df)
|