import pandas as pd import numpy as np from tqdm import tqdm import os import sys sys.path.append(os.getcwd()) import warnings import pickle from sklearn.preprocessing import MinMaxScaler from config import mysubjects_train_data, mysubjects_test_data from config import subjects_data_path, subjects_embed_path, subject_save_path from config import subject_user_item_time_dict_data from config import subject_item_user_time_dict, data_parent_path from config import subjects_emb_dict from config import users_data_path from utils import reduce_mem from config import logger from joblib import Parallel, delayed from data_process import process_users_data_null tqdm.pandas() warnings.filterwarnings('ignore') def get_all_select_sample(sample_nums=10000): """ 从训练集中划出一部分数据来调试代码 """ if os.path.exists(data_parent_path + 'mysubjects_train_sample.csv'): all_select = pd.read_csv(data_parent_path + 'mysubjects_train_sample.csv', sep='\t', encoding='utf-8') reduce_mem(all_select) return all_select all_select = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8') # user_id过滤重复 all_user_ids = all_select.user_id.unique() # 只采集指定数量的user_id sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) # 取出这些user_id选择的课程 all_select = all_select[all_select['user_id'].isin(sample_user_ids)] # 根据user_id, subject_id去重 all_select = all_select.drop_duplicates((['user_id', 'subject_id'])) all_select.to_csv(data_parent_path + 'mysubjects_train_sample.csv', sep='\t', index=False, header=True) reduce_mem(all_select) return all_select def get_all_select_df(offline = False): """ 读取课程数据时分成线上和线下 线上预测时将测试集中的课程数据合并到总的数据中 线下验证模型的有效性或者特征的有效性时只使用训练集 """ if offline: all_select = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8') else: train_mysubjects = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8') test_mysubjects = pd.read_csv(mysubjects_test_data, sep='\t', encoding='utf-8') all_select = train_mysubjects.append(test_mysubjects) all_select = all_select.drop_duplicates((['user_id', 'subject_id'])) reduce_mem(all_select) return all_select def get_user_info_df(): """ 读取用户的基本属性 """ user_info_df = pd.read_csv(users_data_path, sep='\t', encoding='utf-8', quoting=3) user_info_df = process_users_data_null(user_info_df) reduce_mem(user_info_df) return user_info_df def get_item_info_df(): """ 读取课程的基本属性 """ item_info_df = pd.read_csv(subjects_data_path, sep='\t', encoding='utf-8') reduce_mem(item_info_df) return item_info_df def get_recall_item_info_df(item_info_df, subject_id_list): """ 读取课程的基本属性 """ recall_item_info_df = item_info_df[item_info_df['subject_id'].isin(subject_id_list)] return recall_item_info_df def get_select_item_info(all_select_df, user_id): """ 获取某个用户选择的课程行为数据 """ # select_item_df = all_select_df[all_select_df['user_id'] == user_id] # 用loc会快一些 select_item_df = all_select_df.loc[all_select_df['user_id'] == user_id, :].reset_index() return select_item_df def get_item_emb_dict(): """ 生成和读取课程的Embedding数据 """ # 加载已经保存的Embedding数据 if os.path.exists(subjects_emb_dict): item_emb_dict = pickle.load(open(subjects_emb_dict, 'rb')) return item_emb_dict # 生成课程的Embedding数据 item_emb_df = pd.read_csv(subjects_embed_path, sep='\t', encoding='utf-8') item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x] item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols]) # 进行归一化 item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True) item_emb_dict = dict(zip(item_emb_df['subject_id'], item_emb_np)) pickle.dump(item_emb_dict, open(subjects_emb_dict, 'wb')) return item_emb_dict def get_user_item_time_dict(select_df): """ 根据时间获取用户选择的课程序列 {user1: [(item1, time1), (item2, time2)..]...} """ def make_item_time_pair(df): """ 构造选课程,选择时间列表 """ return list(zip(df['subject_id'], df['created_timestamp'])) # 加载之前pickel保存的 if os.path.exists(subject_user_item_time_dict_data): user_item_time_dict = pickle.load(open(subject_user_item_time_dict_data, 'rb')) return user_item_time_dict # 按选择时间排序 select_df = select_df.sort_values('created_timestamp') # 按用户分组生成用户选择的课程 user_item_time_df = select_df.groupby('user_id')['subject_id', 'created_timestamp'].progress_apply(lambda x: make_item_time_pair(x)).reset_index(). \ rename(columns = {0: 'item_time_list'}) # 生成用户选择的课程字典 user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list'])) pickle.dump(user_item_time_dict, open(subject_user_item_time_dict_data, 'wb')) return user_item_time_dict def get_item_user_time_dict(select_df): """ 根据选择时间获取课程被选择的用户序列 {item1: [(user1, time1), (user2, time2)...]...} """ def make_user_time_pair(df): return list(zip(df['user_id'], df['created_timestamp'])) # 加载之前pickel保存的 if os.path.exists(subject_item_user_time_dict): item_user_time_dict = pickle.load(open(subject_item_user_time_dict, 'rb')) return item_user_time_dict select_df = select_df.sort_values('created_timestamp') item_user_time_df = select_df.groupby('subject_id')['user_id', 'created_timestamp']. \ progress_apply(lambda x: make_user_time_pair(x)).reset_index(). \ rename(columns = {0: 'user_time_list'}) item_user_time_dict = dict(zip(item_user_time_df['subject_id'], item_user_time_df['user_time_list'])) pickle.dump(item_user_time_dict, open(subject_item_user_time_dict, 'wb')) return item_user_time_dict def get_hist_and_last_select(all_select): """ 获取课程行为数据的历史选择和最后一次选择 """ # 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下 def hist_func(user_df): if len(user_df) == 1: return user_df else: return user_df[:-1] def apply_parallel(df_grouped, func): results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped) return pd.concat(results) if os.path.exists(subject_save_path + 'select_last_df.csv') and \ os.path.exists(subject_save_path + 'select_hist_df.csv'): select_last_df = pd.read_csv(subject_save_path + 'select_last_df.csv', sep='\t', encoding='utf-8') select_hist_df = pd.read_csv(subject_save_path + 'select_hist_df.csv', sep='\t', encoding='utf-8') else: all_select = all_select.sort_values(by=['user_id', 'created_timestamp']) select_last_df = all_select.groupby('user_id').tail(1) # 多进程并行提升处理速度 df_grouped = all_select.groupby('user_id') select_hist_df = apply_parallel(df_grouped, hist_func) select_hist_df = select_hist_df.reset_index(drop=True) #select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True) select_last_df.to_csv(subject_save_path + 'select_last_df.csv', sep='\t', index=False, header=True) select_hist_df.to_csv(subject_save_path + 'select_hist_df.csv', sep='\t', index=False, header=True) reduce_mem(select_last_df) reduce_mem(select_hist_df) return select_hist_df, select_last_df def get_all_hist_and_last_select(all_select): """ 获取课程行为数据的历史选择和最后一次选择 """ # 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下 def hist_func(user_df): if len(user_df) == 1: return user_df else: return user_df[:-1] def apply_parallel(df_grouped, func): results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped) return pd.concat(results) if os.path.exists(subject_save_path + 'select_all_last_df.csv') and \ os.path.exists(subject_save_path + 'select_all_hist_df.csv'): select_last_df = pd.read_csv(subject_save_path + 'select_all_last_df.csv', sep='\t', encoding='utf-8') select_hist_df = pd.read_csv(subject_save_path + 'select_all_hist_df.csv', sep='\t', encoding='utf-8') else: all_select = all_select.sort_values(by=['user_id', 'created_timestamp']) select_last_df = all_select.groupby('user_id').tail(1) # 多进程并行提升处理速度 df_grouped = all_select.groupby('user_id') select_hist_df = apply_parallel(df_grouped, hist_func) select_hist_df = select_hist_df.reset_index(drop=True) #select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True) select_last_df.to_csv(subject_save_path + 'select_all_last_df.csv', sep='\t', index=False, header=True) select_hist_df.to_csv(subject_save_path + 'select_all_hist_df.csv', sep='\t', index=False, header=True) reduce_mem(select_last_df) reduce_mem(select_hist_df) return select_hist_df, select_last_df def get_rank_hist_and_last_select(all_select): """ 获取排序课程行为数据的历史选择和最后一次选择 """ all_select = all_select.sort_values(by=['user_id', 'created_timestamp']) select_last_df = all_select.groupby('user_id').tail(1) # 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下 def hist_func(user_df): if len(user_df) == 1: return user_df else: return user_df[:-1] select_hist_df = all_select.groupby('user_id').apply(hist_func).reset_index(drop=True) return select_hist_df, select_last_df def get_recall_item_info_dict(item_info_df): """ 获取课程id对应的基本属性,保存成字典的形式 方便后面召回阶段,冷启动阶段直接使用 """ if os.path.exists(subject_save_path + 'subjects_info_recall_dict.pkl'): item_info_dict = pickle.load(open(subject_save_path + 'subjects_info_recall_dict.pkl', 'rb')) return item_info_dict item_info_df['visits'] = item_info_df['visits'].fillna(0) item_info_df['visits'] = item_info_df['visits'].astype(int) item_info_df['study_count'] = item_info_df['study_count'].fillna(0) item_info_df['study_count'] = item_info_df['visits'].astype(int) item_info_df['course_study_count'] = item_info_df['course_study_count'].fillna(0) item_info_df['course_study_count'] = item_info_df['course_study_count'].astype(int) item_info_df['passed_count'] = item_info_df['passed_count'].fillna(0) item_info_df['passed_count'] = item_info_df['passed_count'].astype(int) item_info_df['course_used_count'] = item_info_df['course_used_count'].fillna(0) item_info_df['course_used_count'] = item_info_df['course_used_count'].astype(int) item_info_df['school_used_count'] = item_info_df['school_used_count'].fillna(0) item_info_df['school_used_count'] = item_info_df['school_used_count'].astype(int) item_info_df['challenge_count'] = item_info_df['challenge_count'].fillna(0) item_info_df['challenge_count'] = item_info_df['challenge_count'].astype(int) item_info_df['evaluate_count'] = item_info_df['evaluate_count'].fillna(0) item_info_df['evaluate_count'] = item_info_df['evaluate_count'].astype(int) item_info_df['video_study_time'] = item_info_df['video_study_time'].fillna(0) item_info_df['video_study_time'] = item_info_df['video_study_time'].astype(int) item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].fillna(0) item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].astype(int) item_info_df['averge_star'] = item_info_df['averge_star'].fillna(0.0) item_info_df['averge_star'] = item_info_df['averge_star'].astype(float) # 创建时间进行归一化 max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x)) item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].progress_apply(max_min_scaler) # 课程名称 item_name_dict = dict(zip(item_info_df['subject_id'], item_info_df['subject_name'])) # 课程访问次数 item_visists_dict = dict(zip(item_info_df['subject_id'], item_info_df['visits'])) # 课程创建时间 item_created_time_dict = dict(zip(item_info_df['subject_id'], item_info_df['created_at_ts'])) # 课程评价星数 item_averge_star_dict = dict(zip(item_info_df['subject_id'], item_info_df['averge_star'])) # 课程学生人数 item_study_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['study_count'])) # 课程课堂学习人数 item_course_study_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['course_study_count'])) # 课程通过人数 item_passed_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['passed_count'])) # 课程课堂使用数量 item_course_used_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['course_used_count'])) # 课程学校使用数量 item_school_used_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['school_used_count'])) # 课程关卡数量 item_challenge_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['challenge_count'])) # 课程关卡评测次数 item_evaluate_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['evaluate_count'])) # 课程视频学习时长 item_video_study_time_dict = dict(zip(item_info_df['subject_id'], item_info_df['video_study_time'])) # 课程PDF附件数量 item_study_pdf_attachment_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['study_pdf_attachment_count'])) item_info_dict = {} item_info_dict.clear() for subject_id, \ subject_name, \ item_visists, \ item_created_time, \ item_averge_star, \ item_study_count, \ item_course_study_count, \ item_passed_count, \ item_course_used_count, \ item_school_used_count, \ item_challenge_count, \ item_evaluate_count, \ item_video_study_time, \ item_study_pdf_attachment_count in tqdm(zip(item_visists_dict.keys(), item_name_dict.values(), item_visists_dict.values(), item_created_time_dict.values(), item_averge_star_dict.values(), item_study_count_dict.values(), item_course_study_count_dict.values(), item_passed_count_dict.values(), item_course_used_count_dict.values(), item_school_used_count_dict.values(), item_challenge_count_dict.values(), item_evaluate_count_dict.values(), item_video_study_time_dict.values(), item_study_pdf_attachment_count_dict.values())): item_info_dict.setdefault(subject_id, {}) item_info_dict[subject_id]['subject_name'] = subject_name item_info_dict[subject_id]['visists'] = item_visists item_info_dict[subject_id]['created_at_ts'] = item_created_time item_info_dict[subject_id]['averge_star'] = item_averge_star item_info_dict[subject_id]['study_count'] = item_study_count item_info_dict[subject_id]['course_study_count'] = item_course_study_count item_info_dict[subject_id]['passed_count'] = item_passed_count item_info_dict[subject_id]['course_used_count'] = item_course_used_count item_info_dict[subject_id]['school_used_count'] = item_school_used_count item_info_dict[subject_id]['challenge_count'] = item_challenge_count item_info_dict[subject_id]['evaluate_count'] = item_evaluate_count item_info_dict[subject_id]['video_study_time'] = item_video_study_time item_info_dict[subject_id]['study_pdf_attachment_count'] = item_study_pdf_attachment_count pickle.dump(item_info_dict, open(subject_save_path + 'subjects_info_recall_dict.pkl', 'wb')) return item_info_dict def get_user_hist_item_info_dict(all_select): """ 获取用户历史选择的课程信息 """ logger.info("获取用户历史选择的课程信息字典") if os.path.exists(subject_save_path + 'user_hist_item_info_dict.pkl'): user_hist_item_info_dict = pickle.load(open(subject_save_path + 'user_hist_item_info_dict.pkl', 'rb')) return user_hist_item_info_dict def get_last_item_created_time(x): return max(x.values.tolist()) all_select['visits'] = all_select['visits'].fillna(0) all_select['study_count'] = all_select['study_count'].fillna(0) all_select['course_study_count'] = all_select['course_study_count'].fillna(0) all_select['passed_count'] = all_select['passed_count'].fillna(0) all_select['course_used_count'] = all_select['course_used_count'].fillna(0) all_select['school_used_count'] = all_select['school_used_count'].fillna(0) all_select['challenge_count'] = all_select['challenge_count'].fillna(0) all_select['evaluate_count'] = all_select['evaluate_count'].fillna(0) all_select['video_study_time'] = all_select['video_study_time'].fillna(0) all_select['study_pdf_attachment_count'] = all_select['study_pdf_attachment_count'].fillna(0) all_select['averge_star'] = all_select['averge_star'].fillna(0.0) # 获取user_id对应的用户历史选择课程平均访问量的集合字典 user_hist_item_visits = all_select.groupby('user_id')['visits'].agg('mean').reset_index() user_hist_item_visits_dict = dict(zip(user_hist_item_visits['user_id'], user_hist_item_visits['visits'])) # 获取user_id对应的用户选择课程的集合 user_hist_item_ids = all_select.groupby('user_id')['subject_id'].agg(set).reset_index() user_hist_item_ids_dict = dict(zip(user_hist_item_ids['user_id'], user_hist_item_ids['subject_id'])) # 获取user_id对应的用户历史选择的课程的评价的星数 user_hist_item_averge_star = all_select.groupby('user_id')['averge_star'].agg(set).reset_index() user_hist_item_averge_star_dict = dict(zip(user_hist_item_averge_star['user_id'], user_hist_item_averge_star['averge_star'])) # 获取user_id对应的用户历史选择课程平均学习人数的集合字典 user_hist_item_study_count = all_select.groupby('user_id')['study_count'].agg('mean').reset_index() user_hist_item_study_count_dict = dict(zip(user_hist_item_study_count['user_id'], user_hist_item_study_count['study_count'])) # 获取user_id对应的用户历史选择课程平均课堂学习数量的集合字典 user_hist_item_course_study_count = all_select.groupby('user_id')['course_study_count'].agg('mean').reset_index() user_hist_item_course_study_count_dict = dict(zip(user_hist_item_course_study_count['user_id'], user_hist_item_course_study_count['course_study_count'])) # 获取user_id对应的用户历史选择课程平均课堂学习数量的集合字典 user_hist_item_passed_count = all_select.groupby('user_id')['passed_count'].agg('mean').reset_index() user_hist_item_passed_count_dict = dict(zip(user_hist_item_passed_count['user_id'], user_hist_item_passed_count['passed_count'])) # 获取user_id对应的用户历史选择课程平均课堂使用次数的集合字典 user_hist_item_course_used_count = all_select.groupby('user_id')['course_used_count'].agg('mean').reset_index() user_hist_item_course_used_count_dict = dict(zip(user_hist_item_course_used_count['user_id'], user_hist_item_course_used_count['course_used_count'])) # 获取user_id对应的用户历史选择课程平均学校使用次数的集合字典 user_hist_item_school_used_count = all_select.groupby('user_id')['school_used_count'].agg('mean').reset_index() user_hist_item_school_used_count_dict = dict(zip(user_hist_item_school_used_count['user_id'], user_hist_item_school_used_count['school_used_count'])) # 获取user_id对应的用户历史选择课程平均关卡数量的集合字典 user_hist_item_challenge_count = all_select.groupby('user_id')['challenge_count'].agg('mean').reset_index() user_hist_item_challenge_count_dict = dict(zip(user_hist_item_challenge_count['user_id'], user_hist_item_challenge_count['challenge_count'])) # 获取user_id对应的用户历史选择课程平均关卡数量的集合字典 user_hist_item_evaluate_count = all_select.groupby('user_id')['evaluate_count'].agg('mean').reset_index() user_hist_item_evaluate_count_dict = dict(zip(user_hist_item_evaluate_count['user_id'], user_hist_item_evaluate_count['evaluate_count'])) # 获取user_id对应的用户历史选择课程平均关卡数量的集合字典 user_hist_item_video_study_time = all_select.groupby('user_id')['video_study_time'].agg('mean').reset_index() user_hist_item_video_study_time_dict = dict(zip(user_hist_item_video_study_time['user_id'], user_hist_item_video_study_time['video_study_time'])) # 获取user_id对应的用户历史选择课程平均关卡数量的集合字典 user_hist_item_study_pdf_attachment_count = all_select.groupby('user_id')['study_pdf_attachment_count'].agg('mean').reset_index() user_hist_item_study_pdf_attachment_count_dict = dict(zip(user_hist_item_study_pdf_attachment_count['user_id'], user_hist_item_study_pdf_attachment_count['study_pdf_attachment_count'])) # 获取user_id对应的用户最后一次选择的课程的创建时间 all_select_ = all_select.sort_values('created_at_ts') user_last_item_created_time = all_select_.groupby('user_id')['created_at_ts']. \ progress_apply(get_last_item_created_time).reset_index() max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x)) user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']]. \ progress_apply(max_min_scaler) user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \ user_last_item_created_time['created_at_ts'])) user_hist_item_info_dict = {} user_hist_item_info_dict.clear() for user_id, \ hist_item_ids, \ hist_item_visits, \ hist_item_averge_star, \ hist_item_study_count, \ hist_item_course_study_count, \ hist_item_passed_count, \ hist_item_course_used_count, \ hist_item_school_used_count, \ hist_item_challenge_count, \ hist_item_evaluate_count, \ hist_item_video_study_time, \ hist_item_study_pdf_attachment_count, \ last_item_created_time in tqdm(zip(user_hist_item_ids_dict.keys(), user_hist_item_ids_dict.values(), user_hist_item_visits_dict.values(), user_hist_item_averge_star_dict.values(), user_hist_item_study_count_dict.values(), user_hist_item_course_study_count_dict.values(), user_hist_item_passed_count_dict.values(), user_hist_item_course_used_count_dict.values(), user_hist_item_school_used_count_dict.values(), user_hist_item_challenge_count_dict.values(), user_hist_item_evaluate_count_dict.values(), user_hist_item_video_study_time_dict.values(), user_hist_item_study_pdf_attachment_count_dict.values(), user_last_item_created_time_dict.values())): user_hist_item_info_dict.setdefault(user_id, {}) user_hist_item_info_dict[user_id]['hist_item_ids'] = hist_item_ids user_hist_item_info_dict[user_id]['hist_item_visits'] = hist_item_visits user_hist_item_info_dict[user_id]['hist_item_averge_star'] = hist_item_averge_star user_hist_item_info_dict[user_id]['hist_item_study_count'] = hist_item_study_count user_hist_item_info_dict[user_id]['hist_item_course_study_count'] = hist_item_course_study_count user_hist_item_info_dict[user_id]['hist_item_passed_count'] = hist_item_passed_count user_hist_item_info_dict[user_id]['hist_item_course_used_count'] = hist_item_course_used_count user_hist_item_info_dict[user_id]['hist_item_school_used_count'] = hist_item_school_used_count user_hist_item_info_dict[user_id]['hist_item_challenge_count'] = hist_item_challenge_count user_hist_item_info_dict[user_id]['hist_item_evaluate_count'] = hist_item_evaluate_count user_hist_item_info_dict[user_id]['hist_item_video_study_time'] = hist_item_video_study_time user_hist_item_info_dict[user_id]['hist_item_study_pdf_attachment_count'] = hist_item_study_pdf_attachment_count user_hist_item_info_dict[user_id]['last_item_created_time'] = last_item_created_time pickle.dump(user_hist_item_info_dict, open(subject_save_path + 'user_hist_item_info_dict.pkl', 'wb')) return user_hist_item_info_dict def get_item_topk_select(select_df, k): """ 获取被选择次数最多的课程,用来做召回补全 """ topk_select = select_df['subject_id'].value_counts().index[:k] return topk_select def get_user_activate_degree_dict(all_select_df): """ 将用户的选择次数作为获取用户活跃度的指标 """ # 加载保存好的用户活跃度字典 if os.path.exists(subject_save_path + 'user_activate_degree_dict.pkl'): user_activate_degree_dict = pickle.load(open(subject_save_path + 'user_activate_degree_dict.pkl', 'rb')) return user_activate_degree_dict all_select_df_ = all_select_df.groupby('user_id')['subject_id'].count().reset_index() # 用户活跃度归一化 mm = MinMaxScaler() all_select_df_['subject_id'] = mm.fit_transform(all_select_df_[['subject_id']]) user_activate_degree_dict = dict(zip(all_select_df_['user_id'], all_select_df_['subject_id'])) # 保存生成好的用户活跃度字典 pickle.dump(user_activate_degree_dict, open(subject_save_path + 'user_activate_degree_dict.pkl', 'wb')) return user_activate_degree_dict def metrics_recall(user_recall_items_dict, train_last_select_df, topk=10): """ 召回评估,依次评估召回的前 10, 20, 30,...topk/10 个课程的击中率 """ # 生成用户最后选择课程的字典 last_select_item_dict = dict(zip(train_last_select_df['user_id'], train_last_select_df['subject_id'])) # 用户数量 user_num = len(user_recall_items_dict) for k in range(10, topk + 1, 10): hit_num = 0 for user_id, item_list in user_recall_items_dict.items(): # 获取前k个召回的结果 tmp_recall_items = [x[0] for x in user_recall_items_dict[user_id][:k]] if last_select_item_dict[user_id] in set(tmp_recall_items): hit_num += 1 hit_rate = round(hit_num * 1.0 / user_num, 5) print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num) def metrics_pinsage_recall(user_recall_items_dict, train_last_select_df, topk=10): """ 召回评估,依次评估召回的前 10, 20, 30,...topk/10 个物品的击中率 """ # 生成用户最后选择物品的字典 last_select_item_dict = dict(zip(train_last_select_df['user'], train_last_select_df['item'])) # 用户数量 user_num = len(user_recall_items_dict) print(user_num) for k in range(10, topk + 1, 10): hit_num = 0 for user_id, item_list in user_recall_items_dict.items(): # 获取前k个召回的结果 tmp_recall_items = [x[0] for x in user_recall_items_dict[user_id][:k]] if last_select_item_dict[user_id] in set(tmp_recall_items): hit_num += 1 hit_rate = round(hit_num * 1.0 / user_num, 5) print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num) if __name__ == '__main__': logger.info('获取课程信息') item_info_df = get_item_info_df() logger.info('生成课程信息字典') item_info_dict = get_recall_item_info_dict(item_info_df)