You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

638 lines
29 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import sys
sys.path.append(os.getcwd())
import warnings
import pickle
from sklearn.preprocessing import MinMaxScaler
from config import mysubjects_train_data, mysubjects_test_data
from config import subjects_data_path, subjects_embed_path, subject_save_path
from config import subject_user_item_time_dict_data
from config import subject_item_user_time_dict, data_parent_path
from config import subjects_emb_dict
from config import users_data_path
from utils import reduce_mem
from config import logger
from joblib import Parallel, delayed
from data_process import process_users_data_null
tqdm.pandas()
warnings.filterwarnings('ignore')
def get_all_select_sample(sample_nums=10000):
"""
从训练集中划出一部分数据来调试代码
"""
if os.path.exists(data_parent_path + 'mysubjects_train_sample.csv'):
all_select = pd.read_csv(data_parent_path + 'mysubjects_train_sample.csv', sep='\t', encoding='utf-8')
reduce_mem(all_select)
return all_select
all_select = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8')
# user_id过滤重复
all_user_ids = all_select.user_id.unique()
# 只采集指定数量的user_id
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
# 取出这些user_id选择的课程
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
# 根据user_id, subject_id去重
all_select = all_select.drop_duplicates((['user_id', 'subject_id']))
all_select.to_csv(data_parent_path + 'mysubjects_train_sample.csv', sep='\t', index=False, header=True)
reduce_mem(all_select)
return all_select
def get_all_select_df(offline = False):
"""
读取课程数据时分成线上和线下
线上预测时将测试集中的课程数据合并到总的数据中
线下验证模型的有效性或者特征的有效性时只使用训练集
"""
if offline:
all_select = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8')
else:
train_mysubjects = pd.read_csv(mysubjects_train_data, sep='\t', encoding='utf-8')
test_mysubjects = pd.read_csv(mysubjects_test_data, sep='\t', encoding='utf-8')
all_select = train_mysubjects.append(test_mysubjects)
all_select = all_select.drop_duplicates((['user_id', 'subject_id']))
reduce_mem(all_select)
return all_select
def get_user_info_df():
"""
读取用户的基本属性
"""
user_info_df = pd.read_csv(users_data_path, sep='\t', encoding='utf-8', quoting=3)
user_info_df = process_users_data_null(user_info_df)
reduce_mem(user_info_df)
return user_info_df
def get_item_info_df():
"""
读取课程的基本属性
"""
item_info_df = pd.read_csv(subjects_data_path, sep='\t', encoding='utf-8')
reduce_mem(item_info_df)
return item_info_df
def get_recall_item_info_df(item_info_df, subject_id_list):
"""
读取课程的基本属性
"""
recall_item_info_df = item_info_df[item_info_df['subject_id'].isin(subject_id_list)]
return recall_item_info_df
def get_select_item_info(all_select_df, user_id):
"""
获取某个用户选择的课程行为数据
"""
# select_item_df = all_select_df[all_select_df['user_id'] == user_id]
# 用loc会快一些
select_item_df = all_select_df.loc[all_select_df['user_id'] == user_id, :].reset_index()
return select_item_df
def get_item_emb_dict():
"""
生成和读取课程的Embedding数据
"""
# 加载已经保存的Embedding数据
if os.path.exists(subjects_emb_dict):
item_emb_dict = pickle.load(open(subjects_emb_dict, 'rb'))
return item_emb_dict
# 生成课程的Embedding数据
item_emb_df = pd.read_csv(subjects_embed_path, sep='\t', encoding='utf-8')
item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]
item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])
# 进行归一化
item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)
item_emb_dict = dict(zip(item_emb_df['subject_id'], item_emb_np))
pickle.dump(item_emb_dict, open(subjects_emb_dict, 'wb'))
return item_emb_dict
def get_user_item_time_dict(select_df):
"""
根据时间获取用户选择的课程序列
{user1: [(item1, time1), (item2, time2)..]...}
"""
def make_item_time_pair(df):
"""
构造选课程,选择时间列表
"""
return list(zip(df['subject_id'], df['created_timestamp']))
# 加载之前pickel保存的
if os.path.exists(subject_user_item_time_dict_data):
user_item_time_dict = pickle.load(open(subject_user_item_time_dict_data, 'rb'))
return user_item_time_dict
# 按选择时间排序
select_df = select_df.sort_values('created_timestamp')
# 按用户分组生成用户选择的课程
user_item_time_df = select_df.groupby('user_id')['subject_id', 'created_timestamp'].progress_apply(lambda x: make_item_time_pair(x)).reset_index(). \
rename(columns = {0: 'item_time_list'})
# 生成用户选择的课程字典
user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
pickle.dump(user_item_time_dict, open(subject_user_item_time_dict_data, 'wb'))
return user_item_time_dict
def get_item_user_time_dict(select_df):
"""
根据选择时间获取课程被选择的用户序列
{item1: [(user1, time1), (user2, time2)...]...}
"""
def make_user_time_pair(df):
return list(zip(df['user_id'], df['created_timestamp']))
# 加载之前pickel保存的
if os.path.exists(subject_item_user_time_dict):
item_user_time_dict = pickle.load(open(subject_item_user_time_dict, 'rb'))
return item_user_time_dict
select_df = select_df.sort_values('created_timestamp')
item_user_time_df = select_df.groupby('subject_id')['user_id', 'created_timestamp']. \
progress_apply(lambda x: make_user_time_pair(x)).reset_index(). \
rename(columns = {0: 'user_time_list'})
item_user_time_dict = dict(zip(item_user_time_df['subject_id'], item_user_time_df['user_time_list']))
pickle.dump(item_user_time_dict, open(subject_item_user_time_dict, 'wb'))
return item_user_time_dict
def get_hist_and_last_select(all_select):
"""
获取课程行为数据的历史选择和最后一次选择
"""
# 如果用户只有一个选择hist为空会导致训练的时候这个用户不可见此时默认泄露一下
def hist_func(user_df):
if len(user_df) == 1:
return user_df
else:
return user_df[:-1]
def apply_parallel(df_grouped, func):
results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped)
return pd.concat(results)
if os.path.exists(subject_save_path + 'select_last_df.csv') and \
os.path.exists(subject_save_path + 'select_hist_df.csv'):
select_last_df = pd.read_csv(subject_save_path + 'select_last_df.csv', sep='\t', encoding='utf-8')
select_hist_df = pd.read_csv(subject_save_path + 'select_hist_df.csv', sep='\t', encoding='utf-8')
else:
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
select_last_df = all_select.groupby('user_id').tail(1)
# 多进程并行提升处理速度
df_grouped = all_select.groupby('user_id')
select_hist_df = apply_parallel(df_grouped, hist_func)
select_hist_df = select_hist_df.reset_index(drop=True)
#select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True)
select_last_df.to_csv(subject_save_path + 'select_last_df.csv', sep='\t', index=False, header=True)
select_hist_df.to_csv(subject_save_path + 'select_hist_df.csv', sep='\t', index=False, header=True)
reduce_mem(select_last_df)
reduce_mem(select_hist_df)
return select_hist_df, select_last_df
def get_all_hist_and_last_select(all_select):
"""
获取课程行为数据的历史选择和最后一次选择
"""
# 如果用户只有一个选择hist为空会导致训练的时候这个用户不可见此时默认泄露一下
def hist_func(user_df):
if len(user_df) == 1:
return user_df
else:
return user_df[:-1]
def apply_parallel(df_grouped, func):
results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped)
return pd.concat(results)
if os.path.exists(subject_save_path + 'select_all_last_df.csv') and \
os.path.exists(subject_save_path + 'select_all_hist_df.csv'):
select_last_df = pd.read_csv(subject_save_path + 'select_all_last_df.csv', sep='\t', encoding='utf-8')
select_hist_df = pd.read_csv(subject_save_path + 'select_all_hist_df.csv', sep='\t', encoding='utf-8')
else:
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
select_last_df = all_select.groupby('user_id').tail(1)
# 多进程并行提升处理速度
df_grouped = all_select.groupby('user_id')
select_hist_df = apply_parallel(df_grouped, hist_func)
select_hist_df = select_hist_df.reset_index(drop=True)
#select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True)
select_last_df.to_csv(subject_save_path + 'select_all_last_df.csv', sep='\t', index=False, header=True)
select_hist_df.to_csv(subject_save_path + 'select_all_hist_df.csv', sep='\t', index=False, header=True)
reduce_mem(select_last_df)
reduce_mem(select_hist_df)
return select_hist_df, select_last_df
def get_rank_hist_and_last_select(all_select):
"""
获取排序课程行为数据的历史选择和最后一次选择
"""
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
select_last_df = all_select.groupby('user_id').tail(1)
# 如果用户只有一个选择hist为空会导致训练的时候这个用户不可见此时默认泄露一下
def hist_func(user_df):
if len(user_df) == 1:
return user_df
else:
return user_df[:-1]
select_hist_df = all_select.groupby('user_id').apply(hist_func).reset_index(drop=True)
return select_hist_df, select_last_df
def get_recall_item_info_dict(item_info_df):
"""
获取课程id对应的基本属性保存成字典的形式
方便后面召回阶段,冷启动阶段直接使用
"""
if os.path.exists(subject_save_path + 'subjects_info_recall_dict.pkl'):
item_info_dict = pickle.load(open(subject_save_path + 'subjects_info_recall_dict.pkl', 'rb'))
return item_info_dict
item_info_df['visits'] = item_info_df['visits'].fillna(0)
item_info_df['visits'] = item_info_df['visits'].astype(int)
item_info_df['study_count'] = item_info_df['study_count'].fillna(0)
item_info_df['study_count'] = item_info_df['visits'].astype(int)
item_info_df['course_study_count'] = item_info_df['course_study_count'].fillna(0)
item_info_df['course_study_count'] = item_info_df['course_study_count'].astype(int)
item_info_df['passed_count'] = item_info_df['passed_count'].fillna(0)
item_info_df['passed_count'] = item_info_df['passed_count'].astype(int)
item_info_df['course_used_count'] = item_info_df['course_used_count'].fillna(0)
item_info_df['course_used_count'] = item_info_df['course_used_count'].astype(int)
item_info_df['school_used_count'] = item_info_df['school_used_count'].fillna(0)
item_info_df['school_used_count'] = item_info_df['school_used_count'].astype(int)
item_info_df['challenge_count'] = item_info_df['challenge_count'].fillna(0)
item_info_df['challenge_count'] = item_info_df['challenge_count'].astype(int)
item_info_df['evaluate_count'] = item_info_df['evaluate_count'].fillna(0)
item_info_df['evaluate_count'] = item_info_df['evaluate_count'].astype(int)
item_info_df['video_study_time'] = item_info_df['video_study_time'].fillna(0)
item_info_df['video_study_time'] = item_info_df['video_study_time'].astype(int)
item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].fillna(0)
item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].astype(int)
item_info_df['averge_star'] = item_info_df['averge_star'].fillna(0.0)
item_info_df['averge_star'] = item_info_df['averge_star'].astype(float)
# 创建时间进行归一化
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].progress_apply(max_min_scaler)
# 课程名称
item_name_dict = dict(zip(item_info_df['subject_id'], item_info_df['subject_name']))
# 课程访问次数
item_visists_dict = dict(zip(item_info_df['subject_id'], item_info_df['visits']))
# 课程创建时间
item_created_time_dict = dict(zip(item_info_df['subject_id'], item_info_df['created_at_ts']))
# 课程评价星数
item_averge_star_dict = dict(zip(item_info_df['subject_id'], item_info_df['averge_star']))
# 课程学生人数
item_study_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['study_count']))
# 课程课堂学习人数
item_course_study_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['course_study_count']))
# 课程通过人数
item_passed_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['passed_count']))
# 课程课堂使用数量
item_course_used_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['course_used_count']))
# 课程学校使用数量
item_school_used_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['school_used_count']))
# 课程关卡数量
item_challenge_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['challenge_count']))
# 课程关卡评测次数
item_evaluate_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['evaluate_count']))
# 课程视频学习时长
item_video_study_time_dict = dict(zip(item_info_df['subject_id'], item_info_df['video_study_time']))
# 课程PDF附件数量
item_study_pdf_attachment_count_dict = dict(zip(item_info_df['subject_id'], item_info_df['study_pdf_attachment_count']))
item_info_dict = {}
item_info_dict.clear()
for subject_id, \
subject_name, \
item_visists, \
item_created_time, \
item_averge_star, \
item_study_count, \
item_course_study_count, \
item_passed_count, \
item_course_used_count, \
item_school_used_count, \
item_challenge_count, \
item_evaluate_count, \
item_video_study_time, \
item_study_pdf_attachment_count in tqdm(zip(item_visists_dict.keys(),
item_name_dict.values(),
item_visists_dict.values(),
item_created_time_dict.values(),
item_averge_star_dict.values(),
item_study_count_dict.values(),
item_course_study_count_dict.values(),
item_passed_count_dict.values(),
item_course_used_count_dict.values(),
item_school_used_count_dict.values(),
item_challenge_count_dict.values(),
item_evaluate_count_dict.values(),
item_video_study_time_dict.values(),
item_study_pdf_attachment_count_dict.values())):
item_info_dict.setdefault(subject_id, {})
item_info_dict[subject_id]['subject_name'] = subject_name
item_info_dict[subject_id]['visists'] = item_visists
item_info_dict[subject_id]['created_at_ts'] = item_created_time
item_info_dict[subject_id]['averge_star'] = item_averge_star
item_info_dict[subject_id]['study_count'] = item_study_count
item_info_dict[subject_id]['course_study_count'] = item_course_study_count
item_info_dict[subject_id]['passed_count'] = item_passed_count
item_info_dict[subject_id]['course_used_count'] = item_course_used_count
item_info_dict[subject_id]['school_used_count'] = item_school_used_count
item_info_dict[subject_id]['challenge_count'] = item_challenge_count
item_info_dict[subject_id]['evaluate_count'] = item_evaluate_count
item_info_dict[subject_id]['video_study_time'] = item_video_study_time
item_info_dict[subject_id]['study_pdf_attachment_count'] = item_study_pdf_attachment_count
pickle.dump(item_info_dict, open(subject_save_path + 'subjects_info_recall_dict.pkl', 'wb'))
return item_info_dict
def get_user_hist_item_info_dict(all_select):
"""
获取用户历史选择的课程信息
"""
logger.info("获取用户历史选择的课程信息字典")
if os.path.exists(subject_save_path + 'user_hist_item_info_dict.pkl'):
user_hist_item_info_dict = pickle.load(open(subject_save_path + 'user_hist_item_info_dict.pkl', 'rb'))
return user_hist_item_info_dict
def get_last_item_created_time(x):
return max(x.values.tolist())
all_select['visits'] = all_select['visits'].fillna(0)
all_select['study_count'] = all_select['study_count'].fillna(0)
all_select['course_study_count'] = all_select['course_study_count'].fillna(0)
all_select['passed_count'] = all_select['passed_count'].fillna(0)
all_select['course_used_count'] = all_select['course_used_count'].fillna(0)
all_select['school_used_count'] = all_select['school_used_count'].fillna(0)
all_select['challenge_count'] = all_select['challenge_count'].fillna(0)
all_select['evaluate_count'] = all_select['evaluate_count'].fillna(0)
all_select['video_study_time'] = all_select['video_study_time'].fillna(0)
all_select['study_pdf_attachment_count'] = all_select['study_pdf_attachment_count'].fillna(0)
all_select['averge_star'] = all_select['averge_star'].fillna(0.0)
# 获取user_id对应的用户历史选择课程平均访问量的集合字典
user_hist_item_visits = all_select.groupby('user_id')['visits'].agg('mean').reset_index()
user_hist_item_visits_dict = dict(zip(user_hist_item_visits['user_id'], user_hist_item_visits['visits']))
# 获取user_id对应的用户选择课程的集合
user_hist_item_ids = all_select.groupby('user_id')['subject_id'].agg(set).reset_index()
user_hist_item_ids_dict = dict(zip(user_hist_item_ids['user_id'], user_hist_item_ids['subject_id']))
# 获取user_id对应的用户历史选择的课程的评价的星数
user_hist_item_averge_star = all_select.groupby('user_id')['averge_star'].agg(set).reset_index()
user_hist_item_averge_star_dict = dict(zip(user_hist_item_averge_star['user_id'], user_hist_item_averge_star['averge_star']))
# 获取user_id对应的用户历史选择课程平均学习人数的集合字典
user_hist_item_study_count = all_select.groupby('user_id')['study_count'].agg('mean').reset_index()
user_hist_item_study_count_dict = dict(zip(user_hist_item_study_count['user_id'], user_hist_item_study_count['study_count']))
# 获取user_id对应的用户历史选择课程平均课堂学习数量的集合字典
user_hist_item_course_study_count = all_select.groupby('user_id')['course_study_count'].agg('mean').reset_index()
user_hist_item_course_study_count_dict = dict(zip(user_hist_item_course_study_count['user_id'], user_hist_item_course_study_count['course_study_count']))
# 获取user_id对应的用户历史选择课程平均课堂学习数量的集合字典
user_hist_item_passed_count = all_select.groupby('user_id')['passed_count'].agg('mean').reset_index()
user_hist_item_passed_count_dict = dict(zip(user_hist_item_passed_count['user_id'], user_hist_item_passed_count['passed_count']))
# 获取user_id对应的用户历史选择课程平均课堂使用次数的集合字典
user_hist_item_course_used_count = all_select.groupby('user_id')['course_used_count'].agg('mean').reset_index()
user_hist_item_course_used_count_dict = dict(zip(user_hist_item_course_used_count['user_id'], user_hist_item_course_used_count['course_used_count']))
# 获取user_id对应的用户历史选择课程平均学校使用次数的集合字典
user_hist_item_school_used_count = all_select.groupby('user_id')['school_used_count'].agg('mean').reset_index()
user_hist_item_school_used_count_dict = dict(zip(user_hist_item_school_used_count['user_id'], user_hist_item_school_used_count['school_used_count']))
# 获取user_id对应的用户历史选择课程平均关卡数量的集合字典
user_hist_item_challenge_count = all_select.groupby('user_id')['challenge_count'].agg('mean').reset_index()
user_hist_item_challenge_count_dict = dict(zip(user_hist_item_challenge_count['user_id'], user_hist_item_challenge_count['challenge_count']))
# 获取user_id对应的用户历史选择课程平均关卡数量的集合字典
user_hist_item_evaluate_count = all_select.groupby('user_id')['evaluate_count'].agg('mean').reset_index()
user_hist_item_evaluate_count_dict = dict(zip(user_hist_item_evaluate_count['user_id'], user_hist_item_evaluate_count['evaluate_count']))
# 获取user_id对应的用户历史选择课程平均关卡数量的集合字典
user_hist_item_video_study_time = all_select.groupby('user_id')['video_study_time'].agg('mean').reset_index()
user_hist_item_video_study_time_dict = dict(zip(user_hist_item_video_study_time['user_id'], user_hist_item_video_study_time['video_study_time']))
# 获取user_id对应的用户历史选择课程平均关卡数量的集合字典
user_hist_item_study_pdf_attachment_count = all_select.groupby('user_id')['study_pdf_attachment_count'].agg('mean').reset_index()
user_hist_item_study_pdf_attachment_count_dict = dict(zip(user_hist_item_study_pdf_attachment_count['user_id'], user_hist_item_study_pdf_attachment_count['study_pdf_attachment_count']))
# 获取user_id对应的用户最后一次选择的课程的创建时间
all_select_ = all_select.sort_values('created_at_ts')
user_last_item_created_time = all_select_.groupby('user_id')['created_at_ts']. \
progress_apply(get_last_item_created_time).reset_index()
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']]. \
progress_apply(max_min_scaler)
user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \
user_last_item_created_time['created_at_ts']))
user_hist_item_info_dict = {}
user_hist_item_info_dict.clear()
for user_id, \
hist_item_ids, \
hist_item_visits, \
hist_item_averge_star, \
hist_item_study_count, \
hist_item_course_study_count, \
hist_item_passed_count, \
hist_item_course_used_count, \
hist_item_school_used_count, \
hist_item_challenge_count, \
hist_item_evaluate_count, \
hist_item_video_study_time, \
hist_item_study_pdf_attachment_count, \
last_item_created_time in tqdm(zip(user_hist_item_ids_dict.keys(),
user_hist_item_ids_dict.values(),
user_hist_item_visits_dict.values(),
user_hist_item_averge_star_dict.values(),
user_hist_item_study_count_dict.values(),
user_hist_item_course_study_count_dict.values(),
user_hist_item_passed_count_dict.values(),
user_hist_item_course_used_count_dict.values(),
user_hist_item_school_used_count_dict.values(),
user_hist_item_challenge_count_dict.values(),
user_hist_item_evaluate_count_dict.values(),
user_hist_item_video_study_time_dict.values(),
user_hist_item_study_pdf_attachment_count_dict.values(),
user_last_item_created_time_dict.values())):
user_hist_item_info_dict.setdefault(user_id, {})
user_hist_item_info_dict[user_id]['hist_item_ids'] = hist_item_ids
user_hist_item_info_dict[user_id]['hist_item_visits'] = hist_item_visits
user_hist_item_info_dict[user_id]['hist_item_averge_star'] = hist_item_averge_star
user_hist_item_info_dict[user_id]['hist_item_study_count'] = hist_item_study_count
user_hist_item_info_dict[user_id]['hist_item_course_study_count'] = hist_item_course_study_count
user_hist_item_info_dict[user_id]['hist_item_passed_count'] = hist_item_passed_count
user_hist_item_info_dict[user_id]['hist_item_course_used_count'] = hist_item_course_used_count
user_hist_item_info_dict[user_id]['hist_item_school_used_count'] = hist_item_school_used_count
user_hist_item_info_dict[user_id]['hist_item_challenge_count'] = hist_item_challenge_count
user_hist_item_info_dict[user_id]['hist_item_evaluate_count'] = hist_item_evaluate_count
user_hist_item_info_dict[user_id]['hist_item_video_study_time'] = hist_item_video_study_time
user_hist_item_info_dict[user_id]['hist_item_study_pdf_attachment_count'] = hist_item_study_pdf_attachment_count
user_hist_item_info_dict[user_id]['last_item_created_time'] = last_item_created_time
pickle.dump(user_hist_item_info_dict, open(subject_save_path + 'user_hist_item_info_dict.pkl', 'wb'))
return user_hist_item_info_dict
def get_item_topk_select(select_df, k):
"""
获取被选择次数最多的课程,用来做召回补全
"""
topk_select = select_df['subject_id'].value_counts().index[:k]
return topk_select
def get_user_activate_degree_dict(all_select_df):
"""
将用户的选择次数作为获取用户活跃度的指标
"""
# 加载保存好的用户活跃度字典
if os.path.exists(subject_save_path + 'user_activate_degree_dict.pkl'):
user_activate_degree_dict = pickle.load(open(subject_save_path + 'user_activate_degree_dict.pkl', 'rb'))
return user_activate_degree_dict
all_select_df_ = all_select_df.groupby('user_id')['subject_id'].count().reset_index()
# 用户活跃度归一化
mm = MinMaxScaler()
all_select_df_['subject_id'] = mm.fit_transform(all_select_df_[['subject_id']])
user_activate_degree_dict = dict(zip(all_select_df_['user_id'], all_select_df_['subject_id']))
# 保存生成好的用户活跃度字典
pickle.dump(user_activate_degree_dict, open(subject_save_path + 'user_activate_degree_dict.pkl', 'wb'))
return user_activate_degree_dict
def metrics_recall(user_recall_items_dict, train_last_select_df, topk=10):
"""
召回评估,依次评估召回的前 10, 20, 30...topk/10 个课程的击中率
"""
# 生成用户最后选择课程的字典
last_select_item_dict = dict(zip(train_last_select_df['user_id'], train_last_select_df['subject_id']))
# 用户数量
user_num = len(user_recall_items_dict)
for k in range(10, topk + 1, 10):
hit_num = 0
for user_id, item_list in user_recall_items_dict.items():
# 获取前k个召回的结果
tmp_recall_items = [x[0] for x in user_recall_items_dict[user_id][:k]]
if last_select_item_dict[user_id] in set(tmp_recall_items):
hit_num += 1
hit_rate = round(hit_num * 1.0 / user_num, 5)
print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)
def metrics_pinsage_recall(user_recall_items_dict, train_last_select_df, topk=10):
"""
召回评估,依次评估召回的前 10, 20, 30...topk/10 个物品的击中率
"""
# 生成用户最后选择物品的字典
last_select_item_dict = dict(zip(train_last_select_df['user'], train_last_select_df['item']))
# 用户数量
user_num = len(user_recall_items_dict)
print(user_num)
for k in range(10, topk + 1, 10):
hit_num = 0
for user_id, item_list in user_recall_items_dict.items():
# 获取前k个召回的结果
tmp_recall_items = [x[0] for x in user_recall_items_dict[user_id][:k]]
if last_select_item_dict[user_id] in set(tmp_recall_items):
hit_num += 1
hit_rate = round(hit_num * 1.0 / user_num, 5)
print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)
if __name__ == '__main__':
logger.info('获取课程信息')
item_info_df = get_item_info_df()
logger.info('生成课程信息字典')
item_info_dict = get_recall_item_info_dict(item_info_df)