You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
172 lines
6.7 KiB
172 lines
6.7 KiB
import os
|
|
import sys
|
|
sys.path.append(os.getcwd())
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pickle
|
|
from tqdm import tqdm
|
|
import warnings
|
|
from config import subject_features_save_path
|
|
from config import subject_save_path
|
|
from config import subject_bert_emb_dict
|
|
from config import subjects_bert_em_path
|
|
|
|
tqdm.pandas()
|
|
warnings.filterwarnings('ignore')
|
|
|
|
def save_rank_results(recall_df, topk=5, model_name=None):
|
|
recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
|
|
recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
|
|
|
|
# 判断是不是每个用户都有topk及以上
|
|
tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
|
|
assert tmp.min() >= 1
|
|
|
|
del recall_df['pred_score']
|
|
submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
|
|
|
|
submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
|
|
|
|
item_columns = {}
|
|
item_columns[''] = 'user_id'
|
|
for i in range(1, topk):
|
|
item_columns[i] = 'item_' + str(i)
|
|
|
|
# 按照提交格式定义列名
|
|
submit = submit.rename(columns=item_columns)
|
|
|
|
save_name = subject_features_save_path + model_name + '_results.csv'
|
|
submit.to_csv(save_name, sep='\t', index=False, header=True)
|
|
|
|
|
|
def norm_sim(sim_df, weight=0.0):
|
|
"""
|
|
排序结果归一化
|
|
"""
|
|
min_sim = sim_df.min()
|
|
max_sim = sim_df.max()
|
|
if max_sim == min_sim:
|
|
sim_df = sim_df.apply(lambda sim: 1.0)
|
|
else:
|
|
sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))
|
|
|
|
sim_df = sim_df.apply(lambda sim: sim + weight)
|
|
return sim_df
|
|
|
|
|
|
def get_kfold_users(train_df, n=5):
|
|
"""
|
|
五折交叉验证,这里的五折交叉是以用户为目标进行五折划分
|
|
这一部分与前面的单独训练和验证是分开的
|
|
"""
|
|
user_ids = train_df['user_id'].unique()
|
|
user_set = [user_ids[i::n] for i in range(n)]
|
|
return user_set
|
|
|
|
|
|
def fill_is_disciplines_hab(x):
|
|
"""
|
|
判断是否在用户选择的课程难度中
|
|
"""
|
|
result = 0
|
|
if isinstance(x.disciplines_list, set):
|
|
if str(float(x.disciplines_id)) in x.disciplines_list:
|
|
result = 1
|
|
return result
|
|
|
|
|
|
def make_subject_tuple_func(df):
|
|
row_data = []
|
|
for name, row_df in df.iterrows():
|
|
row_data.append((row_df['created_at_ts'],
|
|
row_df['visits'],
|
|
row_df['stages_count'],
|
|
row_df['stage_shixuns_count'],
|
|
row_df['study_count'],
|
|
row_df['course_study_count'],
|
|
row_df['passed_count'],
|
|
row_df['course_used_count'],
|
|
row_df['school_used_count'],
|
|
row_df['challenge_count'],
|
|
row_df['evaluate_count'],
|
|
row_df['video_study_time'],
|
|
row_df['study_pdf_attachment_count'],
|
|
row_df['disciplines_id'],
|
|
row_df['averge_star']))
|
|
|
|
return row_data
|
|
|
|
|
|
def get_rank_item_info_dict(item_info_df):
|
|
"""
|
|
生成物品信息字典
|
|
"""
|
|
item_info_df['visits'] = item_info_df['visits'].fillna(0)
|
|
item_info_df['visits'] = item_info_df['visits'].astype(int)
|
|
|
|
item_info_df['stages_count'] = item_info_df['stages_count'].fillna(0)
|
|
item_info_df['stages_count'] = item_info_df['stages_count'].astype(int)
|
|
|
|
item_info_df['stage_shixuns_count'] = item_info_df['stage_shixuns_count'].fillna(0)
|
|
item_info_df['stage_shixuns_count'] = item_info_df['stage_shixuns_count'].astype(int)
|
|
|
|
item_info_df['study_count'] = item_info_df['study_count'].fillna(0)
|
|
item_info_df['study_count'] = item_info_df['study_count'].astype(int)
|
|
|
|
item_info_df['course_study_count'] = item_info_df['course_study_count'].fillna(0)
|
|
item_info_df['course_study_count'] = item_info_df['course_study_count'].astype(int)
|
|
|
|
item_info_df['passed_count'] = item_info_df['passed_count'].fillna(0)
|
|
item_info_df['passed_count'] = item_info_df['passed_count'].astype(int)
|
|
|
|
item_info_df['course_used_count'] = item_info_df['course_used_count'].fillna(0)
|
|
item_info_df['course_used_count'] = item_info_df['course_used_count'].astype(int)
|
|
|
|
item_info_df['school_used_count'] = item_info_df['school_used_count'].fillna(0)
|
|
item_info_df['school_used_count'] = item_info_df['school_used_count'].astype(int)
|
|
|
|
item_info_df['challenge_count'] = item_info_df['challenge_count'].fillna(0)
|
|
item_info_df['challenge_count'] = item_info_df['challenge_count'].astype(int)
|
|
|
|
item_info_df['evaluate_count'] = item_info_df['evaluate_count'].fillna(0)
|
|
item_info_df['evaluate_count'] = item_info_df['evaluate_count'].astype(int)
|
|
|
|
item_info_df['video_study_time'] = item_info_df['video_study_time'].fillna(0.0)
|
|
item_info_df['video_study_time'] = item_info_df['video_study_time'].astype(float)
|
|
|
|
item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].fillna(0)
|
|
item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].astype(int)
|
|
|
|
item_info_df['averge_star'] = item_info_df['averge_star'].fillna(0.0)
|
|
item_info_df['averge_star'] = item_info_df['averge_star'].astype(float)
|
|
|
|
if os.path.exists(subject_save_path + 'subjects_info_rank_dict.pkl'):
|
|
item_info_dict = pickle.load(open(subject_save_path + 'subjects_info_rank_dict.pkl', 'rb'))
|
|
else:
|
|
item_info_tuples = item_info_df.groupby('subject_id').progress_apply(make_subject_tuple_func).reset_index()
|
|
item_info_dict = dict(zip(item_info_tuples['subject_id'], item_info_tuples[0]))
|
|
pickle.dump(item_info_dict, open(subject_save_path + 'subjects_info_rank_dict.pkl', 'wb'))
|
|
return item_info_dict
|
|
|
|
def get_item_bert_emb_dict():
|
|
"""
|
|
生成和读取物品的Bert Embedding数据
|
|
"""
|
|
# 加载已经保存的Embedding数据
|
|
if os.path.exists(subject_bert_emb_dict):
|
|
item_emb_dict = pickle.load(open(subject_bert_emb_dict, 'rb'))
|
|
return item_emb_dict
|
|
|
|
# 生成物品的Embedding数据
|
|
item_emb_df = pd.read_csv(subjects_bert_em_path, sep='\t', encoding='utf-8')
|
|
|
|
item_emb_cols = [x for x in item_emb_df.columns if 'bert_em' in x]
|
|
item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])
|
|
# 进行归一化
|
|
item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)
|
|
|
|
item_emb_dict = dict(zip(item_emb_df['subject_id'], item_emb_np))
|
|
pickle.dump(item_emb_dict, open(subject_bert_emb_dict, 'wb'))
|
|
|
|
return item_emb_dict
|