You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

172 lines
6.7 KiB

import os
import sys
sys.path.append(os.getcwd())
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import warnings
from config import subject_features_save_path
from config import subject_save_path
from config import subject_bert_emb_dict
from config import subjects_bert_em_path
tqdm.pandas()
warnings.filterwarnings('ignore')
def save_rank_results(recall_df, topk=5, model_name=None):
recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
# 判断是不是每个用户都有topk及以上
tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
assert tmp.min() >= 1
del recall_df['pred_score']
submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
item_columns = {}
item_columns[''] = 'user_id'
for i in range(1, topk):
item_columns[i] = 'item_' + str(i)
# 按照提交格式定义列名
submit = submit.rename(columns=item_columns)
save_name = subject_features_save_path + model_name + '_results.csv'
submit.to_csv(save_name, sep='\t', index=False, header=True)
def norm_sim(sim_df, weight=0.0):
"""
排序结果归一化
"""
min_sim = sim_df.min()
max_sim = sim_df.max()
if max_sim == min_sim:
sim_df = sim_df.apply(lambda sim: 1.0)
else:
sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))
sim_df = sim_df.apply(lambda sim: sim + weight)
return sim_df
def get_kfold_users(train_df, n=5):
"""
五折交叉验证,这里的五折交叉是以用户为目标进行五折划分
这一部分与前面的单独训练和验证是分开的
"""
user_ids = train_df['user_id'].unique()
user_set = [user_ids[i::n] for i in range(n)]
return user_set
def fill_is_disciplines_hab(x):
"""
判断是否在用户选择的课程难度中
"""
result = 0
if isinstance(x.disciplines_list, set):
if str(float(x.disciplines_id)) in x.disciplines_list:
result = 1
return result
def make_subject_tuple_func(df):
row_data = []
for name, row_df in df.iterrows():
row_data.append((row_df['created_at_ts'],
row_df['visits'],
row_df['stages_count'],
row_df['stage_shixuns_count'],
row_df['study_count'],
row_df['course_study_count'],
row_df['passed_count'],
row_df['course_used_count'],
row_df['school_used_count'],
row_df['challenge_count'],
row_df['evaluate_count'],
row_df['video_study_time'],
row_df['study_pdf_attachment_count'],
row_df['disciplines_id'],
row_df['averge_star']))
return row_data
def get_rank_item_info_dict(item_info_df):
"""
生成物品信息字典
"""
item_info_df['visits'] = item_info_df['visits'].fillna(0)
item_info_df['visits'] = item_info_df['visits'].astype(int)
item_info_df['stages_count'] = item_info_df['stages_count'].fillna(0)
item_info_df['stages_count'] = item_info_df['stages_count'].astype(int)
item_info_df['stage_shixuns_count'] = item_info_df['stage_shixuns_count'].fillna(0)
item_info_df['stage_shixuns_count'] = item_info_df['stage_shixuns_count'].astype(int)
item_info_df['study_count'] = item_info_df['study_count'].fillna(0)
item_info_df['study_count'] = item_info_df['study_count'].astype(int)
item_info_df['course_study_count'] = item_info_df['course_study_count'].fillna(0)
item_info_df['course_study_count'] = item_info_df['course_study_count'].astype(int)
item_info_df['passed_count'] = item_info_df['passed_count'].fillna(0)
item_info_df['passed_count'] = item_info_df['passed_count'].astype(int)
item_info_df['course_used_count'] = item_info_df['course_used_count'].fillna(0)
item_info_df['course_used_count'] = item_info_df['course_used_count'].astype(int)
item_info_df['school_used_count'] = item_info_df['school_used_count'].fillna(0)
item_info_df['school_used_count'] = item_info_df['school_used_count'].astype(int)
item_info_df['challenge_count'] = item_info_df['challenge_count'].fillna(0)
item_info_df['challenge_count'] = item_info_df['challenge_count'].astype(int)
item_info_df['evaluate_count'] = item_info_df['evaluate_count'].fillna(0)
item_info_df['evaluate_count'] = item_info_df['evaluate_count'].astype(int)
item_info_df['video_study_time'] = item_info_df['video_study_time'].fillna(0.0)
item_info_df['video_study_time'] = item_info_df['video_study_time'].astype(float)
item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].fillna(0)
item_info_df['study_pdf_attachment_count'] = item_info_df['study_pdf_attachment_count'].astype(int)
item_info_df['averge_star'] = item_info_df['averge_star'].fillna(0.0)
item_info_df['averge_star'] = item_info_df['averge_star'].astype(float)
if os.path.exists(subject_save_path + 'subjects_info_rank_dict.pkl'):
item_info_dict = pickle.load(open(subject_save_path + 'subjects_info_rank_dict.pkl', 'rb'))
else:
item_info_tuples = item_info_df.groupby('subject_id').progress_apply(make_subject_tuple_func).reset_index()
item_info_dict = dict(zip(item_info_tuples['subject_id'], item_info_tuples[0]))
pickle.dump(item_info_dict, open(subject_save_path + 'subjects_info_rank_dict.pkl', 'wb'))
return item_info_dict
def get_item_bert_emb_dict():
"""
生成和读取物品的Bert Embedding数据
"""
# 加载已经保存的Embedding数据
if os.path.exists(subject_bert_emb_dict):
item_emb_dict = pickle.load(open(subject_bert_emb_dict, 'rb'))
return item_emb_dict
# 生成物品的Embedding数据
item_emb_df = pd.read_csv(subjects_bert_em_path, sep='\t', encoding='utf-8')
item_emb_cols = [x for x in item_emb_df.columns if 'bert_em' in x]
item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])
# 进行归一化
item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)
item_emb_dict = dict(zip(item_emb_df['subject_id'], item_emb_np))
pickle.dump(item_emb_dict, open(subject_bert_emb_dict, 'wb'))
return item_emb_dict