|
|
|
|
#项目一些参数配置
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
import torch
|
|
|
|
|
import multiprocessing
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from utils import create_logger
|
|
|
|
|
|
|
|
|
|
# 项目根目录
|
|
|
|
|
root_path = os.path.abspath(os.path.dirname(__file__))
|
|
|
|
|
|
|
|
|
|
# 是否增量数据模式
|
|
|
|
|
samples_mode = True
|
|
|
|
|
|
|
|
|
|
if samples_mode == True:
|
|
|
|
|
samples_mode_flag = 'sample'
|
|
|
|
|
else:
|
|
|
|
|
samples_mode_flag = 'full'
|
|
|
|
|
|
|
|
|
|
# 采样的用户数量
|
|
|
|
|
samples_user_nums = 10000
|
|
|
|
|
|
|
|
|
|
# 召回评估的标志
|
|
|
|
|
need_metric_recall = True
|
|
|
|
|
|
|
|
|
|
# 线下验证模型模式,只使用训练数据集
|
|
|
|
|
offline_mode = True
|
|
|
|
|
|
|
|
|
|
# 公共保存路径
|
|
|
|
|
data_path = root_path + '/data/'
|
|
|
|
|
|
|
|
|
|
# 根据是增量还是全量数据决定保存路径
|
|
|
|
|
data_parent_path = data_path + samples_mode_flag + '/'
|
|
|
|
|
shixuns_data_path = data_parent_path + 'shixuns.csv'
|
|
|
|
|
shixuns_embed_path = data_parent_path + 'shixuns_emb.csv'
|
|
|
|
|
shixuns_bert_em_path = data_parent_path + 'shixuns_bert_emb.csv'
|
|
|
|
|
shixun_merge_emb_path = data_parent_path + 'shixun_merage_emb.csv'
|
|
|
|
|
cold_start_shixuns_data_path = data_parent_path + 'cold_start_shixuns.csv'
|
|
|
|
|
cold_start_shixuns_parent_path = data_path + 'cold_start_data/shixun/' + samples_mode_flag + '/'
|
|
|
|
|
|
|
|
|
|
subjects_data_path = data_parent_path + 'subjects.csv'
|
|
|
|
|
subjects_embed_path = data_parent_path + 'subjects_emb.csv'
|
|
|
|
|
subjects_bert_em_path = data_parent_path + 'subjects_bert_emb.csv'
|
|
|
|
|
subjects_merge_emb_path = data_parent_path + 'subjects_merage_emb.csv'
|
|
|
|
|
cold_start_subjects_data_path = data_parent_path + 'cold_start_subjects.csv'
|
|
|
|
|
cold_start_subjects_parent_path = data_path + 'cold_start_data/subject/' + samples_mode_flag + '/'
|
|
|
|
|
|
|
|
|
|
myshixuns_data_path = root_path + '/data/' + samples_mode_flag + '/' + 'myshixuns.csv'
|
|
|
|
|
myshixuns_data = root_path + '/data/' + samples_mode_flag + '/' + 'myshixuns_full.csv'
|
|
|
|
|
myshixuns_train_data = root_path + '/data/' + samples_mode_flag + '/' + 'myshixuns_train.csv'
|
|
|
|
|
myshixuns_test_data = root_path + '/data/' + samples_mode_flag + '/' + 'myshixuns_test.csv'
|
|
|
|
|
|
|
|
|
|
mysubjects_data_path = root_path + '/data/' + samples_mode_flag + '/' + 'mysubjects.csv'
|
|
|
|
|
mysubjects_data = root_path + '/data/' + samples_mode_flag + '/' + 'mysubjectsfull.csv'
|
|
|
|
|
mysubjects_train_data = root_path + '/data/' + samples_mode_flag + '/' + 'mysubjects_train.csv'
|
|
|
|
|
mysubjects_test_data = root_path + '/data/' + samples_mode_flag + '/' + 'mysubjects_test.csv'
|
|
|
|
|
|
|
|
|
|
myshixuns_train_data_baseline = root_path + '/data/' + samples_mode_flag + '/' + 'myshixuns_train_baseline.csv'
|
|
|
|
|
myshixuns_test_data_baseline = root_path + '/data/' + samples_mode_flag + '/myshixuns_test_baseline.csv'
|
|
|
|
|
mysubjects_train_data_baseline = root_path + '/data/' + samples_mode_flag + '/' + 'mysubjects_train_baseline.csv'
|
|
|
|
|
mysubjects_test_data_baseline = root_path + '/data/' + samples_mode_flag + '/mysubjects_test_baseline.csv'
|
|
|
|
|
users_data_path = root_path + '/data/' + samples_mode_flag + '/' + 'users.csv'
|
|
|
|
|
|
|
|
|
|
shixun_save_path = root_path + '/results/shixun/' + samples_mode_flag + '/'
|
|
|
|
|
subject_save_path = root_path + '/results/subject/' + samples_mode_flag + '/'
|
|
|
|
|
shixun_features_save_path = root_path + '/features/shixun/' + samples_mode_flag + '/'
|
|
|
|
|
subject_features_save_path = root_path + '/features/subject/' + samples_mode_flag + '/'
|
|
|
|
|
|
|
|
|
|
shixun_model_save_path = root_path + '/models/shixun/' + samples_mode_flag + '/'
|
|
|
|
|
subject_model_save_path = root_path + '/models/subject/' + samples_mode_flag + '/'
|
|
|
|
|
|
|
|
|
|
myshixuns_save_path = root_path + '/data/shixun/' + samples_mode_flag + '/'
|
|
|
|
|
mysubjects_save_path = root_path + '/data/subject/' + samples_mode_flag + '/'
|
|
|
|
|
|
|
|
|
|
# 用户自定义词典文件
|
|
|
|
|
user_dict_path = os.path.join(data_path, 'user_dict.txt')
|
|
|
|
|
|
|
|
|
|
# 实训数据提取的关键词
|
|
|
|
|
shixuns_keywords_path = os.path.join(data_parent_path, 'shixuns_keywords.txt')
|
|
|
|
|
|
|
|
|
|
# 课程数据提取的关键词
|
|
|
|
|
subjects_keywords_path = os.path.join(data_parent_path, 'subjects_keywords.txt')
|
|
|
|
|
|
|
|
|
|
# 实践项目推荐模型训练过程产生的输出
|
|
|
|
|
shixun_id_to_name_dict_data = shixun_save_path + 'shixun_id_to_name_dict.pkl'
|
|
|
|
|
shixun_itemcf_i2i_sim_data_baseline = shixun_save_path + 'itemcf_i2i_sim_baseline.pkl'
|
|
|
|
|
shixun_itemcf_recall_baseline_dict = shixun_save_path + 'itemcf_recall_baseline_dict.pkl'
|
|
|
|
|
shixun_itemcf_i2i_sim_data = shixun_save_path + 'itemcf_i2i_sim.pkl'
|
|
|
|
|
shixun_user_item_time_dict_data = shixun_save_path + 'user_item_time_dict.pkl'
|
|
|
|
|
shixun_user_item_dict_data = shixun_save_path + 'user_item_dict.pkl'
|
|
|
|
|
shixun_item_user_time_dict = shixun_save_path + 'item_user_time_dict.pkl'
|
|
|
|
|
shixun_usercf_u2u_sim_data = shixun_save_path + 'usercf_u2u_sim.pkl'
|
|
|
|
|
shixun_emb_i2i_sim_data = shixun_save_path + 'emb_i2i_sim.pkl'
|
|
|
|
|
shixun_wordemb_i2i_sim_data = shixun_save_path + 'wordemb_i2i_sim.pkl'
|
|
|
|
|
shixun_itemcf_recall_dict = shixun_save_path + 'itemcf_recall_dict.pkl'
|
|
|
|
|
shixun_usercf_recall_dict = shixun_save_path + 'usercf_recall_dict.pkl'
|
|
|
|
|
shixun_item_embedding_recall_dict = shixun_save_path + 'item_embedding_recall_dict.pkl'
|
|
|
|
|
shixun_pinsage_recall_dict = shixun_save_path + 'pinsage_recall_dict.pkl'
|
|
|
|
|
shixun_youtubednn_recall_dict = shixun_save_path + 'youtubednn_recall_dict.pkl'
|
|
|
|
|
shixun_youtubednn_usercf_recall_dict = shixun_save_path + 'youtubednn_usercf_recall_dict.pkl'
|
|
|
|
|
shixun_dssm_usercf_recall_dict = shixun_save_path + 'dssm_usercf_recall_dict.pkl'
|
|
|
|
|
shixun_cold_start_recall_dict = shixun_save_path + 'cold_start_recall_dict.pkl'
|
|
|
|
|
shixun_final_recall_items_dict = shixun_save_path + 'final_recall_items_dict.pkl'
|
|
|
|
|
shixun_xdeepfm_rank_dict = shixun_save_path + 'xdeepfm_rank_dict.pkl'
|
|
|
|
|
shixun_difm_rank_dict = shixun_save_path + 'difm_rank_dict.pkl'
|
|
|
|
|
shixun_bst_rank_dict = shixun_save_path + 'bst_rank_dict.pkl'
|
|
|
|
|
shixun_din_rank_dict = shixun_save_path + 'din_rank_dict.pkl'
|
|
|
|
|
shixun_item_w2v_emb_dict = shixun_save_path + 'item_w2v_emb_dict.pkl'
|
|
|
|
|
shixun_user_w2v_emb_dict = shixun_save_path + 'user_w2v_emb_dict.pkl'
|
|
|
|
|
shixun_train_user_item_feats = shixun_features_save_path + 'train_user_item_feats_df.csv'
|
|
|
|
|
shixun_val_user_item_feats = shixun_features_save_path + 'val_user_item_feats_df.csv'
|
|
|
|
|
shixun_test_user_item_feats = shixun_features_save_path + 'test_user_item_feats_df.csv'
|
|
|
|
|
shixun_all_user_item_feats = shixun_features_save_path + 'all_user_item_feats_df.csv'
|
|
|
|
|
shixuns_emb_dict = shixun_save_path + 'shixuns_emb_dict.pkl'
|
|
|
|
|
shixuns_bert_emb_dict = shixun_save_path + 'shixuns_bert_emb_dict.pkl'
|
|
|
|
|
shixun_youtube_item_emb_dict = shixun_save_path + 'youtube_item_emb_dict.pkl'
|
|
|
|
|
shixun_youtube_user_emb_dict = shixun_save_path + 'youtube_user_emb_dict.pkl'
|
|
|
|
|
shixun_user_embedding_index_dict = shixun_save_path + 'user_embedding_index_dict.pkl'
|
|
|
|
|
shixun_youtube_user_embedding_index_dict = shixun_save_path + 'youtube_user_embedding_index_dict.pkl'
|
|
|
|
|
shixun_youtube_item_embedding_index_dict = shixun_save_path + 'youtube_item_embedding_index_dict.pkl'
|
|
|
|
|
shixun_youtubednn_train_input_data = shixun_save_path + 'youtubednn_train_input.pkl'
|
|
|
|
|
shixun_youtubednn_train_label_data = shixun_save_path + 'youtubednn_train_label.pkl'
|
|
|
|
|
shixun_youtubednn_test_input_data = shixun_save_path + 'youtubednn_test_input.pkl'
|
|
|
|
|
shixun_youtubednn_test_label_data = shixun_save_path + 'youtubednn_test_label.pkl'
|
|
|
|
|
shixun_youtubednn_train_set_data = shixun_save_path + 'youtubednn_train_set.pkl'
|
|
|
|
|
shixun_youtubednn_test_set_data = shixun_save_path + 'youtubednn_test_set.pkl'
|
|
|
|
|
shixun_youtube_user_embedding_data = shixun_save_path + 'youtube_user_embedding_data.pkl'
|
|
|
|
|
shixun_youtube_item_embedding_data = shixun_save_path + 'youtube_item_embedding_data.pkl'
|
|
|
|
|
shixun_cold_start_user_shixun_dict = shixun_save_path + 'cold_start_user_shixun_dict.pkl'
|
|
|
|
|
shixun_mind_train_set_data = shixun_save_path + 'mind_train_set.pkl'
|
|
|
|
|
shixun_mind_test_set_data = shixun_save_path + 'mind_test_set.pkl'
|
|
|
|
|
|
|
|
|
|
shixun_dssm_train_set_data = shixun_save_path + 'dssm_train_set.pkl'
|
|
|
|
|
shixun_dssm_test_set_data = shixun_save_path + 'dssm_test_set.pkl'
|
|
|
|
|
shixun_dssm_train_input_data = shixun_save_path + 'dssm_train_input.pkl'
|
|
|
|
|
shixun_dssm_train_label_data = shixun_save_path + 'dssm_train_label.pkl'
|
|
|
|
|
shixun_dssm_test_input_data = shixun_save_path + 'dssm_test_input.pkl'
|
|
|
|
|
shixun_dssm_test_label_data = shixun_save_path + 'dssm_test_label.pkl'
|
|
|
|
|
shixun_dssm_user_embedding_data = shixun_save_path + 'dssm_user_embedding_data.pkl'
|
|
|
|
|
shixun_dssm_item_embedding_data = shixun_save_path + 'dssm_item_embedding_data.pkl'
|
|
|
|
|
shixun_dssm_item_embedding_index_dict = shixun_save_path + 'dssm_item_embedding_index_dict.pkl'
|
|
|
|
|
shixun_dssm_user_embedding_index_dict = shixun_save_path + 'dssm_user_embedding_index_dict.pkl'
|
|
|
|
|
shixun_dssm_user_emb_dict = shixun_save_path + 'dssm_user_emb_dict.pkl'
|
|
|
|
|
shixun_dssm_item_emb_dict = shixun_save_path + 'dssm_item_emb_dict.pkl'
|
|
|
|
|
shixun_dssm_recall_dict = shixun_save_path + 'dssm_recall_dict.pkl'
|
|
|
|
|
|
|
|
|
|
shixun_fm_train_set_data = shixun_save_path + 'fm_train_set.pkl'
|
|
|
|
|
shixun_fm_test_set_data = shixun_save_path + 'fm_test_set.pkl'
|
|
|
|
|
shixun_fm_train_input_data = shixun_save_path + 'fm_train_input.pkl'
|
|
|
|
|
shixun_fm_train_label_data = shixun_save_path + 'fm_train_label.pkl'
|
|
|
|
|
shixun_fm_test_input_data = shixun_save_path + 'fm_test_input.pkl'
|
|
|
|
|
shixun_fm_test_label_data = shixun_save_path + 'fm_test_label.pkl'
|
|
|
|
|
shixun_fm_user_embedding_data = shixun_save_path + 'fm_user_embedding_data.pkl'
|
|
|
|
|
shixun_fm_item_embedding_data = shixun_save_path + 'fm_item_embedding_data.pkl'
|
|
|
|
|
shixun_fm_item_embedding_index_dict = shixun_save_path + 'fm_item_embedding_index_dict.pkl'
|
|
|
|
|
shixun_fm_user_embedding_index_dict = shixun_save_path + 'fm_user_embedding_index_dict.pkl'
|
|
|
|
|
shixun_fm_user_emb_dict = shixun_save_path + 'fm_user_emb_dict.pkl'
|
|
|
|
|
shixun_fm_item_emb_dict = shixun_save_path + 'fm_item_emb_dict.pkl'
|
|
|
|
|
shixun_fm_recall_dict = shixun_save_path + 'fm_recall_dict.pkl'
|
|
|
|
|
|
|
|
|
|
shixun_mind_recall0_dict = shixun_save_path + 'mind_recall0_dict.pkl'
|
|
|
|
|
shixun_mind_recall1_dict = shixun_save_path + 'mind_recall1_dict.pkl'
|
|
|
|
|
shixun_mind_user_emb0_dict = shixun_save_path + 'mind_user_emb0_dict.pkl'
|
|
|
|
|
shixun_mind_user_emb1_dict = shixun_save_path + 'mind_user_emb1_dict.pkl'
|
|
|
|
|
shixun_mind_item_emb_dict = shixun_save_path + 'mind_item_emb_dict.pkl'
|
|
|
|
|
shixun_mind_user_embedding0_index_dict = shixun_save_path + 'mind_user_embedding0_index_dict.pkl'
|
|
|
|
|
shixun_mind_user_embedding1_index_dict = shixun_save_path + 'mind_user_embedding1_index_dict.pkl'
|
|
|
|
|
shixun_mind_item_embedding_index_dict = shixun_save_path + 'mind_item_embedding_index_dict.pkl'
|
|
|
|
|
shixun_mind_train_input_data = shixun_save_path + 'mind_train_input.pkl'
|
|
|
|
|
shixun_mind_train_label_data = shixun_save_path + 'mind_train_label.pkl'
|
|
|
|
|
shixun_mind_test_input_data = shixun_save_path + 'mind_test_input.pkl'
|
|
|
|
|
shixun_mind_test_label_data = shixun_save_path + 'mind_test_label.pkl'
|
|
|
|
|
shixun_mind_train_set_data = shixun_save_path + 'mind_train_set.pkl'
|
|
|
|
|
shixun_mind_test_set_data = shixun_save_path + 'mind_test_set.pkl'
|
|
|
|
|
shixun_mind_user_embedding0_data = shixun_save_path + 'mind_user_embedding0_data.pkl'
|
|
|
|
|
shixun_mind_user_embedding1_data = shixun_save_path + 'mind_user_embedding1_data.pkl'
|
|
|
|
|
shixun_mind_item_embedding_data = shixun_save_path + 'mind_item_embedding_data.pkl'
|
|
|
|
|
|
|
|
|
|
# 实践课程推荐模型训练过程产生的输出
|
|
|
|
|
subject_id_to_name_dict_data = subject_save_path + 'subject_id_to_name_dict0419.pkl'
|
|
|
|
|
subject_itemcf_i2i_sim_data_baseline = subject_save_path + 'itemcf_i2i_sim_baseline.pkl'
|
|
|
|
|
subject_itemcf_recall_baseline_dict = subject_save_path + 'itemcf_recall_baseline_dict.pkl'
|
|
|
|
|
subject_itemcf_i2i_sim_data = subject_save_path + 'itemcf_i2i_sim.pkl'
|
|
|
|
|
subject_user_item_time_dict_data = subject_save_path + 'user_item_time_dict.pkl'
|
|
|
|
|
subject_item_user_time_dict = subject_save_path + 'item_user_time_dict.pkl'
|
|
|
|
|
subject_usercf_u2u_sim_data = subject_save_path + 'usercf_u2u_sim.pkl'
|
|
|
|
|
subject_emb_i2i_sim_data = subject_save_path + 'emb_i2i_sim.pkl'
|
|
|
|
|
subject_itemcf_recall_dict = subject_save_path + 'itemcf_recall_dict.pkl'
|
|
|
|
|
subject_usercf_recall_dict = subject_save_path + 'usercf_recall_dict.pkl'
|
|
|
|
|
subject_item_embedding_recall_dict = subject_save_path + 'item_embedding_recall_dict.pkl'
|
|
|
|
|
subject_youtubednn_recall_dict = subject_save_path + 'youtubednn_recall_dict.pkl'
|
|
|
|
|
subject_mind_recall_dict = subject_save_path + 'mind_recall_dict.pkl'
|
|
|
|
|
subject_youtubednn_usercf_recall_dict = subject_save_path + 'youtubednn_usercf_recall_dict.pkl'
|
|
|
|
|
subject_dssm_usercf_recall_dict = subject_save_path + 'dssm_usercf_recall_dict.pkl'
|
|
|
|
|
subject_cold_start_recall_dict = subject_save_path + 'cold_start_recall_dict.pkl'
|
|
|
|
|
subject_final_recall_items_dict = subject_save_path + 'final_recall_items_dict.pkl'
|
|
|
|
|
subject_xdeepfm_rank_dict = subject_save_path + 'xdeepfm_rank_dict.pkl'
|
|
|
|
|
subject_difm_rank_dict = subject_save_path + 'difm_rank_dict.pkl'
|
|
|
|
|
subject_bst_rank_dict = subject_save_path + 'bst_rank_dict.pkl'
|
|
|
|
|
subject_din_rank_dict = subject_save_path + 'din_rank_dict.pkl'
|
|
|
|
|
subject_item_w2v_emb_dict = subject_save_path + 'item_w2v_emb_dict.pkl'
|
|
|
|
|
subject_user_w2v_emb_dict = subject_save_path + 'user_w2v_emb_dict.pkl'
|
|
|
|
|
subject_train_user_item_feats = subject_features_save_path + 'train_user_item_feats_df.csv'
|
|
|
|
|
subject_val_user_item_feats = subject_features_save_path + 'val_user_item_feats_df.csv'
|
|
|
|
|
subject_test_user_item_feats = subject_features_save_path + 'test_user_item_feats_df.csv'
|
|
|
|
|
subject_all_user_item_feats = subject_features_save_path + 'all_user_item_feats_df.csv'
|
|
|
|
|
subjects_emb_dict = subject_save_path + 'subjects_emb_dict.pkl'
|
|
|
|
|
subject_bert_emb_dict = subject_save_path + 'subjects_bert_emb_dict.pkl'
|
|
|
|
|
subject_youtube_item_emb_dict = subject_save_path + 'youtube_item_emb_dict.pkl'
|
|
|
|
|
subject_mind_item_emb_dict = subject_save_path + 'mind_item_emb_dict.pkl'
|
|
|
|
|
subject_youtube_user_emb_dict = subject_save_path + 'youtube_user_emb_dict.pkl'
|
|
|
|
|
subject_mind_user_emb_dict = subject_save_path + 'mind_user_emb_dict.pkl'
|
|
|
|
|
subject_youtube_item_emb_data = subject_save_path + 'youtube_item_emb.pkl'
|
|
|
|
|
subject_youtube_user_emb_data = subject_save_path + 'youtube_user_emb.pkl'
|
|
|
|
|
subject_user_embedding_index_dict = subject_save_path + 'user_embedding_index_dict.pkl'
|
|
|
|
|
subject_youtube_user_embedding_index_dict = subject_save_path + 'youtube_user_embedding_index_dict.pkl'
|
|
|
|
|
subject_mind_user_embedding_index_dict = subject_save_path + 'mind_user_embedding_index_dict.pkl'
|
|
|
|
|
subject_youtube_item_embedding_index_dict = subject_save_path + 'youtube_item_embedding_index_dict.pkl'
|
|
|
|
|
subject_mind_item_embedding_index_dict = subject_save_path + 'mind_item_embedding_index_dict.pkl'
|
|
|
|
|
subject_youtubednn_train_input_data = subject_save_path + 'youtubednn_train_input.pkl'
|
|
|
|
|
subject_mind_train_input_data = subject_save_path + 'mind_train_input.pkl'
|
|
|
|
|
subject_youtubednn_train_label_data = subject_save_path + 'youtubednn_train_label.pkl'
|
|
|
|
|
subject_mind_train_label_data = subject_save_path + 'mind_train_label.pkl'
|
|
|
|
|
subject_youtubednn_test_input_data = subject_save_path + 'youtubednn_test_input.pkl'
|
|
|
|
|
subject_mind_test_input_data = subject_save_path + 'mind_test_input.pkl'
|
|
|
|
|
subject_youtubednn_test_label_data = subject_save_path + 'youtubednn_test_label.pkl'
|
|
|
|
|
subject_mind_test_label_data = subject_save_path + 'mind_test_label.pkl'
|
|
|
|
|
subject_youtubednn_train_set_data = subject_save_path + 'youtubednn_train_set.pkl'
|
|
|
|
|
subject_mind_train_set_data = subject_save_path + 'mind_train_set.pkl'
|
|
|
|
|
subject_youtubednn_test_set_data = subject_save_path + 'youtubednn_test_set.pkl'
|
|
|
|
|
subject_mind_test_set_data = subject_save_path + 'mind_test_set.pkl'
|
|
|
|
|
subject_youtube_user_embedding_data = subject_save_path + 'youtube_user_embedding_data.pkl'
|
|
|
|
|
subject_mind_user_embedding_data = subject_save_path + 'mind_user_embedding_data.pkl'
|
|
|
|
|
subject_youtube_item_embedding_data = subject_save_path + 'youtube_item_embedding_data.pkl'
|
|
|
|
|
subject_mind_item_embedding_data = subject_save_path + 'mind_item_embedding_data.pkl'
|
|
|
|
|
subject_cold_start_user_subject_dict = subject_save_path + 'cold_start_user_subject_dict.pkl'
|
|
|
|
|
subject_dssm_usercf_recall_dict = subject_save_path + 'dssm_usercf_recall_dict.pkl'
|
|
|
|
|
subject_pinsage_recall_dict = subject_save_path + 'pinsage_recall_dict.pkl'
|
|
|
|
|
|
|
|
|
|
subject_mind_recall0_dict = subject_save_path + 'mind_recall0_dict.pkl'
|
|
|
|
|
subject_mind_recall1_dict = subject_save_path + 'mind_recall1_dict.pkl'
|
|
|
|
|
subject_mind_user_emb0_dict = subject_save_path + 'mind_user_emb0_dict.pkl'
|
|
|
|
|
subject_mind_user_emb1_dict = subject_save_path + 'mind_user_emb1_dict.pkl'
|
|
|
|
|
subject_mind_item_emb_dict = subject_save_path + 'mind_item_emb_dict.pkl'
|
|
|
|
|
subject_mind_user_embedding0_index_dict = subject_save_path + 'mind_user_embedding0_index_dict.pkl'
|
|
|
|
|
subject_mind_user_embedding1_index_dict = subject_save_path + 'mind_user_embedding1_index_dict.pkl'
|
|
|
|
|
subject_mind_item_embedding_index_dict = subject_save_path + 'mind_item_embedding_index_dict.pkl'
|
|
|
|
|
subject_mind_train_input_data = subject_save_path + 'mind_train_input.pkl'
|
|
|
|
|
subject_mind_train_label_data = subject_save_path + 'mind_train_label.pkl'
|
|
|
|
|
subject_mind_test_input_data = subject_save_path + 'mind_test_input.pkl'
|
|
|
|
|
subject_mind_test_label_data = subject_save_path + 'mind_test_label.pkl'
|
|
|
|
|
subject_mind_train_set_data = subject_save_path + 'mind_train_set.pkl'
|
|
|
|
|
subject_mind_test_set_data = subject_save_path + 'mind_test_set.pkl'
|
|
|
|
|
subject_mind_user_embedding0_data = subject_save_path + 'mind_user_embedding0_data.pkl'
|
|
|
|
|
subject_mind_user_embedding1_data = subject_save_path + 'mind_user_embedding1_data.pkl'
|
|
|
|
|
subject_mind_item_embedding_data = subject_save_path + 'mind_item_embedding_data.pkl'
|
|
|
|
|
|
|
|
|
|
subject_dssm_train_set_data = subject_save_path + 'dssm_train_set.pkl'
|
|
|
|
|
subject_dssm_test_set_data = subject_save_path + 'dssm_test_set.pkl'
|
|
|
|
|
subject_dssm_train_input_data = subject_save_path + 'dssm_train_input.pkl'
|
|
|
|
|
subject_dssm_train_label_data = subject_save_path + 'dssm_train_label.pkl'
|
|
|
|
|
subject_dssm_test_input_data = subject_save_path + 'dssm_test_input.pkl'
|
|
|
|
|
subject_dssm_test_label_data = subject_save_path + 'dssm_test_label.pkl'
|
|
|
|
|
subject_dssm_user_embedding_data = subject_save_path + 'dssm_user_embedding_data.pkl'
|
|
|
|
|
subject_dssm_item_embedding_data = subject_save_path + 'dssm_item_embedding_data.pkl'
|
|
|
|
|
subject_dssm_item_embedding_index_dict = subject_save_path + 'dssm_item_embedding_index_dict.pkl'
|
|
|
|
|
subject_dssm_user_embedding_index_dict = subject_save_path + 'dssm_user_embedding_index_dict.pkl'
|
|
|
|
|
subject_dssm_user_emb_dict = subject_save_path + 'dssm_user_emb_dict.pkl'
|
|
|
|
|
subject_dssm_item_emb_dict = subject_save_path + 'dssm_item_emb_dict.pkl'
|
|
|
|
|
subject_dssm_recall_dict = subject_save_path + 'dssm_recall_dict.pkl'
|
|
|
|
|
|
|
|
|
|
subject_fm_train_set_data = subject_save_path + 'fm_train_set.pkl'
|
|
|
|
|
subject_fm_test_set_data = subject_save_path + 'fm_test_set.pkl'
|
|
|
|
|
subject_fm_train_input_data = subject_save_path + 'fm_train_input.pkl'
|
|
|
|
|
subject_fm_train_label_data = subject_save_path + 'fm_train_label.pkl'
|
|
|
|
|
subject_fm_test_input_data = subject_save_path + 'fm_test_input.pkl'
|
|
|
|
|
subject_fm_test_label_data = subject_save_path + 'fm_test_label.pkl'
|
|
|
|
|
subject_fm_user_embedding_data = subject_save_path + 'fm_user_embedding_data.pkl'
|
|
|
|
|
subject_fm_item_embedding_data = subject_save_path + 'fm_item_embedding_data.pkl'
|
|
|
|
|
subject_fm_item_embedding_index_dict = subject_save_path + 'fm_item_embedding_index_dict.pkl'
|
|
|
|
|
subject_fm_user_embedding_index_dict = subject_save_path + 'fm_user_embedding_index_dict.pkl'
|
|
|
|
|
subject_fm_user_emb_dict = subject_save_path + 'fm_user_emb_dict.pkl'
|
|
|
|
|
subject_fm_item_emb_dict = subject_save_path + 'fm_item_emb_dict.pkl'
|
|
|
|
|
subject_fm_recall_dict = subject_save_path + 'fm_recall_dict.pkl'
|
|
|
|
|
|
|
|
|
|
# faiss召回用到的Word2Vec词向量
|
|
|
|
|
shixun_faiss_w2v_path = shixun_model_save_path + "faiss_word2vec.model"
|
|
|
|
|
subject_faiss_w2v_path = subject_model_save_path + "faiss_word2vec.model"
|
|
|
|
|
|
|
|
|
|
# 根据名称训练的faiss模型保存路径
|
|
|
|
|
shixuns_fassi_model_path = shixun_model_save_path + 'item_content_fassi.model'
|
|
|
|
|
subjects_fassi_model_path = subject_model_save_path + 'item_content_fassi.model'
|
|
|
|
|
|
|
|
|
|
#pinsage训练的模型保存路径
|
|
|
|
|
shixun_pinsage_model_path = shixun_model_save_path
|
|
|
|
|
subject_pinsage_model_path = subject_model_save_path
|
|
|
|
|
|
|
|
|
|
# YoutubeDNN训练的faiss模型保存路径
|
|
|
|
|
shixun_youtube_item_faiss_model_path = shixun_model_save_path + 'youtube_item_faiss.model'
|
|
|
|
|
shixun_youtube_user_faiss_model_path = shixun_model_save_path + 'youtube_user_faiss.model'
|
|
|
|
|
subject_youtube_item_faiss_model_path = subject_model_save_path + 'youtube_item_faiss.model'
|
|
|
|
|
subject_youtube_user_faiss_model_path = subject_model_save_path + 'youtube_user_faiss.model'
|
|
|
|
|
|
|
|
|
|
#DSSM模型训练的faiss模型保存路径
|
|
|
|
|
shixun_dssm_item_faiss_model_path = shixun_model_save_path + 'dssm_item_faiss.model'
|
|
|
|
|
shixun_dssm_user_faiss_model_path = shixun_model_save_path + 'dssm_user_faiss.model'
|
|
|
|
|
subject_dssm_item_faiss_model_path = subject_model_save_path + 'dssm_item_faiss.model'
|
|
|
|
|
subject_dssm_user_faiss_model_path = subject_model_save_path + 'dssm_user_faiss.model'
|
|
|
|
|
|
|
|
|
|
#FM模型训练的faiss模型保存路径
|
|
|
|
|
shixun_fm_item_faiss_model_path = shixun_model_save_path + 'fm_item_faiss.model'
|
|
|
|
|
shixun_fm_user_faiss_model_path = shixun_model_save_path + 'fm_user_faiss.model'
|
|
|
|
|
subject_fm_item_faiss_model_path = subject_model_save_path + 'fm_item_faiss.model'
|
|
|
|
|
subject_fm_user_faiss_model_path = subject_model_save_path + 'fm_user_faiss.model'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#MIND模型训练的faiss模型保存路径
|
|
|
|
|
shixun_mind_item_faiss_model_path = shixun_model_save_path + 'mind_item_faiss.model'
|
|
|
|
|
shixun_mind_user0_faiss_model_path = shixun_model_save_path + 'mind_user0_faiss.model'
|
|
|
|
|
shixun_mind_user1_faiss_model_path = shixun_model_save_path + 'mind_user1_faiss.model'
|
|
|
|
|
|
|
|
|
|
subject_mind_item_faiss_model_path = subject_model_save_path + 'mind_item_faiss.model'
|
|
|
|
|
subject_mind_user0_faiss_model_path = subject_model_save_path + 'mind_user0_faiss.model'
|
|
|
|
|
subject_mind_user1_faiss_model_path = subject_model_save_path + 'mind_user1_faiss.model'
|
|
|
|
|
|
|
|
|
|
# 根据用户选择的物品序列生成的Word2Vec词向量
|
|
|
|
|
shixuns_item_word2vec_model_path = shixun_model_save_path + 'item_word2vec.model'
|
|
|
|
|
subjects_item_word2vec_model_path = subject_model_save_path + 'item_word2vec.model'
|
|
|
|
|
|
|
|
|
|
shixuns_user_word2vec_faiss_model_path = shixun_model_save_path + 'user_word2vec_faiss.model'
|
|
|
|
|
subjects_user_word2vec_faiss_model_path = subject_model_save_path + 'user_word2vec_faiss.model'
|
|
|
|
|
|
|
|
|
|
#训练word2vec的维基百科语料库文件
|
|
|
|
|
word2vec_input_path = '/home/plm_models/word2vec/wiki.txt'
|
|
|
|
|
|
|
|
|
|
#word2vec训练的词向量保存地址
|
|
|
|
|
word2vec_model_path = '/home/plm_models/word2vec/wiki.model'
|
|
|
|
|
word2vec_dim = 100
|
|
|
|
|
|
|
|
|
|
# 确定Embedding的维度
|
|
|
|
|
embedding_dim = 100
|
|
|
|
|
|
|
|
|
|
# ELMo预训练语言模型地址
|
|
|
|
|
elmo_model_path = '/home/plm_models/elmo/zhs.model'
|
|
|
|
|
|
|
|
|
|
# bert预训练语言模型地址
|
|
|
|
|
bert_pretrain_path = "/home/plm_models/chinese-bert-wwm-ext"
|
|
|
|
|
roberta_pretrain_path = "/home/plm_models/chinese-roberta-wwm-ext"
|
|
|
|
|
ernie20_pretrain_path = "/home/plm_models/ernie-2.0"
|
|
|
|
|
bert_base_chinese = '/home/plm_models/bert-base-chinese'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#sentence-transformers预训练语言模型地址
|
|
|
|
|
sentence_transformers_model_path = "/home/plm_models/paraphrase-multilingual-MiniLM-L12-v2"
|
|
|
|
|
|
|
|
|
|
# ltp模型路径
|
|
|
|
|
ltp_model_path = "/home/plm_models/ltp/v4/base2"
|
|
|
|
|
|
|
|
|
|
# 是否使用多卡GPU
|
|
|
|
|
use_multi_gpu = False
|
|
|
|
|
|
|
|
|
|
# 两种分词方式
|
|
|
|
|
JIEBA_TOKEN = 'jieba'
|
|
|
|
|
LTP_TOKEN = 'ltp'
|
|
|
|
|
|
|
|
|
|
# 停用词文件路径
|
|
|
|
|
stop_words_path = root_path + '/stopwords/cn_stopwords.txt'
|
|
|
|
|
|
|
|
|
|
# HNSW参数
|
|
|
|
|
ef_construction = 3000 # 搜索时保存最近邻的动态列表大小
|
|
|
|
|
M = 64 # 图中最大节点的近邻结点数
|
|
|
|
|
|
|
|
|
|
# 随机数种子
|
|
|
|
|
RANDOM_SEED = 2023
|
|
|
|
|
|
|
|
|
|
# 创建日志对象
|
|
|
|
|
logger = create_logger(root_path + '/logs/' + str(datetime.date(datetime.now())) + '.log')
|
|
|
|
|
|
|
|
|
|
# 检测cuda是否可用
|
|
|
|
|
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
|
|
|
|
|
|
|
|
|
# 使用的CPU核数
|
|
|
|
|
cpu_count = multiprocessing.cpu_count() - 2
|
|
|
|
|
|
|
|
|
|
# lightgbm排序模型的特征列
|
|
|
|
|
shixun_lgb_cols = ['sim0', 'time_diff0', 'visit_diff0', 'myshixuns_diff0', 'challenges_diff0',
|
|
|
|
|
'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'user_item_sim', 'score',
|
|
|
|
|
'select_size', 'user_time_diff_mean', 'active_level', 'item_time_diff_mean',
|
|
|
|
|
'user_num', 'hot_level', 'user_time_hob1', 'user_time_hob2',
|
|
|
|
|
'visits_hbo', 'myshixuns_hbo', 'challenges_hbo', 'averge_star_hbo',
|
|
|
|
|
'visits', 'trainee', 'myshixuns_count', 'challenges_count', 'averge_star',
|
|
|
|
|
'created_at_ts', 'user_visits', 'grade', 'experience', 'is_trainee_hab']
|
|
|
|
|
|
|
|
|
|
subject_lgb_cols = ['sim0', 'time_diff0', 'visit_diff0', 'study_count_diff0', 'course_study_count_diff0',
|
|
|
|
|
'passed_count_diff0', 'course_used_count_diff0', 'school_used_count_diff0',
|
|
|
|
|
'challenge_count_diff0', 'evaluate_count_diff0', 'video_study_time_diff0',
|
|
|
|
|
'study_pdf_attachment_count_diff0', 'averge_star_diff0',
|
|
|
|
|
'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'user_item_sim', 'score',
|
|
|
|
|
'select_size', 'user_time_diff_mean', 'active_level', 'item_time_diff_mean',
|
|
|
|
|
'user_num', 'hot_level', 'user_time_hob1', 'user_time_hob2',
|
|
|
|
|
'visits_hbo', 'study_count_hbo', 'course_study_count_hbo', 'passed_count_hbo',
|
|
|
|
|
'course_used_count_hbo', 'school_used_count_hbo', 'challenge_count_hbo',
|
|
|
|
|
'evaluate_count_hbo', 'video_study_time_hbo', 'study_pdf_attachment_count_hbo',
|
|
|
|
|
'averge_star_hbo', 'visits', 'study_count', 'course_study_count', 'passed_count',
|
|
|
|
|
'course_used_count', 'school_used_count', 'challenge_count', 'evaluate_count',
|
|
|
|
|
'video_study_time', 'study_pdf_attachment_count', 'averge_star',
|
|
|
|
|
'created_at_ts', 'user_visits', 'grade', 'experience', 'is_disciplines_hab']
|
|
|
|
|
|
|
|
|
|
# 排序模型的Dense特征
|
|
|
|
|
shixun_rank_dense_fea = ['sim0', 'time_diff0', 'visit_diff0', 'myshixuns_diff0', 'challenges_diff0','trainee_diff0','averge_star_diff0',
|
|
|
|
|
'task_pass_diff0','sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank',
|
|
|
|
|
'select_size', 'user_time_diff_mean', 'active_level', 'item_time_diff_mean', 'seq_length',
|
|
|
|
|
'user_num', 'hot_level', 'user_time_hob1', 'user_time_hob2', 'task_pass_hbo',
|
|
|
|
|
'visits_hbo', 'myshixuns_hbo', 'challenges_hbo', 'averge_star_hbo',
|
|
|
|
|
'visits', 'myshixuns_count', 'challenges_count', 'averge_star', 'task_pass',
|
|
|
|
|
'logins', 'grade', 'edu_background','experience']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
subject_rank_dense_fea = ['sim0', 'time_diff0', 'visit_diff0', 'study_count_diff0', 'course_study_count_diff0',
|
|
|
|
|
'passed_count_diff0', 'course_used_count_diff0', 'school_used_count_diff0','stages_count_diff0',
|
|
|
|
|
'challenge_count_diff0', 'evaluate_count_diff0', 'video_study_time_diff0', 'stages_shixuns_count_diff0',
|
|
|
|
|
'study_pdf_attachment_count_diff0', 'averge_star_diff0', 'stages_count','stage_shixuns_count',
|
|
|
|
|
'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank',
|
|
|
|
|
'select_size', 'user_time_diff_mean', 'active_level', 'item_time_diff_mean', 'seq_length',
|
|
|
|
|
'user_num', 'hot_level', 'user_time_hob1', 'user_time_hob2',
|
|
|
|
|
'visits_hbo', 'study_count_hbo', 'course_study_count_hbo', 'passed_count_hbo',
|
|
|
|
|
'course_used_count_hbo', 'school_used_count_hbo', 'challenge_count_hbo',
|
|
|
|
|
'evaluate_count_hbo', 'video_study_time_hbo', 'study_pdf_attachment_count_hbo',
|
|
|
|
|
'averge_star_hbo', 'visits', 'study_count', 'course_study_count', 'passed_count',
|
|
|
|
|
'course_used_count', 'school_used_count', 'challenge_count', 'evaluate_count',
|
|
|
|
|
'video_study_time', 'study_pdf_attachment_count', 'averge_star',
|
|
|
|
|
'logins', 'grade', 'experience']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 排序模型的sparse特征
|
|
|
|
|
shixun_rank_sparse_fea = ['user_id', 'shixun_id', 'gender', 'school_id', 'identity', 'trainee', 'is_trainee_hab']
|
|
|
|
|
subject_rank_sparse_fea = ['user_id', 'subject_id', 'gender', 'school_id', 'identity','edu_background', 'disciplines_id', 'is_disciplines_hab']
|
|
|
|
|
|
|
|
|
|
# 排序模型特征字段
|
|
|
|
|
shixun_rank_feats_columns = ['visits', 'myshixuns_count', 'challenges_count', 'averge_star']
|
|
|
|
|
|
|
|
|
|
subject_rank_feats_columns = ['visits', 'study_count', 'course_study_count', 'passed_count',
|
|
|
|
|
'course_used_count', 'school_used_count', 'challenge_count', 'evaluate_count',
|
|
|
|
|
'video_study_time', 'study_pdf_attachment_count', 'averge_star']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 用户选择实训最大序列长度
|
|
|
|
|
shixun_max_seq_len = 500
|
|
|
|
|
|
|
|
|
|
# 用户选择课程最大序列长度
|
|
|
|
|
subject_max_seq_len = 300
|
|
|
|
|
|
|
|
|
|
# mysql连接配置
|
|
|
|
|
mysql_host = "rm-bp13v5020p7828r5rso.mysql.rds.aliyuncs.com"
|
|
|
|
|
mysql_user = "testeducoder"
|
|
|
|
|
mysql_passwd = "TEST@123"
|
|
|
|
|
mysql_port = 3306
|
|
|
|
|
mysql_database = "preeducoderweb"
|
|
|
|
|
|
|
|
|
|
#mysql线上
|
|
|
|
|
mysql_test_host = "testeducoder-public.mysql.polardb.rds.aliyuncs.com"
|
|
|
|
|
mysql_test_user = "testeducoder"
|
|
|
|
|
mysql_test_passwd = "TEST@123"
|
|
|
|
|
mysql_test_port = 3306
|
|
|
|
|
mysql_test_database = "educoder_kg_search"
|
|
|
|
|
|
|
|
|
|
test_user_id = 73547
|
|
|
|
|
test_shixun_id = 8222
|
|
|
|
|
test_shixun_name = 'Pytorch深度学习 - 多尺度目标检测'
|
|
|
|
|
|
|
|
|
|
test_subject_id = 4
|
|
|
|
|
test_subject_name = '数据结构与算法(C语言)'
|
|
|
|
|
|
|
|
|
|
#学习路径推荐相关参数
|
|
|
|
|
study_path = root_path+"/study_path_rs/"
|
|
|
|
|
study_path_data = study_path+"data/sample/"
|
|
|
|
|
kg_data = study_path+'knowledge_forest_data/sample/'
|
|
|
|
|
initial_learning=study_path+'knowledge_forest_data/initial_learning/'
|
|
|
|
|
#neo4j连接配置
|
|
|
|
|
neo4j_url = 'http://47.93.43.185:57474/'
|
|
|
|
|
neo4j_username = 'neo4j'
|
|
|
|
|
neo4j_password = 'yongge123123'
|