You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

168 lines
6.6 KiB

5 months ago
import os
import sys
sys.path.append(os.getcwd())
from tqdm import tqdm
import warnings
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
import collections
import pickle
from libreco.algorithms import PinSage
from libreco.data import DatasetFeat
from config import logger, offline_mode
from config import need_metric_recall
from config import subject_pinsage_model_path
from config import subject_pinsage_recall_dict
from matching.subject.recall_comm import get_all_select_df
from matching.subject.recall_comm import get_user_info_df,get_item_info_df
from matching.subject.recall_comm import metrics_pinsage_recall,get_all_hist_and_last_select
K.set_learning_phase(True)
if tf.__version__ >= '2.0.0':
tf.compat.v1.disable_eager_execution()
tqdm.pandas()
warnings.filterwarnings('ignore')
def reset_state(name):
tf.compat.v1.reset_default_graph()
print("\n", "=" * 30, name, "=" * 30)
def pinsage_recall_train():
"""
pinsage召回训练
"""
# 需要召回的数量
recall_item_num = 100
logger.info("加载物品行为数据")
all_select_df = get_all_select_df(offline=offline_mode)
logger.info("获取物品信息数据")
item_info = get_item_info_df()
logger.info("获取用户信息数据")
users_info = get_user_info_df()
all_select_df = all_select_df.merge(users_info, on='user_id')
all_select_df = all_select_df.merge(item_info,on='subject_id')
# 为了召回评估,提取最后一次选择作为召回评估
# 如果不需要做召回评估直接使用全量的训练集进行召回
if need_metric_recall:
logger.info('获取物品行为数据历史和最后一次选择')
train_hist_select_df, train_last_select_df = get_all_hist_and_last_select(all_select_df)
train_hist_select_df['label'] = 1
train_last_select_df['label'] = 1
# pinsage模型需要
train_hist_select_df.rename(columns={'user_id': 'user','subject_id': 'item'}, inplace=True)
train_last_select_df.rename(columns={'user_id': 'user','subject_id': 'item'}, inplace=True)
else:
train_hist_select_df = all_select_df
train_hist_select_df['label'] = 1
train_hist_select_df.rename(columns={'user_id': 'user','subject_id': 'item'}, inplace=True)
train_last_select_df = all_select_df.sample(frac=0.001)
train_last_select_df['label'] = 1
train_last_select_df.rename(columns={'user_id': 'user','subject_id': 'item'}, inplace=True)
#调试程序简单采样,注意删除临时文件
# train_hist_select_df= train_hist_select_df.sample(frac=0.001)
# print(train_hist_select_df.head())
# 定义特征,指定完整列信息
# sparse_col = ['user_id','subject_id']
dense_col = ['gender', 'identity', 'edu_background','logins','grade','experience',
'visits','stages_count','stage_shixuns_count','shixuns_count','study_count',
'course_study_count','passed_count','challenge_count','evaluate_count',
'study_pdf_attachment_count','averge_star']
user_col = ['gender', 'identity', 'edu_background','logins','grade','experience']
item_col = ['visits','stages_count','stage_shixuns_count','shixuns_count','study_count',
'course_study_count','passed_count','challenge_count','evaluate_count',
'study_pdf_attachment_count','averge_star']
train_data = train_hist_select_df[['user', 'item','gender', 'identity', 'edu_background','logins','grade','experience',
'visits','stages_count','stage_shixuns_count','shixuns_count','study_count',
'course_study_count','passed_count','challenge_count','evaluate_count',
'study_pdf_attachment_count','averge_star','label']]
eval_data = train_last_select_df[['user', 'item','gender', 'identity', 'edu_background','logins','grade','experience',
'visits','stages_count','stage_shixuns_count','shixuns_count','study_count',
'course_study_count','passed_count','challenge_count','evaluate_count',
'study_pdf_attachment_count','averge_star','label']]
# print(type(train_data))
# print(type(eval_data))
# input()
train_data, data_info = DatasetFeat.build_trainset(
train_data, user_col, item_col, dense_col,seed=2023,shuffle=False
)
eval_data = DatasetFeat.build_testset(eval_data)
reset_state("PinSage")
pinsage = PinSage(
"ranking",
data_info,
loss_type="cross_entropy",
paradigm="u2i",
embed_size=32,
n_epochs=10,
lr=3e-4,
lr_decay=False,
reg=None,
batch_size=256,
num_neg=1,
dropout_rate=0.01,
remove_edges=False,
num_layers=1,
num_neighbors=10,
num_walks=10,
neighbor_walk_len=2,
sample_walk_len=5,
termination_prob=0.5,
margin=1.0,
sampler="random",
start_node="random",
focus_start=False,
seed=2023,
)
pinsage.fit(
train_data,
neg_sampling=True,
verbose=2,
shuffle=True
)
# save data_info, 指定模型保存文件夹
data_info.save(path=subject_pinsage_model_path, model_name="pinsage_model")
# 设置 manual=True 使用 `numpy` 保存模型
# 设置 manual=False 使用 `tf.train.Saver` 保存模型
# 设置 inference=True 将只保留预测和推荐所需的变量
pinsage.save(
path=subject_pinsage_model_path, model_name="pinsage_model", manual=True, inference_only=True
)
print("训练结束,模型已保存")
user_recall_items_dict = collections.defaultdict(dict)
logger.info('生成item所有用户的召回列表有得分')
for user_id in tqdm(train_hist_select_df['user'].unique()):
item_list = list(pinsage.recommend_user(user=user_id, n_rec=recall_item_num).values())[0].tolist()
score = []
for i in item_list:
score.append(pinsage.predict(user=user_id, item=i)[0])
user_recall_items_dict[user_id] = list(zip(item_list,tuple(score)))
logger.info('保存pinsage召回结果')
pickle.dump(user_recall_items_dict, open(subject_pinsage_recall_dict, 'wb'))
logger.info('pinsage召回效果评估')
metrics_pinsage_recall(user_recall_items_dict, train_last_select_df, topk=recall_item_num)
if __name__ == '__main__':
print("召回开始训练")
pinsage_recall_train()