|
|
|
|
import pandas as pd
|
|
|
|
|
import numpy as np
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
import warnings
|
|
|
|
|
import pickle
|
|
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
from config import myshixuns_train_data, myshixuns_test_data
|
|
|
|
|
from config import shixuns_data_path, shixuns_embed_path, shixun_save_path
|
|
|
|
|
from config import shixun_user_item_time_dict_data,shixun_user_item_dict_data
|
|
|
|
|
from config import shixun_item_user_time_dict, data_parent_path
|
|
|
|
|
from config import shixuns_emb_dict,shixuns_bert_em_path
|
|
|
|
|
from config import users_data_path,shixun_merge_emb_path
|
|
|
|
|
from utils import reduce_mem
|
|
|
|
|
from config import logger
|
|
|
|
|
from joblib import Parallel, delayed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tqdm.pandas()
|
|
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
|
|
|
|
def get_all_select_sample(sample_nums=10000):
|
|
|
|
|
"""
|
|
|
|
|
从训练集中划出一部分数据来调试代码
|
|
|
|
|
"""
|
|
|
|
|
if os.path.exists(data_parent_path + 'myshixuns_train_sample.csv'):
|
|
|
|
|
all_select = pd.read_csv(data_parent_path + 'myshixuns_train_sample.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
reduce_mem(all_select)
|
|
|
|
|
return all_select
|
|
|
|
|
|
|
|
|
|
all_select = pd.read_csv(myshixuns_train_data, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
# user_id过滤重复
|
|
|
|
|
all_user_ids = all_select.user_id.unique()
|
|
|
|
|
|
|
|
|
|
# 只采集指定数量的user_id
|
|
|
|
|
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
|
|
|
|
|
|
|
|
|
|
# 取出这些user_id选择的物品
|
|
|
|
|
all_select = all_select[all_select['user_id'].isin(sample_user_ids)]
|
|
|
|
|
|
|
|
|
|
# 根据user_id, shixun_id去重
|
|
|
|
|
all_select = all_select.drop_duplicates((['user_id', 'shixun_id']))
|
|
|
|
|
|
|
|
|
|
all_select.to_csv(data_parent_path + 'myshixuns_train_sample.csv', sep='\t', index=False, header=True)
|
|
|
|
|
reduce_mem(all_select)
|
|
|
|
|
return all_select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_all_select_df(offline=True):
|
|
|
|
|
"""
|
|
|
|
|
读取物品数据时分成线上和线下
|
|
|
|
|
线上预测时将测试集中的物品数据合并到总的数据中
|
|
|
|
|
线下验证模型的有效性或者特征的有效性时只使用训练集
|
|
|
|
|
"""
|
|
|
|
|
if offline:
|
|
|
|
|
all_select = pd.read_csv(myshixuns_train_data, sep='\t', encoding='utf-8')
|
|
|
|
|
else:
|
|
|
|
|
train_myshixuns = pd.read_csv(myshixuns_train_data, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
test_myshixuns = pd.read_csv(myshixuns_test_data, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
all_select = train_myshixuns.append(test_myshixuns)
|
|
|
|
|
|
|
|
|
|
all_select = all_select.drop_duplicates((['user_id', 'shixun_id']))
|
|
|
|
|
reduce_mem(all_select)
|
|
|
|
|
return all_select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_info_df():
|
|
|
|
|
"""
|
|
|
|
|
读取用户的基本属性
|
|
|
|
|
"""
|
|
|
|
|
user_info_df = pd.read_csv(users_data_path, sep='\t', encoding='utf-8', quoting=3)
|
|
|
|
|
reduce_mem(user_info_df)
|
|
|
|
|
|
|
|
|
|
return user_info_df
|
|
|
|
|
|
|
|
|
|
def get_item_eminfo_df():
|
|
|
|
|
"""
|
|
|
|
|
读取实训的属性和em
|
|
|
|
|
"""
|
|
|
|
|
item_eminfo_df = pd.read_csv(shixun_merge_emb_path, sep='\t', encoding='utf-8', quoting=3)
|
|
|
|
|
reduce_mem(item_eminfo_df)
|
|
|
|
|
|
|
|
|
|
return item_eminfo_df
|
|
|
|
|
|
|
|
|
|
def get_item_info_df():
|
|
|
|
|
"""
|
|
|
|
|
读取物品的基本属性
|
|
|
|
|
"""
|
|
|
|
|
item_info_df = pd.read_csv(shixuns_data_path, sep='\t', encoding='utf-8')
|
|
|
|
|
reduce_mem(item_info_df)
|
|
|
|
|
|
|
|
|
|
return item_info_df
|
|
|
|
|
|
|
|
|
|
def get_item_bert_info_df():
|
|
|
|
|
"""
|
|
|
|
|
读取物品的基本属性和bert编码
|
|
|
|
|
"""
|
|
|
|
|
item_info_df = pd.read_csv(shixuns_bert_em_path, sep='\t', encoding='utf-8')
|
|
|
|
|
reduce_mem(item_info_df)
|
|
|
|
|
|
|
|
|
|
return item_info_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_recall_item_info_df(item_info_df, shixun_id_list):
|
|
|
|
|
"""
|
|
|
|
|
读取物品的基本属性
|
|
|
|
|
"""
|
|
|
|
|
recall_item_info_df = item_info_df[item_info_df['shixun_id'].isin(shixun_id_list)]
|
|
|
|
|
return recall_item_info_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_select_item_info(all_select_df, user_id):
|
|
|
|
|
"""
|
|
|
|
|
获取某个用户选择的物品行为数据
|
|
|
|
|
"""
|
|
|
|
|
select_item_df = all_select_df[all_select_df['user_id'] == user_id]
|
|
|
|
|
|
|
|
|
|
return select_item_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_emb_dict():
|
|
|
|
|
"""
|
|
|
|
|
生成和读取物品的Embedding数据
|
|
|
|
|
"""
|
|
|
|
|
# 加载已经保存的Embedding数据
|
|
|
|
|
if os.path.exists(shixuns_emb_dict):
|
|
|
|
|
item_emb_dict = pickle.load(open(shixuns_emb_dict, 'rb'))
|
|
|
|
|
return item_emb_dict
|
|
|
|
|
|
|
|
|
|
# 生成物品的Embedding数据
|
|
|
|
|
item_emb_df = pd.read_csv(shixuns_embed_path, sep='\t', encoding='utf-8')
|
|
|
|
|
|
|
|
|
|
item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]
|
|
|
|
|
item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])
|
|
|
|
|
# 进行归一化
|
|
|
|
|
item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)
|
|
|
|
|
|
|
|
|
|
item_emb_dict = dict(zip(item_emb_df['shixun_id'], item_emb_np))
|
|
|
|
|
pickle.dump(item_emb_dict, open(shixuns_emb_dict, 'wb'))
|
|
|
|
|
|
|
|
|
|
return item_emb_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_item_time_dict(select_df):
|
|
|
|
|
"""
|
|
|
|
|
根据时间获取用户选择的物品序列
|
|
|
|
|
{user1: [(item1, time1), (item2, time2)..]...}
|
|
|
|
|
"""
|
|
|
|
|
def make_item_time_pair(df):
|
|
|
|
|
"""
|
|
|
|
|
构造选物品,选择时间列表
|
|
|
|
|
"""
|
|
|
|
|
return list(zip(df['shixun_id'], df['created_timestamp']))
|
|
|
|
|
|
|
|
|
|
# 加载之前pickel保存的
|
|
|
|
|
if os.path.exists(shixun_user_item_time_dict_data):
|
|
|
|
|
user_item_time_dict = pickle.load(open(shixun_user_item_time_dict_data, 'rb'))
|
|
|
|
|
return user_item_time_dict
|
|
|
|
|
|
|
|
|
|
# 按选择时间排序
|
|
|
|
|
select_df = select_df.sort_values('created_timestamp')
|
|
|
|
|
|
|
|
|
|
# 按用户分组生成用户选择的物品
|
|
|
|
|
user_item_time_df = select_df.groupby('user_id')['shixun_id', 'created_timestamp']. \
|
|
|
|
|
progress_apply(lambda x: make_item_time_pair(x)).reset_index(). \
|
|
|
|
|
rename(columns = {0: 'item_time_list'})
|
|
|
|
|
|
|
|
|
|
# 生成用户选择的物品字典
|
|
|
|
|
user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
|
|
|
|
|
|
|
|
|
|
pickle.dump(user_item_time_dict, open(shixun_user_item_time_dict_data, 'wb'))
|
|
|
|
|
|
|
|
|
|
return user_item_time_dict
|
|
|
|
|
|
|
|
|
|
def get_user_item_dict(select_df):
|
|
|
|
|
"""
|
|
|
|
|
根据时间获取用户选择的物品序列
|
|
|
|
|
{user1: [(item1, visits,challenges_count,averge_star,task_pass,time1), (item2, time2)..]...}
|
|
|
|
|
"""
|
|
|
|
|
def make_item_time_pair(df):
|
|
|
|
|
"""
|
|
|
|
|
构造选物品,选择时间列表
|
|
|
|
|
"""
|
|
|
|
|
return list(zip(df['shixun_id'], df['visits'],df['challenges_count'],
|
|
|
|
|
df['averge_star'],df['task_pass'],df['created_timestamp']))
|
|
|
|
|
|
|
|
|
|
# 加载之前pickel保存的
|
|
|
|
|
if os.path.exists(shixun_user_item_dict_data):
|
|
|
|
|
user_item_dict = pickle.load(open(shixun_user_item_dict_data, 'rb'))
|
|
|
|
|
return user_item_dict
|
|
|
|
|
|
|
|
|
|
# 按选择时间排序
|
|
|
|
|
select_df = select_df.sort_values('created_timestamp')
|
|
|
|
|
|
|
|
|
|
# 按用户分组生成用户选择的物品
|
|
|
|
|
user_item_time_df = select_df.groupby('user_id')['shixun_id','visits' ,'challenges_count',
|
|
|
|
|
'averge_star','task_pass','created_timestamp']. \
|
|
|
|
|
progress_apply(lambda x: make_item_time_pair(x)).reset_index().rename(columns={0: 'item_time_list'})
|
|
|
|
|
|
|
|
|
|
# 生成用户选择的物品字典
|
|
|
|
|
user_item_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
|
|
|
|
|
|
|
|
|
|
pickle.dump(user_item_dict, open(shixun_user_item_dict_data, 'wb'))
|
|
|
|
|
|
|
|
|
|
return user_item_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_user_time_dict(select_df):
|
|
|
|
|
"""
|
|
|
|
|
根据选择时间获取物品被选择的用户序列
|
|
|
|
|
{item1: [(user1, time1), (user2, time2)...]...}
|
|
|
|
|
"""
|
|
|
|
|
def make_user_time_pair(df):
|
|
|
|
|
return list(zip(df['user_id'], df['created_timestamp']))
|
|
|
|
|
|
|
|
|
|
# 加载之前pickel保存的
|
|
|
|
|
if os.path.exists(shixun_item_user_time_dict):
|
|
|
|
|
item_user_time_dict = pickle.load(open(shixun_item_user_time_dict, 'rb'))
|
|
|
|
|
return item_user_time_dict
|
|
|
|
|
|
|
|
|
|
select_df = select_df.sort_values('created_timestamp')
|
|
|
|
|
|
|
|
|
|
item_user_time_df = select_df.groupby('shixun_id')['user_id', 'created_timestamp']. \
|
|
|
|
|
progress_apply(lambda x: make_user_time_pair(x)).reset_index(). \
|
|
|
|
|
rename(columns = {0: 'user_time_list'})
|
|
|
|
|
|
|
|
|
|
item_user_time_dict = dict(zip(item_user_time_df['shixun_id'], item_user_time_df['user_time_list']))
|
|
|
|
|
pickle.dump(item_user_time_dict, open(shixun_item_user_time_dict, 'wb'))
|
|
|
|
|
|
|
|
|
|
return item_user_time_dict
|
|
|
|
|
|
|
|
|
|
def get_hist_and_last_em_select(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取物品行为数据的历史选择和最后一次选择
|
|
|
|
|
"""
|
|
|
|
|
# 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下
|
|
|
|
|
def hist_func(user_df):
|
|
|
|
|
if len(user_df) == 1:
|
|
|
|
|
return user_df
|
|
|
|
|
else:
|
|
|
|
|
return user_df[:-1]
|
|
|
|
|
|
|
|
|
|
def apply_parallel(df_grouped, func):
|
|
|
|
|
results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped)
|
|
|
|
|
return pd.concat(results)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_save_path + 'select_last_df.csv') and \
|
|
|
|
|
os.path.exists(shixun_save_path + 'select_hist_df.csv'):
|
|
|
|
|
select_last_df = pd.read_csv(shixun_save_path + 'select_last_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
select_hist_df = pd.read_csv(shixun_save_path + 'select_hist_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
else:
|
|
|
|
|
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
|
|
|
|
|
select_last_df = all_select.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
# 多进程并行提升处理速度
|
|
|
|
|
df_grouped = all_select.groupby('user_id')
|
|
|
|
|
select_hist_df = apply_parallel(df_grouped, hist_func)
|
|
|
|
|
select_hist_df = select_hist_df.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
#select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True)
|
|
|
|
|
|
|
|
|
|
select_last_df.to_csv(shixun_save_path + 'select_last_em_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
select_hist_df.to_csv(shixun_save_path + 'select_hist_em_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
reduce_mem(select_last_df)
|
|
|
|
|
reduce_mem(select_hist_df)
|
|
|
|
|
return select_hist_df, select_last_df
|
|
|
|
|
|
|
|
|
|
def get_hist_and_last_select(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取物品行为数据的历史选择和最后一次选择
|
|
|
|
|
"""
|
|
|
|
|
# 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下
|
|
|
|
|
def hist_func(user_df):
|
|
|
|
|
if len(user_df) == 1:
|
|
|
|
|
return user_df
|
|
|
|
|
else:
|
|
|
|
|
return user_df[:-1]
|
|
|
|
|
|
|
|
|
|
def apply_parallel(df_grouped, func):
|
|
|
|
|
results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped)
|
|
|
|
|
return pd.concat(results)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_save_path + 'select_last_df.csv') and \
|
|
|
|
|
os.path.exists(shixun_save_path + 'select_hist_df.csv'):
|
|
|
|
|
select_last_df = pd.read_csv(shixun_save_path + 'select_last_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
select_hist_df = pd.read_csv(shixun_save_path + 'select_hist_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
else:
|
|
|
|
|
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
|
|
|
|
|
select_last_df = all_select.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
# 多进程并行提升处理速度
|
|
|
|
|
df_grouped = all_select.groupby('user_id')
|
|
|
|
|
select_hist_df = apply_parallel(df_grouped, hist_func)
|
|
|
|
|
select_hist_df = select_hist_df.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
#select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True)
|
|
|
|
|
|
|
|
|
|
select_last_df.to_csv(shixun_save_path + 'select_last_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
select_hist_df.to_csv(shixun_save_path + 'select_hist_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
reduce_mem(select_last_df)
|
|
|
|
|
reduce_mem(select_hist_df)
|
|
|
|
|
return select_hist_df, select_last_df
|
|
|
|
|
|
|
|
|
|
def get_cf_hist_and_last_select(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取cf算法所需物品行为数据的历史选择和最后一次选择
|
|
|
|
|
"""
|
|
|
|
|
# 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下
|
|
|
|
|
def hist_func(user_df):
|
|
|
|
|
if len(user_df) == 1:
|
|
|
|
|
return user_df
|
|
|
|
|
else:
|
|
|
|
|
return user_df[:-1]
|
|
|
|
|
|
|
|
|
|
def apply_parallel(df_grouped, func):
|
|
|
|
|
results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped)
|
|
|
|
|
return pd.concat(results)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_save_path + 'cf_select_last_df.csv') and \
|
|
|
|
|
os.path.exists(shixun_save_path + 'cf_select_hist_df.csv'):
|
|
|
|
|
select_last_df = pd.read_csv(shixun_save_path + 'cf_select_last_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
select_hist_df = pd.read_csv(shixun_save_path + 'cf_select_hist_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
else:
|
|
|
|
|
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
|
|
|
|
|
select_last_df = all_select.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
# 多进程并行提升处理速度
|
|
|
|
|
df_grouped = all_select.groupby('user_id')
|
|
|
|
|
select_hist_df = apply_parallel(df_grouped, hist_func)
|
|
|
|
|
select_hist_df = select_hist_df.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
#select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop = True)
|
|
|
|
|
|
|
|
|
|
select_last_df.to_csv(shixun_save_path + 'cf_select_last_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
select_hist_df.to_csv(shixun_save_path + 'cf_select_hist_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
reduce_mem(select_last_df)
|
|
|
|
|
reduce_mem(select_hist_df)
|
|
|
|
|
return select_hist_df, select_last_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_hist_and_last_em_select(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取物品行为数据的历史选择和最后一次选择
|
|
|
|
|
"""
|
|
|
|
|
# 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下
|
|
|
|
|
def hist_func(user_df):
|
|
|
|
|
if len(user_df) == 1:
|
|
|
|
|
return user_df
|
|
|
|
|
else:
|
|
|
|
|
return user_df[:-1]
|
|
|
|
|
|
|
|
|
|
def apply_parallel(df_grouped, func):
|
|
|
|
|
results = Parallel(n_jobs=20)(delayed(func)(group) for name, group in df_grouped)
|
|
|
|
|
return pd.concat(results)
|
|
|
|
|
|
|
|
|
|
if os.path.exists(shixun_save_path + 'select_last_em_df.csv') and \
|
|
|
|
|
os.path.exists(shixun_save_path + 'select_hist_em_df.csv'):
|
|
|
|
|
select_last_df = pd.read_csv(shixun_save_path + 'select_last_em_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
select_hist_df = pd.read_csv(shixun_save_path + 'select_hist_em_df.csv', sep='\t', encoding='utf-8')
|
|
|
|
|
else:
|
|
|
|
|
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
|
|
|
|
|
select_last_df = all_select.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
# 多进程并行提升处理速度
|
|
|
|
|
df_grouped = all_select.groupby('user_id')
|
|
|
|
|
select_hist_df = apply_parallel(df_grouped, hist_func)
|
|
|
|
|
select_hist_df = select_hist_df.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
#select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
select_last_df.to_csv(shixun_save_path + 'select_last_em_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
select_hist_df.to_csv(shixun_save_path + 'select_hist_em_df.csv', sep='\t', index=False, header=True)
|
|
|
|
|
|
|
|
|
|
reduce_mem(select_last_df)
|
|
|
|
|
reduce_mem(select_hist_df)
|
|
|
|
|
return select_hist_df, select_last_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_rank_hist_and_last_select(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取排序物品行为数据的历史选择和最后一次选择
|
|
|
|
|
"""
|
|
|
|
|
all_select = all_select.sort_values(by=['user_id', 'created_timestamp'])
|
|
|
|
|
select_last_df = all_select.groupby('user_id').tail(1)
|
|
|
|
|
|
|
|
|
|
# 如果用户只有一个选择,hist为空,会导致训练的时候这个用户不可见,此时默认泄露一下
|
|
|
|
|
def hist_func(user_df):
|
|
|
|
|
if len(user_df) == 1:
|
|
|
|
|
return user_df
|
|
|
|
|
else:
|
|
|
|
|
return user_df[:-1]
|
|
|
|
|
|
|
|
|
|
select_hist_df = all_select.groupby('user_id').progress_apply(hist_func).reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
return select_hist_df, select_last_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_info_dict(item_info_df):
|
|
|
|
|
"""
|
|
|
|
|
获取物品id对应的基本属性,保存成字典的形式
|
|
|
|
|
方便后面召回阶段,冷启动阶段直接使用
|
|
|
|
|
"""
|
|
|
|
|
# 创建时间进行归一化
|
|
|
|
|
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
|
|
|
|
|
item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].progress_apply(max_min_scaler)
|
|
|
|
|
|
|
|
|
|
# 实训访问次数
|
|
|
|
|
item_visists_dict = dict(zip(item_info_df['shixun_id'], item_info_df['visits']))
|
|
|
|
|
|
|
|
|
|
# 实训难易程度
|
|
|
|
|
item_trainee_dict = dict(zip(item_info_df['shixun_id'], item_info_df['trainee']))
|
|
|
|
|
|
|
|
|
|
# 实训创建时间
|
|
|
|
|
item_created_time_dict = dict(zip(item_info_df['shixun_id'], item_info_df['created_at_ts']))
|
|
|
|
|
|
|
|
|
|
# 实训评价星数
|
|
|
|
|
item_averge_star_dict = dict(zip(item_info_df['shixun_id'], item_info_df['averge_star']))
|
|
|
|
|
|
|
|
|
|
# 实训选择人数
|
|
|
|
|
item_myshixuns_count_dict = dict(zip(item_info_df['shixun_id'], item_info_df['myshixuns_count']))
|
|
|
|
|
|
|
|
|
|
# 实训关卡数量
|
|
|
|
|
item_challenges_count_dict = dict(zip(item_info_df['shixun_id'], item_info_df['challenges_count']))
|
|
|
|
|
|
|
|
|
|
return item_visists_dict, item_trainee_dict, item_created_time_dict, \
|
|
|
|
|
item_averge_star_dict, item_myshixuns_count_dict, item_challenges_count_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_hist_item_info_dict(all_select):
|
|
|
|
|
"""
|
|
|
|
|
获取用户历史选择的物品信息
|
|
|
|
|
"""
|
|
|
|
|
logger.info("获取用户历史选择的物品信息")
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择物品平均访问数量的集合字典
|
|
|
|
|
user_hist_item_visits = all_select.groupby('user_id')['visits'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_visits_dict = dict(zip(user_hist_item_visits['user_id'], user_hist_item_visits['visits']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户选择物品的集合
|
|
|
|
|
user_hist_item_ids_dict = all_select.groupby('user_id')['shixun_id'].agg(set).reset_index()
|
|
|
|
|
user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['shixun_id']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择的物品的难易字典
|
|
|
|
|
user_hist_item_trainee = all_select.groupby('user_id')['trainee'].agg(set).reset_index()
|
|
|
|
|
user_hist_item_trainee_dict = dict(zip(user_hist_item_trainee['user_id'], user_hist_item_trainee['trainee']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择的物品的评价的星数
|
|
|
|
|
user_averge_star = all_select.groupby('user_id')['averge_star'].agg(set).reset_index()
|
|
|
|
|
user_averge_star_dict = dict(zip(user_averge_star['user_id'], user_averge_star['averge_star']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择物品平均选择人数的集合字典
|
|
|
|
|
user_hist_item_myshixuns_count = all_select.groupby('user_id')['myshixuns_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_myshixuns_count_dict = dict(zip(user_hist_item_myshixuns_count['user_id'], user_hist_item_myshixuns_count['myshixuns_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户历史选择物品平均关卡数量的集合字典
|
|
|
|
|
user_hist_item_challenges_count = all_select.groupby('user_id')['challenges_count'].agg('mean').reset_index()
|
|
|
|
|
user_hist_item_challenges_count_dict = dict(zip(user_hist_item_challenges_count['user_id'], user_hist_item_challenges_count['challenges_count']))
|
|
|
|
|
|
|
|
|
|
# 获取user_id对应的用户最后一次选择的物品的创建时间
|
|
|
|
|
all_select_ = all_select.sort_values('created_at_ts')
|
|
|
|
|
user_last_item_created_time = all_select_.groupby('user_id')['created_at_ts']. \
|
|
|
|
|
progress_apply(lambda x: x.iloc[-1]).reset_index()
|
|
|
|
|
|
|
|
|
|
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
|
|
|
|
|
user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']]. \
|
|
|
|
|
progress_apply(max_min_scaler)
|
|
|
|
|
|
|
|
|
|
user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \
|
|
|
|
|
user_last_item_created_time['created_at_ts']))
|
|
|
|
|
|
|
|
|
|
return user_hist_item_visits_dict, user_hist_item_ids_dict, \
|
|
|
|
|
user_hist_item_trainee_dict, user_last_item_created_time_dict, \
|
|
|
|
|
user_averge_star_dict, user_hist_item_myshixuns_count_dict, \
|
|
|
|
|
user_hist_item_challenges_count_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_item_topk_select(select_df, k):
|
|
|
|
|
"""
|
|
|
|
|
获取被选择次数最多的物品,用来做召回补全
|
|
|
|
|
"""
|
|
|
|
|
topk_select = select_df['shixun_id'].value_counts().index[:k]
|
|
|
|
|
return topk_select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_user_activate_degree_dict(all_select_df):
|
|
|
|
|
"""
|
|
|
|
|
将用户的选择次数作为获取用户活跃度的指标
|
|
|
|
|
"""
|
|
|
|
|
all_select_df_ = all_select_df.groupby('user_id')['shixun_id'].count().reset_index()
|
|
|
|
|
|
|
|
|
|
# 用户活跃度归一化
|
|
|
|
|
mm = MinMaxScaler()
|
|
|
|
|
all_select_df_['shixun_id'] = mm.fit_transform(all_select_df_[['shixun_id']])
|
|
|
|
|
user_activate_degree_dict = dict(zip(all_select_df_['user_id'], all_select_df_['shixun_id']))
|
|
|
|
|
|
|
|
|
|
return user_activate_degree_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def metrics_recall(user_recall_items_dict, train_last_select_df, topk=10):
|
|
|
|
|
"""
|
|
|
|
|
召回评估,依次评估召回的前 10, 20, 30,...topk/10 个物品的击中率
|
|
|
|
|
"""
|
|
|
|
|
# 生成用户最后选择物品的字典
|
|
|
|
|
last_select_item_dict = dict(zip(train_last_select_df['user_id'], train_last_select_df['shixun_id']))
|
|
|
|
|
|
|
|
|
|
# 用户数量
|
|
|
|
|
user_num = len(user_recall_items_dict)
|
|
|
|
|
print(user_num)
|
|
|
|
|
|
|
|
|
|
for k in range(10, topk + 1, 10):
|
|
|
|
|
hit_num = 0
|
|
|
|
|
for user_id, item_list in user_recall_items_dict.items():
|
|
|
|
|
# 获取前k个召回的结果
|
|
|
|
|
tmp_recall_items = [x[0] for x in user_recall_items_dict[user_id][:k]]
|
|
|
|
|
|
|
|
|
|
if last_select_item_dict[user_id] in set(tmp_recall_items):
|
|
|
|
|
hit_num += 1
|
|
|
|
|
|
|
|
|
|
hit_rate = round(hit_num * 1.0 / user_num, 5)
|
|
|
|
|
print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)
|
|
|
|
|
|
|
|
|
|
def metrics_pinsage_recall(user_recall_items_dict, train_last_select_df, topk=10):
|
|
|
|
|
"""
|
|
|
|
|
召回评估,依次评估召回的前 10, 20, 30,...topk/10 个物品的击中率
|
|
|
|
|
"""
|
|
|
|
|
# 生成用户最后选择物品的字典
|
|
|
|
|
last_select_item_dict = dict(zip(train_last_select_df['user'], train_last_select_df['item']))
|
|
|
|
|
|
|
|
|
|
# 用户数量
|
|
|
|
|
user_num = len(user_recall_items_dict)
|
|
|
|
|
print(user_num)
|
|
|
|
|
|
|
|
|
|
for k in range(10, topk + 1, 10):
|
|
|
|
|
hit_num = 0
|
|
|
|
|
for user_id, item_list in user_recall_items_dict.items():
|
|
|
|
|
# 获取前k个召回的结果
|
|
|
|
|
tmp_recall_items = [x[0] for x in user_recall_items_dict[user_id][:k]]
|
|
|
|
|
|
|
|
|
|
if last_select_item_dict[user_id] in set(tmp_recall_items):
|
|
|
|
|
hit_num += 1
|
|
|
|
|
|
|
|
|
|
hit_rate = round(hit_num * 1.0 / user_num, 5)
|
|
|
|
|
print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|