import random import pandas as pd import numpy as np from copy import deepcopy random.seed(0) class DataProcess(object): def __init__(self, filename): self._filename = filename self._loadData() self._preProcess() self._binarize(self._originalRatings) # 对'userId'这一列的数据,先去重,然后构成一个用户列表 self._userPool = set(self._originalRatings['userId'].unique()) self._itemPool = set(self._originalRatings['itemId'].unique()) # print("user_pool size: ", len(self._userPool)) # print("item_pool size: ", len(self._itemPool)) self._select_Negatives(self._originalRatings) self._split_pool(self._preprocessRatings) def _loadData(self): self._originalRatings = pd.read_csv(self._filename, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'], engine='python') return self._originalRatings def _preProcess(self): """ 对user和item都重新编号,这里这么做的原因是因为,模型的输入是one-hot向量,需要把user和item都限制在Embedding的长度之内, 模型的两个输入的长度分别是user和item的数量,所以要重新从0编号。 """ # 1. 新建名为"userId"的列,这列对用户从0开始编号 user_id = self._originalRatings[['uid']].drop_duplicates().reindex() user_id['userId'] = np.arange(len(user_id)) #根据user的长度创建一个数组 # 将原先的DataFrame与user_id按照"uid"这一列进行合并 self._originalRatings = pd.merge(self._originalRatings, user_id, on=['uid'], how='left') # 2. 对物品进行重新排列 item_id = self._originalRatings[['mid']].drop_duplicates() item_id['itemId'] = np.arange(len(item_id)) self._originalRatings = pd.merge(self._originalRatings, item_id, on=['mid'], how='left') # 按照['userId', 'itemId', 'rating', 'timestamp']的顺序重新排列 self._originalRatings = self._originalRatings[['userId', 'itemId', 'rating', 'timestamp']] # print(self._originalRatings) # print('Range of userId is [{}, {}]'.format(self._originalRatings.userId.min(), self._originalRatings.userId.max())) # print('Range of itemId is [{}, {}]'.format(self._originalRatings.itemId.min(), self._originalRatings.itemId.max())) def _binarize(self, ratings): """ binarize data into 0 or 1 for implicit feedback """ ratings = deepcopy(ratings) ratings['rating'][ratings['rating'] > 0] = 1.0 self._preprocessRatings = ratings # print("binary: \n", self._preprocessRatings) def _select_Negatives(self, ratings): """ Select al;l negative samples and 100 sampled negative items for each user. """ # 构造user-item表 interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename( columns={'itemId': 'interacted_items'}) # print("interact_status: \n", interact_status) # 把与用户没有产生过交互的样本都当做是负样本 interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self._itemPool - x) # 从上面的全部负样本中随机选99个出来 interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99)) # print("after sampling interact_status: \n", interact_status) # # print("select and rearrange columns") self._negatives = interact_status[['userId', 'negative_items', 'negative_samples']] def _split_pool(self, ratings): """leave one out train/test split """ # print("sort by timestamp descend") # 先按照'userID'进行分组,然后根据时间戳降序排列 ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False) # print(ratings) # 选取排名第一的数据作为测试集,也就是最新的那个数据 test = ratings[ratings['rank_latest'] == 1] # 选取所有排名靠后的,也就是历史数据当做训练集 train = ratings[ratings['rank_latest'] > 1] # print("test: \n", test) # print("train: \n", train) # print("size of test {0}, size of train {1}".format(len(test), len(train))) # 确保训练集和测试集的userId是一样的 assert train['userId'].nunique() == test['userId'].nunique() self.train_ratings = train[['userId', 'itemId', 'rating']] self.test_ratings = test[['userId', 'itemId', 'rating']] def sample_generator(self, num_negatives): # 合并之后的train_ratings的列包括['userId','itemId','rating','negative_items'] train_ratings = pd.merge(self.train_ratings, self._negatives[['userId', 'negative_items']], on='userId') # 从用户的全部负样本集合中随机选择num_negatives个样本当做负样本,并产生一个新的名为"negatives"的列 train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, num_negatives)) # print(train_ratings) # 构造模型所需要的数据,分别是输入user、items以及目标分值ratings。 users, items, ratings = [], [], [] for row in train_ratings.itertuples(): # 构造正样本,分别是userId, itemId以及目标分值1 users.append(int(row.userId)) items.append(int(row.itemId)) ratings.append(float(row.rating)) # 为每个用户构造num_negatives个负样本,分别是userId, itemId以及目标分值0 for i in range(num_negatives): users.append(int(row.userId)) items.append(int(row.negatives[i])) ratings.append(float(0)) # 负样本的ratings为0,直接强行设置为0 return users, items, ratings def test_generator(self, num_negatives): # 合并之后的train_ratings的列包括['userId','itemId','rating','negative_items'] test_ratings = pd.merge(self.test_ratings, self._negatives[['userId', 'negative_items']], on='userId') # 从用户的全部负样本集合中随机选择num_negatives个样本当做负样本,并产生一个新的名为"negatives"的列 test_ratings['negatives'] = test_ratings['negative_items'].apply(lambda x: random.sample(x, num_negatives)) # print(test_ratings) # 构造模型所需要的数据,分别是输入user、items以及目标分值ratings。 users, items, ratings = [], [], [] for row in test_ratings.itertuples(): # 构造正样本,分别是userId, itemId以及目标分值1 users.append(int(row.userId)) items.append(int(row.itemId)) ratings.append(float(row.rating)) # 为每个用户构造num_negatives个负样本,分别是userId, itemId以及目标分值0 for i in range(num_negatives): users.append(int(row.userId)) items.append(int(row.negatives[i])) ratings.append(float(0)) # 负样本的ratings为0,直接强行设置为0 return users, items, ratings