first commit

4 years ago · 224f293b55
parent 190452e862
commit 224f293b55
9 changed files with 1010608 additions and 0 deletions
--- a/AutoRec/dataloader.py
+++ b/AutoRec/dataloader.py
@ -0,0 +1,56 @@
+import torch
+import numpy as np
+import torch.utils.data as Data
+
+def dataProcess(filename, num_users, num_items, train_ratio):
+    fp = open(filename, 'r')
+    lines = fp.readlines()
+
+    num_total_ratings = len(lines)
+
+    user_train_set = set()
+    user_test_set = set()
+    item_train_set = set()
+    item_test_set = set()
+
+    train_r = np.zeros((num_users, num_items))
+    test_r = np.zeros((num_users, num_items))
+
+    train_mask_r = np.zeros((num_users, num_items))
+    test_mask_r = np.zeros((num_users, num_items))
+
+    # 生成0~num_total_ratings范围内的的随机序列
+    random_perm_idx = np.random.permutation(num_total_ratings)
+    # 将数据分为训练集和测试集
+    train_idx = random_perm_idx[0:int(num_total_ratings * train_ratio)]
+    test_idx = random_perm_idx[int(num_total_ratings * train_ratio):]
+
+    ''' Train '''
+    for itr in train_idx:
+        line = lines[itr]
+        user, item, rating, _ = line.split("::")
+        user_idx = int(user) - 1
+        item_idx = int(item) - 1
+        train_r[user_idx][item_idx] = int(rating)
+        train_mask_r[user_idx][item_idx] = 1
+
+        user_train_set.add(user_idx)
+        item_train_set.add(item_idx)
+
+    ''' Test '''
+    for itr in test_idx:
+        line = lines[itr]
+        user, item, rating, _ = line.split("::")
+        user_idx = int(user) - 1
+        item_idx = int(item) - 1
+        test_r[user_idx][item_idx] = int(rating)
+        test_mask_r[user_idx][item_idx] = 1
+
+        user_test_set.add(user_idx)
+        item_test_set.add(item_idx)
+
+    return train_r, train_mask_r, test_r, test_mask_r, user_train_set, item_train_set, user_test_set, item_test_set
+
+def Construct_DataLoader(train_r, train_mask_r, batchsize):
+    torch_dataset = Data.TensorDataset(torch.from_numpy(train_r), torch.from_numpy(train_mask_r))
+    return Data.DataLoader(dataset=torch_dataset, batch_size=batchsize, shuffle=True)
--- a/AutoRec/network.py
+++ b/AutoRec/network.py
@ -0,0 +1,97 @@
+import torch
+import numpy as np
+import torch.nn as nn
+
+class AutoRec(nn.Module):
+    """
+    基于物品的AutoRec模型
+    """
+    def __init__(self, config):
+        super(AutoRec, self).__init__()
+        self._num_items = config['num_items']
+        self._hidden_units = config['hidden_units']
+        self._lambda_value = config['lambda']
+        self._config = config
+
+        # 定义编码器结构
+        self._encoder = nn.Sequential(
+            nn.Linear(self._num_items, self._hidden_units),
+            nn.Sigmoid()
+        )
+        # 定义解码器结构
+        self._decoder = nn.Sequential(
+            nn.Linear(self._hidden_units, self._num_items)
+        )
+
+    def forward(self, input):
+        return self._decoder(self._encoder(input))
+
+    def loss(self, res, input, mask, optimizer):
+        cost = 0
+        temp = 0
+        res = res.double()
+        cost += ((res - input) * mask).pow(2).sum()
+        rmse = cost
+
+        for i in optimizer.param_groups:
+            # 找到权重矩阵V和W，并且计算平方和，用于约束项。
+            for j in i['params']:
+                if j.data.dim() == 2:
+                    temp += torch.t(j.data).pow(2).sum()
+
+        cost += temp * self._config['lambda'] * 0.5
+        return cost, rmse
+
+    def recommend_user(self, r_u, N):
+        """
+        :param r_u: 单个用户对所有物品的评分向量
+        :param N: 推荐的商品个数
+        """
+        # 得到用户对所有物品的评分
+        predict = self.forward(torch.from_numpy(r_u).float())
+        predict = predict.detach().numpy()
+        indexs = np.argsort(-predict)[:N]
+        return indexs
+
+    def recommend_item(self, user, test_r, N):
+        """
+        :param r_u: 所有用户对物品i的评分向量
+        :param N: 推荐的商品个数
+        """
+        # 保存给user的推荐列表
+        recommends = np.array([])
+
+        for i in range(test_r.shape[1]):
+            predict = self.forward(test_r[:, i])
+            recommends.append(predict[user])
+
+        # 按照逆序对推荐列表排序，得到最大的N个值的索引
+        indexs = np.argsot(-recommends)[:N]
+        # 按照用户对物品i的评分降序排序吗，推荐前N个物品给到用户
+        return recommends[indexs]
+
+    def evaluate(self, test_r, test_mask_r, user_test_set, user_train_set, item_test_set, item_train_set):
+        test_r_tensor = torch.from_numpy(test_r).type(torch.FloatTensor)
+        test_mask_r_tensor = torch.from_numpy(test_mask_r).type(torch.FloatTensor)
+
+        res = self.forward(test_r_tensor)
+
+        unseen_user_test_list = list(user_test_set - user_train_set)
+        unseen_item_test_list = list(item_test_set - item_train_set)
+
+        for user in unseen_user_test_list:
+            for item in unseen_item_test_list:
+                if test_mask_r[user, item] == 1:
+                    res[user, item] = 3
+
+        mse = ((res - test_r_tensor) * test_mask_r_tensor).pow(2).sum()
+        RMSE = mse.detach().cpu().numpy() / (test_mask_r == 1).sum()
+        RMSE = np.sqrt(RMSE)
+        print('test RMSE : ', RMSE)
+
+    def saveModel(self):
+        torch.save(self.state_dict(), self._config['model_name'])
+
+    def loadModel(self, map_location):
+        state_dict = torch.load(self._config['model_name'], map_location=map_location)
+        self.load_state_dict(state_dict, strict=False)
--- a/AutoRec/trainer.py
+++ b/AutoRec/trainer.py
@ -0,0 +1,89 @@
+import torch
+import numpy as np
+from AutoRec.dataloader import Construct_DataLoader
+
+def pick_optimizer(network, params):
+    optimizer = None
+    if params['optimizer'] == 'sgd':
+        optimizer = torch.optim.SGD(network.parameters(),
+                                    lr=params['sgd_lr'],
+                                    momentum=params['sgd_momentum'],
+                                    weight_decay=params['l2_regularization'])
+    elif params['optimizer'] == 'adam':
+        optimizer = torch.optim.Adam(network.parameters(),
+                                     lr=params['adam_lr'],
+                                     weight_decay=params['l2_regularization'])
+    elif params['optimizer'] == 'rmsprop':
+        optimizer = torch.optim.RMSprop(network.parameters(),
+                                        lr=params['rmsprop_lr'],
+                                        alpha=params['rmsprop_alpha'],
+                                        momentum=params['rmsprop_momentum'])
+    return optimizer
+
+class Trainer(object):
+    def __init__(self, model, config):
+        self._model = model
+        self._config = config
+        self._optimizer = pick_optimizer(self._model, self._config)
+
+    def _train_single_batch(self, batch_x, batch_mask_x):
+        """
+        对单个小批量数据进行训练
+        """
+        if self._config['use_cuda'] is True:
+            # 将这些数据由CPU迁移到GPU
+            batch_x, batch_mask_x = batch_x.cuda(), batch_mask_x.cuda()
+
+        # 模型的输入为用户评分向量或者物品评分向量，调用forward进行前向传播
+        ratings_pred = self._model(batch_x.float())
+        # 通过交叉熵损失函数来计算损失, ratings_pred.view(-1)代表将预测结果摊平，变成一维的结构。
+        loss, rmse = self._model.loss(res=ratings_pred, input=batch_x, mask=batch_mask_x, optimizer=self._optimizer)
+        # 先将梯度清零,如果不清零，那么这个梯度就和上一个mini-batch有关
+        self._optimizer.zero_grad()
+        # 反向传播计算梯度
+        loss.backward()
+        # 梯度下降等优化器 更新参数
+        self._optimizer.step()
+        # 将loss的值提取成python的float类型
+        loss = loss.item()
+        return loss, rmse
+
+    def _train_an_epoch(self, train_loader, epoch_id, train_mask):
+        """
+        训练一个Epoch，即将训练集中的所有样本全部都过一遍
+        """
+        # 告诉模型目前处于训练模式，启用dropout以及batch normalization
+        self._model.train()
+        total_loss = 0
+        total_rmse = 0
+        # 从DataLoader中获取小批量的id以及数据
+        for batch_id, (batch_x, batch_mask_x) in enumerate(train_loader):
+            assert isinstance(batch_x, torch.Tensor)
+            assert isinstance(batch_mask_x, torch.Tensor)
+
+            loss, rmse = self._train_single_batch(batch_x, batch_mask_x)
+            print('[Training Epoch: {}] Batch: {}, Loss: {}, RMSE: {}'.format(epoch_id, batch_id, loss, rmse))
+            total_loss += loss
+            total_rmse += rmse
+        rmse = np.sqrt(total_rmse.detach().cpu().numpy() / (train_mask == 1).sum())
+        print('Training Epoch: {}, Total Loss: {}, total RMSE: {}'.format(epoch_id, total_loss, rmse))
+
+    def train(self, train_r, train_mask_r):
+        # 是否使用GPU加速
+        self.use_cuda()
+
+        for epoch in range(self._config['num_epoch']):
+            print('-' * 20 + ' Epoch {} starts '.format(epoch) + '-' * 20)
+            # 构造一个DataLoader
+            data_loader = Construct_DataLoader(train_r, train_mask_r, batchsize=self._config['batch_size'])
+            # 训练一个轮次
+            self._train_an_epoch(data_loader, epoch_id=epoch, train_mask=train_mask_r)
+
+    def use_cuda(self):
+        if self._config['use_cuda'] is True:
+            assert torch.cuda.is_available(), 'CUDA is not available'
+            torch.cuda.set_device(self._config['device_id'])
+            self._model.cuda()
+
+    def save(self):
+        self._model.saveModel()
--- a/Data/ml-1m/README.md
+++ b/Data/ml-1m/README.md
@ -0,0 +1,170 @@
+SUMMARY
+================================================================================
+
+These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
+made by 6,040 MovieLens users who joined MovieLens in 2000.
+
+USAGE LICENSE
+================================================================================
+
+Neither the University of Minnesota nor any of the researchers
+involved can guarantee the correctness of the data, its suitability
+for any particular purpose, or the validity of results based on the
+use of the data set.  The data set may be used for any research
+purposes under the following conditions:
+
+     * The user may not state or imply any endorsement from the
+       University of Minnesota or the GroupLens Research Group.
+
+     * The user must acknowledge the use of the data set in
+       publications resulting from the use of the data set
+       (see below for citation information).
+
+     * The user may not redistribute the data without separate
+       permission.
+
+     * The user may not use this information for any commercial or
+       revenue-bearing purposes without first obtaining permission
+       from a faculty member of the GroupLens Research Project at the
+       University of Minnesota.
+
+If you have any further questions or comments, please contact GroupLens
+<grouplens-info@cs.umn.edu>. 
+
+CITATION
+================================================================================
+
+To acknowledge use of the dataset in publications, please cite the following
+paper:
+
+F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History
+and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4,
+Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872
+
+
+ACKNOWLEDGEMENTS
+================================================================================
+
+Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
+set.
+
+FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
+================================================================================
+
+The GroupLens Research Project is a research group in the Department of 
+Computer Science and Engineering at the University of Minnesota. Members of 
+the GroupLens Research Project are involved in many research projects related 
+to the fields of information filtering, collaborative filtering, and 
+recommender systems. The project is lead by professors John Riedl and Joseph 
+Konstan. The project began to explore automated collaborative filtering in 
+1992, but is most well known for its world wide trial of an automated 
+collaborative filtering system for Usenet news in 1996. Since then the project 
+has expanded its scope to research overall information filtering solutions, 
+integrating in content-based methods as well as improving current collaborative 
+filtering technology.
+
+Further information on the GroupLens Research project, including research 
+publications, can be found at the following web site:
+        
+        http://www.grouplens.org/
+
+GroupLens Research currently operates a movie recommender based on 
+collaborative filtering:
+
+        http://www.movielens.org/
+
+RATINGS FILE DESCRIPTION
+================================================================================
+
+All ratings are contained in the file "ratings.dat" and are in the
+following format:
+
+UserID::MovieID::Rating::Timestamp
+
+- UserIDs range between 1 and 6040 
+- MovieIDs range between 1 and 3952
+- Ratings are made on a 5-star scale (whole-star ratings only)
+- Timestamp is represented in seconds since the epoch as returned by time(2)
+- Each user has at least 20 ratings
+
+USERS FILE DESCRIPTION
+================================================================================
+
+User information is in the file "users.dat" and is in the following
+format:
+
+UserID::Gender::Age::Occupation::Zip-code
+
+All demographic information is provided voluntarily by the users and is
+not checked for accuracy.  Only users who have provided some demographic
+information are included in this data set.
+
+- Gender is denoted by a "M" for male and "F" for female
+- Age is chosen from the following ranges:
+
+	*  1:  "Under 18"
+	* 18:  "18-24"
+	* 25:  "25-34"
+	* 35:  "35-44"
+	* 45:  "45-49"
+	* 50:  "50-55"
+	* 56:  "56+"
+
+- Occupation is chosen from the following choices:
+
+	*  0:  "other" or not specified
+	*  1:  "academic/educator"
+	*  2:  "artist"
+	*  3:  "clerical/admin"
+	*  4:  "college/grad student"
+	*  5:  "customer service"
+	*  6:  "doctor/health care"
+	*  7:  "executive/managerial"
+	*  8:  "farmer"
+	*  9:  "homemaker"
+	* 10:  "K-12 student"
+	* 11:  "lawyer"
+	* 12:  "programmer"
+	* 13:  "retired"
+	* 14:  "sales/marketing"
+	* 15:  "scientist"
+	* 16:  "self-employed"
+	* 17:  "technician/engineer"
+	* 18:  "tradesman/craftsman"
+	* 19:  "unemployed"
+	* 20:  "writer"
+
+MOVIES FILE DESCRIPTION
+================================================================================
+
+Movie information is in the file "movies.dat" and is in the following
+format:
+
+MovieID::Title::Genres
+
+- Titles are identical to titles provided by the IMDB (including
+year of release)
+- Genres are pipe-separated and are selected from the following genres:
+
+	* Action
+	* Adventure
+	* Animation
+	* Children's
+	* Comedy
+	* Crime
+	* Documentary
+	* Drama
+	* Fantasy
+	* Film-Noir
+	* Horror
+	* Musical
+	* Mystery
+	* Romance
+	* Sci-Fi
+	* Thriller
+	* War
+	* Western
+
+- Some MovieIDs do not correspond to a movie due to accidental duplicate
+entries and/or test entries
+- Movies are mostly entered by hand, so errors and inconsistencies may exist
--- a/Data/ml-1m/movies.dat
+++ b/Data/ml-1m/movies.dat
--- a/Data/ml-1m/ratings.dat
+++ b/Data/ml-1m/ratings.dat
--- a/Data/ml-1m/users.dat
+++ b/Data/ml-1m/users.dat
--- a/TrainedModels/AutoRec.model
+++ b/TrainedModels/AutoRec.model
--- a/tool/train_test.py
+++ b/tool/train_test.py
@ -0,0 +1,64 @@
+import sys
+import os.path as osp
+this_dir = osp.dirname(__file__)
+lib_path = osp.join(this_dir, '..')
+sys.path.insert(0, lib_path)
+import torch
+from AutoRec.trainer import Trainer
+from AutoRec.network import AutoRec
+from AutoRec.dataloader import dataProcess
+
+autorec_config = \
+{
+    'train_ratio': 0.9,
+    'num_epoch': 100,
+    'batch_size': 100,
+    'optimizer': 'adam',
+    'adam_lr': 1e-3,
+    'l2_regularization':1e-4,
+    'num_users': 6040,
+    'num_items': 3952,
+    'hidden_units': 500,
+    'lambda': 1,
+    'device_id': 0,
+    'use_cuda': True,
+    'data_file': '../Data/ml-1m/ratings.dat',
+    'model_name': '../TrainedModels/AutoRec.model'
+}
+
+if __name__ == "__main__":
+    ####################################################################################
+    # AutoRec 自编码器协同过滤算法
+    ####################################################################################
+    train_r, train_mask_r, test_r, test_mask_r, \
+    user_train_set, item_train_set, user_test_set, item_test_set = \
+        dataProcess(autorec_config['data_file'], autorec_config['num_users'], autorec_config['num_items'], autorec_config['train_ratio'])
+    # 实例化AutoRec对象
+    autorec = AutoRec(config=autorec_config)
+
+    ####################################################################################
+    # 模型训练阶段
+    ###################################################################################
+    # 实例化模型训练器
+    trainer = Trainer(model=autorec, config=autorec_config)
+    # 开始训练
+    trainer.train(train_r, train_mask_r)
+    # 保存模型
+    trainer.save()
+
+    ###################################################################################
+    # 模型测试阶段
+    ###################################################################################
+    # 实例化AutoRec对象
+    autorec = AutoRec(config=autorec_config)
+    autorec.loadModel(map_location=torch.device('cpu'))
+
+    # 进行性能评估
+    autorec.evaluate(test_r, test_mask_r, user_test_set=user_test_set, user_train_set=user_train_set, \
+                     item_test_set=item_test_set, item_train_set=item_train_set)
+
+    # 从测试集中抽取1个用户，推荐5个商品
+    print(autorec.recommend_user(test_r[0], 5))
+
+
+