import numpy as np import pandas as pd class DataLoader(): """A class for loading and transforming data for the lstm model""" def __init__(self, filename, split, cols): ''' filename:数据所在文件名, '.csv'格式文件 split:训练与测试数据分割变量 cols:选择data的一列或者多列进行分析,如 Close 和 Volume ''' dataframe = pd.read_csv(filename) i_split = int(len(dataframe) * split) self.data_train = dataframe.get(cols).values[:i_split] #选择指定的列 进行分割 得到 未处理的训练数据 self.data_test = dataframe.get(cols).values[i_split:] self.len_train = len(self.data_train) self.len_test = len(self.data_test) self.len_train_windows = None def get_test_data(self, seq_len, normalise): ''' Create x, y test data windows Warning: batch method, not generative, make sure you have enough memory to load data, otherwise reduce size of the training split. ''' data_windows = [] for i in range(self.len_test - seq_len): data_windows.append(self.data_test[i:i+seq_len]) #每一个元素是长度为seq_len的 list即一个window data_windows = np.array(data_windows).astype(float) data_windows = self.normalise_windows(data_windows, single_window=False) if normalise else data_windows x = data_windows[:, :-1] y = data_windows[:, -1, [0]] return x, y def get_train_data(self, seq_len, normalise): ''' Create x, y train data windows Warning: batch method, not generative, make sure you have enough memory to load data, otherwise use generate_training_window() method. ''' data_x = [] data_y = [] for i in range(self.len_train - seq_len): x, y = self._next_window(i, seq_len, normalise) data_x.append(x) data_y.append(y) return np.array(data_x), np.array(data_y) def generate_train_batch(self, seq_len, batch_size, normalise): '''Yield a generator of training data from filename on given list of cols split for train/test''' i = 0 while i < (self.len_train - seq_len): x_batch = [] y_batch = [] for b in range(batch_size): if i >= (self.len_train - seq_len): # stop-condition for a smaller final batch if data doesn't divide evenly yield np.array(x_batch), np.array(y_batch) x, y = self._next_window(i, seq_len, normalise) x_batch.append(x) y_batch.append(y) i += 1 yield np.array(x_batch), np.array(y_batch) def _next_window(self, i, seq_len, normalise): '''Generates the next data window from the given index location i''' window = self.data_train[i:i+seq_len] window = self.normalise_windows(window, single_window=True)[0] if normalise else window x = window[:-1] y = window[-1, [0]] # 最后一行的 0个元素 组成array类型,若是[0,2]则取第0个和第2个元素组成array,[-1, 0]:则是取最后一行第0个元素, # 只返回该元素的值[]和()用于索引都是切片操作,所以这里的y即label是 第一列Close列 return x, y def normalise_windows(self, window_data, single_window=False): '''Normalise window with a base value of zero''' normalised_data = [] window_data = [window_data] if single_window else window_data for window in window_data: normalised_window = [] for col_i in range(window.shape[1]): normalised_col = [((float(p) / float(window[0, col_i])) - 1) for p in window[:, col_i]] normalised_window.append(normalised_col) normalised_window = np.array(normalised_window).T # reshape and transpose array back into original multidimensional format normalised_data.append(normalised_window) return np.array(normalised_data)