# -*-coding:utf-8-*- import argparse import pandas as pd from sklearn import preprocessing """ •PassengerID(ID) •Survived(存活与否) •Pclass(客舱等级,较为重要) •Name(姓名,可提取出更多信息) •Sex(性别,较为重要) •Age(年龄,较为重要) •Parch(直系亲友) •SibSp(旁系) •Ticket(票编号) •Fare(票价) •Cabin(客舱编号) •Embarked(上船的港口编号) """ # 随机森林算法 from sklearn.ensemble import RandomForestRegressor def set_age(df): # 把已有的数值型特征取出来丢进RandomForgestRegressor age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] known_age = age_df[age_df.Age.notnull()].values unknown_age = age_df[age_df.Age.isnull()].values y = known_age[:, 0] x = known_age[:, 1:] rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1) rfr.fit(x, y) # 用得到的模型来进行未知年龄结果预测 predictedAges = rfr.predict(unknown_age[:, 1::]) df.loc[(df.Age.isnull()), 'Age'] = predictedAges return df def set_carbin(df): df.loc[df.Cabin.notnull(), 'Cabin'] = 1 # loc 获取指定行索引的行,'Cabin' 只获取指定行的列、对这些满足条件的列赋值 df.loc[df.Cabin.isnull(), 'Cabin'] = 0 df.Cabin.isnull().sum() # 验证填充效果 return df def set_fare(df): """ 填充Fare的缺失值 fare主要和Pclass有关 该人的Pclass等级为几,选择客舱等级为几的票价中位数填充 """ df.loc[df.Pclass == 1, 'Fare'] = df[df.Pclass == 1]['Fare'].fillna(df[df.Pclass == 1]['Fare'].median()) df.loc[df.Pclass == 2, 'Fare'] = df[df.Pclass == 2]['Fare'].fillna(df[df.Pclass == 2]['Fare'].median()) df.loc[df.Pclass == 3, 'Fare'] = df[df.Pclass == 3]['Fare'].fillna(df[df.Pclass == 3]['Fare'].median()) print(df[df.Fare.isnull()]) return df def age_fare_scaler(df): # 特征因子化:通过get_dummies方法将类别型特征转换为数值型特征 dummies_Cabin = pd.get_dummies(df['Cabin'], prefix='Cabin') dummies_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked') dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex') dummies_Pclass = pd.get_dummies(df['Pclass'], prefix='Pclass') # 将因子化后的特征与原始数据集合并在一起,并drop掉原始数据集中的类别型特征 df = pd.concat([df, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1) df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) """ 代码中Age和Fare两个属性的数值幅度变化大,逻辑回归与梯度下降, 各属性值之间的标准化差距太大,对收敛速度有影响甚至不收敛。 """ # 标准化:通过preprocessing.StandardScaler()类的fit_transform方法对数据进行标准化处理 scaler = preprocessing.StandardScaler() df['Age_scaled'] = scaler.fit_transform(df[['Age']]) df['Fare_scaled'] = scaler.fit_transform(df[['Fare']]) # 将数据缩放到均值为0,标准差为1的范围内。提高模型的性能,加速模型的收敛 return df def set_data(df): """ 填充Embarked的值 选择众数 """ print(df.Embarked.mode()) df['Embarked'].fillna('S', inplace=True) """ 填充Fare的缺失值 """ df = set_fare(df) """ 填充carbin的值,有无carbin值作为参考。 """ df = set_carbin(df) """ Age缺失值的处理 """ df = set_age(df) df = age_fare_scaler(df) return df def getTitle(name): str1 = name.split(',')[1] # Mr. Owen Harris str2 = str1.split('.')[0] # Mr # strip() 方法用于移除字符串头尾指定的字符(默认为空格) str3 = str2.strip() return str3 def set_name(df): titleDf = pd.DataFrame() # map函数:对Series每个数据应用自定义的函数计算 titleDf['Title'] = df['Name'].map(getTitle) title_mapDict = { "Capt": "Officer", "Col": "Officer", "Major": "Officer", "Jonkheer": "Royalty", "Don": "Royalty", "Sir": "Royalty", "Dr": "Officer", "Rev": "Officer", "the Countess": "Royalty", "Dona": "Royalty", "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs", "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master", "Lady": "Royalty" } # map函数:对Series每个数据应用自定义的函数计算 titleDf['Title'] = titleDf['Title'].map(title_mapDict) # 使用get_dummies进行one-hot编码 titleDf = pd.get_dummies(titleDf['Title']) # 添加one-hot编码产生的虚拟变量(dummy variables)到泰坦尼克号数据集full df = pd.concat([df, titleDf], axis=1) return df def pre_data(path): train_data = pd.read_csv('Data/train.csv') test_data = pd.read_csv('Data/test.csv') print("\n原始数据缺失情况:") print(train_data.isnull().sum()) # 查看数据的缺失情况 # 数据处理 train_datapath = path + '/Data/new_train.csv' train_data = set_data(train_data) train_data.to_csv(train_datapath, index=False) # 测试集处理 test_datapath = path + '/Data/new_test.csv' test_data = set_data(test_data) test_data.to_csv(test_datapath, index=False) pd.set_option('display.max_columns', None) print(train_data) print("\n数据处理后情况:") print(train_data.isnull().sum()) # 查看数据的缺失情况 return train_data, test_data if __name__ == '__main__': parser = argparse.ArgumentParser() # 参数default值为素材地址与文件保存地址 parser.add_argument("--dest_path", default='E:/python1/123', help="files save path") parser.add_argument('--train', default='Data/train.csv', help="train data") parser.add_argument('--test', default='Data/test.csv', help="test data") parser.add_argument("--testoutput", default="E:/python1/123/Data/new_test.csv", help="new test data output path") parser.add_argument("--trainoutput", default="E:/python1/123/Data/new_train.csv", help="new train data output path") args = parser.parse_args() train = pd.read_csv(args.train) test = pd.read_csv(args.test) print("\n原始数据缺失情况:") print(train.isnull().sum()) # 查看数据的缺失情况 # 数据处理 train_data_path = args.dest_path + '/Data/new_train.csv' train = set_data(train) train.to_csv(train_data_path, index=False) # 测试集处理 test_data_path = args.dest_path + '/Data/new_test.csv' test = set_data(test) test.to_csv(test_data_path, index=False) pd.set_option('display.max_columns', None) print(train) print("\n原始数据处理后情况:") print(train.isnull().sum()) # 查看数据的缺失情况