|
|
|
|
# -*-coding:utf-8-*-
|
|
|
|
|
import argparse
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from sklearn import preprocessing
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
•PassengerID(ID)
|
|
|
|
|
•Survived(存活与否)
|
|
|
|
|
•Pclass(客舱等级,较为重要)
|
|
|
|
|
•Name(姓名,可提取出更多信息)
|
|
|
|
|
•Sex(性别,较为重要)
|
|
|
|
|
•Age(年龄,较为重要)
|
|
|
|
|
•Parch(直系亲友)
|
|
|
|
|
•SibSp(旁系)
|
|
|
|
|
•Ticket(票编号)
|
|
|
|
|
•Fare(票价)
|
|
|
|
|
•Cabin(客舱编号)
|
|
|
|
|
•Embarked(上船的港口编号)
|
|
|
|
|
"""
|
|
|
|
|
# 随机森林算法
|
|
|
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_age(df):
|
|
|
|
|
# 把已有的数值型特征取出来丢进RandomForgestRegressor
|
|
|
|
|
age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
|
|
|
|
|
known_age = age_df[age_df.Age.notnull()].values
|
|
|
|
|
unknown_age = age_df[age_df.Age.isnull()].values
|
|
|
|
|
|
|
|
|
|
y = known_age[:, 0]
|
|
|
|
|
x = known_age[:, 1:]
|
|
|
|
|
|
|
|
|
|
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
|
|
|
|
|
rfr.fit(x, y)
|
|
|
|
|
|
|
|
|
|
# 用得到的模型来进行未知年龄结果预测
|
|
|
|
|
predictedAges = rfr.predict(unknown_age[:, 1::])
|
|
|
|
|
df.loc[(df.Age.isnull()), 'Age'] = predictedAges
|
|
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_carbin(df):
|
|
|
|
|
df.loc[df.Cabin.notnull(), 'Cabin'] = 1 # loc 获取指定行索引的行,'Cabin' 只获取指定行的列、对这些满足条件的列赋值
|
|
|
|
|
df.loc[df.Cabin.isnull(), 'Cabin'] = 0
|
|
|
|
|
df.Cabin.isnull().sum() # 验证填充效果
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_fare(df):
|
|
|
|
|
"""
|
|
|
|
|
填充Fare的缺失值
|
|
|
|
|
fare主要和Pclass有关
|
|
|
|
|
该人的Pclass等级为几,选择客舱等级为几的票价中位数填充
|
|
|
|
|
"""
|
|
|
|
|
df.loc[df.Pclass == 1, 'Fare'] = df[df.Pclass == 1]['Fare'].fillna(df[df.Pclass == 1]['Fare'].median())
|
|
|
|
|
df.loc[df.Pclass == 2, 'Fare'] = df[df.Pclass == 2]['Fare'].fillna(df[df.Pclass == 2]['Fare'].median())
|
|
|
|
|
df.loc[df.Pclass == 3, 'Fare'] = df[df.Pclass == 3]['Fare'].fillna(df[df.Pclass == 3]['Fare'].median())
|
|
|
|
|
print(df[df.Fare.isnull()])
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def age_fare_scaler(df):
|
|
|
|
|
# 特征因子化:通过get_dummies方法将类别型特征转换为数值型特征
|
|
|
|
|
dummies_Cabin = pd.get_dummies(df['Cabin'], prefix='Cabin')
|
|
|
|
|
|
|
|
|
|
dummies_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
|
|
|
|
|
|
|
|
|
|
dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex')
|
|
|
|
|
|
|
|
|
|
dummies_Pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')
|
|
|
|
|
|
|
|
|
|
# 将因子化后的特征与原始数据集合并在一起,并drop掉原始数据集中的类别型特征
|
|
|
|
|
df = pd.concat([df, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
|
|
|
|
|
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
代码中Age和Fare两个属性的数值幅度变化大,逻辑回归与梯度下降,
|
|
|
|
|
各属性值之间的标准化差距太大,对收敛速度有影响甚至不收敛。
|
|
|
|
|
"""
|
|
|
|
|
# 标准化:通过preprocessing.StandardScaler()类的fit_transform方法对数据进行标准化处理
|
|
|
|
|
scaler = preprocessing.StandardScaler()
|
|
|
|
|
df['Age_scaled'] = scaler.fit_transform(df[['Age']])
|
|
|
|
|
df['Fare_scaled'] = scaler.fit_transform(df[['Fare']])
|
|
|
|
|
# 将数据缩放到均值为0,标准差为1的范围内。提高模型的性能,加速模型的收敛
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
def set_data(df):
|
|
|
|
|
"""
|
|
|
|
|
填充Embarked的值
|
|
|
|
|
选择众数
|
|
|
|
|
"""
|
|
|
|
|
print(df.Embarked.mode())
|
|
|
|
|
df['Embarked'].fillna('S', inplace=True)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
填充Fare的缺失值
|
|
|
|
|
"""
|
|
|
|
|
df = set_fare(df)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
填充carbin的值,有无carbin值作为参考。
|
|
|
|
|
"""
|
|
|
|
|
df = set_carbin(df)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Age缺失值的处理
|
|
|
|
|
"""
|
|
|
|
|
df = set_age(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = age_fare_scaler(df)
|
|
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getTitle(name):
|
|
|
|
|
str1 = name.split(',')[1] # Mr. Owen Harris
|
|
|
|
|
str2 = str1.split('.')[0] # Mr
|
|
|
|
|
# strip() 方法用于移除字符串头尾指定的字符(默认为空格)
|
|
|
|
|
str3 = str2.strip()
|
|
|
|
|
return str3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_name(df):
|
|
|
|
|
titleDf = pd.DataFrame()
|
|
|
|
|
# map函数:对Series每个数据应用自定义的函数计算
|
|
|
|
|
titleDf['Title'] = df['Name'].map(getTitle)
|
|
|
|
|
title_mapDict = {
|
|
|
|
|
"Capt": "Officer",
|
|
|
|
|
"Col": "Officer",
|
|
|
|
|
"Major": "Officer",
|
|
|
|
|
"Jonkheer": "Royalty",
|
|
|
|
|
"Don": "Royalty",
|
|
|
|
|
"Sir": "Royalty",
|
|
|
|
|
"Dr": "Officer",
|
|
|
|
|
"Rev": "Officer",
|
|
|
|
|
"the Countess": "Royalty",
|
|
|
|
|
"Dona": "Royalty",
|
|
|
|
|
"Mme": "Mrs",
|
|
|
|
|
"Mlle": "Miss",
|
|
|
|
|
"Ms": "Mrs",
|
|
|
|
|
"Mr": "Mr",
|
|
|
|
|
"Mrs": "Mrs",
|
|
|
|
|
"Miss": "Miss",
|
|
|
|
|
"Master": "Master",
|
|
|
|
|
"Lady": "Royalty"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# map函数:对Series每个数据应用自定义的函数计算
|
|
|
|
|
titleDf['Title'] = titleDf['Title'].map(title_mapDict)
|
|
|
|
|
|
|
|
|
|
# 使用get_dummies进行one-hot编码
|
|
|
|
|
titleDf = pd.get_dummies(titleDf['Title'])
|
|
|
|
|
# 添加one-hot编码产生的虚拟变量(dummy variables)到泰坦尼克号数据集full
|
|
|
|
|
df = pd.concat([df, titleDf], axis=1)
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pre_data(path):
|
|
|
|
|
train_data = pd.read_csv('Data/train.csv')
|
|
|
|
|
test_data = pd.read_csv('Data/test.csv')
|
|
|
|
|
|
|
|
|
|
print("\n原始数据缺失情况:")
|
|
|
|
|
print(train_data.isnull().sum()) # 查看数据的缺失情况
|
|
|
|
|
|
|
|
|
|
# 数据处理
|
|
|
|
|
train_datapath = path + '/Data/new_train.csv'
|
|
|
|
|
train_data = set_data(train_data)
|
|
|
|
|
|
|
|
|
|
train_data.to_csv(train_datapath, index=False)
|
|
|
|
|
|
|
|
|
|
# 测试集处理
|
|
|
|
|
test_datapath = path + '/Data/new_test.csv'
|
|
|
|
|
test_data = set_data(test_data)
|
|
|
|
|
|
|
|
|
|
test_data.to_csv(test_datapath, index=False)
|
|
|
|
|
|
|
|
|
|
pd.set_option('display.max_columns', None)
|
|
|
|
|
print(train_data)
|
|
|
|
|
print("\n数据处理后情况:")
|
|
|
|
|
print(train_data.isnull().sum()) # 查看数据的缺失情况
|
|
|
|
|
return train_data, test_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
# 参数default值为素材地址与文件保存地址
|
|
|
|
|
parser.add_argument("--dest_path", default='E:/python1/123', help="files save path")
|
|
|
|
|
parser.add_argument('--train', default='Data/train.csv', help="train data")
|
|
|
|
|
parser.add_argument('--test', default='Data/test.csv', help="test data")
|
|
|
|
|
parser.add_argument("--testoutput", default="E:/python1/123/Data/new_test.csv",
|
|
|
|
|
help="new test data output path")
|
|
|
|
|
parser.add_argument("--trainoutput", default="E:/python1/123/Data/new_train.csv",
|
|
|
|
|
help="new train data output path")
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
train = pd.read_csv(args.train)
|
|
|
|
|
test = pd.read_csv(args.test)
|
|
|
|
|
|
|
|
|
|
print("\n原始数据缺失情况:")
|
|
|
|
|
print(train.isnull().sum()) # 查看数据的缺失情况
|
|
|
|
|
|
|
|
|
|
# 数据处理
|
|
|
|
|
train_data_path = args.dest_path + '/Data/new_train.csv'
|
|
|
|
|
train = set_data(train)
|
|
|
|
|
train.to_csv(train_data_path, index=False)
|
|
|
|
|
|
|
|
|
|
# 测试集处理
|
|
|
|
|
test_data_path = args.dest_path + '/Data/new_test.csv'
|
|
|
|
|
test = set_data(test)
|
|
|
|
|
test.to_csv(test_data_path, index=False)
|
|
|
|
|
|
|
|
|
|
pd.set_option('display.max_columns', None)
|
|
|
|
|
print(train)
|
|
|
|
|
print("\n原始数据处理后情况:")
|
|
|
|
|
print(train.isnull().sum()) # 查看数据的缺失情况
|