You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

219 lines
7.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*-coding:utf-8-*-
import argparse
import pandas as pd
from sklearn import preprocessing
"""
•PassengerIDID
•Survived(存活与否)
•Pclass客舱等级较为重要
•Name姓名可提取出更多信息
•Sex性别较为重要
•Age年龄较为重要
•Parch直系亲友
•SibSp旁系
•Ticket票编号
•Fare票价
•Cabin客舱编号
•Embarked上船的港口编号
"""
# 随机森林算法
from sklearn.ensemble import RandomForestRegressor
def set_age(df):
# 把已有的数值型特征取出来丢进RandomForgestRegressor
age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
known_age = age_df[age_df.Age.notnull()].values
unknown_age = age_df[age_df.Age.isnull()].values
y = known_age[:, 0]
x = known_age[:, 1:]
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(x, y)
# 用得到的模型来进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::])
df.loc[(df.Age.isnull()), 'Age'] = predictedAges
return df
def set_carbin(df):
df.loc[df.Cabin.notnull(), 'Cabin'] = 1 # loc 获取指定行索引的行,'Cabin' 只获取指定行的列、对这些满足条件的列赋值
df.loc[df.Cabin.isnull(), 'Cabin'] = 0
df.Cabin.isnull().sum() # 验证填充效果
return df
def set_fare(df):
"""
填充Fare的缺失值
fare主要和Pclass有关
该人的Pclass等级为几,选择客舱等级为几的票价中位数填充
"""
df.loc[df.Pclass == 1, 'Fare'] = df[df.Pclass == 1]['Fare'].fillna(df[df.Pclass == 1]['Fare'].median())
df.loc[df.Pclass == 2, 'Fare'] = df[df.Pclass == 2]['Fare'].fillna(df[df.Pclass == 2]['Fare'].median())
df.loc[df.Pclass == 3, 'Fare'] = df[df.Pclass == 3]['Fare'].fillna(df[df.Pclass == 3]['Fare'].median())
print(df[df.Fare.isnull()])
return df
def age_fare_scaler(df):
# 特征因子化通过get_dummies方法将类别型特征转换为数值型特征
dummies_Cabin = pd.get_dummies(df['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')
# 将因子化后的特征与原始数据集合并在一起并drop掉原始数据集中的类别型特征
df = pd.concat([df, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
"""
代码中Age和Fare两个属性的数值幅度变化大逻辑回归与梯度下降
各属性值之间的标准化差距太大,对收敛速度有影响甚至不收敛。
"""
# 标准化通过preprocessing.StandardScaler类的fit_transform方法对数据进行标准化处理
scaler = preprocessing.StandardScaler()
df['Age_scaled'] = scaler.fit_transform(df[['Age']])
df['Fare_scaled'] = scaler.fit_transform(df[['Fare']])
# 将数据缩放到均值为0标准差为1的范围内。提高模型的性能加速模型的收敛
return df
def set_data(df):
"""
填充Embarked的值
选择众数
"""
print(df.Embarked.mode())
df['Embarked'].fillna('S', inplace=True)
"""
填充Fare的缺失值
"""
df = set_fare(df)
"""
填充carbin的值有无carbin值作为参考。
"""
df = set_carbin(df)
"""
Age缺失值的处理
"""
df = set_age(df)
df = age_fare_scaler(df)
return df
def getTitle(name):
str1 = name.split(',')[1] # Mr. Owen Harris
str2 = str1.split('.')[0] # Mr
# strip() 方法用于移除字符串头尾指定的字符(默认为空格)
str3 = str2.strip()
return str3
def set_name(df):
titleDf = pd.DataFrame()
# map函数对Series每个数据应用自定义的函数计算
titleDf['Title'] = df['Name'].map(getTitle)
title_mapDict = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir": "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess": "Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr": "Mr",
"Mrs": "Mrs",
"Miss": "Miss",
"Master": "Master",
"Lady": "Royalty"
}
# map函数对Series每个数据应用自定义的函数计算
titleDf['Title'] = titleDf['Title'].map(title_mapDict)
# 使用get_dummies进行one-hot编码
titleDf = pd.get_dummies(titleDf['Title'])
# 添加one-hot编码产生的虚拟变量dummy variables到泰坦尼克号数据集full
df = pd.concat([df, titleDf], axis=1)
return df
def pre_data(path):
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
print("\n原始数据缺失情况:")
print(train_data.isnull().sum()) # 查看数据的缺失情况
# 数据处理
train_datapath = path + '/Data/new_train.csv'
train_data = set_data(train_data)
train_data.to_csv(train_datapath, index=False)
# 测试集处理
test_datapath = path + '/Data/new_test.csv'
test_data = set_data(test_data)
test_data.to_csv(test_datapath, index=False)
pd.set_option('display.max_columns', None)
print(train_data)
print("\n数据处理后情况:")
print(train_data.isnull().sum()) # 查看数据的缺失情况
return train_data, test_data
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# 参数default值为素材地址与文件保存地址
parser.add_argument("--dest_path", default='E:/python1/123', help="files save path")
parser.add_argument('--train', default='Data/train.csv', help="train data")
parser.add_argument('--test', default='Data/test.csv', help="test data")
parser.add_argument("--testoutput", default="E:/python1/123/Data/new_test.csv",
help="new test data output path")
parser.add_argument("--trainoutput", default="E:/python1/123/Data/new_train.csv",
help="new train data output path")
args = parser.parse_args()
train = pd.read_csv(args.train)
test = pd.read_csv(args.test)
print("\n原始数据缺失情况:")
print(train.isnull().sum()) # 查看数据的缺失情况
# 数据处理
train_data_path = args.dest_path + '/Data/new_train.csv'
train = set_data(train)
train.to_csv(train_data_path, index=False)
# 测试集处理
test_data_path = args.dest_path + '/Data/new_test.csv'
test = set_data(test)
test.to_csv(test_data_path, index=False)
pd.set_option('display.max_columns', None)
print(train)
print("\n原始数据处理后情况:")
print(train.isnull().sum()) # 查看数据的缺失情况