You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

220 lines
7.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*-coding:utf-8-*-
import argparse
import pandas as pd
from sklearn import preprocessing
"""
•PassengerIDID
•Survived(存活与否)
•Pclass客舱等级较为重要
•Name姓名可提取出更多信息
•Sex性别较为重要
•Age年龄较为重要
•Parch直系亲友
•SibSp旁系
•Ticket票编号
•Fare票价
•Cabin客舱编号
•Embarked上船的港口编号
"""
# 随机森林算法
from sklearn.ensemble import RandomForestRegressor
def set_age(df):
# 把已有的数值型特征取出来丢进RandomForgestRegressor
age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
known_age = age_df[age_df.Age.notnull()].values
unknown_age = age_df[age_df.Age.isnull()].values
y = known_age[:, 0]
x = known_age[:, 1:]
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(x, y)
# 用得到的模型来进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::])
df.loc[(df.Age.isnull()), 'Age'] = predictedAges
return df
def set_carbin(df):
df.loc[df.Cabin.notnull(), 'Cabin'] = 1 # loc 获取指定行索引的行,'Cabin' 只获取指定行的列、对这些满足条件的列赋值
df.loc[df.Cabin.isnull(), 'Cabin'] = 0
df.Cabin.isnull().sum() # 验证填充效果
return df
def set_fare(df):
"""
填充Fare的缺失值
fare主要和Pclass有关
该人的Pclass等级为几,选择客舱等级为几的票价中位数填充
"""
df.loc[df.Pclass == 1, 'Fare'] = df[df.Pclass == 1]['Fare'].fillna(df[df.Pclass == 1]['Fare'].median())
df.loc[df.Pclass == 2, 'Fare'] = df[df.Pclass == 2]['Fare'].fillna(df[df.Pclass == 2]['Fare'].median())
df.loc[df.Pclass == 3, 'Fare'] = df[df.Pclass == 3]['Fare'].fillna(df[df.Pclass == 3]['Fare'].median())
print(df[df.Fare.isnull()])
return df
def age_fare_scaler(df):
# 特征因子化
dummies_Cabin = pd.get_dummies(df['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')
df = pd.concat([df, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
# print(df)
"""
Age和Fare两个属性乘客的数值幅度变化太大逻辑回归与梯度下降
各属性值之间scale差距太大将对收敛速度造成几万点伤害值甚至不收敛
我们先用scikit-learn里面的preprocessing模块对这俩做一个scaling
所谓scaling其实就是将一些变化幅度较大的特征化到[-1,1]之内。
"""
scaler = preprocessing.StandardScaler()
df['Age_scaled'] = scaler.fit_transform(df[['Age']])
df['Fare_scaled'] = scaler.fit_transform(df[['Fare']])
return df
def set_data(df):
"""
填充Embarked的值
选择众数
"""
print(df.Embarked.mode())
df['Embarked'].fillna('S', inplace=True)
"""
填充Fare的缺失值
"""
df = set_fare(df)
"""
填充carbin的值缺失了1014个值有无carbin值作为参考。
"""
df = set_carbin(df)
"""
Age缺失值的处理
"""
df = set_age(df)
df = age_fare_scaler(df)
return df
def getTitle(name):
str1 = name.split(',')[1] # Mr. Owen Harris
str2 = str1.split('.')[0] # Mr
# strip() 方法用于移除字符串头尾指定的字符(默认为空格)
str3 = str2.strip()
return str3
def set_name(df):
titleDf = pd.DataFrame()
# map函数对Series每个数据应用自定义的函数计算
titleDf['Title'] = df['Name'].map(getTitle)
title_mapDict = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir": "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess": "Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr": "Mr",
"Mrs": "Mrs",
"Miss": "Miss",
"Master": "Master",
"Lady": "Royalty"
}
# map函数对Series每个数据应用自定义的函数计算
titleDf['Title'] = titleDf['Title'].map(title_mapDict)
# 使用get_dummies进行one-hot编码
titleDf = pd.get_dummies(titleDf['Title'])
# 添加one-hot编码产生的虚拟变量dummy variables到泰坦尼克号数据集full
df = pd.concat([df, titleDf], axis=1)
return df
def pre_data(path):
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
print("\n原始数据缺失情况:")
print(train_data.isnull().sum()) # 查看数据的缺失情况
# 数据处理
train_datapath = path + '/Data/new_train.csv'
train_data = set_data(train_data)
train_data.to_csv(train_datapath, index=False)
# 测试集处理
test_datapath = path + '/Data/new_test.csv'
test_data = set_data(test_data)
test_data.to_csv(test_datapath, index=False)
pd.set_option('display.max_columns', None)
print(train_data)
print("\n数据处理后情况:")
print(train_data.isnull().sum()) # 查看数据的缺失情况
return train_data, test_data
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# 参数default值为素材地址与文件保存地址
parser.add_argument("--dest_path", default='E:/python1/123', help="files save path")
parser.add_argument('--train', default='Data/train.csv', help="train data")
parser.add_argument('--test', default='Data/test.csv', help="test data")
parser.add_argument("--testoutput", default="E:/python1/123/Data/new_test.csv",
help="new test data output path")
parser.add_argument("--trainoutput", default="E:/python1/123/Data/new_train.csv",
help="new train data output path")
args = parser.parse_args()
train = pd.read_csv(args.train)
test = pd.read_csv(args.test)
print("\n原始数据缺失情况:")
print(train.isnull().sum()) # 查看数据的缺失情况
# 数据处理
train_data_path = args.dest_path + '/Data/new_train.csv'
train = set_data(train)
train.to_csv(train_data_path, index=False)
# 测试集处理
test_data_path = args.dest_path + '/Data/new_test.csv'
test = set_data(test)
test.to_csv(test_data_path, index=False)
pd.set_option('display.max_columns', None)
print(train)
print("\n原始数据处理后情况:")
print(train.isnull().sum()) # 查看数据的缺失情况