Delete 'dataset_pre.py'

main
p7jo5irft 10 months ago
parent b1becd5e05
commit cd09059932

@ -1,220 +0,0 @@
# -*-coding:utf-8-*-
import argparse
import pandas as pd
from sklearn import preprocessing
"""
PassengerIDID
Survived(存活与否)
Pclass客舱等级较为重要
Name姓名可提取出更多信息
Sex性别较为重要
Age年龄较为重要
Parch直系亲友
SibSp旁系
Ticket票编号
Fare票价
Cabin客舱编号
Embarked上船的港口编号
"""
# 随机森林算法
from sklearn.ensemble import RandomForestRegressor
def set_age(df):
# 把已有的数值型特征取出来丢进RandomForgestRegressor
age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
known_age = age_df[age_df.Age.notnull()].values
unknown_age = age_df[age_df.Age.isnull()].values
y = known_age[:, 0]
x = known_age[:, 1:]
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(x, y)
# 用得到的模型来进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::])
df.loc[(df.Age.isnull()), 'Age'] = predictedAges
return df
def set_carbin(df):
df.loc[df.Cabin.notnull(), 'Cabin'] = 1 # loc 获取指定行索引的行,'Cabin' 只获取指定行的列、对这些满足条件的列赋值
df.loc[df.Cabin.isnull(), 'Cabin'] = 0
df.Cabin.isnull().sum() # 验证填充效果
return df
def set_fare(df):
"""
填充Fare的缺失值
fare主要和Pclass有关
该人的Pclass等级为几,选择客舱等级为几的票价中位数填充
"""
df.loc[df.Pclass == 1, 'Fare'] = df[df.Pclass == 1]['Fare'].fillna(df[df.Pclass == 1]['Fare'].median())
df.loc[df.Pclass == 2, 'Fare'] = df[df.Pclass == 2]['Fare'].fillna(df[df.Pclass == 2]['Fare'].median())
df.loc[df.Pclass == 3, 'Fare'] = df[df.Pclass == 3]['Fare'].fillna(df[df.Pclass == 3]['Fare'].median())
print(df[df.Fare.isnull()])
return df
def age_fare_scaler(df):
# 特征因子化
dummies_Cabin = pd.get_dummies(df['Cabin'], prefix='Cabin')
dummies_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked')
dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex')
dummies_Pclass = pd.get_dummies(df['Pclass'], prefix='Pclass')
df = pd.concat([df, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
# print(df)
"""
Age和Fare两个属性乘客的数值幅度变化太大逻辑回归与梯度下降
各属性值之间scale差距太大将对收敛速度造成几万点伤害值甚至不收敛
我们先用scikit-learn里面的preprocessing模块对这俩做一个scaling
所谓scaling其实就是将一些变化幅度较大的特征化到[-1,1]之内
"""
scaler = preprocessing.StandardScaler()
df['Age_scaled'] = scaler.fit_transform(df[['Age']])
df['Fare_scaled'] = scaler.fit_transform(df[['Fare']])
return df
def set_data(df):
"""
填充Embarked的值
选择众数
"""
print(df.Embarked.mode())
df['Embarked'].fillna('S', inplace=True)
"""
填充Fare的缺失值
"""
df = set_fare(df)
"""
填充carbin的值缺失了1014个值有无carbin值作为参考
"""
df = set_carbin(df)
"""
Age缺失值的处理
"""
df = set_age(df)
df = age_fare_scaler(df)
return df
def getTitle(name):
str1 = name.split(',')[1] # Mr. Owen Harris
str2 = str1.split('.')[0] # Mr
# strip() 方法用于移除字符串头尾指定的字符(默认为空格)
str3 = str2.strip()
return str3
def set_name(df):
titleDf = pd.DataFrame()
# map函数对Series每个数据应用自定义的函数计算
titleDf['Title'] = df['Name'].map(getTitle)
title_mapDict = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir": "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess": "Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr": "Mr",
"Mrs": "Mrs",
"Miss": "Miss",
"Master": "Master",
"Lady": "Royalty"
}
# map函数对Series每个数据应用自定义的函数计算
titleDf['Title'] = titleDf['Title'].map(title_mapDict)
# 使用get_dummies进行one-hot编码
titleDf = pd.get_dummies(titleDf['Title'])
# 添加one-hot编码产生的虚拟变量dummy variables到泰坦尼克号数据集full
df = pd.concat([df, titleDf], axis=1)
return df
def pre_data(path):
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
print("\n原始数据缺失情况:")
print(train_data.isnull().sum()) # 查看数据的缺失情况
# 数据处理
train_datapath = path + '/Data/new_train.csv'
train_data = set_data(train_data)
train_data.to_csv(train_datapath, index=False)
# 测试集处理
test_datapath = path + '/Data/new_test.csv'
test_data = set_data(test_data)
test_data.to_csv(test_datapath, index=False)
pd.set_option('display.max_columns', None)
print(train_data)
print("\n数据处理后情况:")
print(train_data.isnull().sum()) # 查看数据的缺失情况
return train_data, test_data
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# 参数default值为素材地址与文件保存地址
parser.add_argument("--dest_path", default='E:/python1/123', help="files save path")
parser.add_argument('--train', default='Data/train.csv', help="train data")
parser.add_argument('--test', default='Data/test.csv', help="test data")
parser.add_argument("--testoutput", default="E:/python1/123/Data/new_test.csv",
help="new test data output path")
parser.add_argument("--trainoutput", default="E:/python1/123/Data/new_train.csv",
help="new train data output path")
args = parser.parse_args()
train = pd.read_csv(args.train)
test = pd.read_csv(args.test)
print("\n原始数据缺失情况:")
print(train.isnull().sum()) # 查看数据的缺失情况
# 数据处理
train_data_path = args.dest_path + '/Data/new_train.csv'
train = set_data(train)
train.to_csv(train_data_path, index=False)
# 测试集处理
test_data_path = args.dest_path + '/Data/new_test.csv'
test = set_data(test)
test.to_csv(test_data_path, index=False)
pd.set_option('display.max_columns', None)
print(train)
print("\n原始数据处理后情况:")
print(train.isnull().sum()) # 查看数据的缺失情况
Loading…
Cancel
Save