|
|
3 years ago | |
|---|---|---|
| README.md | 3 years ago | |
README.md
Titanic1
#1.导入训练测试数据,将数据合并并查看数据基本信息 import numpy as np import pandas as pd import matplotlib.pyplot as plt
读入数据
train = pd.read_csv("/data/bigfiles/2309cc5f04782ed9bb6016d9f4e381cf_1607607386535.csv") test = pd.read_csv("/data/bigfiles/7533b82eae4b582610cbd68aa636b017_1607607386511.csv")
缺失值处理:train.info()
train['Age']=train['Age'].fillna(train['Age'].mean())#年龄Age字段,使用平均值填充缺失值
train.drop(['Cabin'],axis=1,inplace=True) # 删去Cabin的那一列数据 train.drop(['Fare'],axis=1,inplace=True) # 删去Fare的那一列数据 train.Embarked = train.Embarked.fillna('S') #train.info()
test['Age']=test['Age'].fillna(test['Age'].mean())#年龄Age字段,使用平均值填充缺失值
test.drop(['Cabin'],axis=1,inplace=True) # 删去Cabin的那一列数据 test.drop(['Fare'],axis=1,inplace=True) # 删去Fare的那一列数据 test.Embarked = test.Embarked.fillna('S') #test.info() #显示大小 #print("数据集大小:",data.shape)
数据集详细信息
#print(data.info())
#2.查看数据前10行数据 import numpy as np import pandas as pd import matplotlib.pyplot as plt
读入数据
data2 = pd.read_csv("/data/bigfiles/2309cc5f04782ed9bb6016d9f4e381cf_1607607386535.csv",nrows =10) print(data2)
#3.显示男性与女性乘客生存比例并进行柱状图可视化
男性和女性存活情况
train'Sex','Survived'.groupby('Sex').mean().plot.bar()#在本代码中,先选取Sex和Survived两项数据作为新的数据全体 #后使用groupby函数来将Sex与Survived的数据转化为Sex的单类分组,后使用plot.bar绘制柱状图 survive_sex=train.groupby(['Sex','Survived'])['Survived'].count()
查看幸存与性别的关系
#由于本题阐述不清,无法确定男女生存比例绘图的具体参数,进行二次绘图,将全部有记录的数据汇总,以是否幸存为x轴,以数量为y轴,后用男女进行区分 Survived_Sex = train['Sex'].groupby(train['Survived']) Survived_Sex.value_counts().unstack().plot(kind = 'bar') plt.show()
4.显示不同客舱乘客生存比例并进行柱状图可视化
train'Pclass','Survived'.groupby('Pclass').mean().plot.bar()
#5.进行数据预测,选择模型,进行评估 import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score
读入数据
train = pd.read_csv("/data/bigfiles/2309cc5f04782ed9bb6016d9f4e381cf_1607607386535.csv") test = pd.read_csv("/data/bigfiles/7533b82eae4b582610cbd68aa636b017_1607607386511.csv")
缺失值处理:
''' train_age=train[train['Age'].notnull()]#这里是为了确定如何处理缺失值 train['Age']=train['Age'].fillna(train['Age'].mean())#年龄Age字段以平均值填充 train.drop(['Cabin'],axis=1,inplace=True) # 删去Cabin的那一列数据 train.Embarked = train.Embarked.fillna('S')#港口仅有两个值缺失,因此使用港口S填充缺失值 '''
dataset = train.append(test,sort=False)#合并后的数据,方便一起清洗
dataset_age=dataset[dataset['Age'].notnull()]#这里是为了确定如何处理缺失值 dataset['Age']=dataset['Age'].fillna(dataset['Age'].mean())#年龄Age字段以平均值填充 dataset.drop(['Cabin','Fare','PassengerId','Name','Ticket','Embarked'],axis=1,inplace=True) # 删去Cabin和Fare的那几列数据 #dataset.Embarked = dataset.Embarked.fillna('S')#港口仅有两个值缺失,因此使用港口S填充缺失值(不用拉!!) dataset.info()
#对Sex数据进行编码,男的1女的0 sexdict = {'male':1, 'female':0} dataset.Sex = dataset.Sex.map(sexdict)
dataset.head(1)
#对训练集与测试集进行定义 x_train = dataset.iloc[0:891, :] y_train = x_train.Survived x_train.drop(['Survived'], axis=1, inplace =True)
x_test = dataset.iloc[891:, :] x_test.drop(['Survived'], axis=1, inplace =True)
y_test = pd.read_csv('/data/bigfiles/7533b82eae4b582610cbd68aa636b017_1607607386511.csv')#测试集 y_test=np.squeeze(y_test)
x_train.shape,y_train.shape,x_test.shape, y_test.shape
#Logistic Regression模型预测 from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score model = LogisticRegression()
model.fit(x_train.iloc[0:-100,:],y_train.iloc[0:-100])
#使用accuracy_score评估 accuracy_score(model.predict(x_train.iloc[-100:,:]),y_train.iloc[-100:].values.reshape(-1,1)) #评估结果0.8