|
|
|
@ -0,0 +1,260 @@
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
"""
|
|
|
|
|
Created on Tue May 31 15:44:45 2022
|
|
|
|
|
|
|
|
|
|
@author: FADER
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
|
|
|
from sklearn.model_selection import cross_val_score
|
|
|
|
|
from time import time
|
|
|
|
|
import datetime
|
|
|
|
|
plt.rcParams['font.sans-serif'] = ['SimHei']
|
|
|
|
|
# 步骤一(替换sans-serif字体)
|
|
|
|
|
plt.rcParams['axes.unicode_minus'] = False
|
|
|
|
|
train = pd.read_csv('train.csv')
|
|
|
|
|
test = pd.read_csv('test.csv')
|
|
|
|
|
#print('训练数据集:',train.shape,'测试数据集:',test.shape)
|
|
|
|
|
full = train.append( test , ignore_index = True )
|
|
|
|
|
#print(full.info)
|
|
|
|
|
#print(full.head(10))
|
|
|
|
|
#print ('合并后的数据集:',full.shape)
|
|
|
|
|
#print(full.info())
|
|
|
|
|
#填补缺失的数据
|
|
|
|
|
full['Age']=full['Age'].fillna(full['Age'].mean())
|
|
|
|
|
full['Fare']=full['Fare'].fillna(full['Fare'].mean())
|
|
|
|
|
full['Embarked']=full['Embarked'].fillna('S')
|
|
|
|
|
full['Cabin'] = full['Cabin'].fillna( 'U' )
|
|
|
|
|
full['Embarked']=full['Embarked'].fillna('S')
|
|
|
|
|
#print(full['Sex'].head())
|
|
|
|
|
dict1 = {'male':1,'female':0}
|
|
|
|
|
full['Sex']=full['Sex'].map(dict1)
|
|
|
|
|
#print(full['Sex'].head())
|
|
|
|
|
#print(full['Embarked'].head())
|
|
|
|
|
#将Embarked的数据分类后并提取为新的列
|
|
|
|
|
def embarkeddefyC(x):
|
|
|
|
|
return 1 if x == 'C' else 0
|
|
|
|
|
def embarkeddefyQ(x):
|
|
|
|
|
return 1 if x == 'Q' else 0
|
|
|
|
|
def embarkeddefyS(x):
|
|
|
|
|
return 1 if x == 'S' else 0
|
|
|
|
|
full['Embarked_C']=full['Embarked'].map(embarkeddefyC)
|
|
|
|
|
full['Embarked_Q']=full['Embarked'].map(embarkeddefyQ)
|
|
|
|
|
full['Embarked_S']=full['Embarked'].map(embarkeddefyS)
|
|
|
|
|
full.drop('Embarked',axis=1,inplace=True)
|
|
|
|
|
#print(full.head())
|
|
|
|
|
#将Pclass的数据分类后并提取为新的列
|
|
|
|
|
pclassDf = pd.DataFrame()
|
|
|
|
|
#使用get_dummies进行one-hot编码,列名前缀是Pclass
|
|
|
|
|
pclassDf = pd.get_dummies(full['Pclass'],prefix='Pclass')
|
|
|
|
|
#print(pclassDf.head())
|
|
|
|
|
full = pd.concat([full,pclassDf],axis=1)
|
|
|
|
|
full.drop('Pclass',axis=1,inplace=True)
|
|
|
|
|
#print(full.head())
|
|
|
|
|
#提取名字的信息
|
|
|
|
|
def getTitle(name):
|
|
|
|
|
str1=name.split(',')[1]
|
|
|
|
|
str2=str1.split('.')[0]
|
|
|
|
|
str3=str2.strip()
|
|
|
|
|
return str3
|
|
|
|
|
Name = pd.DataFrame()
|
|
|
|
|
Name['Title']=full['Name'].map(getTitle)
|
|
|
|
|
title_mapDict={
|
|
|
|
|
'Capt': 'Officer',
|
|
|
|
|
'Col': 'Officer',
|
|
|
|
|
'Major': 'Officer',
|
|
|
|
|
'Jonkheer': 'Royalty',
|
|
|
|
|
'Don': 'Royalty',
|
|
|
|
|
'Sir': 'Royalty',
|
|
|
|
|
'Dr': 'Officer',
|
|
|
|
|
'Rev': 'Officer',
|
|
|
|
|
'the Countess': 'Royalty',
|
|
|
|
|
'Dona': 'Royalty',
|
|
|
|
|
'Mme': 'Mrs',
|
|
|
|
|
'Mlle': 'Miss',
|
|
|
|
|
'Ms': 'Mrs',
|
|
|
|
|
'Mr': 'Mr',
|
|
|
|
|
'Mrs': 'Mrs',
|
|
|
|
|
'Miss': 'Miss',
|
|
|
|
|
'Master': 'Master',
|
|
|
|
|
'Lady': 'Royalty'}
|
|
|
|
|
Name['Title']=Name['Title'].map(title_mapDict)
|
|
|
|
|
Name=pd.get_dummies(Name['Title'])
|
|
|
|
|
full = pd.concat([full,Name],axis = 1 )
|
|
|
|
|
full.drop('Name',axis = 1,inplace = True)
|
|
|
|
|
name=['Officer','Royalty','Miss','Mrs','Mr''Master']
|
|
|
|
|
#print(full.head())
|
|
|
|
|
|
|
|
|
|
#提取Cabin的信息
|
|
|
|
|
#print(full['Cabin'].value_counts())
|
|
|
|
|
def Cabinchange(x):
|
|
|
|
|
return x[0]
|
|
|
|
|
full['Cabin']=full['Cabin'].map(Cabinchange)
|
|
|
|
|
#print(full['Cabin'].head())
|
|
|
|
|
Cabinin = pd.DataFrame()
|
|
|
|
|
Cabinin = pd.get_dummies(full['Cabin'],prefix='Cabin')
|
|
|
|
|
#print(Cabinin.head())
|
|
|
|
|
full = pd.concat([full,Cabinin],axis = 1)
|
|
|
|
|
#提取家庭成员人数信息
|
|
|
|
|
familyDf = pd.DataFrame()
|
|
|
|
|
familyDf['FamilySize']=full['Parch']+full['SibSp']+1
|
|
|
|
|
familyDf['Family_Single']=familyDf['FamilySize'].map(lambda s : 1 if s==1 else 0)
|
|
|
|
|
familyDf['Family_Small']=familyDf['FamilySize'].map(lambda s :1 if 2<= s <= 4 else 0)
|
|
|
|
|
familyDf['Family_Large']=familyDf['FamilySize'].map(lambda s :1 if 5<= s else 0)
|
|
|
|
|
full = pd.concat([full,familyDf],axis=1)
|
|
|
|
|
#计算各组数据与Surrvived的相关系数
|
|
|
|
|
corrDf = abs(full.corr())
|
|
|
|
|
sort=corrDf['Survived'].sort_values(ascending =False)
|
|
|
|
|
plt.figure('fig')
|
|
|
|
|
plt.xlim(0,1)
|
|
|
|
|
plt.barh(sort.index,sort.values)
|
|
|
|
|
|
|
|
|
|
#(corrDf['Survived'])
|
|
|
|
|
#构建模型
|
|
|
|
|
full_X = pd.concat( [Name,#头衔
|
|
|
|
|
pclassDf,
|
|
|
|
|
familyDf,
|
|
|
|
|
full['Fare'],
|
|
|
|
|
Cabinin,
|
|
|
|
|
full['Embarked_C'],
|
|
|
|
|
full['Embarked_Q'],
|
|
|
|
|
full['Embarked_S'],
|
|
|
|
|
full['Sex'],
|
|
|
|
|
] , axis=1 )
|
|
|
|
|
#print(full_X.head())
|
|
|
|
|
sourceRow = 891
|
|
|
|
|
source_X = full_X.loc[0:sourceRow-1,:]
|
|
|
|
|
source_y = full.loc[0:sourceRow-1,'Survived']
|
|
|
|
|
pred_X = full_X.loc[sourceRow:,:]
|
|
|
|
|
print('原始数据集有多少行:',source_X.shape[0])
|
|
|
|
|
print('预测数据集有多少行:',pred_X.shape[0])
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
train_X,test_X,train_y,test_y = train_test_split(source_X,source_y,train_size=0.8,random_state=33)
|
|
|
|
|
print('原始数据集的特征:',source_X.shape,
|
|
|
|
|
'训练数据集特征:',train_X.shape,
|
|
|
|
|
'测试数据集特征:',test_X.shape)
|
|
|
|
|
print('原始数据集的标签:',source_y.shape,
|
|
|
|
|
'训练数据集的标签:',train_y.shape,
|
|
|
|
|
'测试数据集的标签:',test_y.shape)
|
|
|
|
|
#取不同的n_neighbors值并观察取何值时拟合程度最高
|
|
|
|
|
k_range = range(1,21,2)
|
|
|
|
|
cv_scores = []
|
|
|
|
|
time0 = time()
|
|
|
|
|
for n in k_range:
|
|
|
|
|
print(n)
|
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=n)
|
|
|
|
|
scores = cross_val_score(knn,train_X,train_y,cv=10,scoring='accuracy')
|
|
|
|
|
cv_scores.append(scores.mean())
|
|
|
|
|
print('计算所用时长:%s' % (datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")))
|
|
|
|
|
print('最高准确率:',max(cv_scores),',对应的k值为:',k_range[cv_scores.index(max(cv_scores))])
|
|
|
|
|
plt.figure()
|
|
|
|
|
plt.plot(k_range,cv_scores)
|
|
|
|
|
plt.xlabel('K')
|
|
|
|
|
plt.ylabel('Accuracy')
|
|
|
|
|
plt.show()
|
|
|
|
|
model = KNeighborsClassifier(n_neighbors = k_range[cv_scores.index(max(cv_scores))])
|
|
|
|
|
model.fit( train_X , train_y )
|
|
|
|
|
#分类问题,score得到的是模型的正确率
|
|
|
|
|
print('模型得拟合程度为:',model.score(test_X , test_y ))
|
|
|
|
|
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
|
|
|
|
model_y_score = model.predict_proba(test_X)
|
|
|
|
|
model_fpr, model_tpr, _ = roc_curve(test_y,model_y_score[:,1], pos_label=1)
|
|
|
|
|
model_auc = auc(model_fpr, model_tpr)
|
|
|
|
|
plt.plot(model_fpr, model_tpr,
|
|
|
|
|
label='micro-average ROC curve',
|
|
|
|
|
color='b', linewidth=4)
|
|
|
|
|
plt.plot([0, 1], [0, 1], 'k--', lw=2,c='r')
|
|
|
|
|
plt.stackplot(model_fpr, model_tpr, color=['#ff0000'])
|
|
|
|
|
plt.text(0.5, 0.3, 'ROC', ha='center', fontsize=50, c='black', alpha=0.4)
|
|
|
|
|
plt.title('model roc')
|
|
|
|
|
plt.show()
|
|
|
|
|
#使用机器学习模型,对预测数据集中的生存情况进行预测
|
|
|
|
|
pred_Y=model.predict(pred_X)
|
|
|
|
|
#生成的预测值是浮点数(0.0,1,0),转换成整数
|
|
|
|
|
pred_Y=pred_Y.astype(int)
|
|
|
|
|
#3.显示男性与女性乘客生存比例并进行柱状图可视化
|
|
|
|
|
pred_X['predict'] = pred_Y
|
|
|
|
|
#print(pred_X.head())
|
|
|
|
|
x3 = [0,1,2,3]
|
|
|
|
|
index = ['男性','男性存活人数','女性','女性存活人数']
|
|
|
|
|
df = pred_X.groupby(by = ['Sex','predict']).count()
|
|
|
|
|
#print(df)
|
|
|
|
|
plt.figure('fig1')
|
|
|
|
|
plt.title('男性与女性乘客生存比例')
|
|
|
|
|
plt.ylabel('人数')
|
|
|
|
|
height = [266,45,152,91]
|
|
|
|
|
plt.bar(x3,height)
|
|
|
|
|
for x,y in zip(x3,height):
|
|
|
|
|
plt.text(x,y,'%.f'%(y),ha = 'center',va = 'bottom')
|
|
|
|
|
plt.xticks(x3,index)
|
|
|
|
|
fig=plt.figure()
|
|
|
|
|
plt.title('男女生存乘客生存比例')
|
|
|
|
|
|
|
|
|
|
ax = fig.gca()
|
|
|
|
|
explode = [0,0.1]
|
|
|
|
|
labels1 = ['女性死亡','女性存活']
|
|
|
|
|
labels2 = ['男性死亡','男性存活']
|
|
|
|
|
colors = ['yellow','green']
|
|
|
|
|
ax.pie([(152-91)/152,91/152],explode = explode,labels = labels1,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (0,0),frame = True)
|
|
|
|
|
ax.pie([(266-45)/266,45/266],explode = explode,labels = labels2,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (1,0),frame = True)
|
|
|
|
|
ax.set_xticks([0,1])
|
|
|
|
|
ax.set_yticks([0])
|
|
|
|
|
ax.set_xticklabels(["女","男"])
|
|
|
|
|
ax.set_xlim((-0.5,1.5))
|
|
|
|
|
ax.set_ylim((-0.5,0.5))
|
|
|
|
|
#4.显示不同客舱乘客生存比例并进行柱状图可视化
|
|
|
|
|
plt.figure('fig2')
|
|
|
|
|
plt.title('不同客舱乘客生存比例')
|
|
|
|
|
plt.ylabel('人数')
|
|
|
|
|
x = [0,1,2]
|
|
|
|
|
x = np.array(x)
|
|
|
|
|
width = 0.1
|
|
|
|
|
index = ['一等舱','二等舱','三等舱']
|
|
|
|
|
height1 = [107,93,218]
|
|
|
|
|
height2 = [48,36,72]
|
|
|
|
|
plt.bar(x-width,height1,width)
|
|
|
|
|
plt.bar(x+width,height2,width)
|
|
|
|
|
for x,y,i,j in zip(x-width,height1,x+width,height2):
|
|
|
|
|
plt.text(x,y,'%.f'%int(y),ha = 'center',va = 'bottom')
|
|
|
|
|
plt.text(i,j,'%.f'%int(j),ha = 'center',va = 'bottom')
|
|
|
|
|
x = [0,1,2]
|
|
|
|
|
plt.xticks(x,index)
|
|
|
|
|
fig=plt.figure('fig3')
|
|
|
|
|
plt.title('不同货舱乘客生存比例')
|
|
|
|
|
ax = fig.gca()
|
|
|
|
|
explode = [0,0.1]
|
|
|
|
|
labels = ['死亡','存活']
|
|
|
|
|
colors = ['yellow','green']
|
|
|
|
|
ax.pie([(107-48)/107,48/107],explode = explode,labels = labels,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (0,0),frame = True)
|
|
|
|
|
ax.pie([(93-36)/93,36/93],explode = explode,labels = labels,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (1,0),frame = True)
|
|
|
|
|
ax.pie([(218-72)/218,72/218],explode = explode,labels = labels,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (2,0),frame = True)
|
|
|
|
|
ax.set_xticks([0,1,2])
|
|
|
|
|
ax.set_yticks([0])
|
|
|
|
|
ax.set_xticklabels(['一等舱','二等舱','三等舱'])
|
|
|
|
|
ax.set_xlim((-0.5,2.5))
|
|
|
|
|
ax.set_ylim((-0.5,0.5))
|
|
|
|
|
plt.show()
|
|
|
|
|
df = pred_X.groupby(by = ['Officer','Royalty','Miss','Mrs','Mr','Master']).count()
|
|
|
|
|
df = pred_X.groupby(by = ['Officer','Royalty','Miss','Mrs','Mr','Master','predict']).count()
|
|
|
|
|
cunhuolv = [2/5,0,48/78,51/73,29/240,7/21]
|
|
|
|
|
name=['Officer','Royalty','Miss','Mrs','Mr','Master']
|
|
|
|
|
dict1 = dict(zip(name,cunhuolv))
|
|
|
|
|
list1 = sorted(dict1.items(),key = lambda x:x[-1],reverse = True)
|
|
|
|
|
dict1 = dict(list1)
|
|
|
|
|
plt.figure('fig4')
|
|
|
|
|
plt.title('不同身份的乘客的生存比例')
|
|
|
|
|
x1 = [0,1,2,3,4,5]
|
|
|
|
|
plt.bar(x1,list(dict1.values()))
|
|
|
|
|
for x,y in zip(x1,list(dict1.values())):
|
|
|
|
|
plt.text(x,y,'%.2f%%'%(y*100),ha = 'center',va = 'bottom')
|
|
|
|
|
na = list(dict1.keys())
|
|
|
|
|
plt.xlabel('身份')
|
|
|
|
|
plt.ylabel('存活率')
|
|
|
|
|
plt.xticks(x1,na)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|