diff --git a/泰坦尼克号生存情况分析大作业.py b/泰坦尼克号生存情况分析大作业.py new file mode 100644 index 0000000..8c087e7 --- /dev/null +++ b/泰坦尼克号生存情况分析大作业.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue May 31 15:44:45 2022 + +@author: FADER +""" + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.linear_model import LogisticRegression +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import cross_val_score +from time import time +import datetime +plt.rcParams['font.sans-serif'] = ['SimHei'] +# 步骤一(替换sans-serif字体) +plt.rcParams['axes.unicode_minus'] = False +train = pd.read_csv('train.csv') +test = pd.read_csv('test.csv') +#print('训练数据集:',train.shape,'测试数据集:',test.shape) +full = train.append( test , ignore_index = True ) +#print(full.info) +#print(full.head(10)) +#print ('合并后的数据集:',full.shape) +#print(full.info()) +#填补缺失的数据 +full['Age']=full['Age'].fillna(full['Age'].mean()) +full['Fare']=full['Fare'].fillna(full['Fare'].mean()) +full['Embarked']=full['Embarked'].fillna('S') +full['Cabin'] = full['Cabin'].fillna( 'U' ) +full['Embarked']=full['Embarked'].fillna('S') +#print(full['Sex'].head()) +dict1 = {'male':1,'female':0} +full['Sex']=full['Sex'].map(dict1) +#print(full['Sex'].head()) +#print(full['Embarked'].head()) +#将Embarked的数据分类后并提取为新的列 +def embarkeddefyC(x): + return 1 if x == 'C' else 0 +def embarkeddefyQ(x): + return 1 if x == 'Q' else 0 +def embarkeddefyS(x): + return 1 if x == 'S' else 0 +full['Embarked_C']=full['Embarked'].map(embarkeddefyC) +full['Embarked_Q']=full['Embarked'].map(embarkeddefyQ) +full['Embarked_S']=full['Embarked'].map(embarkeddefyS) +full.drop('Embarked',axis=1,inplace=True) +#print(full.head()) +#将Pclass的数据分类后并提取为新的列 +pclassDf = pd.DataFrame() +#使用get_dummies进行one-hot编码,列名前缀是Pclass +pclassDf = pd.get_dummies(full['Pclass'],prefix='Pclass') +#print(pclassDf.head()) +full = pd.concat([full,pclassDf],axis=1) +full.drop('Pclass',axis=1,inplace=True) +#print(full.head()) +#提取名字的信息 +def getTitle(name): + str1=name.split(',')[1] + str2=str1.split('.')[0] + str3=str2.strip() + return str3 +Name = pd.DataFrame() +Name['Title']=full['Name'].map(getTitle) +title_mapDict={ + 'Capt': 'Officer', + 'Col': 'Officer', + 'Major': 'Officer', + 'Jonkheer': 'Royalty', + 'Don': 'Royalty', + 'Sir': 'Royalty', + 'Dr': 'Officer', + 'Rev': 'Officer', + 'the Countess': 'Royalty', + 'Dona': 'Royalty', + 'Mme': 'Mrs', + 'Mlle': 'Miss', + 'Ms': 'Mrs', + 'Mr': 'Mr', + 'Mrs': 'Mrs', + 'Miss': 'Miss', + 'Master': 'Master', + 'Lady': 'Royalty'} +Name['Title']=Name['Title'].map(title_mapDict) +Name=pd.get_dummies(Name['Title']) +full = pd.concat([full,Name],axis = 1 ) +full.drop('Name',axis = 1,inplace = True) +name=['Officer','Royalty','Miss','Mrs','Mr''Master'] +#print(full.head()) + +#提取Cabin的信息 +#print(full['Cabin'].value_counts()) +def Cabinchange(x): + return x[0] +full['Cabin']=full['Cabin'].map(Cabinchange) +#print(full['Cabin'].head()) +Cabinin = pd.DataFrame() +Cabinin = pd.get_dummies(full['Cabin'],prefix='Cabin') +#print(Cabinin.head()) +full = pd.concat([full,Cabinin],axis = 1) +#提取家庭成员人数信息 +familyDf = pd.DataFrame() +familyDf['FamilySize']=full['Parch']+full['SibSp']+1 +familyDf['Family_Single']=familyDf['FamilySize'].map(lambda s : 1 if s==1 else 0) +familyDf['Family_Small']=familyDf['FamilySize'].map(lambda s :1 if 2<= s <= 4 else 0) +familyDf['Family_Large']=familyDf['FamilySize'].map(lambda s :1 if 5<= s else 0) +full = pd.concat([full,familyDf],axis=1) +#计算各组数据与Surrvived的相关系数 +corrDf = abs(full.corr()) +sort=corrDf['Survived'].sort_values(ascending =False) +plt.figure('fig') +plt.xlim(0,1) +plt.barh(sort.index,sort.values) + +#(corrDf['Survived']) +#构建模型 +full_X = pd.concat( [Name,#头衔 + pclassDf, + familyDf, + full['Fare'], + Cabinin, + full['Embarked_C'], + full['Embarked_Q'], + full['Embarked_S'], + full['Sex'], + ] , axis=1 ) +#print(full_X.head()) +sourceRow = 891 +source_X = full_X.loc[0:sourceRow-1,:] +source_y = full.loc[0:sourceRow-1,'Survived'] +pred_X = full_X.loc[sourceRow:,:] +print('原始数据集有多少行:',source_X.shape[0]) +print('预测数据集有多少行:',pred_X.shape[0]) +from sklearn.model_selection import train_test_split +train_X,test_X,train_y,test_y = train_test_split(source_X,source_y,train_size=0.8,random_state=33) +print('原始数据集的特征:',source_X.shape, + '训练数据集特征:',train_X.shape, + '测试数据集特征:',test_X.shape) +print('原始数据集的标签:',source_y.shape, + '训练数据集的标签:',train_y.shape, + '测试数据集的标签:',test_y.shape) +#取不同的n_neighbors值并观察取何值时拟合程度最高 +k_range = range(1,21,2) +cv_scores = [] +time0 = time() +for n in k_range: + print(n) + knn = KNeighborsClassifier(n_neighbors=n) + scores = cross_val_score(knn,train_X,train_y,cv=10,scoring='accuracy') + cv_scores.append(scores.mean()) +print('计算所用时长:%s' % (datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))) +print('最高准确率:',max(cv_scores),',对应的k值为:',k_range[cv_scores.index(max(cv_scores))]) +plt.figure() +plt.plot(k_range,cv_scores) +plt.xlabel('K') +plt.ylabel('Accuracy') +plt.show() +model = KNeighborsClassifier(n_neighbors = k_range[cv_scores.index(max(cv_scores))]) +model.fit( train_X , train_y ) +#分类问题,score得到的是模型的正确率 +print('模型得拟合程度为:',model.score(test_X , test_y )) +from sklearn.metrics import roc_curve, auc, roc_auc_score +model_y_score = model.predict_proba(test_X) +model_fpr, model_tpr, _ = roc_curve(test_y,model_y_score[:,1], pos_label=1) +model_auc = auc(model_fpr, model_tpr) +plt.plot(model_fpr, model_tpr, + label='micro-average ROC curve', + color='b', linewidth=4) +plt.plot([0, 1], [0, 1], 'k--', lw=2,c='r') +plt.stackplot(model_fpr, model_tpr, color=['#ff0000']) +plt.text(0.5, 0.3, 'ROC', ha='center', fontsize=50, c='black', alpha=0.4) +plt.title('model roc') +plt.show() +#使用机器学习模型,对预测数据集中的生存情况进行预测 +pred_Y=model.predict(pred_X) +#生成的预测值是浮点数(0.0,1,0),转换成整数 +pred_Y=pred_Y.astype(int) +#3.显示男性与女性乘客生存比例并进行柱状图可视化 +pred_X['predict'] = pred_Y +#print(pred_X.head()) +x3 = [0,1,2,3] +index = ['男性','男性存活人数','女性','女性存活人数'] +df = pred_X.groupby(by = ['Sex','predict']).count() +#print(df) +plt.figure('fig1') +plt.title('男性与女性乘客生存比例') +plt.ylabel('人数') +height = [266,45,152,91] +plt.bar(x3,height) +for x,y in zip(x3,height): + plt.text(x,y,'%.f'%(y),ha = 'center',va = 'bottom') +plt.xticks(x3,index) +fig=plt.figure() +plt.title('男女生存乘客生存比例') + +ax = fig.gca() +explode = [0,0.1] +labels1 = ['女性死亡','女性存活'] +labels2 = ['男性死亡','男性存活'] +colors = ['yellow','green'] +ax.pie([(152-91)/152,91/152],explode = explode,labels = labels1,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (0,0),frame = True) +ax.pie([(266-45)/266,45/266],explode = explode,labels = labels2,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (1,0),frame = True) +ax.set_xticks([0,1]) +ax.set_yticks([0]) +ax.set_xticklabels(["女","男"]) +ax.set_xlim((-0.5,1.5)) +ax.set_ylim((-0.5,0.5)) +#4.显示不同客舱乘客生存比例并进行柱状图可视化 +plt.figure('fig2') +plt.title('不同客舱乘客生存比例') +plt.ylabel('人数') +x = [0,1,2] +x = np.array(x) +width = 0.1 +index = ['一等舱','二等舱','三等舱'] +height1 = [107,93,218] +height2 = [48,36,72] +plt.bar(x-width,height1,width) +plt.bar(x+width,height2,width) +for x,y,i,j in zip(x-width,height1,x+width,height2): + plt.text(x,y,'%.f'%int(y),ha = 'center',va = 'bottom') + plt.text(i,j,'%.f'%int(j),ha = 'center',va = 'bottom') +x = [0,1,2] +plt.xticks(x,index) +fig=plt.figure('fig3') +plt.title('不同货舱乘客生存比例') +ax = fig.gca() +explode = [0,0.1] +labels = ['死亡','存活'] +colors = ['yellow','green'] +ax.pie([(107-48)/107,48/107],explode = explode,labels = labels,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (0,0),frame = True) +ax.pie([(93-36)/93,36/93],explode = explode,labels = labels,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (1,0),frame = True) +ax.pie([(218-72)/218,72/218],explode = explode,labels = labels,colors = colors,autopct = '%1.1f%%',shadow = True,startangle = 90,radius = 0.35,center = (2,0),frame = True) +ax.set_xticks([0,1,2]) +ax.set_yticks([0]) +ax.set_xticklabels(['一等舱','二等舱','三等舱']) +ax.set_xlim((-0.5,2.5)) +ax.set_ylim((-0.5,0.5)) +plt.show() +df = pred_X.groupby(by = ['Officer','Royalty','Miss','Mrs','Mr','Master']).count() +df = pred_X.groupby(by = ['Officer','Royalty','Miss','Mrs','Mr','Master','predict']).count() +cunhuolv = [2/5,0,48/78,51/73,29/240,7/21] +name=['Officer','Royalty','Miss','Mrs','Mr','Master'] +dict1 = dict(zip(name,cunhuolv)) +list1 = sorted(dict1.items(),key = lambda x:x[-1],reverse = True) +dict1 = dict(list1) +plt.figure('fig4') +plt.title('不同身份的乘客的生存比例') +x1 = [0,1,2,3,4,5] +plt.bar(x1,list(dict1.values())) +for x,y in zip(x1,list(dict1.values())): + plt.text(x,y,'%.2f%%'%(y*100),ha = 'center',va = 'bottom') +na = list(dict1.keys()) +plt.xlabel('身份') +plt.ylabel('存活率') +plt.xticks(x1,na) + + +