You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Titanic/泰坦尼克号生存情况分析大作业.py

220 lines
7.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
"""
Created on Tue May 31 15:44:45 2022
@author: FADER
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from time import time
import datetime
plt.rcParams['font.sans-serif'] = ['SimHei']
# 步骤一替换sans-serif字体
plt.rcParams['axes.unicode_minus'] = False
train = pd.read_csv(r'C:\Users\FADER\Desktop\python课件\train.csv')
test = pd.read_csv(r'C:\Users\FADER\Desktop\python课件\test.csv')
#print('训练数据集:',train.shape,'测试数据集:',test.shape)
full = train.append( test , ignore_index = True )
#print(full.info)
#print(full.head(10))
#print ('合并后的数据集:',full.shape)
#print(full.info())
#填补缺失的数据
full['Age']=full['Age'].fillna(full['Age'].mean())
full['Fare']=full['Fare'].fillna(full['Fare'].mean())
full['Embarked']=full['Embarked'].fillna('S')
full['Cabin'] = full['Cabin'].fillna( 'U' )
full['Embarked']=full['Embarked'].fillna('S')
#print(full['Sex'].head())
dict1 = {'male':1,'female':0}
full['Sex']=full['Sex'].map(dict1)
#print(full['Sex'].head())
#print(full['Embarked'].head())
#将Embarked的数据分类后并提取为新的列
def embarkeddefyC(x):
return 1 if x == 'C' else 0
def embarkeddefyQ(x):
return 1 if x == 'Q' else 0
def embarkeddefyS(x):
return 1 if x == 'S' else 0
full['Embarked_C']=full['Embarked'].map(embarkeddefyC)
full['Embarked_Q']=full['Embarked'].map(embarkeddefyQ)
full['Embarked_S']=full['Embarked'].map(embarkeddefyS)
full.drop('Embarked',axis=1,inplace=True)
#print(full.head())
#将Pclass的数据分类后并提取为新的列
pclassDf = pd.DataFrame()
#使用get_dummies进行one-hot编码列名前缀是Pclass
pclassDf = pd.get_dummies(full['Pclass'],prefix='Pclass')
#print(pclassDf.head())
full = pd.concat([full,pclassDf],axis=1)
full.drop('Pclass',axis=1,inplace=True)
#print(full.head())
#提取名字的信息
def getTitle(name):
str1=name.split(',')[1]
str2=str1.split('.')[0]
str3=str2.strip()
return str3
Name = pd.DataFrame()
Name['Title']=full['Name'].map(getTitle)
title_mapDict={
'Capt': 'Officer',
'Col': 'Officer',
'Major': 'Officer',
'Jonkheer': 'Royalty',
'Don': 'Royalty',
'Sir': 'Royalty',
'Dr': 'Officer',
'Rev': 'Officer',
'the Countess': 'Royalty',
'Dona': 'Royalty',
'Mme': 'Mrs',
'Mlle': 'Miss',
'Ms': 'Mrs',
'Mr': 'Mr',
'Mrs': 'Mrs',
'Miss': 'Miss',
'Master': 'Master',
'Lady': 'Royalty'}
Name['Title']=Name['Title'].map(title_mapDict)
Name=pd.get_dummies(Name['Title'])
full = pd.concat([full,Name],axis = 1 )
full.drop('Name',axis = 1,inplace = True)
#print(full.head())
#提取Cabin的信息
#print(full['Cabin'].value_counts())
def Cabinchange(x):
return x[0]
full['Cabin']=full['Cabin'].map(Cabinchange)
#print(full['Cabin'].head())
Cabinin = pd.DataFrame()
Cabinin = pd.get_dummies(full['Cabin'],prefix='Cabin')
#print(Cabinin.head())
full = pd.concat([full,Cabinin],axis = 1)
#提取家庭成员人数信息
familyDf = pd.DataFrame()
familyDf['FamilySize']=full['Parch']+full['SibSp']+1
familyDf['Family_Single']=familyDf['FamilySize'].map(lambda s : 1 if s==1 else 0)
familyDf['Family_Small']=familyDf['FamilySize'].map(lambda s :1 if 2<= s <= 4 else 0)
familyDf['Family_Large']=familyDf['FamilySize'].map(lambda s :1 if 5<= s else 0)
full = pd.concat([full,familyDf],axis=1)
#计算各组数据与Surrvived的相关系数
corrDf = full.corr()
corrDf['Survived'].sort_values(ascending =False)
#(corrDf['Survived'])
#构建模型
full_X = pd.concat( [Name,#头衔
pclassDf,
familyDf,
full['Fare'],
Cabinin,
full['Embarked_C'],
full['Embarked_Q'],
full['Embarked_S'],
full['Sex'],
] , axis=1 )
#print(full_X.head())
sourceRow = 891
source_X = full_X.loc[0:sourceRow-1,:]
source_y = full.loc[0:sourceRow-1,'Survived']
pred_X = full_X.loc[sourceRow:,:]
print('原始数据集有多少行:',source_X.shape[0])
print('预测数据集有多少行:',pred_X.shape[0])
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(source_X,source_y,train_size=0.8,random_state=33)
print('原始数据集的特征:',source_X.shape,
'训练数据集特征:',train_X.shape,
'测试数据集特征:',test_X.shape)
print('原始数据集的标签:',source_y.shape,
'训练数据集的标签:',train_y.shape,
'测试数据集的标签:',test_y.shape)
#取不同的n_neighbors值并观察取何值时拟合程度最高
k_range = range(1,21,2)
cv_scores = []
time0 = time()
for n in k_range:
print(n)
knn = KNeighborsClassifier(n_neighbors=n)
scores = cross_val_score(knn,train_X,train_y,cv=10,scoring='accuracy')
cv_scores.append(scores.mean())
print('计算所用时长:%s' % (datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")))
print('最高准确率:',max(cv_scores),',对应的k值为:',k_range[cv_scores.index(max(cv_scores))])
plt.plot(k_range,cv_scores)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()
model = LogisticRegression()
model = KNeighborsClassifier(n_neighbors = k_range[cv_scores.index(max(cv_scores))])
model.fit( train_X , train_y )
#分类问题score得到的是模型的正确率
print('模型得拟合程度为:',model.score(test_X , test_y ))
#使用机器学习模型,对预测数据集中的生存情况进行预测
pred_Y=model.predict(pred_X)
#生成的预测值是浮点数0.0,1,0转换成整数
pred_Y=pred_Y.astype(int)
#3.显示男性与女性乘客生存比例并进行柱状图可视化
pred_X['predict'] = pred_Y
#print(pred_X.head())
index = ['男性','男性存活人数','女性','女性存活人数']
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] =1
return counts
ls = pred_X
counts = get_counts(pred_X['Sex'])
df = pred_X.groupby(by = ['Sex','predict']).count()
#print(df)
plt.figure('fig1')
plt.title('男性与女性乘客生存比例')
plt.ylabel('人数')
height = [266,45,152,91]
plt.bar(index,height)
#4.显示不同客舱乘客生存比例并进行柱状图可视化
plt.figure('fig2')
plt.title('不同客舱乘客生存比例')
plt.ylabel('人数')
x = [0,1,2]
x = np.array(x)
width = 0.1
index = ['Pclass_1','Pclass_2','Pclass_3']
height1 = [107,93,218]
height2 = [48,36,72]
plt.bar(x-width,height1,width)
plt.bar(x+width,height2,width)
plt.xticks(x,index)
plt.show()