Delete '泰坦尼克号生存情况分析大作业.py'

master
hnu202110040415 3 years ago
parent f1d2661d8f
commit 346f669525

@ -1,219 +0,0 @@
# -*- coding: utf-8 -*-
"""
Created on Tue May 31 15:44:45 2022
@author: FADER
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from time import time
import datetime
plt.rcParams['font.sans-serif'] = ['SimHei']
# 步骤一替换sans-serif字体
plt.rcParams['axes.unicode_minus'] = False
train = pd.read_csv(r'C:\Users\FADER\Desktop\python课件\train.csv')
test = pd.read_csv(r'C:\Users\FADER\Desktop\python课件\test.csv')
#print('训练数据集:',train.shape,'测试数据集:',test.shape)
full = train.append( test , ignore_index = True )
#print(full.info)
#print(full.head(10))
#print ('合并后的数据集:',full.shape)
#print(full.info())
#填补缺失的数据
full['Age']=full['Age'].fillna(full['Age'].mean())
full['Fare']=full['Fare'].fillna(full['Fare'].mean())
full['Embarked']=full['Embarked'].fillna('S')
full['Cabin'] = full['Cabin'].fillna( 'U' )
full['Embarked']=full['Embarked'].fillna('S')
#print(full['Sex'].head())
dict1 = {'male':1,'female':0}
full['Sex']=full['Sex'].map(dict1)
#print(full['Sex'].head())
#print(full['Embarked'].head())
#将Embarked的数据分类后并提取为新的列
def embarkeddefyC(x):
return 1 if x == 'C' else 0
def embarkeddefyQ(x):
return 1 if x == 'Q' else 0
def embarkeddefyS(x):
return 1 if x == 'S' else 0
full['Embarked_C']=full['Embarked'].map(embarkeddefyC)
full['Embarked_Q']=full['Embarked'].map(embarkeddefyQ)
full['Embarked_S']=full['Embarked'].map(embarkeddefyS)
full.drop('Embarked',axis=1,inplace=True)
#print(full.head())
#将Pclass的数据分类后并提取为新的列
pclassDf = pd.DataFrame()
#使用get_dummies进行one-hot编码列名前缀是Pclass
pclassDf = pd.get_dummies(full['Pclass'],prefix='Pclass')
#print(pclassDf.head())
full = pd.concat([full,pclassDf],axis=1)
full.drop('Pclass',axis=1,inplace=True)
#print(full.head())
#提取名字的信息
def getTitle(name):
str1=name.split(',')[1]
str2=str1.split('.')[0]
str3=str2.strip()
return str3
Name = pd.DataFrame()
Name['Title']=full['Name'].map(getTitle)
title_mapDict={
'Capt': 'Officer',
'Col': 'Officer',
'Major': 'Officer',
'Jonkheer': 'Royalty',
'Don': 'Royalty',
'Sir': 'Royalty',
'Dr': 'Officer',
'Rev': 'Officer',
'the Countess': 'Royalty',
'Dona': 'Royalty',
'Mme': 'Mrs',
'Mlle': 'Miss',
'Ms': 'Mrs',
'Mr': 'Mr',
'Mrs': 'Mrs',
'Miss': 'Miss',
'Master': 'Master',
'Lady': 'Royalty'}
Name['Title']=Name['Title'].map(title_mapDict)
Name=pd.get_dummies(Name['Title'])
full = pd.concat([full,Name],axis = 1 )
full.drop('Name',axis = 1,inplace = True)
#print(full.head())
#提取Cabin的信息
#print(full['Cabin'].value_counts())
def Cabinchange(x):
return x[0]
full['Cabin']=full['Cabin'].map(Cabinchange)
#print(full['Cabin'].head())
Cabinin = pd.DataFrame()
Cabinin = pd.get_dummies(full['Cabin'],prefix='Cabin')
#print(Cabinin.head())
full = pd.concat([full,Cabinin],axis = 1)
#提取家庭成员人数信息
familyDf = pd.DataFrame()
familyDf['FamilySize']=full['Parch']+full['SibSp']+1
familyDf['Family_Single']=familyDf['FamilySize'].map(lambda s : 1 if s==1 else 0)
familyDf['Family_Small']=familyDf['FamilySize'].map(lambda s :1 if 2<= s <= 4 else 0)
familyDf['Family_Large']=familyDf['FamilySize'].map(lambda s :1 if 5<= s else 0)
full = pd.concat([full,familyDf],axis=1)
#计算各组数据与Surrvived的相关系数
corrDf = full.corr()
corrDf['Survived'].sort_values(ascending =False)
#(corrDf['Survived'])
#构建模型
full_X = pd.concat( [Name,#头衔
pclassDf,
familyDf,
full['Fare'],
Cabinin,
full['Embarked_C'],
full['Embarked_Q'],
full['Embarked_S'],
full['Sex'],
] , axis=1 )
#print(full_X.head())
sourceRow = 891
source_X = full_X.loc[0:sourceRow-1,:]
source_y = full.loc[0:sourceRow-1,'Survived']
pred_X = full_X.loc[sourceRow:,:]
print('原始数据集有多少行:',source_X.shape[0])
print('预测数据集有多少行:',pred_X.shape[0])
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(source_X,source_y,train_size=0.8,random_state=33)
print('原始数据集的特征:',source_X.shape,
'训练数据集特征:',train_X.shape,
'测试数据集特征:',test_X.shape)
print('原始数据集的标签:',source_y.shape,
'训练数据集的标签:',train_y.shape,
'测试数据集的标签:',test_y.shape)
#取不同的n_neighbors值并观察取何值时拟合程度最高
k_range = range(1,21,2)
cv_scores = []
time0 = time()
for n in k_range:
print(n)
knn = KNeighborsClassifier(n_neighbors=n)
scores = cross_val_score(knn,train_X,train_y,cv=10,scoring='accuracy')
cv_scores.append(scores.mean())
print('计算所用时长:%s' % (datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")))
print('最高准确率:',max(cv_scores),',对应的k值为:',k_range[cv_scores.index(max(cv_scores))])
plt.plot(k_range,cv_scores)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()
model = LogisticRegression()
model = KNeighborsClassifier(n_neighbors = k_range[cv_scores.index(max(cv_scores))])
model.fit( train_X , train_y )
#分类问题score得到的是模型的正确率
print('模型得拟合程度为:',model.score(test_X , test_y ))
#使用机器学习模型,对预测数据集中的生存情况进行预测
pred_Y=model.predict(pred_X)
#生成的预测值是浮点数0.0,1,0转换成整数
pred_Y=pred_Y.astype(int)
#3.显示男性与女性乘客生存比例并进行柱状图可视化
pred_X['predict'] = pred_Y
#print(pred_X.head())
index = ['男性','男性存活人数','女性','女性存活人数']
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] =1
return counts
ls = pred_X
counts = get_counts(pred_X['Sex'])
df = pred_X.groupby(by = ['Sex','predict']).count()
#print(df)
plt.figure('fig1')
plt.title('男性与女性乘客生存比例')
plt.ylabel('人数')
height = [266,45,152,91]
plt.bar(index,height)
#4.显示不同客舱乘客生存比例并进行柱状图可视化
plt.figure('fig2')
plt.title('不同客舱乘客生存比例')
plt.ylabel('人数')
x = [0,1,2]
x = np.array(x)
width = 0.1
index = ['Pclass_1','Pclass_2','Pclass_3']
height1 = [107,93,218]
height2 = [48,36,72]
plt.bar(x-width,height1,width)
plt.bar(x+width,height2,width)
plt.xticks(x,index)
plt.show()
Loading…
Cancel
Save