diff --git a/泰坦尼克号生存情况分析大作业.py b/泰坦尼克号生存情况分析大作业.py deleted file mode 100644 index f3213ed..0000000 --- a/泰坦尼克号生存情况分析大作业.py +++ /dev/null @@ -1,219 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Tue May 31 15:44:45 2022 - -@author: FADER -""" - -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from sklearn.linear_model import LogisticRegression -from sklearn.neighbors import KNeighborsClassifier -from sklearn.model_selection import cross_val_score -from time import time -import datetime -plt.rcParams['font.sans-serif'] = ['SimHei'] -# 步骤一(替换sans-serif字体) -plt.rcParams['axes.unicode_minus'] = False -train = pd.read_csv(r'C:\Users\FADER\Desktop\python课件\train.csv') -test = pd.read_csv(r'C:\Users\FADER\Desktop\python课件\test.csv') -#print('训练数据集:',train.shape,'测试数据集:',test.shape) -full = train.append( test , ignore_index = True ) -#print(full.info) -#print(full.head(10)) -#print ('合并后的数据集:',full.shape) -#print(full.info()) - -#填补缺失的数据 -full['Age']=full['Age'].fillna(full['Age'].mean()) -full['Fare']=full['Fare'].fillna(full['Fare'].mean()) -full['Embarked']=full['Embarked'].fillna('S') -full['Cabin'] = full['Cabin'].fillna( 'U' ) -full['Embarked']=full['Embarked'].fillna('S') -#print(full['Sex'].head()) -dict1 = {'male':1,'female':0} -full['Sex']=full['Sex'].map(dict1) -#print(full['Sex'].head()) -#print(full['Embarked'].head()) - -#将Embarked的数据分类后并提取为新的列 -def embarkeddefyC(x): - return 1 if x == 'C' else 0 -def embarkeddefyQ(x): - return 1 if x == 'Q' else 0 -def embarkeddefyS(x): - return 1 if x == 'S' else 0 -full['Embarked_C']=full['Embarked'].map(embarkeddefyC) -full['Embarked_Q']=full['Embarked'].map(embarkeddefyQ) -full['Embarked_S']=full['Embarked'].map(embarkeddefyS) -full.drop('Embarked',axis=1,inplace=True) -#print(full.head()) - -#将Pclass的数据分类后并提取为新的列 -pclassDf = pd.DataFrame() -#使用get_dummies进行one-hot编码,列名前缀是Pclass -pclassDf = pd.get_dummies(full['Pclass'],prefix='Pclass') -#print(pclassDf.head()) -full = pd.concat([full,pclassDf],axis=1) -full.drop('Pclass',axis=1,inplace=True) -#print(full.head()) - -#提取名字的信息 -def getTitle(name): - str1=name.split(',')[1] - str2=str1.split('.')[0] - str3=str2.strip() - return str3 -Name = pd.DataFrame() -Name['Title']=full['Name'].map(getTitle) -title_mapDict={ - 'Capt': 'Officer', - 'Col': 'Officer', - 'Major': 'Officer', - 'Jonkheer': 'Royalty', - 'Don': 'Royalty', - 'Sir': 'Royalty', - 'Dr': 'Officer', - 'Rev': 'Officer', - 'the Countess': 'Royalty', - 'Dona': 'Royalty', - 'Mme': 'Mrs', - 'Mlle': 'Miss', - 'Ms': 'Mrs', - 'Mr': 'Mr', - 'Mrs': 'Mrs', - 'Miss': 'Miss', - 'Master': 'Master', - 'Lady': 'Royalty'} -Name['Title']=Name['Title'].map(title_mapDict) -Name=pd.get_dummies(Name['Title']) -full = pd.concat([full,Name],axis = 1 ) -full.drop('Name',axis = 1,inplace = True) -#print(full.head()) - -#提取Cabin的信息 -#print(full['Cabin'].value_counts()) -def Cabinchange(x): - return x[0] -full['Cabin']=full['Cabin'].map(Cabinchange) -#print(full['Cabin'].head()) -Cabinin = pd.DataFrame() -Cabinin = pd.get_dummies(full['Cabin'],prefix='Cabin') -#print(Cabinin.head()) -full = pd.concat([full,Cabinin],axis = 1) - -#提取家庭成员人数信息 -familyDf = pd.DataFrame() -familyDf['FamilySize']=full['Parch']+full['SibSp']+1 -familyDf['Family_Single']=familyDf['FamilySize'].map(lambda s : 1 if s==1 else 0) -familyDf['Family_Small']=familyDf['FamilySize'].map(lambda s :1 if 2<= s <= 4 else 0) -familyDf['Family_Large']=familyDf['FamilySize'].map(lambda s :1 if 5<= s else 0) -full = pd.concat([full,familyDf],axis=1) - -#计算各组数据与Surrvived的相关系数 -corrDf = full.corr() -corrDf['Survived'].sort_values(ascending =False) -#(corrDf['Survived']) - -#构建模型 -full_X = pd.concat( [Name,#头衔 - pclassDf, - familyDf, - full['Fare'], - Cabinin, - full['Embarked_C'], - full['Embarked_Q'], - full['Embarked_S'], - full['Sex'], - ] , axis=1 ) -#print(full_X.head()) -sourceRow = 891 -source_X = full_X.loc[0:sourceRow-1,:] -source_y = full.loc[0:sourceRow-1,'Survived'] -pred_X = full_X.loc[sourceRow:,:] -print('原始数据集有多少行:',source_X.shape[0]) -print('预测数据集有多少行:',pred_X.shape[0]) -from sklearn.model_selection import train_test_split -train_X,test_X,train_y,test_y = train_test_split(source_X,source_y,train_size=0.8,random_state=33) -print('原始数据集的特征:',source_X.shape, - '训练数据集特征:',train_X.shape, - '测试数据集特征:',test_X.shape) -print('原始数据集的标签:',source_y.shape, - '训练数据集的标签:',train_y.shape, - '测试数据集的标签:',test_y.shape) - -#取不同的n_neighbors值并观察取何值时拟合程度最高 -k_range = range(1,21,2) -cv_scores = [] -time0 = time() -for n in k_range: - print(n) - knn = KNeighborsClassifier(n_neighbors=n) - scores = cross_val_score(knn,train_X,train_y,cv=10,scoring='accuracy') - cv_scores.append(scores.mean()) -print('计算所用时长:%s' % (datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))) -print('最高准确率:',max(cv_scores),',对应的k值为:',k_range[cv_scores.index(max(cv_scores))]) -plt.plot(k_range,cv_scores) -plt.xlabel('K') -plt.ylabel('Accuracy') -plt.show() -model = LogisticRegression() -model = KNeighborsClassifier(n_neighbors = k_range[cv_scores.index(max(cv_scores))]) -model.fit( train_X , train_y ) - -#分类问题,score得到的是模型的正确率 -print('模型得拟合程度为:',model.score(test_X , test_y )) - -#使用机器学习模型,对预测数据集中的生存情况进行预测 -pred_Y=model.predict(pred_X) - -#生成的预测值是浮点数(0.0,1,0),转换成整数 -pred_Y=pred_Y.astype(int) - -#3.显示男性与女性乘客生存比例并进行柱状图可视化 -pred_X['predict'] = pred_Y -#print(pred_X.head()) -index = ['男性','男性存活人数','女性','女性存活人数'] -def get_counts(sequence): - counts = {} - for x in sequence: - if x in counts: - counts[x] += 1 - else: - counts[x] =1 - return counts -ls = pred_X -counts = get_counts(pred_X['Sex']) - -df = pred_X.groupby(by = ['Sex','predict']).count() -#print(df) -plt.figure('fig1') -plt.title('男性与女性乘客生存比例') -plt.ylabel('人数') -height = [266,45,152,91] -plt.bar(index,height) -#4.显示不同客舱乘客生存比例并进行柱状图可视化 -plt.figure('fig2') -plt.title('不同客舱乘客生存比例') -plt.ylabel('人数') -x = [0,1,2] -x = np.array(x) -width = 0.1 -index = ['Pclass_1','Pclass_2','Pclass_3'] -height1 = [107,93,218] -height2 = [48,36,72] -plt.bar(x-width,height1,width) -plt.bar(x+width,height2,width) -plt.xticks(x,index) - -plt.show() - - - - - - - - -