|
|
|
@ -0,0 +1,447 @@
|
|
|
|
|
#### 载入数据
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
# /data/shixunfiles/21979737119eb4fafd62cae509c0c571_1602468291676.csv
|
|
|
|
|
# 读入数据
|
|
|
|
|
data = pd.read_csv("/data/shixunfiles/2309cc5f04782ed9bb6016d9f4e381cf_1607607386535.csv")
|
|
|
|
|
#显示大小
|
|
|
|
|
print("Data set size:",data.shape)
|
|
|
|
|
# 数据集详细信息
|
|
|
|
|
print(data.info())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# #### 2.查看数据前10行数据
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(data.head(10))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# #### 3.显示男性与女性乘客生存比例并进行柱状图可视化
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
survived = 0
|
|
|
|
|
dead = 0
|
|
|
|
|
for i in data[pd.notnull(data['Survived'])]['Survived']:
|
|
|
|
|
if i==1.0:
|
|
|
|
|
survived+=1
|
|
|
|
|
else:
|
|
|
|
|
dead+=1
|
|
|
|
|
plt.bar([0,1], [dead,survived])
|
|
|
|
|
plt.xticks([0,1])
|
|
|
|
|
plt.show()
|
|
|
|
|
print('存活人数:',survived)
|
|
|
|
|
print('死亡人数:',dead)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#保留标签缺失值的数据
|
|
|
|
|
data_no = data[pd.isnull(data['Survived'])]
|
|
|
|
|
print(data_no.shape)
|
|
|
|
|
#保留标签不为缺失值的数据
|
|
|
|
|
data = data[pd.notnull(data['Survived'])]
|
|
|
|
|
print(data.shape)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#删除无用特征数据
|
|
|
|
|
drops = ['Cabin','Embarked','Fare','Name','Ticket','SibSp','Parch','PassengerId']
|
|
|
|
|
for drop in drops:
|
|
|
|
|
data = data.drop(drop, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data.head()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
survived_male = 0
|
|
|
|
|
survived_female = 0
|
|
|
|
|
for i in range(len(data)):
|
|
|
|
|
if data.loc[i,'Survived'] ==1.0 and data.loc[i,'Sex'] == 'male':
|
|
|
|
|
survived_male+= 1
|
|
|
|
|
elif data.loc[i,'Survived'] ==1.0 and data.loc[i,'Sex'] == 'female':
|
|
|
|
|
survived_female+= 1
|
|
|
|
|
else:
|
|
|
|
|
dead+=1
|
|
|
|
|
print(survived_male,survived_female)
|
|
|
|
|
plt.bar([0,1],[survived_male,survived_female])
|
|
|
|
|
plt.xticks([0,1],['survived_male','survived_female'])
|
|
|
|
|
for i,j in zip([0,1],[survived_male,survived_female]):
|
|
|
|
|
plt.text(i-0.05,j+1,j)
|
|
|
|
|
plt.title('男性与女性乘客生存比例')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#使用one-hot编码,将'Sex'中的特征里的属性值都当作新的特征附在数据的列上,特征名为前缀prefix加上该属性名
|
|
|
|
|
data = pd.concat([data, pd.get_dummies(data['Sex'], prefix='Sex')], 1)
|
|
|
|
|
data = data.drop('Sex', 1) #0-行,1-列
|
|
|
|
|
data.head()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#分离标签值和特征值
|
|
|
|
|
data.dropna(inplace=True)
|
|
|
|
|
np.isnan(data).any()
|
|
|
|
|
data_feature = data.drop('Survived',1)
|
|
|
|
|
data_label = data['Survived']
|
|
|
|
|
data_label = np.array(data_label)
|
|
|
|
|
data_label.shape
|
|
|
|
|
data_feature.head()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#对数据进行标准化
|
|
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
scaler = StandardScaler()
|
|
|
|
|
data_feature = scaler.fit_transform(data_feature)
|
|
|
|
|
data_feature = pd.DataFrame(data_feature)
|
|
|
|
|
data_feature.head()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 载入数据和标签
|
|
|
|
|
knn_data = data_feature
|
|
|
|
|
knn_label = data_label
|
|
|
|
|
knn_label.shape
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#训练集与测试集的划分
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(knn_data,knn_label, random_state=2020, test_size=0.25)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#建立knn模型
|
|
|
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=3)
|
|
|
|
|
knn.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
score = knn.score(X_train, y_train)
|
|
|
|
|
print('训练数据集的准确率:{:.3}%'.format(score*100))
|
|
|
|
|
score = knn.score(X_test, y_test)
|
|
|
|
|
print('测试数据集的准确率:{:.3}%'.format(score*100))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 交叉验证
|
|
|
|
|
from sklearn.model_selection import cross_val_score
|
|
|
|
|
from time import time
|
|
|
|
|
import datetime
|
|
|
|
|
k_range = range(1,21,2)
|
|
|
|
|
cv_scores = []
|
|
|
|
|
time0 = time()
|
|
|
|
|
# 遍历1到21的分类数,得到每个分类数下的分数
|
|
|
|
|
for n in k_range:
|
|
|
|
|
print(n)
|
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=n)
|
|
|
|
|
scores = cross_val_score(knn,X_train,y_train,cv=10,scoring='accuracy')
|
|
|
|
|
cv_scores.append(scores.mean())
|
|
|
|
|
print('计算所用时长:%s' % (datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")))
|
|
|
|
|
print('最高准确率:',max(cv_scores),',对应的k值为:',k_range[cv_scores.index(max(cv_scores))])
|
|
|
|
|
plt.plot(k_range,cv_scores)
|
|
|
|
|
plt.xlabel('K')
|
|
|
|
|
plt.ylabel('Accuracy')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 进一步寻找最佳k值
|
|
|
|
|
k_range = range(1,9,2)
|
|
|
|
|
cv_scores = []
|
|
|
|
|
time0 = time()
|
|
|
|
|
for n in k_range:
|
|
|
|
|
print(n)
|
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=n)
|
|
|
|
|
scores = cross_val_score(knn,X_train,y_train,cv=10,scoring='accuracy')
|
|
|
|
|
cv_scores.append(scores.mean())
|
|
|
|
|
print('计算所用时长:%s' % (datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")))
|
|
|
|
|
print('最高准确率:',max(cv_scores),',对应的k值为:',k_range[cv_scores.index(max(cv_scores))])
|
|
|
|
|
plt.plot(k_range,cv_scores)
|
|
|
|
|
plt.xlabel('K')
|
|
|
|
|
plt.ylabel('Accuracy')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#测试集评估
|
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=3)
|
|
|
|
|
knn.fit(X_train, y_train)
|
|
|
|
|
score = knn.score(X_train, y_train)
|
|
|
|
|
print('训练数据集的准确率:{:.3}%'.format(score*100))
|
|
|
|
|
score = knn.score(X_test, y_test)
|
|
|
|
|
print('测试数据集的准确率:{:.3}%'.format(score*100))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ROC
|
|
|
|
|
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
|
|
|
|
from sklearn.preprocessing import label_binarize
|
|
|
|
|
# y_test_hot = label_binarize(y_test, classes=(0, 1))
|
|
|
|
|
knn_y_score = knn.predict_proba(X_test)
|
|
|
|
|
knn_fpr, knn_tpr, _ = roc_curve(y_test,knn_y_score[:,1], pos_label=1)
|
|
|
|
|
plt.plot(knn_tpr,knn_fpr,
|
|
|
|
|
label='micro-average ROC curve',
|
|
|
|
|
color='g', linewidth=4)
|
|
|
|
|
plt.plot([0, 1], [0, 1], 'k--', lw=2,c='r')
|
|
|
|
|
plt.title('knn roc')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# AUC
|
|
|
|
|
knn_auc = auc(knn_fpr, knn_tpr)
|
|
|
|
|
knn_auc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# #### 4.显示不同客舱乘客生存比例并进行柱状图可视化
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
# /data/shixunfiles/21979737119eb4fafd62cae509c0c571_1602468291676.csv
|
|
|
|
|
# 读入数据
|
|
|
|
|
data = pd.read_csv("/data/shixunfiles/2309cc5f04782ed9bb6016d9f4e381cf_1607607386535.csv")
|
|
|
|
|
survival_pclass = data.groupby('Pclass')['Survived'].mean()
|
|
|
|
|
print("\nSurvival ratio of passengers in different cabins:")
|
|
|
|
|
print(survival_pclass)
|
|
|
|
|
|
|
|
|
|
# 柱状图可视化
|
|
|
|
|
survival_pclass.plot(kind='bar')
|
|
|
|
|
plt.title('Survival ratio of passengers in different cabins')
|
|
|
|
|
plt.ylabel('Survival ratio')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#神经网络
|
|
|
|
|
cnn_data = np.asarray(data_feature)
|
|
|
|
|
cnn_label = np.asarray(data_label)
|
|
|
|
|
cnn_data.shape
|
|
|
|
|
x_train = cnn_data[:500]
|
|
|
|
|
y_train = cnn_label[:500]
|
|
|
|
|
x_test = cnn_data[500:]
|
|
|
|
|
y_test = cnn_label[500:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from keras import models
|
|
|
|
|
from keras import layers
|
|
|
|
|
model = models.Sequential()
|
|
|
|
|
## 输入层,激活函数为relu
|
|
|
|
|
model.add(layers.Dense(16,activation='relu',input_shape=(4,)))
|
|
|
|
|
## 中间层,激活函数为relu
|
|
|
|
|
model.add(layers.Dense(16,activation='relu'))
|
|
|
|
|
## 输出层,维数为1
|
|
|
|
|
model.add(layers.Dense(1,activation='sigmoid'))
|
|
|
|
|
model.summary()
|
|
|
|
|
model.compile(optimizer = 'rmsprop',
|
|
|
|
|
loss='binary_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
x_val = x_train[:350]
|
|
|
|
|
partial_x_train = x_train[350:]
|
|
|
|
|
|
|
|
|
|
y_val = y_train[:350]
|
|
|
|
|
partial_y_train = y_train[350:]
|
|
|
|
|
history = model.fit(partial_x_train,
|
|
|
|
|
partial_y_train,
|
|
|
|
|
epochs=20,
|
|
|
|
|
batch_size=512,
|
|
|
|
|
validation_data=(x_val,y_val))
|
|
|
|
|
history_dict = history.history
|
|
|
|
|
history_dict.keys()
|
|
|
|
|
results = model.evaluate(x_test,y_test)
|
|
|
|
|
results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 损失值
|
|
|
|
|
loss_values = history_dict['loss']
|
|
|
|
|
val_loss_values = history_dict['val_loss']
|
|
|
|
|
## 迭代次数
|
|
|
|
|
epochs = range(1,len(loss_values)+1)
|
|
|
|
|
plt.plot(epochs,loss_values,'bo',label='Training loss')
|
|
|
|
|
plt.plot(epochs,val_loss_values,'b',label='Validation loss')
|
|
|
|
|
plt.title('Trian and Validation loss')
|
|
|
|
|
plt.xlabel('Epochs')
|
|
|
|
|
plt.ylabel('Loss')
|
|
|
|
|
plt.legend()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
acc = history_dict['acc']
|
|
|
|
|
val_acc = history_dict['val_acc']
|
|
|
|
|
plt.plot(epochs,acc,'bo',label='Training acc')
|
|
|
|
|
plt.plot(epochs,val_acc,'b',label='Validation acc')
|
|
|
|
|
plt.title('Trian and Validation acc')
|
|
|
|
|
plt.xlabel('Epochs')
|
|
|
|
|
plt.ylabel('acc')
|
|
|
|
|
plt.legend()
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nums1 = [16,32,64]
|
|
|
|
|
nums2 = [16,32,64]
|
|
|
|
|
epochs = [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
|
|
|
|
|
# 由于上述3行代码运行时间过长,为了减少运行时间,我们将最优参数标出如下3行,
|
|
|
|
|
# 若同学学有余力可取消注释进行完整训练。
|
|
|
|
|
#nums1 = [32]
|
|
|
|
|
#nums2 = [32]
|
|
|
|
|
#epochs = [10]
|
|
|
|
|
|
|
|
|
|
best_result = 0
|
|
|
|
|
best_param = []
|
|
|
|
|
## 设置中间层不同的节点数,不同的迭代次数取最好的训练结果参数
|
|
|
|
|
for num1 in nums1:
|
|
|
|
|
for num2 in nums2:
|
|
|
|
|
for epoch in epochs:
|
|
|
|
|
model = models.Sequential()
|
|
|
|
|
model.add(layers.Dense(num1,activation='relu',input_shape=(4,)))
|
|
|
|
|
model.add(layers.Dense(num2,activation='relu'))
|
|
|
|
|
model.add(layers.Dense(1,activation='sigmoid'))
|
|
|
|
|
|
|
|
|
|
model.compile(optimizer = 'rmsprop',
|
|
|
|
|
loss='binary_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
|
|
|
|
|
history = model.fit(x_train,y_train,epochs=epoch,batch_size=512)
|
|
|
|
|
|
|
|
|
|
results = model.evaluate(x_test,y_test)
|
|
|
|
|
if best_result<results[1]:
|
|
|
|
|
best_result = results[1]
|
|
|
|
|
best_param = []
|
|
|
|
|
best_param.append(num1)
|
|
|
|
|
best_param.append(num2)
|
|
|
|
|
best_param.append(epoch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('最高的准确率:',best_result)
|
|
|
|
|
print('最好的参数:',best_param)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#进一步试出第一个参数的最佳值
|
|
|
|
|
nums1 = [32,64,128,256]
|
|
|
|
|
nums2 = [32]
|
|
|
|
|
epochs = [8]
|
|
|
|
|
# 由于上述3行代码运行时间过长,为了减少运行时间,我们将最优参数标出如下3行,
|
|
|
|
|
# 若同学学有余力可取消注释进行完整训练。
|
|
|
|
|
#nums1 = [32]
|
|
|
|
|
#nums2 = [32]
|
|
|
|
|
#epochs = [10]
|
|
|
|
|
|
|
|
|
|
best_result = 0
|
|
|
|
|
best_param = []
|
|
|
|
|
## 设置中间层不同的节点数,不同的迭代次数取最好的训练结果参数
|
|
|
|
|
for num1 in nums1:
|
|
|
|
|
for num2 in nums2:
|
|
|
|
|
for epoch in epochs:
|
|
|
|
|
model = models.Sequential()
|
|
|
|
|
model.add(layers.Dense(num1,activation='relu',input_shape=(4,)))
|
|
|
|
|
model.add(layers.Dense(num2,activation='relu'))
|
|
|
|
|
model.add(layers.Dense(1,activation='sigmoid'))
|
|
|
|
|
|
|
|
|
|
model.compile(optimizer = 'rmsprop',
|
|
|
|
|
loss='binary_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
|
|
|
|
|
history = model.fit(x_train,y_train,epochs=epoch,batch_size=512)
|
|
|
|
|
|
|
|
|
|
results = model.evaluate(x_test,y_test)
|
|
|
|
|
if best_result<results[1]:
|
|
|
|
|
best_result = results[1]
|
|
|
|
|
best_param = []
|
|
|
|
|
best_param.append(num1)
|
|
|
|
|
best_param.append(num2)
|
|
|
|
|
best_param.append(epoch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('最高的准确率:',best_result)
|
|
|
|
|
print('最好的参数:',best_param)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = models.Sequential()
|
|
|
|
|
model.add(layers.Dense(64,activation='relu',input_shape=(4,)))
|
|
|
|
|
model.add(layers.Dense(32,activation='relu'))
|
|
|
|
|
model.add(layers.Dense(1,activation='sigmoid'))
|
|
|
|
|
model.compile(optimizer = 'rmsprop',
|
|
|
|
|
loss='binary_crossentropy',
|
|
|
|
|
metrics=['accuracy'])
|
|
|
|
|
history = model.fit(x_train,y_train,epochs=8,batch_size=512)
|
|
|
|
|
results = model.evaluate(x_test,y_test)
|
|
|
|
|
results
|
|
|
|
|
|
|
|
|
|
model.save('cnn_model.h5')
|
|
|
|
|
# 载入
|
|
|
|
|
from keras.models import load_model
|
|
|
|
|
model = load_model('cnn_model.h5')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 绘制混淆矩阵
|
|
|
|
|
from sklearn.metrics import confusion_matrix
|
|
|
|
|
y_pred = model.predict(x_test)
|
|
|
|
|
y_pred = y_pred.reshape(-1)
|
|
|
|
|
for i,pred in enumerate(y_pred):
|
|
|
|
|
if pred>0.5:
|
|
|
|
|
y_pred[i]=1.0
|
|
|
|
|
else:
|
|
|
|
|
y_pred[i]=0.0
|
|
|
|
|
print(y_pred.shape)
|
|
|
|
|
print(y_test.shape)
|
|
|
|
|
|
|
|
|
|
# ROC
|
|
|
|
|
from sklearn.metrics import confusion_matrix
|
|
|
|
|
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
|
|
|
|
model_y_score = model.predict_proba(x_test)
|
|
|
|
|
model_y_score = model_y_score.reshape(-1)
|
|
|
|
|
model_fpr, model_tpr, _ = roc_curve(y_test,model_y_score, pos_label=1)
|
|
|
|
|
plt.plot(model_fpr, model_tpr,
|
|
|
|
|
label='micro-average ROC curve',
|
|
|
|
|
color='g', linewidth=4)
|
|
|
|
|
plt.plot([0, 1], [0, 1], 'k--', lw=2,c='r')
|
|
|
|
|
plt.title('model roc')
|
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
|
|
# AUC
|
|
|
|
|
model_auc = auc(model_fpr, model_tpr)
|
|
|
|
|
model_auc
|