|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
from sklearn import ensemble
|
|
|
|
|
|
db1=pd.read_csv('data\ObesityDataSet_raw_and_data_sinthetic.csv',header=0,names=['性别','年龄','身高','体重','家庭状况','高热量食物','蔬菜食用频率','主餐次数','两餐之间的食物消耗量','是否吸烟','每日水消耗量','卡路里消耗','身体活动频率','使用技术设备的时间','酒精消耗量','代步工具','肥胖度'])
|
|
|
db1.head()
|
|
|
print(db1)
|
|
|
db2=db1.copy()
|
|
|
db2.drop_duplicates(inplace=True)
|
|
|
db2.index = range(len(db2))
|
|
|
db2['性别'].replace(['Female','Male'],['1','0'],inplace=True)
|
|
|
db2['家庭状况'].replace(['yes','no'],['1','0'],inplace=True)
|
|
|
db2['高热量食物'].replace(['yes','no'],['1','0'],inplace=True)
|
|
|
db2['两餐之间的食物消耗量'].replace(['no','Frequently','Sometimes','Always'],['3','2','1','0'],inplace=True)
|
|
|
db2['是否吸烟'].replace(['yes','no'],['1','0'],inplace=True)
|
|
|
db2['卡路里消耗'].replace(['yes','no'],['1','0'],inplace=True)
|
|
|
db2['酒精消耗量'].replace(['no','Frequently','Sometimes','Always'],['0','1','2','3'],inplace=True)
|
|
|
db2['代步工具'].replace(['Walking','Public_Transportation','Bike','Motorbike','Automobile'],['1','2','3','4','5'],inplace=True)
|
|
|
db2['肥胖度'].replace(['Insufficient_Weight','Normal_Weight','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III','Overweight_Level_I','Overweight_Level_II'],['1','2','3','4','5','6','7'],inplace=True)
|
|
|
print(db2.head())
|
|
|
db3=db2['肥胖度'].value_counts()
|
|
|
db3.sort_index(inplace=True)
|
|
|
print(db3)
|
|
|
#########################################
|
|
|
import matplotlib.pyplot as plt
|
|
|
plt.rcParams['font.sans-serif']='SimHei'
|
|
|
plt.rcParams['axes.unicode_minus']=False
|
|
|
p2=plt.figure(figsize=(10,10))
|
|
|
db2_Lable=['体重不足','正常体重','超重水平I','超重水平II','I级肥胖症','II型肥胖症','III型肥胖症']
|
|
|
plt.title('肥胖度类别数据分类')
|
|
|
plt.bar(x=range(1,8),height=db3,width=1,color=['r','g','b','c','m','y','k'])
|
|
|
plt.xticks(range(1,8),db2_Lable)
|
|
|
plt.xlabel('肥胖度')
|
|
|
plt.ylabel('人数')
|
|
|
plt.legend(db2_Lable)
|
|
|
plt.savefig('data\分类平衡表.png')
|
|
|
plt.show()
|
|
|
db_corr=db2.corr()
|
|
|
print('原始数据的数据类型:','\n',db2.dtypes)
|
|
|
print('相关性矩阵:','\n',db_corr,'\n','相关性矩阵的数据类型:','\n',db_corr.dtypes)
|
|
|
db2=db2.astype(float)
|
|
|
db_corr=db2.corr(method='spearman')
|
|
|
db_corr_height=db_corr['体重']
|
|
|
print(db_corr_height)
|
|
|
####################################
|
|
|
import seaborn as sb
|
|
|
ax = sb.heatmap(db_corr,linewidths=0.05,vmax=1, vmin=0,cmap= "RdBu_r",annot= True,annot_kws={'size':6,'weight':'bold'})
|
|
|
plt.savefig('data\数据相关性.png')
|
|
|
plt.show()
|
|
|
plt.bar(range(1,18),db_corr_height,width=0.5,color=['r','g','b','c','m','y','k'])
|
|
|
plt.xticks(range(1,18),db2.columns,rotation=90)
|
|
|
plt.savefig('data\体重相关性.png')
|
|
|
plt.show()
|
|
|
#####################################
|
|
|
Finl_data=db2.drop(labels=['性别','是否吸烟','身体活动频率','使用技术设备的时间','代步工具','主餐次数'],axis=1)
|
|
|
print(Finl_data.columns)
|
|
|
print(Finl_data.head())
|
|
|
from sklearn.preprocessing import MinMaxScaler
|
|
|
data_x=Finl_data.drop(labels='肥胖度',axis=1)
|
|
|
data_y=Finl_data['肥胖度']
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
train_x,test_x,train_y,test_y=train_test_split(data_x,data_y,train_size=0.8,random_state=6)
|
|
|
#数据标准化
|
|
|
scaler=MinMaxScaler()
|
|
|
train_x_scaler=scaler.fit_transform(train_x)
|
|
|
test_x_scaler=scaler.fit_transform(test_x)
|
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
from sklearn.model_selection import GridSearchCV
|
|
|
knn = KNeighborsClassifier()
|
|
|
#pipe = Pipeline([("minmax", scaler), ("knn", knn)])
|
|
|
search_space = [
|
|
|
{
|
|
|
'weights':['uniform'],
|
|
|
'n_neighbors': [i for i in range(1,20)]
|
|
|
},
|
|
|
{
|
|
|
'weights':['distance'],
|
|
|
'n_neighbors': [i for i in range(1,20)],
|
|
|
'p':[i for i in range(1,6)],
|
|
|
}
|
|
|
]
|
|
|
classifier = GridSearchCV(knn, search_space, cv=5, verbose=1,n_jobs=-1)
|
|
|
classifier.fit(train_x_scaler, train_y)
|
|
|
#classifier.best_estimator_.get_params()["knn__n_neighbors"]
|
|
|
#print(classifier.best_params_)
|
|
|
print(classifier.best_estimator_,'\n','准确度:',classifier.best_score_,'\n',classifier.best_params_)
|
|
|
classifier_pred=classifier.predict(test_x_scaler)
|
|
|
true=np.sum(classifier_pred==test_y)
|
|
|
print('预测准确的总数:',true)
|
|
|
print('准确率:',true/test_y.shape[0])
|
|
|
from sklearn.metrics import classification_report
|
|
|
print('使用KNN预测肥胖度数据的分类报告:','\n',classification_report(test_y,classifier_pred))
|
|
|
from sklearn.metrics import confusion_matrix
|
|
|
a=confusion_matrix(test_y,classifier_pred,labels=[1,2,3,4,5,6,7,])
|
|
|
print('生成混淆矩阵:','\n',a)
|
|
|
a=a.astype(float)
|
|
|
for i in range(0,7):
|
|
|
a[:,i]=a[:,i]/np.sum(a[:,i])
|
|
|
ax = sb.heatmap(a,linewidths=0.05,vmax=1, vmin=0,cmap= "RdBu_r",annot= True,annot_kws={'size':6,'weight':'bold'})
|
|
|
plt.savefig('data\混淆矩阵.png')
|
|
|
plt.show()
|
|
|
#####################################
|
|
|
data_6=pd.concat([test_x,test_y],axis=1)
|
|
|
data_6.index=range(len(data_6))
|
|
|
pred_a=pd.DataFrame(classifier_pred)
|
|
|
pred_a.columns=['肥胖度预测']
|
|
|
pred_a.index=range(len(pred_a))
|
|
|
pred_b=pd.concat([data_6,pred_a],axis=1)
|
|
|
print(pred_b)
|
|
|
a=pred_b.loc[(pred_b['肥胖度']==2)&(pred_b['肥胖度预测']==6)]
|
|
|
print(a[['家庭状况','高热量食物','两餐之间的食物消耗量','每日水消耗量','蔬菜食用频率','卡路里消耗']])
|
|
|
b=Finl_data.loc[Finl_data['家庭状况']==1]
|
|
|
b_Lable=['体重不足','正常体重','超重水平I','超重水平II','I级肥胖症','II型肥胖症','III型肥胖症']
|
|
|
b_1=b['肥胖度'].value_counts()
|
|
|
b_1.sort_index(inplace=True)
|
|
|
plt.title('肥胖度类别数据分类')
|
|
|
plt.bar(x=range(len(b_1)),height=b_1,width=1,color=['r','g','b','c','m','y','k'])
|
|
|
plt.xticks(range(len(b_1)),b_Lable)
|
|
|
plt.xlabel('肥胖度')
|
|
|
plt.ylabel('人数')
|
|
|
plt.savefig('data\家中有肥胖人群.png')
|
|
|
plt.show()
|
|
|
###########
|
|
|
###########
|
|
|
###########
|
|
|
b_2_=Finl_data.loc[Finl_data['家庭状况']==0]
|
|
|
b_2=b_2_['肥胖度'].value_counts()
|
|
|
b_2.sort_index(inplace=True)
|
|
|
plt.title('肥胖度类别数据分类')
|
|
|
plt.bar(x=range(len(b_2)),height=b_2,width=1,color=['r','g','b','c','m','y','k'])
|
|
|
plt.xticks(range(len(b_2)),b_Lable)
|
|
|
plt.xlabel('肥胖度')
|
|
|
plt.ylabel('人数')
|
|
|
plt.savefig('data\家中无肥胖人群.png')
|
|
|
plt.show()
|
|
|
###############################
|
|
|
data_x_00=Finl_data.loc[Finl_data["家庭状况"]==1]
|
|
|
data_x_0=data_x_00.drop(labels='肥胖度',axis=1)
|
|
|
data_x_0=data_x_00.drop(labels='家庭状况',axis=1)
|
|
|
data_y_0=data_x_00['肥胖度']
|
|
|
train_x_0,test_x_0,train_y_0,test_y_0=train_test_split(data_x_0,data_y_0,train_size=0.8,random_state=6)
|
|
|
train_x_scaler_0=scaler.fit_transform(train_x_0)
|
|
|
test_x_scaler_0=scaler.fit_transform(test_x_0)
|
|
|
classifier.fit(train_x_scaler_0, train_y_0)
|
|
|
print(classifier.best_estimator_,'\n','准确度:',classifier.best_score_,'\n',classifier.best_params_)
|
|
|
classifier_pred_0=classifier.predict(test_x_scaler_0)
|
|
|
true_0=np.sum(classifier_pred_0==test_y_0)
|
|
|
print('预测准确的总数:',true_0)
|
|
|
print('准确率:',true_0/test_y_0.shape[0])
|
|
|
from sklearn.metrics import classification_report
|
|
|
print('使用KNN预测肥胖度数据的分类报告:','\n',classification_report(test_y_0,classifier_pred_0))
|
|
|
a_00=confusion_matrix(test_y_0,classifier_pred_0,labels=[1,2,3,4,5,6,7,])
|
|
|
print('生成混淆矩阵:','\n',a_00)
|
|
|
a_00=a_00.astype(float)
|
|
|
for i in range(0,7):
|
|
|
a_00[:,i]=a_00[:,i]/np.sum(a_00[:,i])
|
|
|
ax = sb.heatmap(a_00,linewidths=0.05,vmax=1, vmin=0,cmap= "RdBu_r",annot= True,annot_kws={'size':6,'weight':'bold'})
|
|
|
plt.savefig('data\家中无肥胖人群混淆矩阵.png')
|
|
|
plt.show()
|
|
|
#####################################
|
|
|
data_x_11=Finl_data.loc[Finl_data["家庭状况"]==0]
|
|
|
data_x_1=data_x_11.drop(labels='肥胖度',axis=1)
|
|
|
data_x_1=data_x_11.drop(labels='家庭状况',axis=1)
|
|
|
data_y_1=data_x_11['肥胖度']
|
|
|
train_x_1,test_x_1,train_y_1,test_y_1=train_test_split(data_x_1,data_y_1,train_size=0.8,random_state=6)
|
|
|
train_x_scaler_1=scaler.fit_transform(train_x_1)
|
|
|
test_x_scaler_1=scaler.fit_transform(test_x_1)
|
|
|
classifier.fit(train_x_scaler_1, train_y_1)
|
|
|
print(classifier.best_estimator_,'\n','准确度:',classifier.best_score_,'\n',classifier.best_params_)
|
|
|
classifier_pred_1=classifier.predict(test_x_scaler_1)
|
|
|
true_1=np.sum(classifier_pred_1==test_y_1)
|
|
|
print('预测准确的总数:',true_1)
|
|
|
print('准确率:',true_1/test_y_1.shape[0])
|
|
|
from sklearn.metrics import classification_report
|
|
|
print('使用KNN预测肥胖度数据的分类报告:','\n',classification_report(test_y_1,classifier_pred_1))
|
|
|
a_11=confusion_matrix(test_y_1,classifier_pred_1,labels=[1,2,3,4,5,6,7])
|
|
|
print('生成混淆矩阵:','\n',a_11)
|
|
|
a_11=a_11.astype(float)
|
|
|
for i in range(1,7):
|
|
|
a_11[:,i]=a_11[:,i]/np.sum(a_11[:,i])
|
|
|
ax = sb.heatmap(a_11,linewidths=0.05,vmax=1, vmin=0,cmap= "RdBu_r",annot= True,annot_kws={'size':6,'weight':'bold'})
|
|
|
plt.savefig('data\家中无肥胖人群混淆矩阵.png')
|
|
|
plt.show()
|
|
|
#####################################交叉验证得分
|
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
data_RanForest_x=Finl_data.drop(labels='肥胖度',axis=1)
|
|
|
data_RanForest_y=Finl_data['肥胖度']
|
|
|
data_RanForest_x_scaler=scaler.fit_transform(data_RanForest_x)
|
|
|
rfc = RandomForestClassifier(n_estimators=100)
|
|
|
from sklearn.model_selection import cross_val_score
|
|
|
rfc_score_1 = cross_val_score(rfc, data_RanForest_x_scaler, data_RanForest_y, cv=10).mean()
|
|
|
print('10折交叉验证得分:',rfc_score_1)
|
|
|
#################################
|
|
|
rfc_score_n_estimators=[]
|
|
|
for i in range(0,200,1):
|
|
|
rfc = RandomForestClassifier(n_estimators=i+1,n_jobs=-1)
|
|
|
score = cross_val_score(rfc, data_RanForest_x_scaler,data_RanForest_y, cv=10).mean()
|
|
|
rfc_score_n_estimators.append(score)
|
|
|
score_max_1= max(rfc_score_n_estimators)
|
|
|
rfc_max_n_estimators=rfc_score_n_estimators.index(score_max_1)
|
|
|
print('最好得分:',format(score_max_1),'\n',
|
|
|
'子树数量:',rfc_max_n_estimators)
|
|
|
plt.figure(figsize=[30,5])
|
|
|
plt.plot(range(1,201),rfc_score_n_estimators,'bo-')
|
|
|
plt.xticks(range(1,201),rotation=90)
|
|
|
plt.savefig('data\选择子树得分统计表.png')
|
|
|
plt.show()
|
|
|
#################################
|
|
|
################################
|
|
|
rfc_canshu_place =[
|
|
|
{
|
|
|
'max_features': [i for i in range(5,12)],
|
|
|
'max_depth':[i for i in range(1,20)]
|
|
|
}
|
|
|
]
|
|
|
rfc = RandomForestClassifier(n_estimators=139, random_state=90)
|
|
|
rfcgs = GridSearchCV(rfc,rfc_canshu_place, cv=10,verbose=1,n_jobs=-1)
|
|
|
rfcgs.fit(data_RanForest_x_scaler, data_RanForest_y)
|
|
|
print(rfcgs.best_estimator_,'\n','准确度:',rfcgs.best_score_,'\n',rfcgs.best_params_)
|
|
|
################################
|
|
|
rfc_canshu2_place=[
|
|
|
{
|
|
|
'min_samples_split':[2,3],
|
|
|
'min_samples_leaf':[1,2,3]
|
|
|
}
|
|
|
]
|
|
|
rfc = RandomForestClassifier(n_estimators=139, random_state=90,max_depth=13,max_features=6)
|
|
|
rfcgs = GridSearchCV(rfc,rfc_canshu2_place, cv=10,verbose=1,n_jobs=-1)
|
|
|
rfcgs.fit(data_RanForest_x_scaler, data_RanForest_y)
|
|
|
print(rfcgs.best_estimator_,'\n','准确度:',rfcgs.best_score_,'\n',rfcgs.best_params_)
|
|
|
################
|
|
|
rfc = RandomForestClassifier(n_estimators=139, random_state=90,max_depth=13,max_features=6,oob_score=True)
|
|
|
rfc=rfc.fit(data_RanForest_x_scaler,data_RanForest_y)
|
|
|
from sklearn.tree import export_graphviz
|
|
|
dot_data = export_graphviz(rfc.estimators_[0], out_file='tree.dot',feature_names=data_x.columns.tolist(),
|
|
|
class_names=['Insufficient_Weight','Normal_Weight','Obesity_Type_I',
|
|
|
'Obesity_Type_II','Obesity_Type_III','Overweight_Level_I',
|
|
|
'Overweight_Level_II'],special_characters=True,rounded=True,proportion=False,filled=True,precision=2)
|
|
|
import graphviz
|
|
|
with open("tree.dot",encoding='utf-8') as f:
|
|
|
dot_graph = f.read()
|
|
|
graph=graphviz.Source(dot_graph.replace("helvetica","FangSong"))
|
|
|
graph.render('tree')
|
|
|
print('使用袋外数据预测准确度:',rfc.oob_score_)
|
|
|
######################################
|
|
|
|