You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

248 lines
12 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import numpy as np
import pandas as pd
from sklearn import ensemble
db1=pd.read_csv('data\ObesityDataSet_raw_and_data_sinthetic.csv',header=0,names=['性别','年龄','身高','体重','家庭状况','高热量食物','蔬菜食用频率','主餐次数','两餐之间的食物消耗量','是否吸烟','每日水消耗量','卡路里消耗','身体活动频率','使用技术设备的时间','酒精消耗量','代步工具','肥胖度'])
db1.head()
print(db1)
db2=db1.copy()
db2.drop_duplicates(inplace=True)
db2.index = range(len(db2))
db2['性别'].replace(['Female','Male'],['1','0'],inplace=True)
db2['家庭状况'].replace(['yes','no'],['1','0'],inplace=True)
db2['高热量食物'].replace(['yes','no'],['1','0'],inplace=True)
db2['两餐之间的食物消耗量'].replace(['no','Frequently','Sometimes','Always'],['3','2','1','0'],inplace=True)
db2['是否吸烟'].replace(['yes','no'],['1','0'],inplace=True)
db2['卡路里消耗'].replace(['yes','no'],['1','0'],inplace=True)
db2['酒精消耗量'].replace(['no','Frequently','Sometimes','Always'],['0','1','2','3'],inplace=True)
db2['代步工具'].replace(['Walking','Public_Transportation','Bike','Motorbike','Automobile'],['1','2','3','4','5'],inplace=True)
db2['肥胖度'].replace(['Insufficient_Weight','Normal_Weight','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III','Overweight_Level_I','Overweight_Level_II'],['1','2','3','4','5','6','7'],inplace=True)
print(db2.head())
db3=db2['肥胖度'].value_counts()
db3.sort_index(inplace=True)
print(db3)
#########################################
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']='SimHei'
plt.rcParams['axes.unicode_minus']=False
p2=plt.figure(figsize=(10,10))
db2_Lable=['体重不足','正常体重','超重水平I','超重水平II','I级肥胖症','II型肥胖症','III型肥胖症']
plt.title('肥胖度类别数据分类')
plt.bar(x=range(1,8),height=db3,width=1,color=['r','g','b','c','m','y','k'])
plt.xticks(range(1,8),db2_Lable)
plt.xlabel('肥胖度')
plt.ylabel('人数')
plt.legend(db2_Lable)
plt.savefig('data\分类平衡表.png')
plt.show()
db_corr=db2.corr()
print('原始数据的数据类型:','\n',db2.dtypes)
print('相关性矩阵:','\n',db_corr,'\n','相关性矩阵的数据类型:','\n',db_corr.dtypes)
db2=db2.astype(float)
db_corr=db2.corr(method='spearman')
db_corr_height=db_corr['体重']
print(db_corr_height)
####################################
import seaborn as sb
ax = sb.heatmap(db_corr,linewidths=0.05,vmax=1, vmin=0,cmap= "RdBu_r",annot= True,annot_kws={'size':6,'weight':'bold'})
plt.savefig('data\数据相关性.png')
plt.show()
plt.bar(range(1,18),db_corr_height,width=0.5,color=['r','g','b','c','m','y','k'])
plt.xticks(range(1,18),db2.columns,rotation=90)
plt.savefig('data\体重相关性.png')
plt.show()
#####################################
Finl_data=db2.drop(labels=['性别','是否吸烟','身体活动频率','使用技术设备的时间','代步工具','主餐次数'],axis=1)
print(Finl_data.columns)
print(Finl_data.head())
from sklearn.preprocessing import MinMaxScaler
data_x=Finl_data.drop(labels='肥胖度',axis=1)
data_y=Finl_data['肥胖度']
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(data_x,data_y,train_size=0.8,random_state=6)
#数据标准化
scaler=MinMaxScaler()
train_x_scaler=scaler.fit_transform(train_x)
test_x_scaler=scaler.fit_transform(test_x)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
#pipe = Pipeline([("minmax", scaler), ("knn", knn)])
search_space = [
{
'weights':['uniform'],
'n_neighbors': [i for i in range(1,20)]
},
{
'weights':['distance'],
'n_neighbors': [i for i in range(1,20)],
'p':[i for i in range(1,6)],
}
]
classifier = GridSearchCV(knn, search_space, cv=5, verbose=1,n_jobs=-1)
classifier.fit(train_x_scaler, train_y)
#classifier.best_estimator_.get_params()["knn__n_neighbors"]
#print(classifier.best_params_)
print(classifier.best_estimator_,'\n','准确度:',classifier.best_score_,'\n',classifier.best_params_)
classifier_pred=classifier.predict(test_x_scaler)
true=np.sum(classifier_pred==test_y)
print('预测准确的总数:',true)
print('准确率:',true/test_y.shape[0])
from sklearn.metrics import classification_report
print('使用KNN预测肥胖度数据的分类报告','\n',classification_report(test_y,classifier_pred))
from sklearn.metrics import confusion_matrix
a=confusion_matrix(test_y,classifier_pred,labels=[1,2,3,4,5,6,7,])
print('生成混淆矩阵:','\n',a)
a=a.astype(float)
for i in range(0,7):
a[:,i]=a[:,i]/np.sum(a[:,i])
ax = sb.heatmap(a,linewidths=0.05,vmax=1, vmin=0,cmap= "RdBu_r",annot= True,annot_kws={'size':6,'weight':'bold'})
plt.savefig('data\混淆矩阵.png')
plt.show()
#####################################
data_6=pd.concat([test_x,test_y],axis=1)
data_6.index=range(len(data_6))
pred_a=pd.DataFrame(classifier_pred)
pred_a.columns=['肥胖度预测']
pred_a.index=range(len(pred_a))
pred_b=pd.concat([data_6,pred_a],axis=1)
print(pred_b)
a=pred_b.loc[(pred_b['肥胖度']==2)&(pred_b['肥胖度预测']==6)]
print(a[['家庭状况','高热量食物','两餐之间的食物消耗量','每日水消耗量','蔬菜食用频率','卡路里消耗']])
b=Finl_data.loc[Finl_data['家庭状况']==1]
b_Lable=['体重不足','正常体重','超重水平I','超重水平II','I级肥胖症','II型肥胖症','III型肥胖症']
b_1=b['肥胖度'].value_counts()
b_1.sort_index(inplace=True)
plt.title('肥胖度类别数据分类')
plt.bar(x=range(len(b_1)),height=b_1,width=1,color=['r','g','b','c','m','y','k'])
plt.xticks(range(len(b_1)),b_Lable)
plt.xlabel('肥胖度')
plt.ylabel('人数')
plt.savefig('data\家中有肥胖人群.png')
plt.show()
###########
###########
###########
b_2_=Finl_data.loc[Finl_data['家庭状况']==0]
b_2=b_2_['肥胖度'].value_counts()
b_2.sort_index(inplace=True)
plt.title('肥胖度类别数据分类')
plt.bar(x=range(len(b_2)),height=b_2,width=1,color=['r','g','b','c','m','y','k'])
plt.xticks(range(len(b_2)),b_Lable)
plt.xlabel('肥胖度')
plt.ylabel('人数')
plt.savefig('data\家中无肥胖人群.png')
plt.show()
###############################
data_x_00=Finl_data.loc[Finl_data["家庭状况"]==1]
data_x_0=data_x_00.drop(labels='肥胖度',axis=1)
data_x_0=data_x_00.drop(labels='家庭状况',axis=1)
data_y_0=data_x_00['肥胖度']
train_x_0,test_x_0,train_y_0,test_y_0=train_test_split(data_x_0,data_y_0,train_size=0.8,random_state=6)
train_x_scaler_0=scaler.fit_transform(train_x_0)
test_x_scaler_0=scaler.fit_transform(test_x_0)
classifier.fit(train_x_scaler_0, train_y_0)
print(classifier.best_estimator_,'\n','准确度:',classifier.best_score_,'\n',classifier.best_params_)
classifier_pred_0=classifier.predict(test_x_scaler_0)
true_0=np.sum(classifier_pred_0==test_y_0)
print('预测准确的总数:',true_0)
print('准确率:',true_0/test_y_0.shape[0])
from sklearn.metrics import classification_report
print('使用KNN预测肥胖度数据的分类报告','\n',classification_report(test_y_0,classifier_pred_0))
a_00=confusion_matrix(test_y_0,classifier_pred_0,labels=[1,2,3,4,5,6,7,])
print('生成混淆矩阵:','\n',a_00)
a_00=a_00.astype(float)
for i in range(0,7):
a_00[:,i]=a_00[:,i]/np.sum(a_00[:,i])
ax = sb.heatmap(a_00,linewidths=0.05,vmax=1, vmin=0,cmap= "RdBu_r",annot= True,annot_kws={'size':6,'weight':'bold'})
plt.savefig('data\家中无肥胖人群混淆矩阵.png')
plt.show()
#####################################
data_x_11=Finl_data.loc[Finl_data["家庭状况"]==0]
data_x_1=data_x_11.drop(labels='肥胖度',axis=1)
data_x_1=data_x_11.drop(labels='家庭状况',axis=1)
data_y_1=data_x_11['肥胖度']
train_x_1,test_x_1,train_y_1,test_y_1=train_test_split(data_x_1,data_y_1,train_size=0.8,random_state=6)
train_x_scaler_1=scaler.fit_transform(train_x_1)
test_x_scaler_1=scaler.fit_transform(test_x_1)
classifier.fit(train_x_scaler_1, train_y_1)
print(classifier.best_estimator_,'\n','准确度:',classifier.best_score_,'\n',classifier.best_params_)
classifier_pred_1=classifier.predict(test_x_scaler_1)
true_1=np.sum(classifier_pred_1==test_y_1)
print('预测准确的总数:',true_1)
print('准确率:',true_1/test_y_1.shape[0])
from sklearn.metrics import classification_report
print('使用KNN预测肥胖度数据的分类报告','\n',classification_report(test_y_1,classifier_pred_1))
a_11=confusion_matrix(test_y_1,classifier_pred_1,labels=[1,2,3,4,5,6,7])
print('生成混淆矩阵:','\n',a_11)
a_11=a_11.astype(float)
for i in range(1,7):
a_11[:,i]=a_11[:,i]/np.sum(a_11[:,i])
ax = sb.heatmap(a_11,linewidths=0.05,vmax=1, vmin=0,cmap= "RdBu_r",annot= True,annot_kws={'size':6,'weight':'bold'})
plt.savefig('data\家中无肥胖人群混淆矩阵.png')
plt.show()
#####################################交叉验证得分
from sklearn.ensemble import RandomForestClassifier
data_RanForest_x=Finl_data.drop(labels='肥胖度',axis=1)
data_RanForest_y=Finl_data['肥胖度']
data_RanForest_x_scaler=scaler.fit_transform(data_RanForest_x)
rfc = RandomForestClassifier(n_estimators=100)
from sklearn.model_selection import cross_val_score
rfc_score_1 = cross_val_score(rfc, data_RanForest_x_scaler, data_RanForest_y, cv=10).mean()
print('10折交叉验证得分',rfc_score_1)
#################################
rfc_score_n_estimators=[]
for i in range(0,200,1):
rfc = RandomForestClassifier(n_estimators=i+1,n_jobs=-1)
score = cross_val_score(rfc, data_RanForest_x_scaler,data_RanForest_y, cv=10).mean()
rfc_score_n_estimators.append(score)
score_max_1= max(rfc_score_n_estimators)
rfc_max_n_estimators=rfc_score_n_estimators.index(score_max_1)
print('最好得分:',format(score_max_1),'\n',
'子树数量:',rfc_max_n_estimators)
plt.figure(figsize=[30,5])
plt.plot(range(1,201),rfc_score_n_estimators,'bo-')
plt.xticks(range(1,201),rotation=90)
plt.savefig('data\选择子树得分统计表.png')
plt.show()
#################################
################################
rfc_canshu_place =[
{
'max_features': [i for i in range(5,12)],
'max_depth':[i for i in range(1,20)]
}
]
rfc = RandomForestClassifier(n_estimators=139, random_state=90)
rfcgs = GridSearchCV(rfc,rfc_canshu_place, cv=10,verbose=1,n_jobs=-1)
rfcgs.fit(data_RanForest_x_scaler, data_RanForest_y)
print(rfcgs.best_estimator_,'\n','准确度:',rfcgs.best_score_,'\n',rfcgs.best_params_)
################################
rfc_canshu2_place=[
{
'min_samples_split':[2,3],
'min_samples_leaf':[1,2,3]
}
]
rfc = RandomForestClassifier(n_estimators=139, random_state=90,max_depth=13,max_features=6)
rfcgs = GridSearchCV(rfc,rfc_canshu2_place, cv=10,verbose=1,n_jobs=-1)
rfcgs.fit(data_RanForest_x_scaler, data_RanForest_y)
print(rfcgs.best_estimator_,'\n','准确度:',rfcgs.best_score_,'\n',rfcgs.best_params_)
################
rfc = RandomForestClassifier(n_estimators=139, random_state=90,max_depth=13,max_features=6,oob_score=True)
rfc=rfc.fit(data_RanForest_x_scaler,data_RanForest_y)
from sklearn.tree import export_graphviz
dot_data = export_graphviz(rfc.estimators_[0], out_file='tree.dot',feature_names=data_x.columns.tolist(),
class_names=['Insufficient_Weight','Normal_Weight','Obesity_Type_I',
'Obesity_Type_II','Obesity_Type_III','Overweight_Level_I',
'Overweight_Level_II'],special_characters=True,rounded=True,proportion=False,filled=True,precision=2)
import graphviz
with open("tree.dot",encoding='utf-8') as f:
dot_graph = f.read()
graph=graphviz.Source(dot_graph.replace("helvetica","FangSong"))
graph.render('tree')
print('使用袋外数据预测准确度:',rfc.oob_score_)
######################################