|
|
|
|
# 6.4:动手实现决策树
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
#encoding=utf8
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
# 计算熵
|
|
|
|
|
def calcInfoEntropy(label):
|
|
|
|
|
'''
|
|
|
|
|
label(narray):样本标签
|
|
|
|
|
'''
|
|
|
|
|
label_set = set(label)
|
|
|
|
|
result = 0
|
|
|
|
|
for l in label_set:
|
|
|
|
|
count = 0
|
|
|
|
|
for j in range(len(label)):
|
|
|
|
|
if label[j] == l:
|
|
|
|
|
count += 1
|
|
|
|
|
# 计算标签在数据集中出现的概率
|
|
|
|
|
p = count / len(label)
|
|
|
|
|
# 计算熵
|
|
|
|
|
result -= p * np.log2(p)
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
#计算条件熵
|
|
|
|
|
def calcHDA(feature,label,index,value):
|
|
|
|
|
'''
|
|
|
|
|
input:
|
|
|
|
|
feature(ndarray):样本特征
|
|
|
|
|
label(ndarray):样本标签
|
|
|
|
|
index(int):需要使用的特征列索引
|
|
|
|
|
value(int):index所表示的特征列中需要考察的特征值
|
|
|
|
|
output:
|
|
|
|
|
HDA(float):信息熵
|
|
|
|
|
'''
|
|
|
|
|
count = 0
|
|
|
|
|
# sub_feature和sub_label表示根据特征列和特征值分割出的子数据集中的特征和标签
|
|
|
|
|
sub_feature = []
|
|
|
|
|
sub_label = []
|
|
|
|
|
for i in range(len(feature)):
|
|
|
|
|
if feature[i][index] == value:
|
|
|
|
|
count += 1
|
|
|
|
|
sub_feature.append(feature[i])
|
|
|
|
|
sub_label.append(label[i])
|
|
|
|
|
pHA = count / len(feature)
|
|
|
|
|
e = calcInfoEntropy(sub_label)
|
|
|
|
|
HDA = pHA * e
|
|
|
|
|
return HDA
|
|
|
|
|
|
|
|
|
|
#计算信息增益
|
|
|
|
|
def calcInfoGain(feature, label, index):
|
|
|
|
|
'''
|
|
|
|
|
input:
|
|
|
|
|
feature(ndarry):测试用例中字典里的feature
|
|
|
|
|
label(ndarray):测试用例中字典里的label
|
|
|
|
|
index(int):测试用例中字典里的index,即feature部分特征列的索引。该索引指的是feature中第几个特征,如index:0表示使用第一个特征来计算信息增益。
|
|
|
|
|
output:
|
|
|
|
|
InfoGain(float):信息增益
|
|
|
|
|
'''
|
|
|
|
|
base_e = calcInfoEntropy(label)
|
|
|
|
|
f = np.array(feature)
|
|
|
|
|
# 得到指定特征列的值的集合
|
|
|
|
|
f_set = set(f[:, index])
|
|
|
|
|
sum_HDA = 0
|
|
|
|
|
# 计算条件熵
|
|
|
|
|
for value in f_set:
|
|
|
|
|
sum_HDA += calcHDA(feature, label, index, value)
|
|
|
|
|
# 计算信息增益
|
|
|
|
|
InfoGain = base_e - sum_HDA
|
|
|
|
|
return InfoGain
|
|
|
|
|
|
|
|
|
|
# 获得信息增益最高的特征
|
|
|
|
|
def getBestFeature(feature, label):
|
|
|
|
|
'''
|
|
|
|
|
input:
|
|
|
|
|
feature(ndarray):样本特征
|
|
|
|
|
label(ndarray):样本标签
|
|
|
|
|
output:
|
|
|
|
|
best_feature(int):信息增益最高的特征
|
|
|
|
|
'''
|
|
|
|
|
max_infogain = 0
|
|
|
|
|
best_feature = 0
|
|
|
|
|
for i in range(len(feature[0])):
|
|
|
|
|
infogain = calcInfoGain(feature, label, i)
|
|
|
|
|
if infogain > max_infogain:
|
|
|
|
|
max_infogain = infogain
|
|
|
|
|
best_feature = i
|
|
|
|
|
return best_feature
|
|
|
|
|
|
|
|
|
|
#创建决策树
|
|
|
|
|
def createTree(feature, label):
|
|
|
|
|
'''
|
|
|
|
|
input:
|
|
|
|
|
feature(ndarray):训练样本特征
|
|
|
|
|
label(ndarray):训练样本标签
|
|
|
|
|
output:
|
|
|
|
|
tree(dict):决策树模型
|
|
|
|
|
'''
|
|
|
|
|
# 样本里都是同一个label没必要继续分叉了
|
|
|
|
|
if len(set(label)) == 1:
|
|
|
|
|
return label[0]
|
|
|
|
|
# 样本中只有一个特征或者所有样本的特征都一样的话就看哪个label的票数高
|
|
|
|
|
if len(feature[0]) == 1 or len(np.unique(feature, axis=0)) == 1:
|
|
|
|
|
vote = {}
|
|
|
|
|
for l in label:
|
|
|
|
|
if l in vote.keys():
|
|
|
|
|
vote[l] += 1
|
|
|
|
|
else:
|
|
|
|
|
vote[l] = 1
|
|
|
|
|
max_count = 0
|
|
|
|
|
vote_label = None
|
|
|
|
|
for k, v in vote.items():
|
|
|
|
|
if v > max_count:
|
|
|
|
|
max_count = v
|
|
|
|
|
vote_label = k
|
|
|
|
|
return vote_label
|
|
|
|
|
# 根据信息增益拿到特征的索引
|
|
|
|
|
best_feature = getBestFeature(feature, label)
|
|
|
|
|
tree = {best_feature: {}}
|
|
|
|
|
f = np.array(feature)
|
|
|
|
|
# 拿到bestfeature的所有特征值
|
|
|
|
|
f_set = set(f[:, best_feature])
|
|
|
|
|
# 构建对应特征值的子样本集sub_feature, sub_label
|
|
|
|
|
for v in f_set:
|
|
|
|
|
sub_feature = []
|
|
|
|
|
sub_label = []
|
|
|
|
|
for i in range(len(feature)):
|
|
|
|
|
if feature[i][best_feature] == v:
|
|
|
|
|
sub_feature.append(feature[i])
|
|
|
|
|
sub_label.append(label[i])
|
|
|
|
|
# 递归构建决策树
|
|
|
|
|
tree[best_feature][v] = createTree(sub_feature, sub_label)
|
|
|
|
|
return tree
|
|
|
|
|
|
|
|
|
|
#决策树分类
|
|
|
|
|
def dt_clf(train_feature,train_label,test_feature):
|
|
|
|
|
'''
|
|
|
|
|
input:
|
|
|
|
|
train_feature(ndarray):训练样本特征
|
|
|
|
|
train_label(ndarray):训练样本标签
|
|
|
|
|
test_feature(ndarray):测试样本特征
|
|
|
|
|
output:
|
|
|
|
|
predict(ndarray):测试样本预测标签
|
|
|
|
|
'''
|
|
|
|
|
#创建决策树
|
|
|
|
|
tree = createTree(train_feature,train_label)
|
|
|
|
|
result = []
|
|
|
|
|
#根据tree与特征进行分类
|
|
|
|
|
def classify(tree,test_feature):
|
|
|
|
|
#如果tree是叶子节点,返回tree
|
|
|
|
|
if not isinstance(tree,dict):
|
|
|
|
|
return tree
|
|
|
|
|
#根据特征值走入tree中的分支
|
|
|
|
|
t_index,t_value = list(tree.items())[0]
|
|
|
|
|
f_value = test_feature[t_index]
|
|
|
|
|
#如果分支依然是tree
|
|
|
|
|
if isinstance(t_value,dict):
|
|
|
|
|
#根据tree与特征进行分类
|
|
|
|
|
classLabel = classify(tree[t_index][f_value],test_feature)
|
|
|
|
|
return classLabel
|
|
|
|
|
else:
|
|
|
|
|
#返回特征值
|
|
|
|
|
return t_value
|
|
|
|
|
for f in test_feature:
|
|
|
|
|
result.append(classify(tree,f))
|
|
|
|
|
predict = np.array(result)
|
|
|
|
|
return predict
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|