一. ID3算法原理
ID3算法通过计算每个属性的信息增益,认为信息增益越大属性越优,每次划分选取信息增益最大的属性为划分标准,重复这个过程,直到构成一棵决策树。二. 关键概念
1. 信息熵
信息熵是描述事件给我们的惊讶程度,如果所有事件的概率均等,那熵值大,惊讶程度低。如果有一事件的概率极高而其他极低,熵值便低,惊讶程度大。其计算公式如下:2. 信息增益
信息增益描述某个事件发生前后信息熵的变化。 在ID3里面可以理解为引入特征之后信息熵的变化(这句话纯属个人理解,如果不对请指正)。其计算公式如下:
三. 测试数据
data.txtyoung myope no reduced no lenses
young myope no normal soft
young myope yes reduced no lenses
young myope yes normal hard
young hyper no reduced no lenses
young hyper no normal soft
young hyper yes reduced no lenses
young hyper yes normal hard
pre myope no reduced no lenses
pre myope no normal soft
pre myope yes reduced no lenses
pre myope yes normal hard
pre hyper no reduced no lenses
pre hyper no normal soft
pre hyper yes reduced no lenses
pre hyper yes normal no lenses
presbyopic myope no reduced no lenses
presbyopic myope no normal no lenses
presbyopic myope yes reduced no lenses
presbyopic myope yes normal hard
presbyopic hyper no reduced no lenses
presbyopic hyper no normal soft
presbyopic hyper yes reduced no lenses
presbyopic hyper yes normal no lenses
label.txt age prescript astigmatic tearRate
四. 实验代码
#encoding: utf-8
import math
def cal_shannon_ent(data):
data_num = len(data)
cnt = {}
for feat_vec in data:
tmp_class = feat_vec[-1]
if tmp_class not in cnt.keys():
cnt[tmp_class] = 0
cnt[tmp_class] += 1
shannon_ent = 0.0
for key in cnt:
p = float(cnt[key]) / data_num
shannon_ent -= p * math.log(p, 2)
return shannon_ent
def split_data(data, index, value):
ret_data = []
for feat_vec in data:
if feat_vec[index] == value:
remain_feat_vec = feat_vec[:index]
remain_feat_vec.extend(feat_vec[index + 1:])
ret_data.append(remain_feat_vec)
return ret_data
def get_best_feat(data):
base_ent = cal_shannon_ent(data)
feat_num = len(data[0]) - 1
best_info_gain = 0.0
best_feat = -1
for i in range(feat_num):
values = [j[i] for j in data]
unique_values = set(values)
new_ent = 0.0
for value in unique_values:
sub_data = split_data(data, i, value)
p = float(len(sub_data)) / len(data)
new_ent += p * cal_shannon_ent(sub_data) #
info_gain = base_ent - new_ent
if info_gain > best_info_gain:
best_info_gain = info_gain
best_feat = i
return best_feat
def get_major_class(class_list):
cnt = {}
maxx = -1
major_class = -1
for _class in class_list:
if _class not in cnt.keys():
cnt[_class] = 0
cnt[_class] += 1
if(cnt[_class] > maxx):
maxx = cnt[_class]
major_class = _class
return major_class
def create_tree_id3(data, label):
class_list = [i[-1] for i in data]
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
if len(data[0]) == 1:
return get_major_class(class_list)
best_feat = get_best_feat(data)
best_feat_label = label[best_feat]
ret_tree = {best_feat_label:{}}
del(label[best_feat])
values = [i[best_feat] for i in data]
unique_values = set(values)
for value in unique_values:
sub_label = label[:] #
ret_tree[best_feat_label][value] = create_tree_id3(split_data(data, best_feat, value), sub_label)
return ret_tree
if __name__=="__main__":
# data = [[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
# label = ['no surfacing', 'flippers']
fr = open('data.txt')
data = [line.strip().split('\t') for line in fr.readlines()]
fr = open('label.txt')
label = [line.strip().split('\t') for line in fr.readlines()][0]
print create_tree_id3(data, label)
五. 实验结果
{'tearRate': {'reduced': 'no lenses', 'normal': {'astigmatic': {'yes': {'prescript': {'hyper': {'age': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}, 'myope': 'hard'}}, 'no': {'age': {'pre': 'soft', 'presbyopic': {'prescript': {'hyper': 'soft', 'myope': 'no lenses'}}, 'young': 'soft'}}}}}}