1、使用决策树预测隐形眼镜类型,隐形眼镜数据集(lenses.csv)是非常著名的数据集,它包含很多患者眼部状况的观察
条件以及医生推荐的隐形眼镜类型。隐形眼镜类型包括硬材质、软材质以及不适合佩戴隐形眼镜。
要求:读取lenses.csv中的隐形眼镜数据集,构造决策树和画出决策树.(采用cart算法构造决策树)
预测[‘young’,‘hyper’,‘no’,‘normal’]适合戴那种隐形眼镜
from sklearn import tree
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
featureList = []
labels = []
# 使用DictReader创建的reader是一个字典对象,遍历后,不包含第一行数据
reader = csv.DictReader(open('lenses.csv','r'))
for x in reader:
# print(x)
value = {"age":x["age"],"prescript":x["prescript"],"astigmatic":x["astigmatic"],"tearRate":x["tearRate"]}
labelV = x["class"]
featureList.append(value)
labels.append(labelV)
# print(featureList)
# print('labels:',labels)
#下面将字符转化为数据,因为决策树只能分许数据,不能分析字符
vec = DictVectorizer()#这个类是专门将这种形式的字符转化为数字,Vectorizer向量化
featureData = vec.fit_transform(featureList).toarray()#利用fit_transform方法将featureList传进去
#然后利用toarray转化为数组
print('特征名称:',vec.get_feature_names())
# print('dataSet:',featureData)
#把标签换成0、1、2表示
# lb = preprocessing.LabelEncoder()
# labels = lb.fit_transform(labels)
# print(lb.classes_)
# print(labels)
#建立模型
model = tree.DecisionTreeClassifier(criterion='gini',max_depth=3)#创建决策树以信息熵为主要考虑因素
# print(model)
#训练模型
model.fit(featureData,labels)
# model.predict([['young','hyper','no','normal']])
#画出决策树
import graphviz
dot_data = tree.export_graphviz(model,
out_file=None,
feature_names=vec.get_feature_names(),
class_names=['hard','no lenses','soft'],
# class_names=['no lenses','hard','soft'],
# class_names=lb.classes_,
filled=True,
rounded=True,
special_characters=False)
graph = graphviz.Source(dot_data)
# graph.render('computer')
graph
prediction = model.predict([[0,0,1,1,0,1,0,1,0]])
print(prediction)
[‘soft’]
2、读取play.csv文件的内容(outlook(天气),TEMPERATURE(温度)、HUMIDITY(湿度),WINDY(风)代表四个特征。
最后一列(play)代表类别,即是否出去打球),根据play.csv的数据构造决策树以及训练模型,画出决策树。
from sklearn import tree
import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
import numpy as np
featureList = []
labels = []
# 使用DictReader创建的reader是一个字典对象,遍历后,不包含第一行数据
reader = csv.DictReader(open('play.csv','rt'))
dataset0 = np.zeros((14,2))
i=0
for x in reader:
# print(x)
value = {"outlook":x["outlook"],"WINDY":x["WINDY"]}
labelV = x["PLAY"]
featureList.append(value)
labels.append(labelV)
dataset0[i]=[x['TEMPERATURE'],x['HUMIDITY']]
i = i+1
# print(featureList)
print('labels:',labels)
print(dataset0)
#下面将字符转化为数据,因为决策树只能分许数据,不能分析字符
vec = DictVectorizer()#这个类是专门将这种形式的字符转化为数字,Vectorizer向量化
featureData = vec.fit_transform(featureList).toarray()#利用fit_transform方法将featureList传进去
#然后利用toarray转化为数组
print('特征名称:',vec.get_feature_names())
vec.get_feature_names().append('TEMPERATURE')
vec.get_feature_names().append('HUMIDITY')
print(vec.get_feature_names())
# print('dataSet:',featureData)
x_data = np.append(featureData,dataset0,axis=1)#e=np.append(a,b,axis=1)行拼接
print(x_data)
#把标签换成01表示
lb = preprocessing.LabelBinarizer()
y_data = lb.fit_transform(labels)
print(lb.classes_)
# print(y_data)
#建立模型
model = tree.DecisionTreeClassifier(criterion='gini')#创建决策树以信息熵为主要考虑因素
# print(model)
#训练模型
model.fit(x_data,y_data)
#画出决策树
import graphviz
dot_data = tree.export_graphviz(model,
out_file=None,
feature_names=vec.get_feature_names(),
class_names=lb.classes_,
filled=True,
rounded=True,
special_characters=False)
graph = graphviz.Source(dot_data)
# graph.render('computer')
graph