python3实现CART决策树
# https://www.cnblogs.com/ahu-lichang/p/7169026.html和http://blog.csdn.net/lanxu_yy/article/details/18747855 关于信息熵可以参考
import numpy as np # 快速操作结构数组的工具
import pandas as pd # 数据分析处理工具
from sklearn.tree import export_graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
import sys
class jcs():
def getResult(self,trainData,labelName,testData):
# 下面的数据分为为每个用户的来源网站、位置、是否阅读FAQ、浏览网页数目、选择的服务类型(目标结果)
attr_arr = trainData
#生成属性数据集和结果数据集
dataMat = np.array(attr_arr)
arrMat = dataMat[:,0:4]
resultMat = dataMat[:,4]
# 构造数据集成pandas结构
attr_names = labelName #特征属性的名称
attr_pd = pd.DataFrame(data=arrMat,columns=attr_names) #每行为一个对象,每列为一种属性,最后一个为结果值
#将数据集中的字符串转化为代表类别的数字。因为sklearn的决策树只识别数字`
le = LabelEncoder()
for col in attr_pd.columns: #为每一列序列化,就是将每种字符串转化为对应的数字。用数字代表类别
attr_pd[col] = le.fit_transform(attr_pd[col])
# 构建决策树
clf = tree.DecisionTreeClassifier()
clf.fit(attr_pd, resultMat)
# 使用决策树进行预测
result = clf.predict(testData) # 输入也必须是数字的。分别代表了每个数字所代表的属性的字符串值
print(result)
# 将决策树保存
#dot -T jpg dtc.dot -o test.jpg(在命令行利用这句话把dtc.dot转为图片)
# with open('dtc.dot','w') as file:
# export_graphviz(clf,feature_names=labelName,out_file=file)
if __name__=='__main__':
trainData = [['slashdot','USA','yes',18,'None'],
['google','France','yes',23,'Premium'],
['digg','USA','yes',24,'Basic'],
['kiwitobes','France','yes',23,'Basic'],
['google','UK','no',21,'Premium'],
['(direct)','New Zealand','no',12,'None'],
['(direct)','UK','no',21,'Basic'],
['google','USA','no',24,'Premium'],
['slashdot','France','yes',19,'None'],
['digg','USA','no',18,'None'],
['google','UK','no',18,'None'],
['kiwitobes','UK','no',19,'None'],
['digg','New Zealand','yes',12,'Basic'],
['slashdot','UK','no',21,'None'],
['google','UK','yes',18,'Basic'],
['kiwitobes','France','yes',19,'Basic']]
labelName = ['src', 'address', 'FAQ', 'num']
#因为sklearn的决策树只识别数字,上面的训练数据也会转成数字,根据ascll顺序排序
testData = [[4,1,1,0]]
jcs = jcs()
jcs.getResult(trainData,labelName,testData)
#返回的结果如下['None']