澈-python3原生实现决策树

2020年03月27日 13:39:08
python3原生实现决策树

#代码转自https://blog.csdn.net/huminwudi/article/details/78522578
#决策树原理参考https://www.cnblogs.com/lovephysics/p/7231294.html
from math import log
import operator
import sys

def calcShannonEnt(dataSet):  # 计算数据的熵(entropy)
    numEntries=len(dataSet)  # 数据条数
    labelCounts={}
    for featVec in dataSet:
        currentLabel=featVec[-1] # 每行数据的最后一个字（类别）
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1  # 统计有多少个类以及每个类的数量
    shannonEnt=0
    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
        shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
    return shannonEnt

def createDataSet1():    # 创造示例数据
    dataSet = [['slashdot','USA','yes',18,'None'],
                ['google','France','yes',23,'Premium'],
                ['digg','USA','yes',24,'Basic'],
                ['kiwitobes','France','yes',23,'Basic'],
                ['google','UK','no',21,'Premium'],
                ['(direct)','New Zealand','no',12,'None'],
                ['(direct)','UK','no',21,'Basic'],
                ['google','USA','no',24,'Premium'],
                ['slashdot','France','yes',19,'None'],
                ['digg','USA','no',18,'None'],
                ['google','UK','no',18,'None'],
                ['kiwitobes','UK','no',19,'None'],
                ['digg','New Zealand','yes',12,'Basic'],
                ['slashdot','UK','no',21,'None'],
                ['google','UK','yes',18,'Basic'],
                ['kiwitobes','France','yes',19,'Basic']]
    labels = ['src', 'address', 'FAQ', 'num']  #两个特征
    return dataSet,labels

def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis]==value:
            reducedFeatVec =featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureToSplit(dataSet):  # 选择最优的分类特征

    numFeatures = len(dataSet[0])-1
    baseEntropy = calcShannonEnt(dataSet)  # 原始的熵

    bestInfoGain = 0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob =len(subDataSet)/float(len(dataSet))
            newEntropy +=prob*calcShannonEnt(subDataSet)  # 按特征分类后的熵
        infoGain = baseEntropy - newEntropy  # 原始熵与按特征分类后的熵的差值
        if (infoGain>bestInfoGain):   # 若按某特征划分后，熵值减少的最大，则次特征为最优分类特征
            bestInfoGain=infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList):    #按分类后类别数量排序，比如：最后分类为2男1女，则判定为男；
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet,labels):

    classList=[example[-1] for example in dataSet]  # 类别：男或女

    if classList.count(classList[0])==len(classList):
        return classList[0]

    if len(dataSet[0])==1:
        return majorityCnt(classList)

    bestFeat = chooseBestFeatureToSplit(dataSet) #选择最优特征

    bestFeatLabel = labels[bestFeat]

    myTree = {bestFeatLabel:{}} #分类结果以字典形式保存
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree


if __name__=='__main__':
    dataSet, labels=createDataSet1()  # 创造示列数据
    print(createTree(dataSet, labels))  # 输出决策树模型结果
« 2024年4月 »
一	二	三	四	五	六	日
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30