基于ID3決策樹算法的實現(xiàn)(Python版)
實例如下:
# -*- coding:utf-8 -*-
from numpy import *
import numpy as np
import pandas as pd
from math import log
import operator
#計算數(shù)據(jù)集的香農(nóng)熵
def calcShannonEnt(dataSet):
numEntries=len(dataSet)
labelCounts={}
#給所有可能分類創(chuàng)建字典
for featVec in dataSet:
currentLabel=featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt=0.0
#以2為底數(shù)計算香農(nóng)熵
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt-=prob*log(prob,2)
return shannonEnt
#對離散變量劃分?jǐn)?shù)據(jù)集,取出該特征取值為value的所有樣本
def splitDataSet(dataSet,axis,value):
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
#對連續(xù)變量劃分?jǐn)?shù)據(jù)集,direction規(guī)定劃分的方向,
#決定是劃分出小于value的數(shù)據(jù)樣本還是大于value的數(shù)據(jù)樣本集
def splitContinuousDataSet(dataSet,axis,value,direction):
retDataSet=[]
for featVec in dataSet:
if direction==0:
if featVec[axis]>value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
else:
if featVec[axis]<=value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
#選擇最好的數(shù)據(jù)集劃分方式
def chooseBestFeatureToSplit(dataSet,labels):
numFeatures=len(dataSet[0])-1
baseEntropy=calcShannonEnt(dataSet)
bestInfoGain=0.0
bestFeature=-1
bestSplitDict={}
for i in range(numFeatures):
featList=[example[i] for example in dataSet]
#對連續(xù)型特征進(jìn)行處理
if type(featList[0]).__name__=='float' or type(featList[0]).__name__=='int':
#產(chǎn)生n-1個候選劃分點
sortfeatList=sorted(featList)
splitList=[]
for j in range(len(sortfeatList)-1):
splitList.append((sortfeatList[j]+sortfeatList[j+1])/2.0)
bestSplitEntropy=10000
slen=len(splitList)
#求用第j個候選劃分點劃分時,得到的信息熵,并記錄最佳劃分點
for j in range(slen):
value=splitList[j]
newEntropy=0.0
subDataSet0=splitContinuousDataSet(dataSet,i,value,0)
subDataSet1=splitContinuousDataSet(dataSet,i,value,1)
prob0=len(subDataSet0)/float(len(dataSet))
newEntropy+=prob0*calcShannonEnt(subDataSet0)
prob1=len(subDataSet1)/float(len(dataSet))
newEntropy+=prob1*calcShannonEnt(subDataSet1)
if newEntropy<bestSplitEntropy:
bestSplitEntropy=newEntropy
bestSplit=j
#用字典記錄當(dāng)前特征的最佳劃分點
bestSplitDict[labels[i]]=splitList[bestSplit]
infoGain=baseEntropy-bestSplitEntropy
#對離散型特征進(jìn)行處理
else:
uniqueVals=set(featList)
newEntropy=0.0
#計算該特征下每種劃分的信息熵
for value in uniqueVals:
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
newEntropy+=prob*calcShannonEnt(subDataSet)
infoGain=baseEntropy-newEntropy
if infoGain>bestInfoGain:
bestInfoGain=infoGain
bestFeature=i
#若當(dāng)前節(jié)點的最佳劃分特征為連續(xù)特征,則將其以之前記錄的劃分點為界進(jìn)行二值化處理
#即是否小于等于bestSplitValue
if type(dataSet[0][bestFeature]).__name__=='float' or type(dataSet[0][bestFeature]).__name__=='int':
bestSplitValue=bestSplitDict[labels[bestFeature]]
labels[bestFeature]=labels[bestFeature]+'<='+str(bestSplitValue)
for i in range(shape(dataSet)[0]):
if dataSet[i][bestFeature]<=bestSplitValue:
dataSet[i][bestFeature]=1
else:
dataSet[i][bestFeature]=0
return bestFeature
#特征若已經(jīng)劃分完,節(jié)點下的樣本還沒有統(tǒng)一取值,則需要進(jìn)行投票
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
return max(classCount)
#主程序,遞歸產(chǎn)生決策樹
def createTree(dataSet,labels,data_full,labels_full):
classList=[example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet,labels)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
featValues=[example[bestFeat] for example in dataSet]
uniqueVals=set(featValues)
if type(dataSet[0][bestFeat]).__name__=='str':
currentlabel=labels_full.index(labels[bestFeat])
featValuesFull=[example[currentlabel] for example in data_full]
uniqueValsFull=set(featValuesFull)
del(labels[bestFeat])
#針對bestFeat的每個取值,劃分出一個子樹。
for value in uniqueVals:
subLabels=labels[:]
if type(dataSet[0][bestFeat]).__name__=='str':
uniqueValsFull.remove(value)
myTree[bestFeatLabel][value]=createTree(splitDataSet\
(dataSet,bestFeat,value),subLabels,data_full,labels_full)
if type(dataSet[0][bestFeat]).__name__=='str':
for value in uniqueValsFull:
myTree[bestFeatLabel][value]=majorityCnt(classList)
return myTree
import matplotlib.pyplot as plt
decisionNode=dict(boxstyle="sawtooth",fc="0.8")
leafNode=dict(boxstyle="round4",fc="0.8")
arrow_args=dict(arrowstyle="<-")
#計算樹的葉子節(jié)點數(shù)量
def getNumLeafs(myTree):
numLeafs=0
firstSides = list(myTree.keys())
firstStr=firstSides[0]
secondDict=myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':
numLeafs+=getNumLeafs(secondDict[key])
else: numLeafs+=1
return numLeafs
#計算樹的最大深度
def getTreeDepth(myTree):
maxDepth=0
firstSides = list(myTree.keys())
firstStr=firstSides[0]
secondDict=myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':
thisDepth=1+getTreeDepth(secondDict[key])
else: thisDepth=1
if thisDepth>maxDepth:
maxDepth=thisDepth
return maxDepth
#畫節(jié)點
def plotNode(nodeTxt,centerPt,parentPt,nodeType):
createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\
xytext=centerPt,textcoords='axes fraction',va="center", ha="center",\
bbox=nodeType,arrowprops=arrow_args)
#畫箭頭上的文字
def plotMidText(cntrPt,parentPt,txtString):
lens=len(txtString)
xMid=(parentPt[0]+cntrPt[0])/2.0-lens*0.002
yMid=(parentPt[1]+cntrPt[1])/2.0
createPlot.ax1.text(xMid,yMid,txtString)
def plotTree(myTree,parentPt,nodeTxt):
numLeafs=getNumLeafs(myTree)
depth=getTreeDepth(myTree)
firstSides = list(myTree.keys())
firstStr=firstSides[0]
cntrPt=(plotTree.x0ff+(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.y0ff)
plotMidText(cntrPt,parentPt,nodeTxt)
plotNode(firstStr,cntrPt,parentPt,decisionNode)
secondDict=myTree[firstStr]
plotTree.y0ff=plotTree.y0ff-1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':
plotTree(secondDict[key],cntrPt,str(key))
else:
plotTree.x0ff=plotTree.x0ff+1.0/plotTree.totalW
plotNode(secondDict[key],(plotTree.x0ff,plotTree.y0ff),cntrPt,leafNode)
plotMidText((plotTree.x0ff,plotTree.y0ff),cntrPt,str(key))
plotTree.y0ff=plotTree.y0ff+1.0/plotTree.totalD
def createPlot(inTree):
fig=plt.figure(1,facecolor='white')
fig.clf()
axprops=dict(xticks=[],yticks=[])
createPlot.ax1=plt.subplot(111,frameon=False,**axprops)
plotTree.totalW=float(getNumLeafs(inTree))
plotTree.totalD=float(getTreeDepth(inTree))
plotTree.x0ff=-0.5/plotTree.totalW
plotTree.y0ff=1.0
plotTree(inTree,(0.5,1.0),'')
plt.show()
df=pd.read_csv('watermelon_4_3.csv')
data=df.values[:,1:].tolist()
data_full=data[:]
labels=df.columns.values[1:-1].tolist()
labels_full=labels[:]
myTree=createTree(data,labels,data_full,labels_full)
print(myTree)
createPlot(myTree)
最終結(jié)果如下:
{'texture': {'blur': 0, 'little_blur': {'touch': {'soft_stick': 1, 'hard_smooth': 0}}, 'distinct': {'density<=0.38149999999999995': {0: 1, 1: 0}}}}
得到的決策樹如下:

參考資料:
《機(jī)器學(xué)習(xí)實戰(zhàn)》
《機(jī)器學(xué)習(xí)》周志華著
以上這篇基于ID3決策樹算法的實現(xiàn)(Python版)就是小編分享給大家的全部內(nèi)容了,希望能給大家一個參考,也希望大家多多支持腳本之家。
相關(guān)文章
Python使用機(jī)器學(xué)習(xí)模型實現(xiàn)溫度預(yù)測詳解
使用?Python?可以使用機(jī)器學(xué)習(xí)模型進(jìn)行溫度預(yù)測。常用的模型有回歸分析、隨機(jī)森林等。本文就來和大家來了具體實現(xiàn)方法,希望對大家有所幫助2023-01-01
Python實現(xiàn)的序列化和反序列化二叉樹算法示例
這篇文章主要介紹了Python實現(xiàn)的序列化和反序列化二叉樹算法,結(jié)合實例形式分析了Python二叉樹的構(gòu)造、遍歷、序列化、反序列化等相關(guān)操作技巧,需要的朋友可以參考下2019-03-03
python基于OpenCV模塊實現(xiàn)視頻流數(shù)據(jù)切割為圖像幀數(shù)據(jù)(流程分析)
這篇文章主要介紹了python基于OpenCV模塊實現(xiàn)視頻流數(shù)據(jù)切割為圖像幀數(shù)據(jù),這里今天主要是實踐一下視頻流數(shù)據(jù)的預(yù)處理工作,需要的朋友可以參考下2022-05-05

