#ID3是基于信息熵增益构建决策树的,所以要先计算信息熵 defcalInformationEntropy(dataSet): numData = len(dataSet) #样本数量 labelCounts = {} # 创建一个数据字典:key是最后一列的数值(即标签,也就是目标分类的类别),value是属于该类别的样本个数 for featVec in dataSet: # 遍历整个数据集,每次取一行 currentLabel = featVec[-1] #取该行最后一列的值 if currentLabel notin labelCounts.keys(): labelCounts[currentLabel] = 0 labelCounts[currentLabel] += 1 shannonEnt = 0.0# 初始化信息熵 for key in labelCounts: prob = float(labelCounts[key])/numEntries shannonEnt -= prob * log(prob,2) #计算信息熵 return shannonEnt
#按照给定的特征划分数据 defsplitDataSet(dataSet, axis, value):#axis是dataSet数据集下要进行特征划分的列号 retDataSet = [] for featVec in dataSet: #遍历数据集,并抽取按axis的当前value特征进划分的数据集(不包括axis列的值) if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet
######选取当前数据集下用于划分数据集的最优特征 defchooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0]) - 1#获取当前数据集的特征个数 baseEntropy = calInformationEntropy(dataSet) #计算当前数据集的信息熵 bestInfoGain = 0.0; bestFeature = -1#初始化最优信息增益和最优的特征 for i inrange(numFeatures): #遍历每个特征 featList = [example[i] for example in dataSet]#获取数据集中当前特征下的所有值 uniqueVals = set(featList) # 获取当前特征值 newEntropy = 0.0 for value in uniqueVals: #计算每种划分方式的信息熵 subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob * calInformationEntropy(subDataSet) infoGain = baseEntropy - newEntropy #计算信息增益 if (infoGain > bestInfoGain): #比较每个特征的信息增益,只要最好的信息增益 bestInfoGain = infoGain bestFeature = i return bestFeature
#找到分类名称的列表中出现次数最多的分类名称 defmajorityCnt(classList): classCount={} for vote in classList: if vote notin classCount.keys(): classCount[vote] = 0 classCount[vote] += 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0]
####生成决策树的方法 defcreateTree(dataSet,labels): classList = [example[-1] for example in dataSet] # 返回当前数据集下标签列所有值 if classList.count(classList[0]) == len(classList): return classList[0]#当类别完全相同时则停止继续划分,直接返回该类的标签 iflen(dataSet[0]) == 1: #遍历完所有的特征时,仍然不能将数据集划分成仅包含唯一类别的分组 dataSet return majorityCnt(classList) #由于无法简单的返回唯一的类标签,这里就返回出现次数最多的类别作为返回值 bestFeat = chooseBestFeatureToSplit(dataSet) # 获取最好的分类特征索引 bestFeatLabel = labels[bestFeat] #获取该特征的名字
myTree = {bestFeatLabel:{} } #当前数据集选取最好的特征存储在bestFeat中 del(labels[bestFeat]) #删除已经在选取的特征 featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) return myTree
然后,我们用sklearn实现决策树:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
from sklearn.model_selection import train_test_split from sklearn.feature_extraction import DictVectorizer #特征转换器 from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report from sklearn import tree