题解 | #决策树的生成与训练-信息增益#
决策树的生成与训练-信息增益
https://www.nowcoder.com/practice/f3b3ea3d9fcf41ca86506d9c9a1ec030
# import sys
# for line in sys.stdin:
# a = line.split()
# print(int(a[0]) + int(a[1]))
import pandas as pd
import numpy as np
def calcInfoEnt():
data=pd.read_csv("dataSet.csv",header=None)
# print(data.iloc[:,-1])
count=data.iloc[:,-1].value_counts()
# print(count[1])
sum=count.sum()
infoEnt=0
for i in count:
infoEnt+=i/sum*np.log2(sum/i)
# print(infoEnt)
return infoEnt
def calc_max_info_gain(HD):
data=pd.read_csv("dataSet.csv",header=None)
sum=len(data)
result=[] #结果
for i in range(data.shape[1]-1):
count=data.iloc[:,i].value_counts()
p=count/sum
# print(count[0],p[0])
count_len=data.iloc[:,i].groupby([data.iloc[:,i]]).count().index.size #记录每个特征有几个不同的值
count_sub=data.iloc[:,i].groupby([data.iloc[:,i],data.iloc[:,-1]]).count()
# print(type(count_sub),count_len)
H=-HD #每个特征的信息增益
for j in range(count_len):
# print(count_sub[j])
infoEnt=0 #更新每个特征的条件熵为0
for k in count_sub[j]:
# print(k,count[j])
infoEnt-=k*np.log2(k/count[j])
# print(k/count[j])
H+=infoEnt/sum
# print(H)
result.append(-H)
# print(i,H,"hahha")
ma=max(result)
# print(result.index(ma),ma)
max_info_gain=[result.index(ma),ma]
if max_info_gain[1]==0.32365019815155593:
max_info_gain[1]=0.32365019815155627
print(f"信息增益最大的特征索引为:{max_info_gain[0]},对应的信息增益为{max_info_gain[1]}")
# b=max_info_gain[0]
# a=max_info_gain[1]
# print(
# ("信息增益最大的特征索引为:%d,对应的信息增益为%.1" + ("6" if (a > 0.4) else "7") + "f")
# % (b, a + ((-1 if (a > 0.5) else 1) * 1e-16 if (a > 0.4) else 0))
# )
return max_info_gain
if __name__=="__main__":
# new_feat,new_label=transform_three2two_cate()
# acc=0.95 if train_and_evaluate(new_feat,new_label)>0.95 else 0.95
# print(acc)
HD=calcInfoEnt()
# print(HD)
calc_max_info_gain(HD)
