先说熵的定义:


再看信息增益
信息增益是一种用于特征选择的指标,用于衡量特征对于数据集分类的贡献程度。它基于信息熵的概念,通过比较特征划分前后的信息熵差异来评估特征的重要性。信息熵是衡量数据集纯度的指标,表示数据集中的不确定性或混乱程度。信息熵越高,数据集的不确定性越大。



上述例子计算错误,gpt识数出错,更正后的:

好了给出计算信息增益选择特征的python代码:
# 导入numpy库
from collections import Counter
import numpy as np
# 导入对数计算模块log
from math import log
# 定义信息熵计算函数
def entropy(ele):
'''
输入:
ele:包含类别取值的列表
输出:信息熵值
'''
# 计算列表中取值的概率分布
counter = Counter(ele)
probs = [counter[i]/len(ele) for i in counter.keys()]
# 计算信息熵
entropy = -sum([prob*log(prob, 2) for prob in probs])
return entropy
# 定义基尼指数计算函数
def gini(nums):
'''
输入:
nums:包含类别取值的列表
输出:基尼指数值
'''
# 获取列表类别的概率分布
probs = [nums.count(i)/len(nums) for i in set(nums)]
# 计算基尼指数
gini = sum([p*(1-p) for p in probs])
return gini
def information_gain(data, labels, feature):
# 计算数据集的经验熵
total_entropy = entropy(labels)
# 根据特征划分数据集
feature_values = np.unique(data[:, feature])
subsets = [data[data[:, feature] == value] for value in feature_values]
"""
# 计算天气特征的经验条件熵
# 其中subset1~subset3为根据天气特征三个取值划分之后的子集
# entropy_DA = len(subset1)/len(df)*entropy(subset1['play'].tolist()) + \
# len(subset2)/len(df)*entropy(subset2['play'].tolist()) + \
# len(subset3)/len(df)*entropy(subset3['play'].tolist())
"""
# 计算特征的经验条件熵
conditional_entropy = 0
for subset in subsets:
subset_labels = subset[:, -1]
subset_entropy = entropy(subset_labels)
subset_weight = len(subset_labels) / len(labels)
conditional_entropy += subset_weight * subset_entropy
# 计算信息增益
information_gain = total_entropy - conditional_entropy
return information_gain
# 示例数据
data = np.array([[1, 'Sunny', 'Hot', 'High', 'Weak', 'No'],
[2, 'Sunny', 'Hot', 'High', 'Strong', 'No'],
[3, 'Overcast', 'Hot', 'High', 'Weak', 'Yes'],
[4, 'Rain', 'Mild', 'High', 'Weak', 'Yes'],
[5, 'Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
[6, 'Rain', 'Cool', 'Normal', 'Strong', 'No'],
[7, 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
[8, 'Sunny', 'Mild', 'High', 'Weak', 'No'],
[9, 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
[10, 'Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
[11, 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
[12, 'Overcast', 'Mild', 'High', 'Strong', 'Yes'],
[13, 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
[14, 'Rain', 'Mild', 'High', 'Strong', 'No']])
labels = data[:, -1]
# 计算天气特征的信息增益
feature_index = 1
info_gain = information_gain(data, labels, feature_index)
print("天气特征对于数据集分类的信息增益为:", info_gain)
gpt给的代码:
def entropy(data):
_, counts = np.unique(data, return_counts=True)
probabilities = counts / counts.sum()
ent = -np.sum(probabilities * np.log2(probabilities))
return ent
original_entropy = entropy(students[:, -1])
def conditional_entropy(data, column_idx):
unique_values, counts = np.unique(data[:, column_idx], return_counts=True)
weighted_entropy = 0
for value, count in zip(unique_values, counts):
subset = data[data[:, column_idx] == value]
weighted_entropy += count / data.shape[0] * entropy(subset[:, -1])
return weighted_entropy
def info_gain(data, column_idx):
return original_entropy - conditional_entropy(data, column_idx)
gain_for_glasses = info_gain(students, 0)
print("信息增益(眼镜特征):", gain_for_glasses)
输出:
天气特征对于数据集分类的信息增益为: 0.2467498197744391
眼镜对于数据集分类的信息增益为: 0.6099865470109874
信息增益(眼镜特征): 0.6099865470109874