forked from MukulCode/CodingClubIndia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ID3-Decision-Tree.py
118 lines (91 loc) · 4.55 KB
/
ID3-Decision-Tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
class GadId3Classifier:
def fit(self, input, output):
data = input.copy()
data[output.name] = output
self.tree = self.decision_tree(data, data, input.columns, output.name)
def predict(self, input):
# convert input data into a dictionary of samples
samples = input.to_dict(orient='records')
predictions = []
# make a prediction for every sample
for sample in samples:
predictions.append(self.make_prediction(sample, self.tree, 1.0))
return predictions
def entropy(self, attribute_column):
# find unique values and their frequency counts for the given attribute
values, counts = np.unique(attribute_column, return_counts=True)
# calculate entropy for each unique value
entropy_list = []
for i in range(len(values)):
probability = counts[i]/np.sum(counts)
entropy_list.append(-probability*np.log2(probability))
# calculate sum of individual entropy values
total_entropy = np.sum(entropy_list)
return total_entropy
def information_gain(self, data, feature_attribute_name, target_attribute_name):
# find total entropy of given subset
total_entropy = self.entropy(data[target_attribute_name])
# find unique values and their frequency counts for the attribute to be split
values, counts = np.unique(data[feature_attribute_name], return_counts=True)
# calculate weighted entropy of subset
weighted_entropy_list = []
for i in range(len(values)):
subset_probability = counts[i]/np.sum(counts)
subset_entropy = self.entropy(data.where(data[feature_attribute_name]==values[i]).dropna()[target_attribute_name])
weighted_entropy_list.append(subset_probability*subset_entropy)
total_weighted_entropy = np.sum(weighted_entropy_list)
# calculate information gain
information_gain = total_entropy - total_weighted_entropy
return information_gain
def decision_tree(self, data, orginal_data, feature_attribute_names, target_attribute_name, parent_node_class=None):
# base cases:
# if data is pure, return the majority class of subset
unique_classes = np.unique(data[target_attribute_name])
if len(unique_classes) <= 1:
return unique_classes[0]
# if subset is empty, ie. no samples, return majority class of original data
elif len(data) == 0:
majority_class_index = np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])
return np.unique(original_data[target_attribute_name])[majority_class_index]
# if data set contains no features to train with, return parent node class
elif len(feature_attribute_names) == 0:
return parent_node_class
# if none of the above are true, construct a branch:
else:
# determine parent node class of current branch
majority_class_index = np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])
parent_node_class = unique_classes[majority_class_index]
# determine information gain values for each feature
# choose feature which best splits the data, ie. highest value
ig_values = [self.information_gain(data, feature, target_attribute_name) for feature in feature_attribute_names]
best_feature_index = np.argmax(ig_values)
best_feature = feature_attribute_names[best_feature_index]
# create tree structure, empty at first
tree = {best_feature: {}}
# remove best feature from available features, it will become the parent node
feature_attribute_names = [i for i in feature_attribute_names if i != best_feature]
# create nodes under parent node
parent_attribute_values = np.unique(data[best_feature])
for value in parent_attribute_values:
sub_data = data.where(data[best_feature] == value).dropna()
# call the algorithm recursively
subtree = self.decision_tree(sub_data, orginal_data, feature_attribute_names, target_attribute_name, parent_node_class)
# add subtree to original tree
tree[best_feature][value] = subtree
return tree
def make_prediction(self, sample, tree, default=1):
# map sample data to tree
for attribute in list(sample.keys()):
# check if feature exists in tree
if attribute in list(tree.keys()):
try:
result = tree[attribute][sample[attribute]]
except:
return default
result = tree[attribute][sample[attribute]]
# if more attributes exist within result, recursively find best result
if isinstance(result, dict):
return self.make_prediction(sample, result)
else:
return result