-
Notifications
You must be signed in to change notification settings - Fork 0
/
decisionTree.py
393 lines (293 loc) · 12 KB
/
decisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# -*- coding: utf-8 -*-
"""
Intro to Machine Learning
@author: Jonathan Dyer (jbdyer)
"""
import math
import csv
import sys
import numpy as np
################ Global Variables ################
MAX_DEPTH = 0
_attributes = {}
_train_labels = []
_test_labels = []
_unique_labels = []
################ Node Class ################
class Node:
def __init__(self, att_name, directions, left, right):
self.att_name = att_name # assign the splitting attribute
self.directions = directions # this helps test points navigate
self.left = left # left child (Node or Leaf)
self.right = right # right child (Node or Leaf)
def get_idx(self):
return _attributes[self.att_name]
class Leaf(Node):
def __init__(self, prediction):
self.prediction = prediction # this is the predicted label
################ Helper Functions ################
def prob(val, data):
"""
Calculates the simple proportion (probability) of val in data.
Parameters
----------
data : numpy array
Expected to be a vector of attribute values
val : anything
A possible attribute in data
"""
return list(data).count(val) / len(data)
def entropy(Y):
"""
Basic entropy calculation. Expects a vector.
"""
total = 0
for i in set(Y): # for each distinct value of Y
p = prob(i, Y) # get probability of Y taking that value
if p > 0:
log_p = np.log2(p)
else:
log_p = 0
total += (p * log_p)
return -1 * total
def joint_entropy(X,Y):
"""
Calculates the joint entropy of two different vectors of data.
"""
list_of_probs = [] # To store all the probabilities we get.
X = np.array(X) # Need to make sure we have numpy arrays
Y = np.array(Y) # (hopefully they're the same length!).
for x_val in set(X): # For each distinct value of X...
for y_val in set(Y): # for each distinct value of Y...
# get an array of booleans that tell us where they co-occur
intersection = np.array((X == x_val) & (Y == y_val))
# then count the true times
co_occur = np.sum(intersection)
joint_prob = co_occur / len(X) # divide to get probability
list_of_probs.append(joint_prob) # add to the list
# turn it into a numpy array for quicker math (and remove 0 values!)
list_of_probs = np.array([i for i in list_of_probs if i > 0])
# then multiply by log2(prob) --> make sure m isn't 0!
updated_probs = np.array([m * np.log2(m) for m in list_of_probs])
# finally sum them all up and multiply by -1
return np.sum(updated_probs) * -1
def mutual_info(Y, X):
"""
Returns the mutual information of these two columns of values.
"""
# print("Enter mutual info")
return entropy(Y) + entropy(X) - joint_entropy(X,Y)
def get_majority(Y):
"""
Assumes that the labels are in the last column of the data, or that
the data is simply a vector of labels.
Returns the label that takes the majority vote in the given data.
"""
# check if this is a vector...
if len(np.shape(Y)) < 2: # we have a vector of only labels
labels = np.array(Y)
else: # otherwise get the vector of labels
labels = np.array(Y)[:,-1]
# get counts of each label
unique_labels, counts = np.unique(labels, return_counts=True)
# print(unique_labels, counts)
# now the index of the moximum frequency label
i = np.argmax(counts)
# and finally return the most common label
# print("Majority vote is: ", unique_labels[i])
return unique_labels[i]
def get_error(predicted, actual):
mask = (predicted == actual)
return 1 - (np.sum(mask) / len(mask))
################ Recursive Learning Function ################
def learn(data, remaining_features):
# pretty print the start
counts_first = _train_labels.count(_unique_labels[0])
counts_second = _train_labels.count(_unique_labels[1])
print("[{} {} / {} {}]".format(counts_first, _unique_labels[0], counts_second, _unique_labels[1]))
return dt_learn(data, remaining_features)
def dt_learn(data, remaining_features, current_level=0):
"""
Returns a Node at the root of a learned Decision Tree (DT)
Data should by a Numpy ndarray
"""
# print("Enter learning...")
# Check that we haven't exceeded max depth...
if(current_level == MAX_DEPTH):
# print("Max depth!")
return Leaf(get_majority(data[:,-1])) # reached max depth, use majority
# Check for remaining features...
elif (len(remaining_features) < 1): # if we can't split anymore, just
# print("No features left!", current_level)
return Leaf(get_majority(data[:,-1])) # use majority vote here
# Otherwise we can maybe learn from this data!
best_feature = None
best_score = -1
best_idx = -1
feature_idx = -1
for i, f in enumerate(remaining_features): # for each of the unused features
true_idx = _attributes[f] # get the true index for feature
current_feature_data = data[:, true_idx] # retrieve the vector of data
current_labels = data[:, -1] # and the corresponding labels
# Now get mutual info and see if it is higher than previous max
m_i = mutual_info(current_labels, current_feature_data)
# print("Mutual info for", f, ": ", m_i)
if m_i > best_score:
best_feature = f
best_score = m_i
best_idx = true_idx
feature_idx = i
# Check for positive Mutual Information...
if (best_score <= 0.0):
# print("No MI left!", current_level)
return Leaf(get_majority(data[:,-1])) # majority vote
# Now we've found the best feature to split on, we split into partitions:
# - One subset for which the chosen feature is the first option
# - One subset for the rest
# print(best_feature)
best_feature_data = data[:, best_idx] # recover the column
vals = sorted(list(set(best_feature_data))) # get unique values
val_1, val_2 = vals[0], vals[1]
# print(vals)
# here we create a map to specify which way is which
directions = {val_1 : 'left', val_2 : 'right'}
partition_1 = data[best_feature_data == val_1]
partition_2 = data[best_feature_data == val_2]
# print(partition_1)
# print(partition_2)
remaining_features.pop(feature_idx) # remove used feature
# pretty print
part_1 = list(partition_1[:,-1])
part_2 = list(partition_2[:,-1])
# elems_1, counts_1 = np.unique(partition_1[:,-1], return_counts=True)
# elems_2, counts_2 = np.unique(partition_2[:,-1], return_counts=True)
# print(part_1)
# print(part_2)
counts_1_first = part_1.count(_unique_labels[0])
counts_1_second = part_1.count(_unique_labels[1])
counts_2_first = part_2.count(_unique_labels[0])
counts_2_second = part_2.count(_unique_labels[1])
tail_1 = "[{} {} / {} {}]".format(counts_1_first, _unique_labels[0], counts_1_second, _unique_labels[1])
tail_2 = "[{} {} / {} {}]".format(counts_2_first, _unique_labels[0], counts_2_second, _unique_labels[1])
head_1 = best_feature + " = " + val_1 + ": "
head_2 = best_feature + " = " + val_2 + ": "
print("| " * (current_level+1) + head_1 + tail_1)
left = dt_learn(partition_1, remaining_features[:], current_level+1)
print("| " * (current_level+1) + head_2 + tail_2)
right = dt_learn(partition_2, remaining_features[:], current_level+1)
this_node = Node(best_feature, directions, left, right)
# print("Made a node!")
return this_node
################ Prediction functions ################
def predict(d_tree, data):
"""
This one handles assembling the predictions and calling
the error-calculating functions.
Assumes top row of column headers is already removed.
"""
labels = []
data = np.array(data)
# print(data)
for row in data:
pred = dt_predict(d_tree, row)
labels.append(pred)
assert len(labels) == len(data)
# print(labels)
return labels
def dt_predict(d_tree, sample):
# We'll assume that the data is cleaned of feature names on the first row
# print(sample)
if isinstance(d_tree, Leaf):
# print("*** A prediction!", d_tree.prediction, "***")
return d_tree.prediction
else: # must be a node
# get the feature of the node
f_name = d_tree.att_name
# print("Att name:", f_name)
# get the master index of that feature
f_index = _attributes[f_name]
# print("Index:", f_index)
# get the value of the data point at that feature
data_f = sample[f_index]
# print("Val:", data_f)
# get the corresponding directions from the Node
dir = d_tree.directions[data_f]
# print("Direction:", dir)
# now return the appropriate Node
if dir == 'left':
return dt_predict(d_tree.left, sample)
elif dir == 'right':
return dt_predict(d_tree.right, sample)
print("Oh no!", sample)
return None
################ I/O functions ################
def read_training_data(train_path):
with open(train_path, 'r') as f:
reader = csv.reader(f, delimiter=',')
train_data = list(reader)
# get the attributes of the data -- should be the first row
atts = train_data.pop(0)
atts.pop(-1) # remove the label as an attribute
global _attributes
_attributes = dict(zip(atts, range(len(atts))))
# get labels, which are the last value of every row
global _train_labels
_train_labels = [row[-1] for row in train_data]
global _unique_labels
_unique_labels = np.unique(_train_labels)
# return everything except the column headers
return train_data
def read_test_data(test_path):
with open(test_path, 'r') as f:
reader = csv.reader(f, delimiter=',')
test_data = list(reader)
# get the attributes of the data -- should be the first row
atts = test_data.pop(0)
# get labels, which are the last value of every row
global _test_labels
_test_labels = [row[-1] for row in test_data]
# return everything except the column headers
return test_data
def write_labels(labels, path):
with open(path, 'w') as f_out:
for label in labels:
f_out.write(str(label) + "\n")
def write_metrics(path, train, test):
with open(path, 'w') as f_out:
tr = "error(train): " + str(train) + "\n"
te = "error(test): " + str(test) + "\n"
f_out.write(tr)
f_out.write(te)
################ Main driver function ################
def main():
train_in = sys.argv[1]
test_in = sys.argv[2]
global MAX_DEPTH
MAX_DEPTH = int(sys.argv[3])
train_out = sys.argv[4]
test_out = sys.argv[5]
metrics_out = sys.argv[6]
# read in data and get labels
train_data = np.array(read_training_data(train_in))
#******** training first!
learned_tree = learn(train_data.copy(), list(_attributes))
#********
# now we've trained, try out our tree on the training data
train_predictions = predict(learned_tree, train_data)
# write the predictions to file
write_labels(train_predictions, train_out)
#******** do the same for test data
test_data = np.array(read_test_data(test_in))
test_predictions = predict(learned_tree, test_data)
write_labels(test_predictions, test_out)
#******** get the metrics
train_error = get_error(np.array(train_predictions), np.array(_train_labels))
test_error = get_error(np.array(test_predictions), np.array(_test_labels))
write_metrics(metrics_out, train_error, test_error)
if __name__ == '__main__':
if len(sys.argv) == 7:
main()
else:
print("Wrong number of arguments!\n"
+ "Correct usage is 'python decisionTree.py <args... (6 args)>'")
# get a column = [row[col_idx] for row in data[rows_desired]]