-
Notifications
You must be signed in to change notification settings - Fork 6
/
DecisionTree.py
372 lines (281 loc) · 10.3 KB
/
DecisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
'''
Authors: Ashwani Kashyap, Anshul Pardhi
'''
import math
def unique_vals(rows, col):
"""Find the unique values for a column in a dataset."""
return set([row[col] for row in rows])
#######
# Demo:
# unique_vals(training_data, 0)
# unique_vals(training_data, 1)
#######
def class_counts(rows):
"""Counts the number of each type of example in a dataset."""
counts = {} # a dictionary of label -> count.
for row in rows:
# in our dataset format, the label is always the last column
label = row[-1]
if label not in counts:
counts[label] = 0
counts[label] += 1
return counts
#######
# Demo:
# class_counts(training_data)
#######
def max_label(dict):
max_count = 0
label = ""
for key, value in dict.items():
if dict[key] > max_count:
max_count = dict[key]
label = key
return label
def is_numeric(value):
"""Test if a value is numeric."""
return isinstance(value, int) or isinstance(value, float)
#######
# Demo:
# is_numeric(7)
# is_numeric("Red")
#######
class Question:
"""A Question is used to partition a dataset.
This class just records a 'column number' (e.g., 0 for Color) and a
'column value' (e.g., Green). The 'match' method is used to compare
the feature value in an example to the feature value stored in the
question. See the demo below.
"""
def __init__(self, column, value, header):
self.column = column
self.value = value
self.header = header
def match(self, example):
# Compare the feature value in an example to the
# feature value in this question.
val = example[self.column]
if is_numeric(val):
return val >= self.value
else:
return val == self.value
def __repr__(self):
# This is just a helper method to print
# the question in a readable format.
condition = "=="
if is_numeric(self.value):
condition = ">="
return "Is %s %s %s?" % (
self.header[self.column], condition, str(self.value))
def partition(rows, question):
"""Partitions a dataset.
For each row in the dataset, check if it matches the question. If
so, add it to 'true rows', otherwise, add it to 'false rows'.
"""
true_rows, false_rows = [], []
for row in rows:
if question.match(row):
true_rows.append(row)
else:
false_rows.append(row)
return true_rows, false_rows
def gini(rows):
"""Calculate the Gini Impurity for a list of rows.
There are a few different ways to do this, I thought this one was
the most concise. See:
https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity
"""
counts = class_counts(rows)
impurity = 1
for lbl in counts:
prob_of_lbl = counts[lbl] / float(len(rows))
impurity -= prob_of_lbl**2
return impurity
## TODO: Step 3
def entropy(rows):
# compute the entropy.
entries = class_counts(rows)
avg_entropy = 0
size = float(len(rows))
for label in entries:
prob = entries[label] / size
avg_entropy = avg_entropy + (prob * math.log(prob, 2))
return -1*avg_entropy
def info_gain(left, right, current_uncertainty):
"""Information Gain.
The uncertainty of the starting node, minus the weighted impurity of
two child nodes.
"""
p = float(len(left)) / (len(left) + len(right))
## TODO: Step 3, Use Entropy in place of Gini
return current_uncertainty - p * entropy(left) - (1 - p) * entropy(right)
def find_best_split(rows, header):
"""Find the best question to ask by iterating over every feature / value
and calculating the information gain."""
best_gain = 0 # keep track of the best information gain
best_question = None # keep train of the feature / value that produced it
current_uncertainty = entropy(rows)
n_features = len(rows[0]) - 1 # number of columns
for col in range(n_features): # for each feature
values = set([row[col] for row in rows]) # unique values in the column
for val in values: # for each value
question = Question(col, val, header)
# try splitting the dataset
true_rows, false_rows = partition(rows, question)
# Skip this split if it doesn't divide the
# dataset.
if len(true_rows) == 0 or len(false_rows) == 0:
continue
# Calculate the information gain from this split
gain = info_gain(true_rows, false_rows, current_uncertainty)
# You actually can use '>' instead of '>=' here
# but I wanted the tree to look a certain way for our
# toy dataset.
if gain >= best_gain:
best_gain, best_question = gain, question
return best_gain, best_question
## TODO: Step 2
class Leaf:
"""A Leaf node classifies data.
This holds a dictionary of class (e.g., "Apple") -> number of times
it appears in the rows from the training data that reach this leaf.
"""
def __init__(self, rows, id, depth):
self.predictions = class_counts(rows)
self.predicted_label = max_label(self.predictions)
self.id = id
self.depth = depth
## TODO: Step 1
class Decision_Node:
"""A Decision Node asks a question.
This holds a reference to the question, and to the two child nodes.
"""
def __init__(self,
question,
true_branch,
false_branch,
depth,
id,
rows):
self.question = question
self.true_branch = true_branch
self.false_branch = false_branch
self.depth = depth
self.id = id
self.rows = rows
## TODO: Step 3
def build_tree(rows, header, depth=0, id=0):
"""Builds the tree.
Rules of recursion: 1) Believe that it works. 2) Start by checking
for the base case (no further information gain). 3) Prepare for
giant stack traces.
"""
# depth = 0
# Try partitioing the dataset on each of the unique attribute,
# calculate the information gain,
# and return the question that produces the highest gain.
gain, question = find_best_split(rows, header)
# Base case: no further info gain
# Since we can ask no further questions,
# we'll return a leaf.
if gain == 0:
return Leaf(rows, id, depth)
# If we reach here, we have found a useful feature / value
# to partition on.
# nodeLst.append(id)
true_rows, false_rows = partition(rows, question)
# Recursively build the true branch.
true_branch = build_tree(true_rows, header, depth + 1, 2 * id + 2)
# Recursively build the false branch.
false_branch = build_tree(false_rows, header, depth + 1, 2 * id + 1)
# Return a Question node.
# This records the best feature / value to ask at this point,
# as well as the branches to follow
# depending on on the answer.
return Decision_Node(question, true_branch, false_branch, depth, id, rows)
## TODO: Step 8 - already done for you
def prune_tree(node, prunedList):
"""Builds the tree.
Rules of recursion: 1) Believe that it works. 2) Start by checking
for the base case (no further information gain). 3) Prepare for
giant stack traces.
"""
# Base case: we've reached a leaf
if isinstance(node, Leaf):
return node
# If we reach a pruned node, make that node a leaf node and return. Since it becomes a leaf node, the nodes
# below it are automatically not considered
if int(node.id) in prunedList:
return Leaf(node.rows, node.id, node.depth)
# Call this function recursively on the true branch
node.true_branch = prune_tree(node.true_branch, prunedList)
# Call this function recursively on the false branch
node.false_branch = prune_tree(node.false_branch, prunedList)
return node
## TODO: Step 6
def classify(row, node):
"""See the 'rules of recursion' above."""
# Base case: we've reached a leaf
if isinstance(node, Leaf):
return node.predicted_label
# Decide whether to follow the true-branch or the false-branch.
# Compare the feature / value stored in the node,
# to the example we're considering.
if node.question.match(row):
return classify(row, node.true_branch)
else:
return classify(row, node.false_branch)
## TODO: Step 4
def print_tree(node, spacing=""):
"""World's most elegant tree printing function."""
# Base case: we've reached a leaf
if isinstance(node, Leaf):
print(spacing + "Leaf id: " + str(node.id) + " Predictions: " + str(node.predictions) + " Label Class: " + str(node.predicted_label))
return
# Print the question at this node
print(spacing + str(node.question) + " id: " + str(node.id) + " depth: " + str(node.depth))
# Call this function recursively on the true branch
print(spacing + '--> True:')
print_tree(node.true_branch, spacing + " ")
# Call this function recursively on the false branch
print(spacing + '--> False:')
print_tree(node.false_branch, spacing + " ")
def print_leaf(counts):
"""A nicer way to print the predictions at a leaf."""
total = sum(counts.values()) * 1.0
probs = {}
for lbl in counts.keys():
probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
return probs
## TODO: Step 5
def getLeafNodes(node, leafNodes =[]):
# Base case
if isinstance(node, Leaf):
leafNodes.append(node)
return
# Recursive right call for true values
getLeafNodes(node.true_branch, leafNodes)
# Recursive left call for false values
getLeafNodes(node.false_branch, leafNodes)
return leafNodes
def getInnerNodes(node, innerNodes =[]):
# Base case
if isinstance(node, Leaf):
return
innerNodes.append(node)
# Recursive right call for true values
getInnerNodes(node.true_branch, innerNodes)
# Recursive left call for false values
getInnerNodes(node.false_branch, innerNodes)
return innerNodes
## TODO: Step 6
def computeAccuracy(rows, node):
count = len(rows)
if count == 0:
return 0
accuracy = 0
for row in rows:
# last entry of the column is the actual label
if row[-1] == classify(row, node):
accuracy += 1
return round(accuracy/count, 2)