-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDecisionTree.py
165 lines (138 loc) · 5.16 KB
/
DecisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import math
# find item in a list
def find(item, l):
for i in l:
if item(i):
return True
else:
return False
# find most common value for an attribute
def majority(attributes, data, target):
# find target attribute
val_freq = {}
# find target in data
index = attributes.index(target)
# calculate frequency of values in target attr
for tuple in data:
if tuple[index] in val_freq:
val_freq[tuple[index]] += 1
else:
val_freq[tuple[index]] = 1
max = 0
major = ""
for key in val_freq.keys():
if val_freq[key] > max:
max = val_freq[key]
major = key
return major
# Calculates the entropy of the given data set for the target attr
def entropy(attributes, data, target_attr):
val_freq = {}
dataEntropy = 0.0
# find index of the target attribute
i = 0
for entry in attributes:
if target_attr == entry:
break
i += 1
# Calculate the frequency of each of the values in the target attr
for entry in data:
if entry[i] in val_freq:
val_freq[entry[i]] += 1.0
else:
val_freq[entry[i]] = 1.0
# Calculate the entropy of the data for the target attr
for freq in val_freq.values():
dataEntropy += (-freq / len(data)) * math.log(freq / len(data), 2)
return dataEntropy
def gain(attributes, data, attr, target_attr):
"""
Calculates the information gain (reduction in entropy) that would
result by splitting the data on the chosen attribute (attr).
"""
val_freq = {}
subset_entropy = 0.0
# find index of the attribute
i = attributes.index(attr)
# Calculate the frequency of each of the values in the target attribute
for entry in data:
if entry[i] in val_freq:
val_freq[entry[i]] += 1.0
else:
val_freq[entry[i]] = 1.0
# Calculate the sum of the entropy for each subset of records weighted
# by their probability of occuring in the training set.
for val in val_freq.keys():
valProb = val_freq[val] / sum(val_freq.values())
dataSubset = [entry for entry in data if entry[i] == val]
subset_entropy += valProb * entropy(attributes, dataSubset, target_attr)
# Subtract the entropy of the chosen attribute from the entropy of the
# whole data set with respect to the target attribute (and return it)
return entropy(attributes, data, target_attr) - subset_entropy
# choose best attibute
# dont choose target attribute
def choose_attr(data, attributes, target):
best = attributes[0]
max_gain = 0
for attr in attributes:
if attr != target:
newGain = gain(attributes, data, attr, target)
if newGain > max_gain:
max_gain = newGain
best = attr
return best
# get values in the column of the given attribute
def getValues(data, attributes, attr):
index = attributes.index(attr)
values = []
for entry in data:
if entry[index] not in values:
values.append(entry[index])
return values
def get_samples(data, attributes, best, val):
examples = [[]]
index = attributes.index(best)
for entry in data:
# find entries with the give value
if entry[index] == val:
new_entry = []
# add value if it is not in best column
for i in range(0, len(entry)):
if i != index:
new_entry.append(entry[i])
examples.append(new_entry)
examples.remove([])
return examples
def make_tree(data, attributes, target, recursion):
recursion += 1
# Returns a new decision tree based on the examples given.
data = data[:]
vals = [record[attributes.index(target)] for record in data]
default = majority(attributes, data, target)
# If the dataset is empty or the attributes list is empty, return the
# default value. When checking the attributes list for emptiness, we
# need to subtract 1 to account for the target attribute.
if not data or (len(attributes) - 1) <= 0:
return default
# If all the records in the dataset have the same classification,
# return that classification.
elif vals.count(vals[0]) == len(vals):
return vals[0]
else:
# Choose the next best attribute to best classify our data
best = choose_attr(data, attributes, target)
# Create a new decision tree/node with the best attribute and an empty
# dictionary object--we'll fill that up next.
tree = {best: {}}
# Create a new decision tree/sub-node for each of the values in the
# best attribute field
for val in getValues(data, attributes, best):
# Create a subtree for the current value under the "best" field
examples = get_samples(data, attributes, best, val)
new_attr = attributes[:]
new_attr.remove(best)
subtree = make_tree(examples, new_attr, target, recursion)
# Add the new subtree to the empty dictionary object in our new
# tree/node we just created.
tree[best][val] = subtree
return tree