Skip to content

Commit

Permalink
Merge pull request #32 from cheerfulstoic/master
Browse files Browse the repository at this point in the history
Performance Improvements
  • Loading branch information
igrigorik authored Apr 11, 2017
2 parents 1f5b596 + 13aed0b commit 846448a
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 35 deletions.
Binary file added lib/.DS_Store
Binary file not shown.
31 changes: 11 additions & 20 deletions lib/core_extensions/array.rb
Original file line number Diff line number Diff line change
@@ -1,29 +1,20 @@
class Array
def classification
collect(&:last)
end

# calculate information entropy
def entropy
return 0 if empty?
each_with_object(Hash.new(0)) do |i, result|
result[i] += 1
end.values.sum do |count|
percentage = count.to_f / length

info = {}
each do |i|
info[i] = !info[i] ? 1 : (info[i] + 1)
-percentage * Math.log2(percentage)
end

result(info, length)
end
end

private

def result(info, total)
final = 0
info.each do |_symbol, count|
next unless count > 0
percentage = count.to_f / total
final += -percentage * Math.log(percentage) / Math.log(2.0)
module ArrayClassification
refine Array do
def classification
collect(&:last)
end
final
end
end

2 changes: 1 addition & 1 deletion lib/decisiontree.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
require File.dirname(__FILE__) + '/decisiontree/id3_tree.rb'
require 'core_extensions/object'
require 'core_extensions/array'
require File.dirname(__FILE__) + '/decisiontree/id3_tree.rb'
41 changes: 27 additions & 14 deletions lib/decisiontree/id3_tree.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
module DecisionTree
Node = Struct.new(:attribute, :threshold, :gain)

using ArrayClassification

class ID3Tree
def initialize(attributes, data, default, type)
@used = {}
Expand All @@ -28,7 +30,7 @@ def train(data = @data, attributes = @attributes, default = @default)
end

data2 = data2.map do |key, val|
key + [val.sort_by { |_k, v| v }.last.first]
key + [val.sort_by { |_, v| v }.last.first]
end

@tree = id3_train(data2, attributes, default)
Expand All @@ -41,9 +43,9 @@ def type(attribute)
def fitness_for(attribute)
case type(attribute)
when :discrete
proc { |a, b, c| id3_discrete(a, b, c) }
proc { |*args| id3_discrete(*args) }
when :continuous
proc { |a, b, c| id3_continuous(a, b, c) }
proc { |*args| id3_continuous(*args) }
end
end

Expand All @@ -66,14 +68,13 @@ def id3_train(data, attributes, default, _used={})
@used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold]
tree, l = {best => {}}, ['>=', '<']

fitness = fitness_for(best.attribute)
case type(best.attribute)
when :continuous
partitioned_data = data.partition do |d|
d[attributes.index(best.attribute)] >= best.threshold
end
partitioned_data.each_with_index do |examples, i|
tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0), &fitness)
tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0))
end
when :discrete
values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort
Expand All @@ -83,7 +84,7 @@ def id3_train(data, attributes, default, _used={})
end
end
partitions.each_with_index do |examples, i|
tree[best][values[i]] = id3_train(examples, attributes - [values[i]], (data.classification.mode rescue 0), &fitness)
tree[best][values[i]] = id3_train(examples, attributes - [values[i]], (data.classification.mode rescue 0))
end
end

Expand Down Expand Up @@ -116,11 +117,18 @@ def id3_continuous(data, attributes, attribute)

# ID3 for discrete label cases
def id3_discrete(data, attributes, attribute)
values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort
partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } }
remainder = partitions.collect { |p| (p.size.to_f / data.size) * p.classification.entropy }.inject(0) { |a, e| e += a }
index = attributes.index(attribute)

values = data.map { |row| row[index] }.uniq
remainder = values.sort.sum do |val|
classification = data.each_with_object([]) do |row, result|
result << row.last if row[index] == val
end

((classification.size.to_f / data.size) * classification.entropy)
end

[data.classification.entropy - remainder, attributes.index(attribute)]
[data.classification.entropy - remainder, index]
end

def predict(test)
Expand Down Expand Up @@ -320,6 +328,7 @@ def predict(test)

class Bagging
attr_accessor :classifiers

def initialize(attributes, data, default, type)
@classifiers = []
@type = type
Expand All @@ -329,10 +338,13 @@ def initialize(attributes, data, default, type)
end

def train(data = @data, attributes = @attributes, default = @default)
@classifiers = []
10.times { @classifiers << Ruleset.new(attributes, data, default, @type) }
@classifiers.each do |c|
c.train(data, attributes, default)
@classifiers = 10.times.map do |i|
Ruleset.new(attributes, data, default, @type)
end

@classifiers.each_with_index do |classifier, index|
puts "Processing classifier ##{index + 1}"
classifier.train(data, attributes, default)
end
end

Expand All @@ -348,3 +360,4 @@ def predict(test)
end
end
end

0 comments on commit 846448a

Please sign in to comment.