Skip to content

Performance Improvements #32

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added lib/.DS_Store
Binary file not shown.
31 changes: 11 additions & 20 deletions lib/core_extensions/array.rb
Original file line number Diff line number Diff line change
@@ -1,29 +1,20 @@
class Array
def classification
collect(&:last)
end

# calculate information entropy
def entropy
return 0 if empty?
each_with_object(Hash.new(0)) do |i, result|
result[i] += 1
end.values.sum do |count|
percentage = count.to_f / length

info = {}
each do |i|
info[i] = !info[i] ? 1 : (info[i] + 1)
-percentage * Math.log2(percentage)
end

result(info, length)
end
end

private

def result(info, total)
final = 0
info.each do |_symbol, count|
next unless count > 0
percentage = count.to_f / total
final += -percentage * Math.log(percentage) / Math.log(2.0)
module ArrayClassification
refine Array do
def classification
collect(&:last)
end
final
end
end

2 changes: 1 addition & 1 deletion lib/decisiontree.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
require File.dirname(__FILE__) + '/decisiontree/id3_tree.rb'
require 'core_extensions/object'
require 'core_extensions/array'
require File.dirname(__FILE__) + '/decisiontree/id3_tree.rb'
41 changes: 27 additions & 14 deletions lib/decisiontree/id3_tree.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
module DecisionTree
Node = Struct.new(:attribute, :threshold, :gain)

using ArrayClassification

class ID3Tree
def initialize(attributes, data, default, type)
@used = {}
Expand All @@ -28,7 +30,7 @@ def train(data = @data, attributes = @attributes, default = @default)
end

data2 = data2.map do |key, val|
key + [val.sort_by { |_k, v| v }.last.first]
key + [val.sort_by { |_, v| v }.last.first]
end

@tree = id3_train(data2, attributes, default)
Expand All @@ -41,9 +43,9 @@ def type(attribute)
def fitness_for(attribute)
case type(attribute)
when :discrete
proc { |a, b, c| id3_discrete(a, b, c) }
proc { |*args| id3_discrete(*args) }
when :continuous
proc { |a, b, c| id3_continuous(a, b, c) }
proc { |*args| id3_continuous(*args) }
end
end

Expand All @@ -66,14 +68,13 @@ def id3_train(data, attributes, default, _used={})
@used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold]
tree, l = {best => {}}, ['>=', '<']

fitness = fitness_for(best.attribute)
case type(best.attribute)
when :continuous
partitioned_data = data.partition do |d|
d[attributes.index(best.attribute)] >= best.threshold
end
partitioned_data.each_with_index do |examples, i|
tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0), &fitness)
tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0))
end
when :discrete
values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort
Expand All @@ -83,7 +84,7 @@ def id3_train(data, attributes, default, _used={})
end
end
partitions.each_with_index do |examples, i|
tree[best][values[i]] = id3_train(examples, attributes - [values[i]], (data.classification.mode rescue 0), &fitness)
tree[best][values[i]] = id3_train(examples, attributes - [values[i]], (data.classification.mode rescue 0))
end
end

Expand Down Expand Up @@ -116,11 +117,18 @@ def id3_continuous(data, attributes, attribute)

# ID3 for discrete label cases
def id3_discrete(data, attributes, attribute)
values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort
partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } }
remainder = partitions.collect { |p| (p.size.to_f / data.size) * p.classification.entropy }.inject(0) { |a, e| e += a }
index = attributes.index(attribute)

values = data.map { |row| row[index] }.uniq
remainder = values.sort.sum do |val|
classification = data.each_with_object([]) do |row, result|
result << row.last if row[index] == val
end

((classification.size.to_f / data.size) * classification.entropy)
end

[data.classification.entropy - remainder, attributes.index(attribute)]
[data.classification.entropy - remainder, index]
end

def predict(test)
Expand Down Expand Up @@ -320,6 +328,7 @@ def predict(test)

class Bagging
attr_accessor :classifiers

def initialize(attributes, data, default, type)
@classifiers = []
@type = type
Expand All @@ -329,10 +338,13 @@ def initialize(attributes, data, default, type)
end

def train(data = @data, attributes = @attributes, default = @default)
@classifiers = []
10.times { @classifiers << Ruleset.new(attributes, data, default, @type) }
@classifiers.each do |c|
c.train(data, attributes, default)
@classifiers = 10.times.map do |i|
Ruleset.new(attributes, data, default, @type)
end

@classifiers.each_with_index do |classifier, index|
puts "Processing classifier ##{index + 1}"
classifier.train(data, attributes, default)
end
end

Expand All @@ -348,3 +360,4 @@ def predict(test)
end
end
end