Skip to content

Commit aae5a13

Browse files
committed
Merge pull request jekyll#61 from mike-stewart/enable_stemmer
Option to Disable Stemming
2 parents d59a269 + 6bbff03 commit aae5a13

File tree

4 files changed

+37
-10
lines changed

4 files changed

+37
-10
lines changed

lib/classifier-reborn/bayes.rb

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@ class Bayes
1717
# auto_categorize: false When true, enables ability to dynamically declare a category
1818
# enable_threshold: false When true, enables a threshold requirement for classifition
1919
# threshold: 0.0 Default threshold, only used when enabled
20+
# enable_stemmer: true When false, disables word stemming
2021
def initialize(*args)
2122
@categories = {}
2223
options = { language: 'en',
2324
auto_categorize: false,
2425
enable_threshold: false,
25-
threshold: 0.0
26+
threshold: 0.0,
27+
enable_stemmer: true
2628
}
2729
args.flatten.each do |arg|
2830
if arg.is_a?(Hash)
@@ -40,6 +42,7 @@ def initialize(*args)
4042
@auto_categorize = options[:auto_categorize]
4143
@enable_threshold = options[:enable_threshold]
4244
@threshold = options[:threshold]
45+
@enable_stemmer = options[:enable_stemmer]
4346
end
4447

4548
# Provides a general training method for all categories specified in Bayes#new
@@ -61,7 +64,7 @@ def train(category, text)
6164
end
6265

6366
@category_counts[category] += 1
64-
Hasher.word_hash(text, @language).each do |word, count|
67+
Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
6568
@categories[category][word] += count
6669
@category_word_count[category] += count
6770
@total_words += count
@@ -78,7 +81,7 @@ def train(category, text)
7881
def untrain(category, text)
7982
category = CategoryNamer.prepare_name(category)
8083
@category_counts[category] -= 1
81-
Hasher.word_hash(text, @language).each do |word, count|
84+
Hasher.word_hash(text, @language, @enable_stemmer).each do |word, count|
8285
next if @total_words < 0
8386
orig = @categories[category][word] || 0
8487
@categories[category][word] -= count
@@ -98,7 +101,7 @@ def untrain(category, text)
98101
# The largest of these scores (the one closest to 0) is the one picked out by #classify
99102
def classifications(text)
100103
score = {}
101-
word_hash = Hasher.word_hash(text, @language)
104+
word_hash = Hasher.word_hash(text, @language, @enable_stemmer)
102105
training_count = @category_counts.values.reduce(:+).to_f
103106
@categories.each do |category, category_words|
104107
score[category.to_s] = 0
@@ -155,6 +158,16 @@ def threshold_disabled?
155158
!@enable_threshold
156159
end
157160

161+
# Is word stemming enabled?
162+
def stemmer_enabled?
163+
@enable_stemmer
164+
end
165+
166+
# Is word stemming disabled?
167+
def stemmer_disabled?
168+
!@enable_stemmer
169+
end
170+
158171
# Provides training and untraining methods for the categories specified in Bayes#new
159172
# For example:
160173
# b = ClassifierReborn::Bayes.new 'This', 'That', 'the_other'

lib/classifier-reborn/extensions/hasher.rb

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,26 @@ module Hasher
1313

1414
# Return a Hash of strings => ints. Each word in the string is stemmed,
1515
# interned, and indexes to its frequency in the document.
16-
def word_hash(str, language = 'en')
17-
cleaned_word_hash = clean_word_hash(str, language)
16+
def word_hash(str, language = 'en', enable_stemmer = true)
17+
cleaned_word_hash = clean_word_hash(str, language, enable_stemmer)
1818
symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
1919
cleaned_word_hash.merge(symbol_hash)
2020
end
2121

2222
# Return a word hash without extra punctuation or short symbols, just stemmed words
23-
def clean_word_hash(str, language = 'en')
24-
word_hash_for_words str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language
23+
def clean_word_hash(str, language = 'en', enable_stemmer = true)
24+
word_hash_for_words str.gsub(/[^\p{WORD}\s]/, '').downcase.split, language, enable_stemmer
2525
end
2626

27-
def word_hash_for_words(words, language = 'en')
27+
def word_hash_for_words(words, language = 'en', enable_stemmer = true)
2828
d = Hash.new(0)
2929
words.each do |word|
30-
d[word.stem.intern] += 1 if word.length > 2 && !STOPWORDS[language].include?(word)
30+
next unless word.length > 2 && !STOPWORDS[language].include?(word)
31+
if enable_stemmer
32+
d[word.stem.intern] += 1
33+
else
34+
d[word.intern] += 1
35+
end
3136
end
3237
d
3338
end

test/bayes/bayesian_test.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ def test_training_with_utf8
1414
assert_nothing_raised { @classifier.train_interesting 'Água' }
1515
end
1616

17+
def test_stemming_enabled_by_default
18+
assert @classifier.stemmer_enabled?
19+
end
20+
1721
def test_bad_training
1822
assert_raise(StandardError) { @classifier.train_no_category 'words' }
1923
end

test/extensions/hasher_test.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ def test_clean_word_hash
1515
assert_equal hash, Hasher.clean_word_hash("here are some good words of test's. I hope you love them!")
1616
end
1717

18+
def test_clean_word_hash_without_stemming
19+
hash = { good: 1, words: 1, hope: 1, love: 1, them: 1, tests: 1 }
20+
assert_equal hash, Hasher.clean_word_hash("here are some good words of test's. I hope you love them!", 'en', false)
21+
end
22+
1823
def test_default_stopwords
1924
assert_not_empty Hasher::STOPWORDS['en']
2025
assert_not_empty Hasher::STOPWORDS['fr']

0 commit comments

Comments
 (0)