Skip to content

Commit

Permalink
changes on how to treat words encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
luisparravicini committed Feb 2, 2010
1 parent 4b60924 commit ac3785e
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 7 deletions.
8 changes: 5 additions & 3 deletions lib/classifier/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def initialize(options = {})
end

def prepare_category_name val
val.to_s.gsub("_"," ").capitalize.intern
val.to_s.gsub("_"," ").capitalize
end

# Removes common punctuation symbols, returning a new string.
Expand All @@ -22,7 +22,7 @@ def without_punctuation str
end

# Return a Hash of strings => ints. Each word in the string is stemmed,
# interned, and indexes to its frequency in the document.
# and indexes to its frequency in the document.
def word_hash str
word_hash_for_words(str.gsub(/[^\w\s]/,"").split + str.gsub(/[\w]/," ").split)
end
Expand Down Expand Up @@ -50,9 +50,11 @@ def stemmer
def word_hash_for_words(words)
d = Hash.new
skip_words = StopWords.for(@options[:language], @options[:lang_dir])
encoding_name = @options[:encoding].gsub(/_/, '-')
words.each do |word|
word = word.mb_chars.downcase.to_s if word =~ /[\w]+/
key = stemmer.stem(word).intern
key = stemmer.stem(word)
key.force_encoding(encoding_name)
if word =~ /[^\w]/ || ! skip_words.include?(word) && word.length > 2
d[key] ||= 0
d[key] += 1
Expand Down
4 changes: 2 additions & 2 deletions test/base_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ class HelpersTest < Test::Unit::TestCase

def test_word_hash
c = Classifier::Base.new
hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
hash = {'good'=>1, "!"=>1, 'hope'=>1, "'"=>1, "."=>1, 'love'=>1, 'word'=>1, 'them'=>1, 'test'=>1}
assert_equal hash, c.word_hash("here are some good words of test's. I hope you love them!")
end


def test_clean_word_hash
c = Classifier::Base.new
hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
hash = {'good'=>1, 'word'=>1, 'hope'=>1, 'love'=>1, 'them'=>1, 'test'=>1}
assert_equal hash, c.clean_word_hash("here are some good words of test's. I hope you love them!")
end

Expand Down
4 changes: 2 additions & 2 deletions test/lsi/lsi_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,11 @@ def test_keyword_search
lsi.add_item @str4, "Cat"
lsi.add_item @str5, "Bird"

assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
assert_equal ['dog', 'text', 'deal'], lsi.highest_ranked_stems(@str1)
end

def test_summary
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
end

end
end

0 comments on commit ac3785e

Please sign in to comment.