Skip to content

Commit

Permalink
Add basic support for OpenNLP.
Browse files Browse the repository at this point in the history
  • Loading branch information
louismullie committed Jun 3, 2013
1 parent 038d62b commit 727a307
Show file tree
Hide file tree
Showing 11 changed files with 89 additions and 58 deletions.
2 changes: 1 addition & 1 deletion lib/treat/config/data/core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
'abw', 'doc', 'yaml', 'uea',
'lda', 'pdf', 'ptb', 'dot',
'ai', 'id3', 'svo', 'mlp',
'svm', 'srx'],
'svm', 'srx', 'nlp'],

encodings:
{language_to_code: {
Expand Down
2 changes: 1 addition & 1 deletion lib/treat/config/data/languages/english.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
processors: {
parsers: [:stanford],
segmenters: [:scalpel, :srx, :tactful, :punkt, :stanford],
tokenizers: [:ptb, :stanford, :punkt]
tokenizers: [:ptb, :stanford, :punkt, :open_nlp]
}
},
stop_words:
Expand Down
4 changes: 4 additions & 0 deletions lib/treat/config/data/libraries.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,9 @@
stanford: {
jar_path: nil,
model_path: nil
},
open_nlp: {
jar_path: nil,
model_path: nil
}
}
2 changes: 1 addition & 1 deletion lib/treat/helpers/string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def camel_case
if @@cc_cache[o_phrase]
return @@cc_cache[o_phrase]
end
if Treat.core.acronyms.include?(phrase)
if Treat.core.acronyms.include?(phrase.downcase)
phrase = phrase.upcase
else
phrase.gsub!(Regex) { |a| a.upcase }
Expand Down
48 changes: 48 additions & 0 deletions lib/treat/loaders/bind_it.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
class Treat::Loaders::BindIt

# Keep track of whether its loaded or not.
@@loaded = {}

# Load CoreNLP package for a given language.
def self.load(klass, name, language = nil)

return if @@loaded[klass]

language ||= Treat.core.language.default

jar_path = Treat.libraries[name].jar_path ||
Treat.paths.bin + "#{name}/"
model_path = Treat.libraries[name].model_path ||
Treat.paths.models + "#{name}/"

if !File.directory?(jar_path)
raise Treat::Exception, "Looking for #{klass} " +
"library JAR files in #{jar_path}, but it is " +
"not a directory. Please set the config option " +
"Treat.libraries.#{name}.jar_path to a folder " +
"containing the appropriate JAR files."
end

if !File.directory?(model_path)
raise Treat::Exception, "Looking for #{klass} " +
"library model files in #{model_path}, but it " +
"is not a directory. Please set the config option " +
"Treat.libraries.#{name}.model_path to a folder " +
"containing the appropriate JAR files."
end

klass.jar_path = jar_path
klass.model_path = model_path
klass.use language

if Treat.core.verbosity.silence
klass.log_file = '/dev/null'
end

klass.bind

@@loaded[klass] = true

end

end
12 changes: 12 additions & 0 deletions lib/treat/loaders/open_nlp.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
require 'treat/loaders/bind_it'

# A helper class to load the OpenNLP package.
class Treat::Loaders::OpenNLP < Treat::Loaders::BindIt

require 'open-nlp'

def self.load(language = nil)
super(OpenNLP, :open_nlp, language)
end

end
53 changes: 7 additions & 46 deletions lib/treat/loaders/stanford.rb
Original file line number Diff line number Diff line change
@@ -1,53 +1,14 @@
require 'treat/loaders/bind_it'

# A helper class to load the CoreNLP package.
class Treat::Loaders::Stanford
class Treat::Loaders::Stanford < Treat::Loaders::BindIt

# Keep track of whether its loaded or not.
@@loaded = false
require 'stanford-core-nlp'

# Load CoreNLP package for a given language.
def self.load(language = nil)

return if @@loaded

language ||= Treat.core.language.default

jar_path = Treat.libraries.stanford.jar_path ||
Treat.paths.bin + 'stanford/'
model_path = Treat.libraries.stanford.model_path ||
Treat.paths.models + 'stanford/'

if !File.directory?(jar_path)
raise Treat::Exception, "Looking for Stanford " +
"CoreNLP JAR files in #{jar_path}, but it is " +
"not a directory. Please set the config option " +
"Treat.libraries.stanford.jar_path to a folder " +
"containing the Stanford JAR files."
end

if !File.directory?(model_path)
raise Treat::Exception, "Looking for Stanford " +
"CoreNLP model files in #{model_path}, but it " +
"is not a directory. Please set the config option " +
"Treat.libraries.stanford.model_path to a folder " +
"containing the Stanford JAR files."
end

require 'stanford-core-nlp'

StanfordCoreNLP.jar_path = jar_path
StanfordCoreNLP.model_path = model_path
StanfordCoreNLP.use(language)

if Treat.core.verbosity.silence
StanfordCoreNLP.log_file = '/dev/null'
end

StanfordCoreNLP.bind

@@loaded = true

super(StanfordCoreNLP, :stanford, language)
end

def self.find_model(name, language)
language = language.intern
model_file = StanfordCoreNLP::Config::Models[name][language]
Expand All @@ -57,4 +18,4 @@ def self.find_model(name, language)
File.join(model_path, model_dir, model_file)
end

end
end
5 changes: 3 additions & 2 deletions lib/treat/proxies/proxy.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@ module Proxy
def method_missing(sym, *args, &block)
if [:do, :apply].include?(sym) ||
Treat::Workers.lookup(sym)
to_entity.send(sym, *args)
to_entity.send(sym, *args)
else
super(sym, *args, &block)
end
end

# Create an unknown type of entity by default.
def to_entity(builder = nil)
Treat::Entities::Unknown(self.to_s)
Treat::Entities::Unknown.new(self.to_s)
end
end

Expand Down
2 changes: 1 addition & 1 deletion lib/treat/workers/groupable.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def const_missing(const)
require file
if not self.const_defined?(const)
raise Treat::Exception,
"File #{file} does not define " +
"File #{file}.rb does not define " +
"#{self}::#{const}."
end
const_get(const)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Maximum entropy tokenization supplied by OpenNLP.
class Treat::Workers::Processors::Tokenizers::Maxent
class Treat::Workers::Processors::Tokenizers::OpenNlp

require 'open-nlp'
OpenNLP.load
Treat::Loaders::OpenNLP.load

@@tokenizers = {}

# Maximum entropy tokenization.
def self.tokenize(entity, options = {})
Expand All @@ -20,8 +22,7 @@ def self.tokenize(entity, options = {})
tokens = tokenizer.tokenize(str).to_a

tokens.each do |token|
entity << Treat::Entities
::Token.from_string(chunk)
entity << Treat::Entities::Token.from_string(token)
end

end
Expand Down
8 changes: 6 additions & 2 deletions spec/helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,21 @@ module Treat::Specs
require 'rspec'

# Some configuration options for devel.
=begin

Treat.databases.mongo.db = 'treat_test'
Treat.libraries.stanford.model_path =
'/ruby/stanford-core-nlp-minimal/models/'
Treat.libraries.stanford.jar_path =
'/ruby/stanford-core-nlp-minimal/bin/'
Treat.libraries.open_nlp.jar_path =
'/ruby/open-nlp-english/bin/'
Treat.libraries.open_nlp.model_path =
'/ruby/open-nlp-english/models/'
Treat.libraries.punkt.model_path =
'/ruby/punkt/models/'
Treat.libraries.reuters.model_path =
'/ruby/reuters/models/'
=end

# Mimic the ./lib structure.
module Entities; end
module Workers; end
Expand Down

0 comments on commit 727a307

Please sign in to comment.