Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter {

// found the control string that enforces sentence breaks
// note that this token is NOT added to the sentences produced
else if(crt.word == SENTENCE_BREAK_CONTROL_STRING) {
else if (SentenceSplitter.useControlStrings && crt.word == SENTENCE_BREAK_CONTROL_STRING) {
sentences += Sentence(raw.toArray, beginPositions.toArray, endPositions.toArray, words.toArray)
raw = new ArrayBuffer[String]()
words = new ArrayBuffer[String]()
Expand Down Expand Up @@ -199,6 +199,8 @@ object SentenceSplitter {
// Control string that enforces a sentence break
// If you change this value, change also the SENTENCEBREAK in OpenDomainLexer.g to the same value (and recompile the Antlr grammar)
val SENTENCE_BREAK_CONTROL_STRING = "[SB]"
// Set the following to true in order to split sentences on the string above.
var useControlStrings = false

val EOS_FOLLOWEDBY_BULLET = """\.\d+$""".r

Expand Down
18 changes: 11 additions & 7 deletions main/src/test/scala/org/clulab/processors/TestTokenizer.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package org.clulab.processors

import org.clulab.processors.clu.tokenizer.OpenDomainEnglishTokenizer
import org.clulab.processors.clu.tokenizer.{OpenDomainEnglishTokenizer, SentenceSplitter}
import org.scalatest.{FlatSpec, Matchers}

/**
Expand Down Expand Up @@ -225,18 +225,22 @@ class TestTokenizer extends FlatSpec with Matchers {

it should "recognize the control string for sentence breaks" in {
val orig = "this is one sentence [SB] this is another"
val sents = tok(orig)
sents.length should be (2)
val twoSents = tok(orig, true)
twoSents.length should be (2)
twoSents(0).words.length should be(4)
twoSents(1).words.length should be(3)

sents(0).words.length should be (4)
sents(1).words.length should be (3)
val oneSent = tok(orig, false)
oneSent.length should be (1)
oneSent(0).words.length should be (8)
}

def tok(s:String):Array[Sentence] = {
def tok(s:String, useControlStrings: Boolean = false):Array[Sentence] = synchronized {
SentenceSplitter.useControlStrings = useControlStrings
println(s"Tokenizing text: $s")
val t = new OpenDomainEnglishTokenizer(None)
val sents = t.tokenize(s)
for(i <- sents.indices) {
for (i <- sents.indices) {
println(s"\tSentence #$i: " + sents(i).words.mkString(" "))
}
sents
Expand Down