clulab · kwalcock · Nov 18, 2022 · Nov 18, 2022 · Nov 18, 2022
diff --git a/main/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala b/main/src/main/scala/org/clulab/processors/clu/tokenizer/SentenceSplitter.scala
@@ -121,7 +121,7 @@ abstract class RuleBasedSentenceSplitter extends SentenceSplitter {
 
       // found the control string that enforces sentence breaks
       // note that this token is NOT added to the sentences produced
-      else if(crt.word == SENTENCE_BREAK_CONTROL_STRING) {
+      else if (SentenceSplitter.useControlStrings && crt.word == SENTENCE_BREAK_CONTROL_STRING) {
         sentences += Sentence(raw.toArray, beginPositions.toArray, endPositions.toArray, words.toArray)
         raw = new ArrayBuffer[String]()
         words = new ArrayBuffer[String]()
@@ -199,6 +199,8 @@ object SentenceSplitter {
   // Control string that enforces a sentence break
   // If you change this value, change also the SENTENCEBREAK in OpenDomainLexer.g to the same value (and recompile the Antlr grammar)
   val SENTENCE_BREAK_CONTROL_STRING = "[SB]"
+  // Set the following to true in order to split sentences on the string above.
+  var useControlStrings = false
 
   val EOS_FOLLOWEDBY_BULLET = """\.\d+$""".r
 

diff --git a/main/src/test/scala/org/clulab/processors/TestTokenizer.scala b/main/src/test/scala/org/clulab/processors/TestTokenizer.scala
@@ -1,6 +1,6 @@
 package org.clulab.processors
 
-import org.clulab.processors.clu.tokenizer.OpenDomainEnglishTokenizer
+import org.clulab.processors.clu.tokenizer.{OpenDomainEnglishTokenizer, SentenceSplitter}
 import org.scalatest.{FlatSpec, Matchers}
 
 /**
@@ -225,18 +225,22 @@ class TestTokenizer extends FlatSpec with Matchers {
 
   it should "recognize the control string for sentence breaks" in {
     val orig = "this is one sentence [SB] this is another"
-    val sents = tok(orig)
-    sents.length should be (2)
+    val twoSents = tok(orig, true)
+    twoSents.length should be (2)
+    twoSents(0).words.length should be(4)
+    twoSents(1).words.length should be(3)
 
-    sents(0).words.length should be (4)
-    sents(1).words.length should be (3)
+    val oneSent = tok(orig, false)
+    oneSent.length should be (1)
+    oneSent(0).words.length should be (8)
   }
 
-  def tok(s:String):Array[Sentence] = {
+  def tok(s:String, useControlStrings: Boolean = false):Array[Sentence] = synchronized {
+    SentenceSplitter.useControlStrings = useControlStrings
     println(s"Tokenizing text: $s")
     val t = new OpenDomainEnglishTokenizer(None)
     val sents = t.tokenize(s)
-    for(i <- sents.indices) {
+    for (i <- sents.indices) {
       println(s"\tSentence #$i: " + sents(i).words.mkString(" "))
     }
     sents