Merge the ssplit into the tokenize annotator

AngledLuffa · AngledLuffa · commit aa710222c907 · 2022-03-16T15:51:57.000-07:00
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
@@ -257,7 +257,9 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
 
     // if cleanxml is requested and tokenize is here,
     // make it part of tokenize rather than its own annotator
-    unifyCleanXML(this.properties);
+    unifyTokenizeProperty(this.properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
+    // ssplit is always part of tokenize now
+    unifyTokenizeProperty(this.properties, STANFORD_SSPLIT, null);
 
     // cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
     this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
@@ -315,24 +317,31 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
    * In such a case, we remove the cleanxml from the annotators and set
    * the tokenize.cleanxml option instead
    */
-  static void unifyCleanXML(Properties properties) {
+  static void unifyTokenizeProperty(Properties properties, String property, String option) {
     String annotators = properties.getProperty("annotators", "");
     int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
-    int clean = annotators.indexOf(STANFORD_CLEAN_XML);
+    int unwanted = annotators.indexOf(property);
 
-    if (clean >= 0 && tokenize >= 0) {
-      properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
-      int comma = annotators.indexOf(",", clean);
+    if (unwanted >= 0 && tokenize >= 0) {
+      if (option != null) {
+        properties.setProperty(option, "true");
+      }
+      int comma = annotators.indexOf(",", unwanted);
       if (comma >= 0) {
-        annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
+        annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
       } else {
         comma = annotators.lastIndexOf(",");
         if (comma < 0) {
           throw new IllegalArgumentException("Unable to process annotators " + annotators);
         }
         annotators = annotators.substring(0, comma);
       }
-      logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true  Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
+      if (option != null) {
+        logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
+      } else {
+        logger.debug(property + " is now included as part of the tokenize annotator by default");
+      }
+      logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
       properties.setProperty("annotators", annotators);
     }
   }
diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {
   private final boolean useSegmenter;
   private final Annotator segmenterAnnotator;
   private final CleanXmlAnnotator cleanxmlAnnotator;
+  private final WordsToSentencesAnnotator ssplitAnnotator;
 
   /** run a custom post processor after the lexer **/
   private final List<CoreLabelProcessor> postProcessors;
@@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
     } else {
       this.cleanxmlAnnotator = null;
     }
+
+    this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
   }
 
   /**
@@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {
     if (this.cleanxmlAnnotator != null) {
       this.cleanxmlAnnotator.annotate(annotation);
     }
+    this.ssplitAnnotator.annotate(annotation);
   }
 
   @Override
diff --git a/test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java b/test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java
@@ -105,7 +105,7 @@ public void testUnifyTokenizer() {
     for (int i = 0; i < inputs.length; ++i) {
       Properties props = new Properties();
       props.setProperty("annotators", inputs[i]);
-      StanfordCoreNLP.unifyCleanXML(props);
+      StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");
       assertEquals(expected[i], props.getProperty("annotators"));
       assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
     }

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {`
`134`	`134`	`private final boolean useSegmenter;`
`135`	`135`	`private final Annotator segmenterAnnotator;`
`136`	`136`	`private final CleanXmlAnnotator cleanxmlAnnotator;`
	`137`	`+ private final WordsToSentencesAnnotator ssplitAnnotator;`
`137`	`138`
`138`	`139`	`/ run a custom post processor after the lexer /`
`139`	`140`	`private final List<CoreLabelProcessor> postProcessors;`
`@@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {`
`250`	`251`	`} else {`
`251`	`252`	`this.cleanxmlAnnotator = null;`
`252`	`253`	`}`
	`254`	`+`
	`255`	`+ this.ssplitAnnotator = new WordsToSentencesAnnotator(props);`
`253`	`256`	`}`
`254`	`257`
`255`	`258`	`/**`
`@@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {`
`429`	`432`	`if (this.cleanxmlAnnotator != null) {`
`430`	`433`	`this.cleanxmlAnnotator.annotate(annotation);`
`431`	`434`	`}`
	`435`	`+ this.ssplitAnnotator.annotate(annotation);`
`432`	`436`	`}`
`433`	`437`
`434`	`438`	`@Override`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ public void testUnifyTokenizer() {`
`105`	`105`	`for (int i = 0; i < inputs.length; ++i) {`
`106`	`106`	`Properties props = new Properties();`
`107`	`107`	`props.setProperty("annotators", inputs[i]);`
`108`		`- StanfordCoreNLP.unifyCleanXML(props);`
	`108`	`+ StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");`
`109`	`109`	`assertEquals(expected[i], props.getProperty("annotators"));`
`110`	`110`	`assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));`
`111`	`111`	`}`