Skip to content

Commit aa71022

Browse files
committed
Merge the ssplit into the tokenize annotator
1 parent 5f6df2d commit aa71022

File tree

3 files changed

+22
-9
lines changed

3 files changed

+22
-9
lines changed

src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,9 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
257257

258258
// if cleanxml is requested and tokenize is here,
259259
// make it part of tokenize rather than its own annotator
260-
unifyCleanXML(this.properties);
260+
unifyTokenizeProperty(this.properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
261+
// ssplit is always part of tokenize now
262+
unifyTokenizeProperty(this.properties, STANFORD_SSPLIT, null);
261263

262264
// cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
263265
this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
@@ -315,24 +317,31 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
315317
* In such a case, we remove the cleanxml from the annotators and set
316318
* the tokenize.cleanxml option instead
317319
*/
318-
static void unifyCleanXML(Properties properties) {
320+
static void unifyTokenizeProperty(Properties properties, String property, String option) {
319321
String annotators = properties.getProperty("annotators", "");
320322
int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
321-
int clean = annotators.indexOf(STANFORD_CLEAN_XML);
323+
int unwanted = annotators.indexOf(property);
322324

323-
if (clean >= 0 && tokenize >= 0) {
324-
properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
325-
int comma = annotators.indexOf(",", clean);
325+
if (unwanted >= 0 && tokenize >= 0) {
326+
if (option != null) {
327+
properties.setProperty(option, "true");
328+
}
329+
int comma = annotators.indexOf(",", unwanted);
326330
if (comma >= 0) {
327-
annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
331+
annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
328332
} else {
329333
comma = annotators.lastIndexOf(",");
330334
if (comma < 0) {
331335
throw new IllegalArgumentException("Unable to process annotators " + annotators);
332336
}
333337
annotators = annotators.substring(0, comma);
334338
}
335-
logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
339+
if (option != null) {
340+
logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
341+
} else {
342+
logger.debug(property + " is now included as part of the tokenize annotator by default");
343+
}
344+
logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
336345
properties.setProperty("annotators", annotators);
337346
}
338347
}

src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {
134134
private final boolean useSegmenter;
135135
private final Annotator segmenterAnnotator;
136136
private final CleanXmlAnnotator cleanxmlAnnotator;
137+
private final WordsToSentencesAnnotator ssplitAnnotator;
137138

138139
/** run a custom post processor after the lexer **/
139140
private final List<CoreLabelProcessor> postProcessors;
@@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
250251
} else {
251252
this.cleanxmlAnnotator = null;
252253
}
254+
255+
this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
253256
}
254257

255258
/**
@@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {
429432
if (this.cleanxmlAnnotator != null) {
430433
this.cleanxmlAnnotator.annotate(annotation);
431434
}
435+
this.ssplitAnnotator.annotate(annotation);
432436
}
433437

434438
@Override

test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public void testUnifyTokenizer() {
105105
for (int i = 0; i < inputs.length; ++i) {
106106
Properties props = new Properties();
107107
props.setProperty("annotators", inputs[i]);
108-
StanfordCoreNLP.unifyCleanXML(props);
108+
StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");
109109
assertEquals(expected[i], props.getProperty("annotators"));
110110
assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
111111
}

0 commit comments

Comments
 (0)