apache · msfroh · Feb 4, 2025
diff --git a/...analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java b/...analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java
@@ -18,7 +18,9 @@
 package org.apache.lucene.analysis.opennlp;
 
 import java.io.IOException;
+import java.util.Locale;
 import java.util.Map;
+import opennlp.tools.postag.POSTagFormat;
 import org.apache.lucene.analysis.TokenFilterFactory;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
@@ -46,11 +48,17 @@ public class OpenNLPPOSFilterFactory extends TokenFilterFactory implements Resou
 
   public static final String POS_TAGGER_MODEL = "posTaggerModel";
 
+  public static final String POS_TAG_FORMAT = "posTagFormat";
+
   private final String posTaggerModelFile;
 
+  private final POSTagFormat posTaggerFormat;
+
   public OpenNLPPOSFilterFactory(Map<String, String> args) {
     super(args);
     posTaggerModelFile = require(args, POS_TAGGER_MODEL);
+    String tagFormat = get(args, POS_TAG_FORMAT, "CUSTOM");
+    posTaggerFormat = POSTagFormat.valueOf(tagFormat.toUpperCase(Locale.ROOT));
     if (!args.isEmpty()) {
       throw new IllegalArgumentException("Unknown parameters: " + args);
     }
@@ -64,7 +72,8 @@ public OpenNLPPOSFilterFactory() {
   @Override
   public OpenNLPPOSFilter create(TokenStream in) {
     try {
-      return new OpenNLPPOSFilter(in, OpenNLPOpsFactory.getPOSTagger(posTaggerModelFile));
+      return new OpenNLPPOSFilter(
+          in, OpenNLPOpsFactory.getPOSTagger(posTaggerModelFile, posTaggerFormat));
     } catch (IOException e) {
       throw new IllegalArgumentException(e);
     }

diff --git a/...ne/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java b/...ne/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java
@@ -30,7 +30,11 @@ public class NLPPOSTaggerOp {
   private final POSTagger tagger;
 
   public NLPPOSTaggerOp(POSModel model) {
-    tagger = new POSTaggerME(model, POSTagFormat.PENN);
+    this(model, POSTagFormat.CUSTOM);
+  }
+
+  public NLPPOSTaggerOp(POSModel model, POSTagFormat format) {
+    tagger = new POSTaggerME(model, format);
   }
 
   public synchronized String[] getPOSTags(String[] words) {

diff --git a/...analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java b/...analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
@@ -26,6 +26,7 @@
 import opennlp.tools.lemmatizer.LemmatizerModel;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTagFormat;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.tokenize.TokenizerModel;
 import org.apache.lucene.util.ResourceLoader;
@@ -87,8 +88,13 @@ public static TokenizerModel getTokenizerModel(String modelName, ResourceLoader
   }
 
   public static NLPPOSTaggerOp getPOSTagger(String modelName) throws IOException {
+    return getPOSTagger(modelName, POSTagFormat.CUSTOM);
+  }
+
+  public static NLPPOSTaggerOp getPOSTagger(String modelName, POSTagFormat posTagFormat)
+      throws IOException {
     POSModel model = posTaggerModels.get(modelName);
-    return new NLPPOSTaggerOp(model);
+    return new NLPPOSTaggerOp(model, posTagFormat);
   }
 
   public static POSModel getPOSTaggerModel(String modelName, ResourceLoader loader)

diff --git a/...ne/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict b/...ne/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict
@@ -10,3 +10,15 @@ did	VBD	do
 not	RB	not
 come	VB	come
 back	RB	back
+they	PROPN	they
+sent	VERB	send
+him	PRON	he
+running	VERB	run
+in	ADP	in
+the	DET	the
+evening	NOUN	evening
+he	PRON	he
+did	VERB	do
+not	ADV	not
+come	VERB	come
+back	ADV	back
diff --git a/...ennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java b/...ennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
@@ -38,6 +38,9 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
   private static final String[] SENTENCE_posTags = {
     "NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "."
   };
+  private static final String[] SENTENCE_posTags_UD = {
+    "PROPN", "VERB", "PRON", "VERB", "ADP", "DET", "NOUN", "PUNCT"
+  };
 
   private static final String SENTENCES =
       "They sent him running in the evening. He did not come back.";
@@ -51,6 +54,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
   private static final String[] SENTENCES_posTags = {
     "NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "."
   };
+  private static final String[] SENTENCES_posTags_UD = {
+    "PROPN", "VERB", "PRON", "VERB", "ADP", "DET", "NOUN", "PUNCT", "PRON", "VERB", "ADV", "VERB",
+    "ADV", "PUNCT"
+  };
 
   private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
   private static final String[] SENTENCE_both_punc = {
@@ -89,6 +96,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
     "NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP",
     "VBD", "VBD", "RB", "VB", "RB", "."
   };
+  private static final String[] SENTENCES_keep_orig_posTags_UD = {
+    "PROPN", "PROPN", "VERB", "VERB", "PRON", "PRON", "VERB", "VERB", "ADP", "DET", "NOUN", "PUNCT",
+    "PRON", "PRON", "VERB", "VERB", "ADV", "VERB", "ADV", "PUNCT"
+  };
 
   private static final String[] SENTENCES_both_keep_orig_punc = {
     "Konstantin",
@@ -133,6 +144,19 @@ public void test1SentenceDictionaryOnly() throws Exception {
         analyzer, SENTENCE, SENTENCE_dict_punc, null, null, SENTENCE_posTags, null, null, true);
   }
 
+  public void test1SentenceDictionaryOnlyUD() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter(
+                "opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin", "posTagFormat", "UD")
+            .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+            .build();
+    assertAnalyzesTo(
+        analyzer, SENTENCE, SENTENCE_dict_punc, null, null, SENTENCE_posTags_UD, null, null, true);
+  }
+
   public void test2SentencesDictionaryOnly() throws Exception {
     CustomAnalyzer analyzer =
         CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
@@ -145,6 +169,27 @@ public void test2SentencesDictionaryOnly() throws Exception {
         analyzer, SENTENCES, SENTENCES_dict_punc, null, null, SENTENCES_posTags, null, null, true);
   }
 
+  public void test2SentencesDictionaryOnlyUD() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter(
+                "opennlpPOS", "posTaggerModel", posTaggerModelFile, "posTagFormat", "UD")
+            .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
+            .build();
+    assertAnalyzesTo(
+        analyzer,
+        SENTENCES,
+        SENTENCES_dict_punc,
+        null,
+        null,
+        SENTENCES_posTags_UD,
+        null,
+        null,
+        true);
+  }
+
   public void test1SentenceMaxEntOnly() throws Exception {
     CustomAnalyzer analyzer =
         CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
@@ -249,6 +294,29 @@ public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
         true);
   }
 
+  public void testKeywordAttributeAwarenessDictionaryOnlyUD() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter(
+                "opennlpPOS", "posTaggerModel", posTaggerModelFile, "posTagFormat", "UD")
+            .addTokenFilter(KeywordRepeatFilterFactory.class)
+            .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
+            .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
+            .build();
+    assertAnalyzesTo(
+        analyzer,
+        SENTENCES,
+        SENTENCES_dict_keep_orig_punc,
+        null,
+        null,
+        SENTENCES_keep_orig_posTags_UD,
+        null,
+        null,
+        true);
+  }
+
   public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
     CustomAnalyzer analyzer =
         CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))

diff --git a/...ysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java b/...ysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
@@ -63,6 +63,11 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
     "NN", "NN", "CD", "VBZ", "CD", "NNS", ".", "NN", "NN", "CD", ",", "CD", "NNS", "."
   };
 
+  private static final String[] SENTENCES_posTags_UD = {
+    "NOUN", "NOUN", "NUM", "VERB", "NUM", "NOUN", "PUNCT", "NOUN", "NOUN", "NUM", "PUNCT", "NUM",
+    "NOUN", "PUNCT"
+  };
+
   private static final String NO_BREAK = "No period";
   private static final String[] NO_BREAK_terms = {"No", "period"};
   private static final int[] NO_BREAK_startOffsets = {0, 3};
@@ -128,6 +133,46 @@ public void testPOS() throws Exception {
         toPayloads(SENTENCES_posTags));
   }
 
+  public void testPOSUD() throws Exception {
+    CustomAnalyzer analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter(
+                "opennlpPOS", "posTaggerModel", posTaggerModelFile, "posTagFormat", "UD")
+            .build();
+    assertAnalyzesTo(
+        analyzer,
+        SENTENCES,
+        SENTENCES_punc,
+        SENTENCES_startOffsets,
+        SENTENCES_endOffsets,
+        SENTENCES_posTags_UD,
+        null,
+        null,
+        true);
+
+    analyzer =
+        CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+            .withTokenizer(
+                "opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+            .addTokenFilter(
+                "opennlpPOS", "posTaggerModel", posTaggerModelFile, "posTagFormat", "UD")
+            .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
+            .build();
+    assertAnalyzesTo(
+        analyzer,
+        SENTENCES,
+        SENTENCES_punc,
+        SENTENCES_startOffsets,
+        SENTENCES_endOffsets,
+        null,
+        null,
+        null,
+        true,
+        toPayloads(SENTENCES_posTags_UD));
+  }
+
   public void testNoBreak() throws Exception {
     CustomAnalyzer analyzer =
         CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))