Skip to content

Add posTagFormat parameter for OpenNLPPOSFilter #14194

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
package org.apache.lucene.analysis.opennlp;

import java.io.IOException;
import java.util.Locale;
import java.util.Map;
import opennlp.tools.postag.POSTagFormat;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
Expand Down Expand Up @@ -46,11 +48,17 @@ public class OpenNLPPOSFilterFactory extends TokenFilterFactory implements Resou

public static final String POS_TAGGER_MODEL = "posTaggerModel";

public static final String POS_TAG_FORMAT = "posTagFormat";

private final String posTaggerModelFile;

private final POSTagFormat posTaggerFormat;

public OpenNLPPOSFilterFactory(Map<String, String> args) {
super(args);
posTaggerModelFile = require(args, POS_TAGGER_MODEL);
String tagFormat = get(args, POS_TAG_FORMAT, "CUSTOM");
posTaggerFormat = POSTagFormat.valueOf(tagFormat.toUpperCase(Locale.ROOT));
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
Expand All @@ -64,7 +72,8 @@ public OpenNLPPOSFilterFactory() {
@Override
public OpenNLPPOSFilter create(TokenStream in) {
try {
return new OpenNLPPOSFilter(in, OpenNLPOpsFactory.getPOSTagger(posTaggerModelFile));
return new OpenNLPPOSFilter(
in, OpenNLPOpsFactory.getPOSTagger(posTaggerModelFile, posTaggerFormat));
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ public class NLPPOSTaggerOp {
private final POSTagger tagger;

public NLPPOSTaggerOp(POSModel model) {
tagger = new POSTaggerME(model, POSTagFormat.PENN);
this(model, POSTagFormat.CUSTOM);
}

public NLPPOSTaggerOp(POSModel model, POSTagFormat format) {
tagger = new POSTaggerME(model, format);
}

public synchronized String[] getPOSTags(String[] words) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTagFormat;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.lucene.util.ResourceLoader;
Expand Down Expand Up @@ -87,8 +88,13 @@ public static TokenizerModel getTokenizerModel(String modelName, ResourceLoader
}

public static NLPPOSTaggerOp getPOSTagger(String modelName) throws IOException {
return getPOSTagger(modelName, POSTagFormat.CUSTOM);
}

public static NLPPOSTaggerOp getPOSTagger(String modelName, POSTagFormat posTagFormat)
throws IOException {
POSModel model = posTaggerModels.get(modelName);
return new NLPPOSTaggerOp(model);
return new NLPPOSTaggerOp(model, posTagFormat);
}

public static POSModel getPOSTaggerModel(String modelName, ResourceLoader loader)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,15 @@ did VBD do
not RB not
come VB come
back RB back
they PROPN they
sent VERB send
him PRON he
running VERB run
in ADP in
the DET the
evening NOUN evening
he PRON he
did VERB do
not ADV not
come VERB come
back ADV back
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
private static final String[] SENTENCE_posTags = {
"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "."
};
private static final String[] SENTENCE_posTags_UD = {
"PROPN", "VERB", "PRON", "VERB", "ADP", "DET", "NOUN", "PUNCT"
};

private static final String SENTENCES =
"They sent him running in the evening. He did not come back.";
Expand All @@ -51,6 +54,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
private static final String[] SENTENCES_posTags = {
"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "."
};
private static final String[] SENTENCES_posTags_UD = {
"PROPN", "VERB", "PRON", "VERB", "ADP", "DET", "NOUN", "PUNCT", "PRON", "VERB", "ADV", "VERB",
"ADV", "PUNCT"
};

private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
private static final String[] SENTENCE_both_punc = {
Expand Down Expand Up @@ -89,6 +96,10 @@ public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase
"NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP",
"VBD", "VBD", "RB", "VB", "RB", "."
};
private static final String[] SENTENCES_keep_orig_posTags_UD = {
"PROPN", "PROPN", "VERB", "VERB", "PRON", "PRON", "VERB", "VERB", "ADP", "DET", "NOUN", "PUNCT",
"PRON", "PRON", "VERB", "VERB", "ADV", "VERB", "ADV", "PUNCT"
};

private static final String[] SENTENCES_both_keep_orig_punc = {
"Konstantin",
Expand Down Expand Up @@ -133,6 +144,19 @@ public void test1SentenceDictionaryOnly() throws Exception {
analyzer, SENTENCE, SENTENCE_dict_punc, null, null, SENTENCE_posTags, null, null, true);
}

public void test1SentenceDictionaryOnlyUD() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter(
"opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin", "posTagFormat", "UD")
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
.build();
assertAnalyzesTo(
analyzer, SENTENCE, SENTENCE_dict_punc, null, null, SENTENCE_posTags_UD, null, null, true);
}

public void test2SentencesDictionaryOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
Expand All @@ -145,6 +169,27 @@ public void test2SentencesDictionaryOnly() throws Exception {
analyzer, SENTENCES, SENTENCES_dict_punc, null, null, SENTENCES_posTags, null, null, true);
}

public void test2SentencesDictionaryOnlyUD() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter(
"opennlpPOS", "posTaggerModel", posTaggerModelFile, "posTagFormat", "UD")
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
.build();
assertAnalyzesTo(
analyzer,
SENTENCES,
SENTENCES_dict_punc,
null,
null,
SENTENCES_posTags_UD,
null,
null,
true);
}

public void test1SentenceMaxEntOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
Expand Down Expand Up @@ -249,6 +294,29 @@ public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
true);
}

public void testKeywordAttributeAwarenessDictionaryOnlyUD() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter(
"opennlpPOS", "posTaggerModel", posTaggerModelFile, "posTagFormat", "UD")
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(
analyzer,
SENTENCES,
SENTENCES_dict_keep_orig_punc,
null,
null,
SENTENCES_keep_orig_posTags_UD,
null,
null,
true);
}

public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
"NN", "NN", "CD", "VBZ", "CD", "NNS", ".", "NN", "NN", "CD", ",", "CD", "NNS", "."
};

private static final String[] SENTENCES_posTags_UD = {
"NOUN", "NOUN", "NUM", "VERB", "NUM", "NOUN", "PUNCT", "NOUN", "NOUN", "NUM", "PUNCT", "NUM",
"NOUN", "PUNCT"
};

private static final String NO_BREAK = "No period";
private static final String[] NO_BREAK_terms = {"No", "period"};
private static final int[] NO_BREAK_startOffsets = {0, 3};
Expand Down Expand Up @@ -128,6 +133,46 @@ public void testPOS() throws Exception {
toPayloads(SENTENCES_posTags));
}

public void testPOSUD() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter(
"opennlpPOS", "posTaggerModel", posTaggerModelFile, "posTagFormat", "UD")
.build();
assertAnalyzesTo(
analyzer,
SENTENCES,
SENTENCES_punc,
SENTENCES_startOffsets,
SENTENCES_endOffsets,
SENTENCES_posTags_UD,
null,
null,
true);

analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer(
"opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter(
"opennlpPOS", "posTaggerModel", posTaggerModelFile, "posTagFormat", "UD")
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
.build();
assertAnalyzesTo(
analyzer,
SENTENCES,
SENTENCES_punc,
SENTENCES_startOffsets,
SENTENCES_endOffsets,
null,
null,
null,
true,
toPayloads(SENTENCES_posTags_UD));
}

public void testNoBreak() throws Exception {
CustomAnalyzer analyzer =
CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
Expand Down