forked from opensearch-project/OpenSearch
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Factory to enable Lucene ConcatenateGraphFilter (opensearch-proje…
…ct#1278) (opensearch-project#2152) Lucene has a ConcatenateGraphFilter that can concatenate tokens from a TokenStream to create a single token (or several tokens that have the same position if input TokenStream is a graph). The change is to enable that ConcatenateGraphFilter by adding a Factory. Signed-off-by: Mau Bach Quang <quangmaubach@gmail.com>
- Loading branch information
1 parent
8ae0db5
commit 0e95bb9
Showing
3 changed files
with
342 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
81 changes: 81 additions & 0 deletions
81
...mmon/src/main/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; | ||
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; | ||
import org.opensearch.LegacyESVersion; | ||
import org.opensearch.common.settings.Settings; | ||
import org.opensearch.env.Environment; | ||
import org.opensearch.index.IndexSettings; | ||
import org.opensearch.index.analysis.AbstractTokenFilterFactory; | ||
|
||
/** | ||
* Factory for {@link ConcatenateGraphFilter}. | ||
* Adopted from {@link org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilterFactory}, with some changes to | ||
* default values: token_separator is a "space", preserve_position_increments is false to avoid duplicated separators, | ||
* max_graph_expansions is 100 as the default value of 10_000 seems to be unnecessarily large and preserve_separator is false. | ||
* | ||
* <ul> | ||
* <li>preserve_separator: | ||
* For LegacyESVersion lesser than {@link LegacyESVersion#V_7_6_0} i.e. lucene versions lesser | ||
* than {@link org.apache.lucene.util.Version#LUCENE_8_4_0} | ||
* Whether {@link ConcatenateGraphFilter#SEP_LABEL} should separate the input tokens in the concatenated token. | ||
* </li> | ||
* <li>token_separator: | ||
* Separator to use for concatenation. Must be a String with a single character or empty. | ||
* If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_TOKEN_SEPARATOR} will be used. | ||
* If empty i.e. "", tokens will be concatenated without any separators. | ||
* </li> | ||
* <li>preserve_position_increments: | ||
* Whether to add an empty token for missing positions. | ||
* If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_PRESERVE_POSITION_INCREMENTS} will be used. | ||
* </li> | ||
* <li>max_graph_expansions: | ||
* If the tokenStream graph has more than this many possible paths through, then we'll throw | ||
* {@link TooComplexToDeterminizeException} to preserve the stability and memory of the | ||
* machine. | ||
* If not present, {@link ConcatenateGraphTokenFilterFactory#DEFAULT_MAX_GRAPH_EXPANSIONS} will be used. | ||
* </li> | ||
* </ul> | ||
* @see ConcatenateGraphFilter | ||
*/ | ||
public class ConcatenateGraphTokenFilterFactory extends AbstractTokenFilterFactory { | ||
public static final String DEFAULT_TOKEN_SEPARATOR = " "; | ||
public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = 100; | ||
public static final boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = false; | ||
|
||
private final Character tokenSeparator; | ||
private final int maxGraphExpansions; | ||
private final boolean preservePositionIncrements; | ||
|
||
ConcatenateGraphTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { | ||
super(indexSettings, name, settings); | ||
|
||
if (indexSettings.getIndexVersionCreated().onOrAfter(LegacyESVersion.V_7_6_0)) { // i.e. Lucene 8.4.0 | ||
String separator = settings.get("token_separator", DEFAULT_TOKEN_SEPARATOR); | ||
if (separator.length() > 1) { | ||
throw new IllegalArgumentException("token_separator must be either empty or a single character"); | ||
} | ||
tokenSeparator = separator.length() == 0 ? null : separator.charAt(0); // null means no separator while concatenating | ||
} else { | ||
boolean preserveSep = settings.getAsBoolean("preserve_separator", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP); | ||
tokenSeparator = preserveSep ? ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR : null; | ||
} | ||
|
||
maxGraphExpansions = settings.getAsInt("max_graph_expansions", DEFAULT_MAX_GRAPH_EXPANSIONS); | ||
preservePositionIncrements = settings.getAsBoolean("preserve_position_increments", DEFAULT_PRESERVE_POSITION_INCREMENTS); | ||
} | ||
|
||
@Override | ||
public TokenStream create(TokenStream tokenStream) { | ||
return new ConcatenateGraphFilter(tokenStream, tokenSeparator, preservePositionIncrements, maxGraphExpansions); | ||
} | ||
} |
260 changes: 260 additions & 0 deletions
260
...src/test/java/org/opensearch/analysis/common/ConcatenateGraphTokenFilterFactoryTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,260 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.CannedTokenStream; | ||
import org.apache.lucene.analysis.Token; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.core.WhitespaceTokenizer; | ||
import org.apache.lucene.analysis.miscellaneous.ConcatenateGraphFilter; | ||
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; | ||
import org.opensearch.LegacyESVersion; | ||
import org.opensearch.cluster.metadata.IndexMetadata; | ||
import org.opensearch.common.settings.Settings; | ||
import org.opensearch.env.Environment; | ||
import org.opensearch.index.analysis.AnalysisTestsHelper; | ||
import org.opensearch.index.analysis.NamedAnalyzer; | ||
import org.opensearch.index.analysis.TokenFilterFactory; | ||
import org.opensearch.test.OpenSearchTestCase; | ||
import org.opensearch.test.OpenSearchTokenStreamTestCase; | ||
import org.opensearch.test.VersionUtils; | ||
|
||
import java.io.IOException; | ||
import java.io.StringReader; | ||
|
||
public class ConcatenateGraphTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase { | ||
public void testSimpleTokenizerAndConcatenate() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("concatenate_graph"); | ||
String source = "PowerShot Is AweSome"; | ||
Tokenizer tokenizer = new WhitespaceTokenizer(); | ||
tokenizer.setReader(new StringReader(source)); | ||
|
||
assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShot Is AweSome" }); | ||
} | ||
|
||
public void testTokenizerCustomizedSeparator() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+") | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); | ||
String source = "PowerShot Is AweSome"; | ||
Tokenizer tokenizer = new WhitespaceTokenizer(); | ||
tokenizer.setReader(new StringReader(source)); | ||
|
||
assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShot+Is+AweSome" }); | ||
} | ||
|
||
public void testOldLuceneVersionSeparator() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put( | ||
IndexMetadata.SETTING_VERSION_CREATED, | ||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2) | ||
) | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+") // this will be ignored | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); | ||
String source = "PowerShot Is AweSome"; | ||
Tokenizer tokenizer = new WhitespaceTokenizer(); | ||
tokenizer.setReader(new StringReader(source)); | ||
|
||
// earlier Lucene version will only use Lucene's default separator | ||
assertTokenStreamContents( | ||
tokenFilter.create(tokenizer), | ||
new String[] { | ||
"PowerShot" | ||
+ ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR | ||
+ "Is" | ||
+ ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR | ||
+ "AweSome" } | ||
); | ||
} | ||
|
||
public void testOldLuceneVersionNoSeparator() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put( | ||
IndexMetadata.SETTING_VERSION_CREATED, | ||
VersionUtils.randomVersionBetween(random(), LegacyESVersion.V_7_0_0, LegacyESVersion.V_7_5_2) | ||
) | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+") // this will be ignored | ||
.put("index.analysis.filter.my_concatenate_graph.preserve_separator", "false") | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); | ||
String source = "PowerShot Is AweSome"; | ||
Tokenizer tokenizer = new WhitespaceTokenizer(); | ||
tokenizer.setReader(new StringReader(source)); | ||
|
||
// earlier Lucene version will not add separator if preserve_separator is false | ||
assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShotIsAweSome" }); | ||
} | ||
|
||
public void testTokenizerEmptySeparator() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.filter.my_concatenate_graph.token_separator", "") | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); | ||
String source = "PowerShot Is AweSome"; | ||
Tokenizer tokenizer = new WhitespaceTokenizer(); | ||
tokenizer.setReader(new StringReader(source)); | ||
|
||
assertTokenStreamContents(tokenFilter.create(tokenizer), new String[] { "PowerShotIsAweSome" }); | ||
} | ||
|
||
public void testPreservePositionIncrementsDefault() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+") | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); | ||
|
||
CannedTokenStream cannedTokenStream = new CannedTokenStream( | ||
new Token("a", 1, 0, 1), | ||
new Token("b", 2, 2, 3), // there is a gap, posInc is 2 | ||
new Token("d", 1, 4, 5) | ||
); | ||
|
||
// the gap between a and b is not preserved | ||
assertTokenStreamContents(tokenFilter.create(cannedTokenStream), new String[] { "a+b+d" }); | ||
} | ||
|
||
public void testPreservePositionIncrementsTrue() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.filter.my_concatenate_graph.token_separator", "+") | ||
.put("index.analysis.filter.my_concatenate_graph.preserve_position_increments", "true") | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_concatenate_graph"); | ||
|
||
CannedTokenStream cannedTokenStream = new CannedTokenStream( | ||
new Token("a", 1, 0, 1), | ||
new Token("b", 2, 2, 3), // there is a gap, posInc is 2 | ||
new Token("d", 1, 4, 5) | ||
); | ||
|
||
// the gap between a and b is preserved | ||
assertTokenStreamContents(tokenFilter.create(cannedTokenStream), new String[] { "a++b+d" }); | ||
} | ||
|
||
public void testGraph() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter_graph") | ||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true") | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.analyzer.my_analyzer.type", "custom") | ||
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace") | ||
.put("index.analysis.analyzer.my_analyzer.filter", "my_word_delimiter, my_concatenate_graph") | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
String source = "PowerShot Is AweSome"; | ||
|
||
// Expected output from Whitespace Tokenizer is: "PowerShot" --> "Is" --> "Awe" --> "Some" | ||
// Expected output from word_delimiter_graph is a graph: | ||
// <start> ---> "Power" --> "Shot" ---> "Is" ---> "Awe" ---> "Some" --- <end> | ||
// | | | | | ||
// --> "PowerShot" -------- --> "AweSome" --------- | ||
// and this filter will traverse through all possible paths to produce concatenated tokens | ||
String[] expected = new String[] { | ||
"Power Shot Is Awe Some", | ||
"Power Shot Is AweSome", | ||
"PowerShot Is Awe Some", | ||
"PowerShot Is AweSome" }; | ||
|
||
// all tokens will be in the same position | ||
int[] expectedPosIncrements = new int[] { 1, 0, 0, 0 }; | ||
int[] expectedPosLengths = new int[] { 1, 1, 1, 1 }; | ||
|
||
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); | ||
assertAnalyzesToPositions(analyzer, source, expected, expectedPosIncrements, expectedPosLengths); | ||
} | ||
|
||
public void testInvalidSeparator() { | ||
expectThrows( | ||
IllegalArgumentException.class, | ||
() -> AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.filter.my_concatenate_graph.token_separator", "11") | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
) | ||
); | ||
} | ||
|
||
/** | ||
* Similar to the {@link #testGraph()} case, there will be 4 paths generated by word_delimiter_graph. | ||
* By setting max_graph_expansions to 3, we expect an exception. | ||
*/ | ||
public void testMaxGraphExpansion() throws IOException { | ||
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( | ||
Settings.builder() | ||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) | ||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter_graph") | ||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true") | ||
.put("index.analysis.filter.my_concatenate_graph.type", "concatenate_graph") | ||
.put("index.analysis.filter.my_concatenate_graph.max_graph_expansions", "3") | ||
.put("index.analysis.analyzer.my_analyzer.type", "custom") | ||
.put("index.analysis.analyzer.my_analyzer.tokenizer", "whitespace") | ||
.put("index.analysis.analyzer.my_analyzer.filter", "my_word_delimiter, my_concatenate_graph") | ||
.build(), | ||
new CommonAnalysisPlugin() | ||
); | ||
|
||
String source = "PowerShot Is AweSome"; | ||
|
||
TokenStream tokenStream = analysis.indexAnalyzers.get("my_analyzer").tokenStream("dummy", source); | ||
|
||
tokenStream.reset(); | ||
|
||
expectThrows(TooComplexToDeterminizeException.class, tokenStream::incrementToken); | ||
} | ||
} |