Skip to content

Commit

Permalink
[Backport] Fix error handling while reading analyzer mapping rules
Browse files Browse the repository at this point in the history
Add new parseWordList method that takes a parser as a parameter. It reads custom rules from settings or a file, parses and handles errors. Make error messages less verbose for rules files outside config directory.

Signed-off-by: Rabi Panda <adnapibar@gmail.com>
  • Loading branch information
adnapibar committed Nov 8, 2022
1 parent 29ca125 commit f70ae15
Show file tree
Hide file tree
Showing 26 changed files with 394 additions and 155 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,6 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
- Fix message "No OpenSearchException found" when detailed_error disabled by return meaningful messages ([#4708](https://github.com/opensearch-project/OpenSearch/pull/4708))
- Add fix for auto expand replica validation ([#4994](https://github.com/opensearch-project/OpenSearch/pull/4994))
- Fix build failures on the Windows platform ([#4924](https://github.com/opensearch-project/OpenSearch/issues/4924))

- Fix error handling while reading analyzer mapping rules ([6d20423](https://github.com/opensearch-project/OpenSearch/commit/6d20423f5920745463b1abc5f1daf6a786c41aa0))
### Security
- CVE-2022-25857 org.yaml:snakeyaml DOS vulnerability ([#4341](https://github.com/opensearch-project/OpenSearch/pull/4341))
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@

package org.opensearch.analysis.common;

import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.Analysis;
import org.xml.sax.InputSource;

import java.io.InputStream;
Expand All @@ -61,13 +63,15 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
throw new IllegalArgumentException("hyphenation_patterns_path is a required setting.");
}

Path hyphenationPatternsFile = env.configFile().resolve(hyphenationPatternsPath);
Path hyphenationPatternsFile = Analysis.resolveAnalyzerPath(env, hyphenationPatternsPath);

try {
InputStream in = Files.newInputStream(hyphenationPatternsFile);
hyphenationTree = HyphenationCompoundWordTokenFilter.getHyphenationTree(new InputSource(in));
} catch (Exception e) {
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
LogManager.getLogger(HyphenationCompoundWordTokenFilterFactory.class)
.error("Exception while reading hyphenation_patterns_path ", e);
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractCharFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.MappingRule;
import org.opensearch.index.analysis.NormalizingCharFilterFactory;

import java.io.Reader;
Expand All @@ -53,13 +54,13 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory implemen
MappingCharFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name);

List<String> rules = Analysis.getWordList(env, settings, "mappings");
List<MappingRule<String, String>> rules = Analysis.parseWordList(env, settings, "mappings", this::parse);
if (rules == null) {
throw new IllegalArgumentException("mapping requires either `mappings` or `mappings_path` to be configured");
}

NormalizeCharMap.Builder normMapBuilder = new NormalizeCharMap.Builder();
parseRules(rules, normMapBuilder);
rules.forEach(rule -> normMapBuilder.add(rule.getLeft(), rule.getRight()));
normMap = normMapBuilder.build();
}

Expand All @@ -71,18 +72,13 @@ public Reader create(Reader tokenStream) {
// source => target
private static Pattern rulePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");

/**
* parses a list of MappingCharFilter style rules into a normalize char map
*/
private void parseRules(List<String> rules, NormalizeCharMap.Builder map) {
for (String rule : rules) {
Matcher m = rulePattern.matcher(rule);
if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
String lhs = parseString(m.group(1).trim());
String rhs = parseString(m.group(2).trim());
if (lhs == null || rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal mapping.");
map.add(lhs, rhs);
}
private MappingRule<String, String> parse(String rule) {
Matcher m = rulePattern.matcher(rule);
if (!m.find()) throw new RuntimeException("Invalid mapping rule : [" + rule + "]");
String lhs = parseString(m.group(1).trim());
String rhs = parseString(m.group(2).trim());
if (lhs == null || rhs == null) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Illegal mapping.");
return new MappingRule<>(lhs, rhs);
}

char[] out = new char[256];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,31 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.MappingRule;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class StemmerOverrideTokenFilterFactory extends AbstractTokenFilterFactory {

private static final String MAPPING_SEPARATOR = "=>";
private final StemmerOverrideMap overrideMap;

StemmerOverrideTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
super(indexSettings, name, settings);

List<String> rules = Analysis.getWordList(env, settings, "rules");
List<MappingRule<List<String>, String>> rules = Analysis.parseWordList(env, settings, "rules", this::parse);
if (rules == null) {
throw new IllegalArgumentException("stemmer override filter requires either `rules` or `rules_path` to be configured");
}

StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
parseRules(rules, builder, "=>");
for (MappingRule<List<String>, String> rule : rules) {
for (String key : rule.getLeft()) {
builder.add(key, rule.getRight());
}
}
overrideMap = builder.build();

}
Expand All @@ -67,27 +74,26 @@ public TokenStream create(TokenStream tokenStream) {
return new StemmerOverrideFilter(tokenStream, overrideMap);
}

static void parseRules(List<String> rules, StemmerOverrideFilter.Builder builder, String mappingSep) {
for (String rule : rules) {
String[] sides = rule.split(mappingSep, -1);
if (sides.length != 2) {
throw new RuntimeException("Invalid Keyword override Rule:" + rule);
}
private MappingRule<List<String>, String> parse(String rule) {
String[] sides = rule.split(MAPPING_SEPARATOR, -1);
if (sides.length != 2) {
throw new RuntimeException("Invalid keyword override rule: " + rule);
}

String[] keys = sides[0].split(",", -1);
String override = sides[1].trim();
if (override.isEmpty() || override.indexOf(',') != -1) {
throw new RuntimeException("Invalid Keyword override Rule:" + rule);
}
String[] keys = sides[0].split(",", -1);
String override = sides[1].trim();
if (override.isEmpty() || override.indexOf(',') != -1) {
throw new RuntimeException("Invalid keyword override rule: " + rule);
}

for (String key : keys) {
String trimmedKey = key.trim();
if (trimmedKey.isEmpty()) {
throw new RuntimeException("Invalid Keyword override Rule:" + rule);
}
builder.add(trimmedKey, override);
List<String> trimmedKeys = new ArrayList<>();
for (String key : keys) {
String trimmedKey = key.trim();
if (trimmedKey.isEmpty()) {
throw new RuntimeException("Invalid keyword override rule: " + rule);
}
trimmedKeys.add(trimmedKey);
}
return new MappingRule<>(trimmedKeys, override);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

package org.opensearch.analysis.common;

import org.apache.logging.log4j.LogManager;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
Expand Down Expand Up @@ -155,14 +156,15 @@ SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
}
return parser.build();
} catch (Exception e) {
throw new IllegalArgumentException("failed to build synonyms", e);
LogManager.getLogger(SynonymTokenFilterFactory.class).error("Failed to build synonyms: ", e);
throw new IllegalArgumentException("Failed to build synonyms");
}
}

Reader getRulesFromSettings(Environment env) {
Reader rulesReader;
if (settings.getAsList("synonyms", null) != null) {
List<String> rulesList = Analysis.getWordList(env, settings, "synonyms");
List<String> rulesList = Analysis.parseWordList(env, settings, "synonyms", s -> s);
StringBuilder sb = new StringBuilder();
for (String line : rulesList) {
sb.append(line).append(System.lineSeparator());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.MappingRule;
import org.opensearch.index.analysis.TokenFilterFactory;

import java.util.List;
Expand Down Expand Up @@ -73,7 +74,12 @@ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environ
// . => DIGIT
// \u002C => DIGIT
// \u200D => ALPHANUM
List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
List<MappingRule<Character, Byte>> charTypeTableValues = Analysis.parseWordList(
env,
settings,
"type_table",
WordDelimiterTokenFilterFactory::parse
);
if (charTypeTableValues == null) {
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.MappingRule;
import org.opensearch.index.analysis.TokenFilterFactory;

import java.util.Collection;
Expand Down Expand Up @@ -76,7 +77,12 @@ public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment
// . => DIGIT
// \u002C => DIGIT
// \u200D => ALPHANUM
List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table");
List<MappingRule<Character, Byte>> charTypeTableValues = Analysis.parseWordList(
env,
settings,
"type_table",
WordDelimiterTokenFilterFactory::parse
);
if (charTypeTableValues == null) {
this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
} else {
Expand Down Expand Up @@ -127,19 +133,23 @@ public int getFlag(int flag, Settings settings, String key, boolean defaultValue
// source => type
private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");

static MappingRule<Character, Byte> parse(String rule) {
Matcher m = typePattern.matcher(rule);
if (!m.find()) throw new RuntimeException("Invalid mapping rule: [" + rule + "]");
String lhs = parseString(m.group(1).trim());
Byte rhs = parseType(m.group(2).trim());
if (lhs.length() != 1) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Only a single character is allowed.");
if (rhs == null) throw new RuntimeException("Invalid mapping rule: [" + rule + "]. Illegal type.");
return new MappingRule<>(lhs.charAt(0), rhs);
}

/**
* parses a list of MappingCharFilter style rules into a custom byte[] type table
*/
static byte[] parseTypes(Collection<String> rules) {
static byte[] parseTypes(Collection<MappingRule<Character, Byte>> rules) {
SortedMap<Character, Byte> typeMap = new TreeMap<>();
for (String rule : rules) {
Matcher m = typePattern.matcher(rule);
if (!m.find()) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]");
String lhs = parseString(m.group(1).trim());
Byte rhs = parseType(m.group(2).trim());
if (lhs.length() != 1) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
typeMap.put(lhs.charAt(0), rhs);
for (MappingRule<Character, Byte> rule : rules) {
typeMap.put(rule.getLeft(), rule.getRight());
}

// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,24 @@ public void testStemEnglishPossessive() throws IOException {
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}

private void createTokenFilterFactoryWithTypeTable(String[] rules) throws IOException {
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.putList("index.analysis.filter.my_word_delimiter.type_table", rules)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build(),
new CommonAnalysisPlugin()
);
analysis.tokenFilter.get("my_word_delimiter");
}

public void testTypeTableParsingError() {
String[] rules = { "# This is a comment", "$ => DIGIT", "\\u200D => ALPHANUM", "abc => ALPHA" };
RuntimeException ex = expectThrows(RuntimeException.class, () -> createTokenFilterFactoryWithTypeTable(rules));
assertEquals("Line [4]: Invalid mapping rule: [abc => ALPHA]. Only a single character is allowed.", ex.getMessage());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.analysis.common;

import org.apache.lucene.analysis.CharFilter;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.analysis.AnalysisTestsHelper;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.test.OpenSearchTestCase;

import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;

public class MappingCharFilterFactoryTests extends OpenSearchTestCase {
public static CharFilterFactory create(String... rules) throws IOException {
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.analyzer.my_analyzer.tokenizer", "standard")
.put("index.analysis.analyzer.my_analyzer.char_filter", "my_mappings_char_filter")
.put("index.analysis.char_filter.my_mappings_char_filter.type", "mapping")
.putList("index.analysis.char_filter.my_mappings_char_filter.mappings", rules)
.build(),
new CommonAnalysisPlugin()
);

return analysis.charFilter.get("my_mappings_char_filter");
}

public void testRulesOk() throws IOException {
MappingCharFilterFactory mappingCharFilterFactory = (MappingCharFilterFactory) create(
"# This is a comment",
":) => _happy_",
":( => _sad_"
);
CharFilter inputReader = (CharFilter) mappingCharFilterFactory.create(new StringReader("I'm so :)"));
char[] tempBuff = new char[14];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1) break;
output.append(tempBuff, 0, length);
}
assertEquals("I'm so _happy_", output.toString());
}

public void testRuleError() {
for (String rule : Arrays.asList(
"", // empty
"a", // no arrow
"a:>b" // invalid delimiter
)) {
RuntimeException ex = expectThrows(RuntimeException.class, () -> create(rule));
assertEquals("Line [1]: Invalid mapping rule : [" + rule + "]", ex.getMessage());
}
}

public void testRulePartError() {
RuntimeException ex = expectThrows(RuntimeException.class, () -> create("# This is a comment", ":) => _happy_", "a:b"));
assertEquals("Line [3]: Invalid mapping rule : [a:b]", ex.getMessage());
}
}
Loading

0 comments on commit f70ae15

Please sign in to comment.