Skip to content

Add support for inlined user dictionary in the Kuromoji plugin #45489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions docs/plugins/analysis-kuromoji.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,39 @@ dictionary to `$ES_HOME/config/userdict_ja.txt`:
東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞
-----------------------

--

You can also inline the rules directly in the tokenizer definition using
the `user_dictionary_rules` option:

[source,js]
--------------------------------------------------
PUT nori_sample
{
"settings": {
"index": {
"analysis": {
"tokenizer": {
"kuromoji_user_dict": {
"type": "kuromoji_tokenizer",
"mode": "extended",
"user_dictionary_rules": ["東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞"]
}
},
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "kuromoji_user_dict"
}
}
}
}
}
}
--------------------------------------------------
// CONSOLE
--

`nbest_cost`/`nbest_examples`::
+
--
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,22 @@
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.util.CSVUtil;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {

private static final String USER_DICT_OPTION = "user_dictionary";
private static final String USER_DICT_PATH_OPTION = "user_dictionary";
private static final String USER_DICT_RULES_OPTION = "user_dictionary_rules";
private static final String NBEST_COST = "nbest_cost";
private static final String NBEST_EXAMPLES = "nbest_examples";

Expand All @@ -54,17 +59,33 @@ public KuromojiTokenizerFactory(IndexSettings indexSettings, Environment env, St
}

public static UserDictionary getUserDictionary(Environment env, Settings settings) {
if (settings.get(USER_DICT_PATH_OPTION) != null && settings.get(USER_DICT_RULES_OPTION) != null) {
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
" with [" + USER_DICT_RULES_OPTION + "]");
}
try {
final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION);
if (reader == null) {
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, false);
if (ruleList == null || ruleList.isEmpty()) {
return null;
} else {
try {
return UserDictionary.open(reader);
} finally {
reader.close();
}
Set<String> dup = new HashSet<>();
int lineNum = 0;
for (String line : ruleList) {
// ignore comments
if (line.startsWith("#") == false) {
String[] values = CSVUtil.parse(line);
if (dup.add(values[0]) == false) {
throw new IllegalArgumentException("Found duplicate term [" + values[0] + "] in user dictionary " +
"at line [" + lineNum + "]");
}
}
++ lineNum;
}
StringBuilder sb = new StringBuilder();
for (String line : ruleList) {
sb.append(line).append(System.lineSeparator());
}
return UserDictionary.open(new StringReader(sb.toString()));
} catch (IOException e) {
throw new ElasticsearchException("failed to load kuromoji user dictionary", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
Expand All @@ -39,6 +40,8 @@
import java.nio.file.Files;
import java.nio.file.Path;

import static org.apache.lucene.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.instanceOf;
Expand Down Expand Up @@ -307,4 +310,55 @@ public void testNumberFilterFactory() throws Exception {
tokenizer.setReader(new StringReader(source));
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}

public void testKuromojiAnalyzerUserDict() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w", "制限スピード,制限スピード,セイゲンスピード,テスト名詞")
.build();
TestAnalysis analysis = createTestAnalysis(settings);
Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer");
try (TokenStream stream = analyzer.tokenStream("", "制限スピード")) {
assertTokenStreamContents(stream, new String[]{"制限スピード"});
}

try (TokenStream stream = analyzer.tokenStream("", "c++world")) {
assertTokenStreamContents(stream, new String[]{"c++", "world"});
}
}

public void testKuromojiAnalyzerInvalidUserDictOption() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
.put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules", "c++,c++,w,w")
.build();
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
assertThat(exc.getMessage(), containsString("It is not allowed to use [user_dictionary] in conjunction " +
"with [user_dictionary_rules]"));
}

public void testKuromojiAnalyzerDuplicateUserDictRule() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.analyzer.my_analyzer.type", "kuromoji")
.putList("index.analysis.analyzer.my_analyzer.user_dictionary_rules",
"c++,c++,w,w", "#comment", "制限スピード,制限スピード,セイゲンスピード,テスト名詞", "制限スピード,制限スピード,セイゲンスピード,テスト名詞")
.build();
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> createTestAnalysis(settings));
assertThat(exc.getMessage(), containsString("[制限スピード] in user dictionary at line [3]"));
}

private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
Path home = createTempDir();
Path config = home.resolve("config");
Files.createDirectory(config);
Files.copy(dict, config.resolve("user_dict.txt"));
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(Environment.PATH_HOME_SETTING.getKey(), home)
.put(analysisSettings)
.build();
return AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisKuromojiPlugin());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public static UserDictionary getUserDictionary(Environment env, Settings setting
throw new IllegalArgumentException("It is not allowed to use [" + USER_DICT_PATH_OPTION + "] in conjunction" +
" with [" + USER_DICT_RULES_OPTION + "]");
}
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION);
List<String> ruleList = Analysis.getWordList(env, settings, USER_DICT_PATH_OPTION, USER_DICT_RULES_OPTION, true);
StringBuilder sb = new StringBuilder();
if (ruleList == null || ruleList.isEmpty()) {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ public static CharArraySet getWordSet(Environment env, Settings settings, String
* If the word list cannot be found at either key.
*/
public static List<String> getWordList(Environment env, Settings settings, String settingPrefix) {
return getWordList(env, settings, settingPrefix + "_path", settingPrefix);
return getWordList(env, settings, settingPrefix + "_path", settingPrefix, true);
}

/**
Expand All @@ -225,7 +225,8 @@ public static List<String> getWordList(Environment env, Settings settings, Strin
* @throws IllegalArgumentException
* If the word list cannot be found at either key.
*/
public static List<String> getWordList(Environment env, Settings settings, String settingPath, String settingList) {
public static List<String> getWordList(Environment env, Settings settings,
String settingPath, String settingList, boolean removeComments) {
String wordListPath = settings.get(settingPath, null);

if (wordListPath == null) {
Expand All @@ -240,7 +241,7 @@ public static List<String> getWordList(Environment env, Settings settings, Strin
final Path path = env.configFile().resolve(wordListPath);

try {
return loadWordList(path, "#");
return loadWordList(path, removeComments);
} catch (CharacterCodingException ex) {
String message = String.format(Locale.ROOT,
"Unsupported character encoding detected while reading %s: %s - files must be UTF-8 encoded",
Expand All @@ -252,15 +253,15 @@ public static List<String> getWordList(Environment env, Settings settings, Strin
}
}

private static List<String> loadWordList(Path path, String comment) throws IOException {
private static List<String> loadWordList(Path path, boolean removeComments) throws IOException {
final List<String> result = new ArrayList<>();
try (BufferedReader br = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
String word;
while ((word = br.readLine()) != null) {
if (!Strings.hasText(word)) {
if (Strings.hasText(word) == false) {
continue;
}
if (!word.startsWith(comment)) {
if (removeComments == false || word.startsWith("#") == false) {
result.add(word.trim());
}
}
Expand Down