Skip to content

Commit e569a17

Browse files
Mpdreamzrusscam
authored andcommitted
Add support for inlined user dictionary in the Kuromoji plugin (#4138)
Addresses elastic/elasticsearch#45489 (cherry picked from commit c458da9)
1 parent 96563a6 commit e569a17

File tree

2 files changed

+24
-3
lines changed

2 files changed

+24
-3
lines changed

src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizer.cs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
using System.Runtime.Serialization;
1+
using System.Collections.Generic;
2+
using System.Runtime.Serialization;
23
using Elasticsearch.Net.Utf8Json;
34

45
namespace Nest
@@ -44,6 +45,11 @@ public interface IKuromojiTokenizer : ITokenizer
4445
/// </summary>
4546
[DataMember(Name ="user_dictionary")]
4647
string UserDictionary { get; set; }
48+
49+
/// <summary> Inline rule version of <see cref="UserDictionary"/> </summary>
50+
[DataMember(Name ="user_dictionary_rules")]
51+
IEnumerable<string> UserDictionaryRules { get; set; }
52+
4753
}
4854

4955
/// <inheritdoc />
@@ -65,6 +71,9 @@ public class KuromojiTokenizer : TokenizerBase, IKuromojiTokenizer
6571

6672
/// <inheritdoc />
6773
public string UserDictionary { get; set; }
74+
75+
/// <inheritdoc />
76+
public IEnumerable<string> UserDictionaryRules { get; set; }
6877
}
6978

7079
/// <inheritdoc />
@@ -78,6 +87,7 @@ public class KuromojiTokenizerDescriptor
7887
int? IKuromojiTokenizer.NBestCost { get; set; }
7988
string IKuromojiTokenizer.NBestExamples { get; set; }
8089
string IKuromojiTokenizer.UserDictionary { get; set; }
90+
IEnumerable<string> IKuromojiTokenizer.UserDictionaryRules { get; set; }
8191

8292
/// <inheritdoc />
8393
public KuromojiTokenizerDescriptor Mode(KuromojiTokenizationMode? mode) => Assign(mode, (a, v) => a.Mode = v);
@@ -93,5 +103,11 @@ public class KuromojiTokenizerDescriptor
93103

94104
/// <inheritdoc />
95105
public KuromojiTokenizerDescriptor NBestCost(int? cost) => Assign(cost, (a, v) => a.NBestCost = v);
106+
107+
/// <inheritdoc />
108+
public KuromojiTokenizerDescriptor UserDictionaryRules(IEnumerable<string> rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules);
109+
110+
/// <inheritdoc />
111+
public KuromojiTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules);
96112
}
97113
}

src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,23 +114,27 @@ public class IcuTests : TokenizerAssertionBase<IcuTests>
114114
public override string Name => "icu";
115115
}
116116

117+
[SkipVersion("<7.4.0", "not all options available before this version")]
117118
public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
118119
{
119120
private const string Example = "/箱根山-箱根/成田空港-成田/";
121+
private const string Inline = "東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞";
120122

121123
public override FuncTokenizer Fluent => (n, t) => t.Kuromoji(n, e => e
122124
.Mode(KuromojiTokenizationMode.Extended)
123125
.DiscardPunctuation()
124126
.NBestExamples(Example)
125127
.NBestCost(1000)
128+
.UserDictionaryRules(Inline)
126129
);
127130

128131
public override ITokenizer Initializer => new KuromojiTokenizer
129132
{
130133
Mode = KuromojiTokenizationMode.Extended,
131134
DiscardPunctuation = true,
132135
NBestExamples = Example,
133-
NBestCost = 1000
136+
NBestCost = 1000,
137+
UserDictionaryRules = new [] { Inline }
134138
};
135139

136140
public override object Json => new
@@ -139,7 +143,8 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
139143
mode = "extended",
140144
nbest_cost = 1000,
141145
nbest_examples = Example,
142-
type = "kuromoji_tokenizer"
146+
type = "kuromoji_tokenizer",
147+
user_dictionary_rules = new [] { Inline }
143148
};
144149

145150
public override string Name => "kuro";

0 commit comments

Comments
 (0)