Skip to content

Commit f0b326d

Browse files
committed
Add discard_compound_token to kuromoji tokenizer
Relates: elastic/elasticsearch#57421
1 parent e1ef3cd commit f0b326d

File tree

2 files changed

+50
-8
lines changed

2 files changed

+50
-8
lines changed

src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizer.cs

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@ public interface IKuromojiTokenizer : ITokenizer
2121
[JsonFormatter(typeof(NullableStringBooleanFormatter))]
2222
bool? DiscardPunctuation { get; set; }
2323

24+
/// <summary>
25+
/// Whether original compound tokens should be discarded from the output with
26+
/// <see cref="KuromojiTokenizationMode.Search"/> <see cref="Mode"/>. Defaults to `false`.
27+
/// </summary>
28+
[DataMember(Name ="discard_compound_token")]
29+
[JsonFormatter(typeof(NullableStringBooleanFormatter))]
30+
bool? DiscardCompoundToken { get; set; }
31+
2432
/// <summary>
2533
/// The tokenization mode determines how the tokenizer handles compound and unknown words.
2634
/// </summary>
@@ -64,6 +72,9 @@ public class KuromojiTokenizer : TokenizerBase, IKuromojiTokenizer
6472
/// <inheritdoc />
6573
public bool? DiscardPunctuation { get; set; }
6674

75+
/// <inheritdoc />
76+
public bool? DiscardCompoundToken { get; set; }
77+
6778
/// <inheritdoc />
6879
public KuromojiTokenizationMode? Mode { get; set; }
6980

@@ -86,32 +97,35 @@ public class KuromojiTokenizerDescriptor
8697
{
8798
protected override string Type => "kuromoji_tokenizer";
8899
bool? IKuromojiTokenizer.DiscardPunctuation { get; set; }
89-
100+
bool? IKuromojiTokenizer.DiscardCompoundToken { get; set; }
90101
KuromojiTokenizationMode? IKuromojiTokenizer.Mode { get; set; }
91102
int? IKuromojiTokenizer.NBestCost { get; set; }
92103
string IKuromojiTokenizer.NBestExamples { get; set; }
93104
string IKuromojiTokenizer.UserDictionary { get; set; }
94105
IEnumerable<string> IKuromojiTokenizer.UserDictionaryRules { get; set; }
95106

96-
/// <inheritdoc />
107+
/// <inheritdoc cref="IKuromojiTokenizer.Mode" />
97108
public KuromojiTokenizerDescriptor Mode(KuromojiTokenizationMode? mode) => Assign(mode, (a, v) => a.Mode = v);
98109

99-
/// <inheritdoc />
110+
/// <inheritdoc cref="IKuromojiTokenizer.DiscardPunctuation" />
100111
public KuromojiTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v);
101112

102-
/// <inheritdoc />
113+
/// <inheritdoc cref="IKuromojiTokenizer.DiscardCompoundToken" />
114+
public KuromojiTokenizerDescriptor DiscardCompoundToken(bool? discard = true) => Assign(discard, (a, v) => a.DiscardCompoundToken = v);
115+
116+
/// <inheritdoc cref="IKuromojiTokenizer.UserDictionary" />
103117
public KuromojiTokenizerDescriptor UserDictionary(string userDictionary) => Assign(userDictionary, (a, v) => a.UserDictionary = v);
104118

105-
/// <inheritdoc />
119+
/// <inheritdoc cref="IKuromojiTokenizer.NBestExamples" />
106120
public KuromojiTokenizerDescriptor NBestExamples(string examples) => Assign(examples, (a, v) => a.NBestExamples = v);
107121

108-
/// <inheritdoc />
122+
/// <inheritdoc cref="IKuromojiTokenizer.NBestCost" />
109123
public KuromojiTokenizerDescriptor NBestCost(int? cost) => Assign(cost, (a, v) => a.NBestCost = v);
110124

111-
/// <inheritdoc />
125+
/// <inheritdoc cref="IKuromojiTokenizer.UserDictionaryRules" />
112126
public KuromojiTokenizerDescriptor UserDictionaryRules(IEnumerable<string> rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules);
113127

114-
/// <inheritdoc />
128+
/// <inheritdoc cref="IKuromojiTokenizer.UserDictionaryRules" />
115129
public KuromojiTokenizerDescriptor UserDictionaryRules(params string[] rules) => Assign(rules, (a, v) => a.UserDictionaryRules = rules);
116130
}
117131
}

tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,34 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
214214
public override string Name => "kuro";
215215
}
216216

217+
[SkipVersion("<7.9.0", "discard_compound_token introduced in 7.9.0")]
218+
public class KuromojiDiscardCompoundTokenTests : TokenizerAssertionBase<KuromojiDiscardCompoundTokenTests>
219+
{
220+
private const string Example = "/箱根山-箱根/成田空港-成田/";
221+
private const string Inline = "東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞";
222+
223+
public override FuncTokenizer Fluent => (n, t) => t
224+
.Kuromoji(n, e => e
225+
.Mode(KuromojiTokenizationMode.Search)
226+
.DiscardCompoundToken()
227+
);
228+
229+
public override ITokenizer Initializer => new KuromojiTokenizer
230+
{
231+
Mode = KuromojiTokenizationMode.Search,
232+
DiscardCompoundToken = true,
233+
};
234+
235+
public override object Json => new
236+
{
237+
discard_compound_token = true,
238+
mode = "search",
239+
type = "kuromoji_tokenizer",
240+
};
241+
242+
public override string Name => "kuro_discard_compound_token";
243+
}
244+
217245
public class UaxTests : TokenizerAssertionBase<UaxTests>
218246
{
219247
public override FuncTokenizer Fluent => (n, t) => t.UaxEmailUrl(n, e => e

0 commit comments

Comments
 (0)