Skip to content

Commit 8406956

Browse files
Add discard_punctuation to nori token filter (#4591) (#4619)
Add discard_punctuation to nori token filter Co-authored-by: Stuart Cam <stuart.cam@elastic.co>
1 parent 808969b commit 8406956

File tree

3 files changed

+30
-0
lines changed

3 files changed

+30
-0
lines changed

src/Nest/Analysis/Analyzers/NoriAnalyzer.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ namespace Nest
88
/// <para> - nori_tokenizer</para>
99
/// <para> - nori_part_of_speech token filter</para>
1010
/// <para> - nori_readingform token filter</para>
11+
/// <para> - nori_number token filter</para>
1112
/// <para> - lowercase token filter</para>
1213
/// </summary>
1314
public interface INoriAnalyzer : IAnalyzer

src/Nest/Analysis/Tokenizers/NoriTokenizer.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ public interface INoriTokenizer : ITokenizer
3030
[DataMember(Name = "decompound_mode")]
3131
NoriDecompoundMode? DecompoundMode { get; set; }
3232

33+
/// <summary>
34+
/// Whether punctuation should be discarded from the output. Defaults to `true`.
35+
/// </summary>
36+
[DataMember(Name = "discard_punctuation")]
37+
bool? DiscardPunctuation { get; set; }
38+
3339
/// <summary>
3440
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be
3541
/// appended to
@@ -57,6 +63,9 @@ public class NoriTokenizer : TokenizerBase, INoriTokenizer
5763
/// <inheritdoc cref="INoriTokenizer.DecompoundMode" />
5864
public NoriDecompoundMode? DecompoundMode { get; set; }
5965

66+
/// <inheritdoc cref="INoriTokenizer.DiscardPunctuation" />
67+
public bool? DiscardPunctuation { get; set; }
68+
6069
/// <inheritdoc cref="INoriTokenizer.UserDictionary" />
6170
public string UserDictionary { get; set; }
6271

@@ -73,6 +82,7 @@ public class NoriTokenizerDescriptor
7382
NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
7483
string INoriTokenizer.UserDictionary { get; set; }
7584
IEnumerable<string> INoriTokenizer.UserDictionaryRules { get; set; }
85+
bool? INoriTokenizer.DiscardPunctuation { get; set; }
7686

7787
/// <inheritdoc cref="INoriTokenizer.DecompoundMode" />
7888
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(mode, (a, v) => a.DecompoundMode = v);
@@ -85,5 +95,8 @@ public class NoriTokenizerDescriptor
8595

8696
/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
8797
public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable<string> rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);
98+
99+
/// <inheritdoc cref="INoriTokenizer.DiscardPunctuation" />
100+
public NoriTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v);
88101
}
89102
}

tests/Tests/Analysis/Tokenizers/TokenizerTests.cs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,5 +332,21 @@ public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
332332

333333
public override string Name => "char_group";
334334
}
335+
336+
[SkipVersion("<7.7.0", "discard_punctuation introduced in 7.7.0")]
337+
public class DiscardPunctuationTests : TokenizerAssertionBase<DiscardPunctuationTests>
338+
{
339+
public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
340+
.DiscardPunctuation()
341+
);
342+
343+
public override ITokenizer Initializer => new NoriTokenizer
344+
{
345+
DiscardPunctuation = true
346+
};
347+
348+
public override object Json => new { type = "nori_tokenizer", discard_punctuation = true };
349+
public override string Name => "nori";
350+
}
335351
}
336352
}

0 commit comments

Comments
 (0)