Skip to content

Commit d02cb29

Browse files
committed
Add prebuilt ICU Analyzer (#3635)
Relates: #3615, elastic/elasticsearch#34958
1 parent 8ace22d commit d02cb29

17 files changed

+142
-21
lines changed

src/Nest/Analysis/Analyzers/AnalyzerFormatter.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ public IAnalyzer Deserialize(ref JsonReader reader, IJsonFormatterResolver forma
5656
return Deserialize<KuromojiAnalyzer>(ref segmentReader, formatterResolver);
5757
case "nori":
5858
return Deserialize<NoriAnalyzer>(ref segmentReader, formatterResolver);
59+
case "icu_analyzer":
60+
return Deserialize<IcuAnalyzer>(ref segmentReader, formatterResolver);
5961
default:
6062
if (tokenizerPresent)
6163
return Deserialize<CustomAnalyzer>(ref segmentReader, formatterResolver);
@@ -104,6 +106,9 @@ public void Serialize(ref JsonWriter writer, IAnalyzer value, IJsonFormatterReso
104106
case "nori":
105107
Serialize<INoriAnalyzer>(ref writer, value, formatterResolver);
106108
break;
109+
case "icu_analyzer":
110+
Serialize<IIcuAnalyzer>(ref writer, value, formatterResolver);
111+
break;
107112
case "custom":
108113
Serialize<ICustomAnalyzer>(ref writer, value, formatterResolver);
109114
break;

src/Nest/Analysis/Analyzers/Analyzers.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,9 @@ public AnalyzersDescriptor Kuromoji(string name, Func<KuromojiAnalyzerDescriptor
108108
/// <inheritdoc cref="INoriAnalyzer" />
109109
public AnalyzersDescriptor Nori(string name, Func<NoriAnalyzerDescriptor, INoriAnalyzer> selector) =>
110110
Assign(name, selector?.Invoke(new NoriAnalyzerDescriptor()));
111+
112+
/// <inheritdoc cref="IIcuAnalyzer" />
113+
public AnalyzersDescriptor Icu(string name, Func<IcuAnalyzerDescriptor, IIcuAnalyzer> selector) =>
114+
Assign(name, selector?.Invoke(new IcuAnalyzerDescriptor()));
111115
}
112116
}

src/Nest/Analysis/Plugins/Icu/Collation/IcuCollationAlternate.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ namespace Nest
77
/// Sets the alternate handling for strength quaternary to be either shifted or non-ignorable.
88
/// Which boils down to ignoring punctuation and whitespace.
99
/// </summary>
10+
/// <remarks>
11+
/// Requires analysis-icu plugin to be installed
12+
/// </remarks>
1013
[StringEnum]
1114
public enum IcuCollationAlternate
1215
{

src/Nest/Analysis/Plugins/Icu/Collation/IcuCollationCaseFirst.cs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
namespace Nest
55
{
66
/// <summary>
7-
/// Sets the alternate handling for strength quaternary to be either shifted or non-ignorable.
8-
/// Which boils down to ignoring punctuation and whitespace.
7+
/// Controls which case is sorted first when case is not ignored for
8+
/// strength tertiary. The default depends on the collation.
99
/// </summary>
10+
/// <remarks>
11+
/// Requires analysis-icu plugin to be installed
12+
/// </remarks>
1013
[StringEnum]
1114
public enum IcuCollationCaseFirst
1215
{

src/Nest/Analysis/Plugins/Icu/Collation/IcuCollationDecomposition.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ namespace Nest
1212
/// great many of the world’s languages do not require text normalization, most locales
1313
/// set no as the default decomposition mode.
1414
/// </summary>
15+
/// <remarks>
16+
/// Requires analysis-icu plugin to be installed
17+
/// </remarks>
1518
[StringEnum]
1619
public enum IcuCollationDecomposition
1720
{

src/Nest/Analysis/Plugins/Icu/Collation/IcuCollationStrength.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ namespace Nest
88
/// difference considered significant during comparison.
99
/// See also: http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html
1010
/// </summary>
11+
/// <remarks>
12+
/// Requires analysis-icu plugin to be installed
13+
/// </remarks>
1114
[StringEnum]
1215
public enum IcuCollationStrength
1316
{
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
using System.Runtime.Serialization;
2+
3+
namespace Nest
4+
{
5+
/// <summary>
6+
/// An ICU analyzer that performs basic normalization, tokenization and character folding,
7+
/// using the <see cref="IIcuNormalizationCharFilter" /> char filter,
8+
/// <see cref="IIcuTokenizer" /> and <see cref="IcuNormalizationTokenFilter" /> token filter
9+
/// </summary>
10+
/// <remarks>
11+
/// Requires analysis-icu plugin to be installed
12+
/// </remarks>
13+
public interface IIcuAnalyzer : IAnalyzer
14+
{
15+
/// <summary>
16+
/// Normalization method. Default is <see cref="IcuNormalizationType.CompatibilityCaseFold" />
17+
/// </summary>
18+
[DataMember(Name = "method")]
19+
IcuNormalizationType? Method { get; set; }
20+
21+
/// <summary>
22+
/// Normalization mode. Default is <see cref="IcuNormalizationMode.Compose" />
23+
/// </summary>
24+
[DataMember(Name = "mode")]
25+
IcuNormalizationMode? Mode { get; set; }
26+
}
27+
28+
/// <inheritdoc cref="IIcuAnalyzer" />
29+
public class IcuAnalyzer : AnalyzerBase, IIcuAnalyzer
30+
{
31+
public IcuAnalyzer() : base("icu_analyzer") { }
32+
33+
/// <inheritdoc />
34+
public IcuNormalizationType? Method { get; set; }
35+
36+
/// <inheritdoc />
37+
public IcuNormalizationMode? Mode { get; set; }
38+
}
39+
40+
/// <inheritdoc cref="IIcuAnalyzer" />
41+
public class IcuAnalyzerDescriptor : AnalyzerDescriptorBase<IcuAnalyzerDescriptor, IIcuAnalyzer>, IIcuAnalyzer
42+
{
43+
protected override string Type => "icu_analyzer";
44+
45+
IcuNormalizationType? IIcuAnalyzer.Method { get; set; }
46+
IcuNormalizationMode? IIcuAnalyzer.Mode { get; set; }
47+
48+
/// <inheritdoc cref="IIcuAnalyzer.Method" />
49+
public IcuAnalyzerDescriptor Method(IcuNormalizationType? method) => Assign(method, (a, v) => a.Method = v);
50+
51+
/// <inheritdoc cref="IIcuAnalyzer.Mode" />
52+
public IcuAnalyzerDescriptor Mode(IcuNormalizationMode? mode) => Assign(mode, (a, v) => a.Mode = v);
53+
}
54+
}

src/Nest/Analysis/Plugins/Icu/IcuCollationTokenFilter.cs

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ namespace Nest
88
/// defaults to using the DUCET collation, which is a best-effort attempt at language-neutral sorting.
99
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
1010
/// </summary>
11+
/// <remarks>
12+
/// Requires analysis-icu plugin to be installed
13+
/// </remarks>
1114
public interface IIcuCollationTokenFilter : ITokenFilter
1215
{
1316
/// <summary>
@@ -79,7 +82,7 @@ public interface IIcuCollationTokenFilter : ITokenFilter
7982
string Variant { get; set; }
8083
}
8184

82-
/// <inheritdoc />
85+
/// <inheritdoc cref="IIcuCollationTokenFilter" />
8386
public class IcuCollationTokenFilter : TokenFilterBase, IIcuCollationTokenFilter
8487
{
8588
public IcuCollationTokenFilter() : base("icu_collation") { }
@@ -118,7 +121,7 @@ public IcuCollationTokenFilter() : base("icu_collation") { }
118121
public string Variant { get; set; }
119122
}
120123

121-
/// <inheritdoc />
124+
/// <inheritdoc cref="IIcuCollationTokenFilter" />
122125
public class IcuCollationTokenFilterDescriptor
123126
: TokenFilterDescriptorBase<IcuCollationTokenFilterDescriptor, IIcuCollationTokenFilter>, IIcuCollationTokenFilter
124127
{
@@ -136,38 +139,38 @@ public class IcuCollationTokenFilterDescriptor
136139
string IIcuCollationTokenFilter.VariableTop { get; set; }
137140
string IIcuCollationTokenFilter.Variant { get; set; }
138141

139-
/// <inheritdoc />
142+
/// <inheritdoc cref="IIcuCollationTokenFilter.Language" />
140143
public IcuCollationTokenFilterDescriptor Language(string language) => Assign(language, (a, v) => a.Language = v);
141144

142-
/// <inheritdoc />
145+
/// <inheritdoc cref="IIcuCollationTokenFilter.Country" />
143146
public IcuCollationTokenFilterDescriptor Country(string country) => Assign(country, (a, v) => a.Country = v);
144147

145-
/// <inheritdoc />
148+
/// <inheritdoc cref="IIcuCollationTokenFilter.Variant" />
146149
public IcuCollationTokenFilterDescriptor Variant(string variant) => Assign(variant, (a, v) => a.Variant = v);
147150

148-
/// <inheritdoc />
151+
/// <inheritdoc cref="IIcuCollationTokenFilter.Strength" />
149152
public IcuCollationTokenFilterDescriptor Strength(IcuCollationStrength? strength) => Assign(strength, (a, v) => a.Strength = v);
150153

151-
/// <inheritdoc />
154+
/// <inheritdoc cref="IIcuCollationTokenFilter.Decomposition" />
152155
public IcuCollationTokenFilterDescriptor Decomposition(IcuCollationDecomposition? decomposition) =>
153156
Assign(decomposition, (a, v) => a.Decomposition = v);
154157

155-
/// <inheritdoc />
158+
/// <inheritdoc cref="IIcuCollationTokenFilter.Alternate" />
156159
public IcuCollationTokenFilterDescriptor Alternate(IcuCollationAlternate? alternate) => Assign(alternate, (a, v) => a.Alternate = v);
157160

158-
/// <inheritdoc />
161+
/// <inheritdoc cref="IIcuCollationTokenFilter.CaseFirst" />
159162
public IcuCollationTokenFilterDescriptor CaseFirst(IcuCollationCaseFirst? caseFirst) => Assign(caseFirst, (a, v) => a.CaseFirst = v);
160163

161-
/// <inheritdoc />
164+
/// <inheritdoc cref="IIcuCollationTokenFilter.CaseLevel" />
162165
public IcuCollationTokenFilterDescriptor CaseLevel(bool? caseLevel = true) => Assign(caseLevel, (a, v) => a.CaseLevel = v);
163166

164-
/// <inheritdoc />
167+
/// <inheritdoc cref="IIcuCollationTokenFilter.Numeric" />
165168
public IcuCollationTokenFilterDescriptor Numeric(bool? numeric = true) => Assign(numeric, (a, v) => a.Numeric = v);
166169

167-
/// <inheritdoc />
170+
/// <inheritdoc cref="IIcuCollationTokenFilter.HiraganaQuaternaryMode" />
168171
public IcuCollationTokenFilterDescriptor HiraganaQuaternaryMode(bool? mode = true) => Assign(mode, (a, v) => a.HiraganaQuaternaryMode = v);
169172

170-
/// <inheritdoc />
173+
/// <inheritdoc cref="IIcuCollationTokenFilter.VariableTop" />
171174
public IcuCollationTokenFilterDescriptor VariableTop(string variableTop) => Assign(variableTop, (a, v) => a.VariableTop = v);
172175
}
173176
}

src/Nest/Analysis/Plugins/Icu/IcuFoldingTokenFilter.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ namespace Nest
44
{
55
/// <summary>
66
/// Case folding of Unicode characters based on UTR#30, like the ASCII-folding token filter on steroids.
7-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
87
/// </summary>
8+
/// <remarks>
9+
/// Requires analysis-icu plugin to be installed
10+
/// </remarks>
911
public interface IIcuFoldingTokenFilter : ITokenFilter
1012
{
1113
/// <summary>

src/Nest/Analysis/Plugins/Icu/IcuNormalizationCharFilter.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ namespace Nest
44
{
55
/// <summary>
66
/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization
7-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
87
/// </summary>
8+
/// <remarks>
9+
/// Requires analysis-icu plugin to be installed
10+
/// </remarks>
911
public interface IIcuNormalizationCharFilter : ICharFilter
1012
{
1113
/// <summary>

src/Nest/Analysis/Plugins/Icu/IcuNormalizationTokenFilter.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ namespace Nest
44
{
55
/// <summary>
66
/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization
7-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
87
/// </summary>
8+
/// <remarks>
9+
/// Requires analysis-icu plugin to be installed
10+
/// </remarks>
911
public interface IIcuNormalizationTokenFilter : ITokenFilter
1012
{
1113
/// <summary>

src/Nest/Analysis/Plugins/Icu/IcuTokenizer.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ namespace Nest
77
/// like the standard tokenizer, but adds better support for some Asian languages by using a dictionary-based approach
88
/// to identify words in Thai, Lao, Chinese, Japanese, and Korean, and using custom rules to break Myanmar and Khmer
99
/// text into syllables.
10-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
1110
/// </summary>
11+
/// <remarks>
12+
/// Requires analysis-icu plugin to be installed
13+
/// </remarks>
1214
public interface IIcuTokenizer : ITokenizer
1315
{
1416
/// <summary>

src/Nest/Analysis/Plugins/Icu/IcuTransformTokenFilter.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ namespace Nest
55
/// <summary>
66
/// Transforms are used to process Unicode text in many different ways, such as case mapping,
77
/// normalization, transliteration and bidirectional text handling.
8-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
98
/// </summary>
9+
/// <remarks>
10+
/// Requires analysis-icu plugin to be installed
11+
/// </remarks>
1012
public interface IIcuTransformTokenFilter : ITokenFilter
1113
{
1214
/// <summary>

src/Nest/Analysis/Plugins/Icu/Normalization/IcuNormalizationMode.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ namespace Nest
66
/// <summary>
77
/// Normalization mode https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
88
/// </summary>
9+
/// <remarks>
10+
/// Requires analysis-icu plugin to be installed
11+
/// </remarks>
912
[StringEnum]
1013
public enum IcuNormalizationMode
1114
{

src/Nest/Analysis/Plugins/Icu/Normalization/IcuNormalizationType.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ namespace Nest
66
/// <summary>
77
/// Normalization forms https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
88
/// </summary>
9+
/// <remarks>
10+
/// Requires analysis-icu plugin to be installed
11+
/// </remarks>
912
[StringEnum]
1013
public enum IcuNormalizationType
1114
{

src/Nest/Analysis/Plugins/Icu/Transform/IcuNormalizationType.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ namespace Nest
66
/// <summary>
77
/// Forward (default) for LTR and reverse for RTL
88
/// </summary>
9+
/// <remarks>
10+
/// Requires analysis-icu plugin to be installed
11+
/// </remarks>
912
[StringEnum]
1013
public enum IcuTransformDirection
1114
{

src/Tests/Tests/Analysis/Analyzers/AnalyzerTests.cs

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ public class SimpleTests : AnalyzerAssertionBase<SimpleTests>
6969
public override string Name => "mySimple";
7070
}
7171

72-
public class LanguageTests : AnalyzerAssertionBase<SimpleTests>
72+
public class LanguageTests : AnalyzerAssertionBase<LanguageTests>
7373
{
7474
public override FuncTokenizer Fluent => (n, an) => an
7575
.Language("myLanguage", a => a.Language(Language.Dutch));
@@ -216,5 +216,29 @@ public class NoriTests : AnalyzerAssertionBase<NoriTests>
216216

217217
public override string Name => "nori";
218218
}
219+
220+
[SkipVersion("<6.6.0", "introduced in 6.6.0")]
221+
public class IcuTests : AnalyzerAssertionBase<IcuTests>
222+
{
223+
public override FuncTokenizer Fluent => (n, t) => t.Icu(n, e => e
224+
.Method(IcuNormalizationType.Canonical)
225+
.Mode(IcuNormalizationMode.Decompose)
226+
);
227+
228+
public override IAnalyzer Initializer => new IcuAnalyzer
229+
{
230+
Method = IcuNormalizationType.Canonical,
231+
Mode = IcuNormalizationMode.Decompose
232+
};
233+
234+
public override object Json => new
235+
{
236+
type = "icu_analyzer",
237+
method = "nfc",
238+
mode = "decompose"
239+
};
240+
241+
public override string Name => "icu_analyzer";
242+
}
219243
}
220244
}

0 commit comments

Comments
 (0)