Skip to content

Commit 68dd822

Browse files
committed
Add prebuilt ICU Analyzer
Relates: #3615, elastic/elasticsearch#34958
1 parent 605fcd0 commit 68dd822

17 files changed

+139
-22
lines changed

src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
3131
case "fingerprint": return o.ToObject<FingerprintAnalyzer>(ElasticContractResolver.Empty);
3232
case "kuromoji": return o.ToObject<KuromojiAnalyzer>(ElasticContractResolver.Empty);
3333
case "nori": return o.ToObject<NoriAnalyzer>(ElasticContractResolver.Empty);
34+
case "icu_analyzer": return o.ToObject<IcuAnalyzer>(ElasticContractResolver.Empty);
3435
default:
3536
if (o.Property("tokenizer") != null)
3637
return o.ToObject<CustomAnalyzer>(ElasticContractResolver.Empty);

src/Nest/Analysis/Analyzers/Analyzers.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,9 @@ public AnalyzersDescriptor Kuromoji(string name, Func<KuromojiAnalyzerDescriptor
108108
/// <inheritdoc cref="INoriAnalyzer" />
109109
public AnalyzersDescriptor Nori(string name, Func<NoriAnalyzerDescriptor, INoriAnalyzer> selector) =>
110110
Assign(name, selector?.Invoke(new NoriAnalyzerDescriptor()));
111+
112+
/// <inheritdoc cref="IIcuAnalyzer" />
113+
public AnalyzersDescriptor Icu(string name, Func<IcuAnalyzerDescriptor, IIcuAnalyzer> selector) =>
114+
Assign(name, selector?.Invoke(new IcuAnalyzerDescriptor()));
111115
}
112116
}

src/Nest/Analysis/Plugins/Icu/Collation/IcuCollationAlternate.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ namespace Nest
88
/// Sets the alternate handling for strength quaternary to be either shifted or non-ignorable.
99
/// Which boils down to ignoring punctuation and whitespace.
1010
/// </summary>
11+
/// <remarks>
12+
/// Requires analysis-icu plugin to be installed
13+
/// </remarks>
1114
[JsonConverter(typeof(StringEnumConverter))]
1215
public enum IcuCollationAlternate
1316
{

src/Nest/Analysis/Plugins/Icu/Collation/IcuCollationCaseFirst.cs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55
namespace Nest
66
{
77
/// <summary>
8-
/// Sets the alternate handling for strength quaternary to be either shifted or non-ignorable.
9-
/// Which boils down to ignoring punctuation and whitespace.
8+
/// Controls which case is sorted first when case is not ignored for
9+
/// strength tertiary. The default depends on the collation.
1010
/// </summary>
11+
/// <remarks>
12+
/// Requires analysis-icu plugin to be installed
13+
/// </remarks>
1114
[JsonConverter(typeof(StringEnumConverter))]
1215
public enum IcuCollationCaseFirst
1316
{

src/Nest/Analysis/Plugins/Icu/Collation/IcuCollationDecomposition.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ namespace Nest
1313
/// great many of the world’s languages do not require text normalization, most locales
1414
/// set no as the default decomposition mode.
1515
/// </summary>
16+
/// <remarks>
17+
/// Requires analysis-icu plugin to be installed
18+
/// </remarks>
1619
[JsonConverter(typeof(StringEnumConverter))]
1720
public enum IcuCollationDecomposition
1821
{

src/Nest/Analysis/Plugins/Icu/Collation/IcuCollationStrength.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ namespace Nest
99
/// difference considered significant during comparison.
1010
/// See also: http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html
1111
/// </summary>
12+
/// <remarks>
13+
/// Requires analysis-icu plugin to be installed
14+
/// </remarks>
1215
[JsonConverter(typeof(StringEnumConverter))]
1316
public enum IcuCollationStrength
1417
{
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
using Newtonsoft.Json;
2+
3+
namespace Nest
4+
{
5+
/// <summary>
6+
/// An ICU analyzer that performs basic normalization, tokenization and character folding,
7+
/// using the <see cref="IIcuNormalizationCharFilter" /> char filter,
8+
/// <see cref="IIcuTokenizer" /> and <see cref="IcuNormalizationTokenFilter" /> token filter
9+
/// </summary>
10+
/// <remarks>
11+
/// Requires analysis-icu plugin to be installed and Elasticsearch 6.6.0+
12+
/// </remarks>
13+
public interface IIcuAnalyzer : IAnalyzer
14+
{
15+
/// <summary>
16+
/// Normalization method. Default is <see cref="IcuNormalizationType.CompatibilityCaseFold" />
17+
/// </summary>
18+
[JsonProperty("method")]
19+
IcuNormalizationType? Method { get; set; }
20+
21+
/// <summary>
22+
/// Normalization mode. Default is <see cref="IcuNormalizationMode.Compose" />
23+
/// </summary>
24+
[JsonProperty("mode")]
25+
IcuNormalizationMode? Mode { get; set; }
26+
}
27+
28+
/// <inheritdoc cref="IIcuAnalyzer" />
29+
public class IcuAnalyzer : AnalyzerBase, IIcuAnalyzer
30+
{
31+
public IcuAnalyzer() : base("icu_analyzer") { }
32+
33+
/// <inheritdoc />
34+
public IcuNormalizationType? Method { get; set; }
35+
36+
/// <inheritdoc />
37+
public IcuNormalizationMode? Mode { get; set; }
38+
}
39+
40+
/// <inheritdoc cref="IIcuAnalyzer" />
41+
public class IcuAnalyzerDescriptor : AnalyzerDescriptorBase<IcuAnalyzerDescriptor, IIcuAnalyzer>, IIcuAnalyzer
42+
{
43+
protected override string Type => "icu_analyzer";
44+
45+
IcuNormalizationType? IIcuAnalyzer.Method { get; set; }
46+
IcuNormalizationMode? IIcuAnalyzer.Mode { get; set; }
47+
48+
/// <inheritdoc cref="IIcuAnalyzer.Method"/>
49+
public IcuAnalyzerDescriptor Method(IcuNormalizationType? method) => Assign(a => a.Method = method);
50+
51+
/// <inheritdoc cref="IIcuAnalyzer.Mode"/>
52+
public IcuAnalyzerDescriptor Mode(IcuNormalizationMode? mode) => Assign(a => a.Mode = mode);
53+
}
54+
}

src/Nest/Analysis/Plugins/Icu/IcuCollationTokenFilter.cs

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ namespace Nest
77
/// defaults to using the DUCET collation, which is a best-effort attempt at language-neutral sorting.
88
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
99
/// </summary>
10+
/// <remarks>
11+
/// Requires analysis-icu plugin to be installed
12+
/// </remarks>
1013
public interface IIcuCollationTokenFilter : ITokenFilter
1114
{
1215
/// <summary>
@@ -75,7 +78,7 @@ public interface IIcuCollationTokenFilter : ITokenFilter
7578
string Variant { get; set; }
7679
}
7780

78-
/// <inheritdoc />
81+
/// <inheritdoc cref="IIcuCollationTokenFilter" />
7982
public class IcuCollationTokenFilter : TokenFilterBase, IIcuCollationTokenFilter
8083
{
8184
public IcuCollationTokenFilter() : base("icu_collation") { }
@@ -114,7 +117,7 @@ public IcuCollationTokenFilter() : base("icu_collation") { }
114117
public string Variant { get; set; }
115118
}
116119

117-
/// <inheritdoc />
120+
/// <inheritdoc cref="IIcuCollationTokenFilter" />
118121
public class IcuCollationTokenFilterDescriptor
119122
: TokenFilterDescriptorBase<IcuCollationTokenFilterDescriptor, IIcuCollationTokenFilter>, IIcuCollationTokenFilter
120123
{
@@ -132,38 +135,38 @@ public class IcuCollationTokenFilterDescriptor
132135
string IIcuCollationTokenFilter.VariableTop { get; set; }
133136
string IIcuCollationTokenFilter.Variant { get; set; }
134137

135-
/// <inheritdoc />
138+
/// <inheritdoc cref="IIcuCollationTokenFilter.Language" />
136139
public IcuCollationTokenFilterDescriptor Language(string language) => Assign(a => a.Language = language);
137140

138-
/// <inheritdoc />
141+
/// <inheritdoc cref="IIcuCollationTokenFilter.Country" />
139142
public IcuCollationTokenFilterDescriptor Country(string country) => Assign(a => a.Country = country);
140143

141-
/// <inheritdoc />
144+
/// <inheritdoc cref="IIcuCollationTokenFilter.Variant" />
142145
public IcuCollationTokenFilterDescriptor Variant(string variant) => Assign(a => a.Variant = variant);
143146

144-
/// <inheritdoc />
147+
/// <inheritdoc cref="IIcuCollationTokenFilter.Strength" />
145148
public IcuCollationTokenFilterDescriptor Strength(IcuCollationStrength? strength) => Assign(a => a.Strength = strength);
146149

147-
/// <inheritdoc />
150+
/// <inheritdoc cref="IIcuCollationTokenFilter.Decomposition" />
148151
public IcuCollationTokenFilterDescriptor Decomposition(IcuCollationDecomposition? decomposition) =>
149152
Assign(a => a.Decomposition = decomposition);
150153

151-
/// <inheritdoc />
154+
/// <inheritdoc cref="IIcuCollationTokenFilter.Alternate" />
152155
public IcuCollationTokenFilterDescriptor Alternate(IcuCollationAlternate? alternate) => Assign(a => a.Alternate = alternate);
153156

154-
/// <inheritdoc />
157+
/// <inheritdoc cref="IIcuCollationTokenFilter.CaseFirst" />
155158
public IcuCollationTokenFilterDescriptor CaseFirst(IcuCollationCaseFirst? caseFirst) => Assign(a => a.CaseFirst = caseFirst);
156159

157-
/// <inheritdoc />
160+
/// <inheritdoc cref="IIcuCollationTokenFilter.CaseLevel" />
158161
public IcuCollationTokenFilterDescriptor CaseLevel(bool? caseLevel = true) => Assign(a => a.CaseLevel = caseLevel);
159162

160-
/// <inheritdoc />
163+
/// <inheritdoc cref="IIcuCollationTokenFilter.Numeric" />
161164
public IcuCollationTokenFilterDescriptor Numeric(bool? numeric = true) => Assign(a => a.Numeric = numeric);
162165

163-
/// <inheritdoc />
166+
/// <inheritdoc cref="IIcuCollationTokenFilter.HiraganaQuaternaryMode" />
164167
public IcuCollationTokenFilterDescriptor HiraganaQuaternaryMode(bool? mode = true) => Assign(a => a.HiraganaQuaternaryMode = mode);
165168

166-
/// <inheritdoc />
169+
/// <inheritdoc cref="IIcuCollationTokenFilter.VariableTop" />
167170
public IcuCollationTokenFilterDescriptor VariableTop(string variableTop) => Assign(a => a.VariableTop = variableTop);
168171
}
169172
}

src/Nest/Analysis/Plugins/Icu/IcuFoldingTokenFilter.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ namespace Nest
44
{
55
/// <summary>
66
/// Case folding of Unicode characters based on UTR#30, like the ASCII-folding token filter on steroids.
7-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
87
/// </summary>
8+
/// <remarks>
9+
/// Requires analysis-icu plugin to be installed
10+
/// </remarks>
911
public interface IIcuFoldingTokenFilter : ITokenFilter
1012
{
1113
/// <summary>

src/Nest/Analysis/Plugins/Icu/IcuNormalizationCharFilter.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ namespace Nest
44
{
55
/// <summary>
66
/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization
7-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
87
/// </summary>
8+
/// <remarks>
9+
/// Requires analysis-icu plugin to be installed
10+
/// </remarks>
911
public interface IIcuNormalizationCharFilter : ICharFilter
1012
{
1113
/// <summary>

src/Nest/Analysis/Plugins/Icu/IcuNormalizationTokenFilter.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@ namespace Nest
44
{
55
/// <summary>
66
/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization
7-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
87
/// </summary>
8+
/// <remarks>
9+
/// Requires analysis-icu plugin to be installed
10+
/// </remarks>
911
public interface IIcuNormalizationTokenFilter : ITokenFilter
1012
{
1113
/// <summary>

src/Nest/Analysis/Plugins/Icu/IcuTokenizer.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@ namespace Nest
77
/// like the standard tokenizer, but adds better support for some Asian languages by using a dictionary-based approach
88
/// to identify words in Thai, Lao, Chinese, Japanese, and Korean, and using custom rules to break Myanmar and Khmer
99
/// text into syllables.
10-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
1110
/// </summary>
11+
/// <remarks>
12+
/// Requires analysis-icu plugin to be installed
13+
/// </remarks>
1214
public interface IIcuTokenizer : ITokenizer
1315
{
1416
/// <summary>

src/Nest/Analysis/Plugins/Icu/IcuTransformTokenFilter.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ namespace Nest
55
/// <summary>
66
/// Transforms are used to process Unicode text in many different ways, such as case mapping,
77
/// normalization, transliteration and bidirectional text handling.
8-
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
98
/// </summary>
9+
/// <remarks>
10+
/// Requires analysis-icu plugin to be installed
11+
/// </remarks>
1012
public interface IIcuTransformTokenFilter : ITokenFilter
1113
{
1214
/// <summary>

src/Nest/Analysis/Plugins/Icu/Normalization/IcuNormalizationMode.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ namespace Nest
77
/// <summary>
88
/// Normalization mode https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
99
/// </summary>
10+
/// <remarks>
11+
/// Requires analysis-icu plugin to be installed
12+
/// </remarks>
1013
[JsonConverter(typeof(StringEnumConverter))]
1114
public enum IcuNormalizationMode
1215
{

src/Nest/Analysis/Plugins/Icu/Normalization/IcuNormalizationType.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ namespace Nest
77
/// <summary>
88
/// Normalization forms https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
99
/// </summary>
10+
/// <remarks>
11+
/// Requires analysis-icu plugin to be installed
12+
/// </remarks>
1013
[JsonConverter(typeof(StringEnumConverter))]
1114
public enum IcuNormalizationType
1215
{

src/Nest/Analysis/Plugins/Icu/Transform/IcuNormalizationType.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ namespace Nest
77
/// <summary>
88
/// Forward (default) for LTR and reverse for RTL
99
/// </summary>
10+
/// <remarks>
11+
/// Requires analysis-icu plugin to be installed
12+
/// </remarks>
1013
[JsonConverter(typeof(StringEnumConverter))]
1114
public enum IcuTransformDirection
1215
{

src/Tests/Tests/Analysis/Analyzers/AnalyzerTests.cs

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,14 @@ public class SimpleTests : AnalyzerAssertionBase<SimpleTests>
6969
public override string Name => "mySimple";
7070
}
7171

72-
public class LanguageTests : AnalyzerAssertionBase<SimpleTests>
72+
public class LanguageTests : AnalyzerAssertionBase<LanguageTests>
7373
{
7474
public override FuncTokenizer Fluent => (n, an) => an
7575
.Language("myLanguage", a => a.Language(Language.Dutch));
7676

7777
public override IAnalyzer Initializer => new LanguageAnalyzer { Language = Language.Dutch };
7878

79-
public override object Json => new { type = "dutch" };
79+
public override object Json => new { type = "Dutch" };
8080
public override string Name => "myLanguage";
8181
}
8282

@@ -216,5 +216,29 @@ public class NoriTests : AnalyzerAssertionBase<NoriTests>
216216

217217
public override string Name => "nori";
218218
}
219+
220+
[SkipVersion("<6.6.0", "introduced in 6.6.0")]
221+
public class IcuTests : AnalyzerAssertionBase<IcuTests>
222+
{
223+
public override FuncTokenizer Fluent => (n, t) => t.Icu(n, e => e
224+
.Method(IcuNormalizationType.Canonical)
225+
.Mode(IcuNormalizationMode.Decompose)
226+
);
227+
228+
public override IAnalyzer Initializer => new IcuAnalyzer
229+
{
230+
Method = IcuNormalizationType.Canonical,
231+
Mode = IcuNormalizationMode.Decompose
232+
};
233+
234+
public override object Json => new
235+
{
236+
type = "icu_analyzer",
237+
method = "nfc",
238+
mode = "decompose"
239+
};
240+
241+
public override string Name => "icu_analyzer";
242+
}
219243
}
220244
}

0 commit comments

Comments
 (0)