Skip to content

Add prebuilt ICU Analyzer #3635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
case "fingerprint": return o.ToObject<FingerprintAnalyzer>(ElasticContractResolver.Empty);
case "kuromoji": return o.ToObject<KuromojiAnalyzer>(ElasticContractResolver.Empty);
case "nori": return o.ToObject<NoriAnalyzer>(ElasticContractResolver.Empty);
case "icu_analyzer": return o.ToObject<IcuAnalyzer>(ElasticContractResolver.Empty);
default:
if (o.Property("tokenizer") != null)
return o.ToObject<CustomAnalyzer>(ElasticContractResolver.Empty);
Expand Down
4 changes: 4 additions & 0 deletions src/Nest/Analysis/Analyzers/Analyzers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,9 @@ public AnalyzersDescriptor Kuromoji(string name, Func<KuromojiAnalyzerDescriptor
/// <inheritdoc cref="INoriAnalyzer" />
public AnalyzersDescriptor Nori(string name, Func<NoriAnalyzerDescriptor, INoriAnalyzer> selector) =>
Assign(name, selector?.Invoke(new NoriAnalyzerDescriptor()));

/// <inheritdoc cref="IIcuAnalyzer" />
public AnalyzersDescriptor Icu(string name, Func<IcuAnalyzerDescriptor, IIcuAnalyzer> selector) =>
Assign(name, selector?.Invoke(new IcuAnalyzerDescriptor()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ namespace Nest
/// Sets the alternate handling for strength quaternary to be either shifted or non-ignorable.
/// Which boils down to ignoring punctuation and whitespace.
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuCollationAlternate
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
namespace Nest
{
/// <summary>
/// Sets the alternate handling for strength quaternary to be either shifted or non-ignorable.
/// Which boils down to ignoring punctuation and whitespace.
/// Controls which case is sorted first when case is not ignored for
/// strength tertiary. The default depends on the collation.
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuCollationCaseFirst
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ namespace Nest
/// great many of the world’s languages do not require text normalization, most locales
/// set no as the default decomposition mode.
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuCollationDecomposition
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ namespace Nest
/// difference considered significant during comparison.
/// See also: http://icu-project.org/apiref/icu4j/com/ibm/icu/text/Collator.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuCollationStrength
{
Expand Down
54 changes: 54 additions & 0 deletions src/Nest/Analysis/Plugins/Icu/IcuAnalyzer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// An ICU analyzer that performs basic normalization, tokenization and character folding,
/// using the <see cref="IIcuNormalizationCharFilter" /> char filter,
/// <see cref="IIcuTokenizer" /> and <see cref="IcuNormalizationTokenFilter" /> token filter
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed and Elasticsearch 6.6.0+
/// </remarks>
public interface IIcuAnalyzer : IAnalyzer
{
/// <summary>
/// Normalization method. Default is <see cref="IcuNormalizationType.CompatibilityCaseFold" />
/// </summary>
[JsonProperty("method")]
IcuNormalizationType? Method { get; set; }

/// <summary>
/// Normalization mode. Default is <see cref="IcuNormalizationMode.Compose" />
/// </summary>
[JsonProperty("mode")]
IcuNormalizationMode? Mode { get; set; }
}

/// <inheritdoc cref="IIcuAnalyzer" />
public class IcuAnalyzer : AnalyzerBase, IIcuAnalyzer
{
public IcuAnalyzer() : base("icu_analyzer") { }

/// <inheritdoc />
public IcuNormalizationType? Method { get; set; }

/// <inheritdoc />
public IcuNormalizationMode? Mode { get; set; }
}

/// <inheritdoc cref="IIcuAnalyzer" />
public class IcuAnalyzerDescriptor : AnalyzerDescriptorBase<IcuAnalyzerDescriptor, IIcuAnalyzer>, IIcuAnalyzer
{
protected override string Type => "icu_analyzer";

IcuNormalizationType? IIcuAnalyzer.Method { get; set; }
IcuNormalizationMode? IIcuAnalyzer.Mode { get; set; }

/// <inheritdoc cref="IIcuAnalyzer.Method"/>
public IcuAnalyzerDescriptor Method(IcuNormalizationType? method) => Assign(a => a.Method = method);

/// <inheritdoc cref="IIcuAnalyzer.Mode"/>
public IcuAnalyzerDescriptor Mode(IcuNormalizationMode? mode) => Assign(a => a.Mode = mode);
}
}
29 changes: 16 additions & 13 deletions src/Nest/Analysis/Plugins/Icu/IcuCollationTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ namespace Nest
/// defaults to using the DUCET collation, which is a best-effort attempt at language-neutral sorting.
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuCollationTokenFilter : ITokenFilter
{
/// <summary>
Expand Down Expand Up @@ -75,7 +78,7 @@ public interface IIcuCollationTokenFilter : ITokenFilter
string Variant { get; set; }
}

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter" />
public class IcuCollationTokenFilter : TokenFilterBase, IIcuCollationTokenFilter
{
public IcuCollationTokenFilter() : base("icu_collation") { }
Expand Down Expand Up @@ -114,7 +117,7 @@ public IcuCollationTokenFilter() : base("icu_collation") { }
public string Variant { get; set; }
}

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter" />
public class IcuCollationTokenFilterDescriptor
: TokenFilterDescriptorBase<IcuCollationTokenFilterDescriptor, IIcuCollationTokenFilter>, IIcuCollationTokenFilter
{
Expand All @@ -132,38 +135,38 @@ public class IcuCollationTokenFilterDescriptor
string IIcuCollationTokenFilter.VariableTop { get; set; }
string IIcuCollationTokenFilter.Variant { get; set; }

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Language" />
public IcuCollationTokenFilterDescriptor Language(string language) => Assign(language, (a, v) => a.Language = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Country" />
public IcuCollationTokenFilterDescriptor Country(string country) => Assign(country, (a, v) => a.Country = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Variant" />
public IcuCollationTokenFilterDescriptor Variant(string variant) => Assign(variant, (a, v) => a.Variant = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Strength" />
public IcuCollationTokenFilterDescriptor Strength(IcuCollationStrength? strength) => Assign(strength, (a, v) => a.Strength = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Decomposition" />
public IcuCollationTokenFilterDescriptor Decomposition(IcuCollationDecomposition? decomposition) =>
Assign(decomposition, (a, v) => a.Decomposition = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Alternate" />
public IcuCollationTokenFilterDescriptor Alternate(IcuCollationAlternate? alternate) => Assign(alternate, (a, v) => a.Alternate = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.CaseFirst" />
public IcuCollationTokenFilterDescriptor CaseFirst(IcuCollationCaseFirst? caseFirst) => Assign(caseFirst, (a, v) => a.CaseFirst = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.CaseLevel" />
public IcuCollationTokenFilterDescriptor CaseLevel(bool? caseLevel = true) => Assign(caseLevel, (a, v) => a.CaseLevel = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.Numeric" />
public IcuCollationTokenFilterDescriptor Numeric(bool? numeric = true) => Assign(numeric, (a, v) => a.Numeric = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.HiraganaQuaternaryMode" />
public IcuCollationTokenFilterDescriptor HiraganaQuaternaryMode(bool? mode = true) => Assign(mode, (a, v) => a.HiraganaQuaternaryMode = v);

/// <inheritdoc />
/// <inheritdoc cref="IIcuCollationTokenFilter.VariableTop" />
public IcuCollationTokenFilterDescriptor VariableTop(string variableTop) => Assign(variableTop, (a, v) => a.VariableTop = v);
}
}
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuFoldingTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ namespace Nest
{
/// <summary>
/// Case folding of Unicode characters based on UTR#30, like the ASCII-folding token filter on steroids.
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuFoldingTokenFilter : ITokenFilter
{
/// <summary>
Expand Down
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuNormalizationCharFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ namespace Nest
{
/// <summary>
/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuNormalizationCharFilter : ICharFilter
{
/// <summary>
Expand Down
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuNormalizationTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ namespace Nest
{
/// <summary>
/// Normalizes as defined here: http://userguide.icu-project.org/transforms/normalization
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuNormalizationTokenFilter : ITokenFilter
{
/// <summary>
Expand Down
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ namespace Nest
/// like the standard tokenizer, but adds better support for some Asian languages by using a dictionary-based approach
/// to identify words in Thai, Lao, Chinese, Japanese, and Korean, and using custom rules to break Myanmar and Khmer
/// text into syllables.
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuTokenizer : ITokenizer
{
/// <summary>
Expand Down
4 changes: 3 additions & 1 deletion src/Nest/Analysis/Plugins/Icu/IcuTransformTokenFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ namespace Nest
/// <summary>
/// Transforms are used to process Unicode text in many different ways, such as case mapping,
/// normalization, transliteration and bidirectional text handling.
/// Part of the `analysis-icu` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu.html
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
public interface IIcuTransformTokenFilter : ITokenFilter
{
/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ namespace Nest
/// <summary>
/// Normalization mode https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuNormalizationMode
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ namespace Nest
/// <summary>
/// Normalization forms https://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuNormalizationType
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ namespace Nest
/// <summary>
/// Forward (default) for LTR and reverse for RTL
/// </summary>
/// <remarks>
/// Requires analysis-icu plugin to be installed
/// </remarks>
[JsonConverter(typeof(StringEnumConverter))]
public enum IcuTransformDirection
{
Expand Down
26 changes: 25 additions & 1 deletion src/Tests/Tests/Analysis/Analyzers/AnalyzerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public class SimpleTests : AnalyzerAssertionBase<SimpleTests>
public override string Name => "mySimple";
}

public class LanguageTests : AnalyzerAssertionBase<SimpleTests>
public class LanguageTests : AnalyzerAssertionBase<LanguageTests>
{
public override FuncTokenizer Fluent => (n, an) => an
.Language("myLanguage", a => a.Language(Language.Dutch));
Expand Down Expand Up @@ -216,5 +216,29 @@ public class NoriTests : AnalyzerAssertionBase<NoriTests>

public override string Name => "nori";
}

[SkipVersion("<6.6.0", "introduced in 6.6.0")]
public class IcuTests : AnalyzerAssertionBase<IcuTests>
{
public override FuncTokenizer Fluent => (n, t) => t.Icu(n, e => e
.Method(IcuNormalizationType.Canonical)
.Mode(IcuNormalizationMode.Decompose)
);

public override IAnalyzer Initializer => new IcuAnalyzer
{
Method = IcuNormalizationType.Canonical,
Mode = IcuNormalizationMode.Decompose
};

public override object Json => new
{
type = "icu_analyzer",
method = "nfc",
mode = "decompose"
};

public override string Name => "icu_analyzer";
}
}
}