Skip to content

Commit 957caf0

Browse files
codebrainrusscam
authored andcommitted
Implement adjust_offsets on word delimiter graph token filter (#3934)
This commit implements adjust_offsets on word delimiter graph token filter (cherry picked from commit e410ac9)
1 parent a7313e4 commit 957caf0

File tree

2 files changed

+17
-0
lines changed

2 files changed

+17
-0
lines changed

src/Nest/Analysis/TokenFilters/WordDelimiterGraph/WordDelimiterGraphTokenFilter.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@ namespace Nest
1010
/// </summary>
1111
public interface IWordDelimiterGraphTokenFilter : ITokenFilter
1212
{
13+
/// <summary>
14+
/// By default, the filter tries to output subtokens with adjusted offsets to reflect their actual position in the token stream. However, when used in combination with other filters that alter the length or starting position of tokens without changing their offsets (e.g. <see cref="TrimTokenFilter"/>) this can cause tokens with illegal offsets to be emitted. Setting <see cref="AdjustOffsets"/> to false will stop <see cref="WordDelimiterGraphTokenFilter"/> from adjusting these internal offsets.
15+
/// </summary>
16+
[DataMember(Name ="adjust_offsets")]
17+
[JsonFormatter(typeof(NullableStringBooleanFormatter))]
18+
bool? AdjustOffsets { get; set; }
19+
1320
/// <summary>
1421
/// If true causes all subword parts to be catenated: "wi-fi-4000" ⇒ "wifi4000". Defaults to false.
1522
/// </summary>
@@ -104,6 +111,9 @@ public class WordDelimiterGraphTokenFilter : TokenFilterBase, IWordDelimiterGrap
104111
{
105112
public WordDelimiterGraphTokenFilter() : base("word_delimiter_graph") { }
106113

114+
/// <inheritdoc />
115+
public bool? AdjustOffsets { get; set; }
116+
107117
/// <inheritdoc />
108118
public bool? CatenateAll { get; set; }
109119

@@ -149,6 +159,7 @@ public class WordDelimiterGraphTokenFilterDescriptor
149159
: TokenFilterDescriptorBase<WordDelimiterGraphTokenFilterDescriptor, IWordDelimiterGraphTokenFilter>, IWordDelimiterGraphTokenFilter
150160
{
151161
protected override string Type => "word_delimiter_graph";
162+
bool? IWordDelimiterGraphTokenFilter.AdjustOffsets { get; set; }
152163
bool? IWordDelimiterGraphTokenFilter.CatenateAll { get; set; }
153164
bool? IWordDelimiterGraphTokenFilter.CatenateNumbers { get; set; }
154165
bool? IWordDelimiterGraphTokenFilter.CatenateWords { get; set; }
@@ -179,6 +190,9 @@ public WordDelimiterGraphTokenFilterDescriptor GenerateNumberParts(bool? generat
179190
public WordDelimiterGraphTokenFilterDescriptor CatenateNumbers(bool? catenateNumbers = true) =>
180191
Assign(catenateNumbers, (a, v) => a.CatenateNumbers = v);
181192

193+
/// <inheritdoc />
194+
public WordDelimiterGraphTokenFilterDescriptor AdjustOffsets(bool? adjustOffsets = true) => Assign(adjustOffsets, (a, v) => a.AdjustOffsets = v);
195+
182196
/// <inheritdoc />
183197
public WordDelimiterGraphTokenFilterDescriptor CatenateAll(bool? catenateAll = true) => Assign(catenateAll, (a, v) => a.CatenateAll = v);
184198

src/Tests/Tests/Analysis/TokenFilters/TokenFilterTests.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,7 @@ public class WordDelimiterGraphTests : TokenFilterAssertionBase<WordDelimiterGra
833833
{
834834
public override FuncTokenFilters Fluent => (n, tf) => tf
835835
.WordDelimiterGraph(n, t => t
836+
.AdjustOffsets()
836837
.CatenateAll()
837838
.CatenateNumbers()
838839
.CatenateWords()
@@ -848,6 +849,7 @@ public class WordDelimiterGraphTests : TokenFilterAssertionBase<WordDelimiterGra
848849
public override ITokenFilter Initializer =>
849850
new WordDelimiterGraphTokenFilter
850851
{
852+
AdjustOffsets = true,
851853
CatenateAll = true,
852854
CatenateNumbers = true,
853855
CatenateWords = true,
@@ -863,6 +865,7 @@ public class WordDelimiterGraphTests : TokenFilterAssertionBase<WordDelimiterGra
863865
public override object Json => new
864866
{
865867
type = "word_delimiter_graph",
868+
adjust_offsets = true,
866869
generate_word_parts = true,
867870
generate_number_parts = true,
868871
catenate_words = true,

0 commit comments

Comments
 (0)