Skip to content

Implement all queries on doc-values only keyword fields #83404

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/83404.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 83404
summary: Implement all queries on doc-values only keyword fields
area: Mapping
type: enhancement
issues: []
2 changes: 1 addition & 1 deletion docs/reference/mapping/params/doc-values.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ with the __notable exception of `text` and `annotated_text` fields__.

<<number,Numeric types>>, <<date,date types>>, the <<boolean,boolean type>>,
<<ip,ip type>>, <<geo-point,geo_point type>> and the <<keyword,keyword type>>
can also be queried using term or range-based queries
can also be queried
when they are not <<mapping-index,indexed>> but only have doc values enabled.
Query performance on doc values is much slower than on index structures, but
offers an interesting tradeoff between disk usage and query performance for
Expand Down
3 changes: 1 addition & 2 deletions docs/reference/mapping/types/keyword.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ The following parameters are accepted by `keyword` fields:

Should the field be quickly searchable? Accepts `true` (default) and
`false`. `keyword` fields that only have <<doc-values,`doc_values`>>
enabled can still be queried using term or range-based queries,
albeit slower.
enabled can still be queried, albeit slower.

<<index-options,`index_options`>>::

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,60 @@ setup:
body: { query: { range: { keyword: { gte: "key1" } } } }
- length: { hits.hits: 2 }

---
"Test fuzzy query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { fuzzy: { keyword: { value: "kay1", fuzziness: 1 } } } }
- length: { hits.hits: 1 }

---
"Test prefix query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { prefix: { keyword: { value: "key" } } } }
- length: { hits.hits: 2 }

---
"Test case insensitive term query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { term: { keyword: { value: "KeY1", case_insensitive: true } } } }
- length: { hits.hits: 1 }

---
"Test wildcard query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { wildcard: { keyword: { value: "k*1" } } } }
- length: { hits.hits: 1 }

---
"Test case insensitive wildcard query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { wildcard: { keyword: { value: "K*1", case_insensitive: true } } } }
- length: { hits.hits: 1 }

---
"Test regexp query on keyword field where only doc values are enabled":

- do:
search:
index: test
body: { query: { regexp: { keyword: { value: "k.*1" } } } }
- length: { hits.hits: 1 }

---
"Test match query on boolean field where only doc values are enabled":

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.AutomatonQueries;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.FieldData;
Expand All @@ -44,9 +46,15 @@
import org.elasticsearch.script.ScriptCompiler;
import org.elasticsearch.script.StringFieldScript;
import org.elasticsearch.script.field.KeywordDocValuesField;
import org.elasticsearch.script.field.SortedSetDocValuesStringFieldScript;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.lookup.FieldValues;
import org.elasticsearch.search.lookup.SearchLookup;
import org.elasticsearch.search.runtime.StringScriptFieldFuzzyQuery;
import org.elasticsearch.search.runtime.StringScriptFieldPrefixQuery;
import org.elasticsearch.search.runtime.StringScriptFieldRegexpQuery;
import org.elasticsearch.search.runtime.StringScriptFieldTermQuery;
import org.elasticsearch.search.runtime.StringScriptFieldWildcardQuery;
import org.elasticsearch.xcontent.XContentParser;

import java.io.IOException;
Expand Down Expand Up @@ -388,6 +396,68 @@ public Query rangeQuery(
}
}

@Override
public Query fuzzyQuery(
Object value,
Fuzziness fuzziness,
int prefixLength,
int maxExpansions,
boolean transpositions,
SearchExecutionContext context
) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.fuzzyQuery(value, fuzziness, prefixLength, maxExpansions, transpositions, context);
} else {
return StringScriptFieldFuzzyQuery.build(
new Script(""),
ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
name(),
indexedValueForSearch(value).utf8ToString(),
fuzziness.asDistance(BytesRefs.toString(value)),
prefixLength,
transpositions
);
}
}

@Override
public Query prefixQuery(
String value,
MultiTermQuery.RewriteMethod method,
boolean caseInsensitive,
SearchExecutionContext context
) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.prefixQuery(value, method, caseInsensitive, context);
} else {
return new StringScriptFieldPrefixQuery(
new Script(""),
ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
name(),
indexedValueForSearch(value).utf8ToString(),
caseInsensitive
);
}
}

@Override
public Query termQueryCaseInsensitive(Object value, SearchExecutionContext context) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.termQueryCaseInsensitive(value, context);
} else {
return new StringScriptFieldTermQuery(
new Script(""),
ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
name(),
indexedValueForSearch(value).utf8ToString(),
true
);
}
}

@Override
public TermsEnum getTerms(boolean caseInsensitive, String string, SearchExecutionContext queryShardContext, String searchAfter)
throws IOException {
Expand Down Expand Up @@ -521,7 +591,72 @@ public Query wildcardQuery(
boolean caseInsensitive,
SearchExecutionContext context
) {
return super.wildcardQuery(value, method, caseInsensitive, true, context);
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.wildcardQuery(value, method, caseInsensitive, true, context);
} else {
if (getTextSearchInfo().getSearchAnalyzer() != null) {
value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer());
} else {
value = indexedValueForSearch(value).utf8ToString();
}
return new StringScriptFieldWildcardQuery(
new Script(""),
ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
name(),
value,
caseInsensitive
);
}
}

@Override
public Query normalizedWildcardQuery(String value, MultiTermQuery.RewriteMethod method, SearchExecutionContext context) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.normalizedWildcardQuery(value, method, context);
} else {
if (getTextSearchInfo().getSearchAnalyzer() != null) {
value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer());
} else {
value = indexedValueForSearch(value).utf8ToString();
}
return new StringScriptFieldWildcardQuery(
new Script(""),
ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
name(),
value,
false
);
}
}

@Override
public Query regexpQuery(
String value,
int syntaxFlags,
int matchFlags,
int maxDeterminizedStates,
MultiTermQuery.RewriteMethod method,
SearchExecutionContext context
) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.regexpQuery(value, syntaxFlags, matchFlags, maxDeterminizedStates, method, context);
} else {
if (matchFlags != 0) {
throw new IllegalArgumentException("Match flags not yet implemented [" + matchFlags + "]");
}
return new StringScriptFieldRegexpQuery(
new Script(""),
ctx -> new SortedSetDocValuesStringFieldScript(name(), context.lookup(), ctx),
name(),
indexedValueForSearch(value).utf8ToString(),
syntaxFlags,
matchFlags,
maxDeterminizedStates
);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.script.field;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.script.StringFieldScript;
import org.elasticsearch.search.lookup.SearchLookup;

import java.io.IOException;
import java.util.Map;

public class SortedSetDocValuesStringFieldScript extends StringFieldScript {
private final SortedSetDocValues sortedSetDocValues;

boolean hasValue = false;

public SortedSetDocValuesStringFieldScript(String fieldName, SearchLookup searchLookup, LeafReaderContext ctx) {
super(fieldName, Map.of(), searchLookup, ctx);
try {
sortedSetDocValues = DocValues.getSortedSet(ctx.reader(), fieldName);
} catch (IOException e) {
throw new IllegalStateException("Cannot load doc values", e);
}
}

@Override
public void setDocument(int docID) {
try {
hasValue = sortedSetDocValues.advanceExact(docID);
} catch (IOException e) {
throw new IllegalStateException("Cannot load doc values", e);
}
}

@Override
public void execute() {
try {
if (hasValue) {
long ord;
while ((ord = sortedSetDocValues.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
BytesRef bytesRef = sortedSetDocValues.lookupOrd(ord);
emit(bytesRef.utf8ToString());
}
}
} catch (IOException e) {
throw new IllegalStateException("Cannot load doc values", e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,12 @@ public void testRegexpQuery() {
MappedFieldType ft = new KeywordFieldType("field");
assertEquals(new RegexpQuery(new Term("field", "foo.*")), ft.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT));

MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap());
MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap());
IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
() -> unsearchable.regexpQuery("foo.*", 0, 0, 10, null, MOCK_CONTEXT)
);
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());

ElasticsearchException ee = expectThrows(
ElasticsearchException.class,
Expand All @@ -188,12 +188,12 @@ public void testFuzzyQuery() {
ft.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true, MOCK_CONTEXT)
);

MappedFieldType unsearchable = new KeywordFieldType("field", false, true, Collections.emptyMap());
MappedFieldType unsearchable = new KeywordFieldType("field", false, false, Collections.emptyMap());
IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
() -> unsearchable.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true, MOCK_CONTEXT)
);
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
assertEquals("Cannot search on field [field] since it is not indexed nor has doc values.", e.getMessage());

ElasticsearchException ee = expectThrows(
ElasticsearchException.class,
Expand Down