Skip to content

Commit d1c8216

Browse files
authored
Allow field types to optimize phrase prefix queries (#37575)
This change adds a way to customize how phrase prefix queries should be created on field types. The match phrase prefix query is exposed in field types in order to allow optimizations based on the options set on the field. For instance the text field uses the configured prefix field (if available) to build a span near that mixes the original field and the prefix field on the last position. This change also contains a small refactoring of the match/multi_match query that simplifies the interactions between the builders. Closes #31921
1 parent 876094e commit d1c8216

21 files changed

+1060
-781
lines changed

plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java

Lines changed: 8 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,17 @@
3030
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
3131
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
3232
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
33-
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
3433
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
3534
import org.apache.lucene.document.Field;
3635
import org.apache.lucene.index.IndexOptions;
3736
import org.apache.lucene.index.IndexableField;
3837
import org.apache.lucene.index.Term;
39-
import org.apache.lucene.search.MultiPhraseQuery;
4038
import org.apache.lucene.search.NormsFieldExistsQuery;
41-
import org.apache.lucene.search.PhraseQuery;
39+
import org.apache.lucene.search.PrefixQuery;
4240
import org.apache.lucene.search.Query;
4341
import org.apache.lucene.search.TermQuery;
42+
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
43+
import org.apache.lucene.search.spans.SpanQuery;
4444
import org.elasticsearch.ElasticsearchParseException;
4545
import org.elasticsearch.common.settings.Settings;
4646
import org.elasticsearch.common.xcontent.XContentBuilder;
@@ -657,63 +657,12 @@ public Query existsQuery(QueryShardContext context) {
657657
}
658658

659659
@Override
660-
public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePosIncrements) throws IOException {
661-
PhraseQuery.Builder builder = new PhraseQuery.Builder();
662-
builder.setSlop(slop);
663-
664-
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
665-
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
666-
int position = -1;
667-
668-
stream.reset();
669-
while (stream.incrementToken()) {
670-
if (enablePosIncrements) {
671-
position += posIncrAtt.getPositionIncrement();
672-
}
673-
else {
674-
position += 1;
675-
}
676-
builder.add(new Term(field, termAtt.getBytesRef()), position);
677-
}
678-
679-
return builder.build();
660+
public SpanQuery spanPrefixQuery(String value, SpanMultiTermQueryWrapper.SpanRewriteMethod method, QueryShardContext context) {
661+
SpanMultiTermQueryWrapper<?> spanMulti =
662+
new SpanMultiTermQueryWrapper<>(new PrefixQuery(new Term(name(), indexedValueForSearch(value))));
663+
spanMulti.setRewriteMethod(method);
664+
return spanMulti;
680665
}
681-
682-
@Override
683-
public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {
684-
685-
MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
686-
mpqb.setSlop(slop);
687-
688-
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
689-
690-
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
691-
int position = -1;
692-
693-
List<Term> multiTerms = new ArrayList<>();
694-
stream.reset();
695-
while (stream.incrementToken()) {
696-
int positionIncrement = posIncrAtt.getPositionIncrement();
697-
698-
if (positionIncrement > 0 && multiTerms.size() > 0) {
699-
if (enablePositionIncrements) {
700-
mpqb.add(multiTerms.toArray(new Term[0]), position);
701-
} else {
702-
mpqb.add(multiTerms.toArray(new Term[0]));
703-
}
704-
multiTerms.clear();
705-
}
706-
position += positionIncrement;
707-
multiTerms.add(new Term(field, termAtt.getBytesRef()));
708-
}
709-
710-
if (enablePositionIncrements) {
711-
mpqb.add(multiTerms.toArray(new Term[0]), position);
712-
} else {
713-
mpqb.add(multiTerms.toArray(new Term[0]));
714-
}
715-
return mpqb.build();
716-
}
717666
}
718667

719668
private int positionIncrementGap;

server/src/main/java/org/apache/lucene/queries/ExtendedCommonTermsQuery.java

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,8 @@
1919

2020
package org.apache.lucene.queries;
2121

22-
import org.apache.lucene.index.Term;
23-
import org.apache.lucene.index.TermContext;
2422
import org.apache.lucene.search.BooleanClause.Occur;
25-
import org.apache.lucene.search.Query;
2623
import org.elasticsearch.common.lucene.search.Queries;
27-
import org.elasticsearch.index.mapper.MappedFieldType;
2824

2925
/**
3026
* Extended version of {@link CommonTermsQuery} that allows to pass in a
@@ -33,11 +29,8 @@
3329
*/
3430
public class ExtendedCommonTermsQuery extends CommonTermsQuery {
3531

36-
private final MappedFieldType fieldType;
37-
38-
public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency, MappedFieldType fieldType) {
32+
public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency) {
3933
super(highFreqOccur, lowFreqOccur, maxTermFrequency);
40-
this.fieldType = fieldType;
4134
}
4235

4336
private String lowFreqMinNumShouldMatchSpec;
@@ -80,16 +73,4 @@ public float getMaxTermFrequency() {
8073
return this.maxTermFrequency;
8174
}
8275

83-
@Override
84-
protected Query newTermQuery(Term term, TermContext context) {
85-
if (fieldType == null) {
86-
return super.newTermQuery(term, context);
87-
}
88-
final Query query = fieldType.queryStringTermQuery(term);
89-
if (query == null) {
90-
return super.newTermQuery(term, context);
91-
} else {
92-
return query;
93-
}
94-
}
9576
}

server/src/main/java/org/elasticsearch/common/lucene/search/MultiPhrasePrefixQuery.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,21 @@
3939
import java.util.Iterator;
4040
import java.util.List;
4141
import java.util.ListIterator;
42+
import java.util.Objects;
4243

4344
public class MultiPhrasePrefixQuery extends Query {
4445

45-
private String field;
46+
private final String field;
4647
private ArrayList<Term[]> termArrays = new ArrayList<>();
4748
private ArrayList<Integer> positions = new ArrayList<>();
4849
private int maxExpansions = Integer.MAX_VALUE;
4950

5051
private int slop = 0;
5152

53+
public MultiPhrasePrefixQuery(String field) {
54+
this.field = Objects.requireNonNull(field);
55+
}
56+
5257
/**
5358
* Sets the phrase slop for this query.
5459
*
@@ -102,9 +107,6 @@ public void add(Term[] terms) {
102107
* @see org.apache.lucene.search.PhraseQuery.Builder#add(Term, int)
103108
*/
104109
public void add(Term[] terms, int position) {
105-
if (termArrays.size() == 0)
106-
field = terms[0].field();
107-
108110
for (int i = 0; i < terms.length; i++) {
109111
if (terms[i].field() != field) {
110112
throw new IllegalArgumentException(
@@ -212,7 +214,7 @@ private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final
212214
@Override
213215
public final String toString(String f) {
214216
StringBuilder buffer = new StringBuilder();
215-
if (field == null || !field.equals(f)) {
217+
if (field.equals(f) == false) {
216218
buffer.append(field);
217219
buffer.append(":");
218220
}
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
/*
2+
* Licensed to Elasticsearch under one or more contributor
3+
* license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright
5+
* ownership. Elasticsearch licenses this file to you under
6+
* the Apache License, Version 2.0 (the "License"); you may
7+
* not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.elasticsearch.common.lucene.search;
21+
22+
import org.apache.lucene.index.IndexReader;
23+
import org.apache.lucene.index.IndexReaderContext;
24+
import org.apache.lucene.index.LeafReaderContext;
25+
import org.apache.lucene.index.Term;
26+
import org.apache.lucene.index.Terms;
27+
import org.apache.lucene.index.TermsEnum;
28+
import org.apache.lucene.queries.SpanMatchNoDocsQuery;
29+
import org.apache.lucene.search.BooleanQuery;
30+
import org.apache.lucene.search.MultiTermQuery;
31+
import org.apache.lucene.search.Query;
32+
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
33+
import org.apache.lucene.search.spans.SpanOrQuery;
34+
import org.apache.lucene.search.spans.SpanQuery;
35+
import org.apache.lucene.search.spans.SpanTermQuery;
36+
import org.apache.lucene.util.BytesRef;
37+
38+
import java.io.IOException;
39+
import java.util.Collection;
40+
import java.util.HashSet;
41+
import java.util.Set;
42+
43+
/**
44+
* A span rewrite method that extracts the first <code>maxExpansions</code> terms
45+
* that match the {@link MultiTermQuery} in the terms dictionary.
46+
* The rewrite throws an error if more than <code>maxExpansions</code> terms are found and <code>hardLimit</code>
47+
* is set.
48+
*/
49+
public class SpanBooleanQueryRewriteWithMaxClause extends SpanMultiTermQueryWrapper.SpanRewriteMethod {
50+
private final int maxExpansions;
51+
private final boolean hardLimit;
52+
53+
public SpanBooleanQueryRewriteWithMaxClause() {
54+
this(BooleanQuery.getMaxClauseCount(), true);
55+
}
56+
57+
public SpanBooleanQueryRewriteWithMaxClause(int maxExpansions, boolean hardLimit) {
58+
this.maxExpansions = maxExpansions;
59+
this.hardLimit = hardLimit;
60+
}
61+
62+
public int getMaxExpansions() {
63+
return maxExpansions;
64+
}
65+
66+
public boolean isHardLimit() {
67+
return hardLimit;
68+
}
69+
70+
@Override
71+
public SpanQuery rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
72+
final MultiTermQuery.RewriteMethod delegate = new MultiTermQuery.RewriteMethod() {
73+
@Override
74+
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
75+
Collection<SpanQuery> queries = collectTerms(reader, query);
76+
if (queries.size() == 0) {
77+
return new SpanMatchNoDocsQuery(query.getField(), "no expansion found for " + query.toString());
78+
} else if (queries.size() == 1) {
79+
return queries.iterator().next();
80+
} else {
81+
return new SpanOrQuery(queries.toArray(new SpanQuery[0]));
82+
}
83+
}
84+
85+
private Collection<SpanQuery> collectTerms(IndexReader reader, MultiTermQuery query) throws IOException {
86+
Set<SpanQuery> queries = new HashSet<>();
87+
IndexReaderContext topReaderContext = reader.getContext();
88+
for (LeafReaderContext context : topReaderContext.leaves()) {
89+
final Terms terms = context.reader().terms(query.getField());
90+
if (terms == null) {
91+
// field does not exist
92+
continue;
93+
}
94+
95+
final TermsEnum termsEnum = getTermsEnum(query, terms, null);
96+
assert termsEnum != null;
97+
98+
if (termsEnum == TermsEnum.EMPTY)
99+
continue;
100+
101+
BytesRef bytes;
102+
while ((bytes = termsEnum.next()) != null) {
103+
if (queries.size() >= maxExpansions) {
104+
if (hardLimit) {
105+
throw new RuntimeException("[" + query.toString() + " ] " +
106+
"exceeds maxClauseCount [ Boolean maxClauseCount is set to " + BooleanQuery.getMaxClauseCount() + "]");
107+
} else {
108+
return queries;
109+
}
110+
}
111+
queries.add(new SpanTermQuery(new Term(query.getField(), bytes)));
112+
}
113+
}
114+
return queries;
115+
}
116+
};
117+
return (SpanQuery) delegate.rewrite(reader, query);
118+
}
119+
}

server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
import org.apache.lucene.search.Query;
3535
import org.apache.lucene.search.TermInSetQuery;
3636
import org.apache.lucene.search.TermQuery;
37+
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
38+
import org.apache.lucene.search.spans.SpanQuery;
3739
import org.apache.lucene.util.BytesRef;
3840
import org.elasticsearch.ElasticsearchParseException;
3941
import org.elasticsearch.common.Nullable;
@@ -399,14 +401,24 @@ public Query nullValueQuery() {
399401

400402
public abstract Query existsQuery(QueryShardContext context);
401403

402-
public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {
404+
public Query phraseQuery(TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {
403405
throw new IllegalArgumentException("Attempted to build a phrase query with multiple terms against non-text field [" + name + "]");
404406
}
405407

406-
public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {
408+
public Query multiPhraseQuery(TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException {
407409
throw new IllegalArgumentException("Attempted to build a phrase query with multiple terms against non-text field [" + name + "]");
408410
}
409411

412+
public Query phrasePrefixQuery(TokenStream stream, int slop, int maxExpansions) throws IOException {
413+
throw new IllegalArgumentException("Attempted to build a phrase prefix query with multiple terms against non-text field [" + name +
414+
"]");
415+
}
416+
417+
public SpanQuery spanPrefixQuery(String value, SpanMultiTermQueryWrapper.SpanRewriteMethod method, QueryShardContext context) {
418+
throw new IllegalArgumentException("Can only use span prefix queries on text fields - not on [" + name
419+
+ "] which is of type [" + typeName() + "]");
420+
}
421+
410422
/**
411423
* An enum used to describe the relation between the range of terms in a
412424
* shard when compared with a query range

0 commit comments

Comments
 (0)