Skip to content

Commit 7b49beb

Browse files
authored
Fix threshold frequency computation in Suggesters (#34312)
The `term` and `phrase` suggesters have different options to filter candidates based on their frequencies. The `popular` mode for instance filters candidate terms that occur in less docs than the original term. However when we compute this threshold we use the total term frequency of a term instead of the document frequency. This is not inline with the actual filtering which is always based on the document frequency. This change fixes this discrepancy and clarifies the meaning of the different frequencies in use in the suggesters. It also ensures that the threshold doesn't overflow the maximum allowed value (Integer.MAX_VALUE). Closes #34282
1 parent c1c447a commit 7b49beb

File tree

14 files changed

+260
-114
lines changed

14 files changed

+260
-114
lines changed

buildSrc/src/main/resources/checkstyle_suppressions.xml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -372,9 +372,6 @@
372372
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]ContextMapping.java" checks="LineLength" />
373373
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]GeoContextMapping.java" checks="LineLength" />
374374
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]GeoQueryContext.java" checks="LineLength" />
375-
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]CandidateScorer.java" checks="LineLength" />
376-
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]NoisyChannelSpellChecker.java" checks="LineLength" />
377-
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]WordScorer.java" checks="LineLength" />
378375
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]RestoreService.java" checks="LineLength" />
379376
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]SnapshotShardFailure.java" checks="LineLength" />
380377
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]SnapshotShardsService.java" checks="LineLength" />
@@ -564,7 +561,6 @@
564561
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]CorruptedTranslogIT.java" checks="LineLength" />
565562
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]IndexStoreTests.java" checks="LineLength" />
566563
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]StoreTests.java" checks="LineLength" />
567-
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]suggest[/\\]stats[/\\]SuggestStatsIT.java" checks="LineLength" />
568564
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]translog[/\\]TranslogTests.java" checks="LineLength" />
569565
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]indexing[/\\]IndexActionIT.java" checks="LineLength" />
570566
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]indexlifecycle[/\\]IndexLifecycleActionIT.java" checks="LineLength" />
@@ -644,7 +640,6 @@
644640
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]ContextCompletionSuggestSearchIT.java" checks="LineLength" />
645641
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]CategoryContextMappingTests.java" checks="LineLength" />
646642
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]GeoContextMappingTests.java" checks="LineLength" />
647-
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]NoisyChannelSpellCheckerTests.java" checks="LineLength" />
648643
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]similarity[/\\]SimilarityIT.java" checks="LineLength" />
649644
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]AbstractSnapshotIntegTestCase.java" checks="LineLength" />
650645
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]DedicatedClusterSnapshotRestoreIT.java" checks="LineLength" />

docs/reference/migration/migrate_7_0/search.asciidoc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,13 @@ removed.
7878
* `levenstein` - replaced by `levenshtein`
7979
* `jarowinkler` - replaced by `jaro_winkler`
8080

81+
[float]
82+
==== `popular` mode for Suggesters
83+
84+
The `popular` mode for Suggesters (`term` and `phrase`) now uses the doc frequency
85+
(instead of the sum of the doc frequency) of the input terms to compute the frequency
86+
threshold for candidate suggestions.
87+
8188
[float]
8289
==== Limiting the number of terms that can be used in a Terms Query request
8390

server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
*/
1919
package org.elasticsearch.search.suggest.phrase;
2020

21+
import org.apache.lucene.codecs.TermStats;
2122
import org.apache.lucene.util.BytesRef;
2223
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
2324
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
@@ -29,22 +30,22 @@ public abstract class CandidateGenerator {
2930

3031
public abstract boolean isKnownWord(BytesRef term) throws IOException;
3132

32-
public abstract long frequency(BytesRef term) throws IOException;
33+
public abstract TermStats termStats(BytesRef term) throws IOException;
3334

3435
public CandidateSet drawCandidates(BytesRef term) throws IOException {
3536
CandidateSet set = new CandidateSet(Candidate.EMPTY, createCandidate(term, true));
3637
return drawCandidates(set);
3738
}
3839

3940
public Candidate createCandidate(BytesRef term, boolean userInput) throws IOException {
40-
return createCandidate(term, frequency(term), 1.0, userInput);
41+
return createCandidate(term, termStats(term), 1.0, userInput);
4142
}
42-
public Candidate createCandidate(BytesRef term, long frequency, double channelScore) throws IOException {
43-
return createCandidate(term, frequency, channelScore, false);
43+
public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore) throws IOException {
44+
return createCandidate(term, termStats, channelScore, false);
4445
}
4546

46-
public abstract Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException;
47+
public abstract Candidate createCandidate(BytesRef term, TermStats termStats,
48+
double channelScore, boolean userInput) throws IOException;
4749

4850
public abstract CandidateSet drawCandidates(CandidateSet set) throws IOException;
49-
5051
}

server/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,21 +77,24 @@ public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord,
7777
} else {
7878
if (numMissspellingsLeft > 0) {
7979
path[ord] = current.originalTerm;
80-
findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
80+
findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore,
81+
pathScore + scorer.score(path, candidates, ord, gramSize));
8182
for (int i = 0; i < current.candidates.length; i++) {
8283
path[ord] = current.candidates[i];
83-
findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
84+
findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore,
85+
pathScore + scorer.score(path, candidates, ord, gramSize));
8486
}
8587
} else {
8688
path[ord] = current.originalTerm;
87-
findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
89+
findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore,
90+
pathScore + scorer.score(path, candidates, ord, gramSize));
8891
}
8992
}
9093

9194
}
9295

93-
private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score)
94-
throws IOException {
96+
private void updateTop(CandidateSet[] candidates, Candidate[] path,
97+
PriorityQueue<Correction> corrections, double cutoffScore, double score) throws IOException {
9598
score = Math.exp(score);
9699
assert Math.abs(score - score(path, candidates)) < 0.00001 : "cur_score=" + score + ", path_score=" + score(path,candidates);
97100
if (score > cutoffScore) {

server/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java

Lines changed: 54 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
2424
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
2525
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
26+
import org.apache.lucene.codecs.TermStats;
2627
import org.apache.lucene.index.IndexReader;
2728
import org.apache.lucene.index.MultiFields;
2829
import org.apache.lucene.index.Term;
@@ -48,6 +49,7 @@
4849

4950
import static java.lang.Math.log10;
5051
import static java.lang.Math.max;
52+
import static java.lang.Math.min;
5153
import static java.lang.Math.round;
5254

5355
public final class DirectCandidateGenerator extends CandidateGenerator {
@@ -57,20 +59,20 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
5759
private final SuggestMode suggestMode;
5860
private final TermsEnum termsEnum;
5961
private final IndexReader reader;
60-
private final long dictSize;
62+
private final long sumTotalTermFreq;
6163
private static final double LOG_BASE = 5;
6264
private final long frequencyPlateau;
6365
private final Analyzer preFilter;
6466
private final Analyzer postFilter;
6567
private final double nonErrorLikelihood;
66-
private final boolean useTotalTermFrequency;
6768
private final CharsRefBuilder spare = new CharsRefBuilder();
6869
private final BytesRefBuilder byteSpare = new BytesRefBuilder();
6970
private final int numCandidates;
7071

7172
public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader,
7273
double nonErrorLikelihood, int numCandidates) throws IOException {
73-
this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, numCandidates, null, null, MultiFields.getTerms(reader, field));
74+
this(spellchecker, field, suggestMode, reader, nonErrorLikelihood,
75+
numCandidates, null, null, MultiFields.getTerms(reader, field));
7476
}
7577

7678
public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader,
@@ -83,14 +85,12 @@ public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, S
8385
this.numCandidates = numCandidates;
8486
this.suggestMode = suggestMode;
8587
this.reader = reader;
86-
final long dictSize = terms.getSumTotalTermFreq();
87-
this.useTotalTermFrequency = dictSize != -1;
88-
this.dictSize = dictSize == -1 ? reader.maxDoc() : dictSize;
88+
this.sumTotalTermFreq = terms.getSumTotalTermFreq() == -1 ? reader.maxDoc() : terms.getSumTotalTermFreq();
8989
this.preFilter = preFilter;
9090
this.postFilter = postFilter;
9191
this.nonErrorLikelihood = nonErrorLikelihood;
9292
float thresholdFrequency = spellchecker.getThresholdFrequency();
93-
this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(dictSize * thresholdFrequency);
93+
this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int) (reader.maxDoc() * thresholdFrequency);
9494
termsEnum = terms.iterator();
9595
}
9696

@@ -99,24 +99,29 @@ public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, S
9999
*/
100100
@Override
101101
public boolean isKnownWord(BytesRef term) throws IOException {
102-
return frequency(term) > 0;
102+
return termStats(term).docFreq > 0;
103103
}
104104

105105
/* (non-Javadoc)
106106
* @see org.elasticsearch.search.suggest.phrase.CandidateGenerator#frequency(org.apache.lucene.util.BytesRef)
107107
*/
108108
@Override
109-
public long frequency(BytesRef term) throws IOException {
109+
public TermStats termStats(BytesRef term) throws IOException {
110110
term = preFilter(term, spare, byteSpare);
111-
return internalFrequency(term);
111+
return internalTermStats(term);
112112
}
113113

114114

115-
public long internalFrequency(BytesRef term) throws IOException {
115+
public TermStats internalTermStats(BytesRef term) throws IOException {
116116
if (termsEnum.seekExact(term)) {
117-
return useTotalTermFrequency ? termsEnum.totalTermFreq() : termsEnum.docFreq();
117+
return new TermStats(termsEnum.docFreq(),
118+
/**
119+
* We use the {@link TermsEnum#docFreq()} for fields that don't
120+
* record the {@link TermsEnum#totalTermFreq()}.
121+
*/
122+
termsEnum.totalTermFreq() == -1 ? termsEnum.docFreq() : termsEnum.totalTermFreq());
118123
}
119-
return 0;
124+
return new TermStats(0, 0);
120125
}
121126

122127
public String getField() {
@@ -127,15 +132,28 @@ public String getField() {
127132
public CandidateSet drawCandidates(CandidateSet set) throws IOException {
128133
Candidate original = set.originalTerm;
129134
BytesRef term = preFilter(original.term, spare, byteSpare);
130-
final long frequency = original.frequency;
131-
spellchecker.setThresholdFrequency(this.suggestMode == SuggestMode.SUGGEST_ALWAYS ? 0 : thresholdFrequency(frequency, dictSize));
135+
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
136+
/**
137+
* We use the {@link TermStats#docFreq} to compute the frequency threshold
138+
* because that's what {@link DirectSpellChecker#suggestSimilar} expects
139+
* when filtering terms.
140+
*/
141+
int threshold = thresholdTermFrequency(original.termStats.docFreq);
142+
if (threshold == Integer.MAX_VALUE) {
143+
// the threshold is the max possible frequency so we can skip the search
144+
return set;
145+
}
146+
spellchecker.setThresholdFrequency(threshold);
147+
}
148+
132149
SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode);
133150
List<Candidate> candidates = new ArrayList<>(suggestSimilar.length);
134151
for (int i = 0; i < suggestSimilar.length; i++) {
135152
SuggestWord suggestWord = suggestSimilar[i];
136153
BytesRef candidate = new BytesRef(suggestWord.string);
137-
postFilter(new Candidate(candidate, internalFrequency(candidate), suggestWord.score,
138-
score(suggestWord.freq, suggestWord.score, dictSize), false), spare, byteSpare, candidates);
154+
TermStats termStats = internalTermStats(candidate);
155+
postFilter(new Candidate(candidate, termStats,
156+
suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates);
139157
}
140158
set.addCandidates(candidates);
141159
return set;
@@ -171,28 +189,30 @@ public void nextToken() throws IOException {
171189
BytesRef term = result.toBytesRef();
172190
// We should not use frequency(term) here because it will analyze the term again
173191
// If preFilter and postFilter are the same analyzer it would fail.
174-
long freq = internalFrequency(term);
175-
candidates.add(new Candidate(result.toBytesRef(), freq, candidate.stringDistance,
176-
score(candidate.frequency, candidate.stringDistance, dictSize), false));
192+
TermStats termStats = internalTermStats(term);
193+
candidates.add(new Candidate(result.toBytesRef(), termStats, candidate.stringDistance,
194+
score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false));
177195
} else {
178-
candidates.add(new Candidate(result.toBytesRef(), candidate.frequency, nonErrorLikelihood,
179-
score(candidate.frequency, candidate.stringDistance, dictSize), false));
196+
candidates.add(new Candidate(result.toBytesRef(), candidate.termStats, nonErrorLikelihood,
197+
score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false));
180198
}
181199
}
182200
}, spare);
183201
}
184202
}
185203

186-
private double score(long frequency, double errorScore, long dictionarySize) {
187-
return errorScore * (((double)frequency + 1) / ((double)dictionarySize +1));
204+
private double score(TermStats termStats, double errorScore, long dictionarySize) {
205+
return errorScore * (((double)termStats.totalTermFreq + 1) / ((double)dictionarySize +1));
188206
}
189207

190-
protected long thresholdFrequency(long termFrequency, long dictionarySize) {
191-
if (termFrequency > 0) {
192-
return max(0, round(termFrequency * (log10(termFrequency - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1));
208+
// package protected for test
209+
int thresholdTermFrequency(int docFreq) {
210+
if (docFreq > 0) {
211+
return (int) min(
212+
max(0, round(docFreq * (log10(docFreq - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1)), Integer.MAX_VALUE
213+
);
193214
}
194215
return 0;
195-
196216
}
197217

198218
public abstract static class TokenConsumer {
@@ -249,12 +269,12 @@ public static class Candidate implements Comparable<Candidate> {
249269
public static final Candidate[] EMPTY = new Candidate[0];
250270
public final BytesRef term;
251271
public final double stringDistance;
252-
public final long frequency;
272+
public final TermStats termStats;
253273
public final double score;
254274
public final boolean userInput;
255275

256-
public Candidate(BytesRef term, long frequency, double stringDistance, double score, boolean userInput) {
257-
this.frequency = frequency;
276+
public Candidate(BytesRef term, TermStats termStats, double stringDistance, double score, boolean userInput) {
277+
this.termStats = termStats;
258278
this.term = term;
259279
this.stringDistance = stringDistance;
260280
this.score = score;
@@ -266,7 +286,7 @@ public String toString() {
266286
return "Candidate [term=" + term.utf8ToString()
267287
+ ", stringDistance=" + stringDistance
268288
+ ", score=" + score
269-
+ ", frequency=" + frequency
289+
+ ", termStats=" + termStats
270290
+ (userInput ? ", userInput" : "") + "]";
271291
}
272292

@@ -305,8 +325,8 @@ public int compareTo(Candidate other) {
305325
}
306326

307327
@Override
308-
public Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException {
309-
return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput);
328+
public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException {
329+
return new Candidate(term, termStats, channelScore, score(termStats, channelScore, sumTotalTermFreq), userInput);
310330
}
311331

312332
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare)

server/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ protected double scoreUnigram(Candidate word) throws IOException {
4646
@Override
4747
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
4848
join(separator, spare, w_1.term, word.term);
49-
return (alpha + frequency(spare.get())) / (w_1.frequency + alpha * numTerms);
49+
return (alpha + frequency(spare.get())) / (w_1.termStats.totalTermFreq + alpha * numTerms);
5050
}
5151

5252
@Override

server/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
6060
if (count < 1) {
6161
return unigramLambda * scoreUnigram(word);
6262
}
63-
return bigramLambda * (count / (0.5d + w_1.frequency)) + unigramLambda * scoreUnigram(word);
63+
return bigramLambda * (count / (0.5d + w_1.termStats.totalTermFreq)) + unigramLambda * scoreUnigram(word);
6464
}
6565

6666
@Override

0 commit comments

Comments
 (0)