Skip to content

Commit

Permalink
LUCENE-8848 LUCENE-7757 LUCENE-8492: UnifiedHighlighter.hasUnrecogniz…
Browse files Browse the repository at this point in the history
…edQuery

The UH now detects that parts of the query are not understood by it.
When found, it highlights more safely/reliably.
Fixes compatibility with complex and surround query parsers.
  • Loading branch information
dsmiley committed Jun 21, 2019
1 parent 9137a0b commit 54cc701
Show file tree
Hide file tree
Showing 10 changed files with 218 additions and 44 deletions.
5 changes: 5 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ Improvements
* LUCENE-8845: Allow Intervals.prefix() and Intervals.wildcard() to specify
their maximum allowed expansions (Alan Woodward)

* LUCENE-8848 LUCENE-7757 LUCENE-8492: The UnifiedHighlighter now detects that parts of the query are not understood by
it, and thus it should not make optimizations that result in no highlights or slow highlighting. This generally works
best for WEIGHT_MATCHES mode. Consequently queries produced by ComplexPhraseQueryParser and the surround QueryParser
will now highlight correctly. (David Smiley)

Optimizations

* LUCENE-8796: Use exponential search instead of binary search in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) {
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
*/
private static CharacterRunAutomaton buildCombinedAutomaton(UHComponents components) {
// We don't know enough about the query to do this confidently
if (components.getTerms() == null || components.getAutomata() == null) {
return null;
}

List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
if (components.getTerms().length > 0) {
allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(components.getTerms()))));
Expand Down Expand Up @@ -93,7 +98,9 @@ public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
TokenStream tokenStream = tokenStream(content);

// Filter the tokenStream to applicable terms
tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
if (preMemIndexFilterAutomaton != null) {
tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
}
memoryIndex.reset();
memoryIndex.addField(getField(), tokenStream);//note: calls tokenStream.reset() & close()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ static CharacterRunAutomaton[] extractAutomata(Query query, Predicate<String> fi
return collector.runAutomata.toArray(new CharacterRunAutomaton[0]);
}

/**
* Indicates if the the leaf query (from {@link QueryVisitor#visitLeaf(Query)}) is a type of query that
* we can extract automata from.
*/
public static boolean canExtractAutomataFromLeafQuery(Query query) {
return query instanceof AutomatonQuery || query instanceof FuzzyQuery;
}

private static class AutomataCollector extends QueryVisitor {

List<CharacterRunAutomaton> runAutomata = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public class NoOpOffsetStrategy extends FieldOffsetStrategy {
public static final NoOpOffsetStrategy INSTANCE = new NoOpOffsetStrategy();

private NoOpOffsetStrategy() {
super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], Collections.emptySet()));
super(new UHComponents("_ignored_", (s) -> false, new MatchNoDocsQuery(), new BytesRef[0], PhraseHelper.NONE, new CharacterRunAutomaton[0], false, Collections.emptySet()));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,15 @@
*/
public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {

private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
private final CharacterRunAutomaton[] combinedAutomata;

public TokenStreamOffsetStrategy(UHComponents components, Analyzer indexAnalyzer) {
super(new UHComponents(
components.getField(),
components.getFieldMatcher(),
components.getQuery(),
ZERO_LEN_BYTES_REF_ARRAY,
components.getPhraseHelper(),
convertTermsToAutomata(components.getTerms(), components.getAutomata()),
components.getHighlightFlags()),
indexAnalyzer);
super(components, indexAnalyzer);
assert components.getPhraseHelper().hasPositionSensitivity() == false;
combinedAutomata = convertTermsToAutomata(components.getTerms(), components.getAutomata());
}

//TODO this is inefficient; instead build a union automata just for terms part.
private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
for (int i = 0; i < terms.length; i++) {
Expand All @@ -67,7 +61,7 @@ public String toString() {

@Override
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException {
return new TokenStreamOffsetsEnum(tokenStream(content), components.getAutomata());
return new TokenStreamOffsetsEnum(tokenStream(content), combinedAutomata);
}

private static class TokenStreamOffsetsEnum extends OffsetsEnum {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,19 @@ public class UHComponents {
private final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive)
private final PhraseHelper phraseHelper; // Query: position-sensitive information
private final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive
private final boolean hasUnrecognizedQueryPart; // Query: if part of the query (other than the extracted terms / automata) is a leaf we don't know
private final Set<UnifiedHighlighter.HighlightFlag> highlightFlags;

public UHComponents(String field, Predicate<String> fieldMatcher, Query query,
BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
Set<UnifiedHighlighter.HighlightFlag> highlightFlags) {
boolean hasUnrecognizedQueryPart, Set<UnifiedHighlighter.HighlightFlag> highlightFlags) {
this.field = field;
this.fieldMatcher = fieldMatcher;
this.query = query;
this.terms = terms;
this.phraseHelper = phraseHelper;
this.automata = automata;
this.hasUnrecognizedQueryPart = hasUnrecognizedQueryPart;
this.highlightFlags = highlightFlags;
}

Expand Down Expand Up @@ -74,6 +76,10 @@ public CharacterRunAutomaton[] getAutomata() {
return automata;
}

public boolean hasUnrecognizedQueryPart() {
return hasUnrecognizedQueryPart;
}

public Set<UnifiedHighlighter.HighlightFlag> getHighlightFlags() {
return highlightFlags;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
Expand Down Expand Up @@ -750,13 +752,8 @@ public Object highlightWithoutSearcher(String field, Query query, String content
}

protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
Predicate<String> fieldMatcher = getFieldMatcher(field);
BytesRef[] terms = filterExtractedTerms(fieldMatcher, allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
UHComponents components = new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, highlightFlags);
UHComponents components = getHighlightComponents(field, query, allTerms);
OffsetSource offsetSource = getOptimizedOffsetSource(components);
return new FieldHighlighter(field,
getOffsetStrategy(offsetSource, components),
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
Expand All @@ -766,6 +763,41 @@ protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Te
getFormatter(field));
}

protected UHComponents getHighlightComponents(String field, Query query, Set<Term> allTerms) {
Predicate<String> fieldMatcher = getFieldMatcher(field);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
boolean queryHasUnrecognizedPart = hasUnrecognizedQuery(fieldMatcher, query);
BytesRef[] terms = null;
CharacterRunAutomaton[] automata = null;
if (!highlightFlags.contains(HighlightFlag.WEIGHT_MATCHES) || !queryHasUnrecognizedPart) {
terms = filterExtractedTerms(fieldMatcher, allTerms);
automata = getAutomata(field, query, highlightFlags);
} // otherwise don't need to extract
return new UHComponents(field, fieldMatcher, query, terms, phraseHelper, automata, queryHasUnrecognizedPart, highlightFlags);
}

protected boolean hasUnrecognizedQuery(Predicate<String> fieldMatcher, Query query) {
boolean[] hasUnknownLeaf = new boolean[1];
query.visit(new QueryVisitor() {
@Override
public boolean acceptField(String field) {
// checking hasUnknownLeaf is a trick to exit early
return hasUnknownLeaf[0] == false && fieldMatcher.test(field);
}

@Override
public void visitLeaf(Query query) {
if (MultiTermHighlighting.canExtractAutomataFromLeafQuery(query) == false) {
if (!(query instanceof MatchAllDocsQuery || query instanceof MatchNoDocsQuery)) {
hasUnknownLeaf[0] = true;
}
}
}
});
return hasUnknownLeaf[0];
}

protected static BytesRef[] filterExtractedTerms(Predicate<String> fieldMatcher, Set<Term> queryTerms) {
// Strip off the redundant field and sort the remaining terms
SortedSet<BytesRef> filteredTerms = new TreeSet<>();
Expand Down Expand Up @@ -819,26 +851,26 @@ protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<Hig
: ZERO_LEN_AUTOMATA_ARRAY;
}

protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
OffsetSource offsetSource = getOffsetSource(field);
protected OffsetSource getOptimizedOffsetSource(UHComponents components) {
OffsetSource offsetSource = getOffsetSource(components.getField());

if (terms.length == 0 && automata.length == 0 && !phraseHelper.willRewrite()) {
// null automata means unknown, so assume a possibility
boolean mtqOrRewrite = components.getAutomata() == null || components.getAutomata().length > 0
|| components.getPhraseHelper().willRewrite() || components.hasUnrecognizedQueryPart();

// null terms means unknown, so assume something to highlight
if (mtqOrRewrite == false && components.getTerms() != null && components.getTerms().length == 0) {
return OffsetSource.NONE_NEEDED; //nothing to highlight
}

switch (offsetSource) {
case POSTINGS:
if (phraseHelper.willRewrite()) {
// We can't choose the postings offset source when there is "rewriting" in the strict phrase
// processing (rare but possible). Postings requires knowing all the terms (except wildcards)
// up front.
return OffsetSource.ANALYSIS;
} else if (automata.length > 0) {
if (mtqOrRewrite) { // may need to see scan through all terms for the highlighted document efficiently
return OffsetSource.ANALYSIS;
}
break;
case POSTINGS_WITH_TERM_VECTORS:
if (!phraseHelper.willRewrite() && automata.length == 0) {
if (mtqOrRewrite == false) {
return OffsetSource.POSTINGS; //We don't need term vectors
}
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,13 @@
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
Expand Down Expand Up @@ -1357,4 +1360,104 @@ public void testNestedBooleanQueryAccuracy() throws IOException {

ir.close();
}

public void testNotReanalyzed() throws Exception {
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);

if (fieldType == UHTestHelper.reanalysisType) {
return; // we're testing the *other* cases
}

Field body = new Field("body", "", fieldType);
Document doc = new Document();
doc.add(body);

body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore.");
iw.addDocument(doc);

IndexReader ir = iw.getReader();
iw.close();

IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
throw new AssertionError("shouldn't be called");
}
});
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits.value);
String snippets[] = highlighter.highlight("body", query, topDocs);
assertEquals(1, snippets.length);
assertEquals("Just a test <b>highlighting</b> from postings. ", snippets[0]);

ir.close();
}

public void testUnknownQueryWithWeightMatches() throws IOException {
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);

Field body = new Field("body", "", fieldType);
Document doc = new Document();
doc.add(body);

body.setStringValue("Test a one sentence document.");
iw.addDocument(doc);

IndexReader ir = iw.getReader();
iw.close();

IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighter = randomUnifiedHighlighter(searcher, indexAnalyzer,
EnumSet.of(HighlightFlag.WEIGHT_MATCHES), null);
Query query = new BooleanQuery.Builder()
// simple term query body:one
.add(new TermQuery(new Term(body.name(), "one")), BooleanClause.Occur.MUST)
// a custom query, a leaf, that which matches body:sentence
// Note this isn't even an MTQ. What matters is that Weight.matches works.
.add(new Query() {
@Override
public String toString(String field) {
return "bogus";
}

@Override
public Query rewrite(IndexReader reader) {
return this;
}

// we don't visit terms, and we don't expose an automata. Thus this appears as some unknown leaf.
@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(body.name())) {
visitor.visitLeaf(this);
}
}

@Override
public boolean equals(Object obj) {
return this == obj;
}

@Override
public int hashCode() {
return System.identityHashCode(this);
}

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
//TODO maybe should loop through index terms to show we can see other terms
return new TermQuery(new Term(body.name(), "sentence")).createWeight(searcher, scoreMode, boost);
}
}, BooleanClause.Occur.MUST)
.build();
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits.value);
String[] snippets = highlighter.highlight("body", query, topDocs);
assertEquals(1, snippets.length);
assertEquals("Test a <b>one</b> <b>sentence</b> document.", snippets[0]);

ir.close();
}
}
Loading

0 comments on commit 54cc701

Please sign in to comment.