Skip to content

Semantic_text match_all with Highlighter #128702

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/128702.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 128702
summary: Fix missing highlighting in `match_all` queries for `semantic_text` fields
area: Search
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ public class InferenceFeatures implements FeatureSpecification {
private static final NodeFeature TEST_RULE_RETRIEVER_WITH_INDICES_THAT_DONT_RETURN_RANK_DOCS = new NodeFeature(
"test_rule_retriever.with_indices_that_dont_return_rank_docs"
);
private static final NodeFeature SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER = new NodeFeature("semantic_text.match_all_highlighter");

@Override
public Set<NodeFeature> getTestFeatures() {
Expand All @@ -57,7 +58,8 @@ public Set<NodeFeature> getTestFeatures() {
SemanticTextFieldMapper.SEMANTIC_TEXT_BIT_VECTOR_SUPPORT,
SemanticTextFieldMapper.SEMANTIC_TEXT_HANDLE_EMPTY_INPUT,
TEST_RULE_RETRIEVER_WITH_INDICES_THAT_DONT_RETURN_RANK_DOCS,
SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG
SEMANTIC_TEXT_SUPPORT_CHUNKING_CONFIG,
SEMANTIC_TEXT_MATCH_ALL_HIGHLIGHTER
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.KnnByteVectorQuery;
import org.apache.lucene.search.KnnFloatVectorQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
Expand Down Expand Up @@ -267,6 +268,8 @@ public void visitLeaf(Query query) {
queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), null));
} else if (query instanceof KnnByteVectorQuery knnQuery) {
queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), null));
} else if (query instanceof MatchAllDocsQuery) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😍 Nice optimization from the first solution!

queries.add(new MatchAllDocsQuery());
}
}
});
Expand All @@ -293,6 +296,13 @@ public QueryVisitor getSubVisitor(BooleanClause.Occur occur, Query parent) {
}
return this;
}

@Override
public void visitLeaf(Query query) {
if (query instanceof MatchAllDocsQuery) {
queries.add(new MatchAllDocsQuery());
}
}
});
return queries;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,133 @@ setup:
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

---
"Highlighting with match_all query":
- requires:
cluster_features: "semantic_text.match_all_highlighter"
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.

- do:
search:
index: test-sparse-index
body:
query:
match_all: {}
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }

- do:
search:
index: test-dense-index
body:
query:
match_all: {}
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }

---
"Highlighting with match_all and multi chunks with empty input":
- requires:
cluster_features: "semantic_text.match_all_highlighter"
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.

- do:
indices.create:
index: test-index-sparse
body:
settings:
index.mapping.semantic_text.use_legacy_format: false
mappings:
properties:
semantic_text_field:
type: semantic_text
inference_id: sparse-inference-id
text_field:
type: text

- do:
index:
index: test-index-sparse
id: doc_1
body:
semantic_text_field: [ "some test data", " ", "now with chunks" ]
text_field: "some test data"
refresh: true

- do:
search:
index: test-index-sparse
body:
query:
match_all: {}
highlight:
fields:
semantic_text_field:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

- do:
indices.create:
index: test-index-dense
body:
settings:
index.mapping.semantic_text.use_legacy_format: false
mappings:
properties:
semantic_text_field:
type: semantic_text
inference_id: dense-inference-id
text_field:
type: text

- do:
index:
index: test-index-dense
id: doc_1
body:
semantic_text_field: [ "some test data", " ", "now with chunks" ]
text_field: "some test data"
refresh: true

- do:
search:
index: test-index-dense
body:
query:
match_all: {}
highlight:
fields:
semantic_text_field:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }
Original file line number Diff line number Diff line change
Expand Up @@ -288,3 +288,150 @@ setup:
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

---
"Highlighting with match_all query":
- requires:
cluster_features: "semantic_text.match_all_highlighter"
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.

- do:
index:
index: test-sparse-index
id: doc_1
body:
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
refresh: true

- do:
search:
index: test-sparse-index
body:
query:
match_all: {}
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }

- do:
index:
index: test-dense-index
id: doc_1
body:
body: [ "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!" ]
refresh: true

- do:
search:
index: test-dense-index
body:
query:
match_all: {}
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }

---
"Highlighting with match_all and multi chunks with empty input":
- requires:
cluster_features: "semantic_text.match_all_highlighter"
reason: semantic text field supports match_all query with semantic highlighter, effective from 8.19 and 9.1.0.

- do:
indices.create:
index: test-index-sparse
body:
settings:
index.mapping.semantic_text.use_legacy_format: true
mappings:
properties:
semantic_text_field:
type: semantic_text
inference_id: sparse-inference-id
text_field:
type: text

- do:
index:
index: test-index-sparse
id: doc_1
body:
semantic_text_field: [ "some test data", " ", "now with chunks" ]
text_field: "some test data"
refresh: true

- do:
search:
index: test-index-sparse
body:
query:
match_all: {}
highlight:
fields:
semantic_text_field:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

- do:
indices.create:
index: test-index-dense
body:
settings:
index.mapping.semantic_text.use_legacy_format: true
mappings:
properties:
semantic_text_field:
type: semantic_text
inference_id: dense-inference-id
text_field:
type: text

- do:
index:
index: test-index-dense
id: doc_1
body:
semantic_text_field: [ "some test data", " ", "now with chunks" ]
text_field: "some test data"
refresh: true

- do:
search:
index: test-index-dense
body:
query:
match_all: {}
highlight:
fields:
semantic_text_field:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.semantic_text_field: 2 }
- match: { hits.hits.0.highlight.semantic_text_field.0: "some test data" }
- match: { hits.hits.0.highlight.semantic_text_field.1: "now with chunks" }

Loading