From e72f7c2f22de1d4887b30571fb65f8b3bb408d6b Mon Sep 17 00:00:00 2001 From: Aditi Ahuja <48997495+metonymic-smokey@users.noreply.github.com> Date: Wed, 13 Nov 2024 22:55:32 +0530 Subject: [PATCH] MB-62230 - Pre-filtering Optimisation (#2098) This PR - 1. Avoids creating document matches for the pre-filter phase, when IDs suffice. 2. Re-uses document matches by adding them to the doc match pool after each hit. --------- Co-authored-by: Abhinav Dangeti --- go.mod | 2 +- go.sum | 4 ++-- search/collector/eligible.go | 40 ++++++++++++++---------------------- search/collector/heap.go | 5 ----- search/collector/list.go | 5 ----- search/collector/slice.go | 9 +++----- search/collector/topn.go | 4 ---- search_knn.go | 7 ++----- 8 files changed, 23 insertions(+), 53 deletions(-) diff --git a/go.mod b/go.mod index 4079b4d47..19e826daa 100644 --- a/go.mod +++ b/go.mod @@ -24,7 +24,7 @@ require ( github.com/blevesearch/zapx/v13 v13.3.10 github.com/blevesearch/zapx/v14 v14.3.10 github.com/blevesearch/zapx/v15 v15.3.16 - github.com/blevesearch/zapx/v16 v16.1.8-0.20241104164502-f19d5f0cdbcb + github.com/blevesearch/zapx/v16 v16.1.8 github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 github.com/spf13/cobra v1.7.0 diff --git a/go.sum b/go.sum index 7202431a3..bf94edfae 100644 --- a/go.sum +++ b/go.sum @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7 github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= github.com/blevesearch/zapx/v15 v15.3.16 h1:Ct3rv7FUJPfPk99TI/OofdC+Kpb4IdyfdMH48sb+FmE= github.com/blevesearch/zapx/v15 v15.3.16/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= -github.com/blevesearch/zapx/v16 v16.1.8-0.20241104164502-f19d5f0cdbcb h1:+LkKIOe8vnyxmHLI8iOa8vpv9h46qYait5znwcl7Utg= -github.com/blevesearch/zapx/v16 v16.1.8-0.20241104164502-f19d5f0cdbcb/go.mod h1:JqQlOqlRVaYDkpLIl3JnKql8u4zKTNlVEa3nLsi0Gn8= +github.com/blevesearch/zapx/v16 v16.1.8 h1:Bxzpw6YQpFs7UjoCV1+RvDw6fmAT2GZxldwX8b3wVBM= +github.com/blevesearch/zapx/v16 v16.1.8/go.mod h1:JqQlOqlRVaYDkpLIl3JnKql8u4zKTNlVEa3nLsi0Gn8= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= diff --git a/search/collector/eligible.go b/search/collector/eligible.go index 3f78f7049..5590290b0 100644 --- a/search/collector/eligible.go +++ b/search/collector/eligible.go @@ -29,7 +29,7 @@ type EligibleCollector struct { took time.Duration results search.DocumentMatchCollection - store collectorStore + ids []index.IndexInternalID } func NewEligibleCollector(size int) *EligibleCollector { @@ -38,13 +38,9 @@ func NewEligibleCollector(size int) *EligibleCollector { func newEligibleCollector(size int) *EligibleCollector { // No sort order & skip always 0 since this is only to filter eligible docs. - ec := &EligibleCollector{size: size} - - // comparator is a dummy here - ec.store = getOptimalCollectorStore(size, 0, func(i, j *search.DocumentMatch) int { - return 0 - }) - + ec := &EligibleCollector{size: size, + ids: make([]index.IndexInternalID, 0, size), + } return ec } @@ -55,8 +51,13 @@ func makeEligibleDocumentMatchHandler(ctx *search.SearchContext) (search.Documen return nil } - // No elements removed from the store here. - _ = ec.store.Add(d) + copyOfID := make([]byte, len(d.IndexInternalID)) + copy(copyOfID, d.IndexInternalID) + ec.ids = append(ec.ids, copyOfID) + + // recycle the DocumentMatch + ctx.DocumentMatchPool.Put(d) + return nil }, nil } @@ -122,26 +123,15 @@ func (ec *EligibleCollector) Collect(ctx context.Context, searcher search.Search // compute search duration ec.took = time.Since(startTime) - // finalize actual results - err = ec.finalizeResults(reader) - if err != nil { - return err - } return nil } -func (ec *EligibleCollector) finalizeResults(r index.IndexReader) error { - var err error - ec.results, err = ec.store.Final(0, func(doc *search.DocumentMatch) error { - // Adding the results to the store without any modifications since we don't - // require the external ID of the filtered hits. - return nil - }) - return err +func (ec *EligibleCollector) Results() search.DocumentMatchCollection { + return nil } -func (ec *EligibleCollector) Results() search.DocumentMatchCollection { - return ec.results +func (ec *EligibleCollector) IDs() []index.IndexInternalID { + return ec.ids } func (ec *EligibleCollector) Total() uint64 { diff --git a/search/collector/heap.go b/search/collector/heap.go index ab068b084..cd662bcf9 100644 --- a/search/collector/heap.go +++ b/search/collector/heap.go @@ -34,11 +34,6 @@ func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap { return rv } -func (c *collectStoreHeap) Add(doc *search.DocumentMatch) *search.DocumentMatch { - c.add(doc) - return nil -} - func (c *collectStoreHeap) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch { c.add(doc) diff --git a/search/collector/list.go b/search/collector/list.go index b8b645199..f73505e7d 100644 --- a/search/collector/list.go +++ b/search/collector/list.go @@ -34,11 +34,6 @@ func newStoreList(capacity int, compare collectorCompare) *collectStoreList { return rv } -func (c *collectStoreList) Add(doc *search.DocumentMatch, size int) *search.DocumentMatch { - c.results.PushBack(doc) - return nil -} - func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch { c.add(doc) if c.len() > size { diff --git a/search/collector/slice.go b/search/collector/slice.go index 03b212b0f..6120921cb 100644 --- a/search/collector/slice.go +++ b/search/collector/slice.go @@ -14,7 +14,9 @@ package collector -import "github.com/blevesearch/bleve/v2/search" +import ( + "github.com/blevesearch/bleve/v2/search" +) type collectStoreSlice struct { slice search.DocumentMatchCollection @@ -29,11 +31,6 @@ func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice { return rv } -func (c *collectStoreSlice) Add(doc *search.DocumentMatch) *search.DocumentMatch { - c.slice = append(c.slice, doc) - return nil -} - func (c *collectStoreSlice) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch { c.add(doc) diff --git a/search/collector/topn.go b/search/collector/topn.go index 4971e23ae..fc338f54e 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -33,10 +33,6 @@ func init() { } type collectorStore interface { - // Adds a doc to the store without considering size. - // Returns nil if the doc was added successfully. - Add(doc *search.DocumentMatch) *search.DocumentMatch - // Add the document, and if the new store size exceeds the provided size // the last element is removed and returned. If the size has not been // exceeded, nil is returned. diff --git a/search_knn.go b/search_knn.go index 120ed04a0..309b36593 100644 --- a/search_knn.go +++ b/search_knn.go @@ -404,12 +404,9 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea if err != nil { return nil, err } - filterHits := filterColl.Results() + filterHits := filterColl.IDs() if len(filterHits) > 0 { - filterHitsMap[idx] = make([]index.IndexInternalID, len(filterHits)) - for i, docMatch := range filterHits { - filterHitsMap[idx][i] = docMatch.IndexInternalID - } + filterHitsMap[idx] = filterHits } // set requiresFiltering regardless of whether there're filtered hits or // not to later decide whether to consider the knnQuery or not