From ea7f589e6e9973aa0ac88f768a50fd42f57ed14c Mon Sep 17 00:00:00 2001 From: Aditi Ahuja <48997495+metonymic-smokey@users.noreply.github.com> Date: Thu, 19 Sep 2024 22:13:37 +0530 Subject: [PATCH] MB-62230 - Avoiding unnecessary computations for un-filtered kNN. (#2076) Currently, there are a bunch of additional computations done on every kNN query that are necessary only for pre-filtered kNN. This PR adds checks to perform these only when at least one kNN query, out of one or more in the search request, is a filtered kNN query. Also added some commentary. --------- Co-authored-by: Abhinav Dangeti --- go.mod | 4 +-- go.sum | 8 +++--- index/scorch/optimize_knn.go | 43 ++++++++++++++++++------------- index/scorch/snapshot_index_vr.go | 2 +- search_knn.go | 1 + 5 files changed, 33 insertions(+), 25 deletions(-) diff --git a/go.mod b/go.mod index e1d30aadb..d140e35de 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/bits-and-blooms/bitset v1.12.0 github.com/blevesearch/bleve_index_api v1.1.12 github.com/blevesearch/geo v0.1.20 - github.com/blevesearch/go-faiss v1.0.22-0.20240918182005-f19c1d446e92 + github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/goleveldb v1.0.1 @@ -24,7 +24,7 @@ require ( github.com/blevesearch/zapx/v13 v13.3.10 github.com/blevesearch/zapx/v14 v14.3.10 github.com/blevesearch/zapx/v15 v15.3.13 - github.com/blevesearch/zapx/v16 v16.1.6-0.20240909182401-e148470cefbe + github.com/blevesearch/zapx/v16 v16.1.6-0.20240919163431-f2ee7670abd9 github.com/couchbase/moss v0.2.0 github.com/golang/protobuf v1.3.2 github.com/spf13/cobra v1.7.0 diff --git a/go.sum b/go.sum index 137f0da0c..52dd73afd 100644 --- a/go.sum +++ b/go.sum @@ -6,8 +6,8 @@ github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+ github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM= github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w= -github.com/blevesearch/go-faiss v1.0.22-0.20240918182005-f19c1d446e92 h1:pDbDTN8dgycpdp9eCzrNp9e6Z4C+UQhCUAZbaarQ6Bs= -github.com/blevesearch/go-faiss v1.0.22-0.20240918182005-f19c1d446e92/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a h1:mSUfDoOPOLt0OABjiyQq/kQxOzAJmsgIjlAWUPfUDfc= +github.com/blevesearch/go-faiss v1.0.22-0.20240919162919-05a9ee21155a/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:kDy+zgJFJJoJYBvdfBSiZYBbdsUL0XcjHYWezpQBGPA= github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:9eJDeqxJ3E7WnLebQUlPD7ZjSce7AnDb9vjGmMCbD0A= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= @@ -43,8 +43,8 @@ github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz7 github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= github.com/blevesearch/zapx/v15 v15.3.13 h1:6EkfaZiPlAxqXz0neniq35my6S48QI94W/wyhnpDHHQ= github.com/blevesearch/zapx/v15 v15.3.13/go.mod h1:Turk/TNRKj9es7ZpKK95PS7f6D44Y7fAFy8F4LXQtGg= -github.com/blevesearch/zapx/v16 v16.1.6-0.20240909182401-e148470cefbe h1:S1rCvhrU2HqDrRtogYgM52rT5px7o2zFIB3Yo+JPFOU= -github.com/blevesearch/zapx/v16 v16.1.6-0.20240909182401-e148470cefbe/go.mod h1:x9Kg015zbkSXxmE7F+0qeGxpeHJBwkDuxosrrDxYltU= +github.com/blevesearch/zapx/v16 v16.1.6-0.20240919163431-f2ee7670abd9 h1:pSaAZuB/gu5cNhSXrpI6s6xyN3ysVdG+RMqEbHEDx+o= +github.com/blevesearch/zapx/v16 v16.1.6-0.20240919163431-f2ee7670abd9/go.mod h1:R6fi71sVKI+HnzchzfkomFQ5HvMvn3CWTmLBuuUqoTQ= github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps= github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k= github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= diff --git a/index/scorch/optimize_knn.go b/index/scorch/optimize_knn.go index 8be394a12..7f0f0907b 100644 --- a/index/scorch/optimize_knn.go +++ b/index/scorch/optimize_knn.go @@ -23,6 +23,7 @@ import ( "sync" "sync/atomic" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/v2/search" index "github.com/blevesearch/bleve_index_api" segment_api "github.com/blevesearch/scorch_segment_api/v2" @@ -64,7 +65,10 @@ func (o *OptimizeVR) Finish() error { var errorsM sync.Mutex var errors []error - snapshotGlobalDocNums := o.snapshot.globalDocNums() + var snapshotGlobalDocNums map[int]*roaring.Bitmap + if o.requiresFiltering { + snapshotGlobalDocNums = o.snapshot.globalDocNums() + } defer o.invokeSearcherEndCallback() @@ -94,28 +98,31 @@ func (o *OptimizeVR) Finish() error { vectorIndexSize := vecIndex.Size() origSeg.cachedMeta.updateMeta(field, vectorIndexSize) for _, vr := range vrs { - eligibleVectorInternalIDs := vr.getEligibleDocIDs() - if snapshotGlobalDocNums != nil { - // Only the eligible documents belonging to this segment - // will get filtered out. - // There is no way to determine which doc belongs to which segment - eligibleVectorInternalIDs.And(snapshotGlobalDocNums[index]) - } - - eligibleLocalDocNums := make([]uint64, - eligibleVectorInternalIDs.Stats().Cardinality) - // get the (segment-)local document numbers - for i, docNum := range eligibleVectorInternalIDs.ToArray() { - localDocNum := o.snapshot.localDocNumFromGlobal(index, - uint64(docNum)) - eligibleLocalDocNums[i] = localDocNum - } - var pl segment_api.VecPostingsList var err error + // for each VR, populate postings list and iterators // by passing the obtained vector index and getting similar vectors. + + // Only applies to filtered kNN. if vr.eligibleDocIDs != nil && len(vr.eligibleDocIDs) > 0 { + eligibleVectorInternalIDs := vr.getEligibleDocIDs() + if snapshotGlobalDocNums != nil { + // Only the eligible documents belonging to this segment + // will get filtered out. + // There is no way to determine which doc belongs to which segment + eligibleVectorInternalIDs.And(snapshotGlobalDocNums[index]) + } + + eligibleLocalDocNums := make([]uint64, + eligibleVectorInternalIDs.Stats().Cardinality) + // get the (segment-)local document numbers + for i, docNum := range eligibleVectorInternalIDs.ToArray() { + localDocNum := o.snapshot.localDocNumFromGlobal(index, + uint64(docNum)) + eligibleLocalDocNums[i] = localDocNum + } + pl, err = vecIndex.SearchWithFilter(vr.vector, vr.k, eligibleLocalDocNums, vr.searchParams) } else { diff --git a/index/scorch/snapshot_index_vr.go b/index/scorch/snapshot_index_vr.go index 05b5167fd..9d32fd6aa 100644 --- a/index/scorch/snapshot_index_vr.go +++ b/index/scorch/snapshot_index_vr.go @@ -54,7 +54,7 @@ type IndexSnapshotVectorReader struct { searchParams json.RawMessage // The following fields are only applicable for vector readers which will - // process kNN queries. + // process pre-filtered kNN queries. eligibleDocIDs []index.IndexInternalID } diff --git a/search_knn.go b/search_knn.go index 025c7ab1b..279f35df1 100644 --- a/search_knn.go +++ b/search_knn.go @@ -376,6 +376,7 @@ func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, rea } if _, ok := filterQ.(*query.MatchAllQuery); ok { + // Equivalent to not having a filter query. requiresFiltering[idx] = false continue }