-
Notifications
You must be signed in to change notification settings - Fork 735
Prefixed vector index: bug fixes #16376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,9 +21,6 @@ | |
| namespace NKikimr { | ||
| namespace NSchemeShard { | ||
|
|
||
| // TODO(mbkkt) get table rows count (but even better to have unique prefixes count) | ||
| static constexpr ui64 TableSize = 1'000; | ||
|
|
||
| static constexpr const char* Name(TIndexBuildInfo::EState state) noexcept { | ||
| switch (state) { | ||
| case TIndexBuildInfo::EState::Invalid: | ||
|
|
@@ -681,7 +678,8 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil | |
| ev->Record.SetNeedsRounds(3); // TODO(mbkkt) should be configurable | ||
|
|
||
| const auto shardIndex = buildInfo.Shards.at(shardIdx).Index; | ||
| ev->Record.SetChild(buildInfo.KMeans.ChildBegin + (1 + TableSize) * shardIndex); | ||
| // about 2 * TableSize see comment in PrefixIndexDone | ||
| ev->Record.SetChild(buildInfo.KMeans.ChildBegin + (2 * buildInfo.KMeans.TableSize) * shardIndex); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Не очень понимаю что тут происходит Вот мы хотим построить индекс, запускаем на каждом шарде prefix kmeans Запускаем последовательно и TableSize это число строк в шардах слева? Что значит
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Комментарий в другом месте подробный
У нас построение выглядит так. |
||
|
|
||
| ev->Record.SetPostingName(path.Dive(buildInfo.KMeans.WriteTo()).PathString()); | ||
| path.Rise().Dive(NTableIndex::NTableVectorKmeansTreeIndex::LevelTable); | ||
|
|
@@ -931,7 +929,11 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil | |
| db.Table<Schema::KMeansTreeProgress>().Key(buildInfo.Id).Update( | ||
| NIceDb::TUpdate<Schema::KMeansTreeProgress::Level>(buildInfo.KMeans.Level), | ||
| NIceDb::TUpdate<Schema::KMeansTreeProgress::State>(buildInfo.KMeans.State), | ||
| NIceDb::TUpdate<Schema::KMeansTreeProgress::Parent>(buildInfo.KMeans.Parent) | ||
| NIceDb::TUpdate<Schema::KMeansTreeProgress::Parent>(buildInfo.KMeans.Parent), | ||
| NIceDb::TUpdate<Schema::KMeansTreeProgress::ParentBegin>(buildInfo.KMeans.ParentBegin), | ||
| NIceDb::TUpdate<Schema::KMeansTreeProgress::Child>(buildInfo.KMeans.Child), | ||
| NIceDb::TUpdate<Schema::KMeansTreeProgress::ChildBegin>(buildInfo.KMeans.ChildBegin), | ||
| NIceDb::TUpdate<Schema::KMeansTreeProgress::TableSize>(buildInfo.KMeans.TableSize) | ||
| ); | ||
| } | ||
|
|
||
|
|
@@ -944,7 +946,9 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil | |
| const ui64 doneShards = buildInfo.DoneShards.size(); | ||
|
|
||
| ClearDoneShards(txc, buildInfo); | ||
| Y_ABORT_UNLESS(buildInfo.KMeans.PrefixTableDone(TableSize, doneShards)); | ||
| // it's approximate but upper bound, so it's ok | ||
| buildInfo.KMeans.TableSize = std::max<ui64>(1, buildInfo.Processed.GetUploadRows()); | ||
| buildInfo.KMeans.PrefixIndexDone(doneShards); | ||
| PersistKMeansState(txc, buildInfo); | ||
| NIceDb::TNiceDb db{txc.DB}; | ||
| Self->PersistBuildIndexUploadReset(db, buildInfo); | ||
|
|
@@ -1316,14 +1320,14 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil | |
| auto tableColumns = NTableIndex::ExtractInfo(table); // skip dropped columns | ||
| TSerializedTableRange shardRange = InfiniteRange(tableColumns.Keys.size()); | ||
| static constexpr std::string_view LogPrefix = ""; | ||
| LOG_D("infinite range " << buildInfo.KMeans.RangeToDebugStr(shardRange)); | ||
| LOG_D("infinite range " << buildInfo.KMeans.RangeToDebugStr(shardRange, buildInfo.IsBuildPrefixedVectorIndex() ? 2 : 1)); | ||
|
|
||
| buildInfo.Cluster2Shards.clear(); | ||
| for (const auto& x: table->GetPartitions()) { | ||
| Y_ABORT_UNLESS(Self->ShardInfos.contains(x.ShardIdx)); | ||
| TSerializedCellVec bound{x.EndOfRange}; | ||
| shardRange.To = bound; | ||
| LOG_D("shard " << x.ShardIdx << " range " << buildInfo.KMeans.RangeToDebugStr(shardRange)); | ||
| LOG_D("shard " << x.ShardIdx << " range " << buildInfo.KMeans.RangeToDebugStr(shardRange, buildInfo.IsBuildPrefixedVectorIndex() ? 2 : 1)); | ||
| buildInfo.AddParent(shardRange, x.ShardIdx); | ||
| auto [it, emplaced] = buildInfo.Shards.emplace(x.ShardIdx, TIndexBuildInfo::TShardStatus{std::move(shardRange), "", buildInfo.Shards.size()}); | ||
| Y_ASSERT(emplaced); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3113,6 +3113,9 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> { | |
| NTableIndex::TClusterId ChildBegin = 1; // included | ||
| NTableIndex::TClusterId Child = ChildBegin; | ||
|
|
||
| ui64 TableSize = 0; | ||
|
|
||
|
|
||
| ui64 ParentEnd() const noexcept { // included | ||
| return ChildBegin - 1; | ||
| } | ||
|
|
@@ -3173,25 +3176,28 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> { | |
| return true; | ||
| } | ||
|
|
||
| bool PrefixTableDone(ui64 tableSize, ui64 shards) { | ||
| if (!NeedsAnotherLevel()) { | ||
| return false; | ||
| } | ||
| void PrefixIndexDone(ui64 shards) { | ||
| Y_ABORT_UNLESS(NeedsAnotherLevel()); | ||
| State = MultiLocal; | ||
| NextLevel((1 + tableSize) * shards); | ||
| // There's two worst cases, but in both one shard contains TableSize rows | ||
| // 1. all rows have unique prefix (*), in such case we need 1 id for each row (parent, id in prefix table) | ||
| // 2. all unique prefixes have size K, so we have TableSize/K parents + TableSize childs | ||
| // * it doesn't work now, because now prefix should have at least K embeddings, but it's bug | ||
| NextLevel((2 * TableSize) * shards); | ||
| Parent = ParentEnd(); | ||
| return true; | ||
| } | ||
|
|
||
| void Set(ui32 level, NTableIndex::TClusterId parent, ui32 state) { | ||
| // TODO(mbkkt) make it without cycles | ||
| while (Level < level) { | ||
| NextLevel(); | ||
| } | ||
| while (Parent < parent) { | ||
| NextParent(); | ||
| } | ||
| void Set(ui32 level, | ||
| NTableIndex::TClusterId parentBegin, NTableIndex::TClusterId parent, | ||
| NTableIndex::TClusterId childBegin, NTableIndex::TClusterId child, | ||
| ui32 state, ui64 tableSize) { | ||
| Level = level; | ||
| ParentBegin = parentBegin; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Могу только догадываться что эти числа используются для нумерации кластеров, но явно не хватает какого-то комментария с тем картинкой как они используются и что значат
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ну они не в первый раз возникли) циклы выглядят так parentEnd ~ childBegin (там функции есть, по ним видно) Бтв |
||
| Parent = parent; | ||
| ChildBegin = childBegin; | ||
| Child = child; | ||
| State = static_cast<EState>(state); | ||
| TableSize = tableSize; | ||
| } | ||
|
|
||
| NKikimrTxDataShard::TEvLocalKMeansRequest::EState GetUpload() const { | ||
|
|
@@ -3251,7 +3257,7 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> { | |
| return {parentFrom, parentTo}; | ||
| } | ||
|
|
||
| TString RangeToDebugStr(const TSerializedTableRange& range) const { | ||
| TString RangeToDebugStr(const TSerializedTableRange& range, ui32 rootLevel) const { | ||
| auto toStr = [&](const TSerializedCellVec& v) -> TString { | ||
| const auto cells = v.GetCells(); | ||
| if (cells.empty()) { | ||
|
|
@@ -3261,8 +3267,7 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> { | |
| return "-inf"; | ||
| } | ||
| auto str = TStringBuilder{} << "{ count: " << cells.size(); | ||
| if (Parent != 0) { | ||
| Y_ASSERT(Level != 0); | ||
| if (Level > rootLevel) { | ||
| str << ", parent: " << cells[0].AsValue<NTableIndex::TClusterId>(); | ||
| if (cells.size() != 1 && cells[1].IsNull()) { | ||
| str << ", pk: null"; | ||
|
|
@@ -3652,7 +3657,8 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> { | |
|
|
||
| TSerializedTableRange bound{range}; | ||
| LOG_DEBUG_S(TlsActivationContext->AsActorContext(), NKikimrServices::BUILD_INDEX, | ||
| "AddShardStatus id# " << Id << " shard " << shardIdx << " range " << KMeans.RangeToDebugStr(bound)); | ||
| "AddShardStatus id# " << Id << " shard " << shardIdx << | ||
| " range " << KMeans.RangeToDebugStr(bound, IsBuildPrefixedVectorIndex() ? 2 : 1)); | ||
| AddParent(bound, shardIdx); | ||
| Shards.emplace( | ||
| shardIdx, TIndexBuildInfo::TShardStatus(std::move(bound), std::move(lastKeyAck), Shards.size())); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Это чтобы номера не пересекались нужно? Выглядит не очень надёжно :(
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Да все так.
Ну не очень очевидно как по другому сделать.