Skip to content

Commit 0610a4d

Browse files
authored
Prefixed vector index: bug fixes (#16376)
1 parent f0a4b08 commit 0610a4d

File tree

5 files changed

+78
-29
lines changed

5 files changed

+78
-29
lines changed

ydb/core/tx/schemeshard/schemeshard__init.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4566,8 +4566,12 @@ struct TSchemeShard::TTxInit : public TTransactionBase<TSchemeShard> {
45664566
auto& buildInfo = *buildInfoPtr->Get();
45674567
buildInfo.KMeans.Set(
45684568
rowset.GetValue<Schema::KMeansTreeProgress::Level>(),
4569+
rowset.GetValue<Schema::KMeansTreeProgress::ParentBegin>(),
45694570
rowset.GetValue<Schema::KMeansTreeProgress::Parent>(),
4570-
rowset.GetValue<Schema::KMeansTreeProgress::State>()
4571+
rowset.GetValue<Schema::KMeansTreeProgress::ChildBegin>(),
4572+
rowset.GetValue<Schema::KMeansTreeProgress::Child>(),
4573+
rowset.GetValue<Schema::KMeansTreeProgress::State>(),
4574+
rowset.GetValue<Schema::KMeansTreeProgress::TableSize>()
45714575
);
45724576
buildInfo.Sample.Rows.reserve(buildInfo.KMeans.K * 2);
45734577

ydb/core/tx/schemeshard/schemeshard_build_index__progress.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@
2121
namespace NKikimr {
2222
namespace NSchemeShard {
2323

24-
// TODO(mbkkt) get table rows count (but even better to have unique prefixes count)
25-
static constexpr ui64 TableSize = 1'000;
26-
2724
static constexpr const char* Name(TIndexBuildInfo::EState state) noexcept {
2825
switch (state) {
2926
case TIndexBuildInfo::EState::Invalid:
@@ -681,7 +678,8 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil
681678
ev->Record.SetNeedsRounds(3); // TODO(mbkkt) should be configurable
682679

683680
const auto shardIndex = buildInfo.Shards.at(shardIdx).Index;
684-
ev->Record.SetChild(buildInfo.KMeans.ChildBegin + (1 + TableSize) * shardIndex);
681+
// about 2 * TableSize see comment in PrefixIndexDone
682+
ev->Record.SetChild(buildInfo.KMeans.ChildBegin + (2 * buildInfo.KMeans.TableSize) * shardIndex);
685683

686684
ev->Record.SetPostingName(path.Dive(buildInfo.KMeans.WriteTo()).PathString());
687685
path.Rise().Dive(NTableIndex::NTableVectorKmeansTreeIndex::LevelTable);
@@ -931,7 +929,11 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil
931929
db.Table<Schema::KMeansTreeProgress>().Key(buildInfo.Id).Update(
932930
NIceDb::TUpdate<Schema::KMeansTreeProgress::Level>(buildInfo.KMeans.Level),
933931
NIceDb::TUpdate<Schema::KMeansTreeProgress::State>(buildInfo.KMeans.State),
934-
NIceDb::TUpdate<Schema::KMeansTreeProgress::Parent>(buildInfo.KMeans.Parent)
932+
NIceDb::TUpdate<Schema::KMeansTreeProgress::Parent>(buildInfo.KMeans.Parent),
933+
NIceDb::TUpdate<Schema::KMeansTreeProgress::ParentBegin>(buildInfo.KMeans.ParentBegin),
934+
NIceDb::TUpdate<Schema::KMeansTreeProgress::Child>(buildInfo.KMeans.Child),
935+
NIceDb::TUpdate<Schema::KMeansTreeProgress::ChildBegin>(buildInfo.KMeans.ChildBegin),
936+
NIceDb::TUpdate<Schema::KMeansTreeProgress::TableSize>(buildInfo.KMeans.TableSize)
935937
);
936938
}
937939

@@ -944,7 +946,9 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil
944946
const ui64 doneShards = buildInfo.DoneShards.size();
945947

946948
ClearDoneShards(txc, buildInfo);
947-
Y_ABORT_UNLESS(buildInfo.KMeans.PrefixTableDone(TableSize, doneShards));
949+
// it's approximate but upper bound, so it's ok
950+
buildInfo.KMeans.TableSize = std::max<ui64>(1, buildInfo.Processed.GetUploadRows());
951+
buildInfo.KMeans.PrefixIndexDone(doneShards);
948952
PersistKMeansState(txc, buildInfo);
949953
NIceDb::TNiceDb db{txc.DB};
950954
Self->PersistBuildIndexUploadReset(db, buildInfo);
@@ -1316,14 +1320,14 @@ struct TSchemeShard::TIndexBuilder::TTxProgress: public TSchemeShard::TIndexBuil
13161320
auto tableColumns = NTableIndex::ExtractInfo(table); // skip dropped columns
13171321
TSerializedTableRange shardRange = InfiniteRange(tableColumns.Keys.size());
13181322
static constexpr std::string_view LogPrefix = "";
1319-
LOG_D("infinite range " << buildInfo.KMeans.RangeToDebugStr(shardRange));
1323+
LOG_D("infinite range " << buildInfo.KMeans.RangeToDebugStr(shardRange, buildInfo.IsBuildPrefixedVectorIndex() ? 2 : 1));
13201324

13211325
buildInfo.Cluster2Shards.clear();
13221326
for (const auto& x: table->GetPartitions()) {
13231327
Y_ABORT_UNLESS(Self->ShardInfos.contains(x.ShardIdx));
13241328
TSerializedCellVec bound{x.EndOfRange};
13251329
shardRange.To = bound;
1326-
LOG_D("shard " << x.ShardIdx << " range " << buildInfo.KMeans.RangeToDebugStr(shardRange));
1330+
LOG_D("shard " << x.ShardIdx << " range " << buildInfo.KMeans.RangeToDebugStr(shardRange, buildInfo.IsBuildPrefixedVectorIndex() ? 2 : 1));
13271331
buildInfo.AddParent(shardRange, x.ShardIdx);
13281332
auto [it, emplaced] = buildInfo.Shards.emplace(x.ShardIdx, TIndexBuildInfo::TShardStatus{std::move(shardRange), "", buildInfo.Shards.size()});
13291333
Y_ASSERT(emplaced);

ydb/core/tx/schemeshard/schemeshard_info_types.h

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3121,6 +3121,9 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
31213121
NTableIndex::TClusterId ChildBegin = 1; // included
31223122
NTableIndex::TClusterId Child = ChildBegin;
31233123

3124+
ui64 TableSize = 0;
3125+
3126+
31243127
ui64 ParentEnd() const noexcept { // included
31253128
return ChildBegin - 1;
31263129
}
@@ -3181,25 +3184,28 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
31813184
return true;
31823185
}
31833186

3184-
bool PrefixTableDone(ui64 tableSize, ui64 shards) {
3185-
if (!NeedsAnotherLevel()) {
3186-
return false;
3187-
}
3187+
void PrefixIndexDone(ui64 shards) {
3188+
Y_ABORT_UNLESS(NeedsAnotherLevel());
31883189
State = MultiLocal;
3189-
NextLevel((1 + tableSize) * shards);
3190+
// There's two worst cases, but in both one shard contains TableSize rows
3191+
// 1. all rows have unique prefix (*), in such case we need 1 id for each row (parent, id in prefix table)
3192+
// 2. all unique prefixes have size K, so we have TableSize/K parents + TableSize childs
3193+
// * it doesn't work now, because now prefix should have at least K embeddings, but it's bug
3194+
NextLevel((2 * TableSize) * shards);
31903195
Parent = ParentEnd();
3191-
return true;
31923196
}
31933197

3194-
void Set(ui32 level, NTableIndex::TClusterId parent, ui32 state) {
3195-
// TODO(mbkkt) make it without cycles
3196-
while (Level < level) {
3197-
NextLevel();
3198-
}
3199-
while (Parent < parent) {
3200-
NextParent();
3201-
}
3198+
void Set(ui32 level,
3199+
NTableIndex::TClusterId parentBegin, NTableIndex::TClusterId parent,
3200+
NTableIndex::TClusterId childBegin, NTableIndex::TClusterId child,
3201+
ui32 state, ui64 tableSize) {
3202+
Level = level;
3203+
ParentBegin = parentBegin;
3204+
Parent = parent;
3205+
ChildBegin = childBegin;
3206+
Child = child;
32023207
State = static_cast<EState>(state);
3208+
TableSize = tableSize;
32033209
}
32043210

32053211
NKikimrTxDataShard::TEvLocalKMeansRequest::EState GetUpload() const {
@@ -3259,7 +3265,7 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
32593265
return {parentFrom, parentTo};
32603266
}
32613267

3262-
TString RangeToDebugStr(const TSerializedTableRange& range) const {
3268+
TString RangeToDebugStr(const TSerializedTableRange& range, ui32 rootLevel) const {
32633269
auto toStr = [&](const TSerializedCellVec& v) -> TString {
32643270
const auto cells = v.GetCells();
32653271
if (cells.empty()) {
@@ -3269,8 +3275,7 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
32693275
return "-inf";
32703276
}
32713277
auto str = TStringBuilder{} << "{ count: " << cells.size();
3272-
if (Parent != 0) {
3273-
Y_ASSERT(Level != 0);
3278+
if (Level > rootLevel) {
32743279
str << ", parent: " << cells[0].AsValue<NTableIndex::TClusterId>();
32753280
if (cells.size() != 1 && cells[1].IsNull()) {
32763281
str << ", pk: null";
@@ -3660,7 +3665,8 @@ struct TIndexBuildInfo: public TSimpleRefCount<TIndexBuildInfo> {
36603665

36613666
TSerializedTableRange bound{range};
36623667
LOG_DEBUG_S(TlsActivationContext->AsActorContext(), NKikimrServices::BUILD_INDEX,
3663-
"AddShardStatus id# " << Id << " shard " << shardIdx << " range " << KMeans.RangeToDebugStr(bound));
3668+
"AddShardStatus id# " << Id << " shard " << shardIdx <<
3669+
" range " << KMeans.RangeToDebugStr(bound, IsBuildPrefixedVectorIndex() ? 2 : 1));
36643670
AddParent(bound, shardIdx);
36653671
Shards.emplace(
36663672
shardIdx, TIndexBuildInfo::TShardStatus(std::move(bound), std::move(lastKeyAck), Shards.size()));

ydb/core/tx/schemeshard/schemeshard_schema.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1929,13 +1929,24 @@ struct Schema : NIceDb::Schema {
19291929
struct Level : Column<2, NScheme::NTypeIds::Uint32> {};
19301930
struct State : Column<3, NScheme::NTypeIds::Uint32> {};
19311931
struct Parent : Column<4, ClusterIdTypeId> {};
1932+
struct ParentBegin : Column<5, ClusterIdTypeId> {};
1933+
struct Child : Column<6, ClusterIdTypeId> {};
1934+
struct ChildBegin : Column<7, ClusterIdTypeId> {};
1935+
struct TableSize : Column<8, NScheme::NTypeIds::Uint64> {};
1936+
// TableSize required for prefixed kmeans tree
1937+
// But can be filled and used for other kmeans tree for "auto" settings choice
1938+
// Also for "auto" settings will needs to save K
19321939

19331940
using TKey = TableKey<Id>;
19341941
using TColumns = TableColumns<
19351942
Id,
19361943
Level,
19371944
State,
1938-
Parent
1945+
Parent,
1946+
ParentBegin,
1947+
Child,
1948+
ChildBegin,
1949+
TableSize
19391950
>;
19401951
};
19411952

ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_schemeshard_/flat_schemeshard.schema

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8172,6 +8172,26 @@
81728172
"ColumnId": 4,
81738173
"ColumnName": "Parent",
81748174
"ColumnType": "Uint64"
8175+
},
8176+
{
8177+
"ColumnId": 5,
8178+
"ColumnName": "ParentBegin",
8179+
"ColumnType": "Uint64"
8180+
},
8181+
{
8182+
"ColumnId": 6,
8183+
"ColumnName": "Child",
8184+
"ColumnType": "Uint64"
8185+
},
8186+
{
8187+
"ColumnId": 7,
8188+
"ColumnName": "ChildBegin",
8189+
"ColumnType": "Uint64"
8190+
},
8191+
{
8192+
"ColumnId": 8,
8193+
"ColumnName": "TableSize",
8194+
"ColumnType": "Uint64"
81758195
}
81768196
],
81778197
"ColumnsDropped": [],
@@ -8181,7 +8201,11 @@
81818201
1,
81828202
2,
81838203
3,
8184-
4
8204+
4,
8205+
5,
8206+
6,
8207+
7,
8208+
8
81858209
],
81868210
"RoomID": 0,
81878211
"Codec": 0,

0 commit comments

Comments
 (0)