Skip to content

make storage balancer automatically triggered KIKIMR-20672 #1018

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion ydb/core/mind/hive/hive.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@ struct THiveSharedSettings {
TDuration GetStoragePoolFreshPeriod() const {
return TDuration::MilliSeconds(CurrentConfig.GetStoragePoolFreshPeriod());
}

double GetMinGroupUsageToBalance() const {
return CurrentConfig.GetMinGroupUsageToBalance();
}
};

struct TDrainSettings {
Expand All @@ -276,7 +280,7 @@ struct TBalancerSettings {

struct TStorageBalancerSettings {
ui64 NumReassigns;
ui64 MaxInFlight;
ui64 MaxInFlight = 1;
TString StoragePool;
};

Expand Down
6 changes: 6 additions & 0 deletions ydb/core/mind/hive/hive_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ struct TEvPrivate {
EvLogTabletMoves,
EvStartStorageBalancer,
EvRestartCancelled,
EvProcessStorageBalancer,
EvStorageBalancerOut,
EvEnd
};

Expand Down Expand Up @@ -104,6 +106,10 @@ struct TEvPrivate {

TEvRestartCancelled(TFullTabletId tabletId) : TabletId(tabletId) {}
};

struct TEvProcessStorageBalancer : TEventLocal<TEvProcessStorageBalancer, EvProcessStorageBalancer> {};

struct TEvStorageBalancerOut : TEventLocal<TEvStorageBalancerOut, EvStorageBalancerOut> {};
};

} // NHive
Expand Down
65 changes: 54 additions & 11 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,9 @@ void THive::Handle(TEvBlobStorage::TEvControllerSelectGroupsResult::TPtr& ev) {
Execute(CreateUpdateTabletGroups(tabletId));
}
}
if (tablets.empty()) {
ProcessStorageBalancer();
}
} else {
BLOG_ERROR("THive::Handle TEvControllerSelectGroupsResult: obsolete BSC response");
}
Expand Down Expand Up @@ -2051,12 +2054,12 @@ void THive::Handle(TEvHive::TEvRequestHiveStorageStats::TPtr& ev) {
auto& pbGroup = *pbPool.AddGroups();
pbGroup.SetGroupID(id);
pbGroup.SetAcquiredUnits(group.Units.size());
pbGroup.SetAcquiredIOPS(group.AcquiredIOPS);
pbGroup.SetAcquiredThroughput(group.AcquiredThroughput);
pbGroup.SetAcquiredSize(group.AcquiredSize);
pbGroup.SetMaximumIOPS(group.MaximumIOPS);
pbGroup.SetMaximumThroughput(group.MaximumThroughput);
pbGroup.SetMaximumSize(group.MaximumSize);
pbGroup.SetAcquiredIOPS(group.AcquiredResources.IOPS);
pbGroup.SetAcquiredThroughput(group.AcquiredResources.Throughput);
pbGroup.SetAcquiredSize(group.AcquiredResources.Size);
pbGroup.SetMaximumIOPS(group.MaximumResources.IOPS);
pbGroup.SetMaximumThroughput(group.MaximumResources.Throughput);
pbGroup.SetMaximumSize(group.MaximumResources.Size);
pbGroup.SetAllocatedSize(group.GroupParameters.GetAllocatedSize());
pbGroup.SetAvailableSize(group.GroupParameters.GetAvailableSize());
}
Expand Down Expand Up @@ -2171,11 +2174,18 @@ TResourceRawValues THive::GetDefaultResourceInitialMaximumValues() {

void THive::ProcessTabletBalancer() {
if (!ProcessTabletBalancerScheduled && !ProcessTabletBalancerPostponed && BootQueue.BootQueue.empty()) {
Schedule(GetBalancerCooldown(), new TEvPrivate::TEvProcessTabletBalancer());
Schedule(GetBalancerCooldown(LastBalancerTrigger), new TEvPrivate::TEvProcessTabletBalancer());
ProcessTabletBalancerScheduled = true;
}
}

void THive::ProcessStorageBalancer() {
if (!ProcessStorageBalancerScheduled && BootQueue.BootQueue.empty()) {
Schedule(GetBalancerCooldown(EBalancerType::Storage), new TEvPrivate::TEvProcessStorageBalancer());
ProcessStorageBalancerScheduled = true;
}
}

THive::THiveStats THive::GetStats() const {
THiveStats stats = {};
stats.Values.reserve(Nodes.size());
Expand Down Expand Up @@ -2203,7 +2213,6 @@ THive::THiveStats THive::GetStats() const {
maxValues = piecewise_max(maxValues, stats.Values[i].ResourceNormValues);
}


auto minValuesToBalance = GetMinNodeUsageToBalance();
maxValues = piecewise_max(maxValues, minValuesToBalance);
minValues = piecewise_max(minValues, minValuesToBalance);
Expand Down Expand Up @@ -2359,6 +2368,38 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
Send(SelfId(), new TEvPrivate::TEvBalancerOut());
}

void THive::Handle(TEvPrivate::TEvProcessStorageBalancer::TPtr&) {
ProcessStorageBalancerScheduled = false;
if (StoragePools.empty()) {
return;
}
using TPoolStat = std::pair<TStoragePoolInfo::TStats, const TStoragePoolInfo&>;
std::vector<TPoolStat> poolStats;
poolStats.reserve(StoragePools.size());
for (const auto& [name, pool] : StoragePools) {
poolStats.emplace_back(pool.GetStats(), pool);
}
auto& [stats, pool] = *std::max_element(poolStats.begin(), poolStats.end(), [](const TPoolStat& lhs, const TPoolStat& rhs) {
return lhs.first.Scatter < rhs.first.Scatter;
});
if (stats.Scatter > GetMinStorageScatterToBalance()) {
BLOG_D("Storage Scatter = " << stats.Scatter << " in pool " << pool.Name << ", starting StorageBalancer");
ui64 numReassigns = 1;
auto it = pool.Groups.find(stats.MaxUsageGroupId);
if (it != pool.Groups.end()) {
// We want a ballpark estimate of how many reassigns it would take to balance the pool
// Using the number of units in the most loaded group ensures we won't reassign the whole pool on a whim,
// while also giving the balancer some room to work.
// Note that the balancer is not actually required to do that many reassigns, but will never do more
numReassigns = it->second.Units.size();
}
StartHiveStorageBalancer({
.NumReassigns = numReassigns,
.StoragePool = pool.Name
});
}
}

void THive::UpdateTotalResourceValues(
const TNodeInfo* node,
const TTabletInfo* tablet,
Expand Down Expand Up @@ -2660,8 +2701,8 @@ void THive::UpdateTabletFollowersNumber(TLeaderTabletInfo& tablet, NIceDb::TNice
}
}

TDuration THive::GetBalancerCooldown() const {
switch(LastBalancerTrigger) {
TDuration THive::GetBalancerCooldown(EBalancerType balancerType) const {
switch(balancerType) {
case EBalancerType::Scatter:
case EBalancerType::ScatterCounter:
case EBalancerType::ScatterCPU:
Expand Down Expand Up @@ -2787,7 +2828,7 @@ void THive::RequestPoolsInformation() {
}
SendToBSControllerPipe(ev.Release());
}
Schedule(TDuration::Minutes(10), new TEvPrivate::TEvRefreshStorageInfo());
Schedule(GetStorageInfoRefreshFrequency(), new TEvPrivate::TEvRefreshStorageInfo());
}

ui32 THive::GetEventPriority(IEventHandle* ev) {
Expand Down Expand Up @@ -2880,6 +2921,7 @@ void THive::ProcessEvent(std::unique_ptr<IEventHandle> event) {
hFunc(TEvPrivate::TEvRefreshStorageInfo, Handle);
hFunc(TEvPrivate::TEvLogTabletMoves, Handle);
hFunc(TEvPrivate::TEvStartStorageBalancer, Handle);
hFunc(TEvPrivate::TEvProcessStorageBalancer, Handle);
hFunc(TEvHive::TEvUpdateDomain, Handle);
}
}
Expand Down Expand Up @@ -2980,6 +3022,7 @@ STFUNC(THive::StateWork) {
fFunc(TEvPrivate::TEvLogTabletMoves::EventType, EnqueueIncomingEvent);
fFunc(TEvPrivate::TEvStartStorageBalancer::EventType, EnqueueIncomingEvent);
fFunc(TEvHive::TEvUpdateDomain::EventType, EnqueueIncomingEvent);
fFunc(TEvPrivate::TEvProcessStorageBalancer::EventType, EnqueueIncomingEvent);
hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle);
default:
if (!HandleDefaultEvents(ev, SelfId())) {
Expand Down
13 changes: 12 additions & 1 deletion ydb/core/mind/hive/hive_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
bool ProcessTabletBalancerPostponed = false;
bool ProcessPendingOperationsScheduled = false;
bool LogTabletMovesScheduled = false;
bool ProcessStorageBalancerScheduled = false;
TResourceRawValues TotalRawResourceValues = {};
TResourceNormalizedValues TotalNormalizedResourceValues = {};
TInstant LastResourceChangeReaction;
Expand Down Expand Up @@ -556,6 +557,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
void Handle(TEvPrivate::TEvRefreshStorageInfo::TPtr& ev);
void Handle(TEvPrivate::TEvLogTabletMoves::TPtr& ev);
void Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev);
void Handle(TEvPrivate::TEvProcessStorageBalancer::TPtr& ev);
void Handle(TEvPrivate::TEvProcessIncomingEvent::TPtr& ev);
void Handle(TEvHive::TEvUpdateDomain::TPtr& ev);

Expand Down Expand Up @@ -653,6 +655,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
void PostponeProcessBootQueue(TDuration after);
void ProcessPendingOperations();
void ProcessTabletBalancer();
void ProcessStorageBalancer();
const TVector<i64>& GetTabletTypeAllowedMetricIds(TTabletTypes::EType type) const;
static const TVector<i64>& GetDefaultAllowedMetricIdsForType(TTabletTypes::EType type);
static bool IsValidMetrics(const NKikimrTabletBase::TMetrics& metrics);
Expand Down Expand Up @@ -681,7 +684,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
void StopTablet(const TActorId& local, TFullTabletId tabletId);
void ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffects);
void UpdateTabletFollowersNumber(TLeaderTabletInfo& tablet, NIceDb::TNiceDb& db, TSideEffects& sideEffects);
TDuration GetBalancerCooldown() const;
TDuration GetBalancerCooldown(EBalancerType balancerType) const;
void UpdateObjectCount(const TLeaderTabletInfo& tablet, const TNodeInfo& node, i64 diff);
ui64 GetObjectImbalance(TFullObjectId object);

Expand Down Expand Up @@ -914,6 +917,14 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
return CurrentConfig.GetMaxChannelHistorySize();
}

TDuration GetStorageInfoRefreshFrequency() const {
return TDuration::MilliSeconds(CurrentConfig.GetStorageInfoRefreshFrequency());
}

double GetMinStorageScatterToBalance() const {
return CurrentConfig.GetMinStorageScatterToBalance();
}

static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static bool IsSystemTablet(TTabletTypes::EType type);
Expand Down
39 changes: 16 additions & 23 deletions ydb/core/mind/hive/hive_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2785,6 +2785,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
TTestBasicRuntime runtime(1, false);
Setup(runtime, true, 2, [](TAppPrepare& app) {
app.HiveConfig.SetMinPeriodBetweenReassign(0);
app.HiveConfig.SetStorageInfoRefreshFrequency(200);
});
const ui64 hiveTablet = MakeDefaultHiveID(0);
const ui64 testerTablet = MakeDefaultHiveID(1);
Expand Down Expand Up @@ -2830,41 +2831,33 @@ Y_UNIT_TEST_SUITE(THiveTest) {
}

// If assured space is not set, usage is always set to 1
auto groupMetricsExchange = MakeHolder<TEvBlobStorage::TEvControllerGroupMetricsExchange>();
for (const auto& [group, tablets] : groupToTablets) {
NKikimrBlobStorage::TGroupMetrics* metrics = groupMetricsExchange->Record.AddGroupMetrics();
auto updateDiskStatus = MakeHolder<TEvBlobStorage::TEvControllerUpdateDiskStatus>();

metrics->SetGroupId(group);
metrics->MutableGroupParameters()->SetGroupID(group);
metrics->MutableGroupParameters()->SetStoragePoolName("def1");
metrics->MutableGroupParameters()->MutableAssuredResources()->SetSpace(300'000'000);
}
for (ui32 groupId = 0x80000000; groupId < 0x8000000a; ++groupId) {
NKikimrBlobStorage::TVDiskMetrics* vdiskMetrics = updateDiskStatus->Record.AddVDisksMetrics();

vdiskMetrics->MutableVDiskId()->SetGroupID(groupId);
vdiskMetrics->MutableVDiskId()->SetGroupGeneration(1);
vdiskMetrics->MutableVDiskId()->SetRing(0);
vdiskMetrics->MutableVDiskId()->SetDomain(0);
vdiskMetrics->MutableVDiskId()->SetVDisk(0);
vdiskMetrics->SetAvailableSize(30'000'000);

runtime.SendToPipe(MakeBSControllerID(0), sender, groupMetricsExchange.Release(), 0, GetPipeConfigWithRetries());
{
TDispatchOptions options;
options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerGroupMetricsExchange));
runtime.DispatchEvents(options);
}

runtime.SendToPipe(MakeBSControllerID(0), sender, updateDiskStatus.Release(), 0, GetPipeConfigWithRetries());

TChannelsBindings channels = BINDED_CHANNELS;
for (auto& bind : channels) {
bind.SetSize(200'000'000);
}
channels[0].SetSize(500'000'000);
for (auto tablet : {tabletA, tabletB}) {
TAutoPtr<TEvHive::TEvCreateTablet> updateTablet(new TEvHive::TEvCreateTablet(testerTablet, 100500 + (tablet - tabletBase), tabletType, channels));
SendCreateTestTablet(runtime, hiveTablet, testerTablet, updateTablet, 0, true);
}
runtime.SendToPipe(hiveTablet, sender, new NHive::TEvPrivate::TEvStartStorageBalancer({
.NumReassigns = 100,
.MaxInFlight = 1,
.StoragePool = "def1",
}));

{
TDispatchOptions options;
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvRestartComplete, 4); // should actually be less than 4
runtime.DispatchEvents(options, TDuration::Seconds(10));
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvStorageBalancerOut);
runtime.DispatchEvents(options, TDuration::Minutes(1));
}

UNIT_ASSERT_VALUES_UNEQUAL(getGroup(tabletA), getGroup(tabletB));
Expand Down
9 changes: 7 additions & 2 deletions ydb/core/mind/hive/leader_tablet_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,14 @@ const NKikimrBlobStorage::TEvControllerSelectGroupsResult::TGroupParameters* TLe
break;
}
case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE: {
return storagePool->FindFreeAllocationUnit([&params](const TStorageGroupInfo& newGroup) -> bool {
auto channel = GetChannel(channelId);
auto filter = [&params](const TStorageGroupInfo& newGroup) -> bool {
return newGroup.IsMatchesParameters(*params);
});
};
auto calculateUsageWithTablet = [&channel](const TStorageGroupInfo* newGroup) -> double {
return newGroup->GetUsageForChannel(channel);
};
return storagePool->FindFreeAllocationUnit(filter, calculateUsageWithTablet);
break;
}
case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_SPACE: {
Expand Down
4 changes: 4 additions & 0 deletions ydb/core/mind/hive/leader_tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ struct TLeaderTabletInfo : TTabletInfo {
return ChannelInfo->GetSize();
}
}

bool operator==(const TChannel& other) const {
return TabletId == other.TabletId && ChannelId == other.ChannelId;
}
};

TTabletId Id;
Expand Down
22 changes: 11 additions & 11 deletions ydb/core/mind/hive/monitoring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3933,8 +3933,8 @@ class TTxMonEvent_Storage : public TTransactionBase<THive> {
using TKindMap = THashMap<TUnitKind, ui32, TUnitKindHash>;

void GetUnitKinds(const TStorageGroupInfo& group, TKindMap& kinds) {
for (const auto& [tablet, channel] : group.Units) {
const auto& boundChannels(tablet->BoundChannels[channel]);
for (const auto& channel : group.Units) {
const auto& boundChannels(*channel.ChannelInfo);
kinds[TUnitKind{boundChannels.GetIOPS(), boundChannels.GetThroughput(), boundChannels.GetSize()}]++;
}
}
Expand Down Expand Up @@ -3970,11 +3970,11 @@ class TTxMonEvent_Storage : public TTransactionBase<THive> {
out << "<td style='text-align:right'>" << kind.ToString() << "</td>";
out << "<td style='text-align:right'>" << units << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", kind.IOPS * units) << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", prStorageGroup.second.MaximumIOPS) << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", prStorageGroup.second.MaximumResources.IOPS) << "</td>";
out << "<td style='text-align:right'>" << kind.Size * units << "</td>";
out << "<td style='text-align:right'>" << prStorageGroup.second.MaximumSize << "</td>";
out << "<td style='text-align:right'>" << prStorageGroup.second.MaximumResources.Size << "</td>";
out << "<td style='text-align:right'>" << kind.Throughput * units << "</td>";
out << "<td style='text-align:right'>" << prStorageGroup.second.MaximumThroughput << "</td>";
out << "<td style='text-align:right'>" << prStorageGroup.second.MaximumResources.Throughput << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", prStorageGroup.second.StoragePool.GetOvercommit()) << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", prStorageGroup.second.GetUsage()) << "</td>";
out << "</tr>";
Expand All @@ -3985,12 +3985,12 @@ class TTxMonEvent_Storage : public TTransactionBase<THive> {
out << "<td>" << prStoragePool.second.Name << "</td>";
out << "<td style='text-align:right'>" << group.Id << "</td>";
out << "<td style='text-align:right'>" << group.Units.size() << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", group.AcquiredIOPS) << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", group.MaximumIOPS) << "</td>";
out << "<td style='text-align:right'>" << group.AcquiredSize << "</td>";
out << "<td style='text-align:right'>" << group.MaximumSize << "</td>";
out << "<td style='text-align:right'>" << group.AcquiredThroughput << "</td>";
out << "<td style='text-align:right'>" << group.MaximumThroughput << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", group.AcquiredResources.IOPS) << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", group.MaximumResources.IOPS) << "</td>";
out << "<td style='text-align:right'>" << group.AcquiredResources.Size << "</td>";
out << "<td style='text-align:right'>" << group.MaximumResources.Size << "</td>";
out << "<td style='text-align:right'>" << group.AcquiredResources.Throughput << "</td>";
out << "<td style='text-align:right'>" << group.MaximumResources.Throughput << "</td>";
out << "<td style='text-align:right'>" << group.GroupParameters.GetAllocatedSize() << "</td>";
out << "<td style='text-align:right'>" << group.GroupParameters.GetAvailableSize() << "</td>";
out << "<td style='text-align:right'>" << Sprintf("%.2f", group.StoragePool.GetOvercommit()) << "</td>";
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/storage_balancer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class THiveStorageBalancer : public NActors::TActorBootstrapped<THiveStorageBala
Stats.LastRunMovements = Reassigns;
Stats.IsRunningNow = false;
Hive->RemoveSubActor(this);
Send(Hive->SelfId(), new TEvPrivate::TEvStorageBalancerOut());
return IActor::PassAway();
}

Expand Down
Loading