Skip to content

Commit 136beb6

Browse files
authored
add checks of shards / paths quota limits (#5074)
1 parent 994ffaf commit 136beb6

File tree

3 files changed

+107
-1
lines changed

3 files changed

+107
-1
lines changed

ydb/core/health_check/health_check.cpp

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
143143
OverloadState,
144144
SyncState,
145145
Uptime,
146+
QuotaUsage,
146147
};
147148

148149
enum ETimeoutTag {
@@ -241,6 +242,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
241242
ui64 StorageQuota = 0;
242243
ui64 StorageUsage = 0;
243244
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
245+
TString Path;
244246
};
245247

246248
struct TGroupState {
@@ -1060,6 +1062,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
10601062
if (ev->Get()->GetRecord().status() == NKikimrScheme::StatusSuccess) {
10611063
TString path = ev->Get()->GetRecord().path();
10621064
TDatabaseState& state(DatabaseState[path]);
1065+
state.Path = path;
10631066
for (const auto& storagePool : ev->Get()->GetRecord().pathdescription().domaindescription().storagepools()) {
10641067
TString storagePoolName = storagePool.name();
10651068
state.StoragePoolNames.emplace(storagePoolName);
@@ -1447,7 +1450,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14471450
}
14481451
}
14491452

1450-
void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
1453+
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
14511454
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());
14521455

14531456
TSelfCheckContext rrContext(&context, "NODE_UPTIME");
@@ -1494,6 +1497,39 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
14941497
computeNodeStatus.set_overall(context.GetOverallStatus());
14951498
}
14961499

1500+
void FillComputeDatabaseStatus(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
1501+
auto itDescribe = DescribeByPath.find(databaseState.Path);
1502+
if (itDescribe != DescribeByPath.end()) {
1503+
const auto& domain(itDescribe->second->GetRecord().GetPathDescription().GetDomainDescription());
1504+
if (domain.GetPathsLimit() > 0) {
1505+
float usage = (float)domain.GetPathsInside() / domain.GetPathsLimit();
1506+
computeStatus.set_paths_quota_usage(usage);
1507+
if (static_cast<i64>(domain.GetPathsLimit()) - static_cast<i64>(domain.GetPathsInside()) <= 1) {
1508+
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Paths quota exhausted", ETags::QuotaUsage);
1509+
} else if (usage >= 0.99) {
1510+
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Paths quota usage is over than 99%", ETags::QuotaUsage);
1511+
} else if (usage >= 0.90) {
1512+
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Paths quota usage is over than 90%", ETags::QuotaUsage);
1513+
} else {
1514+
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
1515+
}
1516+
}
1517+
if (domain.GetShardsLimit() > 0) {
1518+
float usage = (float)domain.GetShardsInside() / domain.GetShardsLimit();
1519+
computeStatus.set_shards_quota_usage(usage);
1520+
if (static_cast<i64>(domain.GetShardsLimit()) - static_cast<i64>(domain.GetShardsInside()) <= 1) {
1521+
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Shards quota exhausted", ETags::QuotaUsage);
1522+
} else if (usage >= 0.99) {
1523+
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Shards quota usage is over than 99%", ETags::QuotaUsage);
1524+
} else if (usage >= 0.90) {
1525+
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Shards quota usage is over than 90%", ETags::QuotaUsage);
1526+
} else {
1527+
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
1528+
}
1529+
}
1530+
}
1531+
}
1532+
14971533
void FillCompute(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
14981534
TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds;
14991535
if (databaseState.ResourcePathId
@@ -1520,8 +1556,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
15201556
auto& computeNode = *computeStatus.add_nodes();
15211557
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
15221558
}
1559+
FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
15231560
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
15241561
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
1562+
context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage});
15251563
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
15261564
computeNodeIds->push_back(0); // for tablets without node
15271565
for (TNodeId nodeId : *computeNodeIds) {

ydb/core/health_check/health_check_ut.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
7777
} else {
7878
domain->mutable_databasequotas()->set_data_size_hard_quota(quota);
7979
}
80+
domain->SetShardsLimit(quota);
81+
domain->SetShardsInside(size);
8082
}
8183

8284
void AddGroupsInControllerSelectGroupsResult(TEvBlobStorage::TEvControllerSelectGroupsResult::TPtr* ev, int groupCount) {
@@ -495,6 +497,50 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
495497
UNIT_ASSERT_VALUES_EQUAL(storageIssuesCount, storageIssuesNumber);
496498
}
497499

500+
void ShardsQuotaTest(ui64 usage, ui64 quota, ui64 storageIssuesNumber, Ydb::Monitoring::StatusFlag::Status status = Ydb::Monitoring::StatusFlag::GREEN) {
501+
TPortManager tp;
502+
ui16 port = tp.GetPort(2134);
503+
ui16 grpcPort = tp.GetPort(2135);
504+
auto settings = TServerSettings(port)
505+
.SetNodeCount(2)
506+
.SetUseRealThreads(false)
507+
.SetDomainName("Root");
508+
TServer server(settings);
509+
server.EnableGRpc(grpcPort);
510+
TClient client(settings);
511+
TTestActorRuntime& runtime = *server.GetRuntime();
512+
513+
TActorId sender = runtime.AllocateEdgeActor();
514+
TAutoPtr<IEventHandle> handle;
515+
516+
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
517+
switch (ev->GetTypeRewrite()) {
518+
case TEvSchemeShard::EvDescribeSchemeResult: {
519+
auto *x = reinterpret_cast<NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult::TPtr*>(&ev);
520+
ChangeDescribeSchemeResult(x, usage, quota);
521+
break;
522+
}
523+
}
524+
525+
return TTestActorRuntime::EEventAction::PROCESS;
526+
};
527+
runtime.SetObserverFunc(observerFunc);
528+
529+
auto *request = new NHealthCheck::TEvSelfCheckRequest;
530+
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
531+
NHealthCheck::TEvSelfCheckResult* result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle);
532+
533+
int storageIssuesCount = 0;
534+
for (const auto& issue_log : result->Result.Getissue_log()) {
535+
Ctest << issue_log.ShortDebugString() << Endl;
536+
if (issue_log.type() == "COMPUTE_QUOTA" && issue_log.reason_size() == 0 && issue_log.status() == status) {
537+
storageIssuesCount++;
538+
}
539+
}
540+
541+
UNIT_ASSERT_VALUES_EQUAL(storageIssuesCount, storageIssuesNumber);
542+
}
543+
498544
Y_UNIT_TEST(OneIssueListing) {
499545
ListingTest(1, 1);
500546
}
@@ -1765,5 +1811,25 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
17651811
Y_UNIT_TEST(AfterHiveSyncPeriodReportsTabletsState) {
17661812
HiveSyncTest(false);
17671813
}
1814+
1815+
Y_UNIT_TEST(ShardsLimit999) {
1816+
ShardsQuotaTest(999, 1000, 1, Ydb::Monitoring::StatusFlag::RED);
1817+
}
1818+
1819+
Y_UNIT_TEST(ShardsLimit995) {
1820+
ShardsQuotaTest(995, 1000, 1, Ydb::Monitoring::StatusFlag::ORANGE);
1821+
}
1822+
1823+
Y_UNIT_TEST(ShardsLimit905) {
1824+
ShardsQuotaTest(905, 1000, 1, Ydb::Monitoring::StatusFlag::YELLOW);
1825+
}
1826+
1827+
Y_UNIT_TEST(ShardsLimit800) {
1828+
ShardsQuotaTest(805, 1000, 0, Ydb::Monitoring::StatusFlag::GREEN);
1829+
}
1830+
1831+
Y_UNIT_TEST(ShardsNoLimit) {
1832+
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
1833+
}
17681834
}
17691835
}

ydb/public/api/protos/ydb_monitoring.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ message ComputeStatus {
118118
StatusFlag.Status overall = 1;
119119
repeated ComputeNodeStatus nodes = 2;
120120
repeated ComputeTabletStatus tablets = 3;
121+
float paths_quota_usage = 4;
122+
float shards_quota_usage = 5;
121123
}
122124

123125
message LocationNode {

0 commit comments

Comments
 (0)