Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1403,10 +1403,13 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
FilterDomainKey[TSubDomainKey(domainInfo->DomainKey.OwnerId, domainInfo->DomainKey.LocalPathId)] = path;

TTabletId hiveId = domainInfo->Params.GetHive();
if (hiveId && NeedToAskHive(hiveId)) {
if (hiveId) {
DatabaseState[path].HiveId = hiveId;
AskHive(path, hiveId);
if (NeedToAskHive(hiveId)) {
AskHive(path, hiveId);
}
} else if (RootHiveId && NeedToAskHive(RootHiveId)) {
DatabaseState[DomainPath].HiveId = RootHiveId;
AskHive(DomainPath, RootHiveId);
}

Expand Down Expand Up @@ -1501,22 +1504,30 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {

void AggregateHiveInfo() {
TNodeTabletState::TTabletStateSettings settings;
for (const auto& [hiveId, hiveResponse] : HiveInfo) {
for (auto& [dbPath, dbState] : DatabaseState) {
const auto& hiveResponse = HiveInfo[dbState.HiveId];
if (hiveResponse.IsOk()) {
settings.AliveBarrier = TInstant::MilliSeconds(hiveResponse->Record.GetResponseTimestamp()) - TDuration::Minutes(5);
for (const NKikimrHive::TTabletInfo& hiveTablet : hiveResponse->Record.GetTablets()) {
TSubDomainKey tenantId = TSubDomainKey(hiveTablet.GetObjectDomain());
auto itDomain = FilterDomainKey.find(tenantId);
TDatabaseState* database = nullptr;
if (itDomain == FilterDomainKey.end()) {
continue;
}
auto itDatabase = DatabaseState.find(itDomain->second);
if (itDatabase == DatabaseState.end()) {
continue;
if (!FilterDatabase || FilterDatabase == dbPath) {
database = &dbState;
} else {
continue;
}
} else {
auto itDatabase = DatabaseState.find(itDomain->second);
if (itDatabase != DatabaseState.end()) {
database = &itDatabase->second;
} else {
continue;
}
}
TDatabaseState& database = itDatabase->second;
auto tabletId = std::make_pair(hiveTablet.GetTabletID(), hiveTablet.GetFollowerID());
database.MergedTabletState.emplace(tabletId, &hiveTablet);
database->MergedTabletState.emplace(tabletId, &hiveTablet);
TNodeId nodeId = hiveTablet.GetNodeID();
switch (hiveTablet.GetVolatileState()) {
case NKikimrHive::ETabletVolatileState::TABLET_VOLATILE_STATE_STARTING:
Expand All @@ -1526,7 +1537,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
nodeId = 0;
break;
}
database.MergedNodeTabletState[nodeId].AddTablet(hiveTablet, settings);
database->MergedNodeTabletState[nodeId].AddTablet(hiveTablet, settings);
}
}
}
Expand Down
129 changes: 117 additions & 12 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(issueVdiscCount, issueVdiscNumber);
}

bool HasTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
for (const auto& issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
return true;
}
}
return false;
}

void ListingTest(int const groupNumber, int const vdiscPerGroupNumber, bool const isMergeRecords = false) {
auto result = RequestHc(groupNumber, vdiscPerGroupNumber, isMergeRecords);
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
Expand Down Expand Up @@ -864,6 +873,15 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
}

void AddBadServerlessTablet(TEvHive::TEvResponseHiveInfo::TPtr* ev) {
auto &record = (*ev)->Get()->Record;
auto* tablet = record.MutableTablets()->Add();
tablet->SetTabletID(1);
tablet->MutableObjectDomain()->SetSchemeShard(SERVERLESS_DOMAIN_KEY.OwnerId);
tablet->MutableObjectDomain()->SetPathId(SERVERLESS_DOMAIN_KEY.LocalPathId);
tablet->SetRestartsPerPeriod(500);
}

Y_UNIT_TEST(SpecificServerless) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
Expand Down Expand Up @@ -1163,6 +1181,102 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT(!databaseFoundInResult);
}

Y_UNIT_TEST(ServerlessBadTablets) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(1)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);
TClient client(settings);
TTestActorRuntime& runtime = *server.GetRuntime();

auto &dynamicNameserviceConfig = runtime.GetAppData().DynamicNameserviceConfig;
dynamicNameserviceConfig->MaxStaticNodeId = runtime.GetNodeId(server.StaticNodes() - 1);
dynamicNameserviceConfig->MinDynamicNodeId = runtime.GetNodeId(server.StaticNodes());
dynamicNameserviceConfig->MaxDynamicNodeId = runtime.GetNodeId(server.StaticNodes() + server.DynamicNodes() - 1);

ui32 sharedDynNodeId = runtime.GetNodeId(1);

bool firstConsoleResponse = true;
auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
switch (ev->GetTypeRewrite()) {
case NConsole::TEvConsole::EvListTenantsResponse: {
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvListTenantsResponse::TPtr*>(&ev);
AddPathsToListTenantsResponse(x, { "/Root/serverless", "/Root/shared" });
break;
}
case NConsole::TEvConsole::EvGetTenantStatusResponse: {
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvGetTenantStatusResponse::TPtr*>(&ev);
if (!firstConsoleResponse) {
ChangeGetTenantStatusResponse(x, "/Root/serverless");
} else {
firstConsoleResponse = false;
ChangeGetTenantStatusResponse(x, "/Root/shared");
}
break;
}
case TEvTxProxySchemeCache::EvNavigateKeySetResult: {
auto *x = reinterpret_cast<TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr*>(&ev);
ChangeNavigateKeyResultServerless(x, NKikimrSubDomains::EServerlessComputeResourcesModeShared, runtime);
break;
}
case TEvHive::EvResponseHiveNodeStats: {
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveNodeStats::TPtr*>(&ev);
ChangeResponseHiveNodeStats(x, sharedDynNodeId);
break;
}
case TEvHive::EvResponseHiveInfo: {
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveInfo::TPtr*>(&ev);
AddBadServerlessTablet(x);
break;
}
case TEvSchemeShard::EvDescribeSchemeResult: {
auto *x = reinterpret_cast<NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult::TPtr*>(&ev);
ChangeDescribeSchemeResultServerless(x);
break;
}
case TEvBlobStorage::EvControllerConfigResponse: {
auto *x = reinterpret_cast<TEvBlobStorage::TEvControllerConfigResponse::TPtr*>(&ev);
AddGroupVSlotInControllerConfigResponseWithStaticGroup(x, NKikimrBlobStorage::TGroupStatus::FULL, TVDisks(1));
break;
}
case NSysView::TEvSysView::EvGetVSlotsResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr*>(&ev);
AddVSlotsToSysViewResponse(x, 1, TVDisks(1));
break;
}
case NSysView::TEvSysView::EvGetGroupsResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetGroupsResponse::TPtr*>(&ev);
AddGroupsToSysViewResponse(x);
break;
}
case NSysView::TEvSysView::EvGetStoragePoolsResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetStoragePoolsResponse::TPtr*>(&ev);
AddStoragePoolsToSysViewResponse(x);
break;
}
}

return TTestActorRuntime::EEventAction::PROCESS;
};
runtime.SetObserverFunc(observerFunc);

TActorId sender = runtime.AllocateEdgeActor();
TAutoPtr<IEventHandle> handle;

auto *request = new NHealthCheck::TEvSelfCheckRequest;
request->Request.set_return_verbose_status(true);
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
const auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Ctest << result.ShortDebugString();
UNIT_ASSERT(HasTabletIssue(result));
}

Y_UNIT_TEST(DontIgnoreServerlessWithExclusiveNodesWhenNotSpecific) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
Expand Down Expand Up @@ -1858,15 +1972,6 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
for (const auto& issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
return true;
}
}
return false;
}

Y_UNIT_TEST(TestTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
Expand Down Expand Up @@ -1894,7 +1999,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
UNIT_ASSERT(HasTabletIssue(result));
}

Y_UNIT_TEST(TestBootingTabletIsNotDead) {
Expand Down Expand Up @@ -1925,7 +2030,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(!HasDeadTabletIssue(result));
UNIT_ASSERT(!HasTabletIssue(result));
}

Y_UNIT_TEST(TestReBootingTabletIsDead) {
Expand Down Expand Up @@ -1959,7 +2064,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
UNIT_ASSERT(HasTabletIssue(result));
}
}
}
Loading