Skip to content

do not trigger dead tablet issue during creation of a lot of tablets #10395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
681 changes: 351 additions & 330 deletions ydb/core/health_check/health_check.cpp

Large diffs are not rendered by default.

105 changes: 105 additions & 0 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <ydb/core/testlib/test_client.h>
#include <ydb/public/lib/deprecated/kicli/kicli.h>

#include <ydb/core/mind/hive/hive_events.h>
#include <ydb/core/node_whiteboard/node_whiteboard.h>
#include <ydb/core/blobstorage/base/blobstorage_events.h>
#include <ydb/core/tx/schemeshard/schemeshard.h>
Expand Down Expand Up @@ -1936,5 +1937,109 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
for (const auto& issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
return true;
}
}
return false;
}

Y_UNIT_TEST(TestTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1);
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}

Y_UNIT_TEST(TestBootingTabletIsNotDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });

server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, false);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(!HasDeadTabletIssue(result));
}

Y_UNIT_TEST(TestReBootingTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(2)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE);
TActorId sender = runtime->AllocateEdgeActor();


server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, true);
server.SetupDynamicLocalService(3, "Root");
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}
}
}
5 changes: 5 additions & 0 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec
if (tablet == nullptr) {
continue;
}
tablet->InWaitQueue = false;
if (tablet->IsAlive()) {
BLOG_D("tablet " << record.TabletId << " already alive, skipping");
continue;
Expand All @@ -258,6 +259,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec
UpdateTabletFollowersNumber(leader, db, sideEffects);
}
BootQueue.AddToWaitQueue(record); // waiting for new node
tablet->InWaitQueue = true;
continue;
}
}
Expand Down Expand Up @@ -1850,6 +1852,9 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
if (req.GetReturnMetrics()) {
tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues());
}
if (info->InWaitQueue) {
tabletInfo.SetInWaitQueue(true);
}
if (req.GetReturnChannelHistory()) {
for (const auto& channel : info->TabletStorageInfo->Channels) {
auto& tabletChannel = *tabletInfo.AddTabletChannels();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ struct TTabletInfo {
TInstant PostponedStart;
EBalancerPolicy BalancerPolicy;
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
bool InWaitQueue = false;

TTabletInfo(ETabletRole role, THive& hive);
TTabletInfo(const TTabletInfo&) = delete;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/hive.proto
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ message TTabletInfo {
optional uint32 RestartsPerPeriod = 22;
optional uint64 LastAliveTimestamp = 23;
optional EBalancerPolicy BalancerPolicy = 24;
optional bool InWaitQueue = 25;
}

message TEvSeizeTabletsReply {
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/testlib/test_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ namespace Tests {
app.AddDomain(domain.Release());
}

TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN) {
TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN, bool wait) {
auto getChannelBind = [](const TString& storagePool) {
TChannelBind bind;
bind.SetStoragePoolName(storagePool);
Expand Down Expand Up @@ -507,7 +507,7 @@ namespace Tests {
UNIT_ASSERT_EQUAL_C(createTabletReply->Record.GetOwner(), tabletId,
createTabletReply->Record.GetOwner() << " != " << tabletId);
ui64 id = createTabletReply->Record.GetTabletID();
while (true) {
while (wait) {
auto tabletCreationResult =
Runtime->GrabEdgeEventRethrow<TEvHive::TEvTabletCreationResult>(handle);
UNIT_ASSERT(tabletCreationResult);
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/testlib/test_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ namespace Tests {
}
}
void StartDummyTablets();
TVector<ui64> StartPQTablets(ui32 pqTabletsN);
TVector<ui64> StartPQTablets(ui32 pqTabletsN, bool wait = true);
TTestActorRuntime* GetRuntime() const;
const TServerSettings& GetSettings() const;
const NScheme::TTypeRegistry* GetTypeRegistry();
Expand Down
Loading