Skip to content

do not trigger dead tablet issue during creation of a lot of tablets #10235

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,21 +182,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
int Count = 1;
TStackVec<TString> Identifiers;

TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) {
Type = info.tablettype();
Leader = info.followerid() == 0;
static ETabletState GetState(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) {
if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_STOPPED) {
State = ETabletState::Stopped;
} else if (info.volatilestate() != NKikimrHive::TABLET_VOLATILE_STATE_RUNNING
&& info.has_lastalivetimestamp()
&& (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier)
&& info.tabletbootmode() == NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
State = ETabletState::Dead;
} else if (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) {
State = ETabletState::RestartsTooOften;
} else {
State = ETabletState::Good;
return ETabletState::Stopped;
}
ETabletState state = (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) ? ETabletState::RestartsTooOften : ETabletState::Good;
if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_RUNNING) {
return state;
}
if (info.tabletbootmode() != NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
return state;
}
if (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier) {
// Tablet is not alive for a long time
// We should report it as dead unless it's just waiting to be created
if (info.generation() == 0 && info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_BOOTING && !info.inwaitqueue()) {
return state;
}
return ETabletState::Dead;
}
return state;

}

TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings)
: Type(info.tablettype())
, State(GetState(info, settings))
, Leader(info.followerid() == 0)
{
}

bool operator ==(const TNodeTabletStateCount& o) const {
Expand Down
105 changes: 105 additions & 0 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <ydb/core/testlib/test_client.h>
#include <ydb/public/lib/deprecated/kicli/kicli.h>

#include <ydb/core/mind/hive/hive_events.h>
#include <ydb/core/node_whiteboard/node_whiteboard.h>
#include <ydb/core/blobstorage/base/blobstorage_events.h>
#include <ydb/core/tx/schemeshard/schemeshard.h>
Expand Down Expand Up @@ -1837,5 +1838,109 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
for (const auto& issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
return true;
}
}
return false;
}

Y_UNIT_TEST(TestTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1);
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}

Y_UNIT_TEST(TestBootingTabletIsNotDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });

server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, false);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(!HasDeadTabletIssue(result));
}

Y_UNIT_TEST(TestReBootingTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(2)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE);
TActorId sender = runtime->AllocateEdgeActor();


server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, true);
server.SetupDynamicLocalService(3, "Root");
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}
}
}
5 changes: 5 additions & 0 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb&, TSideEffects& sideEffects)
if (tablet == nullptr) {
continue;
}
tablet->InWaitQueue = false;
if (tablet->IsAlive()) {
BLOG_D("tablet " << record.TabletId << " already alive, skipping");
continue;
Expand All @@ -258,6 +259,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb&, TSideEffects& sideEffects)
}
tablet->ActorsToNotifyOnRestart.clear();
BootQueue.AddToWaitQueue(record); // waiting for new node
tablet->InWaitQueue = true;
continue;
}
}
Expand Down Expand Up @@ -1878,6 +1880,9 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
if (req.GetReturnMetrics()) {
tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues());
}
if (info->InWaitQueue) {
tabletInfo.SetInWaitQueue(true);
}
if (req.GetReturnChannelHistory()) {
for (const auto& channel : info->TabletStorageInfo->Channels) {
auto& tabletChannel = *tabletInfo.AddTabletChannels();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ struct TTabletInfo {
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
TInstant BootTime;
TNodeFilter NodeFilter;
bool InWaitQueue = false;

TTabletInfo(ETabletRole role, THive& hive);
TTabletInfo(const TTabletInfo&) = delete;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/hive.proto
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ message TTabletInfo {
optional uint32 RestartsPerPeriod = 22;
optional uint64 LastAliveTimestamp = 23;
optional EBalancerPolicy BalancerPolicy = 24;
optional bool InWaitQueue = 25;
}

message TEvSeizeTabletsReply {
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/testlib/test_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ namespace Tests {
app.AddDomain(domain.Release());
}

TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN) {
TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN, bool wait) {
auto getChannelBind = [](const TString& storagePool) {
TChannelBind bind;
bind.SetStoragePoolName(storagePool);
Expand Down Expand Up @@ -556,7 +556,7 @@ namespace Tests {
UNIT_ASSERT_EQUAL_C(createTabletReply->Record.GetOwner(), tabletId,
createTabletReply->Record.GetOwner() << " != " << tabletId);
ui64 id = createTabletReply->Record.GetTabletID();
while (true) {
while (wait) {
auto tabletCreationResult =
Runtime->GrabEdgeEventRethrow<TEvHive::TEvTabletCreationResult>(handle);
UNIT_ASSERT(tabletCreationResult);
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/testlib/test_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ namespace Tests {
}
}
void StartDummyTablets();
TVector<ui64> StartPQTablets(ui32 pqTabletsN);
TVector<ui64> StartPQTablets(ui32 pqTabletsN, bool wait = true);
TTestActorRuntime* GetRuntime() const;
const TServerSettings& GetSettings() const;
const NScheme::TTypeRegistry* GetTypeRegistry();
Expand Down
Loading