Skip to content

Commit d467973

Browse files
authored
observability for tablet starts (#6584)
1 parent e7d4e27 commit d467973

File tree

6 files changed

+42
-0
lines changed

6 files changed

+42
-0
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,6 +1688,14 @@ void THive::UpdateCounterNodesConnected(i64 nodesConnectedDiff) {
16881688
}
16891689
}
16901690

1691+
void THive::UpdateCounterTabletsStarting(i64 tabletsStartingDiff) {
1692+
if (TabletCounters != nullptr) {
1693+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_TABLETS_STARTING];
1694+
auto newValue = counter.Get() + tabletsStartingDiff;
1695+
counter.Set(newValue);
1696+
}
1697+
}
1698+
16911699
void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
16921700
TabletMoveHistory.PushBack(moveInfo);
16931701
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);

ydb/core/mind/hive/hive_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
649649
void UpdateCounterBootQueueSize(ui64 bootQueueSize);
650650
void UpdateCounterEventQueueSize(i64 eventQueueSizeDiff);
651651
void UpdateCounterNodesConnected(i64 nodesConnectedDiff);
652+
void UpdateCounterTabletsStarting(i64 tabletsStartingDiff);
652653
void RecordTabletMove(const TTabletMoveInfo& info);
653654
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
654655
void ProcessBootQueue();

ydb/core/mind/hive/tablet_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ struct TTabletInfo {
162162
TInstant PostponedStart;
163163
EBalancerPolicy BalancerPolicy;
164164
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
165+
TInstant BootTime;
165166

166167
TTabletInfo(ETabletRole role, THive& hive);
167168
TTabletInfo(const TTabletInfo&) = delete;

ydb/core/mind/hive/tx__start_tablet.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
1010
ui64 Cookie;
1111
bool External;
1212
TSideEffects SideEffects;
13+
bool Success;
1314

1415
public:
1516
TTxStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external, THive *hive)
@@ -23,10 +24,12 @@ class TTxStartTablet : public TTransactionBase<THive> {
2324
TTxType GetTxType() const override { return NHive::TXTYPE_START_TABLET; }
2425

2526
bool Execute(TTransactionContext& txc, const TActorContext&) override {
27+
Success = false;
2628
SideEffects.Reset(Self->SelfId());
2729
BLOG_D("THive::TTxStartTablet::Execute Tablet " << TabletId);
2830
TTabletInfo* tablet = Self->FindTablet(TabletId);
2931
if (tablet != nullptr) {
32+
tablet->BootTime = TActivationContext::Now();
3033
// finish fast-move operation
3134
if (tablet->LastNodeId != 0 && tablet->LastNodeId != Local.NodeId()) {
3235
TNodeInfo* lastNode = Self->FindNode(tablet->LastNodeId);
@@ -65,6 +68,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
6568
new TEvLocal::TEvBootTablet(*leader.TabletStorageInfo, promotableFollowerId, leader.KnownGeneration),
6669
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
6770
Cookie);
71+
Success = true;
6872
return true;
6973
} else {
7074
BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << leader.ToString() << ") - wrong state or node");
@@ -79,6 +83,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
7983
new TEvLocal::TEvBootTablet(*follower.LeaderTablet.TabletStorageInfo, follower.Id),
8084
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
8185
Cookie);
86+
Success = true;
8287
return true;
8388
} else {
8489
BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << follower.ToString() << ") - wrong state or node");
@@ -108,6 +113,9 @@ class TTxStartTablet : public TTransactionBase<THive> {
108113
void Complete(const TActorContext& ctx) override {
109114
BLOG_D("THive::TTxStartTablet::Complete Tablet " << TabletId << " SideEffects: " << SideEffects);
110115
SideEffects.Complete(ctx);
116+
if (Success) {
117+
Self->UpdateCounterTabletsStarting(+1);
118+
}
111119
}
112120
};
113121

ydb/core/mind/hive/tx__update_tablet_status.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ class TTxUpdateTabletStatus : public TTransactionBase<THive> {
8080
if (Status == TEvLocal::TEvTabletStatus::StatusOk) {
8181
tablet->Statistics.AddRestartTimestamp(now.MilliSeconds());
8282
tablet->ActualizeTabletStatistics(now);
83+
if (tablet->BootTime != TInstant()) {
84+
TDuration startTime = now - tablet->BootTime;
85+
if (startTime > TDuration::Seconds(30)) {
86+
BLOG_W("Tablet " << tablet->GetFullTabletId() << " was starting for " << startTime.Seconds() << " seconds");
87+
}
88+
Self->TabletCounters->Percentile()[NHive::COUNTER_TABLETS_START_TIME].IncrementFor(startTime.MilliSeconds());
89+
Self->UpdateCounterTabletsStarting(-1);
90+
}
8391
TNodeInfo* node = Self->FindNode(Local.NodeId());
8492
if (node == nullptr) {
8593
// event from IC about disconnection of the node could overtake events from the node itself because of Pipe Server

ydb/core/protos/counters_hive.proto

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ enum ESimpleCounters {
2929
COUNTER_IMBALANCED_OBJECTS = 19 [(CounterOpts) = {Name: "ImbalancedObjects"}];
3030
COUNTER_WORST_OBJECT_VARIANCE = 20 [(CounterOpts) = {Name: "WorstObjectVariance"}];
3131
COUNTER_STORAGE_SCATTER = 21 [(CounterOpts) = {Name: "StorageScatter"}];
32+
COUNTER_TABLETS_STARTING = 22 [(CounterOpts) = {Name: "TabletsStarting"}];
3233
}
3334

3435
enum ECumulativeCounters {
@@ -75,6 +76,21 @@ enum EPercentileCounters {
7576
Ranges: { Value: 95 Name: "95%" },
7677
Ranges: { Value: 100 Name: "100%" },
7778
}];
79+
80+
COUNTER_TABLETS_START_TIME = 2 [(CounterOpts) = {
81+
Name: "TabletsStartTimeMs",
82+
Ranges: { Value: 1 }
83+
Ranges: { Value: 5 }
84+
Ranges: { Value: 10 }
85+
Ranges: { Value: 50 }
86+
Ranges: { Value: 100 }
87+
Ranges: { Value: 500 }
88+
Ranges: { Value: 1000 }
89+
Ranges: { Value: 5000 }
90+
Ranges: { Value: 10000 }
91+
Ranges: { Value: 30000 }
92+
Ranges: { Value: 60000 }
93+
}];
7894
}
7995

8096
enum ETxTypes {

0 commit comments

Comments
 (0)