Skip to content

Commit b51a904

Browse files
ivanmorozov333zverevgeny
authored andcommitted
additional signals for commit timings control (ydb-platform#8729)
1 parent f86dce7 commit b51a904

File tree

7 files changed

+87
-36
lines changed

7 files changed

+87
-36
lines changed

ydb/core/tx/columnshard/columnshard__progress_tx.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class TColumnShard::TTxProgressTx: public TTransactionBase<TColumnShard> {
1414
const ui32 TabletTxNo;
1515
std::optional<NOlap::TSnapshot> LastCompletedTx;
1616
std::optional<TTxController::TPlanQueueItem> PlannedQueueItem;
17+
std::optional<TMonotonic> StartExecution;
1718

1819
public:
1920
TTxProgressTx(TColumnShard* self)
@@ -54,6 +55,7 @@ class TColumnShard::TTxProgressTx: public TTransactionBase<TColumnShard> {
5455
} else {
5556
Self->ProgressTxController->PopFirstPlannedTx();
5657
}
58+
StartExecution = TMonotonic::Now();
5759

5860
LastCompletedTx = NOlap::TSnapshot(step, txId);
5961
if (LastCompletedTx > Self->LastCompletedTx) {
@@ -84,11 +86,17 @@ class TColumnShard::TTxProgressTx: public TTransactionBase<TColumnShard> {
8486
Self->RescheduleWaitingReads();
8587
}
8688
if (PlannedQueueItem) {
89+
AFL_VERIFY(TxOperator);
90+
Self->GetProgressTxController().GetCounters().OnTxProgressLag(
91+
TxOperator->GetOpType(), TMonotonic::Now() - TMonotonic::MilliSeconds(PlannedQueueItem->Step));
8792
Self->GetProgressTxController().ProgressOnComplete(*PlannedQueueItem);
8893
}
8994
if (LastCompletedTx) {
9095
Self->LastCompletedTx = std::max(*LastCompletedTx, Self->LastCompletedTx);
9196
}
97+
if (StartExecution) {
98+
Self->GetProgressTxController().GetCounters().OnTxProgressDuration(TxOperator->GetOpType(), TMonotonic::Now() - *StartExecution);
99+
}
92100
Self->SetupIndexation();
93101
}
94102
};

ydb/core/tx/columnshard/counters/columnshard.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ TCSCounters::TCSCounters()
5353
HistogramSuccessWriteMiddle6PutBlobsDurationMs = TBase::GetHistogram("SuccessWriteMiddle6PutBlobsDurationMs", NMonitoring::ExponentialHistogram(18, 2, 5));
5454
HistogramFailedWritePutBlobsDurationMs = TBase::GetHistogram("FailedWritePutBlobsDurationMs", NMonitoring::ExponentialHistogram(18, 2, 5));
5555
HistogramWriteTxCompleteDurationMs = TBase::GetHistogram("WriteTxCompleteDurationMs", NMonitoring::ExponentialHistogram(18, 2, 5));
56+
5657
WritePutBlobsCount = TBase::GetValue("WritePutBlobs");
5758
WriteRequests = TBase::GetValue("WriteRequests");
5859

ydb/core/tx/columnshard/counters/columnshard.h

-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ class TCSCounters: public TCommonCountersOwner {
7272
NMonitoring::TDynamicCounters::TCounterPtr WriteRequests;
7373
THashMap<EWriteFailReason, NMonitoring::TDynamicCounters::TCounterPtr> FailedWriteRequests;
7474
NMonitoring::TDynamicCounters::TCounterPtr SuccessWriteRequests;
75-
7675
public:
7776
const TCSInitialization Initialization;
7877
TTxProgressCounters TxProgress;

ydb/core/tx/columnshard/counters/tx_progress.h

+13-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ class TTxProgressCounters: public TCommonCountersOwner {
2424
NMonitoring::TDynamicCounters::TCounterPtr FinishProposeOnComplete;
2525
NMonitoring::TDynamicCounters::TCounterPtr FinishPlannedTx;
2626
NMonitoring::TDynamicCounters::TCounterPtr AbortTx;
27+
NMonitoring::THistogramPtr HistogramTxProgressDuration;
28+
NMonitoring::THistogramPtr HistogramTxProgressLag;
2729

2830
TProgressCounters(const TCommonCountersOwner& owner)
2931
: TBase(owner)
@@ -34,13 +36,23 @@ class TTxProgressCounters: public TCommonCountersOwner {
3436
, FinishProposeOnExecute(TBase::GetDeriviative("FinishProposeOnExecute"))
3537
, FinishProposeOnComplete(TBase::GetDeriviative("FinishProposeOnComplete"))
3638
, FinishPlannedTx(TBase::GetDeriviative("FinishPlannedTx"))
37-
, AbortTx(TBase::GetDeriviative("AbortTx")) {
39+
, AbortTx(TBase::GetDeriviative("AbortTx"))
40+
, HistogramTxProgressDuration(TBase::GetHistogram("TxProgress/Execution/DurationMs", NMonitoring::ExponentialHistogram(18, 2, 5)))
41+
, HistogramTxProgressLag(TBase::GetHistogram("TxProgress/LagOnComplete/DurationMs", NMonitoring::ExponentialHistogram(18, 2, 5))) {
3842
}
3943
};
4044

4145
THashMap<TOpType, TProgressCounters> CountersByOpType;
4246

4347
public:
48+
void OnTxProgressDuration(const TString& opType, const TDuration d) {
49+
GetSubGroup(opType).HistogramTxProgressDuration->Collect(d.MilliSeconds());
50+
}
51+
52+
void OnTxProgressLag(const TString& opType, const TDuration d) {
53+
GetSubGroup(opType).HistogramTxProgressLag->Collect(d.MilliSeconds());
54+
}
55+
4456
void OnRegisterTx(const TOpType& opType) {
4557
GetSubGroup(opType).RegisterTx->Add(1);
4658
}

ydb/core/tx/columnshard/transactions/tx_controller.h

+3
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,9 @@ class TTxController {
381381
DoOnTabletInit(owner);
382382
}
383383
};
384+
TTxProgressCounters& GetCounters() {
385+
return Counters;
386+
}
384387

385388
private:
386389
const TDuration MaxCommitTxDelay = TDuration::Seconds(30);

ydb/core/tx/tx_proxy/upload_rows_common_impl.cpp

+4-12
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ namespace NKikimr {
1212
RowsCount = TBase::GetDeriviative("Rows/Count");
1313
PackageSize = TBase::GetHistogram("Rows/PackageSize", NMonitoring::ExponentialHistogram(15, 2, 10));
1414

15-
DurationToStartCommit = TBase::GetHistogram("ToStartCommit/DurationMs", NMonitoring::ExponentialHistogram(15, 2, 10));
16-
DurationToFinishCommit = TBase::GetHistogram("ToFinishCommit/DurationMs", NMonitoring::ExponentialHistogram(15, 2, 10));
17-
DurationToStartWriting = TBase::GetHistogram("ToStartWriting/DurationMs", NMonitoring::ExponentialHistogram(15, 2, 10));
18-
DurationToTxStarted = TBase::GetHistogram("ToTxStarted/DurationMs", NMonitoring::ExponentialHistogram(15, 2, 10));
15+
PreparingDuration = TBase::GetHistogram("Preparing/DurationMs", NMonitoring::ExponentialHistogram(15, 2, 10));
16+
WritingDuration = TBase::GetHistogram("Writing/DurationMs", NMonitoring::ExponentialHistogram(15, 2, 10));
17+
CommitDuration = TBase::GetHistogram("Commit/DurationMs", NMonitoring::ExponentialHistogram(15, 2, 10));
18+
PrepareReplyDuration = TBase::GetHistogram("ToReply/DurationMs", NMonitoring::ExponentialHistogram(15, 2, 10));
1919

2020
const google::protobuf::EnumDescriptor* descriptor = ::Ydb::StatusIds::StatusCode_descriptor();
2121
for (ui32 i = 0; i < (ui32)descriptor->value_count(); ++i) {
@@ -24,12 +24,4 @@ namespace NKikimr {
2424
}
2525
}
2626

27-
void TUploadCounters::OnReply(const TDuration d, const ::Ydb::StatusIds::StatusCode code) const {
28-
const TString name = ::Ydb::StatusIds::StatusCode_Name(code);
29-
auto it = CodesCount.find(name);
30-
Y_ABORT_UNLESS(it != CodesCount.end());
31-
it->second->Add(1);
32-
ReplyDuration->Collect(d.MilliSeconds());
33-
}
34-
3527
}

ydb/core/tx/tx_proxy/upload_rows_common_impl.h

+58-22
Original file line numberDiff line numberDiff line change
@@ -45,29 +45,64 @@ class TUploadCounters: public NColumnShard::TCommonCountersOwner {
4545
NMonitoring::TDynamicCounters::TCounterPtr RowsCount;
4646
NMonitoring::THistogramPtr PackageSize;
4747

48-
NMonitoring::THistogramPtr DurationToStartCommit;
49-
NMonitoring::THistogramPtr DurationToFinishCommit;
50-
NMonitoring::THistogramPtr DurationToStartWriting;
51-
NMonitoring::THistogramPtr DurationToTxStarted;
48+
NMonitoring::THistogramPtr PreparingDuration;
49+
NMonitoring::THistogramPtr WritingDuration;
50+
NMonitoring::THistogramPtr CommitDuration;
51+
NMonitoring::THistogramPtr PrepareReplyDuration;
5252

5353
THashMap<TString, NMonitoring::TDynamicCounters::TCounterPtr> CodesCount;
5454
public:
5555
TUploadCounters();
5656

57-
void OnTxStarted(const TDuration d) const {
58-
DurationToTxStarted->Collect(d.MilliSeconds());
59-
}
57+
class TGuard: TMoveOnly {
58+
private:
59+
TMonotonic Start = TMonotonic::Now();
60+
std::optional<TMonotonic> WritingStarted;
61+
std::optional<TMonotonic> CommitStarted;
62+
std::optional<TMonotonic> CommitFinished;
63+
std::optional<TMonotonic> ReplyFinished;
64+
TUploadCounters& Owner;
65+
public:
66+
TGuard(const TMonotonic start, TUploadCounters& owner)
67+
: Start(start)
68+
, Owner(owner)
69+
{
6070

61-
void OnWritingStarted(const TDuration d) const {
62-
DurationToStartWriting->Collect(d.MilliSeconds());
63-
}
71+
}
6472

65-
void OnStartCommit(const TDuration d) const {
66-
DurationToStartCommit->Collect(d.MilliSeconds());
67-
}
73+
void OnWritingStarted() {
74+
WritingStarted = TMonotonic::Now();
75+
Owner.PreparingDuration->Collect((*WritingStarted - Start).MilliSeconds());
76+
}
77+
78+
void OnCommitStarted() {
79+
CommitStarted = TMonotonic::Now();
80+
AFL_VERIFY(WritingStarted);
81+
Owner.WritingDuration->Collect((*CommitStarted - *WritingStarted).MilliSeconds());
82+
}
6883

69-
void OnFinishCommit(const TDuration d) const {
70-
DurationToFinishCommit->Collect(d.MilliSeconds());
84+
void OnCommitFinished() {
85+
CommitFinished = TMonotonic::Now();
86+
AFL_VERIFY(CommitStarted);
87+
Owner.CommitDuration->Collect((*CommitFinished - *CommitStarted).MilliSeconds());
88+
}
89+
90+
void OnReply(const ::Ydb::StatusIds::StatusCode code) {
91+
ReplyFinished = TMonotonic::Now();
92+
if (CommitFinished) {
93+
Owner.PrepareReplyDuration->Collect((*ReplyFinished - *CommitFinished).MilliSeconds());
94+
}
95+
Owner.ReplyDuration->Collect((*ReplyFinished - Start).MilliSeconds());
96+
97+
const TString name = ::Ydb::StatusIds::StatusCode_Name(code);
98+
auto it = Owner.CodesCount.find(name);
99+
Y_ABORT_UNLESS(it != Owner.CodesCount.end());
100+
it->second->Add(1);
101+
}
102+
};
103+
104+
TGuard BuildGuard(const TMonotonic start) {
105+
return TGuard(start, *this);
71106
}
72107

73108
void OnRequest(const ui64 rowsCount) const {
@@ -76,7 +111,7 @@ class TUploadCounters: public NColumnShard::TCommonCountersOwner {
76111
PackageSize->Collect(rowsCount);
77112
}
78113

79-
void OnReply(const TDuration d, const ::Ydb::StatusIds::StatusCode code) const;
114+
void OnReply(const TDuration dFull, const TDuration dDelta, const ::Ydb::StatusIds::StatusCode code) const;
80115
};
81116

82117

@@ -169,6 +204,7 @@ class TUploadRowsBase : public TActorBootstrapped<TUploadRowsBase<DerivedActivit
169204
TActorId LeaderPipeCache;
170205
TDuration Timeout;
171206
TInstant StartTime;
207+
std::optional<TInstant> StartCommitTime;
172208
TActorId TimeoutTimerActorId;
173209

174210
TAutoPtr<NSchemeCache::TSchemeCacheRequest> ResolvePartitionsResult;
@@ -185,7 +221,7 @@ class TUploadRowsBase : public TActorBootstrapped<TUploadRowsBase<DerivedActivit
185221
std::shared_ptr<NYql::TIssues> Issues = std::make_shared<NYql::TIssues>();
186222
NLongTxService::TLongTxId LongTxId;
187223
TUploadCounters UploadCounters;
188-
224+
TUploadCounters::TGuard UploadCountersGuard;
189225
protected:
190226
enum class EUploadSource {
191227
ProtoValues = 0,
@@ -237,6 +273,7 @@ class TUploadRowsBase : public TActorBootstrapped<TUploadRowsBase<DerivedActivit
237273
, LeaderPipeCache(MakePipePerNodeCacheID(false))
238274
, Timeout((timeout && timeout <= DEFAULT_TIMEOUT) ? timeout : DEFAULT_TIMEOUT)
239275
, Status(Ydb::StatusIds::SUCCESS)
276+
, UploadCountersGuard(UploadCounters.BuildGuard(TMonotonic::Now()))
240277
, DiskQuotaExceeded(diskQuotaExceeded)
241278
, Span(std::move(span))
242279
{}
@@ -762,7 +799,7 @@ class TUploadRowsBase : public TActorBootstrapped<TUploadRowsBase<DerivedActivit
762799
}
763800

764801
void WriteToColumnTable(const NActors::TActorContext& ctx) {
765-
UploadCounters.OnWritingStarted(TAppData::TimeProvider->Now() - StartTime);
802+
UploadCountersGuard.OnWritingStarted();
766803
TString accessCheckError;
767804
if (!CheckAccess(accessCheckError)) {
768805
return ReplyWithError(Ydb::StatusIds::UNAUTHORIZED, LogPrefix() << accessCheckError, ctx);
@@ -787,7 +824,6 @@ class TUploadRowsBase : public TActorBootstrapped<TUploadRowsBase<DerivedActivit
787824

788825
void Handle(NLongTxService::TEvLongTxService::TEvBeginTxResult::TPtr& ev, const TActorContext& ctx) {
789826
const auto* msg = ev->Get();
790-
UploadCounters.OnTxStarted(TAppData::TimeProvider->Now() - StartTime);
791827

792828
if (msg->Record.GetStatus() != Ydb::StatusIds::SUCCESS) {
793829
NYql::TIssues issues;
@@ -917,7 +953,7 @@ class TUploadRowsBase : public TActorBootstrapped<TUploadRowsBase<DerivedActivit
917953
}
918954

919955
void CommitLongTx(const TActorContext& ctx) {
920-
UploadCounters.OnStartCommit(TAppData::TimeProvider->Now() - StartTime);
956+
UploadCountersGuard.OnCommitStarted();
921957
TActorId longTxServiceId = NLongTxService::MakeLongTxServiceID(ctx.SelfID.NodeId());
922958
ctx.Send(longTxServiceId, new NLongTxService::TEvLongTxService::TEvCommitTx(LongTxId), 0, 0, Span.GetTraceId());
923959
TBase::Become(&TThis::StateWaitCommitLongTx);
@@ -932,7 +968,7 @@ class TUploadRowsBase : public TActorBootstrapped<TUploadRowsBase<DerivedActivit
932968
}
933969

934970
void Handle(NLongTxService::TEvLongTxService::TEvCommitTxResult::TPtr& ev, const NActors::TActorContext& ctx) {
935-
UploadCounters.OnFinishCommit(TAppData::TimeProvider->Now() - StartTime);
971+
UploadCountersGuard.OnCommitFinished();
936972
const auto* msg = ev->Get();
937973

938974
if (msg->Record.GetStatus() == Ydb::StatusIds::SUCCESS) {
@@ -1288,7 +1324,7 @@ class TUploadRowsBase : public TActorBootstrapped<TUploadRowsBase<DerivedActivit
12881324
}
12891325

12901326
void ReplyWithResult(::Ydb::StatusIds::StatusCode status, const TActorContext& ctx) {
1291-
UploadCounters.OnReply(TAppData::TimeProvider->Now() - StartTime, status);
1327+
UploadCountersGuard.OnReply(status);
12921328
SendResult(ctx, status);
12931329

12941330
LOG_DEBUG_S(ctx, NKikimrServices::RPC_REQUEST, LogPrefix() << "completed with status " << status);

0 commit comments

Comments
 (0)