Skip to content

Commit e1dbd5e

Browse files
committed
checkoint coordinator: handle failure to save zero checkpoint
1 parent 44b7587 commit e1dbd5e

File tree

2 files changed

+4
-1
lines changed

2 files changed

+4
-1
lines changed

ydb/core/fq/libs/checkpointing/checkpoint_coordinator.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,11 +372,12 @@ void TCheckpointCoordinator::Handle(const TEvCheckpointCoordinator::TEvScheduleC
372372
CC_LOG_D("Got TEvScheduleCheckpointing");
373373
ScheduleNextCheckpoint();
374374
const auto checkpointsInFly = PendingCheckpoints.size() + PendingCommitCheckpoints.size();
375-
if (checkpointsInFly >= Settings.GetMaxInflight() || InitingZeroCheckpoint) {
375+
if (checkpointsInFly >= Settings.GetMaxInflight() || (InitingZeroCheckpoint && !FailedZeroCheckpoint)) {
376376
CC_LOG_W("Skip schedule checkpoint event since inflight checkpoint limit exceeded: current: " << checkpointsInFly << ", limit: " << Settings.GetMaxInflight());
377377
Metrics.SkippedDueToInFlightLimit->Inc();
378378
return;
379379
}
380+
FailedZeroCheckpoint = false;
380381
Metrics.SkippedDueToInFlightLimit->Set(0);
381382
InitCheckpoint();
382383
}
@@ -470,6 +471,7 @@ void TCheckpointCoordinator::Handle(const NYql::NDq::TEvDqCompute::TEvSaveTaskSt
470471
CC_LOG_E("[" << checkpointId << "] Got all acks for aborted checkpoint, aborting in storage");
471472
CheckpointingSnapshotRotationIndex = CheckpointingSnapshotRotationPeriod; // Next checkpoint is snapshot.
472473
Send(StorageProxy, new TEvCheckpointStorage::TEvAbortCheckpointRequest(CoordinatorId, checkpointId, "Can't save node state"), IEventHandle::FlagTrackDelivery);
474+
FailedZeroCheckpoint = InitingZeroCheckpoint;
473475
} else {
474476
CC_LOG_I("[" << checkpointId << "] Got all acks, changing checkpoint status to 'PendingCommit'");
475477
Send(StorageProxy, new TEvCheckpointStorage::TEvSetCheckpointPendingCommitStatusRequest(CoordinatorId, checkpointId, checkpoint.GetStats().StateSize), IEventHandle::FlagTrackDelivery);

ydb/core/fq/libs/checkpointing/checkpoint_coordinator.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ class TCheckpointCoordinator : public NYql::TTaskControllerImpl<TCheckpointCoord
193193
std::unique_ptr<TPendingInitCoordinator> PendingInit;
194194
bool GraphIsRunning = false;
195195
bool InitingZeroCheckpoint = false;
196+
bool FailedZeroCheckpoint = false;
196197
bool RestoringFromForeignCheckpoint = false;
197198

198199
TCheckpointCoordinatorMetrics Metrics;

0 commit comments

Comments
 (0)