@@ -372,11 +372,12 @@ void TCheckpointCoordinator::Handle(const TEvCheckpointCoordinator::TEvScheduleC
372
372
CC_LOG_D (" Got TEvScheduleCheckpointing" );
373
373
ScheduleNextCheckpoint ();
374
374
const auto checkpointsInFly = PendingCheckpoints.size () + PendingCommitCheckpoints.size ();
375
- if (checkpointsInFly >= Settings.GetMaxInflight () || InitingZeroCheckpoint) {
375
+ if (checkpointsInFly >= Settings.GetMaxInflight () || ( InitingZeroCheckpoint && !FailedZeroCheckpoint) ) {
376
376
CC_LOG_W (" Skip schedule checkpoint event since inflight checkpoint limit exceeded: current: " << checkpointsInFly << " , limit: " << Settings.GetMaxInflight ());
377
377
Metrics.SkippedDueToInFlightLimit ->Inc ();
378
378
return ;
379
379
}
380
+ FailedZeroCheckpoint = false ;
380
381
Metrics.SkippedDueToInFlightLimit ->Set (0 );
381
382
InitCheckpoint ();
382
383
}
@@ -389,6 +390,7 @@ void TCheckpointCoordinator::Handle(const TEvCheckpointStorage::TEvCreateCheckpo
389
390
if (issues) {
390
391
CC_LOG_E (" [" << checkpointId << " ] StorageError: can't create checkpoint: " << issues.ToOneLineString ());
391
392
PendingCheckpoints.erase (checkpointId);
393
+ FailedZeroCheckpoint = InitingZeroCheckpoint;
392
394
UpdateInProgressMetric ();
393
395
++*Metrics.FailedToCreate ;
394
396
++*Metrics.StorageError ;
@@ -470,6 +472,7 @@ void TCheckpointCoordinator::Handle(const NYql::NDq::TEvDqCompute::TEvSaveTaskSt
470
472
CC_LOG_E (" [" << checkpointId << " ] Got all acks for aborted checkpoint, aborting in storage" );
471
473
CheckpointingSnapshotRotationIndex = CheckpointingSnapshotRotationPeriod; // Next checkpoint is snapshot.
472
474
Send (StorageProxy, new TEvCheckpointStorage::TEvAbortCheckpointRequest (CoordinatorId, checkpointId, " Can't save node state" ), IEventHandle::FlagTrackDelivery);
475
+ FailedZeroCheckpoint = InitingZeroCheckpoint;
473
476
} else {
474
477
CC_LOG_I (" [" << checkpointId << " ] Got all acks, changing checkpoint status to 'PendingCommit'" );
475
478
Send (StorageProxy, new TEvCheckpointStorage::TEvSetCheckpointPendingCommitStatusRequest (CoordinatorId, checkpointId, checkpoint.GetStats ().StateSize ), IEventHandle::FlagTrackDelivery);
@@ -494,6 +497,7 @@ void TCheckpointCoordinator::Handle(const TEvCheckpointStorage::TEvSetCheckpoint
494
497
CC_LOG_E (" [" << checkpointId << " ] StorageError: can't change checkpoint status to 'PendingCommit': " << issues.ToString ());
495
498
++*Metrics.StorageError ;
496
499
PendingCheckpoints.erase (it);
500
+ FailedZeroCheckpoint = InitingZeroCheckpoint;
497
501
return ;
498
502
}
499
503
@@ -571,6 +575,7 @@ void TCheckpointCoordinator::Handle(const TEvCheckpointStorage::TEvAbortCheckpoi
571
575
++*Metrics.Aborted ;
572
576
}
573
577
PendingCheckpoints.erase (checkpointId);
578
+ FailedZeroCheckpoint = InitingZeroCheckpoint;
574
579
PendingCommitCheckpoints.erase (checkpointId);
575
580
UpdateInProgressMetric ();
576
581
}
@@ -616,6 +621,8 @@ void TCheckpointCoordinator::Handle(NActors::TEvents::TEvPoison::TPtr& ev) {
616
621
}
617
622
618
623
void TCheckpointCoordinator::Handle (const TEvCheckpointCoordinator::TEvRunGraph::TPtr&) {
624
+ Y_DEBUG_ABORT_UNLESS (InitingZeroCheckpoint);
625
+ Y_DEBUG_ABORT_UNLESS (!FailedZeroCheckpoint);
619
626
InitingZeroCheckpoint = false ;
620
627
// TODO: run graph only now, not before zero checkpoint inited
621
628
}
0 commit comments