@@ -460,18 +460,23 @@ void TCheckpointCoordinator::Handle(const NYql::NDq::TEvDqCompute::TEvSaveTaskSt
460460 if (status == NYql::NDqProto::TEvSaveTaskStateResult::OK) {
461461 checkpoint.Acknowledge (ev->Sender , proto.GetStateSizeBytes ());
462462 CC_LOG_D (" [" << checkpointId << " ] Task state saved, need " << checkpoint.NotYetAcknowledgedCount () << " more acks" );
463- if (checkpoint.GotAllAcknowledges ()) {
463+ } else {
464+ checkpoint.Abort (ev->Sender );
465+ CC_LOG_E (" [" << checkpointId << " ] StorageError: can't save node state, aborting checkpoint" );
466+ ++*Metrics.StorageError ;
467+ }
468+ if (checkpoint.GotAllAcknowledges ()) {
469+ if (checkpoint.GetStats ().Aborted ) {
470+ CC_LOG_E (" [" << checkpointId << " ] Got all acks for aborted checkpoint, aborting in storage" );
471+ CheckpointingSnapshotRotationIndex = CheckpointingSnapshotRotationPeriod; // Next checkpoint is snapshot.
472+ Send (StorageProxy, new TEvCheckpointStorage::TEvAbortCheckpointRequest (CoordinatorId, checkpointId, " Can't save node state" ), IEventHandle::FlagTrackDelivery);
473+ } else {
464474 CC_LOG_I (" [" << checkpointId << " ] Got all acks, changing checkpoint status to 'PendingCommit'" );
465475 Send (StorageProxy, new TEvCheckpointStorage::TEvSetCheckpointPendingCommitStatusRequest (CoordinatorId, checkpointId, checkpoint.GetStats ().StateSize ), IEventHandle::FlagTrackDelivery);
466476 if (InitingZeroCheckpoint) {
467477 Send (RunActorId, new TEvCheckpointCoordinator::TEvZeroCheckpointDone ());
468478 }
469479 }
470- } else {
471- CC_LOG_E (" [" << checkpointId << " ] StorageError: can't save node state, aborting checkpoint" );
472- ++*Metrics.StorageError ;
473- CheckpointingSnapshotRotationIndex = CheckpointingSnapshotRotationPeriod; // Next checkpoint is snapshot.
474- Send (StorageProxy, new TEvCheckpointStorage::TEvAbortCheckpointRequest (CoordinatorId, checkpointId, " Can't save node state" ), IEventHandle::FlagTrackDelivery);
475480 }
476481}
477482
0 commit comments