@@ -43,6 +43,7 @@ namespace NKikimr::NBsController {
4343 std::shared_ptr<TBlobStorageGroupInfo::TTopology> Topology;
4444 TBlobStorageGroupInfo::TGroupVDisks FailedGroupDisks;
4545 const bool IsSelfHealReasonDecommit;
46+ const bool IgnoreDegradedGroupsChecks;
4647 const bool DonorMode;
4748 THashSet<TVDiskID> PendingVDisks;
4849 THashMap<TActorId, TVDiskID> ActorToDiskMap;
@@ -51,14 +52,15 @@ namespace NKikimr::NBsController {
5152 public:
5253 TReassignerActor (TActorId controllerId, TGroupId groupId, TEvControllerUpdateSelfHealInfo::TGroupContent group,
5354 std::optional<TVDiskID> vdiskToReplace, std::shared_ptr<TBlobStorageGroupInfo::TTopology> topology,
54- bool isSelfHealReasonDecommit, bool donorMode)
55+ bool isSelfHealReasonDecommit, bool ignoreDegradedGroupsChecks, bool donorMode)
5556 : ControllerId(controllerId)
5657 , GroupId(groupId)
5758 , Group(std::move(group))
5859 , VDiskToReplace(vdiskToReplace)
5960 , Topology(std::move(topology))
6061 , FailedGroupDisks(Topology.get())
6162 , IsSelfHealReasonDecommit(isSelfHealReasonDecommit)
63+ , IgnoreDegradedGroupsChecks(ignoreDegradedGroupsChecks)
6264 , DonorMode(donorMode)
6365 {}
6466
@@ -166,6 +168,9 @@ namespace NKikimr::NBsController {
166168 request->SetIgnoreGroupReserve (true );
167169 request->SetSettleOnlyOnOperationalDisks (true );
168170 request->SetIsSelfHealReasonDecommit (IsSelfHealReasonDecommit);
171+ if (IgnoreDegradedGroupsChecks) {
172+ request->SetIgnoreDegradedGroupsChecks (IgnoreDegradedGroupsChecks);
173+ }
169174 request->SetAllowUnusableDisks (true );
170175 if (VDiskToReplace) {
171176 ev->SelfHeal = true ;
@@ -278,6 +283,7 @@ namespace NKikimr::NBsController {
278283 bool AllowMultipleRealmsOccupation;
279284 bool DonorMode;
280285 THostRecordMap HostRecords;
286+ std::shared_ptr<TControlWrapper> EnableSelfHealWithDegraded;
281287
282288 using TTopologyDescr = std::tuple<TBlobStorageGroupType::EErasureSpecies, ui32, ui32, ui32>;
283289 THashMap<TTopologyDescr, std::shared_ptr<TBlobStorageGroupInfo::TTopology>> Topologies;
@@ -289,13 +295,15 @@ namespace NKikimr::NBsController {
289295
290296 public:
291297 TSelfHealActor (ui64 tabletId, std::shared_ptr<std::atomic_uint64_t > unreassignableGroups, THostRecordMap hostRecords,
292- bool groupLayoutSanitizerEnabled, bool allowMultipleRealmsOccupation, bool donorMode)
298+ bool groupLayoutSanitizerEnabled, bool allowMultipleRealmsOccupation, bool donorMode,
299+ std::shared_ptr<TControlWrapper> enableSelfHealWithDegraded)
293300 : TabletId(tabletId)
294301 , UnreassignableGroups(std::move(unreassignableGroups))
295302 , GroupLayoutSanitizerEnabled(groupLayoutSanitizerEnabled)
296303 , AllowMultipleRealmsOccupation(allowMultipleRealmsOccupation)
297304 , DonorMode(donorMode)
298305 , HostRecords(std::move(hostRecords))
306+ , EnableSelfHealWithDegraded(std::move(enableSelfHealWithDegraded))
299307 {}
300308
301309 void Bootstrap (const TActorId& parentId) {
@@ -427,9 +435,11 @@ namespace NKikimr::NBsController {
427435
428436 // check if it is possible to move anything out
429437 bool isSelfHealReasonDecommit;
430- if (const auto v = FindVDiskToReplace (group.Content , now, group.Topology .get (), &isSelfHealReasonDecommit)) {
438+ bool ignoreDegradedGroupsChecks;
439+ if (const auto v = FindVDiskToReplace (group.Content , now, group.Topology .get (), &isSelfHealReasonDecommit,
440+ &ignoreDegradedGroupsChecks)) {
431441 group.ReassignerActorId = Register (new TReassignerActor (ControllerId, group.GroupId , group.Content ,
432- *v, group.Topology , isSelfHealReasonDecommit, DonorMode));
442+ *v, group.Topology , isSelfHealReasonDecommit, ignoreDegradedGroupsChecks, DonorMode));
433443 } else {
434444 ++counter; // this group can't be reassigned right now
435445
@@ -484,7 +494,8 @@ namespace NKikimr::NBsController {
484494 ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG (GroupLayoutSanitizerOperationLog,
485495 " Start sanitizing GroupId# " << group.GroupId << " GroupGeneration# " << group.Content .Generation );
486496 group.ReassignerActorId = Register (new TReassignerActor (ControllerId, group.GroupId , group.Content ,
487- std::nullopt , group.Topology , false /* isSelfHealReasonDecommit*/ , DonorMode));
497+ std::nullopt , group.Topology , false /* isSelfHealReasonDecommit*/ ,
498+ false /* ignoreDegradedGroupsChecks*/ , DonorMode));
488499 }
489500 }
490501 }
@@ -534,7 +545,8 @@ namespace NKikimr::NBsController {
534545 }
535546
536547 std::optional<TVDiskID> FindVDiskToReplace (const TEvControllerUpdateSelfHealInfo::TGroupContent& content,
537- TMonotonic now, TBlobStorageGroupInfo::TTopology *topology, bool *isSelfHealReasonDecommit) {
548+ TMonotonic now, TBlobStorageGroupInfo::TTopology *topology, bool *isSelfHealReasonDecommit,
549+ bool *ignoreDegradedGroupsChecks) {
538550 // main idea of selfhealing is step-by-step healing of bad group; we can allow healing of group with more
539551 // than one disk missing, but we should not move next faulty disk until previous one is replicated, at least
540552 // partially (meaning only phantoms left)
@@ -553,7 +565,7 @@ namespace NKikimr::NBsController {
553565 }
554566 [[fallthrough]];
555567 case NKikimrBlobStorage::EVDiskStatus::INIT_PENDING:
556- return std::nullopt ; // don't touch group with replicating disks
568+ return std::nullopt ; // don't touch group with replicating or starting disks
557569
558570 default :
559571 break ;
@@ -579,6 +591,7 @@ namespace NKikimr::NBsController {
579591 continue ; // this group will become degraded when applying self-heal logic, skip disk
580592 }
581593 *isSelfHealReasonDecommit = vdisk.IsSelfHealReasonDecommit ;
594+ *ignoreDegradedGroupsChecks = checker.IsDegraded (failedByReadiness) && *EnableSelfHealWithDegraded;
582595 return vdiskId;
583596 }
584597 }
@@ -886,7 +899,7 @@ namespace NKikimr::NBsController {
886899 IActor *TBlobStorageController::CreateSelfHealActor () {
887900 Y_ABORT_UNLESS (HostRecords);
888901 return new TSelfHealActor (TabletID (), SelfHealUnreassignableGroups, HostRecords, GroupLayoutSanitizerEnabled,
889- AllowMultipleRealmsOccupation, DonorMode);
902+ AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded );
890903 }
891904
892905 void TBlobStorageController::InitializeSelfHealState () {
0 commit comments