@@ -43,6 +43,7 @@ namespace NKikimr::NBsController {
43
43
std::shared_ptr<TBlobStorageGroupInfo::TTopology> Topology;
44
44
TBlobStorageGroupInfo::TGroupVDisks FailedGroupDisks;
45
45
const bool IsSelfHealReasonDecommit;
46
+ const bool IgnoreDegradedGroupsChecks;
46
47
const bool DonorMode;
47
48
THashSet<TVDiskID> PendingVDisks;
48
49
THashMap<TActorId, TVDiskID> ActorToDiskMap;
@@ -51,14 +52,15 @@ namespace NKikimr::NBsController {
51
52
public:
52
53
TReassignerActor (TActorId controllerId, TGroupId groupId, TEvControllerUpdateSelfHealInfo::TGroupContent group,
53
54
std::optional<TVDiskID> vdiskToReplace, std::shared_ptr<TBlobStorageGroupInfo::TTopology> topology,
54
- bool isSelfHealReasonDecommit, bool donorMode)
55
+ bool isSelfHealReasonDecommit, bool ignoreDegradedGroupsChecks, bool donorMode)
55
56
: ControllerId(controllerId)
56
57
, GroupId(groupId)
57
58
, Group(std::move(group))
58
59
, VDiskToReplace(vdiskToReplace)
59
60
, Topology(std::move(topology))
60
61
, FailedGroupDisks(Topology.get())
61
62
, IsSelfHealReasonDecommit(isSelfHealReasonDecommit)
63
+ , IgnoreDegradedGroupsChecks(ignoreDegradedGroupsChecks)
62
64
, DonorMode(donorMode)
63
65
{}
64
66
@@ -166,6 +168,9 @@ namespace NKikimr::NBsController {
166
168
request->SetIgnoreGroupReserve (true );
167
169
request->SetSettleOnlyOnOperationalDisks (true );
168
170
request->SetIsSelfHealReasonDecommit (IsSelfHealReasonDecommit);
171
+ if (IgnoreDegradedGroupsChecks) {
172
+ request->SetIgnoreDegradedGroupsChecks (IgnoreDegradedGroupsChecks);
173
+ }
169
174
request->SetAllowUnusableDisks (true );
170
175
if (VDiskToReplace) {
171
176
ev->SelfHeal = true ;
@@ -278,6 +283,7 @@ namespace NKikimr::NBsController {
278
283
bool AllowMultipleRealmsOccupation;
279
284
bool DonorMode;
280
285
THostRecordMap HostRecords;
286
+ std::shared_ptr<TControlWrapper> EnableSelfHealWithDegraded;
281
287
282
288
using TTopologyDescr = std::tuple<TBlobStorageGroupType::EErasureSpecies, ui32, ui32, ui32>;
283
289
THashMap<TTopologyDescr, std::shared_ptr<TBlobStorageGroupInfo::TTopology>> Topologies;
@@ -289,13 +295,15 @@ namespace NKikimr::NBsController {
289
295
290
296
public:
291
297
TSelfHealActor (ui64 tabletId, std::shared_ptr<std::atomic_uint64_t > unreassignableGroups, THostRecordMap hostRecords,
292
- bool groupLayoutSanitizerEnabled, bool allowMultipleRealmsOccupation, bool donorMode)
298
+ bool groupLayoutSanitizerEnabled, bool allowMultipleRealmsOccupation, bool donorMode,
299
+ std::shared_ptr<TControlWrapper> enableSelfHealWithDegraded)
293
300
: TabletId(tabletId)
294
301
, UnreassignableGroups(std::move(unreassignableGroups))
295
302
, GroupLayoutSanitizerEnabled(groupLayoutSanitizerEnabled)
296
303
, AllowMultipleRealmsOccupation(allowMultipleRealmsOccupation)
297
304
, DonorMode(donorMode)
298
305
, HostRecords(std::move(hostRecords))
306
+ , EnableSelfHealWithDegraded(std::move(enableSelfHealWithDegraded))
299
307
{}
300
308
301
309
void Bootstrap (const TActorId& parentId) {
@@ -427,9 +435,11 @@ namespace NKikimr::NBsController {
427
435
428
436
// check if it is possible to move anything out
429
437
bool isSelfHealReasonDecommit;
430
- if (const auto v = FindVDiskToReplace (group.Content , now, group.Topology .get (), &isSelfHealReasonDecommit)) {
438
+ bool ignoreDegradedGroupsChecks;
439
+ if (const auto v = FindVDiskToReplace (group.Content , now, group.Topology .get (), &isSelfHealReasonDecommit,
440
+ &ignoreDegradedGroupsChecks)) {
431
441
group.ReassignerActorId = Register (new TReassignerActor (ControllerId, group.GroupId , group.Content ,
432
- *v, group.Topology , isSelfHealReasonDecommit, DonorMode));
442
+ *v, group.Topology , isSelfHealReasonDecommit, ignoreDegradedGroupsChecks, DonorMode));
433
443
} else {
434
444
++counter; // this group can't be reassigned right now
435
445
@@ -484,7 +494,8 @@ namespace NKikimr::NBsController {
484
494
ADD_RECORD_WITH_TIMESTAMP_TO_OPERATION_LOG (GroupLayoutSanitizerOperationLog,
485
495
" Start sanitizing GroupId# " << group.GroupId << " GroupGeneration# " << group.Content .Generation );
486
496
group.ReassignerActorId = Register (new TReassignerActor (ControllerId, group.GroupId , group.Content ,
487
- std::nullopt, group.Topology , false /* isSelfHealReasonDecommit*/ , DonorMode));
497
+ std::nullopt, group.Topology , false /* isSelfHealReasonDecommit*/ ,
498
+ false /* ignoreDegradedGroupsChecks*/ , DonorMode));
488
499
}
489
500
}
490
501
}
@@ -534,7 +545,8 @@ namespace NKikimr::NBsController {
534
545
}
535
546
536
547
std::optional<TVDiskID> FindVDiskToReplace (const TEvControllerUpdateSelfHealInfo::TGroupContent& content,
537
- TMonotonic now, TBlobStorageGroupInfo::TTopology *topology, bool *isSelfHealReasonDecommit) {
548
+ TMonotonic now, TBlobStorageGroupInfo::TTopology *topology, bool *isSelfHealReasonDecommit,
549
+ bool *ignoreDegradedGroupsChecks) {
538
550
// main idea of selfhealing is step-by-step healing of bad group; we can allow healing of group with more
539
551
// than one disk missing, but we should not move next faulty disk until previous one is replicated, at least
540
552
// partially (meaning only phantoms left)
@@ -553,7 +565,7 @@ namespace NKikimr::NBsController {
553
565
}
554
566
[[fallthrough]];
555
567
case NKikimrBlobStorage::EVDiskStatus::INIT_PENDING:
556
- return std::nullopt; // don't touch group with replicating disks
568
+ return std::nullopt; // don't touch group with replicating or starting disks
557
569
558
570
default :
559
571
break ;
@@ -579,6 +591,7 @@ namespace NKikimr::NBsController {
579
591
continue ; // this group will become degraded when applying self-heal logic, skip disk
580
592
}
581
593
*isSelfHealReasonDecommit = vdisk.IsSelfHealReasonDecommit ;
594
+ *ignoreDegradedGroupsChecks = checker.IsDegraded (failedByReadiness) && *EnableSelfHealWithDegraded;
582
595
return vdiskId;
583
596
}
584
597
}
@@ -886,7 +899,7 @@ namespace NKikimr::NBsController {
886
899
IActor *TBlobStorageController::CreateSelfHealActor () {
887
900
Y_ABORT_UNLESS (HostRecords);
888
901
return new TSelfHealActor (TabletID (), SelfHealUnreassignableGroups, HostRecords, GroupLayoutSanitizerEnabled,
889
- AllowMultipleRealmsOccupation, DonorMode);
902
+ AllowMultipleRealmsOccupation, DonorMode, EnableSelfHealWithDegraded );
890
903
}
891
904
892
905
void TBlobStorageController::InitializeSelfHealState () {
0 commit comments