Skip to content

Commit 7a94da4

Browse files
committed
Make BSC report about group failure model violations more detailed
1 parent 76567af commit 7a94da4

File tree

10 files changed

+59
-17
lines changed

10 files changed

+59
-17
lines changed

ydb/core/blobstorage/ut_blobstorage/restart_pdisk.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Y_UNIT_TEST_SUITE(BSCRestartPDisk) {
3535
auto& diskId = it->first;
3636

3737
NKikimrBlobStorage::TConfigRequest request;
38+
request.SetIgnoreDegradedGroupsChecks(true);
3839

3940
NKikimrBlobStorage::TRestartPDisk* cmd = request.AddCommand()->MutableRestartPDisk();
4041
auto pdiskId = cmd->MutableTargetPDiskId();
@@ -50,7 +51,7 @@ Y_UNIT_TEST_SUITE(BSCRestartPDisk) {
5051
// Restarting third disk will not be allowed.
5152
UNIT_ASSERT_C(!response.GetSuccess(), "Restart should've been prohibited");
5253

53-
UNIT_ASSERT_STRING_CONTAINS(response.GetErrorDescription(), "ExpectedStatus# DISINTEGRATED");
54+
UNIT_ASSERT_STRING_CONTAINS(response.GetErrorDescription(), "Disintegrated");
5455
break;
5556
}
5657
}

ydb/core/cms/sentinel.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -984,6 +984,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> {
984984
command.SetPDiskId(id.DiskId);
985985
command.SetStatus(info->GetStatus());
986986
}
987+
request->Record.MutableRequest()->SetIgnoreDisintegratedGroupsChecks(true);
987988

988989
NTabletPipe::SendData(SelfId(), CmsState->BSControllerPipe, request.Release(), ++SentinelState->ChangeRequestId);
989990
}

ydb/core/mind/bscontroller/cmds_box.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,11 @@ namespace NKikimr::NBsController {
212212

213213
for (const auto& [id, slot] : pdisk->VSlotsOnPDisk) {
214214
if (slot->Group) {
215+
auto *m = VSlots.FindForUpdate(slot->VSlotId);
216+
m->Status = NKikimrBlobStorage::EVDiskStatus::INIT_PENDING;
217+
m->IsReady = false;
215218
TGroupInfo *group = Groups.FindForUpdate(slot->Group->ID);
216-
219+
GroupFailureModelChanged.insert(slot->Group->ID);
217220
group->CalculateGroupStatus();
218221
}
219222
}

ydb/core/mind/bscontroller/cmds_drive_status.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ namespace NKikimr::NBsController {
3939
for (const auto& [id, slot] : pdisk->VSlotsOnPDisk) {
4040
if (slot->Group) {
4141
TGroupInfo *group = Groups.FindForUpdate(slot->Group->ID);
42+
GroupFailureModelChanged.insert(group->ID);
4243
group->CalculateGroupStatus();
4344
}
4445
}

ydb/core/mind/bscontroller/config.cpp

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ namespace NKikimr::NBsController {
293293

294294
bool TBlobStorageController::CommitConfigUpdates(TConfigState& state, bool suppressFailModelChecking,
295295
bool suppressDegradedGroupsChecking, bool suppressDisintegratedGroupsChecking,
296-
TTransactionContext& txc, TString *errorDescription) {
296+
TTransactionContext& txc, TString *errorDescription, NKikimrBlobStorage::TConfigResponse *response) {
297297
NIceDb::TNiceDb db(txc.DB);
298298

299299
for (TGroupId groupId : state.GroupContentChanged) {
@@ -309,16 +309,20 @@ namespace NKikimr::NBsController {
309309
}
310310
}
311311

312+
bool errors = false;
313+
std::vector<TGroupId> disintegratedByExpectedStatus;
314+
std::vector<TGroupId> disintegrated;
315+
std::vector<TGroupId> degraded;
316+
312317
if (!suppressDisintegratedGroupsChecking) {
313318
for (auto&& [base, overlay] : state.Groups.Diff()) {
314319
if (base && overlay->second) {
315320
const TGroupInfo::TGroupStatus& prev = base->second->Status;
316321
const TGroupInfo::TGroupStatus& status = overlay->second->Status;
317322
if (status.ExpectedStatus == NKikimrBlobStorage::TGroupStatus::DISINTEGRATED &&
318323
status.ExpectedStatus != prev.ExpectedStatus) { // status did really change
319-
*errorDescription = TStringBuilder() << "GroupId# " << overlay->first
320-
<< " ExpectedStatus# DISINTEGRATED";
321-
return false;
324+
disintegratedByExpectedStatus.push_back(overlay->first);
325+
errors = true;
322326
}
323327
}
324328
}
@@ -340,20 +344,44 @@ namespace NKikimr::NBsController {
340344
// check the failure model
341345
auto& checker = *topology.QuorumChecker;
342346
if (!checker.CheckFailModelForGroup(failed)) {
343-
*errorDescription = TStringBuilder() << "GroupId# " << groupId
344-
<< " may lose data while modifying group";
345-
return false;
347+
disintegrated.push_back(groupId);
348+
errors = true;
346349
} else if (!suppressDegradedGroupsChecking && checker.IsDegraded(failed)) {
347-
*errorDescription = TStringBuilder() << "GroupId# " << groupId
348-
<< " may become DEGRADED while modifying group";
349-
return false;
350+
degraded.push_back(groupId);
351+
errors = true;
350352
}
351353
} else {
352354
Y_ABORT_UNLESS(group); // group must exist
353355
}
354356
}
355357
}
356358

359+
if (errors) {
360+
TStringStream msg;
361+
if (!degraded.empty()) {
362+
msg << "Degraded GroupIds# " << FormatList(degraded) << ' ';
363+
if (response) {
364+
response->MutableGroupsGetDegraded()->Add(degraded.begin(), degraded.end());
365+
}
366+
}
367+
if (!disintegrated.empty()) {
368+
msg << "Disintegrated GroupIds# " << FormatList(disintegrated) << ' ';
369+
if (response) {
370+
response->MutableGroupsGetDisintegrated()->Add(disintegrated.begin(), disintegrated.end());
371+
}
372+
}
373+
if (!disintegratedByExpectedStatus.empty()) {
374+
msg << "DisintegratedByExpectedStatus GroupIds# " << FormatList(disintegratedByExpectedStatus) << ' ';
375+
if (response) {
376+
response->MutableGroupsGetDisintegratedByExpectedStatus()->Add(disintegratedByExpectedStatus.begin(),
377+
disintegratedByExpectedStatus.end());
378+
}
379+
}
380+
*errorDescription = msg.Str();
381+
errorDescription->pop_back();
382+
return false;
383+
}
384+
357385
// trim PDisks awaiting deletion
358386
for (const TPDiskId& pdiskId : state.PDisksToRemove) {
359387
TPDiskInfo *pdiskInfo = state.PDisks.FindForUpdate(pdiskId);

ydb/core/mind/bscontroller/config_cmd.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,8 @@ namespace NKikimr::NBsController {
265265

266266
const bool doLogCommand = Success && State->Changed();
267267
Success = Success && Self->CommitConfigUpdates(*State, Cmd.GetIgnoreGroupFailModelChecks(),
268-
Cmd.GetIgnoreDegradedGroupsChecks(), Cmd.GetIgnoreDisintegratedGroupsChecks(), txc, &Error);
268+
Cmd.GetIgnoreDegradedGroupsChecks(), Cmd.GetIgnoreDisintegratedGroupsChecks(), txc, &Error,
269+
Response);
269270

270271
Finish();
271272
if (doLogCommand) {

ydb/core/mind/bscontroller/defs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <ydb/core/base/tablet_pipe.h>
1313
#include <ydb/core/blobstorage/base/blobstorage_events.h>
1414
#include <ydb/core/blobstorage/base/blobstorage_vdiskid.h>
15+
#include <ydb/core/blobstorage/base/utility.h>
1516
#include <ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h>
1617
#include <ydb/core/blobstorage/groupinfo/blobstorage_groupinfo_blobmap.h>
1718
#include <ydb/core/blobstorage/groupinfo/blobstorage_groupinfo_sets.h>

ydb/core/mind/bscontroller/impl.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -499,12 +499,12 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
499499
switch (Status) {
500500
case NKikimrBlobStorage::EDriveStatus::UNKNOWN:
501501
case NKikimrBlobStorage::EDriveStatus::BROKEN:
502-
return false;
503-
504-
case NKikimrBlobStorage::EDriveStatus::ACTIVE:
505502
case NKikimrBlobStorage::EDriveStatus::INACTIVE:
506503
case NKikimrBlobStorage::EDriveStatus::FAULTY:
507504
case NKikimrBlobStorage::EDriveStatus::TO_BE_REMOVED:
505+
return false;
506+
507+
case NKikimrBlobStorage::EDriveStatus::ACTIVE:
508508
return true;
509509

510510
case NKikimrBlobStorage::EDriveStatus::EDriveStatus_INT_MIN_SENTINEL_DO_NOT_USE_:
@@ -1572,7 +1572,8 @@ class TBlobStorageController : public TActor<TBlobStorageController>, public TTa
15721572
void UpdateSystemViews();
15731573

15741574
bool CommitConfigUpdates(TConfigState& state, bool suppressFailModelChecking, bool suppressDegradedGroupsChecking,
1575-
bool suppressDisintegratedGroupsChecking, TTransactionContext& txc, TString *errorDescription);
1575+
bool suppressDisintegratedGroupsChecking, TTransactionContext& txc, TString *errorDescription,
1576+
NKikimrBlobStorage::TConfigResponse *response = nullptr);
15761577

15771578
void CommitSelfHealUpdates(TConfigState& state);
15781579
void CommitScrubUpdates(TConfigState& state, TTransactionContext& txc);

ydb/core/mind/bscontroller/virtual_group.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ namespace NKikimr::NBsController {
8787
group->SeenOperational = true;
8888
}
8989

90+
GroupFailureModelChanged.insert(group->ID);
9091
group->CalculateGroupStatus();
9192

9293
NKikimrBlobDepot::TBlobDepotConfig config;
@@ -127,6 +128,7 @@ namespace NKikimr::NBsController {
127128
group->HiveId = cmd.HasHiveId() ? MakeMaybe(cmd.GetHiveId()) : Nothing();
128129
group->Database = cmd.HasDatabase() ? MakeMaybe(cmd.GetDatabase()) : Nothing();
129130
group->NeedAlter = true;
131+
GroupFailureModelChanged.insert(group->ID);
130132
group->CalculateGroupStatus();
131133

132134
NKikimrBlobDepot::TBlobDepotConfig config;

ydb/core/protos/blobstorage_config.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -771,4 +771,7 @@ message TConfigResponse {
771771
bool Success = 2;
772772
string ErrorDescription = 3;
773773
uint64 ConfigTxSeqNo = 4;
774+
repeated uint32 GroupsGetDegraded = 5;
775+
repeated uint32 GroupsGetDisintegrated = 6;
776+
repeated uint32 GroupsGetDisintegratedByExpectedStatus = 7;
774777
}

0 commit comments

Comments
 (0)