Skip to content

Commit 66970a9

Browse files
authored
24-3: Add reason for pending action in maintenance public API (#3289) (#9965)
1 parent b5dae67 commit 66970a9

File tree

14 files changed

+388
-59
lines changed

14 files changed

+388
-59
lines changed

ydb/core/cms/api_adapters.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,33 @@ namespace {
5454
}
5555
}
5656

57+
Ydb::Maintenance::ActionState::ActionReason ConvertReason(NKikimrCms::TAction::TIssue::EType cmsActionIssueType) {
58+
using EIssueType = NKikimrCms::TAction::TIssue;
59+
switch (cmsActionIssueType) {
60+
case EIssueType::UNKNOWN:
61+
return Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED;
62+
case EIssueType::GENERIC:
63+
return Ydb::Maintenance::ActionState::ACTION_REASON_GENERIC;
64+
case EIssueType::TOO_MANY_UNAVAILABLE_VDISKS:
65+
return Ydb::Maintenance::ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS;
66+
case EIssueType::TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS:
67+
return Ydb::Maintenance::ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS;
68+
case EIssueType::DISABLED_NODES_LIMIT_REACHED:
69+
return Ydb::Maintenance::ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED;
70+
case EIssueType::TENANT_DISABLED_NODES_LIMIT_REACHED:
71+
return Ydb::Maintenance::ActionState::ACTION_REASON_TENANT_DISABLED_NODES_LIMIT_REACHED;
72+
case EIssueType::SYS_TABLETS_NODE_LIMIT_REACHED:
73+
return Ydb::Maintenance::ActionState::ACTION_REASON_SYS_TABLETS_NODE_LIMIT_REACHED;
74+
}
75+
return Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED;
76+
}
77+
5778
void ConvertAction(const NKikimrCms::TAction& cmsAction, Ydb::Maintenance::ActionState& actionState) {
5879
ConvertAction(cmsAction, *actionState.mutable_action()->mutable_lock_action());
5980
// FIXME: specify action_uid
6081
actionState.set_status(Ydb::Maintenance::ActionState::ACTION_STATUS_PENDING);
61-
actionState.set_reason(Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED); // FIXME: specify
82+
actionState.set_reason(ConvertReason(cmsAction.GetIssue().GetType()));
83+
actionState.set_reason_details(cmsAction.GetIssue().GetMessage());
6284
}
6385

6486
void ConvertActionUid(const TString& taskUid, const TString& permissionId,

ydb/core/cms/cluster_info.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,6 @@ using TClusterInfoPtr = TIntrusivePtr<TClusterInfo>;
3737
struct TCmsState;
3838
using TCmsStatePtr = TIntrusivePtr<TCmsState>;
3939

40-
struct TErrorInfo {
41-
NKikimrCms::TStatus::ECode Code = NKikimrCms::TStatus::ALLOW;
42-
TString Reason;
43-
TInstant Deadline;
44-
ui64 RollbackPoint = 0;
45-
};
46-
4740
/**
4841
* Structure to hold info about issued permission. A set of
4942
* all issued permissions is a part of CMS persistent state.

ydb/core/cms/cms.cpp

Lines changed: 91 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,38 @@ namespace NKikimr::NCms {
3636
using namespace NNodeWhiteboard;
3737
using namespace NKikimrCms;
3838

39+
namespace {
40+
41+
constexpr size_t MAX_ISSUES_TO_STORE = 100;
42+
43+
TAction::TIssue ConvertIssue(const TReason& reason) {
44+
TAction::TIssue issue;
45+
switch (reason.GetType()) {
46+
case TReason::EType::Generic:
47+
issue.SetType(TAction::TIssue::GENERIC);
48+
break;
49+
case TReason::EType::TooManyUnavailableVDisks:
50+
issue.SetType(TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);
51+
break;
52+
case TReason::EType::TooManyUnavailableStateStorageRings:
53+
issue.SetType(TAction::TIssue::TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS);
54+
break;
55+
case TReason::EType::DisabledNodesLimitReached:
56+
issue.SetType(TAction::TIssue::DISABLED_NODES_LIMIT_REACHED);
57+
break;
58+
case TReason::EType::TenantDisabledNodesLimitReached:
59+
issue.SetType(TAction::TIssue::TENANT_DISABLED_NODES_LIMIT_REACHED);
60+
break;
61+
case TReason::EType::SysTabletsNodeLimitReached:
62+
issue.SetType(TAction::TIssue::SYS_TABLETS_NODE_LIMIT_REACHED);
63+
break;
64+
}
65+
issue.SetMessage(reason.GetMessage());
66+
return issue;
67+
}
68+
69+
} // anonymous namespace
70+
3971
void TCms::DefaultSignalTabletActive(const TActorContext &)
4072
{
4173
// must be empty
@@ -326,6 +358,8 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
326358
};
327359

328360
auto point = ClusterInfo->PushRollbackPoint();
361+
size_t storedIssues = 0;
362+
size_t processedActions = 0;
329363
for (const auto &action : request.GetActions()) {
330364
TDuration permissionDuration = State->Config.DefaultPermissionDuration;
331365
if (request.HasDuration())
@@ -352,28 +386,40 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
352386

353387
auto *permission = response.AddPermissions();
354388
permission->MutableAction()->CopyFrom(action);
389+
permission->MutableAction()->ClearIssue();
355390
permission->SetDeadline(error.Deadline.GetValue());
356391
AddPermissionExtensions(action, *permission);
357392

358393
ClusterInfo->AddTempLocks(action, &ctx);
359394
} else {
360395
LOG_DEBUG(ctx, NKikimrServices::CMS, "Result: %s (reason: %s)",
361-
ToString(error.Code).data(), error.Reason.data());
396+
ToString(error.Code).data(), error.Reason.GetMessage().data());
362397

363398
if (CodesRate[response.GetStatus().GetCode()] > CodesRate[error.Code]) {
364399
response.MutableStatus()->SetCode(error.Code);
365-
response.MutableStatus()->SetReason(error.Reason);
400+
response.MutableStatus()->SetReason(error.Reason.GetMessage());
366401
if (error.Code == TStatus::DISALLOW_TEMP
367402
|| error.Code == TStatus::ERROR_TEMP)
368403
response.SetDeadline(error.Deadline.GetValue());
369404
}
370405

406+
if (schedule) {
407+
auto *scheduledAction = scheduled.AddActions();
408+
scheduledAction->CopyFrom(action);
409+
410+
// Limit stored issues to avoid overloading the local database
411+
if (storedIssues < MAX_ISSUES_TO_STORE) {
412+
*scheduledAction->MutableIssue() = ConvertIssue(error.Reason);
413+
++storedIssues;
414+
} else {
415+
scheduledAction->ClearIssue();
416+
}
417+
}
418+
371419
if (!allowPartial)
372420
break;
373-
374-
if (schedule)
375-
scheduled.AddActions()->CopyFrom(action);
376421
}
422+
++processedActions;
377423
}
378424
ClusterInfo->RollbackLocks(point);
379425

@@ -396,9 +442,21 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
396442
if (schedule && response.GetStatus().GetCode() != TStatus::ALLOW_PARTIAL) {
397443
if (response.GetStatus().GetCode() == TStatus::DISALLOW_TEMP
398444
|| response.GetStatus().GetCode() == TStatus::ERROR_TEMP)
399-
scheduled.MutableActions()->CopyFrom(request.GetActions());
400-
else
445+
{
446+
if (!allowPartial) {
447+
// Only the first problem action was scheduled during
448+
// the actions check loop. Merge it with rest actions.
449+
Y_ABORT_UNLESS(scheduled.ActionsSize() == 1);
450+
TAction::TIssue issue = std::move(*scheduled.MutableActions()->begin()->MutableIssue());
451+
scheduled.MutableActions()->CopyFrom(request.GetActions());
452+
for (auto &action : *scheduled.MutableActions()) {
453+
action.ClearIssue();
454+
}
455+
*scheduled.MutableActions(processedActions)->MutableIssue() = std::move(issue);
456+
}
457+
} else {
401458
scheduled.ClearActions();
459+
}
402460
}
403461

404462
return response.GetStatus().GetCode() == TStatus::ALLOW
@@ -701,26 +759,32 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action,
701759
case MODE_MAX_AVAILABILITY:
702760
if (restartRings + lockedRings > 1) {
703761
error.Code = TStatus::DISALLOW_TEMP;
704-
error.Reason = TStringBuilder() << "Too many unavailable state storage rings"
705-
<< ". Restarting rings: "
706-
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
707-
<< ". Temporary (for a 2 minutes) locked rings: "
708-
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
709-
<< ". Maximum allowed number of unavailable rings for this mode: " << 1;
762+
error.Reason = TReason(
763+
TStringBuilder() << "Too many unavailable state storage rings"
764+
<< ". Restarting rings: "
765+
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
766+
<< ". Temporary (for a 2 minutes) locked rings: "
767+
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
768+
<< ". Maximum allowed number of unavailable rings for this mode: " << 1,
769+
TReason::EType::TooManyUnavailableStateStorageRings
770+
);
710771
error.Deadline = defaultDeadline;
711772
return false;
712773
}
713774
break;
714775
case MODE_KEEP_AVAILABLE:
715776
if (restartRings + lockedRings + disabledRings > (nToSelect - 1) / 2) {
716777
error.Code = TStatus::DISALLOW_TEMP;
717-
error.Reason = TStringBuilder() << "Too many unavailable state storage rings"
718-
<< ". Restarting rings: "
719-
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
720-
<< ". Temporary (for a 2 minutes) locked rings: "
721-
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
722-
<< ". Disabled rings: " << disabledRings
723-
<< ". Maximum allowed number of unavailable rings for this mode: " << (nToSelect - 1) / 2;
778+
error.Reason = TReason(
779+
TStringBuilder() << "Too many unavailable state storage rings"
780+
<< ". Restarting rings: "
781+
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
782+
<< ". Temporary (for a 2 minutes) locked rings: "
783+
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
784+
<< ". Disabled rings: " << disabledRings
785+
<< ". Maximum allowed number of unavailable rings for this mode: " << (nToSelect - 1) / 2,
786+
TReason::EType::TooManyUnavailableStateStorageRings
787+
);
724788
error.Deadline = defaultDeadline;
725789
return false;
726790
}
@@ -1484,6 +1548,13 @@ void TCms::CheckAndEnqueueRequest(TEvCms::TEvPermissionRequest::TPtr &ev, const
14841548
ev, TStatus::WRONG_REQUEST, "Priority value is out of range", ctx);
14851549
}
14861550

1551+
for (const auto &action : rec.GetActions()) {
1552+
if (action.HasIssue()) {
1553+
return ReplyWithError<TEvCms::TEvPermissionResponse>(
1554+
ev, TStatus::WRONG_REQUEST, TStringBuilder() << "Action issue is read-only", ctx);
1555+
}
1556+
}
1557+
14871558
EnqueueRequest(ev.Release(), ctx);
14881559
}
14891560

ydb/core/cms/cms_maintenance_api_ut.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,32 @@ Y_UNIT_TEST_SUITE(TMaintenanceApiTest) {
6969
)
7070
);
7171
}
72+
73+
Y_UNIT_TEST(ActionReason) {
74+
TCmsTestEnv env(8);
75+
76+
auto response = env.CheckMaintenanceTaskCreate("task-1", Ydb::StatusIds::SUCCESS,
77+
MakeActionGroup(
78+
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
79+
),
80+
MakeActionGroup(
81+
MakeLockAction(env.GetNodeId(1), TDuration::Minutes(10))
82+
)
83+
);
84+
85+
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states().size(), 2);
86+
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(0).action_states().size(), 1);
87+
const auto &a1 = response.action_group_states(0).action_states(0);
88+
UNIT_ASSERT_VALUES_EQUAL(a1.status(), ActionState::ACTION_STATUS_PERFORMED);
89+
UNIT_ASSERT_VALUES_EQUAL(a1.reason(), ActionState::ACTION_REASON_OK);
90+
UNIT_ASSERT(a1.reason_details().empty());
91+
92+
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(1).action_states().size(), 1);
93+
const auto &a2 = response.action_group_states(1).action_states(0);
94+
UNIT_ASSERT_VALUES_EQUAL(a2.status(), ActionState::ACTION_STATUS_PENDING);
95+
UNIT_ASSERT_VALUES_EQUAL(a2.reason(), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS);
96+
UNIT_ASSERT(a2.reason_details().Contains("too many unavailable vdisks"));
97+
}
7298
}
7399

74100
} // namespace NKikimr::NCmsTest

ydb/core/cms/cms_ut.cpp

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,104 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
631631
env.CheckListRequests("user1", 0);
632632
}
633633

634+
Y_UNIT_TEST(ActionIssue)
635+
{
636+
TCmsTestEnv env(16);
637+
638+
// Acquire lock on one node
639+
auto rec = env.CheckPermissionRequest
640+
("user", false, false, true, true, TStatus::ALLOW,
641+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000));
642+
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 1);
643+
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());
644+
645+
auto pid = rec.GetPermissions(0).GetId();
646+
647+
// Schedule request
648+
rec = env.CheckPermissionRequest
649+
("user", false, false, true, true, TStatus::DISALLOW_TEMP,
650+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(9), 60000000),
651+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
652+
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 0);
653+
654+
auto rid = rec.GetRequestId();
655+
656+
// Get scheduled request
657+
auto scheduledRec = env.CheckGetRequest("user", rid);
658+
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
659+
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 2);
660+
auto action1 = scheduledRec.GetRequests(0).GetActions(0);
661+
UNIT_ASSERT(!action1.HasIssue());
662+
auto action2 = scheduledRec.GetRequests(0).GetActions(1);
663+
UNIT_ASSERT(action2.HasIssue());
664+
UNIT_ASSERT_VALUES_EQUAL(action2.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);
665+
666+
// Try to check request
667+
env.CheckRequest("user", rid, false, TStatus::DISALLOW_TEMP);
668+
669+
// Get scheduled request
670+
scheduledRec = env.CheckGetRequest("user", rid);
671+
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
672+
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 2);
673+
action1 = scheduledRec.GetRequests(0).GetActions(0);
674+
UNIT_ASSERT(!action1.HasIssue());
675+
action2 = scheduledRec.GetRequests(0).GetActions(1);
676+
UNIT_ASSERT(action2.HasIssue());
677+
UNIT_ASSERT_VALUES_EQUAL(action2.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);
678+
679+
// Done with permission
680+
env.CheckDonePermission("user", pid);
681+
682+
// Try to check request
683+
rec = env.CheckRequest("user", rid, false, TStatus::ALLOW, 2);
684+
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());
685+
UNIT_ASSERT(!rec.GetPermissions(1).GetAction().HasIssue());
686+
687+
env.CheckGetRequest("user", rid, false, TStatus::WRONG_REQUEST);
688+
}
689+
690+
Y_UNIT_TEST(ActionIssuePartialPermissions)
691+
{
692+
TCmsTestEnv env(8);
693+
694+
// Schedule request
695+
auto rec = env.CheckPermissionRequest
696+
("user", true, false, true, true, TStatus::ALLOW_PARTIAL,
697+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000),
698+
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
699+
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 1);
700+
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());
701+
702+
auto pid = rec.GetPermissions(0).GetId();
703+
auto rid = rec.GetRequestId();
704+
705+
// Get scheduled request
706+
auto scheduledRec = env.CheckGetRequest("user", rid);
707+
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
708+
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 1);
709+
auto action = scheduledRec.GetRequests(0).GetActions(0);
710+
UNIT_ASSERT_VALUES_EQUAL(action.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);
711+
712+
// Try to check request
713+
env.CheckRequest("user", rid, false, TStatus::DISALLOW_TEMP);
714+
715+
// Get scheduled request
716+
scheduledRec = env.CheckGetRequest("user", rid);
717+
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
718+
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 1);
719+
action = scheduledRec.GetRequests(0).GetActions(0);
720+
UNIT_ASSERT_VALUES_EQUAL(action.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);
721+
722+
// Done with permission
723+
env.CheckDonePermission("user", pid);
724+
725+
// Try to check request
726+
rec = env.CheckRequest("user", rid, false, TStatus::ALLOW, 1);
727+
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());
728+
729+
env.CheckGetRequest("user", rid, false, TStatus::WRONG_REQUEST);
730+
}
731+
634732
Y_UNIT_TEST(WalleTasks)
635733
{
636734
TCmsTestEnv env(24, 4);

0 commit comments

Comments
 (0)