Skip to content

24-3: Add reason for pending action in maintenance public API (#3289) #9965

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion ydb/core/cms/api_adapters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,33 @@ namespace {
}
}

Ydb::Maintenance::ActionState::ActionReason ConvertReason(NKikimrCms::TAction::TIssue::EType cmsActionIssueType) {
using EIssueType = NKikimrCms::TAction::TIssue;
switch (cmsActionIssueType) {
case EIssueType::UNKNOWN:
return Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED;
case EIssueType::GENERIC:
return Ydb::Maintenance::ActionState::ACTION_REASON_GENERIC;
case EIssueType::TOO_MANY_UNAVAILABLE_VDISKS:
return Ydb::Maintenance::ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS;
case EIssueType::TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS:
return Ydb::Maintenance::ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS;
case EIssueType::DISABLED_NODES_LIMIT_REACHED:
return Ydb::Maintenance::ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED;
case EIssueType::TENANT_DISABLED_NODES_LIMIT_REACHED:
return Ydb::Maintenance::ActionState::ACTION_REASON_TENANT_DISABLED_NODES_LIMIT_REACHED;
case EIssueType::SYS_TABLETS_NODE_LIMIT_REACHED:
return Ydb::Maintenance::ActionState::ACTION_REASON_SYS_TABLETS_NODE_LIMIT_REACHED;
}
return Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED;
}

void ConvertAction(const NKikimrCms::TAction& cmsAction, Ydb::Maintenance::ActionState& actionState) {
ConvertAction(cmsAction, *actionState.mutable_action()->mutable_lock_action());
// FIXME: specify action_uid
actionState.set_status(Ydb::Maintenance::ActionState::ACTION_STATUS_PENDING);
actionState.set_reason(Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED); // FIXME: specify
actionState.set_reason(ConvertReason(cmsAction.GetIssue().GetType()));
actionState.set_reason_details(cmsAction.GetIssue().GetMessage());
}

void ConvertActionUid(const TString& taskUid, const TString& permissionId,
Expand Down
7 changes: 0 additions & 7 deletions ydb/core/cms/cluster_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,6 @@ using TClusterInfoPtr = TIntrusivePtr<TClusterInfo>;
struct TCmsState;
using TCmsStatePtr = TIntrusivePtr<TCmsState>;

struct TErrorInfo {
NKikimrCms::TStatus::ECode Code = NKikimrCms::TStatus::ALLOW;
TString Reason;
TInstant Deadline;
ui64 RollbackPoint = 0;
};

/**
* Structure to hold info about issued permission. A set of
* all issued permissions is a part of CMS persistent state.
Expand Down
111 changes: 91 additions & 20 deletions ydb/core/cms/cms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,38 @@ namespace NKikimr::NCms {
using namespace NNodeWhiteboard;
using namespace NKikimrCms;

namespace {

constexpr size_t MAX_ISSUES_TO_STORE = 100;

TAction::TIssue ConvertIssue(const TReason& reason) {
TAction::TIssue issue;
switch (reason.GetType()) {
case TReason::EType::Generic:
issue.SetType(TAction::TIssue::GENERIC);
break;
case TReason::EType::TooManyUnavailableVDisks:
issue.SetType(TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);
break;
case TReason::EType::TooManyUnavailableStateStorageRings:
issue.SetType(TAction::TIssue::TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS);
break;
case TReason::EType::DisabledNodesLimitReached:
issue.SetType(TAction::TIssue::DISABLED_NODES_LIMIT_REACHED);
break;
case TReason::EType::TenantDisabledNodesLimitReached:
issue.SetType(TAction::TIssue::TENANT_DISABLED_NODES_LIMIT_REACHED);
break;
case TReason::EType::SysTabletsNodeLimitReached:
issue.SetType(TAction::TIssue::SYS_TABLETS_NODE_LIMIT_REACHED);
break;
}
issue.SetMessage(reason.GetMessage());
return issue;
}

} // anonymous namespace

void TCms::DefaultSignalTabletActive(const TActorContext &)
{
// must be empty
Expand Down Expand Up @@ -326,6 +358,8 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
};

auto point = ClusterInfo->PushRollbackPoint();
size_t storedIssues = 0;
size_t processedActions = 0;
for (const auto &action : request.GetActions()) {
TDuration permissionDuration = State->Config.DefaultPermissionDuration;
if (request.HasDuration())
Expand All @@ -352,28 +386,40 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,

auto *permission = response.AddPermissions();
permission->MutableAction()->CopyFrom(action);
permission->MutableAction()->ClearIssue();
permission->SetDeadline(error.Deadline.GetValue());
AddPermissionExtensions(action, *permission);

ClusterInfo->AddTempLocks(action, &ctx);
} else {
LOG_DEBUG(ctx, NKikimrServices::CMS, "Result: %s (reason: %s)",
ToString(error.Code).data(), error.Reason.data());
ToString(error.Code).data(), error.Reason.GetMessage().data());

if (CodesRate[response.GetStatus().GetCode()] > CodesRate[error.Code]) {
response.MutableStatus()->SetCode(error.Code);
response.MutableStatus()->SetReason(error.Reason);
response.MutableStatus()->SetReason(error.Reason.GetMessage());
if (error.Code == TStatus::DISALLOW_TEMP
|| error.Code == TStatus::ERROR_TEMP)
response.SetDeadline(error.Deadline.GetValue());
}

if (schedule) {
auto *scheduledAction = scheduled.AddActions();
scheduledAction->CopyFrom(action);

// Limit stored issues to avoid overloading the local database
if (storedIssues < MAX_ISSUES_TO_STORE) {
*scheduledAction->MutableIssue() = ConvertIssue(error.Reason);
++storedIssues;
} else {
scheduledAction->ClearIssue();
}
}

if (!allowPartial)
break;

if (schedule)
scheduled.AddActions()->CopyFrom(action);
}
++processedActions;
}
ClusterInfo->RollbackLocks(point);

Expand All @@ -396,9 +442,21 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
if (schedule && response.GetStatus().GetCode() != TStatus::ALLOW_PARTIAL) {
if (response.GetStatus().GetCode() == TStatus::DISALLOW_TEMP
|| response.GetStatus().GetCode() == TStatus::ERROR_TEMP)
scheduled.MutableActions()->CopyFrom(request.GetActions());
else
{
if (!allowPartial) {
// Only the first problem action was scheduled during
// the actions check loop. Merge it with rest actions.
Y_ABORT_UNLESS(scheduled.ActionsSize() == 1);
TAction::TIssue issue = std::move(*scheduled.MutableActions()->begin()->MutableIssue());
scheduled.MutableActions()->CopyFrom(request.GetActions());
for (auto &action : *scheduled.MutableActions()) {
action.ClearIssue();
}
*scheduled.MutableActions(processedActions)->MutableIssue() = std::move(issue);
}
} else {
scheduled.ClearActions();
}
}

return response.GetStatus().GetCode() == TStatus::ALLOW
Expand Down Expand Up @@ -701,26 +759,32 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action,
case MODE_MAX_AVAILABILITY:
if (restartRings + lockedRings > 1) {
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Too many unavailable state storage rings"
<< ". Restarting rings: "
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
<< ". Temporary (for a 2 minutes) locked rings: "
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
<< ". Maximum allowed number of unavailable rings for this mode: " << 1;
error.Reason = TReason(
TStringBuilder() << "Too many unavailable state storage rings"
<< ". Restarting rings: "
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
<< ". Temporary (for a 2 minutes) locked rings: "
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
<< ". Maximum allowed number of unavailable rings for this mode: " << 1,
TReason::EType::TooManyUnavailableStateStorageRings
);
error.Deadline = defaultDeadline;
return false;
}
break;
case MODE_KEEP_AVAILABLE:
if (restartRings + lockedRings + disabledRings > (nToSelect - 1) / 2) {
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Too many unavailable state storage rings"
<< ". Restarting rings: "
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
<< ". Temporary (for a 2 minutes) locked rings: "
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
<< ". Disabled rings: " << disabledRings
<< ". Maximum allowed number of unavailable rings for this mode: " << (nToSelect - 1) / 2;
error.Reason = TReason(
TStringBuilder() << "Too many unavailable state storage rings"
<< ". Restarting rings: "
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
<< ". Temporary (for a 2 minutes) locked rings: "
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
<< ". Disabled rings: " << disabledRings
<< ". Maximum allowed number of unavailable rings for this mode: " << (nToSelect - 1) / 2,
TReason::EType::TooManyUnavailableStateStorageRings
);
error.Deadline = defaultDeadline;
return false;
}
Expand Down Expand Up @@ -1484,6 +1548,13 @@ void TCms::CheckAndEnqueueRequest(TEvCms::TEvPermissionRequest::TPtr &ev, const
ev, TStatus::WRONG_REQUEST, "Priority value is out of range", ctx);
}

for (const auto &action : rec.GetActions()) {
if (action.HasIssue()) {
return ReplyWithError<TEvCms::TEvPermissionResponse>(
ev, TStatus::WRONG_REQUEST, TStringBuilder() << "Action issue is read-only", ctx);
}
}

EnqueueRequest(ev.Release(), ctx);
}

Expand Down
26 changes: 26 additions & 0 deletions ydb/core/cms/cms_maintenance_api_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,32 @@ Y_UNIT_TEST_SUITE(TMaintenanceApiTest) {
)
);
}

Y_UNIT_TEST(ActionReason) {
TCmsTestEnv env(8);

auto response = env.CheckMaintenanceTaskCreate("task-1", Ydb::StatusIds::SUCCESS,
MakeActionGroup(
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
),
MakeActionGroup(
MakeLockAction(env.GetNodeId(1), TDuration::Minutes(10))
)
);

UNIT_ASSERT_VALUES_EQUAL(response.action_group_states().size(), 2);
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(0).action_states().size(), 1);
const auto &a1 = response.action_group_states(0).action_states(0);
UNIT_ASSERT_VALUES_EQUAL(a1.status(), ActionState::ACTION_STATUS_PERFORMED);
UNIT_ASSERT_VALUES_EQUAL(a1.reason(), ActionState::ACTION_REASON_OK);
UNIT_ASSERT(a1.reason_details().empty());

UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(1).action_states().size(), 1);
const auto &a2 = response.action_group_states(1).action_states(0);
UNIT_ASSERT_VALUES_EQUAL(a2.status(), ActionState::ACTION_STATUS_PENDING);
UNIT_ASSERT_VALUES_EQUAL(a2.reason(), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS);
UNIT_ASSERT(a2.reason_details().Contains("too many unavailable vdisks"));
}
}

} // namespace NKikimr::NCmsTest
98 changes: 98 additions & 0 deletions ydb/core/cms/cms_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,104 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
env.CheckListRequests("user1", 0);
}

Y_UNIT_TEST(ActionIssue)
{
TCmsTestEnv env(16);

// Acquire lock on one node
auto rec = env.CheckPermissionRequest
("user", false, false, true, true, TStatus::ALLOW,
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000));
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 1);
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());

auto pid = rec.GetPermissions(0).GetId();

// Schedule request
rec = env.CheckPermissionRequest
("user", false, false, true, true, TStatus::DISALLOW_TEMP,
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(9), 60000000),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 0);

auto rid = rec.GetRequestId();

// Get scheduled request
auto scheduledRec = env.CheckGetRequest("user", rid);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 2);
auto action1 = scheduledRec.GetRequests(0).GetActions(0);
UNIT_ASSERT(!action1.HasIssue());
auto action2 = scheduledRec.GetRequests(0).GetActions(1);
UNIT_ASSERT(action2.HasIssue());
UNIT_ASSERT_VALUES_EQUAL(action2.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);

// Try to check request
env.CheckRequest("user", rid, false, TStatus::DISALLOW_TEMP);

// Get scheduled request
scheduledRec = env.CheckGetRequest("user", rid);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 2);
action1 = scheduledRec.GetRequests(0).GetActions(0);
UNIT_ASSERT(!action1.HasIssue());
action2 = scheduledRec.GetRequests(0).GetActions(1);
UNIT_ASSERT(action2.HasIssue());
UNIT_ASSERT_VALUES_EQUAL(action2.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);

// Done with permission
env.CheckDonePermission("user", pid);

// Try to check request
rec = env.CheckRequest("user", rid, false, TStatus::ALLOW, 2);
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());
UNIT_ASSERT(!rec.GetPermissions(1).GetAction().HasIssue());

env.CheckGetRequest("user", rid, false, TStatus::WRONG_REQUEST);
}

Y_UNIT_TEST(ActionIssuePartialPermissions)
{
TCmsTestEnv env(8);

// Schedule request
auto rec = env.CheckPermissionRequest
("user", true, false, true, true, TStatus::ALLOW_PARTIAL,
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 1);
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());

auto pid = rec.GetPermissions(0).GetId();
auto rid = rec.GetRequestId();

// Get scheduled request
auto scheduledRec = env.CheckGetRequest("user", rid);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 1);
auto action = scheduledRec.GetRequests(0).GetActions(0);
UNIT_ASSERT_VALUES_EQUAL(action.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);

// Try to check request
env.CheckRequest("user", rid, false, TStatus::DISALLOW_TEMP);

// Get scheduled request
scheduledRec = env.CheckGetRequest("user", rid);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 1);
action = scheduledRec.GetRequests(0).GetActions(0);
UNIT_ASSERT_VALUES_EQUAL(action.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);

// Done with permission
env.CheckDonePermission("user", pid);

// Try to check request
rec = env.CheckRequest("user", rid, false, TStatus::ALLOW, 1);
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());

env.CheckGetRequest("user", rid, false, TStatus::WRONG_REQUEST);
}

Y_UNIT_TEST(WalleTasks)
{
TCmsTestEnv env(24, 4);
Expand Down
Loading
Loading