Skip to content

rework node priorities to make dc preference overridable #14554

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ydb/core/mind/hive/hive.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ using TResourceRawValues = std::tuple<i64, i64, i64, i64>; // CPU, Memory, Netwo
using TResourceNormalizedValues = std::tuple<double, double, double, double>;
using TOwnerIdxType = NScheme::TPairUi64Ui64;
using TSubActorId = ui64; // = LocalId part of TActorId
using TDataCenterPriority = std::unordered_map<TDataCenterId, i32>;

static constexpr std::size_t MAX_TABLET_CHANNELS = 256;

Expand Down
61 changes: 15 additions & 46 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1176,15 +1176,15 @@ TNodeInfo* THive::SelectNode<NKikimrConfig::THiveConfig::HIVE_NODE_SELECT_STRATE
return itNode->Node;
}

TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const
TVector<THive::TSelectedNode> THive::SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const
{
i32 priority = std::numeric_limits<i32>::min();
for (const TSelectedNode& selectedNode : selectedNodes) {
priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet));
priority = std::max(priority, selectedNode.Node->GetPriorityForTablet(tablet, dcPriority));
}

auto it = std::partition(selectedNodes.begin(), selectedNodes.end(), [&] (const TSelectedNode& selectedNode) {
return selectedNode.Node->GetPriorityForTablet(tablet) == priority;
return selectedNode.Node->GetPriorityForTablet(tablet, dcPriority) == priority;
});

selectedNodes.erase(it, selectedNodes.end());
Expand Down Expand Up @@ -1274,53 +1274,21 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
}
}

std::vector<std::vector<TNodeInfo*>> candidateGroups;
candidateGroups.resize(dataCentersGroups.size() + 1);
std::unordered_map<TDataCenterId, std::vector<TNodeInfo*>*> indexDC2Group;
TDataCenterPriority dcPriority;
for (size_t numGroup = 0; numGroup < dataCentersGroups.size(); ++numGroup) {
const NKikimrHive::TDataCentersGroup* dcGroup = dataCentersGroups[numGroup];
if (dcGroup->DataCenterSize()) {
for (TDataCenterId dc : dcGroup->GetDataCenter()) {
indexDC2Group[dc] = candidateGroups.data() + numGroup;
}
} else {
for (const ui64 dcId : dcGroup->GetDataCenterNum()) {
indexDC2Group[DataCenterToString(dcId)] = candidateGroups.data() + numGroup;
}
Comment on lines -1287 to -1289
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Это точно deprecated? Как можно в этом удостовериться?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was introduced in April 2021, made deprecated in September 2021, internal tracker shows no signs of it ever being set anywhere, and it is not set on any of our main prod clusters.

}
}
for (auto it = Nodes.begin(); it != Nodes.end(); ++it) {
TNodeInfo* nodeInfo = &it->second;
if (nodeInfo->IsAlive()) {
TDataCenterId dataCenterId = nodeInfo->GetDataCenter();
auto itDataCenter = indexDC2Group.find(dataCenterId);
if (itDataCenter != indexDC2Group.end()) {
itDataCenter->second->push_back(nodeInfo);
} else {
candidateGroups.back().push_back(nodeInfo);
}
} else {
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " node " << nodeInfo->Id << " is not alive");
debugState.NodesDead++;
for (TDataCenterId dc : dcGroup->GetDataCenter()) {
// First group gets largest priority, last group gets +1 priority, dcs not in any groups get 0
dcPriority[dc] = dataCentersGroups.size() - numGroup;
}
}

TVector<TSelectedNode> selectedNodes;
selectedNodes.reserve(Nodes.size());
bool thereAreNodesWithManyStarts = false;

for (auto itCandidateNodes = candidateGroups.begin(); itCandidateNodes != candidateGroups.end(); ++itCandidateNodes) {
const std::vector<TNodeInfo*>& candidateNodes(*itCandidateNodes);
if (candidateGroups.size() > 1) {
BLOG_TRACE("[FBN] Tablet " << tablet.ToString()
<< " checking candidates group " << (itCandidateNodes - candidateGroups.begin() + 1)
<< " of " << candidateGroups.size());
}

selectedNodes.clear();
selectedNodes.reserve(candidateNodes.size());

for (auto it = candidateNodes.begin(); it != candidateNodes.end(); ++it) {
TNodeInfo& nodeInfo = *(*it);
for (auto& [_, nodeInfo] : Nodes) {
if (nodeInfo.IsAlive()) {
if (nodeInfo.IsAllowedToRunTablet(tablet, &debugState)) {
if (nodeInfo.IsAbleToScheduleTablet()) {
if (nodeInfo.IsAbleToRunTablet(tablet, &debugState)) {
Expand All @@ -1346,11 +1314,12 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su
<< " tablet allowed domains " << tablet.GetNodeFilter().AllowedDomains
<< " tablet effective allowed domains " << tablet.GetNodeFilter().GetEffectiveAllowedDomains());
}
}
if (!selectedNodes.empty()) {
break;
} else {
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " node " << nodeInfo.Id << " is not alive");
debugState.NodesDead++;
}
}

BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " selected nodes count " << selectedNodes.size());
if (selectedNodes.empty() && thereAreNodesWithManyStarts) {
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " all available nodes are booting too many tablets");
Expand All @@ -1359,7 +1328,7 @@ THive::TBestNodeResult THive::FindBestNode(const TTabletInfo& tablet, TNodeId su

TNodeInfo* selectedNode = nullptr;
if (!selectedNodes.empty()) {
selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet);
selectedNodes = SelectMaxPriorityNodes(std::move(selectedNodes), tablet, dcPriority);
BLOG_TRACE("[FBN] Tablet " << tablet.ToString() << " selected max priority nodes count " << selectedNodes.size());

switch (GetNodeSelectStrategy()) {
Expand Down
6 changes: 5 additions & 1 deletion ydb/core/mind/hive/hive_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar

template <NKikimrConfig::THiveConfig::EHiveNodeSelectStrategy Strategy>
TNodeInfo* SelectNode(const std::vector<TSelectedNode>& selectedNodes);
TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet) const;
TVector<TSelectedNode> SelectMaxPriorityNodes(TVector<TSelectedNode> selectedNodes, const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const;

public:
void AssignTabletGroups(TLeaderTabletInfo& tablet);
Expand Down Expand Up @@ -995,6 +995,10 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
return CurrentConfig.GetMaxPingsInFlight();
}

ui64 GetNodeRestartsForPenalty() const {
return CurrentConfig.GetNodeRestartsForPenalty();
}

static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static bool IsSystemTablet(TTabletTypes::EType type);
Expand Down
59 changes: 59 additions & 0 deletions ydb/core/mind/hive/hive_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3810,6 +3810,65 @@ Y_UNIT_TEST_SUITE(THiveTest) {
}
}

Y_UNIT_TEST(TestHiveBalancerWithPreferredDC3) {
// Tablet prefers DC 1, but the nodes there are constantly crashing
// Test that it will be eventually launched in DC 2
static const int NUM_NODES = 4;
TTestBasicRuntime runtime(NUM_NODES, false);

runtime.LocationCallback = GetLocation;

Setup(runtime, true);
const int nodeBase = runtime.GetNodeId(0);
TActorId senderA = runtime.AllocateEdgeActor();
const ui64 hiveTablet = MakeDefaultHiveID();
const ui64 testerTablet = MakeTabletID(false, 1);
CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);
{
TDispatchOptions options;
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
runtime.DispatchEvents(options);
}

TTabletTypes::EType tabletType = TTabletTypes::Dummy;
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500, tabletType, BINDED_CHANNELS));
ev->Record.SetFollowerCount(3);
auto* group = ev->Record.MutableDataCentersPreference()->AddDataCentersGroups();
group->AddDataCenter(ToString(1));
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
MakeSureTabletIsUp(runtime, tabletId, 0);

auto getTabletDC = [&]() -> std::optional<TString> {
std::unique_ptr<TEvHive::TEvRequestHiveInfo> request = std::make_unique<TEvHive::TEvRequestHiveInfo>();
runtime.SendToPipe(hiveTablet, senderA, request.release());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
if (tablet.GetTabletID() == tabletId) {
ui32 nodeId = tablet.GetNodeID();
if (nodeId == 0) {
return std::nullopt;
}
auto location = GetLocation(nodeId - nodeBase);
return location.GetDataCenterId();
}
}
return std::nullopt;
};

UNIT_ASSERT_VALUES_EQUAL(getTabletDC(), "1");
for (ui32 i = 0;; ++i) {
// restart node in DC 1
SendKillLocal(runtime, i % 2);
CreateLocal(runtime, i % 2);
auto dc = getTabletDC();
Ctest << "tablet is in dc" << dc << Endl;
if (dc == "2") {
break;
}
}
}

Y_UNIT_TEST(TestHiveFollowersWithChangingDC) {
static const int NUM_NODES = 6;
static const int NUM_TABLETS = 1;
Expand Down
2 changes: 2 additions & 0 deletions ydb/core/mind/hive/monitoring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
UpdateConfig(db, "ScaleInWindowSize", configUpdates);
UpdateConfig(db, "TargetTrackingCPUMargin", configUpdates);
UpdateConfig(db, "DryRunTargetTrackingCPU", configUpdates);
UpdateConfig(db, "NodeRestartsForPenalty", configUpdates);

if (params.contains("BalancerIgnoreTabletTypes")) {
auto value = params.Get("BalancerIgnoreTabletTypes");
Expand Down Expand Up @@ -1195,6 +1196,7 @@ class TTxMonEvent_Settings : public TTransactionBase<THive>, public TLoggedMonTr
ShowConfig(out, "ScaleInWindowSize");
ShowConfig(out, "TargetTrackingCPUMargin");
ShowConfig(out, "DryRunTargetTrackingCPU");
ShowConfig(out, "NodeRestartsForPenalty");

out << "<div class='row' style='margin-top:40px'>";
out << "<div class='col-sm-2' style='padding-top:30px;text-align:right'><label for='allowedMetrics'>AllowedMetrics:</label></div>";
Expand Down
5 changes: 4 additions & 1 deletion ydb/core/mind/hive/node_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ bool TNodeInfo::IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugStat
return true;
}

i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {
i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const {
i32 priority = 0;

auto it = TabletAvailability.find(tablet.GetTabletType());
Expand All @@ -221,6 +221,9 @@ i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {
--priority;
}

priority += dcPriority[GetDataCenter()];
priority -= GetRestartsPerPeriod() / Hive.GetNodeRestartsForPenalty();

return priority;
}

Expand Down
4 changes: 2 additions & 2 deletions ydb/core/mind/hive/node_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ struct TNodeInfo {
bool IsAllowedToRunTablet(TTabletDebugState* debugState = nullptr) const;
bool IsAllowedToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const;
bool IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState = nullptr) const;
i32 GetPriorityForTablet(const TTabletInfo& tablet) const;
i32 GetPriorityForTablet(const TTabletInfo& tablet, TDataCenterPriority& dcPriority) const;
ui64 GetMaxTabletsScheduled() const;
ui64 GetMaxCountForTabletType(TTabletTypes::EType tabletType) const;

Expand Down Expand Up @@ -272,7 +272,7 @@ struct TNodeInfo {

void UpdateResourceTotalUsage(const NKikimrHive::TEvTabletMetrics& metrics);
void ActualizeNodeStatistics(TInstant now);
ui64 GetRestartsPerPeriod(TInstant barrier) const;
ui64 GetRestartsPerPeriod(TInstant barrier = {}) const;

TDataCenterId GetDataCenter() const {
return Location.GetDataCenterId();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1767,6 +1767,7 @@ message THiveConfig {
optional uint64 ScaleInWindowSize = 82 [default = 5]; // buckets
optional double TargetTrackingCPUMargin = 83 [default = 0.1]; // percent
optional double DryRunTargetTrackingCPU = 84; // percent
optional uint64 NodeRestartsForPenalty = 85 [default = 3];
}

message TBlobCacheConfig {
Expand Down
Loading