Skip to content

Commit 76c89f5

Browse files
authored
[ML] Release cluster state (#136769) (#136833)
Refactoring TransportMlMemoryAction to not retain cluster state through the lifecycle of the request so that it can be garbage collected as soon as possible. Fix #123243
1 parent 49442ce commit 76c89f5

File tree

4 files changed

+54
-10
lines changed

4 files changed

+54
-10
lines changed

docs/changelog/136769.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 136769
2+
summary: Release cluster state
3+
area: Machine Learning
4+
type: bug
5+
issues:
6+
- 123243

x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignmentMetadata.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.elasticsearch.cluster.NamedDiff;
1818
import org.elasticsearch.cluster.SimpleDiffable;
1919
import org.elasticsearch.cluster.metadata.Metadata;
20+
import org.elasticsearch.cluster.metadata.ProjectMetadata;
2021
import org.elasticsearch.common.Strings;
2122
import org.elasticsearch.common.collect.Iterators;
2223
import org.elasticsearch.common.io.stream.StreamInput;
@@ -80,6 +81,14 @@ public static TrainedModelAssignmentMetadata fromState(ClusterState clusterState
8081
return trainedModelAssignmentMetadata == null ? EMPTY : trainedModelAssignmentMetadata;
8182
}
8283

84+
public static TrainedModelAssignmentMetadata fromMetadata(ProjectMetadata projectMetadata) {
85+
TrainedModelAssignmentMetadata trainedModelAssignmentMetadata = projectMetadata.custom(NAME);
86+
if (trainedModelAssignmentMetadata == null) {
87+
trainedModelAssignmentMetadata = projectMetadata.custom(DEPRECATED_NAME);
88+
}
89+
return trainedModelAssignmentMetadata == null ? EMPTY : trainedModelAssignmentMetadata;
90+
}
91+
8392
public static List<TrainedModelAssignment> assignmentsForModelId(ClusterState clusterState, String modelId) {
8493
return TrainedModelAssignmentMetadata.fromState(clusterState)
8594
.allAssignments()

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportMlMemoryAction.java

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@
1616
import org.elasticsearch.client.internal.Client;
1717
import org.elasticsearch.client.internal.OriginSettingClient;
1818
import org.elasticsearch.client.internal.ParentTaskAssigningClient;
19+
import org.elasticsearch.cluster.ClusterName;
1920
import org.elasticsearch.cluster.ClusterState;
2021
import org.elasticsearch.cluster.block.ClusterBlockException;
2122
import org.elasticsearch.cluster.block.ClusterBlockLevel;
2223
import org.elasticsearch.cluster.node.DiscoveryNode;
2324
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
25+
import org.elasticsearch.cluster.project.ProjectResolver;
2426
import org.elasticsearch.cluster.service.ClusterService;
2527
import org.elasticsearch.common.settings.ClusterSettings;
2628
import org.elasticsearch.common.unit.ByteSizeValue;
@@ -35,6 +37,7 @@
3537
import org.elasticsearch.xpack.core.ml.action.MlMemoryAction.Response.MlMemoryStats;
3638
import org.elasticsearch.xpack.core.ml.action.TrainedModelCacheInfoAction;
3739
import org.elasticsearch.xpack.core.ml.action.TrainedModelCacheInfoAction.Response.CacheInfo;
40+
import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignmentMetadata;
3841
import org.elasticsearch.xpack.ml.job.NodeLoad;
3942
import org.elasticsearch.xpack.ml.job.NodeLoadDetector;
4043
import org.elasticsearch.xpack.ml.process.MlMemoryTracker;
@@ -53,6 +56,7 @@ public class TransportMlMemoryAction extends TransportMasterNodeAction<MlMemoryA
5356

5457
private final Client client;
5558
private final MlMemoryTracker memoryTracker;
59+
private final ProjectResolver projectResolver;
5660

5761
@Inject
5862
public TransportMlMemoryAction(
@@ -61,7 +65,8 @@ public TransportMlMemoryAction(
6165
ThreadPool threadPool,
6266
ActionFilters actionFilters,
6367
Client client,
64-
MlMemoryTracker memoryTracker
68+
MlMemoryTracker memoryTracker,
69+
ProjectResolver projectResolver
6570
) {
6671
super(
6772
MlMemoryAction.NAME,
@@ -75,6 +80,7 @@ public TransportMlMemoryAction(
7580
);
7681
this.client = new OriginSettingClient(client, ML_ORIGIN);
7782
this.memoryTracker = memoryTracker;
83+
this.projectResolver = projectResolver;
7884
}
7985

8086
@Override
@@ -87,6 +93,10 @@ protected void masterOperation(
8793

8894
ClusterSettings clusterSettings = clusterService.getClusterSettings();
8995

96+
var clusterName = state.getClusterName();
97+
var projectMetadata = state.projectState(projectResolver.getProjectId()).metadata();
98+
var trainedModelAssignmentMetadata = TrainedModelAssignmentMetadata.fromMetadata(projectMetadata);
99+
PersistentTasksCustomMetadata persistentTasksCustomMetadata = projectMetadata.custom(PersistentTasksCustomMetadata.TYPE);
90100
// Resolve the node specification to some concrete nodes
91101
String[] nodeIds = state.nodes().resolveNodes(request.getNodeId());
92102

@@ -112,7 +122,9 @@ protected void masterOperation(
112122
trainedModelCacheInfoRequest,
113123
delegate2.delegateFailureAndWrap(
114124
(l, trainedModelCacheInfoResponse) -> handleResponses(
115-
state,
125+
clusterName,
126+
persistentTasksCustomMetadata,
127+
trainedModelAssignmentMetadata,
116128
clusterSettings,
117129
nodesStatsResponse,
118130
trainedModelCacheInfoResponse,
@@ -127,15 +139,14 @@ protected void masterOperation(
127139
if (memoryTracker.isEverRefreshed()) {
128140
memoryTrackerRefreshListener.onResponse(null);
129141
} else {
130-
memoryTracker.refresh(
131-
state.getMetadata().getProject().custom(PersistentTasksCustomMetadata.TYPE),
132-
memoryTrackerRefreshListener
133-
);
142+
memoryTracker.refresh(persistentTasksCustomMetadata, memoryTrackerRefreshListener);
134143
}
135144
}
136145

137146
void handleResponses(
138-
ClusterState state,
147+
ClusterName clusterName,
148+
PersistentTasksCustomMetadata persistentTasks,
149+
TrainedModelAssignmentMetadata assignmentMetadata,
139150
ClusterSettings clusterSettings,
140151
NodesStatsResponse nodesStatsResponse,
141152
TrainedModelCacheInfoAction.Response trainedModelCacheInfoResponse,
@@ -173,7 +184,8 @@ void handleResponses(
173184
ByteSizeValue mlNativeInference;
174185
if (node.getRoles().contains(DiscoveryNodeRole.ML_ROLE)) {
175186
NodeLoad nodeLoad = nodeLoadDetector.detectNodeLoad(
176-
state,
187+
persistentTasks,
188+
assignmentMetadata,
177189
node,
178190
maxOpenJobsPerNode,
179191
maxMachineMemoryPercent,
@@ -219,7 +231,7 @@ void handleResponses(
219231
);
220232
}
221233

222-
listener.onResponse(new MlMemoryAction.Response(state.getClusterName(), nodeResponses, failures));
234+
listener.onResponse(new MlMemoryAction.Response(clusterName, nodeResponses, failures));
223235
}
224236

225237
@Override

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/NodeLoadDetector.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,24 @@ public NodeLoad detectNodeLoad(
8787
int maxMachineMemoryPercent,
8888
boolean useAutoMachineMemoryCalculation
8989
) {
90-
PersistentTasksCustomMetadata persistentTasks = clusterState.getMetadata().getProject().custom(PersistentTasksCustomMetadata.TYPE);
90+
return detectNodeLoad(
91+
clusterState.getMetadata().getProject().custom(PersistentTasksCustomMetadata.TYPE),
92+
assignmentMetadata,
93+
node,
94+
maxNumberOfOpenJobs,
95+
maxMachineMemoryPercent,
96+
useAutoMachineMemoryCalculation
97+
);
98+
}
99+
100+
public NodeLoad detectNodeLoad(
101+
PersistentTasksCustomMetadata persistentTasks,
102+
TrainedModelAssignmentMetadata assignmentMetadata,
103+
DiscoveryNode node,
104+
int maxNumberOfOpenJobs,
105+
int maxMachineMemoryPercent,
106+
boolean useAutoMachineMemoryCalculation
107+
) {
91108
Map<String, String> nodeAttributes = node.getAttributes();
92109
List<String> errors = new ArrayList<>();
93110
OptionalLong maxMlMemory = NativeMemoryCalculator.allowedBytesForMl(node, maxMachineMemoryPercent, useAutoMachineMemoryCalculation);

0 commit comments

Comments
 (0)