Skip to content

Commit c8c4200

Browse files
authored
[ML] autoscaling context current capacity could be null, this commit handles that (#74822)
context commit may be null. This should only really happen early in a cluster's life cycle or if a node was just recently brought online. Mainly because the current node sizes have not been discovered yet and cached. This change should really have been part of #74691
1 parent d70b090 commit c8c4200

File tree

2 files changed

+14
-6
lines changed

2 files changed

+14
-6
lines changed

x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ public class AutoscalingIT extends MlNativeAutodetectIntegTestCase {
4646

4747
// This test assumes that xpack.ml.max_machine_memory_percent is 30
4848
// and that xpack.ml.use_auto_machine_memory_percent is false
49-
public void testMLAutoscalingCapacity() {
49+
public void testMLAutoscalingCapacity() throws Exception {
5050
SortedMap<String, Settings> deciders = new TreeMap<>();
5151
deciders.put(MlAutoscalingDeciderService.NAME,
5252
Settings.builder().put(MlAutoscalingDeciderService.DOWN_SCALE_DELAY.getKey(), TimeValue.ZERO).build());
@@ -57,14 +57,15 @@ public void testMLAutoscalingCapacity() {
5757
);
5858
assertAcked(client().execute(PutAutoscalingPolicyAction.INSTANCE, request).actionGet());
5959

60-
assertMlCapacity(
60+
assertBusy(() -> assertMlCapacity(
6161
client().execute(
6262
GetAutoscalingCapacityAction.INSTANCE,
6363
new GetAutoscalingCapacityAction.Request()
6464
).actionGet(),
6565
"Requesting scale down as tier and/or node size could be smaller",
6666
0L,
67-
0L);
67+
0L)
68+
);
6869

6970
putJob("job1", 100);
7071
putJob("job2", 200);

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -534,9 +534,13 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
534534
// Due to weird rounding errors, it may be that a scale down result COULD cause a scale up
535535
// Ensuring the scaleDown here forces the scale down result to always be lower than the current capacity.
536536
// This is safe as we know that ALL jobs are assigned at the current capacity
537-
.map(result -> new AutoscalingDeciderResult(
538-
ensureScaleDown(result.requiredCapacity(), context.currentCapacity()), result.reason()
539-
));
537+
.map(result -> {
538+
AutoscalingCapacity capacity = ensureScaleDown(result.requiredCapacity(), context.currentCapacity());
539+
if (capacity == null) {
540+
return null;
541+
}
542+
return new AutoscalingDeciderResult(capacity, result.reason());
543+
});
540544

541545
if (maybeScaleDown.isPresent()) {
542546
final AutoscalingDeciderResult scaleDownDecisionResult = maybeScaleDown.get();
@@ -599,6 +603,9 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
599603
}
600604

601605
static AutoscalingCapacity ensureScaleDown(AutoscalingCapacity scaleDownResult, AutoscalingCapacity currentCapacity) {
606+
if (scaleDownResult == null || currentCapacity == null) {
607+
return null;
608+
}
602609
AutoscalingCapacity newCapacity = new AutoscalingCapacity(
603610
new AutoscalingCapacity.AutoscalingResources(
604611
currentCapacity.total().storage(),

0 commit comments

Comments
 (0)