Skip to content

Commit e31d5f8

Browse files
authored
[7.13] [ML] prevent accidentally asking for more resources when scaling down and improve scaling size estimations (#74691) (#74782)
* [ML] prevent accidentally asking for more resources when scaling down and improve scaling size estimations (#74691) This commit addresses two problems: - Our memory estimations are not very exact. Consequently, its possible to request for too much or too little by a handful of KBs, while this is not a large issue in ESS, for custom tier sizes, it may be. - When scaling down, it was possible that part of the scale down was actually a scale up! This was due to some floating point rounding errors and poor estimations. Even though are estimations are better, it is best to NOT request higher resources in a scale down, no matter what. One of the ways we improve the calculation is during JVM size calculations. Instead of having the knot point be `2gb` it has been changed to `1.2gb`. This accounts for the "window of uncertainty" for JVM sizes. closes: #74709
1 parent 42fb9ec commit e31d5f8

File tree

7 files changed

+260
-81
lines changed

7 files changed

+260
-81
lines changed

x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AutoscalingIT.java

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
3737
import static org.hamcrest.Matchers.containsString;
38-
import static org.hamcrest.Matchers.equalTo;
38+
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
3939
import static org.hamcrest.Matchers.hasKey;
4040

4141
public class AutoscalingIT extends MlNativeAutodetectIntegTestCase {
@@ -46,25 +46,38 @@ public class AutoscalingIT extends MlNativeAutodetectIntegTestCase {
4646

4747
// This test assumes that xpack.ml.max_machine_memory_percent is 30
4848
// and that xpack.ml.use_auto_machine_memory_percent is false
49-
public void testMLAutoscalingCapacity() {
49+
public void testMLAutoscalingCapacity() throws Exception {
5050
SortedMap<String, Settings> deciders = new TreeMap<>();
5151
deciders.put(MlAutoscalingDeciderService.NAME,
5252
Settings.builder().put(MlAutoscalingDeciderService.DOWN_SCALE_DELAY.getKey(), TimeValue.ZERO).build());
5353
final PutAutoscalingPolicyAction.Request request = new PutAutoscalingPolicyAction.Request(
5454
"ml_test",
55-
new TreeSet<>(org.elasticsearch.common.collect.Set.of("ml")),
55+
new TreeSet<>(Arrays.asList(
56+
"transform",
57+
"data_frozen",
58+
"master",
59+
"remote_cluster_client",
60+
"data",
61+
"ml",
62+
"data_content",
63+
"data_hot",
64+
"data_warm",
65+
"data_cold",
66+
"ingest"
67+
)),
5668
deciders
5769
);
5870
assertAcked(client().execute(PutAutoscalingPolicyAction.INSTANCE, request).actionGet());
5971

60-
assertMlCapacity(
72+
assertBusy(() -> assertMlCapacity(
6173
client().execute(
6274
GetAutoscalingCapacityAction.INSTANCE,
6375
new GetAutoscalingCapacityAction.Request()
6476
).actionGet(),
6577
"Requesting scale down as tier and/or node size could be smaller",
6678
0L,
67-
0L);
79+
0L)
80+
);
6881

6982
putJob("job1", 100);
7083
putJob("job2", 200);
@@ -151,8 +164,8 @@ private void assertMlCapacity(GetAutoscalingCapacityAction.Response capacity, St
151164

152165
AutoscalingDeciderResult autoscalingDeciderResult = autoscalingDeciderResults.results().get("ml");
153166
assertThat(autoscalingDeciderResult.reason().summary(), containsString(reason));
154-
assertThat(autoscalingDeciderResult.requiredCapacity().total().memory().getBytes(), equalTo(tierBytes));
155-
assertThat(autoscalingDeciderResult.requiredCapacity().node().memory().getBytes(), equalTo(nodeBytes));
167+
assertThat(autoscalingDeciderResult.requiredCapacity().total().memory().getBytes(), greaterThanOrEqualTo(tierBytes - 1L));
168+
assertThat(autoscalingDeciderResult.requiredCapacity().node().memory().getBytes(), greaterThanOrEqualTo(nodeBytes - 1L));
156169
}
157170

158171
private void putJob(String jobId, long limitMb) {

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/MlAutoscalingDeciderService.java

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ public class MlAutoscalingDeciderService implements AutoscalingDeciderService,
7171
private static final Duration DEFAULT_MEMORY_REFRESH_RATE = Duration.ofMinutes(15);
7272
private static final String MEMORY_STALE = "unable to make scaling decision as job memory requirements are stale";
7373
private static final long NO_SCALE_DOWN_POSSIBLE = -1L;
74+
// If ensureScaleDown changes the calculation by more than this much, log the error
75+
private static final long ACCEPTABLE_DIFFERENCE = ByteSizeValue.ofMb(1).getBytes();
7476

7577
public static final String NAME = "ml";
7678
public static final Setting<Integer> NUM_ANOMALY_JOBS_IN_QUEUE = Setting.intSetting("num_anomaly_jobs_in_queue", 0, 0);
@@ -359,6 +361,7 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
359361

360362
final List<DiscoveryNode> nodes = getNodes(clusterState);
361363
final NativeMemoryCapacity currentScale = currentScale(nodes);
364+
362365
final MlScalingReason.Builder reasonBuilder = MlScalingReason.builder()
363366
.setWaitingAnomalyJobs(waitingAnomalyJobs)
364367
.setWaitingAnalyticsJobs(waitingAnalyticsJobs)
@@ -497,9 +500,21 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
497500
.build()));
498501
}
499502

500-
final Optional<AutoscalingDeciderResult> scaleDownDecision = checkForScaleDown(nodeLoads, largestJob, currentScale, reasonBuilder);
503+
final Optional<AutoscalingDeciderResult> maybeScaleDown = checkForScaleDown(nodeLoads, largestJob, currentScale, reasonBuilder)
504+
// Due to weird rounding errors, it may be that a scale down result COULD cause a scale up
505+
// Ensuring the scaleDown here forces the scale down result to always be lower than the current capacity.
506+
// This is safe as we know that ALL jobs are assigned at the current capacity
507+
.map(result -> {
508+
AutoscalingCapacity capacity = ensureScaleDown(result.requiredCapacity(), context.currentCapacity());
509+
if (capacity == null) {
510+
return null;
511+
}
512+
return new AutoscalingDeciderResult(capacity, result.reason());
513+
});
514+
515+
if (maybeScaleDown.isPresent()) {
516+
final AutoscalingDeciderResult scaleDownDecisionResult = maybeScaleDown.get();
501517

502-
if (scaleDownDecision.isPresent()) {
503518
// Given maxOpenJobs, could we scale down to just one node?
504519
// We have no way of saying "we need X nodes"
505520
if (nodeLoads.size() > 1) {
@@ -516,7 +531,7 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
516531
MAX_OPEN_JOBS_PER_NODE.getKey());
517532
logger.info(() -> new ParameterizedMessage("{} Calculated potential scaled down capacity [{}] ",
518533
msg,
519-
scaleDownDecision.get().requiredCapacity()));
534+
scaleDownDecisionResult.requiredCapacity()));
520535
return new AutoscalingDeciderResult(context.currentCapacity(), reasonBuilder.setSimpleReason(msg).build());
521536
}
522537
}
@@ -528,14 +543,14 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
528543
TimeValue downScaleDelay = DOWN_SCALE_DELAY.get(configuration);
529544
long msLeftToScale = downScaleDelay.millis() - (now - scaleDownDetected);
530545
if (msLeftToScale <= 0) {
531-
return scaleDownDecision.get();
546+
return scaleDownDecisionResult;
532547
}
533548
logger.debug(() -> new ParameterizedMessage(
534549
"not scaling down as the current scale down delay [{}] is not satisfied." +
535550
" The last time scale down was detected [{}]. Calculated scaled down capacity [{}] ",
536551
downScaleDelay.getStringRep(),
537552
XContentElasticsearchExtension.DEFAULT_DATE_PRINTER.print(scaleDownDetected),
538-
scaleDownDecision.get().requiredCapacity()));
553+
scaleDownDecisionResult.requiredCapacity()));
539554
return new AutoscalingDeciderResult(
540555
context.currentCapacity(),
541556
reasonBuilder
@@ -560,6 +575,31 @@ public AutoscalingDeciderResult scale(Settings configuration, AutoscalingDecider
560575
.build()));
561576
}
562577

578+
static AutoscalingCapacity ensureScaleDown(AutoscalingCapacity scaleDownResult, AutoscalingCapacity currentCapacity) {
579+
if (currentCapacity == null || scaleDownResult == null) {
580+
return null;
581+
}
582+
AutoscalingCapacity newCapacity = new AutoscalingCapacity(
583+
new AutoscalingCapacity.AutoscalingResources(
584+
currentCapacity.total().storage(),
585+
ByteSizeValue.ofBytes(Math.min(scaleDownResult.total().memory().getBytes(), currentCapacity.total().memory().getBytes()))
586+
),
587+
new AutoscalingCapacity.AutoscalingResources(
588+
currentCapacity.node().storage(),
589+
ByteSizeValue.ofBytes(Math.min(scaleDownResult.node().memory().getBytes(), currentCapacity.node().memory().getBytes()))
590+
)
591+
);
592+
if (scaleDownResult.node().memory().getBytes() - newCapacity.node().memory().getBytes() > ACCEPTABLE_DIFFERENCE
593+
|| scaleDownResult.total().memory().getBytes() - newCapacity.total().memory().getBytes() > ACCEPTABLE_DIFFERENCE) {
594+
logger.warn(
595+
"scale down accidentally requested a scale up, auto-corrected; initial scaling [{}], corrected [{}]",
596+
scaleDownResult,
597+
newCapacity
598+
);
599+
}
600+
return newCapacity;
601+
}
602+
563603
AutoscalingDeciderResult noScaleResultOrRefresh(MlScalingReason.Builder reasonBuilder,
564604
boolean memoryTrackingStale,
565605
AutoscalingDeciderResult potentialResult) {
@@ -816,8 +856,11 @@ Optional<AutoscalingDeciderResult> checkForScaleDown(List<NodeLoad> nodeLoads,
816856
// Or our largest job could be on a smaller node (meaning the same size tier but smaller nodes are possible).
817857
if (currentlyNecessaryTier < currentCapacity.getTier() || currentlyNecessaryNode < currentCapacity.getNode()) {
818858
NativeMemoryCapacity nativeMemoryCapacity = new NativeMemoryCapacity(
819-
currentlyNecessaryTier,
820-
currentlyNecessaryNode,
859+
// Since we are in the `scaleDown` branch, we know jobs are running and we could be smaller
860+
// If we have some weird rounding errors, it may be that the `currentlyNecessary` values are larger than
861+
// current capacity. We never want to accidentally say "scale up" via a scale down.
862+
Math.min(currentlyNecessaryTier, currentCapacity.getTier()),
863+
Math.min(currentlyNecessaryNode, currentCapacity.getNode()),
821864
// If our newly suggested native capacity is the same, we can use the previously stored jvm size
822865
currentlyNecessaryNode == currentCapacity.getNode() ? currentCapacity.getJvmSize() : null);
823866
AutoscalingCapacity requiredCapacity = nativeMemoryCapacity.autoscalingCapacity(maxMachineMemoryPercent, useAuto);

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/autoscaling/NativeMemoryCapacity.java

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
import org.elasticsearch.xpack.ml.utils.NativeMemoryCalculator;
1313

1414
import java.util.Objects;
15+
import java.util.Optional;
16+
17+
import static org.elasticsearch.xpack.ml.utils.NativeMemoryCalculator.dynamicallyCalculateJvmSizeFromNativeMemorySize;
1518

1619
// Used for storing native memory capacity and then transforming it into an autoscaling capacity
1720
// which takes into account the whole node size
@@ -49,22 +52,26 @@ NativeMemoryCapacity merge(NativeMemoryCapacity nativeMemoryCapacity) {
4952
return this;
5053
}
5154

52-
AutoscalingCapacity autoscalingCapacity(int maxMemoryPercent, boolean useAuto) {
55+
public AutoscalingCapacity autoscalingCapacity(int maxMemoryPercent, boolean useAuto) {
56+
// We calculate the JVM size here first to ensure it stays the same given the rest of the calculations
57+
final Long jvmSize = useAuto ?
58+
Optional.ofNullable(this.jvmSize).orElse(dynamicallyCalculateJvmSizeFromNativeMemorySize(node)) :
59+
null;
5360
// We first need to calculate the actual node size given the current native memory size.
5461
// This way we can accurately determine the required node size AND what the overall memory percentage will be
5562
long actualNodeSize = NativeMemoryCalculator.calculateApproxNecessaryNodeSize(node, jvmSize, maxMemoryPercent, useAuto);
5663
// We make the assumption that the JVM size is the same across the entire tier
5764
// This simplifies calculating the tier as it means that each node in the tier
5865
// will have the same dynamic memory calculation. And thus the tier is simply the sum of the memory necessary
5966
// times that scaling factor.
60-
int memoryPercentForMl = (int)Math.floor(NativeMemoryCalculator.modelMemoryPercent(
67+
double memoryPercentForMl = NativeMemoryCalculator.modelMemoryPercent(
6168
actualNodeSize,
6269
jvmSize,
6370
maxMemoryPercent,
6471
useAuto
65-
));
72+
);
6673
double inverseScale = memoryPercentForMl <= 0 ? 0 : 100.0 / memoryPercentForMl;
67-
long actualTier = (long)Math.ceil(tier * inverseScale);
74+
long actualTier = Math.round(tier * inverseScale);
6875
return new AutoscalingCapacity(
6976
// Tier should always be AT LEAST the largest node size.
7077
// This Math.max catches any strange rounding errors or weird input.

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/NativeMemoryCalculator.java

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
public final class NativeMemoryCalculator {
2525

26+
private static final long STATIC_JVM_UPPER_THRESHOLD = ByteSizeValue.ofGb(2).getBytes();
2627
static final long MINIMUM_AUTOMATIC_NODE_SIZE = ByteSizeValue.ofGb(1).getBytes();
2728
private static final long OS_OVERHEAD = ByteSizeValue.ofMb(200L).getBytes();
2829

@@ -80,15 +81,11 @@ public static long calculateApproxNecessaryNodeSize(long nativeMachineMemory, Lo
8081
if (useAuto) {
8182
// TODO utilize official ergonomic JVM size calculations when available.
8283
jvmSize = jvmSize == null ? dynamicallyCalculateJvmSizeFromNativeMemorySize(nativeMachineMemory) : jvmSize;
83-
// We use a Math.floor here to ensure we have AT LEAST enough memory given rounding.
84-
int modelMemoryPercent = (int)Math.floor(modelMemoryPercent(
85-
nativeMachineMemory + jvmSize + OS_OVERHEAD,
86-
jvmSize,
87-
maxMemoryPercent,
88-
true));
89-
// We calculate the inverse percentage of `nativeMachineMemory + OS_OVERHEAD` as `OS_OVERHEAD` is always present
90-
// on the native memory side and we need to account for it when we invert the model memory percentage
91-
return Math.max((long)Math.ceil((100.0/modelMemoryPercent) * (nativeMachineMemory + OS_OVERHEAD)), MINIMUM_AUTOMATIC_NODE_SIZE);
84+
// We haven't reached our 90% threshold, so, simply summing up the values is adequate
85+
if ((jvmSize + OS_OVERHEAD)/(double)nativeMachineMemory > 0.1) {
86+
return Math.max(nativeMachineMemory + jvmSize + OS_OVERHEAD, MINIMUM_AUTOMATIC_NODE_SIZE);
87+
}
88+
return Math.round((nativeMachineMemory/0.9));
9289
}
9390
return (long) ((100.0/maxMemoryPercent) * nativeMachineMemory);
9491
}
@@ -118,18 +115,11 @@ public static double modelMemoryPercent(long machineMemory, Long jvmSize, int ma
118115
return maxMemoryPercent;
119116
}
120117

121-
public static int modelMemoryPercent(long machineMemory, int maxMemoryPercent, boolean useAuto) {
122-
return (int)Math.ceil(modelMemoryPercent(machineMemory,
123-
null,
124-
maxMemoryPercent,
125-
useAuto));
126-
}
127-
128-
private static long allowedBytesForMl(long machineMemory, Long jvmSize, int maxMemoryPercent, boolean useAuto) {
118+
static long allowedBytesForMl(long machineMemory, Long jvmSize, int maxMemoryPercent, boolean useAuto) {
129119
if (useAuto && jvmSize != null) {
130120
// It is conceivable that there is a machine smaller than 200MB.
131121
// If the administrator wants to use the auto configuration, the node should be larger.
132-
if (machineMemory - jvmSize < OS_OVERHEAD || machineMemory == 0) {
122+
if (machineMemory - jvmSize <= OS_OVERHEAD || machineMemory == 0) {
133123
return machineMemory / 100;
134124
}
135125
// This calculation is dynamic and designed to maximally take advantage of the underlying machine for machine learning
@@ -139,8 +129,8 @@ private static long allowedBytesForMl(long machineMemory, Long jvmSize, int maxM
139129
// 2GB node -> 66%
140130
// 16GB node -> 87%
141131
// 64GB node -> 90%
142-
long memoryPercent = Math.min(90, (int)Math.ceil(((machineMemory - jvmSize - OS_OVERHEAD) / (double)machineMemory) * 100.0D));
143-
return (long)(machineMemory * (memoryPercent / 100.0));
132+
double memoryProportion = Math.min(0.90, (machineMemory - jvmSize - OS_OVERHEAD) / (double)machineMemory);
133+
return Math.round(machineMemory * memoryProportion);
144134
}
145135

146136
return (long)(machineMemory * (maxMemoryPercent / 100.0));
@@ -154,30 +144,33 @@ public static long allowedBytesForMl(long machineMemory, int maxMemoryPercent, b
154144
}
155145

156146
// TODO replace with official ergonomic calculation
157-
private static long dynamicallyCalculateJvmSizeFromNodeSize(long nodeSize) {
158-
if (nodeSize < ByteSizeValue.ofGb(2).getBytes()) {
159-
return (long) (nodeSize * 0.40);
147+
public static long dynamicallyCalculateJvmSizeFromNodeSize(long nodeSize) {
148+
// While the original idea here was to predicate on 2Gb, it has been found that the knot points of
149+
// 2GB and 8GB cause weird issues where the JVM size will "jump the gap" from one to the other when
150+
// considering true tier sizes in elastic cloud.
151+
if (nodeSize < ByteSizeValue.ofMb(1280).getBytes()) {
152+
return (long)(nodeSize * 0.40);
160153
}
161154
if (nodeSize < ByteSizeValue.ofGb(8).getBytes()) {
162-
return (long) (nodeSize * 0.25);
155+
return (long)(nodeSize * 0.25);
163156
}
164-
return ByteSizeValue.ofGb(2).getBytes();
157+
return STATIC_JVM_UPPER_THRESHOLD;
165158
}
166159

167-
private static long dynamicallyCalculateJvmSizeFromNativeMemorySize(long nativeMachineMemory) {
160+
public static long dynamicallyCalculateJvmSizeFromNativeMemorySize(long nativeMachineMemory) {
168161
// See dynamicallyCalculateJvm the following JVM calculations are arithmetic inverses of JVM calculation
169162
//
170163
// Example: For < 2GB node, the JVM is 0.4 * total_node_size. This means, the rest is 0.6 the node size.
171164
// So, the `nativeAndOverhead` is == 0.6 * total_node_size => total_node_size = (nativeAndOverHead / 0.6)
172165
// Consequently jvmSize = (nativeAndOverHead / 0.6)*0.4 = nativeAndOverHead * 2/3
173166
long nativeAndOverhead = nativeMachineMemory + OS_OVERHEAD;
174167
if (nativeAndOverhead < (ByteSizeValue.ofGb(2).getBytes() * 0.60)) {
175-
return (long) Math.ceil(nativeAndOverhead * (2.0 / 3.0));
168+
return Math.round((nativeAndOverhead / 0.6) * 0.4);
176169
}
177170
if (nativeAndOverhead < (ByteSizeValue.ofGb(8).getBytes() * 0.75)) {
178-
return (long) Math.ceil(nativeAndOverhead / 3.0);
171+
return Math.round((nativeAndOverhead / 0.75) * 0.25);
179172
}
180-
return ByteSizeValue.ofGb(2).getBytes();
173+
return STATIC_JVM_UPPER_THRESHOLD;
181174
}
182175

183176
}

0 commit comments

Comments
 (0)