[Data] Optimize autoscaler to support configurable step size for actor pool scaling

dragongu · dragongu · commit 80b0d688e345 · 2025-11-18T10:46:13.000+08:00
Signed-off-by: dragongu &lt;andrewgu@vip.qq.com&gt;
diff --git a/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py b/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py
@@ -31,6 +31,7 @@ def __init__(
         self._actor_pool_scaling_down_threshold = (
             config.actor_pool_util_downscaling_threshold
         )
+        self._actor_pool_max_upscaling_delta = config.actor_pool_max_upscaling_delta
 
         self._validate_autoscaling_config()
 
@@ -89,14 +90,36 @@ def _derive_target_scaling_config(
                     reason="operator exceeding resource quota"
                 )
             budget = self._resource_manager.get_budget(op)
-            if _get_max_scale_up(actor_pool, budget) == 0:
+            max_scale_up = _get_max_scale_up(actor_pool, budget)
+            if max_scale_up == 0:
                 return ActorPoolScalingRequest.no_op(reason="exceeded resource limits")
 
+            current_size = actor_pool.current_size()
+            # Calculate desired delta based on utilization
+            if current_size > 0 and util > 0:
+                plan_delta = math.ceil(
+                    current_size * (util / self._actor_pool_scaling_up_threshold - 1)
+                )
+            else:
+                plan_delta = 1
+
+            # Apply limits: resource budget, configured max delta, and max pool size
+            # The resource budget already provides protection against resource contention.
+            limits = []
+            if max_scale_up is not None:
+                limits.append(max_scale_up)
+            limits.append(self._actor_pool_max_upscaling_delta)
+            limits.append(actor_pool.max_size() - current_size)
+
+            delta = min(plan_delta, *limits)
+            delta = max(1, delta)  # At least scale up by 1
+
             return ActorPoolScalingRequest.upscale(
-                delta=1,
+                delta=delta,
                 reason=(
-                    f"utilization of {util} >= "
-                    f"{self._actor_pool_scaling_up_threshold}"
+                    f"utilization {util:.2f} >= threshold {self._actor_pool_scaling_up_threshold:.2f} "
+                    f"(plan_delta={plan_delta}, max_scale_up={max_scale_up}, "
+                    f"max_upscaling_delta={self._actor_pool_max_upscaling_delta}, final_delta={delta})"
                 ),
             )
         elif util <= self._actor_pool_scaling_down_threshold:
@@ -120,6 +143,13 @@ def _derive_target_scaling_config(
             )
 
     def _validate_autoscaling_config(self):
+        # Validate that scaling up threshold is positive to prevent ZeroDivisionError
+        if self._actor_pool_scaling_up_threshold <= 0:
+            raise ValueError(
+                f"actor_pool_util_upscaling_threshold must be positive, "
+                f"got {self._actor_pool_scaling_up_threshold}"
+            )
+
         for op, state in self._topology.items():
             for actor_pool in op.get_autoscaling_actor_pools():
                 self._validate_actor_pool_autoscaling_config(actor_pool, op)
diff --git a/python/ray/data/context.py b/python/ray/data/context.py
@@ -240,6 +240,11 @@ class ShuffleStrategy(str, enum.Enum):
     0.5,
 )
 
+DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA: int = env_integer(
+    "RAY_DATA_DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA",
+    1,
+)
+
 
 DEFAULT_ENABLE_DYNAMIC_OUTPUT_QUEUE_SIZE_BACKPRESSURE: bool = env_bool(
     "RAY_DATA_ENABLE_DYNAMIC_OUTPUT_QUEUE_SIZE_BACKPRESSURE", False
@@ -265,6 +270,9 @@ class AutoscalingConfig:
             between autoscaling speed and resource efficiency (i.e.,
             making tasks wait instead of immediately triggering execution).
         actor_pool_util_downscaling_threshold: Actor Pool utilization threshold for downscaling.
+        actor_pool_max_upscaling_delta: Maximum number of actors to scale up in a single scaling decision.
+            This limits how many actors can be added at once to prevent resource contention
+            and scheduling pressure. Defaults to 1 for conservative scaling.
     """
 
     actor_pool_util_upscaling_threshold: float = (
@@ -276,6 +284,9 @@ class AutoscalingConfig:
         DEFAULT_ACTOR_POOL_UTIL_DOWNSCALING_THRESHOLD
     )
 
+    # Maximum number of actors to scale up in a single scaling decision
+    actor_pool_max_upscaling_delta: int = DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA
+
 
 def _execution_options_factory() -> "ExecutionOptions":
     # Lazily import to avoid circular dependencies.
diff --git a/python/ray/data/tests/test_autoscaler.py b/python/ray/data/tests/test_autoscaler.py
@@ -1,15 +1,15 @@
+import math
 import time
 from contextlib import contextmanager
 from types import MethodType
-from typing import Optional
+from typing import Callable, Union
 from unittest.mock import MagicMock
 
 import pytest
 
 import ray
 from ray.data import ExecutionResources
 from ray.data._internal.actor_autoscaler import (
-    ActorPoolScalingRequest,
     DefaultActorAutoscaler,
 )
 from ray.data._internal.cluster_autoscaler import DefaultClusterAutoscaler
@@ -24,7 +24,8 @@
 )
 
 
-def test_actor_pool_scaling():
+@pytest.mark.parametrize("max_upscaling_delta", [1, 2, 5, 10])
+def test_actor_pool_scaling(max_upscaling_delta):
     """Test `_actor_pool_should_scale_up` and `_actor_pool_should_scale_down`
     in `DefaultAutoscaler`"""
 
@@ -37,6 +38,7 @@ def test_actor_pool_scaling():
         config=AutoscalingConfig(
             actor_pool_util_upscaling_threshold=1.0,
             actor_pool_util_downscaling_threshold=0.5,
+            actor_pool_max_upscaling_delta=max_upscaling_delta,
         ),
     )
 
@@ -82,22 +84,47 @@ def patch(mock, attr, value, is_method=True):
         yield
         setattr(mock, attr, original)
 
+    ExpectedReason = Union[str, Callable[[str], bool], None]
+
     def assert_autoscaling_action(
-        *, delta: int, expected_reason: Optional[str], force: bool = False
+        *, delta: int, expected_reason: ExpectedReason, force: bool = False
     ):
         nonlocal actor_pool, op, op_state
 
-        assert autoscaler._derive_target_scaling_config(
+        request = autoscaler._derive_target_scaling_config(
             actor_pool=actor_pool,
             op=op,
             op_state=op_state,
-        ) == ActorPoolScalingRequest(delta=delta, force=force, reason=expected_reason)
+        )
+
+        assert request.delta == delta
+        assert request.force == force
+
+        if callable(expected_reason):
+            assert expected_reason(
+                request.reason
+            ), f"Unexpected reason: {request.reason}"
+        else:
+            assert request.reason == expected_reason
+
+    def calculate_plan_delta(util: float, current_size: int, threshold: float) -> int:
+        """Calculate plan_delta based on utilization formula."""
+        return (
+            math.ceil(current_size * (util / threshold - 1))
+            if current_size > 0 and util > 0
+            else 1
+        )
 
     # Should scale up since the util above the threshold.
-    assert actor_pool.get_pool_util() == 1.5
+    util = actor_pool.get_pool_util()
+    assert util == 1.5
+    threshold = autoscaler._actor_pool_scaling_up_threshold
+    plan_delta = calculate_plan_delta(util, actor_pool.current_size(), threshold)
     assert_autoscaling_action(
-        delta=1,
-        expected_reason="utilization of 1.5 >= 1.0",
+        delta=min(plan_delta, max_upscaling_delta),
+        expected_reason=lambda reason: reason.startswith(
+            f"utilization {util:.2f} >= threshold {threshold:.2f}"
+        ),
     )
 
     # Should be no-op since the util is below the threshold.
@@ -161,9 +188,16 @@ def assert_autoscaling_action(
 
             # If the input queue is empty but inputs did not complete,
             # allow to scale up still
+            util = actor_pool.get_pool_util()
+            threshold = autoscaler._actor_pool_scaling_up_threshold
+            plan_delta = calculate_plan_delta(
+                util, actor_pool.current_size(), threshold
+            )
             assert_autoscaling_action(
-                delta=1,
-                expected_reason="utilization of 1.5 >= 1.0",
+                delta=min(plan_delta, max_upscaling_delta),
+                expected_reason=lambda reason: reason.startswith(
+                    f"utilization {util:.2f} >= threshold {threshold:.2f}"
+                ),
             )
 
     # Should be no-op since the op doesn't have enough resources.