[Data] Optimize autoscaler to support configurable step size for actor pool scaling

dragongu · dragongu · commit 191de57a4d54 · 2025-11-19T21:58:56.000+08:00
Signed-off-by: dragongu &lt;andrewgu@vip.qq.com&gt;
diff --git a/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py b/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py
@@ -31,6 +31,7 @@ def __init__(
         self._actor_pool_scaling_down_threshold = (
             config.actor_pool_util_downscaling_threshold
         )
+        self._actor_pool_max_upscaling_delta = config.actor_pool_max_upscaling_delta
 
         self._validate_autoscaling_config()
 
@@ -89,14 +90,27 @@ def _derive_target_scaling_config(
                     reason="operator exceeding resource quota"
                 )
             budget = self._resource_manager.get_budget(op)
-            if _get_max_scale_up(actor_pool, budget) == 0:
+            max_scale_up = _get_max_scale_up(actor_pool, budget)
+            if max_scale_up == 0:
                 return ActorPoolScalingRequest.no_op(reason="exceeded resource limits")
 
+            # Calculate desired delta based on utilization
+            plan_delta = math.ceil(
+                actor_pool.current_size() * (util / self._actor_pool_scaling_up_threshold - 1)
+            )
+
+            delta = min(
+                plan_delta,
+                *self._get_upscale_capacities(actor_pool, max_scale_up),
+            )
+            delta = max(1, delta)  # At least scale up by 1
+
             return ActorPoolScalingRequest.upscale(
-                delta=1,
+                delta=delta,
                 reason=(
-                    f"utilization of {util} >= "
-                    f"{self._actor_pool_scaling_up_threshold}"
+                    f"utilization {util:.2f} >= threshold {self._actor_pool_scaling_up_threshold:.2f} "
+                    f"(plan_delta={plan_delta}, max_scale_up={max_scale_up}, "
+                    f"max_upscaling_delta={self._actor_pool_max_upscaling_delta}, final_delta={delta})"
                 ),
             )
         elif util <= self._actor_pool_scaling_down_threshold:
@@ -120,10 +134,29 @@ def _derive_target_scaling_config(
             )
 
     def _validate_autoscaling_config(self):
+        # Validate that max upscaling delta is positive to prevent override by safeguard
+        if self._actor_pool_max_upscaling_delta <= 0:
+            raise ValueError(
+                f"actor_pool_max_upscaling_delta must be positive, "
+                f"got {self._actor_pool_max_upscaling_delta}"
+            )
+
         for op, state in self._topology.items():
             for actor_pool in op.get_autoscaling_actor_pools():
                 self._validate_actor_pool_autoscaling_config(actor_pool, op)
 
+    def _get_upscale_capacities(
+        self,
+        actor_pool: "AutoscalingActorPool",
+        max_scale_up: Optional[int],
+    ):
+        limits = []
+        if max_scale_up is not None:
+            limits.append(max_scale_up)
+        limits.append(self._actor_pool_max_upscaling_delta)
+        limits.append(actor_pool.max_size() - actor_pool.get_current_size())
+        return limits
+
     def _validate_actor_pool_autoscaling_config(
         self,
         actor_pool: "AutoscalingActorPool",
diff --git a/python/ray/data/context.py b/python/ray/data/context.py
@@ -240,6 +240,11 @@ class ShuffleStrategy(str, enum.Enum):
     0.5,
 )
 
+DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA: int = env_integer(
+    "RAY_DATA_DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA",
+    1,
+)
+
 
 DEFAULT_ENABLE_DYNAMIC_OUTPUT_QUEUE_SIZE_BACKPRESSURE: bool = env_bool(
     "RAY_DATA_ENABLE_DYNAMIC_OUTPUT_QUEUE_SIZE_BACKPRESSURE", False
@@ -265,6 +270,9 @@ class AutoscalingConfig:
             between autoscaling speed and resource efficiency (i.e.,
             making tasks wait instead of immediately triggering execution).
         actor_pool_util_downscaling_threshold: Actor Pool utilization threshold for downscaling.
+        actor_pool_max_upscaling_delta: Maximum number of actors to scale up in a single scaling decision.
+            This limits how many actors can be added at once to prevent resource contention
+            and scheduling pressure. Defaults to 1 for conservative scaling.
     """
 
     actor_pool_util_upscaling_threshold: float = (
@@ -276,6 +284,9 @@ class AutoscalingConfig:
         DEFAULT_ACTOR_POOL_UTIL_DOWNSCALING_THRESHOLD
     )
 
+    # Maximum number of actors to scale up in a single scaling decision
+    actor_pool_max_upscaling_delta: int = DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA
+
 
 def _execution_options_factory() -> "ExecutionOptions":
     # Lazily import to avoid circular dependencies.
diff --git a/python/ray/data/tests/test_autoscaler.py b/python/ray/data/tests/test_autoscaler.py
@@ -209,6 +209,77 @@ def assert_autoscaling_action(
         )
 
 
+@pytest.fixture
+def autoscaler_max_upscaling_delta_setup():
+    resource_manager = MagicMock(
+        spec=ResourceManager, get_budget=MagicMock(return_value=None)
+    )
+
+    actor_pool = MagicMock(
+        spec=_ActorPool,
+        min_size=MagicMock(return_value=5),
+        max_size=MagicMock(return_value=20),
+        current_size=MagicMock(return_value=10),
+        get_current_size=MagicMock(return_value=10),
+        num_pending_actors=MagicMock(return_value=0),
+        get_pool_util=MagicMock(return_value=2.0),
+    )
+
+    op = MagicMock(
+        spec=InternalQueueOperatorMixin,
+        completed=MagicMock(return_value=False),
+        _inputs_complete=False,
+    )
+    op_state = MagicMock(
+        spec=OpState,
+        total_enqueued_input_blocks=MagicMock(return_value=1),
+    )
+    op_state._scheduling_status = MagicMock(under_resource_limits=True)
+    return resource_manager, actor_pool, op, op_state
+
+def test_actor_pool_scaling_respects_small_max_upscaling_delta(
+    autoscaler_max_upscaling_delta_setup,
+):
+    resource_manager, actor_pool, op, op_state = autoscaler_max_upscaling_delta_setup
+    autoscaler = DefaultActorAutoscaler(
+        topology=MagicMock(),
+        resource_manager=resource_manager,
+        config=AutoscalingConfig(
+            actor_pool_util_upscaling_threshold=1.0,
+            actor_pool_util_downscaling_threshold=0.5,
+            actor_pool_max_upscaling_delta=3,
+        ),
+    )
+    request = autoscaler._derive_target_scaling_config(
+        actor_pool=actor_pool,
+        op=op,
+        op_state=op_state,
+    )
+    assert request.delta == 3
+    assert "max_upscaling_delta=3" in request.reason
+
+def test_actor_pool_scaling_respects_large_max_upscaling_delta(
+    autoscaler_max_upscaling_delta_setup,
+):
+    resource_manager, actor_pool, op, op_state = autoscaler_max_upscaling_delta_setup
+    autoscaler = DefaultActorAutoscaler(
+        topology=MagicMock(),
+        resource_manager=resource_manager,
+        config=AutoscalingConfig(
+            actor_pool_util_upscaling_threshold=1.0,
+            actor_pool_util_downscaling_threshold=0.5,
+            actor_pool_max_upscaling_delta=100,
+        ),
+    )
+    request = autoscaler._derive_target_scaling_config(
+        actor_pool=actor_pool,
+        op=op,
+        op_state=op_state,
+    )
+    assert request.delta == 10
+    assert "max_upscaling_delta=10" in request.reason
+
+
 def test_cluster_scaling():
     """Test `_try_scale_up_cluster` in `DefaultAutoscaler`"""
     op1 = MagicMock(
@@ -417,6 +488,62 @@ def __call__(self, row):
     assert expected_message not in wanr_log_args_str
 
 
+@pytest.fixture
+def autoscaler_config_mocks():
+    resource_manager = MagicMock(spec=ResourceManager)
+    topology = MagicMock()
+    topology.items = MagicMock(return_value=[])
+    return resource_manager, topology
+
+
+def test_autoscaling_config_validation_zero_delta(autoscaler_config_mocks):
+    resource_manager, topology = autoscaler_config_mocks
+
+    with pytest.raises(
+        ValueError, match="actor_pool_max_upscaling_delta must be positive"
+    ):
+        DefaultActorAutoscaler(
+            topology=topology,
+            resource_manager=resource_manager,
+            config=AutoscalingConfig(
+                actor_pool_util_upscaling_threshold=1.0,
+                actor_pool_util_downscaling_threshold=0.5,
+                actor_pool_max_upscaling_delta=0,
+            ),
+        )
+
+def test_autoscaling_config_validation_negative_delta(autoscaler_config_mocks):
+    resource_manager, topology = autoscaler_config_mocks
+
+    with pytest.raises(
+        ValueError, match="actor_pool_max_upscaling_delta must be positive"
+    ):
+        DefaultActorAutoscaler(
+            topology=topology,
+            resource_manager=resource_manager,
+            config=AutoscalingConfig(
+                actor_pool_util_upscaling_threshold=1.0,
+                actor_pool_util_downscaling_threshold=0.5,
+                actor_pool_max_upscaling_delta=-1,
+            ),
+        )
+
+
+def test_autoscaling_config_validation_positive_delta(autoscaler_config_mocks):
+    resource_manager, topology = autoscaler_config_mocks
+
+    autoscaler = DefaultActorAutoscaler(
+        topology=topology,
+        resource_manager=resource_manager,
+        config=AutoscalingConfig(
+            actor_pool_util_upscaling_threshold=1.0,
+            actor_pool_util_downscaling_threshold=0.5,
+            actor_pool_max_upscaling_delta=5,
+        ),
+    )
+    assert autoscaler._actor_pool_max_upscaling_delta == 5
+
+
 if __name__ == "__main__":
     import sys