[Data] Optimize autoscaler to support configurable step size for actor pool scaling (#58726)

dragongu · web-flow · commit 814768317813 · 2025-11-20T23:44:35.000Z
## Summary

Add support for configurable upscaling step size in the actor pool
autoscaler. This enables rapid scale-up and efficient resource
utilization by allowing the autoscaler to scale up multiple actors at
once, instead of scaling up one actor at a time.

## Description

### Background

Currently, the actor pool autoscaler scales up actors one at a time,
which can be slow in certain scenarios:

1. **Slow actor startup**: When actor initialization logic is complex,
actors may remain in pending state for extended periods. The autoscaler
skips scaling when it encounters pending actors, preventing further
scaling.

2. **Elastic cluster with unstable resources**: In environments where
available resources are uncertain, users often configure large
concurrency ranges (e.g., (10,1000)) for `map_batches`. In these cases,
rapid startup and scaling are critical to utilize available resources
efficiently.

### Solution

This PR adds support for configurable upscaling step size in the actor
pool autoscaler. Instead of always scaling up by 1 actor at a time, the
autoscaler can now scale up multiple actors based on utilization
metrics, while respecting resource constraints.

## Related issues

&lt;!-- Add related issue numbers if applicable --&gt;

Signed-off-by: dragongu &lt;andrewgu@vip.qq.com&gt;
diff --git a/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py b/python/ray/data/_internal/actor_autoscaler/default_actor_autoscaler.py
@@ -31,6 +31,7 @@ def __init__(
         self._actor_pool_scaling_down_threshold = (
             config.actor_pool_util_downscaling_threshold
         )
+        self._actor_pool_max_upscaling_delta = config.actor_pool_max_upscaling_delta
 
         self._validate_autoscaling_config()
 
@@ -89,11 +90,25 @@ def _derive_target_scaling_config(
                     reason="operator exceeding resource quota"
                 )
             budget = self._resource_manager.get_budget(op)
-            if _get_max_scale_up(actor_pool, budget) == 0:
+            max_scale_up = _get_max_scale_up(actor_pool, budget)
+            if max_scale_up == 0:
                 return ActorPoolScalingRequest.no_op(reason="exceeded resource limits")
 
+            # Calculate desired delta based on utilization
+            plan_delta = math.ceil(
+                actor_pool.current_size()
+                * (util / self._actor_pool_scaling_up_threshold - 1)
+            )
+
+            upscale_capacities = self._get_upscale_capacities(actor_pool, max_scale_up)
+            delta = min(
+                plan_delta,
+                *upscale_capacities,
+            )
+            delta = max(1, delta)  # At least scale up by 1
+
             return ActorPoolScalingRequest.upscale(
-                delta=1,
+                delta=delta,
                 reason=(
                     f"utilization of {util} >= "
                     f"{self._actor_pool_scaling_up_threshold}"
@@ -120,10 +135,36 @@ def _derive_target_scaling_config(
             )
 
     def _validate_autoscaling_config(self):
+        # Validate that max upscaling delta is positive to prevent override by safeguard
+        if self._actor_pool_max_upscaling_delta <= 0:
+            raise ValueError(
+                f"actor_pool_max_upscaling_delta must be positive, "
+                f"got {self._actor_pool_max_upscaling_delta}"
+            )
+        # Validate that upscaling threshold is positive to prevent division by zero
+        # and incorrect scaling calculations
+        if self._actor_pool_scaling_up_threshold <= 0:
+            raise ValueError(
+                f"actor_pool_util_upscaling_threshold must be positive, "
+                f"got {self._actor_pool_scaling_up_threshold}"
+            )
+
         for op, state in self._topology.items():
             for actor_pool in op.get_autoscaling_actor_pools():
                 self._validate_actor_pool_autoscaling_config(actor_pool, op)
 
+    def _get_upscale_capacities(
+        self,
+        actor_pool: "AutoscalingActorPool",
+        max_scale_up: Optional[int],
+    ):
+        limits = []
+        if max_scale_up is not None:
+            limits.append(max_scale_up)
+        limits.append(self._actor_pool_max_upscaling_delta)
+        limits.append(actor_pool.max_size() - actor_pool.current_size())
+        return limits
+
     def _validate_actor_pool_autoscaling_config(
         self,
         actor_pool: "AutoscalingActorPool",
diff --git a/python/ray/data/context.py b/python/ray/data/context.py
@@ -240,6 +240,11 @@ class ShuffleStrategy(str, enum.Enum):
     0.5,
 )
 
+DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA: int = env_integer(
+    "RAY_DATA_DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA",
+    1,
+)
+
 
 DEFAULT_ENABLE_DYNAMIC_OUTPUT_QUEUE_SIZE_BACKPRESSURE: bool = env_bool(
     "RAY_DATA_ENABLE_DYNAMIC_OUTPUT_QUEUE_SIZE_BACKPRESSURE", False
@@ -265,6 +270,9 @@ class AutoscalingConfig:
             between autoscaling speed and resource efficiency (i.e.,
             making tasks wait instead of immediately triggering execution).
         actor_pool_util_downscaling_threshold: Actor Pool utilization threshold for downscaling.
+        actor_pool_max_upscaling_delta: Maximum number of actors to scale up in a single scaling decision.
+            This limits how many actors can be added at once to prevent resource contention
+            and scheduling pressure. Defaults to 1 for conservative scaling.
     """
 
     actor_pool_util_upscaling_threshold: float = (
@@ -276,6 +284,9 @@ class AutoscalingConfig:
         DEFAULT_ACTOR_POOL_UTIL_DOWNSCALING_THRESHOLD
     )
 
+    # Maximum number of actors to scale up in a single scaling decision
+    actor_pool_max_upscaling_delta: int = DEFAULT_ACTOR_POOL_MAX_UPSCALING_DELTA
+
 
 def _execution_options_factory() -> "ExecutionOptions":
     # Lazily import to avoid circular dependencies.
diff --git a/python/ray/data/tests/test_autoscaler.py b/python/ray/data/tests/test_autoscaler.py
@@ -209,6 +209,84 @@ def assert_autoscaling_action(
         )
 
 
+@pytest.fixture
+def autoscaler_max_upscaling_delta_setup():
+    resource_manager = MagicMock(
+        spec=ResourceManager, get_budget=MagicMock(return_value=None)
+    )
+
+    actor_pool = MagicMock(
+        spec=_ActorPool,
+        min_size=MagicMock(return_value=5),
+        max_size=MagicMock(return_value=20),
+        current_size=MagicMock(return_value=10),
+        get_current_size=MagicMock(return_value=10),
+        num_pending_actors=MagicMock(return_value=0),
+        get_pool_util=MagicMock(return_value=2.0),
+    )
+
+    op = MagicMock(
+        spec=InternalQueueOperatorMixin,
+        completed=MagicMock(return_value=False),
+        _inputs_complete=False,
+    )
+    op_state = MagicMock(
+        spec=OpState,
+        total_enqueued_input_blocks=MagicMock(return_value=1),
+    )
+    op_state._scheduling_status = MagicMock(under_resource_limits=True)
+    return resource_manager, actor_pool, op, op_state
+
+
+def test_actor_pool_scaling_respects_small_max_upscaling_delta(
+    autoscaler_max_upscaling_delta_setup,
+):
+    resource_manager, actor_pool, op, op_state = autoscaler_max_upscaling_delta_setup
+    autoscaler = DefaultActorAutoscaler(
+        topology=MagicMock(),
+        resource_manager=resource_manager,
+        config=AutoscalingConfig(
+            actor_pool_util_upscaling_threshold=1.0,
+            actor_pool_util_downscaling_threshold=0.5,
+            actor_pool_max_upscaling_delta=3,
+        ),
+    )
+    request = autoscaler._derive_target_scaling_config(
+        actor_pool=actor_pool,
+        op=op,
+        op_state=op_state,
+    )
+    # With current_size=10, util=2.0, threshold=1.0:
+    # plan_delta = ceil(10 * (2.0/1.0 - 1)) = ceil(10) = 10
+    # However, delta is limited by max_upscaling_delta=3, so delta = min(10, 3) = 3
+    assert request.delta == 3
+
+
+def test_actor_pool_scaling_respects_large_max_upscaling_delta(
+    autoscaler_max_upscaling_delta_setup,
+):
+    resource_manager, actor_pool, op, op_state = autoscaler_max_upscaling_delta_setup
+    autoscaler = DefaultActorAutoscaler(
+        topology=MagicMock(),
+        resource_manager=resource_manager,
+        config=AutoscalingConfig(
+            actor_pool_util_upscaling_threshold=1.0,
+            actor_pool_util_downscaling_threshold=0.5,
+            actor_pool_max_upscaling_delta=100,
+        ),
+    )
+    request = autoscaler._derive_target_scaling_config(
+        actor_pool=actor_pool,
+        op=op,
+        op_state=op_state,
+    )
+    # With current_size=10, util=2.0, threshold=1.0:
+    # plan_delta = ceil(10 * (2.0/1.0 - 1)) = ceil(10) = 10
+    # max_upscaling_delta=100 is large enough, but delta is limited by max_size:
+    # max_size(20) - current_size(10) = 10, so delta = min(10, 100, 10) = 10
+    assert request.delta == 10
+
+
 def test_cluster_scaling():
     """Test `_try_scale_up_cluster` in `DefaultAutoscaler`"""
     op1 = MagicMock(
@@ -417,6 +495,101 @@ def __call__(self, row):
     assert expected_message not in wanr_log_args_str
 
 
+@pytest.fixture
+def autoscaler_config_mocks():
+    resource_manager = MagicMock(spec=ResourceManager)
+    topology = MagicMock()
+    topology.items = MagicMock(return_value=[])
+    return resource_manager, topology
+
+
+def test_autoscaling_config_validation_zero_delta(autoscaler_config_mocks):
+    resource_manager, topology = autoscaler_config_mocks
+
+    with pytest.raises(
+        ValueError, match="actor_pool_max_upscaling_delta must be positive"
+    ):
+        DefaultActorAutoscaler(
+            topology=topology,
+            resource_manager=resource_manager,
+            config=AutoscalingConfig(
+                actor_pool_util_upscaling_threshold=1.0,
+                actor_pool_util_downscaling_threshold=0.5,
+                actor_pool_max_upscaling_delta=0,
+            ),
+        )
+
+
+def test_autoscaling_config_validation_negative_delta(autoscaler_config_mocks):
+    resource_manager, topology = autoscaler_config_mocks
+
+    with pytest.raises(
+        ValueError, match="actor_pool_max_upscaling_delta must be positive"
+    ):
+        DefaultActorAutoscaler(
+            topology=topology,
+            resource_manager=resource_manager,
+            config=AutoscalingConfig(
+                actor_pool_util_upscaling_threshold=1.0,
+                actor_pool_util_downscaling_threshold=0.5,
+                actor_pool_max_upscaling_delta=-1,
+            ),
+        )
+
+
+def test_autoscaling_config_validation_positive_delta(autoscaler_config_mocks):
+    resource_manager, topology = autoscaler_config_mocks
+
+    autoscaler = DefaultActorAutoscaler(
+        topology=topology,
+        resource_manager=resource_manager,
+        config=AutoscalingConfig(
+            actor_pool_util_upscaling_threshold=1.0,
+            actor_pool_util_downscaling_threshold=0.5,
+            actor_pool_max_upscaling_delta=5,
+        ),
+    )
+    assert autoscaler._actor_pool_max_upscaling_delta == 5
+
+
+def test_autoscaling_config_validation_zero_upscaling_threshold(
+    autoscaler_config_mocks,
+):
+    resource_manager, topology = autoscaler_config_mocks
+
+    with pytest.raises(
+        ValueError, match="actor_pool_util_upscaling_threshold must be positive"
+    ):
+        DefaultActorAutoscaler(
+            topology=topology,
+            resource_manager=resource_manager,
+            config=AutoscalingConfig(
+                actor_pool_util_upscaling_threshold=0,
+                actor_pool_util_downscaling_threshold=0.5,
+                actor_pool_max_upscaling_delta=5,
+            ),
+        )
+
+
+def test_autoscaling_config_validation_negative_upscaling_threshold(
+    autoscaler_config_mocks,
+):
+    resource_manager, topology = autoscaler_config_mocks
+
+    with pytest.raises(
+        ValueError, match="actor_pool_util_upscaling_threshold must be positive"
+    ):
+        DefaultActorAutoscaler(
+            topology=topology,
+            resource_manager=resource_manager,
+            config=AutoscalingConfig(
+                actor_pool_util_upscaling_threshold=-1.0,
+                actor_pool_util_downscaling_threshold=0.5,
+                actor_pool_max_upscaling_delta=5,
+            ),
+        )
+
+
 if __name__ == "__main__":
     import sys