Add ray docs for custom autoscaling in serve (#57600)

abrarsheikh · web-flow · commit a1036cb7dd12 · 2025-10-10T12:17:35.000-07:00
1. add docs under advance autoscaling
2. promote autoscaling_context to public api

---------

Signed-off-by: abrar &lt;abrar@anyscale.com&gt;
diff --git a/doc/source/serve/advanced-guides/advanced-autoscaling.md b/doc/source/serve/advanced-guides/advanced-autoscaling.md
@@ -439,3 +439,79 @@ makes more conservative downscaling decisions.
 | `downscaling_factor = 1` | `downscaling_factor = 0.5` |
 | ------------------------------------------------ | ----------------------------------------------- |
 | ![downscale-smooth-before](https://raw.githubusercontent.com/ray-project/images/master/docs/serve/autoscaling-guide/downscale_smoothing_factor_before.png) | ![downscale-smooth-after](https://raw.githubusercontent.com/ray-project/images/master/docs/serve/autoscaling-guide/downscale_smoothing_factor_after.png) |
+
+
+(serve-custom-autoscaling-policies)=
+## Custom autoscaling policies
+
+:::{warning}
+Custom autoscaling policies are experimental and may change in future releases.
+:::
+
+Ray Serve’s built-in, request-driven autoscaling works well for most apps. Use **custom autoscaling policies** when you need more control—e.g., scaling on external metrics (CloudWatch, Prometheus), anticipating predictable traffic (scheduled batch jobs), or applying business logic that goes beyond queue thresholds.
+
+Custom policies let you implement scaling logic based on any metrics or rules you choose.
+
+### Custom policy for deployment
+
+A custom autoscaling policy is a user-provided Python function that takes an [`AutoscalingContext`](../api/doc/ray.serve.config.AutoscalingContext.rst) and returns a tuple `(target_replicas, policy_state)` for a single Deployment.
+
+* **Current state:** Current replica count and deployment metadata.
+* **Built-in metrics:** Total requests, queued requests, per-replica counts.
+* **Custom metrics:** Values your deployment reports via `record_autoscaling_stats()`. (See below.)
+* **Capacity bounds:** `min` / `max` replica limits adjusted for current cluster capacity.
+* **Policy state:** A `dict` you can use to persist arbitrary state across control-loop iterations.
+* **Timing:** Timestamps of the last scale actions and “now”.
+
+The following example showcases a policy that scales up during business hours and evening batch processing, and scales down during off-peak hours:
+
+```{literalinclude} ../doc_code/autoscaling_policy.py
+:language: python
+:start-after: __begin_scheduled_batch_processing_policy__
+:end-before: __end_scheduled_batch_processing_policy__
+```
+
+```{literalinclude} ../doc_code/scheduled_batch_processing.py
+:language: python
+:start-after: __serve_example_begin__
+:end-before: __serve_example_end__
+```
+
+Policies are defined **per deployment**. If you don’t provide one, Ray Serve falls back to its built-in request-based policy.
+
+The policy function is invoked by the Ray Serve controller every `RAY_SERVE_CONTROL_LOOP_INTERVAL_S` seconds (default **0.1s**), so your logic runs against near-real-time state.
+
+:::{warning}
+Keep policy functions **fast and lightweight**. Slow logic can block the Serve controller and degrade cluster responsiveness.
+:::
+
+
+### Custom metrics
+
+You can make richer decisions by emitting your own metrics from the deployment. Implement `record_autoscaling_stats()` to return a `dict[str, float]`. Ray Serve will surface these values in the [`AutoscalingContext`](../api/doc/ray.serve.config.AutoscalingContext.rst).
+
+This example demonstrates how deployments can provide their own metrics (CPU usage, memory usage) and how autoscaling policies can use these metrics to make scaling decisions:
+
+```{literalinclude} ../doc_code/autoscaling_policy.py
+:language: python
+:start-after: __begin_custom_metrics_autoscaling_policy__
+:end-before: __end_custom_metrics_autoscaling_policy__
+```
+
+```{literalinclude} ../doc_code/custom_metrics_autoscaling.py
+:language: python
+:start-after: __serve_example_begin__
+:end-before: __serve_example_end__
+```
+
+:::{note}
+The `record_autoscaling_stats()` method can be either synchronous or asynchronous. It must complete within the timeout specified by `RAY_SERVE_RECORD_AUTOSCALING_STATS_TIMEOUT_S` (default 30 seconds).
+:::
+
+In your policy, access custom metrics via:
+
+* **`ctx.raw_metrics[metric_name]`** — A mapping of replica IDs to lists of raw metric values.
+  The number of data points stored for each replica depends on the [`look_back_period_s`](../api/doc/ray.serve.config.AutoscalingConfig.look_back_period_s.rst) (the sliding window size) and `RAY_SERVE_REPLICA_AUTOSCALING_METRIC_RECORD_INTERVAL_S` (the metric recording interval).
+* **`ctx.aggregated_metrics[metric_name]`** — A time-weighted average computed from the raw metric values for each replica.
+
+> Today, aggregation is a time-weighted average. In future releases, additional aggregation options may be supported.
diff --git a/doc/source/serve/api/index.md b/doc/source/serve/api/index.md
@@ -85,6 +85,7 @@ See the [model composition guide](serve-model-composition) for how to update cod
    serve.config.HTTPOptions
    serve.config.AutoscalingConfig
    serve.config.AutoscalingPolicy
+   serve.config.AutoscalingContext
    serve.config.AggregationFunction
    serve.config.RequestRouterConfig
 ```
diff --git a/doc/source/serve/doc_code/autoscaling_policy.py b/doc/source/serve/doc_code/autoscaling_policy.py
@@ -0,0 +1,43 @@
+# __begin_scheduled_batch_processing_policy__
+from datetime import datetime
+from typing import Any, Dict
+from ray.serve.config import AutoscalingContext
+
+
+def scheduled_batch_processing_policy(
+    ctx: AutoscalingContext,
+) -> tuple[int, Dict[str, Any]]:
+    current_time = datetime.now()
+    current_hour = current_time.hour
+    # Scale up during business hours (9 AM - 5 PM)
+    if 9 <= current_hour < 17:
+        return 2, {"reason": "Business hours"}
+    # Scale up for evening batch processing (6 PM - 8 PM)
+    elif 18 <= current_hour < 20:
+        return 4, {"reason": "Evening batch processing"}
+    # Minimal scaling during off-peak hours
+    else:
+        return 1, {"reason": "Off-peak hours"}
+
+
+# __end_scheduled_batch_processing_policy__
+
+
+# __begin_custom_metrics_autoscaling_policy__
+def custom_metrics_autoscaling_policy(
+    ctx: AutoscalingContext,
+) -> tuple[int, Dict[str, Any]]:
+    cpu_usage_metric = ctx.aggregated_metrics.get("cpu_usage", {})
+    memory_usage_metric = ctx.aggregated_metrics.get("memory_usage", {})
+    max_cpu_usage = max(cpu_usage_metric.values())
+    max_memory_usage = max(memory_usage_metric.values())
+
+    if max_cpu_usage > 80 or max_memory_usage > 85:
+        return min(ctx.capacity_adjusted_max_replicas, ctx.current_num_replicas + 1), {}
+    elif max_cpu_usage < 30 and max_memory_usage < 40:
+        return max(ctx.capacity_adjusted_min_replicas, ctx.current_num_replicas - 1), {}
+    else:
+        return ctx.current_num_replicas, {}
+
+
+# __end_custom_metrics_autoscaling_policy__
diff --git a/doc/source/serve/doc_code/custom_metrics_autoscaling.py b/doc/source/serve/doc_code/custom_metrics_autoscaling.py
@@ -0,0 +1,48 @@
+# __serve_example_begin__
+import time
+from typing import Dict
+
+from ray import serve
+
+
+@serve.deployment(
+    autoscaling_config={
+        "min_replicas": 1,
+        "max_replicas": 5,
+        "policy": {
+            "policy_function": "autoscaling_policy:custom_metrics_autoscaling_policy"
+        },
+    },
+    max_ongoing_requests=5,
+)
+class CustomMetricsDeployment:
+    def __init__(self):
+        self.cpu_usage = 50.0
+        self.memory_usage = 60.0
+
+    def __call__(self) -> str:
+        time.sleep(0.1)
+        self.cpu_usage = min(100, self.cpu_usage + 5)
+        self.memory_usage = min(100, self.memory_usage + 3)
+        return "Hello, world!"
+
+    def record_autoscaling_stats(self) -> Dict[str, float]:
+        self.cpu_usage = max(20, self.cpu_usage - 2)
+        self.memory_usage = max(30, self.memory_usage - 1)
+        return {
+            "cpu_usage": self.cpu_usage,
+            "memory_usage": self.memory_usage,
+        }
+
+
+# Create the app
+app = CustomMetricsDeployment.bind()
+# __serve_example_end__
+
+# TODO: uncomment after autoscaling context is populated with all metrics
+# if __name__ == "__main__":
+#     import requests  # noqa
+
+#     serve.run(app)
+#     resp = requests.get("http://localhost:8000/")
+#     assert resp.text == "Hello, world!"
diff --git a/doc/source/serve/doc_code/scheduled_batch_processing.py b/doc/source/serve/doc_code/scheduled_batch_processing.py
@@ -0,0 +1,33 @@
+# __serve_example_begin__
+import asyncio
+
+from ray import serve
+from ray.serve.config import AutoscalingConfig, AutoscalingPolicy
+
+
+@serve.deployment(
+    autoscaling_config=AutoscalingConfig(
+        min_replicas=1,
+        max_replicas=12,
+        policy=AutoscalingPolicy(
+            policy_function="autoscaling_policy:scheduled_batch_processing_policy"
+        ),
+    ),
+    max_ongoing_requests=3,
+)
+class BatchProcessingDeployment:
+    async def __call__(self) -> str:
+        # Simulate batch processing work
+        await asyncio.sleep(0.5)
+        return "Hello, world!"
+
+
+app = BatchProcessingDeployment.bind()
+# __serve_example_end__
+
+if __name__ == "__main__":
+    import requests  # noqa
+
+    serve.run(app)
+    resp = requests.get("http://localhost:8000/")
+    assert resp.text == "Hello, world!"
diff --git a/python/ray/serve/_private/autoscaling_state.py b/python/ray/serve/_private/autoscaling_state.py
@@ -1,7 +1,6 @@
 import logging
 import time
 from collections import defaultdict
-from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Set
 
 from ray.serve._private.common import (
@@ -25,49 +24,11 @@
     merge_timeseries_dicts,
 )
 from ray.serve._private.utils import get_capacity_adjusted_num_replicas
+from ray.serve.config import AutoscalingContext
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
 
-@dataclass
-class AutoscalingContext:
-    """Rich context provided to custom autoscaling policies."""
-
-    # Deployment information
-    deployment_id: DeploymentID
-    deployment_name: str
-    app_name: Optional[str]
-
-    # Current state
-    current_num_replicas: int
-    target_num_replicas: int
-    running_replicas: List[ReplicaID]
-
-    # Built-in metrics
-    total_num_requests: float
-    queued_requests: Optional[float]
-    requests_per_replica: Dict[ReplicaID, float]
-
-    # Custom metrics
-    aggregated_metrics: Dict[str, Dict[ReplicaID, float]]
-    raw_metrics: Dict[str, Dict[ReplicaID, List[float]]]
-
-    # Capacity and bounds
-    capacity_adjusted_min_replicas: int
-    capacity_adjusted_max_replicas: int
-
-    # Policy state
-    policy_state: Dict[str, Any]
-
-    # Timing
-    last_scale_up_time: Optional[float]
-    last_scale_down_time: Optional[float]
-    current_time: Optional[float]
-
-    # Config
-    config: Optional[Any]
-
-
 class AutoscalingState:
     """Manages autoscaling for a single deployment."""
 
diff --git a/python/ray/serve/autoscaling_policy.py b/python/ray/serve/autoscaling_policy.py
@@ -2,9 +2,8 @@
 import math
 from typing import Any, Dict, Optional, Tuple
 
-from ray.serve._private.autoscaling_state import AutoscalingContext
 from ray.serve._private.constants import CONTROL_LOOP_INTERVAL_S, SERVE_LOGGER_NAME
-from ray.serve.config import AutoscalingConfig
+from ray.serve.config import AutoscalingConfig, AutoscalingContext
 from ray.util.annotations import PublicAPI
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import warnings
+from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Union
 
@@ -16,6 +17,9 @@
     validator,
 )
 from ray._common.utils import import_attr
+
+# Import types needed for AutoscalingContext
+from ray.serve._private.common import DeploymentID, ReplicaID
 from ray.serve._private.constants import (
     DEFAULT_AUTOSCALING_POLICY_NAME,
     DEFAULT_GRPC_PORT,
@@ -34,6 +38,62 @@
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
 
+@PublicAPI(stability="alpha")
+@dataclass
+class AutoscalingContext:
+    """Rich context provided to custom autoscaling policies.
+
+    This class provides comprehensive information about a deployment's current state,
+    metrics, and configuration that can be used by custom autoscaling policies to
+    make intelligent scaling decisions.
+
+    The context includes deployment metadata, current replica state, built-in and
+    custom metrics, capacity bounds, policy state, and timing information.
+    """
+
+    # Deployment information
+    deployment_id: DeploymentID  #: Unique identifier for the deployment.
+    deployment_name: str  #: Name of the deployment.
+    app_name: Optional[str]  #: Name of the application containing this deployment.
+
+    # Current state
+    current_num_replicas: int  #: Current number of running replicas.
+    target_num_replicas: int  #: Target number of replicas set by the autoscaler.
+    running_replicas: List[ReplicaID]  #: List of currently running replica IDs.
+
+    # Built-in metrics
+    total_num_requests: float  #: Total number of requests across all replicas.
+    queued_requests: Optional[float]  #: Number of requests currently queued.
+    requests_per_replica: Dict[
+        ReplicaID, float
+    ]  #: Mapping of replica ID to number of requests.
+
+    # Custom metrics
+    aggregated_metrics: Dict[
+        str, Dict[ReplicaID, float]
+    ]  #: Time-weighted averages of custom metrics per replica.
+    raw_metrics: Dict[
+        str, Dict[ReplicaID, List[float]]
+    ]  #: Raw custom metric values per replica.
+
+    # Capacity and bounds
+    capacity_adjusted_min_replicas: int  #: Minimum replicas adjusted for cluster capacity.
+    capacity_adjusted_max_replicas: int  #: Maximum replicas adjusted for cluster capacity.
+
+    # Policy state
+    policy_state: Dict[
+        str, Any
+    ]  #: Persistent state dictionary for the autoscaling policy.
+
+    # Timing
+    last_scale_up_time: Optional[float]  #: Timestamp of last scale-up action.
+    last_scale_down_time: Optional[float]  #: Timestamp of last scale-down action.
+    current_time: Optional[float]  #: Current timestamp.
+
+    # Config
+    config: Optional[Any]  #: Autoscaling configuration for this deployment.
+
+
 @PublicAPI(stability="alpha")
 class RequestRouterConfig(BaseModel):
     """Config for the Serve request router.
diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py
@@ -14,7 +14,6 @@
 import ray
 from ray import serve
 from ray._common.test_utils import SignalActor, wait_for_condition
-from ray.serve._private.autoscaling_state import AutoscalingContext
 from ray.serve._private.common import (
     DeploymentID,
     DeploymentStatus,
@@ -36,7 +35,7 @@
     get_num_alive_replicas,
     tlog,
 )
-from ray.serve.config import AutoscalingConfig, AutoscalingPolicy
+from ray.serve.config import AutoscalingConfig, AutoscalingContext, AutoscalingPolicy
 from ray.serve.handle import DeploymentHandle
 from ray.serve.schema import ApplicationStatus, ServeDeploySchema
 from ray.util.state import list_actors
diff --git a/python/ray/serve/tests/unit/test_autoscaling_policy.py b/python/ray/serve/tests/unit/test_autoscaling_policy.py
@@ -2,13 +2,12 @@
 
 import pytest
 
-from ray.serve._private.autoscaling_state import AutoscalingContext
 from ray.serve._private.constants import CONTROL_LOOP_INTERVAL_S
 from ray.serve.autoscaling_policy import (
     _calculate_desired_num_replicas,
     replica_queue_length_autoscaling_policy,
 )
-from ray.serve.config import AutoscalingConfig
+from ray.serve.config import AutoscalingConfig, AutoscalingContext
 
 
 class TestCalculateDesiredNumReplicas: