Skip to content

Commit 7dd2629

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Add STZ support for Endpoint.deploy and Model.deploy (preview).
PiperOrigin-RevId: 820898193
1 parent 87ea594 commit 7dd2629

File tree

4 files changed

+472
-3
lines changed

4 files changed

+472
-3
lines changed

google/cloud/aiplatform/models.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1235,6 +1235,9 @@ def _validate_deploy_args(
12351235
traffic_percentage: Optional[int],
12361236
deployment_resource_pool: Optional[DeploymentResourcePool],
12371237
required_replica_count: Optional[int],
1238+
initial_replica_count: Optional[int] = None,
1239+
min_scaleup_period: Optional[int] = None,
1240+
idle_scaledown_period: Optional[int] = None,
12381241
):
12391242
"""Helper method to validate deploy arguments.
12401243
@@ -1290,6 +1293,17 @@ def _validate_deploy_args(
12901293
set, the model deploy/mutate operation will succeed once
12911294
available_replica_count reaches required_replica_count, and the
12921295
rest of the replicas will be retried.
1296+
initial_replica_count (int):
1297+
Optional. The number of replicas to deploy the model with.
1298+
Only applicable for scale-to-zero deployments where
1299+
min_replica_count is 0.
1300+
min_scaleup_period (int):
1301+
Optional. For scale-to-zero deployments, Minimum duration that
1302+
a deployment will be scaled up before traffic is
1303+
evaluated for potential scale-down.
1304+
idle_scaledown_period (int):
1305+
Optional. For scale-to-zero deployments, duration of no traffic
1306+
before scaling to zero.
12931307
12941308
Raises:
12951309
ValueError: if Min or Max replica is negative. Traffic percentage > 100 or
@@ -1305,6 +1319,7 @@ def _validate_deploy_args(
13051319
and max_replica_count != 1
13061320
or required_replica_count
13071321
and required_replica_count != 0
1322+
or initial_replica_count
13081323
):
13091324
raise ValueError(
13101325
"Ignoring explicitly specified replica counts, "
@@ -1327,6 +1342,44 @@ def _validate_deploy_args(
13271342
raise ValueError("Required replica cannot be negative.")
13281343
if accelerator_type:
13291344
utils.validate_accelerator_type(accelerator_type)
1345+
if min_replica_count != 0:
1346+
if initial_replica_count:
1347+
raise ValueError(
1348+
"Initial replica count cannot be set for non-STZ models."
1349+
)
1350+
if min_scaleup_period:
1351+
raise ValueError(
1352+
"Min scaleup period cannot be set for non-STZ models."
1353+
)
1354+
if idle_scaledown_period:
1355+
raise ValueError(
1356+
"Idle scaledown period cannot be set for non-STZ models."
1357+
)
1358+
if min_replica_count == 0 and initial_replica_count:
1359+
if initial_replica_count < 0:
1360+
raise ValueError("Initial replica count must be at least 0.")
1361+
if initial_replica_count > max_replica_count:
1362+
raise ValueError(
1363+
"Initial replica count cannot be greater than max replica count."
1364+
)
1365+
if min_replica_count == 0 and min_scaleup_period:
1366+
if min_scaleup_period < 300:
1367+
raise ValueError(
1368+
"Min scaleup period cannot be less than 300 (5 minutes)."
1369+
)
1370+
if min_scaleup_period > 28800:
1371+
raise ValueError(
1372+
"Min scaleup period cannot be greater than 28800 (8 hours)."
1373+
)
1374+
if min_replica_count == 0 and idle_scaledown_period:
1375+
if idle_scaledown_period < 300:
1376+
raise ValueError(
1377+
"Idle scaledown period cannot be less than 300 (5 minutes)."
1378+
)
1379+
if idle_scaledown_period > 28800:
1380+
raise ValueError(
1381+
"Idle scaledown period cannot be greater than 28800 (8 hours)."
1382+
)
13301383

13311384
if deployed_model_display_name is not None:
13321385
utils.validate_display_name(deployed_model_display_name)

google/cloud/aiplatform/preview/models.py

Lines changed: 129 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
machine_resources_v1beta1 as gca_machine_resources_compat,
4343
model_v1 as gca_model_compat,
4444
)
45+
from google.protobuf import duration_pb2
4546
from google.protobuf import json_format
4647

4748
_DEFAULT_MACHINE_TYPE = "n1-standard-2"
@@ -590,6 +591,9 @@ def _validate_deploy_args(
590591
traffic_percentage: Optional[int],
591592
deployment_resource_pool: Optional[DeploymentResourcePool],
592593
required_replica_count: Optional[int],
594+
initial_replica_count: Optional[int],
595+
min_scaleup_period: Optional[int],
596+
idle_scaledown_period: Optional[int],
593597
):
594598
"""Helper method to validate deploy arguments.
595599
@@ -641,6 +645,17 @@ def _validate_deploy_args(
641645
set, the model deploy/mutate operation will succeed once
642646
available_replica_count reaches required_replica_count, and the
643647
rest of the replicas will be retried.
648+
initial_replica_count (int):
649+
Optional. The number of replicas to deploy the model with.
650+
Only applicable for scale-to-zero deployments where
651+
min_replica_count is 0.
652+
min_scaleup_period (int):
653+
Optional. For scale-to-zero deployments, minimum duration that
654+
a deployment will be scaled up before traffic is
655+
evaluated for potential scale-down.
656+
idle_scaledown_period (int):
657+
Optional. For scale-to-zero deployments, duration of no traffic
658+
before scaling to zero.
644659
645660
Raises:
646661
ValueError: if min/max replica or accelerator type are specified
@@ -650,9 +665,9 @@ def _validate_deploy_args(
650665
not sum to 100, or if the provided accelerator type is invalid.
651666
"""
652667
if not deployment_resource_pool:
653-
if not (min_replica_count and max_replica_count):
668+
if min_replica_count is None or max_replica_count is None:
654669
raise ValueError(
655-
"Minimum and maximum replica counts must not be "
670+
"Minimum and maximum replica counts must not be specified"
656671
"if not using a shared resource pool."
657672
)
658673
return aiplatform.Endpoint._validate_deploy_args(
@@ -664,6 +679,9 @@ def _validate_deploy_args(
664679
traffic_percentage=traffic_percentage,
665680
deployment_resource_pool=deployment_resource_pool,
666681
required_replica_count=required_replica_count,
682+
initial_replica_count=initial_replica_count,
683+
min_scaleup_period=min_scaleup_period,
684+
idle_scaledown_period=idle_scaledown_period,
667685
)
668686

669687
if (
@@ -673,6 +691,9 @@ def _validate_deploy_args(
673691
and max_replica_count != 1
674692
or required_replica_count
675693
and required_replica_count != 0
694+
or initial_replica_count
695+
or min_scaleup_period
696+
or idle_scaledown_period
676697
):
677698
_LOGGER.warning(
678699
"Ignoring explicitly specified replica counts, "
@@ -684,6 +705,11 @@ def _validate_deploy_args(
684705
"deployment_resource_pool may not be specified at the same time"
685706
"as accelerator_type."
686707
)
708+
if initial_replica_count or min_scaleup_period or idle_scaledown_period:
709+
raise ValueError(
710+
"Scale-to-zero parameters may not be specified at the same time"
711+
"as deployment_resource_pool."
712+
)
687713
if traffic_split is None:
688714
if traffic_percentage > 100:
689715
raise ValueError("Traffic percentage cannot be greater than 100.")
@@ -730,6 +756,9 @@ def deploy(
730756
rollout_options: Optional[RolloutOptions] = None,
731757
multihost_gpu_node_count: Optional[int] = None,
732758
max_runtime_duration: Optional[int] = None,
759+
initial_replica_count: Optional[int] = None,
760+
min_scaleup_period: Optional[int] = None,
761+
idle_scaledown_period: Optional[int] = None,
733762
) -> None:
734763
"""Deploys a Model to the Endpoint.
735764
@@ -844,6 +873,17 @@ def deploy(
844873
for a maximum of 7 days or up to the max_runtime_duration specified,
845874
whichever is shorter. After this period, the model will be
846875
automatically undeployed. The value is in seconds.
876+
initial_replica_count (int):
877+
Optional. The number of replicas to deploy the model with.
878+
Only applicable for scale-to-zero deployments where
879+
min_replica_count is 0.
880+
min_scaleup_period (int):
881+
Optional. For scale-to-zero deployments, minimum duration that
882+
a deployment will be scaled up before traffic is
883+
evaluated for potential scale-down.
884+
idle_scaledown_period (int):
885+
Optional. For scale-to-zero deployments, duration of no traffic
886+
before scaling to zero.
847887
"""
848888
self._sync_gca_resource_if_skipped()
849889

@@ -856,6 +896,9 @@ def deploy(
856896
traffic_percentage=traffic_percentage,
857897
deployment_resource_pool=deployment_resource_pool,
858898
required_replica_count=required_replica_count,
899+
initial_replica_count=initial_replica_count,
900+
min_scaleup_period=min_scaleup_period,
901+
idle_scaledown_period=idle_scaledown_period,
859902
)
860903

861904
explanation_spec = _explanation_utils.create_and_validate_explanation_spec(
@@ -891,6 +934,9 @@ def deploy(
891934
rollout_options=rollout_options,
892935
multihost_gpu_node_count=multihost_gpu_node_count,
893936
max_runtime_duration=max_runtime_duration,
937+
initial_replica_count=initial_replica_count,
938+
min_scaleup_period=min_scaleup_period,
939+
idle_scaledown_period=idle_scaledown_period,
894940
)
895941

896942
@base.optional_sync()
@@ -923,6 +969,9 @@ def _deploy(
923969
rollout_options: Optional[RolloutOptions] = None,
924970
multihost_gpu_node_count: Optional[int] = None,
925971
max_runtime_duration: Optional[int] = None,
972+
initial_replica_count: Optional[int] = None,
973+
min_scaleup_period: Optional[int] = None,
974+
idle_scaledown_period: Optional[int] = None,
926975
) -> None:
927976
"""Deploys a Model to the Endpoint.
928977
@@ -1031,6 +1080,17 @@ def _deploy(
10311080
for a maximum of 7 days or up to the max_runtime_duration specified,
10321081
whichever is shorter. After this period, the model will be
10331082
automatically undeployed. The value is in seconds.
1083+
initial_replica_count (int):
1084+
Optional. The number of replicas to deploy the model with.
1085+
Only applicable for scale-to-zero deployments where
1086+
min_replica_count is 0.
1087+
min_scaleup_period (int):
1088+
Optional. For scale-to-zero deployments, minimum duration that
1089+
a deployment will be scaled up before traffic is
1090+
evaluated for potential scale-down.
1091+
idle_scaledown_period (int):
1092+
Optional. For scale-to-zero deployments, duration of no traffic
1093+
before scaling to zero.
10341094
"""
10351095
_LOGGER.log_action_start_against_resource(
10361096
f"Deploying Model {model.resource_name} to", "", self
@@ -1067,6 +1127,9 @@ def _deploy(
10671127
rollout_options=rollout_options,
10681128
multihost_gpu_node_count=multihost_gpu_node_count,
10691129
max_runtime_duration=max_runtime_duration,
1130+
initial_replica_count=initial_replica_count,
1131+
min_scaleup_period=min_scaleup_period,
1132+
idle_scaledown_period=idle_scaledown_period,
10701133
)
10711134

10721135
_LOGGER.log_action_completed_against_resource("model", "deployed", self)
@@ -1106,6 +1169,9 @@ def _deploy_call(
11061169
rollout_options: Optional[RolloutOptions] = None,
11071170
multihost_gpu_node_count: Optional[int] = None,
11081171
max_runtime_duration: Optional[int] = None,
1172+
initial_replica_count: Optional[int] = None,
1173+
min_scaleup_period: Optional[int] = None,
1174+
idle_scaledown_period: Optional[int] = None,
11091175
) -> None:
11101176
"""Helper method to deploy model to endpoint.
11111177
@@ -1221,6 +1287,14 @@ def _deploy_call(
12211287
for a maximum of 7 days or up to the max_runtime_duration specified,
12221288
whichever is shorter. After this period, the model will be
12231289
automatically undeployed. The value is in seconds.
1290+
initial_replica_count (int): Optional. The number of replicas to
1291+
deploy the model with. Only applicable for scale-to-zero
1292+
deployments where min_replica_count is 0.
1293+
min_scaleup_period (int): Optional. For scale-to-zero deployments,
1294+
minimum duration that a deployment will be scaled up before traffic
1295+
is evaluated for potential scale-down.
1296+
idle_scaledown_period (int): Optional. For scale-to-zero deployments,
1297+
duration of no traffic before scaling to zero.
12241298
12251299
Raises:
12261300
ValueError: If only `accelerator_type` or `accelerator_count` is
@@ -1307,6 +1381,24 @@ def _deploy_call(
13071381
required_replica_count=required_replica_count,
13081382
)
13091383

1384+
# If min_replica_count is 0, set Scale to Zero fields.
1385+
if dedicated_resources.min_replica_count == 0:
1386+
# Set initial replica count
1387+
dedicated_resources.initial_replica_count = initial_replica_count
1388+
# Set scale to zero spec.
1389+
stz_spec = (
1390+
gca_machine_resources_compat.DedicatedResources.ScaleToZeroSpec()
1391+
)
1392+
if min_scaleup_period is not None:
1393+
stz_spec.min_scaleup_period = duration_pb2.Duration(
1394+
seconds=min_scaleup_period
1395+
)
1396+
if idle_scaledown_period is not None:
1397+
stz_spec.idle_scaledown_period = duration_pb2.Duration(
1398+
seconds=idle_scaledown_period
1399+
)
1400+
dedicated_resources.scale_to_zero_spec = stz_spec
1401+
13101402
prediction_utils.add_flex_start_to_dedicated_resources(
13111403
dedicated_resources, max_runtime_duration
13121404
)
@@ -1695,6 +1787,9 @@ def deploy(
16951787
rollout_options: Optional[RolloutOptions] = None,
16961788
multihost_gpu_node_count: Optional[int] = None,
16971789
max_runtime_duration: Optional[int] = None,
1790+
initial_replica_count: Optional[int] = None,
1791+
min_scaleup_period: Optional[int] = None,
1792+
idle_scaledown_period: Optional[int] = None,
16981793
) -> Union[Endpoint, models.PrivateEndpoint]:
16991794
"""Deploys model to endpoint.
17001795
@@ -1830,6 +1925,14 @@ def deploy(
18301925
for a maximum of 7 days or up to the max_runtime_duration specified,
18311926
whichever is shorter. After this period, the model will be
18321927
automatically undeployed. The value is in seconds.
1928+
initial_replica_count (int): Optional. The number of replicas to
1929+
deploy the model with. Only applicable for scale-to-zero
1930+
deployments where min_replica_count is 0.
1931+
min_scaleup_period (int): Optional. For scale-to-zero deployments,
1932+
minimum duration that a deployment will be scaled up before traffic
1933+
is evaluated for potential scale-down.
1934+
idle_scaledown_period (int): Optional. For scale-to-zero deployments,
1935+
duration of no traffic before scaling to zero.
18331936
Returns:
18341937
endpoint (Union[Endpoint, models.PrivateEndpoint]):
18351938
Endpoint with the deployed model.
@@ -1848,6 +1951,9 @@ def deploy(
18481951
traffic_percentage=traffic_percentage,
18491952
deployment_resource_pool=deployment_resource_pool,
18501953
required_replica_count=required_replica_count,
1954+
initial_replica_count=initial_replica_count,
1955+
min_scaleup_period=min_scaleup_period,
1956+
idle_scaledown_period=idle_scaledown_period,
18511957
)
18521958

18531959
if isinstance(endpoint, models.PrivateEndpoint):
@@ -1876,7 +1982,10 @@ def deploy(
18761982
service_account=service_account,
18771983
explanation_spec=explanation_spec,
18781984
metadata=metadata,
1879-
encryption_spec_key_name=encryption_spec_key_name
1985+
encryption_spec_key_name=encryption_spec_key_name,
1986+
initial_replica_count=initial_replica_count,
1987+
min_scaleup_period=min_scaleup_period,
1988+
idle_scaledown_period=idle_scaledown_period
18801989
or initializer.global_config.encryption_spec_key_name,
18811990
network=network,
18821991
sync=sync,
@@ -1935,6 +2044,9 @@ def _deploy(
19352044
rollout_options: Optional[RolloutOptions] = None,
19362045
multihost_gpu_node_count: Optional[int] = None,
19372046
max_runtime_duration: Optional[int] = None,
2047+
initial_replica_count: Optional[int] = None,
2048+
min_scaleup_period: Optional[int] = None,
2049+
idle_scaledown_period: Optional[int] = None,
19382050
) -> Union[Endpoint, models.PrivateEndpoint]:
19392051
"""Deploys model to endpoint.
19402052
@@ -2061,6 +2173,17 @@ def _deploy(
20612173
for a maximum of 7 days or up to the max_runtime_duration specified,
20622174
whichever is shorter. After this period, the model will be
20632175
automatically undeployed. The value is in seconds.
2176+
initial_replica_count (int):
2177+
Optional. The number of replicas to deploy the model with.
2178+
Only applicable for scale-to-zero deployments where
2179+
min_replica_count is 0.
2180+
min_scaleup_period (int):
2181+
Optional. For scale-to-zero deployments, minimum duration that
2182+
a deployment will be scaled up before traffic is
2183+
evaluated for potential scale-down.
2184+
idle_scaledown_period (int):
2185+
Optional. For scale-to-zero deployments, duration of no traffic
2186+
before scaling to zero.
20642187
Returns:
20652188
endpoint (Union[Endpoint, models.PrivateEndpoint]):
20662189
Endpoint with the deployed model.
@@ -2130,6 +2253,9 @@ def _deploy(
21302253
required_replica_count=required_replica_count,
21312254
multihost_gpu_node_count=multihost_gpu_node_count,
21322255
max_runtime_duration=max_runtime_duration,
2256+
initial_replica_count=initial_replica_count,
2257+
min_scaleup_period=min_scaleup_period,
2258+
idle_scaledown_period=idle_scaledown_period,
21332259
**preview_kwargs,
21342260
)
21352261

0 commit comments

Comments
 (0)