4242 machine_resources_v1beta1 as gca_machine_resources_compat ,
4343 model_v1 as gca_model_compat ,
4444)
45+ from google .protobuf import duration_pb2
4546from google .protobuf import json_format
4647
4748_DEFAULT_MACHINE_TYPE = "n1-standard-2"
@@ -590,6 +591,9 @@ def _validate_deploy_args(
590591 traffic_percentage : Optional [int ],
591592 deployment_resource_pool : Optional [DeploymentResourcePool ],
592593 required_replica_count : Optional [int ],
594+ initial_replica_count : Optional [int ],
595+ min_scaleup_period : Optional [int ],
596+ idle_scaledown_period : Optional [int ],
593597 ):
594598 """Helper method to validate deploy arguments.
595599
@@ -641,6 +645,17 @@ def _validate_deploy_args(
641645 set, the model deploy/mutate operation will succeed once
642646 available_replica_count reaches required_replica_count, and the
643647 rest of the replicas will be retried.
648+ initial_replica_count (int):
649+ Optional. The number of replicas to deploy the model with.
650+ Only applicable for scale-to-zero deployments where
651+ min_replica_count is 0.
652+ min_scaleup_period (int):
653+ Optional. For scale-to-zero deployments, minimum duration that
654+ a deployment will be scaled up before traffic is
655+ evaluated for potential scale-down.
656+ idle_scaledown_period (int):
657+ Optional. For scale-to-zero deployments, duration of no traffic
658+ before scaling to zero.
644659
645660 Raises:
646661 ValueError: if min/max replica or accelerator type are specified
@@ -650,9 +665,9 @@ def _validate_deploy_args(
650665 not sum to 100, or if the provided accelerator type is invalid.
651666 """
652667 if not deployment_resource_pool :
653- if not ( min_replica_count and max_replica_count ) :
668+ if min_replica_count is None or max_replica_count is None :
654669 raise ValueError (
655- "Minimum and maximum replica counts must not be "
670+ "Minimum and maximum replica counts must not be specified "
656671 "if not using a shared resource pool."
657672 )
658673 return aiplatform .Endpoint ._validate_deploy_args (
@@ -664,6 +679,9 @@ def _validate_deploy_args(
664679 traffic_percentage = traffic_percentage ,
665680 deployment_resource_pool = deployment_resource_pool ,
666681 required_replica_count = required_replica_count ,
682+ initial_replica_count = initial_replica_count ,
683+ min_scaleup_period = min_scaleup_period ,
684+ idle_scaledown_period = idle_scaledown_period ,
667685 )
668686
669687 if (
@@ -673,6 +691,9 @@ def _validate_deploy_args(
673691 and max_replica_count != 1
674692 or required_replica_count
675693 and required_replica_count != 0
694+ or initial_replica_count
695+ or min_scaleup_period
696+ or idle_scaledown_period
676697 ):
677698 _LOGGER .warning (
678699 "Ignoring explicitly specified replica counts, "
@@ -684,6 +705,11 @@ def _validate_deploy_args(
684705 "deployment_resource_pool may not be specified at the same time"
685706 "as accelerator_type."
686707 )
708+ if initial_replica_count or min_scaleup_period or idle_scaledown_period :
709+ raise ValueError (
710+ "Scale-to-zero parameters may not be specified at the same time"
711+ "as deployment_resource_pool."
712+ )
687713 if traffic_split is None :
688714 if traffic_percentage > 100 :
689715 raise ValueError ("Traffic percentage cannot be greater than 100." )
@@ -730,6 +756,9 @@ def deploy(
730756 rollout_options : Optional [RolloutOptions ] = None ,
731757 multihost_gpu_node_count : Optional [int ] = None ,
732758 max_runtime_duration : Optional [int ] = None ,
759+ initial_replica_count : Optional [int ] = None ,
760+ min_scaleup_period : Optional [int ] = None ,
761+ idle_scaledown_period : Optional [int ] = None ,
733762 ) -> None :
734763 """Deploys a Model to the Endpoint.
735764
@@ -844,6 +873,17 @@ def deploy(
844873 for a maximum of 7 days or up to the max_runtime_duration specified,
845874 whichever is shorter. After this period, the model will be
846875 automatically undeployed. The value is in seconds.
876+ initial_replica_count (int):
877+ Optional. The number of replicas to deploy the model with.
878+ Only applicable for scale-to-zero deployments where
879+ min_replica_count is 0.
880+ min_scaleup_period (int):
881+ Optional. For scale-to-zero deployments, minimum duration that
882+ a deployment will be scaled up before traffic is
883+ evaluated for potential scale-down.
884+ idle_scaledown_period (int):
885+ Optional. For scale-to-zero deployments, duration of no traffic
886+ before scaling to zero.
847887 """
848888 self ._sync_gca_resource_if_skipped ()
849889
@@ -856,6 +896,9 @@ def deploy(
856896 traffic_percentage = traffic_percentage ,
857897 deployment_resource_pool = deployment_resource_pool ,
858898 required_replica_count = required_replica_count ,
899+ initial_replica_count = initial_replica_count ,
900+ min_scaleup_period = min_scaleup_period ,
901+ idle_scaledown_period = idle_scaledown_period ,
859902 )
860903
861904 explanation_spec = _explanation_utils .create_and_validate_explanation_spec (
@@ -891,6 +934,9 @@ def deploy(
891934 rollout_options = rollout_options ,
892935 multihost_gpu_node_count = multihost_gpu_node_count ,
893936 max_runtime_duration = max_runtime_duration ,
937+ initial_replica_count = initial_replica_count ,
938+ min_scaleup_period = min_scaleup_period ,
939+ idle_scaledown_period = idle_scaledown_period ,
894940 )
895941
896942 @base .optional_sync ()
@@ -923,6 +969,9 @@ def _deploy(
923969 rollout_options : Optional [RolloutOptions ] = None ,
924970 multihost_gpu_node_count : Optional [int ] = None ,
925971 max_runtime_duration : Optional [int ] = None ,
972+ initial_replica_count : Optional [int ] = None ,
973+ min_scaleup_period : Optional [int ] = None ,
974+ idle_scaledown_period : Optional [int ] = None ,
926975 ) -> None :
927976 """Deploys a Model to the Endpoint.
928977
@@ -1031,6 +1080,17 @@ def _deploy(
10311080 for a maximum of 7 days or up to the max_runtime_duration specified,
10321081 whichever is shorter. After this period, the model will be
10331082 automatically undeployed. The value is in seconds.
1083+ initial_replica_count (int):
1084+ Optional. The number of replicas to deploy the model with.
1085+ Only applicable for scale-to-zero deployments where
1086+ min_replica_count is 0.
1087+ min_scaleup_period (int):
1088+ Optional. For scale-to-zero deployments, minimum duration that
1089+ a deployment will be scaled up before traffic is
1090+ evaluated for potential scale-down.
1091+ idle_scaledown_period (int):
1092+ Optional. For scale-to-zero deployments, duration of no traffic
1093+ before scaling to zero.
10341094 """
10351095 _LOGGER .log_action_start_against_resource (
10361096 f"Deploying Model { model .resource_name } to" , "" , self
@@ -1067,6 +1127,9 @@ def _deploy(
10671127 rollout_options = rollout_options ,
10681128 multihost_gpu_node_count = multihost_gpu_node_count ,
10691129 max_runtime_duration = max_runtime_duration ,
1130+ initial_replica_count = initial_replica_count ,
1131+ min_scaleup_period = min_scaleup_period ,
1132+ idle_scaledown_period = idle_scaledown_period ,
10701133 )
10711134
10721135 _LOGGER .log_action_completed_against_resource ("model" , "deployed" , self )
@@ -1106,6 +1169,9 @@ def _deploy_call(
11061169 rollout_options : Optional [RolloutOptions ] = None ,
11071170 multihost_gpu_node_count : Optional [int ] = None ,
11081171 max_runtime_duration : Optional [int ] = None ,
1172+ initial_replica_count : Optional [int ] = None ,
1173+ min_scaleup_period : Optional [int ] = None ,
1174+ idle_scaledown_period : Optional [int ] = None ,
11091175 ) -> None :
11101176 """Helper method to deploy model to endpoint.
11111177
@@ -1221,6 +1287,14 @@ def _deploy_call(
12211287 for a maximum of 7 days or up to the max_runtime_duration specified,
12221288 whichever is shorter. After this period, the model will be
12231289 automatically undeployed. The value is in seconds.
1290+ initial_replica_count (int): Optional. The number of replicas to
1291+ deploy the model with. Only applicable for scale-to-zero
1292+ deployments where min_replica_count is 0.
1293+ min_scaleup_period (int): Optional. For scale-to-zero deployments,
1294+ minimum duration that a deployment will be scaled up before traffic
1295+ is evaluated for potential scale-down.
1296+ idle_scaledown_period (int): Optional. For scale-to-zero deployments,
1297+ duration of no traffic before scaling to zero.
12241298
12251299 Raises:
12261300 ValueError: If only `accelerator_type` or `accelerator_count` is
@@ -1307,6 +1381,24 @@ def _deploy_call(
13071381 required_replica_count = required_replica_count ,
13081382 )
13091383
1384+ # If min_replica_count is 0, set Scale to Zero fields.
1385+ if dedicated_resources .min_replica_count == 0 :
1386+ # Set initial replica count
1387+ dedicated_resources .initial_replica_count = initial_replica_count
1388+ # Set scale to zero spec.
1389+ stz_spec = (
1390+ gca_machine_resources_compat .DedicatedResources .ScaleToZeroSpec ()
1391+ )
1392+ if min_scaleup_period is not None :
1393+ stz_spec .min_scaleup_period = duration_pb2 .Duration (
1394+ seconds = min_scaleup_period
1395+ )
1396+ if idle_scaledown_period is not None :
1397+ stz_spec .idle_scaledown_period = duration_pb2 .Duration (
1398+ seconds = idle_scaledown_period
1399+ )
1400+ dedicated_resources .scale_to_zero_spec = stz_spec
1401+
13101402 prediction_utils .add_flex_start_to_dedicated_resources (
13111403 dedicated_resources , max_runtime_duration
13121404 )
@@ -1695,6 +1787,9 @@ def deploy(
16951787 rollout_options : Optional [RolloutOptions ] = None ,
16961788 multihost_gpu_node_count : Optional [int ] = None ,
16971789 max_runtime_duration : Optional [int ] = None ,
1790+ initial_replica_count : Optional [int ] = None ,
1791+ min_scaleup_period : Optional [int ] = None ,
1792+ idle_scaledown_period : Optional [int ] = None ,
16981793 ) -> Union [Endpoint , models .PrivateEndpoint ]:
16991794 """Deploys model to endpoint.
17001795
@@ -1830,6 +1925,14 @@ def deploy(
18301925 for a maximum of 7 days or up to the max_runtime_duration specified,
18311926 whichever is shorter. After this period, the model will be
18321927 automatically undeployed. The value is in seconds.
1928+ initial_replica_count (int): Optional. The number of replicas to
1929+ deploy the model with. Only applicable for scale-to-zero
1930+ deployments where min_replica_count is 0.
1931+ min_scaleup_period (int): Optional. For scale-to-zero deployments,
1932+ minimum duration that a deployment will be scaled up before traffic
1933+ is evaluated for potential scale-down.
1934+ idle_scaledown_period (int): Optional. For scale-to-zero deployments,
1935+ duration of no traffic before scaling to zero.
18331936 Returns:
18341937 endpoint (Union[Endpoint, models.PrivateEndpoint]):
18351938 Endpoint with the deployed model.
@@ -1848,6 +1951,9 @@ def deploy(
18481951 traffic_percentage = traffic_percentage ,
18491952 deployment_resource_pool = deployment_resource_pool ,
18501953 required_replica_count = required_replica_count ,
1954+ initial_replica_count = initial_replica_count ,
1955+ min_scaleup_period = min_scaleup_period ,
1956+ idle_scaledown_period = idle_scaledown_period ,
18511957 )
18521958
18531959 if isinstance (endpoint , models .PrivateEndpoint ):
@@ -1876,7 +1982,10 @@ def deploy(
18761982 service_account = service_account ,
18771983 explanation_spec = explanation_spec ,
18781984 metadata = metadata ,
1879- encryption_spec_key_name = encryption_spec_key_name
1985+ encryption_spec_key_name = encryption_spec_key_name ,
1986+ initial_replica_count = initial_replica_count ,
1987+ min_scaleup_period = min_scaleup_period ,
1988+ idle_scaledown_period = idle_scaledown_period
18801989 or initializer .global_config .encryption_spec_key_name ,
18811990 network = network ,
18821991 sync = sync ,
@@ -1935,6 +2044,9 @@ def _deploy(
19352044 rollout_options : Optional [RolloutOptions ] = None ,
19362045 multihost_gpu_node_count : Optional [int ] = None ,
19372046 max_runtime_duration : Optional [int ] = None ,
2047+ initial_replica_count : Optional [int ] = None ,
2048+ min_scaleup_period : Optional [int ] = None ,
2049+ idle_scaledown_period : Optional [int ] = None ,
19382050 ) -> Union [Endpoint , models .PrivateEndpoint ]:
19392051 """Deploys model to endpoint.
19402052
@@ -2061,6 +2173,17 @@ def _deploy(
20612173 for a maximum of 7 days or up to the max_runtime_duration specified,
20622174 whichever is shorter. After this period, the model will be
20632175 automatically undeployed. The value is in seconds.
2176+ initial_replica_count (int):
2177+ Optional. The number of replicas to deploy the model with.
2178+ Only applicable for scale-to-zero deployments where
2179+ min_replica_count is 0.
2180+ min_scaleup_period (int):
2181+ Optional. For scale-to-zero deployments, minimum duration that
2182+ a deployment will be scaled up before traffic is
2183+ evaluated for potential scale-down.
2184+ idle_scaledown_period (int):
2185+ Optional. For scale-to-zero deployments, duration of no traffic
2186+ before scaling to zero.
20642187 Returns:
20652188 endpoint (Union[Endpoint, models.PrivateEndpoint]):
20662189 Endpoint with the deployed model.
@@ -2130,6 +2253,9 @@ def _deploy(
21302253 required_replica_count = required_replica_count ,
21312254 multihost_gpu_node_count = multihost_gpu_node_count ,
21322255 max_runtime_duration = max_runtime_duration ,
2256+ initial_replica_count = initial_replica_count ,
2257+ min_scaleup_period = min_scaleup_period ,
2258+ idle_scaledown_period = idle_scaledown_period ,
21332259 ** preview_kwargs ,
21342260 )
21352261
0 commit comments