From aa68b4c38c8d65a3b4e69351fc8244d4eaf4e412 Mon Sep 17 00:00:00 2001 From: Andrews Arokiam <87992092+andyi2it@users.noreply.github.com> Date: Sun, 12 Jun 2022 01:52:37 +0530 Subject: [PATCH] Added target and metric to components (#2082) * Added target and metric to components Added scaleTarget and scaleMetric to ComponentSpec Made changes to ksvc_reconciler to add annotations to component as required Signed-off-by: Andrews Arokiam Added validation Signed-off-by: Andrews Arokiam Fixed validation bugs Signed-off-by: Andrews Arokiam Updated comments Signed-off-by: Andrews Arokiam Updated python sdk and openapi Signed-off-by: Andrews Arokiam Added test for autoscaling Signed-off-by: Andrews Arokiam Added tests for autoscaling Signed-off-by: Andrews Arokiam Added e2e for autoscaling changes Removed validation for metric concurrency Signed-off-by: Andrews Arokiam Updated CRD Signed-off-by: Andrews Arokiam Fixed liniting issues Signed-off-by: Andrews Arokiam Made ScaleMetric as enum Added a test for raw deployment Signed-off-by: Andrews Arokiam Fixed issues with type after changes in ScaleMetric type Signed-off-by: Andrews Arokiam Updated crd Updated crd Signed-off-by: Andrews Arokiam Updated test case. Modified to check deployment mode before checking hpa or kpa. Signed-off-by: Andrews Arokiam Added debug logs to debug e2e failure in workflow alone Signed-off-by: Andrews Arokiam Disabling all tests temporarily to debug failing e2e Signed-off-by: Andrews Arokiam Reverting debugging changes Signed-off-by: Andrews Arokiam Temporary changes to verify if e2e is passing after Dan's port forwarding Signed-off-by: Andrews Arokiam Removed e2e changes. Removed comments. Updated python sdk version to debug test failure. Signed-off-by: Andrews Arokiam Updated test variable Signed-off-by: Andrews Arokiam Fixed linting error Added test logs Signed-off-by: Andrews Arokiam Adding logs to print spec Signed-off-by: Andrews Arokiam Added more debugging logs Signed-off-by: Andrews Arokiam Reverting python definition log in run e2e script Signed-off-by: Andrews Arokiam Updated python sdk and docs Fixed liniting error Signed-off-by: Andrews Arokiam Fix for failing protobuf issue Signed-off-by: Andrews Arokiam Added debug logs for kserve controller Signed-off-by: Andrews Arokiam Added logs to check controller changes show up Signed-off-by: Andrews Arokiam Added knative hpa Signed-off-by: Andrews Arokiam Updated resource requirements Signed-off-by: Andrews Arokiam Moved knative hpa installation to setup-deps Signed-off-by: Andrews Arokiam Removed skip for tests Signed-off-by: Andrews Arokiam Signed-off-by: Dan Sun * Mutate and add deployment mode annotation if RawDeployment or ModelMesh Signed-off-by: Dan Sun Co-authored-by: Dan Sun --- .../serving.kserve.io_inferenceservices.yaml | 27 +++ pkg/apis/serving/v1beta1/component.go | 21 ++ .../v1beta1/inference_service_defaults.go | 12 +- .../inference_service_defaults_test.go | 15 +- .../v1beta1/inference_service_validation.go | 83 +++++++- pkg/apis/serving/v1beta1/openapi_generated.go | 62 ++++++ pkg/apis/serving/v1beta1/swagger.json | 39 ++++ .../serving/v1beta1/zz_generated.deepcopy.go | 10 + pkg/constants/constants.go | 18 ++ .../reconcilers/hpa/hpa_reconciler.go | 40 ++-- .../reconcilers/knative/ksvc_reconciler.go | 10 + .../service_account_credentials_test.go | 4 +- python/kserve/docs/V1alpha1BuiltInAdapter.md | 2 +- .../docs/V1alpha1ServingRuntimePodSpec.md | 1 + .../kserve/docs/V1alpha1ServingRuntimeSpec.md | 1 + .../docs/V1beta1ComponentExtensionSpec.md | 2 + python/kserve/docs/V1beta1ExplainerSpec.md | 2 + python/kserve/docs/V1beta1PredictorSpec.md | 2 + python/kserve/docs/V1beta1TransformerSpec.md | 2 + .../models/v1alpha1_built_in_adapter.py | 4 +- .../v1alpha1_serving_runtime_pod_spec.py | 34 ++- .../models/v1alpha1_serving_runtime_spec.py | 34 ++- .../v1beta1_component_extension_spec.py | 58 ++++- .../kserve/models/v1beta1_explainer_spec.py | 58 ++++- .../kserve/models/v1beta1_predictor_spec.py | 58 ++++- .../kserve/models/v1beta1_transformer_spec.py | 58 ++++- .../serving.kserve.io_inferenceservices.yaml | 27 +++ test/e2e/predictor/test_autoscaling.py | 200 ++++++++++++++++++ test/scripts/gh-actions/setup-deps.sh | 3 + 29 files changed, 852 insertions(+), 35 deletions(-) create mode 100644 test/e2e/predictor/test_autoscaling.py diff --git a/config/crd/serving.kserve.io_inferenceservices.yaml b/config/crd/serving.kserve.io_inferenceservices.yaml index 83e16a4b78c..ba9df191053 100644 --- a/config/crd/serving.kserve.io_inferenceservices.yaml +++ b/config/crd/serving.kserve.io_inferenceservices.yaml @@ -2782,6 +2782,15 @@ spec: type: string runtimeClassName: type: string + scaleMetric: + enum: + - cpu + - memory + - concurrency + - rps + type: string + scaleTarget: + type: integer schedulerName: type: string securityContext: @@ -8027,6 +8036,15 @@ spec: type: string runtimeClassName: type: string + scaleMetric: + enum: + - cpu + - memory + - concurrency + - rps + type: string + scaleTarget: + type: integer schedulerName: type: string securityContext: @@ -12133,6 +12151,15 @@ spec: type: string runtimeClassName: type: string + scaleMetric: + enum: + - cpu + - memory + - concurrency + - rps + type: string + scaleTarget: + type: integer schedulerName: type: string securityContext: diff --git a/pkg/apis/serving/v1beta1/component.go b/pkg/apis/serving/v1beta1/component.go index 1179f5a52a1..56b31cf8a8a 100644 --- a/pkg/apis/serving/v1beta1/component.go +++ b/pkg/apis/serving/v1beta1/component.go @@ -80,6 +80,16 @@ type ComponentExtensionSpec struct { // Maximum number of replicas for autoscaling. // +optional MaxReplicas int `json:"maxReplicas,omitempty"` + // ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. + // concurrency and rps targets are supported by Knative Pod Autoscaler + //(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). + // +optional + ScaleTarget *int `json:"scaleTarget,omitempty"` + // ScaleMetric defines the scaling metric type watched by autoscaler + // possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via + // Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics). + // +optional + ScaleMetric *ScaleMetric `json:"scaleMetric,omitempty"` // ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container // concurrency(https://knative.dev/docs/serving/autoscaling/concurrency). // +optional @@ -98,6 +108,17 @@ type ComponentExtensionSpec struct { Batcher *Batcher `json:"batcher,omitempty"` } +// ScaleMetric enum +// +kubebuilder:validation:Enum=cpu;memory;concurrency;rps +type ScaleMetric string + +const ( + MetricCPU ScaleMetric = "cpu" + MetricMemory ScaleMetric = "memory" + MetricConcurrency ScaleMetric = "concurrency" + MetricRPS ScaleMetric = "rps" +) + // Default the ComponentExtensionSpec func (s *ComponentExtensionSpec) Default(config *InferenceServicesConfig) {} diff --git a/pkg/apis/serving/v1beta1/inference_service_defaults.go b/pkg/apis/serving/v1beta1/inference_service_defaults.go index 71e1c1b2d09..d5fa15b82a2 100644 --- a/pkg/apis/serving/v1beta1/inference_service_defaults.go +++ b/pkg/apis/serving/v1beta1/inference_service_defaults.go @@ -72,10 +72,18 @@ func (isvc *InferenceService) Default() { if err != nil { panic(err) } - isvc.DefaultInferenceService(configMap) + deployConfig, err := NewDeployConfig(cli) + if err != nil { + panic(err) + } + isvc.DefaultInferenceService(configMap, deployConfig) } -func (isvc *InferenceService) DefaultInferenceService(config *InferenceServicesConfig) { +func (isvc *InferenceService) DefaultInferenceService(config *InferenceServicesConfig, deployConfig *DeployConfig) { + if deployConfig.DefaultDeploymentMode == string(constants.ModelMeshDeployment) || + deployConfig.DefaultDeploymentMode == string(constants.RawDeployment) { + isvc.ObjectMeta.Annotations[constants.DeploymentMode] = deployConfig.DefaultDeploymentMode + } deploymentMode, ok := isvc.ObjectMeta.Annotations[constants.DeploymentMode] if !ok || deploymentMode != string(constants.ModelMeshDeployment) { // Only attempt to assign runtimes for non-modelmesh predictors diff --git a/pkg/apis/serving/v1beta1/inference_service_defaults_test.go b/pkg/apis/serving/v1beta1/inference_service_defaults_test.go index 4c807240ff5..f2c926fbb2a 100644 --- a/pkg/apis/serving/v1beta1/inference_service_defaults_test.go +++ b/pkg/apis/serving/v1beta1/inference_service_defaults_test.go @@ -45,6 +45,9 @@ func TestInferenceServiceDefaults(t *testing.T) { }, }, } + deployConfig := &DeployConfig{ + DefaultDeploymentMode: "Serverless", + } isvc := InferenceService{ ObjectMeta: metav1.ObjectMeta{ Name: "foo", @@ -83,7 +86,7 @@ func TestInferenceServiceDefaults(t *testing.T) { } resources := v1.ResourceRequirements{Requests: defaultResource, Limits: defaultResource} isvc.Spec.DeepCopy() - isvc.DefaultInferenceService(config) + isvc.DefaultInferenceService(config, deployConfig) g.Expect(*&isvc.Spec.Predictor.Tensorflow).To(gomega.BeNil()) g.Expect(*&isvc.Spec.Predictor.Model).NotTo(gomega.BeNil()) @@ -111,6 +114,9 @@ func TestCustomPredictorDefaults(t *testing.T) { }, }, } + deployConfig := &DeployConfig{ + DefaultDeploymentMode: "Serverless", + } isvc := InferenceService{ ObjectMeta: metav1.ObjectMeta{ Name: "foo", @@ -135,13 +141,16 @@ func TestCustomPredictorDefaults(t *testing.T) { } resources := v1.ResourceRequirements{Requests: defaultResource, Limits: defaultResource} isvc.Spec.DeepCopy() - isvc.DefaultInferenceService(config) + isvc.DefaultInferenceService(config, deployConfig) g.Expect(isvc.Spec.Predictor.PodSpec.Containers[0].Resources).To(gomega.Equal(resources)) } func TestInferenceServiceDefaultsModelMeshAnnotation(t *testing.T) { g := gomega.NewGomegaWithT(t) config := &InferenceServicesConfig{} + deployConfig := &DeployConfig{ + DefaultDeploymentMode: "Serverless", + } isvc := InferenceService{ ObjectMeta: metav1.ObjectMeta{ Name: "foo", @@ -161,7 +170,7 @@ func TestInferenceServiceDefaultsModelMeshAnnotation(t *testing.T) { }, } isvc.Spec.DeepCopy() - isvc.DefaultInferenceService(config) + isvc.DefaultInferenceService(config, deployConfig) g.Expect(isvc.Spec.Predictor.Model).To(gomega.BeNil()) g.Expect(isvc.Spec.Predictor.Tensorflow).ToNot(gomega.BeNil()) } diff --git a/pkg/apis/serving/v1beta1/inference_service_validation.go b/pkg/apis/serving/v1beta1/inference_service_validation.go index 596f22223db..2b9ef8fd505 100644 --- a/pkg/apis/serving/v1beta1/inference_service_validation.go +++ b/pkg/apis/serving/v1beta1/inference_service_validation.go @@ -26,6 +26,7 @@ import ( "github.com/kserve/kserve/pkg/constants" "github.com/kserve/kserve/pkg/utils" "k8s.io/apimachinery/pkg/runtime" + "knative.dev/serving/pkg/apis/autoscaling" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook" ) @@ -49,6 +50,8 @@ var _ webhook.Validator = &InferenceService{} func (isvc *InferenceService) ValidateCreate() error { validatorLogger.Info("validate create", "name", isvc.Name) + annotations := isvc.Annotations + if err := validateInferenceServiceName(isvc); err != nil { return err } @@ -60,6 +63,7 @@ func (isvc *InferenceService) ValidateCreate() error { if err := validateAutoscalerTargetUtilizationPercentage(isvc); err != nil { return err } + for _, component := range []Component{ &isvc.Spec.Predictor, isvc.Spec.Transformer, @@ -72,6 +76,7 @@ func (isvc *InferenceService) ValidateCreate() error { if err := utils.FirstNonNilError([]error{ component.GetImplementation().Validate(), component.GetExtensions().Validate(), + validateAutoScalingCompExtension(annotations, component.GetExtensions()), }); err != nil { return err } @@ -80,6 +85,18 @@ func (isvc *InferenceService) ValidateCreate() error { return nil } +// Validate scaling options component extensions +func validateAutoScalingCompExtension(annotations map[string]string, compExtSpec *ComponentExtensionSpec) error { + deploymentMode := annotations["serving.kserve.io/deploymentMode"] + annotationClass := annotations[autoscaling.ClassAnnotationKey] + if deploymentMode == string(constants.RawDeployment) || annotationClass == string(autoscaling.HPA) { + return validateScalingHPACompExtension(compExtSpec) + } + + return validateScalingKPACompExtension(compExtSpec) + +} + // ValidateUpdate implements webhook.Validator so a webhook will be registered for the type func (isvc *InferenceService) ValidateUpdate(old runtime.Object) error { validatorLogger.Info("validate update", "name", isvc.Name) @@ -118,7 +135,7 @@ func validateInferenceServiceAutoscaler(isvc *InferenceService) error { switch class { case constants.AutoscalerClassHPA: if metric, ok := annotations[constants.AutoscalerMetrics]; ok { - return validateHPAMetrics(metric) + return validateHPAMetrics(ScaleMetric(metric)) } else { return nil } @@ -134,7 +151,7 @@ func validateInferenceServiceAutoscaler(isvc *InferenceService) error { } //Validate of autoscaler HPA metrics -func validateHPAMetrics(metric string) error { +func validateHPAMetrics(metric ScaleMetric) error { for _, item := range constants.AutoscalerAllowedMetricsList { if item == constants.AutoscalerMetricsType(metric) { return nil @@ -157,5 +174,67 @@ func validateAutoscalerTargetUtilizationPercentage(isvc *InferenceService) error } } } + + return nil +} + +func validateScalingHPACompExtension(compExtSpec *ComponentExtensionSpec) error { + metric := MetricCPU + if compExtSpec.ScaleMetric != nil { + metric = *compExtSpec.ScaleMetric + } + + err := validateHPAMetrics(metric) + + if err != nil { + return err + } + + if compExtSpec.ScaleTarget != nil { + target := *compExtSpec.ScaleTarget + if metric == MetricCPU && target < 1 || target > 100 { + return fmt.Errorf("The target utilization percentage should be a [1-100] integer.") + } + + if metric == MetricMemory && target < 1 { + return fmt.Errorf("The target memory should be greater than 1 MiB") + } + + } + + return nil +} + +func validateKPAMetrics(metric ScaleMetric) error { + for _, item := range constants.AutoScalerKPAMetricsAllowedList { + if item == constants.AutoScalerKPAMetricsType(metric) { + return nil + } + } + return fmt.Errorf("[%s] is not a supported metric.\n", metric) + +} + +func validateScalingKPACompExtension(compExtSpec *ComponentExtensionSpec) error { + metric := MetricConcurrency + if compExtSpec.ScaleMetric != nil { + metric = *compExtSpec.ScaleMetric + } + + err := validateKPAMetrics(metric) + + if err != nil { + return err + } + + if compExtSpec.ScaleTarget != nil { + target := *compExtSpec.ScaleTarget + + if metric == MetricRPS && target < 1 { + return fmt.Errorf("The target for rps should be greater than 1") + } + + } + return nil } diff --git a/pkg/apis/serving/v1beta1/openapi_generated.go b/pkg/apis/serving/v1beta1/openapi_generated.go index 60d5d4da610..ece7a073a25 100644 --- a/pkg/apis/serving/v1beta1/openapi_generated.go +++ b/pkg/apis/serving/v1beta1/openapi_generated.go @@ -1869,6 +1869,20 @@ func schema_pkg_apis_serving_v1beta1_ComponentExtensionSpec(ref common.Reference Format: "int32", }, }, + "scaleTarget": { + SchemaProps: spec.SchemaProps{ + Description: "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).", + Type: []string{"integer"}, + Format: "int32", + }, + }, + "scaleMetric": { + SchemaProps: spec.SchemaProps{ + Description: "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).", + Type: []string{"string"}, + Format: "", + }, + }, "containerConcurrency": { SchemaProps: spec.SchemaProps{ Description: "ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).", @@ -3897,6 +3911,20 @@ func schema_pkg_apis_serving_v1beta1_ExplainerSpec(ref common.ReferenceCallback) Format: "int32", }, }, + "scaleTarget": { + SchemaProps: spec.SchemaProps{ + Description: "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).", + Type: []string{"integer"}, + Format: "int32", + }, + }, + "scaleMetric": { + SchemaProps: spec.SchemaProps{ + Description: "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).", + Type: []string{"string"}, + Format: "", + }, + }, "containerConcurrency": { SchemaProps: spec.SchemaProps{ Description: "ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).", @@ -4330,6 +4358,12 @@ func schema_pkg_apis_serving_v1beta1_IngressConfig(ref common.ReferenceCallback) Format: "", }, }, + "urlScheme": { + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, }, }, }, @@ -7055,6 +7089,20 @@ func schema_pkg_apis_serving_v1beta1_PredictorSpec(ref common.ReferenceCallback) Format: "int32", }, }, + "scaleTarget": { + SchemaProps: spec.SchemaProps{ + Description: "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).", + Type: []string{"integer"}, + Format: "int32", + }, + }, + "scaleMetric": { + SchemaProps: spec.SchemaProps{ + Description: "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).", + Type: []string{"string"}, + Format: "", + }, + }, "containerConcurrency": { SchemaProps: spec.SchemaProps{ Description: "ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).", @@ -8448,6 +8496,20 @@ func schema_pkg_apis_serving_v1beta1_TransformerSpec(ref common.ReferenceCallbac Format: "int32", }, }, + "scaleTarget": { + SchemaProps: spec.SchemaProps{ + Description: "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).", + Type: []string{"integer"}, + Format: "int32", + }, + }, + "scaleMetric": { + SchemaProps: spec.SchemaProps{ + Description: "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).", + Type: []string{"string"}, + Format: "", + }, + }, "containerConcurrency": { SchemaProps: spec.SchemaProps{ Description: "ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).", diff --git a/pkg/apis/serving/v1beta1/swagger.json b/pkg/apis/serving/v1beta1/swagger.json index 6687cb062a3..b671e62a0aa 100644 --- a/pkg/apis/serving/v1beta1/swagger.json +++ b/pkg/apis/serving/v1beta1/swagger.json @@ -1030,6 +1030,15 @@ "type": "integer", "format": "int32" }, + "scaleMetric": { + "description": "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).", + "type": "string" + }, + "scaleTarget": { + "description": "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).", + "type": "integer", + "format": "int32" + }, "timeout": { "description": "TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component.", "type": "integer", @@ -2099,6 +2108,15 @@ "description": "RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14.", "type": "string" }, + "scaleMetric": { + "description": "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).", + "type": "string" + }, + "scaleTarget": { + "description": "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).", + "type": "integer", + "format": "int32" + }, "schedulerName": { "description": "If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", "type": "string" @@ -2389,6 +2407,9 @@ }, "localGatewayService": { "type": "string" + }, + "urlScheme": { + "type": "string" } } }, @@ -3869,6 +3890,15 @@ "description": "RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14.", "type": "string" }, + "scaleMetric": { + "description": "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).", + "type": "string" + }, + "scaleTarget": { + "description": "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).", + "type": "integer", + "format": "int32" + }, "schedulerName": { "description": "If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", "type": "string" @@ -4674,6 +4704,15 @@ "description": "RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14.", "type": "string" }, + "scaleMetric": { + "description": "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).", + "type": "string" + }, + "scaleTarget": { + "description": "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).", + "type": "integer", + "format": "int32" + }, "schedulerName": { "description": "If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.", "type": "string" diff --git a/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go b/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go index 8bafae4a6de..4e9e100a8b5 100644 --- a/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go @@ -116,6 +116,16 @@ func (in *ComponentExtensionSpec) DeepCopyInto(out *ComponentExtensionSpec) { *out = new(int) **out = **in } + if in.ScaleTarget != nil { + in, out := &in.ScaleTarget, &out.ScaleTarget + *out = new(int) + **out = **in + } + if in.ScaleMetric != nil { + in, out := &in.ScaleMetric, &out.ScaleMetric + *out = new(ScaleMetric) + **out = **in + } if in.ContainerConcurrency != nil { in, out := &in.ContainerConcurrency, &out.ContainerConcurrency *out = new(int64) diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go index 12ae861aec9..da9ddaff00f 100644 --- a/pkg/constants/constants.go +++ b/pkg/constants/constants.go @@ -113,6 +113,12 @@ var ( type AutoscalerClassType string type AutoscalerMetricsType string +type AutoScalerKPAMetricsType string + +var ( + AutoScalerKPAMetricsRPS AutoScalerKPAMetricsType = "rps" + AutoScalerKPAMetricsConcurrency AutoScalerKPAMetricsType = "concurrency" +) // Autoscaler Default Class var ( @@ -129,6 +135,11 @@ var ( AutoScalerMetricsCPU AutoscalerMetricsType = "cpu" ) +// Autoscaler Memory metrics +var ( + AutoScalerMetricsMemory AutoscalerMetricsType = "memory" +) + // Autoscaler Class Allowed List var AutoscalerAllowedClassList = []AutoscalerClassType{ AutoscalerClassHPA, @@ -137,6 +148,13 @@ var AutoscalerAllowedClassList = []AutoscalerClassType{ // Autoscaler Metrics Allowed List var AutoscalerAllowedMetricsList = []AutoscalerMetricsType{ AutoScalerMetricsCPU, + AutoScalerMetricsMemory, +} + +// Autoscaler KPA Metrics Allowed List +var AutoScalerKPAMetricsAllowedList = []AutoScalerKPAMetricsType{ + AutoScalerKPAMetricsConcurrency, + AutoScalerKPAMetricsRPS, } // Autoscaler Default Metrics Value diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/hpa/hpa_reconciler.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/hpa/hpa_reconciler.go index 235a717e63c..41cbc7f7c91 100644 --- a/pkg/controller/v1beta1/inferenceservice/reconcilers/hpa/hpa_reconciler.go +++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/hpa/hpa_reconciler.go @@ -52,28 +52,41 @@ func NewHPAReconciler(client client.Client, } } -func getHPAMetrics(metadata metav1.ObjectMeta) []v2beta2.MetricSpec { +func getHPAMetrics(metadata metav1.ObjectMeta, componentExt *v1beta1.ComponentExtensionSpec) []v2beta2.MetricSpec { var metrics []v2beta2.MetricSpec - var cpuUtilization int32 + var utilization int32 annotations := metadata.Annotations + resourceName := corev1.ResourceCPU + if value, ok := annotations[constants.TargetUtilizationPercentage]; ok { - utilization, _ := strconv.Atoi(value) - cpuUtilization = int32(utilization) + utilizationInt, _ := strconv.Atoi(value) + utilization = int32(utilizationInt) } else { - cpuUtilization = constants.DefaultCPUUtilization + utilization = constants.DefaultCPUUtilization + } + + if componentExt.ScaleTarget != nil { + utilization = int32(*componentExt.ScaleTarget) + } + + if componentExt.ScaleMetric != nil { + resourceName = corev1.ResourceName(*componentExt.ScaleMetric) + } + + metricTarget := v2beta2.MetricTarget{ + Type: "Utilization", + AverageUtilization: &utilization, } ms := v2beta2.MetricSpec{ Type: v2beta2.ResourceMetricSourceType, Resource: &v2beta2.ResourceMetricSource{ - Name: corev1.ResourceCPU, - Target: v2beta2.MetricTarget{ - Type: "Utilization", - AverageUtilization: &cpuUtilization, - }, + Name: resourceName, + Target: metricTarget, }, } + metrics = append(metrics, ms) return metrics } @@ -91,7 +104,7 @@ func createHPA(componentMeta metav1.ObjectMeta, if maxReplicas < minReplicas { maxReplicas = minReplicas } - metrics := getHPAMetrics(componentMeta) + metrics := getHPAMetrics(componentMeta, componentExt) hpa := &v2beta2.HorizontalPodAutoscaler{ ObjectMeta: componentMeta, Spec: v2beta2.HorizontalPodAutoscalerSpec{ @@ -102,8 +115,9 @@ func createHPA(componentMeta metav1.ObjectMeta, }, MinReplicas: &minReplicas, MaxReplicas: maxReplicas, - Metrics: metrics, - Behavior: &v2beta2.HorizontalPodAutoscalerBehavior{}, + + Metrics: metrics, + Behavior: &v2beta2.HorizontalPodAutoscalerBehavior{}, }, } return hpa diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/knative/ksvc_reconciler.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/knative/ksvc_reconciler.go index 0602556d00b..61c5e74b7e3 100644 --- a/pkg/controller/v1beta1/inferenceservice/reconcilers/knative/ksvc_reconciler.go +++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/knative/ksvc_reconciler.go @@ -19,6 +19,7 @@ package knative import ( "context" "fmt" + "github.com/golang/protobuf/proto" "github.com/kserve/kserve/pkg/apis/serving/v1beta1" "github.com/kserve/kserve/pkg/constants" @@ -84,6 +85,15 @@ func createKnativeService(componentMeta metav1.ObjectMeta, if _, ok := annotations[autoscaling.ClassAnnotationKey]; !ok { annotations[autoscaling.ClassAnnotationKey] = autoscaling.KPA } + + if componentExtension.ScaleTarget != nil { + annotations[autoscaling.TargetAnnotationKey] = fmt.Sprint(*componentExtension.ScaleTarget) + } + + if componentExtension.ScaleMetric != nil { + annotations[autoscaling.MetricAnnotationKey] = fmt.Sprint(*componentExtension.ScaleMetric) + } + lastRolledoutRevision := componentStatus.LatestRolledoutRevision // Log component status and canary traffic percent diff --git a/pkg/credentials/service_account_credentials_test.go b/pkg/credentials/service_account_credentials_test.go index 1d4bf84c917..c1285573ec1 100644 --- a/pkg/credentials/service_account_credentials_test.go +++ b/pkg/credentials/service_account_credentials_test.go @@ -22,7 +22,7 @@ import ( "github.com/kserve/kserve/pkg/credentials/azure" "github.com/kserve/kserve/pkg/credentials/gcs" - "github.com/kserve/kserve/pkg/credentials/hdfs" + "github.com/kserve/kserve/pkg/credentials/hdfs" "github.com/kserve/kserve/pkg/credentials/s3" "github.com/google/go-cmp/cmp" @@ -634,4 +634,4 @@ func TestAzureStorageAccessKeyCredentialBuilder(t *testing.T) { g.Expect(c.Delete(context.TODO(), customAzureSecret)).NotTo(gomega.HaveOccurred()) g.Expect(c.Delete(context.TODO(), customOnlyServiceAccount)).NotTo(gomega.HaveOccurred()) -} \ No newline at end of file +} diff --git a/python/kserve/docs/V1alpha1BuiltInAdapter.md b/python/kserve/docs/V1alpha1BuiltInAdapter.md index 86449fbf63e..09dc36f111b 100644 --- a/python/kserve/docs/V1alpha1BuiltInAdapter.md +++ b/python/kserve/docs/V1alpha1BuiltInAdapter.md @@ -7,7 +7,7 @@ Name | Type | Description | Notes **mem_buffer_bytes** | **int** | Fixed memory overhead to subtract from runtime container's memory allocation to determine model capacity | [optional] **model_loading_timeout_millis** | **int** | Timeout for model loading operations in milliseconds | [optional] **runtime_management_port** | **int** | Port which the runtime server listens for model management requests | [optional] -**server_type** | **str** | ServerType can be one of triton/mlserver and the runtime's container must have the same name | [optional] +**server_type** | **str** | ServerType must be one of the supported built-in types such as \"triton\" or \"mlserver\", and the runtime's container must have the same name | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/python/kserve/docs/V1alpha1ServingRuntimePodSpec.md b/python/kserve/docs/V1alpha1ServingRuntimePodSpec.md index c364cfe226e..62a3444c1d2 100644 --- a/python/kserve/docs/V1alpha1ServingRuntimePodSpec.md +++ b/python/kserve/docs/V1alpha1ServingRuntimePodSpec.md @@ -7,6 +7,7 @@ Name | Type | Description | Notes **containers** | [**list[V1Container]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Container.md) | List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. | **node_selector** | **dict(str, str)** | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ | [optional] **tolerations** | [**list[V1Toleration]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Toleration.md) | If specified, the pod's tolerations. | [optional] +**volumes** | [**list[V1Volume]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Volume.md) | List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/python/kserve/docs/V1alpha1ServingRuntimeSpec.md b/python/kserve/docs/V1alpha1ServingRuntimeSpec.md index 871c3fa906d..84aa679c96c 100644 --- a/python/kserve/docs/V1alpha1ServingRuntimeSpec.md +++ b/python/kserve/docs/V1alpha1ServingRuntimeSpec.md @@ -18,6 +18,7 @@ Name | Type | Description | Notes **storage_helper** | [**V1alpha1StorageHelper**](V1alpha1StorageHelper.md) | | [optional] **supported_model_formats** | [**list[V1alpha1SupportedModelFormat]**](V1alpha1SupportedModelFormat.md) | Model formats and version supported by this runtime | [optional] **tolerations** | [**list[V1Toleration]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Toleration.md) | If specified, the pod's tolerations. | [optional] +**volumes** | [**list[V1Volume]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Volume.md) | List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/python/kserve/docs/V1beta1ComponentExtensionSpec.md b/python/kserve/docs/V1beta1ComponentExtensionSpec.md index 2dbbc8c2276..569eb94d6b9 100644 --- a/python/kserve/docs/V1beta1ComponentExtensionSpec.md +++ b/python/kserve/docs/V1beta1ComponentExtensionSpec.md @@ -10,6 +10,8 @@ Name | Type | Description | Notes **logger** | [**V1beta1LoggerSpec**](V1beta1LoggerSpec.md) | | [optional] **max_replicas** | **int** | Maximum number of replicas for autoscaling. | [optional] **min_replicas** | **int** | Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. | [optional] +**scale_metric** | **str** | ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). | [optional] +**scale_target** | **int** | ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). | [optional] **timeout** | **int** | TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/python/kserve/docs/V1beta1ExplainerSpec.md b/python/kserve/docs/V1beta1ExplainerSpec.md index f0a3e36dcc1..54c38e1dc94 100644 --- a/python/kserve/docs/V1beta1ExplainerSpec.md +++ b/python/kserve/docs/V1beta1ExplainerSpec.md @@ -37,6 +37,8 @@ Name | Type | Description | Notes **readiness_gates** | [**list[V1PodReadinessGate]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodReadinessGate.md) | If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \"True\" More info: https://git.k8s.io/enhancements/keps/sig-network/0007-pod-ready%2B%2B.md | [optional] **restart_policy** | **str** | Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy | [optional] **runtime_class_name** | **str** | RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14. | [optional] +**scale_metric** | **str** | ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). | [optional] +**scale_target** | **int** | ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). | [optional] **scheduler_name** | **str** | If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. | [optional] **security_context** | [**V1PodSecurityContext**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodSecurityContext.md) | | [optional] **service_account** | **str** | DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. | [optional] diff --git a/python/kserve/docs/V1beta1PredictorSpec.md b/python/kserve/docs/V1beta1PredictorSpec.md index e0169c3033b..3362362b56f 100644 --- a/python/kserve/docs/V1beta1PredictorSpec.md +++ b/python/kserve/docs/V1beta1PredictorSpec.md @@ -40,6 +40,8 @@ Name | Type | Description | Notes **readiness_gates** | [**list[V1PodReadinessGate]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodReadinessGate.md) | If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \"True\" More info: https://git.k8s.io/enhancements/keps/sig-network/0007-pod-ready%2B%2B.md | [optional] **restart_policy** | **str** | Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy | [optional] **runtime_class_name** | **str** | RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14. | [optional] +**scale_metric** | **str** | ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). | [optional] +**scale_target** | **int** | ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). | [optional] **scheduler_name** | **str** | If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. | [optional] **security_context** | [**V1PodSecurityContext**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodSecurityContext.md) | | [optional] **service_account** | **str** | DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. | [optional] diff --git a/python/kserve/docs/V1beta1TransformerSpec.md b/python/kserve/docs/V1beta1TransformerSpec.md index 786f881aeb3..080d67e026a 100644 --- a/python/kserve/docs/V1beta1TransformerSpec.md +++ b/python/kserve/docs/V1beta1TransformerSpec.md @@ -34,6 +34,8 @@ Name | Type | Description | Notes **readiness_gates** | [**list[V1PodReadinessGate]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodReadinessGate.md) | If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \"True\" More info: https://git.k8s.io/enhancements/keps/sig-network/0007-pod-ready%2B%2B.md | [optional] **restart_policy** | **str** | Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy | [optional] **runtime_class_name** | **str** | RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14. | [optional] +**scale_metric** | **str** | ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). | [optional] +**scale_target** | **int** | ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). | [optional] **scheduler_name** | **str** | If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. | [optional] **security_context** | [**V1PodSecurityContext**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodSecurityContext.md) | | [optional] **service_account** | **str** | DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. | [optional] diff --git a/python/kserve/kserve/models/v1alpha1_built_in_adapter.py b/python/kserve/kserve/models/v1alpha1_built_in_adapter.py index 9a5908f6067..2375e7ed77d 100644 --- a/python/kserve/kserve/models/v1alpha1_built_in_adapter.py +++ b/python/kserve/kserve/models/v1alpha1_built_in_adapter.py @@ -182,7 +182,7 @@ def runtime_management_port(self, runtime_management_port): def server_type(self): """Gets the server_type of this V1alpha1BuiltInAdapter. # noqa: E501 - ServerType can be one of triton/mlserver and the runtime's container must have the same name # noqa: E501 + ServerType must be one of the supported built-in types such as \"triton\" or \"mlserver\", and the runtime's container must have the same name # noqa: E501 :return: The server_type of this V1alpha1BuiltInAdapter. # noqa: E501 :rtype: str @@ -193,7 +193,7 @@ def server_type(self): def server_type(self, server_type): """Sets the server_type of this V1alpha1BuiltInAdapter. - ServerType can be one of triton/mlserver and the runtime's container must have the same name # noqa: E501 + ServerType must be one of the supported built-in types such as \"triton\" or \"mlserver\", and the runtime's container must have the same name # noqa: E501 :param server_type: The server_type of this V1alpha1BuiltInAdapter. # noqa: E501 :type: str diff --git a/python/kserve/kserve/models/v1alpha1_serving_runtime_pod_spec.py b/python/kserve/kserve/models/v1alpha1_serving_runtime_pod_spec.py index 7ca054f1fe0..2b674959adf 100644 --- a/python/kserve/kserve/models/v1alpha1_serving_runtime_pod_spec.py +++ b/python/kserve/kserve/models/v1alpha1_serving_runtime_pod_spec.py @@ -50,17 +50,19 @@ class V1alpha1ServingRuntimePodSpec(object): 'affinity': 'V1Affinity', 'containers': 'list[V1Container]', 'node_selector': 'dict(str, str)', - 'tolerations': 'list[V1Toleration]' + 'tolerations': 'list[V1Toleration]', + 'volumes': 'list[V1Volume]' } attribute_map = { 'affinity': 'affinity', 'containers': 'containers', 'node_selector': 'nodeSelector', - 'tolerations': 'tolerations' + 'tolerations': 'tolerations', + 'volumes': 'volumes' } - def __init__(self, affinity=None, containers=None, node_selector=None, tolerations=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, affinity=None, containers=None, node_selector=None, tolerations=None, volumes=None, local_vars_configuration=None): # noqa: E501 """V1alpha1ServingRuntimePodSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -70,6 +72,7 @@ def __init__(self, affinity=None, containers=None, node_selector=None, toleratio self._containers = None self._node_selector = None self._tolerations = None + self._volumes = None self.discriminator = None if affinity is not None: @@ -79,6 +82,8 @@ def __init__(self, affinity=None, containers=None, node_selector=None, toleratio self.node_selector = node_selector if tolerations is not None: self.tolerations = tolerations + if volumes is not None: + self.volumes = volumes @property def affinity(self): @@ -172,6 +177,29 @@ def tolerations(self, tolerations): self._tolerations = tolerations + @property + def volumes(self): + """Gets the volumes of this V1alpha1ServingRuntimePodSpec. # noqa: E501 + + List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes # noqa: E501 + + :return: The volumes of this V1alpha1ServingRuntimePodSpec. # noqa: E501 + :rtype: list[V1Volume] + """ + return self._volumes + + @volumes.setter + def volumes(self, volumes): + """Sets the volumes of this V1alpha1ServingRuntimePodSpec. + + List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes # noqa: E501 + + :param volumes: The volumes of this V1alpha1ServingRuntimePodSpec. # noqa: E501 + :type: list[V1Volume] + """ + + self._volumes = volumes + def to_dict(self): """Returns the model properties as a dict""" result = {} diff --git a/python/kserve/kserve/models/v1alpha1_serving_runtime_spec.py b/python/kserve/kserve/models/v1alpha1_serving_runtime_spec.py index 2a0e40dbc8f..6841032a6ed 100644 --- a/python/kserve/kserve/models/v1alpha1_serving_runtime_spec.py +++ b/python/kserve/kserve/models/v1alpha1_serving_runtime_spec.py @@ -60,7 +60,8 @@ class V1alpha1ServingRuntimeSpec(object): 'replicas': 'int', 'storage_helper': 'V1alpha1StorageHelper', 'supported_model_formats': 'list[V1alpha1SupportedModelFormat]', - 'tolerations': 'list[V1Toleration]' + 'tolerations': 'list[V1Toleration]', + 'volumes': 'list[V1Volume]' } attribute_map = { @@ -77,10 +78,11 @@ class V1alpha1ServingRuntimeSpec(object): 'replicas': 'replicas', 'storage_helper': 'storageHelper', 'supported_model_formats': 'supportedModelFormats', - 'tolerations': 'tolerations' + 'tolerations': 'tolerations', + 'volumes': 'volumes' } - def __init__(self, affinity=None, built_in_adapter=None, containers=None, disabled=None, grpc_data_endpoint=None, grpc_endpoint=None, http_data_endpoint=None, multi_model=None, node_selector=None, protocol_versions=None, replicas=None, storage_helper=None, supported_model_formats=None, tolerations=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, affinity=None, built_in_adapter=None, containers=None, disabled=None, grpc_data_endpoint=None, grpc_endpoint=None, http_data_endpoint=None, multi_model=None, node_selector=None, protocol_versions=None, replicas=None, storage_helper=None, supported_model_formats=None, tolerations=None, volumes=None, local_vars_configuration=None): # noqa: E501 """V1alpha1ServingRuntimeSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -100,6 +102,7 @@ def __init__(self, affinity=None, built_in_adapter=None, containers=None, disabl self._storage_helper = None self._supported_model_formats = None self._tolerations = None + self._volumes = None self.discriminator = None if affinity is not None: @@ -129,6 +132,8 @@ def __init__(self, affinity=None, built_in_adapter=None, containers=None, disabl self.supported_model_formats = supported_model_formats if tolerations is not None: self.tolerations = tolerations + if volumes is not None: + self.volumes = volumes @property def affinity(self): @@ -448,6 +453,29 @@ def tolerations(self, tolerations): self._tolerations = tolerations + @property + def volumes(self): + """Gets the volumes of this V1alpha1ServingRuntimeSpec. # noqa: E501 + + List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes # noqa: E501 + + :return: The volumes of this V1alpha1ServingRuntimeSpec. # noqa: E501 + :rtype: list[V1Volume] + """ + return self._volumes + + @volumes.setter + def volumes(self, volumes): + """Sets the volumes of this V1alpha1ServingRuntimeSpec. + + List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes # noqa: E501 + + :param volumes: The volumes of this V1alpha1ServingRuntimeSpec. # noqa: E501 + :type: list[V1Volume] + """ + + self._volumes = volumes + def to_dict(self): """Returns the model properties as a dict""" result = {} diff --git a/python/kserve/kserve/models/v1beta1_component_extension_spec.py b/python/kserve/kserve/models/v1beta1_component_extension_spec.py index 0a724563ff1..b2d4cdfe1a1 100644 --- a/python/kserve/kserve/models/v1beta1_component_extension_spec.py +++ b/python/kserve/kserve/models/v1beta1_component_extension_spec.py @@ -53,6 +53,8 @@ class V1beta1ComponentExtensionSpec(object): 'logger': 'V1beta1LoggerSpec', 'max_replicas': 'int', 'min_replicas': 'int', + 'scale_metric': 'str', + 'scale_target': 'int', 'timeout': 'int' } @@ -63,10 +65,12 @@ class V1beta1ComponentExtensionSpec(object): 'logger': 'logger', 'max_replicas': 'maxReplicas', 'min_replicas': 'minReplicas', + 'scale_metric': 'scaleMetric', + 'scale_target': 'scaleTarget', 'timeout': 'timeout' } - def __init__(self, batcher=None, canary_traffic_percent=None, container_concurrency=None, logger=None, max_replicas=None, min_replicas=None, timeout=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, batcher=None, canary_traffic_percent=None, container_concurrency=None, logger=None, max_replicas=None, min_replicas=None, scale_metric=None, scale_target=None, timeout=None, local_vars_configuration=None): # noqa: E501 """V1beta1ComponentExtensionSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -78,6 +82,8 @@ def __init__(self, batcher=None, canary_traffic_percent=None, container_concurre self._logger = None self._max_replicas = None self._min_replicas = None + self._scale_metric = None + self._scale_target = None self._timeout = None self.discriminator = None @@ -93,6 +99,10 @@ def __init__(self, batcher=None, canary_traffic_percent=None, container_concurre self.max_replicas = max_replicas if min_replicas is not None: self.min_replicas = min_replicas + if scale_metric is not None: + self.scale_metric = scale_metric + if scale_target is not None: + self.scale_target = scale_target if timeout is not None: self.timeout = timeout @@ -230,6 +240,52 @@ def min_replicas(self, min_replicas): self._min_replicas = min_replicas + @property + def scale_metric(self): + """Gets the scale_metric of this V1beta1ComponentExtensionSpec. # noqa: E501 + + ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). # noqa: E501 + + :return: The scale_metric of this V1beta1ComponentExtensionSpec. # noqa: E501 + :rtype: str + """ + return self._scale_metric + + @scale_metric.setter + def scale_metric(self, scale_metric): + """Sets the scale_metric of this V1beta1ComponentExtensionSpec. + + ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). # noqa: E501 + + :param scale_metric: The scale_metric of this V1beta1ComponentExtensionSpec. # noqa: E501 + :type: str + """ + + self._scale_metric = scale_metric + + @property + def scale_target(self): + """Gets the scale_target of this V1beta1ComponentExtensionSpec. # noqa: E501 + + ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). # noqa: E501 + + :return: The scale_target of this V1beta1ComponentExtensionSpec. # noqa: E501 + :rtype: int + """ + return self._scale_target + + @scale_target.setter + def scale_target(self, scale_target): + """Sets the scale_target of this V1beta1ComponentExtensionSpec. + + ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). # noqa: E501 + + :param scale_target: The scale_target of this V1beta1ComponentExtensionSpec. # noqa: E501 + :type: int + """ + + self._scale_target = scale_target + @property def timeout(self): """Gets the timeout of this V1beta1ComponentExtensionSpec. # noqa: E501 diff --git a/python/kserve/kserve/models/v1beta1_explainer_spec.py b/python/kserve/kserve/models/v1beta1_explainer_spec.py index cb9cd7c7de2..cf651caf320 100644 --- a/python/kserve/kserve/models/v1beta1_explainer_spec.py +++ b/python/kserve/kserve/models/v1beta1_explainer_spec.py @@ -80,6 +80,8 @@ class V1beta1ExplainerSpec(object): 'readiness_gates': 'list[V1PodReadinessGate]', 'restart_policy': 'str', 'runtime_class_name': 'str', + 'scale_metric': 'str', + 'scale_target': 'int', 'scheduler_name': 'str', 'security_context': 'V1PodSecurityContext', 'service_account': 'str', @@ -128,6 +130,8 @@ class V1beta1ExplainerSpec(object): 'readiness_gates': 'readinessGates', 'restart_policy': 'restartPolicy', 'runtime_class_name': 'runtimeClassName', + 'scale_metric': 'scaleMetric', + 'scale_target': 'scaleTarget', 'scheduler_name': 'schedulerName', 'security_context': 'securityContext', 'service_account': 'serviceAccount', @@ -142,7 +146,7 @@ class V1beta1ExplainerSpec(object): 'volumes': 'volumes' } - def __init__(self, active_deadline_seconds=None, affinity=None, aix=None, alibi=None, art=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, logger=None, max_replicas=None, min_replicas=None, node_name=None, node_selector=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, active_deadline_seconds=None, affinity=None, aix=None, alibi=None, art=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, logger=None, max_replicas=None, min_replicas=None, node_name=None, node_selector=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scale_metric=None, scale_target=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None): # noqa: E501 """V1beta1ExplainerSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -181,6 +185,8 @@ def __init__(self, active_deadline_seconds=None, affinity=None, aix=None, alibi= self._readiness_gates = None self._restart_policy = None self._runtime_class_name = None + self._scale_metric = None + self._scale_target = None self._scheduler_name = None self._security_context = None self._service_account = None @@ -261,6 +267,10 @@ def __init__(self, active_deadline_seconds=None, affinity=None, aix=None, alibi= self.restart_policy = restart_policy if runtime_class_name is not None: self.runtime_class_name = runtime_class_name + if scale_metric is not None: + self.scale_metric = scale_metric + if scale_target is not None: + self.scale_target = scale_target if scheduler_name is not None: self.scheduler_name = scheduler_name if security_context is not None: @@ -1031,6 +1041,52 @@ def runtime_class_name(self, runtime_class_name): self._runtime_class_name = runtime_class_name + @property + def scale_metric(self): + """Gets the scale_metric of this V1beta1ExplainerSpec. # noqa: E501 + + ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). # noqa: E501 + + :return: The scale_metric of this V1beta1ExplainerSpec. # noqa: E501 + :rtype: str + """ + return self._scale_metric + + @scale_metric.setter + def scale_metric(self, scale_metric): + """Sets the scale_metric of this V1beta1ExplainerSpec. + + ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). # noqa: E501 + + :param scale_metric: The scale_metric of this V1beta1ExplainerSpec. # noqa: E501 + :type: str + """ + + self._scale_metric = scale_metric + + @property + def scale_target(self): + """Gets the scale_target of this V1beta1ExplainerSpec. # noqa: E501 + + ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). # noqa: E501 + + :return: The scale_target of this V1beta1ExplainerSpec. # noqa: E501 + :rtype: int + """ + return self._scale_target + + @scale_target.setter + def scale_target(self, scale_target): + """Sets the scale_target of this V1beta1ExplainerSpec. + + ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). # noqa: E501 + + :param scale_target: The scale_target of this V1beta1ExplainerSpec. # noqa: E501 + :type: int + """ + + self._scale_target = scale_target + @property def scheduler_name(self): """Gets the scheduler_name of this V1beta1ExplainerSpec. # noqa: E501 diff --git a/python/kserve/kserve/models/v1beta1_predictor_spec.py b/python/kserve/kserve/models/v1beta1_predictor_spec.py index 255adc6ad51..a1ce6ed880f 100644 --- a/python/kserve/kserve/models/v1beta1_predictor_spec.py +++ b/python/kserve/kserve/models/v1beta1_predictor_spec.py @@ -83,6 +83,8 @@ class V1beta1PredictorSpec(object): 'readiness_gates': 'list[V1PodReadinessGate]', 'restart_policy': 'str', 'runtime_class_name': 'str', + 'scale_metric': 'str', + 'scale_target': 'int', 'scheduler_name': 'str', 'security_context': 'V1PodSecurityContext', 'service_account': 'str', @@ -138,6 +140,8 @@ class V1beta1PredictorSpec(object): 'readiness_gates': 'readinessGates', 'restart_policy': 'restartPolicy', 'runtime_class_name': 'runtimeClassName', + 'scale_metric': 'scaleMetric', + 'scale_target': 'scaleTarget', 'scheduler_name': 'schedulerName', 'security_context': 'securityContext', 'service_account': 'serviceAccount', @@ -156,7 +160,7 @@ class V1beta1PredictorSpec(object): 'xgboost': 'xgboost' } - def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, lightgbm=None, logger=None, max_replicas=None, min_replicas=None, model=None, node_name=None, node_selector=None, onnx=None, overhead=None, paddle=None, pmml=None, preemption_policy=None, priority=None, priority_class_name=None, pytorch=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, sklearn=None, subdomain=None, tensorflow=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, triton=None, volumes=None, xgboost=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, lightgbm=None, logger=None, max_replicas=None, min_replicas=None, model=None, node_name=None, node_selector=None, onnx=None, overhead=None, paddle=None, pmml=None, preemption_policy=None, priority=None, priority_class_name=None, pytorch=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scale_metric=None, scale_target=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, sklearn=None, subdomain=None, tensorflow=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, triton=None, volumes=None, xgboost=None, local_vars_configuration=None): # noqa: E501 """V1beta1PredictorSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -198,6 +202,8 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic self._readiness_gates = None self._restart_policy = None self._runtime_class_name = None + self._scale_metric = None + self._scale_target = None self._scheduler_name = None self._security_context = None self._service_account = None @@ -288,6 +294,10 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic self.restart_policy = restart_policy if runtime_class_name is not None: self.runtime_class_name = runtime_class_name + if scale_metric is not None: + self.scale_metric = scale_metric + if scale_target is not None: + self.scale_target = scale_target if scheduler_name is not None: self.scheduler_name = scheduler_name if security_context is not None: @@ -1129,6 +1139,52 @@ def runtime_class_name(self, runtime_class_name): self._runtime_class_name = runtime_class_name + @property + def scale_metric(self): + """Gets the scale_metric of this V1beta1PredictorSpec. # noqa: E501 + + ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). # noqa: E501 + + :return: The scale_metric of this V1beta1PredictorSpec. # noqa: E501 + :rtype: str + """ + return self._scale_metric + + @scale_metric.setter + def scale_metric(self, scale_metric): + """Sets the scale_metric of this V1beta1PredictorSpec. + + ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). # noqa: E501 + + :param scale_metric: The scale_metric of this V1beta1PredictorSpec. # noqa: E501 + :type: str + """ + + self._scale_metric = scale_metric + + @property + def scale_target(self): + """Gets the scale_target of this V1beta1PredictorSpec. # noqa: E501 + + ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). # noqa: E501 + + :return: The scale_target of this V1beta1PredictorSpec. # noqa: E501 + :rtype: int + """ + return self._scale_target + + @scale_target.setter + def scale_target(self, scale_target): + """Sets the scale_target of this V1beta1PredictorSpec. + + ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). # noqa: E501 + + :param scale_target: The scale_target of this V1beta1PredictorSpec. # noqa: E501 + :type: int + """ + + self._scale_target = scale_target + @property def scheduler_name(self): """Gets the scheduler_name of this V1beta1PredictorSpec. # noqa: E501 diff --git a/python/kserve/kserve/models/v1beta1_transformer_spec.py b/python/kserve/kserve/models/v1beta1_transformer_spec.py index 30d321191d2..7054789f8f8 100644 --- a/python/kserve/kserve/models/v1beta1_transformer_spec.py +++ b/python/kserve/kserve/models/v1beta1_transformer_spec.py @@ -77,6 +77,8 @@ class V1beta1TransformerSpec(object): 'readiness_gates': 'list[V1PodReadinessGate]', 'restart_policy': 'str', 'runtime_class_name': 'str', + 'scale_metric': 'str', + 'scale_target': 'int', 'scheduler_name': 'str', 'security_context': 'V1PodSecurityContext', 'service_account': 'str', @@ -122,6 +124,8 @@ class V1beta1TransformerSpec(object): 'readiness_gates': 'readinessGates', 'restart_policy': 'restartPolicy', 'runtime_class_name': 'runtimeClassName', + 'scale_metric': 'scaleMetric', + 'scale_target': 'scaleTarget', 'scheduler_name': 'schedulerName', 'security_context': 'securityContext', 'service_account': 'serviceAccount', @@ -136,7 +140,7 @@ class V1beta1TransformerSpec(object): 'volumes': 'volumes' } - def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, logger=None, max_replicas=None, min_replicas=None, node_name=None, node_selector=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, logger=None, max_replicas=None, min_replicas=None, node_name=None, node_selector=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scale_metric=None, scale_target=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None): # noqa: E501 """V1beta1TransformerSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -172,6 +176,8 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic self._readiness_gates = None self._restart_policy = None self._runtime_class_name = None + self._scale_metric = None + self._scale_target = None self._scheduler_name = None self._security_context = None self._service_account = None @@ -246,6 +252,10 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic self.restart_policy = restart_policy if runtime_class_name is not None: self.runtime_class_name = runtime_class_name + if scale_metric is not None: + self.scale_metric = scale_metric + if scale_target is not None: + self.scale_target = scale_target if scheduler_name is not None: self.scheduler_name = scheduler_name if security_context is not None: @@ -953,6 +963,52 @@ def runtime_class_name(self, runtime_class_name): self._runtime_class_name = runtime_class_name + @property + def scale_metric(self): + """Gets the scale_metric of this V1beta1TransformerSpec. # noqa: E501 + + ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). # noqa: E501 + + :return: The scale_metric of this V1beta1TransformerSpec. # noqa: E501 + :rtype: str + """ + return self._scale_metric + + @scale_metric.setter + def scale_metric(self, scale_metric): + """Sets the scale_metric of this V1beta1TransformerSpec. + + ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). # noqa: E501 + + :param scale_metric: The scale_metric of this V1beta1TransformerSpec. # noqa: E501 + :type: str + """ + + self._scale_metric = scale_metric + + @property + def scale_target(self): + """Gets the scale_target of this V1beta1TransformerSpec. # noqa: E501 + + ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). # noqa: E501 + + :return: The scale_target of this V1beta1TransformerSpec. # noqa: E501 + :rtype: int + """ + return self._scale_target + + @scale_target.setter + def scale_target(self, scale_target): + """Sets the scale_target of this V1beta1TransformerSpec. + + ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). # noqa: E501 + + :param scale_target: The scale_target of this V1beta1TransformerSpec. # noqa: E501 + :type: int + """ + + self._scale_target = scale_target + @property def scheduler_name(self): """Gets the scheduler_name of this V1beta1TransformerSpec. # noqa: E501 diff --git a/test/crds/serving.kserve.io_inferenceservices.yaml b/test/crds/serving.kserve.io_inferenceservices.yaml index 6b06189f202..3baeedf2811 100644 --- a/test/crds/serving.kserve.io_inferenceservices.yaml +++ b/test/crds/serving.kserve.io_inferenceservices.yaml @@ -4676,6 +4676,15 @@ spec: type: string runtimeClassName: type: string + scaleMetric: + enum: + - cpu + - memory + - concurrency + - rps + type: string + scaleTarget: + type: integer schedulerName: type: string securityContext: @@ -9921,6 +9930,15 @@ spec: type: string runtimeClassName: type: string + scaleMetric: + enum: + - cpu + - memory + - concurrency + - rps + type: string + scaleTarget: + type: integer schedulerName: type: string securityContext: @@ -14027,6 +14045,15 @@ spec: type: string runtimeClassName: type: string + scaleMetric: + enum: + - cpu + - memory + - concurrency + - rps + type: string + scaleTarget: + type: integer schedulerName: type: string securityContext: diff --git a/test/e2e/predictor/test_autoscaling.py b/test/e2e/predictor/test_autoscaling.py new file mode 100644 index 00000000000..4bcdba9303a --- /dev/null +++ b/test/e2e/predictor/test_autoscaling.py @@ -0,0 +1,200 @@ +# Copyright 2022 The KServe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from kubernetes import client +from kserve import ( + constants, + KServeClient, + V1beta1InferenceService, + V1beta1InferenceServiceSpec, + V1beta1PredictorSpec, + V1beta1SKLearnSpec, +) +from kubernetes.client import V1ResourceRequirements +import pytest + +from ..common.utils import predict +from ..common.utils import KSERVE_TEST_NAMESPACE + +TARGET = "autoscaling.knative.dev/target" +METRIC = "autoscaling.knative.dev/metric" +MODEL = "gs://kfserving-examples/models/sklearn/1.0/model" +INPUT = "./data/iris_input.json" + + +def test_sklearn_kserve_concurrency(): + service_name = "isvc-sklearn-scale-concurrency" + predictor = V1beta1PredictorSpec( + min_replicas=1, + scale_metric='concurrency', + scale_target=2, + sklearn=V1beta1SKLearnSpec( + storage_uri=MODEL, + resources=V1ResourceRequirements( + requests={"cpu": "50m", "memory": "128Mi"}, + limits={"cpu": "100m", "memory": "256Mi"}, + ), + ), + ) + isvc = V1beta1InferenceService( + api_version=constants.KSERVE_V1BETA1, + kind=constants.KSERVE_KIND, + metadata=client.V1ObjectMeta( + name=service_name, namespace=KSERVE_TEST_NAMESPACE + ), + spec=V1beta1InferenceServiceSpec(predictor=predictor), + ) + + kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config")) + kserve_client.create(isvc) + kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE) + pods = kserve_client.core_api.list_namespaced_pod(KSERVE_TEST_NAMESPACE, + label_selector='serving.kserve.io/inferenceservice={}' + .format(service_name)) + + isvc_annotations = pods.items[0].metadata.annotations + + res = predict(service_name, INPUT) + assert res["predictions"] == [1, 1] + assert(isvc_annotations[METRIC] == 'concurrency') + assert(isvc_annotations[TARGET] == '2') + kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE) + + +def test_sklearn_kserve_rps(): + service_name = "isvc-sklearn-scale-rps" + predictor = V1beta1PredictorSpec( + min_replicas=1, + scale_metric='rps', + scale_target=5, + sklearn=V1beta1SKLearnSpec( + storage_uri=MODEL, + resources=V1ResourceRequirements( + requests={"cpu": "50m", "memory": "128Mi"}, + limits={"cpu": "100m", "memory": "256Mi"}, + ), + ), + ) + + isvc = V1beta1InferenceService( + api_version=constants.KSERVE_V1BETA1, + kind=constants.KSERVE_KIND, + metadata=client.V1ObjectMeta( + name=service_name, namespace=KSERVE_TEST_NAMESPACE + ), + spec=V1beta1InferenceServiceSpec(predictor=predictor), + ) + + kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config")) + kserve_client.create(isvc) + kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE) + pods = kserve_client.core_api.list_namespaced_pod(KSERVE_TEST_NAMESPACE, + label_selector='serving.kserve.io/inferenceservice={}' + .format(service_name)) + + annotations = pods.items[0].metadata.annotations + + assert(annotations[METRIC] == 'rps') + assert(annotations[TARGET] == '5') + res = predict(service_name, INPUT) + assert res["predictions"] == [1, 1] + kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE) + + +@pytest.mark.skip() +def test_sklearn_kserve_cpu(): + service_name = "isvc-sklearn-scale-cpu" + predictor = V1beta1PredictorSpec( + min_replicas=1, + scale_metric='cpu', + scale_target=50, + sklearn=V1beta1SKLearnSpec( + storage_uri=MODEL, + resources=V1ResourceRequirements( + requests={"cpu": "50m", "memory": "128Mi"}, + limits={"cpu": "100m", "memory": "256Mi"}, + ), + ), + ) + + annotations = dict() + annotations['autoscaling.knative.dev/class'] = 'hpa.autoscaling.knative.dev' + + isvc = V1beta1InferenceService( + api_version=constants.KSERVE_V1BETA1, + kind=constants.KSERVE_KIND, + metadata=client.V1ObjectMeta( + name=service_name, namespace=KSERVE_TEST_NAMESPACE, + annotations=annotations + ), + spec=V1beta1InferenceServiceSpec(predictor=predictor), + ) + + kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config")) + kserve_client.create(isvc) + kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE) + pods = kserve_client.core_api.list_namespaced_pod(KSERVE_TEST_NAMESPACE, + label_selector='serving.kserve.io/inferenceservice={}' + .format(service_name)) + + isvc_annotations = pods.items[0].metadata.annotations + + assert(isvc_annotations[METRIC] == 'cpu') + assert(isvc_annotations[TARGET] == '50') + res = predict(service_name, INPUT) + assert res["predictions"] == [1, 1] + kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE) + + +def test_sklearn_kserve_raw(): + service_name = "isvc-sklearn-scale-raw" + predictor = V1beta1PredictorSpec( + min_replicas=1, + scale_metric='cpu', + scale_target=50, + sklearn=V1beta1SKLearnSpec( + storage_uri=MODEL, + resources=V1ResourceRequirements( + requests={"cpu": "50m", "memory": "128Mi"}, + limits={"cpu": "100m", "memory": "256Mi"}, + ), + ), + ) + + annotations = dict() + annotations['serving.kserve.io/deploymentMode'] = 'RawDeployment' + + isvc = V1beta1InferenceService( + api_version=constants.KSERVE_V1BETA1, + kind=constants.KSERVE_KIND, + metadata=client.V1ObjectMeta( + name=service_name, namespace=KSERVE_TEST_NAMESPACE, + annotations=annotations + ), + spec=V1beta1InferenceServiceSpec(predictor=predictor), + ) + + kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config")) + kserve_client.create(isvc) + kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE) + api_instance = kserve_client.api_instance + hpa_resp = api_instance.list_namespaced_custom_object(group='autoscaling', version='v1', + namespace=KSERVE_TEST_NAMESPACE, + plural='horizontalpodautoscalers') + + assert(hpa_resp['items'][0]['spec']['targetCPUUtilizationPercentage'] == 50) + res = predict(service_name, INPUT) + assert res["predictions"] == [1, 1] + kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE) diff --git a/test/scripts/gh-actions/setup-deps.sh b/test/scripts/gh-actions/setup-deps.sh index e7db7c3f1ab..e2cd68c76dc 100755 --- a/test/scripts/gh-actions/setup-deps.sh +++ b/test/scripts/gh-actions/setup-deps.sh @@ -77,6 +77,9 @@ for i in 1 2 3 ; do kustomize build test/overlays/knative | kubectl apply -f - & echo "Waiting for Knative to be ready ..." kubectl wait --for=condition=Ready pods --all --timeout=180s -n knative-serving -l 'app in (activator,autoscaler,autoscaler-hpa,controller,net-istio-controller,net-istio-webhook)' +echo "Add knative hpa..." +# kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.0.0/serving-hpa.yaml + # Skip tag resolution for certain domains kubectl patch cm config-deployment --patch '{"data":{"registries-skipping-tag-resolving":"nvcr.io,index.docker.io"}}' -n knative-serving