From aa68b4c38c8d65a3b4e69351fc8244d4eaf4e412 Mon Sep 17 00:00:00 2001
From: Andrews Arokiam <87992092+andyi2it@users.noreply.github.com>
Date: Sun, 12 Jun 2022 01:52:37 +0530
Subject: [PATCH] Added target and metric to components (#2082)

* Added target and metric to components
Added scaleTarget and scaleMetric to ComponentSpec
Made changes to ksvc_reconciler to add annotations to component as required

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added validation

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Fixed validation bugs

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Updated comments

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Updated python sdk and openapi

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added test for autoscaling

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added tests for autoscaling

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added e2e for autoscaling changes
Removed validation for metric concurrency

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Updated CRD

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Fixed liniting issues

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Made ScaleMetric as enum
Added a test for raw deployment

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Fixed issues with type after changes in ScaleMetric type

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Updated crd

Updated crd

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Updated test case.
Modified to check deployment mode before checking hpa or kpa.

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added debug logs to debug e2e failure in workflow alone

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Disabling all tests temporarily to debug failing e2e

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Reverting debugging changes

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Temporary changes to verify if e2e is passing after Dan's port forwarding

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Removed e2e changes.
Removed comments.
Updated python sdk version to debug test failure.

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Updated test variable

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Fixed linting error
Added test logs

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Adding logs to print spec

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added more debugging logs

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Reverting python definition log in run e2e script

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Updated python sdk and docs
Fixed liniting error

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Fix for failing protobuf issue

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added debug logs for kserve controller

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added logs to check controller changes show up

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Added knative hpa

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Updated resource requirements

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Moved knative hpa installation to setup-deps

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>

Removed skip for tests

Signed-off-by: Andrews Arokiam <andrews.arokiam@ideas2it.com>
Signed-off-by: Dan Sun <dsun20@bloomberg.net>

* Mutate and add deployment mode annotation if RawDeployment or ModelMesh

Signed-off-by: Dan Sun <dsun20@bloomberg.net>

Co-authored-by: Dan Sun <dsun20@bloomberg.net>
---
 .../serving.kserve.io_inferenceservices.yaml  |  27 +++
 pkg/apis/serving/v1beta1/component.go         |  21 ++
 .../v1beta1/inference_service_defaults.go     |  12 +-
 .../inference_service_defaults_test.go        |  15 +-
 .../v1beta1/inference_service_validation.go   |  83 +++++++-
 pkg/apis/serving/v1beta1/openapi_generated.go |  62 ++++++
 pkg/apis/serving/v1beta1/swagger.json         |  39 ++++
 .../serving/v1beta1/zz_generated.deepcopy.go  |  10 +
 pkg/constants/constants.go                    |  18 ++
 .../reconcilers/hpa/hpa_reconciler.go         |  40 ++--
 .../reconcilers/knative/ksvc_reconciler.go    |  10 +
 .../service_account_credentials_test.go       |   4 +-
 python/kserve/docs/V1alpha1BuiltInAdapter.md  |   2 +-
 .../docs/V1alpha1ServingRuntimePodSpec.md     |   1 +
 .../kserve/docs/V1alpha1ServingRuntimeSpec.md |   1 +
 .../docs/V1beta1ComponentExtensionSpec.md     |   2 +
 python/kserve/docs/V1beta1ExplainerSpec.md    |   2 +
 python/kserve/docs/V1beta1PredictorSpec.md    |   2 +
 python/kserve/docs/V1beta1TransformerSpec.md  |   2 +
 .../models/v1alpha1_built_in_adapter.py       |   4 +-
 .../v1alpha1_serving_runtime_pod_spec.py      |  34 ++-
 .../models/v1alpha1_serving_runtime_spec.py   |  34 ++-
 .../v1beta1_component_extension_spec.py       |  58 ++++-
 .../kserve/models/v1beta1_explainer_spec.py   |  58 ++++-
 .../kserve/models/v1beta1_predictor_spec.py   |  58 ++++-
 .../kserve/models/v1beta1_transformer_spec.py |  58 ++++-
 .../serving.kserve.io_inferenceservices.yaml  |  27 +++
 test/e2e/predictor/test_autoscaling.py        | 200 ++++++++++++++++++
 test/scripts/gh-actions/setup-deps.sh         |   3 +
 29 files changed, 852 insertions(+), 35 deletions(-)
 create mode 100644 test/e2e/predictor/test_autoscaling.py

diff --git a/config/crd/serving.kserve.io_inferenceservices.yaml b/config/crd/serving.kserve.io_inferenceservices.yaml
index 83e16a4b78c..ba9df191053 100644
--- a/config/crd/serving.kserve.io_inferenceservices.yaml
+++ b/config/crd/serving.kserve.io_inferenceservices.yaml
@@ -2782,6 +2782,15 @@ spec:
                       type: string
                     runtimeClassName:
                       type: string
+                    scaleMetric:
+                      enum:
+                        - cpu
+                        - memory
+                        - concurrency
+                        - rps
+                      type: string
+                    scaleTarget:
+                      type: integer
                     schedulerName:
                       type: string
                     securityContext:
@@ -8027,6 +8036,15 @@ spec:
                       type: string
                     runtimeClassName:
                       type: string
+                    scaleMetric:
+                      enum:
+                        - cpu
+                        - memory
+                        - concurrency
+                        - rps
+                      type: string
+                    scaleTarget:
+                      type: integer
                     schedulerName:
                       type: string
                     securityContext:
@@ -12133,6 +12151,15 @@ spec:
                       type: string
                     runtimeClassName:
                       type: string
+                    scaleMetric:
+                      enum:
+                        - cpu
+                        - memory
+                        - concurrency
+                        - rps
+                      type: string
+                    scaleTarget:
+                      type: integer
                     schedulerName:
                       type: string
                     securityContext:
diff --git a/pkg/apis/serving/v1beta1/component.go b/pkg/apis/serving/v1beta1/component.go
index 1179f5a52a1..56b31cf8a8a 100644
--- a/pkg/apis/serving/v1beta1/component.go
+++ b/pkg/apis/serving/v1beta1/component.go
@@ -80,6 +80,16 @@ type ComponentExtensionSpec struct {
 	// Maximum number of replicas for autoscaling.
 	// +optional
 	MaxReplicas int `json:"maxReplicas,omitempty"`
+	// ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for.
+	// concurrency and rps targets are supported by Knative Pod Autoscaler
+	//(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).
+	// +optional
+	ScaleTarget *int `json:"scaleTarget,omitempty"`
+	// ScaleMetric defines the scaling metric type watched by autoscaler
+	// possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via
+	// Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).
+	// +optional
+	ScaleMetric *ScaleMetric `json:"scaleMetric,omitempty"`
 	// ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container
 	// concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).
 	// +optional
@@ -98,6 +108,17 @@ type ComponentExtensionSpec struct {
 	Batcher *Batcher `json:"batcher,omitempty"`
 }
 
+// ScaleMetric enum
+// +kubebuilder:validation:Enum=cpu;memory;concurrency;rps
+type ScaleMetric string
+
+const (
+	MetricCPU         ScaleMetric = "cpu"
+	MetricMemory      ScaleMetric = "memory"
+	MetricConcurrency ScaleMetric = "concurrency"
+	MetricRPS         ScaleMetric = "rps"
+)
+
 // Default the ComponentExtensionSpec
 func (s *ComponentExtensionSpec) Default(config *InferenceServicesConfig) {}
 
diff --git a/pkg/apis/serving/v1beta1/inference_service_defaults.go b/pkg/apis/serving/v1beta1/inference_service_defaults.go
index 71e1c1b2d09..d5fa15b82a2 100644
--- a/pkg/apis/serving/v1beta1/inference_service_defaults.go
+++ b/pkg/apis/serving/v1beta1/inference_service_defaults.go
@@ -72,10 +72,18 @@ func (isvc *InferenceService) Default() {
 	if err != nil {
 		panic(err)
 	}
-	isvc.DefaultInferenceService(configMap)
+	deployConfig, err := NewDeployConfig(cli)
+	if err != nil {
+		panic(err)
+	}
+	isvc.DefaultInferenceService(configMap, deployConfig)
 }
 
-func (isvc *InferenceService) DefaultInferenceService(config *InferenceServicesConfig) {
+func (isvc *InferenceService) DefaultInferenceService(config *InferenceServicesConfig, deployConfig *DeployConfig) {
+	if deployConfig.DefaultDeploymentMode == string(constants.ModelMeshDeployment) ||
+		deployConfig.DefaultDeploymentMode == string(constants.RawDeployment) {
+		isvc.ObjectMeta.Annotations[constants.DeploymentMode] = deployConfig.DefaultDeploymentMode
+	}
 	deploymentMode, ok := isvc.ObjectMeta.Annotations[constants.DeploymentMode]
 	if !ok || deploymentMode != string(constants.ModelMeshDeployment) {
 		// Only attempt to assign runtimes for non-modelmesh predictors
diff --git a/pkg/apis/serving/v1beta1/inference_service_defaults_test.go b/pkg/apis/serving/v1beta1/inference_service_defaults_test.go
index 4c807240ff5..f2c926fbb2a 100644
--- a/pkg/apis/serving/v1beta1/inference_service_defaults_test.go
+++ b/pkg/apis/serving/v1beta1/inference_service_defaults_test.go
@@ -45,6 +45,9 @@ func TestInferenceServiceDefaults(t *testing.T) {
 			},
 		},
 	}
+	deployConfig := &DeployConfig{
+		DefaultDeploymentMode: "Serverless",
+	}
 	isvc := InferenceService{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "foo",
@@ -83,7 +86,7 @@ func TestInferenceServiceDefaults(t *testing.T) {
 	}
 	resources := v1.ResourceRequirements{Requests: defaultResource, Limits: defaultResource}
 	isvc.Spec.DeepCopy()
-	isvc.DefaultInferenceService(config)
+	isvc.DefaultInferenceService(config, deployConfig)
 	g.Expect(*&isvc.Spec.Predictor.Tensorflow).To(gomega.BeNil())
 	g.Expect(*&isvc.Spec.Predictor.Model).NotTo(gomega.BeNil())
 
@@ -111,6 +114,9 @@ func TestCustomPredictorDefaults(t *testing.T) {
 			},
 		},
 	}
+	deployConfig := &DeployConfig{
+		DefaultDeploymentMode: "Serverless",
+	}
 	isvc := InferenceService{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "foo",
@@ -135,13 +141,16 @@ func TestCustomPredictorDefaults(t *testing.T) {
 	}
 	resources := v1.ResourceRequirements{Requests: defaultResource, Limits: defaultResource}
 	isvc.Spec.DeepCopy()
-	isvc.DefaultInferenceService(config)
+	isvc.DefaultInferenceService(config, deployConfig)
 	g.Expect(isvc.Spec.Predictor.PodSpec.Containers[0].Resources).To(gomega.Equal(resources))
 }
 
 func TestInferenceServiceDefaultsModelMeshAnnotation(t *testing.T) {
 	g := gomega.NewGomegaWithT(t)
 	config := &InferenceServicesConfig{}
+	deployConfig := &DeployConfig{
+		DefaultDeploymentMode: "Serverless",
+	}
 	isvc := InferenceService{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "foo",
@@ -161,7 +170,7 @@ func TestInferenceServiceDefaultsModelMeshAnnotation(t *testing.T) {
 		},
 	}
 	isvc.Spec.DeepCopy()
-	isvc.DefaultInferenceService(config)
+	isvc.DefaultInferenceService(config, deployConfig)
 	g.Expect(isvc.Spec.Predictor.Model).To(gomega.BeNil())
 	g.Expect(isvc.Spec.Predictor.Tensorflow).ToNot(gomega.BeNil())
 }
diff --git a/pkg/apis/serving/v1beta1/inference_service_validation.go b/pkg/apis/serving/v1beta1/inference_service_validation.go
index 596f22223db..2b9ef8fd505 100644
--- a/pkg/apis/serving/v1beta1/inference_service_validation.go
+++ b/pkg/apis/serving/v1beta1/inference_service_validation.go
@@ -26,6 +26,7 @@ import (
 	"github.com/kserve/kserve/pkg/constants"
 	"github.com/kserve/kserve/pkg/utils"
 	"k8s.io/apimachinery/pkg/runtime"
+	"knative.dev/serving/pkg/apis/autoscaling"
 	logf "sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/webhook"
 )
@@ -49,6 +50,8 @@ var _ webhook.Validator = &InferenceService{}
 func (isvc *InferenceService) ValidateCreate() error {
 	validatorLogger.Info("validate create", "name", isvc.Name)
 
+	annotations := isvc.Annotations
+
 	if err := validateInferenceServiceName(isvc); err != nil {
 		return err
 	}
@@ -60,6 +63,7 @@ func (isvc *InferenceService) ValidateCreate() error {
 	if err := validateAutoscalerTargetUtilizationPercentage(isvc); err != nil {
 		return err
 	}
+
 	for _, component := range []Component{
 		&isvc.Spec.Predictor,
 		isvc.Spec.Transformer,
@@ -72,6 +76,7 @@ func (isvc *InferenceService) ValidateCreate() error {
 			if err := utils.FirstNonNilError([]error{
 				component.GetImplementation().Validate(),
 				component.GetExtensions().Validate(),
+				validateAutoScalingCompExtension(annotations, component.GetExtensions()),
 			}); err != nil {
 				return err
 			}
@@ -80,6 +85,18 @@ func (isvc *InferenceService) ValidateCreate() error {
 	return nil
 }
 
+// Validate scaling options component extensions
+func validateAutoScalingCompExtension(annotations map[string]string, compExtSpec *ComponentExtensionSpec) error {
+	deploymentMode := annotations["serving.kserve.io/deploymentMode"]
+	annotationClass := annotations[autoscaling.ClassAnnotationKey]
+	if deploymentMode == string(constants.RawDeployment) || annotationClass == string(autoscaling.HPA) {
+		return validateScalingHPACompExtension(compExtSpec)
+	}
+
+	return validateScalingKPACompExtension(compExtSpec)
+
+}
+
 // ValidateUpdate implements webhook.Validator so a webhook will be registered for the type
 func (isvc *InferenceService) ValidateUpdate(old runtime.Object) error {
 	validatorLogger.Info("validate update", "name", isvc.Name)
@@ -118,7 +135,7 @@ func validateInferenceServiceAutoscaler(isvc *InferenceService) error {
 				switch class {
 				case constants.AutoscalerClassHPA:
 					if metric, ok := annotations[constants.AutoscalerMetrics]; ok {
-						return validateHPAMetrics(metric)
+						return validateHPAMetrics(ScaleMetric(metric))
 					} else {
 						return nil
 					}
@@ -134,7 +151,7 @@ func validateInferenceServiceAutoscaler(isvc *InferenceService) error {
 }
 
 //Validate of autoscaler HPA metrics
-func validateHPAMetrics(metric string) error {
+func validateHPAMetrics(metric ScaleMetric) error {
 	for _, item := range constants.AutoscalerAllowedMetricsList {
 		if item == constants.AutoscalerMetricsType(metric) {
 			return nil
@@ -157,5 +174,67 @@ func validateAutoscalerTargetUtilizationPercentage(isvc *InferenceService) error
 			}
 		}
 	}
+
+	return nil
+}
+
+func validateScalingHPACompExtension(compExtSpec *ComponentExtensionSpec) error {
+	metric := MetricCPU
+	if compExtSpec.ScaleMetric != nil {
+		metric = *compExtSpec.ScaleMetric
+	}
+
+	err := validateHPAMetrics(metric)
+
+	if err != nil {
+		return err
+	}
+
+	if compExtSpec.ScaleTarget != nil {
+		target := *compExtSpec.ScaleTarget
+		if metric == MetricCPU && target < 1 || target > 100 {
+			return fmt.Errorf("The target utilization percentage should be a [1-100] integer.")
+		}
+
+		if metric == MetricMemory && target < 1 {
+			return fmt.Errorf("The target memory should be greater than 1 MiB")
+		}
+
+	}
+
+	return nil
+}
+
+func validateKPAMetrics(metric ScaleMetric) error {
+	for _, item := range constants.AutoScalerKPAMetricsAllowedList {
+		if item == constants.AutoScalerKPAMetricsType(metric) {
+			return nil
+		}
+	}
+	return fmt.Errorf("[%s] is not a supported metric.\n", metric)
+
+}
+
+func validateScalingKPACompExtension(compExtSpec *ComponentExtensionSpec) error {
+	metric := MetricConcurrency
+	if compExtSpec.ScaleMetric != nil {
+		metric = *compExtSpec.ScaleMetric
+	}
+
+	err := validateKPAMetrics(metric)
+
+	if err != nil {
+		return err
+	}
+
+	if compExtSpec.ScaleTarget != nil {
+		target := *compExtSpec.ScaleTarget
+
+		if metric == MetricRPS && target < 1 {
+			return fmt.Errorf("The target for rps should be greater than 1")
+		}
+
+	}
+
 	return nil
 }
diff --git a/pkg/apis/serving/v1beta1/openapi_generated.go b/pkg/apis/serving/v1beta1/openapi_generated.go
index 60d5d4da610..ece7a073a25 100644
--- a/pkg/apis/serving/v1beta1/openapi_generated.go
+++ b/pkg/apis/serving/v1beta1/openapi_generated.go
@@ -1869,6 +1869,20 @@ func schema_pkg_apis_serving_v1beta1_ComponentExtensionSpec(ref common.Reference
 							Format:      "int32",
 						},
 					},
+					"scaleTarget": {
+						SchemaProps: spec.SchemaProps{
+							Description: "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).",
+							Type:        []string{"integer"},
+							Format:      "int32",
+						},
+					},
+					"scaleMetric": {
+						SchemaProps: spec.SchemaProps{
+							Description: "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).",
+							Type:        []string{"string"},
+							Format:      "",
+						},
+					},
 					"containerConcurrency": {
 						SchemaProps: spec.SchemaProps{
 							Description: "ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).",
@@ -3897,6 +3911,20 @@ func schema_pkg_apis_serving_v1beta1_ExplainerSpec(ref common.ReferenceCallback)
 							Format:      "int32",
 						},
 					},
+					"scaleTarget": {
+						SchemaProps: spec.SchemaProps{
+							Description: "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).",
+							Type:        []string{"integer"},
+							Format:      "int32",
+						},
+					},
+					"scaleMetric": {
+						SchemaProps: spec.SchemaProps{
+							Description: "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).",
+							Type:        []string{"string"},
+							Format:      "",
+						},
+					},
 					"containerConcurrency": {
 						SchemaProps: spec.SchemaProps{
 							Description: "ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).",
@@ -4330,6 +4358,12 @@ func schema_pkg_apis_serving_v1beta1_IngressConfig(ref common.ReferenceCallback)
 							Format: "",
 						},
 					},
+					"urlScheme": {
+						SchemaProps: spec.SchemaProps{
+							Type:   []string{"string"},
+							Format: "",
+						},
+					},
 				},
 			},
 		},
@@ -7055,6 +7089,20 @@ func schema_pkg_apis_serving_v1beta1_PredictorSpec(ref common.ReferenceCallback)
 							Format:      "int32",
 						},
 					},
+					"scaleTarget": {
+						SchemaProps: spec.SchemaProps{
+							Description: "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).",
+							Type:        []string{"integer"},
+							Format:      "int32",
+						},
+					},
+					"scaleMetric": {
+						SchemaProps: spec.SchemaProps{
+							Description: "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).",
+							Type:        []string{"string"},
+							Format:      "",
+						},
+					},
 					"containerConcurrency": {
 						SchemaProps: spec.SchemaProps{
 							Description: "ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).",
@@ -8448,6 +8496,20 @@ func schema_pkg_apis_serving_v1beta1_TransformerSpec(ref common.ReferenceCallbac
 							Format:      "int32",
 						},
 					},
+					"scaleTarget": {
+						SchemaProps: spec.SchemaProps{
+							Description: "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).",
+							Type:        []string{"integer"},
+							Format:      "int32",
+						},
+					},
+					"scaleMetric": {
+						SchemaProps: spec.SchemaProps{
+							Description: "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).",
+							Type:        []string{"string"},
+							Format:      "",
+						},
+					},
 					"containerConcurrency": {
 						SchemaProps: spec.SchemaProps{
 							Description: "ContainerConcurrency specifies how many requests can be processed concurrently, this sets the hard limit of the container concurrency(https://knative.dev/docs/serving/autoscaling/concurrency).",
diff --git a/pkg/apis/serving/v1beta1/swagger.json b/pkg/apis/serving/v1beta1/swagger.json
index 6687cb062a3..b671e62a0aa 100644
--- a/pkg/apis/serving/v1beta1/swagger.json
+++ b/pkg/apis/serving/v1beta1/swagger.json
@@ -1030,6 +1030,15 @@
           "type": "integer",
           "format": "int32"
         },
+        "scaleMetric": {
+          "description": "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).",
+          "type": "string"
+        },
+        "scaleTarget": {
+          "description": "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).",
+          "type": "integer",
+          "format": "int32"
+        },
         "timeout": {
           "description": "TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component.",
           "type": "integer",
@@ -2099,6 +2108,15 @@
           "description": "RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod.  If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14.",
           "type": "string"
         },
+        "scaleMetric": {
+          "description": "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).",
+          "type": "string"
+        },
+        "scaleTarget": {
+          "description": "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).",
+          "type": "integer",
+          "format": "int32"
+        },
         "schedulerName": {
           "description": "If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.",
           "type": "string"
@@ -2389,6 +2407,9 @@
         },
         "localGatewayService": {
           "type": "string"
+        },
+        "urlScheme": {
+          "type": "string"
         }
       }
     },
@@ -3869,6 +3890,15 @@
           "description": "RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod.  If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14.",
           "type": "string"
         },
+        "scaleMetric": {
+          "description": "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).",
+          "type": "string"
+        },
+        "scaleTarget": {
+          "description": "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).",
+          "type": "integer",
+          "format": "int32"
+        },
         "schedulerName": {
           "description": "If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.",
           "type": "string"
@@ -4674,6 +4704,15 @@
           "description": "RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod.  If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \"legacy\" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14.",
           "type": "string"
         },
+        "scaleMetric": {
+          "description": "ScaleMetric defines the scaling metric type watched by autoscaler possible values are concurrency, rps, cpu, memory. concurrency, rps are supported via Knative Pod Autoscaler(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics).",
+          "type": "string"
+        },
+        "scaleTarget": {
+          "description": "ScaleTarget specifies the integer target value of the metric type the Autoscaler watches for. concurrency and rps targets are supported by Knative Pod Autoscaler (https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).",
+          "type": "integer",
+          "format": "int32"
+        },
         "schedulerName": {
           "description": "If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler.",
           "type": "string"
diff --git a/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go b/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go
index 8bafae4a6de..4e9e100a8b5 100644
--- a/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go
+++ b/pkg/apis/serving/v1beta1/zz_generated.deepcopy.go
@@ -116,6 +116,16 @@ func (in *ComponentExtensionSpec) DeepCopyInto(out *ComponentExtensionSpec) {
 		*out = new(int)
 		**out = **in
 	}
+	if in.ScaleTarget != nil {
+		in, out := &in.ScaleTarget, &out.ScaleTarget
+		*out = new(int)
+		**out = **in
+	}
+	if in.ScaleMetric != nil {
+		in, out := &in.ScaleMetric, &out.ScaleMetric
+		*out = new(ScaleMetric)
+		**out = **in
+	}
 	if in.ContainerConcurrency != nil {
 		in, out := &in.ContainerConcurrency, &out.ContainerConcurrency
 		*out = new(int64)
diff --git a/pkg/constants/constants.go b/pkg/constants/constants.go
index 12ae861aec9..da9ddaff00f 100644
--- a/pkg/constants/constants.go
+++ b/pkg/constants/constants.go
@@ -113,6 +113,12 @@ var (
 
 type AutoscalerClassType string
 type AutoscalerMetricsType string
+type AutoScalerKPAMetricsType string
+
+var (
+	AutoScalerKPAMetricsRPS         AutoScalerKPAMetricsType = "rps"
+	AutoScalerKPAMetricsConcurrency AutoScalerKPAMetricsType = "concurrency"
+)
 
 // Autoscaler Default Class
 var (
@@ -129,6 +135,11 @@ var (
 	AutoScalerMetricsCPU AutoscalerMetricsType = "cpu"
 )
 
+// Autoscaler Memory metrics
+var (
+	AutoScalerMetricsMemory AutoscalerMetricsType = "memory"
+)
+
 // Autoscaler Class Allowed List
 var AutoscalerAllowedClassList = []AutoscalerClassType{
 	AutoscalerClassHPA,
@@ -137,6 +148,13 @@ var AutoscalerAllowedClassList = []AutoscalerClassType{
 // Autoscaler Metrics Allowed List
 var AutoscalerAllowedMetricsList = []AutoscalerMetricsType{
 	AutoScalerMetricsCPU,
+	AutoScalerMetricsMemory,
+}
+
+// Autoscaler KPA Metrics Allowed List
+var AutoScalerKPAMetricsAllowedList = []AutoScalerKPAMetricsType{
+	AutoScalerKPAMetricsConcurrency,
+	AutoScalerKPAMetricsRPS,
 }
 
 // Autoscaler Default Metrics Value
diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/hpa/hpa_reconciler.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/hpa/hpa_reconciler.go
index 235a717e63c..41cbc7f7c91 100644
--- a/pkg/controller/v1beta1/inferenceservice/reconcilers/hpa/hpa_reconciler.go
+++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/hpa/hpa_reconciler.go
@@ -52,28 +52,41 @@ func NewHPAReconciler(client client.Client,
 	}
 }
 
-func getHPAMetrics(metadata metav1.ObjectMeta) []v2beta2.MetricSpec {
+func getHPAMetrics(metadata metav1.ObjectMeta, componentExt *v1beta1.ComponentExtensionSpec) []v2beta2.MetricSpec {
 	var metrics []v2beta2.MetricSpec
-	var cpuUtilization int32
+	var utilization int32
 	annotations := metadata.Annotations
 
+	resourceName := corev1.ResourceCPU
+
 	if value, ok := annotations[constants.TargetUtilizationPercentage]; ok {
-		utilization, _ := strconv.Atoi(value)
-		cpuUtilization = int32(utilization)
+		utilizationInt, _ := strconv.Atoi(value)
+		utilization = int32(utilizationInt)
 	} else {
-		cpuUtilization = constants.DefaultCPUUtilization
+		utilization = constants.DefaultCPUUtilization
+	}
+
+	if componentExt.ScaleTarget != nil {
+		utilization = int32(*componentExt.ScaleTarget)
+	}
+
+	if componentExt.ScaleMetric != nil {
+		resourceName = corev1.ResourceName(*componentExt.ScaleMetric)
+	}
+
+	metricTarget := v2beta2.MetricTarget{
+		Type:               "Utilization",
+		AverageUtilization: &utilization,
 	}
 
 	ms := v2beta2.MetricSpec{
 		Type: v2beta2.ResourceMetricSourceType,
 		Resource: &v2beta2.ResourceMetricSource{
-			Name: corev1.ResourceCPU,
-			Target: v2beta2.MetricTarget{
-				Type:               "Utilization",
-				AverageUtilization: &cpuUtilization,
-			},
+			Name:   resourceName,
+			Target: metricTarget,
 		},
 	}
+
 	metrics = append(metrics, ms)
 	return metrics
 }
@@ -91,7 +104,7 @@ func createHPA(componentMeta metav1.ObjectMeta,
 	if maxReplicas < minReplicas {
 		maxReplicas = minReplicas
 	}
-	metrics := getHPAMetrics(componentMeta)
+	metrics := getHPAMetrics(componentMeta, componentExt)
 	hpa := &v2beta2.HorizontalPodAutoscaler{
 		ObjectMeta: componentMeta,
 		Spec: v2beta2.HorizontalPodAutoscalerSpec{
@@ -102,8 +115,9 @@ func createHPA(componentMeta metav1.ObjectMeta,
 			},
 			MinReplicas: &minReplicas,
 			MaxReplicas: maxReplicas,
-			Metrics:     metrics,
-			Behavior:    &v2beta2.HorizontalPodAutoscalerBehavior{},
+
+			Metrics:  metrics,
+			Behavior: &v2beta2.HorizontalPodAutoscalerBehavior{},
 		},
 	}
 	return hpa
diff --git a/pkg/controller/v1beta1/inferenceservice/reconcilers/knative/ksvc_reconciler.go b/pkg/controller/v1beta1/inferenceservice/reconcilers/knative/ksvc_reconciler.go
index 0602556d00b..61c5e74b7e3 100644
--- a/pkg/controller/v1beta1/inferenceservice/reconcilers/knative/ksvc_reconciler.go
+++ b/pkg/controller/v1beta1/inferenceservice/reconcilers/knative/ksvc_reconciler.go
@@ -19,6 +19,7 @@ package knative
 import (
 	"context"
 	"fmt"
+
 	"github.com/golang/protobuf/proto"
 	"github.com/kserve/kserve/pkg/apis/serving/v1beta1"
 	"github.com/kserve/kserve/pkg/constants"
@@ -84,6 +85,15 @@ func createKnativeService(componentMeta metav1.ObjectMeta,
 	if _, ok := annotations[autoscaling.ClassAnnotationKey]; !ok {
 		annotations[autoscaling.ClassAnnotationKey] = autoscaling.KPA
 	}
+
+	if componentExtension.ScaleTarget != nil {
+		annotations[autoscaling.TargetAnnotationKey] = fmt.Sprint(*componentExtension.ScaleTarget)
+	}
+
+	if componentExtension.ScaleMetric != nil {
+		annotations[autoscaling.MetricAnnotationKey] = fmt.Sprint(*componentExtension.ScaleMetric)
+	}
+
 	lastRolledoutRevision := componentStatus.LatestRolledoutRevision
 
 	// Log component status and canary traffic percent
diff --git a/pkg/credentials/service_account_credentials_test.go b/pkg/credentials/service_account_credentials_test.go
index 1d4bf84c917..c1285573ec1 100644
--- a/pkg/credentials/service_account_credentials_test.go
+++ b/pkg/credentials/service_account_credentials_test.go
@@ -22,7 +22,7 @@ import (
 
 	"github.com/kserve/kserve/pkg/credentials/azure"
 	"github.com/kserve/kserve/pkg/credentials/gcs"
-  "github.com/kserve/kserve/pkg/credentials/hdfs"
+	"github.com/kserve/kserve/pkg/credentials/hdfs"
 	"github.com/kserve/kserve/pkg/credentials/s3"
 
 	"github.com/google/go-cmp/cmp"
@@ -634,4 +634,4 @@ func TestAzureStorageAccessKeyCredentialBuilder(t *testing.T) {
 
 	g.Expect(c.Delete(context.TODO(), customAzureSecret)).NotTo(gomega.HaveOccurred())
 	g.Expect(c.Delete(context.TODO(), customOnlyServiceAccount)).NotTo(gomega.HaveOccurred())
-}
\ No newline at end of file
+}
diff --git a/python/kserve/docs/V1alpha1BuiltInAdapter.md b/python/kserve/docs/V1alpha1BuiltInAdapter.md
index 86449fbf63e..09dc36f111b 100644
--- a/python/kserve/docs/V1alpha1BuiltInAdapter.md
+++ b/python/kserve/docs/V1alpha1BuiltInAdapter.md
@@ -7,7 +7,7 @@ Name | Type | Description | Notes
 **mem_buffer_bytes** | **int** | Fixed memory overhead to subtract from runtime container&#39;s memory allocation to determine model capacity | [optional] 
 **model_loading_timeout_millis** | **int** | Timeout for model loading operations in milliseconds | [optional] 
 **runtime_management_port** | **int** | Port which the runtime server listens for model management requests | [optional] 
-**server_type** | **str** | ServerType can be one of triton/mlserver and the runtime&#39;s container must have the same name | [optional] 
+**server_type** | **str** | ServerType must be one of the supported built-in types such as \&quot;triton\&quot; or \&quot;mlserver\&quot;, and the runtime&#39;s container must have the same name | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
 
diff --git a/python/kserve/docs/V1alpha1ServingRuntimePodSpec.md b/python/kserve/docs/V1alpha1ServingRuntimePodSpec.md
index c364cfe226e..62a3444c1d2 100644
--- a/python/kserve/docs/V1alpha1ServingRuntimePodSpec.md
+++ b/python/kserve/docs/V1alpha1ServingRuntimePodSpec.md
@@ -7,6 +7,7 @@ Name | Type | Description | Notes
 **containers** | [**list[V1Container]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Container.md) | List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. | 
 **node_selector** | **dict(str, str)** | NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node&#39;s labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ | [optional] 
 **tolerations** | [**list[V1Toleration]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Toleration.md) | If specified, the pod&#39;s tolerations. | [optional] 
+**volumes** | [**list[V1Volume]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Volume.md) | List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
 
diff --git a/python/kserve/docs/V1alpha1ServingRuntimeSpec.md b/python/kserve/docs/V1alpha1ServingRuntimeSpec.md
index 871c3fa906d..84aa679c96c 100644
--- a/python/kserve/docs/V1alpha1ServingRuntimeSpec.md
+++ b/python/kserve/docs/V1alpha1ServingRuntimeSpec.md
@@ -18,6 +18,7 @@ Name | Type | Description | Notes
 **storage_helper** | [**V1alpha1StorageHelper**](V1alpha1StorageHelper.md) |  | [optional] 
 **supported_model_formats** | [**list[V1alpha1SupportedModelFormat]**](V1alpha1SupportedModelFormat.md) | Model formats and version supported by this runtime | [optional] 
 **tolerations** | [**list[V1Toleration]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Toleration.md) | If specified, the pod&#39;s tolerations. | [optional] 
+**volumes** | [**list[V1Volume]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Volume.md) | List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
 
diff --git a/python/kserve/docs/V1beta1ComponentExtensionSpec.md b/python/kserve/docs/V1beta1ComponentExtensionSpec.md
index 2dbbc8c2276..569eb94d6b9 100644
--- a/python/kserve/docs/V1beta1ComponentExtensionSpec.md
+++ b/python/kserve/docs/V1beta1ComponentExtensionSpec.md
@@ -10,6 +10,8 @@ Name | Type | Description | Notes
 **logger** | [**V1beta1LoggerSpec**](V1beta1LoggerSpec.md) |  | [optional] 
 **max_replicas** | **int** | Maximum number of replicas for autoscaling. | [optional] 
 **min_replicas** | **int** | Minimum number of replicas, defaults to 1 but can be set to 0 to enable scale-to-zero. | [optional] 
+**scale_metric** | **str** | ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). | [optional] 
+**scale_target** | **int** | ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). | [optional] 
 **timeout** | **int** | TimeoutSeconds specifies the number of seconds to wait before timing out a request to the component. | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
diff --git a/python/kserve/docs/V1beta1ExplainerSpec.md b/python/kserve/docs/V1beta1ExplainerSpec.md
index f0a3e36dcc1..54c38e1dc94 100644
--- a/python/kserve/docs/V1beta1ExplainerSpec.md
+++ b/python/kserve/docs/V1beta1ExplainerSpec.md
@@ -37,6 +37,8 @@ Name | Type | Description | Notes
 **readiness_gates** | [**list[V1PodReadinessGate]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodReadinessGate.md) | If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \&quot;True\&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/0007-pod-ready%2B%2B.md | [optional] 
 **restart_policy** | **str** | Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy | [optional] 
 **runtime_class_name** | **str** | RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod.  If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \&quot;legacy\&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14. | [optional] 
+**scale_metric** | **str** | ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). | [optional] 
+**scale_target** | **int** | ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). | [optional] 
 **scheduler_name** | **str** | If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. | [optional] 
 **security_context** | [**V1PodSecurityContext**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodSecurityContext.md) |  | [optional] 
 **service_account** | **str** | DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. | [optional] 
diff --git a/python/kserve/docs/V1beta1PredictorSpec.md b/python/kserve/docs/V1beta1PredictorSpec.md
index e0169c3033b..3362362b56f 100644
--- a/python/kserve/docs/V1beta1PredictorSpec.md
+++ b/python/kserve/docs/V1beta1PredictorSpec.md
@@ -40,6 +40,8 @@ Name | Type | Description | Notes
 **readiness_gates** | [**list[V1PodReadinessGate]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodReadinessGate.md) | If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \&quot;True\&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/0007-pod-ready%2B%2B.md | [optional] 
 **restart_policy** | **str** | Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy | [optional] 
 **runtime_class_name** | **str** | RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod.  If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \&quot;legacy\&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14. | [optional] 
+**scale_metric** | **str** | ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). | [optional] 
+**scale_target** | **int** | ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). | [optional] 
 **scheduler_name** | **str** | If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. | [optional] 
 **security_context** | [**V1PodSecurityContext**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodSecurityContext.md) |  | [optional] 
 **service_account** | **str** | DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. | [optional] 
diff --git a/python/kserve/docs/V1beta1TransformerSpec.md b/python/kserve/docs/V1beta1TransformerSpec.md
index 786f881aeb3..080d67e026a 100644
--- a/python/kserve/docs/V1beta1TransformerSpec.md
+++ b/python/kserve/docs/V1beta1TransformerSpec.md
@@ -34,6 +34,8 @@ Name | Type | Description | Notes
 **readiness_gates** | [**list[V1PodReadinessGate]**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodReadinessGate.md) | If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to \&quot;True\&quot; More info: https://git.k8s.io/enhancements/keps/sig-network/0007-pod-ready%2B%2B.md | [optional] 
 **restart_policy** | **str** | Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy | [optional] 
 **runtime_class_name** | **str** | RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod.  If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the \&quot;legacy\&quot; RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/runtime-class.md This is a beta feature as of Kubernetes v1.14. | [optional] 
+**scale_metric** | **str** | ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/). | [optional] 
+**scale_target** | **int** | ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/). | [optional] 
 **scheduler_name** | **str** | If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. | [optional] 
 **security_context** | [**V1PodSecurityContext**](https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1PodSecurityContext.md) |  | [optional] 
 **service_account** | **str** | DeprecatedServiceAccount is a depreciated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. | [optional] 
diff --git a/python/kserve/kserve/models/v1alpha1_built_in_adapter.py b/python/kserve/kserve/models/v1alpha1_built_in_adapter.py
index 9a5908f6067..2375e7ed77d 100644
--- a/python/kserve/kserve/models/v1alpha1_built_in_adapter.py
+++ b/python/kserve/kserve/models/v1alpha1_built_in_adapter.py
@@ -182,7 +182,7 @@ def runtime_management_port(self, runtime_management_port):
     def server_type(self):
         """Gets the server_type of this V1alpha1BuiltInAdapter.  # noqa: E501
 
-        ServerType can be one of triton/mlserver and the runtime's container must have the same name  # noqa: E501
+        ServerType must be one of the supported built-in types such as \"triton\" or \"mlserver\", and the runtime's container must have the same name  # noqa: E501
 
         :return: The server_type of this V1alpha1BuiltInAdapter.  # noqa: E501
         :rtype: str
@@ -193,7 +193,7 @@ def server_type(self):
     def server_type(self, server_type):
         """Sets the server_type of this V1alpha1BuiltInAdapter.
 
-        ServerType can be one of triton/mlserver and the runtime's container must have the same name  # noqa: E501
+        ServerType must be one of the supported built-in types such as \"triton\" or \"mlserver\", and the runtime's container must have the same name  # noqa: E501
 
         :param server_type: The server_type of this V1alpha1BuiltInAdapter.  # noqa: E501
         :type: str
diff --git a/python/kserve/kserve/models/v1alpha1_serving_runtime_pod_spec.py b/python/kserve/kserve/models/v1alpha1_serving_runtime_pod_spec.py
index 7ca054f1fe0..2b674959adf 100644
--- a/python/kserve/kserve/models/v1alpha1_serving_runtime_pod_spec.py
+++ b/python/kserve/kserve/models/v1alpha1_serving_runtime_pod_spec.py
@@ -50,17 +50,19 @@ class V1alpha1ServingRuntimePodSpec(object):
         'affinity': 'V1Affinity',
         'containers': 'list[V1Container]',
         'node_selector': 'dict(str, str)',
-        'tolerations': 'list[V1Toleration]'
+        'tolerations': 'list[V1Toleration]',
+        'volumes': 'list[V1Volume]'
     }
 
     attribute_map = {
         'affinity': 'affinity',
         'containers': 'containers',
         'node_selector': 'nodeSelector',
-        'tolerations': 'tolerations'
+        'tolerations': 'tolerations',
+        'volumes': 'volumes'
     }
 
-    def __init__(self, affinity=None, containers=None, node_selector=None, tolerations=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(self, affinity=None, containers=None, node_selector=None, tolerations=None, volumes=None, local_vars_configuration=None):  # noqa: E501
         """V1alpha1ServingRuntimePodSpec - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -70,6 +72,7 @@ def __init__(self, affinity=None, containers=None, node_selector=None, toleratio
         self._containers = None
         self._node_selector = None
         self._tolerations = None
+        self._volumes = None
         self.discriminator = None
 
         if affinity is not None:
@@ -79,6 +82,8 @@ def __init__(self, affinity=None, containers=None, node_selector=None, toleratio
             self.node_selector = node_selector
         if tolerations is not None:
             self.tolerations = tolerations
+        if volumes is not None:
+            self.volumes = volumes
 
     @property
     def affinity(self):
@@ -172,6 +177,29 @@ def tolerations(self, tolerations):
 
         self._tolerations = tolerations
 
+    @property
+    def volumes(self):
+        """Gets the volumes of this V1alpha1ServingRuntimePodSpec.  # noqa: E501
+
+        List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes  # noqa: E501
+
+        :return: The volumes of this V1alpha1ServingRuntimePodSpec.  # noqa: E501
+        :rtype: list[V1Volume]
+        """
+        return self._volumes
+
+    @volumes.setter
+    def volumes(self, volumes):
+        """Sets the volumes of this V1alpha1ServingRuntimePodSpec.
+
+        List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes  # noqa: E501
+
+        :param volumes: The volumes of this V1alpha1ServingRuntimePodSpec.  # noqa: E501
+        :type: list[V1Volume]
+        """
+
+        self._volumes = volumes
+
     def to_dict(self):
         """Returns the model properties as a dict"""
         result = {}
diff --git a/python/kserve/kserve/models/v1alpha1_serving_runtime_spec.py b/python/kserve/kserve/models/v1alpha1_serving_runtime_spec.py
index 2a0e40dbc8f..6841032a6ed 100644
--- a/python/kserve/kserve/models/v1alpha1_serving_runtime_spec.py
+++ b/python/kserve/kserve/models/v1alpha1_serving_runtime_spec.py
@@ -60,7 +60,8 @@ class V1alpha1ServingRuntimeSpec(object):
         'replicas': 'int',
         'storage_helper': 'V1alpha1StorageHelper',
         'supported_model_formats': 'list[V1alpha1SupportedModelFormat]',
-        'tolerations': 'list[V1Toleration]'
+        'tolerations': 'list[V1Toleration]',
+        'volumes': 'list[V1Volume]'
     }
 
     attribute_map = {
@@ -77,10 +78,11 @@ class V1alpha1ServingRuntimeSpec(object):
         'replicas': 'replicas',
         'storage_helper': 'storageHelper',
         'supported_model_formats': 'supportedModelFormats',
-        'tolerations': 'tolerations'
+        'tolerations': 'tolerations',
+        'volumes': 'volumes'
     }
 
-    def __init__(self, affinity=None, built_in_adapter=None, containers=None, disabled=None, grpc_data_endpoint=None, grpc_endpoint=None, http_data_endpoint=None, multi_model=None, node_selector=None, protocol_versions=None, replicas=None, storage_helper=None, supported_model_formats=None, tolerations=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(self, affinity=None, built_in_adapter=None, containers=None, disabled=None, grpc_data_endpoint=None, grpc_endpoint=None, http_data_endpoint=None, multi_model=None, node_selector=None, protocol_versions=None, replicas=None, storage_helper=None, supported_model_formats=None, tolerations=None, volumes=None, local_vars_configuration=None):  # noqa: E501
         """V1alpha1ServingRuntimeSpec - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -100,6 +102,7 @@ def __init__(self, affinity=None, built_in_adapter=None, containers=None, disabl
         self._storage_helper = None
         self._supported_model_formats = None
         self._tolerations = None
+        self._volumes = None
         self.discriminator = None
 
         if affinity is not None:
@@ -129,6 +132,8 @@ def __init__(self, affinity=None, built_in_adapter=None, containers=None, disabl
             self.supported_model_formats = supported_model_formats
         if tolerations is not None:
             self.tolerations = tolerations
+        if volumes is not None:
+            self.volumes = volumes
 
     @property
     def affinity(self):
@@ -448,6 +453,29 @@ def tolerations(self, tolerations):
 
         self._tolerations = tolerations
 
+    @property
+    def volumes(self):
+        """Gets the volumes of this V1alpha1ServingRuntimeSpec.  # noqa: E501
+
+        List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes  # noqa: E501
+
+        :return: The volumes of this V1alpha1ServingRuntimeSpec.  # noqa: E501
+        :rtype: list[V1Volume]
+        """
+        return self._volumes
+
+    @volumes.setter
+    def volumes(self, volumes):
+        """Sets the volumes of this V1alpha1ServingRuntimeSpec.
+
+        List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes  # noqa: E501
+
+        :param volumes: The volumes of this V1alpha1ServingRuntimeSpec.  # noqa: E501
+        :type: list[V1Volume]
+        """
+
+        self._volumes = volumes
+
     def to_dict(self):
         """Returns the model properties as a dict"""
         result = {}
diff --git a/python/kserve/kserve/models/v1beta1_component_extension_spec.py b/python/kserve/kserve/models/v1beta1_component_extension_spec.py
index 0a724563ff1..b2d4cdfe1a1 100644
--- a/python/kserve/kserve/models/v1beta1_component_extension_spec.py
+++ b/python/kserve/kserve/models/v1beta1_component_extension_spec.py
@@ -53,6 +53,8 @@ class V1beta1ComponentExtensionSpec(object):
         'logger': 'V1beta1LoggerSpec',
         'max_replicas': 'int',
         'min_replicas': 'int',
+        'scale_metric': 'str',
+        'scale_target': 'int',
         'timeout': 'int'
     }
 
@@ -63,10 +65,12 @@ class V1beta1ComponentExtensionSpec(object):
         'logger': 'logger',
         'max_replicas': 'maxReplicas',
         'min_replicas': 'minReplicas',
+        'scale_metric': 'scaleMetric',
+        'scale_target': 'scaleTarget',
         'timeout': 'timeout'
     }
 
-    def __init__(self, batcher=None, canary_traffic_percent=None, container_concurrency=None, logger=None, max_replicas=None, min_replicas=None, timeout=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(self, batcher=None, canary_traffic_percent=None, container_concurrency=None, logger=None, max_replicas=None, min_replicas=None, scale_metric=None, scale_target=None, timeout=None, local_vars_configuration=None):  # noqa: E501
         """V1beta1ComponentExtensionSpec - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -78,6 +82,8 @@ def __init__(self, batcher=None, canary_traffic_percent=None, container_concurre
         self._logger = None
         self._max_replicas = None
         self._min_replicas = None
+        self._scale_metric = None
+        self._scale_target = None
         self._timeout = None
         self.discriminator = None
 
@@ -93,6 +99,10 @@ def __init__(self, batcher=None, canary_traffic_percent=None, container_concurre
             self.max_replicas = max_replicas
         if min_replicas is not None:
             self.min_replicas = min_replicas
+        if scale_metric is not None:
+            self.scale_metric = scale_metric
+        if scale_target is not None:
+            self.scale_target = scale_target
         if timeout is not None:
             self.timeout = timeout
 
@@ -230,6 +240,52 @@ def min_replicas(self, min_replicas):
 
         self._min_replicas = min_replicas
 
+    @property
+    def scale_metric(self):
+        """Gets the scale_metric of this V1beta1ComponentExtensionSpec.  # noqa: E501
+
+        ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/).  # noqa: E501
+
+        :return: The scale_metric of this V1beta1ComponentExtensionSpec.  # noqa: E501
+        :rtype: str
+        """
+        return self._scale_metric
+
+    @scale_metric.setter
+    def scale_metric(self, scale_metric):
+        """Sets the scale_metric of this V1beta1ComponentExtensionSpec.
+
+        ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/).  # noqa: E501
+
+        :param scale_metric: The scale_metric of this V1beta1ComponentExtensionSpec.  # noqa: E501
+        :type: str
+        """
+
+        self._scale_metric = scale_metric
+
+    @property
+    def scale_target(self):
+        """Gets the scale_target of this V1beta1ComponentExtensionSpec.  # noqa: E501
+
+        ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).  # noqa: E501
+
+        :return: The scale_target of this V1beta1ComponentExtensionSpec.  # noqa: E501
+        :rtype: int
+        """
+        return self._scale_target
+
+    @scale_target.setter
+    def scale_target(self, scale_target):
+        """Sets the scale_target of this V1beta1ComponentExtensionSpec.
+
+        ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).  # noqa: E501
+
+        :param scale_target: The scale_target of this V1beta1ComponentExtensionSpec.  # noqa: E501
+        :type: int
+        """
+
+        self._scale_target = scale_target
+
     @property
     def timeout(self):
         """Gets the timeout of this V1beta1ComponentExtensionSpec.  # noqa: E501
diff --git a/python/kserve/kserve/models/v1beta1_explainer_spec.py b/python/kserve/kserve/models/v1beta1_explainer_spec.py
index cb9cd7c7de2..cf651caf320 100644
--- a/python/kserve/kserve/models/v1beta1_explainer_spec.py
+++ b/python/kserve/kserve/models/v1beta1_explainer_spec.py
@@ -80,6 +80,8 @@ class V1beta1ExplainerSpec(object):
         'readiness_gates': 'list[V1PodReadinessGate]',
         'restart_policy': 'str',
         'runtime_class_name': 'str',
+        'scale_metric': 'str',
+        'scale_target': 'int',
         'scheduler_name': 'str',
         'security_context': 'V1PodSecurityContext',
         'service_account': 'str',
@@ -128,6 +130,8 @@ class V1beta1ExplainerSpec(object):
         'readiness_gates': 'readinessGates',
         'restart_policy': 'restartPolicy',
         'runtime_class_name': 'runtimeClassName',
+        'scale_metric': 'scaleMetric',
+        'scale_target': 'scaleTarget',
         'scheduler_name': 'schedulerName',
         'security_context': 'securityContext',
         'service_account': 'serviceAccount',
@@ -142,7 +146,7 @@ class V1beta1ExplainerSpec(object):
         'volumes': 'volumes'
     }
 
-    def __init__(self, active_deadline_seconds=None, affinity=None, aix=None, alibi=None, art=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, logger=None, max_replicas=None, min_replicas=None, node_name=None, node_selector=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(self, active_deadline_seconds=None, affinity=None, aix=None, alibi=None, art=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, logger=None, max_replicas=None, min_replicas=None, node_name=None, node_selector=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scale_metric=None, scale_target=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None):  # noqa: E501
         """V1beta1ExplainerSpec - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -181,6 +185,8 @@ def __init__(self, active_deadline_seconds=None, affinity=None, aix=None, alibi=
         self._readiness_gates = None
         self._restart_policy = None
         self._runtime_class_name = None
+        self._scale_metric = None
+        self._scale_target = None
         self._scheduler_name = None
         self._security_context = None
         self._service_account = None
@@ -261,6 +267,10 @@ def __init__(self, active_deadline_seconds=None, affinity=None, aix=None, alibi=
             self.restart_policy = restart_policy
         if runtime_class_name is not None:
             self.runtime_class_name = runtime_class_name
+        if scale_metric is not None:
+            self.scale_metric = scale_metric
+        if scale_target is not None:
+            self.scale_target = scale_target
         if scheduler_name is not None:
             self.scheduler_name = scheduler_name
         if security_context is not None:
@@ -1031,6 +1041,52 @@ def runtime_class_name(self, runtime_class_name):
 
         self._runtime_class_name = runtime_class_name
 
+    @property
+    def scale_metric(self):
+        """Gets the scale_metric of this V1beta1ExplainerSpec.  # noqa: E501
+
+        ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/).  # noqa: E501
+
+        :return: The scale_metric of this V1beta1ExplainerSpec.  # noqa: E501
+        :rtype: str
+        """
+        return self._scale_metric
+
+    @scale_metric.setter
+    def scale_metric(self, scale_metric):
+        """Sets the scale_metric of this V1beta1ExplainerSpec.
+
+        ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/).  # noqa: E501
+
+        :param scale_metric: The scale_metric of this V1beta1ExplainerSpec.  # noqa: E501
+        :type: str
+        """
+
+        self._scale_metric = scale_metric
+
+    @property
+    def scale_target(self):
+        """Gets the scale_target of this V1beta1ExplainerSpec.  # noqa: E501
+
+        ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).  # noqa: E501
+
+        :return: The scale_target of this V1beta1ExplainerSpec.  # noqa: E501
+        :rtype: int
+        """
+        return self._scale_target
+
+    @scale_target.setter
+    def scale_target(self, scale_target):
+        """Sets the scale_target of this V1beta1ExplainerSpec.
+
+        ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).  # noqa: E501
+
+        :param scale_target: The scale_target of this V1beta1ExplainerSpec.  # noqa: E501
+        :type: int
+        """
+
+        self._scale_target = scale_target
+
     @property
     def scheduler_name(self):
         """Gets the scheduler_name of this V1beta1ExplainerSpec.  # noqa: E501
diff --git a/python/kserve/kserve/models/v1beta1_predictor_spec.py b/python/kserve/kserve/models/v1beta1_predictor_spec.py
index 255adc6ad51..a1ce6ed880f 100644
--- a/python/kserve/kserve/models/v1beta1_predictor_spec.py
+++ b/python/kserve/kserve/models/v1beta1_predictor_spec.py
@@ -83,6 +83,8 @@ class V1beta1PredictorSpec(object):
         'readiness_gates': 'list[V1PodReadinessGate]',
         'restart_policy': 'str',
         'runtime_class_name': 'str',
+        'scale_metric': 'str',
+        'scale_target': 'int',
         'scheduler_name': 'str',
         'security_context': 'V1PodSecurityContext',
         'service_account': 'str',
@@ -138,6 +140,8 @@ class V1beta1PredictorSpec(object):
         'readiness_gates': 'readinessGates',
         'restart_policy': 'restartPolicy',
         'runtime_class_name': 'runtimeClassName',
+        'scale_metric': 'scaleMetric',
+        'scale_target': 'scaleTarget',
         'scheduler_name': 'schedulerName',
         'security_context': 'securityContext',
         'service_account': 'serviceAccount',
@@ -156,7 +160,7 @@ class V1beta1PredictorSpec(object):
         'xgboost': 'xgboost'
     }
 
-    def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, lightgbm=None, logger=None, max_replicas=None, min_replicas=None, model=None, node_name=None, node_selector=None, onnx=None, overhead=None, paddle=None, pmml=None, preemption_policy=None, priority=None, priority_class_name=None, pytorch=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, sklearn=None, subdomain=None, tensorflow=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, triton=None, volumes=None, xgboost=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, lightgbm=None, logger=None, max_replicas=None, min_replicas=None, model=None, node_name=None, node_selector=None, onnx=None, overhead=None, paddle=None, pmml=None, preemption_policy=None, priority=None, priority_class_name=None, pytorch=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scale_metric=None, scale_target=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, sklearn=None, subdomain=None, tensorflow=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, triton=None, volumes=None, xgboost=None, local_vars_configuration=None):  # noqa: E501
         """V1beta1PredictorSpec - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -198,6 +202,8 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic
         self._readiness_gates = None
         self._restart_policy = None
         self._runtime_class_name = None
+        self._scale_metric = None
+        self._scale_target = None
         self._scheduler_name = None
         self._security_context = None
         self._service_account = None
@@ -288,6 +294,10 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic
             self.restart_policy = restart_policy
         if runtime_class_name is not None:
             self.runtime_class_name = runtime_class_name
+        if scale_metric is not None:
+            self.scale_metric = scale_metric
+        if scale_target is not None:
+            self.scale_target = scale_target
         if scheduler_name is not None:
             self.scheduler_name = scheduler_name
         if security_context is not None:
@@ -1129,6 +1139,52 @@ def runtime_class_name(self, runtime_class_name):
 
         self._runtime_class_name = runtime_class_name
 
+    @property
+    def scale_metric(self):
+        """Gets the scale_metric of this V1beta1PredictorSpec.  # noqa: E501
+
+        ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/).  # noqa: E501
+
+        :return: The scale_metric of this V1beta1PredictorSpec.  # noqa: E501
+        :rtype: str
+        """
+        return self._scale_metric
+
+    @scale_metric.setter
+    def scale_metric(self, scale_metric):
+        """Sets the scale_metric of this V1beta1PredictorSpec.
+
+        ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/).  # noqa: E501
+
+        :param scale_metric: The scale_metric of this V1beta1PredictorSpec.  # noqa: E501
+        :type: str
+        """
+
+        self._scale_metric = scale_metric
+
+    @property
+    def scale_target(self):
+        """Gets the scale_target of this V1beta1PredictorSpec.  # noqa: E501
+
+        ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).  # noqa: E501
+
+        :return: The scale_target of this V1beta1PredictorSpec.  # noqa: E501
+        :rtype: int
+        """
+        return self._scale_target
+
+    @scale_target.setter
+    def scale_target(self, scale_target):
+        """Sets the scale_target of this V1beta1PredictorSpec.
+
+        ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).  # noqa: E501
+
+        :param scale_target: The scale_target of this V1beta1PredictorSpec.  # noqa: E501
+        :type: int
+        """
+
+        self._scale_target = scale_target
+
     @property
     def scheduler_name(self):
         """Gets the scheduler_name of this V1beta1PredictorSpec.  # noqa: E501
diff --git a/python/kserve/kserve/models/v1beta1_transformer_spec.py b/python/kserve/kserve/models/v1beta1_transformer_spec.py
index 30d321191d2..7054789f8f8 100644
--- a/python/kserve/kserve/models/v1beta1_transformer_spec.py
+++ b/python/kserve/kserve/models/v1beta1_transformer_spec.py
@@ -77,6 +77,8 @@ class V1beta1TransformerSpec(object):
         'readiness_gates': 'list[V1PodReadinessGate]',
         'restart_policy': 'str',
         'runtime_class_name': 'str',
+        'scale_metric': 'str',
+        'scale_target': 'int',
         'scheduler_name': 'str',
         'security_context': 'V1PodSecurityContext',
         'service_account': 'str',
@@ -122,6 +124,8 @@ class V1beta1TransformerSpec(object):
         'readiness_gates': 'readinessGates',
         'restart_policy': 'restartPolicy',
         'runtime_class_name': 'runtimeClassName',
+        'scale_metric': 'scaleMetric',
+        'scale_target': 'scaleTarget',
         'scheduler_name': 'schedulerName',
         'security_context': 'securityContext',
         'service_account': 'serviceAccount',
@@ -136,7 +140,7 @@ class V1beta1TransformerSpec(object):
         'volumes': 'volumes'
     }
 
-    def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, logger=None, max_replicas=None, min_replicas=None, node_name=None, node_selector=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None):  # noqa: E501
+    def __init__(self, active_deadline_seconds=None, affinity=None, automount_service_account_token=None, batcher=None, canary_traffic_percent=None, container_concurrency=None, containers=None, dns_config=None, dns_policy=None, enable_service_links=None, ephemeral_containers=None, host_aliases=None, host_ipc=None, host_network=None, host_pid=None, hostname=None, image_pull_secrets=None, init_containers=None, logger=None, max_replicas=None, min_replicas=None, node_name=None, node_selector=None, overhead=None, preemption_policy=None, priority=None, priority_class_name=None, readiness_gates=None, restart_policy=None, runtime_class_name=None, scale_metric=None, scale_target=None, scheduler_name=None, security_context=None, service_account=None, service_account_name=None, set_hostname_as_fqdn=None, share_process_namespace=None, subdomain=None, termination_grace_period_seconds=None, timeout=None, tolerations=None, topology_spread_constraints=None, volumes=None, local_vars_configuration=None):  # noqa: E501
         """V1beta1TransformerSpec - a model defined in OpenAPI"""  # noqa: E501
         if local_vars_configuration is None:
             local_vars_configuration = Configuration()
@@ -172,6 +176,8 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic
         self._readiness_gates = None
         self._restart_policy = None
         self._runtime_class_name = None
+        self._scale_metric = None
+        self._scale_target = None
         self._scheduler_name = None
         self._security_context = None
         self._service_account = None
@@ -246,6 +252,10 @@ def __init__(self, active_deadline_seconds=None, affinity=None, automount_servic
             self.restart_policy = restart_policy
         if runtime_class_name is not None:
             self.runtime_class_name = runtime_class_name
+        if scale_metric is not None:
+            self.scale_metric = scale_metric
+        if scale_target is not None:
+            self.scale_target = scale_target
         if scheduler_name is not None:
             self.scheduler_name = scheduler_name
         if security_context is not None:
@@ -953,6 +963,52 @@ def runtime_class_name(self, runtime_class_name):
 
         self._runtime_class_name = runtime_class_name
 
+    @property
+    def scale_metric(self):
+        """Gets the scale_metric of this V1beta1TransformerSpec.  # noqa: E501
+
+        ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/).  # noqa: E501
+
+        :return: The scale_metric of this V1beta1TransformerSpec.  # noqa: E501
+        :rtype: str
+        """
+        return self._scale_metric
+
+    @scale_metric.setter
+    def scale_metric(self, scale_metric):
+        """Sets the scale_metric of this V1beta1TransformerSpec.
+
+        ScaleMetric specifies scaling metric of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-metrics/).  # noqa: E501
+
+        :param scale_metric: The scale_metric of this V1beta1TransformerSpec.  # noqa: E501
+        :type: str
+        """
+
+        self._scale_metric = scale_metric
+
+    @property
+    def scale_target(self):
+        """Gets the scale_target of this V1beta1TransformerSpec.  # noqa: E501
+
+        ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).  # noqa: E501
+
+        :return: The scale_target of this V1beta1TransformerSpec.  # noqa: E501
+        :rtype: int
+        """
+        return self._scale_target
+
+    @scale_target.setter
+    def scale_target(self, scale_target):
+        """Sets the scale_target of this V1beta1TransformerSpec.
+
+        ScaleTarget specifies scaling value of the component concurrency(https://knative.dev/docs/serving/autoscaling/autoscaling-targets/).  # noqa: E501
+
+        :param scale_target: The scale_target of this V1beta1TransformerSpec.  # noqa: E501
+        :type: int
+        """
+
+        self._scale_target = scale_target
+
     @property
     def scheduler_name(self):
         """Gets the scheduler_name of this V1beta1TransformerSpec.  # noqa: E501
diff --git a/test/crds/serving.kserve.io_inferenceservices.yaml b/test/crds/serving.kserve.io_inferenceservices.yaml
index 6b06189f202..3baeedf2811 100644
--- a/test/crds/serving.kserve.io_inferenceservices.yaml
+++ b/test/crds/serving.kserve.io_inferenceservices.yaml
@@ -4676,6 +4676,15 @@ spec:
                     type: string
                   runtimeClassName:
                     type: string
+                  scaleMetric:
+                    enum:
+                    - cpu
+                    - memory
+                    - concurrency
+                    - rps
+                    type: string
+                  scaleTarget:
+                    type: integer
                   schedulerName:
                     type: string
                   securityContext:
@@ -9921,6 +9930,15 @@ spec:
                     type: string
                   runtimeClassName:
                     type: string
+                  scaleMetric:
+                    enum:
+                    - cpu
+                    - memory
+                    - concurrency
+                    - rps
+                    type: string
+                  scaleTarget:
+                    type: integer
                   schedulerName:
                     type: string
                   securityContext:
@@ -14027,6 +14045,15 @@ spec:
                     type: string
                   runtimeClassName:
                     type: string
+                  scaleMetric:
+                    enum:
+                    - cpu
+                    - memory
+                    - concurrency
+                    - rps
+                    type: string
+                  scaleTarget:
+                    type: integer
                   schedulerName:
                     type: string
                   securityContext:
diff --git a/test/e2e/predictor/test_autoscaling.py b/test/e2e/predictor/test_autoscaling.py
new file mode 100644
index 00000000000..4bcdba9303a
--- /dev/null
+++ b/test/e2e/predictor/test_autoscaling.py
@@ -0,0 +1,200 @@
+# Copyright 2022 The KServe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from kubernetes import client
+from kserve import (
+    constants,
+    KServeClient,
+    V1beta1InferenceService,
+    V1beta1InferenceServiceSpec,
+    V1beta1PredictorSpec,
+    V1beta1SKLearnSpec,
+)
+from kubernetes.client import V1ResourceRequirements
+import pytest
+
+from ..common.utils import predict
+from ..common.utils import KSERVE_TEST_NAMESPACE
+
+TARGET = "autoscaling.knative.dev/target"
+METRIC = "autoscaling.knative.dev/metric"
+MODEL = "gs://kfserving-examples/models/sklearn/1.0/model"
+INPUT = "./data/iris_input.json"
+
+
+def test_sklearn_kserve_concurrency():
+    service_name = "isvc-sklearn-scale-concurrency"
+    predictor = V1beta1PredictorSpec(
+        min_replicas=1,
+        scale_metric='concurrency',
+        scale_target=2,
+        sklearn=V1beta1SKLearnSpec(
+            storage_uri=MODEL,
+            resources=V1ResourceRequirements(
+                requests={"cpu": "50m", "memory": "128Mi"},
+                limits={"cpu": "100m", "memory": "256Mi"},
+            ),
+        ),
+    )
+    isvc = V1beta1InferenceService(
+        api_version=constants.KSERVE_V1BETA1,
+        kind=constants.KSERVE_KIND,
+        metadata=client.V1ObjectMeta(
+            name=service_name, namespace=KSERVE_TEST_NAMESPACE
+        ),
+        spec=V1beta1InferenceServiceSpec(predictor=predictor),
+    )
+
+    kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config"))
+    kserve_client.create(isvc)
+    kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE)
+    pods = kserve_client.core_api.list_namespaced_pod(KSERVE_TEST_NAMESPACE,
+                                                      label_selector='serving.kserve.io/inferenceservice={}'
+                                                      .format(service_name))
+
+    isvc_annotations = pods.items[0].metadata.annotations
+
+    res = predict(service_name, INPUT)
+    assert res["predictions"] == [1, 1]
+    assert(isvc_annotations[METRIC] == 'concurrency')
+    assert(isvc_annotations[TARGET] == '2')
+    kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)
+
+
+def test_sklearn_kserve_rps():
+    service_name = "isvc-sklearn-scale-rps"
+    predictor = V1beta1PredictorSpec(
+        min_replicas=1,
+        scale_metric='rps',
+        scale_target=5,
+        sklearn=V1beta1SKLearnSpec(
+            storage_uri=MODEL,
+            resources=V1ResourceRequirements(
+                requests={"cpu": "50m", "memory": "128Mi"},
+                limits={"cpu": "100m", "memory": "256Mi"},
+            ),
+        ),
+    )
+
+    isvc = V1beta1InferenceService(
+        api_version=constants.KSERVE_V1BETA1,
+        kind=constants.KSERVE_KIND,
+        metadata=client.V1ObjectMeta(
+            name=service_name, namespace=KSERVE_TEST_NAMESPACE
+        ),
+        spec=V1beta1InferenceServiceSpec(predictor=predictor),
+    )
+
+    kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config"))
+    kserve_client.create(isvc)
+    kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE)
+    pods = kserve_client.core_api.list_namespaced_pod(KSERVE_TEST_NAMESPACE,
+                                                      label_selector='serving.kserve.io/inferenceservice={}'
+                                                      .format(service_name))
+
+    annotations = pods.items[0].metadata.annotations
+
+    assert(annotations[METRIC] == 'rps')
+    assert(annotations[TARGET] == '5')
+    res = predict(service_name, INPUT)
+    assert res["predictions"] == [1, 1]
+    kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)
+
+
+@pytest.mark.skip()
+def test_sklearn_kserve_cpu():
+    service_name = "isvc-sklearn-scale-cpu"
+    predictor = V1beta1PredictorSpec(
+        min_replicas=1,
+        scale_metric='cpu',
+        scale_target=50,
+        sklearn=V1beta1SKLearnSpec(
+            storage_uri=MODEL,
+            resources=V1ResourceRequirements(
+                requests={"cpu": "50m", "memory": "128Mi"},
+                limits={"cpu": "100m", "memory": "256Mi"},
+            ),
+        ),
+    )
+
+    annotations = dict()
+    annotations['autoscaling.knative.dev/class'] = 'hpa.autoscaling.knative.dev'
+
+    isvc = V1beta1InferenceService(
+        api_version=constants.KSERVE_V1BETA1,
+        kind=constants.KSERVE_KIND,
+        metadata=client.V1ObjectMeta(
+            name=service_name, namespace=KSERVE_TEST_NAMESPACE,
+            annotations=annotations
+        ),
+        spec=V1beta1InferenceServiceSpec(predictor=predictor),
+    )
+
+    kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config"))
+    kserve_client.create(isvc)
+    kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE)
+    pods = kserve_client.core_api.list_namespaced_pod(KSERVE_TEST_NAMESPACE,
+                                                      label_selector='serving.kserve.io/inferenceservice={}'
+                                                      .format(service_name))
+
+    isvc_annotations = pods.items[0].metadata.annotations
+
+    assert(isvc_annotations[METRIC] == 'cpu')
+    assert(isvc_annotations[TARGET] == '50')
+    res = predict(service_name, INPUT)
+    assert res["predictions"] == [1, 1]
+    kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)
+
+
+def test_sklearn_kserve_raw():
+    service_name = "isvc-sklearn-scale-raw"
+    predictor = V1beta1PredictorSpec(
+        min_replicas=1,
+        scale_metric='cpu',
+        scale_target=50,
+        sklearn=V1beta1SKLearnSpec(
+            storage_uri=MODEL,
+            resources=V1ResourceRequirements(
+                requests={"cpu": "50m", "memory": "128Mi"},
+                limits={"cpu": "100m", "memory": "256Mi"},
+            ),
+        ),
+    )
+
+    annotations = dict()
+    annotations['serving.kserve.io/deploymentMode'] = 'RawDeployment'
+
+    isvc = V1beta1InferenceService(
+        api_version=constants.KSERVE_V1BETA1,
+        kind=constants.KSERVE_KIND,
+        metadata=client.V1ObjectMeta(
+            name=service_name, namespace=KSERVE_TEST_NAMESPACE,
+            annotations=annotations
+        ),
+        spec=V1beta1InferenceServiceSpec(predictor=predictor),
+    )
+
+    kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config"))
+    kserve_client.create(isvc)
+    kserve_client.wait_isvc_ready(service_name, namespace=KSERVE_TEST_NAMESPACE)
+    api_instance = kserve_client.api_instance
+    hpa_resp = api_instance.list_namespaced_custom_object(group='autoscaling', version='v1',
+                                                          namespace=KSERVE_TEST_NAMESPACE,
+                                                          plural='horizontalpodautoscalers')
+
+    assert(hpa_resp['items'][0]['spec']['targetCPUUtilizationPercentage'] == 50)
+    res = predict(service_name, INPUT)
+    assert res["predictions"] == [1, 1]
+    kserve_client.delete(service_name, KSERVE_TEST_NAMESPACE)
diff --git a/test/scripts/gh-actions/setup-deps.sh b/test/scripts/gh-actions/setup-deps.sh
index e7db7c3f1ab..e2cd68c76dc 100755
--- a/test/scripts/gh-actions/setup-deps.sh
+++ b/test/scripts/gh-actions/setup-deps.sh
@@ -77,6 +77,9 @@ for i in 1 2 3 ; do kustomize build test/overlays/knative | kubectl apply -f - &
 echo "Waiting for Knative to be ready ..."
 kubectl wait --for=condition=Ready pods --all --timeout=180s -n knative-serving -l 'app in (activator,autoscaler,autoscaler-hpa,controller,net-istio-controller,net-istio-webhook)'
 
+echo "Add knative hpa..."
+# kubectl apply -f https://github.com/knative/serving/releases/download/knative-v1.0.0/serving-hpa.yaml
+
 # Skip tag resolution for certain domains
 kubectl patch cm config-deployment --patch '{"data":{"registries-skipping-tag-resolving":"nvcr.io,index.docker.io"}}' -n knative-serving