Remove default queue proxy resource limit and Add KFServing benchmark…

…ing (kubeflow#894) * Add benchmark script * Add sklearn bechmarking * Add tf benchmark * Add perf result for autoscaler and activator * Add HPA experiments * Address comments * Bump up tf flower example cpu limit * Bump cpu limit for tf benchmark * Update hpa result for tf * Fix hpa benchmark
magdalenakuhn17 · Jul 6, 2020 · 941a38e · 941a38e
1 parent 97d1c74
commit 941a38e
Show file tree

Hide file tree

Showing 14 changed files with 482 additions and 58 deletions.
diff --git a/README.md b/README.md
@@ -146,6 +146,7 @@ Error Set:
 [Developer Guide](/docs/DEVELOPER_GUIDE.md).
 
 ### Performance Tests
+[KFServing benchmark test comparing Knative and Kubernetes Deployment with HPA](test/benchmark/README.md)
 [Performance Tests](https://docs.google.com/document/d/1ss7M3cx1qD1PVpTaKTu_Y3C80JJz4nvMZlIyuZutZoE/edit#)
 
 ### Contributor Guide

diff --git a/pkg/controller/inferenceservice/controller_test.go b/pkg/controller/inferenceservice/controller_test.go
@@ -25,8 +25,6 @@ import (
 
 	"knative.dev/pkg/network"
 
-	"github.com/kubeflow/kfserving/pkg/controller/inferenceservice/resources/knative"
-
 	"k8s.io/apimachinery/pkg/api/errors"
 
 	"github.com/google/go-cmp/cmp"
@@ -174,7 +172,6 @@ func TestInferenceServiceWithOnlyPredictor(t *testing.T) {
 							"autoscaling.knative.dev/class":                            "kpa.autoscaling.knative.dev",
 							"autoscaling.knative.dev/maxScale":                         "3",
 							"autoscaling.knative.dev/minScale":                         "1",
-							"queue.sidecar.serving.knative.dev/resourcePercentage":     knative.DefaultQueueSideCarResourcePercentage,
 							constants.StorageInitializerSourceUriInternalAnnotationKey: defaultInstance.Spec.Default.Predictor.Tensorflow.StorageURI,
 						},
 					},
@@ -495,7 +492,6 @@ func TestInferenceServiceWithDefaultAndCanaryPredictor(t *testing.T) {
 							"autoscaling.knative.dev/class":                            "kpa.autoscaling.knative.dev",
 							"autoscaling.knative.dev/maxScale":                         "3",
 							"autoscaling.knative.dev/minScale":                         "1",
-							"queue.sidecar.serving.knative.dev/resourcePercentage":     knative.DefaultQueueSideCarResourcePercentage,
 							constants.StorageInitializerSourceUriInternalAnnotationKey: canary.Spec.Canary.Predictor.Tensorflow.StorageURI,
 						},
 					},
@@ -1097,11 +1093,10 @@ func TestInferenceServiceWithTransformer(t *testing.T) {
 							constants.KServiceComponentLabel: constants.Transformer.String(),
 						},
 						Annotations: map[string]string{
-							"autoscaling.knative.dev/target":                       "1",
-							"autoscaling.knative.dev/class":                        "kpa.autoscaling.knative.dev",
-							"autoscaling.knative.dev/maxScale":                     "3",
-							"autoscaling.knative.dev/minScale":                     "1",
-							"queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage,
+							"autoscaling.knative.dev/target":   "1",
+							"autoscaling.knative.dev/class":    "kpa.autoscaling.knative.dev",
+							"autoscaling.knative.dev/maxScale": "3",
+							"autoscaling.knative.dev/minScale": "1",
 						},
 					},
 					Spec: knservingv1.RevisionSpec{
@@ -1661,11 +1656,10 @@ func TestInferenceServiceWithExplainer(t *testing.T) {
 							constants.KServiceComponentLabel: constants.Explainer.String(),
 						},
 						Annotations: map[string]string{
-							"autoscaling.knative.dev/target":                       "1",
-							"autoscaling.knative.dev/class":                        "kpa.autoscaling.knative.dev",
-							"autoscaling.knative.dev/maxScale":                     "3",
-							"autoscaling.knative.dev/minScale":                     "1",
-							"queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage,
+							"autoscaling.knative.dev/target":   "1",
+							"autoscaling.knative.dev/class":    "kpa.autoscaling.knative.dev",
+							"autoscaling.knative.dev/maxScale": "3",
+							"autoscaling.knative.dev/minScale": "1",
 						},
 					},
 					Spec: knservingv1.RevisionSpec{

diff --git a/pkg/controller/inferenceservice/reconcilers/knative/service_reconciler_test.go b/pkg/controller/inferenceservice/reconcilers/knative/service_reconciler_test.go
@@ -22,8 +22,6 @@ import (
 	"testing"
 	"time"
 
-	"github.com/kubeflow/kfserving/pkg/controller/inferenceservice/resources/knative"
-
 	"github.com/google/go-cmp/cmp"
 	"github.com/kubeflow/kfserving/pkg/apis/serving/v1alpha2"
 	"github.com/kubeflow/kfserving/pkg/constants"
@@ -130,7 +128,6 @@ func TestKnativeServiceReconcile(t *testing.T) {
 									"autoscaling.knative.dev/minScale":                            "1",
 									"autoscaling.knative.dev/target":                              "1",
 									"internal.serving.kubeflow.org/storage-initializer-sourceuri": "gs://testuri",
-									"queue.sidecar.serving.knative.dev/resourcePercentage":        knative.DefaultQueueSideCarResourcePercentage,
 								},
 							},
 							Spec: knservingv1.RevisionSpec{
@@ -175,7 +172,6 @@ func TestKnativeServiceReconcile(t *testing.T) {
 									"autoscaling.knative.dev/minScale":                            "1",
 									"autoscaling.knative.dev/target":                              "1",
 									"internal.serving.kubeflow.org/storage-initializer-sourceuri": "gs://testuri2",
-									"queue.sidecar.serving.knative.dev/resourcePercentage":        knative.DefaultQueueSideCarResourcePercentage,
 								},
 							},
 							Spec: knservingv1.RevisionSpec{
@@ -238,7 +234,6 @@ func TestKnativeServiceReconcile(t *testing.T) {
 									"autoscaling.knative.dev/minScale":                            "1",
 									"autoscaling.knative.dev/target":                              "1",
 									"internal.serving.kubeflow.org/storage-initializer-sourceuri": "gs://testuri",
-									"queue.sidecar.serving.knative.dev/resourcePercentage":        knative.DefaultQueueSideCarResourcePercentage,
 								},
 							},
 							Spec: knservingv1.RevisionSpec{

diff --git a/pkg/controller/inferenceservice/resources/knative/service.go b/pkg/controller/inferenceservice/resources/knative/service.go
@@ -29,7 +29,6 @@ import (
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"knative.dev/serving/pkg/apis/autoscaling"
-	"knative.dev/serving/pkg/apis/serving"
 	knservingv1 "knative.dev/serving/pkg/apis/serving/v1"
 )
 
@@ -40,12 +39,6 @@ var serviceAnnotationDisallowedList = []string{
 	"kubectl.kubernetes.io/last-applied-configuration",
 }
 
-const (
-	// Set to 20% of the resource for main container, InferenceService defaults to 1CPU which is 200m for queue-proxy
-	// https://github.com/knative/serving/blob/1d263950f9f2fea85a4dd394948a029c328af9d9/pkg/reconciler/revision/resources/resourceboundary.go#L30
-	DefaultQueueSideCarResourcePercentage = "20"
-)
-
 type ServiceBuilder struct {
 	inferenceServiceConfig *v1alpha2.InferenceServicesConfig
 	credentialBuilder      *credentials.CredentialBuilder
@@ -393,9 +386,6 @@ func (c *ServiceBuilder) buildAnnotations(metadata metav1.ObjectMeta, minReplica
 		annotations[autoscaling.MaxScaleAnnotationKey] = fmt.Sprint(maxReplicas)
 	}
 
-	if _, ok := annotations[serving.QueueSideCarResourcePercentageAnnotation]; !ok {
-		annotations[serving.QueueSideCarResourcePercentageAnnotation] = DefaultQueueSideCarResourcePercentage
-	}
 	// User can pass down scaling target annotation to overwrite the target default 1
 	if _, ok := annotations[autoscaling.TargetAnnotationKey]; !ok {
 		if parallelism == 0 {

diff --git a/pkg/controller/inferenceservice/resources/knative/service_test.go b/pkg/controller/inferenceservice/resources/knative/service_test.go
@@ -46,8 +46,7 @@ var isvc = v1alpha2.InferenceService{
 		Name:      "mnist",
 		Namespace: "default",
 		Annotations: map[string]string{
-			constants.InferenceServiceGKEAcceleratorAnnotationKey:  "nvidia-tesla-t4",
-			"queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage,
+			constants.InferenceServiceGKEAcceleratorAnnotationKey: "nvidia-tesla-t4",
 		},
 	},
 	Spec: v1alpha2.InferenceServiceSpec{
@@ -100,7 +99,6 @@ var defaultService = &knservingv1.Service{
 						"autoscaling.knative.dev/target":                           "1",
 						"autoscaling.knative.dev/minScale":                         "1",
 						"autoscaling.knative.dev/maxScale":                         "3",
-						"queue.sidecar.serving.knative.dev/resourcePercentage":     DefaultQueueSideCarResourcePercentage,
 						constants.InferenceServiceGKEAcceleratorAnnotationKey:      "nvidia-tesla-t4",
 						constants.StorageInitializerSourceUriInternalAnnotationKey: isvc.Spec.Default.Predictor.Tensorflow.StorageURI,
 					},
@@ -149,7 +147,6 @@ var canaryService = &knservingv1.Service{
 						"autoscaling.knative.dev/target":                           "1",
 						"autoscaling.knative.dev/minScale":                         "1",
 						"autoscaling.knative.dev/maxScale":                         "3",
-						"queue.sidecar.serving.knative.dev/resourcePercentage":     DefaultQueueSideCarResourcePercentage,
 						constants.InferenceServiceGKEAcceleratorAnnotationKey:      "nvidia-tesla-t4",
 						constants.StorageInitializerSourceUriInternalAnnotationKey: "s3://test/mnist-2/export",
 					},
@@ -195,8 +192,7 @@ func TestInferenceServiceToKnativeService(t *testing.T) {
 					Name:      "mnist",
 					Namespace: "default",
 					Annotations: map[string]string{
-						constants.InferenceServiceGKEAcceleratorAnnotationKey:  "nvidia-tesla-t4",
-						"queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage,
+						constants.InferenceServiceGKEAcceleratorAnnotationKey: "nvidia-tesla-t4",
 					},
 				},
 				Spec: v1alpha2.InferenceServiceSpec{
@@ -273,7 +269,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) {
 									"autoscaling.knative.dev/class":                            "kpa.autoscaling.knative.dev",
 									"autoscaling.knative.dev/minScale":                         "1",
 									"autoscaling.knative.dev/target":                           "1",
-									"queue.sidecar.serving.knative.dev/resourcePercentage":     DefaultQueueSideCarResourcePercentage,
 								},
 							},
 							Spec: knservingv1.RevisionSpec{
@@ -334,7 +329,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) {
 									"autoscaling.knative.dev/class":                            "kpa.autoscaling.knative.dev",
 									"autoscaling.knative.dev/minScale":                         "1",
 									"autoscaling.knative.dev/target":                           "1",
-									"queue.sidecar.serving.knative.dev/resourcePercentage":     DefaultQueueSideCarResourcePercentage,
 								},
 							},
 							Spec: knservingv1.RevisionSpec{
@@ -396,7 +390,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) {
 									"autoscaling.knative.dev/class":                            "kpa.autoscaling.knative.dev",
 									"autoscaling.knative.dev/minScale":                         "1",
 									"autoscaling.knative.dev/target":                           "1",
-									"queue.sidecar.serving.knative.dev/resourcePercentage":     DefaultQueueSideCarResourcePercentage,
 								},
 							},
 							Spec: knservingv1.RevisionSpec{
@@ -434,7 +427,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) {
 						"autoscaling.knative.dev/target":   "2",
 						constants.StorageInitializerSourceUriInternalAnnotationKey: "test",
 						"kubectl.kubernetes.io/last-applied-configuration":         "test2",
-						"queue.sidecar.serving.knative.dev/resourcePercentage":     DefaultQueueSideCarResourcePercentage,
 					},
 				},
 				Spec: v1alpha2.InferenceServiceSpec{
@@ -472,7 +464,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) {
 									"autoscaling.knative.dev/minScale":                         "1",
 									"sourceName":                                               "srcName",
 									"prop1":                                                    "val1",
-									"queue.sidecar.serving.knative.dev/resourcePercentage":     DefaultQueueSideCarResourcePercentage,
 								},
 							},
 							Spec: knservingv1.RevisionSpec{
@@ -613,11 +604,10 @@ func TestTransformerToKnativeService(t *testing.T) {
 							constants.KServiceComponentLabel: constants.Transformer.String(),
 						},
 						Annotations: map[string]string{
-							"autoscaling.knative.dev/class":                        "kpa.autoscaling.knative.dev",
-							"autoscaling.knative.dev/target":                       "1",
-							"autoscaling.knative.dev/minScale":                     "1",
-							"autoscaling.knative.dev/maxScale":                     "3",
-							"queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage,
+							"autoscaling.knative.dev/class":    "kpa.autoscaling.knative.dev",
+							"autoscaling.knative.dev/target":   "1",
+							"autoscaling.knative.dev/minScale": "1",
+							"autoscaling.knative.dev/maxScale": "3",
 						},
 					},
 					Spec: knservingv1.RevisionSpec{
@@ -660,11 +650,10 @@ func TestTransformerToKnativeService(t *testing.T) {
 							constants.KServiceComponentLabel: constants.Transformer.String(),
 						},
 						Annotations: map[string]string{
-							"autoscaling.knative.dev/class":                        "kpa.autoscaling.knative.dev",
-							"autoscaling.knative.dev/target":                       "1",
-							"autoscaling.knative.dev/minScale":                     "2",
-							"autoscaling.knative.dev/maxScale":                     "4",
-							"queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage,
+							"autoscaling.knative.dev/class":    "kpa.autoscaling.knative.dev",
+							"autoscaling.knative.dev/target":   "1",
+							"autoscaling.knative.dev/minScale": "2",
+							"autoscaling.knative.dev/maxScale": "4",
 						},
 					},
 					Spec: knservingv1.RevisionSpec{
@@ -807,10 +796,9 @@ func TestExplainerToKnativeService(t *testing.T) {
 							constants.KServiceComponentLabel: constants.Explainer.String(),
 						},
 						Annotations: map[string]string{
-							"autoscaling.knative.dev/class":                        "kpa.autoscaling.knative.dev",
-							"autoscaling.knative.dev/minScale":                     "1",
-							"autoscaling.knative.dev/target":                       "1",
-							"queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage,
+							"autoscaling.knative.dev/class":    "kpa.autoscaling.knative.dev",
+							"autoscaling.knative.dev/minScale": "1",
+							"autoscaling.knative.dev/target":   "1",
 						},
 					},
 					Spec: knservingv1.RevisionSpec{
@@ -853,10 +841,9 @@ func TestExplainerToKnativeService(t *testing.T) {
 							constants.KServiceComponentLabel: constants.Explainer.String(),
 						},
 						Annotations: map[string]string{
-							"autoscaling.knative.dev/class":                        "kpa.autoscaling.knative.dev",
-							"autoscaling.knative.dev/minScale":                     "1",
-							"autoscaling.knative.dev/target":                       "1",
-							"queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage,
+							"autoscaling.knative.dev/class":    "kpa.autoscaling.knative.dev",
+							"autoscaling.knative.dev/minScale": "1",
+							"autoscaling.knative.dev/target":   "1",
 						},
 					},
 					Spec: knservingv1.RevisionSpec{