diff --git a/README.md b/README.md index ffb1b444102..fdf78845cb2 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,7 @@ Error Set: [Developer Guide](/docs/DEVELOPER_GUIDE.md). ### Performance Tests +[KFServing benchmark test comparing Knative and Kubernetes Deployment with HPA](test/benchmark/README.md) [Performance Tests](https://docs.google.com/document/d/1ss7M3cx1qD1PVpTaKTu_Y3C80JJz4nvMZlIyuZutZoE/edit#) ### Contributor Guide diff --git a/pkg/controller/inferenceservice/controller_test.go b/pkg/controller/inferenceservice/controller_test.go index b0d68903041..a5a7731a626 100644 --- a/pkg/controller/inferenceservice/controller_test.go +++ b/pkg/controller/inferenceservice/controller_test.go @@ -25,8 +25,6 @@ import ( "knative.dev/pkg/network" - "github.com/kubeflow/kfserving/pkg/controller/inferenceservice/resources/knative" - "k8s.io/apimachinery/pkg/api/errors" "github.com/google/go-cmp/cmp" @@ -174,7 +172,6 @@ func TestInferenceServiceWithOnlyPredictor(t *testing.T) { "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", "autoscaling.knative.dev/maxScale": "3", "autoscaling.knative.dev/minScale": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage, constants.StorageInitializerSourceUriInternalAnnotationKey: defaultInstance.Spec.Default.Predictor.Tensorflow.StorageURI, }, }, @@ -495,7 +492,6 @@ func TestInferenceServiceWithDefaultAndCanaryPredictor(t *testing.T) { "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", "autoscaling.knative.dev/maxScale": "3", "autoscaling.knative.dev/minScale": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage, constants.StorageInitializerSourceUriInternalAnnotationKey: canary.Spec.Canary.Predictor.Tensorflow.StorageURI, }, }, @@ -1097,11 +1093,10 @@ func TestInferenceServiceWithTransformer(t *testing.T) { constants.KServiceComponentLabel: constants.Transformer.String(), }, Annotations: map[string]string{ - "autoscaling.knative.dev/target": "1", - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/maxScale": "3", - "autoscaling.knative.dev/minScale": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage, + "autoscaling.knative.dev/target": "1", + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/maxScale": "3", + "autoscaling.knative.dev/minScale": "1", }, }, Spec: knservingv1.RevisionSpec{ @@ -1661,11 +1656,10 @@ func TestInferenceServiceWithExplainer(t *testing.T) { constants.KServiceComponentLabel: constants.Explainer.String(), }, Annotations: map[string]string{ - "autoscaling.knative.dev/target": "1", - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/maxScale": "3", - "autoscaling.knative.dev/minScale": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage, + "autoscaling.knative.dev/target": "1", + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/maxScale": "3", + "autoscaling.knative.dev/minScale": "1", }, }, Spec: knservingv1.RevisionSpec{ diff --git a/pkg/controller/inferenceservice/reconcilers/knative/service_reconciler_test.go b/pkg/controller/inferenceservice/reconcilers/knative/service_reconciler_test.go index 2882b342eb2..fd9fe486f43 100644 --- a/pkg/controller/inferenceservice/reconcilers/knative/service_reconciler_test.go +++ b/pkg/controller/inferenceservice/reconcilers/knative/service_reconciler_test.go @@ -22,8 +22,6 @@ import ( "testing" "time" - "github.com/kubeflow/kfserving/pkg/controller/inferenceservice/resources/knative" - "github.com/google/go-cmp/cmp" "github.com/kubeflow/kfserving/pkg/apis/serving/v1alpha2" "github.com/kubeflow/kfserving/pkg/constants" @@ -130,7 +128,6 @@ func TestKnativeServiceReconcile(t *testing.T) { "autoscaling.knative.dev/minScale": "1", "autoscaling.knative.dev/target": "1", "internal.serving.kubeflow.org/storage-initializer-sourceuri": "gs://testuri", - "queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage, }, }, Spec: knservingv1.RevisionSpec{ @@ -175,7 +172,6 @@ func TestKnativeServiceReconcile(t *testing.T) { "autoscaling.knative.dev/minScale": "1", "autoscaling.knative.dev/target": "1", "internal.serving.kubeflow.org/storage-initializer-sourceuri": "gs://testuri2", - "queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage, }, }, Spec: knservingv1.RevisionSpec{ @@ -238,7 +234,6 @@ func TestKnativeServiceReconcile(t *testing.T) { "autoscaling.knative.dev/minScale": "1", "autoscaling.knative.dev/target": "1", "internal.serving.kubeflow.org/storage-initializer-sourceuri": "gs://testuri", - "queue.sidecar.serving.knative.dev/resourcePercentage": knative.DefaultQueueSideCarResourcePercentage, }, }, Spec: knservingv1.RevisionSpec{ diff --git a/pkg/controller/inferenceservice/resources/knative/service.go b/pkg/controller/inferenceservice/resources/knative/service.go index ea36b5d294e..74495837771 100644 --- a/pkg/controller/inferenceservice/resources/knative/service.go +++ b/pkg/controller/inferenceservice/resources/knative/service.go @@ -29,7 +29,6 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "knative.dev/serving/pkg/apis/autoscaling" - "knative.dev/serving/pkg/apis/serving" knservingv1 "knative.dev/serving/pkg/apis/serving/v1" ) @@ -40,12 +39,6 @@ var serviceAnnotationDisallowedList = []string{ "kubectl.kubernetes.io/last-applied-configuration", } -const ( - // Set to 20% of the resource for main container, InferenceService defaults to 1CPU which is 200m for queue-proxy - // https://github.com/knative/serving/blob/1d263950f9f2fea85a4dd394948a029c328af9d9/pkg/reconciler/revision/resources/resourceboundary.go#L30 - DefaultQueueSideCarResourcePercentage = "20" -) - type ServiceBuilder struct { inferenceServiceConfig *v1alpha2.InferenceServicesConfig credentialBuilder *credentials.CredentialBuilder @@ -393,9 +386,6 @@ func (c *ServiceBuilder) buildAnnotations(metadata metav1.ObjectMeta, minReplica annotations[autoscaling.MaxScaleAnnotationKey] = fmt.Sprint(maxReplicas) } - if _, ok := annotations[serving.QueueSideCarResourcePercentageAnnotation]; !ok { - annotations[serving.QueueSideCarResourcePercentageAnnotation] = DefaultQueueSideCarResourcePercentage - } // User can pass down scaling target annotation to overwrite the target default 1 if _, ok := annotations[autoscaling.TargetAnnotationKey]; !ok { if parallelism == 0 { diff --git a/pkg/controller/inferenceservice/resources/knative/service_test.go b/pkg/controller/inferenceservice/resources/knative/service_test.go index 0fcec86004d..9be797786ed 100644 --- a/pkg/controller/inferenceservice/resources/knative/service_test.go +++ b/pkg/controller/inferenceservice/resources/knative/service_test.go @@ -46,8 +46,7 @@ var isvc = v1alpha2.InferenceService{ Name: "mnist", Namespace: "default", Annotations: map[string]string{ - constants.InferenceServiceGKEAcceleratorAnnotationKey: "nvidia-tesla-t4", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, + constants.InferenceServiceGKEAcceleratorAnnotationKey: "nvidia-tesla-t4", }, }, Spec: v1alpha2.InferenceServiceSpec{ @@ -100,7 +99,6 @@ var defaultService = &knservingv1.Service{ "autoscaling.knative.dev/target": "1", "autoscaling.knative.dev/minScale": "1", "autoscaling.knative.dev/maxScale": "3", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, constants.InferenceServiceGKEAcceleratorAnnotationKey: "nvidia-tesla-t4", constants.StorageInitializerSourceUriInternalAnnotationKey: isvc.Spec.Default.Predictor.Tensorflow.StorageURI, }, @@ -149,7 +147,6 @@ var canaryService = &knservingv1.Service{ "autoscaling.knative.dev/target": "1", "autoscaling.knative.dev/minScale": "1", "autoscaling.knative.dev/maxScale": "3", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, constants.InferenceServiceGKEAcceleratorAnnotationKey: "nvidia-tesla-t4", constants.StorageInitializerSourceUriInternalAnnotationKey: "s3://test/mnist-2/export", }, @@ -195,8 +192,7 @@ func TestInferenceServiceToKnativeService(t *testing.T) { Name: "mnist", Namespace: "default", Annotations: map[string]string{ - constants.InferenceServiceGKEAcceleratorAnnotationKey: "nvidia-tesla-t4", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, + constants.InferenceServiceGKEAcceleratorAnnotationKey: "nvidia-tesla-t4", }, }, Spec: v1alpha2.InferenceServiceSpec{ @@ -273,7 +269,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) { "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", "autoscaling.knative.dev/minScale": "1", "autoscaling.knative.dev/target": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, }, }, Spec: knservingv1.RevisionSpec{ @@ -334,7 +329,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) { "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", "autoscaling.knative.dev/minScale": "1", "autoscaling.knative.dev/target": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, }, }, Spec: knservingv1.RevisionSpec{ @@ -396,7 +390,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) { "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", "autoscaling.knative.dev/minScale": "1", "autoscaling.knative.dev/target": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, }, }, Spec: knservingv1.RevisionSpec{ @@ -434,7 +427,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) { "autoscaling.knative.dev/target": "2", constants.StorageInitializerSourceUriInternalAnnotationKey: "test", "kubectl.kubernetes.io/last-applied-configuration": "test2", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, }, }, Spec: v1alpha2.InferenceServiceSpec{ @@ -472,7 +464,6 @@ func TestInferenceServiceToKnativeService(t *testing.T) { "autoscaling.knative.dev/minScale": "1", "sourceName": "srcName", "prop1": "val1", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, }, }, Spec: knservingv1.RevisionSpec{ @@ -613,11 +604,10 @@ func TestTransformerToKnativeService(t *testing.T) { constants.KServiceComponentLabel: constants.Transformer.String(), }, Annotations: map[string]string{ - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/target": "1", - "autoscaling.knative.dev/minScale": "1", - "autoscaling.knative.dev/maxScale": "3", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/target": "1", + "autoscaling.knative.dev/minScale": "1", + "autoscaling.knative.dev/maxScale": "3", }, }, Spec: knservingv1.RevisionSpec{ @@ -660,11 +650,10 @@ func TestTransformerToKnativeService(t *testing.T) { constants.KServiceComponentLabel: constants.Transformer.String(), }, Annotations: map[string]string{ - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/target": "1", - "autoscaling.knative.dev/minScale": "2", - "autoscaling.knative.dev/maxScale": "4", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/target": "1", + "autoscaling.knative.dev/minScale": "2", + "autoscaling.knative.dev/maxScale": "4", }, }, Spec: knservingv1.RevisionSpec{ @@ -807,10 +796,9 @@ func TestExplainerToKnativeService(t *testing.T) { constants.KServiceComponentLabel: constants.Explainer.String(), }, Annotations: map[string]string{ - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/minScale": "1", - "autoscaling.knative.dev/target": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/minScale": "1", + "autoscaling.knative.dev/target": "1", }, }, Spec: knservingv1.RevisionSpec{ @@ -853,10 +841,9 @@ func TestExplainerToKnativeService(t *testing.T) { constants.KServiceComponentLabel: constants.Explainer.String(), }, Annotations: map[string]string{ - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/minScale": "1", - "autoscaling.knative.dev/target": "1", - "queue.sidecar.serving.knative.dev/resourcePercentage": DefaultQueueSideCarResourcePercentage, + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/minScale": "1", + "autoscaling.knative.dev/target": "1", }, }, Spec: knservingv1.RevisionSpec{ diff --git a/test/benchmark/README.md b/test/benchmark/README.md new file mode 100644 index 00000000000..8d3bcdf0a78 --- /dev/null +++ b/test/benchmark/README.md @@ -0,0 +1,289 @@ +# Benchmark + +This benchmark focus on testing KFServing performance with and without Knative queue proxy/activator on the request path. + +* Knative queue proxy does the following for the KFServing main container. + - Enforces concurrency level for the pod + - Emit metrics for autoscaling(KPA) + - Timeout enforcement + - Readiness probe + - Queue limiting + - Distributed tracing + - Graceful shutdown handling + +* Knative activator buffers the requests while pods are scaled down to zero and report metrics to autoscaler. The activator +also effectively acts as a load balancer which distributes the load across all the pods as they become available in a way that +does not overload them with regards to their concurrency settings. So it protects the app from burst so you do not see messages +queuing in the user pods. + +## Environment Setup +- K8S: v1.14.10-gke.36(8 nodes n1-standard) +- Istio: 1.1.6 +- Knative: 0.11.2 +- KFServing: master(with fix for https://github.com/kubeflow/kfserving/issues/844) + +Note that `v1.14.10-gke.36` suffers the [CFS throttling bug](https://github.com/kubernetes/kubernetes/issues/67577), +and `1.15.11-gke.15` includes the CFS throttling fix. + +## Benchmarking + +### Results on KFServing SKLearn Iris Example +- Create `InferenceService` +```bash +kubectl apply -f ./sklearn.yaml +``` +- Create the input vegeta configmap +```bash +kubectl apply -f ./sklearn_vegeta_cfg.yaml +``` +- Create the benchmark job using [vegeta](https://github.com/tsenart/vegeta) +Note that you can configure pod anti-affinity to run vegeta on a different node on which the inference pod is running. +```bash +kubectl create -f ./sk_benchmark.yaml +``` + +#### CC=8 With queue proxy and activator on the request path +Create an `InferenceService` with `ContainerCurrency`(cc) set to 8 which is equal to the number of cores on the node. +```yaml +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "sklearn-iris" +spec: + default: + parallelism: 8 # CC=8 + predictor: + sklearn: + storageUri: "gs://kfserving-samples/models/sklearn/iris" +``` + +| QPS/Replicas | mean | p50 | p95 | p99 | Success Rate | +| --- | --- | --- | --- | --- | --- | +| 5/s minReplicas=1 | 6.213ms | 5.915ms | 6.992ms | 7.615ms | 100% | +| 50/s minReplicas=1 | 5.738ms | 5.608ms | 6.483ms | 6.801ms | 100% | +| 500/s minReplicas=1 | 4.083ms | 3.743ms | 4.929ms | 5.642ms | 100% | +| 1000/s minReplicas=1 | 398.562ms | 5.95ms | 2.945s | 3.691s | 100% | + +#### Raw Kubernetes Service(Without queue proxy and activator on the request path) +- Update the SKLearn Iris `InferenceService` with following yaml to use HPA +```yaml +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "sklearn-iris" + annotations: + autoscaling.knative.dev/class: hpa.autoscaling.knative.dev + autoscaling.knative.dev/metric: cpu + autoscaling.knative.dev/target: "80" +spec: + default: + predictor: + sklearn: + storageUri: "gs://kfserving-samples/models/sklearn/iris" +``` +```bash +kubectl apply -f ./sklearn_hpa.yaml +``` +- Setup virtual service to go directly to the private service to bypass the Knative Activator and queue-proxy, change the benchmark +test target url host to `sklearn-iris-raw.default.svc.cluster.local`. +```yaml +apiVersion: v1 +kind: Service +metadata: + name: sklearn-iris-raw +spec: + externalName: cluster-local-gateway.istio-system.svc.cluster.local + sessionAffinity: None + type: ExternalName +--- +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: sklearn-iris-raw +spec: + gateways: + - knative-serving/cluster-local-gateway + hosts: + - sklearn-iris-raw.default.svc.cluster.local + http: + - match: + - authority: + regex: ^sklearn-iris-raw\.default(\.svc(\.cluster\.local)?)?(?::\d{1,5})?$ + gateways: + - knative-serving/cluster-local-gateway + uri: + regex: ^/v1/models/[\w-]+(:predict)? + route: + - destination: + host: sklearn-iris-predictor-default-xt264-private.default.svc.cluster.local #this is the private service to user container + port: + number: 80 + weight: 100 +``` + +| QPS/Replicas | mean | p50 | p95 | p99 | Success Rate | +| --- | --- | --- | --- | --- | --- | +| 5/s Replicas=1 | 2.673ms | 2.381ms | 4.352ms | 5.966ms | 100% | +| 50/s Replicas=1 | 2.188ms | 2.117ms | 2.684ms | 3.02ms | 100% | +| 500/s Replicas=1 | 1.376ms | 1.283ms | 1.713ms | 2.205ms | 100% | +| 1000/s Replicas=1 | 7.969s | 8.658s | 16.669s | 20.307s | 93.72% | + +So you can see that queue-proxy and activator adds 2-3 millisecond overhead, but you get the advantage of KPA and +smart load balancing. For this example we do not see much benefits because the request takes only 1-2 ms to process, +however you can see the obvious advantage when request volume goes to 1000/s and KPA reacts faster and performs better +than HPA. + +### Results on KFServing with TFServing Flower Example +- Create `InferenceService` +```bash +kubectl apply -f ../docs/samples/tensorflow/tensorflow.yaml +``` +- Create the input vegeta configmap +```bash +kubectl apply -f ./tf_vegeta_cfg.yaml +``` +- Create the benchmark job using [vegeta](https://github.com/tsenart/vegeta) +Note that you can configure pod anti-affinity to run vegeta on a different node on which the inference pod is running. +```bash +kubectl create -f ./tf_benchmark.yaml +``` + +#### CC=0 +- Create `InferenceService` with default `ContainerConcurrency` set to 0 which is unlimited concurrency, activator in this case just pass +through and you would still expect requests queued on user container in case of request overload. +```yaml +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "flowers-sample" +spec: + default: + predictor: + tensorflow: + storageUri: "gs://kfserving-samples/models/tensorflow/flowers + resources: + requests: + cpu: "4" + memory: 2Gi + limits: + cpu: "4" + memory: 2Gi +``` + +```bash +kubectl apply -f ./tf_flowers.yaml +``` + +| QPS/Replicas | mean | p50 | p95 | p99 | Success Rate | +| --- | --- | --- | --- | --- | --- | +| 1/s minReplicas=1 | 110.54ms | 110.343ms | 116.116ms | 117.298ms | 100% | +| 5/s minReplicas=1 | 133.272ms | 131.242ms | 148.195ms | 153.291ms | 100% | +| 10/s minReplicas=1 | 946.376ms | 127.961ms | 4.635s | 6.934s | 100% | + +#### CC=1 +- Create `InferenceService` with `ContainerConcurrency` set to 1, activator respects container queue limit 1 so that requests do +not get queued on user pods and activator chooses to route the requests to the pods which have capacity. + +```yaml +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "flowers-sample" +spec: + default: + predictor: + parallelism: 1 #CC=1 + tensorflow: + storageUri: "gs://kfserving-samples/models/tensorflow/flowers + resources: + requests: + cpu: "4" + memory: 2Gi + limits: + cpu: "4" + memory: 2Gi +``` + +| QPS/Replicas | mean | p50 | p95 | p99 | Success Rate | +| --- | --- | --- | --- | --- | --- | +| 1/s minReplicas=1 | 103.766ms | 102.869ms | 111.559ms | 116.577ms | 100% | +| 5/s minReplicas=1 | 117.456ms | 117.117ms | 122.346ms | 126.139ms | 100% | +| 10/s minReplicas=1 | 702.249ms | 111.289ms | 3.469s | 3.831s | 100% | + + +So here you can see that with CC=1, when you send one request at a time the latency does not make much different with CC=0 or CC=1. +However when you send more concurrent requests you start to notice pronounced result when CC=1 because activator starts to take effect and you +will observe better tail latency at p95 and p99 thanks to Knative activator [smarter load balancing](https://github.com/knative/serving/issues/5692) than random load balancing. + +#### Raw Kubernetes Service(Without queue proxy and activator) +```yaml +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "flowers-sample-hpa" + annotations: + autoscaling.knative.dev/class: hpa.autoscaling.knative.dev + autoscaling.knative.dev/metric: cpu + autoscaling.knative.dev/target: "60" +spec: + default: + predictor: + tensorflow: + storageUri: "gs://kfserving-samples/models/tensorflow/flowers + resources: + requests: + cpu: "4" + memory: 2Gi + limits: + cpu: "4" + memory: 2Gi +``` +Setup virtual service to bypass the knative proxy and update vegeta config target URL to +`http://flowers-sample-raw.default.svc.cluster.local/v1/models/flowers-sample-hpa:predict` + + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: flowers-sample-raw + namespace: default +spec: + externalName: cluster-local-gateway.istio-system.svc.cluster.local + sessionAffinity: None + type: ExternalName +--- +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: flowers-sample-raw +spec: + gateways: + - knative-serving/cluster-local-gateway + hosts: + - flowers-sample-raw.default.svc.cluster.local + http: + - match: + - authority: + regex: ^flowers-sample-raw\.default(\.svc(\.cluster\.local)?)?(?::\d{1,5})?$ + gateways: + - knative-serving/cluster-local-gateway + uri: + regex: ^/v1/models/[\w-]+(:predict)? + route: + - destination: + host: flowers-sample-hpa-predictor-default-95bbz-private.default.svc.cluster.local #this is the private service to user container + port: + number: 80 + weight: 100 +``` + +| QPS/Replicas | mean | p50 | p95 | p99 | Success Rate | +| --- | --- | --- | --- | --- | --- | +| 1/s Replicas=1 | 129.143ms | 112.853ms | 118.143ms | 128.557ms | 100% | +| 5/s Replicas=1 | 127.947ms | 127.549ms | 132.171ms | 135.801ms | 100% | +| 10/s Replicas=1 | 5.461s | 5.087s | 12.992s | 14.587s | 100% | + +This experiment runs the `InferenceService` using HPA with average target utilization 80% of CPU and calls directly to Kubernetes Service bypassing +the Knative queue proxy and activator. You can see that KPA reacts faster with the load and performs better than HPA for both low latency and high latency +requests. diff --git a/test/benchmark/sk_benchmark.yaml b/test/benchmark/sk_benchmark.yaml new file mode 100644 index 00000000000..c2aee232260 --- /dev/null +++ b/test/benchmark/sk_benchmark.yaml @@ -0,0 +1,37 @@ +apiVersion: batch/v1 +kind: Job +metadata: + generateName: sklearn-load-test +spec: + backoffLimit: 6 + parallelism: 1 + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + containers: + - args: + - vegeta -cpus=1 attack -duration=10m -rate=5/1s -targets=/var/vegeta/cfg + | vegeta report -type=text + command: + - sh + - -c + image: peterevans/vegeta:latest + imagePullPolicy: Always + name: vegeta + resources: + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - mountPath: /var/vegeta + name: vegeta-cfg + restartPolicy: Never + volumes: + - configMap: + defaultMode: 420 + name: vegeta-cfg + name: vegeta-cfg +--- + diff --git a/test/benchmark/sklearn.yaml b/test/benchmark/sklearn.yaml new file mode 100644 index 00000000000..f99a9721413 --- /dev/null +++ b/test/benchmark/sklearn.yaml @@ -0,0 +1,10 @@ +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "sklearn-iris" +spec: + default: + parallelism: 8 # CC=8 + predictor: + sklearn: + storageUri: "gs://kfserving-samples/models/sklearn/iris" diff --git a/test/benchmark/sklearn_hpa.yaml b/test/benchmark/sklearn_hpa.yaml new file mode 100644 index 00000000000..0c87959ca70 --- /dev/null +++ b/test/benchmark/sklearn_hpa.yaml @@ -0,0 +1,13 @@ +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "sklearn-iris" + annotations: + autoscaling.knative.dev/class: hpa.autoscaling.knative.dev + autoscaling.knative.dev/metric: cpu + autoscaling.knative.dev/target: "80" +spec: + default: + predictor: + sklearn: + storageUri: "gs://kfserving-samples/models/sklearn/iris" diff --git a/test/benchmark/sklearn_vegeta_cfg.yaml b/test/benchmark/sklearn_vegeta_cfg.yaml new file mode 100644 index 00000000000..9671c1efea3 --- /dev/null +++ b/test/benchmark/sklearn_vegeta_cfg.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +data: + cfg: | + POST http://sklearn-iris.default.svc.cluster.local/v1/models/sklearn-iris:predict + @/var/vegeta/payload + payload: | + { + "instances": [ + [6.8, 2.8, 4.8, 1.4], + [6.0, 3.4, 4.5, 1.6] + ] + } +kind: ConfigMap +metadata: + annotations: + name: vegeta-cfg diff --git a/test/benchmark/tf_benchmark.yaml b/test/benchmark/tf_benchmark.yaml new file mode 100644 index 00000000000..15ae07f2b73 --- /dev/null +++ b/test/benchmark/tf_benchmark.yaml @@ -0,0 +1,35 @@ +apiVersion: batch/v1 +kind: Job +metadata: + generateName: tf-load-test +spec: + backoffLimit: 6 + parallelism: 1 + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + spec: + containers: + - args: + - vegeta -cpus=5 attack -duration=1m -rate=5/1s -targets=/var/vegeta/cfg + | vegeta report -type=text + command: + - sh + - -c + image: peterevans/vegeta:latest + imagePullPolicy: Always + name: vegeta + resources: + requests: + cpu: "1" + memory: 1Gi + volumeMounts: + - mountPath: /var/vegeta + name: tf-vegeta-cfg + restartPolicy: Never + volumes: + - configMap: + defaultMode: 420 + name: tf-vegeta-cfg + name: tf-vegeta-cfg diff --git a/test/benchmark/tf_flowers.yaml b/test/benchmark/tf_flowers.yaml new file mode 100644 index 00000000000..49fd81924f4 --- /dev/null +++ b/test/benchmark/tf_flowers.yaml @@ -0,0 +1,17 @@ +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "flowers-sample" +spec: + default: + predictor: + parallelism: 1 #CC=1 + tensorflow: + storageUri: "gs://kfserving-samples/models/tensorflow/flowers" + resources: + requests: + cpu: "4" + memory: "2Gi" + limits: + cpu: "4" + memory: "2Gi" diff --git a/test/benchmark/tf_flowers_hpa.yaml b/test/benchmark/tf_flowers_hpa.yaml new file mode 100644 index 00000000000..df4e464faec --- /dev/null +++ b/test/benchmark/tf_flowers_hpa.yaml @@ -0,0 +1,20 @@ +apiVersion: "serving.kubeflow.org/v1alpha2" +kind: "InferenceService" +metadata: + name: "flowers-sample-hpa" + annotations: + autoscaling.knative.dev/class: hpa.autoscaling.knative.dev + autoscaling.knative.dev/metric: cpu + autoscaling.knative.dev/target: "60" +spec: + default: + predictor: + tensorflow: + storageUri: "gs://kfserving-samples/models/tensorflow/flowers" + resources: + requests: + cpu: "4" + memory: "2Gi" + limits: + cpu: "4" + memory: "2Gi" diff --git a/test/benchmark/tf_vegeta_cfg.yaml b/test/benchmark/tf_vegeta_cfg.yaml new file mode 100644 index 00000000000..b354e912356 --- /dev/null +++ b/test/benchmark/tf_vegeta_cfg.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +data: + cfg: | + POST http://flowers-sample.default.svc.cluster.local/v1/models/flowers-sample:predict + @/var/vegeta/payload + payload: | + { + "instances":[ + { + "image_bytes":{ + "b64":"" + }, + "key":"1" + } + ] + } +kind: ConfigMap +metadata: + annotations: + name: tf-vegeta-cfg