InftyAI
diff --git a/‎api/inference/v1alpha1/service_types.go‎
Lines changed: 10 additions & 6 deletions b/‎api/inference/v1alpha1/service_types.go‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎api/inference/v1alpha1/zz_generated.deepcopy.go‎
Lines changed: 6 additions & 0 deletions b/‎api/inference/v1alpha1/zz_generated.deepcopy.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎chart/crds/backendruntime-crd.yaml‎
Lines changed: 0 additions & 16 deletions b/‎chart/crds/backendruntime-crd.yaml‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎chart/crds/playground-crd.yaml‎
Lines changed: 38 additions & 39 deletions b/‎chart/crds/playground-crd.yaml‎
Lines changed: 38 additions & 39 deletions
@@ -35,12 +35,16 @@ const (
 type ServiceSpec struct {
 	// ModelClaims represents multiple claims for different models.
 	ModelClaims coreapi.ModelClaims `json:"modelClaims,omitempty"`
-	// WorkloadTemplate defines the underlying workload layout and configuration.
-	// Note: the LWS spec might be twisted with various LWS instances to support
-	// accelerator fungibility or other cutting-edge researches.
-	// LWS supports both single-host and multi-host scenarios, for single host
-	// cases, only need to care about replicas, rolloutStrategy and workerTemplate.
-	WorkloadTemplate lws.LeaderWorkerSetSpec `json:"workloadTemplate"`
+	// Replicas represents the replica number of inference workloads.
+	// +kubebuilder:default=1
+	// +optional
+	Replicas *int32 `json:"replicas,omitempty"`
+	// WorkloadTemplate defines the template for leader/worker pods
+	WorkloadTemplate lws.LeaderWorkerTemplate `json:"workloadTemplate"`
+	// RolloutStrategy defines the strategy that will be applied to update replicas
+	// when a revision is made to the leaderWorkerTemplate.
+	// +optional
+	RolloutStrategy lws.RolloutStrategy `json:"rolloutStrategy,omitempty"`
 }
 
 const (
 
@@ -322,22 +322,6 @@ spec:
                     format: int32
                     type: integer
                 type: object
-              multiHostCommands:
-                description: |-
-                  MultiHostCommands represents leader and worker commands for nodes with
-                  different roles.
-                properties:
-                  leader:
-                    description: Leader commands.
-                    items:
-                      type: string
-                    type: array
-                  worker:
-                    description: Worker commands.
-                    items:
-                      type: string
-                    type: array
-                type: object
               readinessProbe:
                 description: |-
                   Periodic probe of backend readiness.
 
@@ -47,9 +47,8 @@ spec:
                 properties:
                   args:
                     description: |-
-                      Args represents all the arguments for the command.
-                      Argument around with {{ .CONFIG }} is a configuration waiting for render.
-                      Args defined here will "append" the args in the recommendedConfig.
+                      Args defined here will "append" the args defined in the recommendedConfig,
+                      either explicitly configured in configName or inferred in the runtime.
                     items:
                       type: string
                     type: array
@@ -62,7 +61,7 @@ spec:
                     description: |-
                       ConfigName represents the recommended configuration name for the backend,
                       It will be inferred from the models in the runtime if not specified, e.g. default,
-                      speculative-decoding or model-parallelism.
+                      speculative-decoding.
                     type: string
                   envs:
                     description: Envs represents the environments set to the container.
@@ -216,6 +215,41 @@ spec:
                           More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
                         type: object
                     type: object
+                  sharedMemorySize:
+                    anyOf:
+                    - type: integer
+                    - type: string
+                    description: |-
+                      SharedMemorySize represents the size of /dev/shm required in the runtime of
+                      inference workload.
+                      SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
+                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                    x-kubernetes-int-or-string: true
+                  version:
+                    description: |-
+                      Version represents the backend version if you want a different one
+                      from the default version.
+                    type: string
+                type: object
+              elasticConfig:
+                description: |-
+                  ElasticConfig defines the configuration for elastic usage,
+                  e.g. the max/min replicas.
+                properties:
+                  maxReplicas:
+                    description: |-
+                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+                      Default to nil means there's no limit for the instance number.
+                    format: int32
+                    type: integer
+                  minReplicas:
+                    default: 1
+                    description: |-
+                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
+                      Default to 1.
+                      MinReplicas couldn't be 0 now, will support serverless in the future.
+                    format: int32
+                    type: integer
                   scaleTrigger:
                     description: |-
                       ScaleTrigger defines the rules to scale the workloads.
@@ -829,41 +863,6 @@ spec:
                             type: array
                         type: object
                     type: object
-                  sharedMemorySize:
-                    anyOf:
-                    - type: integer
-                    - type: string
-                    description: |-
-                      SharedMemorySize represents the size of /dev/shm required in the runtime of
-                      inference workload.
-                      SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
-                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                    x-kubernetes-int-or-string: true
-                  version:
-                    description: |-
-                      Version represents the backend version if you want a different one
-                      from the default version.
-                    type: string
-                type: object
-              elasticConfig:
-                description: |-
-                  ElasticConfig defines the configuration for elastic usage,
-                  e.g. the max/min replicas.
-                properties:
-                  maxReplicas:
-                    description: |-
-                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
-                      Default to nil means there's no limit for the instance number.
-                    format: int32
-                    type: integer
-                  minReplicas:
-                    default: 1
-                    description: |-
-                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to 1.
-                      MinReplicas couldn't be 0 now, will support serverless in the future.
-                    format: int32
-                    type: integer
                 type: object
               modelClaim:
                 description: |-