|
47 | 47 | properties: |
48 | 48 | args: |
49 | 49 | description: |- |
50 | | - Args represents all the arguments for the command. |
51 | | - Argument around with {{ .CONFIG }} is a configuration waiting for render. |
52 | | - Args defined here will "append" the args in the recommendedConfig. |
| 50 | + Args defined here will "append" the args defined in the recommendedConfig, |
| 51 | + either explicitly configured in configName or inferred in the runtime. |
53 | 52 | items: |
54 | 53 | type: string |
55 | 54 | type: array |
|
62 | 61 | description: |- |
63 | 62 | ConfigName represents the recommended configuration name for the backend, |
64 | 63 | It will be inferred from the models in the runtime if not specified, e.g. default, |
65 | | - speculative-decoding or model-parallelism. |
| 64 | + speculative-decoding. |
66 | 65 | type: string |
67 | 66 | envs: |
68 | 67 | description: Envs represents the environments set to the container. |
@@ -216,6 +215,41 @@ spec: |
216 | 215 | More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ |
217 | 216 | type: object |
218 | 217 | type: object |
| 218 | + sharedMemorySize: |
| 219 | + anyOf: |
| 220 | + - type: integer |
| 221 | + - type: string |
| 222 | + description: |- |
| 223 | + SharedMemorySize represents the size of /dev/shm required in the runtime of |
| 224 | + inference workload. |
| 225 | + SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig. |
| 226 | + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ |
| 227 | + x-kubernetes-int-or-string: true |
| 228 | + version: |
| 229 | + description: |- |
| 230 | + Version represents the backend version if you want a different one |
| 231 | + from the default version. |
| 232 | + type: string |
| 233 | + type: object |
| 234 | + elasticConfig: |
| 235 | + description: |- |
| 236 | + ElasticConfig defines the configuration for elastic usage, |
| 237 | + e.g. the max/min replicas. |
| 238 | + properties: |
| 239 | + maxReplicas: |
| 240 | + description: |- |
| 241 | + MaxReplicas indicates the maximum number of inference workloads based on the traffic. |
| 242 | + Default to nil means there's no limit for the instance number. |
| 243 | + format: int32 |
| 244 | + type: integer |
| 245 | + minReplicas: |
| 246 | + default: 1 |
| 247 | + description: |- |
| 248 | + MinReplicas indicates the minimum number of inference workloads based on the traffic. |
| 249 | + Default to 1. |
| 250 | + MinReplicas couldn't be 0 now, will support serverless in the future. |
| 251 | + format: int32 |
| 252 | + type: integer |
219 | 253 | scaleTrigger: |
220 | 254 | description: |- |
221 | 255 | ScaleTrigger defines the rules to scale the workloads. |
@@ -829,41 +863,6 @@ spec: |
829 | 863 | type: array |
830 | 864 | type: object |
831 | 865 | type: object |
832 | | - sharedMemorySize: |
833 | | - anyOf: |
834 | | - - type: integer |
835 | | - - type: string |
836 | | - description: |- |
837 | | - SharedMemorySize represents the size of /dev/shm required in the runtime of |
838 | | - inference workload. |
839 | | - SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig. |
840 | | - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ |
841 | | - x-kubernetes-int-or-string: true |
842 | | - version: |
843 | | - description: |- |
844 | | - Version represents the backend version if you want a different one |
845 | | - from the default version. |
846 | | - type: string |
847 | | - type: object |
848 | | - elasticConfig: |
849 | | - description: |- |
850 | | - ElasticConfig defines the configuration for elastic usage, |
851 | | - e.g. the max/min replicas. |
852 | | - properties: |
853 | | - maxReplicas: |
854 | | - description: |- |
855 | | - MaxReplicas indicates the maximum number of inference workloads based on the traffic. |
856 | | - Default to nil means there's no limit for the instance number. |
857 | | - format: int32 |
858 | | - type: integer |
859 | | - minReplicas: |
860 | | - default: 1 |
861 | | - description: |- |
862 | | - MinReplicas indicates the minimum number of inference workloads based on the traffic. |
863 | | - Default to 1. |
864 | | - MinReplicas couldn't be 0 now, will support serverless in the future. |
865 | | - format: int32 |
866 | | - type: integer |
867 | 866 | type: object |
868 | 867 | modelClaim: |
869 | 868 | description: |- |
|
0 commit comments