add some more fixes

deepjavalibrary · Feb 19, 2024 · 57ab1d7 · 57ab1d7
1 parent 51aea31
commit 57ab1d7
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 7 deletions.
diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
@@ -20,6 +20,8 @@
 
 class VllmQuantizeMethods(str, Enum):
     awq = 'awq'
+    gptq = 'gptq'
+    squeezellm = 'squeezellm'
 
 
 class VllmRbProperties(Properties):
@@ -32,6 +34,10 @@ class VllmRbProperties(Properties):
     # Adjustable prefix model length for certain 32k or longer model
     max_model_len: Optional[int] = None
     enforce_eager: Optional[bool] = False
+    # TODO: this default may change with different vLLM versions
+    # TODO: try to get good default from vLLM to prevent revisiting
+    # TODO: last time check: vllm 0.3.1
+    gpu_memory_utilization: Optional[float] = 0.9
 
     @validator('engine')
     def validate_engine(cls, engine):

diff --git a/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py
@@ -51,6 +51,7 @@ def __init__(self, model_id_or_path, properties, **kwargs):
             seed=0,
             max_model_len=self.vllm_configs.max_model_len,
             enforce_eager=self.vllm_configs.enforce_eager,
+            gpu_memory_utilization=self.vllm_configs.gpu_memory_utilization,
             max_num_batched_tokens=self.vllm_configs.
             max_rolling_batch_prefill_tokens,
             trust_remote_code=self.vllm_configs.trust_remote_code,

diff --git a/engines/python/setup/djl_python/tests/test_properties_manager.py b/engines/python/setup/djl_python/tests/test_properties_manager.py
@@ -443,6 +443,8 @@ def test_vllm_valid():
                              int(properties['max_model_len']))
             self.assertEqual(vllm_configs.enforce_eager,
                              bool(properties['enforce_eager']))
+            self.assertEqual(vllm_configs.gpu_memory_utilization,
+                             float(properties['gpu_memory_utilization']))
 
         # test with invalid quantization
         def test_invalid_quantization_method():
@@ -460,6 +462,7 @@ def test_invalid_quantization_method():
             'dtype': 'fp16',
             'quantize': 'awq',
             'enforce_eager': "True",
+            "gpu_memory_utilization": "0.85",
             'load_format': 'pt'
         }
         test_vllm_valid()

diff --git a/serving/docs/lmi/configurations_large_model_inference_containers.md b/serving/docs/lmi/configurations_large_model_inference_containers.md
@@ -83,13 +83,14 @@ If you specify Engine to be MPI, rolling_batch to auto or lmi-dist in DeepSpeed
 
 If you specify Engine to be Python, rolling_batch to vllm in DeepSpeed container, the following parameters will be accessible.
 
-| Item	                                    | Required	 | Description	                                                                                                                                                                                                                                                                                                                                                                                         | Example value	         |
-|------------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|
-| option.quantize	                         | No	       | Quantize the model with the supported quantization methods	                                                                                                                                                                                                                                                                                                                                          | `awq` Default: `None`	 |
-| option.max_rolling_batch_prefill_tokens	 | No	       | Limits the number of tokens for caching. This needs to be tuned based on batch size and input sequence length to avoid GPU OOM. If you don't set, vLLM will try to find a good number to fit in	                                                                                                                                                                                                     | Default: `None`	       |
-| option.max_model_len                     | No	       | the maximum length (input+output) vLLM should preserve memory for. If not specified, will use the default length the model is capable in config.json. in verion like 0.27.0, sometimes model's maximum length could go to 32k (Mistral 7B) and way beyond the supported KV token size. In that case to deploy on a small instance, we need to adjust this value within the range of KV Cache limit.	 | Default: `None`	       |
-| option.enforce_eager                     | No	       | vLLM by default will run with CUDA graph optimization to reach to the best performance. However, in the situation of very less GPU memory, having CUDA graph enabled will cause OOM. So if you set this option to true, we will use PyTorch Eager mode and disable CUDA graph to save some GBs of memory.	                                                                                           | Default: `False`	      |
-| option.load_format	                      | No	       | The checkpoint format of the model. Default is auto and means bin/safetensors will be used if found.	                                                                                                                                                                                                                                                                                                | Default: `auto`	       |
+| Item	                                    | Required	 | Description	                                                                                                                                                                                                                                                                                                                                                                                         | Example value	                    |
+|------------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
+| option.quantize	                         | No	       | Quantize the model with the supported quantization methods	                                                                                                                                                                                                                                                                                                                                          | `awq` and `gptq` Default: `None`	 |
+| option.max_rolling_batch_prefill_tokens	 | No	       | Limits the number of tokens for caching. This needs to be tuned based on batch size and input sequence length to avoid GPU OOM. If you don't set, vLLM will try to find a good number to fit in	                                                                                                                                                                                                     | Default: `None`	                  |
+| option.max_model_len                     | No	       | the maximum length (input+output) vLLM should preserve memory for. If not specified, will use the default length the model is capable in config.json. in verion like 0.27.0, sometimes model's maximum length could go to 32k (Mistral 7B) and way beyond the supported KV token size. In that case to deploy on a small instance, we need to adjust this value within the range of KV Cache limit.	 | Default: `None`	                  |
+| option.enforce_eager                     | No	       | vLLM by default will run with CUDA graph optimization to reach to the best performance. However, in the situation of very less GPU memory, having CUDA graph enabled will cause OOM. So if you set this option to true, we will use PyTorch Eager mode and disable CUDA graph to save some GBs of memory.	                                                                                           | Default: `False`	                 |
+| option.gpu_memory_utilization                     | No	       | This config controls the percentage of memory to be allocated to PagedAttention. Default to 0.9 (90%). We don't recommend to change this value because this impact the overall GPU memory allocations.                                                                                                                                                                                               | Default: `0.9`	                   |
+| option.load_format	                      | No	       | The checkpoint format of the model. Default is auto and means bin/safetensors will be used if found.	                                                                                                                                                                                                                                                                                                | Default: `auto`	                  |
 
 ### Transformers-NeuronX ([doc](https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-configuration.html#large-model-inference-neuronx-lmi))