Skip to content

Commit

Permalink
add some more fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Qing Lan committed Feb 19, 2024
1 parent 51aea31 commit 57ab1d7
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

class VllmQuantizeMethods(str, Enum):
awq = 'awq'
gptq = 'gptq'
squeezellm = 'squeezellm'


class VllmRbProperties(Properties):
Expand All @@ -32,6 +34,10 @@ class VllmRbProperties(Properties):
# Adjustable prefix model length for certain 32k or longer model
max_model_len: Optional[int] = None
enforce_eager: Optional[bool] = False
# TODO: this default may change with different vLLM versions
# TODO: try to get good default from vLLM to prevent revisiting
# TODO: last time check: vllm 0.3.1
gpu_memory_utilization: Optional[float] = 0.9

@validator('engine')
def validate_engine(cls, engine):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(self, model_id_or_path, properties, **kwargs):
seed=0,
max_model_len=self.vllm_configs.max_model_len,
enforce_eager=self.vllm_configs.enforce_eager,
gpu_memory_utilization=self.vllm_configs.gpu_memory_utilization,
max_num_batched_tokens=self.vllm_configs.
max_rolling_batch_prefill_tokens,
trust_remote_code=self.vllm_configs.trust_remote_code,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,8 @@ def test_vllm_valid():
int(properties['max_model_len']))
self.assertEqual(vllm_configs.enforce_eager,
bool(properties['enforce_eager']))
self.assertEqual(vllm_configs.gpu_memory_utilization,
float(properties['gpu_memory_utilization']))

# test with invalid quantization
def test_invalid_quantization_method():
Expand All @@ -460,6 +462,7 @@ def test_invalid_quantization_method():
'dtype': 'fp16',
'quantize': 'awq',
'enforce_eager': "True",
"gpu_memory_utilization": "0.85",
'load_format': 'pt'
}
test_vllm_valid()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,14 @@ If you specify Engine to be MPI, rolling_batch to auto or lmi-dist in DeepSpeed

If you specify Engine to be Python, rolling_batch to vllm in DeepSpeed container, the following parameters will be accessible.

| Item | Required | Description | Example value |
|------------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|
| option.quantize | No | Quantize the model with the supported quantization methods | `awq` Default: `None` |
| option.max_rolling_batch_prefill_tokens | No | Limits the number of tokens for caching. This needs to be tuned based on batch size and input sequence length to avoid GPU OOM. If you don't set, vLLM will try to find a good number to fit in | Default: `None` |
| option.max_model_len | No | the maximum length (input+output) vLLM should preserve memory for. If not specified, will use the default length the model is capable in config.json. in verion like 0.27.0, sometimes model's maximum length could go to 32k (Mistral 7B) and way beyond the supported KV token size. In that case to deploy on a small instance, we need to adjust this value within the range of KV Cache limit. | Default: `None` |
| option.enforce_eager | No | vLLM by default will run with CUDA graph optimization to reach to the best performance. However, in the situation of very less GPU memory, having CUDA graph enabled will cause OOM. So if you set this option to true, we will use PyTorch Eager mode and disable CUDA graph to save some GBs of memory. | Default: `False` |
| option.load_format | No | The checkpoint format of the model. Default is auto and means bin/safetensors will be used if found. | Default: `auto` |
| Item | Required | Description | Example value |
|------------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
| option.quantize | No | Quantize the model with the supported quantization methods | `awq` and `gptq` Default: `None` |
| option.max_rolling_batch_prefill_tokens | No | Limits the number of tokens for caching. This needs to be tuned based on batch size and input sequence length to avoid GPU OOM. If you don't set, vLLM will try to find a good number to fit in | Default: `None` |
| option.max_model_len | No | the maximum length (input+output) vLLM should preserve memory for. If not specified, will use the default length the model is capable in config.json. in verion like 0.27.0, sometimes model's maximum length could go to 32k (Mistral 7B) and way beyond the supported KV token size. In that case to deploy on a small instance, we need to adjust this value within the range of KV Cache limit. | Default: `None` |
| option.enforce_eager | No | vLLM by default will run with CUDA graph optimization to reach to the best performance. However, in the situation of very less GPU memory, having CUDA graph enabled will cause OOM. So if you set this option to true, we will use PyTorch Eager mode and disable CUDA graph to save some GBs of memory. | Default: `False` |
| option.gpu_memory_utilization | No | This config controls the percentage of memory to be allocated to PagedAttention. Default to 0.9 (90%). We don't recommend to change this value because this impact the overall GPU memory allocations. | Default: `0.9` |
| option.load_format | No | The checkpoint format of the model. Default is auto and means bin/safetensors will be used if found. | Default: `auto` |

### Transformers-NeuronX ([doc](https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-configuration.html#large-model-inference-neuronx-lmi))

Expand Down

0 comments on commit 57ab1d7

Please sign in to comment.