Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Get available quantization methods from quantization registry #4098

Merged
merged 8 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from tqdm import tqdm

from vllm import LLM, SamplingParams
from vllm.model_executor.layers.quantization import (
_QUANTIZATION_CONFIG_REGISTRY)


def main(args: argparse.Namespace):
Expand Down Expand Up @@ -101,7 +103,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=['awq', 'gptq', 'squeezellm', None],
choices=[*_QUANTIZATION_CONFIG_REGISTRY, None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)

from vllm.model_executor.layers.quantization import (
_QUANTIZATION_CONFIG_REGISTRY)


def sample_requests(
dataset_path: str,
Expand Down Expand Up @@ -267,7 +270,7 @@ def main(args: argparse.Namespace):
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
choices=['awq', 'gptq', 'squeezellm', None],
choices=[*_QUANTIZATION_CONFIG_REGISTRY, None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
Expand Down
8 changes: 5 additions & 3 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from transformers import PretrainedConfig

from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (
_QUANTIZATION_CONFIG_REGISTRY)
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
is_neuron)
Expand Down Expand Up @@ -118,8 +120,8 @@ def _verify_tokenizer_mode(self) -> None:
self.tokenizer_mode = tokenizer_mode

def _verify_quantization(self) -> None:
supported_quantization = ["awq", "gptq", "squeezellm", "marlin"]
rocm_not_supported_quantization = ["awq", "marlin"]
supported_quantization = [*_QUANTIZATION_CONFIG_REGISTRY]
rocm_supported_quantization = ["gptq", "squeezellm"]
if self.quantization is not None:
self.quantization = self.quantization.lower()

Expand Down Expand Up @@ -155,7 +157,7 @@ def _verify_quantization(self) -> None:
f"Unknown quantization method: {self.quantization}. Must "
f"be one of {supported_quantization}.")
if is_hip(
) and self.quantization in rocm_not_supported_quantization:
) and self.quantization not in rocm_supported_quantization:
raise ValueError(
f"{self.quantization} quantization is currently not "
f"supported in ROCm.")
Expand Down
4 changes: 3 additions & 1 deletion vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, SpeculativeConfig,
TokenizerPoolConfig, VisionLanguageConfig)
from vllm.model_executor.layers.quantization import (
_QUANTIZATION_CONFIG_REGISTRY)
from vllm.utils import str_to_int_tuple


Expand Down Expand Up @@ -286,7 +288,7 @@ def add_cli_args(
parser.add_argument('--quantization',
'-q',
type=str,
choices=['awq', 'gptq', 'squeezellm', None],
choices=[*_QUANTIZATION_CONFIG_REGISTRY, None],
default=EngineArgs.quantization,
help='Method used to quantize the weights. If '
'None, we first check the `quantization_config` '
Expand Down
1 change: 1 addition & 0 deletions vllm/model_executor/layers/quantization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
__all__ = [
"QuantizationConfig",
"get_quantization_config",
"_QUANTIZATION_CONFIG_REGISTRY",
]
Loading