remove squeezellm from docs; remove gemms

vllm-project · mgoin · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
commit 039123f53b2c76969d725f9dff5041e925bd52ca
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -181,7 +181,6 @@ set(VLLM_EXT_SRC
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
-  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"

diff --git a/csrc/ops.h b/csrc/ops.h
@@ -170,9 +170,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                                torch::Tensor& scales);
 
-void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
-                     torch::Tensor lookup_table);
-
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,

diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -237,12 +237,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
   ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
 
-  // Quantized GEMM for SqueezeLLM.
-  ops.def(
-      "squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
-      "lookup_table) -> ()");
-  ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
-
   // Compute FP8 quantized tensor for given scaling factor.
   ops.def(
       "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");

diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst
@@ -119,17 +119,6 @@ The table below shows the compatibility of various quantization implementations
      - ✗
      - ✗
      - ✗
-   * - SqueezeLLM
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✅︎
-     - ✗
-     - ✗
-     - ✗
-     - ✗
-     - ✗
 
 Notes:
 ^^^^^^

diff --git a/examples/fp8/README.md b/examples/fp8/README.md
@@ -62,7 +62,7 @@ This script evaluates the inference throughput of language models using various
 
 python3 benchmarks/benchmark_throughput.py --help 
 usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL]
-                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
+                               [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N]
                                [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code]
                                [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}]
                                [--quantization-param-path KV_CACHE_quantization_param_path]
@@ -76,7 +76,7 @@ optional arguments:
   --output-len OUTPUT_LEN  Output length for each request. Overrides the output length from the dataset.
   --model MODEL
   --tokenizer TOKENIZER
-  --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None}
+  --quantization {awq,gptq,None}, -q {awq,gptq,None}
   --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE
   --n N  Number of generated sequences per prompt.
   --use-beam-search

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -209,12 +209,6 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
     torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
 
 
-# squeezellm
-def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
-                    lookup_table: torch.Tensor) -> None:
-    torch.ops._C.squeezellm_gemm(vec, mat, mul, lookup_table)
-
-
 # marlin
 def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                 b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,

diff --git a/vllm/config.py b/vllm/config.py
@@ -277,7 +277,7 @@ def _parse_quant_hf_config(self):
 
     def _verify_quantization(self) -> None:
         supported_quantization = [*QUANTIZATION_METHODS]
-        rocm_supported_quantization = ["awq", "gptq", "squeezellm", "fp8"]
+        rocm_supported_quantization = ["awq", "gptq", "fp8"]
         optimized_quantization_methods = [
             "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
             "fbgemm_fp8", "compressed_tensors", "compressed-tensors",
@@ -1537,7 +1537,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
         if model_config.quantization and model_config.quantization not in [
                 "awq", "gptq"
         ]:
-            # TODO support marlin and squeezellm
+            # TODO support marlin
             logger.warning("%s quantization is not tested with LoRA yet.",
                            model_config.quantization)
 

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -55,7 +55,7 @@ class LLM:
             However, if the `torch_dtype` in the config is `float32`, we will
             use `float16` instead.
         quantization: The method used to quantize the model weights. Currently,
-            we support "awq", "gptq", "squeezellm", and "fp8" (experimental).
+            we support "awq", "gptq", and "fp8" (experimental).
             If None, we first check the `quantization_config` attribute in the
             model config file. If that is None, we assume the model weights are
             not quantized and use `dtype` to determine the data type of

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
@@ -39,7 +39,7 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # unquantizedLinear
     if hasattr(base_layer, "weight"):
         return base_layer.weight.device
-    # GPTQ/AWQ/SqueezeLLM
+    # GPTQ/AWQ
     elif hasattr(base_layer, "qweight"):
         return base_layer.qweight.device
     # marlin