vllm-project · vanbasten23 · Jun 4, 2025 · Jun 4, 2025 · gemini-code-assist · Jun 4, 2025
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
@@ -67,6 +67,38 @@ def test_basic(
         assert "1024" in output or "0, 1" in output
 
 
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic test for TPU only")
+def test_w8a8_quantization(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
+    max_tokens = 5
+    tensor_parallel_size = 1
+    max_num_seqs = 4
+
+    prompt = "The next numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with vllm_runner(
+                model,
+                max_num_batched_tokens=64,
+                max_model_len=4096,
+                gpu_memory_utilization=0.7,
+                max_num_seqs=max_num_seqs,
+                tensor_parallel_size=tensor_parallel_size) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+        output = vllm_outputs[0][1]
+
+        assert "1024" in output or "0, 1" in output
+
+
 TP_SIZE_8 = 8
 
 

@@ -5,6 +5,8 @@
 from typing import Optional
 
 import torch
+# Required to register custom ops.
+import torch_xla.experimental.custom_kernel  # noqa: F401
 from functorch.experimental.control_flow import cond  # noqa: F401
 
 from vllm.model_executor.layers.quantization.utils import replace_parameter
@@ -90,16 +92,24 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         w_q, w_s, _, _, _ = self._get_weight_params(layer)
 
-        import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
-        out = torch.ops.xla.quantized_matmul(x,
-                                             w_q,
-                                             w_s,
-                                             zero_point=None,
-                                             block_size=-1,
-                                             int4_weight=False,
-                                             quantize_activation=True)
-        # `quantized_matmul` output is fp32, cast it down to bf16 for perf
-        out = out.to(x.dtype)
+        # import torch_xla.experimental.xla_quantized_matmul  # noqa: F401
+        # out = torch.ops.xla.quantized_matmul(x,
+        #                                      w_q,
+        #                                      w_s,
+        #                                      zero_point=None,
+        #                                      block_size=-1,
+        #                                      int4_weight=False,
+        #                                      quantize_activation=True)
+        # # `quantized_matmul` output is fp32, cast it down to bf16 for perf
+        # out = out.to(x.dtype)
+
+        out = torch.ops.xla.quantized_matmul_int8(
+            x,
+            w_q,
+            w_s,
+            quantize_activation=True,
+        )
+
         # Explicitly capture control flow to make dynamo happy.
         # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
         return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
@@ -99,6 +99,9 @@ def init_device(self):
         # ring, the xla tpu compiler flag
         # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
         # fix this. It will be removed after the bug in XLA compiler is fixed.
+        # os.environ["LIBTPU_INIT_ARGS"] = (
+        #     "--xla_tpu_force_1d_allreduce_at_chunk_count=1
+        #  --xla_jf_conv_input_fusion=False")
         os.environ["LIBTPU_INIT_ARGS"] = (
             "--xla_tpu_force_1d_allreduce_at_chunk_count=1")
         torch.set_grad_enabled(False)