vllm-project
diff --git a/‎.buildkite/test-amd.yaml‎
Lines changed: 10 additions & 10 deletions b/‎.buildkite/test-amd.yaml‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu‎
Lines changed: 4 additions & 5 deletions b/‎csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu‎
Lines changed: 4 additions & 5 deletions
@@ -48,8 +48,8 @@ steps:
   commands:
   - bash standalone_tests/pytorch_nightly_dependency.sh
 
-- label: Async Engine, Inputs, Utils, Worker Test # 36min
-  timeout_in_minutes: 50
+- label: Async Engine, Inputs, Utils, Worker Test # 10min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -616,9 +616,9 @@ steps:
   - uv pip install --system torchao==0.13.0
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
-- label: LM Eval Small Models # 53min
-  timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+- label: LM Eval Small Models # 15min
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -627,8 +627,8 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
-- label: OpenAI API correctness # 22min
-  timeout_in_minutes: 30
+- label: OpenAI API correctness # 10min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -859,10 +859,10 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 50min
-  mirror_hardwares: [amdexperimental]
+- label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
-  timeout_in_minutes: 70
+  timeout_in_minutes: 15
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - vllm/multimodal/
 
@@ -221,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
 
 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
+
+# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
+!vllm/benchmarks/lib/
@@ -1,6 +1,5 @@
 #include "scaled_mm_kernels.hpp"
 #include "scaled_mm_sm100_fp8_dispatch.cuh"
-#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 
 namespace vllm {
 
@@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
                 "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, b, a_scales, b_scales, *bias);
+    return cutlass_scaled_mm_sm100_fp8_epilogue<true>(out, a, b, a_scales,
+                                                      b_scales, *bias);
   } else {
-    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogue>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_scaled_mm_sm100_fp8_epilogue<false>(out, a, b, a_scales,
+                                                       b_scales);
   }
 }