misc changes

ca1207 · ca1207 · commit 8d69392c9ee7 · 2025-08-22T07:15:30.000Z
Signed-off-by: ca1207 &lt;ca1207zzz@gmail.com&gt;
diff --git a/benchmarks/kernels/benchmark_polynorm.py b/benchmarks/kernels/benchmark_polynorm.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-from typing import Optional, Union
 
 import torch
 
@@ -23,8 +22,16 @@ def norm(x, eps: float):
         return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
 
     x = x.float()
-    return (weight[0] * norm(x**3, eps) + weight[1] * norm(x**2, eps) +
-            weight[2] * norm(x, eps) + bias).to(weight.dtype).view(orig_shape)
+    return (
+        (
+            weight[0] * norm(x**3, eps)
+            + weight[1] * norm(x**2, eps)
+            + weight[2] * norm(x, eps)
+            + bias
+        )
+        .to(weight.dtype)
+        .view(orig_shape)
+    )
 
 
 def polynorm_vllm(
@@ -44,18 +51,14 @@ def polynorm_vllm(
     return output
 
 
-def calculate_diff(batch_size, seq_len, hidden_size):
+def calculate_diff(batch_size, seq_len, hidden_dim):
     dtype = torch.bfloat16
-    x = torch.randn(batch_size,
-                    seq_len,
-                    hidden_size,
-                    dtype=dtype,
-                    device="cuda")
+    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
     weight = torch.ones(3, dtype=dtype, device="cuda")
     bais = torch.ones(1, dtype=dtype, device="cuda")
 
-    output_naive = polynorm_naive(x.clone(), weight, bais)
-    output_vllm = polynorm_vllm(x.clone(), weight, bais)
+    output_naive = polynorm_naive(x, weight, bais)
+    output_vllm = polynorm_vllm(x, weight, bais)
 
     if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
         print("✅ All implementations match")
@@ -65,47 +68,42 @@ def calculate_diff(batch_size, seq_len, hidden_size):
 
 batch_size_range = [2**i for i in range(0, 7, 2)]
 seq_length_range = [2**i for i in range(6, 11, 1)]
-head_num_range = [32, 48]
-configs = list(
-    itertools.product(head_num_range, batch_size_range, seq_length_range))
+dim_range = [2048, 4096]
+configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
 
 
 def get_benchmark():
-
     @triton.testing.perf_report(
         triton.testing.Benchmark(
-            x_names=["head_num", "batch_size", "seq_len"],
+            x_names=["dim", "batch_size", "seq_len"],
             x_vals=[list(_) for _ in configs],
             line_arg="provider",
             line_vals=["naive", "vllm"],
             line_names=["Naive", "vLLM"],
             styles=[("blue", "-"), ("red", "-")],
             ylabel="us",
-            plot_name=f"polynorm-perf",
+            plot_name="polynorm-perf",
             args={},
-        ))
-    def benchmark(head_num, batch_size, seq_len, provider):
+        )
+    )
+    def benchmark(dim, batch_size, seq_len, provider):
         dtype = torch.bfloat16
-        hidden_size = head_num * 128  # assuming head_dim = 128
+        hidden_dim = dim * 4
 
-        x = torch.randn(batch_size,
-                        seq_len,
-                        hidden_size,
-                        dtype=dtype,
-                        device="cuda")
+        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
         weight = torch.ones(3, dtype=dtype, device="cuda")
         bias = torch.ones(1, dtype=dtype, device="cuda")
 
         quantiles = [0.5, 0.2, 0.8]
 
         if provider == "naive":
             ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_naive(x.clone(), weight, bias),
+                lambda: polynorm_naive(x, weight, bias),
                 quantiles=quantiles,
             )
         else:
             ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_vllm(x.clone(), weight, bias),
+                lambda: polynorm_vllm(x, weight, bias),
                 quantiles=quantiles,
             )
 
@@ -131,10 +129,10 @@ def benchmark(head_num, batch_size, seq_len, provider):
         help="Sequence length",
     )
     parser.add_argument(
-        "--hidden-size",
+        "--hidden-dim",
         type=int,
-        default=4096,
-        help="Hidden size (2nd dimension) of the sequence",
+        default=8192,
+        help="Intermediate size of MLP",
     )
     parser.add_argument(
         "--save-path",
@@ -149,7 +147,7 @@ def benchmark(head_num, batch_size, seq_len, provider):
     calculate_diff(
         batch_size=args.batch_size,
         seq_len=args.seq_len,
-        hidden_size=args.hidden_size,
+        hidden_dim=args.hidden_dim,
     )
 
     benchmark = get_benchmark()
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
@@ -203,7 +203,7 @@ struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
-                 scalar_t* __restrict__ input,         // [..., hidden_size]
+                 const scalar_t* __restrict__ input,   // [..., hidden_size]
                  const scalar_t* __restrict__ weight,  // [3]
                  const scalar_t* __restrict__ bias,    // [1]
                  const float epsilon, const int hidden_size) {
@@ -215,7 +215,7 @@ poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
      not aliased in practice. Argument pointers should not be dereferenced
      in this kernel as that would be undefined behavior */
   auto* __restrict__ input_v =
-      reinterpret_cast<_f16VecPN<scalar_t, width>*>(input);
+      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
   const int vec_hidden_size = hidden_size / width;
   float variance = 0.0f;
   float variance2 = 0.0f;
@@ -231,14 +231,22 @@ poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
     variance3 += x6;
   }
 
-  using BlockReduce = cub::BlockReduce<float, 1024>;
+  float3 thread_variances = make_float3(variance, variance2, variance3);
+
+  struct SumOp {
+    __device__ float3 operator()(const float3& a, const float3& b) const {
+      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+  };
+
+  using BlockReduce = cub::BlockReduce<float3, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
+  float3 block_variances =
+      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
 
-  variance = BlockReduce(reduceStore).Sum(variance, blockDim.x);
-  __syncthreads();
-  variance2 = BlockReduce(reduceStore).Sum(variance2, blockDim.x);
-  __syncthreads();
-  variance3 = BlockReduce(reduceStore).Sum(variance3, blockDim.x);
+  variance = block_variances.x;
+  variance2 = block_variances.y;
+  variance3 = block_variances.z;
 
   __shared__ float s_w2_inv_std;
   __shared__ float s_w1_inv_std2;
@@ -273,7 +281,7 @@ poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
-                 scalar_t* __restrict__ input,         // [..., hidden_size]
+                 const scalar_t* __restrict__ input,   // [..., hidden_size]
                  const scalar_t* __restrict__ weight,  // [3]
                  const scalar_t* __restrict__ bias,    // [1]
                  const float epsilon, const int hidden_size) {
@@ -292,14 +300,22 @@ poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
     variance3 += x6;
   }
 
-  using BlockReduce = cub::BlockReduce<float, 1024>;
+  float3 thread_variances = make_float3(variance, variance2, variance3);
+
+  struct SumOp {
+    __device__ float3 operator()(const float3& a, const float3& b) const {
+      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+  };
+
+  using BlockReduce = cub::BlockReduce<float3, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
+  float3 block_variances =
+      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
 
-  variance = BlockReduce(reduceStore).Sum(variance, blockDim.x);
-  __syncthreads();
-  variance2 = BlockReduce(reduceStore).Sum(variance2, blockDim.x);
-  __syncthreads();
-  variance3 = BlockReduce(reduceStore).Sum(variance3, blockDim.x);
+  variance = block_variances.x;
+  variance2 = block_variances.y;
+  variance3 = block_variances.z;
 
   __shared__ float s_w2_inv_std;
   __shared__ float s_w1_inv_std2;
@@ -323,8 +339,9 @@ poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
     float x2 = x * x;
     float x3 = x2 * x;
 
-    out[blockIdx.x * hidden_size + idx] = (scalar_t)(
-        x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 + s_bias);
+    out[blockIdx.x * hidden_size + idx] =
+        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
+                   s_bias);
   }
 }
 
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -382,6 +382,7 @@ th {
 | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `MotifForCausalLM` | Motif-1-Tiny | `Motif-Technologies/Motif-2.6B`, `Motif-Technologies/Motif-2.6b-v1.1-LC`, etc. | | ✅︎ | |
 | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
@@ -6,7 +6,7 @@
 
 from tests.kernels.quant_utils import FP8_DTYPE
 from tests.kernels.utils import opcheck
-from vllm.model_executor.layers.layernorm import RMSNorm, PolyNorm
+from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -258,7 +258,8 @@ def check_available_online(
                                           {"tiny": "TitanML/tiny-mixtral"}),  # noqa: E501
     "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
     "MotifForCausalLM": _HfExamplesInfo("Motif-Technologies/Motif-2.6B",
-                                        trust_remote_code=True),
+                                        trust_remote_code=True,
+                                        v0_only=True),
     "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
     "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
     "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
@@ -65,8 +65,9 @@ def _initialize_kv_caches_v1(self, vllm_config):
                        _initialize_kv_caches_v1), monkeypatch.context() as m):
         if model_info.v0_only:
             m.setenv("VLLM_USE_V1", "0")
-        if model_arch == "Phi4FlashForCausalLM":
-            # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
+        if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
+            # Phi4FlashForCausalLM and MotifForCausalLM
+            # only supports DIFFERENTIAL_FLASH_ATTN backend
             m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
         if model_arch == "GptOssForCausalLM":
             # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
@@ -734,6 +734,7 @@ def forward_generate_kv_cache(
                     window_size=self.sliding_window,
                     alibi_slopes=self.alibi_slopes,
                     softcap=self.logits_soft_cap,
+                    fa_version=self.vllm_flash_attn_version,
                 )
                 assert prefill_output.shape == output[:
                                                       num_prefill_tokens].shape
@@ -755,6 +756,7 @@ def forward_generate_kv_cache(
                     window_size=self.sliding_window,
                     alibi_slopes=self.alibi_slopes,
                     softcap=self.logits_soft_cap,
+                    fa_version=self.vllm_flash_attn_version,
                 ).squeeze(1)
             except Exception as e:
                 logger.error("Error in PagedAttention.forward_decode: %s",
@@ -787,6 +789,7 @@ def forward_with_kv_cache_only(
             window_size=self.sliding_window,
             alibi_slopes=self.alibi_slopes,
             softcap=self.logits_soft_cap,
+            fa_version=self.vllm_flash_attn_version,
         ).squeeze(1)
         return output
 
diff --git a/vllm/model_executor/models/motif.py b/vllm/model_executor/models/motif.py
@@ -19,7 +19,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.layernorm import RMSNorm, PolyNorm
+from vllm.model_executor.layers.layernorm import PolyNorm, RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
@@ -72,17 +72,20 @@ def __init__(
             prefix=f"{prefix}.down_proj",
         )
         if hidden_act != "poly_norm":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only poly_norm is supported for now.")
+            raise NotImplementedError(f"Unsupported activation: {hidden_act}. "
+                                      "Only poly_norm is supported for now.")
         self.act_fn = PolyNorm()
         self.intermediate_size = intermediate_size
-        self.tp_size = get_tensor_model_parallel_world_size()
+        tp_size = get_tensor_model_parallel_world_size()
+        if hidden_act == "poly_norm" and tp_size > 1:
+            raise NotImplementedError(
+                "Tensor parallelism for poly_norm is not supported yet. "
+                "Support will be added in the future.")
 
     def forward(self, x):
         x, _ = self.gate_up_proj(x)
         x = self.act_fn(
-            x[..., :self.intermediate_size //
-              self.tp_size]) * x[..., self.intermediate_size // self.tp_size:]
+            x[..., :self.intermediate_size]) * x[..., self.intermediate_size:]
         x, _ = self.down_proj(x)
         return x
 
@@ -175,7 +178,7 @@ def __init__(
         self.lambda_k2 = nn.Parameter(
             torch.zeros(self.head_dim, dtype=torch.float32).normal_(mean=0,
                                                                     std=0.1))
-        self.subln = RMSNorm(2 * self.head_dim, eps=1e-5)
+        self.subln = RMSNorm(2 * self.head_dim, eps=config.attn_rms_norm_eps)
 
         params = {
             'differential_flash_attention_config': {