fix scaling factor for DeepSeek (#521)

bzgoogle · Lumosis · commit d1e3c63f3342 · 2025-08-26T16:17:55.000Z
Signed-off-by: bzgoogle &lt;beinuoz@google.com&gt;
diff --git a/tpu_commons/models/jax/common/attention/deepseek_v3_attention.py b/tpu_commons/models/jax/common/attention/deepseek_v3_attention.py
@@ -1,3 +1,4 @@
+import math
 from dataclasses import dataclass
 from typing import Any, Tuple
 
@@ -63,6 +64,7 @@ class MLA(nnx.Module):
     attention_chunk_size: int | None = None
     rope_input_ordering: str = "split"
     quant: Any | None = None
+    rope_mscale_all_dim: float = 1.0
 
     def __post_init__(self):
         self.N = self.num_attention_heads
@@ -73,6 +75,13 @@ def __post_init__(self):
 
         assert self.N == self.K, "N and K must be equal for MLA"
 
+        if self.rope_scaling["factor"] <= 1.0:
+            yarn_mscale = 1.0
+        else:
+            yarn_mscale = 0.1 * self.rope_mscale_all_dim * math.log(
+                self.rope_scaling["factor"]) + 1.0
+        self.scale = self.qk_head_dim**-0.5 * yarn_mscale**2
+
         self.rope = DeepseekScalingRotaryEmbedding(
             self.qk_rope_head_dim,
             self.rope_theta,
@@ -180,7 +189,6 @@ def __call__(self,
             # Concatenate the nope and rope queries.
             q_TNH = jnp.concatenate([q_nope_TNH, q_rope_TNH], axis=-1)
             # Multiple the query by scaling factor
-            q_TNH = q_TNH * self.qk_head_dim**-0.5
             q_TNH = nnx.with_sharding_constraint(q_TNH, self.query_tnh)
 
         with jax.named_scope("kv_proj"):
@@ -293,7 +301,7 @@ def attention(
         def _ragged_paged_attention(*args):
             return ragged_paged_attention(
                 *args,
-                sm_scale=q_TNH.shape[-1]**-0.5,
+                sm_scale=self.scale,
             )
 
         output_TNH, kv_cache = jax.jit(