[Qwen3 Next] Use numerically stable rsqrt (#40848)

thalahors · web-flow · commit fc5f9105da6b · 2025-09-15T12:45:13.000+02:00
use numerically stable inverse
diff --git a/src/transformers/models/qwen3_next/modeling_qwen3_next.py b/src/transformers/models/qwen3_next/modeling_qwen3_next.py
@@ -435,7 +435,7 @@ def torch_causal_conv1d_update(
 
 def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6):
     """This function is intended to align with the l2norm implementation in the FLA library."""
-    inv_norm = 1 / torch.sqrt((x * x).sum(dim=dim, keepdim=True) + eps)
+    inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps)
     return x * inv_norm
 
 
diff --git a/src/transformers/models/qwen3_next/modular_qwen3_next.py b/src/transformers/models/qwen3_next/modular_qwen3_next.py
@@ -271,7 +271,7 @@ def torch_causal_conv1d_update(
 
 def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6):
     """This function is intended to align with the l2norm implementation in the FLA library."""
-    inv_norm = 1 / torch.sqrt((x * x).sum(dim=dim, keepdim=True) + eps)
+    inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps)
     return x * inv_norm