Disables tensor sanitization in attention ops

LoserCheems · LoserCheems · commit 071ab90f1c8f · 2025-10-26T19:48:43.000+08:00
Removes NaN/Inf zeroing in forward and backward paths to avoid masking numerical issues and to reduce overhead.

Preserves raw outputs/gradients for easier debugging and correctness checks; callers can sanitize if required.
diff --git a/flash_dmattn/flash_dmattn_interface.py b/flash_dmattn/flash_dmattn_interface.py
@@ -95,7 +95,7 @@ def _flash_dmattn_forward(
         softcap,
         return_softmax,
     )
-    _sanitize_tensors(out, nan=0.0, posinf=0.0, neginf=0.0)
+    # _sanitize_tensors(out, nan=0.0, posinf=0.0, neginf=0.0)
     return out, softmax_lse, S_dmask
 
 
@@ -163,7 +163,7 @@ def _flash_dmattn_varlen_forward(
         softcap,
         return_softmax,
     )
-    _sanitize_tensors(out, nan=0.0, posinf=0.0, neginf=0.0)
+    # _sanitize_tensors(out, nan=0.0, posinf=0.0, neginf=0.0)
     return out, softmax_lse, S_dmask
 
 
@@ -247,7 +247,7 @@ def _flash_dmattn_backward(
         softcap,
         deterministic,
     )
-    _sanitize_tensors(dq, dk, dv, dbias, nan=0.0, posinf=0.0, neginf=0.0)
+    # _sanitize_tensors(dq, dk, dv, dbias, nan=0.0, posinf=0.0, neginf=0.0)
     return softmax_d
 
 
@@ -335,7 +335,7 @@ def _flash_dmattn_varlen_backward(
         softcap,
         deterministic,
     )
-    _sanitize_tensors(dq, dk, dv, nan=0.0, posinf=0.0, neginf=0.0)
+    # _sanitize_tensors(dq, dk, dv, nan=0.0, posinf=0.0, neginf=0.0)
     return softmax_d