Enable cudnn dropout (apple#913)

hanzhi713 · web-flow · commit 58dcf3353cfc · 2025-02-10T21:41:33.000Z
diff --git a/axlearn/common/flash_attention/utils.py b/axlearn/common/flash_attention/utils.py
@@ -224,7 +224,6 @@ def get_segment_ids(segment_ids: SegmentIdAttentionBias) -> Optional[Tensor]:
                 or mask.has_value()
                 or jnp.float32 in (query.dtype, key.dtype, value.dtype)
                 or query.shape[1] != key.shape[1]
-                or dropout_rate != 0.0
             ):
                 logging.warning("Flash attention falling back to Triton GPU kernel.")
                 logging.warning("explicit_bias after extracting mask: %s", explicit_bias.value())
@@ -253,7 +252,7 @@ def get_segment_ids(segment_ids: SegmentIdAttentionBias) -> Optional[Tensor]:
                     bias=explicit_bias.value(),
                     softmax_scale=softmax_scale,
                     causal=causal.has_value(),
-                    dropout_rate=0.0,
+                    dropout_rate=dropout_rate,
                 )
 
         elif backend == "tpu":