FlashAttention needs contiguous gradients

This was not visible before because the test was creating tensors of the wrong dtype, which when casted to the right dtype would yield contiguous tensors
facebookresearch · fmassa · Nov 29, 2022 · Nov 28, 2022 · Nov 29, 2022 · Nov 28, 2022
commit 5922cdd2a101461cbf86219d375f04e9a464246a
diff --git a/tests/test_mem_eff_attention.py b/tests/test_mem_eff_attention.py
@@ -605,7 +605,7 @@ def test_backward(
 
     grad_out = torch.ones_like(out)
     if grad_out_contiguous is False:
-        grad_out = torch.tensor([1.0], device=device)[None, None, :].expand_as(out)
+        grad_out = torch.tensor([1.0], dtype=query.dtype, device=device)[None, None, :].expand_as(out)
 
     out.backward(grad_out)
     del out

diff --git a/xformers/ops/memory_efficient_attention.py b/xformers/ops/memory_efficient_attention.py
@@ -624,7 +624,7 @@ def _backward(cls, ctx, grad, saved_tensors):
 
         assert grad.dtype in cls.SUPPORTED_DTYPES
         cls._flash_attn_backward(
-            grad.reshape(ctx.kernel_output_shape),
+            grad.reshape(ctx.kernel_output_shape).contiguous(),
             q,
             k,
             v,