speedup sdpa_mask for mindspore (#2113)

lvyufeng · web-flow · commit 3fbc94b1733c · 2025-08-03T20:54:25.000+08:00
diff --git a/mindnlp/core/ops/creation.py b/mindnlp/core/ops/creation.py
@@ -183,24 +183,24 @@ def empty(*size, dtype=None, device=None, requires_grad=False, pin_memory=False,
     if dtype is None:
         dtype = get_default_dtype()
 
-    if device:
-        if not isinstance(device, str) and hasattr(device, "type"):
-            device = device.type
-        if device.lower() == 'cpu':
-            device = 'CPU'
-        elif device.lower() == 'npu':
-            device = 'Ascend'
-        elif device.lower() == 'cuda':
-            device = 'GPU'
-        else:
-            device = 'meta'
-
-    # To avoid the problem in irecv and recv of using empty.
-    if device not in ['meta', 'GPU']:
-        out = mindspore.mint.empty(size, dtype=dtype, device=device)
-    else:
-        out = CTensor(dtype=dtype, shape=size)
-        out = mindspore.Tensor(out)
+    # if device:
+    #     if not isinstance(device, str) and hasattr(device, "type"):
+    #         device = device.type
+    #     if device.lower() == 'cpu':
+    #         device = 'CPU'
+    #     elif device.lower() == 'npu':
+    #         device = 'Ascend'
+    #     elif device.lower() == 'cuda':
+    #         device = 'GPU'
+    #     else:
+    #         device = 'meta'
+
+    # # To avoid the problem in irecv and recv of using empty.
+    # if device not in ['meta', 'GPU']:
+    #     out = mindspore.mint.empty(size, dtype=dtype, device=device)
+    # else:
+    out = CTensor(dtype=dtype, shape=size)
+    out = mindspore.Tensor(out)
     # else:
     #     out = np.empty(size, dtype=dtype2np[dtype])
     #     out = mindspore.Tensor(out)
diff --git a/mindnlp/transformers/masking_utils.py b/mindnlp/transformers/masking_utils.py
@@ -198,7 +198,7 @@ def _ignore_causal_mask_sdpa(
     allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
     passed).
     """
-    is_tracing = core.jit.is_tracing() or isinstance(padding_mask, core.fx.Proxy) or is_torchdynamo_compiling()
+    is_tracing = core.jit.is_tracing() or isinstance(padding_mask, core.fx.Proxy)
     if padding_mask is not None and padding_mask.shape[-1] > kv_length:
         mask_indices = core.arange(kv_length, device=padding_mask.device)
         mask_indices += kv_offset