Refactor atomic operations in CUDA templates for improved readability

LeiWang1999 · LeiWang1999 · commit dede99f7f232 · 2025-09-24T17:59:07.000+08:00
- Reformatted atomic operation implementations in atomic.h for better code clarity.
- Adjusted function signatures in tilelang's atomic.py to enhance readability by aligning parameters.
- Cleaned up unnecessary whitespace and comments in customize.py to streamline the codebase.
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
@@ -59,10 +59,12 @@ TL_DEVICE T1 AtomicMaxRet(T1 *address, T2 val,
   using NT1 = typename normalize_atomic_type<T1>::type;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    return static_cast<T1>(atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    return static_cast<T1>(
+        atomicMax(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    return static_cast<T1>(aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+    return static_cast<T1>(
+        aref.fetch_max(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
   }
 }
 
@@ -85,10 +87,12 @@ TL_DEVICE T1 AtomicMinRet(T1 *address, T2 val,
   using NT1 = typename normalize_atomic_type<T1>::type;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    return static_cast<T1>(atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    return static_cast<T1>(
+        atomicMin(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    return static_cast<T1>(aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+    return static_cast<T1>(
+        aref.fetch_min(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
   }
 }
 
@@ -111,10 +115,12 @@ TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val,
   using NT1 = typename normalize_atomic_type<T1>::type;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    return static_cast<T1>(atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
+    return static_cast<T1>(
+        atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
   } else {
     cuda::atomic_ref<NT1, cuda::thread_scope_device> aref(*address);
-    return static_cast<T1>(aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
+    return static_cast<T1>(
+        aref.fetch_add(cuda_cast<NT1>(val), cuda::memory_order(memory_order)));
   }
 }
 
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
@@ -4,10 +4,10 @@
 #include <cuda_runtime.h>
 #endif
 
+#include "atomic.h"
 #include <cutlass/fast_math.h>
 #include <cutlass/numeric_types.h>
 #include <math_constants.h>
-#include "atomic.h"
 
 using cutlass::bfloat16_t;
 using cutlass::half_t;
@@ -138,7 +138,6 @@ TL_DEVICE unsigned int cast_smem_ptr_to_int(const void *const smem_ptr) {
   return smem_int;
 }
 
-
 // DP4A
 template <typename InDatatype, typename OutDatatype>
 TL_DEVICE /**
diff --git a/src/transform/legalize_safe_memory_access.cc b/src/transform/legalize_safe_memory_access.cc
@@ -235,7 +235,8 @@ class SafeMemorysRewriter : public StmtExprMutator {
 
   bool IsLocalBuffer(const Buffer &buffer) {
     String scope = buffer.scope();
-    return scope == "local" || scope == "local.fragment" || scope == "local.var";
+    return scope == "local" || scope == "local.fragment" ||
+           scope == "local.var";
   }
 
   bool isSharedBuffer(const Buffer &buffer) {
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
@@ -4,8 +4,8 @@
 
 import tilelang.language as T
 from tvm import ir
-from tvm.tir import PrimExpr, Buffer, BufferLoad, BufferRegion, Var, op
-from typing import List, Union, Optional
+from tvm.tir import PrimExpr, Buffer, BufferRegion, Var, op
+from typing import Optional
 
 _MEMORY_ORDER_ID_MAP = {
     "relaxed": 0,
@@ -17,7 +17,10 @@
 }
 
 
-def atomic_max(dst: Buffer, value: PrimExpr, memory_order: Optional[str] = None, return_prev: bool = False) -> PrimExpr:
+def atomic_max(dst: Buffer,
+               value: PrimExpr,
+               memory_order: Optional[str] = None,
+               return_prev: bool = False) -> PrimExpr:
     """
     Perform an atomic maximum on the value stored at dst with an optional memory-order.
 
@@ -61,7 +64,10 @@ def atomic_max(dst: Buffer, value: PrimExpr, memory_order: Optional[str] = None,
                              _MEMORY_ORDER_ID_MAP[memory_order])
 
 
-def atomic_min(dst: Buffer, value: PrimExpr, memory_order: Optional[str] = None, return_prev: bool = False) -> PrimExpr:
+def atomic_min(dst: Buffer,
+               value: PrimExpr,
+               memory_order: Optional[str] = None,
+               return_prev: bool = False) -> PrimExpr:
     """
     Atomically update the value at dst to the minimum of its current value and value.
 
@@ -107,7 +113,10 @@ def atomic_min(dst: Buffer, value: PrimExpr, memory_order: Optional[str] = None,
                              _MEMORY_ORDER_ID_MAP[memory_order])
 
 
-def atomic_add(dst: Buffer, value: PrimExpr, memory_order: Optional[str] = None, return_prev: bool = False) -> PrimExpr:
+def atomic_add(dst: Buffer,
+               value: PrimExpr,
+               memory_order: Optional[str] = None,
+               return_prev: bool = False) -> PrimExpr:
     """
     Atomically add `value` into `dst`, returning a handle to the operation.
 
@@ -210,7 +219,8 @@ def _to_region(data, access_type):
     # Note: tile-region-based atomic operations don't support return_prev yet
     # This would need to be implemented in the tile runtime
     if return_prev:
-        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
+        raise NotImplementedError(
+            "return_prev is not supported for tile-region-based atomic operations")
 
     return T.call_intrin("handle", op.Op.get("tl.atomicadd"), value, dst)
 
@@ -249,19 +259,7 @@ def atomic_addx2(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> Pri
         >>>             atomic_addx2(global_grads[i, j:j+2], grads[i, j:j+2])
     """
     func_name = "AtomicAddx2Ret" if return_prev else "AtomicAddx2"
-    return_type = "handle"  # For vector operations, we need to determine the appropriate return type
-
-    if return_prev:
-        # For return types, we need to infer the vector type based on dst.dtype
-        if "half" in str(dst.dtype).lower():
-            return_type = "half2"
-        elif "bfloat16" in str(dst.dtype).lower():
-            return_type = "__nv_bfloat162"
-        elif "float" in str(dst.dtype).lower():
-            return_type = "float2"
-        else:
-            return_type = "handle"  # Fallback
-
+    return_type = dst.dtype if return_prev else "handle"
     return T.call_extern(return_type, func_name, T.address_of(dst), T.address_of(value))
 
 
@@ -299,15 +297,7 @@ def atomic_addx4(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> Pri
         >>> atomic_addx4(rgba_dst, rgba_add)  # Atomic blend of all 4 channels
     """
     func_name = "AtomicAddx4Ret" if return_prev else "AtomicAddx4"
-    return_type = "handle"
-
-    if return_prev:
-        # For float4 operations
-        if "float" in str(dst.dtype).lower():
-            return_type = "float4"
-        else:
-            return_type = "handle"  # Fallback
-
+    return_type = "float4" if "float" in str(dst.dtype).lower() else "handle"
     return T.call_extern(return_type, func_name, T.address_of(dst), T.address_of(value))
 
 
@@ -402,4 +392,4 @@ def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> P
         >>> atomic_store(log_counter, 0)  # Reset counter atomically
     """
     return T.call_extern("handle", "AtomicStore", T.address_of(dst), src,
-                         _MEMORY_ORDER_ID_MAP[memory_order])
+                         _MEMORY_ORDER_ID_MAP[memory_order])
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
@@ -1,13 +1,9 @@
-# Copyright (c) Tile-AI Corporation.
-# Licensed under the MIT License.
 """The language interface for tl programs."""
 
 import tilelang.language as T
-from tvm import ir
-from tvm.tir import PrimExpr, Buffer, BufferLoad, BufferRegion, Var, op
-from typing import List, Union, Optional
-from .atomic import atomic_max, atomic_min, atomic_add, atomic_addx2, atomic_addx4, atomic_load, atomic_store
-
+from tvm.tir import PrimExpr, Buffer, BufferLoad, BufferRegion, op
+from typing import List, Union
+from .atomic import atomic_max, atomic_min, atomic_add, atomic_addx2, atomic_addx4, atomic_load, atomic_store  # noqa: F401
 
 
 def region(buffer: BufferLoad, access_type: str, *args: PrimExpr):
@@ -97,16 +93,6 @@ def buffer_region_to_tile_region(buffer_region: BufferRegion, access_type: str,
     return region(T.BufferLoad(buffer_region.buffer, mins), access_type, *region_extents)
 
 
-
-
-
-
-
-
-
-
-
-
 def dp4a(A: Buffer, B: Buffer, C: Buffer) -> PrimExpr:
     """Perform a 4-element dot product with accumulation (DP4A).
 
@@ -163,7 +149,3 @@ def view(src: Buffer,
     if dtype is None:
         dtype = src.dtype
     return T.Tensor(shape, dtype, src.data)
-
-
-
-

Original file line number	Diff line number	Diff line change
`@@ -235,7 +235,8 @@ class SafeMemorysRewriter : public StmtExprMutator {`
`235`	`235`
`236`	`236`	`bool IsLocalBuffer(const Buffer &buffer) {`
`237`	`237`	`String scope = buffer.scope();`
`238`		`- return scope == "local" \|\| scope == "local.fragment" \|\| scope == "local.var";`
	`238`	`+ return scope == "local" \|\| scope == "local.fragment" \|\|`
	`239`	`+ scope == "local.var";`
`239`	`240`	`}`
`240`	`241`
`241`	`242`	`bool isSharedBuffer(const Buffer &buffer) {`