Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/op/atomic_add.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ static int GetArchInt(Target target) {
int arch_int = 0;
auto s = target->GetAttr<String>("arch");
ICHECK(s.defined());
const char *arch_str = s.value().c_str();
if (arch_str[0] == 's' && arch_str[1] == 'm' && arch_str[2] == '_') {
arch_int = atoi(&arch_str[3]);
std::string arch = s.value();
if (arch.rfind("sm_", 0) == 0) {
arch_int = std::stoi(arch.substr(3));
} else {
arch_int = 0;
}
Comment on lines +40 to 45
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Avoid std::stoi; handle “sm_90a” and prevent exceptions in GetArchInt

std::stoi throws on non-digit suffixes (e.g., valid CUDA arch strings like “sm_90a”) and changes behavior vs. lenient parsing. Parse the digit span only and use from_chars to avoid exceptions.

Apply this diff:

-  std::string arch = s.value();
-  if (arch.rfind("sm_", 0) == 0) {
-    arch_int = std::stoi(arch.substr(3));
-  } else {
-    arch_int = 0;
-  }
+  std::string arch = s.value();
+  if (arch.rfind("sm_", 0) == 0) {
+    const char* first = arch.data() + 3;
+    const char* last  = arch.data() + arch.size();
+    const char* p = first;
+    while (p < last && std::isdigit(static_cast<unsigned char>(*p))) ++p;
+    if (p > first) {
+      int v = 0;
+      auto res = std::from_chars(first, p, v);
+      if (res.ec == std::errc()) arch_int = v;
+    }
+  } else {
+    arch_int = 0;
+  }

Add required headers (outside the hunk):

#include <charconv>
#include <cctype>

Expand Down Expand Up @@ -255,7 +255,7 @@ PrimExpr AtomicAddNode::MakePredicate(arith::Analyzer *analyzer,
*/
For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
Array<IterVar> loop_vars = MakeIterVars();
bool is_scalar = loop_vars.size() == 0;
bool is_scalar = loop_vars.empty();
if (is_scalar) {
return For(Var("i"), 0, 1, ForKind::kSerial,
BufferStore(dst, BufferLoad(src, {0}), {0}));
Expand Down Expand Up @@ -425,5 +425,7 @@ TIR_REGISTER_TL_OP(AtomicAdd, atomicadd)
.set_attr<TCallEffectKind>("TCallEffectKind",
Integer(CallEffectKind::kOpaque));

TVM_FFI_STATIC_INIT_BLOCK({ AtomicAddNode::RegisterReflection(); });

} // namespace tl
} // namespace tvm
123 changes: 40 additions & 83 deletions src/op/atomic_add.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
/*!
* \file tl/op/atomic_add.h
* \brief Define atomic add operator.
*
* \brief Atomic addition operations for concurrent memory updates
*/

#ifndef TVM_TL_OP_ATOMIC_ADD_H_
Expand All @@ -10,91 +9,20 @@
#include "operator.h"
#include "parallel.h"

/**
* Lower this tile operator into a TIR statement for the given lowering context.
*
* @param T Lowering context containing mapped buffers and iteration
* information.
* @param analyzer Arithmetic analyzer used to simplify and reason about
* expressions.
* @return A TIR Stmt that implements the atomic-add tile operation for the
* provided context.
*/
/**
* Infer memory/layout mapping for tensors and buffers used by this operator.
*
* @param T Layout inference context providing buffer and shape information.
* @param level Inference aggressiveness level; higher levels may perform more
* speculative decisions.
* @return A LayoutMap describing inferred layouts for the operator's inputs and
* outputs.
*/
/**
* Get the Op registration that identifies this tile operator.
*
* @return A reference to the registered Op representing this operator.
*/
/**
* Create a deep copy of this tile operator node wrapped as a TileOperator.
*
* @return A TileOperator handle owning a cloned AtomicAddNode.
*/
/**
* Construct a SIMT-style For loop nest (thread/block mapping) appropriate for
* the operator.
*
* @param analyzer Arithmetic analyzer used to simplify loop bounds and
* predicates.
* @return A For loop node representing the SIMT-parallel loop structure.
*/
/**
* Create iteration variables used by this operator's loop nest.
*
* @return An array of IterVar objects describing the loop iteration axes.
*/
/**
* Produce index expressions for either source or destination buffer access
* based on iteration vars.
*
* @param ivs IterVars created by MakeIterVars().
* @param src_dst Selects which indices to produce: 0 for source indices, 1 for
* destination indices.
* @return An array of PrimExpr index expressions suitable for indexing the
* selected buffer.
*/
/**
* Build a predicate expression that guards out-of-bounds or conditional
* accesses for src or dst.
*
* @param analyzer Arithmetic analyzer used to simplify the predicate.
* @param ivs IterVars created by MakeIterVars().
* @param extents The loop extents corresponding to the itervars.
* @param src_dst Selects which side the predicate is for: 0 for source, 1 for
* destination.
* @return A PrimExpr boolean predicate that evaluates to true for valid
* iterations.
*/
/**
* Construct an AtomicAdd tile operator from operation arguments and a buffer
* mapping.
*
* @param args Operation arguments (e.g., values or indices) specific to the
* atomic-add semantics.
* @param vmap Mapping from buffer names to Buffer objects used by this
* operator.
*/
namespace tvm {
namespace tl {

using namespace tir;

/// Node class for atomic addition operations
class AtomicAddNode : public TileOperatorNode {
public:
Buffer src, dst;
Array<Range> src_range, dst_range;
IntImm coalesced_width;
Buffer src, dst; ///< Source and destination buffers
Array<Range> src_range,
dst_range; ///< Access ranges for source and destination
IntImm coalesced_width; ///< Width for memory coalescing optimization

mutable ParallelOp par_op_;
mutable ParallelOp par_op_; ///< Associated parallel operation
static constexpr const char *_type_key = "tl.AtomicAdd";
TVM_DECLARE_FINAL_OBJECT_INFO(AtomicAddNode, TileOperatorNode);

Expand All @@ -104,18 +32,47 @@ class AtomicAddNode : public TileOperatorNode {
static const Op &Get();
TileOperator Clone() const;

static void RegisterReflection() {
namespace refl = tvm::ffi::reflection;
refl::ObjectDef<AtomicAddNode>()
.def_ro("src", &AtomicAddNode::src)
.def_ro("dst", &AtomicAddNode::dst)
.def_ro("src_range", &AtomicAddNode::src_range)
.def_ro("dst_range", &AtomicAddNode::dst_range)
.def_ro("coalesced_width", &AtomicAddNode::coalesced_width);
}
Comment on lines +35 to +43
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Ensure RegisterReflection() runs at static init.

Please verify a TU calls AtomicAddNode::RegisterReflection() (and node type registration) during startup; otherwise Python reflection won’t work.


🏁 Script executed:

#!/bin/bash
# Expect: at least one call to RegisterReflection() and node/op registrations.
rg -n -C2 -e 'AtomicAddNode::RegisterReflection\(' -e 'TVM_REGISTER_NODE_TYPE\(\s*AtomicAddNode\s*\)' -e 'TIR_REGISTER_TL_OP\(\s*AtomicAdd'

Length of output: 663


Register node type for AtomicAddNode
Add a TVM_REGISTER_NODE_TYPE(AtomicAddNode); invocation (e.g. in src/op/atomic_add.cc, just above the existing TVM_FFI_STATIC_INIT_BLOCK) so the node type is registered with FFI before calling AtomicAddNode::RegisterReflection(). Without it, Python reflection will not work.

🤖 Prompt for AI Agents
In src/op/atomic_add.h around lines 35 to 43 the reflection registration for
AtomicAddNode is defined but the node type itself is not registered with TVM's
FFI, causing Python reflection to fail; add a
TVM_REGISTER_NODE_TYPE(AtomicAddNode); call (place it in src/op/atomic_add.cc
just above the existing TVM_FFI_STATIC_INIT_BLOCK) so the node type is
registered before AtomicAddNode::RegisterReflection() is invoked.


bool SEqualReduce(const AtomicAddNode *other, SEqualReducer equal) const {
return equal(src, other->src) && equal(dst, other->dst) &&
equal(src_range, other->src_range) &&
equal(dst_range, other->dst_range) &&
equal(coalesced_width, other->coalesced_width);
}

void SHashReduce(SHashReducer hash_reduce) const {
hash_reduce(src);
hash_reduce(dst);
hash_reduce(src_range);
hash_reduce(dst_range);
hash_reduce(coalesced_width);
}

static constexpr bool _type_has_method_sequal_reduce = true;
static constexpr bool _type_has_method_shash_reduce = true;

protected:
/// Create SIMT-style parallel loop structure
For MakeSIMTLoop(arith::Analyzer *analyzer) const;
Comment on lines +64 to 65
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Bug: scalar path in MakeSIMTLoop does a plain store, not an atomic add.

In src/op/atomic_add.cc, the is_scalar branch emits BufferStore(dst, src) instead of an atomic add. This silently changes semantics.

Patch suggestion (cc file):

@@
-  if (is_scalar) {
-    return For(Var("i"), 0, 1, ForKind::kSerial,
-               BufferStore(dst, BufferLoad(src, {0}), {0}));
-  }
+  if (is_scalar) {
+    Array<PrimExpr> new_args;
+    new_args.push_back(StringImm("AtomicAdd"));
+    PrimExpr src_value = BufferLoad(src, {0});
+    if (src->dtype != dst->dtype) {
+      src_value = Cast(dst->dtype, src_value);
+    }
+    PrimExpr dst_value = BufferLoad(dst, {0});
+    Call address_of_value =
+        tvm::tir::Call(DataType::Handle(), builtin::address_of(), {dst_value});
+    new_args.push_back(address_of_value);
+    new_args.push_back(src_value);
+    Call atomicadd_call =
+        tvm::tir::Call(dst->dtype, builtin::call_extern(), new_args);
+    return For(Var("i"), 0, 1, ForKind::kSerial, Evaluate(atomicadd_call));
+  }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
/// Create SIMT-style parallel loop structure
For MakeSIMTLoop(arith::Analyzer *analyzer) const;
if (is_scalar) {
Array<PrimExpr> new_args;
new_args.push_back(StringImm("AtomicAdd"));
PrimExpr src_value = BufferLoad(src, {0});
if (src->dtype != dst->dtype) {
src_value = Cast(dst->dtype, src_value);
}
PrimExpr dst_value = BufferLoad(dst, {0});
Call address_of_value =
tvm::tir::Call(DataType::Handle(), builtin::address_of(), {dst_value});
new_args.push_back(address_of_value);
new_args.push_back(src_value);
Call atomicadd_call =
tvm::tir::Call(dst->dtype, builtin::call_extern(), new_args);
return For(Var("i"), 0, 1, ForKind::kSerial, Evaluate(atomicadd_call));
}
🤖 Prompt for AI Agents
In src/op/atomic_add.h around lines 64-65, the scalar path in MakeSIMTLoop
currently emits a plain BufferStore(dst, src) which changes semantics; replace
the scalar-store with the same atomic-add operation used by the vector/SIMT path
(use the existing BufferAtomicAdd/atomic add intrinsic or helper used
elsewhere), preserving operand order, return/old-value handling, and any
required memory ordering flags; include any missing header or forward
declaration needed for the atomic helper and add/update tests to cover the
scalar branch.

/// Generate iteration variables for loop nest
Array<IterVar> MakeIterVars() const;

// ivs: itervars returned by MakeIterVars()
// src_dst: 0 for src_indices, 1 for dst_indices
/// Generate buffer indices from iteration variables
Array<PrimExpr> MakeIndices(const Array<IterVar> &ivs, int src_dst) const;

/// Create boundary predicate for memory safety
PrimExpr MakePredicate(arith::Analyzer *analyzer, const Array<IterVar> &ivs,
Array<PrimExpr> extents, int src_dst) const;
};

/// Wrapper class for atomic addition operations
class AtomicAdd : public TileOperator {
public:
TVM_DEFINE_OBJECT_REF_METHODS(AtomicAdd, TileOperator, AtomicAddNode);
Expand Down
8 changes: 6 additions & 2 deletions src/op/copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ PrimExpr CopyNode::MakePredicate(arith::Analyzer *analyzer,
*/
For CopyNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
Array<IterVar> loop_vars = MakeIterVars();
bool is_scalar = loop_vars.size() == 0;
bool is_scalar = loop_vars.empty();
if (is_scalar) {
return For(Var("i"), 0, 1, ForKind::kSerial,
BufferStore(dst, BufferLoad(src, {0}), {0}));
Expand Down Expand Up @@ -1197,7 +1197,7 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
int swizzle;
int max_dim;
};
static const SwizzleCheck swizzle_checks[] = {
static const std::vector<SwizzleCheck> swizzle_checks = {
{static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B), 32},
{static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B), 64},
{static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B), 128},
Expand Down Expand Up @@ -1559,5 +1559,9 @@ TIR_REGISTER_TL_OP(Conv2DIm2ColOp, c2d_im2col)
.set_attr<TCallEffectKind>("TCallEffectKind",
Integer(CallEffectKind::kOpaque));

TVM_FFI_STATIC_INIT_BLOCK({
CopyNode::RegisterReflection();
Conv2DIm2ColOpNode::RegisterReflection();
});
} // namespace tl
} // namespace tvm
Loading