Merge branch 'main' of https://github.com/tile-ai/tilelang into v2_1106

LeiWang1999 · LeiWang1999 · commit 23ef354b160f · 2025-11-11T15:10:48.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -230,12 +230,8 @@ target_include_directories(tilelang_objs PRIVATE ${TILE_LANG_INCLUDES})
 
 add_library(tilelang SHARED $<TARGET_OBJECTS:tilelang_objs>)
 add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
-target_link_libraries(tilelang PUBLIC tvm_runtime)
+target_link_libraries(tilelang PUBLIC tvm_runtime tvm)
 target_link_libraries(tilelang_module PUBLIC tvm)
-if(APPLE)
-  # FIXME: libtilelang should only link against tvm runtime
-  target_link_libraries(tilelang PUBLIC tvm)
-endif()
 # Build cython extension
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
 
diff --git a/src/layout/layout.cc b/src/layout/layout.cc
@@ -5,6 +5,7 @@
 
 #include "layout.h"
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/logging.h>
 
 #include <tvm/arith/pattern.h>
 #include <tvm/tir/op.h>
@@ -255,8 +256,11 @@ std::pair<Layout, arith::IterMapLevel> LayoutNode::InverseWithLevel() const {
   }
   arith::IterMapResult res =
       arith::DetectIterMap(forward_index_, getVarMap(), 1, level, &analyzer);
-  ICHECK(res->errors.empty())
-      << "Layout " << DebugOutput() << " has errors: " << res->errors;
+  if (!res->errors.empty()) {
+    std::ostringstream msg;
+    msg << "Layout " << DebugOutput() << " has errors: " << res->errors;
+    throw NormalizeIterException(msg.str());
+  }
 
   auto outputs_shape = OutputShape();
   Array<PrimExpr> outputs;
diff --git a/src/layout/utils.cc b/src/layout/utils.cc
@@ -115,6 +115,10 @@ Array<IterSplitExpr> get_unused_iters(const IterMark &mark,
   return results;
 }
 
+// Heuristic: detect per-iterator gaps ("unused" pieces) even when the iterator
+// appears in fused forms across multiple index expressions. We first normalize
+// every index into IterSumExpr, collect all splits per source Var, then
+// consolidate them to avoid misclassifying a used split as unused.
 Array<IterSplitExpr> DivideUnusedIterators(const Array<PrimExpr> &exprs,
                                            const Array<IterVar> input_iters,
                                            Analyzer *analyzer) {
@@ -134,17 +138,25 @@ Array<IterSplitExpr> DivideUnusedIterators(const Array<PrimExpr> &exprs,
   }
 
   for (const IterVar &iter : input_iters) {
-    IterMark iv_mark;
+    // Merge splits from all IterMark that share the same source Var as `iter`.
+    std::vector<IterSplitExpr> merged_splits;
     for (const IterMark &mark : collector.visited_) {
-      if (mark->source.as<Var>()->same_as(iter->var)) { // NOLINT(*)
-        iv_mark = mark;
-        break;
+      auto vexpr = mark->source.as<Var>();
+      if (vexpr && vexpr.value().same_as(iter->var)) {
+        auto it = collector.mark2splits_.find(mark);
+        if (it != collector.mark2splits_.end()) {
+          const auto &vec = it->second;
+          merged_splits.insert(merged_splits.end(), vec.begin(), vec.end());
+        }
       }
     }
-    if (iv_mark.defined()) {
-      auto splits =
-          get_unused_iters(iv_mark, collector.mark2splits_[iv_mark], analyzer);
-      // Put the small axis last
+
+    if (!merged_splits.empty()) {
+      // Use a unified mark (Var + full extent) to compute the missing pieces
+      // so that fused usages are honored as "used" and not reintroduced.
+      IterMark unified_mark(iter->var, iter->dom->extent);
+      auto splits = get_unused_iters(unified_mark, merged_splits, analyzer);
+      // Put the small axis last for a flattened ordering.
       results.insert(results.end(), splits.rbegin(), splits.rend());
     } else if (!is_one(iter->dom->extent)) {
       auto mark = IterMark(iter->var, iter->dom->extent);
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
@@ -620,11 +620,66 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
   if (IsCommonAccessIndice(buffer)) {
     return loop_layout_;
   }
+  // Prefer a simple path: if original 2D indices form a bijective map, invert
+  // them directly and avoid introducing a synthetic replicate dimension.
+  {
+    auto res2d =
+        arith::DetectIterMap(indice_map_[buffer], ToVMap(loop_vars_), 1,
+                             arith::IterMapLevel::Bijective,
+                             const_cast<arith::Analyzer *>(&analyzer_));
+    if (res2d->errors.empty()) {
+      Layout ind_inv2d = Layout(loop_vars_, indice_map_[buffer])->Inverse();
+      PrimExpr indice_rep_extent = 1;
+      PrimExpr loop_rep_extent = loop_layout_->ReplicateExtent();
+      PrimExpr dest_buffer_rep_extent = indice_rep_extent * loop_rep_extent;
+      Array<PrimExpr> fwd2;
+      for (size_t i = 0; i < buffer->shape.size(); i++) {
+        fwd2.push_back(InputPlaceholder(i));
+      }
+      PrimExpr thd_b2 =
+          loop_layout_->ForwardThread(ind_inv2d->Forward(fwd2), std::nullopt);
+      return Fragment(buffer->shape, {}, thd_b2, dest_buffer_rep_extent,
+                      std::nullopt)
+          ->CondenseReplicateVar();
+    }
+  }
+  // Otherwise, infer an extra flattened iterator that captures truly-unused
+  // pieces of the loop space (if any), then try inversion with it.
   PrimExpr rep_b = MakeFlattenedExpression(
       DivideUnusedIterators(indice_map_[buffer], loop_vars_, &analyzer_));
   auto bijective_indice = indice_map_[buffer];
   bijective_indice.push_back(rep_b);
-  Layout ind_inv = Layout(loop_vars_, bijective_indice)->Inverse();
+  Layout layout_before_inv = Layout(loop_vars_, bijective_indice);
+
+  // Pre-check cardinality to guard non-bijective combinations after adding
+  // rep_b.
+  PrimExpr in_prod = 1;
+  for (const auto &iv : loop_vars_)
+    in_prod *= iv->dom->extent;
+  PrimExpr out_prod = 1;
+  for (const auto &d : layout_before_inv->OutputShape())
+    out_prod *= d;
+
+  if (!analyzer_.CanProveEqual(in_prod, out_prod)) {
+    DLOG(WARNING) << "  Non-bijective mapping after appending rep_b; falling "
+                     "back to no-rep inversion.";
+    Layout ind_inv_fallback =
+        Layout(loop_vars_, indice_map_[buffer])->Inverse();
+    PrimExpr indice_rep_extent = 1;
+    PrimExpr loop_rep_extent = loop_layout_->ReplicateExtent();
+    PrimExpr dest_buffer_rep_extent = indice_rep_extent * loop_rep_extent;
+    Array<PrimExpr> fwd2;
+    for (size_t i = 0; i < buffer->shape.size(); i++) {
+      fwd2.push_back(InputPlaceholder(i));
+    }
+    PrimExpr thd_b = loop_layout_->ForwardThread(
+        ind_inv_fallback->Forward(fwd2), std::nullopt);
+    return Fragment(buffer->shape, {}, thd_b, dest_buffer_rep_extent,
+                    std::nullopt)
+        ->CondenseReplicateVar();
+  }
+
+  Layout ind_inv = layout_before_inv->Inverse();
   PrimExpr indice_rep_extent =
       ind_inv->InputShape().back(); // this is the size of rep_b
   PrimExpr loop_rep_extent = loop_layout_->ReplicateExtent();
diff --git a/testing/python/layout/test_tilelang_layout_fused_replicate.py b/testing/python/layout/test_tilelang_layout_fused_replicate.py
@@ -0,0 +1,63 @@
+import pytest
+import torch
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+tilelang.testing.set_random_seed()
+
+VEC_SIZE = 32
+
+
+@tilelang.jit
+def fused_index_kernel(B: int, M: int, N: int, BLOCK_MN: int, BLOCK_K: int):
+
+    @T.prim_func
+    def main(
+            a: T.Buffer((B, M, N), "bfloat16"),
+            a_out: T.Buffer((B, M, N), "float32"),
+    ):
+        with T.Kernel(
+                T.ceildiv(M, BLOCK_MN),
+                T.ceildiv(N, BLOCK_K),
+                B,
+                threads=128,
+        ) as (pid_m, pid_n, pid_b):
+            a_fp32_local = T.alloc_fragment((BLOCK_MN * BLOCK_K // VEC_SIZE, VEC_SIZE), "float32")
+            offs_m = pid_m * BLOCK_MN
+            offs_n = pid_n * BLOCK_K
+
+            for i, j in T.Parallel(BLOCK_MN, BLOCK_K):
+                idx = i * BLOCK_K + j
+                a_out[pid_b, offs_m + i, offs_n + j] = a_fp32_local[idx // VEC_SIZE, idx % VEC_SIZE]
+
+    return main
+
+
+def _require_cuda_tensor(shape, dtype):
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+    try:
+        return torch.randn(*shape, device="cuda", dtype=dtype)
+    except RuntimeError as err:
+        pytest.skip(f"CUDA runtime unavailable: {err}")
+
+
+def test_layout_infer_compiles_and_runs():
+    B, M, N = 1, 32, 64
+    BLOCK_MN, BLOCK_K = 32, 64
+    kernel = fused_index_kernel(B, M, N, BLOCK_MN, BLOCK_K)
+
+    a = _require_cuda_tensor((B, M, N), torch.bfloat16)
+    a_out = torch.empty((B, M, N), dtype=torch.float32, device=a.device)
+
+    # Ensure kernel compiles and executes without layout inversion failure
+    kernel(a, a_out)
+
+    assert a_out.shape == a.shape
+    assert a_out.dtype == torch.float32
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
@@ -7,7 +7,10 @@
 import os
 import subprocess
 import warnings
-from tilelang.env import CUDA_HOME
+import contextlib
+from tilelang.env import CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
+import shutil
+import tempfile
 import tvm_ffi
 from tilelang import tvm as tvm
 from tvm.target import Target
@@ -125,6 +128,154 @@ def compile_cuda(code,
         return data
 
 
+def default_compile_options(compile_flags: list[str] | None = None) -> list[str]:
+    """
+    Build a set of default NVCC compile options for TileLang generated sources.
+
+    Includes C++ standard and common include paths (TileLang templates, CUTLASS,
+    CUDA include). Merges user-provided compile flags if given.
+
+    Parameters
+    ----------
+    compile_flags : Optional[List[str]]
+        Additional flags to include. Items are split on whitespace.
+
+    Returns
+    -------
+    List[str]
+        A list of flags suitable for NVCC's command line.
+    """
+    options: list[str] = ["-std=c++17"]
+    try:
+        if TILELANG_TEMPLATE_PATH:
+            options.append(f"-I{TILELANG_TEMPLATE_PATH}")
+    except Exception:
+        pass
+    try:
+        if CUTLASS_INCLUDE_DIR:
+            options.append(f"-I{CUTLASS_INCLUDE_DIR}")
+    except Exception:
+        pass
+    try:
+        if CUDA_HOME:
+            options.append(f"-I{os.path.join(CUDA_HOME, 'include')}")
+    except Exception:
+        pass
+
+    # Preserve user flags exactly, including repeated tokens required by NVCC
+    # (e.g., multiple "-gencode" pairs or repeated "-Xcompiler" entries).
+    if compile_flags:
+        import shlex
+        for flag in compile_flags:
+            # Split each string like a shell would, preserving quoted args
+            tokens = shlex.split(flag) if isinstance(flag, str) else [str(flag)]
+            options.extend(tokens)
+    return options
+
+
+def get_ptx_from_source(code: str,
+                        compile_flags: list[str] | None = None,
+                        verbose: bool = False) -> str:
+    """
+    Compile CUDA C++ source to PTX using NVCC and return as text.
+
+    Parameters
+    ----------
+    code : str
+        CUDA C++ kernel source code.
+    compile_flags : Optional[List[str]]
+        Additional flags merged with defaults.
+    verbose : bool
+        Print NVCC output when True.
+
+    Returns
+    -------
+    str
+        PTX text.
+    """
+    opts = default_compile_options(compile_flags)
+    ptx_bytes = compile_cuda(code, target_format="ptx", options=opts, verbose=verbose)
+    try:
+        return ptx_bytes.decode("utf-8")
+    except Exception:
+        return str(ptx_bytes)
+
+
+def _find_tool(name: str) -> str | None:
+    """Find a CUDA binary in PATH or under CUDA_HOME/bin."""
+    path = shutil.which(name)
+    if path:
+        return path
+    if CUDA_HOME:
+        candidate = os.path.join(CUDA_HOME, "bin", name)
+        if os.path.exists(candidate):
+            return candidate
+    return None
+
+
+def get_sass_from_source(code: str,
+                         compile_flags: list[str] | None = None,
+                         verbose: bool = False) -> str:
+    """
+    Compile CUDA C++ source to CUBIN and disassemble to SASS.
+
+    Uses nvdisasm if available; otherwise falls back to cuobjdump.
+
+    Parameters
+    ----------
+    code : str
+        CUDA C++ kernel source code.
+    compile_flags : Optional[List[str]]
+        Additional flags merged with defaults.
+    verbose : bool
+        Print tool outputs when True.
+
+    Returns
+    -------
+    str
+        SASS text.
+    """
+    opts = default_compile_options(compile_flags)
+    cubin_bytes = compile_cuda(code, target_format="cubin", options=opts, verbose=verbose)
+
+    # Write to a temp .cubin file
+    with tempfile.NamedTemporaryFile(suffix=".cubin", delete=False) as tmp:
+        tmp.write(cubin_bytes)
+        cubin_path = tmp.name
+
+    # Try disassembly tools (prefer nvdisasm, fallback cuobjdump)
+    cand_nvdisasm = _find_tool("nvdisasm")
+    cand_cuobjdump = _find_tool("cuobjdump")
+    if not cand_nvdisasm and not cand_cuobjdump:
+        raise RuntimeError(
+            "Cannot find 'nvdisasm' or 'cuobjdump'. Please ensure CUDA toolkit is installed and in PATH."
+        )
+    last_err: str | None = None
+    try:
+        # Attempt nvdisasm first
+        tools_to_try = []
+        if cand_nvdisasm:
+            tools_to_try.append(("nvdisasm", [cand_nvdisasm, cubin_path]))
+        if cand_cuobjdump:
+            tools_to_try.append(("cuobjdump", [cand_cuobjdump, "--dump-sass", cubin_path]))
+
+        for tool_name, cmd in tools_to_try:
+            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+            out, _ = proc.communicate()
+            text = py_str(out)
+            if verbose:
+                print(f"[{tool_name}] output:\n{text}")
+            if proc.returncode == 0 and text.strip():
+                return text
+            last_err = f"{tool_name} rc={proc.returncode}, output:\n{text}"
+        # If we reach here, all attempts failed
+        raise RuntimeError(f"SASS disassembly failed. Tried tools: "
+                           f"{', '.join(name for name, _ in tools_to_try)}\n{last_err or ''}")
+    finally:
+        with contextlib.suppress(Exception):
+            os.remove(cubin_path)
+
+
 def find_cuda_path():
     """Utility function to find cuda path
 
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py