halide · steven-johnson · Jul 16, 2024 · Jun 28, 2024 · Jul 4, 2024
diff --git a/src/Profiling.cpp b/src/Profiling.cpp
@@ -216,8 +216,8 @@ class InjectProfiling : public IRMutator {
                                  const Expr &condition,
                                  const Type &type,
                                  const std::string &name,
-                                 bool &on_stack) {
-        on_stack = true;
+                                 bool &can_fit_on_stack) {
+        can_fit_on_stack = true;
 
         Expr cond = simplify(condition);
         if (is_const_zero(cond)) {  // Condition always false
@@ -236,7 +236,7 @@ class InjectProfiling : public IRMutator {
         // it would have constant size).
         internal_assert(!extents.empty());
 
-        on_stack = false;
+        can_fit_on_stack = false;
         Expr size = cast<uint64_t>(extents[0]);
         for (size_t i = 1; i < extents.size(); i++) {
             size *= extents[i];
@@ -261,9 +261,12 @@ class InjectProfiling : public IRMutator {
         auto [new_extents, changed] = mutate_with_changes(op->extents);
         Expr condition = mutate(op->condition);
 
-        bool on_stack;
-        Expr size = compute_allocation_size(new_extents, condition, op->type, op->name, on_stack);
+        bool can_fit_on_stack;
+        Expr size = compute_allocation_size(new_extents, condition, op->type, op->name, can_fit_on_stack);
         internal_assert(size.type() == UInt(64));
+
+        bool on_stack = can_fit_on_stack && !op->new_expr.defined();
+
         func_alloc_sizes.push(op->name, {on_stack, size});
 
         // compute_allocation_size() might return a zero size, if the allocation is

diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
@@ -126,6 +126,7 @@ tests(GROUPS correctness
       gameoflife.cpp
       gather.cpp
       gpu_allocation_cache.cpp
+      gpu_alloc_group_profiling.cpp
       gpu_arg_types.cpp
       gpu_assertion_in_kernel.cpp
       gpu_bounds_inference_failure.cpp

diff --git a/test/correctness/gpu_alloc_group_profiling.cpp b/test/correctness/gpu_alloc_group_profiling.cpp
@@ -0,0 +1,52 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char *argv[]) {
+
+    Target t = get_jit_target_from_environment();
+    if (!t.has_gpu_feature()) {
+        printf("[SKIP] GPU not enabled\n");
+        return 0;
+    }
+
+    // There was a bug that causes the inject profiling logic to try to
+    // lookup a Func from the environment, by the buffer name of an allocation group.
+    // Of course there is no Func for that name.
+    // This happens when the buffer originally was intended for GPUShared, but got somehow
+    // lifted to Heap (which I ran into before, without doing it explicitly like this below).
+    //  --mcourteaux
+
+    Var x{"x"}, y{"y"};
+
+    Func f1{"f1"}, f2{"f2"};
+    f1(x, y) = cast<float>(x + y);
+    f2(x, y) = f1(x, y) * 2;
+
+    Func result{"result"};
+    result(x, y) = f2(x, y);
+
+    Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
+    result
+        .compute_root()
+        .gpu_tile(x, y, xo, yo, xi, yi, 16, 16)
+        .reorder(xi, yi, xo, yo);
+
+    f2.compute_at(result, xo)
+        .gpu_threads(x, y)
+        .store_in(MemoryType::Heap);
+
+    f1.compute_at(result, xo)
+        .gpu_threads(x, y)
+        .store_in(MemoryType::Heap);
+
+    result.print_loop_nest();
+
+    t.set_feature(Target::Profile);  // Make sure profiling is enabled!
+    result.compile_jit(t);
+    result.realize({64, 64}, t);
+    // result.compile_to_conceptual_stmt("gpu_alloc_group_profiling.stmt.html", {}, Halide::HTML, t);
+
+    printf("Success!\n");
+    return 0;
+}