Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups). #8333

Merged
merged 2 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions src/Profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ class InjectProfiling : public IRMutator {
const Expr &condition,
const Type &type,
const std::string &name,
bool &on_stack) {
on_stack = true;
bool &can_fit_on_stack) {
can_fit_on_stack = true;

Expr cond = simplify(condition);
if (is_const_zero(cond)) { // Condition always false
Expand All @@ -236,7 +236,7 @@ class InjectProfiling : public IRMutator {
// it would have constant size).
internal_assert(!extents.empty());

on_stack = false;
can_fit_on_stack = false;
Expr size = cast<uint64_t>(extents[0]);
for (size_t i = 1; i < extents.size(); i++) {
size *= extents[i];
Expand All @@ -261,9 +261,12 @@ class InjectProfiling : public IRMutator {
auto [new_extents, changed] = mutate_with_changes(op->extents);
Expr condition = mutate(op->condition);

bool on_stack;
Expr size = compute_allocation_size(new_extents, condition, op->type, op->name, on_stack);
bool can_fit_on_stack;
Expr size = compute_allocation_size(new_extents, condition, op->type, op->name, can_fit_on_stack);
internal_assert(size.type() == UInt(64));

bool on_stack = can_fit_on_stack && !op->new_expr.defined();

func_alloc_sizes.push(op->name, {on_stack, size});

// compute_allocation_size() might return a zero size, if the allocation is
Expand Down
1 change: 1 addition & 0 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ tests(GROUPS correctness
gameoflife.cpp
gather.cpp
gpu_allocation_cache.cpp
gpu_alloc_group_profiling.cpp
gpu_arg_types.cpp
gpu_assertion_in_kernel.cpp
gpu_bounds_inference_failure.cpp
Expand Down
52 changes: 52 additions & 0 deletions test/correctness/gpu_alloc_group_profiling.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "Halide.h"

using namespace Halide;

int main(int argc, char *argv[]) {

Target t = get_jit_target_from_environment();
if (!t.has_gpu_feature()) {
printf("[SKIP] GPU not enabled\n");
return 0;
}

// There was a bug that causes the inject profiling logic to try to
// lookup a Func from the environment, by the buffer name of an allocation group.
// Of course there is no Func for that name.
// This happens when the buffer originally was intended for GPUShared, but got somehow
// lifted to Heap (which I ran into before, without doing it explicitly like this below).
// --mcourteaux

Var x{"x"}, y{"y"};

Func f1{"f1"}, f2{"f2"};
f1(x, y) = cast<float>(x + y);
f2(x, y) = f1(x, y) * 2;

Func result{"result"};
result(x, y) = f2(x, y);

Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
result
.compute_root()
.gpu_tile(x, y, xo, yo, xi, yi, 16, 16)
.reorder(xi, yi, xo, yo);

f2.compute_at(result, xo)
.gpu_threads(x, y)
.store_in(MemoryType::Heap);

f1.compute_at(result, xo)
.gpu_threads(x, y)
.store_in(MemoryType::Heap);

result.print_loop_nest();

t.set_feature(Target::Profile); // Make sure profiling is enabled!
result.compile_jit(t);
result.realize({64, 64}, t);
// result.compile_to_conceptual_stmt("gpu_alloc_group_profiling.stmt.html", {}, Halide::HTML, t);

printf("Success!\n");
return 0;
}
Loading