Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make gpu thread and block for loop names opaque #8133

Merged
merged 1 commit into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 23 additions & 20 deletions src/CanonicalizeGPUVars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,26 @@ namespace Halide {
namespace Internal {

using std::map;
using std::string;
using std::vector;

namespace {
string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"};
string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"};

string get_thread_name(int index) {
const std::string &gpu_thread_name(int index) {
static std::string gpu_thread_names[3] = {"." + unique_name("thread_id_x"),
"." + unique_name("thread_id_y"),
"." + unique_name("thread_id_z")};
internal_assert(index >= 0 && index < 3);
return thread_names[index];
return gpu_thread_names[index];
}

string get_block_name(int index) {
const std::string &gpu_block_name(int index) {
static std::string gpu_block_names[3] = {"." + unique_name("block_id_x"),
"." + unique_name("block_id_y"),
"." + unique_name("block_id_z")};
internal_assert(index >= 0 && index < 3);
return block_names[index];
return gpu_block_names[index];
}

namespace {

class CountGPUBlocksThreads : public IRVisitor {
using IRVisitor::visit;

Expand Down Expand Up @@ -73,12 +76,12 @@ class CountGPUBlocksThreads : public IRVisitor {
};

class CanonicalizeGPUVars : public IRMutator {
map<string, string> gpu_vars;
map<std::string, std::string> gpu_vars;

using IRMutator::visit;

string find_replacement(const string &suffix, const string &name) {
vector<string> v = split_string(name, suffix);
std::string find_replacement(const std::string &suffix, const std::string &name) {
vector<std::string> v = split_string(name, suffix);
internal_assert(v.size() == 2);
const auto &iter = gpu_vars.find(v[0]);
if (iter != gpu_vars.end()) {
Expand All @@ -87,7 +90,7 @@ class CanonicalizeGPUVars : public IRMutator {
return name;
}

string canonicalize_let(const string &name) {
std::string canonicalize_let(const std::string &name) {
if (ends_with(name, ".loop_max")) {
return find_replacement(".loop_max", name);
} else if (ends_with(name, ".loop_min")) {
Expand All @@ -100,7 +103,7 @@ class CanonicalizeGPUVars : public IRMutator {
}

Stmt visit(const For *op) override {
string name = op->name;
std::string name = op->name;
Expr min = mutate(op->min);
Expr extent = mutate(op->extent);
Stmt body = mutate(op->body);
Expand All @@ -113,13 +116,13 @@ class CanonicalizeGPUVars : public IRMutator {
op->body.accept(&counter);

if (op->for_type == ForType::GPUBlock) {
name += "." + get_block_name(counter.nblocks);
name += gpu_block_name(counter.nblocks);
debug(5) << "Replacing " << op->name << " with GPU block name " << name << "\n";
} else if (op->for_type == ForType::GPUThread) {
name += "." + get_thread_name(counter.nthreads);
name += gpu_thread_name(counter.nthreads);
debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n";
} else if (op->for_type == ForType::GPULane) {
name += "." + get_thread_name(0);
name += gpu_thread_name(0);
}

if (name != op->name) {
Expand All @@ -143,7 +146,7 @@ class CanonicalizeGPUVars : public IRMutator {
}

Stmt visit(const LetStmt *op) override {
vector<std::pair<string, Expr>> lets;
vector<std::pair<std::string, Expr>> lets;
Stmt result;

do {
Expand All @@ -154,7 +157,7 @@ class CanonicalizeGPUVars : public IRMutator {
result = mutate(result);

for (auto it = lets.rbegin(); it != lets.rend(); it++) {
string name = canonicalize_let(it->first);
std::string name = canonicalize_let(it->first);
if (name != it->first) {
Expr new_var = Variable::make(Int(32), name);
result = substitute(it->first, new_var, result);
Expand All @@ -168,7 +171,7 @@ class CanonicalizeGPUVars : public IRMutator {
Stmt visit(const IfThenElse *op) override {
Expr condition = mutate(op->condition);

map<string, string> old_gpu_vars;
map<std::string, std::string> old_gpu_vars;
old_gpu_vars.swap(gpu_vars);
Stmt then_case = mutate(op->then_case);

Expand Down
7 changes: 7 additions & 0 deletions src/CanonicalizeGPUVars.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ namespace Internal {
* by the nesting order: innermost is assigned to x and so on. */
Stmt canonicalize_gpu_vars(Stmt s);

/** Names for the thread and block id variables. Includes the leading
* dot. Indexed from inside out, so 0 gives you the innermost loop. */
// @{
const std::string &gpu_thread_name(int index);
const std::string &gpu_block_name(int index);
// @}

} // namespace Internal
} // namespace Halide

Expand Down
36 changes: 12 additions & 24 deletions src/CodeGen_D3D12Compute_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <sstream>
#include <utility>

#include "CanonicalizeGPUVars.h"
#include "CodeGen_D3D12Compute_Dev.h"
#include "CodeGen_GPU_Dev.h"
#include "CodeGen_Internal.h"
Expand Down Expand Up @@ -221,22 +222,18 @@ string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_reinterpret(Type

namespace {
string simt_intrinsic(const string &name) {
if (ends_with(name, ".__thread_id_x")) {
if (ends_with(name, gpu_thread_name(0))) {
return "tid_in_tgroup.x";
} else if (ends_with(name, ".__thread_id_y")) {
} else if (ends_with(name, gpu_thread_name(1))) {
return "tid_in_tgroup.y";
} else if (ends_with(name, ".__thread_id_z")) {
} else if (ends_with(name, gpu_thread_name(2))) {
return "tid_in_tgroup.z";
} else if (ends_with(name, ".__thread_id_w")) {
user_error << "HLSL (SM5.1) does not support more than three dimensions for compute kernel threads.\n";
} else if (ends_with(name, ".__block_id_x")) {
} else if (ends_with(name, gpu_block_name(0))) {
return "tgroup_index.x";
} else if (ends_with(name, ".__block_id_y")) {
} else if (ends_with(name, gpu_block_name(1))) {
return "tgroup_index.y";
} else if (ends_with(name, ".__block_id_z")) {
} else if (ends_with(name, gpu_block_name(2))) {
return "tgroup_index.z";
} else if (ends_with(name, ".__block_id_w")) {
user_error << "HLSL (SM5.1) does not support more than three dimensions for compute dispatch groups.\n";
}
internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
return "";
Expand Down Expand Up @@ -300,15 +297,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const For *loop) {
user_assert(loop->for_type != ForType::GPULane)
<< "The D3D12Compute backend does not support the gpu_lanes() scheduling directive.";

if (!is_gpu_var(loop->name)) {
user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside D3D12Compute kernel\n";
if (!is_gpu(loop->for_type)) {
CodeGen_GPU_C::visit(loop);
return;
}

internal_assert((loop->for_type == ForType::GPUBlock) ||
(loop->for_type == ForType::GPUThread))
<< "kernel loop must be either gpu block or gpu thread\n";
internal_assert(is_const_zero(loop->min));

stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
Expand Down Expand Up @@ -1153,7 +1145,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
struct FindThreadGroupSize : public IRVisitor {
using IRVisitor::visit;
void visit(const For *loop) override {
if (!is_gpu_var(loop->name)) {
if (!is_gpu(loop->for_type)) {
return loop->body.accept(this);
}
if (loop->for_type != ForType::GPUThread) {
Expand All @@ -1175,13 +1167,9 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
loop->body.accept(this);
}
int thread_loop_workgroup_index(const string &name) {
string ids[] = {".__thread_id_x",
".__thread_id_y",
".__thread_id_z",
".__thread_id_w"};
for (auto &id : ids) {
if (ends_with(name, id)) {
return (&id - ids);
for (int i = 0; i < 3; i++) {
if (ends_with(name, gpu_thread_name(i))) {
return i;
}
}
return -1;
Expand Down
74 changes: 29 additions & 45 deletions src/CodeGen_GPU_Dev.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "CodeGen_GPU_Dev.h"
#include "CanonicalizeGPUVars.h"
#include "Deinterleave.h"
#include "ExprUsesVar.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "IRVisitor.h"
Expand All @@ -9,50 +11,6 @@ namespace Internal {

CodeGen_GPU_Dev::~CodeGen_GPU_Dev() = default;

bool CodeGen_GPU_Dev::is_gpu_var(const std::string &name) {
return is_gpu_block_var(name) || is_gpu_thread_var(name);
}

bool CodeGen_GPU_Dev::is_gpu_block_var(const std::string &name) {
return (ends_with(name, ".__block_id_x") ||
ends_with(name, ".__block_id_y") ||
ends_with(name, ".__block_id_z") ||
ends_with(name, ".__block_id_w"));
}

bool CodeGen_GPU_Dev::is_gpu_thread_var(const std::string &name) {
return (ends_with(name, ".__thread_id_x") ||
ends_with(name, ".__thread_id_y") ||
ends_with(name, ".__thread_id_z") ||
ends_with(name, ".__thread_id_w"));
}

namespace {
// Check to see if an expression is uniform within a block.
// This is done by checking to see if the expression depends on any GPU
// thread indices.
class IsBlockUniform : public IRVisitor {
using IRVisitor::visit;

void visit(const Variable *op) override {
if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) {
result = false;
}
}

public:
bool result = true;

IsBlockUniform() = default;
};
} // namespace

bool CodeGen_GPU_Dev::is_block_uniform(const Expr &expr) {
IsBlockUniform v;
expr.accept(&v);
return v.result;
}

namespace {
// Check to see if a buffer is a candidate for constant memory storage.
// A buffer is a candidate for constant memory if it is never written to,
Expand All @@ -71,14 +29,40 @@ class IsBufferConstant : public IRVisitor {

void visit(const Load *op) override {
if (op->name == buffer &&
!CodeGen_GPU_Dev::is_block_uniform(op->index)) {
expr_uses_vars(op->index, depends_on_thread_var)) {
result = false;
}
if (result) {
IRVisitor::visit(op);
}
}

void visit(const LetStmt *op) override {
op->value.accept(this);
ScopedBinding<> bind_if(expr_uses_vars(op->value, depends_on_thread_var),
depends_on_thread_var,
op->name);
op->body.accept(this);
}

void visit(const Let *op) override {
op->value.accept(this);
ScopedBinding<> bind_if(expr_uses_vars(op->value, depends_on_thread_var),
depends_on_thread_var,
op->name);
op->body.accept(this);
}

void visit(const For *op) override {
ScopedBinding<> bind_if(op->for_type == ForType::GPUThread ||
op->for_type == ForType::GPULane,
depends_on_thread_var,
op->name);
IRVisitor::visit(op);
}

Scope<> depends_on_thread_var;

public:
bool result = true;
const std::string &buffer;
Expand Down
4 changes: 0 additions & 4 deletions src/CodeGen_GPU_Dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,6 @@ struct CodeGen_GPU_Dev {
return false;
}

static bool is_gpu_var(const std::string &name);
static bool is_gpu_block_var(const std::string &name);
static bool is_gpu_thread_var(const std::string &name);

/** Checks if expr is block uniform, i.e. does not depend on a thread
* var. */
static bool is_block_uniform(const Expr &expr);
Expand Down
22 changes: 8 additions & 14 deletions src/CodeGen_Metal_Dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <sstream>
#include <utility>

#include "CanonicalizeGPUVars.h"
#include "CodeGen_GPU_Dev.h"
#include "CodeGen_Internal.h"
#include "CodeGen_Metal_Dev.h"
Expand Down Expand Up @@ -187,22 +188,18 @@ string CodeGen_Metal_Dev::CodeGen_Metal_C::print_reinterpret(Type type, const Ex

namespace {
string simt_intrinsic(const string &name) {
if (ends_with(name, ".__thread_id_x")) {
if (ends_with(name, gpu_thread_name(0))) {
return "tid_in_tgroup.x";
} else if (ends_with(name, ".__thread_id_y")) {
} else if (ends_with(name, gpu_thread_name(1))) {
return "tid_in_tgroup.y";
} else if (ends_with(name, ".__thread_id_z")) {
} else if (ends_with(name, gpu_thread_name(2))) {
return "tid_in_tgroup.z";
} else if (ends_with(name, ".__thread_id_w")) {
user_error << "Metal does not support more than three dimensions in a kernel (threads).\n";
} else if (ends_with(name, ".__block_id_x")) {
} else if (ends_with(name, gpu_block_name(0))) {
return "tgroup_index.x";
} else if (ends_with(name, ".__block_id_y")) {
} else if (ends_with(name, gpu_block_name(1))) {
return "tgroup_index.y";
} else if (ends_with(name, ".__block_id_z")) {
} else if (ends_with(name, gpu_block_name(2))) {
return "tgroup_index.z";
} else if (ends_with(name, ".__block_id_w")) {
user_error << "Metal does not support more than three dimensions in a kernel (groups).\n";
}
internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
return "";
Expand Down Expand Up @@ -272,10 +269,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const For *loop) {
user_assert(loop->for_type != ForType::GPULane)
<< "The Metal backend does not support the gpu_lanes() scheduling directive.";

if (is_gpu_var(loop->name)) {
internal_assert((loop->for_type == ForType::GPUBlock) ||
(loop->for_type == ForType::GPUThread))
<< "kernel loop must be either gpu block or gpu thread\n";
if (is_gpu(loop->for_type)) {
internal_assert(is_const_zero(loop->min));

stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
Expand Down
Loading
Loading