halide · abadams · Mar 5, 2024 · Mar 4, 2024
diff --git a/src/CanonicalizeGPUVars.cpp b/src/CanonicalizeGPUVars.cpp
@@ -11,23 +11,26 @@ namespace Halide {
 namespace Internal {
 
 using std::map;
-using std::string;
 using std::vector;
 
-namespace {
-string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"};
-string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"};
-
-string get_thread_name(int index) {
+const std::string &gpu_thread_name(int index) {
+    static std::string gpu_thread_names[3] = {"." + unique_name("thread_id_x"),
+                                              "." + unique_name("thread_id_y"),
+                                              "." + unique_name("thread_id_z")};
     internal_assert(index >= 0 && index < 3);
-    return thread_names[index];
+    return gpu_thread_names[index];
 }
 
-string get_block_name(int index) {
+const std::string &gpu_block_name(int index) {
+    static std::string gpu_block_names[3] = {"." + unique_name("block_id_x"),
+                                             "." + unique_name("block_id_y"),
+                                             "." + unique_name("block_id_z")};
     internal_assert(index >= 0 && index < 3);
-    return block_names[index];
+    return gpu_block_names[index];
 }
 
+namespace {
+
 class CountGPUBlocksThreads : public IRVisitor {
     using IRVisitor::visit;
 
@@ -73,12 +76,12 @@ class CountGPUBlocksThreads : public IRVisitor {
 };
 
 class CanonicalizeGPUVars : public IRMutator {
-    map<string, string> gpu_vars;
+    map<std::string, std::string> gpu_vars;
 
     using IRMutator::visit;
 
-    string find_replacement(const string &suffix, const string &name) {
-        vector<string> v = split_string(name, suffix);
+    std::string find_replacement(const std::string &suffix, const std::string &name) {
+        vector<std::string> v = split_string(name, suffix);
         internal_assert(v.size() == 2);
         const auto &iter = gpu_vars.find(v[0]);
         if (iter != gpu_vars.end()) {
@@ -87,7 +90,7 @@ class CanonicalizeGPUVars : public IRMutator {
         return name;
     }
 
-    string canonicalize_let(const string &name) {
+    std::string canonicalize_let(const std::string &name) {
         if (ends_with(name, ".loop_max")) {
             return find_replacement(".loop_max", name);
         } else if (ends_with(name, ".loop_min")) {
@@ -100,7 +103,7 @@ class CanonicalizeGPUVars : public IRMutator {
     }
 
     Stmt visit(const For *op) override {
-        string name = op->name;
+        std::string name = op->name;
         Expr min = mutate(op->min);
         Expr extent = mutate(op->extent);
         Stmt body = mutate(op->body);
@@ -113,13 +116,13 @@ class CanonicalizeGPUVars : public IRMutator {
             op->body.accept(&counter);
 
             if (op->for_type == ForType::GPUBlock) {
-                name += "." + get_block_name(counter.nblocks);
+                name += gpu_block_name(counter.nblocks);
                 debug(5) << "Replacing " << op->name << " with GPU block name " << name << "\n";
             } else if (op->for_type == ForType::GPUThread) {
-                name += "." + get_thread_name(counter.nthreads);
+                name += gpu_thread_name(counter.nthreads);
                 debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n";
             } else if (op->for_type == ForType::GPULane) {
-                name += "." + get_thread_name(0);
+                name += gpu_thread_name(0);
             }
 
             if (name != op->name) {
@@ -143,7 +146,7 @@ class CanonicalizeGPUVars : public IRMutator {
     }
 
     Stmt visit(const LetStmt *op) override {
-        vector<std::pair<string, Expr>> lets;
+        vector<std::pair<std::string, Expr>> lets;
         Stmt result;
 
         do {
@@ -154,7 +157,7 @@ class CanonicalizeGPUVars : public IRMutator {
         result = mutate(result);
 
         for (auto it = lets.rbegin(); it != lets.rend(); it++) {
-            string name = canonicalize_let(it->first);
+            std::string name = canonicalize_let(it->first);
             if (name != it->first) {
                 Expr new_var = Variable::make(Int(32), name);
                 result = substitute(it->first, new_var, result);
@@ -168,7 +171,7 @@ class CanonicalizeGPUVars : public IRMutator {
     Stmt visit(const IfThenElse *op) override {
         Expr condition = mutate(op->condition);
 
-        map<string, string> old_gpu_vars;
+        map<std::string, std::string> old_gpu_vars;
         old_gpu_vars.swap(gpu_vars);
         Stmt then_case = mutate(op->then_case);
 

diff --git a/src/CanonicalizeGPUVars.h b/src/CanonicalizeGPUVars.h
@@ -15,6 +15,13 @@ namespace Internal {
  * by the nesting order: innermost is assigned to x and so on. */
 Stmt canonicalize_gpu_vars(Stmt s);
 
+/** Names for the thread and block id variables. Includes the leading
+ * dot. Indexed from inside out, so 0 gives you the innermost loop. */
+// @{
+const std::string &gpu_thread_name(int index);
+const std::string &gpu_block_name(int index);
+// @}
+
 }  // namespace Internal
 }  // namespace Halide
 

diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -3,6 +3,7 @@
 #include <sstream>
 #include <utility>
 
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_D3D12Compute_Dev.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
@@ -221,22 +222,18 @@ string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_reinterpret(Type
 
 namespace {
 string simt_intrinsic(const string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
+    if (ends_with(name, gpu_thread_name(0))) {
         return "tid_in_tgroup.x";
-    } else if (ends_with(name, ".__thread_id_y")) {
+    } else if (ends_with(name, gpu_thread_name(1))) {
         return "tid_in_tgroup.y";
-    } else if (ends_with(name, ".__thread_id_z")) {
+    } else if (ends_with(name, gpu_thread_name(2))) {
         return "tid_in_tgroup.z";
-    } else if (ends_with(name, ".__thread_id_w")) {
-        user_error << "HLSL (SM5.1) does not support more than three dimensions for compute kernel threads.\n";
-    } else if (ends_with(name, ".__block_id_x")) {
+    } else if (ends_with(name, gpu_block_name(0))) {
         return "tgroup_index.x";
-    } else if (ends_with(name, ".__block_id_y")) {
+    } else if (ends_with(name, gpu_block_name(1))) {
         return "tgroup_index.y";
-    } else if (ends_with(name, ".__block_id_z")) {
+    } else if (ends_with(name, gpu_block_name(2))) {
         return "tgroup_index.z";
-    } else if (ends_with(name, ".__block_id_w")) {
-        user_error << "HLSL (SM5.1) does not support more than three dimensions for compute dispatch groups.\n";
     }
     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
     return "";
@@ -300,15 +297,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const For *loop) {
     user_assert(loop->for_type != ForType::GPULane)
         << "The D3D12Compute backend does not support the gpu_lanes() scheduling directive.";
 
-    if (!is_gpu_var(loop->name)) {
-        user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside D3D12Compute kernel\n";
+    if (!is_gpu(loop->for_type)) {
         CodeGen_GPU_C::visit(loop);
         return;
     }
-
-    internal_assert((loop->for_type == ForType::GPUBlock) ||
-                    (loop->for_type == ForType::GPUThread))
-        << "kernel loop must be either gpu block or gpu thread\n";
     internal_assert(is_const_zero(loop->min));
 
     stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
@@ -1153,7 +1145,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
     struct FindThreadGroupSize : public IRVisitor {
         using IRVisitor::visit;
         void visit(const For *loop) override {
-            if (!is_gpu_var(loop->name)) {
+            if (!is_gpu(loop->for_type)) {
                 return loop->body.accept(this);
             }
             if (loop->for_type != ForType::GPUThread) {
@@ -1175,13 +1167,9 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
             loop->body.accept(this);
         }
         int thread_loop_workgroup_index(const string &name) {
-            string ids[] = {".__thread_id_x",
-                            ".__thread_id_y",
-                            ".__thread_id_z",
-                            ".__thread_id_w"};
-            for (auto &id : ids) {
-                if (ends_with(name, id)) {
-                    return (&id - ids);
+            for (int i = 0; i < 3; i++) {
+                if (ends_with(name, gpu_thread_name(i))) {
+                    return i;
                 }
             }
             return -1;

diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp
@@ -1,5 +1,7 @@
 #include "CodeGen_GPU_Dev.h"
+#include "CanonicalizeGPUVars.h"
 #include "Deinterleave.h"
+#include "ExprUsesVar.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "IRVisitor.h"
@@ -9,50 +11,6 @@ namespace Internal {
 
 CodeGen_GPU_Dev::~CodeGen_GPU_Dev() = default;
 
-bool CodeGen_GPU_Dev::is_gpu_var(const std::string &name) {
-    return is_gpu_block_var(name) || is_gpu_thread_var(name);
-}
-
-bool CodeGen_GPU_Dev::is_gpu_block_var(const std::string &name) {
-    return (ends_with(name, ".__block_id_x") ||
-            ends_with(name, ".__block_id_y") ||
-            ends_with(name, ".__block_id_z") ||
-            ends_with(name, ".__block_id_w"));
-}
-
-bool CodeGen_GPU_Dev::is_gpu_thread_var(const std::string &name) {
-    return (ends_with(name, ".__thread_id_x") ||
-            ends_with(name, ".__thread_id_y") ||
-            ends_with(name, ".__thread_id_z") ||
-            ends_with(name, ".__thread_id_w"));
-}
-
-namespace {
-// Check to see if an expression is uniform within a block.
-// This is done by checking to see if the expression depends on any GPU
-// thread indices.
-class IsBlockUniform : public IRVisitor {
-    using IRVisitor::visit;
-
-    void visit(const Variable *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) {
-            result = false;
-        }
-    }
-
-public:
-    bool result = true;
-
-    IsBlockUniform() = default;
-};
-}  // namespace
-
-bool CodeGen_GPU_Dev::is_block_uniform(const Expr &expr) {
-    IsBlockUniform v;
-    expr.accept(&v);
-    return v.result;
-}
-
 namespace {
 // Check to see if a buffer is a candidate for constant memory storage.
 // A buffer is a candidate for constant memory if it is never written to,
@@ -71,14 +29,40 @@ class IsBufferConstant : public IRVisitor {
 
     void visit(const Load *op) override {
         if (op->name == buffer &&
-            !CodeGen_GPU_Dev::is_block_uniform(op->index)) {
+            expr_uses_vars(op->index, depends_on_thread_var)) {
             result = false;
         }
         if (result) {
             IRVisitor::visit(op);
         }
     }
 
+    void visit(const LetStmt *op) override {
+        op->value.accept(this);
+        ScopedBinding<> bind_if(expr_uses_vars(op->value, depends_on_thread_var),
+                                depends_on_thread_var,
+                                op->name);
+        op->body.accept(this);
+    }
+
+    void visit(const Let *op) override {
+        op->value.accept(this);
+        ScopedBinding<> bind_if(expr_uses_vars(op->value, depends_on_thread_var),
+                                depends_on_thread_var,
+                                op->name);
+        op->body.accept(this);
+    }
+
+    void visit(const For *op) override {
+        ScopedBinding<> bind_if(op->for_type == ForType::GPUThread ||
+                                    op->for_type == ForType::GPULane,
+                                depends_on_thread_var,
+                                op->name);
+        IRVisitor::visit(op);
+    }
+
+    Scope<> depends_on_thread_var;
+
 public:
     bool result = true;
     const std::string &buffer;

diff --git a/src/CodeGen_GPU_Dev.h b/src/CodeGen_GPU_Dev.h
@@ -55,10 +55,6 @@ struct CodeGen_GPU_Dev {
         return false;
     }
 
-    static bool is_gpu_var(const std::string &name);
-    static bool is_gpu_block_var(const std::string &name);
-    static bool is_gpu_thread_var(const std::string &name);
-
     /** Checks if expr is block uniform, i.e. does not depend on a thread
      * var. */
     static bool is_block_uniform(const Expr &expr);

diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
@@ -2,6 +2,7 @@
 #include <sstream>
 #include <utility>
 
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Metal_Dev.h"
@@ -187,22 +188,18 @@ string CodeGen_Metal_Dev::CodeGen_Metal_C::print_reinterpret(Type type, const Ex
 
 namespace {
 string simt_intrinsic(const string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
+    if (ends_with(name, gpu_thread_name(0))) {
         return "tid_in_tgroup.x";
-    } else if (ends_with(name, ".__thread_id_y")) {
+    } else if (ends_with(name, gpu_thread_name(1))) {
         return "tid_in_tgroup.y";
-    } else if (ends_with(name, ".__thread_id_z")) {
+    } else if (ends_with(name, gpu_thread_name(2))) {
         return "tid_in_tgroup.z";
-    } else if (ends_with(name, ".__thread_id_w")) {
-        user_error << "Metal does not support more than three dimensions in a kernel (threads).\n";
-    } else if (ends_with(name, ".__block_id_x")) {
+    } else if (ends_with(name, gpu_block_name(0))) {
         return "tgroup_index.x";
-    } else if (ends_with(name, ".__block_id_y")) {
+    } else if (ends_with(name, gpu_block_name(1))) {
         return "tgroup_index.y";
-    } else if (ends_with(name, ".__block_id_z")) {
+    } else if (ends_with(name, gpu_block_name(2))) {
         return "tgroup_index.z";
-    } else if (ends_with(name, ".__block_id_w")) {
-        user_error << "Metal does not support more than three dimensions in a kernel (groups).\n";
     }
     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
     return "";
@@ -272,10 +269,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const For *loop) {
     user_assert(loop->for_type != ForType::GPULane)
         << "The Metal backend does not support the gpu_lanes() scheduling directive.";
 
-    if (is_gpu_var(loop->name)) {
-        internal_assert((loop->for_type == ForType::GPUBlock) ||
-                        (loop->for_type == ForType::GPUThread))
-            << "kernel loop must be either gpu block or gpu thread\n";
+    if (is_gpu(loop->for_type)) {
         internal_assert(is_const_zero(loop->min));
 
         stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)