From 31368194a88405741eaf361e7a793cbb1b8f8fd9 Mon Sep 17 00:00:00 2001
From: Xuanda Yang <th3charlie@gmail.com>
Date: Fri, 1 Dec 2023 01:59:30 +0800
Subject: [PATCH 001/186] [serialization] Add Halide version and serialization
 version in serialization format (#7905)

* halide version

* serialization version

* format

* Fix Makefile

* trigger buildbots

---------

Co-authored-by: Andrew Adams <andrew.b.adams@gmail.com>
Co-authored-by: Steven Johnson <srj@google.com>
---
 Makefile                |  8 ++++++++
 src/CMakeLists.txt      |  6 ++++++
 src/Deserialization.cpp | 19 +++++++++++++++++++
 src/Serialization.cpp   | 12 +++++++++++-
 src/halide_ir.fbs       |  6 ++++--
 5 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index d1ebace87bda..7364941941a2 100644
--- a/Makefile
+++ b/Makefile
@@ -247,6 +247,14 @@ CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS)
 # On ubuntu, this requires packages flatbuffers-compiler and libflatbuffers-dev
 ifneq (,$(shell which flatc))
 CXX_FLAGS += -DWITH_SERIALIZATION -I $(BUILD_DIR) -I $(shell which flatc | sed 's/bin.flatc/include/')
+# Note: if updating here, be sure to update in CMakeLists.txt as well
+HALIDE_SERIALIZATION_VERSION_MAJOR ?= 0
+HALIDE_SERIALIZATION_VERSION_MINOR ?= 1
+HALIDE_SERIALIZATION_VERSION_PATCH ?= 0
+HALIDE_SERIALIZATION_VERSION=$(HALIDE_SERIALIZATION_VERSION_MAJOR).$(HALIDE_SERIALIZATION_VERSION_MINOR).$(HALIDE_SERIALIZATION_VERSION_PATCH)
+CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_MAJOR=$(HALIDE_SERIALIZATION_VERSION_MAJOR)
+CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_MINOR=$(HALIDE_SERIALIZATION_VERSION_MINOR)
+CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_PATCH=$(HALIDE_SERIALIZATION_VERSION_PATCH)
 endif
 
 # This is required on some hosts like powerpc64le-linux-gnu because we may build
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9ef902c27be2..771944b10d42 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -509,6 +509,12 @@ if (WITH_SERIALIZATION)
     target_include_directories(Halide PRIVATE "$<BUILD_INTERFACE:${fb_dir}>")
     target_link_libraries(Halide PRIVATE Halide_flatbuffers)
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION)
+    # Note: if updating here, be sure to update in Makefile as well
+    target_compile_definitions(Halide PUBLIC
+                               HALIDE_SERIALIZATION_VERSION_MAJOR=0
+                               HALIDE_SERIALIZATION_VERSION_MINOR=1
+                               HALIDE_SERIALIZATION_VERSION_PATCH=0
+                               )
 endif ()
 
 # Enable serialization testing by intercepting JIT compilation with a serialization roundtrip;
diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp
index c0e9f39de7bf..b27918756886 100644
--- a/src/Deserialization.cpp
+++ b/src/Deserialization.cpp
@@ -1403,6 +1403,25 @@ Pipeline Deserializer::deserialize(const std::vector<uint8_t> &data) {
         user_warning << "deserialized pipeline is empty\n";
         return Pipeline();
     }
+
+    std::string deserialized_halide_version = deserialize_string(pipeline_obj->halide_version());
+    std::string halide_version = std::to_string(HALIDE_VERSION_MAJOR) + "." +
+                                 std::to_string(HALIDE_VERSION_MINOR) + "." +
+                                 std::to_string(HALIDE_VERSION_PATCH);
+    if (deserialized_halide_version != halide_version) {
+        user_warning << "deserialized pipeline is built with Halide version " << deserialized_halide_version
+                     << ", but current Halide version is " << halide_version << "\n";
+    }
+
+    std::string deserialized_serialization_version = deserialize_string(pipeline_obj->serialization_version());
+    std::string serialization_version = std::to_string(HALIDE_SERIALIZATION_VERSION_MAJOR) + "." +
+                                        std::to_string(HALIDE_SERIALIZATION_VERSION_MINOR) + "." +
+                                        std::to_string(HALIDE_SERIALIZATION_VERSION_PATCH);
+    if (deserialized_serialization_version != serialization_version) {
+        user_error << "deserialized pipeline is built with Halide serialization version " << deserialized_serialization_version
+                   << ", but current Halide serialization version is " << serialization_version << "\n";
+    }
+
     const std::vector<std::string> func_names_in_order =
         deserialize_vector<flatbuffers::String, std::string>(pipeline_obj->func_names_in_order(),
                                                              &Deserializer::deserialize_string);
diff --git a/src/Serialization.cpp b/src/Serialization.cpp
index 2928e3b7ebbf..857c963cceab 100644
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -1501,6 +1501,14 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector<uint8_t> &resul
         buffers_serialized.push_back(serialize_buffer(builder, buffer.second));
     }
 
+    std::string halide_version = std::to_string(HALIDE_VERSION_MAJOR) + "." +
+                                 std::to_string(HALIDE_VERSION_MINOR) + "." +
+                                 std::to_string(HALIDE_VERSION_PATCH);
+
+    std::string serialization_version = std::to_string(HALIDE_SERIALIZATION_VERSION_MAJOR) + "." +
+                                        std::to_string(HALIDE_SERIALIZATION_VERSION_MINOR) + "." +
+                                        std::to_string(HALIDE_SERIALIZATION_VERSION_PATCH);
+
     auto pipeline_obj = Serialize::CreatePipeline(builder,
                                                   builder.CreateVector(funcs_serialized),
                                                   builder.CreateVector(output_names_serialized),
@@ -1509,7 +1517,9 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector<uint8_t> &resul
                                                   builder.CreateVector(func_names_in_order_serialized),
                                                   builder.CreateVector(parameters_serialized),
                                                   builder.CreateVector(external_parameters_serialized),
-                                                  builder.CreateVector(buffers_serialized));
+                                                  builder.CreateVector(buffers_serialized),
+                                                  serialize_string(builder, halide_version),
+                                                  serialize_string(builder, serialization_version));
     builder.Finish(pipeline_obj);
 
     uint8_t *buf = builder.GetBufferPointer();
diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs
index 479e488b6739..8148aca639a9 100644
--- a/src/halide_ir.fbs
+++ b/src/halide_ir.fbs
@@ -1,7 +1,7 @@
 namespace Halide.Serialize;
 
-// This corresponds to the corresponding Halide version.
-file_identifier "HL17";
+// This identifies the serialized data being a Halide pipeline. Should be exactly 4 bytes.
+file_identifier "HLDE";
 
 // File extension of any written files. "hlpipe" stands for Halide Pipeline.
 file_extension "hlpipe";
@@ -710,6 +710,8 @@ table Pipeline {
     parameters: [Parameter];
     external_parameters: [ExternalParameter];
     buffers: [Buffer];
+    halide_version: string;
+    serialization_version: string;
 }
 
 root_type Pipeline;

From 4fc2a7d860c08d03ee93d47f743f4f6878b5f8a9 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 30 Nov 2023 16:31:48 -0800
Subject: [PATCH 002/186] Handle many more intrinsics in Bounds.cpp (#7823)

* Handle many more intrinsics in Bounds.cpp

This addresses many (but not all) of the `signed integer overflow` issues we're seeing in Google due to https://github.com/halide/Halide/pull/7814 -- a lot of the issues seems to be in code that uses intrinsics that had no handling in value bounds checking, so the bounds were naively large and overflowed.

- Most of the intrinsics from FindIntrinsics.h weren't handled; now they all are (most by lowering to other IR, though the halving_add variants were modeled directly because the bitwise ops don't mesh well)
- strict_float() is just a pass-through
- round() is a best guess (basically, if bounds exist, expand by one as a worst-case)

There are definitely others we should handle here... trunc/floor/ceil probably?

* Fix round() and strict_float() handling

* Update Bounds.cpp

* Fixes?

* trigger buildbots

* Revert saturating_cast handling

* Update Bounds.cpp

---------

Co-authored-by: Andrew Adams <andrew.b.adams@gmail.com>
---
 src/Bounds.cpp       | 149 +++++++++++++++++++++++++++++++++++++++++--
 src/FindIntrinsics.h |   1 +
 2 files changed, 143 insertions(+), 7 deletions(-)

diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index d40922cb6db0..0ba1f5440056 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -41,6 +41,37 @@ using std::string;
 using std::vector;
 
 namespace {
+
+bool can_widen(const Expr &e) {
+    // We don't want to widen Xtensa 48-bit integers
+    return e.type().bits() <= 32;
+}
+
+bool can_widen_all(const std::vector<Expr> &args) {
+    for (const auto &e : args) {
+        if (!can_widen(e)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+Expr widen(Expr a) {
+    internal_assert(can_widen(a));
+    Type result_type = a.type().widen();
+    return Cast::make(result_type, std::move(a));
+}
+
+Expr narrow(Expr a) {
+    Type result_type = a.type().narrow();
+    return Cast::make(result_type, std::move(a));
+}
+
+Expr saturating_narrow(const Expr &a) {
+    Type narrow = a.type().narrow();
+    return saturating_cast(narrow, a);
+}
+
 int static_sign(const Expr &x) {
     if (is_positive_const(x)) {
         return 1;
@@ -56,6 +87,7 @@ int static_sign(const Expr &x) {
     }
     return 0;
 }
+
 }  // anonymous namespace
 
 const FuncValueBounds &empty_func_value_bounds() {
@@ -1195,6 +1227,15 @@ class Bounds : public IRVisitor {
             // else fall thru and continue
         }
 
+        const auto handle_expr_bounds = [this, t](const Expr &e) -> void {
+            if (e.defined()) {
+                e.accept(this);
+            } else {
+                // Just use the bounds of the type
+                this->bounds_of_type(t);
+            }
+        };
+
         if (op->is_intrinsic(Call::abs)) {
             Interval a = arg_bounds.get(0);
             interval.min = make_zero(t);
@@ -1468,6 +1509,7 @@ class Bounds : public IRVisitor {
             }
         } else if (op->args.size() == 1 &&
                    (op->is_intrinsic(Call::round) ||
+                    op->is_intrinsic(Call::strict_float) ||
                     op->name == "ceil_f32" || op->name == "ceil_f64" ||
                     op->name == "floor_f32" || op->name == "floor_f64" ||
                     op->name == "exp_f32" || op->name == "exp_f64" ||
@@ -1518,14 +1560,107 @@ class Bounds : public IRVisitor {
             }
             interval = result;
         } else if (op->is_intrinsic(Call::widen_right_add)) {
-            Expr add = Add::make(op->args[0], cast(op->args[0].type(), op->args[1]));
-            add.accept(this);
-        } else if (op->is_intrinsic(Call::widen_right_sub)) {
-            Expr sub = Sub::make(op->args[0], cast(op->args[0].type(), op->args[1]));
-            sub.accept(this);
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen(op->args[1]) ?
+                         lower_widen_right_add(op->args[0], op->args[1]) :
+                         Expr();
+            handle_expr_bounds(e);
         } else if (op->is_intrinsic(Call::widen_right_mul)) {
-            Expr mul = Mul::make(op->args[0], cast(op->args[0].type(), op->args[1]));
-            mul.accept(this);
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen(op->args[1]) ?
+                         lower_widen_right_mul(op->args[0], op->args[1]) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::widen_right_sub)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen(op->args[1]) ?
+                         lower_widen_right_sub(op->args[0], op->args[1]) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::widening_add)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen_all(op->args) ?
+                         lower_widening_add(op->args[0], op->args[1]) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::widening_mul)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen_all(op->args) ?
+                         lower_widening_mul(op->args[0], op->args[1]) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::widening_sub)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen_all(op->args) ?
+                         lower_widening_sub(op->args[0], op->args[1]) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::saturating_add)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen_all(op->args) ?
+                         narrow(clamp(widen(op->args[0]) + widen(op->args[1]), t.min(), t.max())) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::saturating_sub)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen_all(op->args) ?
+                         narrow(clamp(widen(op->args[0]) - widen(op->args[1]), t.min(), t.max())) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::widening_shift_left)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen(op->args[0]) ?
+                         lower_widening_shift_left(op->args[0], op->args[1]) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::widening_shift_right)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen(op->args[0]) ?
+                         lower_widening_shift_right(op->args[0], op->args[1]) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::rounding_shift_right)) {
+            internal_assert(op->args.size() == 2);
+            // TODO: uses bitwise ops we may not handle well
+            handle_expr_bounds(lower_rounding_shift_right(op->args[0], op->args[1]));
+        } else if (op->is_intrinsic(Call::rounding_shift_left)) {
+            internal_assert(op->args.size() == 2);
+            // TODO: uses bitwise ops we may not handle well
+            handle_expr_bounds(lower_rounding_shift_left(op->args[0], op->args[1]));
+        } else if (op->is_intrinsic(Call::halving_add)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen_all(op->args) ?
+                         narrow((widen(op->args[0]) + widen(op->args[1])) / 2) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::halving_sub)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen_all(op->args) ?
+                         narrow((widen(op->args[0]) - widen(op->args[1])) / 2) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::rounding_halving_add)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = can_widen_all(op->args) ?
+                         narrow((widen(op->args[0]) + widen(op->args[1]) + 1) / 2) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::rounding_mul_shift_right)) {
+            internal_assert(op->args.size() == 3);
+            Expr e = can_widen_all(op->args) ?
+                         saturating_narrow(rounding_shift_right(widening_mul(op->args[0], op->args[1]), op->args[2])) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::mul_shift_right)) {
+            internal_assert(op->args.size() == 3);
+            Expr e = can_widen_all(op->args) ?
+                         saturating_narrow(widening_mul(op->args[0], op->args[1]) >> op->args[2]) :
+                         Expr();
+            handle_expr_bounds(e);
+        } else if (op->is_intrinsic(Call::sorted_avg)) {
+            internal_assert(op->args.size() == 2);
+            Expr e = lower_sorted_avg(op->args[0], op->args[1]);
+            handle_expr_bounds(e);
         } else if (op->call_type == Call::Halide) {
             bounds_of_func(op->name, op->value_index, op->type);
         } else {
diff --git a/src/FindIntrinsics.h b/src/FindIntrinsics.h
index f8ddaf171bc3..fc4c2a8e90f5 100644
--- a/src/FindIntrinsics.h
+++ b/src/FindIntrinsics.h
@@ -30,6 +30,7 @@ Expr lower_saturating_cast(const Type &t, const Expr &a);
 Expr lower_halving_add(const Expr &a, const Expr &b);
 Expr lower_halving_sub(const Expr &a, const Expr &b);
 Expr lower_rounding_halving_add(const Expr &a, const Expr &b);
+Expr lower_sorted_avg(const Expr &a, const Expr &b);
 
 Expr lower_mul_shift_right(const Expr &a, const Expr &b, const Expr &q);
 Expr lower_rounding_mul_shift_right(const Expr &a, const Expr &b, const Expr &q);

From 674e6cc491e2ea755cb85b61a0a6946ff923fbcc Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 1 Dec 2023 13:18:20 -0800
Subject: [PATCH 003/186] Disallow async nestings that violate read after write
 dependencies (#7868)

* Disallow async nestings that violate read after write dependencies

Fixes #7867

* Add test

* Add another failure case, and improve error message

* Add some more tests

* Update test

* Add new test to cmakelists

* Fix for llvm trunk

* Always acquire the folding semaphore, even if unused

* Skip async_order test under wasm

* trigger buildbots

---------

Co-authored-by: Volodymyr Kysenko <vksnk@google.com>
Co-authored-by: Steven Johnson <srj@google.com>
---
 src/AsyncProducers.cpp              | 51 ++++++++++++++++
 src/StorageFolding.cpp              |  5 --
 test/correctness/CMakeLists.txt     |  1 +
 test/correctness/async_order.cpp    | 94 +++++++++++++++++++++++++++++
 test/error/CMakeLists.txt           |  2 +
 test/error/bad_async_producer.cpp   | 31 ++++++++++
 test/error/bad_async_producer_2.cpp | 23 +++++++
 test/performance/async_gpu.cpp      | 33 +++++++---
 8 files changed, 228 insertions(+), 12 deletions(-)
 create mode 100644 test/correctness/async_order.cpp
 create mode 100644 test/error/bad_async_producer.cpp
 create mode 100644 test/error/bad_async_producer_2.cpp

diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
index cf10f51c4663..f633409cce65 100644
--- a/src/AsyncProducers.cpp
+++ b/src/AsyncProducers.cpp
@@ -109,15 +109,55 @@ class NoOpCollapsingMutator : public IRMutator {
 class GenerateProducerBody : public NoOpCollapsingMutator {
     const string &func;
     vector<Expr> sema;
+    std::set<string> producers_dropped;
+    bool found_producer = false;
 
     using NoOpCollapsingMutator::visit;
 
+    void bad_producer_nesting_error(const string &producer, const string &async_consumer) {
+        user_error
+            << "The Func " << producer << " is consumed by async Func " << async_consumer
+            << " and has a compute_at location in between the store_at "
+            << "location and the compute_at location of " << async_consumer
+            << ". This is only legal when " << producer
+            << " is both async and has a store_at location outside the store_at location of the consumer.";
+    }
+
     // Preserve produce nodes and add synchronization
     Stmt visit(const ProducerConsumer *op) override {
         if (op->name == func && op->is_producer) {
+            found_producer = true;
+
             // Add post-synchronization
             internal_assert(!sema.empty()) << "Duplicate produce node: " << op->name << "\n";
             Stmt body = op->body;
+
+            // We don't currently support waiting on producers to the producer
+            // half of the fork node. Or rather, if you want to do that you have
+            // to schedule those Funcs as async too. Check for any consume nodes
+            // where the producer has gone to the consumer side of the fork
+            // node.
+            class FindBadConsumeNodes : public IRVisitor {
+                const std::set<string> &producers_dropped;
+                using IRVisitor::visit;
+
+                void visit(const ProducerConsumer *op) override {
+                    if (!op->is_producer && producers_dropped.count(op->name)) {
+                        found = op->name;
+                    }
+                }
+
+            public:
+                string found;
+                FindBadConsumeNodes(const std::set<string> &p)
+                    : producers_dropped(p) {
+                }
+            } finder(producers_dropped);
+            body.accept(&finder);
+            if (!finder.found.empty()) {
+                bad_producer_nesting_error(finder.found, func);
+            }
+
             while (!sema.empty()) {
                 Expr release = Call::make(Int(32), "halide_semaphore_release", {sema.back(), 1}, Call::Extern);
                 body = Block::make(body, Evaluate::make(release));
@@ -125,7 +165,18 @@ class GenerateProducerBody : public NoOpCollapsingMutator {
             }
             return ProducerConsumer::make_produce(op->name, body);
         } else {
+            if (op->is_producer) {
+                producers_dropped.insert(op->name);
+            }
+            bool found_producer_before = found_producer;
             Stmt body = mutate(op->body);
+            if (!op->is_producer && producers_dropped.count(op->name) &&
+                found_producer && !found_producer_before) {
+                // We've found a consume node wrapping our async producer where
+                // the corresponding producer node was dropped from this half of
+                // the fork.
+                bad_producer_nesting_error(op->name, func);
+            }
             if (is_no_op(body) || op->is_producer) {
                 return body;
             } else {
diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp
index b4b13104b424..fd7a12d66995 100644
--- a/src/StorageFolding.cpp
+++ b/src/StorageFolding.cpp
@@ -825,11 +825,6 @@ class AttemptStorageFoldingOfFunction : public IRMutator {
                         to_release = max_required - max_required_next;  // This is the last time we use these entries
                     }
 
-                    if (provided.used.defined()) {
-                        to_acquire = select(provided.used, to_acquire, 0);
-                    }
-                    // We should always release the required region, even if we don't use it.
-
                     // On the first iteration, we need to acquire the extent of the region shared
                     // between the producer and consumer, and we need to release it on the last
                     // iteration.
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 8fc403b298bb..9b72d5ceecb3 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -8,6 +8,7 @@ tests(GROUPS correctness
       align_bounds.cpp
       argmax.cpp
       async_device_copy.cpp
+      async_order.cpp
       autodiff.cpp
       bad_likely.cpp
       bit_counting.cpp
diff --git a/test/correctness/async_order.cpp b/test/correctness/async_order.cpp
new file mode 100644
index 000000000000..f712d7e19c43
--- /dev/null
+++ b/test/correctness/async_order.cpp
@@ -0,0 +1,94 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    if (get_jit_target_from_environment().arch == Target::WebAssembly) {
+        printf("[SKIP] WebAssembly does not support async() yet.\n");
+        return 0;
+    }
+
+    {
+        Func producer1, producer2, consumer;
+        Var x, y;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = producer1(x, y);
+        consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1);
+
+        consumer.compute_root();
+
+        producer1.compute_at(consumer, y);
+        producer2.compute_at(consumer, y).async();
+
+        consumer.bound(x, 0, 16).bound(y, 0, 16);
+
+        Buffer<int> out = consumer.realize({16, 16});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(-1);
+            }
+        });
+    }
+    {
+        Func producer1, producer2, consumer;
+        Var x, y;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = producer1(x, y);
+        consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1);
+
+        consumer.compute_root();
+
+        producer1.compute_root();
+        producer2.store_root().compute_at(consumer, y).async();
+
+        consumer.bound(x, 0, 16).bound(y, 0, 16);
+
+        Buffer<int> out = consumer.realize({16, 16});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(-1);
+            }
+        });
+    }
+
+    {
+        Func producer1, producer2, consumer;
+        Var x, y;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = producer1(x, y);
+        consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1);
+
+        consumer.compute_root();
+
+        producer1.store_root().compute_at(consumer, y).async();
+        producer2.store_root().compute_at(consumer, y).async();
+
+        consumer.bound(x, 0, 16).bound(y, 0, 16);
+
+        Buffer<int> out = consumer.realize({16, 16});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(-1);
+            }
+        });
+    }
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt
index 440851b521cb..337bc667739e 100644
--- a/test/error/CMakeLists.txt
+++ b/test/error/CMakeLists.txt
@@ -9,6 +9,8 @@ tests(GROUPS error
       auto_schedule_no_parallel.cpp
       auto_schedule_no_reorder.cpp
       autodiff_unbounded.cpp
+      bad_async_producer.cpp
+      bad_async_producer_2.cpp
       bad_bound.cpp
       bad_bound_storage.cpp
       bad_compute_at.cpp
diff --git a/test/error/bad_async_producer.cpp b/test/error/bad_async_producer.cpp
new file mode 100644
index 000000000000..9e78e268958c
--- /dev/null
+++ b/test/error/bad_async_producer.cpp
@@ -0,0 +1,31 @@
+
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+
+    Func f{"f"}, g{"g"}, h{"h"};
+    Var x;
+
+    f(x) = cast<uint8_t>(x + 7);
+    g(x) = f(x);
+    h(x) = g(x);
+
+    // The schedule below is an error. It should really be:
+    // f.store_root().compute_at(g, Var::outermost());
+    // So that it's nested inside the consumer h.
+    f.store_root().compute_at(h, x);
+    g.store_root().compute_at(h, x).async();
+
+    Buffer<uint8_t> buf = h.realize({32});
+    for (int i = 0; i < buf.dim(0).extent(); i++) {
+        uint8_t correct = i + 7;
+        if (buf(i) != correct) {
+            printf("buf(%d) = %d instead of %d\n", i, buf(i), correct);
+            return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/test/error/bad_async_producer_2.cpp b/test/error/bad_async_producer_2.cpp
new file mode 100644
index 000000000000..d9929c56b3c1
--- /dev/null
+++ b/test/error/bad_async_producer_2.cpp
@@ -0,0 +1,23 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+// From https://github.com/halide/Halide/issues/5201
+int main(int argc, char **argv) {
+    Func producer1, producer2, consumer;
+    Var x, y;
+
+    producer1(x, y) = x + y;
+    producer2(x, y) = producer1(x, y);
+    consumer(x, y) = producer2(x, y - 1) + producer2(x, y + 1);
+
+    consumer.compute_root();
+
+    producer1.compute_at(consumer, y).async();
+    producer2.store_root().compute_at(consumer, y).async();
+
+    consumer.bound(x, 0, 16).bound(y, 0, 16);
+
+    Buffer<int> out = consumer.realize({16, 16});
+    return 0;
+}
diff --git a/test/performance/async_gpu.cpp b/test/performance/async_gpu.cpp
index 9d78efe4022e..55263e39546f 100644
--- a/test/performance/async_gpu.cpp
+++ b/test/performance/async_gpu.cpp
@@ -9,7 +9,7 @@ Expr expensive(Expr x, int c) {
     if (c <= 0) {
         return x;
     } else {
-        return expensive(fast_pow(x, x + 1), c - 1);
+        return expensive(x * (x + 1), c - 1);
     }
 }
 
@@ -31,11 +31,12 @@ int main(int argc, char **argv) {
     }
 
     double times[2];
+    uint32_t correct = 0;
     for (int use_async = 0; use_async < 2; use_async++) {
         Var x, y, t, xi, yi;
 
-        ImageParam in(Float(32), 3);
-        Func cpu, gpu;
+        ImageParam in(UInt(32), 3);
+        Func cpu("cpu"), gpu("gpu");
 
         // We have a two-stage pipeline that processes frames. We want
         // to run the first stage on the GPU and the second stage on
@@ -50,19 +51,21 @@ int main(int argc, char **argv) {
 
         // Assume GPU memory is limited, and compute the GPU stage one
         // frame at a time. Hoist the allocation to the top level.
-        gpu.compute_at(cpu, t).store_root().gpu_tile(x, y, xi, yi, 8, 8);
+        gpu.compute_at(gpu.in(), Var::outermost()).store_root().gpu_tile(x, y, xi, yi, 8, 8);
 
         // Stage the copy-back of the GPU result into a host-side
         // double-buffer.
         gpu.in().copy_to_host().compute_at(cpu, t).store_root().fold_storage(t, 2);
 
         if (use_async) {
+            // gpu.async();
             gpu.in().async();
-            gpu.async();
         }
 
-        in.set(Buffer<float>(800, 800, 16));
-        Buffer<float> out(800, 800, 16);
+        Buffer<uint32_t> in_buf(800, 800, 16);
+        in_buf.fill(17);
+        in.set(in_buf);
+        Buffer<uint32_t> out(800, 800, 16);
 
         cpu.compile_jit();
 
@@ -70,6 +73,22 @@ int main(int argc, char **argv) {
             cpu.realize(out);
         });
 
+        if (!use_async) {
+            correct = out(0, 0, 0);
+        } else {
+            for (int t = 0; t < out.dim(2).extent(); t++) {
+                for (int y = 0; y < out.dim(1).extent(); y++) {
+                    for (int x = 0; x < out.dim(0).extent(); x++) {
+                        if (out(x, y, t) != correct) {
+                            printf("Async output at (%d, %d, %d) is %u instead of %u\n",
+                                   x, y, t, out(x, y, t), correct);
+                            return 1;
+                        }
+                    }
+                }
+            }
+        }
+
         printf("%s: %f\n",
                use_async ? "with async" : "without async",
                times[use_async]);

From dea2cf7e2228c7f5ce52fa8236c3d15fdb82b89f Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Sun, 3 Dec 2023 13:34:02 -0800
Subject: [PATCH 004/186] complete_x86_target() should enable F16C and FMA when
 AVX2 is present (#7971)

All known AVX2-enabled architectures definitely have these features.
---
 src/CodeGen_X86.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index e34dd30870b4..ab099eef123c 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -47,6 +47,9 @@ Target complete_x86_target(Target t) {
     }
     if (t.has_feature(Target::AVX2)) {
         t.set_feature(Target::AVX);
+        // All AVX2-enabled architectures have F16C and FMA
+        t.set_feature(Target::F16C);
+        t.set_feature(Target::FMA);
     }
     if (t.has_feature(Target::AVX)) {
         t.set_feature(Target::SSE41);

From 17578a104b0d9530fbb053a4eaa818580b91b2f7 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 5 Dec 2023 10:08:08 -0800
Subject: [PATCH 005/186] Add two new tail strategies for update definitions
 (#7949)

* Add two new tail strategies for update definitions

* Stop printing asm

* Update expected number of partitions for Partition::Always

* Add a comment explaining why the blend safety check is per dimension

* Add serialization support for the new tail strategies

* trigger buildbots

* Add comment

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/ApplySplit.cpp                          | 15 ++++
 src/ApplySplit.h                            |  6 +-
 src/Deserialization.cpp                     |  4 +
 src/Func.cpp                                | 82 ++++++++++++++++++
 src/IRPrinter.cpp                           |  6 ++
 src/Schedule.h                              | 26 ++++++
 src/ScheduleFunctions.cpp                   | 30 ++++---
 src/Serialization.cpp                       |  4 +
 src/halide_ir.fbs                           |  2 +
 test/correctness/nested_tail_strategies.cpp | 30 ++++++-
 test/error/CMakeLists.txt                   |  2 +
 test/error/round_up_and_blend_race.cpp      | 23 +++++
 test/error/shift_inwards_and_blend_race.cpp | 19 +++++
 test/performance/CMakeLists.txt             |  1 +
 test/performance/blend_tail_strategies.cpp  | 93 +++++++++++++++++++++
 15 files changed, 326 insertions(+), 17 deletions(-)
 create mode 100644 test/error/round_up_and_blend_race.cpp
 create mode 100644 test/error/shift_inwards_and_blend_race.cpp
 create mode 100644 test/performance/blend_tail_strategies.cpp

diff --git a/src/ApplySplit.cpp b/src/ApplySplit.cpp
index 7bde69a38e94..48d27b1ffc7a 100644
--- a/src/ApplySplit.cpp
+++ b/src/ApplySplit.cpp
@@ -107,6 +107,21 @@ vector<ApplySplitResult> apply_split(const Split &split, bool is_update, const s
             // non-trivial loop.
             base = likely_if_innermost(base);
             base = Min::make(base, old_max + (1 - split.factor));
+        } else if (tail == TailStrategy::ShiftInwardsAndBlend) {
+            Expr old_base = base;
+            base = likely(base);
+            base = Min::make(base, old_max + (1 - split.factor));
+            // Make a mask which will be a loop invariant if inner gets
+            // vectorized, and apply it if we're in the tail.
+            Expr unwanted_elems = (-old_extent) % split.factor;
+            Expr mask = inner >= unwanted_elems;
+            mask = select(base == old_base, likely(const_true()), mask);
+            result.emplace_back(mask, ApplySplitResult::BlendProvides);
+        } else if (tail == TailStrategy::RoundUpAndBlend) {
+            Expr unwanted_elems = (-old_extent) % split.factor;
+            Expr mask = inner < split.factor - unwanted_elems;
+            mask = select(outer < outer_max, likely(const_true()), mask);
+            result.emplace_back(mask, ApplySplitResult::BlendProvides);
         } else {
             internal_assert(tail == TailStrategy::RoundUp);
         }
diff --git a/src/ApplySplit.h b/src/ApplySplit.h
index 61774733b02b..5e646b22f08b 100644
--- a/src/ApplySplit.h
+++ b/src/ApplySplit.h
@@ -36,7 +36,8 @@ struct ApplySplitResult {
                 LetStmt,
                 PredicateCalls,
                 PredicateProvides,
-                Predicate };
+                Predicate,
+                BlendProvides };
     Type type;
 
     ApplySplitResult(const std::string &n, Expr val, Type t)
@@ -67,6 +68,9 @@ struct ApplySplitResult {
     bool is_predicate_provides() const {
         return (type == PredicateProvides);
     }
+    bool is_blend_provides() const {
+        return (type == BlendProvides);
+    }
 };
 
 /** Given a Split schedule on a definition (init or update), return a list of
diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp
index b27918756886..bea4ca0d9d92 100644
--- a/src/Deserialization.cpp
+++ b/src/Deserialization.cpp
@@ -350,6 +350,10 @@ TailStrategy Deserializer::deserialize_tail_strategy(Serialize::TailStrategy tai
         return TailStrategy::PredicateStores;
     case Serialize::TailStrategy::ShiftInwards:
         return TailStrategy::ShiftInwards;
+    case Serialize::TailStrategy::ShiftInwardsAndBlend:
+        return TailStrategy::ShiftInwardsAndBlend;
+    case Serialize::TailStrategy::RoundUpAndBlend:
+        return TailStrategy::RoundUpAndBlend;
     case Serialize::TailStrategy::Auto:
         return TailStrategy::Auto;
     default:
diff --git a/src/Func.cpp b/src/Func.cpp
index 37b64df5af5b..8f46e7316531 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -375,6 +375,79 @@ bool is_const_assignment(const string &func_name, const vector<Expr> &args, cons
              rhs_checker.has_self_reference ||
              rhs_checker.has_rvar);
 }
+
+void check_for_race_conditions_in_split_with_blend(const StageSchedule &sched) {
+    // Splits with a 'blend' tail strategy do a load and then a store of values
+    // outside of the region to be computed, so for each split using a 'blend'
+    // tail strategy, verify that there aren't any parallel vars that stem from
+    // the same original dimension, so that this load and store doesn't race
+    // with a true computation of that value happening in some other thread.
+
+    // Note that we only need to check vars in the same dimension, because
+    // allocation bounds inference is done per-dimension and allocates padding
+    // based on the values actually accessed by the lowered code (i.e. it covers
+    // the blend region). So for example, an access beyond the end of a scanline
+    // can't overflow onto the next scanline. Halide will allocate padding, or
+    // throw a bounds error if it's an input or output.
+
+    if (sched.allow_race_conditions()) {
+        return;
+    }
+
+    std::set<std::string> parallel;
+    for (const auto &dim : sched.dims()) {
+        if (is_unordered_parallel(dim.for_type)) {
+            parallel.insert(dim.var);
+        }
+    }
+
+    // Process the splits in reverse order to figure out which root vars have a
+    // parallel child.
+    for (auto it = sched.splits().rbegin(); it != sched.splits().rend(); it++) {
+        if (it->is_fuse()) {
+            if (parallel.count(it->old_var)) {
+                parallel.insert(it->inner);
+                parallel.insert(it->old_var);
+            }
+        } else if (it->is_rename() || it->is_purify()) {
+            if (parallel.count(it->outer)) {
+                parallel.insert(it->old_var);
+            }
+        } else {
+            if (parallel.count(it->inner) || parallel.count(it->outer)) {
+                parallel.insert(it->old_var);
+            }
+        }
+    }
+
+    // Now propagate back to all children of the identified root vars, to assert
+    // that none of them use a blending tail strategy.
+    for (auto it = sched.splits().begin(); it != sched.splits().end(); it++) {
+        if (it->is_fuse()) {
+            if (parallel.count(it->inner) || parallel.count(it->outer)) {
+                parallel.insert(it->old_var);
+            }
+        } else if (it->is_rename() || it->is_purify()) {
+            if (parallel.count(it->old_var)) {
+                parallel.insert(it->outer);
+            }
+        } else {
+            if (parallel.count(it->old_var)) {
+                parallel.insert(it->inner);
+                parallel.insert(it->old_var);
+                if (it->tail == TailStrategy::ShiftInwardsAndBlend ||
+                    it->tail == TailStrategy::RoundUpAndBlend) {
+                    user_error << "Tail strategy " << it->tail
+                               << " may not be used to split " << it->old_var
+                               << " because other vars stemming from the same original "
+                               << "Var or RVar are marked as parallel."
+                               << "This could cause a race condition.\n";
+                }
+            }
+        }
+    }
+}
+
 }  // namespace
 
 void Stage::set_dim_type(const VarOrRVar &var, ForType t) {
@@ -439,6 +512,10 @@ void Stage::set_dim_type(const VarOrRVar &var, ForType t) {
                    << " in vars for function\n"
                    << dump_argument_list();
     }
+
+    if (is_unordered_parallel(t)) {
+        check_for_race_conditions_in_split_with_blend(definition.schedule());
+    }
 }
 
 void Stage::set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api) {
@@ -1171,6 +1248,11 @@ void Stage::split(const string &old, const string &outer, const string &inner, c
         }
     }
 
+    if (tail == TailStrategy::ShiftInwardsAndBlend ||
+        tail == TailStrategy::RoundUpAndBlend) {
+        check_for_race_conditions_in_split_with_blend(definition.schedule());
+    }
+
     if (!definition.is_init()) {
         user_assert(tail != TailStrategy::ShiftInwards)
             << "When splitting Var " << old_name
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index cd89e76417c0..dc07d0e0f010 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -180,6 +180,12 @@ std::ostream &operator<<(std::ostream &out, const TailStrategy &t) {
     case TailStrategy::RoundUp:
         out << "RoundUp";
         break;
+    case TailStrategy::ShiftInwardsAndBlend:
+        out << "ShiftInwardsAndBlend";
+        break;
+    case TailStrategy::RoundUpAndBlend:
+        out << "RoundUpAndBlend";
+        break;
     }
     return out;
 }
diff --git a/src/Schedule.h b/src/Schedule.h
index 22908a8425e4..32a654228673 100644
--- a/src/Schedule.h
+++ b/src/Schedule.h
@@ -100,6 +100,32 @@ enum class TailStrategy {
      * instead of a multiple of the split factor as with RoundUp. */
     ShiftInwards,
 
+    /** Equivalent to ShiftInwards, but protects values that would be
+     * re-evaluated by loading the memory location that would be stored to,
+     * modifying only the elements not contained within the overlap, and then
+     * storing the blended result.
+     *
+     * This tail strategy is useful when you want to use ShiftInwards to
+     * vectorize without a scalar tail, but are scheduling a stage where that
+     * isn't legal (e.g. an update definition).
+     *
+     * Because this is a read - modify - write, this tail strategy cannot be
+     * used on any dimension the stage is parallelized over as it would cause a
+     * race condition.
+     */
+    ShiftInwardsAndBlend,
+
+    /** Equivalent to RoundUp, but protected values that would be written beyond
+     * the end by loading the memory location that would be stored to,
+     * modifying only the elements within the region being computed, and then
+     * storing the blended result.
+     *
+     * This tail strategy is useful when vectorizing an update to some sub-region
+     * of a larger Func. As with ShiftInwardsAndBlend, it can't be combined with
+     * parallelism.
+     */
+    RoundUpAndBlend,
+
     /** For pure definitions use ShiftInwards. For pure vars in
      * update definitions use RoundUp. For RVars in update
      * definitions use GuardWithIf. */
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index 5c0b63edfe9e..9c5ca9095575 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -126,8 +126,8 @@ Stmt substitute_in(const string &name, const Expr &value, bool calls, bool provi
 
 class AddPredicates : public IRGraphMutator {
     const Expr &cond;
-    bool calls;
-    bool provides;
+    const Function &func;
+    ApplySplitResult::Type type;
 
     using IRMutator::visit;
 
@@ -135,7 +135,13 @@ class AddPredicates : public IRGraphMutator {
         auto [args, changed_args] = mutate_with_changes(p->args);
         auto [values, changed_values] = mutate_with_changes(p->values);
         Expr predicate = mutate(p->predicate);
-        if (provides) {
+        if (type == ApplySplitResult::BlendProvides) {
+            int idx = 0;
+            for (Expr &v : values) {
+                v = select(cond, v, Call::make(func, args, idx++));
+            }
+            return Provide::make(p->name, values, args, predicate);
+        } else if (type == ApplySplitResult::PredicateProvides) {
             return Provide::make(p->name, values, args, predicate && cond);
         } else if (changed_args || changed_values || !predicate.same_as(p->predicate)) {
             return Provide::make(p->name, values, args, predicate);
@@ -146,20 +152,20 @@ class AddPredicates : public IRGraphMutator {
 
     Expr visit(const Call *op) override {
         Expr result = IRMutator::visit(op);
-        if (calls && op->call_type == Call::Halide) {
+        if (type == ApplySplitResult::PredicateCalls && op->call_type == Call::Halide) {
             result = Call::make(op->type, Call::if_then_else, {cond, result}, Call::PureIntrinsic);
         }
         return result;
     }
 
 public:
-    AddPredicates(const Expr &cond, bool calls, bool provides)
-        : cond(cond), calls(calls), provides(provides) {
+    AddPredicates(const Expr &cond, const Function &func, ApplySplitResult::Type type)
+        : cond(cond), func(func), type(type) {
     }
 };
 
-Stmt add_predicates(const Expr &cond, bool calls, bool provides, const Stmt &s) {
-    return AddPredicates(cond, calls, provides).mutate(s);
+Stmt add_predicates(const Expr &cond, const Function &func, ApplySplitResult::Type type, const Stmt &s) {
+    return AddPredicates(cond, func, type).mutate(s);
 }
 
 // Build a loop nest about a provide node using a schedule
@@ -227,10 +233,10 @@ Stmt build_loop_nest(
                 stmt = substitute_in(res.name, res.value, true, false, stmt);
             } else if (res.is_substitution_in_provides()) {
                 stmt = substitute_in(res.name, res.value, false, true, stmt);
-            } else if (res.is_predicate_calls()) {
-                stmt = add_predicates(res.value, true, false, stmt);
-            } else if (res.is_predicate_provides()) {
-                stmt = add_predicates(res.value, false, true, stmt);
+            } else if (res.is_blend_provides() ||
+                       res.is_predicate_calls() ||
+                       res.is_predicate_provides()) {
+                stmt = add_predicates(res.value, func, res.type, stmt);
             } else if (res.is_let()) {
                 stmt = LetStmt::make(res.name, res.value, stmt);
             } else {
diff --git a/src/Serialization.cpp b/src/Serialization.cpp
index 857c963cceab..0224bef35600 100644
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -320,6 +320,10 @@ Serialize::TailStrategy Serializer::serialize_tail_strategy(const TailStrategy &
         return Serialize::TailStrategy::PredicateStores;
     case TailStrategy::ShiftInwards:
         return Serialize::TailStrategy::ShiftInwards;
+    case TailStrategy::ShiftInwardsAndBlend:
+        return Serialize::TailStrategy::ShiftInwardsAndBlend;
+    case TailStrategy::RoundUpAndBlend:
+        return Serialize::TailStrategy::RoundUpAndBlend;
     case TailStrategy::Auto:
         return Serialize::TailStrategy::Auto;
     default:
diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs
index 8148aca639a9..e4ac5ae49aed 100644
--- a/src/halide_ir.fbs
+++ b/src/halide_ir.fbs
@@ -527,6 +527,8 @@ enum TailStrategy: ubyte {
     PredicateLoads,
     PredicateStores,
     ShiftInwards,
+    ShiftInwardsAndBlend,
+    RoundUpAndBlend,
     Auto,
 }
 
diff --git a/test/correctness/nested_tail_strategies.cpp b/test/correctness/nested_tail_strategies.cpp
index 2a0ddc7a6bf8..a1f59d30c0bb 100644
--- a/test/correctness/nested_tail_strategies.cpp
+++ b/test/correctness/nested_tail_strategies.cpp
@@ -19,10 +19,12 @@ void my_free(JITUserContext *user_context, void *ptr) {
 void check(Func out, int line, std::vector<TailStrategy> tails) {
     bool has_round_up =
         std::find(tails.begin(), tails.end(), TailStrategy::RoundUp) != tails.end() ||
+        std::find(tails.begin(), tails.end(), TailStrategy::RoundUpAndBlend) != tails.end() ||
         std::find(tails.begin(), tails.end(), TailStrategy::PredicateLoads) != tails.end() ||
         std::find(tails.begin(), tails.end(), TailStrategy::PredicateStores) != tails.end();
     bool has_shift_inwards =
-        std::find(tails.begin(), tails.end(), TailStrategy::ShiftInwards) != tails.end();
+        std::find(tails.begin(), tails.end(), TailStrategy::ShiftInwards) != tails.end() ||
+        std::find(tails.begin(), tails.end(), TailStrategy::ShiftInwardsAndBlend) != tails.end();
 
     std::vector<int> sizes_to_try;
 
@@ -68,6 +70,12 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    // We'll randomly subsample these tests, because otherwise there are too many of them.
+    std::mt19937 rng(0);
+    int seed = argc > 1 ? atoi(argv[1]) : time(nullptr);
+    rng.seed(seed);
+    std::cout << "Nested tail strategies seed: " << seed << "\n";
+
     // Test random compositions of tail strategies in simple
     // producer-consumer pipelines. The bounds being tight sometimes
     // depends on the simplifier being able to cancel out things.
@@ -76,7 +84,8 @@ int main(int argc, char **argv) {
         TailStrategy::RoundUp,
         TailStrategy::GuardWithIf,
         TailStrategy::ShiftInwards,
-    };
+        TailStrategy::RoundUpAndBlend,
+        TailStrategy::ShiftInwardsAndBlend};
 
     TailStrategy innermost_tails[] = {
         TailStrategy::RoundUp,
@@ -84,7 +93,8 @@ int main(int argc, char **argv) {
         TailStrategy::PredicateLoads,
         TailStrategy::PredicateStores,
         TailStrategy::ShiftInwards,
-    };
+        TailStrategy::RoundUpAndBlend,
+        TailStrategy::ShiftInwardsAndBlend};
 
     // Two stages. First stage computed at tiles of second.
     for (auto t1 : innermost_tails) {
@@ -110,6 +120,10 @@ int main(int argc, char **argv) {
     for (auto t1 : innermost_tails) {
         for (auto t2 : innermost_tails) {
             for (auto t3 : innermost_tails) {
+                if ((rng() & 7) != 0) {
+                    continue;
+                }
+
                 Func in("in"), f("f"), g("g"), h("h");
                 Var x;
 
@@ -134,6 +148,10 @@ int main(int argc, char **argv) {
     for (auto t1 : tails) {
         for (auto t2 : innermost_tails) {
             for (auto t3 : innermost_tails) {
+                if ((rng() & 7) != 0) {
+                    continue;
+                }
+
                 Func in, f, g, h;
                 Var x;
 
@@ -158,8 +176,12 @@ int main(int argc, char **argv) {
     // (but can handle smaller outputs).
     for (auto t1 : innermost_tails) {
         for (auto t2 : tails) {
-            for (auto t3 : tails) {  // Not innermost_tails because of n^4 complexity here.
+            for (auto t3 : innermost_tails) {
                 for (auto t4 : tails) {
+                    if ((rng() & 63) != 0) {
+                        continue;
+                    }
+
                     Func in("in"), f("f"), g("g"), h("h");
                     Var x;
 
diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt
index 337bc667739e..ef4f5ffea614 100644
--- a/test/error/CMakeLists.txt
+++ b/test/error/CMakeLists.txt
@@ -94,7 +94,9 @@ tests(GROUPS error
       reuse_var_in_schedule.cpp
       reused_args.cpp
       rfactor_inner_dim_non_commutative.cpp
+      round_up_and_blend_race.cpp
       run_with_large_stack_throws.cpp
+      shift_inwards_and_blend_race.cpp
       specialize_fail.cpp
       split_inner_wrong_tail_strategy.cpp
       split_non_innermost_predicated.cpp
diff --git a/test/error/round_up_and_blend_race.cpp b/test/error/round_up_and_blend_race.cpp
new file mode 100644
index 000000000000..72244c0a6e8b
--- /dev/null
+++ b/test/error/round_up_and_blend_race.cpp
@@ -0,0 +1,23 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+
+    Func f;
+    Var x;
+
+    f(x) = 0;
+    f(x) += 4;
+
+    // This schedule should be forbidden, because it causes a race condition.
+    Var xo, xi;
+    f.update()
+        .split(x, xo, xi, 8, TailStrategy::RoundUp)
+        .vectorize(xi, 16, TailStrategy::RoundUpAndBlend)  // Access beyond the end of each slice
+        .parallel(xo);
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/error/shift_inwards_and_blend_race.cpp b/test/error/shift_inwards_and_blend_race.cpp
new file mode 100644
index 000000000000..67b4d9a6bcf1
--- /dev/null
+++ b/test/error/shift_inwards_and_blend_race.cpp
@@ -0,0 +1,19 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+
+    Func f;
+    Var x;
+
+    f(x) = 0;
+    f(x) += 4;
+
+    // This schedule should be forbidden, because it causes a race condition.
+    f.update().vectorize(x, 8, TailStrategy::ShiftInwardsAndBlend).parallel(x);
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/performance/CMakeLists.txt b/test/performance/CMakeLists.txt
index f47e92d6436b..1fecb06d0195 100644
--- a/test/performance/CMakeLists.txt
+++ b/test/performance/CMakeLists.txt
@@ -7,6 +7,7 @@ endif()
 tests(GROUPS performance
       SOURCES
       async_gpu.cpp
+      blend_tail_strategies.cpp
       block_transpose.cpp
       boundary_conditions.cpp
       clamped_vector_load.cpp
diff --git a/test/performance/blend_tail_strategies.cpp b/test/performance/blend_tail_strategies.cpp
new file mode 100644
index 000000000000..fa6a6f03d8c4
--- /dev/null
+++ b/test/performance/blend_tail_strategies.cpp
@@ -0,0 +1,93 @@
+#include "Halide.h"
+#include "halide_benchmark.h"
+
+using namespace Halide;
+using namespace Halide::Tools;
+
+int main(int argc, char **argv) {
+    Var x("x"), y("y");
+
+    Target t = get_jit_target_from_environment();
+
+    // Make sure we don't have predicated instructions available
+    if ((t.arch != Target::X86 && t.arch != Target::ARM) ||
+        t.has_feature(Target::AVX512) ||
+        t.has_feature(Target::SVE)) {
+        printf("[SKIP] This is a test for architectures without predication. "
+               "Currently we only test x86 before AVX-512 and ARM without SVE\n");
+        return 0;
+    }
+
+    const int N = t.natural_vector_size<uint8_t>() * 2;
+    const int reps = 1024 * 128;
+
+    Buffer<uint8_t> output_buf(N - 1, N - 1);
+    Buffer<uint8_t> correct_output;
+
+    std::map<TailStrategy, double> times;
+    for (auto ts : {TailStrategy::GuardWithIf,
+                    TailStrategy::RoundUp,
+                    TailStrategy::ShiftInwardsAndBlend,
+                    TailStrategy::RoundUpAndBlend}) {
+        Func f, g;
+        f(x, y) = cast<uint8_t>(x + y);
+        RDom r(0, reps);
+        f(x, y) = f(x, y) * 3 + cast<uint8_t>(0 * r);
+        g(x, y) = f(x, y);
+
+        f.compute_root()
+            .update()
+            .reorder(x, y, r)
+            .vectorize(x, N / 2, ts);
+
+        if (ts == TailStrategy::ShiftInwardsAndBlend) {
+            // Hide the stall from a load that overlaps the previous store by
+            // doing multiple scanlines at once. We expect the tail in y might
+            // be large, so force partitioning of x even in the loop tail in y.
+            f.update()
+                .reorder(y, x)
+                .unroll(y, 8, TailStrategy::GuardWithIf)
+                .reorder(x, y)
+                .partition(x, Partition::Always);
+        }
+
+        g.compile_jit();
+        // Uncomment to see the assembly
+        // g.compile_to_assembly("/dev/stdout", {}, "f", t);
+        double t = benchmark([&]() {
+            g.realize(output_buf);
+        });
+
+        // Check correctness
+        if (ts == TailStrategy::GuardWithIf) {
+            correct_output = output_buf.copy();
+        } else {
+            for (int y = 0; y < output_buf.height(); y++) {
+                for (int x = 0; x < output_buf.width(); x++) {
+                    if (output_buf(x, y) != correct_output(x, y)) {
+                        printf("output_buf(%d, %d) = %d instead of %d\n",
+                               x, y, output_buf(x, y), correct_output(x, y));
+                    }
+                }
+            }
+        }
+        times[ts] = t;
+    }
+
+    for (auto p : times) {
+        std::cout << p.first << " " << p.second << "\n";
+    }
+
+    if (times[TailStrategy::GuardWithIf] < times[TailStrategy::ShiftInwardsAndBlend]) {
+        printf("ShiftInwardsAndBlend is slower than it should be\n");
+        return 1;
+    }
+
+    if (times[TailStrategy::GuardWithIf] < times[TailStrategy::RoundUpAndBlend]) {
+        printf("RoundUpAndBlend is slower than it should be\n");
+        return 1;
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From 209ec02b372e2f6bc0c7155c70ea2ffe94b15c47 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 5 Dec 2023 14:15:23 -0800
Subject: [PATCH 006/186] Add appropriate mattrs for arm-32 extensions (#7978)

* Add appropriate mattrs for arm-32 extensions

Fixes #7976

* Pull clauses out of if
---
 src/CodeGen_ARM.cpp | 46 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 826f3723e4bf..03678e5ef605 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1647,19 +1647,34 @@ string CodeGen_ARM::mcpu_tune() const {
 }
 
 string CodeGen_ARM::mattrs() const {
+    string arch_flags;
+    string separator;
+    if (target.has_feature(Target::ARMFp16)) {
+        arch_flags += separator + "+fullfp16";
+        separator = ",";
+    }
+    if (target.has_feature(Target::ARMv81a)) {
+        arch_flags += separator + "+v8.1a";
+        separator = ",";
+    }
+    if (target.has_feature(Target::ARMDotProd)) {
+        arch_flags += separator + "+dotprod";
+        separator = ",";
+    }
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
-            return "+neon";
+            arch_flags += separator + "+neon";
+            separator = ",";
         }
         if (!target.has_feature(Target::NoNEON)) {
-            return "+neon";
+            arch_flags += separator + "+neon";
+            separator = ",";
         } else {
-            return "-neon";
+            arch_flags += separator + "-neon";
+            separator = ",";
         }
     } else {
         // TODO: Should Halide's SVE flags be 64-bit only?
-        string arch_flags;
-        string separator;
         if (target.has_feature(Target::SVE2)) {
             arch_flags = "+sve2";
             separator = ",";
@@ -1667,28 +1682,11 @@ string CodeGen_ARM::mattrs() const {
             arch_flags = "+sve";
             separator = ",";
         }
-
-        if (target.has_feature(Target::ARMv81a)) {
-            arch_flags += separator + "+v8.1a";
-            separator = ",";
-        }
-
-        if (target.has_feature(Target::ARMDotProd)) {
-            arch_flags += separator + "+dotprod";
-            separator = ",";
-        }
-
-        if (target.has_feature(Target::ARMFp16)) {
-            arch_flags += separator + "+fullfp16";
-            separator = ",";
-        }
-
         if (target.os == Target::IOS || target.os == Target::OSX) {
-            return arch_flags + separator + "+reserve-x18";
-        } else {
-            return arch_flags;
+            arch_flags += separator + "+reserve-x18";
         }
     }
+    return arch_flags;
 }
 
 bool CodeGen_ARM::use_soft_float_abi() const {

From 17b7366ae50ddeea608c0af0fef2260937ace690 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 6 Dec 2023 15:03:14 -0800
Subject: [PATCH 007/186] Move canonical version numbers into source, not build
 system (#7980) (#7981)

* Move canonical version numbers into source, not build system (#7980)

* Fixes
---
 Makefile                    | 20 --------------------
 src/CMakeLists.txt          | 10 +++-------
 src/Deserialization.cpp     |  7 ++++---
 src/Serialization.cpp       |  6 +++---
 src/halide_ir.fbs           | 10 ++++++++++
 src/runtime/HalideRuntime.h |  9 +++++++++
 test/runtime/CMakeLists.txt |  1 -
 7 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/Makefile b/Makefile
index 7364941941a2..4140da5c8f30 100644
--- a/Makefile
+++ b/Makefile
@@ -9,12 +9,6 @@
 #     For correctness and performance tests this include halide build time and run time. For
 #     the tests in test/generator/ this times only the halide build time.
 
-# Halide project version
-HALIDE_VERSION_MAJOR ?= 17
-HALIDE_VERSION_MINOR ?= 0
-HALIDE_VERSION_PATCH ?= 0
-HALIDE_VERSION=$(HALIDE_VERSION_MAJOR).$(HALIDE_VERSION_MINOR).$(HALIDE_VERSION_PATCH)
-
 # Disable built-in makefile rules for all apps to avoid pointless file-system
 # scanning and general weirdness resulting from implicit rules.
 MAKEFLAGS += --no-builtin-rules
@@ -146,12 +140,6 @@ WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty
 HL_TARGET ?= host
 HL_JIT_TARGET ?= host
 
-HL_VERSION_FLAGS = \
-	-DHALIDE_VERSION="$(HALIDE_VERSION)" \
-	-DHALIDE_VERSION_MAJOR=$(HALIDE_VERSION_MAJOR) \
-	-DHALIDE_VERSION_MINOR=$(HALIDE_VERSION_MINOR) \
-	-DHALIDE_VERSION_PATCH=$(HALIDE_VERSION_PATCH)
-
 X86_CXX_FLAGS=$(if $(WITH_X86), -DWITH_X86, )
 X86_LLVM_CONFIG_LIB=$(if $(WITH_X86), x86, )
 
@@ -222,7 +210,6 @@ LLVM_CXX_FLAGS_LIBCPP := $(findstring -stdlib=libc++, $(LLVM_CXX_FLAGS))
 endif
 
 CXX_FLAGS = $(CXXFLAGS) $(CXX_WARNING_FLAGS) $(RTTI_CXX_FLAGS) -Woverloaded-virtual $(FPIC) $(OPTIMIZE) -fno-omit-frame-pointer -DCOMPILING_HALIDE
-CXX_FLAGS += $(HL_VERSION_FLAGS)
 CXX_FLAGS += $(LLVM_CXX_FLAGS)
 CXX_FLAGS += $(PTX_CXX_FLAGS)
 CXX_FLAGS += $(ARM_CXX_FLAGS)
@@ -248,13 +235,8 @@ CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS)
 ifneq (,$(shell which flatc))
 CXX_FLAGS += -DWITH_SERIALIZATION -I $(BUILD_DIR) -I $(shell which flatc | sed 's/bin.flatc/include/')
 # Note: if updating here, be sure to update in CMakeLists.txt as well
-HALIDE_SERIALIZATION_VERSION_MAJOR ?= 0
 HALIDE_SERIALIZATION_VERSION_MINOR ?= 1
 HALIDE_SERIALIZATION_VERSION_PATCH ?= 0
-HALIDE_SERIALIZATION_VERSION=$(HALIDE_SERIALIZATION_VERSION_MAJOR).$(HALIDE_SERIALIZATION_VERSION_MINOR).$(HALIDE_SERIALIZATION_VERSION_PATCH)
-CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_MAJOR=$(HALIDE_SERIALIZATION_VERSION_MAJOR)
-CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_MINOR=$(HALIDE_SERIALIZATION_VERSION_MINOR)
-CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_PATCH=$(HALIDE_SERIALIZATION_VERSION_PATCH)
 endif
 
 # This is required on some hosts like powerpc64le-linux-gnu because we may build
@@ -307,7 +289,6 @@ TEST_LD_FLAGS = -L$(BIN_DIR) -lHalide $(COMMON_LD_FLAGS)
 
 # In the tests, some of our expectations change depending on the llvm version
 TEST_CXX_FLAGS += -DLLVM_VERSION=$(LLVM_VERSION_TIMES_10)
-TEST_CXX_FLAGS += $(HL_VERSION_FLAGS)
 
 # In the tests, default to exporting no symbols that aren't explicitly exported
 TEST_CXX_FLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
@@ -1118,7 +1099,6 @@ RUNTIME_CXX_FLAGS = \
     -Wno-unused-function \
     -Wvla \
     -Wsign-compare
-RUNTIME_CXX_FLAGS += $(HL_VERSION_FLAGS)
 
 $(BUILD_DIR)/initmod.windows_%_x86_32.ll: $(SRC_DIR)/runtime/windows_%_x86.cpp $(BUILD_DIR)/clang_ok
 	@mkdir -p $(@D)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 771944b10d42..5d15d55f4416 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -509,12 +509,6 @@ if (WITH_SERIALIZATION)
     target_include_directories(Halide PRIVATE "$<BUILD_INTERFACE:${fb_dir}>")
     target_link_libraries(Halide PRIVATE Halide_flatbuffers)
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION)
-    # Note: if updating here, be sure to update in Makefile as well
-    target_compile_definitions(Halide PUBLIC
-                               HALIDE_SERIALIZATION_VERSION_MAJOR=0
-                               HALIDE_SERIALIZATION_VERSION_MINOR=1
-                               HALIDE_SERIALIZATION_VERSION_PATCH=0
-                               )
 endif ()
 
 # Enable serialization testing by intercepting JIT compilation with a serialization roundtrip;
@@ -549,8 +543,10 @@ set_target_properties(Halide PROPERTIES
                       VERSION ${Halide_VERSION}
                       SOVERSION ${Halide_SOVERSION_OVERRIDE})
 
+# Note that we (deliberately) redeclare these versions here, even though the macros
+# with identical versions are expected to be defined in source; this allows us to
+# ensure that the versions defined between all build systems are identical.
 target_compile_definitions(Halide PUBLIC 
-                           HALIDE_VERSION=${Halide_VERSION} 
                            HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR}
                            HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR}
                            HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH})
diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp
index bea4ca0d9d92..90590d6f15af 100644
--- a/src/Deserialization.cpp
+++ b/src/Deserialization.cpp
@@ -1418,9 +1418,10 @@ Pipeline Deserializer::deserialize(const std::vector<uint8_t> &data) {
     }
 
     std::string deserialized_serialization_version = deserialize_string(pipeline_obj->serialization_version());
-    std::string serialization_version = std::to_string(HALIDE_SERIALIZATION_VERSION_MAJOR) + "." +
-                                        std::to_string(HALIDE_SERIALIZATION_VERSION_MINOR) + "." +
-                                        std::to_string(HALIDE_SERIALIZATION_VERSION_PATCH);
+    std::string serialization_version = std::to_string((int)Serialize::SerializationVersionMajor::Value) + "." +
+                                        std::to_string((int)Serialize::SerializationVersionMinor::Value) + "." +
+                                        std::to_string((int)Serialize::SerializationVersionPatch::Value);
+
     if (deserialized_serialization_version != serialization_version) {
         user_error << "deserialized pipeline is built with Halide serialization version " << deserialized_serialization_version
                    << ", but current Halide serialization version is " << serialization_version << "\n";
diff --git a/src/Serialization.cpp b/src/Serialization.cpp
index 0224bef35600..a9342d95ba6d 100644
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -1509,9 +1509,9 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector<uint8_t> &resul
                                  std::to_string(HALIDE_VERSION_MINOR) + "." +
                                  std::to_string(HALIDE_VERSION_PATCH);
 
-    std::string serialization_version = std::to_string(HALIDE_SERIALIZATION_VERSION_MAJOR) + "." +
-                                        std::to_string(HALIDE_SERIALIZATION_VERSION_MINOR) + "." +
-                                        std::to_string(HALIDE_SERIALIZATION_VERSION_PATCH);
+    std::string serialization_version = std::to_string((int)Serialize::SerializationVersionMajor::Value) + "." +
+                                        std::to_string((int)Serialize::SerializationVersionMinor::Value) + "." +
+                                        std::to_string((int)Serialize::SerializationVersionPatch::Value);
 
     auto pipeline_obj = Serialize::CreatePipeline(builder,
                                                   builder.CreateVector(funcs_serialized),
diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs
index e4ac5ae49aed..fe52231ffc49 100644
--- a/src/halide_ir.fbs
+++ b/src/halide_ir.fbs
@@ -6,6 +6,16 @@ file_identifier "HLDE";
 // File extension of any written files. "hlpipe" stands for Halide Pipeline.
 file_extension "hlpipe";
 
+enum SerializationVersionMajor: int {
+    Value = 0
+}
+enum SerializationVersionMinor: int {
+    Value = 1
+}
+enum SerializationVersionPatch: int {
+    Value = 0
+}
+
 // from src/IR.cpp
 union Stmt {
     LetStmt,
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 81088971418c..445811009abd 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -18,6 +18,15 @@
 #include "runtime_internal.h"
 #endif
 
+// Note that the canonical Halide version is considered to be defined here
+// (rather than in the build system); we redundantly define the value in
+// our CMake build, so that we ensure that the in-build metadata (eg soversion)
+// matches, but keeping the canonical version here makes it easier to keep
+// downstream build systems (eg Blaze/Bazel) properly in sync with the source.
+#define HALIDE_VERSION_MAJOR 17
+#define HALIDE_VERSION_MINOR 0
+#define HALIDE_VERSION_PATCH 0
+
 #ifdef __cplusplus
 // Forward declare type to allow naming typed handles.
 // See Type.h for documentation.
diff --git a/test/runtime/CMakeLists.txt b/test/runtime/CMakeLists.txt
index dbbdba540448..44ebf4c39d9d 100644
--- a/test/runtime/CMakeLists.txt
+++ b/test/runtime/CMakeLists.txt
@@ -8,7 +8,6 @@ function(_set_target_options NAME)
     target_compile_definitions(
         ${NAME}
         PRIVATE
-        HALIDE_VERSION=${Halide_VERSION}
         HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR}
         HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR}
         HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH}

From 9f6ec17acafa59d1da959dd39ad4383d43bcd1ee Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 6 Dec 2023 16:59:53 -0800
Subject: [PATCH 008/186] Silence useless "Insufficient parallelism"
 autoscheduler warning (#7990)

---
 src/autoschedulers/mullapudi2016/AutoSchedule.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index a83bebc637bc..9ac542cdc38f 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -2804,9 +2804,12 @@ void Partitioner::generate_group_cpu_schedule(
         }
     }
 
-    if (can_prove(def_par < arch_params.parallelism)) {
-        user_warning << "Insufficient parallelism for " << f_handle.name() << "\n";
-    }
+    // Silenced: the user can't really do anything about it,
+    // and it triggers on things like tiny lookup tables
+    //
+    // if (can_prove(def_par < arch_params.parallelism)) {
+    //     user_warning << "Insufficient parallelism for " << f_handle.name() << "\n";
+    // }
 
     // Find the level at which group members will be computed.
     int tile_inner_index = dims.size() - outer_dims.size() - 1;

From 6e57d6cb871720b9af3af24fe2cc6eba8f188fc4 Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Thu, 7 Dec 2023 08:06:31 -0800
Subject: [PATCH 009/186] Add a notebook with a visualization of the aprrox_*
 functions and their errors (#7974)

* Add a notebook with a visualization of the aprrox_* functions and their errors

* Fix spelling error
---
 apps/hannk/halide/common_halide.h             |   2 +
 .../docs/approx_log2_and_applications.ipynb   | 382 ++++++++++++++++++
 2 files changed, 384 insertions(+)
 create mode 100644 apps/hannk/halide/docs/approx_log2_and_applications.ipynb

diff --git a/apps/hannk/halide/common_halide.h b/apps/hannk/halide/common_halide.h
index 82a9e22d408f..e499177a9410 100644
--- a/apps/hannk/halide/common_halide.h
+++ b/apps/hannk/halide/common_halide.h
@@ -39,6 +39,8 @@ Halide::Expr align(const Halide::Expr &x, const Halide::Expr &n);
 // where N is the number of bits of the narrowed result minus one.
 Halide::Expr multiply_2x_high(const Halide::Expr &a, const Halide::Expr &b);
 
+// For a visualization of the approx_* functions and their errors, see:
+// apps/hannk/halide/docs/approx_log2_and_applications.ipynb
 // Approximate log2(x/2^q_x)*2^q.
 // q must be less than 16.
 Halide::Expr approx_log2(int q, const Halide::Expr &x, int q_x, const Halide::Type &type = Halide::Int(32));
diff --git a/apps/hannk/halide/docs/approx_log2_and_applications.ipynb b/apps/hannk/halide/docs/approx_log2_and_applications.ipynb
new file mode 100644
index 000000000000..d4771b5219b3
--- /dev/null
+++ b/apps/hannk/halide/docs/approx_log2_and_applications.ipynb
@@ -0,0 +1,382 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "r1XiiUQGUjpx"
+      },
+      "source": [
+        "import numpy as np\n",
+        "import matplotlib as mpl\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "# Many architectures have shifts where the right-hand-side is signed. A negative\n",
+        "# RHS is the same as a positive shift in the other direction.\n",
+        "def shift_right(x, y):\n",
+        "  return np.floor(x / 2**y)\n",
+        "def shift_left(x, y):\n",
+        "  return np.floor(x * 2**y)\n",
+        "def rounding_shift_right(x, y):\n",
+        "  return np.round(x / 2**y)\n",
+        "def rounding_shift_left(x, y):\n",
+        "  return np.round(x * 2**y)\n",
+        "\n",
+        "def bitwise_and(x, y):\n",
+        "  return np.mod(x, y + 1)\n",
+        "\n",
+        "# This is sqrdmulh on ARM\n",
+        "def multiply_2x_high(x, y):\n",
+        "  return rounding_shift_right(x * y, 15)\n",
+        "\n",
+        "def relative_error(x, y):\n",
+        "  return (x - y) / (np.maximum(x, y) + 1e-3)\n",
+        "\n",
+        "def plot_results(x, exact, approxs, title, logx = False, logy = False, relative = False, log2_xscale = 0, log2_yscale = 0):\n",
+        "  fig, [p1, p2] = plt.subplots(2, 1)\n",
+        "\n",
+        "  p1.set_xlabel('x')\n",
+        "  if logx:\n",
+        "    p1.set_xscale('log')\n",
+        "  p1.set_ylabel(title)\n",
+        "  if logy:\n",
+        "    p1.set_yscale('log')\n",
+        "\n",
+        "  xscale = 2**log2_xscale\n",
+        "  yscale = 2**log2_yscale\n",
+        "\n",
+        "  exact = np.round(exact*yscale)/yscale\n",
+        "\n",
+        "  p1.plot(x/xscale, exact)\n",
+        "  for approx in approxs:\n",
+        "    p1.plot(x/xscale, approx/yscale)\n",
+        "\n",
+        "  p2.set_xlabel('x')\n",
+        "  if logx:\n",
+        "    p2.set_xscale('log')\n",
+        "\n",
+        "  p2.set_ylabel('relative error' if relative else 'error')\n",
+        "  for approx in approxs:\n",
+        "    p2.plot(x/xscale, relative_error(approx/yscale, exact) if relative else approx/yscale - exact)\n",
+        "\n",
+        "def eval_poly(x, p, q):\n",
+        "  x1 = rounding_shift_left(x, 15 - q)\n",
+        "  y = p[0]\n",
+        "  xi = x1\n",
+        "  for i in p[1:]:\n",
+        "    y = y + multiply_2x_high(i, xi)\n",
+        "    xi = multiply_2x_high(xi, x1)\n",
+        "  return rounding_shift_right(y, 15 - q)\n",
+        "\n",
+        "points = 6\n",
+        "degree = 3\n",
+        "log2_poly_x = np.arange(points, 2 * points + 1) / points\n",
+        "log2_poly_y = np.log2(log2_poly_x)\n",
+        "log2_poly = np.polyfit(log2_poly_x - 1, log2_poly_y, degree)\n",
+        "\n",
+        "exp2_poly_x = np.arange(points, 2 * points + 1) / points\n",
+        "exp2_poly_y = np.exp2(exp2_poly_x - 1) - 1\n",
+        "exp2_poly = np.polyfit(exp2_poly_x - 1, exp2_poly_y, degree)\n",
+        "\n",
+        "log2_poly = log2_poly[::-1]\n",
+        "exp2_poly = exp2_poly[::-1]\n",
+        "\n",
+        "print(log2_poly)\n",
+        "print(exp2_poly)\n",
+        "\n",
+        "log2_poly = np.round(log2_poly * 2**15)\n",
+        "exp2_poly = np.round(exp2_poly * 2**15)\n",
+        "exp2_poly[0] = 0\n",
+        "\n",
+        "print(log2_poly)\n",
+        "print(exp2_poly)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "1xjo4hIEo_z5"
+      },
+      "source": [
+        "# Approximate N*log2(x*2^q_x), where N = 2^q, and the intermediate computations are\n",
+        "# restricted to be integers.\n",
+        "def approx_log2(x, q, q_x = 0):\n",
+        "  # This can be computed with count_leading_zeros\n",
+        "  floor_log2_x = np.select([x > 0], [np.floor(np.log2(x))], [-1])\n",
+        "\n",
+        "  # We've computed log2(x*2^q_x) = log2(x) + q_x. Subtract that offset now\n",
+        "  # before multiplying by the result quantization.\n",
+        "  result = shift_left(floor_log2_x - q_x, q)\n",
+        "\n",
+        "  frac = bitwise_and(shift_right(x, floor_log2_x - q), 2**q - 1)\n",
+        "\n",
+        "  return result + eval_poly(frac, log2_poly, q)\n",
+        "\n",
+        "x = np.arange(1, 10000)\n",
+        "q = 15\n",
+        "q_x = 2\n",
+        "log2_x = np.log2(x / 2**q_x)\n",
+        "approx_log2_x = approx_log2(x, q, q_x)\n",
+        "\n",
+        "plot_results(x, log2_x, [approx_log2_x], 'log2(x)', logx=True, log2_xscale=q_x, log2_yscale=q)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "6uJN5muLsLdE"
+      },
+      "source": [
+        "\n",
+        "# Approximate 2^(x/2^q_x)*2^q\n",
+        "def approx_exp2(x, q_x, q):\n",
+        "  int_part = shift_right(x, q_x)\n",
+        "  frac_part = x - shift_left(int_part, q_x)\n",
+        "\n",
+        "  frac_part = eval_poly(frac_part, exp2_poly, q_x)\n",
+        "\n",
+        "  exp_int_part = shift_left(1, int_part + q)\n",
+        "  return exp_int_part + rounding_shift_right(exp_int_part * frac_part, q_x)\n",
+        "\n",
+        "q_x = 10\n",
+        "q = 15\n",
+        "x = np.arange(-4000, 2000)\n",
+        "approx_exp2_x = approx_exp2(x, q_x, q)\n",
+        "exact = np.exp2(x / 2**q_x)\n",
+        "\n",
+        "plot_results(x, exact, [approx_exp2_x], '2^x', False, True, relative=True, log2_xscale=q_x, log2_yscale=q)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5BP-edzCmNBi"
+      },
+      "source": [
+        "q = 15\n",
+        "x = np.arange(10, 10000) * 10\n",
+        "round_trip_x = approx_exp2(approx_log2(x, q), q, 0)\n",
+        "\n",
+        "plot_results(x, x, [round_trip_x], '2^log2(x)', logx=True, logy=True, relative=True)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nyrzI90uNH1s"
+      },
+      "source": [
+        "# Approximate 2^q*sqrt(2^(x/2^q_x))\n",
+        "def sqrt_approx_exp2(x, q_x, q):\n",
+        "  return approx_exp2(x, q_x + 1, q)\n",
+        "\n",
+        "q = 11\n",
+        "q_x = 8\n",
+        "x = np.arange(-1000, 2000)\n",
+        "approx_exp2_x = sqrt_approx_exp2(x, q_x, q)\n",
+        "exact = np.sqrt(np.exp2(x / 2**q_x))\n",
+        "\n",
+        "plot_results(x, exact, [approx_exp2_x], 'sqrt(2^x)', relative=True, log2_xscale=q_x, log2_yscale=q)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Kno5t4VihCTL"
+      },
+      "source": [
+        "# Approximate sqrt(x) = 2^((1/2)*log2(x))\n",
+        "def approx_sqrt(x, q):\n",
+        "  # log2(x) will never be larger than 32, for 32-bit x. So to make the result\n",
+        "  # fit in a 16-bit integer, we can make the precision 2^16/32 = 2048.\n",
+        "  q_x = 11;\n",
+        "\n",
+        "  log2_sqrt_x = approx_log2(x, q_x - 1)\n",
+        "  return approx_exp2(log2_sqrt_x, q_x, q)\n",
+        "\n",
+        "q = 15\n",
+        "x = np.arange(1, 10000)**2\n",
+        "sqrt_x = np.sqrt(x)\n",
+        "approx_sqrt_x = approx_sqrt(x, q)\n",
+        "\n",
+        "plot_results(x, sqrt_x, [approx_sqrt_x], 'sqrt(x)', log2_yscale=q, relative=True)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "0dMecIGr92WY"
+      },
+      "source": [
+        "# Approximate 2^31/sqrt(x) = 2^(-(1/2)*log2(x))\n",
+        "def approx_reciprocal_sqrt(x):\n",
+        "  q = 15\n",
+        "  log2_sqrt_x = approx_log2(x, q - 1)\n",
+        "  return approx_exp2(-log2_sqrt_x, q, 31)\n",
+        "\n",
+        "x = np.arange(1, 10000)**2\n",
+        "inv_sqrt_x = 1 / np.sqrt(x)\n",
+        "approx_reciprocal_sqrt_x = approx_reciprocal_sqrt(x)\n",
+        "\n",
+        "plot_results(x, inv_sqrt_x, [approx_reciprocal_sqrt_x], '1/sqrt(x)', True, True, True, log2_yscale=31)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VFC9aUFcc8d7"
+      },
+      "source": [
+        "# Approximate 2^32/x = 2^32*2^(-log2(x))\n",
+        "def approx_reciprocal(x):\n",
+        "  q = 15;\n",
+        "  log2_x = approx_log2(x, q)\n",
+        "  return approx_exp2(-log2_x, q, 31)\n",
+        "\n",
+        "x = 1.01**np.arange(0, 2000)\n",
+        "inv_x = 1 / x\n",
+        "approx_inv_x = approx_reciprocal(x)\n",
+        "# This is ~sqrt(2) times more accurate, but maybe not practical for large x.\n",
+        "approx_inv_sqrt_x2 = approx_reciprocal_sqrt(x*x)\n",
+        "\n",
+        "plot_results(x, inv_x, [approx_inv_x], '1/x', True, True, log2_yscale=31, relative=True)\n",
+        "plot_results(x, inv_x, [approx_inv_sqrt_x2], '1/x', True, True, log2_yscale=31, relative=True)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "6BhQzLIZCcKC"
+      },
+      "source": [
+        "# Approximate log2(exp2(x) + c)\n",
+        "def approx_log2_exp2_plus_constant(x, c, q_x, q):\n",
+        "  # When x/2^q_x is large, approx_exp2 below will overflow. But when it is large\n",
+        "  # we don't need it to be very precise\n",
+        "  q_exp = 16 #np.minimum(16, 16 - np.floor(np.log2(np.maximum(x, 1))))\n",
+        "  one = 2**q_exp\n",
+        "\n",
+        "  one_plus_exp2_x = one * c + approx_exp2(x, q_x, q_exp)\n",
+        "  # Mimic overflow of int32\n",
+        "  one_plus_exp2_x = np.mod(one_plus_exp2_x, 2**31)\n",
+        "\n",
+        "  raw = approx_log2(one_plus_exp2_x, q, q_exp)\n",
+        "\n",
+        "  line = rounding_shift_right(x, q_x - q)\n",
+        "\n",
+        "  threshold = 30 - q_exp\n",
+        "  result = np.select([shift_right(x, q_x) < threshold], [raw], line)\n",
+        "  return result\n",
+        "\n",
+        "def approx_log2p1_exp2(x, q_x, q):\n",
+        "  return approx_log2_exp2_plus_constant(x, 1, q_x, q)\n",
+        "\n",
+        "def approx_log2m1_exp2(x, q_x, q):\n",
+        "  return approx_log2_exp2_plus_constant(x, -1, q_x, q)\n",
+        "\n",
+        "x = np.arange(-4000, 4000)*8\n",
+        "q_x = 11\n",
+        "q = 15\n",
+        "\n",
+        "exact = np.log2(np.exp2(x / 2**q_x) + 1)\n",
+        "approx = approx_log2p1_exp2(x, q_x, q)\n",
+        "plot_results(x, exact, [approx], 'log2(2^x + 1)', log2_xscale=q_x, log2_yscale=q)\n",
+        "\n",
+        "x = np.arange(1, 4000)*8\n",
+        "exact = np.log2(np.exp2(x / 2**q_x) - 1)\n",
+        "approx = approx_log2m1_exp2(x, q_x, q)\n",
+        "plot_results(x, exact, [approx], 'log2(2^x - 1)', log2_xscale=q_x, log2_yscale=q)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "G6n1u8fcUf-3"
+      },
+      "source": [
+        "# Approximate logistic(x) = 1/(e^-x + 1)\n",
+        "# = 2^log2(1/(e^-x + 1))\n",
+        "# = 2^-log2(e^-x + 1)\n",
+        "def approx_logistic(x, q_x, q):\n",
+        "  x2 = multiply_2x_high(x, np.round(-np.log2(np.exp(1)) * 2**14))\n",
+        "  q_exp = 11\n",
+        "  log2_d = approx_log2p1_exp2(x2, q_x - 1, q_exp)\n",
+        "  return approx_exp2(-log2_d, q_exp, q)\n",
+        "\n",
+        "x = np.arange(-4000, 4000)*8\n",
+        "q_x = 11\n",
+        "q = 15\n",
+        "exact = 1 / (1 + np.exp(-x / 2**q_x))\n",
+        "approx = approx_logistic(x, q_x, q)\n",
+        "plot_results(x, exact, [approx], '1/(1 + e^-x)', log2_xscale=q_x, log2_yscale=q)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LBXXNc_8twQD"
+      },
+      "source": [
+        "# Approximate tanh(x) = (e^2x - 1)/(e^2x + 1)\n",
+        "# = 2^log2((e^2x - 1)/(e^2x + 1))\n",
+        "# = 2^(log2(e^2x - 1) - log2(e^2x + 1))\n",
+        "def approx_tanh(x, q_x, q):\n",
+        "  abs_x_base2 = multiply_2x_high(np.abs(x), np.round(np.log2(np.exp(1)) * 2**14))\n",
+        "  q_exp = 11\n",
+        "  log2_n = approx_log2m1_exp2(abs_x_base2, q_x - 2, q_exp)\n",
+        "  log2_d = approx_log2p1_exp2(abs_x_base2, q_x - 2, q_exp)\n",
+        "  # Saturate at int16\n",
+        "  log2_n = np.clip(log2_n, -(2**15), 2**15)\n",
+        "  log2_d = np.clip(log2_d, -(2**15), 2**15)\n",
+        "  return np.sign(x) * approx_exp2(log2_n - log2_d, q_exp, q)\n",
+        "\n",
+        "x = np.arange(-4000, 4000)*8\n",
+        "q_x = 12\n",
+        "q = 15\n",
+        "exact = np.tanh(x / 2**q_x)\n",
+        "approx = approx_tanh(x, q_x, q)\n",
+        "\n",
+        "points = 20\n",
+        "poly_x = np.arange(0, points * 3) / points\n",
+        "poly_y = np.tanh(poly_x)\n",
+        "poly = np.polyfit(poly_x, poly_y, 6)\n",
+        "approx2 = np.polyval(poly, x / 2**q_x) * 2**q\n",
+        "\n",
+        "\n",
+        "plot_results(x, exact, [approx], 'tanh(x)', log2_xscale=q_x, log2_yscale=q)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file

From d1ecc1fb65fcf9dafb573c7f781bb1ab6f41d264 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 7 Dec 2023 08:06:57 -0800
Subject: [PATCH 010/186] Make narrowing float->int casts on wasm go via wider
 ints (#7973)

Fixes #7972
---
 src/CodeGen_WebAssembly.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index 69d696ce9f8a..948346ad7c2a 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -175,6 +175,17 @@ void CodeGen_WebAssembly::visit(const Cast *op) {
                 }
             }
         }
+
+        // Narrowing float -> int casts should go via an integer type of the
+        // matching width (see https://github.com/halide/Halide/issues/7972)
+        if (op->value.type().is_float() &&
+            (op->type.is_int() || op->type.is_uint()) &&
+            op->type.bits() < op->value.type().bits()) {
+            Expr equiv = Cast::make(op->type.with_bits(op->value.type().bits()), op->value);
+            equiv = Cast::make(op->type, equiv);
+            codegen(equiv);
+            return;
+        }
     }
 
     CodeGen_Posix::visit(op);

From 83febb0ad0919e85e5832371907feaa81e342b26 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 7 Dec 2023 09:46:27 -0800
Subject: [PATCH 011/186] Fix handling of assert statements whose conditions
 get vectorized (#7989)

* Fix handling of assert statements whose conditions get vectorized

* Fix test name
---
 src/IRPrinter.cpp                      |  1 -
 src/VectorizeLoops.cpp                 |  2 +-
 test/correctness/CMakeLists.txt        |  1 +
 test/correctness/vectorized_assert.cpp | 46 ++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 test/correctness/vectorized_assert.cpp

diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index dc07d0e0f010..52cb3714268c 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1109,7 +1109,6 @@ void IRPrinter::visit(const VectorReduce *op) {
     stream << "("
            << op->type
            << ")vector_reduce_" << op->op << "("
-           << ", "
            << op->value
            << ")";
 }
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index df116c841217..7ced1dab0d92 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -816,7 +816,7 @@ class VectorSubs : public IRMutator {
     }
 
     Stmt visit(const AssertStmt *op) override {
-        return (op->condition.type().lanes() > 1) ? scalarize(op) : op;
+        return (mutate(op->condition).type().lanes() > 1) ? scalarize(op) : op;
     }
 
     Stmt visit(const IfThenElse *op) override {
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 9b72d5ceecb3..da968c419593 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -335,6 +335,7 @@ tests(GROUPS correctness
       vectorize_mixed_widths.cpp
       vectorize_nested.cpp
       vectorize_varying_allocation_size.cpp
+      vectorized_assert.cpp
       vectorized_gpu_allocation.cpp
       vectorized_initialization.cpp
       vectorized_load_from_vectorized_allocation.cpp
diff --git a/test/correctness/vectorized_assert.cpp b/test/correctness/vectorized_assert.cpp
new file mode 100644
index 000000000000..3a71174c347b
--- /dev/null
+++ b/test/correctness/vectorized_assert.cpp
@@ -0,0 +1,46 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int error_count = 0;
+void my_error(JITUserContext *ucon, const char *msg) {
+    error_count++;
+}
+
+int main(int argc, char **argv) {
+    Func f("f"), g("g");
+    Var x("x");
+    Param<int> p;
+
+    f(x) = x;
+    f(x) += 1;
+    g(x) = f(x) + f(2 * x + p);
+
+    g.vectorize(x, 8);
+    f.bound_storage(x, 32);
+    // No way to check this at compile time. The size of f depends on both x and
+    // p.  An assert is injected, but the assert is inside g's vectorized loop.
+
+    g.jit_handlers().custom_error = my_error;
+
+    g.compile_jit();
+
+    // Will trigger the assert
+    p.set(256);
+    g.realize({128});
+    if (error_count != 1) {
+        printf("There should have been an error\n");
+        return 1;
+    }
+
+    // Will not trigger the assert
+    p.set(0);
+    g.realize({8});
+    if (error_count != 1) {
+        printf("There should not have been an error\n");
+        return 1;
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From df36139fee3a1b751b2878403dfd120d9a18fb9c Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 7 Dec 2023 10:02:42 -0800
Subject: [PATCH 012/186] Fix all "unscheduled update()" warnings in our code
 (#7991)

* Fix all "unscheduled update()" warnings in our code

And also fix the Mullapudi scheduler to explicitly touch all update stages. This allows us to mark this warning as an error if we so choose.

* fixes

* fixes

* Update recursive_box_filters.cpp
---
 apps/hist/hist_generator.cpp                  |  3 +-
 apps/iir_blur/iir_blur_generator.cpp          |  2 ++
 .../anderson2021/cost_model_generator.cpp     |  2 +-
 .../mullapudi2016/AutoSchedule.cpp            | 33 ++++++++++++++-----
 test/correctness/recursive_box_filters.cpp    |  4 +++
 test/error/tuple_output_bounds_check.cpp      |  1 +
 6 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/apps/hist/hist_generator.cpp b/apps/hist/hist_generator.cpp
index 32d86d3d0186..3401088e3672 100644
--- a/apps/hist/hist_generator.cpp
+++ b/apps/hist/hist_generator.cpp
@@ -181,6 +181,7 @@ class Hist : public Halide::Generator<Hist> {
                     .compute_at(hist_rows.in(), y)
                     .vectorize(x, vec);
 
+                hist_rows.update(0).unscheduled();
                 hist_rows.in()
                     .compute_root()
                     .vectorize(x, vec)
@@ -199,7 +200,7 @@ class Hist : public Halide::Generator<Hist> {
                     .parallel(x)
                     .reorder(ry, x);
 
-                cdf.compute_root();
+                cdf.compute_root().update().unscheduled();
                 output.reorder(c, x, y)
                     .bound(c, 0, 3)
                     .unroll(c)
diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp
index 1aeb3e0d1a5f..cfb967390f8c 100644
--- a/apps/iir_blur/iir_blur_generator.cpp
+++ b/apps/iir_blur/iir_blur_generator.cpp
@@ -51,6 +51,8 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
             blur.compute_at(transpose, yo);
 
             // Vectorize computations within the strips.
+            blur.update(0)
+                .unscheduled();
             blur.update(1)
                 .reorder(x, ry)
                 .vectorize(x);
diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp
index 6dfeb0dc62b5..e40971c5729a 100644
--- a/src/autoschedulers/anderson2021/cost_model_generator.cpp
+++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp
@@ -661,7 +661,7 @@ class CostModel : public Generator<CostModel<training>> {
             };
 
             // Pipeline features processing
-            conv1_stage1.compute_root().vectorize(c);
+            conv1_stage1.compute_root().vectorize(c).update().vectorize(c);
             squashed_head1_filter.compute_root().vectorize(c);
 
             // Schedule features processing. The number of schedule
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index 9ac542cdc38f..be2ede0748b0 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -837,20 +837,27 @@ struct AutoSchedule {
                 }
             }
 
-            for (const auto &m : f.second) {
-                const int stage = m.first;
-                const vector<string> &schedules = m.second;
-                internal_assert(!schedules.empty());
+            const int num_stages = func.updates().size() + 1;
+            for (int stage = 0; stage < num_stages; stage++) {
                 schedule_ss << "    " << fname;
                 if (stage > 0) {
-                    schedule_ss << ".update(" << std::to_string(stage - 1) << ")";
+                    schedule_ss << ".update(" << (stage - 1) << ")";
                 }
-                for (const std::string &s : schedules) {
-                    schedule_ss << "\n        ." << s;
+                auto it = f.second.find(stage);
+                if (it != f.second.end()) {
+                    const vector<string> &schedules = it->second;
+                    internal_assert(!schedules.empty());
+                    for (const std::string &s : schedules) {
+                        internal_assert(!s.empty());
+                        schedule_ss << "\n        ." << s;
+                    }
+                } else {
+                    if (stage > 0) {
+                        schedule_ss << ".unscheduled()";
+                    }
                 }
                 schedule_ss << ";\n";
             }
-
             schedule_ss << "}\n";
         }
 
@@ -3386,6 +3393,16 @@ string generate_schedules(const vector<Function> &outputs, const Target &target,
     debug(2) << "Generating CPU schedule...\n";
     part.generate_cpu_schedule(target, sched);
 
+    // Ensure that all update stages are "touched" so we get no warnings/errors
+    for (const auto &f : sched.func_schedules) {
+        const Function &func = get_element(sched.env, f.first);
+        const int num_update_stages = func.updates().size();
+        for (int stage = 0; stage < num_update_stages; stage++) {
+            Definition def = get_stage_definition(func, stage + 1);
+            def.schedule().touched() = true;
+        }
+    }
+
     std::ostringstream oss;
     oss << sched;
     string sched_string = oss.str();
diff --git a/test/correctness/recursive_box_filters.cpp b/test/correctness/recursive_box_filters.cpp
index 443542ed38bc..58012cbb50cc 100644
--- a/test/correctness/recursive_box_filters.cpp
+++ b/test/correctness/recursive_box_filters.cpp
@@ -26,6 +26,10 @@ int main(int argc, char **argv) {
     // have to pass 'true' to the atomic call to tell it to skip the check.
     h.update(2).atomic(true).vectorize(r, 16);
 
+    // These stages don't need scheduling
+    h.update(0).unscheduled();
+    h.update(1).unscheduled();
+
     Buffer<int> r0(size);
     Buffer<int> r1(size);
     h.realize({r0, r1});
diff --git a/test/error/tuple_output_bounds_check.cpp b/test/error/tuple_output_bounds_check.cpp
index 53b3a26a8337..74df02134182 100644
--- a/test/error/tuple_output_bounds_check.cpp
+++ b/test/error/tuple_output_bounds_check.cpp
@@ -17,6 +17,7 @@ int main(int argc, char **argv) {
 
     Var xo, xi;
     h.split(x, xo, xi, 16, TailStrategy::RoundUp);
+    h.update(0).unscheduled();
 
     Buffer<int> r0(size);
     Buffer<int> r1(size);

From 5aa891a78ac2aa970aff1d3128756f7884b5dab5 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 7 Dec 2023 10:03:06 -0800
Subject: [PATCH 013/186] =?UTF-8?q?Silence=20useless=20'Outer=20dim=20vect?=
 =?UTF-8?q?orization=20of=20var'=20warning=20in=20Mullapudi=E2=80=A6=20(#7?=
 =?UTF-8?q?992)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Silence useless 'Outer dim vectorization of var' warning in Mullapudi scheduler
---
 src/autoschedulers/mullapudi2016/AutoSchedule.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index be2ede0748b0..2ce325538a86 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -2479,10 +2479,13 @@ void Partitioner::vectorize_stage(const Group &g, Stage f_handle, int stage_num,
         // storage dimension of the func.
         //
         // TODO: Check if the warning is necessary.
-        if (vec_dim_index > 0) {
-            user_warning << "Outer dim vectorization of var \"" << vec_dim_name
-                         << "\" in function \"" << f_handle.name() << "\"\n";
-        }
+        //
+        // Disabled: this isn't really user actionable, and is just noise.
+        //
+        // if (vec_dim_index > 0) {
+        //     user_warning << "Outer dim vectorization of var \"" << vec_dim_name
+        //                  << "\" in function \"" << f_handle.name() << "\"\n";
+        // }
     }
 }
 

From 19c1c81e8946a6d4471b65be7fb609f055b5ae68 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 8 Dec 2023 08:50:01 -0800
Subject: [PATCH 014/186] Make wasm +sign-ext and +nontrapping-fptoint the
 default (#7995)

* Make wasm +sign-ext and +nontrapping-fptoint the default

These have been supported in ~all wasm runtimes for a while now, and +nontrapping-fptoint in particular can make a big performance difference. We should enable these by default, and add a new backdoor (wasm_mvponly) for code paths that need to use the original wasm Minimum Viable Product spec only.

* Update simd_op_check_wasm.cpp
---
 README_webassembly.md                          | 17 ++++++++---------
 python_bindings/src/halide/halide_/PyEnums.cpp |  3 +--
 src/CodeGen_WebAssembly.cpp                    |  8 ++------
 src/Target.cpp                                 |  3 +--
 src/Target.h                                   |  3 +--
 src/WasmExecutor.cpp                           |  6 ++----
 src/runtime/HalideRuntime.h                    |  3 +--
 test/correctness/simd_op_check_wasm.cpp        |  7 ++++---
 8 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/README_webassembly.md b/README_webassembly.md
index f5fad7d79995..0fdcf80f15f1 100644
--- a/README_webassembly.md
+++ b/README_webassembly.md
@@ -6,10 +6,11 @@ backend.
 As WebAssembly itself is still under active development, Halide's support has
 some limitations. Some of the most important:
 
+-   Sign-extension operations are enabled by default (but can be avoided via
+    Target::WasmMvpOnly).
+-   Non-trapping float-to-int conversions are enabled by default (but can be
+    avoided via Target::WasmMvpOnly).
 -   Fixed-width SIMD (128 bit) can be enabled via Target::WasmSimd128.
--   Sign-extension operations can be enabled via Target::WasmSignExt.
--   Non-trapping float-to-int conversions can be enabled via
-    Target::WasmSatFloatToInt.
 -   Threads have very limited support via Target::WasmThreads; see
     [below](#using-threads) for more details.
 -   Halide's JIT for Wasm is extremely limited and really useful only for
@@ -152,9 +153,8 @@ cmake -DLLVM_ENABLE_PROJECTS="clang;lld" ...
 ```
 
 -   To run the JIT tests, set `HL_JIT_TARGET=wasm-32-wasmrt` (possibly adding
-    `wasm_simd128`, `wasm_signext`, and/or `wasm_sat_float_to_int`) and run
-    CMake/CTest normally. Note that wasm testing is only support under CMake
-    (not via Make).
+    `wasm_simd128`) and run CMake/CTest normally. Note that wasm testing is
+    only supported under CMake (not via Make).
 
 ## Enabling wasm AOT
 
@@ -165,9 +165,8 @@ will), you need to install Emscripten locally.
     (https://emscripten.org/docs/getting_started/downloads.html).
 
 -   To run the AOT tests, set `HL_TARGET=wasm-32-wasmrt` (possibly adding
-    `wasm_simd128`, `wasm_signext`, and/or `wasm_sat_float_to_int`) and run
-    CMake/CTest normally. Note that wasm testing is only support under CMake
-    (not via Make).
+    `wasm_simd128`) and run CMake/CTest normally. Note that wasm testing is
+    only supported under CMake (not via Make).
 
 # Running benchmarks
 
diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index 1913b204fbd4..f86e7072edd5 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -165,9 +165,8 @@ void define_enums(py::module &m) {
         .value("HexagonDma", Target::Feature::HexagonDma)
         .value("EmbedBitcode", Target::Feature::EmbedBitcode)
         .value("EnableLLVMLoopOpt", Target::Feature::EnableLLVMLoopOpt)
+        .value("WasmMvpOnly", Target::Feature::WasmMvpOnly)
         .value("WasmSimd128", Target::Feature::WasmSimd128)
-        .value("WasmSignExt", Target::Feature::WasmSignExt)
-        .value("WasmSatFloatToInt", Target::Feature::WasmSatFloatToInt)
         .value("WasmThreads", Target::Feature::WasmThreads)
         .value("WasmBulkMemory", Target::Feature::WasmBulkMemory)
         .value("SVE", Target::Feature::SVE)
diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index 948346ad7c2a..3e9aedca3fbf 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -333,9 +333,10 @@ string CodeGen_WebAssembly::mattrs() const {
     std::ostringstream s;
     string sep;
 
-    if (target.has_feature(Target::WasmSignExt)) {
+    if (!target.has_feature(Target::WasmMvpOnly)) {
         s << sep << "+sign-ext";
         sep = ",";
+        s << sep << "+nontrapping-fptoint";
     }
 
     if (target.has_feature(Target::WasmSimd128)) {
@@ -343,11 +344,6 @@ string CodeGen_WebAssembly::mattrs() const {
         sep = ",";
     }
 
-    if (target.has_feature(Target::WasmSatFloatToInt)) {
-        s << sep << "+nontrapping-fptoint";
-        sep = ",";
-    }
-
     if (target.has_feature(Target::WasmThreads)) {
         // "WasmThreads" doesn't directly affect LLVM codegen,
         // but it does end up requiring atomics, so be sure to enable them.
diff --git a/src/Target.cpp b/src/Target.cpp
index 597d5bf5367d..e222e97d5282 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -533,8 +533,7 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"embed_bitcode", Target::EmbedBitcode},
     {"enable_llvm_loop_opt", Target::EnableLLVMLoopOpt},
     {"wasm_simd128", Target::WasmSimd128},
-    {"wasm_signext", Target::WasmSignExt},
-    {"wasm_sat_float_to_int", Target::WasmSatFloatToInt},
+    {"wasm_mvponly", Target::WasmMvpOnly},
     {"wasm_threads", Target::WasmThreads},
     {"wasm_bulk_memory", Target::WasmBulkMemory},
     {"webgpu", Target::WebGPU},
diff --git a/src/Target.h b/src/Target.h
index 76b06aed6b8e..331694e34c3a 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -143,9 +143,8 @@ struct Target {
         CheckUnsafePromises = halide_target_feature_check_unsafe_promises,
         EmbedBitcode = halide_target_feature_embed_bitcode,
         EnableLLVMLoopOpt = halide_target_feature_enable_llvm_loop_opt,
+        WasmMvpOnly = halide_target_feature_wasm_mvponly,
         WasmSimd128 = halide_target_feature_wasm_simd128,
-        WasmSignExt = halide_target_feature_wasm_signext,
-        WasmSatFloatToInt = halide_target_feature_wasm_sat_float_to_int,
         WasmThreads = halide_target_feature_wasm_threads,
         WasmBulkMemory = halide_target_feature_wasm_bulk_memory,
         WebGPU = halide_target_feature_webgpu,
diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp
index d82932bd3ea0..b99efdc6d67e 100644
--- a/src/WasmExecutor.cpp
+++ b/src/WasmExecutor.cpp
@@ -1308,15 +1308,13 @@ wabt::interp::HostFunc::Ptr make_extern_callback(wabt::interp::Store &store,
 
 wabt::Features calc_features(const Target &target) {
     wabt::Features f;
-    if (target.has_feature(Target::WasmSignExt)) {
+    if (!target.has_feature(Target::WasmMvpOnly)) {
         f.enable_sign_extension();
+        f.enable_sat_float_to_int();
     }
     if (target.has_feature(Target::WasmSimd128)) {
         f.enable_simd();
     }
-    if (target.has_feature(Target::WasmSatFloatToInt)) {
-        f.enable_sat_float_to_int();
-    }
     return f;
 }
 #endif  // WITH_WABT
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 445811009abd..f50e498ce88e 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1386,9 +1386,8 @@ typedef enum halide_target_feature_t {
     halide_target_feature_hexagon_dma,            ///< Enable Hexagon DMA buffers.
     halide_target_feature_embed_bitcode,          ///< Emulate clang -fembed-bitcode flag.
     halide_target_feature_enable_llvm_loop_opt,   ///< Enable loop vectorization + unrolling in LLVM. Overrides halide_target_feature_disable_llvm_loop_opt. (Ignored for non-LLVM targets.)
+    halide_target_feature_wasm_mvponly,           ///< Disable all extensions to WebAssembly codegen (including +sign-ext and +nontrapping-fptoint, which are on by default).
     halide_target_feature_wasm_simd128,           ///< Enable +simd128 instructions for WebAssembly codegen.
-    halide_target_feature_wasm_signext,           ///< Enable +sign-ext instructions for WebAssembly codegen.
-    halide_target_feature_wasm_sat_float_to_int,  ///< Enable saturating (nontrapping) float-to-int instructions for WebAssembly codegen.
     halide_target_feature_wasm_threads,           ///< Enable use of threads in WebAssembly codegen. Requires the use of a wasm runtime that provides pthread-compatible wrappers (typically, Emscripten with the -pthreads flag). Unsupported under WASI.
     halide_target_feature_wasm_bulk_memory,       ///< Enable +bulk-memory instructions for WebAssembly codegen.
     halide_target_feature_webgpu,                 ///< Enable the WebGPU runtime.
diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp
index 87f3a0263047..6b6898c82b85 100644
--- a/test/correctness/simd_op_check_wasm.cpp
+++ b/test/correctness/simd_op_check_wasm.cpp
@@ -16,8 +16,8 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
     SimdOpCheckWASM(Target t, int w = 768, int h = 128)
         : SimdOpCheckTest(t, w, h) {
         use_wasm_simd128 = target.has_feature(Target::WasmSimd128);
-        use_wasm_sat_float_to_int = target.has_feature(Target::WasmSatFloatToInt);
-        use_wasm_sign_ext = target.has_feature(Target::WasmSignExt);
+        use_wasm_sign_ext = !target.has_feature(Target::WasmMvpOnly);
+        use_wasm_sat_float_to_int = !target.has_feature(Target::WasmMvpOnly);
     }
 
     void add_tests() override {
@@ -544,6 +544,7 @@ int main(int argc, char **argv) {
         argc, argv,
         {
             Target("wasm-32-wasmrt"),
-            Target("wasm-32-wasmrt-wasm_simd128-wasm_sat_float_to_int"),
+            Target("wasm-32-wasmrt-wasm_simd128"),
+            Target("wasm-32-wasmrt-wasm_mvponly"),
         });
 }

From 96435186fe4aef03b075476eabd3618849be35eb Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 8 Dec 2023 09:50:32 -0800
Subject: [PATCH 015/186] Add join_strings() call and use it from mattrs()
 (#7997)

* Add join_strings() call and use it from mattrs()

This is a super-nit kind of fix, but the fact that we had rerolled a join-strings algo in a half-dozen places made my teeth hurt, so I decided to fix it:

- Add join_strings() to Util.h
- revise the mattrs() calls to use it instead of the janky mess they used

This doesn't move the needle on code size or speed but it is less weird.

Probably other places we could/should use this too.

(Does C++20 have join/split strings in the std library yet? If not, why not?)

* Update Util.h

* Update Util.h

* clang-tidy
---
 src/CodeGen_ARM.cpp         | 31 +++++++++++------------------
 src/CodeGen_Hexagon.cpp     | 11 ++++++-----
 src/CodeGen_PowerPC.cpp     | 32 ++++++++++--------------------
 src/CodeGen_RISCV.cpp       | 14 +++++++++----
 src/CodeGen_WebAssembly.cpp | 32 +++++++++++-------------------
 src/CodeGen_X86.cpp         | 39 +++++++++++++++++++++----------------
 src/Util.h                  | 24 +++++++++++++++++++++++
 7 files changed, 94 insertions(+), 89 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 03678e5ef605..4cf1dc597ab4 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1647,46 +1647,37 @@ string CodeGen_ARM::mcpu_tune() const {
 }
 
 string CodeGen_ARM::mattrs() const {
-    string arch_flags;
-    string separator;
+    std::vector<std::string_view> attrs;
     if (target.has_feature(Target::ARMFp16)) {
-        arch_flags += separator + "+fullfp16";
-        separator = ",";
+        attrs.emplace_back("+fullfp16");
     }
     if (target.has_feature(Target::ARMv81a)) {
-        arch_flags += separator + "+v8.1a";
-        separator = ",";
+        attrs.emplace_back("+v8.1a");
     }
     if (target.has_feature(Target::ARMDotProd)) {
-        arch_flags += separator + "+dotprod";
-        separator = ",";
+        attrs.emplace_back("+dotprod");
     }
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
-            arch_flags += separator + "+neon";
-            separator = ",";
+            attrs.emplace_back("+neon");
         }
         if (!target.has_feature(Target::NoNEON)) {
-            arch_flags += separator + "+neon";
-            separator = ",";
+            attrs.emplace_back("+neon");
         } else {
-            arch_flags += separator + "-neon";
-            separator = ",";
+            attrs.emplace_back("-neon");
         }
     } else {
         // TODO: Should Halide's SVE flags be 64-bit only?
         if (target.has_feature(Target::SVE2)) {
-            arch_flags = "+sve2";
-            separator = ",";
+            attrs.emplace_back("+sve2");
         } else if (target.has_feature(Target::SVE)) {
-            arch_flags = "+sve";
-            separator = ",";
+            attrs.emplace_back("+sve");
         }
         if (target.os == Target::IOS || target.os == Target::OSX) {
-            arch_flags += separator + "+reserve-x18";
+            attrs.emplace_back("+reserve-x18");
         }
     }
-    return arch_flags;
+    return join_strings(attrs, ",");
 }
 
 bool CodeGen_ARM::use_soft_float_abi() const {
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 54f084b9c271..9463a4c921aa 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -1801,13 +1801,14 @@ string CodeGen_Hexagon::mcpu_tune() const {
 }
 
 string CodeGen_Hexagon::mattrs() const {
-    std::stringstream attrs;
-    attrs << "+hvx-length128b";
-    attrs << ",+long-calls";
+    std::vector<std::string> attrs = {
+        "+hvx-length128b",
+        "+long-calls",
+    };
     if (target.has_feature(Target::HVX)) {
-        attrs << ",+hvxv" << isa_version;
+        attrs.push_back("+hvxv" + std::to_string(isa_version));
     }
-    return attrs.str();
+    return join_strings(attrs, ",");
 }
 
 bool CodeGen_Hexagon::use_soft_float_abi() const {
diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp
index 1f9c96c24d3d..6d7303de3b52 100644
--- a/src/CodeGen_PowerPC.cpp
+++ b/src/CodeGen_PowerPC.cpp
@@ -161,28 +161,16 @@ string CodeGen_PowerPC::mcpu_tune() const {
 }
 
 string CodeGen_PowerPC::mattrs() const {
-    string features;
-    string separator;
-    string enable;
-
-    features += "+altivec";
-    separator = ",";
-
-    enable = target.has_feature(Target::VSX) ? "+" : "-";
-    features += separator + enable + "vsx";
-    separator = ",";
-
-    enable = target.has_feature(Target::POWER_ARCH_2_07) ? "+" : "-";
-    features += separator + enable + "power8-altivec";
-    separator = ",";
-
-    // These move instructions are defined in POWER ISA 2.06 but we do
-    // not check for 2.06 currently.  So disable this for anything
-    // lower than ISA 2.07
-    features += separator + enable + "direct-move";
-    separator = ",";
-
-    return features;
+    std::vector<std::string> attrs = {
+        "+altivec",
+        target.has_feature(Target::VSX) ? "+vsx" : "-vsx",
+        target.has_feature(Target::POWER_ARCH_2_07) ? "+power8-altivec" : "-power8-altivec",
+        // These move instructions are defined in POWER ISA 2.06 but we do
+        // not check for 2.06 currently.  So disable this for anything
+        // lower than ISA 2.07
+        target.has_feature(Target::POWER_ARCH_2_07) ? "+direct-move" : "-direct-move",
+    };
+    return join_strings(attrs, ",");
 }
 
 bool CodeGen_PowerPC::use_soft_float_abi() const {
diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
index 234dae37e6ec..a702baff78a2 100644
--- a/src/CodeGen_RISCV.cpp
+++ b/src/CodeGen_RISCV.cpp
@@ -164,17 +164,23 @@ string CodeGen_RISCV::mattrs() const {
     //   +f Single-Precision Floating-Point,
     //   +d Double-Precision Floating-Point,
     //   +c Compressed Instructions,
-    string arch_flags = "+m,+a,+f,+d,+c";
+    std::vector<std::string> attrs = {
+        "+m",
+        "+a",
+        "+f",
+        "+d",
+        "+c",
+    };
 
     if (target.has_feature(Target::RVV)) {
-        arch_flags += ",+v";
+        attrs.emplace_back("+v");
 #if LLVM_VERSION >= 160
         if (target.vector_bits != 0) {
-            arch_flags += ",+zvl" + std::to_string(target.vector_bits) + "b";
+            attrs.push_back("+zvl" + std::to_string(target.vector_bits) + "b");
         }
 #endif
     }
-    return arch_flags;
+    return join_strings(attrs, ",");
 }
 
 string CodeGen_RISCV::mabi() const {
diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index 3e9aedca3fbf..6f37f1447df1 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -330,46 +330,36 @@ string CodeGen_WebAssembly::mcpu_tune() const {
 }
 
 string CodeGen_WebAssembly::mattrs() const {
-    std::ostringstream s;
-    string sep;
+    user_assert(target.os == Target::WebAssemblyRuntime)
+        << "wasmrt is the only supported 'os' for WebAssembly at this time.";
+
+    std::vector<std::string_view> attrs;
 
     if (!target.has_feature(Target::WasmMvpOnly)) {
-        s << sep << "+sign-ext";
-        sep = ",";
-        s << sep << "+nontrapping-fptoint";
+        attrs.emplace_back("+sign-ext");
+        attrs.emplace_back("+nontrapping-fptoint");
     }
-
     if (target.has_feature(Target::WasmSimd128)) {
-        s << sep << "+simd128";
-        sep = ",";
+        attrs.emplace_back("+simd128");
     }
-
     if (target.has_feature(Target::WasmThreads)) {
         // "WasmThreads" doesn't directly affect LLVM codegen,
         // but it does end up requiring atomics, so be sure to enable them.
-        s << sep << ",+atomics";
-        sep = ",";
+        attrs.emplace_back("+atomics");
     }
-
     // PIC implies +mutable-globals because the PIC ABI used by the linker
     // depends on importing and exporting mutable globals. Also -pthread implies
     // mutable-globals too, so quitely enable it if either of these are specified.
     if (use_pic() || target.has_feature(Target::WasmThreads)) {
-        s << sep << "+mutable-globals";
-        sep = ",";
+        attrs.emplace_back("+mutable-globals");
     }
-
     // Recent Emscripten builds assume that specifying `-pthread` implies bulk-memory too,
     // so quietly enable it if either of these are specified.
     if (target.has_feature(Target::WasmBulkMemory) || target.has_feature(Target::WasmThreads)) {
-        s << sep << "+bulk-memory";
-        sep = ",";
+        attrs.emplace_back("+bulk-memory");
     }
 
-    user_assert(target.os == Target::WebAssemblyRuntime)
-        << "wasmrt is the only supported 'os' for WebAssembly at this time.";
-
-    return s.str();
+    return join_strings(attrs, ",");
 }
 
 bool CodeGen_WebAssembly::use_soft_float_abi() const {
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index ab099eef123c..8d87f4c1937e 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -987,49 +987,54 @@ string CodeGen_X86::mcpu_tune() const {
 // FIXME: we should lower everything here, instead of relying
 //        that -mcpu= (`mcpu_target()`) implies/sets features for us.
 string CodeGen_X86::mattrs() const {
-    string features;
-    string separator;
+    std::vector<std::string_view> attrs;
     if (target.has_feature(Target::FMA)) {
-        features += "+fma";
-        separator = ",";
+        attrs.emplace_back("+fma");
     }
     if (target.has_feature(Target::FMA4)) {
-        features += separator + "+fma4";
-        separator = ",";
+        attrs.emplace_back("+fma4");
     }
     if (target.has_feature(Target::F16C)) {
-        features += separator + "+f16c";
-        separator = ",";
+        attrs.emplace_back("+f16c");
     }
     if (target.has_feature(Target::AVX512) ||
         target.has_feature(Target::AVX512_KNL) ||
         target.has_feature(Target::AVX512_Skylake) ||
         target.has_feature(Target::AVX512_Cannonlake)) {
-        features += separator + "+avx512f,+avx512cd";
-        separator = ",";
+        attrs.emplace_back("+avx512f");
+        attrs.emplace_back("+avx512cd");
         if (target.has_feature(Target::AVX512_KNL)) {
-            features += ",+avx512pf,+avx512er";
+            attrs.emplace_back("+avx512pf");
+            attrs.emplace_back("+avx512er");
         }
         if (target.has_feature(Target::AVX512_Skylake) ||
             target.has_feature(Target::AVX512_Cannonlake)) {
-            features += ",+avx512vl,+avx512bw,+avx512dq";
+            attrs.emplace_back("+avx512vl");
+            attrs.emplace_back("+avx512bw");
+            attrs.emplace_back("+avx512dq");
         }
         if (target.has_feature(Target::AVX512_Cannonlake)) {
-            features += ",+avx512ifma,+avx512vbmi";
+            attrs.emplace_back("+avx512ifma");
+            attrs.emplace_back("+avx512vbmi");
         }
         if (target.has_feature(Target::AVX512_Zen4)) {
-            features += ",+avx512bf16,+avx512vnni,+avx512bitalg,+avx512vbmi2";
+            attrs.emplace_back("+avx512bf16");
+            attrs.emplace_back("+avx512vnni");
+            attrs.emplace_back("+avx512bitalg");
+            attrs.emplace_back("+avx512vbmi2");
         }
         if (target.has_feature(Target::AVX512_SapphireRapids)) {
-            features += ",+avxvnni,+amx-int8,+amx-bf16";
+            attrs.emplace_back("+avxvnni");
+            attrs.emplace_back("+amx-int8");
+            attrs.emplace_back("+amx-bf16");
         }
     }
 #if LLVM_VERSION >= 180
     if (gather_might_be_slow(target)) {
-        features += ",+prefer-no-gather";
+        attrs.push_back("+prefer-no-gather");
     }
 #endif
-    return features;
+    return join_strings(attrs, ",");
 }
 
 bool CodeGen_X86::use_soft_float_abi() const {
diff --git a/src/Util.h b/src/Util.h
index 1bc53d5b3691..15c297796911 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -17,6 +17,7 @@
 #include <cstring>
 #include <functional>
 #include <limits>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -185,6 +186,29 @@ std::string replace_all(const std::string &str, const std::string &find, const s
 /** Split the source string using 'delim' as the divider. */
 std::vector<std::string> split_string(const std::string &source, const std::string &delim);
 
+/** Join the source vector using 'delim' as the divider. */
+template<typename T>
+std::string join_strings(const std::vector<T> &sources, const std::string &delim) {
+    size_t sz = 0;
+    if (!sources.empty()) {
+        sz += delim.size() * (sources.size() - 1);
+    }
+    for (const auto &s : sources) {
+        sz += s.size();
+    }
+    std::string result;
+    result.reserve(sz);
+    bool need_delim = false;
+    for (const auto &s : sources) {
+        if (need_delim) {
+            result += delim;
+        }
+        result += s;
+        need_delim = true;
+    }
+    return result;
+}
+
 /** Perform a left fold of a vector. Returns a default-constructed
  * vector element if the vector is empty. Similar to std::accumulate
  * but with a less clunky syntax. */

From 9c099c29379ea379309109dce9c23b731da2d8a1 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 8 Dec 2023 09:53:04 -0800
Subject: [PATCH 016/186] Teach unrolling to exploit conditions in enclosing
 ifs (#7969)

* Teach unrolling to exploit conditions in enclosing ifs

Fixes #7968

* Handle vectorization as well

* Remove unused usings

* Add missing print
---
 Makefile                                      |   2 +
 src/BoundConstantExtentLoops.cpp              | 136 ++++++++++++++++++
 src/BoundConstantExtentLoops.h                |  24 ++++
 src/BoundsInference.cpp                       |   4 +-
 src/CMakeLists.txt                            |   4 +-
 src/Lower.cpp                                 |   5 +
 src/Simplify.cpp                              |  14 +-
 src/Simplify.h                                |  17 ++-
 src/UnrollLoops.cpp                           |  79 +---------
 src/VectorizeLoops.cpp                        |   4 +-
 test/correctness/CMakeLists.txt               |   2 +
 ...roll_loop_with_implied_constant_bounds.cpp |  54 +++++++
 .../vectorized_guard_with_if_tail.cpp         |  42 ++++++
 13 files changed, 298 insertions(+), 89 deletions(-)
 create mode 100644 src/BoundConstantExtentLoops.cpp
 create mode 100644 src/BoundConstantExtentLoops.h
 create mode 100644 test/correctness/unroll_loop_with_implied_constant_bounds.cpp
 create mode 100644 test/correctness/vectorized_guard_with_if_tail.cpp

diff --git a/Makefile b/Makefile
index 4140da5c8f30..b24dfdc2d80d 100644
--- a/Makefile
+++ b/Makefile
@@ -459,6 +459,7 @@ SOURCE_FILES = \
   BoundaryConditions.cpp \
   Bounds.cpp \
   BoundsInference.cpp \
+  BoundConstantExtentLoops.cpp \
   BoundSmallAllocations.cpp \
   Buffer.cpp \
   Callable.cpp \
@@ -654,6 +655,7 @@ HEADER_FILES = \
   BoundaryConditions.h \
   Bounds.h \
   BoundsInference.h \
+  BoundConstantExtentLoops.h \
   BoundSmallAllocations.h \
   Buffer.h \
   Callable.h \
diff --git a/src/BoundConstantExtentLoops.cpp b/src/BoundConstantExtentLoops.cpp
new file mode 100644
index 000000000000..d2901854f6eb
--- /dev/null
+++ b/src/BoundConstantExtentLoops.cpp
@@ -0,0 +1,136 @@
+#include "BoundConstantExtentLoops.h"
+#include "Bounds.h"
+#include "CSE.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "Simplify.h"
+#include "SimplifyCorrelatedDifferences.h"
+#include "Substitute.h"
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+class BoundLoops : public IRMutator {
+    using IRMutator::visit;
+
+    std::vector<std::pair<std::string, Expr>> lets;
+
+    Stmt visit(const LetStmt *op) override {
+        if (is_pure(op->value)) {
+            lets.emplace_back(op->name, op->value);
+            Stmt s = IRMutator::visit(op);
+            lets.pop_back();
+            return s;
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
+    std::vector<Expr> facts;
+    Stmt visit(const IfThenElse *op) override {
+        facts.push_back(op->condition);
+        Stmt then_case = mutate(op->then_case);
+        Stmt else_case;
+        if (op->else_case.defined()) {
+            facts.back() = simplify(!op->condition);
+            else_case = mutate(op->else_case);
+        }
+        facts.pop_back();
+        if (then_case.same_as(op->then_case) &&
+            else_case.same_as(op->else_case)) {
+            return op;
+        } else {
+            return IfThenElse::make(op->condition, then_case, else_case);
+        }
+    }
+
+    Stmt visit(const For *op) override {
+        if (is_const(op->extent)) {
+            // Nothing needs to be done
+            return IRMutator::visit(op);
+        }
+
+        if (op->for_type == ForType::Unrolled ||
+            op->for_type == ForType::Vectorized) {
+            // Give it one last chance to simplify to an int
+            Expr extent = simplify(op->extent);
+            Stmt body = op->body;
+            const IntImm *e = extent.as<IntImm>();
+
+            if (e == nullptr) {
+                // We're about to hard fail. Get really aggressive
+                // with the simplifier.
+                for (auto it = lets.rbegin(); it != lets.rend(); it++) {
+                    extent = Let::make(it->first, it->second, extent);
+                }
+                extent = remove_likelies(extent);
+                extent = substitute_in_all_lets(extent);
+                extent = simplify(extent,
+                                  true,
+                                  Scope<Interval>::empty_scope(),
+                                  Scope<ModulusRemainder>::empty_scope(),
+                                  facts);
+                e = extent.as<IntImm>();
+            }
+
+            Expr extent_upper;
+            if (e == nullptr) {
+                // Still no luck. Try taking an upper bound and
+                // injecting an if statement around the body.
+                extent_upper = find_constant_bound(extent, Direction::Upper, Scope<Interval>());
+                if (extent_upper.defined()) {
+                    e = extent_upper.as<IntImm>();
+                    body =
+                        IfThenElse::make(likely_if_innermost(Variable::make(Int(32), op->name) <
+                                                             op->min + op->extent),
+                                         body);
+                }
+            }
+
+            if (e == nullptr && permit_failed_unroll && op->for_type == ForType::Unrolled) {
+                // Still no luck, but we're allowed to fail. Rewrite
+                // to a serial loop.
+                user_warning << "HL_PERMIT_FAILED_UNROLL is allowing us to unroll a non-constant loop into a serial loop. Did you mean to do this?\n";
+                body = mutate(body);
+                return For::make(op->name, op->min, op->extent,
+                                 ForType::Serial, op->partition_policy, op->device_api, std::move(body));
+            }
+
+            user_assert(e)
+                << "Can only " << (op->for_type == ForType::Unrolled ? "unroll" : "vectorize")
+                << " for loops over a constant extent.\n"
+                << "Loop over " << op->name << " has extent " << extent << ".\n";
+            body = mutate(body);
+
+            return For::make(op->name, op->min, e,
+                             op->for_type, op->partition_policy, op->device_api, std::move(body));
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+    bool permit_failed_unroll = false;
+
+public:
+    BoundLoops() {
+        // Experimental autoschedulers may want to unroll without
+        // being totally confident the loop will indeed turn out
+        // to be constant-sized. If this feature continues to be
+        // important, we need to expose it in the scheduling
+        // language somewhere, but how? For now we do something
+        // ugly and expedient.
+
+        // For the tracking issue to fix this, see
+        // https://github.com/halide/Halide/issues/3479
+        permit_failed_unroll = get_env_variable("HL_PERMIT_FAILED_UNROLL") == "1";
+    }
+};
+
+}  // namespace
+
+Stmt bound_constant_extent_loops(const Stmt &s) {
+    return BoundLoops().mutate(s);
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/BoundConstantExtentLoops.h b/src/BoundConstantExtentLoops.h
new file mode 100644
index 000000000000..061064f795f9
--- /dev/null
+++ b/src/BoundConstantExtentLoops.h
@@ -0,0 +1,24 @@
+#ifndef HALIDE_BOUND_CONSTANT_EXTENT_LOOPS_H
+#define HALIDE_BOUND_CONSTANT_EXTENT_LOOPS_H
+
+/** \file
+ * Defines the lowering pass that enforces a constant extent on all
+ * vectorized or unrolled loops.
+ */
+
+#include "Expr.h"
+
+namespace Halide {
+namespace Internal {
+
+/** Replace all loop extents of unrolled or vectorized loops with constants, by
+ * substituting and simplifying as needed. If we can't determine a constant
+ * extent, but can determine a constant upper bound, inject an if statement into
+ * the body. If we can't even determine a constant upper bound, throw a user
+ * error. */
+Stmt bound_constant_extent_loops(const Stmt &s);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index d8a1ff53cc37..31b441ea4251 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -1013,11 +1013,11 @@ class BoundsInference : public IRMutator {
                     }
 
                     // Dump out the region required of each stage for debugging.
-
                     /*
                     debug(0) << "Box required of " << producer.name
                              << " by " << consumer.name
-                             << " stage " << consumer.stage << ":\n";
+                             << " stage " << consumer.stage << ":\n"
+                             << " used: " << b.used << "\n";
                     for (size_t k = 0; k < b.size(); k++) {
                         debug(0) << "  " << b[k].min << " ... " << b[k].max << "\n";
                     }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5d15d55f4416..390fee9a64e5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,7 +21,8 @@ set(HEADER_FILES
     BoundaryConditions.h
     Bounds.h
     BoundsInference.h
-    BoundSmallAllocations.h
+    BoundConstantExtentLoops.h
+    BoundSmallAllocations.h    
     Buffer.h
     Callable.h
     CanonicalizeGPUVars.h
@@ -189,6 +190,7 @@ set(SOURCE_FILES
     BoundaryConditions.cpp
     Bounds.cpp
     BoundsInference.cpp
+    BoundConstantExtentLoops.cpp
     BoundSmallAllocations.cpp
     Buffer.cpp
     Callable.cpp
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 67aedde288d0..37c4bac07efb 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -11,6 +11,7 @@
 #include "AddParameterChecks.h"
 #include "AllocationBoundsInference.h"
 #include "AsyncProducers.h"
+#include "BoundConstantExtentLoops.h"
 #include "BoundSmallAllocations.h"
 #include "Bounds.h"
 #include "BoundsInference.h"
@@ -312,6 +313,10 @@ void lower_impl(const vector<Function> &output_funcs,
     s = simplify_correlated_differences(s);
     log("Lowering after simplifying correlated differences:", s);
 
+    debug(1) << "Bounding constant extent loops...\n";
+    s = bound_constant_extent_loops(s);
+    log("Lowering after bounding constant extent loops:", s);
+
     debug(1) << "Unrolling...\n";
     s = unroll_loops(s);
     log("Lowering after unrolling:", s);
diff --git a/src/Simplify.cpp b/src/Simplify.cpp
index 7a2cbac5a047..339ef2917c83 100644
--- a/src/Simplify.cpp
+++ b/src/Simplify.cpp
@@ -355,8 +355,13 @@ Simplify::ScopedFact::~ScopedFact() {
 
 Expr simplify(const Expr &e, bool remove_dead_let_stmts,
               const Scope<Interval> &bounds,
-              const Scope<ModulusRemainder> &alignment) {
+              const Scope<ModulusRemainder> &alignment,
+              const std::vector<Expr> &assumptions) {
     Simplify m(remove_dead_let_stmts, &bounds, &alignment);
+    std::vector<Simplify::ScopedFact> facts;
+    for (const Expr &a : assumptions) {
+        facts.push_back(m.scoped_truth(a));
+    }
     Expr result = m.mutate(e, nullptr);
     if (m.in_unreachable) {
         return unreachable(e.type());
@@ -366,8 +371,13 @@ Expr simplify(const Expr &e, bool remove_dead_let_stmts,
 
 Stmt simplify(const Stmt &s, bool remove_dead_let_stmts,
               const Scope<Interval> &bounds,
-              const Scope<ModulusRemainder> &alignment) {
+              const Scope<ModulusRemainder> &alignment,
+              const std::vector<Expr> &assumptions) {
     Simplify m(remove_dead_let_stmts, &bounds, &alignment);
+    std::vector<Simplify::ScopedFact> facts;
+    for (const Expr &a : assumptions) {
+        facts.push_back(m.scoped_truth(a));
+    }
     Stmt result = m.mutate(s);
     if (m.in_unreachable) {
         return Evaluate::make(unreachable());
diff --git a/src/Simplify.h b/src/Simplify.h
index 14dec65fc025..b9335c0c3de9 100644
--- a/src/Simplify.h
+++ b/src/Simplify.h
@@ -13,19 +13,22 @@
 namespace Halide {
 namespace Internal {
 
-/** Perform a a wide range of simplifications to expressions and
- * statements, including constant folding, substituting in trivial
- * values, arithmetic rearranging, etc. Simplifies across let
- * statements, so must not be called on stmts with dangling or
- * repeated variable names.
+/** Perform a wide range of simplifications to expressions and statements,
+ * including constant folding, substituting in trivial values, arithmetic
+ * rearranging, etc. Simplifies across let statements, so must not be called on
+ * stmts with dangling or repeated variable names. Can optionally be passed
+ * known bounds of any variables, known alignment properties, and any other
+ * Exprs that should be assumed to be true.
  */
 // @{
 Stmt simplify(const Stmt &, bool remove_dead_code = true,
               const Scope<Interval> &bounds = Scope<Interval>::empty_scope(),
-              const Scope<ModulusRemainder> &alignment = Scope<ModulusRemainder>::empty_scope());
+              const Scope<ModulusRemainder> &alignment = Scope<ModulusRemainder>::empty_scope(),
+              const std::vector<Expr> &assumptions = std::vector<Expr>());
 Expr simplify(const Expr &, bool remove_dead_code = true,
               const Scope<Interval> &bounds = Scope<Interval>::empty_scope(),
-              const Scope<ModulusRemainder> &alignment = Scope<ModulusRemainder>::empty_scope());
+              const Scope<ModulusRemainder> &alignment = Scope<ModulusRemainder>::empty_scope(),
+              const std::vector<Expr> &assumptions = std::vector<Expr>());
 // @}
 
 /** Attempt to statically prove an expression is true using the simplifier. */
diff --git a/src/UnrollLoops.cpp b/src/UnrollLoops.cpp
index e1726aa28ceb..2823c8b9ac9f 100644
--- a/src/UnrollLoops.cpp
+++ b/src/UnrollLoops.cpp
@@ -1,16 +1,10 @@
 #include "UnrollLoops.h"
-#include "Bounds.h"
-#include "CSE.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "Simplify.h"
-#include "SimplifyCorrelatedDifferences.h"
 #include "Substitute.h"
 #include "UniquifyVariableNames.h"
 
-using std::pair;
-using std::vector;
-
 namespace Halide {
 namespace Internal {
 
@@ -19,62 +13,13 @@ namespace {
 class UnrollLoops : public IRMutator {
     using IRMutator::visit;
 
-    vector<pair<std::string, Expr>> lets;
-
-    Stmt visit(const LetStmt *op) override {
-        if (is_pure(op->value)) {
-            lets.emplace_back(op->name, op->value);
-            Stmt s = IRMutator::visit(op);
-            lets.pop_back();
-            return s;
-        } else {
-            return IRMutator::visit(op);
-        }
-    }
-
     Stmt visit(const For *for_loop) override {
         if (for_loop->for_type == ForType::Unrolled) {
-            // Give it one last chance to simplify to an int
-            Expr extent = simplify(for_loop->extent);
             Stmt body = for_loop->body;
-            const IntImm *e = extent.as<IntImm>();
-
-            if (e == nullptr) {
-                // We're about to hard fail. Get really aggressive
-                // with the simplifier.
-                for (auto it = lets.rbegin(); it != lets.rend(); it++) {
-                    extent = Let::make(it->first, it->second, extent);
-                }
-                extent = remove_likelies(extent);
-                extent = substitute_in_all_lets(extent);
-                extent = simplify(extent);
-                e = extent.as<IntImm>();
-            }
+            const IntImm *e = for_loop->extent.as<IntImm>();
 
-            Expr extent_upper;
-            bool use_guard = false;
-            if (e == nullptr) {
-                // Still no luck. Try taking an upper bound and
-                // injecting an if statement around the body.
-                extent_upper = find_constant_bound(extent, Direction::Upper, Scope<Interval>());
-                if (extent_upper.defined()) {
-                    e = extent_upper.as<IntImm>();
-                    use_guard = true;
-                }
-            }
-
-            if (e == nullptr && permit_failed_unroll) {
-                // Still no luck, but we're allowed to fail. Rewrite
-                // to a serial loop.
-                user_warning << "HL_PERMIT_FAILED_UNROLL is allowing us to unroll a non-constant loop into a serial loop. Did you mean to do this?\n";
-                body = mutate(body);
-                return For::make(for_loop->name, for_loop->min, for_loop->extent,
-                                 ForType::Serial, for_loop->partition_policy, for_loop->device_api, std::move(body));
-            }
-
-            user_assert(e)
-                << "Can only unroll for loops over a constant extent.\n"
-                << "Loop over " << for_loop->name << " has extent " << extent << ".\n";
+            internal_assert(e)
+                << "Loop over " << for_loop->name << " should have had a constant extent\n";
             body = mutate(body);
 
             if (e->value == 1) {
@@ -94,9 +39,6 @@ class UnrollLoops : public IRMutator {
                 } else {
                     iters = Block::make(iter, iters);
                 }
-                if (use_guard) {
-                    iters = IfThenElse::make(likely_if_innermost(i < for_loop->extent), iters);
-                }
             }
 
             return iters;
@@ -105,21 +47,6 @@ class UnrollLoops : public IRMutator {
             return IRMutator::visit(for_loop);
         }
     }
-    bool permit_failed_unroll = false;
-
-public:
-    UnrollLoops() {
-        // Experimental autoschedulers may want to unroll without
-        // being totally confident the loop will indeed turn out
-        // to be constant-sized. If this feature continues to be
-        // important, we need to expose it in the scheduling
-        // language somewhere, but how? For now we do something
-        // ugly and expedient.
-
-        // For the tracking issue to fix this, see
-        // https://github.com/halide/Halide/issues/3479
-        permit_failed_unroll = get_env_variable("HL_PERMIT_FAILED_UNROLL") == "1";
-    }
 };
 
 }  // namespace
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 7ced1dab0d92..89c4f020af51 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -951,7 +951,9 @@ class VectorSubs : public IRMutator {
 
         if (op->for_type == ForType::Vectorized) {
             const IntImm *extent_int = extent.as<IntImm>();
-            if (!extent_int || extent_int->value <= 1) {
+            internal_assert(extent_int)
+                << "Vectorized for loop extent should have been rewritten to a constant\n";
+            if (extent_int->value <= 1) {
                 user_error << "Loop over " << op->name
                            << " has extent " << extent
                            << ". Can only vectorize loops over a "
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index da968c419593..6b4529be6be5 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -319,6 +319,7 @@ tests(GROUPS correctness
       uninitialized_read.cpp
       unique_func_image.cpp
       unroll_dynamic_loop.cpp
+      unroll_loop_with_implied_constant_bounds.cpp
       unrolled_reduction.cpp
       unsafe_dedup_lets.cpp
       unsafe_promises.cpp
@@ -337,6 +338,7 @@ tests(GROUPS correctness
       vectorize_varying_allocation_size.cpp
       vectorized_assert.cpp
       vectorized_gpu_allocation.cpp
+      vectorized_guard_with_if_tail.cpp
       vectorized_initialization.cpp
       vectorized_load_from_vectorized_allocation.cpp
       vectorized_reduction_bug.cpp
diff --git a/test/correctness/unroll_loop_with_implied_constant_bounds.cpp b/test/correctness/unroll_loop_with_implied_constant_bounds.cpp
new file mode 100644
index 000000000000..c38d59c5214a
--- /dev/null
+++ b/test/correctness/unroll_loop_with_implied_constant_bounds.cpp
@@ -0,0 +1,54 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    // This test verifies that unrolling/vectorizing is capable of inferring
+    // constant bounds of loops that are implied by containing if statement
+    // conditions, e.g the following structure should work:
+
+    /*
+      let extent = foo
+      if (foo == 7) {
+        unrolled for (x from 0 to extent) {...}
+      }
+    */
+
+    for (int i = 0; i < 2; i++) {
+        Func intermediate("intermediate");
+
+        Func output1("output1"), output2("output2");
+
+        Var x("x"), y("y"), c("c");
+
+        intermediate(x, y, c) = x + y + c;
+
+        output1(x, y, c) = intermediate(x, y, c);
+        output2(x, y, c) = intermediate(x, y, c);
+
+        Expr three_channels =
+            (output1.output_buffer().dim(2).extent() == 3 &&
+             output1.output_buffer().dim(2).min() == 0 &&
+             output2.output_buffer().dim(2).extent() == 3 &&
+             output2.output_buffer().dim(2).min() == 0);
+
+        if (i == 0) {
+            intermediate.compute_root()
+                .specialize(three_channels)
+                .unroll(c);
+        } else {
+            intermediate.compute_root()
+                .specialize(three_channels)
+                .vectorize(c);
+        }
+
+        Pipeline p{{output1, output2}};
+
+        // Should not throw an error in loop unrolling or vectorization.
+        p.compile_jit();
+    }
+
+    printf("Success!\n");
+
+    return 0;
+}
diff --git a/test/correctness/vectorized_guard_with_if_tail.cpp b/test/correctness/vectorized_guard_with_if_tail.cpp
new file mode 100644
index 000000000000..62bf975d93f1
--- /dev/null
+++ b/test/correctness/vectorized_guard_with_if_tail.cpp
@@ -0,0 +1,42 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    Var x;
+
+    for (int i = 0; i < 2; i++) {
+        Func f, g;
+        f(x) = x;
+        g(x) = f(x) * 2;
+
+        g.vectorize(x, 8, TailStrategy::GuardWithIf);
+
+        f.compute_at(g, x);
+
+        // A varying amount of f is required depending on if we're in the steady
+        // state of g or the tail. Nonetheless, the amount required has a constant
+        // upper bound of 8. Vectorization, unrolling, and variants of store_in that
+        // require constant extent should all be able to handle this.
+        if (i == 0) {
+            f.vectorize(x);
+        } else {
+            f.unroll(x);
+        }
+        f.store_in(MemoryType::Register);
+
+        Buffer<int> buf = g.realize({37});
+
+        for (int i = 0; i < buf.width(); i++) {
+            int correct = i * 2;
+            if (buf(i) != correct) {
+                printf("buf(%d) = %d instead of %d\n",
+                       i, buf(i), correct);
+                return 1;
+            }
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From 357e64685619ab0aaee03f2efa5a4e38d4fb5372 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 8 Dec 2023 11:17:30 -0800
Subject: [PATCH 017/186] Do some basic validation of Target Features (#7986)
 (#7987)

* Do some basic validation of Target Features (#7986)

* Update Target.cpp

* Update Target.cpp

* Fixes

* Update Target.cpp

* Improve error messaging.

* format

* Update Target.cpp
---
 python_bindings/test/correctness/target.py |  5 +-
 src/Target.cpp                             | 83 ++++++++++++++++++++++
 src/Target.h                               |  6 ++
 test/correctness/target.cpp                |  7 +-
 4 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/python_bindings/test/correctness/target.py b/python_bindings/test/correctness/target.py
index 18eee2651301..7876bc97ecef 100644
--- a/python_bindings/test/correctness/target.py
+++ b/python_bindings/test/correctness/target.py
@@ -50,9 +50,6 @@ def test_target():
         32,
         [
             hl.TargetFeature.JIT,
-            hl.TargetFeature.SSE41,
-            hl.TargetFeature.AVX,
-            hl.TargetFeature.AVX2,
             hl.TargetFeature.CUDA,
             hl.TargetFeature.OpenCL,
             hl.TargetFeature.OpenGLCompute,
@@ -60,7 +57,7 @@ def test_target():
         ],
     )
     ts = t1.to_string()
-    assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-openglcompute-sse41"
+    assert ts == "arm-32-android-cuda-debug-jit-opencl-openglcompute"
     assert hl.Target.validate_target_string(ts)
 
     # Expected failures:
diff --git a/src/Target.cpp b/src/Target.cpp
index e222e97d5282..49011348544f 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -785,8 +785,90 @@ void bad_target_string(const std::string &target) {
                << "On this platform, the host target is: " << get_host_target().to_string() << "\n";
 }
 
+void do_check_bad(const Target &t, const std::initializer_list<Target::Feature> &v) {
+    for (Target::Feature f : v) {
+        user_assert(!t.has_feature(f))
+            << "Target feature " << Target::feature_to_name(f)
+            << " is incompatible with the Target's architecture. (" << t << ")\n";
+    }
+}
+
 }  // namespace
 
+void Target::validate_features() const {
+    // Note that the features don't have to be exhaustive, but enough to avoid obvious mistakes is good.
+    if (arch == X86) {
+        do_check_bad(*this, {
+                                ARMDotProd,
+                                ARMFp16,
+                                ARMv7s,
+                                ARMv81a,
+                                NoNEON,
+                                POWER_ARCH_2_07,
+                                RVV,
+                                SVE,
+                                SVE2,
+                                VSX,
+                                WasmBulkMemory,
+                                WasmMvpOnly,
+                                WasmSimd128,
+                                WasmThreads,
+                            });
+    } else if (arch == ARM) {
+        do_check_bad(*this, {
+                                AVX,
+                                AVX2,
+                                AVX512,
+                                AVX512_Cannonlake,
+                                AVX512_KNL,
+                                AVX512_SapphireRapids,
+                                AVX512_Skylake,
+                                AVX512_Zen4,
+                                F16C,
+                                FMA,
+                                FMA4,
+                                POWER_ARCH_2_07,
+                                RVV,
+                                SSE41,
+                                VSX,
+                                WasmBulkMemory,
+                                WasmMvpOnly,
+                                WasmSimd128,
+                                WasmThreads,
+                            });
+    } else if (arch == WebAssembly) {
+        do_check_bad(*this, {
+                                ARMDotProd,
+                                ARMFp16,
+                                ARMv7s,
+                                ARMv81a,
+                                AVX,
+                                AVX2,
+                                AVX512,
+                                AVX512_Cannonlake,
+                                AVX512_KNL,
+                                AVX512_SapphireRapids,
+                                AVX512_Skylake,
+                                AVX512_Zen4,
+                                F16C,
+                                FMA,
+                                FMA4,
+                                HVX_128,
+                                HVX_128,
+                                HVX_v62,
+                                HVX_v65,
+                                HVX_v66,
+                                NoNEON,
+                                POWER_ARCH_2_07,
+                                RVV,
+                                SSE41,
+                                SVE,
+                                SVE2,
+                                VSX,
+                            });
+    }
+}
+
 Target::Target(const std::string &target) {
     Target host = get_host_target();
 
@@ -798,6 +880,7 @@ Target::Target(const std::string &target) {
             bad_target_string(target);
         }
     }
+    validate_features();
 }
 
 Target::Target(const char *s)
diff --git a/src/Target.h b/src/Target.h
index 331694e34c3a..97c141f308e5 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -177,6 +177,7 @@ struct Target {
         for (const auto &f : initial_features) {
             set_feature(f);
         }
+        validate_features();
     }
 
     Target(OS o, Arch a, int b, const std::vector<Feature> &initial_features = std::vector<Feature>())
@@ -357,6 +358,11 @@ struct Target {
 private:
     /** A bitmask that stores the active features. */
     std::bitset<FeatureEnd> features;
+
+    /** Attempt to validate that all features set are sensible for the base Target.
+     * This is *not* guaranteed to get all invalid combinations, but is intended
+     * to catch at least the most common (e.g., setting arm-specific features on x86). */
+    void validate_features() const;
 };
 
 /** Return the target corresponding to the host machine. */
diff --git a/test/correctness/target.cpp b/test/correctness/target.cpp
index 160d870ac09a..8fc03b589a73 100644
--- a/test/correctness/target.cpp
+++ b/test/correctness/target.cpp
@@ -51,11 +51,10 @@ int main(int argc, char **argv) {
 
     // Full specification round-trip, crazy features
     t1 = Target(Target::Android, Target::ARM, 32,
-                {Target::JIT, Target::SSE41, Target::AVX, Target::AVX2,
-                 Target::CUDA, Target::OpenCL, Target::OpenGLCompute,
-                 Target::Debug});
+                {Target::JIT, Target::CUDA, Target::OpenCL,
+                 Target::OpenGLCompute, Target::Debug});
     ts = t1.to_string();
-    if (ts != "arm-32-android-avx-avx2-cuda-debug-jit-opencl-openglcompute-sse41") {
+    if (ts != "arm-32-android-cuda-debug-jit-opencl-openglcompute") {
         printf("to_string failure: %s\n", ts.c_str());
         return 1;
     }

From 3d5cf40cd64b32dfecf7a584cf4790c9c3237b4d Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Tue, 12 Dec 2023 18:50:56 +0100
Subject: [PATCH 018/186] Inject profiling for function calls to
 'halide_copy_to_host' and 'halide_copy_to_device'. (#7913)

* Inject profiling for function calls to 'halide_copy_to_host' and 'halide_copy_to_device'.

* WIP: I get segfaults. The device_interface pointer is bogus.

* Figured it out...

* Allow global sync on d3d12.

* Cleanly time all buffer copies as well.

* Cleanup old comment.

* Following Andrews suggestion for suffixing buffer copies in the profiler.

* Sort the profiler report lines into three sections: funcs, buffer copy to device, and buffer copy to host.

* Inject profiling for function calls to 'halide_copy_to_host' and 'halide_copy_to_device'.

* WIP: I get segfaults. The device_interface pointer is bogus.

* Figured it out...

* Allow global sync on d3d12.

* Cleanly time all buffer copies as well.

* Cleanup old comment.

* Following Andrews suggestion for suffixing buffer copies in the profiler.

* Sort the profiler report lines into three sections: funcs, buffer copy to device, and buffer copy to host.

* Attempt to fix output parsing.

* Fix crash for copy_to_device

* halide_device_sync_global(NULL) -> success

* Fixed the buffer copy bug. Added a new test that will cause buffer copies in two directions within the compiled pipeline. This will catch this better in the future. Tweaked the profile report section header printing.

* Clang-format, my dear friend...
---
 src/CodeGen_Internal.cpp                      |   1 +
 src/OffloadGPULoops.cpp                       |   8 +-
 src/Profiling.cpp                             |  72 ++++++++++
 src/runtime/HalideRuntime.h                   |   9 ++
 src/runtime/d3d12compute.cpp                  |   8 +-
 src/runtime/device_interface.cpp              |  15 ++
 src/runtime/profiler_common.cpp               | 128 +++++++++++++++---
 src/runtime/runtime_api.cpp                   |   1 +
 test/correctness/CMakeLists.txt               |   1 +
 .../device_buffer_copies_with_profile.cpp     |  71 ++++++++++
 test/performance/memory_profiler.cpp          |  12 +-
 test/performance/profiler.cpp                 |  12 +-
 12 files changed, 306 insertions(+), 32 deletions(-)
 create mode 100644 test/correctness/device_buffer_copies_with_profile.cpp

diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index 5c36ea58aae3..2fc5b5cae0df 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -40,6 +40,7 @@ bool function_takes_user_context(const std::string &name) {
         "halide_device_malloc",
         "halide_device_and_host_malloc",
         "halide_device_sync",
+        "halide_device_sync_global",
         "halide_do_par_for",
         "halide_do_loop_task",
         "halide_do_task",
diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index 7b8464211994..46e6544036b7 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -249,7 +249,13 @@ class InjectGpuOffload : public IRMutator {
             Call::make(Handle(), Call::make_struct, args, Call::Intrinsic),
             Call::make(Handle(), Call::make_struct, arg_is_buffer, Call::Intrinsic),
         };
-        return call_extern_and_assert("halide_" + api_unique_name + "_run", run_args);
+        Stmt run_and_assert = call_extern_and_assert("halide_" + api_unique_name + "_run", run_args);
+        if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) {
+            Expr device_interface = make_device_interface_call(loop->device_api, MemoryType::Auto);
+            Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface});
+            return Block::make(run_and_assert, sync_and_assert);
+        }
+        return run_and_assert;
     }
 
 public:
diff --git a/src/Profiling.cpp b/src/Profiling.cpp
index 7bd9a9fe4db7..2be058b3c8a6 100644
--- a/src/Profiling.cpp
+++ b/src/Profiling.cpp
@@ -6,6 +6,7 @@
 #include "ExprUsesVar.h"
 #include "IRMutator.h"
 #include "IROperator.h"
+#include "InjectHostDevBufferCopies.h"
 #include "Profiling.h"
 #include "Scope.h"
 #include "Simplify.h"
@@ -422,6 +423,77 @@ class InjectProfiling : public IRMutator {
         }
         return IfThenElse::make(std::move(condition), std::move(then_case), std::move(else_case));
     }
+
+    Stmt visit(const LetStmt *op) override {
+        if (const Call *call = op->value.as<Call>()) {
+            Stmt start_profiler;
+            if (call->name == "halide_copy_to_host" || call->name == "halide_copy_to_device") {
+                std::string buffer_name;
+                if (const Variable *var = call->args.front().as<Variable>()) {
+                    buffer_name = var->name;
+                    if (ends_with(buffer_name, ".buffer")) {
+                        buffer_name = buffer_name.substr(0, buffer_name.size() - 7);
+                    } else {
+                        internal_error << "Expected to find a variable ending in .buffer as first argument to function call " << call->name << "\n";
+                    }
+                } else {
+                    internal_error << "Expected to find a variable as first argument of the function call " << call->name << ".\n";
+                }
+                bool requires_sync = false;
+                if (call->name == "halide_copy_to_host") {
+                    int copy_to_host_id = get_func_id(buffer_name + " (copy to host)");
+                    start_profiler = set_current_func(copy_to_host_id);
+                    requires_sync = false;
+                } else if (call->name == "halide_copy_to_device") {
+                    int copy_to_device_id = get_func_id(buffer_name + " (copy to device)");
+                    start_profiler = set_current_func(copy_to_device_id);
+                    requires_sync = true;
+                } else {
+                    internal_error << "Unexpected function name.\n";
+                }
+                if (start_profiler.defined()) {
+                    // The copy functions are followed by an assert, which we will wrap in the timed body.
+                    const AssertStmt *copy_assert = nullptr;
+                    Stmt other;
+                    if (const Block *block = op->body.as<Block>()) {
+                        if (const AssertStmt *assert = block->first.as<AssertStmt>()) {
+                            copy_assert = assert;
+                            other = block->rest;
+                        }
+                    } else if (const AssertStmt *assert = op->body.as<AssertStmt>()) {
+                        copy_assert = assert;
+                    }
+                    if (copy_assert) {
+                        std::vector<Stmt> steps;
+                        steps.push_back(AssertStmt::make(copy_assert->condition, copy_assert->message));
+                        if (requires_sync) {
+                            internal_assert(call->name == "halide_copy_to_device");
+                            Expr device_interface = call->args.back();  // The last argument to the copy_to_device calls is the device_interface.
+                            Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface});
+                            steps.push_back(sync_and_assert);
+                        }
+                        steps.push_back(set_current_func(stack.back()));
+
+                        if (other.defined()) {
+                            steps.push_back(mutate(other));
+                        }
+                        return Block::make(start_profiler,
+                                           LetStmt::make(op->name, mutate(op->value),
+                                                         Block::make(steps)));
+                    } else {
+                        internal_error << "No assert found after buffer copy.\n";
+                    }
+                }
+            }
+        }
+
+        Stmt body = mutate(op->body);
+        Expr value = mutate(op->value);
+        if (body.same_as(op->body) && value.same_as(op->value)) {
+            return op;
+        }
+        return LetStmt::make(op->name, value, body);
+    }
 };
 
 }  // namespace
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index f50e498ce88e..eea4faf7b073 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -885,6 +885,15 @@ extern int halide_device_release_crop(void *user_context,
  * should rarely be necessary, except maybe for profiling. */
 extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf);
 
+/**
+ * Wait for current GPU operations to complete. Calling this explicitly
+ * should rarely be necessary, except maybe for profiling.
+ * This variation of the synchronizing is useful when a synchronization is desirable
+ * without specifying any buffer to synchronize on.
+ * Calling this with a null device_interface is always illegal.
+ */
+extern int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface);
+
 /** Allocate device memory to back a halide_buffer_t. */
 extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
                                 const struct halide_device_interface_t *device_interface);
diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index 9d652423ff9a..adae690800cc 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -2786,8 +2786,12 @@ WEAK int halide_d3d12compute_device_sync(void *user_context, struct halide_buffe
         return d3d12_context.error();
     }
 
-    d3d12_buffer *dbuffer = peel_buffer(buffer);
-    d3d12compute_device_sync_internal(d3d12_context.device, dbuffer);
+    if (buffer != nullptr) {
+        d3d12_buffer *dbuffer = peel_buffer(buffer);
+        d3d12compute_device_sync_internal(d3d12_context.device, dbuffer);
+    } else {
+        d3d12compute_device_sync_internal(d3d12_context.device, nullptr);
+    }
 
     return halide_error_code_success;
 }
diff --git a/src/runtime/device_interface.cpp b/src/runtime/device_interface.cpp
index 692a28e5fa9f..710d1259678d 100644
--- a/src/runtime/device_interface.cpp
+++ b/src/runtime/device_interface.cpp
@@ -231,6 +231,21 @@ WEAK int halide_device_sync(void *user_context, struct halide_buffer_t *buf) {
     }
 }
 
+/**
+ * Wait for current GPU operations to complete. Calling this explicitly
+ * should rarely be necessary, except maybe for profiling.
+ * This variation of the synchronizing is useful when a synchronization is desirable
+ * without specifying any buffer to synchronize on.
+ */
+WEAK int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface) {
+    if (device_interface == nullptr) {
+        return halide_error_code_no_device_interface;
+    }
+    // This function calls immediately the device_interface implementation to syncrhonize on
+    //  "no buffer" (i.e., nullptr buffer) to trigger a "global" device sync.
+    return device_interface->impl->device_sync(user_context, nullptr);
+}
+
 /** Allocate device memory to back a halide_buffer_t. */
 WEAK int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
                               const halide_device_interface_t *device_interface) {
diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp
index aed1376b6087..ccbe0bf11ecb 100644
--- a/src/runtime/profiler_common.cpp
+++ b/src/runtime/profiler_common.cpp
@@ -349,6 +349,14 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
             };
         }
     }
+    bool support_colors = false;
+    const char *term = getenv("TERM");
+    if (term) {
+        // Check if the terminal supports colors
+        if (strstr(term, "color") || strstr(term, "xterm")) {
+            support_colors = true;
+        }
+    }
 
     for (halide_profiler_pipeline_stats *p = s->pipelines; p;
          p = (halide_profiler_pipeline_stats *)(p->next)) {
@@ -385,14 +393,31 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
         if (print_f_states) {
             int f_stats_count = 0;
             halide_profiler_func_stats **f_stats = (halide_profiler_func_stats **)__builtin_alloca(p->num_funcs * sizeof(halide_profiler_func_stats *));
+            const char *substr_copy_to_device = " (copy to device)";
+            const char *substr_copy_to_host = " (copy to host)";
+
+            int max_func_name_length = 23;  // length of the section header
+            int num_copy_to_device = 0;
+            int num_copy_to_host = 0;
 
-            int max_func_name_length = 0;
+            uint64_t total_func_time = 0;
+            uint64_t total_copy_to_device_time = 0;
+            uint64_t total_copy_to_host_time = 0;
             for (int i = 0; i < p->num_funcs; i++) {
                 halide_profiler_func_stats *fs = p->funcs + i;
                 int name_len = strlen(fs->name);
                 if (name_len > max_func_name_length) {
                     max_func_name_length = name_len;
                 }
+                if (strstr(fs->name, substr_copy_to_device)) {
+                    num_copy_to_device++;
+                    total_copy_to_device_time += fs->time;
+                } else if (strstr(fs->name, substr_copy_to_host)) {
+                    num_copy_to_host++;
+                    total_copy_to_host_time += fs->time;
+                } else {
+                    total_func_time += fs->time;
+                }
             }
 
             for (int i = 0; i < p->num_funcs; i++) {
@@ -418,18 +443,8 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
                 }
             }
 
-            for (int i = 0; i < f_stats_count; i++) {
-                size_t cursor = 0;
-                sstr.clear();
-                halide_profiler_func_stats *fs = f_stats[i];
-
-                sstr << "  " << fs->name << ": ";
-                cursor += max_func_name_length + 5;
-                while (sstr.size() < cursor) {
-                    sstr << " ";
-                }
-
-                float ft = fs->time / (p->runs * 1000000.0f);
+            const auto print_time_and_percentage = [&sstr, p](uint64_t time, size_t &cursor, bool light) {
+                float ft = time / (p->runs * 1000000.0f);
                 if (ft < 10000) {
                     sstr << " ";
                 }
@@ -451,16 +466,40 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
                     sstr << " ";
                 }
 
-                int percent = 0;
+                int perthousand = 0;
                 if (p->time != 0) {
-                    percent = (100 * fs->time) / p->time;
+                    perthousand = (1000 * time) / p->time;
+                }
+                sstr << "(";
+                if (perthousand < 100) {
+                    sstr << " ";
                 }
-                sstr << "(" << percent << "%)";
-                cursor += 8;
+                int percent = perthousand / 10;
+                sstr << percent << "." << (perthousand - percent * 10) << "%)";
+                if (!light) {
+                    cursor += 10;
+                    while (sstr.size() < cursor) {
+                        sstr << " ";
+                    }
+                }
+            };
+
+            auto print_report_entry = [&](halide_profiler_func_stats *fs, const char *suffix_cut) {
+                size_t cursor = 0;
+                sstr.clear();
+
+                sstr << "    " << fs->name;
+                if (suffix_cut) {
+                    sstr.erase(strlen(suffix_cut));
+                }
+                sstr << ": ";
+                cursor += max_func_name_length + 7;
                 while (sstr.size() < cursor) {
                     sstr << " ";
                 }
 
+                print_time_and_percentage(fs->time, cursor, false);
+
                 if (!serial) {
                     float threads = fs->active_threads_numerator / (fs->active_threads_denominator + 1e-10);
                     sstr << "threads: " << threads;
@@ -494,6 +533,61 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
                 sstr << "\n";
 
                 halide_print(user_context, sstr.str());
+            };
+
+            if (num_copy_to_host == 0 && num_copy_to_device == 0) {
+                for (int i = 0; i < f_stats_count; i++) {
+                    halide_profiler_func_stats *fs = f_stats[i];
+                    print_report_entry(fs, nullptr);
+                }
+            } else {
+                const auto print_section_header = [&](const char *name, uint64_t total_time) {
+                    size_t cursor = 0;
+                    sstr.clear();
+                    sstr << "  ";
+                    if (support_colors) {
+                        sstr << "\033[90m\033[3m";
+                        cursor += 9;
+                    }
+                    sstr << "[" << name << " ";
+                    cursor += max_func_name_length + 7;
+                    while (sstr.size() < cursor) {
+                        sstr << ":";
+                    }
+                    print_time_and_percentage(total_time, cursor, true);
+                    sstr << " ::::]";
+                    if (support_colors) {
+                        sstr << "\033[0m";
+                    }
+                    sstr << "\n";
+                    halide_print(user_context, sstr.str());
+                };
+
+                print_section_header("funcs", total_func_time);
+                for (int i = 0; i < f_stats_count; i++) {
+                    halide_profiler_func_stats *fs = f_stats[i];
+                    if (!strstr(fs->name, substr_copy_to_device) && !strstr(fs->name, substr_copy_to_host)) {
+                        print_report_entry(fs, nullptr);
+                    }
+                }
+                if (num_copy_to_device) {
+                    print_section_header("buffer copies to device", total_copy_to_device_time);
+                    for (int i = 0; i < f_stats_count; i++) {
+                        halide_profiler_func_stats *fs = f_stats[i];
+                        if (strstr(fs->name, substr_copy_to_device)) {
+                            print_report_entry(fs, substr_copy_to_device);
+                        }
+                    }
+                }
+                if (num_copy_to_host) {
+                    print_section_header("buffer copies to host", total_copy_to_host_time);
+                    for (int i = 0; i < f_stats_count; i++) {
+                        halide_profiler_func_stats *fs = f_stats[i];
+                        if (strstr(fs->name, substr_copy_to_host)) {
+                            print_report_entry(fs, substr_copy_to_host);
+                        }
+                    }
+                }
             }
         }
     }
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
index 51f5b7245343..5c64391b6259 100644
--- a/src/runtime/runtime_api.cpp
+++ b/src/runtime/runtime_api.cpp
@@ -50,6 +50,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_device_malloc,
     (void *)&halide_device_release,
     (void *)&halide_device_sync,
+    (void *)&halide_device_sync_global,
     (void *)&halide_disable_timer_interrupt,
     (void *)&halide_do_par_for,
     (void *)&halide_do_parallel_tasks,
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 6b4529be6be5..4ee9f57480dc 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -76,6 +76,7 @@ tests(GROUPS correctness
       debug_to_file_reorder.cpp
       deferred_loop_level.cpp
       deinterleave4.cpp
+      device_buffer_copies_with_profile.cpp
       device_buffer_copy.cpp
       device_copy_at_inner_loop.cpp
       device_crop.cpp
diff --git a/test/correctness/device_buffer_copies_with_profile.cpp b/test/correctness/device_buffer_copies_with_profile.cpp
new file mode 100644
index 000000000000..7398334fc06a
--- /dev/null
+++ b/test/correctness/device_buffer_copies_with_profile.cpp
@@ -0,0 +1,71 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int run_test(Target t) {
+    // Sliding window with the producer on the GPU and the consumer on
+    // the CPU. This requires a copy inside the loop over which we are
+    // sliding. Currently this copies the entire buffer back and
+    // forth, which is suboptimal in the general case. In this
+    // specific case we're folded over y, so copying the entire buffer
+    // is not much more than just copying the part that was modified.
+
+    Func f0{"f0_on_cpu"}, f1{"f1_on_gpu"}, f2{"f2_on_cpu"};
+    Var x, y, tx, ty;
+
+    // Produce something on CPU
+    f0(x, y) = x + y;
+    f0.compute_root();
+
+    // Which we use to produce something on GPU, causing a copy_to_device.
+    f1(x, y) = f0(x, y) + f0(x, y + 1);
+    f1.compute_root().gpu_tile(x, y, tx, ty, 8, 8);
+
+    // Which in turn we use to produce something on CPU, causing a copy_to_host.
+    f2(x, y) = f1(x, y) * 2;
+    f2.compute_root();
+
+    // Make the buffer a little bigger so we actually can see the copy time.
+    Buffer<int> out = f2.realize({2000, 2000}, t);
+    // Let's only verify a part of it...
+    for (int y = 0; y < 100; y++) {
+        for (int x = 0; x < 100; x++) {
+            int correct = 4 * (x + y) + 2;
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    Target t = get_jit_target_from_environment();
+    if (!t.has_gpu_feature()) {
+        printf("[SKIP] no gpu feature enabled\n");
+        return 0;
+    }
+    printf("Testing without profiler.\n");
+    int result = run_test(t);
+    if (result != 0) {
+        return 1;
+    }
+
+    printf("Testing thread based profiler.\n");
+    result = run_test(t.with_feature(Target::Profile));
+    if (result != 0) {
+        return 1;
+    }
+    if (t.os == Target::Linux) {
+        printf("Testing timer based profiler.\n");
+        result = run_test(t.with_feature(Target::ProfileByTimer));
+        if (result != 0) {
+            return 1;
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/performance/memory_profiler.cpp b/test/performance/memory_profiler.cpp
index 3fb511979265..8ca5cf3c2295 100644
--- a/test/performance/memory_profiler.cpp
+++ b/test/performance/memory_profiler.cpp
@@ -16,13 +16,13 @@ void reset_stats() {
 }
 
 void my_print(JITUserContext *, const char *msg) {
-    float this_ms, this_threads;
-    int idx, this_percentage, this_heap_peak;
+    float this_ms, this_threads, this_percentage;
+    int idx, this_heap_peak;
     int this_num_mallocs, this_malloc_avg, this_stack_peak;
     int val;
 
     // printf("%s", msg);
-    val = sscanf(msg, " g_%d: %fms (%d%%) threads: %f peak: %d num: %d avg: %d",
+    val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f peak: %d num: %d avg: %d",
                  &idx, &this_ms, &this_percentage, &this_threads, &this_heap_peak,
                  &this_num_mallocs, &this_malloc_avg);
     if (val == 7) {
@@ -31,7 +31,7 @@ void my_print(JITUserContext *, const char *msg) {
         malloc_avg = this_malloc_avg;
     }
 
-    val = sscanf(msg, " g_%d: %fms (%d%%) peak: %d num: %d avg: %d",
+    val = sscanf(msg, " g_%d: %fms (%f%%) peak: %d num: %d avg: %d",
                  &idx, &this_ms, &this_percentage, &this_heap_peak,
                  &this_num_mallocs, &this_malloc_avg);
     if (val == 6) {
@@ -40,13 +40,13 @@ void my_print(JITUserContext *, const char *msg) {
         malloc_avg = this_malloc_avg;
     }
 
-    val = sscanf(msg, " g_%d: %fms (%d%%) threads: %f stack: %d",
+    val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f stack: %d",
                  &idx, &this_ms, &this_percentage, &this_threads, &this_stack_peak);
     if (val == 5) {
         stack_peak = this_stack_peak;
     }
 
-    val = sscanf(msg, " g_%d: %fms (%d%%) stack: %d",
+    val = sscanf(msg, " g_%d: %fms (%f%%) stack: %d",
                  &idx, &this_ms, &this_percentage, &this_stack_peak);
     if (val == 4) {
         stack_peak = this_stack_peak;
diff --git a/test/performance/profiler.cpp b/test/performance/profiler.cpp
index 3912a16c4211..bf5d166c0e81 100644
--- a/test/performance/profiler.cpp
+++ b/test/performance/profiler.cpp
@@ -3,14 +3,14 @@
 
 using namespace Halide;
 
-int percentage = 0;
+float percentage = 0;
 float ms = 0;
 void my_print(JITUserContext *, const char *msg) {
     float this_ms;
-    int this_percentage;
-    int val = sscanf(msg, " fn13: %fms (%d", &this_ms, &this_percentage);
+    float this_percentage;
+    int val = sscanf(msg, " fn13: %fms (%f", &this_ms, &this_percentage);
     if (val != 2) {
-        val = sscanf(msg, " fn13$1: %fms (%d", &this_ms, &this_percentage);
+        val = sscanf(msg, " fn13$1: %fms (%f", &this_ms, &this_percentage);
     }
     if (val == 2) {
         ms = this_ms;
@@ -59,8 +59,8 @@ int run_test(bool use_timer_profiler) {
 
     printf("Time spent in fn13: %fms\n", ms);
 
-    if (percentage < 40) {
-        printf("Percentage of runtime spent in f13: %d\n"
+    if (percentage < 40.0f) {
+        printf("Percentage of runtime spent in f13: %.1f%%\n"
                "This is suspiciously low. It should be more like 66%%\n",
                percentage);
         return 1;

From 6d29ad5a0b5afd650e3e3d6f977a3b03b23b3655 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 13 Dec 2023 09:02:37 -0800
Subject: [PATCH 019/186] Add missing Python bindings for various recent
 additions to Func and Stage (#8002)

* Add missing Python bindings for various recent additions to Func and Stage

We have been sloppy about maintaining these. Also added a bit of testing.

* Update PyEnums.cpp
---
 .../src/halide/halide_/PyEnums.cpp            | 10 ++++++
 python_bindings/src/halide/halide_/PyFunc.cpp | 10 ++++--
 .../src/halide/halide_/PyScheduleMethods.h    |  9 +++++
 .../src/halide/halide_/PyStage.cpp            |  4 ++-
 .../test/correctness/boundary_conditions.py   | 33 ++++++++++++-------
 .../test/correctness/realize_warnings.py      | 24 ++++++++++++++
 6 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index f86e7072edd5..d723d66461d8 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -68,7 +68,12 @@ void define_enums(py::module &m) {
     py::enum_<TailStrategy>(m, "TailStrategy")
         .value("RoundUp", TailStrategy::RoundUp)
         .value("GuardWithIf", TailStrategy::GuardWithIf)
+        .value("Predicate", TailStrategy::Predicate)
+        .value("PredicateLoads", TailStrategy::PredicateLoads)
+        .value("PredicateStores", TailStrategy::PredicateStores)
         .value("ShiftInwards", TailStrategy::ShiftInwards)
+        .value("ShiftInwardsAndBlend", TailStrategy::ShiftInwardsAndBlend)
+        .value("RoundUpAndBlend", TailStrategy::RoundUpAndBlend)
         .value("Auto", TailStrategy::Auto);
 
     py::enum_<Target::OS>(m, "TargetOS")
@@ -216,6 +221,11 @@ void define_enums(py::module &m) {
         .value("stmt", OutputFileType::stmt)
         .value("stmt_html", OutputFileType::stmt_html)
         .value("compiler_log", OutputFileType::compiler_log);
+
+    py::enum_<Partition>(m, "Partition")
+        .value("Auto", Partition::Auto)
+        .value("Never", Partition::Never)
+        .value("Always", Partition::Always);
 }
 
 }  // namespace PythonBindings
diff --git a/python_bindings/src/halide/halide_/PyFunc.cpp b/python_bindings/src/halide/halide_/PyFunc.cpp
index dcbd122c6228..b7e82900a6cf 100644
--- a/python_bindings/src/halide/halide_/PyFunc.cpp
+++ b/python_bindings/src/halide/halide_/PyFunc.cpp
@@ -205,19 +205,25 @@ void define_func(py::module &m) {
             })
 
             .def("compute_at", (Func & (Func::*)(const Func &, const Var &)) & Func::compute_at, py::arg("f"), py::arg("var"))
-            .def("compute_at", (Func & (Func::*)(const Func &, const RVar &)) & Func::compute_at, py::arg("f"), py::arg("var"))
+            .def("compute_at", (Func & (Func::*)(const Func &, const RVar &)) & Func::compute_at, py::arg("f"), py::arg("rvar"))
             .def("compute_at", (Func & (Func::*)(LoopLevel)) & Func::compute_at, py::arg("loop_level"))
 
             .def("store_at", (Func & (Func::*)(const Func &, const Var &)) & Func::store_at, py::arg("f"), py::arg("var"))
-            .def("store_at", (Func & (Func::*)(const Func &, const RVar &)) & Func::store_at, py::arg("f"), py::arg("var"))
+            .def("store_at", (Func & (Func::*)(const Func &, const RVar &)) & Func::store_at, py::arg("f"), py::arg("rvar"))
             .def("store_at", (Func & (Func::*)(LoopLevel)) & Func::store_at, py::arg("loop_level"))
 
             .def("async_", &Func::async)
+            .def("bound_storage", &Func::bound_storage)
             .def("memoize", &Func::memoize)
             .def("compute_inline", &Func::compute_inline)
             .def("compute_root", &Func::compute_root)
             .def("store_root", &Func::store_root)
 
+            .def("hoist_storage", (Func & (Func::*)(const Func &f, const Var &var)) & Func::hoist_storage, py::arg("f"), py::arg("var"))
+            .def("hoist_storage", (Func & (Func::*)(const Func &f, const RVar &rvar)) & Func::hoist_storage, py::arg("f"), py::arg("rvar"))
+            .def("hoist_storage", (Func & (Func::*)(LoopLevel)) & Func::hoist_storage, py::arg("loop_level"))
+            .def("hoist_storage_root", &Func::hoist_storage_root)
+
             .def("store_in", &Func::store_in, py::arg("memory_type"))
 
             .def(
diff --git a/python_bindings/src/halide/halide_/PyScheduleMethods.h b/python_bindings/src/halide/halide_/PyScheduleMethods.h
index 9086bbafc5c0..2c8c00a98f4e 100644
--- a/python_bindings/src/halide/halide_/PyScheduleMethods.h
+++ b/python_bindings/src/halide/halide_/PyScheduleMethods.h
@@ -33,6 +33,15 @@ HALIDE_NEVER_INLINE void add_schedule_methods(PythonClass &class_instance) {
         .def("fuse", &T::fuse,
              py::arg("inner"), py::arg("outer"), py::arg("fused"))
 
+        .def("partition", (T & (T::*)(const VarOrRVar &var, Partition partition_policy)) & T::partition,
+             py::arg("var"), py::arg("partition_policy"))
+        .def("never_partition_all", &T::never_partition_all)
+        .def("never_partition", (T & (T::*)(const std::vector<VarOrRVar> &vars)) & T::never_partition,
+             py::arg("vars"))
+        .def("always_partition_all", &T::always_partition_all)
+        .def("always_partition", (T & (T::*)(const std::vector<VarOrRVar> &vars)) & T::always_partition,
+             py::arg("vars"))
+
         .def("serial", &T::serial,
              py::arg("var"))
 
diff --git a/python_bindings/src/halide/halide_/PyStage.cpp b/python_bindings/src/halide/halide_/PyStage.cpp
index e84c6fcc7189..b412a6f2b39e 100644
--- a/python_bindings/src/halide/halide_/PyStage.cpp
+++ b/python_bindings/src/halide/halide_/PyStage.cpp
@@ -17,7 +17,9 @@ void define_stage(py::module &m) {
             .def("rfactor", (Func(Stage::*)(std::vector<std::pair<RVar, Var>>)) & Stage::rfactor,
                  py::arg("preserved"))
             .def("rfactor", (Func(Stage::*)(const RVar &, const Var &)) & Stage::rfactor,
-                 py::arg("r"), py::arg("v"));
+                 py::arg("r"), py::arg("v"))
+
+            .def("unscheduled", &Stage::unscheduled);
 
     py::implicitly_convertible<Func, Stage>();
 
diff --git a/python_bindings/test/correctness/boundary_conditions.py b/python_bindings/test/correctness/boundary_conditions.py
index 716b0cdfd6a1..32abd12ff0e6 100644
--- a/python_bindings/test/correctness/boundary_conditions.py
+++ b/python_bindings/test/correctness/boundary_conditions.py
@@ -4,19 +4,22 @@
 test_min = -25
 test_extent = 100
 
-x, y = hl.Var(), hl.Var()
+x, y = hl.vars("x y")
 
 
 def expect_eq(actual, expected):
     assert expected == actual, "Failed: expected %d, actual %d" % (expected, actual)
 
 
-def schedule_test(f, vector_width, target):
+def schedule_test(f, vector_width, target, partition_policy):
     if vector_width != 1:
         f.vectorize(x, vector_width)
 
+    f.partition(x, partition_policy);
+    f.partition(y, partition_policy);
+
     if target.has_gpu_feature() and vector_width <= 16:
-        xo, yo, xi, yi = hl.Var(), hl.Var(), hl.Var(), hl.Var()
+        xo, yo, xi, yi = hl.vars("xo yo xi yi")
         f.gpu_tile(x, y, xo, yo, xi, yi, 2, 2)
 
 
@@ -30,11 +33,12 @@ def realize_and_check(
     test_extent_y,
     vector_width,
     target,
+    partition_policy,
 ):
     result = hl.Buffer(hl.UInt(8), [test_extent_x, test_extent_y])
     result.set_min([test_min_x, test_min_y])
     f2 = hl.lambda_func(x, y, f[x, y])
-    schedule_test(f2, vector_width, target)
+    schedule_test(f2, vector_width, target, partition_policy)
     f2.realize(result, target)
     result.copy_to_host()
     for r in range(test_min_y, test_min_y + test_extent_y):
@@ -91,8 +95,8 @@ def check_mirror_interior(input, result, c, r):
     expect_eq(result[c, r], input[mapped_x, mapped_y])
 
 
-def test_all(vector_width, target):
-    # print("target is %s " % str(target))
+def test_all(vector_width, target, partition_policy):
+    # print("target is %s, partition_policy is %s " % (str(target), str(partition_policy)))
 
     W = 32
     H = 32
@@ -137,6 +141,7 @@ def test_all(vector_width, target):
             test_extent,
             vector_width,
             target,
+            partition_policy,
         )
         realize_and_check(
             bc(**image_input_args),
@@ -148,6 +153,7 @@ def test_all(vector_width, target):
             test_extent,
             vector_width,
             target,
+            partition_policy,
         )
         realize_and_check(
             bc(**undef_min_args),
@@ -159,6 +165,7 @@ def test_all(vector_width, target):
             test_extent,
             vector_width,
             target,
+            partition_policy,
         )
         realize_and_check(
             bc(**undef_max_args),
@@ -170,6 +177,7 @@ def test_all(vector_width, target):
             H,
             vector_width,
             target,
+            partition_policy,
         )
         realize_and_check(
             bc(**implicit_bounds_args),
@@ -181,6 +189,7 @@ def test_all(vector_width, target):
             test_extent,
             vector_width,
             target,
+            partition_policy,
         )
 
 
@@ -189,11 +198,13 @@ def test_all(vector_width, target):
 
     vector_width_power_max = 6
     # https://github.com/halide/Halide/issues/2148
-    if target.has_feature(hl.TargetFeature.Metal) or target.has_feature(
-        hl.TargetFeature.D3D12Compute
-    ):
-        vector_width_power_max = 3
+    if target.has_feature(hl.TargetFeature.Metal) or \
+        target.has_feature(hl.TargetFeature.Vulkan) or \
+        target.has_feature(hl.TargetFeature.OpenGLCompute) or \
+        target.has_feature(hl.TargetFeature.D3D12Compute):
+        vector_width_power_max = 2
 
     for i in range(0, vector_width_power_max):
         vector_width = 1 << i
-        test_all(vector_width, target)
+        test_all(vector_width, target, hl.Partition.Auto)
+        test_all(vector_width, target, hl.Partition.Never)
diff --git a/python_bindings/test/correctness/realize_warnings.py b/python_bindings/test/correctness/realize_warnings.py
index a76e5727c93e..f182538c3c2b 100644
--- a/python_bindings/test/correctness/realize_warnings.py
+++ b/python_bindings/test/correctness/realize_warnings.py
@@ -27,6 +27,30 @@ def test_warnings():
     for line in stdout_lines:
         assert line == expected_warning
 
+def test_unscheduled(suppress):
+    x = hl.Var()
+    f = hl.Func("f_%s" % str(suppress))
+    f[x] = 0
+    f[x] += 5
+    f.vectorize(x, 8)
+    if suppress:
+        f.update(0).unscheduled()
+
+    buffer = io.StringIO()
+    with contextlib.redirect_stdout(buffer):
+        f.realize([1024])
+
+    buffer.seek(0)
+    stdout_lines = buffer.readlines()
+    if suppress:
+        assert len(stdout_lines) == 0
+    else:
+        expected_warning = "Warning: Update definition 0 of function f_False has not been scheduled"
+        assert len(stdout_lines) > 0
+        for line in stdout_lines:
+            assert line.startswith(expected_warning), "\n%s\n%s" % (line, expected_warning)
 
 if __name__ == "__main__":
     test_warnings()
+    test_unscheduled(True)
+    test_unscheduled(False)

From 6bcb6955a9b24ad34f63e0749acced8609e90741 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 14 Dec 2023 16:27:56 -0800
Subject: [PATCH 020/186] Update Halide version in setup.py to 17.0.0 (#8010)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fa88f382a122..4939b88a3151 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name="halide",
-    version='16.0.0',
+    version='17.0.0',
     author="The Halide team",
     author_email="halide-dev@lists.csail.mit.edu",
     description="Halide is a programming language designed to make it easier "

From 61b8d384b2b799cd47634e4a3b67aa7c7f580a46 Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Tue, 19 Dec 2023 14:14:05 -0800
Subject: [PATCH 021/186] Scheduling directive to support ring buffering
 (#7967)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Half-plumbed

* Revert "Half-plumbed"

This reverts commit eb9dd02c6c607f0b49c95258ae67f58fe583ff44.

* Interface for double buffer

* Update Provides, Calls and Realizes for double buffering

* Proper sync for double buffering

* Use proper name for the semaphor and use correct initial value

* Rename the class

* Pass expression for index

* Adds storage for double buffering index

* Use a separate index to go through the double buffer

* Failing test

* Better handling of hoisted storage in all of the async-related passes

* New test and clean-up the generated IR

* More tests

* Allow double buffering without async and add corresponding test

* Filter out incorrect double_buffer schedules

* Add tests to the cmake files

* Clean up

* Update the comment

* Clean up

* Clean up

* Update serialization

* complete_x86_target() should enable F16C and FMA when AVX2 is present (#7971)

All known AVX2-enabled architectures definitely have these features.

* Add two new tail strategies for update definitions (#7949)

* Add two new tail strategies for update definitions

* Stop printing asm

* Update expected number of partitions for Partition::Always

* Add a comment explaining why the blend safety check is per dimension

* Add serialization support for the new tail strategies

* trigger buildbots

* Add comment

---------

Co-authored-by: Steven Johnson <srj@google.com>

* Add appropriate mattrs for arm-32 extensions (#7978)

* Add appropriate mattrs for arm-32 extensions

Fixes #7976

* Pull clauses out of if

* Move canonical version numbers into source, not build system (#7980) (#7981)

* Move canonical version numbers into source, not build system (#7980)

* Fixes

* Silence useless "Insufficient parallelism" autoscheduler warning (#7990)

* Add a notebook with a visualization of the aprrox_* functions and their errors (#7974)

* Add a notebook with a visualization of the aprrox_* functions and their errors

* Fix spelling error

* Make narrowing float->int casts on wasm go via wider ints (#7973)

Fixes #7972

* Fix handling of assert statements whose conditions get vectorized (#7989)

* Fix handling of assert statements whose conditions get vectorized

* Fix test name

* Fix all "unscheduled update()" warnings in our code (#7991)

* Fix all "unscheduled update()" warnings in our code

And also fix the Mullapudi scheduler to explicitly touch all update stages. This allows us to mark this warning as an error if we so choose.

* fixes

* fixes

* Update recursive_box_filters.cpp

* Silence useless 'Outer dim vectorization of var' warning in Mullapudi… (#7992)

Silence useless 'Outer dim vectorization of var' warning in Mullapudi scheduler

* Add a tutorial for async and double_buffer

* Renamed double_buffer to ring_buffer

* ring_buffer() now expects an extent Expr

* Actually use extent for ring_buffer()

* Address some of the comments

* Provide an example of the code structure for producer-consumer async example

* Comments updates

* Fix clang-format and clang-tidy

* Add Python binding for Func::ring_buffer()

* Don't use a separate index for ring buffer + add a new test

* Rename the tests

* Clean up the old name

* Add &

* Move test to the right folder

* Move expr

* Add comments for InjectRingBuffering

* Improve ring_buffer doc

* Fix comments

* Comments

* A better error message

* Mention that extent is expected to be a positive integer

* Add another code structure and explain how the indices for ring buffer are computed

* Expand test comments

* Fix spelling

---------

Co-authored-by: Steven Johnson <srj@google.com>
Co-authored-by: Andrew Adams <andrew.b.adams@gmail.com>
---
 python_bindings/src/halide/halide_/PyFunc.cpp |   1 +
 src/AsyncProducers.cpp                        | 297 +++++++++++--
 src/Deserialization.cpp                       |   2 +
 src/Func.cpp                                  |   6 +
 src/Func.h                                    |  15 +
 src/Schedule.cpp                              |  11 +
 src/Schedule.h                                |   3 +
 src/ScheduleFunctions.cpp                     |   4 +
 src/Serialization.cpp                         |   3 +-
 src/StorageFlattening.cpp                     |   9 +-
 src/halide_ir.fbs                             |   1 +
 test/correctness/CMakeLists.txt               |   1 +
 test/correctness/ring_buffer.cpp              | 414 ++++++++++++++++++
 test/error/CMakeLists.txt                     |   1 +
 test/error/bad_ring_buffer.cpp                |  23 +
 tutorial/CMakeLists.txt                       |   3 +-
 tutorial/lesson_24_async.cpp                  | 299 +++++++++++++
 17 files changed, 1045 insertions(+), 48 deletions(-)
 create mode 100644 test/correctness/ring_buffer.cpp
 create mode 100644 test/error/bad_ring_buffer.cpp
 create mode 100644 tutorial/lesson_24_async.cpp

diff --git a/python_bindings/src/halide/halide_/PyFunc.cpp b/python_bindings/src/halide/halide_/PyFunc.cpp
index b7e82900a6cf..bcc889b6d9ce 100644
--- a/python_bindings/src/halide/halide_/PyFunc.cpp
+++ b/python_bindings/src/halide/halide_/PyFunc.cpp
@@ -213,6 +213,7 @@ void define_func(py::module &m) {
             .def("store_at", (Func & (Func::*)(LoopLevel)) & Func::store_at, py::arg("loop_level"))
 
             .def("async_", &Func::async)
+            .def("ring_buffer", &Func::ring_buffer)
             .def("bound_storage", &Func::bound_storage)
             .def("memoize", &Func::memoize)
             .def("compute_inline", &Func::compute_inline)
diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
index f633409cce65..783f00dd35b1 100644
--- a/src/AsyncProducers.cpp
+++ b/src/AsyncProducers.cpp
@@ -73,6 +73,15 @@ class NoOpCollapsingMutator : public IRMutator {
         }
     }
 
+    Stmt visit(const HoistedStorage *op) override {
+        Stmt body = mutate(op->body);
+        if (is_no_op(body)) {
+            return body;
+        } else {
+            return HoistedStorage::make(op->name, body);
+        }
+    }
+
     Stmt visit(const Allocate *op) override {
         Stmt body = mutate(op->body);
         if (is_no_op(body)) {
@@ -198,6 +207,9 @@ class GenerateProducerBody : public NoOpCollapsingMutator {
         if (starts_with(op->name, func + ".folding_semaphore.") && ends_with(op->name, ".head")) {
             // This is a counter associated with the producer side of a storage-folding semaphore. Keep it.
             return op;
+        } else if (starts_with(op->name, func + ".ring_buffer.")) {
+            // This is a counter associated with the producer side of a ring buffering.
+            return op;
         } else {
             return Evaluate::make(0);
         }
@@ -243,8 +255,42 @@ class GenerateProducerBody : public NoOpCollapsingMutator {
         return op;
     }
 
+    Stmt visit(const Allocate *op) override {
+        Stmt body = mutate(op->body);
+        if (is_no_op(body)) {
+            return body;
+        } else {
+            return Allocate::make(op->name, op->type, op->memory_type,
+                                  op->extents, op->condition, body,
+                                  op->new_expr, op->free_function, op->padding);
+        }
+    }
+
+    Stmt visit(const Realize *op) override {
+        Stmt body = mutate(op->body);
+        if (is_no_op(body)) {
+            return body;
+        } else {
+            inner_realizes.insert(op->name);
+            return Realize::make(op->name, op->types, op->memory_type,
+                                 op->bounds, op->condition, body);
+        }
+    }
+
+    Stmt visit(const HoistedStorage *op) override {
+        Stmt body = mutate(op->body);
+        if (is_no_op(body)) {
+            return body;
+        } else if (inner_realizes.count(op->name) == 0) {
+            return body;
+        } else {
+            return HoistedStorage::make(op->name, body);
+        }
+    }
+
     map<string, vector<string>> &cloned_acquires;
     set<string> inner_semaphores;
+    set<string> inner_realizes;
 
 public:
     GenerateProducerBody(const string &f, const vector<Expr> &s, map<string, vector<string>> &a)
@@ -363,57 +409,78 @@ class ForkAsyncProducers : public IRMutator {
     const map<string, Function> &env;
 
     map<string, vector<string>> cloned_acquires;
-
-    Stmt visit(const Realize *op) override {
-        auto it = env.find(op->name);
-        internal_assert(it != env.end());
-        Function f = it->second;
-        if (f.schedule().async()) {
-            Stmt body = op->body;
-
-            // Make two copies of the body, one which only does the
-            // producer, and one which only does the consumer. Inject
-            // synchronization to preserve dependencies. Put them in a
-            // task-parallel block.
-
-            // Make a semaphore per consume node
-            CountConsumeNodes consumes(op->name);
-            body.accept(&consumes);
-
-            vector<string> sema_names;
-            vector<Expr> sema_vars;
-            for (int i = 0; i < consumes.count; i++) {
-                sema_names.push_back(op->name + ".semaphore_" + std::to_string(i));
-                sema_vars.push_back(Variable::make(type_of<halide_semaphore_t *>(), sema_names.back()));
+    std::set<string> hoisted_storages;
+
+    Stmt process_body(const string &name, Stmt body) {
+        // Make two copies of the body, one which only does the
+        // producer, and one which only does the consumer. Inject
+        // synchronization to preserve dependencies. Put them in a
+        // task-parallel block.
+
+        // Make a semaphore per consume node
+        CountConsumeNodes consumes(name);
+        body.accept(&consumes);
+
+        vector<string> sema_names;
+        vector<Expr> sema_vars;
+        for (int i = 0; i < consumes.count; i++) {
+            sema_names.push_back(name + ".semaphore_" + std::to_string(i));
+            sema_vars.push_back(Variable::make(type_of<halide_semaphore_t *>(), sema_names.back()));
+        }
+
+        Stmt producer = GenerateProducerBody(name, sema_vars, cloned_acquires).mutate(body);
+        Stmt consumer = GenerateConsumerBody(name, sema_vars).mutate(body);
+
+        // Recurse on both sides
+        producer = mutate(producer);
+        consumer = mutate(consumer);
+
+        // Run them concurrently
+        body = Fork::make(producer, consumer);
+
+        for (const string &sema_name : sema_names) {
+            // Make a semaphore on the stack
+            Expr sema_space = Call::make(type_of<halide_semaphore_t *>(), "halide_make_semaphore",
+                                         {0}, Call::Extern);
+
+            // If there's a nested async producer, we may have
+            // recursively cloned this semaphore inside the mutation
+            // of the producer and consumer.
+            const vector<string> &clones = cloned_acquires[sema_name];
+            for (const auto &i : clones) {
+                body = CloneAcquire(sema_name, i).mutate(body);
+                body = LetStmt::make(i, sema_space, body);
             }
 
-            Stmt producer = GenerateProducerBody(op->name, sema_vars, cloned_acquires).mutate(body);
-            Stmt consumer = GenerateConsumerBody(op->name, sema_vars).mutate(body);
-
-            // Recurse on both sides
-            producer = mutate(producer);
-            consumer = mutate(consumer);
-
-            // Run them concurrently
-            body = Fork::make(producer, consumer);
+            body = LetStmt::make(sema_name, sema_space, body);
+        }
 
-            for (const string &sema_name : sema_names) {
-                // Make a semaphore on the stack
-                Expr sema_space = Call::make(type_of<halide_semaphore_t *>(), "halide_make_semaphore",
-                                             {0}, Call::Extern);
+        return body;
+    }
 
-                // If there's a nested async producer, we may have
-                // recursively cloned this semaphore inside the mutation
-                // of the producer and consumer.
-                const vector<string> &clones = cloned_acquires[sema_name];
-                for (const auto &i : clones) {
-                    body = CloneAcquire(sema_name, i).mutate(body);
-                    body = LetStmt::make(i, sema_space, body);
-                }
+    Stmt visit(const HoistedStorage *op) override {
+        hoisted_storages.insert(op->name);
+        Stmt body = op->body;
 
-                body = LetStmt::make(sema_name, sema_space, body);
-            }
+        auto it = env.find(op->name);
+        internal_assert(it != env.end());
+        Function f = it->second;
+        if (f.schedule().async() && f.schedule().ring_buffer().defined()) {
+            body = process_body(op->name, body);
+        } else {
+            body = mutate(body);
+        }
+        hoisted_storages.erase(op->name);
+        return HoistedStorage::make(op->name, body);
+    }
 
+    Stmt visit(const Realize *op) override {
+        auto it = env.find(op->name);
+        internal_assert(it != env.end());
+        Function f = it->second;
+        if (f.schedule().async() && hoisted_storages.count(op->name) == 0) {
+            Stmt body = op->body;
+            body = process_body(op->name, body);
             return Realize::make(op->name, op->types, op->memory_type,
                                  op->bounds, op->condition, body);
         } else {
@@ -592,6 +659,117 @@ class TightenProducerConsumerNodes : public IRMutator {
     }
 };
 
+// Update indices to add ring buffer.
+class UpdateIndices : public IRMutator {
+    using IRMutator::visit;
+
+    Stmt visit(const Provide *op) override {
+        if (op->name == func_name) {
+            std::vector<Expr> args = op->args;
+            args.push_back(ring_buffer_index);
+            return Provide::make(op->name, op->values, args, op->predicate);
+        }
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Call *op) override {
+        if (op->call_type == Call::Halide && op->name == func_name) {
+            std::vector<Expr> args = op->args;
+            args.push_back(ring_buffer_index);
+            return Call::make(op->type, op->name, args, op->call_type, op->func, op->value_index, op->image, op->param);
+        }
+        return IRMutator::visit(op);
+    }
+
+    std::string func_name;
+    Expr ring_buffer_index;
+
+public:
+    UpdateIndices(const string &fn, Expr di)
+        : func_name(fn), ring_buffer_index(std::move(di)) {
+    }
+};
+
+// Inject ring buffering.
+class InjectRingBuffering : public IRMutator {
+    using IRMutator::visit;
+
+    struct Loop {
+        std::string name;
+        Expr min;
+        Expr extent;
+
+        Loop(std::string n, Expr m, Expr e)
+            : name(std::move(n)), min(std::move(m)), extent(std::move(e)) {
+        }
+    };
+
+    const map<string, Function> &env;
+    std::vector<Loop> loops;
+    std::map<std::string, int> hoist_storage_loop_index;
+
+    Stmt visit(const Realize *op) override {
+        Stmt body = mutate(op->body);
+        Function f = env.find(op->name)->second;
+        Region bounds = op->bounds;
+        if (f.schedule().ring_buffer().defined()) {
+            // For the ring buffering we expand the storage by adding another dimension of
+            // the range of [0, ring_buffer.extent].
+            bounds.emplace_back(0, f.schedule().ring_buffer());
+            // Build an index for accessing ring buffer as a linear combination of all
+            // loop variables between the storage location (defined by the HoistStorage loop level)
+            // and corresponding Realize node.
+            int loop_index = hoist_storage_loop_index[op->name] + 1;
+            Expr current_index = Variable::make(Int(32), loops[loop_index].name);
+            while (++loop_index < (int)loops.size()) {
+                current_index = current_index *
+                                    (loops[loop_index].extent - loops[loop_index].min) +
+                                Variable::make(Int(32), loops[loop_index].name);
+            }
+            current_index = current_index % f.schedule().ring_buffer();
+            // Adds an extra index for to the all of the references of f.
+            body = UpdateIndices(op->name, current_index).mutate(body);
+            Expr sema_var = Variable::make(type_of<halide_semaphore_t *>(), f.name() + ".folding_semaphore.ring_buffer");
+            Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema_var, 1}, Call::Extern);
+            Stmt release = Evaluate::make(release_producer);
+            body = Block::make(body, release);
+            body = Acquire::make(sema_var, 1, body);
+        }
+
+        return Realize::make(op->name, op->types, op->memory_type, bounds, op->condition, body);
+    }
+
+    Stmt visit(const HoistedStorage *op) override {
+        // Store the index of the last loop we encountered.
+        hoist_storage_loop_index[op->name] = loops.size() - 1;
+        Function f = env.find(op->name)->second;
+
+        Stmt mutated = mutate(op->body);
+        mutated = HoistedStorage::make(op->name, mutated);
+
+        if (f.schedule().ring_buffer().defined()) {
+            // Make a semaphore on the stack
+            Expr sema_space = Call::make(type_of<halide_semaphore_t *>(), "halide_make_semaphore",
+                                         {2}, Call::Extern);
+            mutated = LetStmt::make(f.name() + std::string(".folding_semaphore.ring_buffer"), sema_space, mutated);
+        }
+        hoist_storage_loop_index.erase(op->name);
+        return mutated;
+    }
+
+    Stmt visit(const For *op) override {
+        loops.emplace_back(op->name, op->min, op->extent);
+        Stmt mutated = IRMutator::visit(op);
+        loops.pop_back();
+        return mutated;
+    }
+
+public:
+    InjectRingBuffering(const map<string, Function> &e)
+        : env(e) {
+    }
+};
+
 // Broaden the scope of acquire nodes to pack trailing work into the
 // same task and to potentially reduce the nesting depth of tasks.
 class ExpandAcquireNodes : public IRMutator {
@@ -639,6 +817,18 @@ class ExpandAcquireNodes : public IRMutator {
         }
     }
 
+    Stmt visit(const HoistedStorage *op) override {
+        Stmt body = mutate(op->body);
+        if (const Acquire *a = body.as<Acquire>()) {
+            // Don't do the allocation until we have the
+            // semaphore. Reduces peak memory use.
+            return Acquire::make(a->semaphore, a->count,
+                                 mutate(HoistedStorage::make(op->name, a->body)));
+        } else {
+            return HoistedStorage::make(op->name, body);
+        }
+    }
+
     Stmt visit(const LetStmt *op) override {
         Stmt orig = op;
         Stmt body;
@@ -693,6 +883,9 @@ class TightenForkNodes : public IRMutator {
         const LetStmt *lr = rest.as<LetStmt>();
         const Realize *rf = first.as<Realize>();
         const Realize *rr = rest.as<Realize>();
+        const HoistedStorage *hf = first.as<HoistedStorage>();
+        const HoistedStorage *hr = rest.as<HoistedStorage>();
+
         if (lf && lr &&
             lf->name == lr->name &&
             equal(lf->value, lr->value)) {
@@ -707,6 +900,10 @@ class TightenForkNodes : public IRMutator {
         } else if (rr && !stmt_uses_var(first, rr->name)) {
             return Realize::make(rr->name, rr->types, rr->memory_type,
                                  rr->bounds, rr->condition, make_fork(first, rr->body));
+        } else if (hf && !stmt_uses_var(rest, hf->name)) {
+            return HoistedStorage::make(hf->name, make_fork(rf->body, rest));
+        } else if (hr && !stmt_uses_var(first, hr->name)) {
+            return HoistedStorage::make(hr->name, make_fork(first, hr->body));
         } else {
             return Fork::make(first, rest);
         }
@@ -740,6 +937,15 @@ class TightenForkNodes : public IRMutator {
         }
     }
 
+    Stmt visit(const HoistedStorage *op) override {
+        Stmt body = mutate(op->body);
+        if (in_fork && !stmt_uses_var(body, op->name)) {
+            return body;
+        } else {
+            return HoistedStorage::make(op->name, body);
+        }
+    }
+
     Stmt visit(const LetStmt *op) override {
         Stmt body = mutate(op->body);
         if (in_fork && !stmt_uses_var(body, op->name)) {
@@ -758,6 +964,7 @@ class TightenForkNodes : public IRMutator {
 
 Stmt fork_async_producers(Stmt s, const map<string, Function> &env) {
     s = TightenProducerConsumerNodes(env).mutate(s);
+    s = InjectRingBuffering(env).mutate(s);
     s = ForkAsyncProducers(env).mutate(s);
     s = ExpandAcquireNodes().mutate(s);
     s = TightenForkNodes().mutate(s);
diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp
index 90590d6f15af..33fa3b36e78e 100644
--- a/src/Deserialization.cpp
+++ b/src/Deserialization.cpp
@@ -1017,6 +1017,7 @@ FuncSchedule Deserializer::deserialize_func_schedule(const Serialize::FuncSchedu
     const auto memory_type = deserialize_memory_type(func_schedule->memory_type());
     const auto memoized = func_schedule->memoized();
     const auto async = func_schedule->async();
+    const auto ring_buffer = deserialize_expr(func_schedule->ring_buffer_type(), func_schedule->ring_buffer());
     const auto memoize_eviction_key = deserialize_expr(func_schedule->memoize_eviction_key_type(), func_schedule->memoize_eviction_key());
     auto hl_func_schedule = FuncSchedule();
     hl_func_schedule.store_level() = store_level;
@@ -1029,6 +1030,7 @@ FuncSchedule Deserializer::deserialize_func_schedule(const Serialize::FuncSchedu
     hl_func_schedule.memory_type() = memory_type;
     hl_func_schedule.memoized() = memoized;
     hl_func_schedule.async() = async;
+    hl_func_schedule.ring_buffer() = ring_buffer;
     hl_func_schedule.memoize_eviction_key() = memoize_eviction_key;
     return hl_func_schedule;
 }
diff --git a/src/Func.cpp b/src/Func.cpp
index 8f46e7316531..978d2b19a436 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -2398,6 +2398,12 @@ Func &Func::async() {
     return *this;
 }
 
+Func &Func::ring_buffer(Expr extent) {
+    invalidate_cache();
+    func.schedule().ring_buffer() = std::move(extent);
+    return *this;
+}
+
 Stage Func::specialize(const Expr &c) {
     invalidate_cache();
     return Stage(func, func.definition(), 0).specialize(c);
diff --git a/src/Func.h b/src/Func.h
index ccadef338c29..d4074ee18cc6 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -2281,6 +2281,21 @@ class Func {
      */
     Func &async();
 
+    /** Expands the storage of the function by an extra dimension
+     * to enable ring buffering. For this to be useful the storage
+     * of the function has to be hoisted to an upper loop level using
+     * \ref Func::hoist_storage. The index for the new ring buffer dimension
+     * is calculated implicitly based on a linear combination of the all of
+     * the loop variables between hoist_storage and compute_at/store_at
+     * loop levels. Scheduling a function with ring_buffer increases the
+     * amount of memory required for this function by an *extent* times.
+     * ring_buffer is especially useful in combination with \ref Func::async,
+     * but can be used without it.
+     *
+     * The extent is expected to be a positive integer.
+     */
+    Func &ring_buffer(Expr extent);
+
     /** Bound the extent of a Func's storage, but not extent of its
      * compute. This can be useful for forcing a function's allocation
      * to be a fixed size, which often means it can go on the stack.
diff --git a/src/Schedule.cpp b/src/Schedule.cpp
index 4ebcccd5e1d8..a2a34f34862e 100644
--- a/src/Schedule.cpp
+++ b/src/Schedule.cpp
@@ -241,6 +241,8 @@ struct FuncScheduleContents {
     MemoryType memory_type = MemoryType::Auto;
     bool memoized = false;
     bool async = false;
+    // This is an extent of the ring buffer and expected to be a positive integer.
+    Expr ring_buffer;
     Expr memoize_eviction_key;
 
     FuncScheduleContents()
@@ -362,6 +364,7 @@ FuncSchedule FuncSchedule::deep_copy(
     copy.contents->memoized = contents->memoized;
     copy.contents->memoize_eviction_key = contents->memoize_eviction_key;
     copy.contents->async = contents->async;
+    copy.contents->ring_buffer = contents->ring_buffer;
 
     // Deep-copy wrapper functions.
     for (const auto &iter : contents->wrappers) {
@@ -405,6 +408,14 @@ bool FuncSchedule::async() const {
     return contents->async;
 }
 
+Expr &FuncSchedule::ring_buffer() {
+    return contents->ring_buffer;
+}
+
+Expr &FuncSchedule::ring_buffer() const {
+    return contents->ring_buffer;
+}
+
 std::vector<StorageDim> &FuncSchedule::storage_dims() {
     return contents->storage_dims;
 }
diff --git a/src/Schedule.h b/src/Schedule.h
index 32a654228673..f32ce2265a0f 100644
--- a/src/Schedule.h
+++ b/src/Schedule.h
@@ -624,6 +624,9 @@ class FuncSchedule {
     bool &async();
     bool async() const;
 
+    Expr &ring_buffer();
+    Expr &ring_buffer() const;
+
     /** The list and order of dimensions used to store this
      * function. The first dimension in the vector corresponds to the
      * innermost dimension for storage (i.e. which dimension is
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index 9c5ca9095575..9525c9a07308 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -2249,6 +2249,10 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_
         return true;
     }
 
+    if (f.schedule().ring_buffer().defined() && store_at == hoist_storage_at) {
+        user_error << "Func \"" << f.name() << "\" is scheduled with ring_buffer(), but has matching store_at and hoist_storage levels. Add an explicit hoist_storage directive to the schedule to fix the issue.\n";
+    }
+
     vector<ComputeLegalSchedules::Site> &sites = legal.sites_allowed;
     int store_idx = -1, compute_idx = -1, hoist_storage_idx = -1;
     for (size_t i = 0; i < sites.size(); i++) {
diff --git a/src/Serialization.cpp b/src/Serialization.cpp
index a9342d95ba6d..f8be69271ff0 100644
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -1117,6 +1117,7 @@ Offset<Serialize::FuncSchedule> Serializer::serialize_func_schedule(FlatBufferBu
     const Serialize::MemoryType memory_type = serialize_memory_type(func_schedule.memory_type());
     const auto memoized = func_schedule.memoized();
     const auto async = func_schedule.async();
+    const auto ring_buffer = serialize_expr(builder, func_schedule.ring_buffer());
     const auto memoize_eviction_key_serialized = serialize_expr(builder, func_schedule.memoize_eviction_key());
     return Serialize::CreateFuncSchedule(builder, store_level_serialized, compute_level_serialized,
                                          hoist_storage_level_serialized,
@@ -1124,7 +1125,7 @@ Offset<Serialize::FuncSchedule> Serializer::serialize_func_schedule(FlatBufferBu
                                          builder.CreateVector(bounds_serialized),
                                          builder.CreateVector(estimates_serialized),
                                          builder.CreateVector(wrappers_serialized),
-                                         memory_type, memoized, async,
+                                         memory_type, memoized, async, ring_buffer.first, ring_buffer.second,
                                          memoize_eviction_key_serialized.first, memoize_eviction_key_serialized.second);
 }
 
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index 5d16d02d7ab4..223a33837c7a 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -217,10 +217,12 @@ class FlattenDimensions : public IRMutator {
         vector<Expr> allocation_extents(extents.size());
         vector<int> storage_permutation;
         vector<Stmt> bound_asserts;
+        bool is_ring_buffered = false;
         {
             auto iter = env.find(op->name);
             internal_assert(iter != env.end()) << "Realize node refers to function not in environment.\n";
             Function f = iter->second.first;
+            is_ring_buffered = f.schedule().ring_buffer().defined();
             const vector<StorageDim> &storage_dims = f.schedule().storage_dims();
             const vector<string> &args = f.args();
             for (size_t i = 0; i < storage_dims.size(); i++) {
@@ -251,6 +253,10 @@ class FlattenDimensions : public IRMutator {
                 }
                 internal_assert(storage_permutation.size() == i + 1);
             }
+            if (is_ring_buffered) {
+                storage_permutation.push_back(storage_dims.size());
+                allocation_extents[storage_dims.size()] = extents[storage_dims.size()];
+            }
         }
 
         internal_assert(storage_permutation.size() == op->bounds.size());
@@ -279,13 +285,13 @@ class FlattenDimensions : public IRMutator {
         builder.host = Variable::make(Handle(), op->name);
         builder.type = op->types[0];
         builder.dimensions = dims;
+
         for (int i = 0; i < dims; i++) {
             builder.mins.push_back(min_var[i]);
             builder.extents.push_back(extent_var[i]);
             builder.strides.push_back(stride_var[i]);
         }
         stmt = LetStmt::make(op->name + ".buffer", builder.build(), stmt);
-
         if (hoisted_storages_map.count(op->name) > 0) {
             HoistedStorageData &hoisted_storage_data = hoisted_storages[hoisted_storages_map[op->name]];
             vector<Expr> bounded_extents;
@@ -336,6 +342,7 @@ class FlattenDimensions : public IRMutator {
             stmt = LetStmt::make(min_name[i - 1], op->bounds[i - 1].min, stmt);
             stmt = LetStmt::make(extent_name[i - 1], extents[i - 1], stmt);
         }
+
         return stmt;
     }
 
diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs
index fe52231ffc49..e5855e301d1e 100644
--- a/src/halide_ir.fbs
+++ b/src/halide_ir.fbs
@@ -521,6 +521,7 @@ table FuncSchedule {
     memory_type: MemoryType = Auto;
     memoized: bool;
     async: bool;
+    ring_buffer: Expr;
     memoize_eviction_key: Expr;
 }
 
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 4ee9f57480dc..07921a347425 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -84,6 +84,7 @@ tests(GROUPS correctness
       dilate3x3.cpp
       div_by_zero.cpp
       div_round_to_zero.cpp
+      ring_buffer.cpp
       dynamic_allocation_in_gpu_kernel.cpp
       dynamic_reduction_bounds.cpp
       early_out.cpp
diff --git a/test/correctness/ring_buffer.cpp b/test/correctness/ring_buffer.cpp
new file mode 100644
index 000000000000..4cb6eb9ac4e0
--- /dev/null
+++ b/test/correctness/ring_buffer.cpp
@@ -0,0 +1,414 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    if (get_jit_target_from_environment().arch == Target::WebAssembly) {
+        printf("[SKIP] WebAssembly does not support async() yet.\n");
+        return 0;
+    }
+
+    // Double-buffer a tile of producer computed as async.
+    {
+        Func producer("producer"), consumer("consumer");
+        Var x, y, xo, yo, xi, yi;
+
+        producer(x, y) = x + y;
+        consumer(x, y) = producer(x - 1, y - 1) + producer(x, y) + producer(x + 1, y + 1);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+        producer
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 3 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Double-buffer a tile of producer computed as async, but the storage moved to the outside.
+    {
+        Func producer("producer"), consumer("consumer");
+        Var x, y, xo, yo, xi, yi;
+
+        producer(x, y) = x + y;
+        consumer(x, y) = producer(x - 1, y - 1) + producer(x, y) + producer(x + 1, y + 1);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+        producer
+            .compute_at(consumer, xo)
+            .hoist_storage_root()
+            .ring_buffer(2)
+            .async();
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 3 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Double-buffer a tile of producer computed as async with multiple intermediate consumers.
+    {
+        Func producer("producer"), consumer("consumer"), interm1("interm1"), interm2("interm2"), interm3("interm3");
+        Var x, y, xo, yo, xi, yi;
+
+        producer(x, y) = x + y;
+        interm1(x, y) = producer(x - 1, y - 1);
+        interm2(x, y) = producer(x, y);
+        interm3(x, y) = producer(x + 1, y + 1);
+
+        consumer(x, y) = interm1(x, y) + interm2(x, y) + interm3(x, y);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+        producer
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        interm1
+            .compute_at(consumer, xo);
+        interm2
+            .compute_at(consumer, xo);
+        interm3
+            .compute_at(consumer, xo);
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 3 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Double-buffer a tile of producer computed as async with multiple intermediate consumers and output consumer.
+    {
+        Func producer("producer"), consumer("consumer"), interm1("interm1"), interm2("interm2"), interm3("interm3");
+        Var x, y, xo, yo, xi, yi;
+
+        producer(x, y) = x + y;
+        interm1(x, y) = producer(x - 1, y - 1);
+        interm2(x, y) = producer(x, y);
+        interm3(x, y) = producer(x + 1, y + 1);
+
+        consumer(x, y) = interm1(x, y) + interm2(x, y) + interm3(x, y) + producer(x, y + 2);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+        producer
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        interm1
+            .compute_at(consumer, xo);
+        interm2
+            .compute_at(consumer, xo);
+        interm3
+            .compute_at(consumer, xo);
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 3 * (x + y) + x + y + 2;
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Two async producers with double buffering and one consumer.
+    {
+        Func producer1("producer1"), producer2("producer2"), consumer("consumer");
+        Var x, y, xo, yo, xi, yi;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = x * y;
+        consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+        producer1
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+        producer2
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y) + x * y;
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Two async producers with double buffering at different storage levels and one consumer.
+    {
+        Func producer1("producer1"), producer2("producer2"), consumer("consumer");
+        Var x, y, xo, yo, xi, yi;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = x * y;
+        consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+
+        producer1
+            .compute_at(consumer, xo)
+            .hoist_storage_root()
+            .ring_buffer(2)
+            .async();
+
+        producer2
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y) + x * y;
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Two async producers with ring buffers and two consumers.
+    {
+        Func producer1("producer1"), producer2("producer2"), interm1("interm1"), interm2("interm2"), consumer("consumer");
+        Var x, y, xo, yo, xi, yi;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = x + y;
+        interm1(x, y) = producer1(x - 1, y + 1) + producer2(x, y);
+        interm2(x, y) = producer1(x, y) + producer2(x + 1, y - 1);
+        consumer(x, y) = interm1(x, y) + interm2(x, y);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+
+        interm1
+            .compute_at(consumer, xo);
+
+        interm2
+            .compute_at(consumer, xo);
+
+        // Extents for ring_buffer() below are random to test various cases.
+        producer1
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(5)
+            .async();
+
+        producer2
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 4 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Three async producers with ring buffers and two consumers.
+    {
+        Func producer1("producer1"), producer2("producer2"), producer3("producer3");
+        Func interm1("interm1"), interm2("interm2"), consumer("consumer");
+        Var x, y, xo, yo, xi, yi;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = x + y;
+        producer3(x, y) = x * y;
+        interm1(x, y) = producer1(x - 1, y + 1) + producer2(x, y) + producer3(x - 1, y - 1);
+        interm2(x, y) = producer1(x, y) + producer2(x + 1, y - 1) + producer3(x + 1, y + 1);
+        consumer(x, y) = interm1(x, y) + interm2(x, y);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+
+        interm1
+            .compute_at(consumer, xo);
+
+        interm2
+            .compute_at(consumer, xo)
+            // Let's hoist storage of this consumer to make it more complicated.
+            .hoist_storage(consumer, yo);
+
+        // Extents for ring_buffer() below are random to test various cases.
+        producer1
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        producer2
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(3)
+            .async();
+
+        producer3
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(4)
+            .async();
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 4 * (x + y) + ((x - 1) * (y - 1)) + ((x + 1) * (y + 1));
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Two non-async ring-buffered producers and two consumers.
+    {
+        Func producer1("producer1"), producer2("producer2"), producer3("producer3");
+        Func interm1("interm1"), interm2("interm2"), consumer("consumer");
+        Var x, y, xo, yo, xi, yi;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = x + y;
+        producer3(x, y) = x * y;
+        interm1(x, y) = producer1(x - 1, y + 1) + producer2(x, y) + producer3(x - 1, y - 1);
+        interm2(x, y) = producer1(x, y) + producer2(x + 1, y - 1) + producer3(x + 1, y + 1);
+        consumer(x, y) = interm1(x, y) + interm2(x, y);
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+
+        interm1
+            .compute_at(consumer, xo);
+
+        interm2
+            .compute_at(consumer, xo)
+            // Let's hoist storage of this consumer to make it more complicated.
+            .hoist_storage(consumer, yo);
+
+        // Extents for ring_buffer() below are random to test various cases.
+        producer1
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(3);
+
+        producer2
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2);
+
+        producer3
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(4);
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 4 * (x + y) + ((x - 1) * (y - 1)) + ((x + 1) * (y + 1));
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    // Chain of two async double-buffered producers and consumer.
+    {
+        Func producer1("producer1"), producer2("producer2"), consumer("consumer");
+        Var x, y, xo, yo, xi, yi;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = producer1(x, y) + x * y;
+        consumer(x, y) = producer2(x, y) * 2;
+
+        consumer
+            .compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+        producer1
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        producer2
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, yo)
+            .ring_buffer(2)
+            .async();
+
+        Buffer<int> out = consumer.realize({128, 128});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y + x * y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(1);
+            }
+        });
+    }
+
+    printf("Success!\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt
index ef4f5ffea614..52a2a01cd65e 100644
--- a/test/error/CMakeLists.txt
+++ b/test/error/CMakeLists.txt
@@ -20,6 +20,7 @@ tests(GROUPS error
       bad_const_cast.cpp
       bad_device_api.cpp
       bad_dimensions.cpp
+      bad_ring_buffer.cpp
       bad_extern_split.cpp
       bad_fold.cpp
       bad_host_alignment.cpp
diff --git a/test/error/bad_ring_buffer.cpp b/test/error/bad_ring_buffer.cpp
new file mode 100644
index 000000000000..ffd06ef9d075
--- /dev/null
+++ b/test/error/bad_ring_buffer.cpp
@@ -0,0 +1,23 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    Func f("f"), g("g"), h("h");
+    Var x("x"), y("y");
+
+    f(x) = x;
+    g(x) = f(x);
+    h(x, y) = g(x);
+
+    g.compute_at(h, y);
+
+    // ring_buffer() requires an explicit hoist_storage().
+    f.compute_root().ring_buffer(2);
+
+    h.realize({10, 10});
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt
index 862db3db6bd3..ee81fcb7a545 100644
--- a/tutorial/CMakeLists.txt
+++ b/tutorial/CMakeLists.txt
@@ -210,6 +210,7 @@ if (TARGET Halide::Mullapudi2016)
     set_tests_properties(tutorial_lesson_21_auto_scheduler_run PROPERTIES LABELS "tutorial;multithreaded")
 endif ()
 
-# Lessons 22-23
+# Lessons 22-24
 add_tutorial(lesson_22_jit_performance.cpp)
 add_tutorial(lesson_23_serialization.cpp WITH_IMAGE_IO)
+add_tutorial(lesson_24_async.cpp)
diff --git a/tutorial/lesson_24_async.cpp b/tutorial/lesson_24_async.cpp
new file mode 100644
index 000000000000..191350cf5012
--- /dev/null
+++ b/tutorial/lesson_24_async.cpp
@@ -0,0 +1,299 @@
+// Halide tutorial lesson 24: Async execution
+
+// This lesson demonstrates how to asynchronously execute a function
+// using scheduling directives 'async' and 'ring_buffer'.
+
+// On linux, you can compile and run it like so:
+// g++ lesson_24*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_24 -std=c++17
+// LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_24
+
+// On os x:
+// g++ lesson_24*.cpp -g -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -o lesson_24 -std=c++17
+// DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_24
+
+// If you have the entire Halide source tree, you can also build it by
+// running:
+//    make tutorial_lesson_24_async
+// in a shell with the current directory at the top of the halide
+// source tree.
+
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    // Declare some Vars to use below.
+    Var x("x"), y("y"), c("c"), xo("xo"), yo("yo"), xi("xi"), yi("yi"), tile("tile");
+
+    {
+        // In this example we simply tell Halide to run `producer` in a
+        // separate thread. This is not very useful on its own, but is a good start
+        // for the next examples.
+        Func producer("producer"), consumer("consumer");
+
+        producer(x, y) = x + y;
+        consumer(x, y) = producer(x - 1, y - 1) + producer(x, y) + producer(x + 1, y + 1);
+
+        consumer.compute_root();
+        // Use async() to produce `producer` in a separate thread.
+        producer.compute_root().async();
+
+        // The high-level structure of the generated code will be:
+        // {
+        //     allocate producer[...]
+        //     thread #1 {
+        //         produce producer {
+        //             ...
+        //         }
+        //         signal that data is ready
+        //     }
+        //     thread #2 {
+        //         consume producer {
+        //             block until producer data is ready       
+        //             produce consumer {
+        //                 ...                        
+        //             }
+        //         }
+        //     }
+        // }
+        consumer.realize({128, 128});
+    }
+
+    {
+        // Now let's use async() to execute two different producers simultaneously.
+        // This could be useful in various scenarios when you want to overlap 
+        // computations of different functions in time. For example, you could execute 
+        // producer1 and producer2 on different devices in parallel (e.g producer1 on CPU
+        // and producer2 on GPU).
+        Func producer1("producer1"), producer2("producer2"), consumer("consumer");
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = x + y;
+        consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1);
+
+        // With the schedule below, `producer1` and `producer2` computations will be each 
+        // launched in separate threads. Since `consumer` depends on both of them, and producers
+        // are scheduled as compute_root(), `consumer` will have to wait until `producer1` and
+        // `producer2` fully completed their work. The required synchronization primitives 
+        // will be added between producers and `consumer` to ensure that it's safe for `consumer`
+        // to start its work and input data is fully ready.
+        consumer.compute_root();
+        producer1.compute_root().async();
+        producer2.compute_root().async();
+
+        // The high-level structure of the generated code will be:
+        // {
+        //     allocate producer1[...]
+        //     allocate producer2[...]
+        //     thread #1 {
+        //         produce producer1 {
+        //             ...
+        //         }
+        //         signal that producer1 data is ready
+        //     }
+        //     thread #2 {
+        //         produce producer2 {
+        //             ...
+        //         }
+        //         signal that producer2 data is ready
+        //     }
+        //     thread #3 {
+        //         consume producer1 {
+        //             consume producer2 {
+        //                 block until producer1 data is ready
+        //                 block until producer2 data is ready
+        //                 produce consumer {
+        //                     ...                        
+        //                 }
+        //             }
+        //         }
+        //     }
+        // }
+        consumer.realize({128, 128});
+    }
+
+    {
+        // In the previous example, we managed to run two producers in parallel, but `consumer` had
+        // to wait until the data is fully ready. Wouldn't it be great if we could overlap computations
+        // of `producer` and `consumer` too? This computational pattern is known as 'double buffering' and
+        // can be critical for achieving good performance in certain scenarios. The high-level idea is that
+        // producer is allowed to run ahead and do the next chunk of work without waiting while consumer 
+        // is processing the current chunk. The obvious drawback of this method is that it requires twice
+        // as much memory for `producer`. 
+        Func producer("producer"), consumer("consumer");
+
+        producer(x, y, c) = (x + y) * (c + 1);
+        consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c);
+
+        consumer.compute_root();
+
+        // In this example the planes are processed separately, so producer can run ahead 
+        // and start producing plane `c + 1`, while `consumer` consumes already produced plane `c`.
+        // One way to express it with Halide schedule is very similar to how sliding window schedules
+        // are expressed (see lesson_8 for details). There are indeed a lot of commonalities between the two
+        // because both of them are relying on a circular buffer as underlying data structure.
+        producer
+            .async()
+            .compute_at(consumer, c)
+            // fold_storage requires store_at which is separate from compute_at.
+            .store_at(consumer, Var::outermost())
+            // Explicit fold_storage is required here, because otherwise Halide will infer that only
+            // one plane of `producer` is necessary for `consumer`, but for the purposes of this
+            // example we want at least 2.
+            // Please, note that adding a fold_storage(c, 2) will double the amount of storage allocated
+            // for `producer`.
+            .fold_storage(c, 2);
+
+        // The high-level structure of the generated code will be:
+        // {
+        //     allocate producer1[extent.x, extent.y, 2]
+        //     // In this case there are two semaphores, because producer can run ahead, so we need
+        //     // to track how much was consumed and produced separately.
+        //     // This semaphore indicates how much producer has produced.
+        //     producer1.semaphore = 0
+        //     // This semaphore indicates how much `space` for producer is available.
+        //     producer1.folding_semaphore = 2
+        //     thread #1 {
+        //         loop over c {
+        //             // Acquire a semaphore or block until the space to produce to is available.
+        //             // The semaphore is released by consumer thread, when the data was fully
+        //             // consumed.
+        //             acquire(producer1.folding_semaphore, 1)
+        //             produce producer1 {
+        //                 // Produce the next plane of the producer1 and store it at index c % 2.
+        //                 producer1[_, _, c % 2] = ...
+        //                 // Release a semaphore to indicate that plane was produced, consumer will
+        //                 // acquire this semaphore in the other thread.
+        //                 release(producer1.semaphore)
+        //             }
+        //         }
+        //     }
+        //     thread #2 {
+        //         loop over c {
+        //             // Acquire a semaphore or block until the data from producer is ready.
+        //             // The semaphore is released by producer thread, when the data was fully
+        //             // produced.
+        //             acquire(producer1.semaphore, 1)
+        //             consume producer1 {
+        //                 consumer[_, _, c] = <computations which use producer[_, _, c % 2]>
+        //                 // Release a semaphore to indicate that plane was consumed, producer will
+        //                 // acquire this semaphore in the other thread.
+        //                 release(producer1.folding_semaphore)
+        //             }
+        //         }
+        //     }
+        // }
+        consumer.realize({128, 128, 4});
+    }
+
+    {
+        // In the previous example, we relied on the storage folding to express double buffering
+        // technique, but there is another, more direct way to do that.
+        Func producer("producer"), consumer("consumer");
+
+        producer(x, y, c) = (x + y) * (c + 1);
+        consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c);
+
+        consumer.compute_root();
+
+        // As mentioned in the previous example, the planes are processed separately, so producer can run
+        // ahead and start producing plane `c + 1`, while `consumer` consumes already produced plane `c`.
+        // A more direct way to express this would be to hoist storage of `producer` to ouside of the loop
+        // `c` over planes, double its size and add necessary indices to flip the  planes.
+        // The first part can be achieved with `hoist_storage` directive and the rest is done with 
+        // `ring_buffer`. Please, note that it's enough to provide only extent of the ring buffer, there is no
+        // need to specify an explicit loop level to tie ring buffer to, because the index for ring buffer
+        // will be implicitly computed based on a linear combination of loop variables between storage and
+        // compute_at/store_at levels.
+        producer
+            .async()
+            .compute_at(consumer, c)
+            .hoist_storage(consumer, Var::outermost())
+            // Similarly, to the previous example, the amount of storage is doubled here.
+            .ring_buffer(2);
+
+        // The high-level structure of the generated code will be very similar to the previous example.
+        consumer.realize({128, 128, 4});
+    }
+
+    {
+        // The advantage of the `hoist_storage` + `ring_buffer` approach is that it can be applied to
+        // fairly arbitrary loop splits and tilings. For example, in the following schedule instead of 
+        // double buffering over whole planes, we double buffer over sub-regions or tiles of the planes.
+        // This is not possible to achieve with fold_storage, because it works over the *storage*
+        // dimensions of the function and not the loop splits.
+        Func producer("producer"), consumer("consumer");
+
+        producer(x, y, c) = (x + y) * (c + 1);
+        consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c);
+
+        consumer.compute_root()
+            .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::Auto);
+
+        producer
+            .async()
+            .compute_at(consumer, xo)
+            .hoist_storage(consumer, Var::outermost())
+            .ring_buffer(2);
+
+        // // The high-level structure of the generated code will be:
+        // {
+        //     // The size of the tile (16, 16, 1) + extra to accomodate 3x3 filter. The fourth dimension
+        //     // is added by ring_buffer() directive.
+        //     allocate producer1[18, 18, 1, 2]
+        //     // In this case there are two semaphores, because producer can run ahead, so we need
+        //     // to track how much was consumed and produced separately.
+        //     // This semaphore indicates how much producer has produced.
+        //     producer1.semaphore = 0
+        //     // This semaphore indicates how much `space` for producer is available.
+        //     producer1.folding_semaphore.ring_buffer = 2
+        //     thread #1 {
+        //         loop over c {
+        //             loop over yo {
+        //                 loop over xo {
+        //                     // Acquire a semaphore or block until the space to produce to is available.
+        //                     // The semaphore is released by consumer thread, when the data was fully
+        //                     // consumed.
+        //                     acquire(producer1.folding_semaphore.ring_buffer, 1)
+        //                     produce producer1 {
+        //                         // The index of ring buffer is computed as a linear combination of the all loop
+        //                         // variables up to the storage level.
+        //                         ring_buffer_index = <linear combination of c, yo, xo> % 2
+        //                         // Produce the next tile of the producer1 and store it at index ring_buffer_index.
+        //                         producer1[x, y, 0, ring_buffer_index % 2] = ...
+        //                         // Release a semaphore to indicate that tile was produced, consumer will
+        //                         // acquire this semaphore in the other thread.
+        //                         release(producer1.semaphore)
+        //                     }
+        //                 }
+        //             }
+        //         }
+        //     }
+        //     thread #2 {
+        //         loop over c {
+        //             loop over yo {
+        //                 loop over xo {
+        //                     // Acquire a semaphore or block until the data from producer is ready.
+        //                     // The semaphore is released by producer thread, when the data was fully
+        //                     // produced.
+        //                     acquire(producer1.semaphore, 1)
+        //                     consume producer1 {
+        //                         ring_buffer_index = <linear combination of c, yo, xo> % 2
+        //                         consumer[_, _, c] = <computations which use producer[_, _, 0, ring_buffer_index]>
+        //                         // Release a semaphore to indicate that tile was consumed, producer will
+        //                         // acquire this semaphore in the other thread.
+        //                         release(producer1.folding_semaphore.ring_buffer)
+        //                     }
+        //                 }
+        //             }
+        //         }
+        //     }
+        // }
+        consumer.realize({128, 128, 4});
+    }
+
+    printf("Success!\n");
+
+    return 0;
+}

From 6f26b044276083f172d8319fb9876d2eb80d2acd Mon Sep 17 00:00:00 2001
From: Tyler Hou <tylerhoucs@gmail.com>
Date: Tue, 2 Jan 2024 13:27:51 -0500
Subject: [PATCH 022/186] Change startswith -> starts_with (#8013)

startswith was deprecated in llvm/lvm-project#75491, which means that
Halide fails to compile using LLVM 18 (deprecation warning).
---
 src/CodeGen_LLVM.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 18e70dfb3d87..f319f204de9f 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1231,13 +1231,13 @@ void CodeGen_LLVM::optimize_module() {
             // Do not annotate any of Halide's low-level synchronization code as it has
             // tsan interface calls to mark its behavior and is much faster if
             // it is not analyzed instruction by instruction.
-            if (!(function.getName().startswith("_ZN6Halide7Runtime8Internal15Synchronization") ||
+            if (!(function.getName().starts_with("_ZN6Halide7Runtime8Internal15Synchronization") ||
                   // TODO: this is a benign data race that re-initializes the detected features;
                   // we should really fix it properly inside the implementation, rather than disabling
                   // it here as a band-aid.
-                  function.getName().startswith("halide_default_can_use_target_features") ||
-                  function.getName().startswith("halide_mutex_") ||
-                  function.getName().startswith("halide_cond_"))) {
+                  function.getName().starts_with("halide_default_can_use_target_features") ||
+                  function.getName().starts_with("halide_mutex_") ||
+                  function.getName().starts_with("halide_cond_"))) {
                 function.addFnAttr(Attribute::SanitizeThread);
             }
         }

From 8024bdc9050c52b13e901355c6944fc26aa27874 Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Tue, 2 Jan 2024 14:52:53 -0800
Subject: [PATCH 023/186] Don't add ring_buffer semaphores if the function is
 not scheduled as async (#8015)

Don't add ring_buffer semaphores if the function is not scheduled as asybc

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/AsyncProducers.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
index 783f00dd35b1..92012ccfe4c1 100644
--- a/src/AsyncProducers.cpp
+++ b/src/AsyncProducers.cpp
@@ -729,11 +729,14 @@ class InjectRingBuffering : public IRMutator {
             current_index = current_index % f.schedule().ring_buffer();
             // Adds an extra index for to the all of the references of f.
             body = UpdateIndices(op->name, current_index).mutate(body);
-            Expr sema_var = Variable::make(type_of<halide_semaphore_t *>(), f.name() + ".folding_semaphore.ring_buffer");
-            Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema_var, 1}, Call::Extern);
-            Stmt release = Evaluate::make(release_producer);
-            body = Block::make(body, release);
-            body = Acquire::make(sema_var, 1, body);
+
+            if (f.schedule().async()) {
+                Expr sema_var = Variable::make(type_of<halide_semaphore_t *>(), f.name() + ".folding_semaphore.ring_buffer");
+                Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema_var, 1}, Call::Extern);
+                Stmt release = Evaluate::make(release_producer);
+                body = Block::make(body, release);
+                body = Acquire::make(sema_var, 1, body);
+            }
         }
 
         return Realize::make(op->name, op->types, op->memory_type, bounds, op->condition, body);
@@ -747,7 +750,7 @@ class InjectRingBuffering : public IRMutator {
         Stmt mutated = mutate(op->body);
         mutated = HoistedStorage::make(op->name, mutated);
 
-        if (f.schedule().ring_buffer().defined()) {
+        if (f.schedule().async() && f.schedule().ring_buffer().defined()) {
             // Make a semaphore on the stack
             Expr sema_space = Call::make(type_of<halide_semaphore_t *>(), "halide_make_semaphore",
                                          {2}, Call::Extern);

From d2da00705ceb511fe69837cceab848f124d957ec Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 3 Jan 2024 20:05:37 +0000
Subject: [PATCH 024/186] Fix for top-of-tree LLVM (Fix #8017) (#8018)

Fix for top-of-tree LLVM
---
 src/CodeGen_LLVM.cpp    | 4 ++++
 src/CodeGen_PTX_Dev.cpp | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index f319f204de9f..7b9eecd3d74e 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1244,7 +1244,11 @@ void CodeGen_LLVM::optimize_module() {
     }
 
     if (tm) {
+#if LLVM_VERSION >= 180
+        tm->registerPassBuilderCallbacks(pb, /*PopulateClassToPassNames=*/false);
+#else
         tm->registerPassBuilderCallbacks(pb);
+#endif
     }
 
     mpm = pb.buildPerModuleDefaultPipeline(level, debug_pass_manager);
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 2a47e591c503..6be2f1b7e988 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -700,7 +700,11 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
     using OptimizationLevel = llvm::OptimizationLevel;
     OptimizationLevel level = OptimizationLevel::O3;
 
+#if LLVM_VERSION >= 180
+    target_machine->registerPassBuilderCallbacks(pb, /*PopulateClassToPassNames=*/false);
+#else
     target_machine->registerPassBuilderCallbacks(pb);
+#endif
 
     mpm = pb.buildPerModuleDefaultPipeline(level, debug_pass_manager);
     mpm.run(*module, mam);

From b661c8d79fa92c7deb99e4611dc4f536ea435102 Mon Sep 17 00:00:00 2001
From: Zalman Stern <zalman@google.com>
Date: Wed, 3 Jan 2024 17:49:56 -0800
Subject: [PATCH 025/186] Quick fix for crash that is occurring in SVE2 tests.
 (#8020)

Broken out into separate PR for ease of review and isolated
test/tracking.
---
 test/correctness/simd_op_check.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index d97f2c72b90c..7b1057b7f3ea 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -202,7 +202,8 @@ class SimdOpCheckTest {
             void visit(const Internal::Call *op) override {
                 if (op->call_type == Internal::Call::Halide) {
                     Internal::Function f(op->func);
-                    if (f.has_update_definition()) {
+                    if (f.has_update_definition() &&
+                        f.update(0).schedule().rvars().size() > 0) {
                         inline_reduction = f;
                         result = true;
                     }

From daf011d9739d1318fd4b10250583cf15ffc611d4 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 4 Jan 2024 17:04:18 +0000
Subject: [PATCH 026/186] Don't use variable-length arrays (#8021)

There was a rogue use of VLAs (an extension we don't want to use) in one of the runtime tests. Fixed the test. I'll follow up with a separate PR to ensure this warning is enabled everywhere to flush out other usages.
---
 test/runtime/memory_arena.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/runtime/memory_arena.cpp b/test/runtime/memory_arena.cpp
index f4d4b853eddf..3189b401c6b0 100644
--- a/test/runtime/memory_arena.cpp
+++ b/test/runtime/memory_arena.cpp
@@ -44,7 +44,7 @@ int main(int argc, char **argv) {
         MemoryArena::Config config = {sizeof(double), 32, 0};
         MemoryArena *arena = MemoryArena::create(user_context, config, test_allocator);
 
-        size_t count = 4 * 1024;
+        constexpr size_t count = 4 * 1024;
         void *pointers[count];
         for (size_t n = 0; n < count; ++n) {
             pointers[n] = arena->reserve(user_context, true);
@@ -75,7 +75,7 @@ int main(int argc, char **argv) {
 
         arena.destroy(user_context);
 
-        size_t count = 4 * 1024;
+        constexpr size_t count = 4 * 1024;
         void *pointers[count];
         for (size_t n = 0; n < count; ++n) {
             pointers[n] = arena.reserve(user_context, true);

From 21accaddc5718830f77ec2ea1afa5a624edd08b0 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 4 Jan 2024 17:04:34 +0000
Subject: [PATCH 027/186] Set warnings on tests as well as src (#8022)

* Don't use variable-length arrays

There was a rogue use of VLAs (an extension we don't want to use) in one of the runtime tests. Fixed the test. I'll follow up with a separate PR to ensure this warning is enabled everywhere to flush out other usages.

* Set warnings on tests as well as src
---
 CMakeLists.txt                | 74 +++++++++++++++++++++++++++++++++++
 cmake/HalideTestHelpers.cmake |  1 +
 src/CMakeLists.txt            | 68 +-------------------------------
 test/runtime/CMakeLists.txt   |  1 +
 4 files changed, 77 insertions(+), 67 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a916bba26f3..6be8ece13282 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,80 @@ if (TARGET_VULKAN)
     set(TARGET_SPIRV ON) # required
 endif()
 
+# Helper function to set C++ compiler warnings in a sane way
+function(set_halide_compiler_warnings NAME)
+    target_compile_options(
+        ${NAME}
+        PRIVATE
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wall>
+
+        # variable length arrays in C++ are a Clang extension, we don't want to use them
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wvla-extension>
+
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wcast-qual>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wignored-qualifiers>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Woverloaded-virtual>
+
+        $<$<CXX_COMPILER_ID:GNU>:-Wsuggest-override>
+
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Winconsistent-missing-destructor-override>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Winconsistent-missing-override>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wdeprecated-declarations>
+
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-double-promotion>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-float-conversion>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-float-equal>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-missing-field-initializers>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-old-style-cast>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-shadow>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-sign-conversion>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-switch-enum>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-undef>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-unused-function>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-unused-macros>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-unused-parameter>
+
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-c++98-compat-pedantic>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-c++98-compat>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-cast-align>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-comma>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-covered-switch-default>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-documentation-unknown-command>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-documentation>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-exit-time-destructors>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-global-constructors>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-implicit-float-conversion>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-implicit-int-conversion>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-implicit-int-float-conversion>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-missing-prototypes>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-nonportable-system-include-path>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-reserved-id-macro>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-shadow-field-in-constructor>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-shadow-field>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-shorten-64-to-32>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-undefined-func-template>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-unused-member-function>
+        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-unused-template>
+
+        # This warning was removed in Clang 13
+        $<$<AND:$<CXX_COMPILER_ID:Clang,AppleClang>,$<VERSION_LESS:$<CXX_COMPILER_VERSION>,13.0>>:-Wno-return-std-move-in-c++11>
+
+        $<$<CXX_COMPILER_ID:MSVC>:/W3>
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4018>  # 4018: disable "signed/unsigned mismatch"
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4141>  # 4141: 'inline' used more than once
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4146>  # 4146: unary minus applied to unsigned type
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4244>  # 4244: conversion, possible loss of data
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4267>  # 4267: conversion from 'size_t' to 'int', possible loss of data
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4291>  # 4291: No matching operator delete found
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4503>  # 4503: disable "decorated name length exceeded, name was truncated"
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4800>  # 4800: forcing value to bool 'true' or 'false' (performance warning)
+
+        # No: enable deprecation warnings
+        # $<$<CXX_COMPILER_ID:MSVC>:/wd4996>  # 4996: compiler encountered deprecated declaration
+    )
+endfunction()
+
+
 ##
 # Import dependencies
 ##
diff --git a/cmake/HalideTestHelpers.cmake b/cmake/HalideTestHelpers.cmake
index e938d11d53ec..b6b9b70551ff 100644
--- a/cmake/HalideTestHelpers.cmake
+++ b/cmake/HalideTestHelpers.cmake
@@ -54,6 +54,7 @@ function(add_halide_test TARGET)
     add_test(NAME ${TARGET}
              COMMAND ${args_COMMAND} ${args_ARGS}
              WORKING_DIRECTORY "${args_WORKING_DIRECTORY}")
+    set_halide_compiler_warnings(${TARGET})
 
     # We can't add Halide::TerminateHandler here, because it requires Halide::Error
     # and friends to be present in the final linkage, but some callers of add_halide_test()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 390fee9a64e5..cfb092d29bf0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -575,73 +575,7 @@ endif ()
 ##
 # Set compiler options for libHalide
 ##
-
-target_compile_options(
-        Halide
-        PRIVATE
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wall>
-
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wcast-qual>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wignored-qualifiers>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Woverloaded-virtual>
-
-        $<$<CXX_COMPILER_ID:GNU>:-Wsuggest-override>
-
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Winconsistent-missing-destructor-override>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Winconsistent-missing-override>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wdeprecated-declarations>
-
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-double-promotion>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-float-conversion>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-float-equal>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-missing-field-initializers>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-old-style-cast>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-shadow>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-sign-conversion>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-switch-enum>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-undef>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-unused-function>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-unused-macros>
-        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-unused-parameter>
-
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-c++98-compat-pedantic>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-c++98-compat>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-cast-align>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-comma>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-covered-switch-default>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-documentation-unknown-command>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-documentation>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-exit-time-destructors>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-global-constructors>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-implicit-float-conversion>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-implicit-int-conversion>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-implicit-int-float-conversion>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-missing-prototypes>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-nonportable-system-include-path>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-reserved-id-macro>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-shadow-field-in-constructor>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-shadow-field>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-shorten-64-to-32>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-undefined-func-template>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-unused-member-function>
-        $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-unused-template>
-
-        # This warning was removed in Clang 13
-        $<$<AND:$<CXX_COMPILER_ID:Clang,AppleClang>,$<VERSION_LESS:$<CXX_COMPILER_VERSION>,13.0>>:-Wno-return-std-move-in-c++11>
-
-        $<$<CXX_COMPILER_ID:MSVC>:/W3>
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4018>  # 4018: disable "signed/unsigned mismatch"
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4141>  # 4141: 'inline' used more than once
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4146>  # 4146: unary minus applied to unsigned type
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4244>  # 4244: conversion, possible loss of data
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4267>  # 4267: conversion from 'size_t' to 'int', possible loss of data
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4291>  # 4291: No matching operator delete found
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4503>  # 4503: disable "decorated name length exceeded, name was truncated"
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4800>  # 4800: forcing value to bool 'true' or 'false' (performance warning)
-
-        # No: enable deprecation warnings
-        # $<$<CXX_COMPILER_ID:MSVC>:/wd4996>  # 4996: compiler encountered deprecated declaration
-)
+set_halide_compiler_warnings(Halide)
 
 if (CMAKE_GENERATOR MATCHES "Visual Studio")
     # We could expose the /MP flag to all targets, but that might end up saturating the build
diff --git a/test/runtime/CMakeLists.txt b/test/runtime/CMakeLists.txt
index 44ebf4c39d9d..b432b4299804 100644
--- a/test/runtime/CMakeLists.txt
+++ b/test/runtime/CMakeLists.txt
@@ -15,6 +15,7 @@ function(_set_target_options NAME)
         COMPILING_HALIDE_RUNTIME
         COMPILING_HALIDE_RUNTIME_TESTS
     )
+    set_halide_compiler_warnings(${NAME})
 endfunction()
 
 function(halide_define_runtime_internal_test NAME)

From cdebeb8ce81f82be022bae1ecda50a09d6d8fa9e Mon Sep 17 00:00:00 2001
From: Tom Westerhout <14264576+twesterhout@users.noreply.github.com>
Date: Tue, 9 Jan 2024 02:33:08 +0100
Subject: [PATCH 028/186] Fix -Wstrict-prototype warnings in HalideRuntime.h
 (#8027)

When HalideRuntime.h is included in a C file, funtions that are declared
with `()` instead of `(void)` for their arguments change meaning. These
may cause issues downstream because different code is generated.
---
 src/runtime/HalideRuntime.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index eea4faf7b073..b61b13041b8e 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -214,7 +214,7 @@ typedef int (*halide_task_t)(void *user_context, int task_number, uint8_t *closu
 extern int halide_do_par_for(void *user_context,
                              halide_task_t task,
                              int min, int size, uint8_t *closure);
-extern void halide_shutdown_thread_pool();
+extern void halide_shutdown_thread_pool(void);
 //@}
 
 /** Set a custom method for performing a parallel for loop. Returns
@@ -751,7 +751,7 @@ extern int halide_get_trace_file(void *user_context);
 
 /** If tracing is writing to a file. This call closes that file
  * (flushing the trace). Returns zero on success. */
-extern int halide_shutdown_trace();
+extern int halide_shutdown_trace(void);
 
 /** All Halide GPU or device backend implementations provide an
  * interface to be used with halide_device_malloc, etc. This is
@@ -1005,7 +1005,7 @@ extern void halide_memoization_cache_release(void *user_context, void *host);
 /** Free all memory and resources associated with the memoization cache.
  * Must be called at a time when no other threads are accessing the cache.
  */
-extern void halide_memoization_cache_cleanup();
+extern void halide_memoization_cache_cleanup(void);
 
 /** Verify that a given range of memory has been initialized; only used when Target::MSAN is enabled.
  *
@@ -1911,7 +1911,7 @@ enum {
 
 /** Get a pointer to the global profiler state for programmatic
  * inspection. Lock it before using to pause the profiler. */
-extern struct halide_profiler_state *halide_profiler_get_state();
+extern struct halide_profiler_state *halide_profiler_get_state(void);
 
 /** Get a pointer to the pipeline state associated with pipeline_name.
  * This function grabs the global profiler state's lock on entry. */
@@ -1930,14 +1930,14 @@ extern int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *pre
  * running; halide_profiler_memory_allocate/free and
  * halide_profiler_stack_peak_update update the profiler pipeline's
  * state without grabbing the global profiler state's lock. */
-extern void halide_profiler_reset();
+extern void halide_profiler_reset(void);
 
 /** Reset all profiler state.
  * WARNING: Do NOT call this method while any halide pipeline is
  * running; halide_profiler_memory_allocate/free and
  * halide_profiler_stack_peak_update update the profiler pipeline's
  * state without grabbing the global profiler state's lock. */
-void halide_profiler_shutdown();
+void halide_profiler_shutdown(void);
 
 /** Print out timing statistics for everything run since the last
  * reset. Also happens at process exit. */
@@ -1946,12 +1946,12 @@ extern void halide_profiler_report(void *user_context);
 /** For timer based profiling, this routine starts the timer chain running.
  * halide_get_profiler_state can be called to get the current timer interval.
  */
-extern void halide_start_timer_chain();
+extern void halide_start_timer_chain(void);
 /** These routines are called to temporarily disable and then reenable
  * timer interuppts for profiling */
 //@{
-extern void halide_disable_timer_interrupt();
-extern void halide_enable_timer_interrupt();
+extern void halide_disable_timer_interrupt(void);
+extern void halide_enable_timer_interrupt(void);
 //@}
 
 /// \name "Float16" functions

From 91b063dfb30d531bccd03a2e0958951c2c394436 Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Mon, 8 Jan 2024 20:57:15 -0800
Subject: [PATCH 029/186] Stronger chain detection in LoopCarry pass (#8016)

* Stronger chain detection in LoopCarry

* Make sure that types are the same

* Add a comment

* Run CSE before calling can_prove

* Test for loop carry

* clang-tidy

* Add missing override

* Update comments
---
 src/LoopCarry.cpp               | 32 +++++++++++++++--
 test/correctness/CMakeLists.txt |  1 +
 test/correctness/loop_carry.cpp | 64 +++++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 test/correctness/loop_carry.cpp

diff --git a/src/LoopCarry.cpp b/src/LoopCarry.cpp
index 5f4d7bb519d3..050cdfbfc8d9 100644
--- a/src/LoopCarry.cpp
+++ b/src/LoopCarry.cpp
@@ -283,11 +283,34 @@ class LoopCarryOverLoop : public IRMutator {
 
         // For each load, move the load index forwards by one loop iteration
         vector<Expr> indices, next_indices, predicates, next_predicates;
+        // CSE-d versions of the above, so can_prove can be safely used on them.
+        vector<Expr> indices_csed, next_indices_csed, predicates_csed, next_predicates_csed;
         for (const vector<const Load *> &v : loads) {
             indices.push_back(v[0]->index);
             next_indices.push_back(step_forwards(v[0]->index, linear));
             predicates.push_back(v[0]->predicate);
             next_predicates.push_back(step_forwards(v[0]->predicate, linear));
+
+            if (indices.back().defined()) {
+                indices_csed.push_back(common_subexpression_elimination(indices.back()));
+            } else {
+                indices_csed.emplace_back();
+            }
+            if (next_indices.back().defined()) {
+                next_indices_csed.push_back(common_subexpression_elimination(next_indices.back()));
+            } else {
+                next_indices_csed.emplace_back();
+            }
+            if (predicates.back().defined()) {
+                predicates_csed.push_back(common_subexpression_elimination(predicates.back()));
+            } else {
+                predicates_csed.emplace_back();
+            }
+            if (next_predicates.back().defined()) {
+                next_predicates_csed.push_back(common_subexpression_elimination(next_predicates.back()));
+            } else {
+                next_predicates_csed.emplace_back();
+            }
         }
 
         // Find loads done on this loop iteration that will be
@@ -299,11 +322,16 @@ class LoopCarryOverLoop : public IRMutator {
                 if (i == j) {
                     continue;
                 }
+                // can_prove is stronger than graph_equal, because it doesn't require index expressions to be
+                // exactly the same, but evaluate to the same value. We keep the graph_equal check, because
+                // it's faster and should be executed before the more expensive check.
                 if (loads[i][0]->name == loads[j][0]->name &&
                     next_indices[j].defined() &&
-                    graph_equal(indices[i], next_indices[j]) &&
+                    (graph_equal(indices[i], next_indices[j]) ||
+                     ((indices[i].type() == next_indices[j].type()) && can_prove(indices_csed[i] == next_indices_csed[j]))) &&
                     next_predicates[j].defined() &&
-                    graph_equal(predicates[i], next_predicates[j])) {
+                    (graph_equal(predicates[i], next_predicates[j]) ||
+                     ((predicates[i].type() == next_predicates[j].type()) && can_prove(predicates_csed[i] == next_predicates_csed[j])))) {
                     chains.push_back({j, i});
                     debug(3) << "Found carried value:\n"
                              << i << ":  -> " << Expr(loads[i][0]) << "\n"
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 07921a347425..cd66f21a346e 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -199,6 +199,7 @@ tests(GROUPS correctness
       likely.cpp
       load_library.cpp
       logical.cpp
+      loop_carry.cpp
       loop_invariant_extern_calls.cpp
       loop_level_generator_param.cpp
       lossless_cast.cpp
diff --git a/test/correctness/loop_carry.cpp b/test/correctness/loop_carry.cpp
new file mode 100644
index 000000000000..4cfba7d25f3f
--- /dev/null
+++ b/test/correctness/loop_carry.cpp
@@ -0,0 +1,64 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+// Wrapper class to call loop_carry on a given statement.
+class LoopCarryWrapper : public IRMutator {
+    using IRMutator::visit;
+
+    int register_count_;
+    Stmt mutate(const Stmt &stmt) override {
+        return simplify(loop_carry(stmt, register_count_));
+    }
+
+public:
+    LoopCarryWrapper(int register_count)
+        : register_count_(register_count) {
+    }
+};
+
+int main(int argc, char **argv) {
+    Func input;
+    Func g;
+    Func h;
+    Func f;
+    Var x, y, xo, yo, xi, yi;
+
+    input(x, y) = x + y;
+
+    Expr sum_expr = 0;
+    for (int ix = -100; ix <= 100; ix++) {
+        // Generate two chains of sums, but only one of them will be carried.
+        sum_expr += input(x, y + ix);
+        sum_expr += input(x + 13, y + 2 * ix);
+    }
+    g(x, y) = sum_expr;
+    h(x, y) = g(x, y) + 12;
+    f(x, y) = h(x, y);
+
+    // Make a maximum number of the carried values very large for the purpose
+    // of this test.
+    constexpr int kMaxRegisterCount = 1024;
+    f.add_custom_lowering_pass(new LoopCarryWrapper(kMaxRegisterCount));
+
+    const int size = 128;
+    f.compute_root()
+        .bound(x, 0, size)
+        .bound(y, 0, size);
+
+    h.compute_root()
+        .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp);
+
+    g.compute_at(h, xo)
+        .reorder(y, x)
+        .vectorize(x, 4);
+
+    input.compute_root();
+
+    f.realize({size, size});
+
+    printf("Success!\n");
+    return 0;
+}

From 8d3c12e632d0e85687feec37084cca71ab32753a Mon Sep 17 00:00:00 2001
From: Mike Woodworth <mike@divergentmedia.com>
Date: Tue, 16 Jan 2024 10:55:53 -0800
Subject: [PATCH 030/186] adds mappings for f16 variants of halide float math
 (#8029)

* adds mappings for f16 variants of halide float math

* fix clang format errors

* trigger buildbots

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/CodeGen_Metal_Dev.cpp | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 471b76b925ee..69d47279e9ae 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -795,6 +795,31 @@ void CodeGen_Metal_Dev::init_module() {
                << "#define tanh_f32 tanh\n"
                << "#define atanh_f32 atanh\n"
                << "#define fast_inverse_sqrt_f32 rsqrt\n"
+               << "#define is_nan_f16 isnan\n"
+               << "#define is_inf_f16 isinf\n"
+               << "#define is_finite_f16 isfinite\n"
+               << "#define sqrt_f16 sqrt\n"
+               << "#define sin_f16 sin\n"
+               << "#define cos_f16 cos\n"
+               << "#define exp_f16 exp\n"
+               << "#define log_f16 log\n"
+               << "#define abs_f16 fabs\n"
+               << "#define floor_f16 floor\n"
+               << "#define ceil_f16 ceil\n"
+               << "#define trunc_f16 trunc\n"
+               << "#define pow_f16 pow\n"
+               << "#define asin_f16 asin\n"
+               << "#define acos_f16 acos\n"
+               << "#define tan_f16 tan\n"
+               << "#define atan_f16 atan\n"
+               << "#define atan2_f16 atan2\n"
+               << "#define sinh_f16 sinh\n"
+               << "#define asinh_f16 asinh\n"
+               << "#define cosh_f16 cosh\n"
+               << "#define acosh_f16 acosh\n"
+               << "#define tanh_f16 tanh\n"
+               << "#define atanh_f16 atanh\n"
+               << "#define fast_inverse_sqrt_f16 rsqrt\n"
                // This is quite annoying: even though the MSL docs claim
                // all versions of Metal support the same memory fence
                // names, the truth is that 1.0 does not.

From d2eed57d224b2de7d7b4349025eb06606bccf773 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 16 Jan 2024 20:00:36 +0000
Subject: [PATCH 031/186] Fix build breakage for wasm targets (#8031)

Update HalideTestHelpers.cmake
---
 cmake/HalideTestHelpers.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/HalideTestHelpers.cmake b/cmake/HalideTestHelpers.cmake
index b6b9b70551ff..8f39cec026a4 100644
--- a/cmake/HalideTestHelpers.cmake
+++ b/cmake/HalideTestHelpers.cmake
@@ -54,7 +54,9 @@ function(add_halide_test TARGET)
     add_test(NAME ${TARGET}
              COMMAND ${args_COMMAND} ${args_ARGS}
              WORKING_DIRECTORY "${args_WORKING_DIRECTORY}")
-    set_halide_compiler_warnings(${TARGET})
+    if (NOT Halide_TARGET MATCHES "wasm")
+        set_halide_compiler_warnings(${TARGET})
+    endif ()
 
     # We can't add Halide::TerminateHandler here, because it requires Halide::Error
     # and friends to be present in the final linkage, but some callers of add_halide_test()

From 3a7720492e777b7509f1be60d0cb93389d6fe44e Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 17 Jan 2024 15:35:07 +0000
Subject: [PATCH 032/186] Require LLVM >= 16.0 (#8003)

* Require LLVM >= 16.0

Per policy, we only support top-of-tree LLVM, plus two versions back; let's update to require LLVM >= 16, and drop workarounds for older versions.

* LLVM_VERSION < 170
---
 dependencies/llvm/CMakeLists.txt        |  2 +-
 src/CodeGen_ARM.cpp                     |  4 +---
 src/CodeGen_LLVM.cpp                    | 20 --------------------
 src/CodeGen_RISCV.cpp                   |  2 --
 src/JITModule.cpp                       | 12 ------------
 src/LLVM_Headers.h                      |  4 ++--
 src/LLVM_Runtime_Linker.cpp             |  4 ----
 test/correctness/simd_op_check_arm.cpp  |  3 +--
 test/correctness/simd_op_check_wasm.cpp | 15 ++++-----------
 9 files changed, 9 insertions(+), 57 deletions(-)

diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt
index 8ab9fa3d2506..48b8642494dd 100644
--- a/dependencies/llvm/CMakeLists.txt
+++ b/dependencies/llvm/CMakeLists.txt
@@ -20,7 +20,7 @@ message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 message(STATUS "Using ClangConfig.cmake in: ${Clang_DIR}")
 
-if (LLVM_PACKAGE_VERSION VERSION_LESS 15.0)
+if (LLVM_PACKAGE_VERSION VERSION_LESS 16.0)
     message(FATAL_ERROR "LLVM version must be 15.0 or newer")
 endif ()
 
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 4cf1dc597ab4..9c6525703f16 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1144,10 +1144,8 @@ void CodeGen_ARM::visit(const Store *op) {
         llvm::Type *intrin_llvm_type = llvm_type_of(intrin_type);
 #if LLVM_VERSION >= 170
         const bool is_opaque = true;
-#elif LLVM_VERSION >= 150
-        const bool is_opaque = llvm::PointerType::get(intrin_llvm_type, 0)->isOpaque();
 #else
-        const bool is_opaque = false;
+        const bool is_opaque = llvm::PointerType::get(intrin_llvm_type, 0)->isOpaque();
 #endif
         if (target.bits == 32) {
             instr << "llvm.arm.neon.vst"
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 7b9eecd3d74e..a5c32cf83cc7 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1177,35 +1177,19 @@ void CodeGen_LLVM::optimize_module() {
                 if (get_target().os == Target::OS::Linux) {
                     sanitizercoverage_options.StackDepth = true;
                 }
-#if LLVM_VERSION >= 160
                 mpm.addPass(SanitizerCoveragePass(sanitizercoverage_options));
-#else
-                mpm.addPass(ModuleSanitizerCoveragePass(sanitizercoverage_options));
-#endif
             });
     }
 
     if (get_target().has_feature(Target::ASAN)) {
-#if LLVM_VERSION >= 150
-        // Nothing, ASanGlobalsMetadataAnalysis no longer exists
-#else
-        pb.registerPipelineStartEPCallback([&](ModulePassManager &mpm, OptimizationLevel) {
-            mpm.addPass(RequireAnalysisPass<ASanGlobalsMetadataAnalysis, llvm::Module>());
-        });
-#endif
         pb.registerPipelineStartEPCallback([](ModulePassManager &mpm, OptimizationLevel) {
             AddressSanitizerOptions asan_options;  // default values are good...
             asan_options.UseAfterScope = true;     // ...except this one
             constexpr bool use_global_gc = false;
             constexpr bool use_odr_indicator = true;
             constexpr auto destructor_kind = AsanDtorKind::Global;
-#if LLVM_VERSION >= 160
             mpm.addPass(AddressSanitizerPass(
                 asan_options, use_global_gc, use_odr_indicator, destructor_kind));
-#else
-            mpm.addPass(ModuleAddressSanitizerPass(
-                asan_options, use_global_gc, use_odr_indicator, destructor_kind));
-#endif
         });
     }
 
@@ -2046,11 +2030,7 @@ void CodeGen_LLVM::add_tbaa_metadata(llvm::Instruction *inst, string buffer, con
 }
 
 void CodeGen_LLVM::function_does_not_access_memory(llvm::Function *fn) {
-#if LLVM_VERSION >= 160
     fn->addFnAttr("memory(none)");
-#else
-    fn->addFnAttr(llvm::Attribute::ReadNone);
-#endif
 }
 
 void CodeGen_LLVM::visit(const Load *op) {
diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
index a702baff78a2..6bbc38532ecf 100644
--- a/src/CodeGen_RISCV.cpp
+++ b/src/CodeGen_RISCV.cpp
@@ -174,11 +174,9 @@ string CodeGen_RISCV::mattrs() const {
 
     if (target.has_feature(Target::RVV)) {
         attrs.emplace_back("+v");
-#if LLVM_VERSION >= 160
         if (target.vector_bits != 0) {
             attrs.push_back("+zvl" + std::to_string(target.vector_bits) + "b");
         }
-#endif
     }
     return join_strings(attrs, ",");
 }
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index eb274bc6c59d..0d37c07284c3 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -225,11 +225,7 @@ JITModule::Symbol compile_and_get_function(llvm::orc::LLJIT &JIT, const string &
     auto addr = JIT.lookup(name);
     internal_assert(addr) << llvm::toString(addr.takeError()) << "\n";
 
-#if LLVM_VERSION >= 150
     void *f = (void *)addr->getValue();
-#else
-    void *f = (void *)addr->getAddress();
-#endif
     if (!f) {
         internal_error << "Compiling " << name << " returned nullptr\n";
     }
@@ -1014,20 +1010,12 @@ JITModule &make_module(llvm::Module *for_module, Target target,
         }
 
         uint64_t arg_addr = llvm::cantFail(runtime.jit_module->JIT->lookup("halide_jit_module_argument"))
-#if LLVM_VERSION >= 150
                                 .getValue();
-#else
-                                .getAddress();
-#endif
         internal_assert(arg_addr != 0);
         *((void **)arg_addr) = runtime.jit_module.get();
 
         uint64_t fun_addr = llvm::cantFail(runtime.jit_module->JIT->lookup("halide_jit_module_adjust_ref_count"))
-#if LLVM_VERSION >= 150
                                 .getValue();
-#else
-                                .getAddress();
-#endif
         internal_assert(fun_addr != 0);
         *(void (**)(void *arg, int32_t count))fun_addr = &adjust_module_ref_count;
     }
diff --git a/src/LLVM_Headers.h b/src/LLVM_Headers.h
index ad3f25365577..6b5013b72cf0 100644
--- a/src/LLVM_Headers.h
+++ b/src/LLVM_Headers.h
@@ -1,10 +1,10 @@
 #ifndef HALIDE_LLVM_HEADERS_H
 #define HALIDE_LLVM_HEADERS_H
 
-#if LLVM_VERSION >= 140
+#if LLVM_VERSION >= 160
 // We're good to go
 #else
-#error "Compiling Halide requires LLVM 14.0 or newer"
+#error "Compiling Halide requires LLVM 16.0 or newer"
 #endif
 
 // No msvc warnings from llvm headers please
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index c946faad4850..0531f7bc3365 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -402,11 +402,7 @@ llvm::DataLayout get_data_layout_for_target(Target target) {
         if (target.bits == 32) {
             return llvm::DataLayout("e-m:e-p:32:32-i64:64-n32-S128");
         } else {
-#if LLVM_VERSION >= 160
             return llvm::DataLayout("e-m:e-p:64:64-i64:64-i128:128-n32:64-S128");
-#else
-            return llvm::DataLayout("e-m:e-p:64:64-i64:64-i128:128-n64-S128");
-#endif
         }
     } else {
         // Return empty data layout. Must be set later.
diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp
index 68fbf91a0081..acc3edcc4a8a 100644
--- a/test/correctness/simd_op_check_arm.cpp
+++ b/test/correctness/simd_op_check_arm.cpp
@@ -948,8 +948,7 @@ class SimdOpCheckARM : public SimdOpCheckTest {
             // LLVM15 emits UZP2 if the shift amount is half the width of the vector element.
             const auto shrn_or_uzp2 = [&](int element_width, int shift_amt, int vector_width) {
                 constexpr int simd_vector_bits = 128;
-                if (Halide::Internal::get_llvm_version() >= 150 &&
-                    ((vector_width * element_width) % (simd_vector_bits * 2)) == 0 &&
+                if (((vector_width * element_width) % (simd_vector_bits * 2)) == 0 &&
                     shift_amt == element_width / 2) {
                     return "uzp2";
                 }
diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp
index 6b6898c82b85..89aad9e5c389 100644
--- a/test/correctness/simd_op_check_wasm.cpp
+++ b/test/correctness/simd_op_check_wasm.cpp
@@ -388,17 +388,10 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
                 // check("v128.load64_zero", 2 * w, in_u64(0));
 
                 // Load vector with identical lanes generates *.splat.
-                if (Halide::Internal::get_llvm_version() >= 160) {
-                    check("i8x16.splat", 16 * w, in_u8(0));
-                    check("i16x8.splat", 8 * w, in_u16(0));
-                    check("i32x4.splat", 4 * w, in_u32(0));
-                    check("i64x2.splat", 2 * w, in_u64(0));
-                } else {
-                    check("v128.load8_splat", 16 * w, in_u8(0));
-                    check("v128.load16_splat", 8 * w, in_u16(0));
-                    check("v128.load32_splat", 4 * w, in_u32(0));
-                    check("v128.load64_splat", 2 * w, in_u64(0));
-                }
+                check("i8x16.splat", 16 * w, in_u8(0));
+                check("i16x8.splat", 8 * w, in_u16(0));
+                check("i32x4.splat", 4 * w, in_u32(0));
+                check("i64x2.splat", 2 * w, in_u64(0));
 
                 // Load Lane
                 // TODO: does Halide have any idiom that obviously generates these?

From 22f9bb9247b3e384bbd9d8e7ff96501a29b49265 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 17 Jan 2024 16:26:43 +0000
Subject: [PATCH 033/186] Add test for #8029 (#8032)

Tweak correctness_float16_t so that it uses one of the transcendal functions (sqrt) that were missing in Metal.
---
 test/correctness/float16_t.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/correctness/float16_t.cpp b/test/correctness/float16_t.cpp
index 2c10f3e81ae6..d135e8108fa7 100644
--- a/test/correctness/float16_t.cpp
+++ b/test/correctness/float16_t.cpp
@@ -236,14 +236,14 @@ int run_test() {
         Param<float16_t> mul("mul");
 
         Func output;
-        output(x, y) = x * y * (input(x, y) * mul);
+        output(x, y) = x * y * (sqrt(input(x, y)) * mul);
 
         Var xi, yi;
         output.gpu_tile(x, y, xi, yi, 8, 8);
 
         mul.set(float16_t(2.0f));
         Buffer<float16_t> in(8, 8);
-        in.fill(float16_t(0.25f));
+        in.fill(float16_t(0.0625f));
         input.set(in);
         Buffer<float16_t> buf = output.realize({8, 8});
         for (int y = 0; y < 8; y++) {

From e0e9f637635c29f92f40890c4ba0c539b32141cf Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 22 Jan 2024 21:43:00 +0000
Subject: [PATCH 034/186] Tweak the Printer code in runtime for smaller code
 (#8023)

* Tweak the Printer code in runtime for smaller code

TL;DR: template expansion meant that we had more replicated code than expected from the inline expansion of code in Printer and friends. Restructured and added NEVER_INLINE to try to make the call sites as small as possible. It's a modest code-size savings but nonzero... e.g., the linux-x86-64 .o output from correct_cross_compilation drops from 164280 bytes to 162936 bytes.

* Update printer.h

* debug

* Update HalideTestHelpers.cmake

* Update printer.h

* fixes
---
 src/runtime/d3d12compute.cpp        |   4 +-
 src/runtime/posix_error_handler.cpp |  20 ++-
 src/runtime/printer.h               | 226 ++++++++++++++--------------
 src/runtime/runtime_internal.h      |   2 +
 src/runtime/to_string.cpp           |  15 ++
 src/runtime/tracing.cpp             |   2 +-
 6 files changed, 140 insertions(+), 129 deletions(-)

diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp
index adae690800cc..f4f85180a56e 100644
--- a/src/runtime/d3d12compute.cpp
+++ b/src/runtime/d3d12compute.cpp
@@ -98,11 +98,11 @@ static constexpr uint64_t trace_buf_size = 4096;
 WEAK char trace_buf[trace_buf_size] = {};
 WEAK int trace_indent = 0;
 
-struct trace : public BasicPrinter<trace_buf_size> {
+struct trace : public PrinterBase {
     ScopedMutexLock lock;
 
     explicit trace(void *user_context = nullptr)
-        : BasicPrinter<trace_buf_size>(user_context, trace_buf),
+        : PrinterBase(user_context, trace_buf, trace_buf_size),
           lock(&trace_lock) {
         for (int i = 0; i < trace_indent; i++) {
             *this << "    ";
diff --git a/src/runtime/posix_error_handler.cpp b/src/runtime/posix_error_handler.cpp
index d40790fad15d..27bcc1f5b28f 100644
--- a/src/runtime/posix_error_handler.cpp
+++ b/src/runtime/posix_error_handler.cpp
@@ -7,18 +7,16 @@ extern "C" {
 extern void abort();
 
 WEAK void halide_default_error(void *user_context, const char *msg) {
-    char buf[4096];
-    char *dst = halide_string_to_string(buf, buf + 4094, "Error: ");
-    dst = halide_string_to_string(dst, dst + 4094, msg);
-    // We still have one character free. Add a newline if there
-    // isn't one already.
-    if (dst[-1] != '\n') {
-        dst[0] = '\n';
-        dst[1] = 0;
-        dst += 1;
+    // Can't use StackBasicPrinter here because it limits size to 256
+    constexpr int buf_size = 4096;
+    char buf[buf_size];
+    PrinterBase dst(user_context, buf, buf_size);
+    dst << "Error: " << msg;
+    const char *d = dst.str();
+    if (d && *d && d[strlen(d) - 1] != '\n') {
+        dst << "\n";
     }
-    (void)halide_msan_annotate_memory_is_initialized(user_context, buf, dst - buf + 1);
-    halide_print(user_context, buf);
+    halide_print(user_context, dst.str());
     abort();
 }
 }
diff --git a/src/runtime/printer.h b/src/runtime/printer.h
index be3620020824..6a379561dbe5 100644
--- a/src/runtime/printer.h
+++ b/src/runtime/printer.h
@@ -41,179 +41,174 @@ constexpr uint64_t default_printer_buffer_length = 1024;
 // Then remember the print only happens when the debug object leaves
 // scope, which may print at a confusing time.
 
-namespace {
-template<PrinterType printer_type, uint64_t buffer_length = default_printer_buffer_length>
-class Printer {
-    char *buf, *dst, *end;
-    void *user_context;
-    bool own_mem;
+class PrinterBase {
+protected:
+    char *dst;
+    char *const end;
+    char *const start;
+    void *const user_context;
+
+    NEVER_INLINE void allocation_error() const {
+        halide_error(user_context, "Printer buffer allocation failed.\n");
+    }
 
 public:
-    explicit Printer(void *ctx, char *mem = nullptr)
-        : user_context(ctx), own_mem(mem == nullptr) {
-        if (mem != nullptr) {
-            buf = mem;
-        } else {
-            buf = (char *)malloc(buffer_length);
+    // This class will stream text into the range [start, start + size - 1].
+    // It does *not* assume any ownership of the memory; it assumes
+    // the memory will remain valid for its lifespan, and doesn't
+    // attempt to free any allocations. It also doesn't do any sanity
+    // checking of the pointers, so if you pass in a null or bogus value,
+    // it will attempt to use it.
+    NEVER_INLINE PrinterBase(void *user_context_, char *start_, uint64_t size_)
+        : dst(start_),
+          // (If start is null, set end = start to ensure no writes are done)
+          end(start_ ? start_ + size_ - 1 : start_),
+          start(start_),
+          user_context(user_context_) {
+        if (end > start) {
+            // null-terminate the final byte to ensure string isn't $ENDLESS
+            *end = 0;
         }
+    }
+
+    NEVER_INLINE const char *str() {
+        (void)halide_msan_annotate_memory_is_initialized(user_context, start, dst - start + 1);
+        return start;
+    }
+
+    uint64_t size() const {
+        halide_debug_assert(user_context, dst >= start);
+        return (uint64_t)(dst - start);
+    }
+
+    uint64_t capacity() const {
+        halide_debug_assert(user_context, end >= start);
+        return (uint64_t)(end - start);
+    }
 
-        dst = buf;
+    NEVER_INLINE void clear() {
+        dst = start;
         if (dst) {
-            end = buf + (buffer_length - 1);
-            *end = 0;
-        } else {
-            // Pointers equal ensures no writes to buffer via formatting code
-            end = dst;
+            dst[0] = 0;
         }
+    }
 
-#if HALIDE_RUNTIME_PRINTER_LOG_THREADID
-        uint64_t tid;
-        pthread_threadid_np(0, &tid);
-        *this << "(TID:" << tid << ")";
-#endif
+    NEVER_INLINE void erase(int n) {
+        if (dst) {
+            dst -= n;
+            if (dst < start) {
+                dst = start;
+            }
+            dst[0] = 0;
+        }
     }
 
-    // Not movable, not copyable
-    Printer(const Printer &copy) = delete;
-    Printer &operator=(const Printer &) = delete;
-    Printer(Printer &&) = delete;
-    Printer &operator=(Printer &&) = delete;
+    struct Float16Bits {
+        uint16_t bits;
+    };
 
-    Printer &operator<<(const char *arg) {
+    // These are NEVER_INLINE because Clang will aggressively inline
+    // all of them, but the code size of calling out-of-line here is slightly
+    // smaller, and we ~always prefer smaller code size when using Printer
+    // in the runtime (it's a modest but nonzero difference).
+    NEVER_INLINE PrinterBase &operator<<(const char *arg) {
         dst = halide_string_to_string(dst, end, arg);
         return *this;
     }
 
-    Printer &operator<<(int64_t arg) {
+    NEVER_INLINE PrinterBase &operator<<(int64_t arg) {
         dst = halide_int64_to_string(dst, end, arg, 1);
         return *this;
     }
 
-    Printer &operator<<(int32_t arg) {
+    NEVER_INLINE PrinterBase &operator<<(int32_t arg) {
         dst = halide_int64_to_string(dst, end, arg, 1);
         return *this;
     }
 
-    Printer &operator<<(uint64_t arg) {
+    NEVER_INLINE PrinterBase &operator<<(uint64_t arg) {
         dst = halide_uint64_to_string(dst, end, arg, 1);
         return *this;
     }
 
-    Printer &operator<<(uint32_t arg) {
+    NEVER_INLINE PrinterBase &operator<<(uint32_t arg) {
         dst = halide_uint64_to_string(dst, end, arg, 1);
         return *this;
     }
 
-    Printer &operator<<(double arg) {
+    NEVER_INLINE PrinterBase &operator<<(double arg) {
         dst = halide_double_to_string(dst, end, arg, 1);
         return *this;
     }
 
-    Printer &operator<<(float arg) {
+    NEVER_INLINE PrinterBase &operator<<(float arg) {
         dst = halide_double_to_string(dst, end, arg, 0);
         return *this;
     }
 
-    Printer &operator<<(const void *arg) {
-        dst = halide_pointer_to_string(dst, end, arg);
+    NEVER_INLINE PrinterBase &operator<<(Float16Bits arg) {
+        double value = halide_float16_bits_to_double(arg.bits);
+        dst = halide_double_to_string(dst, end, value, 1);
         return *this;
     }
 
-    Printer &write_float16_from_bits(const uint16_t arg) {
-        double value = halide_float16_bits_to_double(arg);
-        dst = halide_double_to_string(dst, end, value, 1);
+    NEVER_INLINE PrinterBase &operator<<(const void *arg) {
+        dst = halide_pointer_to_string(dst, end, arg);
         return *this;
     }
 
-    Printer &operator<<(const halide_type_t &t) {
+    NEVER_INLINE PrinterBase &operator<<(const halide_type_t &t) {
         dst = halide_type_to_string(dst, end, &t);
         return *this;
     }
 
-    Printer &operator<<(const halide_buffer_t &buf) {
+    NEVER_INLINE PrinterBase &operator<<(const halide_buffer_t &buf) {
         dst = halide_buffer_to_string(dst, end, &buf);
         return *this;
     }
 
-    template<typename T>
-    void append(const T &value) {
-        *this << value;
-    }
-
-    template<typename First, typename Second, typename... Rest>
-    void append(const First &first, const Second &second, const Rest &...rest) {
-        append<First>(first);
-        append<Second, Rest...>(second, rest...);
-    }
-
-    // Use it like a stringstream.
-    const char *str() {
-        if (buf) {
-            if (printer_type == StringStreamPrinterType) {
-                msan_annotate_is_initialized();
-            }
-            return buf;
-        } else {
-            return allocation_error();
-        }
-    }
-
-    // Clear it. Useful for reusing a stringstream.
-    void clear() {
-        dst = buf;
-        if (dst) {
-            dst[0] = 0;
-        }
+    template<typename... Args>
+    void append(const Args &...args) {
+        ((*this << args), ...);
     }
 
-    // Returns the number of characters in the buffer
-    uint64_t size() const {
-        return (uint64_t)(dst - buf);
-    }
+    // Not movable, not copyable
+    PrinterBase(const PrinterBase &copy) = delete;
+    PrinterBase &operator=(const PrinterBase &) = delete;
+    PrinterBase(PrinterBase &&) = delete;
+    PrinterBase &operator=(PrinterBase &&) = delete;
+};
 
-    uint64_t capacity() const {
-        return buffer_length;
-    }
+namespace {
 
-    // Delete the last N characters
-    void erase(int n) {
-        if (dst) {
-            dst -= n;
-            if (dst < buf) {
-                dst = buf;
-            }
-            dst[0] = 0;
+template<PrinterType printer_type, uint64_t buffer_length = default_printer_buffer_length>
+class HeapPrinter : public PrinterBase {
+public:
+    NEVER_INLINE explicit HeapPrinter(void *user_context)
+        : PrinterBase(user_context, (char *)malloc(buffer_length), buffer_length) {
+        if (!start) {
+            allocation_error();
         }
-    }
 
-    const char *allocation_error() {
-        return "Printer buffer allocation failed.\n";
-    }
-
-    void msan_annotate_is_initialized() {
-        (void)halide_msan_annotate_memory_is_initialized(user_context, buf, dst - buf + 1);
+#if HALIDE_RUNTIME_PRINTER_LOG_THREADID
+        uint64_t tid;
+        pthread_threadid_np(0, &tid);
+        *this << "(TID:" << tid << ")";
+#endif
     }
 
-    ~Printer() {
-        if (!buf) {
-            halide_error(user_context, allocation_error());
+    NEVER_INLINE ~HeapPrinter() {
+        if (printer_type == ErrorPrinterType) {
+            halide_error(user_context, str());
+        } else if (printer_type == BasicPrinterType) {
+            halide_print(user_context, str());
         } else {
-            msan_annotate_is_initialized();
-            if (printer_type == ErrorPrinterType) {
-                halide_error(user_context, buf);
-            } else if (printer_type == BasicPrinterType) {
-                halide_print(user_context, buf);
-            } else {
-                // It's a stringstream. Do nothing.
-            }
+            // It's a stringstream. Do nothing.
         }
 
-        if (own_mem) {
-            free(buf);
-        }
+        free(start);
     }
 };
-
 // A class that supports << with all the same types as Printer, but
 // does nothing and should compile to a no-op.
 class SinkPrinter {
@@ -227,13 +222,13 @@ ALWAYS_INLINE SinkPrinter operator<<(const SinkPrinter &s, T) {
 }
 
 template<uint64_t buffer_length = default_printer_buffer_length>
-using BasicPrinter = Printer<BasicPrinterType, buffer_length>;
+using BasicPrinter = HeapPrinter<BasicPrinterType, buffer_length>;
 
 template<uint64_t buffer_length = default_printer_buffer_length>
-using ErrorPrinter = Printer<ErrorPrinterType, buffer_length>;
+using ErrorPrinter = HeapPrinter<ErrorPrinterType, buffer_length>;
 
 template<uint64_t buffer_length = default_printer_buffer_length>
-using StringStreamPrinter = Printer<StringStreamPrinterType, buffer_length>;
+using StringStreamPrinter = HeapPrinter<StringStreamPrinterType, buffer_length>;
 
 using print = BasicPrinter<>;
 using error = ErrorPrinter<>;
@@ -244,17 +239,16 @@ using debug = BasicPrinter<>;
 #else
 using debug = SinkPrinter;
 #endif
-}  // namespace
 
 // A Printer that automatically reserves stack space for the printer buffer, rather than malloc.
 // Note that this requires an explicit buffer_length, and it (generally) should be <= 256.
 template<PrinterType printer_type, uint64_t buffer_length>
-class StackPrinter : public Printer<printer_type, buffer_length> {
+class StackPrinter : public PrinterBase {
     char scratch[buffer_length];
 
 public:
-    explicit StackPrinter(void *ctx)
-        : Printer<printer_type, buffer_length>(ctx, scratch) {
+    explicit StackPrinter(void *user_context)
+        : PrinterBase(user_context, scratch, buffer_length) {
         static_assert(buffer_length <= 256, "StackPrinter is meant only for small buffer sizes; you are probably making a mistake.");
     }
 };
@@ -268,6 +262,8 @@ using StackErrorPrinter = StackPrinter<ErrorPrinterType, buffer_length>;
 template<uint64_t buffer_length = default_printer_buffer_length>
 using StackStringStreamPrinter = StackPrinter<StringStreamPrinterType, buffer_length>;
 
+}  // namespace
+
 }  // namespace Internal
 }  // namespace Runtime
 }  // namespace Halide
diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h
index 57dfe0b1087a..027ae5c4f500 100644
--- a/src/runtime/runtime_internal.h
+++ b/src/runtime/runtime_internal.h
@@ -51,6 +51,8 @@ typedef ptrdiff_t ssize_t;
 
 #define WEAK __attribute__((weak))
 
+#define NEVER_INLINE __attribute__((noinline))
+
 // Note that ALWAYS_INLINE should *always* also be `inline`.
 #define ALWAYS_INLINE inline __attribute__((always_inline))
 
diff --git a/src/runtime/to_string.cpp b/src/runtime/to_string.cpp
index 71d537609e83..1200ca5c07d9 100644
--- a/src/runtime/to_string.cpp
+++ b/src/runtime/to_string.cpp
@@ -1,8 +1,11 @@
 #include "HalideRuntime.h"
+#include "runtime_internal.h"
 
 extern "C" {
 
 WEAK char *halide_string_to_string(char *dst, char *end, const char *arg) {
+    halide_debug_assert(nullptr, dst <= end);
+
     if (dst >= end) {
         return dst;
     }
@@ -25,6 +28,8 @@ WEAK char *halide_string_to_string(char *dst, char *end, const char *arg) {
 }
 
 WEAK char *halide_uint64_to_string(char *dst, char *end, uint64_t arg, int min_digits) {
+    halide_debug_assert(nullptr, dst <= end);
+
     // 32 is more than enough chars to contain any 64-bit int.
     char buf[32];
     buf[31] = 0;
@@ -43,6 +48,8 @@ WEAK char *halide_uint64_to_string(char *dst, char *end, uint64_t arg, int min_d
 }
 
 WEAK char *halide_int64_to_string(char *dst, char *end, int64_t arg, int min_digits) {
+    halide_debug_assert(nullptr, dst <= end);
+
     if (arg < 0 && dst < end) {
         *dst++ = '-';
         arg = -arg;
@@ -51,6 +58,8 @@ WEAK char *halide_int64_to_string(char *dst, char *end, int64_t arg, int min_dig
 }
 
 WEAK char *halide_double_to_string(char *dst, char *end, double arg, int scientific) {
+    halide_debug_assert(nullptr, dst <= end);
+
     uint64_t bits = 0;
     memcpy(&bits, &arg, sizeof(double));
 
@@ -234,6 +243,8 @@ WEAK char *halide_double_to_string(char *dst, char *end, double arg, int scienti
 }
 
 WEAK char *halide_pointer_to_string(char *dst, char *end, const void *arg) {
+    halide_debug_assert(nullptr, dst <= end);
+
     const char *hex_digits = "0123456789abcdef";
     char buf[20] = {0};
     char *buf_ptr = buf + 18;
@@ -251,6 +262,8 @@ WEAK char *halide_pointer_to_string(char *dst, char *end, const void *arg) {
 }
 
 WEAK char *halide_type_to_string(char *dst, char *end, const halide_type_t *t) {
+    halide_debug_assert(nullptr, dst <= end);
+
     const char *code_name = nullptr;
     switch (t->code) {
     case halide_type_int:
@@ -282,6 +295,8 @@ WEAK char *halide_type_to_string(char *dst, char *end, const halide_type_t *t) {
 }
 
 WEAK char *halide_buffer_to_string(char *dst, char *end, const halide_buffer_t *buf) {
+    halide_debug_assert(nullptr, dst <= end);
+
     if (buf == nullptr) {
         return halide_string_to_string(dst, end, "nullptr");
     }
diff --git a/src/runtime/tracing.cpp b/src/runtime/tracing.cpp
index 8e8769e2ad12..93a12c7d90a4 100644
--- a/src/runtime/tracing.cpp
+++ b/src/runtime/tracing.cpp
@@ -308,7 +308,7 @@ WEAK int32_t halide_default_trace(void *user_context, const halide_trace_event_t
                     if (print_bits == 32) {
                         ss << ((float *)(e->value))[i];
                     } else if (print_bits == 16) {
-                        ss.write_float16_from_bits(((uint16_t *)(e->value))[i]);
+                        ss << PrinterBase::Float16Bits{((uint16_t *)(e->value))[i]};
                     } else {
                         ss << ((double *)(e->value))[i];
                     }

From 90e909d8e56e2894d5b63e9efab2e97e058887ee Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 24 Jan 2024 18:44:47 +0000
Subject: [PATCH 035/186] Allow LLVM 19 in CMake (#8041)

---
 dependencies/llvm/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt
index 48b8642494dd..a4aef94b08de 100644
--- a/dependencies/llvm/CMakeLists.txt
+++ b/dependencies/llvm/CMakeLists.txt
@@ -24,8 +24,8 @@ if (LLVM_PACKAGE_VERSION VERSION_LESS 16.0)
     message(FATAL_ERROR "LLVM version must be 15.0 or newer")
 endif ()
 
-if (LLVM_PACKAGE_VERSION VERSION_GREATER 18.0)
-    message(WARNING "Halide is not tested on LLVM versions beyond 18.0")
+if (LLVM_PACKAGE_VERSION VERSION_GREATER 19.0)
+    message(WARNING "Halide is not tested on LLVM versions beyond 19.0")
 endif ()
 
 # LLVM_DEFINITIONS is a space-separated list instead of a more typical

From 9b9dfaff070653954dda3c4a872a02644e2464e3 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 25 Jan 2024 06:12:17 +1100
Subject: [PATCH 036/186] Update Makefile for llvm 19 (#8040)

---
 Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b24dfdc2d80d..39358e03ef18 100644
--- a/Makefile
+++ b/Makefile
@@ -2280,6 +2280,10 @@ ifneq (,$(findstring clang version 18.0,$(CLANG_VERSION)))
 CLANG_OK=yes
 endif
 
+ifneq (,$(findstring clang version 19.0,$(CLANG_VERSION)))
+CLANG_OK=yes
+endif
+
 ifneq (,$(findstring Apple LLVM version 5.0,$(CLANG_VERSION)))
 CLANG_OK=yes
 endif
@@ -2300,7 +2304,7 @@ $(BUILD_DIR)/clang_ok:
 	@exit 1
 endif
 
-ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 160 170 180))
+ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 160 170 180 190))
 LLVM_OK=yes
 endif
 

From 6177e519b49d4e674ddb33c8d3ae0a1a4e839b9e Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 24 Jan 2024 20:04:19 +0000
Subject: [PATCH 037/186] Update Halide version to 18 (#8043)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6be8ece13282..6b6fb85841c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.22...3.23)
 project(Halide
-        VERSION 17.0.0
+        VERSION 18.0.0
         DESCRIPTION "Halide compiler and libraries"
         HOMEPAGE_URL "https://halide-lang.org")
 

From c1923f3691ff1ac2964a33dc599b47a88eada5b5 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 24 Jan 2024 23:53:28 +0000
Subject: [PATCH 038/186] HALIDE_VERSION_MAJOR -> 18 (#8044)

---
 src/runtime/HalideRuntime.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index b61b13041b8e..7b84e44f6928 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -23,7 +23,7 @@
 // our CMake build, so that we ensure that the in-build metadata (eg soversion)
 // matches, but keeping the canonical version here makes it easier to keep
 // downstream build systems (eg Blaze/Bazel) properly in sync with the source.
-#define HALIDE_VERSION_MAJOR 17
+#define HALIDE_VERSION_MAJOR 18
 #define HALIDE_VERSION_MINOR 0
 #define HALIDE_VERSION_PATCH 0
 

From 4590a095a857d07232b2407b1b5a3fdeaa327cc2 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 26 Jan 2024 12:07:40 +1100
Subject: [PATCH 039/186] Fix for llvm trunk: Force-include more runtime types
 (#8045)

* Fix for llvm trunk: Force-include more runtime types

* Include the force-include-types module first

* Fix comment

* Expand comment
---
 src/LLVM_Runtime_Linker.cpp         | 15 ++++++++++++---
 src/runtime/force_include_types.cpp | 17 +++++++++++++----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 0531f7bc3365..ad65bdc2ebc2 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -782,6 +782,7 @@ std::unique_ptr<llvm::Module> link_with_wasm_jit_runtime(llvm::LLVMContext *c, c
     // things that are 'alwaysinline' can be included here but are unnecessary.
     vector<std::unique_ptr<llvm::Module>> modules;
     modules.push_back(std::move(extra_module));
+    modules.push_back(get_initmod_force_include_types(c, bits_64, debug));
     modules.push_back(get_initmod_fake_thread_pool(c, bits_64, debug));
     modules.push_back(get_initmod_posix_aligned_alloc(c, bits_64, debug));
     modules.push_back(get_initmod_posix_allocator(c, bits_64, debug));
@@ -796,7 +797,6 @@ std::unique_ptr<llvm::Module> link_with_wasm_jit_runtime(llvm::LLVMContext *c, c
     modules.push_back(get_initmod_alignment_32(c, bits_64, debug));
     modules.push_back(get_initmod_fopen(c, bits_64, debug));
     modules.push_back(get_initmod_device_interface(c, bits_64, debug));
-    modules.push_back(get_initmod_force_include_types(c, bits_64, debug));
     modules.push_back(get_initmod_float16_t(c, bits_64, debug));
     modules.push_back(get_initmod_errors(c, bits_64, debug));
     modules.push_back(get_initmod_msan_stubs(c, bits_64, debug));
@@ -843,6 +843,17 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
 
     vector<std::unique_ptr<llvm::Module>> modules;
 
+    // Start with the module that defines our struct types. This must be
+    // included first, because when parsing modules, if two structs are
+    // encountered with the same fields, they are deduped, and the first name
+    // wins.
+    //
+    // If in the future these names become unpredictable, an alternative
+    // strategy is to make this module include a global variable of each type we
+    // care about, recover the struct types from those named globals, and then
+    // delete the globals in link_modules.
+    modules.push_back(get_initmod_force_include_types(c, bits_64, debug));
+
     const auto add_allocator = [&]() {
         modules.push_back(get_initmod_posix_aligned_alloc(c, bits_64, debug));
         modules.push_back(get_initmod_posix_allocator(c, bits_64, debug));
@@ -1277,8 +1288,6 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
         modules.push_back(get_initmod_runtime_api(c, bits_64, debug));
     }
 
-    modules.push_back(get_initmod_force_include_types(c, bits_64, debug));
-
     link_modules(modules, t);
 
     if (t.os == Target::Windows &&
diff --git a/src/runtime/force_include_types.cpp b/src/runtime/force_include_types.cpp
index f5eeda611180..99a2dea821fc 100644
--- a/src/runtime/force_include_types.cpp
+++ b/src/runtime/force_include_types.cpp
@@ -6,10 +6,19 @@ namespace Runtime {
 namespace Internal {
 
 struct AllTheTypes {
-    halide_filter_metadata_t a;
-    halide_filter_argument_t b;
-    halide_scalar_value_t c;
-    halide_semaphore_t d;
+    halide_buffer_t a;
+    halide_device_interface_t b;
+    halide_dimension_t c;
+    halide_filter_argument_t d;
+    halide_filter_metadata_t e;
+    halide_parallel_task_t f;
+    halide_pseudostack_slot_t g;
+    halide_scalar_value_t h;
+    halide_semaphore_acquire_t i;
+    halide_semaphore_t j;
+    halide_trace_event_t k;
+    halide_trace_packet_t l;
+    halide_type_t m;
 };
 
 WEAK void halide_unused_force_include_types() {

From 3657cf5f363fd64aeaf06432e62e3960800927b0 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sat, 27 Jan 2024 04:26:12 +1100
Subject: [PATCH 040/186] Fix bounds_of_nested_lanes (#8039)

* Fix bounds_of_nested_lanes

bounds_of_nested_lanes assumed that one layer of nested vectorization
could be removed at a time. When faced with the expression:

min(ramp(x8(a), x8(b), 5), x40(27))

It panicked, because on the left hand side it reduced the bounds to
x8(a) ... x8(a) + x8(b) * 4, and on the right hand side it reduced the
bounds to 27. It then attempted to take a min of mismatched types.

In general we can't assume that binary operators on nested vectors have
the same nesting structure on both sides, so I just rewrote it to reduce
directly to a scalar.

Fixes #8038
---
 src/VectorizeLoops.cpp             | 140 ++++++++++++++++-------------
 test/correctness/fuzz_schedule.cpp |  19 ++++
 2 files changed, 95 insertions(+), 64 deletions(-)

diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 89c4f020af51..1c3ec57f3fb7 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -29,103 +29,128 @@ Expr get_lane(const Expr &e, int l) {
     return Shuffle::make_slice(e, l, 0, 1);
 }
 
-/** Find the exact max and min lanes of a vector expression. Not
- * conservative like bounds_of_expr, but uses similar rules for some
- * common node types where it can be exact. If e is a nested vector,
- * the result will be the bounds of the vectors in each lane. */
-Interval bounds_of_nested_lanes(const Expr &e) {
+/** A helper like .as<Broadcast>(), but unwraps arbitrarily many layers of
+ * nested broadcasts. Guaranteed to return either a broadcast of a scalar or
+ * nullptr. */
+const Broadcast *as_scalar_broadcast(const Expr &e) {
+    const Broadcast *b = e.as<Broadcast>();
+    if (b && b->value.type().is_scalar()) {
+        return b;
+    } else if (b) {
+        return as_scalar_broadcast(b->value);
+    } else {
+        return nullptr;
+    }
+};
+
+/** Find the exact scalar max and min lanes of a vector expression. Not
+ * conservative like bounds_of_expr, but uses similar rules for some common node
+ * types where it can be exact. Always returns a scalar, even in the case of
+ * nested vectorization. */
+Interval bounds_of_lanes(const Expr &e) {
+    if (e.type().is_scalar()) {
+        return {e, e};
+    }
+
     if (const Add *add = e.as<Add>()) {
-        if (const Broadcast *b = add->b.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(add->a);
+        if (const Broadcast *b = as_scalar_broadcast(add->b)) {
+            Interval ia = bounds_of_lanes(add->a);
             return {ia.min + b->value, ia.max + b->value};
-        } else if (const Broadcast *b = add->a.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(add->b);
+        } else if (const Broadcast *b = as_scalar_broadcast(add->a)) {
+            Interval ia = bounds_of_lanes(add->b);
             return {b->value + ia.min, b->value + ia.max};
         }
     } else if (const Sub *sub = e.as<Sub>()) {
-        if (const Broadcast *b = sub->b.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(sub->a);
+        if (const Broadcast *b = as_scalar_broadcast(sub->b)) {
+            Interval ia = bounds_of_lanes(sub->a);
             return {ia.min - b->value, ia.max - b->value};
-        } else if (const Broadcast *b = sub->a.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(sub->b);
-            return {b->value - ia.max, b->value - ia.max};
+        } else if (const Broadcast *b = as_scalar_broadcast(sub->a)) {
+            Interval ia = bounds_of_lanes(sub->b);
+            return {b->value - ia.max, b->value - ia.min};
         }
     } else if (const Mul *mul = e.as<Mul>()) {
-        if (const Broadcast *b = mul->b.as<Broadcast>()) {
+        if (const Broadcast *b = as_scalar_broadcast(mul->b)) {
             if (is_positive_const(b->value)) {
-                Interval ia = bounds_of_nested_lanes(mul->a);
+                Interval ia = bounds_of_lanes(mul->a);
                 return {ia.min * b->value, ia.max * b->value};
             } else if (is_negative_const(b->value)) {
-                Interval ia = bounds_of_nested_lanes(mul->a);
+                Interval ia = bounds_of_lanes(mul->a);
                 return {ia.max * b->value, ia.min * b->value};
             }
-        } else if (const Broadcast *b = mul->a.as<Broadcast>()) {
+        } else if (const Broadcast *b = as_scalar_broadcast(mul->a)) {
             if (is_positive_const(b->value)) {
-                Interval ia = bounds_of_nested_lanes(mul->b);
+                Interval ia = bounds_of_lanes(mul->b);
                 return {b->value * ia.min, b->value * ia.max};
             } else if (is_negative_const(b->value)) {
-                Interval ia = bounds_of_nested_lanes(mul->b);
+                Interval ia = bounds_of_lanes(mul->b);
                 return {b->value * ia.max, b->value * ia.min};
             }
         }
     } else if (const Div *div = e.as<Div>()) {
-        if (const Broadcast *b = div->b.as<Broadcast>()) {
+        if (const Broadcast *b = as_scalar_broadcast(div->b)) {
             if (is_positive_const(b->value)) {
-                Interval ia = bounds_of_nested_lanes(div->a);
+                Interval ia = bounds_of_lanes(div->a);
                 return {ia.min / b->value, ia.max / b->value};
             } else if (is_negative_const(b->value)) {
-                Interval ia = bounds_of_nested_lanes(div->a);
+                Interval ia = bounds_of_lanes(div->a);
                 return {ia.max / b->value, ia.min / b->value};
             }
         }
     } else if (const And *and_ = e.as<And>()) {
-        if (const Broadcast *b = and_->b.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(and_->a);
+        if (const Broadcast *b = as_scalar_broadcast(and_->b)) {
+            Interval ia = bounds_of_lanes(and_->a);
             return {ia.min && b->value, ia.max && b->value};
-        } else if (const Broadcast *b = and_->a.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(and_->b);
+        } else if (const Broadcast *b = as_scalar_broadcast(and_->a)) {
+            Interval ia = bounds_of_lanes(and_->b);
             return {ia.min && b->value, ia.max && b->value};
         }
     } else if (const Or *or_ = e.as<Or>()) {
-        if (const Broadcast *b = or_->b.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(or_->a);
+        if (const Broadcast *b = as_scalar_broadcast(or_->b)) {
+            Interval ia = bounds_of_lanes(or_->a);
             return {ia.min && b->value, ia.max && b->value};
-        } else if (const Broadcast *b = or_->a.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(or_->b);
+        } else if (const Broadcast *b = as_scalar_broadcast(or_->a)) {
+            Interval ia = bounds_of_lanes(or_->b);
             return {ia.min && b->value, ia.max && b->value};
         }
     } else if (const Min *min = e.as<Min>()) {
-        if (const Broadcast *b = min->b.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(min->a);
+        if (const Broadcast *b = as_scalar_broadcast(min->b)) {
+            Interval ia = bounds_of_lanes(min->a);
+            // ia and b->value have both had one nesting layer of vectorization
+            // peeled off, but that doesn't make them the same type.
             return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)};
-        } else if (const Broadcast *b = min->a.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(min->b);
+        } else if (const Broadcast *b = as_scalar_broadcast(min->a)) {
+            Interval ia = bounds_of_lanes(min->b);
             return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)};
         }
     } else if (const Max *max = e.as<Max>()) {
-        if (const Broadcast *b = max->b.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(max->a);
+        if (const Broadcast *b = as_scalar_broadcast(max->b)) {
+            Interval ia = bounds_of_lanes(max->a);
             return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)};
-        } else if (const Broadcast *b = max->a.as<Broadcast>()) {
-            Interval ia = bounds_of_nested_lanes(max->b);
+        } else if (const Broadcast *b = as_scalar_broadcast(max->a)) {
+            Interval ia = bounds_of_lanes(max->b);
             return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)};
         }
     } else if (const Not *not_ = e.as<Not>()) {
-        Interval ia = bounds_of_nested_lanes(not_->a);
+        Interval ia = bounds_of_lanes(not_->a);
         return {!ia.max, !ia.min};
     } else if (const Ramp *r = e.as<Ramp>()) {
         Expr last_lane_idx = make_const(r->base.type(), r->lanes - 1);
-        if (is_positive_const(r->stride)) {
-            return {r->base, r->base + last_lane_idx * r->stride};
-        } else if (is_negative_const(r->stride)) {
-            return {r->base + last_lane_idx * r->stride, r->base};
+        Interval ib = bounds_of_lanes(r->base);
+        const Broadcast *b = as_scalar_broadcast(r->stride);
+        Expr stride = b ? b->value : r->stride;
+        if (stride.type().is_scalar()) {
+            if (is_positive_const(stride)) {
+                return {ib.min, ib.max + last_lane_idx * stride};
+            } else if (is_negative_const(stride)) {
+                return {ib.min + last_lane_idx * stride, ib.max};
+            }
         }
     } else if (const LE *le = e.as<LE>()) {
         // The least true this can be is if we maximize the LHS and minimize the RHS.
         // The most true this can be is if we minimize the LHS and maximize the RHS.
         // This is only exact if one of the two sides is a Broadcast.
-        Interval ia = bounds_of_nested_lanes(le->a);
-        Interval ib = bounds_of_nested_lanes(le->b);
+        Interval ia = bounds_of_lanes(le->a);
+        Interval ib = bounds_of_lanes(le->b);
         if (ia.is_single_point() || ib.is_single_point()) {
             return {ia.max <= ib.min, ia.min <= ib.max};
         }
@@ -133,17 +158,17 @@ Interval bounds_of_nested_lanes(const Expr &e) {
         // The least true this can be is if we maximize the LHS and minimize the RHS.
         // The most true this can be is if we minimize the LHS and maximize the RHS.
         // This is only exact if one of the two sides is a Broadcast.
-        Interval ia = bounds_of_nested_lanes(lt->a);
-        Interval ib = bounds_of_nested_lanes(lt->b);
+        Interval ia = bounds_of_lanes(lt->a);
+        Interval ib = bounds_of_lanes(lt->b);
         if (ia.is_single_point() || ib.is_single_point()) {
             return {ia.max < ib.min, ia.min < ib.max};
         }
 
-    } else if (const Broadcast *b = e.as<Broadcast>()) {
+    } else if (const Broadcast *b = as_scalar_broadcast(e)) {
         return {b->value, b->value};
     } else if (const Let *let = e.as<Let>()) {
-        Interval ia = bounds_of_nested_lanes(let->value);
-        Interval ib = bounds_of_nested_lanes(let->body);
+        Interval ia = bounds_of_lanes(let->value);
+        Interval ib = bounds_of_lanes(let->body);
         if (expr_uses_var(ib.min, let->name)) {
             ib.min = Let::make(let->name, let->value, ib.min);
         }
@@ -166,19 +191,6 @@ Interval bounds_of_nested_lanes(const Expr &e) {
     }
 };
 
-/** Similar to bounds_of_nested_lanes, but it recursively reduces
- * the bounds of nested vectors to scalars. */
-Interval bounds_of_lanes(const Expr &e) {
-    Interval bounds = bounds_of_nested_lanes(e);
-    if (!bounds.min.type().is_scalar()) {
-        bounds.min = bounds_of_lanes(bounds.min).min;
-    }
-    if (!bounds.max.type().is_scalar()) {
-        bounds.max = bounds_of_lanes(bounds.max).max;
-    }
-    return bounds;
-}
-
 // A ramp with the lanes repeated inner_repetitions times, and then
 // the whole vector repeated outer_repetitions times.
 // E.g: <0 0 2 2 4 4 6 6 0 0 2 2 4 4 6 6>.
diff --git a/test/correctness/fuzz_schedule.cpp b/test/correctness/fuzz_schedule.cpp
index 07f940ed82e3..9f0f86e3854b 100644
--- a/test/correctness/fuzz_schedule.cpp
+++ b/test/correctness/fuzz_schedule.cpp
@@ -183,6 +183,25 @@ int main(int argc, char **argv) {
         check_blur_output(buf, correct);
     }
 
+    // https://github.com/halide/Halide/issues/8038
+    {
+        Func input("input");
+        Func local_sum("local_sum");
+        Func blurry("blurry");
+        Var x("x"), y("y"), yi("yi"), yo("yo"), xi("xi"), xo("xo"), yofxi("yofxi"), yofxio("yofxio"), yofxii("yofxii"), yofxiifyi("yofxiifyi"), yofxioo("yofxioo"), yofxioi("yofxioi");
+        input(x, y) = 2 * x + 5 * y;
+        RDom r(-2, 5, -2, 5, "rdom_r");
+        local_sum(x, y) = 0;
+        local_sum(x, y) += input(x + r.x, y + r.y);
+        blurry(x, y) = cast<int32_t>(local_sum(x, y) / 25);
+        local_sum.split(y, yi, yo, 2, TailStrategy::GuardWithIf).split(x, xi, xo, 5, TailStrategy::Predicate).fuse(yo, xi, yofxi).split(yofxi, yofxio, yofxii, 8, TailStrategy::ShiftInwards).fuse(yofxii, yi, yofxiifyi).split(yofxio, yofxioo, yofxioi, 5, TailStrategy::ShiftInwards).vectorize(yofxiifyi).vectorize(yofxioi);
+        local_sum.update(0).unscheduled();
+        blurry.split(x, xo, xi, 5, TailStrategy::Auto);
+        Pipeline p({blurry});
+        auto buf = p.realize({32, 32});
+        check_blur_output(buf, correct);
+    }
+
     printf("Success!\n");
     return 0;
 }

From 45d78509df9c69ebb3d805d547cf6e54859379c4 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sat, 27 Jan 2024 07:01:41 +1100
Subject: [PATCH 041/186] Track whether or not let expressions failed to solve
 in solver (#7982)

* Track whether or not let expressions failed to solve in solver

After mutating an expression, the solver needs to know two things:

1) Did the expression contain the variable we're solving for
2) Was the expression successfully "solved" for the variable. I.e. the
variable only appears once in the leftmost position. We need to know
this to know property 1 of any subexpressions (i.e. does the right child
of the expression contain the variable). This drives what
transformations we do in ways that are guaranteed to terminate and not
take exponential time.

We were tracking property 1 through lets but not property 2, and this
meant we were doing unhelpful transformations in some cases. I found a
case in the wild where this made a pipeline take > 1 hour to compile (I
killed it after an hour). It may have been in an infinite transformation
loop, or it might have just been exponential. Not sure.

* Remove surplus comma

* Fix use of uninitialized value that could cause bad transformation
---
 src/ModulusRemainder.h |  6 ++++--
 src/Solve.cpp          | 35 ++++++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/src/ModulusRemainder.h b/src/ModulusRemainder.h
index c0341b75abf6..cbcdce10b98c 100644
--- a/src/ModulusRemainder.h
+++ b/src/ModulusRemainder.h
@@ -7,6 +7,8 @@
 
 #include <cstdint>
 
+#include "Util.h"
+
 namespace Halide {
 
 struct Expr;
@@ -83,8 +85,8 @@ ModulusRemainder modulus_remainder(const Expr &e, const Scope<ModulusRemainder>
 /** Reduce an expression modulo some integer. Returns true and assigns
  * to remainder if an answer could be found. */
 ///@{
-bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder);
-bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder, const Scope<ModulusRemainder> &scope);
+HALIDE_MUST_USE_RESULT bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder);
+HALIDE_MUST_USE_RESULT bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder, const Scope<ModulusRemainder> &scope);
 ///@}
 
 void modulus_remainder_test();
diff --git a/src/Solve.cpp b/src/Solve.cpp
index a08eedadbd27..22bd14e44412 100644
--- a/src/Solve.cpp
+++ b/src/Solve.cpp
@@ -44,18 +44,22 @@ class SolveExpression : public IRMutator {
         map<Expr, CacheEntry, ExprCompare>::iterator iter = cache.find(e);
         if (iter == cache.end()) {
             // Not in the cache, call the base class version.
-            debug(4) << "Mutating " << e << " (" << uses_var << ")\n";
+            debug(4) << "Mutating " << e << " (" << uses_var << ", " << failed << ")\n";
             bool old_uses_var = uses_var;
             uses_var = false;
+            bool old_failed = failed;
+            failed = false;
             Expr new_e = IRMutator::mutate(e);
-            CacheEntry entry = {new_e, uses_var};
+            CacheEntry entry = {new_e, uses_var, failed};
             uses_var = old_uses_var || uses_var;
+            failed = old_failed || failed;
             cache[e] = entry;
-            debug(4) << "(Miss) Rewrote " << e << " -> " << new_e << " (" << uses_var << ")\n";
+            debug(4) << "(Miss) Rewrote " << e << " -> " << new_e << " (" << uses_var << ", " << failed << ")\n";
             return new_e;
         } else {
             // Cache hit.
             uses_var = uses_var || iter->second.uses_var;
+            failed = failed || iter->second.failed;
             debug(4) << "(Hit) Rewrote " << e << " -> " << iter->second.expr << " (" << uses_var << ")\n";
             return iter->second.expr;
         }
@@ -75,7 +79,7 @@ class SolveExpression : public IRMutator {
     // stateless, so we can cache everything.
     struct CacheEntry {
         Expr expr;
-        bool uses_var;
+        bool uses_var, failed;
     };
     map<Expr, CacheEntry, ExprCompare> cache;
 
@@ -388,16 +392,25 @@ class SolveExpression : public IRMutator {
         const Mul *mul_a = a.as<Mul>();
         Expr expr;
         if (a_uses_var && !b_uses_var) {
+            const int64_t *ib = as_const_int(b);
+            auto is_multiple_of_b = [&](const Expr &e) {
+                if (ib) {
+                    int64_t r = 0;
+                    return reduce_expr_modulo(e, *ib, &r) && r == 0;
+                } else {
+                    return can_prove(e / b * b == e);
+                }
+            };
             if (add_a && !a_failed &&
-                can_prove(add_a->a / b * b == add_a->a)) {
+                is_multiple_of_b(add_a->a)) {
                 // (f(x) + a) / b -> f(x) / b + a / b
                 expr = mutate(simplify(add_a->a / b) + add_a->b / b);
             } else if (sub_a && !a_failed &&
-                       can_prove(sub_a->a / b * b == sub_a->a)) {
+                       is_multiple_of_b(sub_a->a)) {
                 // (f(x) - a) / b -> f(x) / b - a / b
                 expr = mutate(simplify(sub_a->a / b) - sub_a->b / b);
             } else if (mul_a && !a_failed && no_overflow_int(op->type) &&
-                       can_prove(mul_a->b / b * b == mul_a->b)) {
+                       is_multiple_of_b(mul_a->b)) {
                 // (f(x) * a) / b -> f(x) * (a / b)
                 expr = mutate(mul_a->a * (mul_a->b / b));
             }
@@ -776,6 +789,7 @@ class SolveExpression : public IRMutator {
         } else if (scope.contains(op->name)) {
             CacheEntry e = scope.get(op->name);
             uses_var = uses_var || e.uses_var;
+            failed = failed || e.failed;
             return e.expr;
         } else if (external_scope.contains(op->name)) {
             Expr e = external_scope.get(op->name);
@@ -790,11 +804,14 @@ class SolveExpression : public IRMutator {
 
     Expr visit(const Let *op) override {
         bool old_uses_var = uses_var;
+        bool old_failed = failed;
         uses_var = false;
+        failed = false;
         Expr value = mutate(op->value);
-        CacheEntry e = {value, uses_var};
-
+        CacheEntry e = {value, uses_var, failed};
         uses_var = old_uses_var;
+        failed = old_failed;
+
         ScopedBinding<CacheEntry> bind(scope, op->name, e);
         return mutate(op->body);
     }

From 4b2d21154c5eda4e3ece657e4886d45fa78069f1 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Sat, 27 Jan 2024 00:33:24 +0000
Subject: [PATCH 042/186] Upgrade clang-format and clang-tidy to use LLVM 17
 (#8042)

* Upgrade clang-format and clang-tidy to use LLVM 17

* trigger buildbots

* trigger buildbots

* trigger buildbots

* trigger buildbots
---
 .clang-tidy                                   | 24 ++++++++++++++--
 .github/workflows/presubmit.yml               | 14 +++++-----
 apps/hannk/interpreter/allocation_planner.cpp |  2 +-
 apps/hannk/interpreter/interpreter.cpp        |  2 +-
 apps/hannk/util/error_util.cpp                |  6 ++--
 apps/hannk/util/model_runner.cpp              |  4 +--
 run-clang-format.sh                           | 14 +++++-----
 run-clang-tidy.sh                             | 14 +++++-----
 src/Associativity.cpp                         |  2 +-
 src/AutoScheduleUtils.cpp                     |  2 +-
 src/Bounds.cpp                                |  2 +-
 src/Buffer.h                                  | 28 +++++++++----------
 src/Deinterleave.cpp                          |  2 +-
 src/Function.cpp                              | 11 ++++----
 src/Generator.cpp                             |  2 +-
 src/Generator.h                               | 10 +++----
 src/IRMatch.cpp                               |  2 +-
 src/LLVM_Output.cpp                           |  6 ++--
 src/Monotonic.cpp                             |  2 +-
 src/Reduction.cpp                             |  2 +-
 src/Scope.h                                   |  2 +-
 src/SpirvIR.cpp                               |  2 +-
 src/StmtToHTML.cpp                            |  2 +-
 src/Target.cpp                                |  2 +-
 src/UniquifyVariableNames.cpp                 |  2 +-
 src/autoschedulers/adams2019/AutoSchedule.cpp |  2 +-
 .../anderson2021/AutoSchedule.cpp             | 20 ++++++-------
 .../anderson2021/SearchSpace.cpp              |  2 +-
 src/autoschedulers/anderson2021/State.h       |  2 +-
 src/autoschedulers/common/cmdline.h           | 14 +++++-----
 .../li2018/GradientAutoscheduler.cpp          |  2 +-
 src/runtime/cuda.cpp                          |  2 +-
 src/runtime/mini_d3d12.h                      | 22 +++++++++------
 src/runtime/mini_vulkan.h                     |  2 +-
 src/runtime/opencl.cpp                        |  2 +-
 src/runtime/runtime_internal.h                |  2 +-
 test/correctness/unroll_dynamic_loop.cpp      |  2 +-
 tools/regexp_replace.cpp                      |  2 +-
 38 files changed, 132 insertions(+), 105 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 04cf50c915ec..815ccd3339a2 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -19,6 +19,7 @@ Checks: >
     bugprone-dangling-handle,
     bugprone-dynamic-static-initializers,
     -bugprone-easily-swappable-parameters,
+    -bugprone-empty-catch,  # TODO: consider enabling
     -bugprone-exception-escape,
     bugprone-fold-init-type,
     bugprone-forward-declaration-namespace,
@@ -35,8 +36,10 @@ Checks: >
     bugprone-misplaced-pointer-arithmetic-in-alloc,
     bugprone-misplaced-widening-cast,
     bugprone-move-forwarding-reference,
+    bugprone-multiple-new-in-one-expression,
     bugprone-multiple-statement-macro,
-    -bugprone-narrowing-conversions,,
+    -bugprone-narrowing-conversions,
+    bugprone-non-zero-enum-to-bool-conversion,
     bugprone-no-escape,
     bugprone-not-null-terminated-result,
     bugprone-parent-virtual-call,
@@ -63,6 +66,7 @@ Checks: >
     bugprone-suspicious-semicolon,
     bugprone-suspicious-string-compare,
     bugprone-swapped-arguments,
+    -bugprone-switch-missing-default-case,  # TODO: consider enabling
     bugprone-terminating-continue,
     bugprone-throw-keyword-missing,
     bugprone-too-small-loop-variable,
@@ -71,6 +75,8 @@ Checks: >
     bugprone-undelegated-constructor,
     bugprone-unhandled-exception-at-new,
     bugprone-unhandled-self-assignment,
+    bugprone-unique-ptr-array-mismatch,
+    bugprone-unsafe-functions,
     bugprone-unused-raii,
     bugprone-unused-return-value,
     bugprone-use-after-move,
@@ -78,9 +84,16 @@ Checks: >
 
     clang-diagnostic-shadow-field,
 
+    cppcoreguidelines-avoid-capturing-lambda-coroutines,
+    cppcoreguidelines-misleading-capture-default-by-value,
+    -cppcoreguidelines-missing-std-forward,  # TODO: consider enabling
+    cppcoreguidelines-rvalue-reference-param-not-moved,
+
     misc-confusable-identifiers,
     -misc-const-correctness,
     misc-definitions-in-headers,
+    misc-header-include-cycle,
+    -misc-include-cleaner,  # TODO: consider enabling
     misc-misleading-bidirectional,
     misc-misleading-identifier,
     misc-misplaced-const,
@@ -115,6 +128,7 @@ Checks: >
     -modernize-replace-random-shuffle,
     -modernize-return-braced-init-list,
     -modernize-shrink-to-fit,
+    -modernize-type-traits,  # TODO: consider enabling
     -modernize-unary-static-assert,
     -modernize-use-auto,
     modernize-use-bool-literals,
@@ -126,11 +140,13 @@ Checks: >
     -modernize-use-noexcept,
     modernize-use-nullptr,
     modernize-use-override,
+    -modernize-use-std-print,
     -modernize-use-trailing-return-type,
     -modernize-use-transparent-functors,
     -modernize-use-uncaught-exceptions,
-    -modernize-use-using
+    -modernize-use-using,
 
+    performance-avoid-endl,
     performance-faster-string-find,
     performance-for-range-copy,
     performance-implicit-conversion-in-loop,
@@ -141,13 +157,16 @@ Checks: >
     performance-move-constructor-init,
     performance-no-automatic-move,
     -performance-no-int-to-ptr,
+    performance-noexcept-destructor,
     performance-noexcept-move-constructor,
+    performance-noexcept-swap,
     performance-trivially-destructible,
     performance-type-promotion-in-math-fn,
     performance-unnecessary-copy-initialization,
     performance-unnecessary-value-param,
 
     readability-avoid-const-params-in-decls,
+    -readability-avoid-unconditional-preprocessor-if,
     readability-braces-around-statements,
     readability-const-return-type,
     -readability-container-contains,
@@ -170,6 +189,7 @@ Checks: >
     readability-misplaced-array-index,
     -readability-named-parameter,
     -readability-non-const-parameter,
+    -readability-operators-representation,
     readability-qualified-auto,
     readability-redundant-access-specifiers,
     readability-redundant-control-flow,
diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 045a313cb23c..e30a606bd8d0 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -18,11 +18,11 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v3
-      - uses: DoozyX/clang-format-lint-action@v0.16.2
+      - uses: DoozyX/clang-format-lint-action@v0.17
         with:
           source: '.'
           extensions: 'h,c,cpp'
-          clangFormatVersion: 16
+          clangFormatVersion: 17
   # As of Aug 2023, the macOS runners have more RAM (14GB vs 7GB) and CPU (3 cores vs 2)
   # than the Linux and Windows runners, so let's use those instead, since clang-tidy is
   # a bit of a sluggard
@@ -36,14 +36,14 @@ jobs:
           # from apt.llvm.org
           # wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
           sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 15CF4D18AF4F7421
-          sudo apt-add-repository "deb https://apt.llvm.org/$(lsb_release -sc)/ llvm-toolchain-$(lsb_release -sc)-16 main"
+          sudo apt-add-repository "deb https://apt.llvm.org/$(lsb_release -sc)/ llvm-toolchain-$(lsb_release -sc)-17 main"
           sudo apt-get update
-          sudo apt-get install llvm-16 clang-16 liblld-16-dev libclang-16-dev clang-tidy-16 ninja-build
+          sudo apt-get install llvm-17 clang-17 liblld-17-dev libclang-17-dev clang-tidy-17 ninja-build
       - name: Run clang-tidy
         run: |
-          export CC=clang-16
-          export CXX=clang++-16
-          export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-16
+          export CC=clang-17
+          export CXX=clang++-17
+          export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-17
           export CMAKE_GENERATOR=Ninja
           ./run-clang-tidy.sh
   check_cmake_file_lists:
diff --git a/apps/hannk/interpreter/allocation_planner.cpp b/apps/hannk/interpreter/allocation_planner.cpp
index caa9bd4b2664..a037846bcb2b 100644
--- a/apps/hannk/interpreter/allocation_planner.cpp
+++ b/apps/hannk/interpreter/allocation_planner.cpp
@@ -250,7 +250,7 @@ void AllocationPlanner::dump(std::ostream &o) {
             }
         }
         line[kLineWidth] = 0;
-        o << "t=" << std::setfill('0') << std::setw(3) << t << ": " << line << '\n';
+        o << "t=" << std::setfill('0') << std::setw(3) << t << ": " << line << "\n";
     }
 }
 
diff --git a/apps/hannk/interpreter/interpreter.cpp b/apps/hannk/interpreter/interpreter.cpp
index a6ee64514efa..902a4a0db807 100644
--- a/apps/hannk/interpreter/interpreter.cpp
+++ b/apps/hannk/interpreter/interpreter.cpp
@@ -120,7 +120,7 @@ std::unique_ptr<char[]> allocate_tensors(const Op *root, const InterpreterOption
 
     if (options.verbosity >= 1) {
         std::ostringstream oss;
-        oss << "Arena memory needed: " << planner.memory_needed() << '\n';
+        oss << "Arena memory needed: " << planner.memory_needed() << "\n";
         oss << "    Offsets:";
         for (int i = 0; i < planner.block_count(); i++) {
             oss << ' ' << planner.get_block_offset(i);
diff --git a/apps/hannk/util/error_util.cpp b/apps/hannk/util/error_util.cpp
index 3348faeafc0a..e9e21186b18c 100644
--- a/apps/hannk/util/error_util.cpp
+++ b/apps/hannk/util/error_util.cpp
@@ -45,7 +45,7 @@ Logger::Logger(LogSeverity severity)
 
 void Logger::finish() noexcept(false) {
     if (!msg.str().empty() && msg.str().back() != '\n') {
-        msg << '\n';
+        msg << "\n";
     }
 
     hannk_log(severity, msg.str().c_str());
@@ -62,12 +62,12 @@ Logger::~Logger() noexcept(false) {
 
 Checker::Checker(const char *condition_string)
     : logger(FATAL) {
-    logger.msg << " Condition Failed: " << condition_string << '\n';
+    logger.msg << " Condition Failed: " << condition_string << "\n";
 }
 
 Checker::Checker(const char *file, int line, const char *condition_string)
     : logger(FATAL, file, line) {
-    logger.msg << " Condition Failed: " << condition_string << '\n';
+    logger.msg << " Condition Failed: " << condition_string << "\n";
 }
 
 Checker::~Checker() noexcept(false) {
diff --git a/apps/hannk/util/model_runner.cpp b/apps/hannk/util/model_runner.cpp
index 0e0bf2e4e72b..76bbbeeaa37e 100644
--- a/apps/hannk/util/model_runner.cpp
+++ b/apps/hannk/util/model_runner.cpp
@@ -636,7 +636,7 @@ void ModelRunner::run(const std::string &filename) {
                     std::cout << ',' << RunNames[i] << "_matches_tflite";
                 }
             }
-            std::cout << '\n';
+            std::cout << "\n";
         }
     }
 
@@ -724,7 +724,7 @@ void ModelRunner::run(const std::string &filename) {
     }
 
     if (csv_output) {
-        std::cout << '\n';
+        std::cout << "\n";
     }
 }
 
diff --git a/run-clang-format.sh b/run-clang-format.sh
index 7f852b5c419d..9b5712c5e56a 100755
--- a/run-clang-format.sh
+++ b/run-clang-format.sh
@@ -4,23 +4,23 @@ set -e
 
 ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-# We are currently standardized on using LLVM/Clang16 for this script.
+# We are currently standardized on using LLVM/Clang17 for this script.
 # Note that this is totally independent of the version of LLVM that you
-# are using to build Halide itself. If you don't have LLVM16 installed,
+# are using to build Halide itself. If you don't have LLVM17 installed,
 # you can usually install what you need easily via:
 #
-# sudo apt-get install llvm-16 clang-16 libclang-16-dev clang-tidy-16
-# export CLANG_FORMAT_LLVM_INSTALL_DIR=/usr/lib/llvm-16
+# sudo apt-get install llvm-17 clang-17 libclang-17-dev clang-tidy-17
+# export CLANG_FORMAT_LLVM_INSTALL_DIR=/usr/lib/llvm-17
 
 [ -z "$CLANG_FORMAT_LLVM_INSTALL_DIR" ] && echo "CLANG_FORMAT_LLVM_INSTALL_DIR must point to an LLVM installation dir for this script." && exit
 echo CLANG_FORMAT_LLVM_INSTALL_DIR = ${CLANG_FORMAT_LLVM_INSTALL_DIR}
 
 VERSION=$(${CLANG_FORMAT_LLVM_INSTALL_DIR}/bin/clang-format --version)
-if [[ ${VERSION} =~ .*version\ 16.* ]]
+if [[ ${VERSION} =~ .*version\ 17.* ]]
 then
-    echo "clang-format version 16 found."
+    echo "clang-format version 17 found."
 else
-    echo "CLANG_FORMAT_LLVM_INSTALL_DIR must point to an LLVM 16 install!"
+    echo "CLANG_FORMAT_LLVM_INSTALL_DIR must point to an LLVM 17 install!"
     exit 1
 fi
 
diff --git a/run-clang-tidy.sh b/run-clang-tidy.sh
index d876c2da7292..1b4fc808a0a9 100755
--- a/run-clang-tidy.sh
+++ b/run-clang-tidy.sh
@@ -30,23 +30,23 @@ if [ -n "${FIX}" ]; then
     echo "Operating in -fix mode!"
 fi
 
-# We are currently standardized on using LLVM/Clang16 for this script.
+# We are currently standardized on using LLVM/Clang17 for this script.
 # Note that this is totally independent of the version of LLVM that you
-# are using to build Halide itself. If you don't have LLVM16 installed,
+# are using to build Halide itself. If you don't have LLVM17 installed,
 # you can usually install what you need easily via:
 #
-# sudo apt-get install llvm-16 clang-16 libclang-16-dev clang-tidy-16
-# export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-16
+# sudo apt-get install llvm-17 clang-17 libclang-17-dev clang-tidy-17
+# export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-17
 
 [ -z "$CLANG_TIDY_LLVM_INSTALL_DIR" ] && echo "CLANG_TIDY_LLVM_INSTALL_DIR must point to an LLVM installation dir for this script." && exit
 echo CLANG_TIDY_LLVM_INSTALL_DIR = ${CLANG_TIDY_LLVM_INSTALL_DIR}
 
 VERSION=$(${CLANG_TIDY_LLVM_INSTALL_DIR}/bin/clang-tidy --version)
-if [[ ${VERSION} =~ .*version\ 16.* ]]
+if [[ ${VERSION} =~ .*version\ 17.* ]]
 then
-    echo "clang-tidy version 16 found."
+    echo "clang-tidy version 17 found."
 else
-    echo "CLANG_TIDY_LLVM_INSTALL_DIR must point to an LLVM 16 install!"
+    echo "CLANG_TIDY_LLVM_INSTALL_DIR must point to an LLVM 17 install!"
     exit 1
 fi
 
diff --git a/src/Associativity.cpp b/src/Associativity.cpp
index 794113413451..39a0011391a6 100644
--- a/src/Associativity.cpp
+++ b/src/Associativity.cpp
@@ -781,7 +781,7 @@ void associativity_test() {
                                 true));
     }
 
-    std::cout << "Associativity test passed" << std::endl;
+    std::cout << "Associativity test passed\n";
 }
 
 }  // namespace Internal
diff --git a/src/AutoScheduleUtils.cpp b/src/AutoScheduleUtils.cpp
index 85a0b7e17979..5dcd9183db57 100644
--- a/src/AutoScheduleUtils.cpp
+++ b/src/AutoScheduleUtils.cpp
@@ -405,7 +405,7 @@ void propagate_estimate_test() {
     check(img.dim(0).min() + img.dim(1).min() + x, x + 2);
     check(img.dim(0).extent() + img.dim(1).min() + img.dim(1).extent() * x, 55 * x + 38);
 
-    std::cout << "Propagate estimate test passed" << std::endl;
+    std::cout << "Propagate estimate test passed\n";
 }
 
 }  // namespace Internal
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index 0ba1f5440056..a08bb0b9ad61 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -3919,7 +3919,7 @@ void bounds_test() {
         internal_assert(in.is_single_point());
     }
 
-    std::cout << "Bounds test passed" << std::endl;
+    std::cout << "Bounds test passed\n";
 }
 
 }  // namespace Internal
diff --git a/src/Buffer.h b/src/Buffer.h
index 637ca2900f65..304a1bd197ab 100644
--- a/src/Buffer.h
+++ b/src/Buffer.h
@@ -394,18 +394,18 @@ class Buffer {
     // @}
 
     // We forward numerous methods from the underlying Buffer
-#define HALIDE_BUFFER_FORWARD_CONST(method)                                                                                           \
-    template<typename... Args>                                                                                                        \
-    auto method(Args &&...args) const->decltype(std::declval<const Runtime::Buffer<T, Dims>>().method(std::forward<Args>(args)...)) { \
-        user_assert(defined()) << "Undefined buffer calling const method " #method "\n";                                              \
-        return get()->method(std::forward<Args>(args)...);                                                                            \
+#define HALIDE_BUFFER_FORWARD_CONST(method)                                                                                             \
+    template<typename... Args>                                                                                                          \
+    auto method(Args &&...args) const -> decltype(std::declval<const Runtime::Buffer<T, Dims>>().method(std::forward<Args>(args)...)) { \
+        user_assert(defined()) << "Undefined buffer calling const method " #method "\n";                                                \
+        return get()->method(std::forward<Args>(args)...);                                                                              \
     }
 
-#define HALIDE_BUFFER_FORWARD(method)                                                                                     \
-    template<typename... Args>                                                                                            \
-    auto method(Args &&...args)->decltype(std::declval<Runtime::Buffer<T, Dims>>().method(std::forward<Args>(args)...)) { \
-        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                                        \
-        return get()->method(std::forward<Args>(args)...);                                                                \
+#define HALIDE_BUFFER_FORWARD(method)                                                                                       \
+    template<typename... Args>                                                                                              \
+    auto method(Args &&...args) -> decltype(std::declval<Runtime::Buffer<T, Dims>>().method(std::forward<Args>(args)...)) { \
+        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                                          \
+        return get()->method(std::forward<Args>(args)...);                                                                  \
     }
 
 // This is a weird-looking but effective workaround for a deficiency in "perfect forwarding":
@@ -418,10 +418,10 @@ class Buffer {
 // and forward it as is, we can just use ... to allow an arbitrary number of commas,
 // then use __VA_ARGS__ to forward the mess as-is, and while it looks horrible, it
 // works.
-#define HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(method, ...)                                                  \
-    inline auto method(const __VA_ARGS__ &a)->decltype(std::declval<Runtime::Buffer<T, Dims>>().method(a)) { \
-        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                           \
-        return get()->method(a);                                                                             \
+#define HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(method, ...)                                                    \
+    inline auto method(const __VA_ARGS__ &a) -> decltype(std::declval<Runtime::Buffer<T, Dims>>().method(a)) { \
+        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                             \
+        return get()->method(a);                                                                               \
     }
 
     /** Does the same thing as the equivalent Halide::Runtime::Buffer method */
diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index 0b30cefaa292..c43159893838 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -836,7 +836,7 @@ void deinterleave_vector_test() {
           Shuffle::make({vec_x, vec_y}, {0, 2, 4, 3, 1, 3}),
           Shuffle::make({vec_x, vec_y}, {4, 6, 2, 7, 2, 4}));
 
-    std::cout << "deinterleave_vector test passed" << std::endl;
+    std::cout << "deinterleave_vector test passed\n";
 }
 
 }  // namespace Internal
diff --git a/src/Function.cpp b/src/Function.cpp
index 3000817ecb2c..795d18136843 100644
--- a/src/Function.cpp
+++ b/src/Function.cpp
@@ -922,13 +922,14 @@ void Function::define_extern(const std::string &function_name,
     contents->func_schedule.storage_dims().clear();
     contents->init_def.schedule().dims().clear();
     for (size_t i = 0; i < args.size(); i++) {
-        contents->func_schedule.storage_dims().push_back(StorageDim{arg_names[i]});
-        contents->init_def.schedule().dims().push_back(
-            Dim{arg_names[i], ForType::Extern, DeviceAPI::None, DimType::PureVar});
+        StorageDim sd = {arg_names[i]};
+        contents->func_schedule.storage_dims().push_back(sd);
+        Dim d = {arg_names[i], ForType::Extern, DeviceAPI::None, DimType::PureVar};
+        contents->init_def.schedule().dims().push_back(d);
     }
     // Add the dummy outermost dim
-    contents->init_def.schedule().dims().push_back(
-        Dim{Var::outermost().name(), ForType::Serial, DeviceAPI::None, DimType::PureVar});
+    Dim d = {Var::outermost().name(), ForType::Serial, DeviceAPI::None, DimType::PureVar};
+    contents->init_def.schedule().dims().push_back(d);
 }
 
 void Function::accept(IRVisitor *visitor) const {
diff --git a/src/Generator.cpp b/src/Generator.cpp
index 8b633b777dd0..8719b2f2adae 100644
--- a/src/Generator.cpp
+++ b/src/Generator.cpp
@@ -2247,7 +2247,7 @@ void generator_test() {
     // Verify that Tuple parameter-pack variants can convert GeneratorParam to Expr
     Tuple t(gp, gp, gp);
 
-    std::cout << "Generator test passed" << std::endl;
+    std::cout << "Generator test passed\n";
 }
 
 }  // namespace Internal
diff --git a/src/Generator.h b/src/Generator.h
index 4d00a0fec574..99d106056842 100644
--- a/src/Generator.h
+++ b/src/Generator.h
@@ -1648,15 +1648,15 @@ class GeneratorInputImpl : public GeneratorInputBase {
 // types in question satisfy the property of copies referring to the same underlying
 // structure (returning references is just an optimization). Since this is verbose
 // and used in several places, we'll use a helper macro:
-#define HALIDE_FORWARD_METHOD(Class, Method)                                                                                                        \
-    template<typename... Args>                                                                                                                      \
-    inline auto Method(Args &&...args)->typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
-        return this->template as<Class>().Method(std::forward<Args>(args)...);                                                                      \
+#define HALIDE_FORWARD_METHOD(Class, Method)                                                                                                          \
+    template<typename... Args>                                                                                                                        \
+    inline auto Method(Args &&...args) -> typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
+        return this->template as<Class>().Method(std::forward<Args>(args)...);                                                                        \
     }
 
 #define HALIDE_FORWARD_METHOD_CONST(Class, Method)                                                                  \
     template<typename... Args>                                                                                      \
-    inline auto Method(Args &&...args) const->                                                                      \
+    inline auto Method(Args &&...args) const ->                                                                     \
         typename std::remove_reference<decltype(std::declval<Class>().Method(std::forward<Args>(args)...))>::type { \
         this->check_gio_access();                                                                                   \
         return this->template as<Class>().Method(std::forward<Args>(args)...);                                      \
diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp
index 55dc02dcd553..3e5d95d787e6 100644
--- a/src/IRMatch.cpp
+++ b/src/IRMatch.cpp
@@ -48,7 +48,7 @@ void expr_match_test() {
 
     internal_assert(expr_match(vec_wild * 3, Ramp::make(x, y, 4) * 3, matches));
 
-    std::cout << "expr_match test passed" << std::endl;
+    std::cout << "expr_match test passed\n";
 }
 
 namespace {
diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp
index 4952fc981877..6b54aeef0e97 100644
--- a/src/LLVM_Output.cpp
+++ b/src/LLVM_Output.cpp
@@ -60,18 +60,18 @@ void emit_big_endian_u32(std::ostream &out, uint32_t value) {
     out << static_cast<uint8_t>((value >> 24) & 0xff)
         << static_cast<uint8_t>((value >> 16) & 0xff)
         << static_cast<uint8_t>((value >> 8) & 0xff)
-        << static_cast<uint8_t>((value)&0xff);
+        << static_cast<uint8_t>((value) & 0xff);
 }
 
 void emit_little_endian_u32(std::ostream &out, uint32_t value) {
-    out << static_cast<uint8_t>((value)&0xff)
+    out << static_cast<uint8_t>((value) & 0xff)
         << static_cast<uint8_t>((value >> 8) & 0xff)
         << static_cast<uint8_t>((value >> 16) & 0xff)
         << static_cast<uint8_t>((value >> 24) & 0xff);
 }
 
 void emit_little_endian_u16(std::ostream &out, uint16_t value) {
-    out << static_cast<uint8_t>((value)&0xff)
+    out << static_cast<uint8_t>((value) & 0xff)
         << static_cast<uint8_t>((value >> 8) & 0xff);
 }
 
diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp
index 2e2aa554e31f..dd8e17d5b177 100644
--- a/src/Monotonic.cpp
+++ b/src/Monotonic.cpp
@@ -768,7 +768,7 @@ void is_monotonic_test() {
 
     check_unknown(select(0 < x, max(min(x, 4), 3), 4));
 
-    std::cout << "is_monotonic test passed" << std::endl;
+    std::cout << "is_monotonic test passed\n";
 }
 
 }  // namespace Internal
diff --git a/src/Reduction.cpp b/src/Reduction.cpp
index c04d11dfed7b..bacd79ac4869 100644
--- a/src/Reduction.cpp
+++ b/src/Reduction.cpp
@@ -88,7 +88,7 @@ void split_predicate_test() {
         check((x < y) && ((w == 1) || ((x == 10) && (y == z))), expected);
     }
 
-    std::cout << "Split predicate test passed" << std::endl;
+    std::cout << "Split predicate test passed\n";
 }
 
 struct ReductionDomainContents {
diff --git a/src/Scope.h b/src/Scope.h
index 1838d14c7799..9d1cc43e1164 100644
--- a/src/Scope.h
+++ b/src/Scope.h
@@ -243,7 +243,7 @@ class Scope {
         return const_iterator(table.end());
     }
 
-    void swap(Scope<T> &other) {
+    void swap(Scope<T> &other) noexcept {
         table.swap(other.table);
         std::swap(containing_scope, other.containing_scope);
     }
diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp
index 21bf5a1e696f..761865d76b14 100644
--- a/src/SpirvIR.cpp
+++ b/src/SpirvIR.cpp
@@ -3951,7 +3951,7 @@ void spirv_ir_test() {
     binary.clear();
     builder.encode(binary);
 
-    std::cout << "SpirV IR test passed" << std::endl;
+    std::cout << "SpirV IR test passed\n";
 #else
     std::cout << "SpirV IR test *disabled*" << std::endl;
 #endif
diff --git a/src/StmtToHTML.cpp b/src/StmtToHTML.cpp
index 7c8c9f9c03c7..9c317ba35525 100644
--- a/src/StmtToHTML.cpp
+++ b/src/StmtToHTML.cpp
@@ -1124,7 +1124,7 @@ class HTMLCodePrinter : public IRVisitor {
 
     // Prints newline to stream
     void print_ln() {
-        stream << '\n';
+        stream << "\n";
     }
 
     // Prints a variable to stream
diff --git a/src/Target.cpp b/src/Target.cpp
index 49011348544f..c824fea1c928 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -1550,7 +1550,7 @@ void target_test() {
     internal_assert(with_vector_bits.vector_bits == 512) << "Vector bits not populated in constructor.\n";
     internal_assert(Target(with_vector_bits.to_string()).vector_bits == 512) << "Vector bits not round tripped properly.\n";
 
-    std::cout << "Target test passed" << std::endl;
+    std::cout << "Target test passed\n";
 }
 
 }  // namespace Internal
diff --git a/src/UniquifyVariableNames.cpp b/src/UniquifyVariableNames.cpp
index 781ba9256257..26689ec34633 100644
--- a/src/UniquifyVariableNames.cpp
+++ b/src/UniquifyVariableNames.cpp
@@ -248,7 +248,7 @@ void uniquify_variable_names_test() {
           {{x, Let::make(y.name(), 3, y)},
            {x_1, Let::make(y.name(), 4, y)}});
 
-    std::cout << "uniquify_variable_names test passed" << std::endl;
+    std::cout << "uniquify_variable_names test passed\n";
 }
 
 }  // namespace Internal
diff --git a/src/autoschedulers/adams2019/AutoSchedule.cpp b/src/autoschedulers/adams2019/AutoSchedule.cpp
index dd147465d791..083626a82423 100644
--- a/src/autoschedulers/adams2019/AutoSchedule.cpp
+++ b/src/autoschedulers/adams2019/AutoSchedule.cpp
@@ -193,7 +193,7 @@ class StateQueue {
         return sz;
     }
 
-    void swap(StateQueue &other) {
+    void swap(StateQueue &other) noexcept {
         storage.swap(other.storage);
         std::swap(sz, other.sz);
     }
diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp
index 8165979f90fb..e670fe7d8734 100644
--- a/src/autoschedulers/anderson2021/AutoSchedule.cpp
+++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp
@@ -651,12 +651,12 @@ void generate_schedule(const std::vector<Function> &outputs,
         }
     }
 
-    aslog(1) << "Number of states added: " << stats.num_states_added << '\n';
-    aslog(1) << "Number of featurizations computed: " << stats.num_featurizations << '\n';
-    aslog(1) << "Number of memoization hits: " << stats.num_memoization_hits << '\n';
-    aslog(1) << "Number of memoization misses: " << stats.num_memoization_misses << '\n';
-    aslog(1) << "Number of block memoization hits: " << stats.num_block_memoization_hits << '\n';
-    aslog(1) << "Number of block memoization misses: " << stats.num_block_memoization_misses << '\n';
+    aslog(1) << "Number of states added: " << stats.num_states_added << "\n";
+    aslog(1) << "Number of featurizations computed: " << stats.num_featurizations << "\n";
+    aslog(1) << "Number of memoization hits: " << stats.num_memoization_hits << "\n";
+    aslog(1) << "Number of memoization misses: " << stats.num_memoization_misses << "\n";
+    aslog(1) << "Number of block memoization hits: " << stats.num_block_memoization_hits << "\n";
+    aslog(1) << "Number of block memoization misses: " << stats.num_block_memoization_misses << "\n";
     aslog(1) << "Total featurization time (ms): " << stats.total_featurization_time() << "\n";
     aslog(1) << "Average featurization time (ms): " << stats.average_featurization_time() << "\n";
     aslog(1) << "Total enqueue time (ms): " << stats.total_enqueue_time() << "\n";
@@ -667,14 +667,14 @@ void generate_schedule(const std::vector<Function> &outputs,
     aslog(1) << "Total filter thread tiles time (ms): " << stats.total_filter_thread_tiles_time() << "\n";
     aslog(1) << "Total filter parallel tiles time (ms): " << stats.total_filter_parallel_tiles_time() << "\n";
 
-    aslog(1) << "Number of schedules evaluated by cost model: " << stats.num_schedules_enqueued << '\n';
-    aslog(1) << "Number of tilings generated: " << stats.num_tilings_generated << '\n';
-    aslog(1) << "Number of tilings accepted: " << stats.num_tilings_accepted << '\n';
+    aslog(1) << "Number of schedules evaluated by cost model: " << stats.num_schedules_enqueued << "\n";
+    aslog(1) << "Number of tilings generated: " << stats.num_tilings_generated << "\n";
+    aslog(1) << "Number of tilings accepted: " << stats.num_tilings_accepted << "\n";
     aslog(1) << "Total cost model evaluation time (ms): " << stats.total_cost_model_evaluation_time() << "\n";
     aslog(1) << "Average cost model evaluation time (ms): " << stats.average_cost_model_evaluation_time() << "\n";
     std::chrono::duration<double> total_time = timer.elapsed();
     aslog(1) << "Time taken for autoscheduler (s): "
-             << std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count() / 1000.0 << '\n';
+             << std::chrono::duration_cast<std::chrono::milliseconds>(total_time).count() / 1000.0 << "\n";
 }
 
 struct Anderson2021 {
diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp
index bad8972435ce..938a039a29ec 100644
--- a/src/autoschedulers/anderson2021/SearchSpace.cpp
+++ b/src/autoschedulers/anderson2021/SearchSpace.cpp
@@ -303,7 +303,7 @@ void SearchSpace::generate_children(const IntrusivePtr<State> &state,
                 aslog(1) << "  " << e2->producer->func.name() << "\n";
             }
         }
-        internal_error << "Pipeline so far doesn't use next Func: " << node->func.name() << '\n';
+        internal_error << "Pipeline so far doesn't use next Func: " << node->func.name() << "\n";
     }
 
     int num_children = 0;
diff --git a/src/autoschedulers/anderson2021/State.h b/src/autoschedulers/anderson2021/State.h
index c2b0371dce3f..53ef12a33eb4 100644
--- a/src/autoschedulers/anderson2021/State.h
+++ b/src/autoschedulers/anderson2021/State.h
@@ -270,7 +270,7 @@ class StateQueue {
         return sz;
     }
 
-    void swap(StateQueue &other) {
+    void swap(StateQueue &other) noexcept {
         storage.swap(other.storage);
         std::swap(sz, other.sz);
     }
diff --git a/src/autoschedulers/common/cmdline.h b/src/autoschedulers/common/cmdline.h
index 29783dbbd2cb..1158eb151c01 100644
--- a/src/autoschedulers/common/cmdline.h
+++ b/src/autoschedulers/common/cmdline.h
@@ -489,7 +489,7 @@ class parser {
         }
 
         for (auto &arg : args) {
-            std::cout << "\"" << arg << "\"" << std::endl;
+            std::cout << "\"" << arg << "\"\n";
         }
 
         return parse(args);
@@ -635,7 +635,7 @@ class parser {
     std::string error_full() const {
         std::ostringstream oss;
         for (const auto &error : errors) {
-            oss << error << std::endl;
+            oss << error << "\n";
         }
         return oss.str();
     }
@@ -649,8 +649,8 @@ class parser {
             }
         }
 
-        oss << "[options] ... " << ftr << std::endl;
-        oss << "options:" << std::endl;
+        oss << "[options] ... " << ftr << "\n";
+        oss << "options:\n";
 
         size_t max_width = 0;
         for (const auto &o : ordered) {
@@ -667,7 +667,7 @@ class parser {
             for (size_t j = o->name().length(); j < max_width + 4; j++) {
                 oss << ' ';
             }
-            oss << o->description() << std::endl;
+            oss << o->description() << "\n";
         }
         return oss.str();
     }
@@ -680,7 +680,7 @@ class parser {
         }
 
         if (!ok) {
-            std::cerr << error() << std::endl
+            std::cerr << error() << "\n"
                       << usage();
             exit(1);
         }
@@ -813,7 +813,7 @@ class parser {
                 actual = read(value);
                 has = true;
             } catch (const std::exception &e) {
-                std::cout << "Exception was caught: " << e.what() << std::endl;
+                std::cout << "Exception was caught: " << e.what() << "\n";
                 return false;
             }
             return true;
diff --git a/src/autoschedulers/li2018/GradientAutoscheduler.cpp b/src/autoschedulers/li2018/GradientAutoscheduler.cpp
index db8a81a634ab..709e13b2ead5 100644
--- a/src/autoschedulers/li2018/GradientAutoscheduler.cpp
+++ b/src/autoschedulers/li2018/GradientAutoscheduler.cpp
@@ -37,7 +37,7 @@ std::vector<int> get_int_bounds(const Box &bounds) {
     std::vector<int> int_bounds;
     int_bounds.reserve(bounds.size());
     for (int i = 0; i < (int)bounds.size(); i++) {
-        Interval interval = bounds[i];
+        const Interval &interval = bounds[i];
         Expr extent = simplify(interval.max - interval.min + 1);
         extent = simplify(substitute_var_estimates(extent));
         const int64_t *extent_int = as_const_int(extent);
diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp
index 6e5f0e82eff2..a5170c55d256 100644
--- a/src/runtime/cuda.cpp
+++ b/src/runtime/cuda.cpp
@@ -76,7 +76,7 @@ WEAK int load_libcuda(void *user_context) {
     halide_abort_if_false(user_context, cuInit == nullptr);
     halide_error_code_t result;
 
-// clang-format off
+    // clang-format off
 #define CUDA_FN(ret, fn, args)               result = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn, false, fn); if (result) return result;        // NOLINT(bugprone-macro-parentheses)
 #define CUDA_FN_OPTIONAL(ret, fn, args)      result = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn, true, fn); if (result) return result; // NOLINT(bugprone-macro-parentheses)
 #define CUDA_FN_3020(ret, fn, fn_3020, args) result = get_cuda_symbol<ret(CUDAAPI *) args>(user_context, #fn_3020, false, fn); if (result) return result;  // NOLINT(bugprone-macro-parentheses)
diff --git a/src/runtime/mini_d3d12.h b/src/runtime/mini_d3d12.h
index 3fe30d1dddd1..7b179fe58760 100644
--- a/src/runtime/mini_d3d12.h
+++ b/src/runtime/mini_d3d12.h
@@ -635,8 +635,12 @@ typedef struct _RPC_MESSAGE {
 #define THIS void
 #define DECLARE_INTERFACE(iface) interface DECLSPEC_NOVTABLE iface
 #define DECLARE_INTERFACE_(iface, baseiface) interface DECLSPEC_NOVTABLE iface : public baseiface
-#define DECLARE_INTERFACE_IID(iface, iid) interface DECLSPEC_UUID(iid) DECLSPEC_NOVTABLE iface
-#define DECLARE_INTERFACE_IID_(iface, baseiface, iid) interface DECLSPEC_UUID(iid) DECLSPEC_NOVTABLE iface : public baseiface
+#define DECLARE_INTERFACE_IID(iface, iid) \
+    interface DECLSPEC_UUID(iid)          \
+    DECLSPEC_NOVTABLE iface
+#define DECLARE_INTERFACE_IID_(iface, baseiface, iid) \
+    interface DECLSPEC_UUID(iid)                      \
+    DECLSPEC_NOVTABLE iface : public baseiface
 
 #define IFACEMETHOD(method) __override STDMETHOD(method)
 #define IFACEMETHOD_(type, method) __override STDMETHOD_(type, method)
@@ -715,7 +719,8 @@ _Post_equal_to_(pp) _Post_satisfies_(return == pp) void **IID_PPV_ARGS_Helper(T
 #define DECLARE_INTERFACE(iface)                  \
     typedef interface iface {                     \
         const struct iface##Vtbl FAR *lpVtbl;     \
-    } iface;                                      \
+    }                                             \
+    iface;                                        \
     typedef const struct iface##Vtbl iface##Vtbl; \
     const struct iface##Vtbl
 #else
@@ -724,7 +729,8 @@ _Post_equal_to_(pp) _Post_satisfies_(return == pp) void **IID_PPV_ARGS_Helper(T
 #define DECLARE_INTERFACE(iface)            \
     typedef interface iface {               \
         struct iface##Vtbl FAR *lpVtbl;     \
-    } iface;                                \
+    }                                       \
+    iface;                                  \
     typedef struct iface##Vtbl iface##Vtbl; \
     struct iface##Vtbl
 #endif
@@ -2299,10 +2305,10 @@ typedef enum D3D12_SHADER_COMPONENT_MAPPING {
 #define D3D12_SHADER_COMPONENT_MAPPING_MASK 0x7
 #define D3D12_SHADER_COMPONENT_MAPPING_SHIFT 3
 #define D3D12_SHADER_COMPONENT_MAPPING_ALWAYS_SET_BIT_AVOIDING_ZEROMEM_MISTAKES (1 << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 4))
-#define D3D12_ENCODE_SHADER_4_COMPONENT_MAPPING(Src0, Src1, Src2, Src3) ((((Src0)&D3D12_SHADER_COMPONENT_MAPPING_MASK) |                                                 \
-                                                                          (((Src1)&D3D12_SHADER_COMPONENT_MAPPING_MASK) << D3D12_SHADER_COMPONENT_MAPPING_SHIFT) |       \
-                                                                          (((Src2)&D3D12_SHADER_COMPONENT_MAPPING_MASK) << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 2)) | \
-                                                                          (((Src3)&D3D12_SHADER_COMPONENT_MAPPING_MASK) << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 3)) | \
+#define D3D12_ENCODE_SHADER_4_COMPONENT_MAPPING(Src0, Src1, Src2, Src3) ((((Src0) & D3D12_SHADER_COMPONENT_MAPPING_MASK) |                                                 \
+                                                                          (((Src1) & D3D12_SHADER_COMPONENT_MAPPING_MASK) << D3D12_SHADER_COMPONENT_MAPPING_SHIFT) |       \
+                                                                          (((Src2) & D3D12_SHADER_COMPONENT_MAPPING_MASK) << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 2)) | \
+                                                                          (((Src3) & D3D12_SHADER_COMPONENT_MAPPING_MASK) << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 3)) | \
                                                                           D3D12_SHADER_COMPONENT_MAPPING_ALWAYS_SET_BIT_AVOIDING_ZEROMEM_MISTAKES))
 #define D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(ComponentToExtract, Mapping) \
     ((D3D12_SHADER_COMPONENT_MAPPING)((Mapping) >> (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * (ComponentToExtract)) & D3D12_SHADER_COMPONENT_MAPPING_MASK))
diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index 184282f9a878..1eff0ad7310b 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -74,7 +74,7 @@ typedef uint32_t VkSampleMask;
 // Provided by VK_VERSION_1_0
 #define VK_API_VERSION_MAJOR(version) (((uint32_t)(version) >> 22) & 0x7FU)
 #define VK_API_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3FFU)
-#define VK_API_VERSION_PATCH(version) ((uint32_t)(version)&0xFFFU)
+#define VK_API_VERSION_PATCH(version) ((uint32_t)(version) & 0xFFFU)
 #define VK_MAKE_API_VERSION(variant, major, minor, patch) \
     ((((uint32_t)(variant)) << 29) | (((uint32_t)(major)) << 22) | (((uint32_t)(minor)) << 12) | ((uint32_t)(patch)))
 #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)
diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp
index fac8ff41fbfc..8aaba7f6a707 100644
--- a/src/runtime/opencl.cpp
+++ b/src/runtime/opencl.cpp
@@ -68,7 +68,7 @@ WEAK int load_libopencl(void *user_context) {
     halide_abort_if_false(user_context, clCreateContext == nullptr);
     halide_error_code_t result;
 
-// clang-format off
+    // clang-format off
 #define CL_FN(ret, fn, args)     result = get_cl_symbol<ret(CL_API_CALL *) args>(user_context, #fn, true, fn); if (result) return result;   // NOLINT(bugprone-macro-parentheses)
 #define CL_12_FN(ret, fn, args)  result = get_cl_symbol<ret(CL_API_CALL *) args>(user_context, #fn, false, fn); if (result) return result; // NOLINT(bugprone-macro-parentheses)
 #include "cl_functions.h"
diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h
index 027ae5c4f500..8df9dcb8eb2c 100644
--- a/src/runtime/runtime_internal.h
+++ b/src/runtime/runtime_internal.h
@@ -222,7 +222,7 @@ ALWAYS_INLINE T is_power_of_two(T value) {
 
 namespace {
 template<typename T>
-ALWAYS_INLINE void swap(T &a, T &b) {
+ALWAYS_INLINE void swap(T &a, T &b) noexcept {
     T t = a;
     a = b;
     b = t;
diff --git a/test/correctness/unroll_dynamic_loop.cpp b/test/correctness/unroll_dynamic_loop.cpp
index e43412b0c6c1..a31ca78dcf3f 100644
--- a/test/correctness/unroll_dynamic_loop.cpp
+++ b/test/correctness/unroll_dynamic_loop.cpp
@@ -9,7 +9,7 @@ int main(int argc, char **argv) {
     Buffer<float> in(100);
     in.for_each_element([&](int x) { in(x) = x * 2.0f; });
 
-    f(x) = in(x)*3;
+    f(x) = in(x) * 3;
     g(x) = f(x) * 2;
 
     Var xo, xi;
diff --git a/tools/regexp_replace.cpp b/tools/regexp_replace.cpp
index 956a67030a92..c0d8311db279 100644
--- a/tools/regexp_replace.cpp
+++ b/tools/regexp_replace.cpp
@@ -19,7 +19,7 @@ int main(int argc, const char **argv) {
     while (std::getline(std::cin, line)) {
         std::regex_replace(std::ostreambuf_iterator<char>(std::cout),
                            line.begin(), line.end(), re, argv[2]);
-        std::cout << std::endl;
+        std::cout << "\n";
     }
     return 0;
 }

From 47378ee5bd7cb304be9d61e0a636982c8a2623d0 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 29 Jan 2024 01:28:13 +0000
Subject: [PATCH 043/186] Enable `bugprone-switch-missing-default-case` (#8048)

* Upgrade clang-format and clang-tidy to use LLVM 17

* trigger buildbots

* trigger buildbots

* trigger buildbots

* trigger buildbots

* Enable `bugprone-switch-missing-default-case`

...and fix existing warnings.

* Update .clang-tidy

* Update Parameter.cpp

* Update .clang-tidy

* Update .clang-tidy

* Update .clang-tidy

* Update .clang-tidy

* Update CPlusPlusMangle.cpp
---
 .clang-tidy                   |  8 ++++----
 src/CPlusPlusMangle.cpp       | 20 ++++++++++++--------
 src/CodeGen_OpenCL_Dev.cpp    |  6 ++++++
 src/HexagonOptimize.cpp       |  4 ++++
 src/Parameter.cpp             |  8 ++++++++
 src/runtime/openglcompute.cpp |  2 ++
 tools/halide_image_io.h       |  3 +++
 7 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 815ccd3339a2..283acd5f9bd3 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -19,7 +19,7 @@ Checks: >
     bugprone-dangling-handle,
     bugprone-dynamic-static-initializers,
     -bugprone-easily-swappable-parameters,
-    -bugprone-empty-catch,  # TODO: consider enabling
+    -bugprone-empty-catch,
     -bugprone-exception-escape,
     bugprone-fold-init-type,
     bugprone-forward-declaration-namespace,
@@ -66,7 +66,7 @@ Checks: >
     bugprone-suspicious-semicolon,
     bugprone-suspicious-string-compare,
     bugprone-swapped-arguments,
-    -bugprone-switch-missing-default-case,  # TODO: consider enabling
+    bugprone-switch-missing-default-case,
     bugprone-terminating-continue,
     bugprone-throw-keyword-missing,
     bugprone-too-small-loop-variable,
@@ -93,7 +93,7 @@ Checks: >
     -misc-const-correctness,
     misc-definitions-in-headers,
     misc-header-include-cycle,
-    -misc-include-cleaner,  # TODO: consider enabling
+    -misc-include-cleaner,
     misc-misleading-bidirectional,
     misc-misleading-identifier,
     misc-misplaced-const,
@@ -128,7 +128,7 @@ Checks: >
     -modernize-replace-random-shuffle,
     -modernize-return-braced-init-list,
     -modernize-shrink-to-fit,
-    -modernize-type-traits,  # TODO: consider enabling
+    -modernize-type-traits,
     -modernize-unary-static-assert,
     -modernize-use-auto,
     modernize-use-bool-literals,
diff --git a/src/CPlusPlusMangle.cpp b/src/CPlusPlusMangle.cpp
index 05c9d552e68f..b5c30b4fcb65 100644
--- a/src/CPlusPlusMangle.cpp
+++ b/src/CPlusPlusMangle.cpp
@@ -246,9 +246,10 @@ MangledNamePart mangle_type(const Type &type, const Target &target, PreviousDecl
             return "H";
         case 64:
             return "_J";
+        default:
+            internal_error << "Unexpected integer size: " << type.bits() << ".\n";
+            return "";
         }
-        internal_error << "Unexpected integer size: " << type.bits() << ".\n";
-        return "";
     } else if (type.is_uint()) {
         switch (type.bits()) {
         case 1:
@@ -261,9 +262,10 @@ MangledNamePart mangle_type(const Type &type, const Target &target, PreviousDecl
             return "I";
         case 64:
             return "_K";
+        default:
+            internal_error << "Unexpected unsigned integer size: " << type.bits() << "\n";
+            return "";
         }
-        internal_error << "Unexpected unsigned integer size: " << type.bits() << "\n";
-        return "";
     } else if (type.is_float()) {
         if (type.bits() == 32) {
             return "M";
@@ -546,9 +548,10 @@ std::string mangle_type(const Type &type, const Target &target, PrevPrefixes &pr
             } else {
                 return "l";
             }
+        default:
+            internal_error << "Unexpected integer size: " << type.bits() << ".\n";
+            return "";
         }
-        internal_error << "Unexpected integer size: " << type.bits() << ".\n";
-        return "";
     } else if (type.is_uint()) {
         switch (type.bits()) {
         case 1:
@@ -571,9 +574,10 @@ std::string mangle_type(const Type &type, const Target &target, PrevPrefixes &pr
             } else {
                 return "m";
             }
+        default:
+            internal_error << "Unexpected unsigned integer size: " << type.bits() << "\n";
+            return "";
         }
-        internal_error << "Unexpected unsigned integer size: " << type.bits() << "\n";
-        return "";
     } else if (type.is_float()) {
         if (type.bits() == 32) {
             return "f";
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index 5712c1ea0fe9..52feed53f9e0 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -389,6 +389,9 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
                 rhs << "(int4)(" << coord[0] << idx << ", " << coord[1] << idx
                     << ", " << coord[2] << idx << ", 0)).s0";
                 break;
+            default:
+                internal_error << "Unsupported dims";
+                break;
             }
             print_assignment(op->type.with_bits(32).with_lanes(1), rhs.str());
             results[i] = id;
@@ -448,6 +451,9 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) {
                 write_image << "(int4)(" << coord[0] << idx << ", " << coord[1] << idx
                             << ", " << coord[2] << idx << ", 0)";
                 break;
+            default:
+                internal_error << "Unsupported dims";
+                break;
             }
             write_image << ", (" << print_type(value_type.with_bits(32).with_lanes(4))
                         << ")(" << value << idx << ", 0, 0, 0));\n";
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index 3e19915e98cf..b76a9eb1cfef 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -91,6 +91,8 @@ string type_suffix(Type type, bool signed_variants) {
             return prefix + "h";
         case 32:
             return prefix + "w";
+        default:
+            break;
         }
     } else if (type.is_uint()) {
         switch (type.bits()) {
@@ -100,6 +102,8 @@ string type_suffix(Type type, bool signed_variants) {
             return prefix + "uh";
         case 32:
             return prefix + "uw";
+        default:
+            break;
         }
     }
     internal_error << "Unsupported HVX type: " << type << "\n";
diff --git a/src/Parameter.cpp b/src/Parameter.cpp
index d9616b5bebf8..41353871fd0d 100644
--- a/src/Parameter.cpp
+++ b/src/Parameter.cpp
@@ -142,6 +142,8 @@ Expr Parameter::scalar_expr() const {
             return Expr(sv.u.f32);
         case 64:
             return Expr(sv.u.f64);
+        default:
+            break;
         }
     } else if (t.is_int()) {
         switch (t.bits()) {
@@ -153,6 +155,8 @@ Expr Parameter::scalar_expr() const {
             return Expr(sv.u.i32);
         case 64:
             return Expr(sv.u.i64);
+        default:
+            break;
         }
     } else if (t.is_uint()) {
         switch (t.bits()) {
@@ -166,12 +170,16 @@ Expr Parameter::scalar_expr() const {
             return Expr(sv.u.u32);
         case 64:
             return Expr(sv.u.u64);
+        default:
+            break;
         }
     } else if (t.is_handle()) {
         // handles are always uint64 internally.
         switch (t.bits()) {
         case 64:
             return Expr(sv.u.u64);
+        default:
+            break;
         }
     }
     internal_error << "Unsupported type " << t << " in scalar_expr\n";
diff --git a/src/runtime/openglcompute.cpp b/src/runtime/openglcompute.cpp
index 27397e2c008e..edb1327d90a9 100644
--- a/src/runtime/openglcompute.cpp
+++ b/src/runtime/openglcompute.cpp
@@ -88,6 +88,8 @@ WEAK const char *gl_error_name(int32_t err) {
     case 0x8031:
         return "GL_TABLE_TOO_LARGE";
         break;
+    default:
+        break;
     }
     return "<unknown GL error>";
 }
diff --git a/tools/halide_image_io.h b/tools/halide_image_io.h
index a9f312168b92..e039f7c2e798 100644
--- a/tools/halide_image_io.h
+++ b/tools/halide_image_io.h
@@ -1455,6 +1455,9 @@ bool load_mat(const std::string &filename, ImageType *im) {
     case miDOUBLE:
         type = halide_type_of<double>();
         break;
+    default:
+        check(false, "Unknown header");
+        return false;
     }
 
     *im = ImageType(type, extents);

From e2448fe535db057b18f7ca16d1c878cd045902e9 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 1 Feb 2024 09:46:10 -0800
Subject: [PATCH 044/186] Fix type error in VectorizeLoops (#8055)

---
 src/VectorizeLoops.cpp             |  3 +-
 test/correctness/fuzz_schedule.cpp | 68 ++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 1c3ec57f3fb7..6d10d2e9d5f3 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -134,7 +134,7 @@ Interval bounds_of_lanes(const Expr &e) {
         Interval ia = bounds_of_lanes(not_->a);
         return {!ia.max, !ia.min};
     } else if (const Ramp *r = e.as<Ramp>()) {
-        Expr last_lane_idx = make_const(r->base.type(), r->lanes - 1);
+        Expr last_lane_idx = make_const(r->base.type().element_of(), r->lanes - 1);
         Interval ib = bounds_of_lanes(r->base);
         const Broadcast *b = as_scalar_broadcast(r->stride);
         Expr stride = b ? b->value : r->stride;
@@ -875,6 +875,7 @@ class VectorSubs : public IRMutator {
                 // generating a scalar condition that checks if
                 // the least-true lane is true.
                 Expr all_true = bounds_of_lanes(likely->args[0]).min;
+                internal_assert(all_true.type() == Bool());
                 // Wrap it in the same flavor of likely
                 all_true = Call::make(Bool(), likely->name,
                                       {all_true}, Call::PureIntrinsic);
diff --git a/test/correctness/fuzz_schedule.cpp b/test/correctness/fuzz_schedule.cpp
index 9f0f86e3854b..a774335a07bf 100644
--- a/test/correctness/fuzz_schedule.cpp
+++ b/test/correctness/fuzz_schedule.cpp
@@ -202,6 +202,74 @@ int main(int argc, char **argv) {
         check_blur_output(buf, correct);
     }
 
+    // https://github.com/halide/Halide/issues/8054
+    {
+        ImageParam input(Float(32), 2, "input");
+        const float r_sigma = 0.1;
+        const int s_sigma = 8;
+        Func bilateral_grid{"bilateral_grid"};
+
+        Var x("x"), y("y"), z("z"), c("c");
+
+        // Add a boundary condition
+        Func clamped = Halide::BoundaryConditions::repeat_edge(input);
+
+        // Construct the bilateral grid
+        RDom r(0, s_sigma, 0, s_sigma);
+        Expr val = clamped(x * s_sigma + r.x - s_sigma / 2, y * s_sigma + r.y - s_sigma / 2);
+        val = clamp(val, 0.0f, 1.0f);
+
+        Expr zi = cast<int>(val * (1.0f / r_sigma) + 0.5f);
+
+        Func histogram("histogram");
+        histogram(x, y, z, c) = 0.0f;
+        histogram(x, y, zi, c) += mux(c, {val, 1.0f});
+
+        // Blur the grid using a five-tap filter
+        Func blurx("blurx"), blury("blury"), blurz("blurz");
+        blurz(x, y, z, c) = (histogram(x, y, z - 2, c) +
+                             histogram(x, y, z - 1, c) * 4 +
+                             histogram(x, y, z, c) * 6 +
+                             histogram(x, y, z + 1, c) * 4 +
+                             histogram(x, y, z + 2, c));
+        blurx(x, y, z, c) = (blurz(x - 2, y, z, c) +
+                             blurz(x - 1, y, z, c) * 4 +
+                             blurz(x, y, z, c) * 6 +
+                             blurz(x + 1, y, z, c) * 4 +
+                             blurz(x + 2, y, z, c));
+        blury(x, y, z, c) = (blurx(x, y - 2, z, c) +
+                             blurx(x, y - 1, z, c) * 4 +
+                             blurx(x, y, z, c) * 6 +
+                             blurx(x, y + 1, z, c) * 4 +
+                             blurx(x, y + 2, z, c));
+
+        // Take trilinear samples to compute the output
+        val = clamp(input(x, y), 0.0f, 1.0f);
+        Expr zv = val * (1.0f / r_sigma);
+        zi = cast<int>(zv);
+        Expr zf = zv - zi;
+        Expr xf = cast<float>(x % s_sigma) / s_sigma;
+        Expr yf = cast<float>(y % s_sigma) / s_sigma;
+        Expr xi = x / s_sigma;
+        Expr yi = y / s_sigma;
+        Func interpolated("interpolated");
+        interpolated(x, y, c) =
+            lerp(lerp(lerp(blury(xi, yi, zi, c), blury(xi + 1, yi, zi, c), xf),
+                      lerp(blury(xi, yi + 1, zi, c), blury(xi + 1, yi + 1, zi, c), xf), yf),
+                 lerp(lerp(blury(xi, yi, zi + 1, c), blury(xi + 1, yi, zi + 1, c), xf),
+                      lerp(blury(xi, yi + 1, zi + 1, c), blury(xi + 1, yi + 1, zi + 1, c), xf), yf),
+                 zf);
+
+        // Normalize
+        bilateral_grid(x, y) = interpolated(x, y, 0) / interpolated(x, y, 1);
+        Pipeline p({bilateral_grid});
+
+        Var v6, zo, vzi;
+
+        blury.compute_root().split(x, x, v6, 6, TailStrategy::GuardWithIf).split(z, zo, vzi, 8, TailStrategy::GuardWithIf).reorder(y, x, c, vzi, zo, v6).vectorize(vzi).vectorize(v6);
+        p.compile_to_module({input}, "bilateral_grid", {Target("host")});
+    }
+
     printf("Success!\n");
     return 0;
 }

From 80e2081153361a7e0d3671290c383b1ba891286c Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 5 Feb 2024 14:25:05 -0800
Subject: [PATCH 045/186] Update makefile to use
 test/common/terminate_handler.cpp (#8066)

This means we actually print error messages when using exceptions and
the makefile
---
 Makefile | 44 ++++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 39358e03ef18..04fc41fa4167 100644
--- a/Makefile
+++ b/Makefile
@@ -277,7 +277,7 @@ LLVM_SHARED_LIBS = -Wl,-rpath=$(LLVM_LIBDIR) -L $(LLVM_LIBDIR) -lLLVM
 
 LLVM_LIBS_FOR_SHARED_LIBHALIDE=$(if $(WITH_LLVM_INSIDE_SHARED_LIBHALIDE),$(LLVM_STATIC_LIBS),$(LLVM_SHARED_LIBS))
 
-TUTORIAL_CXX_FLAGS ?= -std=c++17 -g -fno-omit-frame-pointer $(RTTI_CXX_FLAGS) -I $(ROOT_DIR)/tools $(SANITIZER_FLAGS) $(LLVM_CXX_FLAGS_LIBCPP)
+TUTORIAL_CXX_FLAGS ?= -std=c++17 -g -fno-omit-frame-pointer $(RTTI_CXX_FLAGS) -I $(ROOT_DIR)/tools $(SANITIZER_FLAGS) $(LLVM_CXX_FLAGS_LIBCPP) $(EXCEPTIONS_CXX_FLAGS)
 # The tutorials contain example code with warnings that we don't want
 # to be flagged as errors, so the test flags are the tutorial flags
 # plus our warning flags.
@@ -951,6 +951,14 @@ INITIAL_MODULES = $(RUNTIME_CPP_COMPONENTS:%=$(BUILD_DIR)/initmod.%_32.o) \
                   $(RUNTIME_LL_COMPONENTS:%=$(BUILD_DIR)/initmod.%_ll.o) \
                   $(PTX_DEVICE_INITIAL_MODULES:libdevice.%.bc=$(BUILD_DIR)/initmod_ptx.%_ll.o)
 
+TEST_DEPS = $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES)
+ifneq (,$(WITH_EXCEPTIONS))
+# The tests will link libHalide, but also the object file that
+# installs a global exception handler.
+TEST_DEPS += $(BUILD_DIR)/terminate_handler.o
+TEST_LD_FLAGS += $(BUILD_DIR)/terminate_handler.o
+endif
+
 # Add the Hexagon simulator to the rpath on Linux. (Not supported elsewhere, so no else cases.)
 ifeq ($(UNAME), Linux)
 ifneq (,$(WITH_HEXAGON))
@@ -1220,6 +1228,10 @@ $(BUILD_DIR)/Simplify_%.o: $(SRC_DIR)/Simplify_%.cpp $(SRC_DIR)/Simplify_Interna
 	@mkdir -p $(@D)
 	$(CXX) $(CXX_FLAGS) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/Simplify_$*.d -MT $@
 
+$(BUILD_DIR)/terminate_handler.o: $(ROOT_DIR)/test/common/terminate_handler.cpp
+	@mkdir -p $(@D)
+	$(CXX) $(CXX_FLAGS) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o
+
 .PHONY: clean
 clean:
 	rm -rf $(LIB_DIR)
@@ -1380,7 +1392,7 @@ $(BIN_DIR)/%/runtime.a: $(BIN_DIR)/runtime.generator
 	@mkdir -p $(@D)
 	$(CURDIR)/$< -r runtime -o $(CURDIR)/$(BIN_DIR)/$* target=$*
 
-$(BIN_DIR)/test_internal: $(ROOT_DIR)/test/internal.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT)
+$(BIN_DIR)/test_internal: $(ROOT_DIR)/test/internal.cpp $(TEST_DEPS)
 	@mkdir -p $(@D)
 	$(CXX) $(TEST_CXX_FLAGS) $< -I$(SRC_DIR) $(TEST_LD_FLAGS) -o $@
 
@@ -1395,7 +1407,7 @@ $(BUILD_DIR)/halide_ir.fbs.h: $(SRC_DIR)/halide_ir.fbs
 	flatc --cpp --cpp-std C++17 --no-union-value-namespacing --keep-prefix --filename-suffix ".fbs" -o $(BUILD_DIR) $^
 
 # Correctness test that link against libHalide
-$(BIN_DIR)/correctness_%: $(ROOT_DIR)/test/correctness/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES)
+$(BIN_DIR)/correctness_%: $(ROOT_DIR)/test/correctness/%.cpp $(TEST_DEPS)
 	@mkdir -p $(@D)
 	$(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
 
@@ -1410,7 +1422,7 @@ $(BIN_DIR)/correctness_halide_buffer: $(ROOT_DIR)/test/correctness/halide_buffer
 
 # The image_io test additionally needs to link to libpng and
 # libjpeg.
-$(BIN_DIR)/correctness_image_io: $(ROOT_DIR)/test/correctness/image_io.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES)
+$(BIN_DIR)/correctness_image_io: $(ROOT_DIR)/test/correctness/image_io.cpp $(TEST_DEPS)
 	$(CXX) $(TEST_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@
 
 # OpenCL runtime correctness test requires runtime.a to be linked.
@@ -1418,14 +1430,14 @@ $(BIN_DIR)/$(TARGET)/correctness_opencl_runtime: $(ROOT_DIR)/test/correctness/op
 	@mkdir -p $(@D)
 	$(CXX) $(BIN_DIR)/$(TARGET)/runtime.a $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
 
-$(BIN_DIR)/performance_%: $(ROOT_DIR)/test/performance/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h
+$(BIN_DIR)/performance_%: $(ROOT_DIR)/test/performance/%.cpp $(TEST_DEPS)
 	$(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE) $< -I$(INCLUDE_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(TEST_LD_FLAGS) -o $@
 
 # Error tests that link against libHalide
-$(BIN_DIR)/error_%: $(ROOT_DIR)/test/error/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h
+$(BIN_DIR)/error_%: $(ROOT_DIR)/test/error/%.cpp $(TEST_DEPS)
 	$(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
 
-$(BIN_DIR)/warning_%: $(ROOT_DIR)/test/warning/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h
+$(BIN_DIR)/warning_%: $(ROOT_DIR)/test/warning/%.cpp $(TEST_DEPS)
 	$(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
 
 # Runtime tests that test internals
@@ -1452,13 +1464,13 @@ $(BIN_DIR)/runtime_%: $(ROOT_DIR)/test/runtime/%.cpp $(BIN_DIR)/runtime_internal
 	$(CXX) $(TEST_CXX_FLAGS) $(RUNTIME_TESTS_CXXFLAGS) -I$(ROOT_DIR)/test/runtime -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $^ $(COMMON_LD_FLAGS) -o $@
 
 # Auto schedule tests that link against libHalide
-$(BIN_DIR)/mullapudi2016_%: $(ROOT_DIR)/test/autoschedulers/mullapudi2016/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h
+$(BIN_DIR)/mullapudi2016_%: $(ROOT_DIR)/test/autoschedulers/mullapudi2016/%.cpp $(TEST_DEPS)
 	$(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
 
-$(BIN_DIR)/li2018_%: $(ROOT_DIR)/test/autoschedulers/li2018/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h
+$(BIN_DIR)/li2018_%: $(ROOT_DIR)/test/autoschedulers/li2018/%.cpp $(TEST_DEPS)
 	$(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
 
-$(BIN_DIR)/adams2019_%: $(ROOT_DIR)/test/autoschedulers/adams2019/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h
+$(BIN_DIR)/adams2019_%: $(ROOT_DIR)/test/autoschedulers/adams2019/%.cpp $(TEST_DEPS)
 	$(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@
 
 # TODO(srj): this doesn't auto-delete, why not?
@@ -1471,7 +1483,7 @@ $(BUILD_DIR)/%_generator.o: $(ROOT_DIR)/test/generator/%_generator.cpp $(INCLUDE
 	@mkdir -p $(@D)
 	$(CXX) $(TEST_CXX_FLAGS) -I$(INCLUDE_DIR) -I$(CURDIR)/$(FILTERS_DIR) -c $< -o $@
 
-$(BIN_DIR)/%.generator: $(BUILD_DIR)/GenGen.o $(BIN_DIR)/libHalide.$(SHARED_EXT) $(BUILD_DIR)/%_generator.o
+$(BIN_DIR)/%.generator: $(BUILD_DIR)/GenGen.o $(TEST_DEPS) $(BUILD_DIR)/%_generator.o
 	@mkdir -p $(@D)
 	$(CXX) $(filter %.cpp %.o %.a,$^) $(TEST_LD_FLAGS) -o $@
 
@@ -1787,7 +1799,7 @@ $(BIN_DIR)/$(TARGET)/generator_aotcpp_define_extern_opencl: $(ROOT_DIR)/test/gen
 	$(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) -o $@
 
 # By default, %_jittest.cpp depends on libHalide, plus the stubs for the Generator. These are external tests that use the JIT.
-$(BIN_DIR)/generator_jit_%: $(ROOT_DIR)/test/generator/%_jittest.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(FILTERS_DIR)/%.stub.h $(BUILD_DIR)/%_generator.o
+$(BIN_DIR)/generator_jit_%: $(ROOT_DIR)/test/generator/%_jittest.cpp $(TEST_DEPS) $(FILTERS_DIR)/%.stub.h $(BUILD_DIR)/%_generator.o
 	@mkdir -p $(@D)
 	$(CXX) -g $(TEST_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) -I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I $(ROOT_DIR)/apps/support $(TEST_LD_FLAGS) -o $@
 
@@ -1922,7 +1934,7 @@ $(FILTERS_DIR)/multi_rungen2: $(BUILD_DIR)/RunGenMain.o $(BIN_DIR)/$(TARGET)/run
 	@mkdir -p $(@D)
 	$(CXX) -std=c++17 -I$(FILTERS_DIR) $^ $(GEN_AOT_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@
 
-$(BIN_DIR)/tutorial_%: $(ROOT_DIR)/tutorial/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(INCLUDE_DIR)/HalideRuntime.h
+$(BIN_DIR)/tutorial_%: $(ROOT_DIR)/tutorial/%.cpp $(TEST_DEPS)
 	@ if [[ $@ == *_run ]]; then \
 		export TUTORIAL=$* ;\
 		export LESSON=`echo $${TUTORIAL} | cut -b1-9`; \
@@ -1934,7 +1946,7 @@ $(BIN_DIR)/tutorial_%: $(ROOT_DIR)/tutorial/%.cpp $(BIN_DIR)/libHalide.$(SHARED_
 		-I$(INCLUDE_DIR) -I$(ROOT_DIR)/tools $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@;\
 	fi
 
-$(BIN_DIR)/tutorial_lesson_15_generators: $(ROOT_DIR)/tutorial/lesson_15_generators.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o
+$(BIN_DIR)/tutorial_lesson_15_generators: $(ROOT_DIR)/tutorial/lesson_15_generators.cpp $(TEST_DEPS) $(BUILD_DIR)/GenGen.o
 	$(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \
 	-I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@
 
@@ -1945,7 +1957,7 @@ tutorial_lesson_15_generators: $(ROOT_DIR)/tutorial/lesson_15_generators_usage.s
 	PATH="$${PATH}:$(CURDIR)/$(BIN_DIR)" source $(ROOT_DIR)/tutorial/lesson_15_generators_usage.sh
 	@-echo
 
-$(BIN_DIR)/tutorial_lesson_16_rgb_generate: $(ROOT_DIR)/tutorial/lesson_16_rgb_generate.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o
+$(BIN_DIR)/tutorial_lesson_16_rgb_generate: $(ROOT_DIR)/tutorial/lesson_16_rgb_generate.cpp $(TEST_DEPS) $(BUILD_DIR)/GenGen.o
 	$(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \
 	-I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@
 
@@ -1962,7 +1974,7 @@ $(BIN_DIR)/tutorial_lesson_16_rgb_run: $(ROOT_DIR)/tutorial/lesson_16_rgb_run.cp
         -lHalide $(TEST_LD_FLAGS) $(COMMON_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@
 	@-echo
 
-$(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate: $(ROOT_DIR)/tutorial/lesson_21_auto_scheduler_generate.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o
+$(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate: $(ROOT_DIR)/tutorial/lesson_21_auto_scheduler_generate.cpp $(TEST_DEPS) $(BUILD_DIR)/GenGen.o
 	$(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \
 	-I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@
 

From 93bff95c52e6599f9f779c99604002ff955d276e Mon Sep 17 00:00:00 2001
From: Teo <wraith1995@users.noreply.github.com>
Date: Tue, 6 Feb 2024 18:34:02 -0500
Subject: [PATCH 046/186] add unsafe_promise_clamped (#8071)

add unsafe_promise_clamp
---
 python_bindings/src/halide/halide_/PyIROperator.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python_bindings/src/halide/halide_/PyIROperator.cpp b/python_bindings/src/halide/halide_/PyIROperator.cpp
index ce9a0ef5fec1..81a51398bb51 100644
--- a/python_bindings/src/halide/halide_/PyIROperator.cpp
+++ b/python_bindings/src/halide/halide_/PyIROperator.cpp
@@ -44,6 +44,7 @@ void define_operators(py::module &m) {
     });
 
     m.def("clamp", &clamp);
+    m.def("unsafe_promise_clamped", &unsafe_promise_clamped);
     m.def("abs", &abs);
     m.def("absd", &absd);
 

From 665804c752cba9e7b673d3778d83d58a19628948 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 6 Feb 2024 23:34:29 +0000
Subject: [PATCH 047/186] Don't require Halide_WebGPU when using wasm (#8063)
 (#8065)

* Don't require Halide_WebGPU when using wasm (#8063)

* trigger buildbots
---
 cmake/HalideGeneratorHelpers.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake
index f62da88b1f7b..d45341536422 100644
--- a/cmake/HalideGeneratorHelpers.cmake
+++ b/cmake/HalideGeneratorHelpers.cmake
@@ -739,7 +739,7 @@ function(_Halide_target_link_gpu_libs TARGET VISIBILITY)
         target_link_libraries(${TARGET} ${VISIBILITY} "${FOUNDATION_LIBRARY}" "${METAL_LIBRARY}")
     endif ()
 
-    if ("${ARGN}" MATCHES "webgpu")
+    if ("${ARGN}" MATCHES "webgpu" AND NOT "${ARGN}" MATCHES "wasm")
         find_package(Halide_WebGPU REQUIRED)
         target_link_libraries(${TARGET} ${VISIBILITY} Halide::WebGPU)
     endif ()

From 84fe5655ee569680ce116497724e28e3c3575fe5 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 7 Feb 2024 17:41:21 +0000
Subject: [PATCH 048/186] Outsmart the LLVM optimizer (#8073)

The old definitions of bool_1, bool_2, bool_3 in simd_op_check_x86 (etc) all referred to the same entry in in_f32; as of https://github.com/llvm/llvm-project/pull/76367, the LLVM optimizer is smart enough to realize that (eg) bool1 != bool2 by construction, and optimizes away the code that tests their conditions, such as the one for andps and orps. Initing them from different locations is enough to outsmart the compiler.

(bug was only noticed in the x86 test, but I updated the other tests to guard against future improvements there too.)
---
 test/correctness/simd_op_check_arm.cpp     | 2 +-
 test/correctness/simd_op_check_hvx.cpp     | 2 +-
 test/correctness/simd_op_check_powerpc.cpp | 2 +-
 test/correctness/simd_op_check_wasm.cpp    | 2 +-
 test/correctness/simd_op_check_x86.cpp     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp
index acc3edcc4a8a..e8762a6ea2d8 100644
--- a/test/correctness/simd_op_check_arm.cpp
+++ b/test/correctness/simd_op_check_arm.cpp
@@ -37,7 +37,7 @@ class SimdOpCheckARM : public SimdOpCheckTest {
         Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32);
         Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32);
         Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32);
-        Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f);
+        Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f);
 
         // Table copied from the Cortex-A9 TRM.
 
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
index 2832f1bc8ede..450ef3f06fe6 100644
--- a/test/correctness/simd_op_check_hvx.cpp
+++ b/test/correctness/simd_op_check_hvx.cpp
@@ -45,7 +45,7 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32);
         Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32);
         Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32);
-        Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f);
+        Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f);
 
         constexpr int hvx_width = 128;
 
diff --git a/test/correctness/simd_op_check_powerpc.cpp b/test/correctness/simd_op_check_powerpc.cpp
index 2dccd72735f3..fdf28f3641a5 100644
--- a/test/correctness/simd_op_check_powerpc.cpp
+++ b/test/correctness/simd_op_check_powerpc.cpp
@@ -36,7 +36,7 @@ class SimdOpCheckPowerPC : public SimdOpCheckTest {
         Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32);
         Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32);
         Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32);
-        // Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f);
+        // Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f);
 
         // Basic AltiVec SIMD instructions.
         for (int w = 1; w <= 4; w++) {
diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp
index 89aad9e5c389..56e2e4231876 100644
--- a/test/correctness/simd_op_check_wasm.cpp
+++ b/test/correctness/simd_op_check_wasm.cpp
@@ -37,7 +37,7 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
         Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32);
         Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32);
         Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32);
-        Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f);
+        Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f);
 
         check("f32.sqrt", 1, sqrt(f32_1));
         check("f32.min", 1, min(f32_1, f32_2));
diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp
index 51d4a0b18ccb..990e4e886307 100644
--- a/test/correctness/simd_op_check_x86.cpp
+++ b/test/correctness/simd_op_check_x86.cpp
@@ -57,7 +57,7 @@ class SimdOpCheckX86 : public SimdOpCheckTest {
         Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32);
         Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32);
         Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32);
-        Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f);
+        Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f);
 
         // MMX and SSE1 (in 64 and 128 bits)
         for (int w = 1; w <= 4; w++) {

From 78a076220a4aefdcef13d3ab7b3afa7faf8917f7 Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <132343640+prasmish@users.noreply.github.com>
Date: Wed, 7 Feb 2024 23:11:51 +0530
Subject: [PATCH 049/186] Add hexagon_benchmarks app for CMake builds (#8069)

* Add hexagon_benchmarks app for CMake builds

* Removed unnecessary -lc++abi flag from GCC build
---
 apps/CMakeLists.txt                    |  2 +-
 apps/hexagon_benchmarks/CMakeLists.txt | 44 ++++++++++++++++++++++++++
 apps/hexagon_benchmarks/process.cpp    |  7 ++--
 3 files changed, 50 insertions(+), 3 deletions(-)
 create mode 100644 apps/hexagon_benchmarks/CMakeLists.txt

diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 149f6a610b5c..1f6abcdc6e64 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -45,7 +45,7 @@ add_app(depthwise_separable_conv)
 add_app(fft)
 add_app(hannk)
 add_app(harris)
-# add_app(hexagon_benchmarks)  # TODO(#5374): missing CMake build
+add_app(hexagon_benchmarks)
 # add_app(hexagon_dma)  # TODO(#5374): missing CMake build
 add_app(hist)
 add_app(iir_blur)
diff --git a/apps/hexagon_benchmarks/CMakeLists.txt b/apps/hexagon_benchmarks/CMakeLists.txt
new file mode 100644
index 000000000000..9cbcc541b76a
--- /dev/null
+++ b/apps/hexagon_benchmarks/CMakeLists.txt
@@ -0,0 +1,44 @@
+cmake_minimum_required(VERSION 3.22)
+project(hexagon_benchmarks)
+
+enable_testing()
+
+# Set up language settings
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS NO)
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+# Find Halide
+find_package(Halide REQUIRED)
+
+macro(add_generator_and_library FILTER_NAME)
+    set(GENERATOR_EXE ${FILTER_NAME}.generator)
+    set(GENERATOR_SRC ${FILTER_NAME}_generator.cpp)
+    add_halide_generator(${GENERATOR_EXE} SOURCES ${GENERATOR_SRC})
+    add_halide_library(${FILTER_NAME} FROM ${GENERATOR_EXE})
+endmacro()
+
+add_generator_and_library(dilate3x3)
+add_generator_and_library(gaussian5x5)
+add_generator_and_library(median3x3)
+
+# Main executable
+add_executable(process process.cpp)
+target_compile_options(process PRIVATE $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-O2>)
+if (Halide_TARGET MATCHES "hvx")
+  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 TARGET_HAS_HVX)
+else()
+  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3)
+endif()
+target_link_libraries(process
+                      PRIVATE
+                      Halide::Tools
+                      dilate3x3 gaussian5x5 median3x3)
+
+# Test that the app actually works!
+add_test(NAME hexagon_benchmarks COMMAND process -n 1)
+set_tests_properties(hexagon_benchmarks PROPERTIES
+                     LABELS hexagon_benchmarks
+                     PASS_REGULAR_EXPRESSION "Success!"
+                     SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
diff --git a/apps/hexagon_benchmarks/process.cpp b/apps/hexagon_benchmarks/process.cpp
index 975bf8aa2da4..87a492c577d1 100644
--- a/apps/hexagon_benchmarks/process.cpp
+++ b/apps/hexagon_benchmarks/process.cpp
@@ -3,6 +3,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#ifdef TARGET_HAS_HVX
+#include "HalideRuntimeHexagonHost.h"
+#endif
+
 #include "halide_benchmark.h"
 #include "process.h"
 
@@ -39,11 +43,10 @@ int main(int argc, char **argv) {
     Dilate3x3Descriptor dilate3x3_pipeine(W, H);
     Median3x3Descriptor median3x3_pipeline(W, H);
     Gaussian5x5Descriptor gaussian5x5_pipeline(W, H);
-    SobelDescriptor sobel_pipeline(W, H);
     Conv3x3a32Descriptor conv3x3a32_pipeline(W, H);
 
     std::vector<PipelineDescriptorBase *> pipelines = {&conv3x3a16_pipeline, &dilate3x3_pipeine, &median3x3_pipeline,
-                                                       &gaussian5x5_pipeline, &sobel_pipeline, &conv3x3a32_pipeline};
+                                                       &gaussian5x5_pipeline, &conv3x3a32_pipeline};
 
     for (PipelineDescriptorBase *p : pipelines) {
         if (!p->defined()) {

From 37153a95d0d2d0b7b8c51c92c4a94c8cc11f8f7b Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <derek.gerstmann@gmail.com>
Date: Wed, 7 Feb 2024 09:43:58 -0800
Subject: [PATCH 050/186] Fix bool conversion bug in Vulkan code generator
 (#8067)

* Fix bug in Vulkan code generator that was incorrectly passing the
address of a byte vector, instead of its contents to
builder.declare_constant()

* Add bool_predicate_cast correctness test to verify bool conversion for
Vulkan codegen works as expected

---------

Co-authored-by: Derek Gerstmann <dgerstmann@adobe.com>
---
 src/CodeGen_Vulkan_Dev.cpp               |  7 +++--
 test/correctness/CMakeLists.txt          |  1 +
 test/correctness/bool_predicate_cast.cpp | 39 ++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 test/correctness/bool_predicate_cast.cpp

diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 7e06447a27fc..b86c99f9269e 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -548,6 +548,9 @@ void fill_bytes_with_value(uint8_t *bytes, int count, int value) {
 }
 
 SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type value_type, SpvId value_id) {
+    debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(): casting from value type '"
+             << value_type << "' to target type '" << target_type << "' for value id '" << value_id << "' !\n";
+
     if (!value_type.is_bool()) {
         value_id = cast_type(Bool(), value_type, value_id);
     }
@@ -590,8 +593,8 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type
 
     SpvId result_id = builder.reserve_id(SpvResultId);
     SpvId target_type_id = builder.declare_type(target_type);
-    SpvId true_value_id = builder.declare_constant(target_type, &true_data);
-    SpvId false_value_id = builder.declare_constant(target_type, &false_data);
+    SpvId true_value_id = builder.declare_constant(target_type, &true_data[0]);
+    SpvId false_value_id = builder.declare_constant(target_type, &false_data[0]);
     builder.append(SpvFactory::select(target_type_id, result_id, value_id, true_value_id, false_value_id));
     return result_id;
 }
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index cd66f21a346e..5960e7922658 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -14,6 +14,7 @@ tests(GROUPS correctness
       bit_counting.cpp
       bitwise_ops.cpp
       bool_compute_root_vectorize.cpp
+      bool_predicate_cast.cpp
       bound.cpp
       bound_small_allocations.cpp
       bound_storage.cpp
diff --git a/test/correctness/bool_predicate_cast.cpp b/test/correctness/bool_predicate_cast.cpp
new file mode 100644
index 000000000000..1043f329b76c
--- /dev/null
+++ b/test/correctness/bool_predicate_cast.cpp
@@ -0,0 +1,39 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+
+    // Test explicit casting of a predicate to an integer as part of a reduction
+    // NOTE: triggers a convert_to_bool in Vulkan for a SelectOp
+    Target target = get_jit_target_from_environment();
+    Var x("x"), y("y");
+
+    Func input("input");
+    input(x, y) = cast<uint8_t>(x + y);
+
+    Func test("test");
+    test(x, y) = cast(UInt(8), input(x, y) >= 32);
+
+    if (target.has_gpu_feature()) {
+        Var xi("xi"), yi("yi");
+        test.gpu_tile(x, y, xi, yi, 8, 8);
+    }
+
+    Realization result = test.realize({96, 96});
+    Buffer<uint8_t> a = result[0];
+    for (int y = 0; y < a.height(); y++) {
+        for (int x = 0; x < a.width(); x++) {
+            uint8_t correct_a = ((x + y) >= 32) ? 1 : 0;
+            if (a(x, y) != correct_a) {
+                printf("result(%d, %d) = (%d) instead of (%d)\n",
+                       x, y, a(x, y), correct_a);
+                return 1;
+            }
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From 39e5c08a88ac59ef1e848e7b7e40f2056c792b08 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 7 Feb 2024 09:49:06 -0800
Subject: [PATCH 051/186] Better validation of gpu schedules (#8068)

* Update makefile to use test/common/terminate_handler.cpp

This means we actually print error messages when using exceptions and
the makefile

* Better validate of GPU schedules

GPU loop constraints were checked in two different places. Checking them
in ScheduleFunctions was incorrect because it didn't consider update
definitions and specializations. Checking them in FuseGPUThreadLoops was
too late, because the Var names have gone (they've been renamed to
things like __thread_id_x). Furthermore, some problems were internal
errors or runtime errors when they should have been user errors. We
allowed 4d thread and block dimensions, but then hit an internal error.

This PR centralizes checking of GPU loop structure in
CanonicalizeGPUVars and adds more helpful error messages that print the
problematic loop structure. E.g:

```
Error: GPU thread loop over f$8.s0.v0 is inside three other GPU thread
loops. The maximum number of nested GPU thread loops is 3. The loop nest
is:
compute_at for g$8:
 for g$8.s0.v7:
  for g$8.s0.v6:
   for g$8.s0.v5:
    for g$8.s0.v4:
     gpu_block g$8.s0.v3:
      gpu_block g$8.s0.v2:
       gpu_thread g$8.s0.v1:
        gpu_thread g$8.s0.v0:
         store_at for f$8:
          compute_at for f$8:
           gpu_thread f$8.s0.v1:
            gpu_thread f$8.s0.v0:
```

Fixes the bug found in #7946

* Delete dead code

* Actually clear the ostringstream
---
 src/CanonicalizeGPUVars.cpp                 | 222 +++++++++++++++++++-
 src/FuseGPUThreadLoops.cpp                  |  40 ----
 src/ScheduleFunctions.cpp                   |  39 ----
 test/correctness/CMakeLists.txt             |   3 +-
 test/correctness/gpu_error_1.cpp            |  47 -----
 test/correctness/gpu_error_2.cpp            |  46 ----
 test/correctness/invalid_gpu_loop_nests.cpp | 103 +++++++++
 7 files changed, 317 insertions(+), 183 deletions(-)
 delete mode 100644 test/correctness/gpu_error_1.cpp
 delete mode 100644 test/correctness/gpu_error_2.cpp
 create mode 100644 test/correctness/invalid_gpu_loop_nests.cpp

diff --git a/src/CanonicalizeGPUVars.cpp b/src/CanonicalizeGPUVars.cpp
index f399a995ef50..7e993d7a72c1 100644
--- a/src/CanonicalizeGPUVars.cpp
+++ b/src/CanonicalizeGPUVars.cpp
@@ -15,16 +15,16 @@ using std::string;
 using std::vector;
 
 namespace {
-string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"};
-string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"};
+string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"};
+string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"};
 
 string get_thread_name(int index) {
-    internal_assert(index >= 0 && index < 4);
+    internal_assert(index >= 0 && index < 3);
     return thread_names[index];
 }
 
 string get_block_name(int index) {
-    internal_assert(index >= 0 && index < 4);
+    internal_assert(index >= 0 && index < 3);
     return block_names[index];
 }
 
@@ -111,10 +111,6 @@ class CanonicalizeGPUVars : public IRMutator {
 
             CountGPUBlocksThreads counter;
             op->body.accept(&counter);
-            internal_assert(counter.nblocks <= 4)
-                << op->name << " can only have maximum of 4 block dimensions\n";
-            internal_assert(counter.nthreads <= 4)
-                << op->name << " can only have maximum of 4 thread dimensions\n";
 
             if (op->for_type == ForType::GPUBlock) {
                 name += "." + get_block_name(counter.nblocks);
@@ -123,7 +119,6 @@ class CanonicalizeGPUVars : public IRMutator {
                 name += "." + get_thread_name(counter.nthreads);
                 debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n";
             } else if (op->for_type == ForType::GPULane) {
-                user_assert(counter.nlanes == 0) << "Cannot nest multiple loops over gpu lanes: " << name << "\n";
                 name += "." + get_thread_name(0);
             }
 
@@ -190,9 +185,218 @@ class CanonicalizeGPUVars : public IRMutator {
     }
 };
 
+std::string loop_nest_summary_to_node(const IRNode *root, const IRNode *target) {
+    class Summary : public IRVisitor {
+    public:
+        std::vector<std::ostringstream> stack;
+        Summary(const IRNode *target)
+            : target(target) {
+        }
+
+    protected:
+        const IRNode *target;
+        bool done = false;
+
+        using IRVisitor::visit;
+
+        void visit(const For *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << op->for_type << " " << op->name;
+            if (op == target) {
+                done = true;
+            } else {
+                IRVisitor::visit(op);
+                if (!done) {
+                    stack.pop_back();
+                }
+            }
+        }
+
+        void visit(const Realize *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << "store_at for " << op->name;
+            IRVisitor::visit(op);
+            if (!done) {
+                stack.pop_back();
+            }
+        }
+
+        void visit(const HoistedStorage *op) override {
+            if (done) {
+                return;
+            }
+            stack.emplace_back();
+            stack.back() << "hoisted storage for " << op->name;
+            IRVisitor::visit(op);
+            if (!done) {
+                stack.pop_back();
+            }
+        }
+
+        void visit(const ProducerConsumer *op) override {
+            if (done) {
+                return;
+            }
+            if (op->is_producer) {
+                stack.emplace_back();
+                stack.back() << "compute_at for " << op->name;
+                IRVisitor::visit(op);
+                if (!done) {
+                    stack.pop_back();
+                }
+            } else {
+                IRVisitor::visit(op);
+            }
+        }
+    } summary{target};
+
+    root->accept(&summary);
+
+    std::ostringstream result;
+    std::string prefix = "";
+    result << "The loop nest is:\n";
+    for (const auto &str : summary.stack) {
+        result << prefix << str.str() << ":\n";
+        prefix += " ";
+    }
+    return result.str();
+};
+
+// Check the user's GPU schedule is valid. Throws an error if it is not, so no
+// return value required.
+class ValidateGPUSchedule : public IRVisitor {
+
+    using IRVisitor::visit;
+
+    const IRNode *root = nullptr;
+
+    int in_blocks = 0;
+    int in_threads = 0;
+    int in_lanes = 0;
+
+    std::string innermost_blocks_loop, innermost_threads_loop;
+    std::ostringstream blocks_not_ok_reason;
+
+    void clear_blocks_not_ok_reason() {
+        std::ostringstream empty;
+        blocks_not_ok_reason.swap(empty);
+    }
+
+    void visit(const For *op) override {
+        if (!root) {
+            root = op;
+        }
+        bool should_clear = false;
+        if (in_blocks && op->for_type != ForType::GPUBlock && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << op->for_type << " loop over " << op->name;
+            should_clear = true;
+        }
+        if (op->for_type == ForType::GPUBlock) {
+            user_assert(blocks_not_ok_reason.tellp() == 0)
+                << blocks_not_ok_reason.str() << " is inside GPU block loop over "
+                << innermost_blocks_loop << " but outside GPU block loop over " << op->name
+                << ". Funcs cannot be scheduled in between GPU block loops. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_blocks < 3)
+                << "GPU block loop over " << op->name << " is inside three other GPU block loops. "
+                << "The maximum number of nested GPU block loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_threads == 0)
+                << "GPU block loop over " << op->name << " is inside GPU thread loop over "
+                << innermost_threads_loop << ". "
+                << loop_nest_summary_to_node(root, op);
+            in_blocks++;
+            ScopedValue<std::string> s(innermost_blocks_loop, op->name);
+            IRVisitor::visit(op);
+            in_blocks--;
+        } else if (op->for_type == ForType::GPUThread) {
+            user_assert(in_lanes == 0)
+                << "GPU thread loop over " << op->name << " is inside a loop over GPU lanes. "
+                << "GPU thread loops must be outside any GPU lane loop. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_threads < 3)
+                << "GPU thread loop over " << op->name << " is inside three other GPU thread loops. "
+                << "The maximum number of nested GPU thread loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_blocks)
+                << "GPU thread loop over " << op->name << " must be inside a GPU block loop. "
+                << loop_nest_summary_to_node(root, op);
+            in_threads++;
+            ScopedValue<std::string> s(innermost_threads_loop, op->name);
+            IRVisitor::visit(op);
+            in_threads--;
+        } else if (op->for_type == ForType::GPULane) {
+            user_assert(in_threads < 3)
+                << "GPU lane loop over " << op->name << " is inside three other GPU thread or lane loops. "
+                << "The maximum number of nested GPU thread or lane loops is 3. "
+                << loop_nest_summary_to_node(root, op);
+            user_assert(in_lanes == 0)
+                << "GPU lane loop over " << op->name << " is inside another GPU lane loop. GPU lane loops "
+                << "may not be nested. "
+                << loop_nest_summary_to_node(root, op);
+            in_lanes++;
+            ScopedValue<std::string> s(innermost_threads_loop, op->name);
+            IRVisitor::visit(op);
+            in_lanes--;
+        } else {
+            IRVisitor::visit(op);
+        }
+        if (should_clear) {
+            clear_blocks_not_ok_reason();
+        }
+    }
+
+    void visit(const Realize *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "store_at location for " << op->name;
+            IRVisitor::visit(op);
+            clear_blocks_not_ok_reason();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const ProducerConsumer *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (op->is_producer && in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "compute_at location for " << op->name;
+            IRVisitor::visit(op);
+            clear_blocks_not_ok_reason();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const HoistedStorage *op) override {
+        if (!root) {
+            root = op;
+        }
+        if (in_blocks && blocks_not_ok_reason.tellp() == 0) {
+            blocks_not_ok_reason << "hoist_storage location for " << op->name;
+            IRVisitor::visit(op);
+            clear_blocks_not_ok_reason();
+        } else {
+            IRVisitor::visit(op);
+        }
+    }
+};
+
 }  // anonymous namespace
 
 Stmt canonicalize_gpu_vars(Stmt s) {
+    ValidateGPUSchedule validator;
+    s.accept(&validator);
     CanonicalizeGPUVars canonicalizer;
     s = canonicalizer.mutate(s);
     return s;
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index 906963059cff..cd59fd470d38 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -1515,44 +1515,6 @@ class ZeroGPULoopMins : public IRMutator {
     ZeroGPULoopMins() = default;
 };
 
-class ValidateGPULoopNesting : public IRVisitor {
-    int gpu_block_depth = 0, gpu_thread_depth = 0;
-    string innermost_block_var, innermost_thread_var;
-
-    using IRVisitor::visit;
-
-    void visit(const For *op) override {
-        ScopedValue<string> old_innermost_block_var(innermost_block_var);
-        ScopedValue<string> old_innermost_thread_var(innermost_thread_var);
-        ScopedValue<int> old_gpu_block_depth(gpu_block_depth);
-        ScopedValue<int> old_gpu_thread_depth(gpu_thread_depth);
-
-        for (int i = 1; i <= 4; i++) {
-            if (ends_with(op->name, block_names[4 - i])) {
-                user_assert(i > gpu_block_depth)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_block_var << "\n";
-                user_assert(gpu_thread_depth == 0)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_thread_var << "\n";
-                innermost_block_var = op->name;
-                gpu_block_depth = i;
-            }
-            if (ends_with(op->name, thread_names[4 - i])) {
-                user_assert(i > gpu_thread_depth)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " cannot be inside of loop over " << innermost_thread_var << "\n";
-                user_assert(gpu_block_depth > 0)
-                    << "Invalid schedule: Loop over " << op->name
-                    << " must be inside a loop over gpu blocks\n";
-                innermost_thread_var = op->name;
-                gpu_thread_depth = i;
-            }
-        }
-        IRVisitor::visit(op);
-    }
-};
-
 }  // namespace
 
 // Also used by InjectImageIntrinsics
@@ -1632,8 +1594,6 @@ class NormalizeIfStatements : public IRMutator {
 }  // namespace
 
 Stmt fuse_gpu_thread_loops(Stmt s) {
-    ValidateGPULoopNesting validate;
-    s.accept(&validate);
     // NormalizeIfStatements pushes the predicates between GPU blocks
     // into the innermost GPU block. FuseGPUThreadLoops would then
     // merge the predicate into the merged GPU thread.
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index 9525c9a07308..c575cd47477d 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -2269,49 +2269,10 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_
 
     std::ostringstream err;
 
-    // If you're compute_at() inside a gpu blocks loop, you can't have a gpu blocks loop yourself
-    const auto has_gpu_blocks = [&]() {
-        for (const Dim &d : f.definition().schedule().dims()) {
-            if (d.for_type == ForType::GPUBlock) {
-                return true;
-            }
-        }
-        return false;
-    };
-
     const auto all_ok = [&]() {
         return store_idx >= 0 && compute_idx >= 0 && hoist_storage_idx >= 0;
     };
 
-    if (all_ok() && has_gpu_blocks()) {
-        for (int i = 0; i <= compute_idx; i++) {
-            if (sites[i].is_gpu_block) {
-                string site_fname = sites[i].loop_level.func();
-                user_error << "Functions that are compute_at() a gpu_block() loop cannot have their own gpu_block() loops, "
-                           << "but Func \"" << f.name() << "\" is compute_at() \"" << site_fname << "\"\n";
-            }
-        }
-    }
-
-    // If you're compute_at() a var marked as a gpu block var, it must be the innermost one
-    if (all_ok() && sites[compute_idx].is_gpu_block) {
-        string compute_at_fname = sites[compute_idx].loop_level.func();
-        int possibly_invalid_idx = compute_idx;
-        for (int i = compute_idx + 1; i < (int)sites.size(); i++) {
-            if (!sites[i].is_gpu_block) {
-                continue;
-            }
-            string site_fname = sites[i].loop_level.func();
-            if (site_fname == compute_at_fname) {
-                err << "Functions that are compute_at() a gpu_block() loop must specify the innermost gpu_block() loop for that Func.\n";
-                sites.erase(sites.begin() + possibly_invalid_idx);
-                // This one will also be invalid if we find a subsequent loop from the same func
-                possibly_invalid_idx = i;
-                store_idx = compute_idx = hoist_storage_idx = -1;
-            }
-        }
-    }
-
     // Check there isn't a parallel loop between the compute_at and the store_at
     if (all_ok()) {
         for (int i = store_idx + 1; i <= compute_idx; i++) {
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 5960e7922658..3b946edda6d9 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -132,8 +132,6 @@ tests(GROUPS correctness
       gpu_data_flows.cpp
       gpu_different_blocks_threads_dimensions.cpp
       gpu_dynamic_shared.cpp
-      gpu_error_1.cpp
-      gpu_error_2.cpp
       gpu_free_sync.cpp
       gpu_give_input_buffers_device_allocations.cpp
       gpu_jit_explicit_copy_to_device.cpp
@@ -187,6 +185,7 @@ tests(GROUPS correctness
       interval.cpp
       intrinsics.cpp
       introspection.cpp
+      invalid_gpu_loop_nests.cpp
       inverse.cpp
       isnan.cpp
       issue_3926.cpp
diff --git a/test/correctness/gpu_error_1.cpp b/test/correctness/gpu_error_1.cpp
deleted file mode 100644
index d3fafb72f8ba..000000000000
--- a/test/correctness/gpu_error_1.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include "Halide.h"
-#include <iostream>
-
-using namespace Halide;
-
-class MyCompileTimeErrorReporter : public CompileTimeErrorReporter {
-public:
-    void warning(const char *msg) override {
-        std::cerr << "Should not see any warnings in this test, but saw: " << msg << "\n";
-        exit(1);
-    }
-
-    void error(const char *msg) override {
-        std::string m = msg;
-        if (!strstr(msg, "Functions that are compute_at() a gpu_block() loop cannot have their own gpu_block() loops")) {
-            std::cerr << "Did not see expected error, instead saw: (" << msg << ")\n";
-            exit(1);
-        }
-
-        std::cout << "Success!\n";
-        exit(0);
-    }
-};
-
-int main(int argc, char **argv) {
-    static MyCompileTimeErrorReporter reporter;
-    set_custom_compile_time_error_reporter(&reporter);
-
-    ImageParam im(Float(32), 2);
-
-    Func a("a"), b("b");
-    Var x("x"), y("y");
-
-    a(x, y) = im(x, y);
-    b(x, y) = a(x, y);
-
-    // Verify that attempting to schedule such that we would have nested gpu-blocks for different
-    // functions produces a useful error message.
-    Var xi, yi;
-    b.gpu_tile(x, y, xi, yi, 4, 4);
-    a.compute_at(b, x).gpu_tile(x, xi, 4);
-
-    b.realize({32, 32}, Target("host-metal"));
-
-    std::cerr << "Failure, did not see error!\n";
-    return 1;
-}
diff --git a/test/correctness/gpu_error_2.cpp b/test/correctness/gpu_error_2.cpp
deleted file mode 100644
index 50a51330d145..000000000000
--- a/test/correctness/gpu_error_2.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "Halide.h"
-#include <iostream>
-
-using namespace Halide;
-
-class MyCompileTimeErrorReporter : public CompileTimeErrorReporter {
-public:
-    void warning(const char *msg) override {
-        std::cerr << "Should not see any warnings in this test, but saw: " << msg << "\n";
-        exit(1);
-    }
-
-    void error(const char *msg) override {
-        if (!strstr(msg, "Functions that are compute_at() a gpu_block() loop must specify the innermost gpu_block() loop for that Func.")) {
-            std::cerr << "Did not see expected error, instead saw: (" << msg << ")\n";
-            exit(1);
-        }
-
-        std::cout << "Saw expected error message.\n";
-        std::cout << "Success!\n";
-        exit(0);
-    }
-};
-
-int main(int argc, char **argv) {
-    static MyCompileTimeErrorReporter reporter;
-    set_custom_compile_time_error_reporter(&reporter);
-
-    ImageParam im(Float(32), 2);
-
-    Func a("a"), b("b");
-    Var x("x"), y("y");
-
-    a(x, y) = im(x, y);
-    a(x, y) += 1;
-    b(x, y) = a(x, y);
-
-    Var xi, yi;
-    b.gpu_tile(x, y, xi, yi, 4, 4);
-    a.compute_at(b, y);
-
-    b.realize({32, 32}, Target("host-metal"));
-
-    std::cerr << "Failure, did not see error!\n";
-    return 1;
-}
diff --git a/test/correctness/invalid_gpu_loop_nests.cpp b/test/correctness/invalid_gpu_loop_nests.cpp
new file mode 100644
index 000000000000..551fe4a8eb54
--- /dev/null
+++ b/test/correctness/invalid_gpu_loop_nests.cpp
@@ -0,0 +1,103 @@
+#include "Halide.h"
+#include <iostream>
+
+using namespace Halide;
+
+void check_error(bool error) {
+    if (!error) {
+        std::cout << "There was supposed to be an error!\n";
+        exit(1);
+    }
+}
+
+int main(int argc, char **argv) {
+#if HALIDE_WITH_EXCEPTIONS
+    if (!Halide::exceptions_enabled()) {
+        std::cout << "[SKIP] Halide was compiled without exceptions.\n";
+        return 0;
+    }
+
+    Target t = get_jit_target_from_environment();
+    if (!t.has_gpu_feature()) {
+        std::cout << "[SKIP] No GPU target enabled.\n";
+        return 0;
+    }
+
+    Var v0, v1, v2, v3, v4, v5, v6, v7;
+    Param<bool> p;
+    for (int i = 0;; i++) {
+        Func f{"f"}, g{"g"};
+        f(v0, v1, v2, v3, v4, v5, v6, v7) = v0;
+        g(v0, v1, v2, v3, v4, v5, v6, v7) = f(v0, v1, v2, v3, v4, v5, v6, v7);
+        switch (i) {
+        case 0:
+            // Threads but no blocks on an output Func
+            g.gpu_threads(v0);
+            break;
+        case 1:
+            // Threads but no blocks on a compute_root non-output Func
+            f.compute_root().gpu_threads(v0);
+            g.gpu_blocks(v1).gpu_threads(v0);
+            break;
+        case 2:
+            // Too many blocks loops
+            g.gpu_blocks(v0, v1).gpu_blocks(v2, v3);
+            break;
+        case 3:
+            // Too many threads loops
+            g.gpu_threads(v0, v1).gpu_threads(v2, v3).gpu_blocks(v4);
+            break;
+        case 4:
+            // Threads outside of blocks
+            g.gpu_blocks(v0).gpu_threads(v1);
+            break;
+        case 5:
+            // Something with a blocks loop compute_at inside something else with a blocks loop
+            g.gpu_blocks(v0);
+            f.compute_at(g, v0).gpu_blocks(v0);
+            break;
+        case 6:
+            // Something compute_at between two gpu_blocks loops
+            g.gpu_blocks(v0, v1);
+            f.compute_at(g, v1);
+            break;
+        case 7:
+            // Something with too many threads loops once nesting is taken into account
+            g.gpu_threads(v0, v1).gpu_blocks(v2, v3);
+            f.compute_at(g, v0).gpu_threads(v0, v1);
+            break;
+        case 8:
+            // The same, but only in a specialization
+            g.gpu_threads(v0, v1).gpu_blocks(v2, v3);
+            f.compute_at(g, v0).gpu_threads(v0).specialize(p).gpu_threads(v1);
+            break;
+        case 9:
+            // A serial loop in between two gpu blocks loops
+            g.gpu_blocks(v5, v7);
+            break;
+        default:
+            std::cout << "Success!\n";
+            return 0;
+        }
+
+        bool error = false;
+        try {
+            g.compile_jit();
+        } catch (const Halide::CompileError &e) {
+            error = true;
+            std::cout << "Expected compile error:\n"
+                      << e.what() << "\n";
+        }
+
+        if (!error) {
+            printf("There should have been an error\n");
+            return 1;
+        }
+    }
+
+    // unreachable
+#else
+    std::cout << "[SKIP] Halide was compiled without exceptions.\n";
+    return 0;
+#endif
+}

From 55dfa397c2c6bac0c0394c4d3d802b79e21559be Mon Sep 17 00:00:00 2001
From: Zalman Stern <zalman@google.com>
Date: Wed, 7 Feb 2024 10:23:46 -0800
Subject: [PATCH 052/186] Add an easy way to print vectors in debug output.
 (#8072)

* Add helper to print containers, or at least vectors, in debug info.

* Add documentation comments.

* Formatting.

* Name change.
---
 src/Debug.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/src/Debug.h b/src/Debug.h
index fadb5b4066ac..9f47a5aebeb6 100644
--- a/src/Debug.h
+++ b/src/Debug.h
@@ -65,6 +65,60 @@ class debug {
     static int debug_level();
 };
 
+/** Allow easily printing the contents of containers, or std::vector-like containers,
+ *  in debug output. Used like so:
+ *        std::vector<Type> arg_types;
+ *        debug(4) << "arg_types: " << PrintSpan(arg_types) << "\n";
+ * Which results in output like "arg_types: { uint8x8, uint8x8 }" on one line. */
+template<typename T>
+struct PrintSpan {
+    const T &span;
+    PrintSpan(const T &span)
+        : span(span) {
+    }
+};
+
+template<typename StreamT, typename T>
+inline StreamT &operator<<(StreamT &stream, const PrintSpan<T> &wrapper) {
+    stream << "{ ";
+    const char *sep = "";
+    for (const auto &e : wrapper.span) {
+        stream << sep << e;
+        sep = ", ";
+    }
+    stream << " }";
+    return stream;
+}
+
+/** Allow easily printing the contents of spans, or std::vector-like spans,
+ *  in debug output. Used like so:
+ *        std::vector<Type> arg_types;
+ *        debug(4) << "arg_types: " << PrintSpan(arg_types) << "\n";
+ * Which results in output like:
+ *     arg_types:
+ *     {
+ *             uint8x8,
+ *             uint8x8,
+ *     }
+ * Indentation uses a tab character. */
+template<typename T>
+struct PrintSpanLn {
+    const T &span;
+    PrintSpanLn(const T &span)
+        : span(span) {
+    }
+};
+
+template<typename StreamT, typename T>
+inline StreamT &operator<<(StreamT &stream, const PrintSpanLn<T> &wrapper) {
+    stream << "\n{\n";
+    for (const auto &e : wrapper.span) {
+        stream << "\t" << e << ",\n";
+    }
+    stream << "}\n";
+    return stream;
+}
+
 }  // namespace Internal
 }  // namespace Halide
 

From de8e39dbcd2d60a47e5465303bd5aa7f30d404d7 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 9 Feb 2024 16:55:00 +0000
Subject: [PATCH 053/186] Bump serialization version to 18.0.0 (#8080)

* Bump serialization version to 18.0.0

As a matter of policy, we should probably bump the version of the serialization format for every version of Halide -- even if changes are minimal-to-nonexistent -- to reinforce the fact that this isn't intended in any way as a long-term archival format.

This PR suggests that we bump the major version to match the main Halide version, but I'm open for other suggestions.

* Update halide_ir.fbs
---
 src/halide_ir.fbs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs
index e5855e301d1e..d91222d62f65 100644
--- a/src/halide_ir.fbs
+++ b/src/halide_ir.fbs
@@ -7,10 +7,12 @@ file_identifier "HLDE";
 file_extension "hlpipe";
 
 enum SerializationVersionMajor: int {
-    Value = 0
+    Value = 18
 }
 enum SerializationVersionMinor: int {
-    Value = 1
+    // 0 = Unstable
+    // 1 = First stable version
+    Value = 0
 }
 enum SerializationVersionPatch: int {
     Value = 0

From a3baa5de2b1064fa2930b94d9a49b11676457cbb Mon Sep 17 00:00:00 2001
From: James Price <jrprice@google.com>
Date: Fri, 9 Feb 2024 13:39:21 -0500
Subject: [PATCH 054/186] [WebGPU] Update to latest native headers (#8081)

* [WebGPU] Update to latest native headers

* Remove #ifdef for `requiredFeature[s]Count`

* Pass nullptr to wgpuCreateInstance
  * Emscripten currently requires this
  * Dawn accepts it too

* Use nullptr for another wgpuCreateInstance call
---
 src/runtime/mini_webgpu.h | 490 +++++++++++++++++++++++++-------------
 src/runtime/webgpu.cpp    |   4 +-
 test/common/gpu_context.h |   9 +-
 3 files changed, 323 insertions(+), 180 deletions(-)

diff --git a/src/runtime/mini_webgpu.h b/src/runtime/mini_webgpu.h
index 5a766d1a80c3..3d6bf862f0b7 100644
--- a/src/runtime/mini_webgpu.h
+++ b/src/runtime/mini_webgpu.h
@@ -75,7 +75,7 @@
 
 #define WGPU_ARRAY_LAYER_COUNT_UNDEFINED UINT32_MAX
 #define WGPU_COPY_STRIDE_UNDEFINED UINT32_MAX
-#define WGPU_DEPTH_SLICE_UNDEFINED (0xffffffffUL)
+#define WGPU_DEPTH_SLICE_UNDEFINED UINT32_MAX
 #define WGPU_LIMIT_U32_UNDEFINED UINT32_MAX
 #define WGPU_LIMIT_U64_UNDEFINED UINT64_MAX
 #define WGPU_MIP_LEVEL_COUNT_UNDEFINED UINT32_MAX
@@ -115,6 +115,7 @@ typedef struct WGPUTextureViewImpl* WGPUTextureView WGPU_OBJECT_ATTRIBUTE;
 
 // Structure forward declarations
 struct WGPUAdapterProperties;
+struct WGPUAdapterPropertiesD3D;
 struct WGPUBindGroupEntry;
 struct WGPUBlendComponent;
 struct WGPUBufferBindingLayout;
@@ -128,9 +129,13 @@ struct WGPUCompilationMessage;
 struct WGPUComputePassTimestampWrites;
 struct WGPUConstantEntry;
 struct WGPUCopyTextureForBrowserOptions;
+struct WGPUCreateComputePipelineAsyncCallbackInfo;
+struct WGPUCreateRenderPipelineAsyncCallbackInfo;
+struct WGPUDawnWGSLBlocklist;
 struct WGPUDawnAdapterPropertiesPowerPreference;
 struct WGPUDawnBufferDescriptorErrorInfoFromWireClient;
 struct WGPUDawnCacheDeviceDescriptor;
+struct WGPUDawnComputePipelineFullSubgroups;
 struct WGPUDawnEncoderInternalUsageDescriptor;
 struct WGPUDawnExperimentalSubgroupLimits;
 struct WGPUDawnMultisampleStateRenderToSingleSampled;
@@ -138,6 +143,7 @@ struct WGPUDawnRenderPassColorAttachmentRenderToSingleSampled;
 struct WGPUDawnShaderModuleSPIRVOptionsDescriptor;
 struct WGPUDawnTextureInternalUsageDescriptor;
 struct WGPUDawnTogglesDescriptor;
+struct WGPUDawnWireWGSLControl;
 struct WGPUDepthStencilStateDepthWriteDefinedDawn;
 struct WGPUExtent2D;
 struct WGPUExtent3D;
@@ -146,6 +152,7 @@ struct WGPUExternalTextureBindingLayout;
 struct WGPUFuture;
 struct WGPUInstanceFeatures;
 struct WGPULimits;
+struct WGPUMemoryHeapInfo;
 struct WGPUMultisampleState;
 struct WGPUOrigin2D;
 struct WGPUOrigin3D;
@@ -163,31 +170,32 @@ struct WGPURenderPassDescriptorMaxDrawCount;
 struct WGPURenderPassTimestampWrites;
 struct WGPURequestAdapterCallbackInfo;
 struct WGPURequestAdapterOptions;
+struct WGPURequestDeviceCallbackInfo;
 struct WGPUSamplerBindingLayout;
 struct WGPUSamplerDescriptor;
-struct WGPUShaderModuleDescriptor;
 struct WGPUShaderModuleSPIRVDescriptor;
 struct WGPUShaderModuleWGSLDescriptor;
-struct WGPUSharedFenceDescriptor;
+struct WGPUShaderModuleDescriptor;
 struct WGPUSharedFenceDXGISharedHandleDescriptor;
 struct WGPUSharedFenceDXGISharedHandleExportInfo;
-struct WGPUSharedFenceExportInfo;
 struct WGPUSharedFenceMTLSharedEventDescriptor;
 struct WGPUSharedFenceMTLSharedEventExportInfo;
+struct WGPUSharedFenceDescriptor;
+struct WGPUSharedFenceExportInfo;
 struct WGPUSharedFenceVkSemaphoreOpaqueFDDescriptor;
 struct WGPUSharedFenceVkSemaphoreOpaqueFDExportInfo;
 struct WGPUSharedFenceVkSemaphoreSyncFDDescriptor;
 struct WGPUSharedFenceVkSemaphoreSyncFDExportInfo;
 struct WGPUSharedFenceVkSemaphoreZirconHandleDescriptor;
 struct WGPUSharedFenceVkSemaphoreZirconHandleExportInfo;
+struct WGPUSharedTextureMemoryDXGISharedHandleDescriptor;
+struct WGPUSharedTextureMemoryEGLImageDescriptor;
+struct WGPUSharedTextureMemoryIOSurfaceDescriptor;
 struct WGPUSharedTextureMemoryAHardwareBufferDescriptor;
 struct WGPUSharedTextureMemoryBeginAccessDescriptor;
 struct WGPUSharedTextureMemoryDescriptor;
-struct WGPUSharedTextureMemoryDmaBufDescriptor;
-struct WGPUSharedTextureMemoryDXGISharedHandleDescriptor;
-struct WGPUSharedTextureMemoryEGLImageDescriptor;
+struct WGPUSharedTextureMemoryDmaBufPlane;
 struct WGPUSharedTextureMemoryEndAccessState;
-struct WGPUSharedTextureMemoryIOSurfaceDescriptor;
 struct WGPUSharedTextureMemoryOpaqueFDDescriptor;
 struct WGPUSharedTextureMemoryVkDedicatedAllocationDescriptor;
 struct WGPUSharedTextureMemoryVkImageLayoutBeginState;
@@ -200,8 +208,8 @@ struct WGPUSurfaceDescriptorFromAndroidNativeWindow;
 struct WGPUSurfaceDescriptorFromCanvasHTMLSelector;
 struct WGPUSurfaceDescriptorFromMetalLayer;
 struct WGPUSurfaceDescriptorFromWaylandSurface;
-struct WGPUSurfaceDescriptorFromWindowsCoreWindow;
 struct WGPUSurfaceDescriptorFromWindowsHWND;
+struct WGPUSurfaceDescriptorFromWindowsCoreWindow;
 struct WGPUSurfaceDescriptorFromWindowsSwapChainPanel;
 struct WGPUSurfaceDescriptorFromXlibWindow;
 struct WGPUSwapChainDescriptor;
@@ -210,6 +218,7 @@ struct WGPUTextureBindingViewDimensionDescriptor;
 struct WGPUTextureDataLayout;
 struct WGPUTextureViewDescriptor;
 struct WGPUVertexAttribute;
+struct WGPUAdapterPropertiesMemoryHeaps;
 struct WGPUBindGroupDescriptor;
 struct WGPUBindGroupLayoutEntry;
 struct WGPUBlendState;
@@ -227,6 +236,7 @@ struct WGPUProgrammableStageDescriptor;
 struct WGPURenderPassColorAttachment;
 struct WGPURenderPassStorageAttachment;
 struct WGPURequiredLimits;
+struct WGPUSharedTextureMemoryDmaBufDescriptor;
 struct WGPUSharedTextureMemoryProperties;
 struct WGPUSharedTextureMemoryVkImageDescriptor;
 struct WGPUSupportedLimits;
@@ -242,25 +252,40 @@ struct WGPUVertexState;
 struct WGPUFragmentState;
 struct WGPURenderPipelineDescriptor;
 
+typedef enum WGPUWGSLFeatureName {
+    WGPUWGSLFeatureName_Undefined = 0x00000000,
+    WGPUWGSLFeatureName_ReadonlyAndReadwriteStorageTextures = 0x00000001,
+    WGPUWGSLFeatureName_Packed4x8IntegerDotProduct = 0x00000002,
+    WGPUWGSLFeatureName_UnrestrictedPointerParameters = 0x00000003,
+    WGPUWGSLFeatureName_PointerCompositeAccess = 0x00000004,
+    WGPUWGSLFeatureName_ChromiumTestingUnimplemented = 0x000003E8,
+    WGPUWGSLFeatureName_ChromiumTestingUnsafeExperimental = 0x000003E9,
+    WGPUWGSLFeatureName_ChromiumTestingExperimental = 0x000003EA,
+    WGPUWGSLFeatureName_ChromiumTestingShippedWithKillswitch = 0x000003EB,
+    WGPUWGSLFeatureName_ChromiumTestingShipped = 0x000003EC,
+    WGPUWGSLFeatureName_Force32 = 0x7FFFFFFF
+} WGPUWGSLFeatureName WGPU_ENUM_ATTRIBUTE;
+
 typedef enum WGPUAdapterType {
-    WGPUAdapterType_DiscreteGPU = 0x00000000,
-    WGPUAdapterType_IntegratedGPU = 0x00000001,
-    WGPUAdapterType_CPU = 0x00000002,
-    WGPUAdapterType_Unknown = 0x00000003,
+    WGPUAdapterType_DiscreteGPU = 0x00000001,
+    WGPUAdapterType_IntegratedGPU = 0x00000002,
+    WGPUAdapterType_CPU = 0x00000003,
+    WGPUAdapterType_Unknown = 0x00000004,
     WGPUAdapterType_Force32 = 0x7FFFFFFF
 } WGPUAdapterType WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUAddressMode {
-    WGPUAddressMode_Repeat = 0x00000000,
-    WGPUAddressMode_MirrorRepeat = 0x00000001,
-    WGPUAddressMode_ClampToEdge = 0x00000002,
+    WGPUAddressMode_Undefined = 0x00000000,
+    WGPUAddressMode_ClampToEdge = 0x00000001,
+    WGPUAddressMode_Repeat = 0x00000002,
+    WGPUAddressMode_MirrorRepeat = 0x00000003,
     WGPUAddressMode_Force32 = 0x7FFFFFFF
 } WGPUAddressMode WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUAlphaMode {
-    WGPUAlphaMode_Premultiplied = 0x00000000,
-    WGPUAlphaMode_Unpremultiplied = 0x00000001,
-    WGPUAlphaMode_Opaque = 0x00000002,
+    WGPUAlphaMode_Opaque = 0x00000001,
+    WGPUAlphaMode_Premultiplied = 0x00000002,
+    WGPUAlphaMode_Unpremultiplied = 0x00000003,
     WGPUAlphaMode_Force32 = 0x7FFFFFFF
 } WGPUAlphaMode WGPU_ENUM_ATTRIBUTE;
 
@@ -278,32 +303,34 @@ typedef enum WGPUBackendType {
 } WGPUBackendType WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUBlendFactor {
-    WGPUBlendFactor_Zero = 0x00000000,
-    WGPUBlendFactor_One = 0x00000001,
-    WGPUBlendFactor_Src = 0x00000002,
-    WGPUBlendFactor_OneMinusSrc = 0x00000003,
-    WGPUBlendFactor_SrcAlpha = 0x00000004,
-    WGPUBlendFactor_OneMinusSrcAlpha = 0x00000005,
-    WGPUBlendFactor_Dst = 0x00000006,
-    WGPUBlendFactor_OneMinusDst = 0x00000007,
-    WGPUBlendFactor_DstAlpha = 0x00000008,
-    WGPUBlendFactor_OneMinusDstAlpha = 0x00000009,
-    WGPUBlendFactor_SrcAlphaSaturated = 0x0000000A,
-    WGPUBlendFactor_Constant = 0x0000000B,
-    WGPUBlendFactor_OneMinusConstant = 0x0000000C,
-    WGPUBlendFactor_Src1 = 0x0000000D,
-    WGPUBlendFactor_OneMinusSrc1 = 0x0000000E,
-    WGPUBlendFactor_Src1Alpha = 0x0000000F,
-    WGPUBlendFactor_OneMinusSrc1Alpha = 0x00000010,
+    WGPUBlendFactor_Undefined = 0x00000000,
+    WGPUBlendFactor_Zero = 0x00000001,
+    WGPUBlendFactor_One = 0x00000002,
+    WGPUBlendFactor_Src = 0x00000003,
+    WGPUBlendFactor_OneMinusSrc = 0x00000004,
+    WGPUBlendFactor_SrcAlpha = 0x00000005,
+    WGPUBlendFactor_OneMinusSrcAlpha = 0x00000006,
+    WGPUBlendFactor_Dst = 0x00000007,
+    WGPUBlendFactor_OneMinusDst = 0x00000008,
+    WGPUBlendFactor_DstAlpha = 0x00000009,
+    WGPUBlendFactor_OneMinusDstAlpha = 0x0000000A,
+    WGPUBlendFactor_SrcAlphaSaturated = 0x0000000B,
+    WGPUBlendFactor_Constant = 0x0000000C,
+    WGPUBlendFactor_OneMinusConstant = 0x0000000D,
+    WGPUBlendFactor_Src1 = 0x0000000E,
+    WGPUBlendFactor_OneMinusSrc1 = 0x0000000F,
+    WGPUBlendFactor_Src1Alpha = 0x00000010,
+    WGPUBlendFactor_OneMinusSrc1Alpha = 0x00000011,
     WGPUBlendFactor_Force32 = 0x7FFFFFFF
 } WGPUBlendFactor WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUBlendOperation {
-    WGPUBlendOperation_Add = 0x00000000,
-    WGPUBlendOperation_Subtract = 0x00000001,
-    WGPUBlendOperation_ReverseSubtract = 0x00000002,
-    WGPUBlendOperation_Min = 0x00000003,
-    WGPUBlendOperation_Max = 0x00000004,
+    WGPUBlendOperation_Undefined = 0x00000000,
+    WGPUBlendOperation_Add = 0x00000001,
+    WGPUBlendOperation_Subtract = 0x00000002,
+    WGPUBlendOperation_ReverseSubtract = 0x00000003,
+    WGPUBlendOperation_Min = 0x00000004,
+    WGPUBlendOperation_Max = 0x00000005,
     WGPUBlendOperation_Force32 = 0x7FFFFFFF
 } WGPUBlendOperation WGPU_ENUM_ATTRIBUTE;
 
@@ -317,21 +344,22 @@ typedef enum WGPUBufferBindingType {
 
 typedef enum WGPUBufferMapAsyncStatus {
     WGPUBufferMapAsyncStatus_Success = 0x00000000,
-    WGPUBufferMapAsyncStatus_ValidationError = 0x00000001,
-    WGPUBufferMapAsyncStatus_Unknown = 0x00000002,
-    WGPUBufferMapAsyncStatus_DeviceLost = 0x00000003,
-    WGPUBufferMapAsyncStatus_DestroyedBeforeCallback = 0x00000004,
-    WGPUBufferMapAsyncStatus_UnmappedBeforeCallback = 0x00000005,
-    WGPUBufferMapAsyncStatus_MappingAlreadyPending = 0x00000006,
-    WGPUBufferMapAsyncStatus_OffsetOutOfRange = 0x00000007,
-    WGPUBufferMapAsyncStatus_SizeOutOfRange = 0x00000008,
+    WGPUBufferMapAsyncStatus_InstanceDropped = 0x00000001,
+    WGPUBufferMapAsyncStatus_ValidationError = 0x00000002,
+    WGPUBufferMapAsyncStatus_Unknown = 0x00000003,
+    WGPUBufferMapAsyncStatus_DeviceLost = 0x00000004,
+    WGPUBufferMapAsyncStatus_DestroyedBeforeCallback = 0x00000005,
+    WGPUBufferMapAsyncStatus_UnmappedBeforeCallback = 0x00000006,
+    WGPUBufferMapAsyncStatus_MappingAlreadyPending = 0x00000007,
+    WGPUBufferMapAsyncStatus_OffsetOutOfRange = 0x00000008,
+    WGPUBufferMapAsyncStatus_SizeOutOfRange = 0x00000009,
     WGPUBufferMapAsyncStatus_Force32 = 0x7FFFFFFF
 } WGPUBufferMapAsyncStatus WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUBufferMapState {
-    WGPUBufferMapState_Unmapped = 0x00000000,
-    WGPUBufferMapState_Pending = 0x00000001,
-    WGPUBufferMapState_Mapped = 0x00000002,
+    WGPUBufferMapState_Unmapped = 0x00000001,
+    WGPUBufferMapState_Pending = 0x00000002,
+    WGPUBufferMapState_Mapped = 0x00000003,
     WGPUBufferMapState_Force32 = 0x7FFFFFFF
 } WGPUBufferMapState WGPU_ENUM_ATTRIBUTE;
 
@@ -346,44 +374,47 @@ typedef enum WGPUCompareFunction {
     WGPUCompareFunction_Undefined = 0x00000000,
     WGPUCompareFunction_Never = 0x00000001,
     WGPUCompareFunction_Less = 0x00000002,
-    WGPUCompareFunction_LessEqual = 0x00000003,
-    WGPUCompareFunction_Greater = 0x00000004,
-    WGPUCompareFunction_GreaterEqual = 0x00000005,
-    WGPUCompareFunction_Equal = 0x00000006,
-    WGPUCompareFunction_NotEqual = 0x00000007,
+    WGPUCompareFunction_Equal = 0x00000003,
+    WGPUCompareFunction_LessEqual = 0x00000004,
+    WGPUCompareFunction_Greater = 0x00000005,
+    WGPUCompareFunction_NotEqual = 0x00000006,
+    WGPUCompareFunction_GreaterEqual = 0x00000007,
     WGPUCompareFunction_Always = 0x00000008,
     WGPUCompareFunction_Force32 = 0x7FFFFFFF
 } WGPUCompareFunction WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUCompilationInfoRequestStatus {
     WGPUCompilationInfoRequestStatus_Success = 0x00000000,
-    WGPUCompilationInfoRequestStatus_Error = 0x00000001,
-    WGPUCompilationInfoRequestStatus_DeviceLost = 0x00000002,
-    WGPUCompilationInfoRequestStatus_Unknown = 0x00000003,
+    WGPUCompilationInfoRequestStatus_InstanceDropped = 0x00000001,
+    WGPUCompilationInfoRequestStatus_Error = 0x00000002,
+    WGPUCompilationInfoRequestStatus_DeviceLost = 0x00000003,
+    WGPUCompilationInfoRequestStatus_Unknown = 0x00000004,
     WGPUCompilationInfoRequestStatus_Force32 = 0x7FFFFFFF
 } WGPUCompilationInfoRequestStatus WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUCompilationMessageType {
-    WGPUCompilationMessageType_Error = 0x00000000,
-    WGPUCompilationMessageType_Warning = 0x00000001,
-    WGPUCompilationMessageType_Info = 0x00000002,
+    WGPUCompilationMessageType_Error = 0x00000001,
+    WGPUCompilationMessageType_Warning = 0x00000002,
+    WGPUCompilationMessageType_Info = 0x00000003,
     WGPUCompilationMessageType_Force32 = 0x7FFFFFFF
 } WGPUCompilationMessageType WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUCreatePipelineAsyncStatus {
     WGPUCreatePipelineAsyncStatus_Success = 0x00000000,
-    WGPUCreatePipelineAsyncStatus_ValidationError = 0x00000001,
-    WGPUCreatePipelineAsyncStatus_InternalError = 0x00000002,
-    WGPUCreatePipelineAsyncStatus_DeviceLost = 0x00000003,
-    WGPUCreatePipelineAsyncStatus_DeviceDestroyed = 0x00000004,
-    WGPUCreatePipelineAsyncStatus_Unknown = 0x00000005,
+    WGPUCreatePipelineAsyncStatus_InstanceDropped = 0x00000001,
+    WGPUCreatePipelineAsyncStatus_ValidationError = 0x00000002,
+    WGPUCreatePipelineAsyncStatus_InternalError = 0x00000003,
+    WGPUCreatePipelineAsyncStatus_DeviceLost = 0x00000004,
+    WGPUCreatePipelineAsyncStatus_DeviceDestroyed = 0x00000005,
+    WGPUCreatePipelineAsyncStatus_Unknown = 0x00000006,
     WGPUCreatePipelineAsyncStatus_Force32 = 0x7FFFFFFF
 } WGPUCreatePipelineAsyncStatus WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUCullMode {
-    WGPUCullMode_None = 0x00000000,
-    WGPUCullMode_Front = 0x00000001,
-    WGPUCullMode_Back = 0x00000002,
+    WGPUCullMode_Undefined = 0x00000000,
+    WGPUCullMode_None = 0x00000001,
+    WGPUCullMode_Front = 0x00000002,
+    WGPUCullMode_Back = 0x00000003,
     WGPUCullMode_Force32 = 0x7FFFFFFF
 } WGPUCullMode WGPU_ENUM_ATTRIBUTE;
 
@@ -394,9 +425,9 @@ typedef enum WGPUDeviceLostReason {
 } WGPUDeviceLostReason WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUErrorFilter {
-    WGPUErrorFilter_Validation = 0x00000000,
-    WGPUErrorFilter_OutOfMemory = 0x00000001,
-    WGPUErrorFilter_Internal = 0x00000002,
+    WGPUErrorFilter_Validation = 0x00000001,
+    WGPUErrorFilter_OutOfMemory = 0x00000002,
+    WGPUErrorFilter_Internal = 0x00000003,
     WGPUErrorFilter_Force32 = 0x7FFFFFFF
 } WGPUErrorFilter WGPU_ENUM_ATTRIBUTE;
 
@@ -434,7 +465,6 @@ typedef enum WGPUFeatureName {
     WGPUFeatureName_DawnInternalUsages = 0x000003EA,
     WGPUFeatureName_DawnMultiPlanarFormats = 0x000003EB,
     WGPUFeatureName_DawnNative = 0x000003EC,
-    WGPUFeatureName_ChromiumExperimentalDp4a = 0x000003ED,
     WGPUFeatureName_ChromiumExperimentalTimestampQueryInsidePasses = 0x000003EE,
     WGPUFeatureName_ImplicitDeviceSynchronization = 0x000003EF,
     WGPUFeatureName_SurfaceCapabilities = 0x000003F0,
@@ -455,6 +485,8 @@ typedef enum WGPUFeatureName {
     WGPUFeatureName_MultiPlanarFormatNv12a = 0x00000400,
     WGPUFeatureName_FramebufferFetch = 0x00000401,
     WGPUFeatureName_BufferMapExtendedUsages = 0x00000402,
+    WGPUFeatureName_AdapterPropertiesMemoryHeaps = 0x00000403,
+    WGPUFeatureName_AdapterPropertiesD3D = 0x00000404,
     WGPUFeatureName_SharedTextureMemoryVkDedicatedAllocation = 0x0000044C,
     WGPUFeatureName_SharedTextureMemoryAHardwareBuffer = 0x0000044D,
     WGPUFeatureName_SharedTextureMemoryDmaBuf = 0x0000044E,
@@ -473,14 +505,16 @@ typedef enum WGPUFeatureName {
 } WGPUFeatureName WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUFilterMode {
-    WGPUFilterMode_Nearest = 0x00000000,
-    WGPUFilterMode_Linear = 0x00000001,
+    WGPUFilterMode_Undefined = 0x00000000,
+    WGPUFilterMode_Nearest = 0x00000001,
+    WGPUFilterMode_Linear = 0x00000002,
     WGPUFilterMode_Force32 = 0x7FFFFFFF
 } WGPUFilterMode WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUFrontFace {
-    WGPUFrontFace_CCW = 0x00000000,
-    WGPUFrontFace_CW = 0x00000001,
+    WGPUFrontFace_Undefined = 0x00000000,
+    WGPUFrontFace_CCW = 0x00000001,
+    WGPUFrontFace_CW = 0x00000002,
     WGPUFrontFace_Force32 = 0x7FFFFFFF
 } WGPUFrontFace WGPU_ENUM_ATTRIBUTE;
 
@@ -499,16 +533,17 @@ typedef enum WGPULoadOp {
 } WGPULoadOp WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPULoggingType {
-    WGPULoggingType_Verbose = 0x00000000,
-    WGPULoggingType_Info = 0x00000001,
-    WGPULoggingType_Warning = 0x00000002,
-    WGPULoggingType_Error = 0x00000003,
+    WGPULoggingType_Verbose = 0x00000001,
+    WGPULoggingType_Info = 0x00000002,
+    WGPULoggingType_Warning = 0x00000003,
+    WGPULoggingType_Error = 0x00000004,
     WGPULoggingType_Force32 = 0x7FFFFFFF
 } WGPULoggingType WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUMipmapFilterMode {
-    WGPUMipmapFilterMode_Nearest = 0x00000000,
-    WGPUMipmapFilterMode_Linear = 0x00000001,
+    WGPUMipmapFilterMode_Undefined = 0x00000000,
+    WGPUMipmapFilterMode_Nearest = 0x00000001,
+    WGPUMipmapFilterMode_Linear = 0x00000002,
     WGPUMipmapFilterMode_Force32 = 0x7FFFFFFF
 } WGPUMipmapFilterMode WGPU_ENUM_ATTRIBUTE;
 
@@ -520,47 +555,51 @@ typedef enum WGPUPowerPreference {
 } WGPUPowerPreference WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUPresentMode {
-    WGPUPresentMode_Fifo = 0x00000000,
-    WGPUPresentMode_Immediate = 0x00000002,
-    WGPUPresentMode_Mailbox = 0x00000003,
+    WGPUPresentMode_Fifo = 0x00000001,
+    WGPUPresentMode_Immediate = 0x00000003,
+    WGPUPresentMode_Mailbox = 0x00000004,
     WGPUPresentMode_Force32 = 0x7FFFFFFF
 } WGPUPresentMode WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUPrimitiveTopology {
-    WGPUPrimitiveTopology_PointList = 0x00000000,
-    WGPUPrimitiveTopology_LineList = 0x00000001,
-    WGPUPrimitiveTopology_LineStrip = 0x00000002,
-    WGPUPrimitiveTopology_TriangleList = 0x00000003,
-    WGPUPrimitiveTopology_TriangleStrip = 0x00000004,
+    WGPUPrimitiveTopology_Undefined = 0x00000000,
+    WGPUPrimitiveTopology_PointList = 0x00000001,
+    WGPUPrimitiveTopology_LineList = 0x00000002,
+    WGPUPrimitiveTopology_LineStrip = 0x00000003,
+    WGPUPrimitiveTopology_TriangleList = 0x00000004,
+    WGPUPrimitiveTopology_TriangleStrip = 0x00000005,
     WGPUPrimitiveTopology_Force32 = 0x7FFFFFFF
 } WGPUPrimitiveTopology WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUQueryType {
-    WGPUQueryType_Occlusion = 0x00000000,
-    WGPUQueryType_Timestamp = 0x00000001,
+    WGPUQueryType_Occlusion = 0x00000001,
+    WGPUQueryType_Timestamp = 0x00000002,
     WGPUQueryType_Force32 = 0x7FFFFFFF
 } WGPUQueryType WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUQueueWorkDoneStatus {
     WGPUQueueWorkDoneStatus_Success = 0x00000000,
-    WGPUQueueWorkDoneStatus_Error = 0x00000001,
-    WGPUQueueWorkDoneStatus_Unknown = 0x00000002,
-    WGPUQueueWorkDoneStatus_DeviceLost = 0x00000003,
+    WGPUQueueWorkDoneStatus_InstanceDropped = 0x00000001,
+    WGPUQueueWorkDoneStatus_Error = 0x00000002,
+    WGPUQueueWorkDoneStatus_Unknown = 0x00000003,
+    WGPUQueueWorkDoneStatus_DeviceLost = 0x00000004,
     WGPUQueueWorkDoneStatus_Force32 = 0x7FFFFFFF
 } WGPUQueueWorkDoneStatus WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPURequestAdapterStatus {
     WGPURequestAdapterStatus_Success = 0x00000000,
-    WGPURequestAdapterStatus_Unavailable = 0x00000001,
-    WGPURequestAdapterStatus_Error = 0x00000002,
-    WGPURequestAdapterStatus_Unknown = 0x00000003,
+    WGPURequestAdapterStatus_InstanceDropped = 0x00000001,
+    WGPURequestAdapterStatus_Unavailable = 0x00000002,
+    WGPURequestAdapterStatus_Error = 0x00000003,
+    WGPURequestAdapterStatus_Unknown = 0x00000004,
     WGPURequestAdapterStatus_Force32 = 0x7FFFFFFF
 } WGPURequestAdapterStatus WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPURequestDeviceStatus {
     WGPURequestDeviceStatus_Success = 0x00000000,
-    WGPURequestDeviceStatus_Error = 0x00000001,
-    WGPURequestDeviceStatus_Unknown = 0x00000002,
+    WGPURequestDeviceStatus_InstanceDropped = 0x00000001,
+    WGPURequestDeviceStatus_Error = 0x00000002,
+    WGPURequestDeviceStatus_Unknown = 0x00000003,
     WGPURequestDeviceStatus_Force32 = 0x7FFFFFFF
 } WGPURequestDeviceStatus WGPU_ENUM_ATTRIBUTE;
 
@@ -599,6 +638,11 @@ typedef enum WGPUSType {
     WGPUSType_PipelineLayoutPixelLocalStorage = 0x000003F8,
     WGPUSType_BufferHostMappedPointer = 0x000003F9,
     WGPUSType_DawnExperimentalSubgroupLimits = 0x000003FA,
+    WGPUSType_AdapterPropertiesMemoryHeaps = 0x000003FB,
+    WGPUSType_AdapterPropertiesD3D = 0x000003FC,
+    WGPUSType_DawnComputePipelineFullSubgroups = 0x000003FD,
+    WGPUSType_DawnWireWGSLControl = 0x000003FE,
+    WGPUSType_DawnWGSLBlocklist = 0x000003FF,
     WGPUSType_SharedTextureMemoryVkImageDescriptor = 0x0000044C,
     WGPUSType_SharedTextureMemoryVkDedicatedAllocationDescriptor = 0x0000044D,
     WGPUSType_SharedTextureMemoryAHardwareBufferDescriptor = 0x0000044E,
@@ -645,14 +689,15 @@ typedef enum WGPUSharedFenceType {
 } WGPUSharedFenceType WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUStencilOperation {
-    WGPUStencilOperation_Keep = 0x00000000,
-    WGPUStencilOperation_Zero = 0x00000001,
-    WGPUStencilOperation_Replace = 0x00000002,
-    WGPUStencilOperation_Invert = 0x00000003,
-    WGPUStencilOperation_IncrementClamp = 0x00000004,
-    WGPUStencilOperation_DecrementClamp = 0x00000005,
-    WGPUStencilOperation_IncrementWrap = 0x00000006,
-    WGPUStencilOperation_DecrementWrap = 0x00000007,
+    WGPUStencilOperation_Undefined = 0x00000000,
+    WGPUStencilOperation_Keep = 0x00000001,
+    WGPUStencilOperation_Zero = 0x00000002,
+    WGPUStencilOperation_Replace = 0x00000003,
+    WGPUStencilOperation_Invert = 0x00000004,
+    WGPUStencilOperation_IncrementClamp = 0x00000005,
+    WGPUStencilOperation_DecrementClamp = 0x00000006,
+    WGPUStencilOperation_IncrementWrap = 0x00000007,
+    WGPUStencilOperation_DecrementWrap = 0x00000008,
     WGPUStencilOperation_Force32 = 0x7FFFFFFF
 } WGPUStencilOperation WGPU_ENUM_ATTRIBUTE;
 
@@ -672,19 +717,21 @@ typedef enum WGPUStoreOp {
 } WGPUStoreOp WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUTextureAspect {
-    WGPUTextureAspect_All = 0x00000000,
-    WGPUTextureAspect_StencilOnly = 0x00000001,
-    WGPUTextureAspect_DepthOnly = 0x00000002,
-    WGPUTextureAspect_Plane0Only = 0x00000003,
-    WGPUTextureAspect_Plane1Only = 0x00000004,
-    WGPUTextureAspect_Plane2Only = 0x00000005,
+    WGPUTextureAspect_Undefined = 0x00000000,
+    WGPUTextureAspect_All = 0x00000001,
+    WGPUTextureAspect_StencilOnly = 0x00000002,
+    WGPUTextureAspect_DepthOnly = 0x00000003,
+    WGPUTextureAspect_Plane0Only = 0x00000004,
+    WGPUTextureAspect_Plane1Only = 0x00000005,
+    WGPUTextureAspect_Plane2Only = 0x00000006,
     WGPUTextureAspect_Force32 = 0x7FFFFFFF
 } WGPUTextureAspect WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUTextureDimension {
-    WGPUTextureDimension_1D = 0x00000000,
-    WGPUTextureDimension_2D = 0x00000001,
-    WGPUTextureDimension_3D = 0x00000002,
+    WGPUTextureDimension_Undefined = 0x00000000,
+    WGPUTextureDimension_1D = 0x00000001,
+    WGPUTextureDimension_2D = 0x00000002,
+    WGPUTextureDimension_3D = 0x00000003,
     WGPUTextureDimension_Force32 = 0x7FFFFFFF
 } WGPUTextureDimension WGPU_ENUM_ATTRIBUTE;
 
@@ -855,9 +902,10 @@ typedef enum WGPUVertexFormat {
 } WGPUVertexFormat WGPU_ENUM_ATTRIBUTE;
 
 typedef enum WGPUVertexStepMode {
-    WGPUVertexStepMode_Vertex = 0x00000000,
-    WGPUVertexStepMode_Instance = 0x00000001,
-    WGPUVertexStepMode_VertexBufferNotUsed = 0x00000002,
+    WGPUVertexStepMode_Undefined = 0x00000000,
+    WGPUVertexStepMode_VertexBufferNotUsed = 0x00000001,
+    WGPUVertexStepMode_Vertex = 0x00000002,
+    WGPUVertexStepMode_Instance = 0x00000003,
     WGPUVertexStepMode_Force32 = 0x7FFFFFFF
 } WGPUVertexStepMode WGPU_ENUM_ATTRIBUTE;
 
@@ -898,6 +946,17 @@ typedef enum WGPUColorWriteMask {
 } WGPUColorWriteMask WGPU_ENUM_ATTRIBUTE;
 typedef WGPUFlags WGPUColorWriteMaskFlags WGPU_ENUM_ATTRIBUTE;
 
+typedef enum WGPUHeapProperty {
+    WGPUHeapProperty_Undefined = 0x00000000,
+    WGPUHeapProperty_DeviceLocal = 0x00000001,
+    WGPUHeapProperty_HostVisible = 0x00000002,
+    WGPUHeapProperty_HostCoherent = 0x00000004,
+    WGPUHeapProperty_HostUncached = 0x00000008,
+    WGPUHeapProperty_HostCached = 0x00000010,
+    WGPUHeapProperty_Force32 = 0x7FFFFFFF
+} WGPUHeapProperty WGPU_ENUM_ATTRIBUTE;
+typedef WGPUFlags WGPUHeapPropertyFlags WGPU_ENUM_ATTRIBUTE;
+
 typedef enum WGPUMapMode {
     WGPUMapMode_None = 0x00000000,
     WGPUMapMode_Read = 0x00000001,
@@ -933,6 +992,8 @@ typedef void (*WGPUCallback)(void * userdata) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUCompilationInfoCallback)(WGPUCompilationInfoRequestStatus status, struct WGPUCompilationInfo const * compilationInfo, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUCreateComputePipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPUComputePipeline pipeline, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUCreateRenderPipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPURenderPipeline pipeline, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
+typedef size_t (*WGPUDawnLoadCacheDataFunction)(void const * key, size_t keySize, void * value, size_t valueSize, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
+typedef void (*WGPUDawnStoreCacheDataFunction)(void const * key, size_t keySize, void const * value, size_t valueSize, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUDeviceLostCallback)(WGPUDeviceLostReason reason, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUErrorCallback)(WGPUErrorType type, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPULoggingCallback)(WGPULoggingType type, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
@@ -964,6 +1025,12 @@ typedef struct WGPUAdapterProperties {
     WGPUBool compatibilityMode;
 } WGPUAdapterProperties WGPU_STRUCTURE_ATTRIBUTE;
 
+// Can be chained in WGPUAdapterProperties
+typedef struct WGPUAdapterPropertiesD3D {
+    WGPUChainedStructOut chain;
+    uint32_t shaderModel;
+} WGPUAdapterPropertiesD3D WGPU_STRUCTURE_ATTRIBUTE;
+
 typedef struct WGPUBindGroupEntry {
     WGPUChainedStruct const * nextInChain;
     uint32_t binding;
@@ -1064,6 +1131,27 @@ typedef struct WGPUCopyTextureForBrowserOptions {
     WGPUBool internalUsage;
 } WGPUCopyTextureForBrowserOptions WGPU_STRUCTURE_ATTRIBUTE;
 
+typedef struct WGPUCreateComputePipelineAsyncCallbackInfo {
+    WGPUChainedStruct const * nextInChain;
+    WGPUCallbackMode mode;
+    WGPUCreateComputePipelineAsyncCallback callback;
+    void * userdata;
+} WGPUCreateComputePipelineAsyncCallbackInfo WGPU_STRUCTURE_ATTRIBUTE;
+
+typedef struct WGPUCreateRenderPipelineAsyncCallbackInfo {
+    WGPUChainedStruct const * nextInChain;
+    WGPUCallbackMode mode;
+    WGPUCreateRenderPipelineAsyncCallback callback;
+    void * userdata;
+} WGPUCreateRenderPipelineAsyncCallbackInfo WGPU_STRUCTURE_ATTRIBUTE;
+
+// Can be chained in WGPUInstanceDescriptor
+typedef struct WGPUDawnWGSLBlocklist {
+    WGPUChainedStruct chain;
+    size_t blocklistedFeatureCount;
+    const char* const * blocklistedFeatures;
+} WGPUDawnWGSLBlocklist WGPU_STRUCTURE_ATTRIBUTE;
+
 // Can be chained in WGPUAdapterProperties
 typedef struct WGPUDawnAdapterPropertiesPowerPreference {
     WGPUChainedStructOut chain;
@@ -1080,8 +1168,17 @@ typedef struct WGPUDawnBufferDescriptorErrorInfoFromWireClient {
 typedef struct WGPUDawnCacheDeviceDescriptor {
     WGPUChainedStruct chain;
     char const * isolationKey;
+    WGPUDawnLoadCacheDataFunction loadDataFunction;
+    WGPUDawnStoreCacheDataFunction storeDataFunction;
+    void * functionUserdata;
 } WGPUDawnCacheDeviceDescriptor WGPU_STRUCTURE_ATTRIBUTE;
 
+// Can be chained in WGPUComputePipelineDescriptor
+typedef struct WGPUDawnComputePipelineFullSubgroups {
+    WGPUChainedStruct chain;
+    WGPUBool requiresFullSubgroups;
+} WGPUDawnComputePipelineFullSubgroups WGPU_STRUCTURE_ATTRIBUTE;
+
 // Can be chained in WGPUCommandEncoderDescriptor
 typedef struct WGPUDawnEncoderInternalUsageDescriptor {
     WGPUChainedStruct chain;
@@ -1130,6 +1227,14 @@ typedef struct WGPUDawnTogglesDescriptor {
     const char* const * disabledToggles;
 } WGPUDawnTogglesDescriptor WGPU_STRUCTURE_ATTRIBUTE;
 
+// Can be chained in WGPUInstanceDescriptor
+typedef struct WGPUDawnWireWGSLControl {
+    WGPUChainedStruct chain;
+    WGPUBool enableExperimental;
+    WGPUBool enableUnsafe;
+    WGPUBool enableTesting;
+} WGPUDawnWireWGSLControl WGPU_STRUCTURE_ATTRIBUTE;
+
 // Can be chained in WGPUDepthStencilState
 typedef struct WGPUDepthStencilStateDepthWriteDefinedDawn {
     WGPUChainedStruct chain;
@@ -1203,6 +1308,11 @@ typedef struct WGPULimits {
     uint32_t maxComputeWorkgroupsPerDimension;
 } WGPULimits WGPU_STRUCTURE_ATTRIBUTE;
 
+typedef struct WGPUMemoryHeapInfo {
+    WGPUHeapPropertyFlags properties;
+    uint64_t size;
+} WGPUMemoryHeapInfo WGPU_STRUCTURE_ATTRIBUTE;
+
 typedef struct WGPUMultisampleState {
     WGPUChainedStruct const * nextInChain;
     uint32_t count;
@@ -1323,6 +1433,13 @@ typedef struct WGPURequestAdapterOptions {
     WGPUBool compatibilityMode;
 } WGPURequestAdapterOptions WGPU_STRUCTURE_ATTRIBUTE;
 
+typedef struct WGPURequestDeviceCallbackInfo {
+    WGPUChainedStruct const * nextInChain;
+    WGPUCallbackMode mode;
+    WGPURequestDeviceCallback callback;
+    void * userdata;
+} WGPURequestDeviceCallbackInfo WGPU_STRUCTURE_ATTRIBUTE;
+
 typedef struct WGPUSamplerBindingLayout {
     WGPUChainedStruct const * nextInChain;
     WGPUSamplerBindingType type;
@@ -1343,11 +1460,6 @@ typedef struct WGPUSamplerDescriptor {
     uint16_t maxAnisotropy;
 } WGPUSamplerDescriptor WGPU_STRUCTURE_ATTRIBUTE;
 
-typedef struct WGPUShaderModuleDescriptor {
-    WGPUChainedStruct const * nextInChain;
-    WGPU_NULLABLE char const * label;
-} WGPUShaderModuleDescriptor WGPU_STRUCTURE_ATTRIBUTE;
-
 // Can be chained in WGPUShaderModuleDescriptor
 typedef struct WGPUShaderModuleSPIRVDescriptor {
     WGPUChainedStruct chain;
@@ -1361,10 +1473,10 @@ typedef struct WGPUShaderModuleWGSLDescriptor {
     char const * code;
 } WGPUShaderModuleWGSLDescriptor WGPU_STRUCTURE_ATTRIBUTE;
 
-typedef struct WGPUSharedFenceDescriptor {
+typedef struct WGPUShaderModuleDescriptor {
     WGPUChainedStruct const * nextInChain;
     WGPU_NULLABLE char const * label;
-} WGPUSharedFenceDescriptor WGPU_STRUCTURE_ATTRIBUTE;
+} WGPUShaderModuleDescriptor WGPU_STRUCTURE_ATTRIBUTE;
 
 // Can be chained in WGPUSharedFenceDescriptor
 typedef struct WGPUSharedFenceDXGISharedHandleDescriptor {
@@ -1378,11 +1490,6 @@ typedef struct WGPUSharedFenceDXGISharedHandleExportInfo {
     void * handle;
 } WGPUSharedFenceDXGISharedHandleExportInfo WGPU_STRUCTURE_ATTRIBUTE;
 
-typedef struct WGPUSharedFenceExportInfo {
-    WGPUChainedStructOut * nextInChain;
-    WGPUSharedFenceType type;
-} WGPUSharedFenceExportInfo WGPU_STRUCTURE_ATTRIBUTE;
-
 // Can be chained in WGPUSharedFenceDescriptor
 typedef struct WGPUSharedFenceMTLSharedEventDescriptor {
     WGPUChainedStruct chain;
@@ -1395,6 +1502,16 @@ typedef struct WGPUSharedFenceMTLSharedEventExportInfo {
     void * sharedEvent;
 } WGPUSharedFenceMTLSharedEventExportInfo WGPU_STRUCTURE_ATTRIBUTE;
 
+typedef struct WGPUSharedFenceDescriptor {
+    WGPUChainedStruct const * nextInChain;
+    WGPU_NULLABLE char const * label;
+} WGPUSharedFenceDescriptor WGPU_STRUCTURE_ATTRIBUTE;
+
+typedef struct WGPUSharedFenceExportInfo {
+    WGPUChainedStructOut * nextInChain;
+    WGPUSharedFenceType type;
+} WGPUSharedFenceExportInfo WGPU_STRUCTURE_ATTRIBUTE;
+
 // Can be chained in WGPUSharedFenceDescriptor
 typedef struct WGPUSharedFenceVkSemaphoreOpaqueFDDescriptor {
     WGPUChainedStruct chain;
@@ -1431,6 +1548,24 @@ typedef struct WGPUSharedFenceVkSemaphoreZirconHandleExportInfo {
     uint32_t handle;
 } WGPUSharedFenceVkSemaphoreZirconHandleExportInfo WGPU_STRUCTURE_ATTRIBUTE;
 
+// Can be chained in WGPUSharedTextureMemoryDescriptor
+typedef struct WGPUSharedTextureMemoryDXGISharedHandleDescriptor {
+    WGPUChainedStruct chain;
+    void * handle;
+} WGPUSharedTextureMemoryDXGISharedHandleDescriptor WGPU_STRUCTURE_ATTRIBUTE;
+
+// Can be chained in WGPUSharedTextureMemoryDescriptor
+typedef struct WGPUSharedTextureMemoryEGLImageDescriptor {
+    WGPUChainedStruct chain;
+    void * image;
+} WGPUSharedTextureMemoryEGLImageDescriptor WGPU_STRUCTURE_ATTRIBUTE;
+
+// Can be chained in WGPUSharedTextureMemoryDescriptor
+typedef struct WGPUSharedTextureMemoryIOSurfaceDescriptor {
+    WGPUChainedStruct chain;
+    void * ioSurface;
+} WGPUSharedTextureMemoryIOSurfaceDescriptor WGPU_STRUCTURE_ATTRIBUTE;
+
 // Can be chained in WGPUSharedTextureMemoryDescriptor
 typedef struct WGPUSharedTextureMemoryAHardwareBufferDescriptor {
     WGPUChainedStruct chain;
@@ -1439,6 +1574,7 @@ typedef struct WGPUSharedTextureMemoryAHardwareBufferDescriptor {
 
 typedef struct WGPUSharedTextureMemoryBeginAccessDescriptor {
     WGPUChainedStruct const * nextInChain;
+    WGPUBool concurrentRead;
     WGPUBool initialized;
     size_t fenceCount;
     WGPUSharedFence const * fences;
@@ -1450,28 +1586,11 @@ typedef struct WGPUSharedTextureMemoryDescriptor {
     WGPU_NULLABLE char const * label;
 } WGPUSharedTextureMemoryDescriptor WGPU_STRUCTURE_ATTRIBUTE;
 
-// Can be chained in WGPUSharedTextureMemoryDescriptor
-typedef struct WGPUSharedTextureMemoryDmaBufDescriptor {
-    WGPUChainedStruct chain;
-    int memoryFD;
-    uint64_t allocationSize;
-    uint64_t drmModifier;
-    size_t planeCount;
-    uint64_t const * planeOffsets;
-    uint32_t const * planeStrides;
-} WGPUSharedTextureMemoryDmaBufDescriptor WGPU_STRUCTURE_ATTRIBUTE;
-
-// Can be chained in WGPUSharedTextureMemoryDescriptor
-typedef struct WGPUSharedTextureMemoryDXGISharedHandleDescriptor {
-    WGPUChainedStruct chain;
-    void * handle;
-} WGPUSharedTextureMemoryDXGISharedHandleDescriptor WGPU_STRUCTURE_ATTRIBUTE;
-
-// Can be chained in WGPUSharedTextureMemoryDescriptor
-typedef struct WGPUSharedTextureMemoryEGLImageDescriptor {
-    WGPUChainedStruct chain;
-    void * image;
-} WGPUSharedTextureMemoryEGLImageDescriptor WGPU_STRUCTURE_ATTRIBUTE;
+typedef struct WGPUSharedTextureMemoryDmaBufPlane {
+    int fd;
+    uint64_t offset;
+    uint32_t stride;
+} WGPUSharedTextureMemoryDmaBufPlane WGPU_STRUCTURE_ATTRIBUTE;
 
 typedef struct WGPUSharedTextureMemoryEndAccessState {
     WGPUChainedStructOut * nextInChain;
@@ -1481,17 +1600,14 @@ typedef struct WGPUSharedTextureMemoryEndAccessState {
     uint64_t const * signaledValues;
 } WGPUSharedTextureMemoryEndAccessState WGPU_STRUCTURE_ATTRIBUTE;
 
-// Can be chained in WGPUSharedTextureMemoryDescriptor
-typedef struct WGPUSharedTextureMemoryIOSurfaceDescriptor {
-    WGPUChainedStruct chain;
-    void * ioSurface;
-} WGPUSharedTextureMemoryIOSurfaceDescriptor WGPU_STRUCTURE_ATTRIBUTE;
-
 // Can be chained in WGPUSharedTextureMemoryDescriptor
 typedef struct WGPUSharedTextureMemoryOpaqueFDDescriptor {
     WGPUChainedStruct chain;
+    void const * vkImageCreateInfo;
     int memoryFD;
+    uint32_t memoryTypeIndex;
     uint64_t allocationSize;
+    WGPUBool dedicatedAllocation;
 } WGPUSharedTextureMemoryOpaqueFDDescriptor WGPU_STRUCTURE_ATTRIBUTE;
 
 // Can be chained in WGPUSharedTextureMemoryDescriptor
@@ -1565,12 +1681,6 @@ typedef struct WGPUSurfaceDescriptorFromWaylandSurface {
     void * surface;
 } WGPUSurfaceDescriptorFromWaylandSurface WGPU_STRUCTURE_ATTRIBUTE;
 
-// Can be chained in WGPUSurfaceDescriptor
-typedef struct WGPUSurfaceDescriptorFromWindowsCoreWindow {
-    WGPUChainedStruct chain;
-    void * coreWindow;
-} WGPUSurfaceDescriptorFromWindowsCoreWindow WGPU_STRUCTURE_ATTRIBUTE;
-
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWindowsHWND {
     WGPUChainedStruct chain;
@@ -1578,6 +1688,12 @@ typedef struct WGPUSurfaceDescriptorFromWindowsHWND {
     void * hwnd;
 } WGPUSurfaceDescriptorFromWindowsHWND WGPU_STRUCTURE_ATTRIBUTE;
 
+// Can be chained in WGPUSurfaceDescriptor
+typedef struct WGPUSurfaceDescriptorFromWindowsCoreWindow {
+    WGPUChainedStruct chain;
+    void * coreWindow;
+} WGPUSurfaceDescriptorFromWindowsCoreWindow WGPU_STRUCTURE_ATTRIBUTE;
+
 // Can be chained in WGPUSurfaceDescriptor
 typedef struct WGPUSurfaceDescriptorFromWindowsSwapChainPanel {
     WGPUChainedStruct chain;
@@ -1639,6 +1755,13 @@ typedef struct WGPUVertexAttribute {
     uint32_t shaderLocation;
 } WGPUVertexAttribute WGPU_STRUCTURE_ATTRIBUTE;
 
+// Can be chained in WGPUAdapterProperties
+typedef struct WGPUAdapterPropertiesMemoryHeaps {
+    WGPUChainedStructOut chain;
+    size_t heapCount;
+    WGPUMemoryHeapInfo const * heapInfo;
+} WGPUAdapterPropertiesMemoryHeaps WGPU_STRUCTURE_ATTRIBUTE;
+
 typedef struct WGPUBindGroupDescriptor {
     WGPUChainedStruct const * nextInChain;
     WGPU_NULLABLE char const * label;
@@ -1701,6 +1824,7 @@ typedef struct WGPUExternalTextureDescriptor {
     float const * dstTransferFunctionParameters;
     float const * gamutConversionMatrix;
     WGPUBool flipY;
+    WGPUBool mirrored;
     WGPUExternalTextureRotation rotation;
 } WGPUExternalTextureDescriptor WGPU_STRUCTURE_ATTRIBUTE;
 
@@ -1775,6 +1899,16 @@ typedef struct WGPURequiredLimits {
     WGPULimits limits;
 } WGPURequiredLimits WGPU_STRUCTURE_ATTRIBUTE;
 
+// Can be chained in WGPUSharedTextureMemoryDescriptor
+typedef struct WGPUSharedTextureMemoryDmaBufDescriptor {
+    WGPUChainedStruct chain;
+    WGPUExtent3D size;
+    uint32_t drmFormat;
+    uint64_t drmModifier;
+    size_t planeCount;
+    WGPUSharedTextureMemoryDmaBufPlane const * planes;
+} WGPUSharedTextureMemoryDmaBufDescriptor WGPU_STRUCTURE_ATTRIBUTE;
+
 typedef struct WGPUSharedTextureMemoryProperties {
     WGPUChainedStructOut * nextInChain;
     WGPUTextureUsageFlags usage;
@@ -1903,6 +2037,7 @@ extern "C" {
 #if !defined(WGPU_SKIP_PROCS)
 
 typedef void (*WGPUProcAdapterPropertiesFreeMembers)(WGPUAdapterProperties value) WGPU_FUNCTION_ATTRIBUTE;
+typedef void (*WGPUProcAdapterPropertiesMemoryHeapsFreeMembers)(WGPUAdapterPropertiesMemoryHeaps value) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUInstance (*WGPUProcCreateInstance)(WGPUInstanceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUBool (*WGPUProcGetInstanceFeatures)(WGPUInstanceFeatures * features) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUProc (*WGPUProcGetProcAddress)(WGPUDevice device, char const * procName) WGPU_FUNCTION_ATTRIBUTE;
@@ -1916,6 +2051,7 @@ typedef WGPUBool (*WGPUProcAdapterGetLimits)(WGPUAdapter adapter, WGPUSupportedL
 typedef void (*WGPUProcAdapterGetProperties)(WGPUAdapter adapter, WGPUAdapterProperties * properties) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUBool (*WGPUProcAdapterHasFeature)(WGPUAdapter adapter, WGPUFeatureName feature) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcAdapterRequestDevice)(WGPUAdapter adapter, WGPU_NULLABLE WGPUDeviceDescriptor const * descriptor, WGPURequestDeviceCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
+typedef WGPUFuture (*WGPUProcAdapterRequestDeviceF)(WGPUAdapter adapter, WGPU_NULLABLE WGPUDeviceDescriptor const * options, WGPURequestDeviceCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcAdapterReference)(WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcAdapterRelease)(WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE;
 
@@ -1995,6 +2131,7 @@ typedef WGPUBuffer (*WGPUProcDeviceCreateBuffer)(WGPUDevice device, WGPUBufferDe
 typedef WGPUCommandEncoder (*WGPUProcDeviceCreateCommandEncoder)(WGPUDevice device, WGPU_NULLABLE WGPUCommandEncoderDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUComputePipeline (*WGPUProcDeviceCreateComputePipeline)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcDeviceCreateComputePipelineAsync)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
+typedef WGPUFuture (*WGPUProcDeviceCreateComputePipelineAsyncF)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUBuffer (*WGPUProcDeviceCreateErrorBuffer)(WGPUDevice device, WGPUBufferDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUExternalTexture (*WGPUProcDeviceCreateErrorExternalTexture)(WGPUDevice device) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUShaderModule (*WGPUProcDeviceCreateErrorShaderModule)(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor, char const * errorMessage) WGPU_FUNCTION_ATTRIBUTE;
@@ -2005,6 +2142,7 @@ typedef WGPUQuerySet (*WGPUProcDeviceCreateQuerySet)(WGPUDevice device, WGPUQuer
 typedef WGPURenderBundleEncoder (*WGPUProcDeviceCreateRenderBundleEncoder)(WGPUDevice device, WGPURenderBundleEncoderDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPURenderPipeline (*WGPUProcDeviceCreateRenderPipeline)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcDeviceCreateRenderPipelineAsync)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
+typedef WGPUFuture (*WGPUProcDeviceCreateRenderPipelineAsyncF)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUSampler (*WGPUProcDeviceCreateSampler)(WGPUDevice device, WGPU_NULLABLE WGPUSamplerDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUShaderModule (*WGPUProcDeviceCreateShaderModule)(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUSwapChain (*WGPUProcDeviceCreateSwapChain)(WGPUDevice device, WGPUSurface surface, WGPUSwapChainDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
@@ -2041,6 +2179,8 @@ typedef void (*WGPUProcExternalTextureRelease)(WGPUExternalTexture externalTextu
 
 // Procs of Instance
 typedef WGPUSurface (*WGPUProcInstanceCreateSurface)(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
+typedef size_t (*WGPUProcInstanceEnumerateWGSLLanguageFeatures)(WGPUInstance instance, WGPUWGSLFeatureName * features) WGPU_FUNCTION_ATTRIBUTE;
+typedef WGPUBool (*WGPUProcInstanceHasWGSLLanguageFeature)(WGPUInstance instance, WGPUWGSLFeatureName feature) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcInstanceProcessEvents)(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcInstanceRequestAdapter)(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUFuture (*WGPUProcInstanceRequestAdapterF)(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE;
@@ -2148,11 +2288,13 @@ typedef WGPUBool (*WGPUProcSharedTextureMemoryBeginAccess)(WGPUSharedTextureMemo
 typedef WGPUTexture (*WGPUProcSharedTextureMemoryCreateTexture)(WGPUSharedTextureMemory sharedTextureMemory, WGPU_NULLABLE WGPUTextureDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUBool (*WGPUProcSharedTextureMemoryEndAccess)(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryEndAccessState * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcSharedTextureMemoryGetProperties)(WGPUSharedTextureMemory sharedTextureMemory, WGPUSharedTextureMemoryProperties * properties) WGPU_FUNCTION_ATTRIBUTE;
+typedef WGPUBool (*WGPUProcSharedTextureMemoryIsDeviceLost)(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcSharedTextureMemorySetLabel)(WGPUSharedTextureMemory sharedTextureMemory, char const * label) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcSharedTextureMemoryReference)(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcSharedTextureMemoryRelease)(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE;
 
 // Procs of Surface
+typedef WGPUTextureFormat (*WGPUProcSurfaceGetPreferredFormat)(WGPUSurface surface, WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcSurfaceReference)(WGPUSurface surface) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcSurfaceRelease)(WGPUSurface surface) WGPU_FUNCTION_ATTRIBUTE;
 
@@ -2164,6 +2306,7 @@ typedef void (*WGPUProcSwapChainReference)(WGPUSwapChain swapChain) WGPU_FUNCTIO
 typedef void (*WGPUProcSwapChainRelease)(WGPUSwapChain swapChain) WGPU_FUNCTION_ATTRIBUTE;
 
 // Procs of Texture
+typedef WGPUTextureView (*WGPUProcTextureCreateErrorView)(WGPUTexture texture, WGPU_NULLABLE WGPUTextureViewDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef WGPUTextureView (*WGPUProcTextureCreateView)(WGPUTexture texture, WGPU_NULLABLE WGPUTextureViewDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 typedef void (*WGPUProcTextureDestroy)(WGPUTexture texture) WGPU_FUNCTION_ATTRIBUTE;
 typedef uint32_t (*WGPUProcTextureGetDepthOrArrayLayers)(WGPUTexture texture) WGPU_FUNCTION_ATTRIBUTE;
@@ -2189,6 +2332,7 @@ typedef void (*WGPUProcTextureViewRelease)(WGPUTextureView textureView) WGPU_FUN
 #if !defined(WGPU_SKIP_DECLARATIONS)
 
 WGPU_EXPORT void wgpuAdapterPropertiesFreeMembers(WGPUAdapterProperties value) WGPU_FUNCTION_ATTRIBUTE;
+WGPU_EXPORT void wgpuAdapterPropertiesMemoryHeapsFreeMembers(WGPUAdapterPropertiesMemoryHeaps value) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPU_NULLABLE WGPUInstanceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUBool wgpuGetInstanceFeatures(WGPUInstanceFeatures * features) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUProc wgpuGetProcAddress(WGPU_NULLABLE WGPUDevice device, char const * procName) WGPU_FUNCTION_ATTRIBUTE;
@@ -2202,6 +2346,7 @@ WGPU_EXPORT WGPUBool wgpuAdapterGetLimits(WGPUAdapter adapter, WGPUSupportedLimi
 WGPU_EXPORT void wgpuAdapterGetProperties(WGPUAdapter adapter, WGPUAdapterProperties * properties) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUBool wgpuAdapterHasFeature(WGPUAdapter adapter, WGPUFeatureName feature) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuAdapterRequestDevice(WGPUAdapter adapter, WGPU_NULLABLE WGPUDeviceDescriptor const * descriptor, WGPURequestDeviceCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
+WGPU_EXPORT WGPUFuture wgpuAdapterRequestDeviceF(WGPUAdapter adapter, WGPU_NULLABLE WGPUDeviceDescriptor const * options, WGPURequestDeviceCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuAdapterReference(WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuAdapterRelease(WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE;
 
@@ -2281,6 +2426,7 @@ WGPU_EXPORT WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, WGPUBufferDescr
 WGPU_EXPORT WGPUCommandEncoder wgpuDeviceCreateCommandEncoder(WGPUDevice device, WGPU_NULLABLE WGPUCommandEncoderDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUComputePipeline wgpuDeviceCreateComputePipeline(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuDeviceCreateComputePipelineAsync(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
+WGPU_EXPORT WGPUFuture wgpuDeviceCreateComputePipelineAsyncF(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUBuffer wgpuDeviceCreateErrorBuffer(WGPUDevice device, WGPUBufferDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUExternalTexture wgpuDeviceCreateErrorExternalTexture(WGPUDevice device) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUShaderModule wgpuDeviceCreateErrorShaderModule(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor, char const * errorMessage) WGPU_FUNCTION_ATTRIBUTE;
@@ -2291,6 +2437,7 @@ WGPU_EXPORT WGPUQuerySet wgpuDeviceCreateQuerySet(WGPUDevice device, WGPUQuerySe
 WGPU_EXPORT WGPURenderBundleEncoder wgpuDeviceCreateRenderBundleEncoder(WGPUDevice device, WGPURenderBundleEncoderDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPURenderPipeline wgpuDeviceCreateRenderPipeline(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuDeviceCreateRenderPipelineAsync(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
+WGPU_EXPORT WGPUFuture wgpuDeviceCreateRenderPipelineAsyncF(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUSampler wgpuDeviceCreateSampler(WGPUDevice device, WGPU_NULLABLE WGPUSamplerDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUShaderModule wgpuDeviceCreateShaderModule(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUSwapChain wgpuDeviceCreateSwapChain(WGPUDevice device, WGPUSurface surface, WGPUSwapChainDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
@@ -2327,6 +2474,8 @@ WGPU_EXPORT void wgpuExternalTextureRelease(WGPUExternalTexture externalTexture)
 
 // Methods of Instance
 WGPU_EXPORT WGPUSurface wgpuInstanceCreateSurface(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
+WGPU_EXPORT size_t wgpuInstanceEnumerateWGSLLanguageFeatures(WGPUInstance instance, WGPUWGSLFeatureName * features) WGPU_FUNCTION_ATTRIBUTE;
+WGPU_EXPORT WGPUBool wgpuInstanceHasWGSLLanguageFeature(WGPUInstance instance, WGPUWGSLFeatureName feature) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuInstanceProcessEvents(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuInstanceRequestAdapter(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUFuture wgpuInstanceRequestAdapterF(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE;
@@ -2434,11 +2583,13 @@ WGPU_EXPORT WGPUBool wgpuSharedTextureMemoryBeginAccess(WGPUSharedTextureMemory
 WGPU_EXPORT WGPUTexture wgpuSharedTextureMemoryCreateTexture(WGPUSharedTextureMemory sharedTextureMemory, WGPU_NULLABLE WGPUTextureDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUBool wgpuSharedTextureMemoryEndAccess(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryEndAccessState * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuSharedTextureMemoryGetProperties(WGPUSharedTextureMemory sharedTextureMemory, WGPUSharedTextureMemoryProperties * properties) WGPU_FUNCTION_ATTRIBUTE;
+WGPU_EXPORT WGPUBool wgpuSharedTextureMemoryIsDeviceLost(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuSharedTextureMemorySetLabel(WGPUSharedTextureMemory sharedTextureMemory, char const * label) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuSharedTextureMemoryReference(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuSharedTextureMemoryRelease(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE;
 
 // Methods of Surface
+WGPU_EXPORT WGPUTextureFormat wgpuSurfaceGetPreferredFormat(WGPUSurface surface, WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuSurfaceReference(WGPUSurface surface) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuSurfaceRelease(WGPUSurface surface) WGPU_FUNCTION_ATTRIBUTE;
 
@@ -2450,6 +2601,7 @@ WGPU_EXPORT void wgpuSwapChainReference(WGPUSwapChain swapChain) WGPU_FUNCTION_A
 WGPU_EXPORT void wgpuSwapChainRelease(WGPUSwapChain swapChain) WGPU_FUNCTION_ATTRIBUTE;
 
 // Methods of Texture
+WGPU_EXPORT WGPUTextureView wgpuTextureCreateErrorView(WGPUTexture texture, WGPU_NULLABLE WGPUTextureViewDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT WGPUTextureView wgpuTextureCreateView(WGPUTexture texture, WGPU_NULLABLE WGPUTextureViewDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT void wgpuTextureDestroy(WGPUTexture texture) WGPU_FUNCTION_ATTRIBUTE;
 WGPU_EXPORT uint32_t wgpuTextureGetDepthOrArrayLayers(WGPUTexture texture) WGPU_FUNCTION_ATTRIBUTE;
diff --git a/src/runtime/webgpu.cpp b/src/runtime/webgpu.cpp
index b889ed5e7385..aa4e3fb5a71f 100644
--- a/src/runtime/webgpu.cpp
+++ b/src/runtime/webgpu.cpp
@@ -328,9 +328,7 @@ WEAK int create_webgpu_context(void *user_context) {
         << "WGPU: create_webgpu_context (user_context: " << user_context
         << ")\n";
 
-    WGPUInstanceDescriptor desc{};
-    desc.nextInChain = nullptr;
-    global_instance = wgpuCreateInstance(&desc);
+    global_instance = wgpuCreateInstance(nullptr);
     debug(user_context)
         << "WGPU: wgpuCreateInstance produces: " << global_instance
         << ")\n";
diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h
index ffcbd1c603c0..474e837a91f4 100644
--- a/test/common/gpu_context.h
+++ b/test/common/gpu_context.h
@@ -186,9 +186,7 @@ inline bool create_webgpu_context(WGPUInstance *instance_out, WGPUAdapter *adapt
         bool success = true;
     } results;
 
-    WGPUInstanceDescriptor desc{};
-    desc.nextInChain = nullptr;
-    results.instance = wgpuCreateInstance(&desc);
+    results.instance = wgpuCreateInstance(nullptr);
 
     auto request_adapter_callback = [](WGPURequestAdapterStatus status, WGPUAdapter adapter, char const *message, void *userdata) {
         auto *results = (Results *)userdata;
@@ -234,12 +232,7 @@ inline bool create_webgpu_context(WGPUInstance *instance_out, WGPUAdapter *adapt
         WGPUDeviceDescriptor desc{};
         desc.nextInChain = nullptr;
         desc.label = nullptr;
-#if defined(__EMSCRIPTEN__)
-        // ...sigh, really?
-        desc.requiredFeaturesCount = 0;
-#else
         desc.requiredFeatureCount = 0;
-#endif
         desc.requiredFeatures = nullptr;
         desc.requiredLimits = &requestedLimits;
         desc.deviceLostCallback = device_lost_callback;

From 22581bfc8a3059954045dc5cae33f08b833df57e Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Sun, 11 Feb 2024 18:40:09 +0000
Subject: [PATCH 055/186] Remove OpenGLCompute (#8077)

* Remove OpenGLCompute

This was supposed to be removed in Halide 17 (oops), removing for Halide 18

* Update dynamic_allocation_in_gpu_kernel.cpp

* Update dynamic_allocation_in_gpu_kernel.cpp

* Update halide_ir.fbs
---
 Makefile                                      |   13 -
 README.md                                     |    2 +-
 README_cmake.md                               |   18 -
 apps/CMakeLists.txt                           |    1 -
 apps/openglcompute/AndroidManifest.xml        |   22 -
 apps/openglcompute/Makefile                   |   99 --
 apps/openglcompute/build.sh                   |    9 -
 apps/openglcompute/build.xml                  |   20 -
 apps/openglcompute/jni/Android.mk             |   69 --
 apps/openglcompute/jni/Application.mk         |    7 -
 apps/openglcompute/jni/oglc_run.cpp           |  250 ----
 .../jni/oglc_two_kernels_run.cpp              |   89 --
 .../res/drawable-hdpi/ic_launcher.png         |  Bin 9397 -> 0 bytes
 .../res/drawable-ldpi/ic_launcher.png         |  Bin 2729 -> 0 bytes
 .../res/drawable-mdpi/ic_launcher.png         |  Bin 5237 -> 0 bytes
 .../res/drawable-xhdpi/ic_launcher.png        |  Bin 14383 -> 0 bytes
 apps/openglcompute/res/layout/main.xml        |   15 -
 apps/openglcompute/res/values/strings.xml     |    4 -
 .../HalideOpenGLComputeActivity.java          |   30 -
 apps/openglcompute/test_oglc_avg.cpp          |   59 -
 apps/openglcompute/test_two_kernels.cpp       |   40 -
 cmake/HalideGeneratorHelpers.cmake            |   16 -
 packaging/common/Description.txt              |    2 +-
 .../src/halide/halide_/PyEnums.cpp            |    2 -
 .../test/correctness/boundary_conditions.py   |    1 -
 python_bindings/test/correctness/target.py    |    3 +-
 src/BoundSmallAllocations.cpp                 |   11 +-
 src/CMakeLists.txt                            |    7 -
 src/CodeGen_C.cpp                             |    4 -
 src/CodeGen_Internal.cpp                      |    2 -
 src/CodeGen_OpenGLCompute_Dev.cpp             | 1029 -----------------
 src/CodeGen_OpenGLCompute_Dev.h               |   23 -
 src/CodeGen_Vulkan_Dev.cpp                    |    2 -
 src/Deserialization.cpp                       |    2 -
 src/DeviceAPI.h                               |    2 -
 src/DeviceInterface.cpp                       |    7 -
 src/FuseGPUThreadLoops.cpp                    |    7 +-
 src/IRPrinter.cpp                             |    3 -
 src/JITModule.cpp                             |   63 -
 src/LLVM_Runtime_Linker.cpp                   |   21 -
 src/Lower.cpp                                 |    9 +-
 src/Module.cpp                                |    6 +-
 src/OffloadGPULoops.cpp                       |    6 +-
 src/Pipeline.cpp                              |    4 -
 src/SelectGPUAPI.h                            |    2 +-
 src/Serialization.cpp                         |    2 -
 src/StorageFlattening.cpp                     |    4 +-
 src/Target.cpp                                |   15 -
 src/Target.h                                  |    6 +-
 src/halide_ir.fbs                             |    1 -
 src/runtime/CMakeLists.txt                    |    5 -
 src/runtime/HalideRuntime.h                   |    2 -
 src/runtime/HalideRuntimeOpenGLCompute.h      |   76 --
 src/runtime/device_interface.cpp              |    6 +-
 src/runtime/mini_opengl.h                     |  221 ----
 src/runtime/opengl_egl_context.cpp            |  181 ---
 src/runtime/opengl_glx_context.cpp            |  156 ---
 src/runtime/openglcompute.cpp                 |  990 ----------------
 src/runtime/osx_opengl_context.cpp            |  118 --
 src/runtime/runtime_api.cpp                   |    7 -
 test/correctness/async_copy_chain.cpp         |    6 -
 test/correctness/async_device_copy.cpp        |    6 -
 test/correctness/boundary_conditions.cpp      |    8 +-
 test/correctness/device_buffer_copy.cpp       |    5 -
 test/correctness/device_crop.cpp              |    5 -
 test/correctness/device_slice.cpp             |    5 -
 .../dynamic_allocation_in_gpu_kernel.cpp      |    4 +-
 test/correctness/gpu_allocation_cache.cpp     |   25 +-
 test/correctness/gpu_dynamic_shared.cpp       |    5 -
 .../gpu_jit_explicit_copy_to_device.cpp       |    2 +-
 test/correctness/gpu_large_alloc.cpp          |    2 +-
 test/correctness/gpu_mixed_dimensionality.cpp |    2 +-
 test/correctness/gpu_multi_device.cpp         |    8 -
 test/correctness/gpu_multi_kernel.cpp         |    2 +-
 test/correctness/gpu_reuse_shared_memory.cpp  |    4 +-
 test/correctness/logical.cpp                  |   20 +-
 test/correctness/math.cpp                     |    8 +-
 test/correctness/mul_div_mod.cpp              |    2 -
 test/correctness/newtons_method.cpp           |    3 +-
 test/correctness/parallel_gpu_nested.cpp      |    2 +-
 test/correctness/plain_c_includes.c           |    1 -
 test/correctness/target.cpp                   |    4 +-
 .../correctness/vectorized_gpu_allocation.cpp |    6 -
 83 files changed, 44 insertions(+), 3862 deletions(-)
 delete mode 100644 apps/openglcompute/AndroidManifest.xml
 delete mode 100644 apps/openglcompute/Makefile
 delete mode 100755 apps/openglcompute/build.sh
 delete mode 100644 apps/openglcompute/build.xml
 delete mode 100644 apps/openglcompute/jni/Android.mk
 delete mode 100644 apps/openglcompute/jni/Application.mk
 delete mode 100644 apps/openglcompute/jni/oglc_run.cpp
 delete mode 100644 apps/openglcompute/jni/oglc_two_kernels_run.cpp
 delete mode 100644 apps/openglcompute/res/drawable-hdpi/ic_launcher.png
 delete mode 100644 apps/openglcompute/res/drawable-ldpi/ic_launcher.png
 delete mode 100644 apps/openglcompute/res/drawable-mdpi/ic_launcher.png
 delete mode 100644 apps/openglcompute/res/drawable-xhdpi/ic_launcher.png
 delete mode 100644 apps/openglcompute/res/layout/main.xml
 delete mode 100644 apps/openglcompute/res/values/strings.xml
 delete mode 100644 apps/openglcompute/src/com/example/hellohalideopenglcompute/HalideOpenGLComputeActivity.java
 delete mode 100644 apps/openglcompute/test_oglc_avg.cpp
 delete mode 100644 apps/openglcompute/test_two_kernels.cpp
 delete mode 100644 src/CodeGen_OpenGLCompute_Dev.cpp
 delete mode 100644 src/CodeGen_OpenGLCompute_Dev.h
 delete mode 100644 src/runtime/HalideRuntimeOpenGLCompute.h
 delete mode 100644 src/runtime/mini_opengl.h
 delete mode 100644 src/runtime/opengl_egl_context.cpp
 delete mode 100644 src/runtime/opengl_glx_context.cpp
 delete mode 100644 src/runtime/openglcompute.cpp
 delete mode 100644 src/runtime/osx_opengl_context.cpp

diff --git a/Makefile b/Makefile
index 04fc41fa4167..e1457ea161e2 100644
--- a/Makefile
+++ b/Makefile
@@ -126,7 +126,6 @@ WITH_WEBASSEMBLY ?= $(findstring webassembly, $(LLVM_COMPONENTS))
 WITH_AMDGPU ?= $(findstring amdgpu, $(LLVM_COMPONENTS))
 WITH_OPENCL ?= not-empty
 WITH_METAL ?= not-empty
-WITH_OPENGLCOMPUTE ?= not-empty
 WITH_D3D12 ?= not-empty
 WITH_VULKAN ?= not-empty
 WITH_SPIRV ?= not-empty
@@ -163,8 +162,6 @@ OPENCL_LLVM_CONFIG_LIB=$(if $(WITH_OPENCL), , )
 METAL_CXX_FLAGS=$(if $(WITH_METAL), -DWITH_METAL, )
 METAL_LLVM_CONFIG_LIB=$(if $(WITH_METAL), , )
 
-OPENGLCOMPUTE_CXX_FLAGS=$(if $(WITH_OPENGLCOMPUTE), -DWITH_OPENGLCOMPUTE, )
-
 D3D12_CXX_FLAGS=$(if $(WITH_D3D12), -DWITH_D3D12, )
 D3D12_LLVM_CONFIG_LIB=$(if $(WITH_D3D12), , )
 
@@ -218,7 +215,6 @@ CXX_FLAGS += $(AARCH64_CXX_FLAGS)
 CXX_FLAGS += $(X86_CXX_FLAGS)
 CXX_FLAGS += $(OPENCL_CXX_FLAGS)
 CXX_FLAGS += $(METAL_CXX_FLAGS)
-CXX_FLAGS += $(OPENGLCOMPUTE_CXX_FLAGS)
 CXX_FLAGS += $(D3D12_CXX_FLAGS)
 CXX_FLAGS += $(WEBGPU_CXX_FLAGS)
 CXX_FLAGS += $(POWERPC_CXX_FLAGS)
@@ -345,7 +341,6 @@ endif
 ifneq ($(TEST_VULKAN), )
 VULKAN_LD_FLAGS ?= -lvulkan
 endif
-OPENGL_LD_FLAGS ?= -lGL
 HOST_OS=linux
 endif
 
@@ -364,7 +359,6 @@ endif
 ifneq ($(TEST_METAL), )
 METAL_LD_FLAGS ?= -framework Metal -framework Foundation
 endif
-OPENGL_LD_FLAGS ?= -framework OpenGL
 HOST_OS=os_x
 endif
 
@@ -476,7 +470,6 @@ SOURCE_FILES = \
   CodeGen_Metal_Dev.cpp \
   CodeGen_OpenCL_Dev.cpp \
   CodeGen_Vulkan_Dev.cpp \
-  CodeGen_OpenGLCompute_Dev.cpp \
   CodeGen_Posix.cpp \
   CodeGen_PowerPC.cpp \
   CodeGen_PTX_Dev.cpp \
@@ -670,7 +663,6 @@ HEADER_FILES = \
   CodeGen_Metal_Dev.h \
   CodeGen_OpenCL_Dev.h \
   CodeGen_Vulkan_Dev.h \
-  CodeGen_OpenGLCompute_Dev.h \
   CodeGen_Posix.h \
   CodeGen_PTX_Dev.h \
   CodeGen_PyTorch.h \
@@ -854,13 +846,9 @@ RUNTIME_CPP_COMPONENTS = \
   msan \
   msan_stubs \
   opencl \
-  opengl_egl_context \
-  opengl_glx_context \
-  openglcompute \
   osx_clock \
   osx_get_symbol \
   osx_host_cpu_count \
-  osx_opengl_context \
   osx_yield \
   posix_aligned_alloc \
   posix_allocator \
@@ -931,7 +919,6 @@ RUNTIME_EXPORTED_INCLUDES = $(INCLUDE_DIR)/HalideRuntime.h \
                             $(INCLUDE_DIR)/HalideRuntimeHexagonDma.h \
                             $(INCLUDE_DIR)/HalideRuntimeHexagonHost.h \
                             $(INCLUDE_DIR)/HalideRuntimeOpenCL.h \
-                            $(INCLUDE_DIR)/HalideRuntimeOpenGLCompute.h \
                             $(INCLUDE_DIR)/HalideRuntimeMetal.h	\
                             $(INCLUDE_DIR)/HalideRuntimeQurt.h \
                             $(INCLUDE_DIR)/HalideRuntimeVulkan.h \
diff --git a/README.md b/README.md
index 6ebe04107159..c5dfe5507a8b 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ currently targets:
 
 - CPU architectures: X86, ARM, Hexagon, PowerPC, RISC-V
 - Operating systems: Linux, Windows, macOS, Android, iOS, Qualcomm QuRT
-- GPU Compute APIs: CUDA, OpenCL, OpenGL Compute Shaders, Apple Metal, Microsoft
+- GPU Compute APIs: CUDA, OpenCL, Apple Metal, Microsoft
   Direct X 12, Vulkan
 
 Rather than being a standalone programming language, Halide is embedded in C++.
diff --git a/README_cmake.md b/README_cmake.md
index 49e2f2feb3f7..3908920450a1 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -500,23 +500,6 @@ If the CMake version is lower than 3.18, the deprecated [`FindCUDA`][findcuda]
 module will be used instead. It reads the variable `CUDA_TOOLKIT_ROOT_DIR`
 instead of `CUDAToolkit_ROOT` above.
 
-TODO(https://github.com/halide/Halide/issues/5633): update this section for OpenGLCompute, which needs some (but maybe not all) of this.
-
-When targeting OpenGL, the [`FindOpenGL`][findopengl] and [`FindX11`][findx11]
-modules will be used to link AOT generated binaries. These modules can be
-overridden by setting the following variables:
-
-| Variable                | Description                      |
-|-------------------------|----------------------------------|
-| `OPENGL_egl_LIBRARY`    | Path to the EGL library.         |
-| `OPENGL_glu_LIBRARY`    | Path to the GLU library.         |
-| `OPENGL_glx_LIBRARY`    | Path to the GLVND GLX library.   |
-| `OPENGL_opengl_LIBRARY` | Path to the GLVND OpenGL library |
-| `OPENGL_gl_LIBRARY`     | Path to the OpenGL library.      |
-
-The OpenGL paths will need to be set if you intend to use OpenGL with X11 on
-macOS.
-
 Halide also searches for `libpng` and `libjpeg-turbo` through the
 [`FindPNG`][findpng] and [`FindJPEG`][findjpeg] modules, respectively. They can
 be overridden by setting the following variables.
@@ -1395,7 +1378,6 @@ guidelines you should follow when writing a new app.
 [finddoxygen]: https://cmake.org/cmake/help/latest/module/FindDoxygen.html
 [findjpeg]: https://cmake.org/cmake/help/latest/module/FindJPEG.html
 [findopencl]: https://cmake.org/cmake/help/latest/module/FindOpenCL.html
-[findopengl]: https://cmake.org/cmake/help/latest/module/FindOpenGL.html
 [findpng]: https://cmake.org/cmake/help/latest/module/FindPNG.html
 [findpython3]: https://cmake.org/cmake/help/latest/module/FindPython3.html
 [findx11]: https://cmake.org/cmake/help/latest/module/FindX11.html
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 1f6abcdc6e64..13d73167e865 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -58,7 +58,6 @@ add_app(max_filter)
 add_app(nl_means)
 # add_app(nn_ops)  # TODO(#5374): missing CMake build
 # add_app(onnx)  # TODO(#5374): missing CMake build
-# add_app(openglcompute)  # TODO(#5374): missing CMake build
 add_app(resize)
 # add_app(resnet_50)  # TODO(#5374): missing CMake build
 # add_app(simd_op_check)  # TODO(#5374): missing CMake build
diff --git a/apps/openglcompute/AndroidManifest.xml b/apps/openglcompute/AndroidManifest.xml
deleted file mode 100644
index e809beefd0ea..000000000000
--- a/apps/openglcompute/AndroidManifest.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-          package="com.example.hellohalideopenglcompute"
-          android:versionCode="1"
-          android:versionName="1.0">
-  <application android:label="@string/app_name" android:icon="@drawable/ic_launcher">
-
-    <activity android:name=".HalideOpenGLComputeActivity"
-              android:label="@string/app_name"
-              android:screenOrientation="landscape"
-              android:theme="@android:style/Theme.Black.NoTitleBar.Fullscreen">
-
-      <intent-filter>
-        <action android:name="android.intent.action.MAIN" />
-        <category android:name="android.intent.category.LAUNCHER" />
-      </intent-filter>
-    </activity>
-
-  </application>
-  <uses-sdk android:minSdkVersion="17" />
-
-</manifest>
diff --git a/apps/openglcompute/Makefile b/apps/openglcompute/Makefile
deleted file mode 100644
index 4bda85258364..000000000000
--- a/apps/openglcompute/Makefile
+++ /dev/null
@@ -1,99 +0,0 @@
-include ../support/Makefile.inc
-
-CXX ?= c++
-
-TOP := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))/../..)
-.PHONY: all $(TOP)
-all: run run-two
-HALIDE_LIB := $(TOP)/$(LIBHALIDE_LDFLAGS)
-$(HALIDE_LIB): $(TOP)
-	$(MAKE) -C $(TOP)
-
-test_%: test_%.cpp
-	$(CXX) -std=c++17 -I ../../include/ $< -L ../../bin/ -lHalide $(HALIDE_SYSTEM_LIBS) -o $@ -g
-
-avg_filter_uint32t.o avg_filter_uint32t.h avg_filter_float.o avg_filter_float.h: test_oglc_avg
-	LD_LIBRARY_PATH=../../bin DYLD_LIBRARY_PATH=../../bin HL_TARGET=arm-32-android-armv7s-openglcompute ./$<
-
-avg_filter_uint32t_arm.o avg_filter_uint32t_arm.h avg_filter_float_arm.o avg_filter_float_arm.h: test_oglc_avg
-	LD_LIBRARY_PATH=../../bin DYLD_LIBRARY_PATH=../../bin HL_TARGET=arm-32-android-armv7s ./$< "_arm"
-
-AVG_FILTER_SRC = jni/oglc_run.cpp \
-                 avg_filter_uint32t.o avg_filter_uint32t.h \
-                 avg_filter_uint32t_arm.o avg_filter_uint32t_arm.h \
-                 avg_filter_float.o avg_filter_float.h \
-                 avg_filter_float_arm.o avg_filter_float_arm.h
-
-libs/armeabi-v7a/oglc_run: $(HALIDE_LIB) $(AVG_FILTER_SRC)
-	ndk-build libs/armeabi-v7a/oglc_run
-
-two_kernels_filter.o two_kernels_filter.h: test_two_kernels
-	LD_LIBRARY_PATH=../../bin DYLD_LIBRARY_PATH=../../bin HL_TARGET=arm-32-android-armv7s-openglcompute ./$<
-
-TWO_KERNELS_SRC = jni/oglc_two_kernels_run.cpp \
-                  two_kernels_filter.o two_kernels_filter.h
-
-libs/armeabi-v7a/oglc_two_kernels_run: $(HALIDE_LIB) $(TWO_KERNELS_SRC)
-	ndk-build libs/armeabi-v7a/oglc_two_kernels_run libs/armeabi-v7a/liboglc_two_kernels.so
-
-jni-libs: $(HALIDE_LIB) $(AVG_FILTER_SRC) $(TWO_KERNELS_SRC)
-	ndk-build libs/armeabi-v7a/liboglc_two_kernels.so libs/armeabi-v7a/liboglc.so
-
-deploy: libs/armeabi-v7a/oglc_run
-	adb push libs/armeabi-v7a/oglc_run /mnt/sdcard/
-
-define RUN_STEPS
-su
-mkdir -p /data/tmp
-rm -rf /data/tmp/oglc
-mkdir /data/tmp/oglc
-cd /data/tmp/oglc
-pwd
-cp /mnt/sdcard/oglc_run .
-chmod 777 /data/tmp/oglc/oglc_run
-LD_LIBRARY_PATH=. ./oglc_run
-exit
-exit
-endef
-export RUN_STEPS
-
-
-run: deploy
-	adb logcat -c
-	sh -c 'echo "$$RUN_STEPS" | adb shell'
-	adb logcat -d | grep "I oglc"
-	echo "Done"
-
-deploy-two: libs/armeabi-v7a/oglc_two_kernels_run
-	adb push libs/armeabi-v7a/oglc_two_kernels_run /mnt/sdcard/
-
-
-define RUN_TWO_STEPS
-su
-mkdir /data/tmp
-cd /data/tmp
-pwd
-cp /mnt/sdcard/oglc_two_kernels_run .
-chmod 777 /data/tmp/oglc_two_kernels_run
-LD_LIBRARY_PATH=. ./oglc_two_kernels_run
-exit
-exit
-endef
-export RUN_TWO_STEPS
-
-run-two: deploy-two
-	adb logcat -c
-	sh -c 'echo "$$RUN_TWO_STEPS" | adb shell'
-	adb logcat -d | grep "I oglc"
-	echo "Done"
-
-clean:
-	rm -f test_oglc_avg
-	rm -rf test_oglc_avg.dSYM/
-	rm -f avg_filter*
-	rm -f test_two_kernels
-	rm -rf test_two_kernels.dSYM/
-	rm -rf libs/
-	rm -rf obj/
-	rm -rf bin/
-	rm -rf gen/
diff --git a/apps/openglcompute/build.sh b/apps/openglcompute/build.sh
deleted file mode 100755
index e00ac542386f..000000000000
--- a/apps/openglcompute/build.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-set -e
-android update project -p . --target android-21
-make jni-libs
-ant debug
-adb install -r bin/HelloHalideOpenGLCompute-debug.apk
-adb logcat -c
-adb shell am start -n com.example.hellohalideopenglcompute/.HalideOpenGLComputeActivity
-adb logcat | grep "^I/oglc"
diff --git a/apps/openglcompute/build.xml b/apps/openglcompute/build.xml
deleted file mode 100644
index 1773fe07f123..000000000000
--- a/apps/openglcompute/build.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project name="HelloHalideOpenGLCompute" default="help">
-
-    <property file="local.properties" />
-
-    <property file="ant.properties" />
-    <property environment="env" />
-    <condition property="sdk.dir" value="${env.ANDROID_HOME}">
-        <isset property="env.ANDROID_HOME" />
-    </condition>
-    <loadproperties srcFile="project.properties" />
-
-    <fail
-            message="sdk.dir is missing. Make sure to generate local.properties using 'android update project' or to inject it through the ANDROID_HOME environment variable."
-            unless="sdk.dir"
-    />
-    <import file="custom_rules.xml" optional="true" />
-    <!-- version-tag: 21 -->
-    <import file="${sdk.dir}/tools/ant/build.xml" />
-</project>
diff --git a/apps/openglcompute/jni/Android.mk b/apps/openglcompute/jni/Android.mk
deleted file mode 100644
index 232e91e208cc..000000000000
--- a/apps/openglcompute/jni/Android.mk
+++ /dev/null
@@ -1,69 +0,0 @@
-LOCAL_PATH:= $(call my-dir)
-
-# === oglc_run ===
-
-include $(CLEAR_VARS)
-
-LOCAL_MODULE           := oglc_run
-LOCAL_SRC_FILES        := oglc_run.cpp
-LOCAL_STATIC_LIBRARIES := android_native_app_glue
-LOCAL_STATIC_LIBRARIES += libOpengl
-LOCAL_LDLIBS           := -lm -llog -landroid -lEGL -lGLESv2 avg_filter_uint32t.o avg_filter_uint32t_arm.o avg_filter_float.o avg_filter_float_arm.o
-LOCAL_ARM_MODE         := arm
-
-LOCAL_CPPFLAGS += -std=c++17 -I../support -I../../include
-
-LOCAL_C_INCLUDES += ./
-
-include $(BUILD_EXECUTABLE)
-
-# === oglc library ===
-
-include $(CLEAR_VARS)
-
-LOCAL_MODULE           := oglc
-LOCAL_SRC_FILES        := oglc_run.cpp
-LOCAL_STATIC_LIBRARIES += libOpengl
-LOCAL_LDLIBS           := -lm -llog -landroid -lEGL -lGLESv2 avg_filter_uint32t.o avg_filter_uint32t_arm.o avg_filter_float.o avg_filter_float_arm.o
-LOCAL_ARM_MODE         := arm
-
-LOCAL_CPPFLAGS += -std=c++17 -I../support -I../../include
-
-LOCAL_C_INCLUDES += ./
-
-include $(BUILD_SHARED_LIBRARY)
-
-# === oglc_two_kernels_run ===
-
-include $(CLEAR_VARS)
-
-LOCAL_MODULE           := oglc_two_kernels_run
-LOCAL_SRC_FILES        := oglc_two_kernels_run.cpp
-LOCAL_STATIC_LIBRARIES := android_native_app_glue
-LOCAL_STATIC_LIBRARIES += libOpengl
-LOCAL_LDLIBS           := -lm -llog -landroid -lEGL -lGLESv2 two_kernels_filter.o
-LOCAL_ARM_MODE         := arm
-
-LOCAL_CPPFLAGS += -std=c++17 -I../support -I../../include
-
-LOCAL_C_INCLUDES += ./
-
-include $(BUILD_EXECUTABLE)
-
-# === oglc_two_kernels library ===
-
-include $(CLEAR_VARS)
-
-LOCAL_MODULE           := oglc_two_kernels
-LOCAL_SRC_FILES        := oglc_two_kernels_run.cpp
-LOCAL_STATIC_LIBRARIES += libOpengl
-LOCAL_LDLIBS           := -lm -llog -landroid -lEGL -lGLESv2 two_kernels_filter.o
-LOCAL_ARM_MODE         := arm
-
-LOCAL_CPPFLAGS += -std=c++17 -I../support -I../../include
-
-LOCAL_C_INCLUDES += ./
-
-include $(BUILD_SHARED_LIBRARY)
-
-$(call import-module,android/native_app_glue)
diff --git a/apps/openglcompute/jni/Application.mk b/apps/openglcompute/jni/Application.mk
deleted file mode 100644
index 88a9ea14cc27..000000000000
--- a/apps/openglcompute/jni/Application.mk
+++ /dev/null
@@ -1,7 +0,0 @@
-# TODO(aam): Confirm that application builds and runs for all supported targets:
-# APP_ABI := armeabi armeabi-v7a arm64-v8a x86_64 x86
-APP_ABI := armeabi-v7a
-APP_PLATFORM := android-17
-
-APP_STL := c++_static
-LOCAL_C_INCLUDES += ${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/4.8/include
diff --git a/apps/openglcompute/jni/oglc_run.cpp b/apps/openglcompute/jni/oglc_run.cpp
deleted file mode 100644
index 3378ab555dd1..000000000000
--- a/apps/openglcompute/jni/oglc_run.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-#include "avg_filter_float.h"
-#include "avg_filter_float_arm.h"
-#include "avg_filter_uint32t.h"
-#include "avg_filter_uint32t_arm.h"
-#include <android/log.h>
-#include <iomanip>
-#include <iostream>
-#include <jni.h>
-#include <sstream>
-
-#include "HalideBuffer.h"
-#include "HalideRuntimeOpenGLCompute.h"
-
-#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "oglc_run", __VA_ARGS__)
-#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "oglc_run", __VA_ARGS__)
-
-using Halide::Runtime::Buffer;
-
-typedef int (*filter_t)(halide_buffer_t *, halide_buffer_t *);
-
-struct timing {
-    filter_t filter;
-    Buffer<> *input;
-    Buffer<> *output;
-    double worst_t = 0;
-    int worst_rep = 0;
-    double best_t = DBL_MAX;
-    int best_rep = 0;
-
-    template<typename T>
-    timing(filter_t filter, Buffer<T, 3> *input, Buffer<T, 3> *output)
-        : filter(filter), input(&input->template as<void>()), output(&output->template as<void>()) {
-    }
-
-    int run(int n_reps, bool with_copying) {
-        timeval t1, t2;
-        for (int i = 0; i < n_reps; i++) {
-            input->set_host_dirty();
-            gettimeofday(&t1, NULL);
-            int error = filter(*input, *output);
-            output->device_sync();
-
-            if (with_copying) {
-                output->copy_to_host();
-            }
-            gettimeofday(&t2, NULL);
-            if (error) {
-                return error;
-            }
-            double t = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
-            if (t < best_t) {
-                best_t = t;
-                best_rep = i;
-            }
-            if (t > worst_t) {
-                worst_t = t;
-                worst_rep = i;
-            }
-        }
-        return 0;
-    }
-};
-
-template<class T>
-class Tester;
-
-template<class T>
-bool doBlur(Tester<T> *tester,
-            Buffer<T, 3> bt_input,
-            Buffer<T, 3> bt_output,
-            Buffer<T, 3> bt_output_arm) {
-    return false;  // This abstract implementation should never be called
-}
-
-template<class T>
-bool doCopy(Tester<T> *tester,
-            Buffer<T, 3> bt_input,
-            Buffer<T, 3> bt_output,
-            Buffer<T, 3> bt_output_arm) {
-    return false;  // This abstract implementation should never be called
-}
-
-template<class T>
-class Tester {
-    int debug_level;
-
-public:
-    Tester(int _debug_level = 0)
-        : debug_level(_debug_level) {
-    }
-
-private:
-    bool validate(Buffer<T, 3> actual, Buffer<T, 3> expected) {
-        int count_mismatches = 0;
-        actual.for_each_element([&](int x, int y, int c) {
-            T actual_value = actual(x, y, c);
-            T expected_value = expected(x, y, c);
-            const float EPSILON = 0.00001f;
-            if (abs((double((actual_value - expected_value)) > EPSILON))) {
-                if (count_mismatches < 100) {
-                    std::ostringstream str;
-                    str << "actual and expected results differ at "
-                        << "(" << x << ", " << y << ", " << c << "):"
-                        << +actual_value << " != " << +expected_value
-                        << "\n";
-                    LOGI("%s", str.str().c_str());
-                }
-                count_mismatches++;
-            }
-        });
-
-        return count_mismatches == 0;
-    }
-
-    void print(Buffer<T, 3> buf) {
-        for (int j = 0; j < std::min(buf.height(), 10); j++) {
-            std::stringstream oss;
-            for (int i = 0; i < std::min(buf.width(), 10); i++) {
-                oss << " [";
-                for (int k = 0; k < buf.channels(); k++) {
-                    oss << std::fixed << std::setprecision(1);
-                    if (k > 0) {
-                        oss << std::setw(4);
-                    }
-                    oss << +buf(i, j, k);
-                }
-                oss << "]";
-            }
-            LOGI("%s", oss.str().c_str());
-        }
-    }
-
-public:
-    bool test(Buffer<T, 3> input,
-              Buffer<T, 3> output,
-              Buffer<T, 3> output_arm,
-              filter_t avg_filter,
-              filter_t avg_filter_arm) {
-
-        // Performance check
-        input.set_host_dirty();
-        timing openglcompute(avg_filter, &input, &output);
-        input.set_host_dirty();
-        timing openglcompute_with_copying(avg_filter, &input, &output);
-        input.set_host_dirty();
-        timing arm(avg_filter_arm, &input, &output_arm);
-
-        const int N_REPS = 10;
-        arm.run(N_REPS, false);
-        openglcompute.run(N_REPS, false);
-        openglcompute_with_copying.run(N_REPS, true);
-
-        LOGI("Out of %d runs best times are:\n"
-             "openglcompute:            %fms(@%d)\n"
-             "openglcompute(with copy): %fms(@%d)\n"
-             "ARM:                      %fms(@%d)\n",
-             N_REPS,
-             openglcompute.best_t, openglcompute.best_rep,
-             openglcompute_with_copying.best_t, openglcompute_with_copying.best_rep,
-             arm.best_t, arm.best_rep);
-        LOGI("Out of %d runs worst times are:\n"
-             "openglcompute:            %fms(@%d)\n"
-             "openglcompute(with copy): %fms(@%d)\n"
-             "ARM:                      %fms(@%d)\n",
-             N_REPS,
-             openglcompute.worst_t, openglcompute.worst_rep,
-             openglcompute_with_copying.worst_t, openglcompute_with_copying.worst_rep,
-             arm.worst_t, arm.worst_rep);
-
-        // Data correctness check
-        input.set_host_dirty();
-        avg_filter(input, output);
-        LOGI("Filter is done.");
-        output.device_sync();
-        LOGI("Sync is done");
-        output.copy_to_host();
-
-        LOGI("Output arm:");
-        print(output_arm);
-        LOGI("Output openglcompute:");
-        print(output);
-
-        bool matches = validate(output, output_arm);
-        LOGI(matches ? "Test passed.\n" : "Test failed.\n");
-
-        return matches;
-    }
-
-    void runTest() {
-        int width = 4096;
-        int height = 2048;
-        int channels = 4;
-
-        auto input = Buffer<T, 3>::make_interleaved(width, height, channels);
-        LOGI("Allocated memory for %dx%dx%d image", width, height, channels);
-
-        input.for_each_element([&](int i, int j, int k) {
-            input(i, j, k) = ((i + j) % 2) * 6;
-        });
-
-        LOGI("Input :\n");
-        print(input);
-
-        auto output = Buffer<T, 3>::make_interleaved(width, height, channels);
-        auto output_arm = Buffer<T, 3>::make_interleaved(width, height, channels);
-
-        doBlur(this, input, output, output_arm);
-    }
-};
-
-template<>
-bool doBlur<float>(Tester<float> *tester,
-                   Buffer<float, 3> bt_input,
-                   Buffer<float, 3> bt_output,
-                   Buffer<float, 3> bt_output_arm) {
-    return tester->test(bt_input,
-                        bt_output, bt_output_arm,
-                        avg_filter_float,
-                        avg_filter_float_arm);
-}
-
-template<>
-bool doBlur<uint32_t>(Tester<uint32_t> *tester,
-                      Buffer<uint32_t, 3> bt_input,
-                      Buffer<uint32_t, 3> bt_output,
-                      Buffer<uint32_t, 3> bt_output_arm) {
-    return tester->test(bt_input,
-                        bt_output, bt_output_arm,
-                        avg_filter_uint32t,
-                        avg_filter_uint32t_arm);
-}
-
-int main(int argc, char **argv) {
-    LOGI("\nvvvv vvvv vvvv");
-    LOGI("\nTesting uint32_t...\n");
-    (new Tester<uint32_t>())->runTest();
-    LOGI("---- ---- ----");
-    LOGI("\nTesting float...\n");
-    (new Tester<float>())->runTest();
-
-    halide_device_release(NULL, halide_openglcompute_device_interface());
-
-    LOGI("^^^^ ^^^^ ^^^^\n");
-}
-
-extern "C" {
-JNIEXPORT void JNICALL Java_com_example_hellohalideopenglcompute_HalideOpenGLComputeActivity_runTest(JNIEnv *env, jobject obj) {
-    main(0, NULL);
-}
-}
diff --git a/apps/openglcompute/jni/oglc_two_kernels_run.cpp b/apps/openglcompute/jni/oglc_two_kernels_run.cpp
deleted file mode 100644
index 6574de25ae39..000000000000
--- a/apps/openglcompute/jni/oglc_two_kernels_run.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "two_kernels_filter.h"
-#include <android/log.h>
-#include <iomanip>
-#include <iostream>
-#include <jni.h>
-#include <sstream>
-
-#include "HalideBuffer.h"
-#include "HalideRuntimeOpenGLCompute.h"
-
-#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "oglc_run", __VA_ARGS__)
-#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "oglc_run", __VA_ARGS__)
-
-template<typename T>
-void print(Halide::Runtime::Buffer<T, 3> buf) {
-    for (int j = 0; j < std::min(buf.height(), 10); j++) {
-        std::stringstream oss;
-        for (int i = 0; i < std::min(buf.width(), 10); i++) {
-            oss << " [";
-            for (int k = 0; k < buf.channels(); k++) {
-                oss << std::fixed << std::setprecision(1);
-                if (k > 0) {
-                    oss << std::setw(4);
-                }
-                oss << +buf(i, j, k);
-            }
-            oss << "]";
-        }
-        LOGI("%s", oss.str().c_str());
-    }
-}
-
-int main(int argc, char **argv) {
-    LOGI("\nvvvv vvvv vvvv");
-
-    int width = 128;
-    int height = 128;
-    int channels = 4;
-
-    auto input = Halide::Runtime::Buffer<int, 3>::make_interleaved(width, height, channels);
-    LOGI("Allocated memory for %dx%dx%d image", width, height, channels);
-
-    input.for_each_element([&](int i, int j, int k) {
-        input(i, j, k) = ((i + j) % 2) * 6;
-    });
-
-    LOGI("Input :\n");
-    print(input);
-
-    auto output = Halide::Runtime::Buffer<int, 3>::make_interleaved(width, height, channels);
-
-    two_kernels_filter(input, output);
-    LOGI("Filter is done.");
-    output.device_sync();
-    LOGI("Sync is done");
-    output.copy_to_host();
-
-    LOGI("Output :\n");
-    print(output);
-
-    int count_mismatches = 0;
-    output.for_each_element([&](int i, int j, int k) {
-        int32_t output_value = output(i, j, k);
-        int32_t input_value = input(i, j, k);
-        if (output_value != input_value) {
-            if (count_mismatches < 100) {
-                std::ostringstream str;
-                str << "output and input results differ at "
-                    << "(" << i << ", " << j << ", " << k << "):"
-                    << output_value << " != " << input_value
-                    << "\n";
-                LOGI("%s", str.str().c_str());
-            }
-            count_mismatches++;
-        }
-    });
-
-    LOGI(count_mismatches == 0 ? "Test passed.\n" : "Test failed.\n");
-
-    halide_device_release(NULL, halide_openglcompute_device_interface());
-
-    LOGI("^^^^ ^^^^ ^^^^\n");
-}
-
-extern "C" {
-JNIEXPORT void JNICALL Java_com_example_hellohalideopenglcompute_HalideOpenGLComputeActivity_runTwoKernelsTest(JNIEnv *env, jobject obj) {
-    main(0, NULL);
-}
-}
diff --git a/apps/openglcompute/res/drawable-hdpi/ic_launcher.png b/apps/openglcompute/res/drawable-hdpi/ic_launcher.png
deleted file mode 100644
index 96a442e5b8e9394ccf50bab9988cb2316026245d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9397
zcmV;mBud+fP)<h;3K|Lk000e1NJLTq002k;002k`1^@s6RqeA!00006VoOIv00000
z008+zyMF)x010qNS#tmY7ZLyf7ZL$ypVCqQ000Sga6xAP002k;002k;&Ux?!001N2
zNkl<Zc-qaJdAuc6b??7x@2WcI+;iuin;sYiWJUx>L`9r|n3#ts(U@pVoQ)(ZPc(6i
z8k}N`MvWQ78F(rhG(?6FnFXYo>28{yZ}%O}TvdDT_5P?j=iW=V`8=UNc_}`JbG!ST
zs@lK(TWkH+P**sB$A`cEY%Y53cQ}1&6`x-M$Cz&{o9bLU^M-%^mY?+vedlvt$RT-^
zu|w7}IaWaljBq#|I%Mpo!Wc2bbZF3KF9|D%wZe{YFM=hJAv$>j>nhx`=W<S%JW3kP
zxAC0SJC-=M%)~?<M!5JZ-{eDYxZ{AVNe(1HS3hztd1j3@5<&>is#KG!cJA5x!4)f)
zezMz1?Vn$GnZNjbFXH(pK83nn!^3=+^*kTTs5rV9Dq^XS(IKO!mKt5!dSmb3IVCxZ
z8TTk5IE)F1V29$G7v#j9d-hy&_pdg8?kT4)zqr>?`}I%W>(?GO%*C&}?Fp|bI*<U}
zbi+2P)hexq!x-r<S!L|f74?s7dic@nqR8Cz?hBt|@7%D%zu@Ewfwno4*BZl;0M$^T
zi=(HpypkDx38Q>~2&KZ$%^B6R&1~2kA{`CWy+>F-x=z-f{_&vyu_3yp{jtw(*syi%
zu3t2|4{c~LJXRt2m>rMg2V_kLltCZ<`m>qcI?BPP?6hf``|e!rZEFszeYQ3f-*nAS
zZ+h<E{@Gl2Py3Q+qWS`FZC5kiPLms%<ld^|7n6caM2uKR&*!+A3a3p!;E(Q&$r@&O
z8kQ09e$s5)y+uYRV^_1)iynMn`-gA2X6Mx(f6G;8g5iR9oFTOltAGFDyZ+|^^tC%D
zXly!$ZJQj=dnM+WgXY4wzc>1$mFwy+7156lkB(k6)!1fUbJCxgIBK38$j<WgPF4KT
zfv`f+iV`H?R57Vyb}HuHpz+PB)t`=KzkU0ydcIE94hvwOw&l@XH{5#Nqph2-Za%Zo
z$nRcv_>j5cC$r&YXN)nr#PY=tJaLc?C_o?j+8H3Q>891JJ9&$l-r+-SG#q)*;r52%
z@nlKflb65o%s*Jt)!pw1k{vIoQIvoJ0Y&Msiw0X!q<qB~st|^P`r0pZU-^6d$mM&Q
zcU*q&9Xp<J+jZ0LT)EzSrc$lgi(mUP|A+6p3{~N4A3rb*vIi2NFJJKjR@4&CJnKok
z=ozQ0Ozci}cen1I-Whf+TNR!AoKuf@MG~7=1j$wj21n0(GKC;A*sBPSC!l!1k%eAp
zg|44AT{pct&6_U^X72YkJ+R~GyM8hKz4w1`;!nGFD1PgUZ*+2Sms6gxiu$;6)wdr#
zAZt2c47w*=Kq8~8=OiKyWk#>J)_47G*?aJ6bJFLh_4b$5&1k5wN>du*>6#i7R9T8;
z7>EHOV=ue7mo77SJPwER4(A+s?n0JjYK)b}Om6n>ke?0JR=jTI+RFBg_iwb7k%n*2
zR_M0DJ9x+0zxba4(B1y^JQ_Nj6dlP5PGXvSq8fF#mxrFYj3d9(V#jJwt+IqU9+8+D
z6C6Us1O<r;^8#qGjcGQLli_gWWaKW#<vDqt<D4f?ojT{W*Y&17C)>I$d8OF!3+Hm1
z<!vvT(3}Y*73JN({lr5369(w!?kgFoE+g}fNj;UwZncTv8%&+GoTI+iV(Umw>W5in
zXV^%U35HooOpSmeqlG6e0kUMYNonKp1vr|My9}4-WO+uOxe_c-o&}%voNYHkqtle%
z5yQ_^oozSUUNu30EQSAl!Q%(%3G1NXENSMjCL*<W*^YF)$Dg{2W4_a3YGjsOev%{Y
z9yVjCp(HfeLm)$OC`#Y|^j&oTJP;5x2pS9_7&I6(2q72<fMOBjqe>Vx-Td2~rk(}d
z8pT!HZe>1r5EGuz`pgsg@^yQEi=BIa#meLq0!?{TZ}q#}=7UC9_l=w|wv+pP!g4#!
zRys6EN$Jv}#U47$k&)pDzvks}LGfPku6P9p!56Py)~1)W(11n7n}`Wx!=;_JTiu#d
zpCqx=hEk@t4sp?!j{W}wP@V-=Pd=T^>6IKBy;#mLA7hCe{V7B3@I7Ipa}L`MbF|YQ
z)$BNWsiEnoNHrtJli|n<Ln1BTGjcL@<j&)APo4){9`H_aPH_&hG-o_Ik;&v_(lo`p
zV2y|~mLsA_Ke>8cOnn4NyF=8MbVxgof0>Uv%wM_j94a;8(LMjl<nRf0cYc4^n%;7D
zEm`^zCwZ9bZMRw8M)NvHS3bzf_{EHJJiF5C2--t21Xa8$UIShOE_l2PIPY<}!{r|5
z9L{;1^LPiH-YjeDFXfr*Kg?lEUW#bKmc9Q08k^uX<T=Mr?daUrZut8_?eXXSN_XD7
zjFX?WjGM2X`fp^Aui-Lf^^)iF)S<8YGhX`*W3>L~E(99gJ*2%Jt<E%)JewiAhCoi$
zuE3`i(ljIMIkH|(nr6kTmlv-zBkg8nX-<~r<e4MSJTCLNJdkCM*o*<7>NtAkD@j;^
za~Y~&j6uY{=R<G!mHEK$U*d27&avbfeExkK9@jD-cNF{9E$1?^60D7E2)S3)vq$PH
z{*N&xbkbeM`yP2{vBJKTlUKY41UAq9g!@~6L#;ZDh?K8#j})|gg%Ain;CfcGBf
zJ;4RCyoU*Oo_pA*nH)Zh2RCluvYTJW*iu8U>v5S4joH*RW_m9N{ZSN0HhAwFyJNok
zS9kx$>wMf%tUi&Eb`6u0lWJ|k?A-42(lp2UmS(PrAc(24wexRiHUieMwf$o%m6$xs
zp#-SdBUu2D5`v;(9-sm&kN2M74c&AvKe_v@tQ|dzJ2qSgQHpnUP(iQ?J%Il;Jdyp#
z7}cpq6Kdm+FS~zS4Eo;fuO=DFP*UlpO|_CNt5&NUqBvQWxmg7#ARvMf=%#H@p%RZ`
zjK$hMbNb+vVP3UlkfIt&ptJ<00Ic{Ka+lF+&w;OEs1O2#V8~O|R<k8DQ=Oet+srZE
z=hG9*B5}P)XZOHmU%ltR0rZHz6D7l|BbE_zh3RyfNMbfVxE&PhL+~QzCC0id#i$1D
z2*|*MKrmua6;u^5QuqlLu%|p!L4(e-sC@T66pRXi;7VqB&|pfz=!?%e$W!QcQr)||
zEpvOqi&XWhxMHm_!GHY2e}qUzjd7Yf$D^Hx*uL{oW;;`?UUCR$9dyM$80^=?cIw4s
zM5MG?!ZK<yO0Gjxs52Dr;E|cFnDk&Wvq@A1BhiUz-bRe`NQlIkA{Uqb;==$e@q!mE
zV3C47IzVV1AO(U~LI?$m%J;%2X~`&8fTvori}DqG@F8+}ILTOdmG@qf_*fK&B{C^Z
zGMYrjYP*qXwYiS<$F1jEcRZV2TODtI*?bPSPZYZ^e>*Gq9TIgM&UqM&bZOXBwnbC?
zDr))NR&g>lwVgcmnx`K1$)PTTw3m}-T11^ZkY{}jQ@lGD$XzJIcVFkYBBW=o_}TUU
zt@yd{Jz;@~72x#!RG(#ira6}v-*J#<{@@^OI-Q2T^}=IKLubsa&V-%WwlF1s7fz~u
zMdQTV7SnRet#^`VO0V7H(?59X{uy+S`(sorO@2-+qioUdo9+6r4#|jb=?t50oh42R
z{}I>Krut|YKkOc|O|M>y#(3YA;I(i+MiHSfwbJA$jIUr$Y2i|u)*>@2eUYk`j4C5r
z>61dKu!AqM_E7#DoDzbd-bfT%AYXUUB{SS|{b{`5^?wz1{PVQgTlvyqOX8(#GTz(U
zNPhnj>$lC`xaD56`TjW&uW8p~qikP*F8kHFM0frzdk%UNGjb1O$%uLK`0-)2UsZ3L
z#+j+CI_8k4VslL%$aVR@joX>M-@odbX!os$xY$HDIOCokY?{Q0v2kQErf|ZlN>D9w
zC+2}E&?rDdi#%))$p%P4C_xGXu=@U~_<|V4L|{>TP$XBp$5pCPXLzK3!;gP>7=QNi
zkNOur`>xY=@VSpB#LsN9JKpOz({ANcdv>?K+D_*_HZ<;9>kplj^Ph5!e&&a#?(3vK
z_Q@}D_M5kGcx^AuaI~qKYUnb1Mj-n;MURXa)+x7<z0Docp0hjm>~e2gbMW|gw?5Rg
zTOMlo>6zIJ$VNVgn(@kTSL0eP)nR35IHpoHM2W#h6cNmTm@-9`dFJ$;k(S`7Lg@RY
zp!hNmb9un!O4<wbI)P5hleP_cC(*Q7mBw6m@18x=Z~fV2_kHt+_g_HT4o_RU+W&0H
zlJVimXsveqoNRACKzpto=4QHPYI}P5TVJyL6%CNDT)tc9W>Wt05ANDGirv(B14gW|
zwjP}C9bK{J`qZ_S2o)b`RonR-b8~y8)$H0`+gg6>#^wu8eCp9xA9B>>8(KRizI?+^
zAJ#i>*({qM-c4gBB~5dzg(wj!HA`hkh!aDl5>u&J;>2K#Ax<n&%wmTP_`JyXeDJtT
z>2)2wt|L!9X;(=*jy!`r4_FhCBoRxNj<M>XNv(~jG<w@#|D==bKdf25?EA~t$t#wx
zT%N^A61&_<x1G{zwL-g*hN+!twDqCZi&gp8O3m78UHxCb<AKE{_U)U`WBUVRWab_t
z#soriUhZd?c%Nm)#t9e*33=*i&b70>Q|%<}%K6RimaBJcP0v}oCgRN3B;oiM)opj?
zXm;;tv3q-yy}NqMOr^~3&1lW$w3}UK_IT2sCrkYx5$&6e2A%g;QZUX~A&L!2rFd0p
z5%men@^zN_Xw2|v%*c2|wQfkN4r6u&k;LxYY+w3{KY#cie)!iz>(yAgt=&-+Sy2V&
z9BJxI+VMKQ%dvY~x>gmEijj3ss_*NAT(8d1@DQ6e&#Ln&6Qk>wHrh>;V2nvomC`8&
z(w?`?*_^3u-TJrMzv2~7dH(XLJvUOXk4U8oW6Ol)YsawhIB{GdvIzu1hzMTrE)cvB
z%2GxMpaF89<9uF(?cfN(BNR?wwWvCZ6e6<FMv8ZyEc0Y(VQ$@apxyA&ZguE&-HPYR
z*X-Lq{wbelLZ10j78H)%Wkq!$I3alPszL~Y%Vl4H3<)bHk7DDN8$_f6*rxYV+)QbW
zAyCw2JVmHgGcvgz(vEa8A4uCRj0I!b7%Lbl{Ee{&Qwl_5E!G&s6z0`;=Aa0s*ofd4
zMXi3Q_pX!-&f{EJ0_m4RawX3dmqM>2+G_{$+;`yjgLj{(^z*zzwd;K3RElb*%=??P
zm+lLY0@Y}^kVdMYX5M)YJ~8h=i(S{q#NfU0xPTao4WPDQL=Y_;vg=p%iay1_`<0Ga
zMG&<(pOU+bI2u9_g8IJBTqGX*3@G$Zc`pj0f@)vd2?Aj`ms>DHg>;w~p}HXV(*VJX
zphd;fht9qL3E)D8h$$A;SGl22Ygv>`iU=A)z=1<WOu(7|w!alcS|LHo@0aIX<c1AK
z3@%G)b@wuytY*A=D7$7iV60NDj1)ixzj!fWUL0y^p4#6u)uK$RiUuRYjFbv}J}UMR
zng<C)$N)spb~Ee?5GcWT1i_TjZV-7Fh_^EQO8MuwI|4dainOQ%MhYs6f=Kyik%8zF
zF^Cv)*QJ+s7>ZYN$|2`*$`R)?KD>$tw_e9h_x~eX_udS~Q%yz?48i*aIa+_wx|j{B
zsG7mwZ)6M3dmvgMC3K-66;ML(9o2xU!F8+qF)>v{1;ip)6v_I)6law|rd_Dx2oV|n
z(Qm_PUnTTuKFG)w%s|)lS!w~Lm$k|Al=0djocyHU;>1H=!N}0E0lSV^b2^6~^lUco
zyoH+|_!l<qyq-LCsK&z>i3#e<G@}3s=HCZ9T3)fjE>uHd4TJS8=CLaHG9H8g&h3Xm
z#>BkpUBAmae(#)qO3)ZMG3irM=5IzA^s+)w86=tIMT{&?Awux<(k2>U#n`c&@Z?u=
z%=#BoO-9Nc^?)hz*YW~~tU8rLR-MZBJsY_7fp2r~mY>q-O;L%5Fp?}V6CK=F(18U3
znxB8ZR0TT{)T64RDt!+yFgp!JXGP0|It0Hz2Em#YfRv>O>8A?J=Sz!nq<|{&mW=?~
zDQT{S6PH0|jwy37t+0Ob6izz)JdRlNEUbyk>-K?}FOT=Dj9SuS_0nTFd+A^D?Bo83
zTkicXcW=IuZoZd(Dl;&#`LI;_s?e;OH9quf?*XuV0O$Qh0j~HWKpA|PXV4&<S3K1O
zj2Miu7%|c}J$;NrE6!QKHntT|(!yEl;6_9FJyd9QW_bGH@8ng_`w~a4dk%uZ2Uq$l
zyPzzx_~`qpLiaevhgWmfsqf%TFTa9edl<d6QU3e<Dj3{b@m8~=gE-y?W^kW<e+nqB
zspN=p+ArWO21o-hIsJ5uiOSv8-h8ER9;k)(4OAyiFU)fhh(XKHEaWa{&5~32r|*24
zi$C@Twr}5BZh$RnJqzt{KSwBijkOUV{QUYWdCeQ2#{+loVRB?WS(eT-DJVu_>b2zs
z@W5<)dtovIR<mGD<u&|#Td7|q*ax6jKj%yA*u57c7HeZvrCXjz7M^gS&5M}YpUz)z
z5Mc(NQONU@=N$JAE_(B&xM+$$df(Id{8uhwW@Z*+?7%-g7N|lfhJnDm^_Tz2n=U+)
zkNnN=bLN>Z<Wm>@gvsi$^s;v05(XwF3$lJ;wzYfE`46fnT7>!qt|hWHRE>yQP)i8=
zVbC|O{Ud6%kwGcch>>|pE-=?cW;TDR0lE5Nw7l66lr-zIYT3bj^ujCn$b0{ZO;gwK
z#}}W(*T3~in$6ZCpbB98pftPTo;!K>U;H*7_}t4m;;4i9#^2t`pS<=jsnx198);d3
z-M6Mx{7-c0A-jhJQ`5mB<gsfJBaxz#f}QWO?2kdQtrs^|*uDx2Lt}BaFfg~UFvLpf
zp$DpVP_77`+;su)A?Ji6&*W3@xsJEJ;p5!;(7*7<?|%y4yzGmlX@)Tt5irKGapQx$
z_d^%(?!S6IW6OjueB|f6`wzdu^2xP$?@H4Em(QmS1K%hE&E|%Ntrd}YUi-|WSVR;-
zRlW1QPVTy>y8TBnfbr2~sER5E5oz}=so34cg)GYarRWi8w#W$%G{?Z*4xDb#LX1B1
zg!4G{m~*)H_J8J^SNt`XU-fxjea`>p_$Qyn*Dn18*WdPCp8oWw^XU)%kfRQHMgfQh
z1j_ua@O4G%QK;&YH3Y9(q!hkgOUCkcVH5N0Ug(EPX%H6qCfPqg))qrd#ec^47dBu-
z=sRkmjGS>3K(tfRTo<YCdB-qMSt-O~P)}s5oO$XSdG<-?^ONhp%r`Ip0H68F3`ZYz
zJYV?u4>;zCXO-74hV;y1!vCN}v|w?AWR$YpYXs@Dr?iNLKD9s|2)0aHY!TKTYhwMI
z7b#54h!H6rUU9+xnL$g6h?t?Li5guXPY1g)$bI$~rHWP%QkY<DtDO71ck#!s{UA?1
zWs*<5{|b&d;#s9n@<?$%`<_K)UL1!0nWeu=pu(P+Erfw0oUc?a+(1==_j$*Ahe#M~
zz`g+4l2C;Y>J6Y-U^0C(@*$ruN2*zn0QRBOeVpgMFbT%k!Dn1*u#%J^y)enX1K;0~
z<!N!*1-eGWprOw&Ax(Sp+I4Vm`<RpFd6wmQM$n)kxL*U1u?PlhB8-VT{iJ-pMyw0i
zG0!S%<1&nv7aj=4eNAk?K$SwVXxn&xjc#1b9EEPLQ`V5?O<z?81%5$vXvEY6u>%3Q
zP(b%}P!Loj6M{v96(Qa~K!bq-V-P89U_K)0zHC_F#L==3IPh2hHG6&?rxvQ%|EljR
zfGIDyu=rIrl1dyjuMfwuh?pXZmARwNZ?GbW;5BH5D#nN|WbGm+UGAh7_AcG>4&|{0
zrg?k@h8zm!0A|5Zo%X%g|2tBPKHHB6`~4h?I@bepDe6?^f<i|_<o4A_^FNCkyw`kY
zcCH&mF}WL|m(KUa7BdI~Di{)s#9hGelcX-L4T-7*CA9UicfZJK{lCsg^jp&ZHG>8w
zBnzfOf|j{kR5m6BLRr0$!RZ$PHSk*)tyjkws*DpyHIiiL*8o(Smx(OKT7@D&Y3OI^
zEUMtKa2*SLjt(eJsZsLsrgV`A+xL(~JN#JU6+L)gCe%VuSNbCzTr09w>eZ#779SKV
z)m)@#TNVy|q3Tz_U`^7MY`l}`GU~OlQi|*cprX?tm@tIV+8kOGkaa=9Y<{N|RZ)ns
zHlgnz2S%qwK9wXjest~Ux$YNNA{0?6Xpv{_mqYt8D`g&7Yb~>lX+HP&AK<=+Zl_kO
z6a2g`^4=9W92GQ3e9Mk6?DlzlkIM`iOzwk*5L81TcuyYkI-<3^@49_+^XC7&N}SL1
zh$kIBxb`9+v}acfV?<wf)<pmhUwp^U?o2atnzfSq;A{wpcM`I$Xk&)B+t#b!``Oc2
zxokY{<aN*#VV)rx6dT3NbT;y-uf2xLe)3FSeeQ)k_2lOi7NFrV;OIW!6gsGo1MN<W
z%dh-8-@o#U?3vrb5l5|@2Xv96HbiEW5N6^^XBk;l-rQ+at{AJYU0O*J_1;S-Ym>FQ
zN#04eHe0*j{pz=zOj3#EHLrT3e)O;3xqpCWrl$e)PcD9jQ4P-8_zyZg^M7i|*kOuj
znsvlwNUsy5+01^P_sqMOjXjxKwHn4)$87t-MWZZ*5Dbit4|D9vL+spsJ0JPd?{Ms)
zFW^<@yqjZ=IvG%$ck_Cu9|b8CvoV%5P5IZWzs>i4`~`N+-p`7a6RbLHJ;nxtSB#Mb
z`1I552=9DrYWFNZ{-=Mt;SVo5@3cmv`IZT@@>#~zCe-=qENxsn+uHfL`e?SbT3IQ_
zt~e)Lcirs_S5^X#?hDYmgV%8QQDe+?>*1&0e^BnaeZz(&D~3<)#QuUL8h*NlXgtr|
z&a{_Z)o9FK_U5<0!E3N|yY1P2g%J9s*?!zF78+NSb%!ix)tbQ09oO&|U$~Bwk35^-
zec9VN^xz{043e^xD}WEmzh8d^-~Pd8**<d@D-RmuxI+&fC~RYjR6@&CLR984Rwbx!
z)+$51{IzE%Bg^AW*FA8<yS8oW+M}OD90~c@MBP{HlW^&T<Rp6gBR^)-gBw}4a%}$q
zZP~n)M>bEfd+I?HuO~n4SksoN8LRPUy={E<@BjRMUh?X71Xaey>t^$&Eq2B7)u_r$
z|IQwpG52G!F$J5fRo1LqLB7iKz_!bI@27skX~+Eze|Y}IBuRp?hR7z|eA~7B<99#7
zrX4r2a_tCDUb_}Cg)g!OEVeJ5AEVRyb!9~f4OL68qhZZRP0l*>MdkxvxXeGWx$T>+
zI^X!wnYQDnwK9?i)j)eLXJU2Cw>~>R?72@MecvT7;h~2gATow_cbc)$Ws+xNSB{++
zo^tTp^y*(-Y-XF=$XyoBJnMN9+p!Qrep1)%ym_v7zZH{;u~L>T=4XP!f^?uC4ULUR
zdl`>x+DVkHVd;|9#N*oubBFQEyRT#UK^0c7T}l)eEEFS)qvZl%f>#I;iCwAWb=kW0
z(e#lm51o?d>D|kgtTscVQCNDAXMAjxSX&{_Qf)T((wMHWWLb<o=q9G`K9PU;#2t(9
z*eB7s{;CIg-b;?>z6WpPXP0(3_SBWwI19Vx?$i6WUqP$4O|wjNbYzst$z{58`cBhm
z&F(N-KeXFzo#aC|6BbC($As#B8X=}ggpDyQUp|Q>9cG$47#>TQn%T(eHA`5se7KnZ
zF_dj<Da@+uyAAeOi!{YDPQ=oDc6P?jwRV2BTGy{FTfW>_6NN0xS-oZ%Nj%PTpK=MC
zw*4IMGls_v)mokI)Dph*p<g~0j{kay^R@3ffjF)p2#w}ma_2D;Q%fcgv7GpnW7xZA
zMwX8rtHCsuZry$7Rh?|d$#FHfYOQ7>D<)7prEF|j6I$2=XF=Ua3z;BN^yt&H@G%7&
zWnL7*e0S9svjSP>kuc;VCbZXUN3G7D8`G@!Qnjt=p=7yC?QH0tsa@RsuPMLj@wf-c
z|LV)H$Auga+MTAU#>)eeuh_L`!<SPxd$x^4>qC=Ls|{m}Cy)|w6#aP}w6_-ya~9LF
z{dQAPa-|&ME858gIK=}lVK7MLT~Oye&UM9y?0X=8Qmvb*)=X}iv%Me)Gqav+FWdGT
zuk&#ak~?2Kzf}w)xZuKGx%+`1?Ec<o({tbcv1gsz>oq?*H@EjFm%C6OT577vWKoJB
z$A^sIasm!5TGOFF<zd7W&2M7`jG(5h@s!1~a`rlk0+GqdZI3MNDkl4nIH`Q*^B3K9
z`B}d=VOFnLCs{8KwV^t3oG?0;0KYjvPw0WVi!S*s;;6z*V;jzA)N2#cYNT-_^=y26
z+50!#_aA?m<_$MGUXMmc#~2<Sqgojzj;mN3&-cCh?QB!bHi|J=Yl&=BRs;nl5P~Cj
zDZO5YR%=e%t)8@I)7k21@0`E;;2qb&EU$m>GmHkKNTE7KW3nveUq1bt4Uj)!1_6BJ
zU6=EoPrjVdk+pQX+j-GTpQS&&^43tT43kuRlvE8fGdYc!1|m)3WCuwlqB>NeQc0**
zYE&wTj*QpuPLfJ)j2$(`sI@k@oR!^9d(3&Kd6r3*<)pooPNzq=)1%#NQ;nAsF*5VR
zOYXQC;B^<C9u3z=+}kdAC71ler354NT8|%`Ckzn4$1nUL@A>4*Sik--jy?J`uDj-!
zSep}9YT4*SOrT2I6MF4H+EZFRPh+}^b4@i8OYk9Y&86o*Y4(`Ax1W4#t<mhV;YZrN
z;7w<I``!m{|6Nr=6j>X^5m6LjZPb61LF2?qBy?B_?1YE!nej)R5c8qG`2s_uF`Cu+
z`X_$#2Ur#!Pw0WVd60fYG8A#y55LDyJ!Yt$5G6Efb<6Nr%-BTC_|llMB?%*A5%rOX
z`fyBbD5g@4Ns^)P;F7zjv{t6u?k1J0kR*v#Dhair3iXjH^^qz=!xd`vm`W`oN-Wj_
zNML7~t!rR<Ta}3=EBWI4AJ}i${%>bc|9I0mUjpEgOJ9XGg<eT2mRg8#i4Y}9oG4jE
z>2;vjDZ;b~V638P!uVuejytg~ci-I(n9#M6AR=mQG0YjoLKGPgFp(jS4Pn7UJR)Et
z<umVj;*WzLaDcw>-8ZsqWsRLXri#f_BSeWIat3P+Q3Td1#ws={2CLGpDdvrgP#KD7
z&SnaR^#_Bsq;Xt;kyI^}iX~1WYzdHamc$tH1#Mz6f<2(WuH^s%^yXK78Gyg}{;LNA
zoW%$)#R!a0wv&q%qj%+~i3^k&1jY!ljfi82Vr$~W5G6u&$Wp0VqR3*bDIWLE4Y64K
ze08)CmeFrq2><T^2;ljzS%$HKwFVJUV+zf(uuY+{P^l{3+v0JiR58ZmgAzrC%Fqx&
zGU6oC>QGFSDAk%Rhs}$r*rJVNuoO(~AJ!PG{T~d_i(dQ;OsQc+q&tww<xqHG0);A$
z6N9>lJV|`Bv$N}R$K=uxCPyc!RBBXfRjRcZi5yAQk|YKj*>d`|Xw~ckP!!SW%^gsH
z4oDR1AJt?S?}B;<&e0TPFsNAMQwxCt69o{uA>=K^qd1+MST3tptj8GHnN(upgb*ji
zq`i%b+{{=o7ByB78@8!x_Gs&uqLOKv_6{gO2b4jbc8YT@EEzqBp!v_c?XXFx9D<OC
z;4NxI#9*xraTNPr=p|m~esIjmOQ+%_Htj|fDnk+?JA|0<&7tAsE!I{iZ7jK1HIa>q
zb{!I|Nu<;4kZbyl3*LDg#$f7`nKwT9p9|2|t&fmAe64Of^c3TKI%Q?_^+uxaj|?xL
zw5U4G#YlpQDngbfM)q85qt=DJt|y5nG){VqE;V8I&WBCAH+|pe@QT+};^BWB8(lGB
zqe!DD7Gq<wG&R%HyXl(UA2|5XL;j^+AD4$7zRlN%M)}4i4<1l}4(RU>I`0pj%h;hm
z;n?F&(5YS1X4{T?Hf24&;~ic?rDC*Zgk;*ga9b~Je`?R%gBQy3U5$!cEi-#s>T+d#
zWH}Mbv|6p1R<`wiiPB32Gn*u}EQxC^LGJIR?H}~g*|#s5IQY`pJzcYP=0El5RWIen
z8*k;5(^qldFJ}(enhxl1pnB_vPi5uu!@1|-9|Owd=%J>WPwQ>dkLW|!5WV<$<73Xb
z{0CRJT1OpP567)vYea*J7*!3_M-nC`C)l*@dKzsw^5El5v)K$c-nf?sZ)?i>Gc=yt
zg{xL=urnv{!j}h=hh{KFAjIS@=h9C<bRd_+{{{YNd_enrvvL3c03~!qSaf7zbY(hY
za%Ew3WdJfTF)=MLIW00WR4_R@Gcr0eGA%GSIxsM(l48sN001R)MObuXVRU6WZEs|0
vW_bWIFflPLFgYzTHdHV-Ix;spGd3+SH##sdcWUue00000NkvXXu0mjfsl$*y

diff --git a/apps/openglcompute/res/drawable-ldpi/ic_launcher.png b/apps/openglcompute/res/drawable-ldpi/ic_launcher.png
deleted file mode 100644
index 99238729d8753585237a65b91c7cde426c90baef..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2729
zcmV;a3Rd-rP)<h;3K|Lk000e1NJLTq001Na001Ni1^@s6;Q*MJ00001b5ch_0Itp)
z=>Px#24YJ`L;(K){{a7>y{D4^000SaNLh0L02dMf02dMgXP?qi00007bV*G`2ipt~
z7YY(F`_Sb8017EdL_t(o!?l=uuwPYm$3JVI^ZWho?|E--5|R`W6G$K=z*GYY6wr=V
zEHLAs&<Cg#7*UbZktrooI+l)B$1q4L6x)iE(lRPZ)Mx-BN)QR*5ip4)gaqSF$UFD>
zdz|M!d-acV?}e00<tp7Xd(Q8-&)#Q!*LQu_+UIPTM0@7oHGle_&z*PtWY)Yea_w2)
zfAOOEZ*{(z%)_7Gx@)ngrM35OIHx!^rn;s(ard2~x_Xi7`Z=$kzejJJ_wufNKYH@f
zeT}zjLzrZ{>xbt)P@m$z^s#fV*k#SgXB4;4p<Y{Gm53ly>FT(w@xz)o_l~EwJ+$tL
zNA}&l{N}CqzO8^B)M@;g^aHT<;0<SH(t2**adQ#z@hRfb#R!KOU^lVGL{UVR6nXv^
z{X1{GWWy;Ug6bkfw*USi+m>E84yNhu{N${eJ-?VeV-AUA6q$<9tr<l!6$}k<Rn@Fa
z6vdplxP0;ho8J55yB|JhmdJMQSiR)!3)unpPk))n$TvYN385CK09Hgj5U^y#8WAQT
zVbCE60&TZj#2sX~KY1wZxXX8J?+=eZzK4Ijdh?sw@!krU&GDyasPXO}O!myg&CrT_
zmkhe?VkIS&j94vkmXahXQJ4}(3B=tpzwrq<I<iwcLF?pB%kEqD*&Ul6Uvr4R`}!$t
zcx2#r&$3oC_{3oQ$bzBaBia*RD82f}1kIFIE2P!x<h|LU)eH&R5kV&bTjk{Gmomd#
zEUArfR+eA0bZ!mctAG9S<U_V<^M|?Y57z+j$nK90ISG24#u!&eToEaT7|%x#)CtZx
ztaaGblBOw1oPw=0aCBJJSZNQZ?B6qd^S!sda>t}a{U45TFsn9Sc6zfp($j8t2s@dE
zQIjAUBn)CY?J)11fS?@`1`%Nx6NL#$Z0Usk7(Wr4STgIdiMw<nd-weK-ERc+o9uJr
zgP(jIOdFQZ3Ofl=kZK%S$u*J^1_@CR6Lv;>7!!ptNtBYrmL$nY(+rzsSZg&+Q(Pts
z$DVsczi`HH^ri&>wJ9FAf9p&De1OdZH!;t<6V-n!4>5RGht>sq2l{?Fa6~?LaQm$9
z9qH`6yjb)<RPW8G)IEwN5Cb9#DCz`N1*0H}F9Kq4nPL+qXclB}e<8Crynf@rO}&j;
zv^K+ExOeasv*PJ<AH3>4PhAIa?cbkttcHHF=ZgDOlWSCc`VaTB=hp)doVH}{g9J0z
z{OG}rx?{_LG>2kT!Sf8oqKD@j#DD_oG}lq0#F53O8AgO^qo8w6oGP^*|D}1SXUk7K
zb?V*KdY9iC3G_f;Tb_CB@TqH89N00=&{%tU%c0Z4WB~ApI*tQ-I@60@=bck#y}*T6
z_R1w!Pet&si6M<0X$&@1Z04|OhSLnh!5CX8&N-6E$;g1?;NIcJ!9M@ET6asjDj{j&
zq&1Y$9Lh>#7>)s?>Lr;~P$jdD%&Hf*{8+t^cGKb)1Y-;$qr{4!>WIP!krE;qzA0ie
zH@2QMam0}lG!0Rtu2d9Jhk!tC3e<BG0zyTd%V#+Sbrdp-q^ZMriU>GyD1bu2t1_*&
znD@VXDUHfZeztiTyAJ-0ENzq8EH4L{qM4F8hdRitic@fz!#TyN5{GdxF+&jQ7@$l6
zDL9*@Sw_A%6O4hL>RjG2?L1CC{!f_IyJ&p<g=gN%+3jn1Y419A9r!u>j%>v_aJj(1
zDV}G<WolIOM92wvZNu=7t&%3c??$sn)d&)0cGSDf;j#iUavB%B?{5$x=Uilzd!f5F
zld~6H#nHV*UTMFCDO%JizE@SH1)5W=PSrWYWl9(%;e>@zl}MeEcR)=MBzMj!s=}<^
zGdSzCOStu`m-76U#|fg&xSo<UX3wiT`QlxlB1rnV@T^-1qX1RKGbK@{IGo9!g^Z39
z1kuDA5)rGTNs`CcVQrQfRWt+VDM=EtaQ+g$e9LNny77KiKKyg0b)UraDRbDo<1wnW
zGM~EYHl|OR&Erq6B1<(d3oh5NZ#-Wab<lyTaA;o?PAqCn;#AR06-5M7uvwZE9XRJO
z&g799CHDj8{1tK1;h8NDvwqVzx&C8cr&etsqvcHytW`__6a}X!&S}mNI_hY<A3O10
zrqan%QN=mMIcp4F&X|sc?N}E|#VRKpx0o3-rc<wWkAne-;s9KpEt+Zm(~SaAa=|+R
zNG1d%B4Mule34T{j2W-O>PB<%f3<v#d%YI(eK4~qf~v`jrlOuNUM}j_fWezEBuJz#
zH$DnVoU`eeu2dCt;x`sO#<BcbEERJnoKPw>P={hr%`<p3aXbJW(tmy<pfIp|ZECP7
z*ch^u;Y!yOE8>p}{nf+USozR$hK7$G3*$9{2!b{no?XWStM8y#?82#n6GW?7)Zsa`
zwL!I2XXA1vS#2G_6uFg)uUPcjE9|${UC9d@_w0xRuPYew-0*;GI=nx){rvMUu(54@
z+`1-W3}TdRyVvvF=0|BZ+svA_fYc`R9sDKlJoSV8^oiAcd+nE5_tZVqd%^b&f>BQz
zGBTL-|M&8(H=O;xQ=e^A=e^iz^4+6@yKlSf%8Tv#hqkcmS4VRN-hS^#_`+wt2f#&F
zoaoiN8`U^;=?_+H4ewj^5AQhK+SC`?KJ^PeVnke)?{!I}B<(sU&3He<>2?MWWu%2Z
z{8ENr@N(U$qFI3=v-$PTS07#Z@0&k3QOG}i+j)HBi%%Z=`tcW^UCejx+4hFXpTF~>
z6_NH`)m1V01y2Phns1H@BEv%=rBZ<`6)ly05y^ASTBkN~;?g=vr<Cm7w|&Xl6^CE^
zy*`6(S@t0Y2HOR{&{~SfEsaL4+EuO~qByKVCWt|dhrz=bUsXL(Cz`~GbULj#3ZqBE
zu=%C?m+gA`vg;P0CS~<E`X&!(>9P;=m7CX$|G)Zgm+aiXZ~uaNy+(I$oqD4|rBaJZ
zrIPx7!4u>8Hcd<rgbWQ1Fx20{MdiC!tcY){%ND(M>FJC#TdexmzBje$|6hQ{z`W;j
zcxEL`omomE>(d+x8Qd8VhX=5+`P#GV58evMdoP*&lTI}9fl8%JsjEQ2FXPkIUzaTk
zaNk#c^;<u={I_%rvogO{>wYqAW|>-DX%0C?1}#Zoic`Di%g1kcS7qn!=Ut&(rcy6c
zEP5*Vl6GWL2O9olCKpP^6ib5fJT(SUCo~-tix$s^a?N*TuSl&?#P^M4X@Pb!L1}-x
z&WA*#CC1=+BE_;txmKWDDTfD-_Gz_Ib&Z~KTI()QX%w`p;#2A}c%F3r-vD)*@$xL`
zN{seU@}^QO)(>T_xfWpdaeovRE7^CZPMr}<?>#|!d*|R6{H=+M{MV$Mp3LNPKT_t5
z(-+S5yz=?J*A+!U{KSTh8xFttSbqQdFU>bSjT8Q$)Ky#JnbOd}k;7ZR_W37=|NQzh
jFn-Lp|K;W1YU6(Zg`N}+zmb=x00000NkvXXu0mjf_|!_9

diff --git a/apps/openglcompute/res/drawable-mdpi/ic_launcher.png b/apps/openglcompute/res/drawable-mdpi/ic_launcher.png
deleted file mode 100644
index 359047dfa4ed206e41e2354f9c6b307e713efe32..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5237
zcmV-*6pHJKP)<h;3K|Lk000e1NJLTq001xm001xu1^@s6R|5Hm00006VoOIv00000
z008+zyMF)x010qNS#tmY7ZLyf7ZL$ypVCqQ000Sga6xAP001xm001xm&hCs?000x7
zNkl<ZXx_D1X|x?xmHzfQr|OPx$a@)HCP|nQ5<;LsCJBhjAc80$(jX$JtzfI9m!M(`
zDoCSI=>!xJWW@nmR0Ns^Wrk)72_X;&VM@qLNZyn;-h1m-)j4PH{!#b7fObo=TF+Xw
z)_t{JRqgNW{e9m)=MZ*rJl6A%IHK!gcqM)U)>TjF8ytMTRLpN39jns9J?@oOe47l4
z1dw7d06;*nuu_+V$6Qs4K>#PCRHVFExV^duw#+<i(*6$thAv)qlPvl2x}A>4>?(j)
z*AHP%*L5@qEpM#j?*@5nOq@HlBR^5M@^_J9)U!&MV7N?QAAfFbdJaGWPgRws)6~+R
z-NrZmx<?+k?X>0V*7Od$!{dkY1w*wll3j_1b``)C%NHS6N>yBU998+?y%)4SU2YA}
zA%$NKSGVi)4!sVH=l1lla~XcBLKrfnO2~CXCa>$GlX_p?dYsM`3%)hidhs()bzlDL
zr7zEG>kK#SwpW`1YyR;!pa1&-`0t?)V)3FnK7V~pCo%hYIQUj+f?7Oh#@-(|a?XKA
zr;?n-<mNJLC=f*0js#<1XfX7s^WoFh6|SQo-m+%p`lnYt+IwL~m%g@82=v9Lukzox
zej%>>{Mx?{fOYn3n4;UD5a5kBx9Z>DQ1SETOzUjjZ`HF0&e`i-6T<17qM|ec7?fBc
z;0k&%hz+o?+KMG>1)PSqUSqTR@!luCa_YiGo3TkPUp^w8T}r$YFf$gPyy|ZYU`={9
z3c4MNG|FgE6ETxVuw_~St-lefEMgF+NTdzZD8wWJ0s<69@frs3IxH*_A4`(dIZhJT
z)TwApTxD36oOSS>-?;UKV^n{)k!mFpfWRL3*Rxl@V_bS?f`4@I!*C2lX%(H}L=`CT
z0BxGtLQ@`yX#0U)3`bO@9NHBjM^*Gw6<bk5AL<x^hrj`F%;cOXm$GsxzzlNU9{JSz
zUrsyXkm6kfBe4=Lh_!XtRPe~1y?+}3pSW-?0#nAswpLm}*UOe<LuoJb{=2H^-#<a@
zD(KTfL>4K=(1QdKEK*p+u<&qTSoUzKhfO`4Wz>@z)uK^Aw6m!k{QPq@f~bd?t)6?}
z1bJ=k7!E&fDxUmP-(QVQ?F@i8a-dv4%Gg64haX`<aQe3+i4GmGn{CZ{)8ak|wo@_U
z`;dr70*4qDu~@{MBC!@}?2slNHWkEif>yNv^E%Ea<=YJ4SdqH4e{1~Sk?qbu|M;*f
zbqpYh(szvQ9ev=Amrj8q0@9+|SbxTQw)=Lr&Hm@e_hY2mXXchai5dBmusvCYf%>!X
zK>#8PKtTjx&+y*EIR|SkT*`=|2>VPq0kb=fM~F#u|GG<9sj?zc-#-8BqmC*-%N5t%
z3v1um65bJjO9}`JV*qzjs9O-*vCma1qq%z0=Thg*sPtm8u4CiyU5H^JCTU0mH2?_M
zGn{jci{Y)p`kvomV&MR6*th{{opqpyh3Ux4m)!GykUSWKMk@t>>SyNTwj2L%XZ{Nn
z>Xv_j0zm+HA-wSFCJ4n;tqux{Z<*M!+ghP`mh}};q{({$d;y{&<a1?^QXmjS2oQid
zlSS`#WWrfSoYY`rjp|qe6Bvjyeq=b(gMF3jf}C7njMlc+>M#518E{~{H2e(KJ+~I!
z(QA0${wLzt8F#!r1DoX%bYVIIT!6Y<gOuu&5@WwTCSv8f$5fy@JU|Pp)v;Mni2y>1
zJctN_2;>9AahjEz5Cm@p&;a2*ykj`$0UrSH$QJ^n3By@S!UCJh5jS2|HIuruyXF34
zRDv<JonHqb75FwkBitYLD8jP455<>0v?9yEOYVFWR0jftU~yzAQIFKu_~N!vxLSpD
zIxEmBpAwnRC3gEyg%Yon(xeEA2t*11fhfB~8i^HvMIcQOp5dF9V>l<UsfSHG(nO?I
zan!rHTJ3(t&42azq0PshK23f2+$sFp3lVB#9{HkDl~FHBLD*-{8UXm=RgVGyX3T8I
zAqVY-x$jc`ct0yw+-+W|J-Bde$z2Ed>7DZ+tS31TC`?6B2!P-{Ai`NS%8sfWFCh_#
z2!sJ<26G0;dxnUBNT3Wrj-j+52u(2zc*4ieoxAxfi_hFMD8$Dt*t4hHU+Z6a>y4`)
z-dgRJ&wT2GICjQeJ2<d9{RH+24xs{$5g^jranDy3%%`g|+V<wi8*lb+wO6%GX#dUT
zwQX(P+gATPuK81gd?JMwp;D_szGx8U6tp2|qoC9dYKZ_M0%l}9Z~|u;Y0QWtk6Oi{
zIx3Qkg;X6OYkhRl`%dXvI%P)NN8UAk(tmX3y7E2AE8NrDt9rU?KR@%xsh=sJotEDD
zqV#O7W&rMa<HKlgnXHM(r)lz4rQB77&|A-Hcw~UKZ5qW>4|X4P=?_kA+q7QY|L{F)
z>E#!CslTU!sFuPzhBSJAZ4?NAGFdr600O~tQ;`JDd9Vkv#1X>KptUV8Q)hHgp)4=n
zf7k1aF8a|v_e`5zKCDz~Nuz3ARYohScS~Kpws!0=fL0XBO0`T-YycqYn}yY@ZV?g2
zlnDnM86|@t(hM=mC6W&G)j}8N_Fwtr#>s`2R4qD9xuZ_o&BU=o5&`up5LX5DnnxN7
z(!|510_PdtJ9u$`Fq8(A0!#>KLogu_1c1^6@0sdRitRngzWe^er2PiAMIqpkE7Xj4
zqSD0i@PNn2cHaUJ;)tnGEM^?Y2OX%5f<y>OPNhi#0IY;la!zy_Gm@B#Lw#(Mo_^%=
znu44{7-|HeMy{k$Y%?&%Kq&>KG_*4CK85oRio&-@sE4y2Y3h;2*%j9ragC&24JaC`
z`!uzlS%RjYWaMg=C2{s!Ax`QU03w3c0Yn(2{;azYNJdU3mn!CrxI&4*JCC^T#}y}2
zA`Q<LQGFCSJrSX9!Ep2%a$y01E@njb+)yQfa{^}t-UBSzw~Y>zFa=EsmQ0RGvftbU
zQ>{c90A|-98)Xj4nT0b0yyJf8t%xIraRd)QQ&z*I6o?d@PmrXe$eT_q-0f@}wCCAq
zEl$Ss8*j&&jkjWZ<yB}EAkf8ZUd4@r&y<S6jLfasi%90?jldxSc<)mJLqsE~<T3!A
z?Ykitv_b%gNKFI~NcI925C}xbc`q@mOd5#+@_^C`X;Md2ucD(o8w+<o2Mc#U4ejM=
z&9va%bI!uTJx;;;?)$L%wOg^de>GSHg|Kx;aNPWFa9~0$jGSbWOU>XjH6x<NdckwU
z2WHP)2qHt7e<~h-fE)TP0v2WwuCveXW!Y9pZ5^HEJ#o^z&&SNpLy!+!8}gZ{5)pyL
zBQ$v|+~Z`-pLGKIMqa|o^*7`B*YAc_`E2&3>Dc0w(iTEtcE6dO3<w0a@m^ssSv&pG
zh6kaQfmQ(ka^q;FK>#5TC{ScvW=I(b=Nv*)M5VtC-7j0@OiMO};u|K_aA+ua&Wy|G
z0O?p6>sL7#>4b<MFOQrlLn#gKQ^`J^RRCxBAR_FXOPzCI_VB*39(ef4o1s+*twKN$
zj~fs-v>E^@$`cedW&;pHYGbq)cE=gVUygN~?!_hF|0teV`9}~ml+s!M!x_o7(s*;*
zCVc-VU&If8em*{M)JJgGyiZ}QGSUDFC<*}~u!v@1)yzPXBMKoDa!^zNBmjHLN~pCo
z86Fi-BjwE?n=_NmIA?K7liV3M;v_;xTNl23?ow=ga}EA*-%{NF<S%gIdoRF23y%Z{
z=<C~t-~8uNboZ{oF-LwJ7yi={geH%>A9)Ej6(HYiJs85m`CL9ANNz_7Wfw>}W{H&o
zhy)^>0cdZXg2B-WvL1};5P}FJQvqpeDFK{}*W_F4Q?l}yJ$-+C<-Fxs|HfnZ?SC!9
z1CQT|j+S@fx%Cg={YRgO&z<v4<ij$Yx8o}3{SK+j#DhH%5+b5(O(El46b8H#5%Ta%
z5N>2Z>i~diz*O?*BnAkIbU{QcAP}Z33z=$xNR5+KgfMs35xDG&i*Vb0Kg44zZ^zZ&
zc>uXE4-p1))`B-&1MC}R(r5-n0MAaC)!S!3D{E#4D+*c5&ME_7bO-`vnhuJ0%rG^y
z*MSI{U{o_J!WqGvFVAW?BdzlmMhBQRZ2?B+Z$U21!?_gN1W=^F4PGQ^jHW1{`Cb9o
zLx~8DXBkZ|AhymqMH-oHxQxU~>&7f9WD8o#QYOvxW(yKUdVH3~XXbxdwyFjxt+lAv
z<LsW&*(AMXIG2u##dx`haG_8Lh)9o{9FKoX#FJ+RAPS1YW-FlpsKy5<X|SG{-`W<7
z2*fqaZb$tC4RA7E8qJYe&R_1GHF^`MM%$9rtZMSjfg}|H$=fE2AVO&^N>ZaWSag=@
z=8P$&K}1lbY?iX@ee4?s0wKUBJ964=H$0STaA3T?n~R$9CTTo$W*+}*eEXdRL>ghx
z0ulvhz0Z>9A)>e;5?WE{3wn~(Mxl@k5Z8vY60)g)Z7AM`NMj7L0~nqG?*MV$0cj#*
zg?t%+Zb&IZs~iSLH{&P2T8vGbH$W*3fW~XQxiirODk4xy!&-;m-f<)T^zbbx6<grh
zH?kOlP|U4g)8FmWV3kkGP-rFb1=xBK<cll+?H8vE|McEdNhu}udZQcN`0ATTQ}Bcd
zU9dK|ZTmKyr>J$2bI!+g&Q(Tb>mTpfw(MhPbbX*24<L410T?g~jL}%%eH&ISyBmiu
zJQE*1?hBYQdFtDO&Xmr;JH|s#+>YD+xC~pjzlg4B?I0><U2r4G3Pm2N4<t67ex{Au
zmb4b8EV5}8^~&HwBSX<8zrN-%<jaKC_B@`rzApoC)Q9#20J-*xyQz0uZzW&oJie=|
z{Y0W*eq6Vkoz369^!hLK3o8zTBI=e&L_m_s;PjbYNG4gV*?be$Tze<pv)?Ch=E)Z!
z2tpW>ZG1eo;$GZ-@3q)Ayc(TT%9uB8CcO9K>t$rJ4+!Ga!{2blb3*{mJ?rAx;e_@g
zW=}sb8SURhsg02gkr06Qo;))H{@ois2J0*E-a_ku;$#FwS}J2z^z{y5!Tf{u-m?$!
zW7XmPw~xK}Y|U*DV-zVxM2Z?xn6(ROnxdy?JIXW%Qzy=WHv^~-wPRiPJ(xPPjP?m_
zU@!3AH)Mt2y@NuFGk%)cvT4gxH~;vV!~gKarE2vv&(f8P@Ag++xft8kE4o&xvN3^V
zhgKTPzIFc&iMV*lvDmVC6ReMr3kzh>qKs;xT2uwI^KCQwiCuxGcI>;nX1mYH6|D_I
zV?e$kJ`M5;L7M=zY84}cF$$#|Dx-Bwp4<s#GKVK>xT+U;&*D<@0j8<iSgQK^yFUt|
zJHjxpJEy{TOqfQ@z|hcE$rXTH*oL8j7&lz82F<UIkF&nK51^2(s;q@gErbIsI%*L%
zZ|b2*#iJzO5|v)xcF#>tMo%x5%Tg?~5R?T=3cv%@lt|5rbf!U~$$KWHR3?X<ZO3>k
zu&I|c5%P}XIIb@4XrJ=aC`y!W*}^Y88R7A}hVa+MJ05U+?`P+M8rvjM6j3edroqA2
zxm4Kuj7oLnm$`fxbar$}K3^bGfWT*$Wd5R*hEfJ52%w-LATTp*YNZ}ksTNg7J=bnd
z-Pkqa!<Vb0PQAV+;?9{d3gJxLzWka09{>RO=D(kYB&|Wjqg0rvF8kum{NfucTYqrP
z`5U%u**G!G6{S=zQMp`3K3_yWUyzoz^2Q(tmC>3+s5Oq`4(BY=)S@2MFgiNo;u?&k
zg`0}`37-~9P0%vHiA@+H2!cEy8o#>wuOImB)G_Pj7yce!TXGVt#ORn<k`*uCiJ_r>
z(=jFB*q2Zp6$}lGp?}+$um^#4QjKaSEI75c$z6AAYL348>#uKEccl>fFbuUZ0R$d}
zZ~}6sT!$|qC`YPurgrtQ76=RC$YS~T-}$t1r_YJ6x+vSq`|<Dg01#s4^o4P;tkaIk
zIh0!oQH2pz1ofK0r6g&r?0r8PO^-PGTPT>xwOl@gGLU>BhcFBv<X}jH93&{F@Gi$a
zFO@NIw-nc0x^oA7y959LmfiAMm=I8gLE2N0)5sSzf<VI<h6w~j4$+uE2cZ7$Yy(RN
zl7=~jAV(;*7=$^K?eusJTPFkoL5qTC!8PC5`QU+_JYs5&FP(V<j3I<MgM2X*ZAg^P
z0>~FMie-ahi$Rz-LINpu0Hu~Za`}LYEdk2y0hQVU6k7}mB|~9e!x(}I6ii4k;VvE0
z?|KG+Oj%0Bi3m(dlp;$c5Cu`1C<qDpf<l@Pymi&IHUzY_O+XkFe6buNC|I8_Dy@}*
zb6^(I){r-Wf)|BKZSe08Ac?(v{_6)R2<nF)c=!v1-0nmoLc+!bVvN>M@ypLV(%bX9
zr_WVSKiJ10x1!vdPr`gLXF?@f1r%~#N8UkH?XgO1p%e>?-DLnfb<FzKF?)favvmqu
zi)GX*{&D!y0S2AB6Ix+ZhjLawTdfV9JO7~0zD=(UQBbT=-S+hsONI6Atz8Fqv`=qq
zZ=WVg<%goE@~`t39CXuObM~Q)n^tm|5AecML%S5uE^z<=7A`y(2OMyyw6{*D8!lh>
z=86?7j~f~sKElT8lSw^&-{|PJ_Z)D@o-cw6^yvN1aY@hS38meM!r|M7s_XW%93Aak
za$IUh=gpcu=jzR`4$^18^F8_11#h4-#Jd^}{s&{CB`(>qac=+s03~!qSaf7zbY(hY
za%Ew3WdJfTF)=MLIW00WR4_R@Gcr0eGA%GSIxsM(l48sN001R)MObuXVRU6WZEs|0
vW_bWIFflPLFgYzTHdHV-Ix;spGd3+SH##sdcWUue00000NkvXXu0mjfB?gph

diff --git a/apps/openglcompute/res/drawable-xhdpi/ic_launcher.png b/apps/openglcompute/res/drawable-xhdpi/ic_launcher.png
deleted file mode 100644
index 71c6d760f05183ef8a47c614d8d13380c8528499..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14383
zcmV+~IMBz5P)<h;3K|Lk000e1NJLTq003YB003YJ1^@s6;+S_h00006VoOIv00000
z008+zyMF)x010qNS#tmY7ZLyf7ZL$ypVCqQ000Sga6xAP003YB003YBc-z_Z001{w
zNkl<Zc-q~4ceo{0mG5uuy{qbk8@g}z?VO>>IR{Zx9EA~4K?jU8DyU!%BVu|c#=(H1
zIAFva(2=Yn8AKWhO=@Vm>As!A%_mpwu-+fLs?Ir051^0kZ=Q9(`cB=t=bYMm<@H-@
z?@QQC#}<A+X45yYn%|8*um`qa2)As07+Y6VFn`7%Q4n(>7(lHuiOKOg-hI-&yJQ@X
z>38Dx`mgcs{{O@!m2+^EdNUPDF+a6!8!8*d@!BI^jeED=gH;btqEI5d{e*jVDP7bq
z{q~MSBE(fsoQg6}7k95+<MvrH{UwJ@Ip`w~PJ8%kTDNX0h@wE$ayqeC_4pQi|BmDF
zniEdIRJ;>Ji!s3$poDp-qlOkXAwnM{3JB1P1P!!MLkm@C24>Si7~v<e-E6{9;ZL!`
zoQM6?rFdMw7Y+$tTzUfl;Pmpx<+vQP{h0k;E|*@q9Ah7{)~#`t#n2NCckpq&yPPg!
zcQN2k_bo#*HWzEn@0nSM8}Iwe=T{D`IayKRyW6&m&Y3%JkjIDX;;qNtfx#Jlc-sko
z1_0FY3t0Xo00Q6;VJ3#~J2bF~ELy}!$f8M2Y^NdIhCS))=@SV@20y_%`#Dt21OOm$
zF@z9cM_Atci<~Q8U?Zp(ZJgY6(Tc}6oVM;^^`1wDR$<ecmWT=xkKWtd<#gF?8t}-n
zCVIyX#Kz%!g&Ws)GO7G%&LVm2j5+;_X3m^>(J@mNzG-t<6(_#~IP~Z}QN`;~#%u^^
zBv=E1KsZ>EXwWhEA%MjWSj+&p1YiKMScFGKjPH_0g9QS9!hVpah<VJajz?P`#rol=
zaPr)fu>ud$BNHq6km8f&$y)VmTQ`qJPd+?0zVd<OngfpBgumWe;hFOURz8^Qa=Pp`
z4XBxw;Ke_}_=c%&aOs`mb7wF6&g7{3U1hR?D2m?r{X1@Z>*nDN_N;fDC>PCKgkkd-
zF&a`~zS4LCy*S)Om}M0r157c%Vz&|}g=6?|;XWKwAQT*MxQ#H?lrYWC!I5q;pTUZZ
zoF|S^mMxt;_qPCIXf(txX5a0Ww;uk~=vd{jwJXPI%UbvK`FqRT9{O`bUiO)BJM_2%
z(XOY<ZezggMGB%dA3`}^^VlF(1GH^T`F&&CS{t<1D3wcJ5)g~ueDtlhIT8m;)c1&C
zEpUvf!KfHd?d4E=BZK}!<NjYJxa^iiIODcn?04J2IOp~l-@iS^_pfT9a*0OsyfHj=
z)JC_wJRL8!B^orF#x<`<KBn%TxJS*32JF}DyHA>!tbcIB+EHv;)4J<VzieD|&@nw?
z`5m!a%X{M9n_DRN?RxU;1|Iw4vzMKMR;7q&$|xWWqJoxj(^+TKnbYU-PX_1os<@S0
zFV<~8?agQY&;myz!KKGa6#j`}Pc>*BV9|&y5&#Sa0{{$SB&foHK?p!lAcP=9mJn^Q
zEdF4f`u+CiwmYVjr%WuN^Du#n`yU&B^3IJzBL_Zu-$?zTyBfz|`{R*^-t)z|a`kd+
z3q1~f(<FZUl}G<^7ytmP82HhDy$I!j8b;RkW7>k6y5Nm3x1Yb_kKdg+KYV*sjIe!V
z{5>Bz^<6`n@li*u;}T2+4lyJ`2oxNk906cBFdVfoiU|zCpa}<N3kL@#hL8dggzJHH
zy<jdqWUTs4qdN5xD<A$pqod;=T=C#`Jn=x|6Qy$L!lE$ptDk((zWGB3KuEw@!cV^T
z$UhbXo@kwoy`v*AipTK7U;PD}w}L+T{ujs(ocmzk{KH0WTD(`^A&2d=!0b2USlPrj
z0az|#&;o%ls0)oM985lN2f)H0MimL_aA0@_SXek1r`bB=taV@=NOFCUqP%5zczgd7
z%eNL+JyO5!vd`!<K7MJRz2?^=?yEoBiyygX0*$J`XZ~&1#{avW0jv0IG%=1|3}TV~
zHtuV^4EyySuZrk1L%45tb?dAjss42*%_vS`hkokCTq_XCI<}BT1tEG+#54rxg+u{>
z1i&zeF@X)3#Clk0*p&E|Ev$2}*1}l_W2{Z$7(q~!&ar*`feE?ciQuhsm(q`Gl}fN+
z@eJbtu1z-J9Kjlg^G?2Vm(yjpIN`_LzXAXv^r3($xF(p5y?b9P1*F-Cr~YXsj=g)|
zS$n>$x7f>y=ZgXCM@>wqVLVI>hXL%1sn{O{%!kA@0KEW80E%#MFwm*p_a{<c02W{;
z;2eP+Ng*PcB$|_0i}`yN`H&YDzSGlN`oz^2+;3#5iP~f$K^Dvg%)lkr9lOhEv&%K$
zKJzNf5c{A)8*qU81=fi3XeRcSfHmV-f6jF5`E`p{aVsx((`i2uq8B503;@uF1t?>B
zD)9ll)VtgP1B?cSF@g0+Q1@mB1{Ma^85pZ!tc5iO#u!-ZV6}xY4oPBJCzg_?K&wta
zn%L5Rj?vAeG*Bm!j&+Mc0?>)WhhMvFm(gdJCt~yENoevA*5h{EDh@*#(_{(r%m&=?
zu|e$lr34M$iU-{w?Joo(Y{qhgD4~QIkSM}}!O$?MLZb<b1Plu=I}!;Ru(0V=EC`Ht
zl;i+rFbgmXILBTa7$6cP1XzXENbTNQo<1^NH%n&@%x*SDHWkP^?br~U)4P;DyHo=n
zOx}zI!SUGQ?!y*w8}{PYDhW0*HQqajJud-ZH(?p|W05Ff5W{#J0g4FFg8-#;fCPqy
zMk|ecI3GBJ;8LfXxn2tc!>I-s18e=OF&ai&7-M0rh0zYyI+(=47^@pK8?@?t)yRhO
zzs%pSswcJ+l9+kcqH%0n*9V;dpM3NE&pVBFsSjxAt=MWGLVz-sxL2ty_6bwL*y%l(
z^9>+yo3UI7lth3j7{MAa0$2!WSj1?ejxkiQ4K<7-K?@ef2cKYAaNFUg(T{h&499@8
zfO7<A0d`0w-%CF`O8o<YnE|FiD$*p~t9xfuE?#}lEI2lj2()!w82~V4(G$CbM!SRo
zw~xLG_f*dWAw<oLz;H|4SpS+42o5OGjPC^K#W0>ildBY909A~mi5d(n62vetXrh7`
z4HzV;U3Zyv?>JqX@EIc<!&!#4{?$3aT7WImdjxA4iEhGL1F(d13@`#I$q2#c_RrsR
zZmXq@R7NC;gH9Z_Ji6ciKMa^sI2ZsB(hOJ|Q^bR`w>rL17PGz;pl_gtaW`qV2(}?K
z7!zhaTCssiN~pzE)ZG|bt^v&&Iw!VCuMKp5YG@e$;~cE9-qBhIYucx?3~Lx{30fye
zS{fl{!|4FcxRUz?fTWbfM0}x+#ep9=eVP@JqE)w;wWx(pTz<gwPAj=981RDEZAX9q
zbku6&nAW#c*5eVYsQ27*FymcB($siZHR2HvhiKYy6zx9PGk70n%9ByXz8Fg$fkhR7
z^e&aN*eA_@Gpo<+WBoMVw<fjg#=<!Vlm2e4gR_7(354yz5pzC`17^Mr3#XohW-^KG
zl@%aFaL$smY^|lDvv$e;hc|u|mYA6Z>XQP1!_hCDgS-E@^?9S!F42HJ_S_#uc_5Su
zs5YV8=8;EdD(d~XBf)i7k@eOjOu}f!6L8G}mPQ{ykK7Z1=*K{C7^dQQG~*hqW*BXt
zwShMNOtkjDYl9@w(22=Uqtnw^7;U{qm`pPmt+<B4;*pr!e=MS)4^v7@aQu=_VoG5t
zlBD7N0$2$ltRVUFzUlh3Z9|b$kru6npPMz$Z`|^{GvK<XPDP_;acIw{5r$Kxf#$xR
zh3_Q}XC=Djnym`8T2&b1&}>!FL;E8XQ{Y&G*#ZExj-eADv1EkRiA9p=HbW9mXn&pE
zx6s<=(T*{$<JG}gLL(Uk0I;*LE=HhAm|i>-anb}*Q^f2@NW}!Ypi#4-44eZ5;wFGR
z2l-#ffa_PC34p;4_~V9Ch1H=Mop@k2T=ZsZ95<RZsyD_F#|>ER2~w$V2Qwf@K~R83
zvJIQ6w*fXxCEOy(CETXcuAvj1GDN3@H|;ZhZ>JU*V<1q%=E-}pVf-!#5kQI%P6I0*
zTLpFk*7~tCJ3&MYqC=<6ZM^c6Z@7>dv20Zp<}9uM?_~fH0U)$$1VND)+d76o^q=A^
zEr^rEHJg*7*_`x*)CPi!7_L8n$2VUEYYnzlmg6rQKZCm73TFhg)~N(r7^9)J_GT#Y
z=E!J+L>qrUGe4>H>r4xD=7=p^O5i)6{5&4r@Eg=y<GArJS780f-3UY(#+WoBQUJ&S
z%mnu)#%mB3V>oNE;R%JeoxjiXN3-XX0XM8Z3x+2kseod+K#}a>@yV^%M}^*#iQp1F
zAst%zV+r1|H5(QIra@x@LRv&YFN9=BDFGr7sAH&E#DX-22b|;do=c^e;n;zlgR|aA
zyY$*QZ{k|5CRq1iVqyY?LIkChclb`g8G$6Wu3oE&%0x0;uh6maSl?4UGb=(U=b9CT
zAAD)W^Fp)dRRgSbAYouM5g5E}`|w<2-3dk;YPD)2(M=f5sbl0cD<y<c04qnKl!OvR
z7^`8eOQkxq>unQcOk3Ku&N5x^1FSJ=M3mZon=-*VILENo0tgU=eUPES)PX*zAoL7o
z=^+bdICcU=mYo}9XOEjc^IkZoMNjft0EE-uvH$-*2E<7n^$EZlD+Y?kfE~ZUXxp14
zEf*&Z@<id33I5DX1hIm%7RDO5j+j0twy`;(?GD#nr*%04thF6sDCaqWO947Cf!P6Y
zU}iXHVXcNW-dw>EgTT(Y7k=$iK(SA|BR=ybI5Z(;@VwCMZ!$sa_=8wT7h@fN5QG4U
zvlvfCab)odtTZ3MLn~IoCYzzuBK6l5SDPdEd-X-eRX!@EFbu5#2NG>lLPR;HL-}yh
z`_wi&MC5}HqLgS1BLC{41#goav%lv!HA~s6mwsoR&nay7yEk7xf5<Xuf;Aomjhn_*
zCL3aM#60xy?b8o`aM@q&8{fYdWN{Lo|LnU_4trrSi4qNfpfKTvF`8`0@YrVb6b5kY
zv>)QejjzT(&AaOVO#?>xa{z!6%4qPn@N-<8|7}ThG@fYqze_s}1$89iq|O`10Jds>
zYaEiem4=mV>361M;_0g=f=i>8)OmJ>lG;J1CPwF4k%DWP#OL>1TN^ShV9rgEXOi~~
zo@v>AmuiBAwT9R;XvwTawOIhrs)H{7(gpbBM@FC!BA{L{Kms92D$+oBAOK+VhGBg7
zc3)5U{+-ADeGFL39|7~7nBW-O`9f^QpHak8ybYhG0{W>$Q)!!B3u9_nx2~CC?^LgC
zw{LpU1qHTp&{+jz9CbniodoVWt?PyotcB^iXFaoWV!JN0<83{suyab>OdC2+=C-z^
z*N%~DOvW?==a`rY)^SNHJ^KfD&w!Ai3aa?hC9_FWO<7cBACBb`&gR+lG2YO;P7w)N
z$40Dvd?O~u8W0k=<ocukHU>P_IuBrh5qCR6NJtRo;Uu{YcZwM}hWjy#XVYoCUvLpd
zn?q7ah~9Dw)-ffue$<-Vr!$MGYy)F7V6=nL-sT&_xx^dO37}>6x)aZ_usS8a%cMPf
zzwKh0F>OY;)b6|VyE8_(G-_&JBaQvN3G>W?H+4=hAT(PCWA*%fj=K_LBQ@Gqt;@M|
z0ZT|@FlvE~(|`wNGT+_rM8!xctgZCX?71^U5PB0x1YCU0k<X0*E6lr5P!R|c3`WdN
zs-PkQM0OPSIDiWVZaOg5&&<*xvAyLvZg|+r%CKS^_oF@@P|{Q(Gb=DjkcWePy%k;y
z(ul}2#ii2CU6G4*nc3*TbzIxA?s(lcEuF8N+k8$DQs?|N=hsN#X#qfhNP?ZDVlfD&
z4whxyAoODsjyrKL(N~^h6a04{gFX;o){Wb+{I+$t;<hLGhqskz@o$*vb>H~j9c;9A
zYgg6?07kd90N`nW-cG@|S^K;O3l@!{FPe@<ToZeQbJ;?&V0u*#7JjNq8@|xO#J`Wa
z74I6>H@;ShX>*$mw_$j6^H?+9E=;4JzVe!A@_?7{ll9hUq1mbgaVweTVAJ>>5RxDy
zfyg`1+@W^8a!MHF63fmz-L`Zicf>A}<lGFUj@fa?dwfZUIIWry0AfJ+N}9C}I&mB~
ziJA>NqK&zoP2oG6*0z51&Nt7Xq#*6oY5hmlvF>Uo>Ti(<_Xtp)F~;ksPsCeiHJgq7
zn$5=R4m)V>q0WihPCt1@ef7GAsEk=IlmzNki#xB|p40kiCCT4D^jduClFfL-Sv@e^
zq6;hk={{Bbz?2dOzty0|8!a3{^g%#iL_dXUZG5(F%43_g;A~0i{de7X?|+~1_Lqu}
z|7ndFoN~|&f4=+SEz(T;R$MDCC9*6F4U%CCGKx{`Arwmi!h%2$3aF4ga|D3|00Km=
zqm;J_I=921Ib{Opzk;3UNYv8Prgq*kOu|TFhq%dTH7uHSz{U}59Kkd~#0`PT>R4;r
z*3qB6=(O->fBDloG%$^<-m+w9!-M}_oKl}V(7!<U$r6DZB8iKLYXz>?8r*DX#7%u#
zqiRa;J8#t~r@W!xW`h%=JMerO17z636<?3BxmLs8J%@;Q_P#|mP2pQLx9yFwYiWt1
z2-pb64sjepH_KeDR;+F%^Cl~mpU+t||D;n6o^w|TFUFroZ$9@$!HJhX{Fh;@9jai>
z>Mb-fJc&3q&`AQ4jHsXxMuey+Q78!%N`#<5P)Z>xNCcroSP&p$2q6&!5-MaMt^Vc|
zPeWE~7&-y0wP4542_uOu;-<%xlGq|?I<y*$xM`qcM%=Q{iGzTG5CTyY(ySS?(Kltv
zy!o@I{`9_^H@^CXujzd&fX3RVhR*39Y|fow`m||{1S4t`&ROW#q1BAhY$j;dH5xT%
z>J|60S##{G0sLlSv?cqe2e#FWpP2z*0cQeKM=O$hoZYsudfZqvbY?RiHsquN31R{S
z0>CNg*igOhM72^+CdV655EMRErtjZ%@l}86Iq1lP-m}kvi!p0H>ql3u3HDgW*t#yn
z)(sXTTY<6dEliBY7#@kytXt?9ND{yq_^zwxbnKYQFtUpAP7eV{38;XeLZDCx5EUhQ
z`T~@D6^gwAJ^dOzQ=dY)M{-|ZKNTkJ85`G@zCy6ewr-p}R9j}CAtu5EK^OvzHZ~P&
zv|0v9lWAf^^R`XRg8}?z+r}m>+`HE&c+bRu=EMLn8`!d8f@lwkiS6ouM!Z2XVnZZ}
zg!InY5u5{zwn$nAjYgtc4ab!+w-}&k-kf6x*RNUKSE+8n)c*Nu!QvU%V{eOMG!^U^
z^=1XFra|0vXw`w*q(;4(pjowO)HLd~1dUpPxMh*F99k`pjQY$u%^949O_Q+9JP83v
zMUYBBDFGFD^A;5(!h-Z#6%nF>M4==R6@+I-Kv03VcSd^?Rj)d7Y^-%mlES^`(fP~X
z`^AHcjk>1VWK1eFkTUTo1_RDGXzjddYd9n=qGp}>?Ju|ouQ_`GKKQD?;zM6O@R=Fl
zbO;b5X+)SoAHa`qeOsYf6CCRVQYe6QZgVrcYP3V#vZz-yRmNighLdVfZ>5U<I3CB?
za1BBTNJStqiZBeIltd6Jgi(Orz6i7D_dzO&PzDe}LI~-70A!j0Gr$Bu<W5?<1kPCx
zys?knGt<NHPTYs8HqU;o^`^*Z-y@_GhdA~oB#9l8mO~PI1~lsi)@oEIW8thE003ij
zhsmS156T>U7AU}H@0rcd5CEg?Gc!Pt!ZA}W!(}(TI#qBn!3=VaL7hz@xpV7?oe3bJ
zdJa5tR(}-sRpORy7`8oOBALjM3<y#ZTEaqzboi7ySN}^6Ll6lFq7)4kAVCl${PS}y
zrnWF}JITu6hztegjQGyw`G$}ExdVd8@F~C~sUI-{#(=D0Xk*6!z*(o*5v*ef#}MR+
z&1rto(KF_QfQ(D&{W#Xo%mB{b&lz{U#Gxz7Xg`QMA`lOBIS~|{khagVmDXAhKM?t*
zC@TmgJkkA`5?o{?<&-Iz6QHM=^z3;A-(F&!@uT9lvH)l^1np23oQAHhouuWskj;`Z
z0XXWKvk+;#4At-0rlD3gtPXlSjVAtcS>)zi_o|!!u`^Dj6v?Eq9p-V)oXiw-F^3s(
zGX_Y(8W2ebDg9`PDDC6-s_6;lnFH5NW$#Km9BhYhfe8eO#59oT7@;ad$pDTmIw`?u
z19cu|KzBaC$g^SR+Cs(-IW&>YlaNb@;PybeXpvLjKQB`Nk&PJuv}<(Jc}K$MQ>Gn|
z$j(4JpIye)lw2u7sf`AlXgf>mCCs`G>9a1yW_B=TopzMlh^Axq!)1v$X<=+~8x#*>
z-jo->B!r2|b{Jy-R_(+sBeLrzen!~LbaDsrokMPDIlX2NOL%&ue{6q$N8;E;CZA#w
zaXtGW05mJzGXFnoKn@VMO;}oV$|Z`snBY<(k#9wosn*!G84wn5zQ5Mn^z?hY4@jTm
z+FIb!=Tn-Mwc<LO%?75Fmf+C6&%xnKUypKO0K*fjvFwST;E7FtK;3LdvCs!4Lohpk
zW`uO;AXo>{J2UW1DA?tu3mx$H*`L^tI?Z91X>{FLJiu_yR&#Cwa5{Qs25|buw&r+a
zojE^m|EX=`vJ8(D3BP!vJblLWa-a&W_FxFPjn3@1OY0pXv$fncA!a}d1?L=MU4hmH
z1LeJN+<~vh{tHh=Pia~%2s5VciBpgLERGs~6PB<3Z#=sGT1+;!BMM6hgJMd2(`B1G
zCAU+_^WY|py4pS^P4t{`%*u!2sbEo;eeC!O-<3yz@6H1}2KFo(&|%a3@0C;vsQnCX
zzb};*4=WJ>mMS1Aq-4&K#Y{ajtx0_W5y<flIhXMEmMyCFF|ZSuGjJFVTJ&1%zwl&)
zLD5SRTL=>E!VDZ{PF;$ZANesHv+rAR|EeqT*t+X5T3LfYMTmlO%4pjaGG=pN&O+S|
zMsyICJZwfp6nV*ZkR4H2Zk*HWP9M^FIM;pe=}?3SQi=9Bog~@tlSH0yWISNUd4!S)
z2{Tyhn4Pu649X_!Z6KweNkh-{b0j<xNhFczCR%YFipp3#<E1!k&(~wg?BgM&$_Y;p
z`S?<*q!Q`-=lXkQ;U$N@7f0@W79L;sCp@s~N^BW>3?N1!?Da?|o37v?^|T#kh>!=~
zUj1WZoFtOH{yC1AWgdBTa-i*yI|7N!S>st4(B@EHIuvcKXb&<Uc`iLJx8G8cn9?`D
z1t89BY^K$jQ)dT|U@(!_B5N>N-H!g^JRGv<H8^ySGcad*R`}*r;$;`%pQgJcQh8vo
zv*;~N$5H#9g~Rtc11pAZ#sjN<i`AR%hBkHdmIvAr+u@WuXP%Ced|(leuY9s0a|*hQ
z25rI0tan$4o;q)KsR`51W`cynyT3dIB84GI2&Rq|%bZ3$T`HVA9m-zCG5f!?`TM&*
z-|U=V+h%4q*UN`vhfww5^b<aX1=Ej4e|b(ub}%ypiDxAI(-;8)36+~BDFYn1$1AY^
z;+JFF_~UqZ&DFT~iC_C{+u7V~WYhEl@4OkwmU=isNP>OpLO^F|o(F{~cf1z(-Y(%2
zIFgPtZS5lWj)P}*sTax1<dhNi=}yc)(yL&W!7?)}w~Q6Uf;vOW&Nw@ByzB_pCa6@#
zF}$A86MEu1ws4M9!vB5kOO$5O)*3JzgdmjE0<3?WFgof{Oz@6A4{4`ffIt;<#s`4_
zOpx04PJ*=))6Q_zYZPFu)qrmVNcs9~3}9HRAxQbClhFWY@j%94?shtv1TdJHN>NZK
z6_m6>1a0l;kd}PHOh`-<{iOw1IQT+b^!>Ns%y%A!>;Lc@z)46U(~gGc42^aj)>#k{
zq*SO^8~DLbzkyTE+zXfe_>0(Q?kSKc!dQdOfFf;8L=g0#RG6NVh#>LU(5>X0>7I92
zMvR=HnWJ{8>B(MgHx#t9k|bmL)J0xB0T3t#$Z?KMba1{SBkYj6Ac$1ZzS*5McNWBv
zI^7xl2jC4SeG?a5a4qI7nTpSU`*k?yBQM2Wci-$WAt6#mSUlU20dUL=DJ1Ik27YtZ
z6?oHm$KaAHK7gZ+J_<kj)-CwgQ@;l3`e97cmY#gLzF){1o#zqG30RuQjV(H)J^Q^|
zxCE3ew0&yl-#U(+hM`ed%sELQ%A%llveNmW?j)Ir*|E)Hrqs-2tfsEP<bz5FdC*=j
z!bRubfV&>J5<kE8LVWsLufQRHd?ns++Q)FjVaI^TOSPaZIqN>0^Tlr|C9HAy{Y_Wm
zSJz&Qr#9b%Lk>I!A9>$ZIPS1hA%wtWWgPXYfeYFhaCd@5I}DR}-Npw)A_}u`)@SBf
zCeUFOoC6R*$*?2(Nyp3G<9-?g-uR-+ap6y2;E_lGBs!em4){nH@zV)p4N&L`gR?9&
zjhHe%r0_yBo&*3`XAr0eFFxu`IO@QE#!bt<k85uJCO&z|OK|w!uftid`VbD-|KOC^
zex;-k!ke1(8VuZW`|t6KtG|ks8*jlr`|O8Lo%tQS_^3Cb5EZ>9u>+An5<56z-;4V+
z3C)tn6uTmcdOXoX5arHbvK_{D<PoVFS!bBpZ&s9pi}b9C7$8Nl0}DbfZJjn8odaMW
zvA8VV_BNY1ZF%DzC)W`@WrPO+A=2!U9m>V2IPJub;JAZdhnw&H4z9oLyZGouSK;XW
z-+;HA@nI}kvZw#7wZ4fLz+aZ#fh&IXpLlfbAF#(>3-G~rei<)1;*A*SpOrI>h;pE@
zv$&r})|o>S?SV3bo#j|c(FO&&61G&xkY&~kcs+I6#Ib+2;SSn7GXwg2r)496ps>M=
zI)J{6xw$lVG9pt{-(^4mEC8FosUyiD+3mnOQBNO9wHYxubs^4t`4@4*p>M)X_kIW0
z-E;-s@$sMIWk;WbH=KSh7A{<jhaSBjm;LZUEPMEO*mIvbIPV>w#><X<8wLmF=M0u<
zk*6~|^#!HQ2qGh@Z-MlGV*=R5Npq4dQ$EJ5OFIiyJHt^2uygF309g8*I*+!Vr6>;o
zN+}=20uVx2fUFPAkcVM;5u`%}DXmsXNdiCuxOz6X9A4QWjN3`Jz5^qCb~|^*zIf{^
zFUE<7zZKWtekrcH;hVT^*_Bv4=TQ9h;Tth9vw#nr_bI&mgnz}%X^XogUW)&DJ$jCa
zb_hSa)S|$*!XWiIl;xzkx8|JaT|&mlg{a+%p9M9~;sg94+Tj$7E=07WD$^DFrbJ@^
zLQ$!dt3y|I$UePy+>!P0(_-UpMx@zo%7}%t55c)-eiyGe;a&LNl^?^hzg~;ePk$rM
zKI@AZoH{QhssWMABf0<S$N~hnBbbx*#B|galp-WawpUTUs<JyAPb*>`z++;^%uafT
zm}kV@W7=tFoDd?X4~aCx$`Gbbsofz=aE_UX5EY^V5rI2805Ubrq^%3YdJcIOrP;7!
z3u85w%sm`0I^th2cX0`?dBr&xoH`H2Bw%(BLOm_xeERpbr<F%_E`x+RahuG%NQ_Nx
zN!J(nD2ULVU}kB47KQ?F#xy4Ux2O37x=WHfX8@^OLWNKoOgc}syJB5`?W;-_j&&+G
z8Ra{Q);l`y^d|4*Z_jiHl4*s`p*q=wP8^I)JY*r|PQ%lk9a#ZCkLNoEL0z>8PgSc0
zr0O1Mra4`5n1OlOrSlwXW4=3LzdM_x5RhpK9)&%1BGf4j>pN?qS?2+zgUudntxx-;
z2)ca*x79vpBA$~1>~JuMgl~&63@NEyxqA+u1%Otofkva|%@lX~HqL!nXVFPW!Oo>E
z8qYB9_MAM(Xmr*vmc4e9e5VZPTpWQk3T~I&IOlYyA8<aLNeAK8ly+#tds+?wbV<<P
z&V~*m{jEnj#m;%P{VL^yONE@2QczM&AtL3gN}#TYrbClE`~rnxBmh9E5C9Ma;d5j_
z<^y1fXoo6X4#e#s#p$43r=8j%;>l6$JpKQBskgK1zm0pelY8Fa2xLiE_7`ioC6%Bo
zLCq`xfE~cb6q;iJfOQh<T28c1?mQC%kS`Geq>3~E(;W$QhLqV%s3Q#Pd=|I0WrxYP
z{m9>^18IQ$_kEnuZjVWCWOEWE(V?pVV488gW)ddnI+4hoJf5?%E5TXT<CFC!0L0q2
zr?=wnb;di_00;;Q`H<HG1APk`V5%dvX<e$85jyKMNQs#){Q`rV+@Tw7esC9){&xE9
zK%T!x3A*x`w!_V8*v^@J`I4`VNq~qcH(fgphw`~F^4}>8qyPXR6fXP4Cm>~aQT~4j
z8T^cv|JtYelpFKR-nQA^<y_lQO@=9&&VlWq*8ZkT-<huK{hN=!gZR!lfVkZ@m<`8y
z=c!Zyfh-{k%3d2B;EVtX7CFx|f6*a~Z9(+fq(l&xqnXLa{^KkZqdvUz%r78N&w&9%
zzHw@g<4#5p29AhqSHXd^FtQs`$i~v)GN{8=b%n6)9Dra<D}|k<-^1rwb)vIE_uL8Z
zp=#r!JgMN=q|sEz*x<p$ASj~{^&{U;$SHyo4N6hntpBqUl+0Qs!2(2z0ofSaNVPx}
z((su){QYbU01HS_K;Pou)4sD_Qe&nTkOo5eu2b3iDsESaIIY)lTF@6^+O<GkyvXwh
zXAp&Dtl#z!esS%E*gmo)ExwblVm*iOhM$8C%Lq?IP%6aQ^(*o9%P&B+K87F&bAYVv
z-yF1VK^yJ2v0=e7)N)?o%ps?vr>q8;*?1Gx4Y8y>s7AOR5*)4CvSmvGFs)m^mjC_2
z(^0QKOGy#{nstk!801$Rf4EeYqKzB0-dRD;S!bQi2;DJ5z%e_c8F7>AI;QmiP>6aM
zP{<h4Ra@>Dw2}f>-}+^|?~^CtC%^tW>h&t5^x5olDZ)IH8OjJRrNZ`+E%^H7pTOB4
zd>L-N`!^^Si@t^+(BX_TEXQM8k?IE=u~JgC^q7X}`E;Wy!Dc{(G*b)iw{X1<uE#l3
zvbq%|6we82leSrJjr>QFST{U2Bp$xAj>lInhY-&J4ZZj7hcNxrSt!yX_njL)g!;Jp
z>g0s@X9!sigGg)J63+QGw8juyExB0>s5)t7qvpPS)G;$3zWJ(ED3zw#vY7_s>hL=q
zrZ@@OOS8egIcv$%`Pj5>3_rg56ZqrpKfxLQ{9e5L#s7k0v6xoT9Au8|WKMYJqMt1{
zl~O`Vh0(F?xcc`$!f&ttE+*@nF=N&M=Jw7(5F$lqvj*f8OUN-Sh7vun7E~w%4Anr=
zto=$BsaTuTUo3}n=9Ef)Pq`#XP}3FY=A^WVS=WpwKODw;-F)t+PY{>?$6a=^au67d
zD0&VWaLq68#@+YbjHm~0*#mbHK=(E)!CB+m-L~3jIdJv)GM*R|wb6c2AMKOX;j*et
zkZ4rRw>Phz_>>b<6#yuyxWBvrf&yf%dU@1}4!a3PSYXUuI2DH;y#%U%8!r3R`|!R`
zy#jx_?YACb71F~U&UK0W4l!1WfcmOfv(>=QfBS8md;ZDz@$Wu|zCn!x4q1qqb9+$g
zZ!gH$5tO1GmOruMdZXE>UGVV_!3igw!xi=B@QK4?YtEmn4FA5>sy(W8^ATfOH&|Ey
z=t%v+7dk_~?U`8<{pFbs0M32Wr6?9kxb5l<&#nRQIsbJ0<k~Co`jbxvgylwaQr7Ad
z*Gi#|8Xep8t}y6(ffR*V>||h!8Pz&|T<hi!0}<Ww!Ec;QcH6A9jSS<I7j}Gjm$mw3
zjTe~|3qepJLrBKlg#(y1cL3YAtj5=W{0>}y%N2P2E8mafjyef|-+GMNnIb?L7UiI1
zfFy}=<E_8NudewPHjl2r?0Hjg@RG$Kk#2j*1cva70kg!aGfU4i;XK(&>Q$4R`fm%d
zeLdXL!=wW9DnY&f`RQ}6x@e!*Lrw1o?)omw`!76^ozqYe$-Va8!*1HR38%h&0bY3Q
z3wNrmJJo<soO30nPXSas4N+J`y;irSo=7Gwy*^3I1wZ@i@)I{t-t*$I#-lG@KYrhd
z!j<3bEN)Q+Cw_79Ur;O;@!_u?0T8va;zoCIz7qmKjv@--mlg@ZPyu7!f<YW~!~#s%
z`|%%_pM~?ka3+>Nat{I(=7_D2kO@LaNTG1co!8*pkG&FK`~JDG;YJ*A=mN}`-3J*m
zWI%rTQa}g-0j2!91V(2Ucsn`+$aisr<G2-TAq0YAf-s0o7?ed6h3bJ7D`7SC=3(ed
zfBsdFVAWHYFNAV-Xu}hDVEr{{ni0J2yhYEQ0RV8<Pk=^g2vd8fA<+rg2DYcKL@E&K
zBQJgrS6bMVnEKG4e~B7otW+SxH9ueN3|F~cYhr%it6()&!TE&5b#xAcyKOil>w<2F
zz(N2Z3n47#FPee<4w;4Z<G0|GUwt(`ebL*oW@sfuy4cP-gNGmcE8cVdY545LZ^d~0
z7!EvaKISZ#3Kcp?#Zbzx&XLkHK}gDp+-^F^>{yQXJ7XL(^U#w+TVe)CAma7wwnA&`
zNEq|A-|fw(op>-#J7IrRDn~F0ZP*45>`>~nSTg+}%$dFiuDo<;r*wYCH0J#OJQcSt
zy8(MI+7HD-8A53M*B9=`8RyO=Ye51bw22vE%&s;S);TO$v?mtru~68!=z`E3;AH*&
zYP?n%H!6h82<s&-yUvEf9r1W)c`sjoz!Hr{g0-u+fE&{=b)X*u3#xF9S-5}MlPLBj
zSiHw96w3-iI<%kHPPjV3#2lB(c3VkT$rOTFU=sqiWHxW!poWJxzIXM|rf==NbgoJ&
zWmD*N*!PgRAf*wg5)N7VZcNmMuzJG{h~o;HRgKHPupICG;^Fw_C!Tq?-RG1?zU4y)
zB2Wb+x``}CPvRC_x>7}nA{zB3uKmd>TzJ`AaMa-k;?_UkDrOJv<tD3R->bK_zCGqG
zS_LkU%CBS;J1kY&ktmtD%F}%AScAn1!`rH8H4Wx0=*Pr(4Xvs`-_#<6wCM`TZ0%Xc
zGcvoL<}P`1$bR{h)*8e`L~=G@3Z`1Es%^t-Rwx;~xY`;XE(e1!PIGm#g`0n~>A8^Z
zS&zRHO5FLeeB0%??zeX$Dg<aYVzF&}l=qmo=ZulD4YyaC8}}ALQwJdhQb{NU01glY
zL4NCkO!D)gwVTYkk~qIQ#c#5w0zg0@Zut&6A%(9Gh&HO*A^!mTl{z_$ZvB6%G1@_(
zfJn{)Y{A8GLyU~Cb&lPu*vRW0wdAIaYwW)J9ET^LScyBX9(`VD_PlP~x_GY{aL#Wp
z>6~Lp5Mj_)1LKZ3X`Rw+)CR1vh9DUz34tQm3ct0m>)7j`{o*_J`~IhWHtD(n@@Liu
z<d?Gw2q_?hf>IJfs&uKV^1Yquf(mfpYqG4sR>4^bYXo%SD_(3%E{<bcYi^f~cvyq@
z?W2_qzbO{V^3DgIvP&1u#>zF1W8SQ#Sn<HN|9uR&>DmYJ(pMhr_w6?cnyrMj9+v}s
zdu(OaS81acCULxf94EpU$AU`~1yd2KUJyrMr@*WL4&ZD`C|1a`X_f#Kh!uzeND4s|
zK!^~6B1joRsRATLkTQax2!sL%5r`rXhX99Qr{J7|(*o8guu~3BS#4X=*qQ+8$AU0?
z%kc2J-wEmyM;vj2tJfdHjVmfR<&b~DPcOaYd866$zIE{}*FTIGzI<QAaRXrxVf)yV
zETt5^rCH+_e)xC4df_V$7_!F9(I%d!Z3{+QkS!vVQxK^sP@)F{8L<LSd@EBXD8MNY
zg&+hV3L!)Ufs_Hl6J2BkM+D-?zn=sDoz63X*#EZ;DT5R~*Mg*bG7N%mPal|lPNg>X
zSQwP#o{JW_&%XCsocNlB*mrOaEXMKhJ<pHy1JAP$=sYg?>S=J!VWPSbjxDB7St7QL
zuB38tx;^Q*vuECT>rYp09eupF+#7IM2&owLAPW0Y2>PH@(RW6BY|`UFWWjJCB1Z&H
zyY$mMK&0y#gdk*#yJbgdwG)G~a8AS67>TZPy<Q{d^j%W%4+#8+R?BQTV826g{{ze9
z_rCauz3Ps);MO~aFj<+vkG}l;|5v45vUxWE+<D_lyx^bq1_1Q-4B)~KU(0JZK7sM-
zR(a3?2SsbvKJ~73LqnzU;TYA4m>TsKTCFNtdIGT-hjvvsZUMqUN&zJUgsK2R0ZCC1
zp(;?IN))ORML~%IRiHvtLaA6rp-@B=MF^t+Dj*2u;JAf2nMAcViqX-n*tBs2#<y8F
zx(VX(ht~bujQNG%U2?(w^`l-gCm0_aXXmv0{hdF+nx{6R5yt>Cmj8MC|07kNe(W+0
z$d2>B{7TH3GaqB46PPl!k3R6`%lVJXzB~Q)yRLm=<*NIqwHlV2bwf$)7i*C4n`{J;
zL=Z`Yp@32fg<=s>f%~VH?+-#XDM(EbLKcM}_Bn-O9lIrsMy+IxL!y&>3*#g<h~orF
zY|;%sEVMQ}(7%vFSrS{etvczD6K3DGX?061-nT~&t=x>+3ui(IzkR{wpI^Sq=(EfJ
zhs>8gdL6#`%d_!+-uDZ9``70J0KzDAK_s|XR#1u%MgltBpTQ)))uh#MXjVDhhMo}x
z7Ol8pbwj>u`8}KOKmH7arD@<0ply@je?RlTrd)mfFK>SA$p;T4NGAjdAMPrTiYf^y
zebf|20x}?k5s_d{65FZ|&KR&O?p=+s%~NpjOCnS^7ZAtIT}pglH~kwcsnS&<?7PF3
z%saXfL{usDDwKK^din#D`xJ^j5{0rvv8+%m`RhW-Uq?m1<&0Dz&wn}4Tl-h-82Y-)
zIGHvDdf}Yobr33!q5y?rgkm{DsT`tM4pA%zC=?aKNJ0hV8PNtl9mDbG-gPq^J2Y#A
zr~W(-7oUGOzWtd8cR6i#+rP5~+AZ3H1AAMg$b_Nrc#dXBB@l%0J6yFGI%XIRSn?eP
zdC`Ozr@!P#Jb1?)bokO^_zT~7w>bTbS2@EKBEdP1Bn0PBgumxA@4T2xe)}9)BAIuB
z`>yAoU4F-Iqsea3fD8i2@b^|SPErX{fj|_c8z~hf3h7zuktp^kL`5&LA_dWe^hEsn
z$Nmbf8IB9+EzII`PP&GcF4?yZLL&v*Sf&}V3R3hl5(o<dVJHwr0zpLScO%`f&r?GP
z{`)5RznKB&z3w~yVJcg7lflAX3WLBCB`gX=MS&m^hzi1+q@qL+Qil}#?<4}g{_nRT
z2z-~p;Pe42B#a6~x+R(4DJzKlcVs3kDhfoAf1V2bcjkSt1c+T07w-JnZ6@doKR6|C
z0|^x6rou7hlNN#?Qz=XYfSBPThP4uGsH=dImzT+xI-PSQC6VtFO{G2ZoBnxtLgdeP
z&VflF2qaw6y{nZ_|9~0*0H1u@4ef0w$rr-g_L@?A?O#=@5kWu*LvJpEP`2S^08(ZP
z$z8htkYJ@0g3z0@sNka=Z*Ed;2nB+W{1%ykfK(#g2O!e`0{tVJAi(4BSu{kj?T+Tu
zv65&z+HB*!AQDiahn7|7!$_HOK&e!D$pNMGUMLY3l;4LmN<SYEf`AZ&loMU5woeg)
zIAbaa`p5JE0C3SyPs8A}Iasu4FZ5uV185>|k;nk!v?nz)7gBm@m5MkF0!SIyT4SR6
z+ViGBn--t;wncE%0#EU+9-Y~5?gPS<iK)Mls;y}v{RbUF2>Q2=9tbG}TKf6@A2H8%
z>^2`zES69#^kHb|N%;0vvVw?h+QdlA;B5aOmu_urvpO*#IYJ;E*ITP%1OTH9KtU?v
z*PgPEWOhzU)d~W|5RQXTLInaUkRG<CboUcSV98#~qyR}YwTb<XFyy#V-@OdjZ5jX<
zpg6OD4}5j80AO`&Rud$F*k!9m2$b7nU(6za7*a`YC1Xyz+WJzlf0zLjihkdqcb#=1
z=Fgu`_dRw4gUo$VAtV(LO9gLc^6HyNB|q~y!aBC>&{{iLudV|?5HV-I<sXg#&4$4b
zzI7pbq9ydgw-6@B;_tPZl)!}&9NKI(kZ)a)AfX^t2q_Cr$+B^dg$(qiGT^3YE|gkV
zSg8QVypB-p3G9F0^c6(K=-b6ou%)NmACyZ2ItUA_RD>`rAPkF$qB07F9z=z*D@46$
z#^V&*;ct_`q_IY9cqHcj8M~GKyEhZ=Db7bweU05~;Tkbz8g3t6MgPu>i~D<L$%0a@
zl~f&^29biK2tr0eDhTZ+LnY(y4i3soNhlJeT>msey<WwM$GsTDf~$>Dp`}_M6@#}p
zXMfV)Gjmp{)C=okM?$bv3W5}@WzneDMI{*#QpBGh-n{vHhaI+`KtbF6j_*gSx_c9W
z-KGIj5=JH-!%=)57S4Ey+p=XuY#)2#8;yGF)x*PEme(qpgc(o)&r$);PznPIt{}8d
zwiw%Ze^OlW?nYeT-o65yW$q~~M%-$`I*lZ0V%4fgU92aBl;S24Brj?tTYeNL6SXib
zik{Md>?ux@g|Jr=gt4x5j}xuaO{4tjB}?}cebXhMwDcWVH#C7;ezj${GGLd((VfRt
zk9-#Q-SPlV*!Ln_bI+U5)Z1lTW81Xb3Xz(2VlkR}Tp{XTq+}==Zd0OL_f1xZZYqaM
z$80m8n72X(f|FK)sZ-~pS{cEdh5fK@9HXNXsMa@O!Mwwz3}Rcbi!oxB&F?QSIIdWj
zx>(6V<gDIYZ;s#h__}2eFCIL`#x*A#HC<S8NE(ECtpOn=e)hFhyNxEh)dbys{Y?M}
zH~jS-aL!_%1*h7_pZbIR*_Yzh8ShE{&{GWm5Ks>aVGmk*5<(bg6N3tnEv$EiVjmlm
zKuU#5Wh;L1&Bp-%AN|S+IN+dtu>8SW;MiEQQXoi>G#VR3kNlOA0hCa%=}ubL{Rw#g
z8>O^z*aor(V1b*ij4|}&n%zkb0KoqRbb1&ct<2Ko0000bbVXQnWMOn=I%9HWVRU5x
zGB7bQEigGPGBQ*!IXW{kIx{jYFgH3dFsPDZ%m4rYC3HntbYx+4WjbwdWNBu305UK!
pF)c7TEipD!FgH3fH###mEigAaFfey&@l*f+002ovPDHLkV1iQC3p)S+

diff --git a/apps/openglcompute/res/layout/main.xml b/apps/openglcompute/res/layout/main.xml
deleted file mode 100644
index 5a8da6d73556..000000000000
--- a/apps/openglcompute/res/layout/main.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    android:orientation="horizontal"
-    android:layout_width="fill_parent"
-    android:layout_height="fill_parent"
-    >
-
-  <FrameLayout
-      android:id="@+id/camera_preview"
-      android:layout_width="fill_parent"
-      android:layout_height="fill_parent"
-      android:layout_weight="1"
-      />
-
-</LinearLayout>
\ No newline at end of file
diff --git a/apps/openglcompute/res/values/strings.xml b/apps/openglcompute/res/values/strings.xml
deleted file mode 100644
index 3a57a5288983..000000000000
--- a/apps/openglcompute/res/values/strings.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-    <string name="app_name">HelloHalideAndroidOpenGLCompute</string>
-</resources>
diff --git a/apps/openglcompute/src/com/example/hellohalideopenglcompute/HalideOpenGLComputeActivity.java b/apps/openglcompute/src/com/example/hellohalideopenglcompute/HalideOpenGLComputeActivity.java
deleted file mode 100644
index b9cfb2f2f969..000000000000
--- a/apps/openglcompute/src/com/example/hellohalideopenglcompute/HalideOpenGLComputeActivity.java
+++ /dev/null
@@ -1,30 +0,0 @@
-package com.example.hellohalideopenglcompute;
-
-import android.app.Activity;
-import android.os.Bundle;
-import android.hardware.Camera;
-import android.util.Log;
-import android.widget.FrameLayout;
-import android.view.SurfaceView;
-
-public class HalideOpenGLComputeActivity extends Activity {
-    private static final String TAG = "HalideOpenGLComputeActivity";
-
-    static {
-        System.loadLibrary("oglc");
-        System.loadLibrary("oglc_two_kernels");
-    }
-    private static native void runTest();
-    private static native void runTwoKernelsTest();
-
-    @Override
-    public void onCreate(Bundle b) {
-        super.onCreate(b);
-        Log.d(TAG, "Starting the tests:");
-        runTest();
-        Log.d(TAG, "Done with first test");
-        runTwoKernelsTest();
-        Log.d(TAG, "Done");
-        finish();
-   }
-}
diff --git a/apps/openglcompute/test_oglc_avg.cpp b/apps/openglcompute/test_oglc_avg.cpp
deleted file mode 100644
index 346b7e9f7d72..000000000000
--- a/apps/openglcompute/test_oglc_avg.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "Halide.h"
-
-using namespace Halide;
-
-void blur(std::string suffix, ImageParam input) {
-    input.dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4);
-
-    Var x("x"), y("y"), c("c");
-
-    Func clamped("clamped");
-    clamped = BoundaryConditions::repeat_edge(input);
-
-    Func blur_x("blur_x");
-    blur_x(x, y, c) = (clamped(x - 1, y, c) +
-                       clamped(x, y, c) +
-                       clamped(x + 1, y, c)) /
-                      3;
-
-    Func result("avg_filter");
-    result(x, y, c) = (blur_x(x, y - 1, c) +
-                       blur_x(x, y, c) +
-                       blur_x(x, y + 1, c)) /
-                      3;
-
-    result.output_buffer().dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4);
-
-    Target target = get_target_from_environment();
-    result.bound(c, 0, 4)
-        .reorder_storage(c, x, y)
-        .reorder(c, x, y);
-    if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) {
-        Var xi("xi"), yi("yi");
-        result.unroll(c)
-            .gpu_tile(x, y, xi, yi, 64, 64);
-    } else {
-        Var yi("yi");
-        result
-            .unroll(c)
-            .split(y, y, yi, 32)
-            .parallel(y)
-            .vectorize(x, 4);
-        blur_x.store_at(result, y)
-            .compute_at(result, yi)
-            .reorder(c, x, y)
-            .unroll(c)
-            .vectorize(x, 4);
-    }
-
-    std::string fn_name = std::string("avg_filter") + suffix;
-    result.compile_to_file(fn_name, {input}, fn_name);
-}
-
-int main(int argc, char **argv) {
-    ImageParam input_uint32(UInt(32), 3, "input");
-    blur(std::string("_uint32t") + (argc > 1 ? argv[1] : ""), input_uint32);
-
-    ImageParam input_float(Float(32), 3, "input");
-    blur(std::string("_float") + (argc > 1 ? argv[1] : ""), input_float);
-}
diff --git a/apps/openglcompute/test_two_kernels.cpp b/apps/openglcompute/test_two_kernels.cpp
deleted file mode 100644
index abff1aba5b23..000000000000
--- a/apps/openglcompute/test_two_kernels.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "Halide.h"
-
-using namespace Halide;
-
-int main(int argc, char **argv) {
-    ImageParam input(UInt(32), 3, "input");
-    input.dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4);
-
-    Var x, y, c, xi, yi;
-    Func f("f");
-    f(x, y, c) = input(x, y, c) + 1;
-    f.bound(c, 0, 4)
-        .reorder_storage(c, x, y)
-        .reorder(c, x, y);
-
-    f.compute_root();
-    f.output_buffer().dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4);
-
-    Target target = get_target_from_environment();
-    if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) {
-        f.unroll(c)
-            .gpu_tile(x, y, xi, yi, 64, 64);
-    }
-
-    Func g("g");
-    g(x, y, c) = f(x, y, c) - 1;
-    g.bound(c, 0, 4)
-        .reorder_storage(c, x, y)
-        .reorder(c, x, y);
-    if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) {
-        g.unroll(c)
-            .gpu_tile(x, y, xi, yi, 64, 64);
-    }
-    g.output_buffer().dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4);
-
-    std::string fn_name = std::string("two_kernels_filter") + (argc > 1 ? argv[1] : "");
-    g.compile_to_file(fn_name, {input}, fn_name);
-
-    return 0;
-}
diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake
index d45341536422..3aa380da450e 100644
--- a/cmake/HalideGeneratorHelpers.cmake
+++ b/cmake/HalideGeneratorHelpers.cmake
@@ -712,22 +712,6 @@ function(_Halide_add_targets_to_runtime TARGET)
 endfunction()
 
 function(_Halide_target_link_gpu_libs TARGET VISIBILITY)
-    # TODO(https://github.com/halide/Halide/issues/5633): verify that this is correct & necessary for OpenGLCompute
-    if ("${ARGN}" MATCHES "openglcompute")
-        if ("${ARGN}" MATCHES "egl")
-            find_package(OpenGL REQUIRED COMPONENTS OpenGL EGL)
-            target_link_libraries(${TARGET} ${VISIBILITY} OpenGL::OpenGL OpenGL::EGL)
-        else ()
-            if ("${ARGN}" MATCHES "linux" OR ("${ARGN}" MATCHES "host" AND Halide_HOST_TARGET MATCHES "linux"))
-                find_package(X11 REQUIRED)
-                target_link_libraries(${TARGET} ${VISIBILITY} X11::X11)
-            endif ()
-
-            find_package(OpenGL REQUIRED)
-            target_link_libraries(${TARGET} ${VISIBILITY} OpenGL::GL)
-        endif ()
-    endif ()
-
     if ("${ARGN}" MATCHES "vulkan")
         find_package(Vulkan REQUIRED)
         target_link_libraries(${TARGET} ${VISIBILITY} Vulkan::Vulkan)
diff --git a/packaging/common/Description.txt b/packaging/common/Description.txt
index 21464255c878..7f11935edb42 100644
--- a/packaging/common/Description.txt
+++ b/packaging/common/Description.txt
@@ -4,7 +4,7 @@
 
  * CPU architectures: X86, ARM, Hexagon, PowerPC, RISC-V, WebAssembly
  * Operating systems: Linux, Windows, macOS, Android, iOS, Qualcomm QuRT
- * GPU APIs: CUDA, OpenCL, OpenGL Compute Shaders, Apple Metal, Direct X 12
+ * GPU APIs: CUDA, OpenCL, Apple Metal, Direct X 12
 
 Rather than being a standalone programming language, Halide is embedded in C++.
 This means you write C++ code that builds an in-memory representation of a
diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index d723d66461d8..e6cede6c6edb 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -25,7 +25,6 @@ void define_enums(py::module &m) {
         .value("CUDA", DeviceAPI::CUDA)
         .value("Vulkan", DeviceAPI::Vulkan)
         .value("OpenCL", DeviceAPI::OpenCL)
-        .value("OpenGLCompute", DeviceAPI::OpenGLCompute)
         .value("Metal", DeviceAPI::Metal)
         .value("Hexagon", DeviceAPI::Hexagon);
 
@@ -137,7 +136,6 @@ void define_enums(py::module &m) {
         .value("CLDoubles", Target::Feature::CLDoubles)
         .value("CLHalf", Target::Feature::CLHalf)
         .value("CLAtomics64", Target::Feature::CLAtomics64)
-        .value("OpenGLCompute", Target::Feature::OpenGLCompute)
         .value("EGL", Target::Feature::EGL)
         .value("UserContext", Target::Feature::UserContext)
         .value("Profile", Target::Feature::Profile)
diff --git a/python_bindings/test/correctness/boundary_conditions.py b/python_bindings/test/correctness/boundary_conditions.py
index 32abd12ff0e6..2fa5e8e8c59d 100644
--- a/python_bindings/test/correctness/boundary_conditions.py
+++ b/python_bindings/test/correctness/boundary_conditions.py
@@ -200,7 +200,6 @@ def test_all(vector_width, target, partition_policy):
     # https://github.com/halide/Halide/issues/2148
     if target.has_feature(hl.TargetFeature.Metal) or \
         target.has_feature(hl.TargetFeature.Vulkan) or \
-        target.has_feature(hl.TargetFeature.OpenGLCompute) or \
         target.has_feature(hl.TargetFeature.D3D12Compute):
         vector_width_power_max = 2
 
diff --git a/python_bindings/test/correctness/target.py b/python_bindings/test/correctness/target.py
index 7876bc97ecef..a7031c2cd7d1 100644
--- a/python_bindings/test/correctness/target.py
+++ b/python_bindings/test/correctness/target.py
@@ -52,12 +52,11 @@ def test_target():
             hl.TargetFeature.JIT,
             hl.TargetFeature.CUDA,
             hl.TargetFeature.OpenCL,
-            hl.TargetFeature.OpenGLCompute,
             hl.TargetFeature.Debug,
         ],
     )
     ts = t1.to_string()
-    assert ts == "arm-32-android-cuda-debug-jit-opencl-openglcompute"
+    assert ts == "arm-32-android-cuda-debug-jit-opencl"
     assert hl.Target.validate_target_string(ts)
 
     # Expected failures:
diff --git a/src/BoundSmallAllocations.cpp b/src/BoundSmallAllocations.cpp
index f6a86f8a3e2a..f83a13d99614 100644
--- a/src/BoundSmallAllocations.cpp
+++ b/src/BoundSmallAllocations.cpp
@@ -74,9 +74,7 @@ class BoundSmallAllocations : public IRMutator {
     }
 
     bool must_be_constant(MemoryType memory_type) const {
-        return (memory_type == MemoryType::Register ||
-                (device_api == DeviceAPI::OpenGLCompute &&
-                 memory_type == MemoryType::GPUShared));
+        return memory_type == MemoryType::Register;
     }
 
     Stmt visit(const Realize *op) override {
@@ -125,13 +123,6 @@ class BoundSmallAllocations : public IRMutator {
                 << "Allocation " << op->name << " has a dynamic size. "
                 << "Only fixed-size allocations can be stored in registers. "
                 << "Try storing on the heap or stack instead.";
-
-            user_assert(!(device_api == DeviceAPI::OpenGLCompute &&
-                          op->memory_type == MemoryType::GPUShared))
-                << "Allocation " << op->name << " has a dynamic size. "
-                << "Only fixed-size allocations can be stored in shared memory "
-                << "in OpenGL compute shaders. Try storing in MemoryType::Heap "
-                << "instead.";
         }
 
         const int64_t *size_ptr = bound.defined() ? as_const_int(bound) : nullptr;
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cfb092d29bf0..77453fbce0a9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -35,7 +35,6 @@ set(HEADER_FILES
     CodeGen_LLVM.h
     CodeGen_Metal_Dev.h
     CodeGen_OpenCL_Dev.h
-    CodeGen_OpenGLCompute_Dev.h
     CodeGen_Posix.h
     CodeGen_PTX_Dev.h
     CodeGen_PyTorch.h
@@ -206,7 +205,6 @@ set(SOURCE_FILES
     CodeGen_LLVM.cpp
     CodeGen_Metal_Dev.cpp
     CodeGen_OpenCL_Dev.cpp
-    CodeGen_OpenGLCompute_Dev.cpp
     CodeGen_Posix.cpp
     CodeGen_PowerPC.cpp
     CodeGen_PTX_Dev.cpp
@@ -612,11 +610,6 @@ if (TARGET_D3D12COMPUTE)
     target_compile_definitions(Halide PRIVATE WITH_D3D12)
 endif ()
 
-option(TARGET_OPENGLCOMPUTE "Include OpenGLCompute target" ON)
-if (TARGET_OPENGLCOMPUTE)
-    target_compile_definitions(Halide PRIVATE WITH_OPENGLCOMPUTE)
-endif ()
-
 if (TARGET_VULKAN)
     message(STATUS "Enabling Vulkan target")
     target_compile_definitions(Halide PRIVATE WITH_VULKAN)
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index 3939edc4a678..89c18cb8ab28 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -30,7 +30,6 @@ extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeCuda_h[];
 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeHexagonHost_h[];
 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeMetal_h[];
 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenCL_h[];
-extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h[];
 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeQurt_h[];
 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeD3D12Compute_h[];
 extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeWebGPU_h[];
@@ -307,9 +306,6 @@ CodeGen_C::~CodeGen_C() {
             if (target.has_feature(Target::OpenCL)) {
                 stream << halide_internal_runtime_header_HalideRuntimeOpenCL_h << "\n";
             }
-            if (target.has_feature(Target::OpenGLCompute)) {
-                stream << halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h << "\n";
-            }
             if (target.has_feature(Target::D3D12Compute)) {
                 stream << halide_internal_runtime_header_HalideRuntimeD3D12Compute_h << "\n";
             }
diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index 2fc5b5cae0df..78fc4224fb61 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -64,7 +64,6 @@ bool function_takes_user_context(const std::string &name) {
         "halide_memoization_cache_release",
         "halide_cuda_run",
         "halide_opencl_run",
-        "halide_openglcompute_run",
         "halide_metal_run",
         "halide_d3d12compute_run",
         "halide_vulkan_run",
@@ -90,7 +89,6 @@ bool function_takes_user_context(const std::string &name) {
         "halide_vtcm_free",
         "halide_cuda_initialize_kernels",
         "halide_opencl_initialize_kernels",
-        "halide_openglcompute_initialize_kernels",
         "halide_metal_initialize_kernels",
         "halide_d3d12compute_initialize_kernels",
         "halide_vulkan_initialize_kernels",
diff --git a/src/CodeGen_OpenGLCompute_Dev.cpp b/src/CodeGen_OpenGLCompute_Dev.cpp
deleted file mode 100644
index f2f0949f33fd..000000000000
--- a/src/CodeGen_OpenGLCompute_Dev.cpp
+++ /dev/null
@@ -1,1029 +0,0 @@
-#include "CodeGen_OpenGLCompute_Dev.h"
-#include "CSE.h"
-#include "CodeGen_C.h"
-#include "CodeGen_GPU_Dev.h"
-#include "Debug.h"
-#include "Deinterleave.h"
-#include "FindIntrinsics.h"
-#include "IRMatch.h"
-#include "IRMutator.h"
-#include "IROperator.h"
-#include "Simplify.h"
-#include <iomanip>
-#include <limits>
-#include <map>
-
-namespace Halide {
-namespace Internal {
-
-using std::ostringstream;
-using std::string;
-using std::vector;
-
-namespace {
-
-char get_lane_suffix(int i) {
-    internal_assert(i >= 0 && i < 4);
-    return "rgba"[i];
-}
-
-class CodeGen_OpenGLCompute_C : public CodeGen_C {
-public:
-    CodeGen_OpenGLCompute_C(std::ostream &s, const Target &t);
-    void add_kernel(const Stmt &stmt,
-                    const std::string &name,
-                    const std::vector<DeviceArgument> &args);
-
-protected:
-    Type map_type(const Type &);
-
-    std::string print_name(const std::string &name) override;
-    std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override;
-
-    using CodeGen_C::visit;
-
-    void visit(const Cast *) override;
-
-    void visit(const FloatImm *) override;
-    void visit(const UIntImm *) override;
-    void visit(const IntImm *) override;
-
-    void visit(const Max *op) override;
-    void visit(const Min *op) override;
-
-    void visit(const Mod *) override;
-
-    // these have specific functions
-    // in GLSL that operate on vectors
-    void visit(const EQ *) override;
-    void visit(const NE *) override;
-    void visit(const LT *) override;
-    void visit(const LE *) override;
-    void visit(const GT *) override;
-    void visit(const GE *) override;
-
-    void visit(const Shuffle *) override;
-
-    void visit(const For *) override;
-    void visit(const Ramp *op) override;
-    void visit(const Broadcast *op) override;
-    void visit(const Load *op) override;
-    void visit(const Store *op) override;
-    void visit(const Call *op) override;
-    void visit(const Allocate *op) override;
-    void visit(const Free *op) override;
-    void visit(const Select *op) override;
-    void visit(const Evaluate *op) override;
-
-    const std::map<std::string, std::string> builtin = {
-        {"abs", "abs"},
-        {"abs_f32", "abs"},
-        {"acos_f32", "acos"},
-        {"acosh_f32", "acosh"},
-        {"asin_f32", "asin"},
-        {"asinh_f32", "asinh"},
-        {"atan2_f32", "atan"},  // also called atan in GLSL
-        {"atan_f32", "atan"},
-        {"atanh_f32", "atanh"},
-        {"ceil_f32", "ceil"},
-        {"cos_f32", "cos"},
-        {"cosh_f32", "cosh"},
-        {"equal", "equal"},
-        {"exp_f32", "exp"},
-        {"fast_inverse_sqrt_f32", "inversesqrt"},
-        {"floor_f32", "floor"},
-        {"greaterThan", "greaterThan"},
-        {"greaterThanEqual", "greaterThanEqual"},
-        {"isnan", "isnan"},
-        {"lessThan", "lessThan"},
-        {"lessThanEqual", "lessThanEqual"},
-        {"log_f32", "log"},
-        {"max", "max"},
-        {"min", "min"},
-        {"mix", "mix"},
-        {"mod", "mod"},
-        {"notEqual", "notEqual"},
-        {"sin_f32", "sin"},
-        {"sinh_f32", "sinh"},
-        {"sqrt_f32", "sqrt"},
-        {"tan_f32", "tan"},
-        {"tanh_f32", "tanh"},
-        {"trunc_f32", "trunc"},
-    };
-    int workgroup_size[3] = {0, 0, 0};
-
-    // Maps each buffer with whether its base type is a vector.
-    std::map<string, bool> buffer_is_vector;
-};
-
-CodeGen_OpenGLCompute_C::CodeGen_OpenGLCompute_C(std::ostream &s, const Target &t)
-    : CodeGen_C(s, t) {
-}
-
-// Maps Halide types to appropriate GLSL types or emit error if no equivalent
-// type is available.
-Type CodeGen_OpenGLCompute_C::map_type(const Type &type) {
-    Type result = type;
-    if (type.is_scalar()) {
-        if (type.is_float()) {
-            user_assert(type.bits() <= 32)
-                << "GLSL: Can't represent a float with " << type.bits() << " bits.\n";
-            result = Float(32);
-        } else if (type.is_bool()) {
-            // unchanged
-        } else if (type.is_int() && type.bits() <= 32) {
-            result = Int(32);
-        } else if (type.is_uint() && type.bits() <= 32) {
-            result = UInt(32);
-        } else {
-            user_error << "GLSL: Can't represent type '" << type << "'.\n";
-        }
-    } else {
-        user_assert(type.lanes() <= 4)
-            << "GLSL: vector types wider than 4 aren't supported\n";
-        user_assert(type.is_bool() || type.is_int() || type.is_uint() || type.is_float())
-            << "GLSL: Can't represent vector type '" << type << "'.\n";
-        Type scalar_type = type.element_of();
-        result = map_type(scalar_type).with_lanes(type.lanes());
-    }
-    return result;
-}
-
-// Identifiers containing double underscores '__' are reserved in GLSL, so we
-// have to use a different name mangling scheme than in the C code generator.
-string CodeGen_OpenGLCompute_C::print_name(const string &name) {
-    const string mangled = CodeGen_C::print_name(name);
-    return replace_all(mangled, "__", "XX");
-}
-
-string CodeGen_OpenGLCompute_C::print_type(Type type, AppendSpaceIfNeeded space) {
-    ostringstream oss;
-    type = map_type(type);
-    if (type.is_scalar()) {
-        if (type.is_float()) {
-            oss << "float";
-        } else if (type.is_bool()) {
-            oss << "bool";
-        } else if (type.is_int()) {
-            oss << "int";
-        } else if (type.is_uint()) {
-            oss << "uint";
-        } else {
-            internal_error << "GLSL: invalid type '" << type << "' encountered.\n";
-        }
-    } else {
-        if (type.is_float()) {
-            // no prefix for float vectors
-        } else if (type.is_bool()) {
-            oss << "b";
-        } else if (type.is_int()) {
-            oss << "i";
-        } else if (type.is_uint()) {
-            oss << "u";
-        } else {
-            internal_error << "GLSL: invalid type '" << type << "' encountered.\n";
-        }
-        oss << "vec" << type.lanes();
-    }
-
-    if (space == AppendSpace) {
-        oss << " ";
-    }
-
-    return oss.str();
-}
-
-string simt_intrinsic(const string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
-        return "gl_LocalInvocationID.x";
-    } else if (ends_with(name, ".__thread_id_y")) {
-        return "gl_LocalInvocationID.y";
-    } else if (ends_with(name, ".__thread_id_z")) {
-        return "gl_LocalInvocationID.z";
-    } else if (ends_with(name, ".__thread_id_w")) {
-        internal_error << "4-dimension loops with " << name << " are not supported\n";
-    } else if (ends_with(name, ".__block_id_x")) {
-        return "gl_WorkGroupID.x";
-    } else if (ends_with(name, ".__block_id_y")) {
-        return "gl_WorkGroupID.y";
-    } else if (ends_with(name, ".__block_id_z")) {
-        return "gl_WorkGroupID.z";
-    } else if (ends_with(name, ".__block_id_w")) {
-        internal_error << "4-dimension loops with " << name << " are not supported\n";
-    }
-    internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
-    return "";
-}
-
-int thread_loop_workgroup_index(const string &name) {
-    string ids[] = {".__thread_id_x",
-                    ".__thread_id_y",
-                    ".__thread_id_z",
-                    ".__thread_id_w"};
-    for (size_t i = 0; i < sizeof(ids) / sizeof(string); i++) {
-        if (ends_with(name, ids[i])) {
-            return i;
-        }
-    }
-    return -1;
-}
-
-void CodeGen_OpenGLCompute_C::visit(const FloatImm *op) {
-    ostringstream oss;
-    // Print integral numbers with trailing ".0". For fractional numbers use a
-    // precision of 9 digits, which should be enough to recover the binary
-    // float unambiguously from the decimal representation (if iostreams
-    // implements correct rounding).
-    const float truncated = (op->value < 0 ? std::ceil(op->value) : std::floor(op->value));
-    if (truncated == op->value) {
-        oss << std::fixed << std::setprecision(1) << op->value;
-    } else {
-        oss << std::setprecision(9) << op->value;
-    }
-    id = oss.str();
-}
-
-void CodeGen_OpenGLCompute_C::visit(const UIntImm *op) {
-    if (op->type == Bool()) {
-        if (op->value == 1) {
-            id = "true";
-        } else {
-            id = "false";
-        }
-    } else {
-        id = std::to_string(op->value) + "u";
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Max *op) {
-    print_expr(Call::make(op->type, "max", {op->a, op->b}, Call::PureExtern));
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Min *op) {
-    print_expr(Call::make(op->type, "min", {op->a, op->b}, Call::PureExtern));
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Mod *op) {
-    if (op->type.is_int() || op->type.is_uint()) {
-        // Just exploit the Euclidean identity
-        // FIXME: Why doesn't lower_euclidean_mod work for glsl?
-        // https://github.com/halide/Halide/issues/4979
-        Expr zero = make_zero(op->type);
-        Expr equiv = select(op->a == zero, zero,
-                            op->a - (op->a / op->b) * op->b);
-        equiv = common_subexpression_elimination(equiv);
-        print_expr(equiv);
-    } else {
-        print_expr(Call::make(op->type, "mod", {op->a, op->b}, Call::Extern));
-    }
-}
-
-// The following comparisons are defined for ivec and vec
-// types, so we don't use call_builtin
-void CodeGen_OpenGLCompute_C::visit(const EQ *op) {
-    if (op->type.is_vector()) {
-        print_expr(Call::make(op->type, "equal", {op->a, op->b}, Call::Extern));
-    } else {
-        CodeGen_C::visit(op);
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const NE *op) {
-    if (op->type.is_vector()) {
-        print_expr(Call::make(op->type, "notEqual", {op->a, op->b}, Call::Extern));
-    } else {
-        CodeGen_C::visit(op);
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const LT *op) {
-    if (op->type.is_vector()) {
-        print_expr(Call::make(op->type, "lessThan", {op->a, op->b}, Call::Extern));
-    } else {
-        CodeGen_C::visit(op);
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const LE *op) {
-    if (op->type.is_vector()) {
-        print_expr(Call::make(op->type, "lessThanEqual", {op->a, op->b}, Call::Extern));
-    } else {
-        CodeGen_C::visit(op);
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const GT *op) {
-    if (op->type.is_vector()) {
-        print_expr(Call::make(op->type, "greaterThan", {op->a, op->b}, Call::Extern));
-    } else {
-        CodeGen_C::visit(op);
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const GE *op) {
-    if (op->type.is_vector()) {
-        print_expr(Call::make(op->type, "greaterThanEqual", {op->a, op->b}, Call::Extern));
-    } else {
-        CodeGen_C::visit(op);
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Shuffle *op) {
-    // The halide Shuffle represents the llvm intrinisc
-    // shufflevector, however, for GLSL its use is limited to swizzling
-    // up to a four channel vec type.
-
-    internal_assert(op->vectors.size() == 1);
-
-    int shuffle_lanes = op->type.lanes();
-    internal_assert(shuffle_lanes <= 4);
-
-    string expr = print_expr(op->vectors[0]);
-
-    // Create a swizzle expression for the shuffle
-    string swizzle;
-    for (int i = 0; i != shuffle_lanes; ++i) {
-        int channel = op->indices[i];
-        internal_assert(channel < 4) << "Shuffle of invalid channel";
-        swizzle += get_lane_suffix(channel);
-    }
-
-    print_assignment(op->type, expr + "." + swizzle);
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Call *op) {
-    if (op->is_intrinsic(Call::gpu_thread_barrier)) {
-        internal_assert(op->args.size() == 1) << "gpu_thread_barrier() intrinsic must specify memory fence type.\n";
-
-        const auto *fence_type_ptr = as_const_int(op->args[0]);
-        internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n";
-        auto fence_type = *fence_type_ptr;
-
-        stream << get_indent() << "barrier();\n";
-
-        // barrier() is an execution barrier; for memory behavior, we'll use the
-        // least-common-denominator groupMemoryBarrier(), because other fence types
-        // require extensions or GL 4.3 as a minumum.
-        if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device ||
-            fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared) {
-            stream << "groupMemoryBarrier();\n";
-        }
-        print_assignment(op->type, "0");
-    } else if (op->is_intrinsic(Call::lerp)) {
-        // Implement lerp using GLSL's mix() function, which always uses
-        // floating point arithmetic.
-        Expr zero_val = op->args[0];
-        Expr one_val = op->args[1];
-        Expr weight = op->args[2];
-
-        internal_assert(weight.type().is_uint() || weight.type().is_float());
-        if (weight.type().is_uint()) {
-            // Normalize integer weights to [0.0f, 1.0f] range.
-            internal_assert(weight.type().bits() < 32);
-            weight = Div::make(Cast::make(Float(32), weight),
-                               Cast::make(Float(32), weight.type().max()));
-        } else if (op->type.is_uint()) {
-            // Round float weights down to next multiple of (1/op->type.imax())
-            // to give same results as lerp based on integer arithmetic.
-            internal_assert(op->type.bits() < 32);
-            weight = floor(weight * op->type.max()) / op->type.max();
-        }
-
-        Type result_type = Float(32, op->type.lanes());
-        Expr e = Call::make(result_type, "mix", {zero_val, one_val, weight}, Call::Extern);
-
-        if (!op->type.is_float()) {
-            // Mirror rounding implementation of Halide's integer lerp.
-            e = Cast::make(op->type, floor(e + 0.5f));
-        }
-        print_expr(e);
-        return;
-    } else if (op->is_intrinsic(Call::abs)) {
-        internal_assert(op->args.size() == 1);
-        Expr a = op->args[0];
-        Type target_type = map_type(op->type);
-        if (op->type != Int(32)) {
-            print_assignment(target_type, print_type(target_type) + "(abs(" + print_expr(a) + "))");
-        } else {
-            print_assignment(target_type, "abs(" + print_expr(a) + ")");
-        }
-        return;
-    } else if (op->is_intrinsic(Call::absd)) {
-        internal_assert(op->args.size() == 2);
-        Expr a = op->args[0];
-        Expr b = op->args[1];
-        Expr e = cast(op->type, select(a < b, b - a, a - b));
-        print_expr(e);
-        return;
-    } else if (op->is_intrinsic(Call::return_second)) {
-        internal_assert(op->args.size() == 2);
-        // Simply discard the first argument, which is generally a call to
-        // 'halide_printf'.
-        print_assignment(op->type, print_expr(op->args[1]));
-        return;
-    } else if (op->is_intrinsic(Call::round)) {
-        print_assignment(op->type, "roundEven(" + print_expr(op->args[0]) + ")");
-        return;
-    } else if (op->name == "fast_inverse_f32") {
-        print_expr(make_one(op->type) / op->args[0]);
-        return;
-    } else if (op->name == "fast_inverse_sqrt_f32") {
-        print_expr(make_one(op->type) / sqrt(op->args[0]));
-        return;
-    } else if (op->name == "pow_f32") {
-        if (can_prove(op->args[0] > 0)) {
-            ostringstream rhs;
-            rhs << "pow(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")";
-            print_assignment(op->type, rhs.str());
-            return;
-        } else {
-            ostringstream base;
-            string a = print_expr(op->args[0]);
-            string b = print_expr(op->args[1]);
-            base << "pow(abs(" << a << "), " << b << ")";
-            string c = print_assignment(op->type, base.str());
-            Expr a_var = is_const(op->args[0]) ? op->args[0] : Variable::make(op->type, a);
-            Expr b_var = is_const(op->args[1]) ? op->args[1] : Variable::make(op->type, b);
-            Expr c_var = Variable::make(op->type, c);
-            // OpenGL isn't required to produce NaNs, so we return
-            // zero in the undefined case.
-            Expr equiv = select(a_var > 0 || b_var % 2 == 0, c_var,
-                                b_var % 2 == 1, -c_var,
-                                0.0f);
-            print_expr(simplify(equiv));
-            return;
-        }
-    } else if (op->is_intrinsic(Call::shift_right)) {
-        print_assignment(op->type, print_expr(op->args[0]) + " >> " + print_expr(op->args[1]));
-    } else if (op->is_intrinsic(Call::shift_left)) {
-        print_assignment(op->type, print_expr(op->args[0]) + " << " + print_expr(op->args[1]));
-    } else if (op->is_intrinsic(Call::bitwise_not)) {
-        print_assignment(op->type, "~" + print_expr(op->args[0]));
-    } else if (op->is_intrinsic(Call::bitwise_and)) {
-        print_assignment(op->type, print_expr(op->args[0]) + " & " + print_expr(op->args[1]));
-    } else if (op->is_intrinsic(Call::bitwise_or)) {
-        print_assignment(op->type, print_expr(op->args[0]) + " | " + print_expr(op->args[1]));
-    } else if (op->is_intrinsic(Call::bitwise_xor)) {
-        print_assignment(op->type, print_expr(op->args[0]) + " ^ " + print_expr(op->args[1]));
-    } else if (op->is_intrinsic(Call::div_round_to_zero)) {
-        print_assignment(op->type, print_expr(op->args[0]) + " / " + print_expr(op->args[1]));
-    } else if (op->is_intrinsic(Call::mod_round_to_zero)) {
-        print_assignment(op->type, print_expr(op->args[0]) + " % " + print_expr(op->args[1]));
-    } else if (op->is_intrinsic(Call::saturating_cast)) {
-        Expr e = lower_intrinsic(op);
-        print_expr(e);
-        return;
-    } else {
-        auto it = builtin.find(op->name);
-        if (it == builtin.end()) {
-            user_error << "GLSL: unknown function '" << op->name << "' encountered.\n";
-        }
-
-        ostringstream rhs;
-        rhs << it->second << "(";
-        for (size_t i = 0; i < op->args.size(); i++) {
-            if (i > 0) {
-                rhs << ", ";
-            }
-            rhs << print_expr(op->args[i]);
-        }
-        rhs << ")";
-        print_assignment(op->type, rhs.str());
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Cast *op) {
-    Type value_type = op->value.type();
-    // If both types are represented by the same GLSL type, no explicit cast
-    // is necessary.
-    Type target_type = map_type(op->type);
-    if (target_type == map_type(value_type)) {
-        Expr value = op->value;
-        if (value_type.code() == Type::Float) {
-            // float->int conversions may need explicit truncation if an
-            // integer type is embedded into a float. (Note: overflows are
-            // considered undefined behavior, so we do nothing about values
-            // that are out of range of the target type.)
-            if (op->type.code() == Type::UInt) {
-                value = simplify(floor(value));
-            } else if (op->type.code() == Type::Int) {
-                value = simplify(trunc(value));
-            }
-        }
-        // FIXME: Overflow is not UB for most Halide types
-        // https://github.com/halide/Halide/issues/4975
-        value.accept(this);
-    } else {
-        print_assignment(target_type, print_type(target_type) + "(" + print_expr(op->value) + ")");
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const For *loop) {
-    user_assert(loop->for_type != ForType::GPULane)
-        << "The OpenGLCompute backend does not support the gpu_lanes() scheduling directive.";
-
-    if (CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
-        internal_assert((loop->for_type == ForType::GPUBlock) ||
-                        (loop->for_type == ForType::GPUThread))
-            << "kernel loop must be either gpu block or gpu thread\n";
-        internal_assert(is_const_zero(loop->min));
-
-        debug(4) << "loop extent is " << loop->extent << "\n";
-        //
-        //  Need to extract workgroup size.
-        //
-        int index = thread_loop_workgroup_index(loop->name);
-        if (index >= 0) {
-            const IntImm *int_limit = loop->extent.as<IntImm>();
-            user_assert(int_limit != nullptr) << "For OpenGLCompute workgroup size must be a constant integer.\n";
-            int new_workgroup_size = int_limit->value;
-            user_assert(workgroup_size[index] == 0 ||
-                        workgroup_size[index] == new_workgroup_size)
-                << "OpenGLCompute requires all gpu kernels have same workgroup size, "
-                << "but two different ones were encountered " << workgroup_size[index]
-                << " and " << new_workgroup_size
-                << " in dimension " << index << ".\n";
-            workgroup_size[index] = new_workgroup_size;
-            debug(4) << "Workgroup size for index " << index << " is " << workgroup_size[index] << "\n";
-        }
-
-        stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
-               << " = int(" << simt_intrinsic(loop->name) << ");\n";
-
-        loop->body.accept(this);
-
-    } else {
-        user_assert(loop->for_type != ForType::Parallel)
-            << "Cannot use parallel loops inside OpenGLCompute kernel\n";
-        CodeGen_C::visit(loop);
-    }
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Ramp *op) {
-    if (op->lanes > 4) {
-        internal_error << "GLSL: ramp lanes " << op->lanes << " is not supported\n";
-    }
-
-    ostringstream rhs;
-    // Print the sequence vec(0, 1, 2, ...).
-    rhs << print_type(op->type) << "(";
-    for (int i = 0; i < op->type.lanes(); i++) {
-        rhs << i;
-        if (i != op->type.lanes() - 1) {
-            rhs << ", ";
-        }
-    }
-    rhs << ")";
-
-    // Multiply by the stride and add the base.
-    rhs << " * " << print_expr(op->stride) << " + " << print_expr(op->base);
-
-    print_assignment(op->type, rhs.str());
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Broadcast *op) {
-    string id_value = print_expr(op->value);
-    ostringstream oss;
-    oss << print_type(op->type.with_lanes(op->lanes)) << "(" << id_value << ")";
-    print_assignment(op->type.with_lanes(op->lanes), oss.str());
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Load *op) {
-    user_assert(is_const_one(op->predicate)) << "GLSL: predicated load is not supported.\n";
-    // https://github.com/halide/Halide/issues/4975
-
-    string name = print_name(op->name);
-    if (!allocations.contains(op->name)) {
-        name += ".data";
-    }
-
-    // If the index is scalar, just index the buffer using the index.
-    if (op->type.is_scalar()) {
-        internal_assert(!buffer_is_vector[op->name]);
-        string index_id = print_expr(op->index);
-        string rhs = name + "[" + index_id + "]";
-        print_assignment(op->type, rhs);
-        return;
-    }
-
-    // If this is a dense vector load and the buffer has a vector base type,
-    // then index the buffer using the base of the ramp divided by the number
-    // of lanes.
-    Expr ramp_base = strided_ramp_base(op->index);
-    if (ramp_base.defined() && buffer_is_vector[op->name]) {
-        string index_id = print_expr(ramp_base / op->type.lanes());
-        string rhs = name + "[" + index_id + "]";
-        print_assignment(op->type, rhs);
-        return;
-    }
-
-    // Gather vector elements.
-    internal_assert(op->type.is_vector());
-    internal_assert(!buffer_is_vector[op->name]);
-    string index_id = print_expr(op->index);
-    string rhs = print_type(op->type) + "(";
-    for (int i = 0; i < op->type.lanes(); i++) {
-        rhs += name + "[" + index_id + "[" + std::to_string(i) + "]]";
-        if (i != op->type.lanes() - 1) {
-            rhs += ", ";
-        }
-    }
-    rhs += ")";
-    print_assignment(op->type, rhs);
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Store *op) {
-    user_assert(is_const_one(op->predicate)) << "GLSL: predicated store is not supported.\n";
-    // https://github.com/halide/Halide/issues/4975
-
-    string name = print_name(op->name);
-    if (!allocations.contains(op->name)) {
-        name += ".data";
-    }
-
-    string value_id = print_expr(op->value);
-
-    // If the index is scalar, just index the buffer using the index.
-    if (op->value.type().is_scalar()) {
-        internal_assert(!buffer_is_vector[op->name]);
-        string index_id = print_expr(op->index);
-        stream << get_indent() << name << "[" << index_id << "] = ";
-        stream << value_id << ";\n";
-
-        // Need a cache clear on stores to avoid reusing stale loaded
-        // values from before the store.
-        cache.clear();
-        return;
-    }
-
-    // If this is a dense vector store and the buffer has a vector base type,
-    // then index the buffer using the base of the ramp divided by the number
-    // of lanes.
-    Expr ramp_base = strided_ramp_base(op->index);
-    if (ramp_base.defined() && buffer_is_vector[op->name]) {
-        string index_id = print_expr(ramp_base / op->value.type().lanes());
-        stream << get_indent() << name << "[" << index_id << "] = ";
-        stream << value_id << ";\n";
-
-        // Need a cache clear on stores to avoid reusing stale loaded
-        // values from before the store.
-        cache.clear();
-        return;
-    }
-
-    // Scatter vector elements.
-    internal_assert(op->value.type().is_vector());
-    internal_assert(!buffer_is_vector[op->name]);
-    string index_id = print_expr(op->index);
-    for (int i = 0; i < op->value.type().lanes(); i++) {
-        string sub_index_id = index_id + "[" + std::to_string(i) + "]";
-        stream << get_indent() << name << "[" << sub_index_id << "] = ";
-        stream << value_id << "[" << std::to_string(i) << "];\n";
-    }
-
-    // Need a cache clear on stores to avoid reusing stale loaded
-    // values from before the store.
-    cache.clear();
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Select *op) {
-    ostringstream rhs;
-    string true_val = print_expr(op->true_value);
-    string false_val = print_expr(op->false_value);
-    string cond = print_expr(op->condition);
-    if (op->type.is_scalar()) {
-        rhs << cond << " ? " << true_val << " : " << false_val;
-    } else {
-        rhs << print_type(op->type) << "(";
-        for (int i = 0; i < op->type.lanes(); i++) {
-            string index = "[" + std::to_string(i) + "]";
-            rhs << cond << index << " ? "
-                << true_val << index << " : "
-                << false_val << index;
-            if (i != op->type.lanes() - 1) {
-                rhs << ", ";
-            }
-        }
-        rhs << ")";
-    }
-    print_assignment(op->type, rhs.str());
-}
-
-class CodeGen_OpenGLCompute_Dev : public CodeGen_GPU_Dev {
-public:
-    CodeGen_OpenGLCompute_Dev(const Target &target);
-
-    // CodeGen_GPU_Dev interface
-    void add_kernel(Stmt stmt,
-                    const std::string &name,
-                    const std::vector<DeviceArgument> &args) override;
-
-    void init_module() override;
-
-    std::vector<char> compile_to_src() override;
-
-    std::string get_current_kernel_name() override;
-
-    void dump() override;
-
-    std::string print_gpu_name(const std::string &name) override;
-
-    std::string api_unique_name() override {
-        return "openglcompute";
-    }
-    bool kernel_run_takes_types() const override {
-        return true;
-    }
-
-protected:
-    std::ostringstream src_stream;
-    std::string cur_kernel_name;
-    CodeGen_OpenGLCompute_C glc;
-};
-
-CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_Dev(const Target &target)
-    : glc(src_stream, target) {
-}
-
-void CodeGen_OpenGLCompute_Dev::add_kernel(Stmt s,
-                                           const string &name,
-                                           const vector<DeviceArgument> &args) {
-    debug(2) << "CodeGen_OpenGLCompute_Dev::compile " << name << "\n";
-
-    // TODO: do we have to uniquify these names, or can we trust that they are safe?
-    cur_kernel_name = name;
-    glc.add_kernel(s, name, args);
-}
-
-namespace {
-class FindSharedAllocations : public IRVisitor {
-    using IRVisitor::visit;
-
-    void visit(const Allocate *op) override {
-        op->body.accept(this);
-        if (op->memory_type == MemoryType::GPUShared) {
-            allocs.push_back(op);
-        }
-    }
-
-public:
-    vector<const Allocate *> allocs;
-};
-
-// Check if all loads and stores to the member 'buffer' are dense, aligned, and
-// have the same number of lanes. If this is indeed the case then the 'lanes'
-// member stores the number of lanes in those loads and stores.
-class CheckAlignedDenseVectorLoadStore : public IRVisitor {
-public:
-    // True if all loads and stores from the buffer are dense, aligned, and all
-    // have the same number of lanes, false otherwise.
-    bool are_all_dense = true;
-
-    // The number of lanes in the loads and stores. If the number of lanes is
-    // variable, then are_all_dense is set to false regardless, and this value
-    // is undefined. Initially set to -1 before any dense operation is
-    // discovered.
-    int lanes = -1;
-
-    CheckAlignedDenseVectorLoadStore(string buffer)
-        : buffer(std::move(buffer)) {
-    }
-
-private:
-    // The name of the buffer to check.
-    string buffer;
-
-    using IRVisitor::visit;
-
-    void visit(const Load *op) override {
-        IRVisitor::visit(op);
-
-        if (op->name != buffer) {
-            return;
-        }
-
-        if (op->type.is_scalar()) {
-            are_all_dense = false;
-            return;
-        }
-
-        Expr ramp_base = strided_ramp_base(op->index);
-        if (!ramp_base.defined()) {
-            are_all_dense = false;
-            return;
-        }
-
-        if ((op->alignment.modulus % op->type.lanes() != 0) ||
-            (op->alignment.remainder % op->type.lanes() != 0)) {
-            are_all_dense = false;
-            return;
-        }
-
-        if (lanes != -1 && op->type.lanes() != lanes) {
-            are_all_dense = false;
-            return;
-        }
-
-        lanes = op->type.lanes();
-    }
-
-    void visit(const Store *op) override {
-        IRVisitor::visit(op);
-
-        if (op->name != buffer) {
-            return;
-        }
-
-        if (op->value.type().is_scalar()) {
-            are_all_dense = false;
-            return;
-        }
-
-        Expr ramp_base = strided_ramp_base(op->index);
-        if (!ramp_base.defined()) {
-            are_all_dense = false;
-            return;
-        }
-
-        if ((op->alignment.modulus % op->value.type().lanes() != 0) ||
-            (op->alignment.remainder % op->value.type().lanes() != 0)) {
-            are_all_dense = false;
-            return;
-        }
-
-        if (lanes != -1 && op->value.type().lanes() != lanes) {
-            are_all_dense = false;
-            return;
-        }
-
-        lanes = op->value.type().lanes();
-    }
-};
-}  // namespace
-
-void CodeGen_OpenGLCompute_C::add_kernel(const Stmt &s,
-                                         const string &name,
-                                         const vector<DeviceArgument> &args) {
-
-    debug(2) << "Adding OpenGLCompute kernel " << name << "\n";
-    cache.clear();
-
-    if (target.os == Target::Android) {
-        stream << "#version 310 es\n"
-               << "#extension GL_ANDROID_extension_pack_es31a : require\n";
-    } else if (target.has_feature(Target::EGL)) {
-        stream << "#version 310 es\n";
-    } else {
-        stream << "#version 430\n";
-    }
-    stream << "float float_from_bits(int x) { return intBitsToFloat(int(x)); }\n";
-    stream << "#define halide_maybe_unused(x) (void)(x)\n";
-
-    for (size_t i = 0; i < args.size(); i++) {
-        if (args[i].is_buffer) {
-            //
-            // layout(binding = 10) buffer buffer10 {
-            //     vec3 data[];
-            // } inBuffer;
-            //
-            CheckAlignedDenseVectorLoadStore check_dense(args[i].name);
-            s.accept(&check_dense);
-            int lanes = check_dense.are_all_dense ? check_dense.lanes : 1;
-            buffer_is_vector[args[i].name] = lanes > 1;
-            stream << "layout(binding=" << i << ")"
-                   << " buffer buffer" << i << " { "
-                   << print_type(args[i].type.with_lanes(lanes)) << " data[]; } "
-                   << print_name(args[i].name) << ";\n";
-        } else {
-            stream << "layout(location = " << i << ") uniform " << print_type(args[i].type)
-                   << " " << print_name(args[i].name) << ";\n";
-        }
-    }
-
-    // Find all the shared allocations and declare them at global scope.
-    FindSharedAllocations fsa;
-    s.accept(&fsa);
-    for (const Allocate *op : fsa.allocs) {
-        internal_assert(op->extents.size() == 1 && is_const(op->extents[0]));
-        stream << "shared "
-               << print_type(op->type) << " "
-               << print_name(op->name) << "["
-               << op->extents[0] << "];\n";
-    }
-
-    // We'll figure out the workgroup size while traversing the stmt
-    workgroup_size[0] = 0;
-    workgroup_size[1] = 0;
-    workgroup_size[2] = 0;
-
-    stream << "void main()\n{\n";
-    indent += 2;
-    print(s);
-    indent -= 2;
-    stream << "}\n";
-
-    // Declare the workgroup size.
-    indent += 2;
-    stream << "layout(local_size_x = " << workgroup_size[0];
-    if (workgroup_size[1] > 1) {
-        stream << ", local_size_y = " << workgroup_size[1];
-    }
-    if (workgroup_size[2] > 1) {
-        stream << ", local_size_z = " << workgroup_size[2];
-    }
-    stream << ") in;\n// end of kernel " << name << "\n";
-    indent -= 2;
-}
-
-void CodeGen_OpenGLCompute_Dev::init_module() {
-    src_stream.str("");
-    src_stream.clear();
-    cur_kernel_name = "";
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Allocate *op) {
-    debug(2) << "OpenGLCompute: Allocate " << op->name << " of type " << op->type << " on device\n";
-
-    stream << get_indent();
-    Allocation alloc;
-    alloc.type = op->type;
-    allocations.push(op->name, alloc);
-
-    internal_assert(!op->extents.empty());
-    Expr extent = 1;
-    for (const Expr &e : op->extents) {
-        extent *= e;
-    }
-    extent = simplify(extent);
-    internal_assert(is_const(extent));
-
-    if (op->memory_type != MemoryType::GPUShared) {
-        stream << "{\n";
-        indent += 2;
-        stream << get_indent();
-        // Shared allocations were already declared at global scope.
-        stream << print_type(op->type) << " "
-               << print_name(op->name) << "["
-               << op->extents[0] << "];\n";
-    }
-    op->body.accept(this);
-
-    if (op->memory_type != MemoryType::GPUShared) {
-        indent -= 2;
-        stream << get_indent() << "}\n";
-    }
-
-    buffer_is_vector[op->name] = op->type.is_vector();
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Free *op) {
-    debug(2) << "OpenGLCompute: Free on device for " << op->name << "\n";
-
-    allocations.pop(op->name);
-}
-
-void CodeGen_OpenGLCompute_C::visit(const Evaluate *op) {
-    if (is_const(op->value)) {
-        return;
-    }
-    print_expr(op->value);
-}
-
-void CodeGen_OpenGLCompute_C::visit(const IntImm *op) {
-    if (op->type == Int(32)) {
-        // GL seems to interpret some large int immediates as uints.
-        id = "int(" + std::to_string(op->value) + ")";
-    } else {
-        id = print_type(op->type) + "(" + std::to_string(op->value) + ")";
-    }
-}
-
-vector<char> CodeGen_OpenGLCompute_Dev::compile_to_src() {
-    string str = src_stream.str();
-    debug(1) << "GLSL Compute source:\n"
-             << str << "\n";
-    vector<char> buffer(str.begin(), str.end());
-    buffer.push_back(0);
-    return buffer;
-}
-
-string CodeGen_OpenGLCompute_Dev::get_current_kernel_name() {
-    return cur_kernel_name;
-}
-
-void CodeGen_OpenGLCompute_Dev::dump() {
-    std::cerr << src_stream.str() << "\n";
-}
-
-std::string CodeGen_OpenGLCompute_Dev::print_gpu_name(const std::string &name) {
-    return name;
-}
-
-}  // namespace
-
-std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_OpenGLCompute_Dev(const Target &target) {
-    return std::make_unique<CodeGen_OpenGLCompute_Dev>(target);
-}
-
-}  // namespace Internal
-}  // namespace Halide
diff --git a/src/CodeGen_OpenGLCompute_Dev.h b/src/CodeGen_OpenGLCompute_Dev.h
deleted file mode 100644
index f0a63c885909..000000000000
--- a/src/CodeGen_OpenGLCompute_Dev.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef HALIDE_CODEGEN_OPENGLCOMPUTE_DEV_H
-#define HALIDE_CODEGEN_OPENGLCOMPUTE_DEV_H
-
-/** \file
- * Defines the code-generator for producing GLSL kernel code for OpenGL Compute.
- */
-
-#include <memory>
-
-namespace Halide {
-
-struct Target;
-
-namespace Internal {
-
-struct CodeGen_GPU_Dev;
-
-std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_OpenGLCompute_Dev(const Target &target);
-
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index b86c99f9269e..61b365f2f7aa 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -291,8 +291,6 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev {
 // have the same number of lanes. If this is indeed the case then the 'lanes'
 // member stores the number of lanes in those loads and stores.
 //
-// FIXME: Refactor this and the version in CodeGen_OpenGLCompute_Dev to a common place!
-//
 class CheckAlignedDenseVectorLoadStore : public IRVisitor {
 public:
     // True if all loads and stores from the buffer are dense, aligned, and all
diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp
index 33fa3b36e78e..551acfcdebf2 100644
--- a/src/Deserialization.cpp
+++ b/src/Deserialization.cpp
@@ -244,8 +244,6 @@ DeviceAPI Deserializer::deserialize_device_api(Serialize::DeviceAPI device_api)
         return DeviceAPI::CUDA;
     case Serialize::DeviceAPI::OpenCL:
         return DeviceAPI::OpenCL;
-    case Serialize::DeviceAPI::OpenGLCompute:
-        return DeviceAPI::OpenGLCompute;
     case Serialize::DeviceAPI::Metal:
         return DeviceAPI::Metal;
     case Serialize::DeviceAPI::Hexagon:
diff --git a/src/DeviceAPI.h b/src/DeviceAPI.h
index 1f67aaf7b048..12476a23b724 100644
--- a/src/DeviceAPI.h
+++ b/src/DeviceAPI.h
@@ -18,7 +18,6 @@ enum class DeviceAPI {
     Default_GPU,
     CUDA,
     OpenCL,
-    OpenGLCompute,
     Metal,
     Hexagon,
     HexagonDma,
@@ -34,7 +33,6 @@ const DeviceAPI all_device_apis[] = {DeviceAPI::None,
                                      DeviceAPI::Default_GPU,
                                      DeviceAPI::CUDA,
                                      DeviceAPI::OpenCL,
-                                     DeviceAPI::OpenGLCompute,
                                      DeviceAPI::Metal,
                                      DeviceAPI::Hexagon,
                                      DeviceAPI::HexagonDma,
diff --git a/src/DeviceInterface.cpp b/src/DeviceInterface.cpp
index 9a0cb2f97e99..27f6b549ee7d 100644
--- a/src/DeviceInterface.cpp
+++ b/src/DeviceInterface.cpp
@@ -94,8 +94,6 @@ const halide_device_interface_t *get_device_interface_for_device_api(DeviceAPI d
         name = "opencl";
     } else if (d == DeviceAPI::CUDA) {
         name = "cuda";
-    } else if (d == DeviceAPI::OpenGLCompute) {
-        name = "openglcompute";
     } else if (d == DeviceAPI::Hexagon) {
         name = "hexagon";
     } else if (d == DeviceAPI::HexagonDma) {
@@ -154,8 +152,6 @@ DeviceAPI get_default_device_api_for_target(const Target &target) {
         return DeviceAPI::OpenCL;
     } else if (target.has_feature(Target::CUDA)) {
         return DeviceAPI::CUDA;
-    } else if (target.has_feature(Target::OpenGLCompute)) {
-        return DeviceAPI::OpenGLCompute;
     } else if (target.arch != Target::Hexagon && target.has_feature(Target::HVX)) {
         return DeviceAPI::Hexagon;
     } else if (target.has_feature(Target::HexagonDma)) {
@@ -192,9 +188,6 @@ Expr make_device_interface_call(DeviceAPI device_api, MemoryType memory_type) {
     case DeviceAPI::Metal:
         interface_name = "halide_metal_device_interface";
         break;
-    case DeviceAPI::OpenGLCompute:
-        interface_name = "halide_openglcompute_device_interface";
-        break;
     case DeviceAPI::Hexagon:
         interface_name = "halide_hexagon_device_interface";
         break;
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index cd59fd470d38..ef5a75344bb8 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -627,7 +627,6 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
 
                 if (!may_merge_allocs_of_different_type &&
                     mem_allocs[free_spaces[i]].group[0].type != alloc.type) {
-                    // Types must also match for OpenGLCompute
                     continue;
                 }
 
@@ -649,7 +648,6 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
 
                 if (!may_merge_allocs_of_different_type &&
                     mem_allocs[free_spaces[i]].group[0].type != alloc.type) {
-                    // Types must also match for OpenGLCompute
                     continue;
                 }
 
@@ -760,7 +758,7 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
         // lifetimes, and then cluster the groups according to which
         // ones can share a single allocation. For cuda, opencl, and
         // similar we get one big combined allocation per memory
-        // type. For vulkan, openglcompute and direct3d, we also separate by
+        // type. For vulkan and direct3d, we also separate by
         // element type.
         map<pair<MemoryType, Type>, vector<AllocGroup>> clustered_allocs;
 
@@ -1034,8 +1032,7 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
         : device_api(d),
           thread_id_var_name(unique_name('t')),
           num_threads_var_name(unique_name('t')),
-          may_merge_allocs_of_different_type(device_api != DeviceAPI::OpenGLCompute &&
-                                             device_api != DeviceAPI::D3D12Compute &&
+          may_merge_allocs_of_different_type(device_api != DeviceAPI::D3D12Compute &&
                                              device_api != DeviceAPI::Vulkan &&
                                              device_api != DeviceAPI::WebGPU) {
     }
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index 52cb3714268c..bc03dd124d9a 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -99,9 +99,6 @@ ostream &operator<<(ostream &out, const DeviceAPI &api) {
     case DeviceAPI::OpenCL:
         out << "<OpenCL>";
         break;
-    case DeviceAPI::OpenGLCompute:
-        out << "<OpenGLCompute>";
-        break;
     case DeviceAPI::Metal:
         out << "<Metal>";
         break;
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index 0d37c07284c3..ffd8949d4ca1 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -58,48 +58,6 @@ typedef struct CUctx_st *CUcontext;
 typedef struct cl_context_st *cl_context;
 typedef struct cl_command_queue_st *cl_command_queue;
 
-void load_opengl(bool needs_egl) {
-#if defined(__linux__)
-    if (have_symbol("glXGetCurrentContext") && have_symbol("glDeleteTextures")) {
-        debug(1) << "OpenGL support code already linked in...\n";
-    } else {
-        debug(1) << "Looking for OpenGL support code...\n";
-        string error;
-        if (needs_egl) {
-            // NVIDIA EGL prefers users to load libOpenGL.so instead of libGL.so
-            // The way we're using it, it seems like libGL.so.1 is a valid fallback.
-            // See here for more details: https://developer.nvidia.com/blog/linking-opengl-server-side-rendering
-            llvm::sys::DynamicLibrary::LoadLibraryPermanently("libOpenGL.so.0", &error);
-            if (!error.empty()) {
-                debug(1) << "Could not find libOpenGL.so.0 when EGL requested. Falling back to libGL.so.1\n";
-                llvm::sys::DynamicLibrary::LoadLibraryPermanently("libGL.so.1", &error);
-            }
-            user_assert(error.empty()) << "Could not find libOpenGL.so.0 or libGL.so.1\n";
-            llvm::sys::DynamicLibrary::LoadLibraryPermanently("libEGL.so.1", &error);
-            user_assert(error.empty()) << "Could not find libEGL.so.1\n";
-        } else {
-            llvm::sys::DynamicLibrary::LoadLibraryPermanently("libGL.so.1", &error);
-            user_assert(error.empty()) << "Could not find libGL.so\n";
-            llvm::sys::DynamicLibrary::LoadLibraryPermanently("libX11.so.6", &error);
-            user_assert(error.empty()) << "Could not find libX11.so.6\n";
-        }
-    }
-#elif defined(__APPLE__)
-    if (have_symbol("aglCreateContext") && have_symbol("glDeleteTextures")) {
-        debug(1) << "OpenGL support code already linked in...\n";
-    } else {
-        debug(1) << "Looking for OpenGL support code...\n";
-        string error;
-        llvm::sys::DynamicLibrary::LoadLibraryPermanently("/System/Library/Frameworks/AGL.framework/AGL", &error);
-        user_assert(error.empty()) << "Could not find AGL.framework\n";
-        llvm::sys::DynamicLibrary::LoadLibraryPermanently("/System/Library/Frameworks/OpenGL.framework/OpenGL", &error);
-        user_assert(error.empty()) << "Could not find OpenGL.framework\n";
-    }
-#else
-    internal_error << "JIT support for OpenGL on anything other than linux or OS X not yet implemented\n";
-#endif
-}
-
 void load_metal() {
 #if defined(__APPLE__)
     if (have_symbol("MTLCreateSystemDefaultDevice")) {
@@ -766,7 +724,6 @@ enum RuntimeKind {
     OpenCL,
     Metal,
     CUDA,
-    OpenGLCompute,  // NOTE: this feature is deprecated and will be removed in Halide 17
     Hexagon,
     D3D12Compute,
     Vulkan,
@@ -774,7 +731,6 @@ enum RuntimeKind {
     OpenCLDebug,
     MetalDebug,
     CUDADebug,
-    OpenGLComputeDebug,  // NOTE: this feature is deprecated and will be removed in Halide 17
     HexagonDebug,
     D3D12ComputeDebug,
     VulkanDebug,
@@ -812,7 +768,6 @@ JITModule &make_module(llvm::Module *for_module, Target target,
         one_gpu.set_feature(Target::Metal, false);
         one_gpu.set_feature(Target::CUDA, false);
         one_gpu.set_feature(Target::HVX, false);
-        one_gpu.set_feature(Target::OpenGLCompute, false);
         one_gpu.set_feature(Target::D3D12Compute, false);
         one_gpu.set_feature(Target::Vulkan, false);
         one_gpu.set_feature(Target::WebGPU, false);
@@ -847,17 +802,6 @@ JITModule &make_module(llvm::Module *for_module, Target target,
             one_gpu.set_feature(Target::CUDA);
             module_name += "cuda";
             break;
-        case OpenGLComputeDebug:
-            one_gpu.set_feature(Target::Debug);
-            one_gpu.set_feature(Target::OpenGLCompute);
-            module_name = "debug_openglcompute";
-            load_opengl(one_gpu.has_feature(Target::EGL));
-            break;
-        case OpenGLCompute:
-            one_gpu.set_feature(Target::OpenGLCompute);
-            module_name += "openglcompute";
-            load_opengl(one_gpu.has_feature(Target::EGL));
-            break;
         case HexagonDebug:
             one_gpu.set_feature(Target::Debug);
             one_gpu.set_feature(Target::HVX);
@@ -1065,13 +1009,6 @@ std::vector<JITModule> JITSharedRuntime::get(llvm::Module *for_module, const Tar
             result.push_back(m);
         }
     }
-    if (target.has_feature(Target::OpenGLCompute)) {
-        auto kind = target.has_feature(Target::Debug) ? OpenGLComputeDebug : OpenGLCompute;
-        JITModule m = make_module(for_module, target, kind, result, create);
-        if (m.compiled()) {
-            result.push_back(m);
-        }
-    }
     if (target.has_feature(Target::HVX)) {
         auto kind = target.has_feature(Target::Debug) ? HexagonDebug : Hexagon;
         JITModule m = make_module(for_module, target, kind, result, create);
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index ad65bdc2ebc2..609fbc3467bc 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -111,13 +111,9 @@ DECLARE_CPP_INITMOD(module_jit_ref_count)
 DECLARE_CPP_INITMOD(msan)
 DECLARE_CPP_INITMOD(msan_stubs)
 DECLARE_CPP_INITMOD(opencl)
-DECLARE_CPP_INITMOD(opengl_egl_context)
-DECLARE_CPP_INITMOD(opengl_glx_context)
-DECLARE_CPP_INITMOD(openglcompute)
 DECLARE_CPP_INITMOD(osx_clock)
 DECLARE_CPP_INITMOD(osx_get_symbol)
 DECLARE_CPP_INITMOD(osx_host_cpu_count)
-DECLARE_CPP_INITMOD(osx_opengl_context)
 DECLARE_CPP_INITMOD(osx_yield)
 DECLARE_CPP_INITMOD(posix_aligned_alloc)
 DECLARE_CPP_INITMOD(posix_allocator)
@@ -1211,23 +1207,6 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
                 modules.push_back(get_initmod_opencl(c, bits_64, debug));
             }
         }
-        if (t.has_feature(Target::OpenGLCompute)) {
-            modules.push_back(get_initmod_openglcompute(c, bits_64, debug));
-            if (t.os == Target::Android) {
-                // Only platform that supports OpenGL Compute for now.
-                modules.push_back(get_initmod_opengl_egl_context(c, bits_64, debug));
-            } else if (t.os == Target::Linux) {
-                if (t.has_feature(Target::EGL)) {
-                    modules.push_back(get_initmod_opengl_egl_context(c, bits_64, debug));
-                } else {
-                    modules.push_back(get_initmod_opengl_glx_context(c, bits_64, debug));
-                }
-            } else if (t.os == Target::OSX) {
-                modules.push_back(get_initmod_osx_opengl_context(c, bits_64, debug));
-            } else {
-                // You're on your own to provide definitions of halide_opengl_get_proc_address and halide_opengl_create_context
-            }
-        }
         if (t.has_feature(Target::Metal)) {
             modules.push_back(get_initmod_metal(c, bits_64, debug));
             if (t.arch == Target::ARM) {
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 37c4bac07efb..74af1aeffe28 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -211,7 +211,6 @@ void lower_impl(const vector<Function> &output_funcs,
 
     bool will_inject_host_copies =
         (t.has_gpu_feature() ||
-         t.has_feature(Target::OpenGLCompute) ||
          t.has_feature(Target::HexagonDma) ||
          (t.arch != Target::Hexagon && (t.has_feature(Target::HVX))));
 
@@ -251,11 +250,10 @@ void lower_impl(const vector<Function> &output_funcs,
     s = split_tuples(s, env);
     log("Lowering after destructuring tuple-valued realizations:", s);
 
-    // OpenGL relies on GPU var canonicalization occurring before
+    // Vulkan relies on GPU var canonicalization occurring before
     // storage flattening.
     if (t.has_gpu_feature() ||
-        t.has_feature(Target::Vulkan) ||
-        t.has_feature(Target::OpenGLCompute)) {
+        t.has_feature(Target::Vulkan)) {
         debug(1) << "Canonicalizing GPU var names...\n";
         s = canonicalize_gpu_vars(s);
         log("Lowering after canonicalizing GPU var names:", s);
@@ -327,8 +325,7 @@ void lower_impl(const vector<Function> &output_funcs,
     log("Lowering after vectorizing:", s);
 
     if (t.has_gpu_feature() ||
-        t.has_feature(Target::Vulkan) ||
-        t.has_feature(Target::OpenGLCompute)) {
+        t.has_feature(Target::Vulkan)) {
         debug(1) << "Injecting per-block gpu synchronization...\n";
         s = fuse_gpu_thread_loops(s);
         log("Lowering after injecting per-block gpu synchronization:", s);
diff --git a/src/Module.cpp b/src/Module.cpp
index a00ff25e7d59..5bece0d7ebdd 100644
--- a/src/Module.cpp
+++ b/src/Module.cpp
@@ -332,7 +332,7 @@ struct ModuleContents {
     /** This is a copy of the code throughout the lowering process, which
      * reflects best the actual pipeline, without introducing device-specific
      * generated code from device-specific offloads (such as Cuda PTX,
-     * OpenGL Compute, etc...). In other words, we'd like to keep this
+     * etc...). In other words, we'd like to keep this
      * conceptually relevant and human-readable. */
     Stmt conceptual_code;
 };
@@ -560,10 +560,6 @@ const Internal::Stmt &Module::get_conceptual_stmt() const {
 void Module::compile(const std::map<OutputFileType, std::string> &output_files) const {
     validate_outputs(output_files);
 
-    if (target().has_feature(Target::OpenGLCompute)) {
-        user_warning << "WARNING: OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.\n";
-    }
-
     // Minor but worthwhile optimization: if all of the output files are of types that won't
     // ever rely on submodules (e.g.: toplevel declarations in C/C++), don't bother resolving
     // the submodules, which can call compile_to_buffer().
diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index 46e6544036b7..77a57efc1149 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -5,7 +5,6 @@
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Metal_Dev.h"
 #include "CodeGen_OpenCL_Dev.h"
-#include "CodeGen_OpenGLCompute_Dev.h"
 #include "CodeGen_PTX_Dev.h"
 #include "CodeGen_Vulkan_Dev.h"
 #include "CodeGen_WebGPU_Dev.h"
@@ -166,7 +165,7 @@ class InjectGpuOffload : public IRMutator {
                      return a.type.bits() > b.type.bits();
                  } else {
                      // Ensure that buffer arguments come first:
-                     // for many OpenGL/Compute systems, the
+                     // for some GPU systems, the
                      // legal indices for buffer args are much
                      // more restrictive than for scalar args,
                      // and scalar args can be 'grown' by
@@ -267,9 +266,6 @@ class InjectGpuOffload : public IRMutator {
         // host arch or os.
         device_target.os = Target::OSUnknown;
         device_target.arch = Target::ArchUnknown;
-        if (target.has_feature(Target::OpenGLCompute)) {
-            cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(device_target);
-        }
         if (target.has_feature(Target::CUDA)) {
             cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(device_target);
         }
diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
index c605d2038248..536b8994e686 100644
--- a/src/Pipeline.cpp
+++ b/src/Pipeline.cpp
@@ -957,10 +957,6 @@ void Pipeline::realize(JITUserContext *context,
     Target target = t;
     user_assert(defined()) << "Can't realize an undefined Pipeline\n";
 
-    if (t.has_feature(Target::OpenGLCompute)) {
-        user_warning << "WARNING: OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.\n";
-    }
-
     debug(2) << "Realizing Pipeline for " << target << "\n";
 
     if (target.has_unknowns()) {
diff --git a/src/SelectGPUAPI.h b/src/SelectGPUAPI.h
index ecb424364bb9..a38572d4946d 100644
--- a/src/SelectGPUAPI.h
+++ b/src/SelectGPUAPI.h
@@ -16,7 +16,7 @@ namespace Internal {
 
 /** Replace for loops with GPU_Default device_api with an actual
  * device API depending on what's enabled in the target. Choose the
- * first of the following: opencl, cuda, openglcompute, opengl */
+ * first of the following: opencl, cuda */
 Stmt select_gpu_api(const Stmt &s, const Target &t);
 
 }  // namespace Internal
diff --git a/src/Serialization.cpp b/src/Serialization.cpp
index f8be69271ff0..144d79af7e5e 100644
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -214,8 +214,6 @@ Serialize::DeviceAPI Serializer::serialize_device_api(const DeviceAPI &device_ap
         return Serialize::DeviceAPI::CUDA;
     case DeviceAPI::OpenCL:
         return Serialize::DeviceAPI::OpenCL;
-    case DeviceAPI::OpenGLCompute:
-        return Serialize::DeviceAPI::OpenGLCompute;
     case DeviceAPI::Metal:
         return Serialize::DeviceAPI::Metal;
     case DeviceAPI::Hexagon:
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index 223a33837c7a..d7e7c50002f6 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -422,8 +422,7 @@ class FlattenDimensions : public IRMutator {
 
                 // Create image_load("name", name.buffer, x - x_min, x_extent,
                 // y - y_min, y_extent, ...).  Extents can be used by
-                // successive passes. OpenGL, for example, uses them
-                // for coordinate normalization.
+                // successive passes.
                 vector<Expr> args(2);
                 args[0] = op->name;
                 args[1] = buffer_var;
@@ -600,7 +599,6 @@ Stmt storage_flattening(Stmt s,
                         const vector<Function> &outputs,
                         const map<string, Function> &env,
                         const Target &target) {
-    // The OpenGL backend requires loop mins to be zero'd at this point.
     s = zero_gpu_loop_mins(s);
 
     // Make an environment that makes it easier to figure out which
diff --git a/src/Target.cpp b/src/Target.cpp
index c824fea1c928..082b5103bd0b 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -498,7 +498,6 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"cl_doubles", Target::CLDoubles},
     {"cl_half", Target::CLHalf},
     {"cl_atomics64", Target::CLAtomics64},
-    {"openglcompute", Target::OpenGLCompute},
     {"egl", Target::EGL},
     {"user_context", Target::UserContext},
     {"profile", Target::Profile},
@@ -983,9 +982,6 @@ bool Target::supported() const {
 #if !defined(WITH_METAL)
     bad |= has_feature(Target::Metal);
 #endif
-#if !defined(WITH_OPENGLCOMPUTE)
-    bad |= has_feature(Target::OpenGLCompute);
-#endif
 #if !defined(WITH_D3D12)
     bad |= has_feature(Target::D3D12Compute);
 #endif
@@ -1059,7 +1055,6 @@ bool Target::has_gpu_feature() const {
             has_feature(OpenCL) ||
             has_feature(Metal) ||
             has_feature(D3D12Compute) ||
-            has_feature(OpenGLCompute) ||
             has_feature(Vulkan) ||
             has_feature(WebGPU));
 }
@@ -1118,14 +1113,12 @@ bool Target::supports_type(const Type &t) const {
     if (t.bits() == 64) {
         if (t.is_float()) {
             return (!has_feature(Metal) &&
-                    !has_feature(OpenGLCompute) &&
                     !has_feature(D3D12Compute) &&
                     (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles)) &&
                     (!has_feature(Vulkan) || has_feature(Target::VulkanFloat64)) &&
                     !has_feature(WebGPU));
         } else {
             return (!has_feature(Metal) &&
-                    !has_feature(OpenGLCompute) &&
                     !has_feature(D3D12Compute) &&
                     (!has_feature(Vulkan) || has_feature(Target::VulkanInt64)) &&
                     !has_feature(WebGPU));
@@ -1157,8 +1150,6 @@ bool Target::supports_type(const Type &t, DeviceAPI device) const {
         // Shader Model 5.x can optionally support double-precision; 64-bit int
         // types are not supported.
         return t.bits() < 64;
-    } else if (device == DeviceAPI::OpenGLCompute) {
-        return t.bits() < 64;
     } else if (device == DeviceAPI::Vulkan) {
         if (t.is_float() && t.bits() == 64) {
             return has_feature(Target::VulkanFloat64);
@@ -1214,9 +1205,6 @@ DeviceAPI Target::get_required_device_api() const {
     if (has_feature(Target::OpenCL)) {
         return DeviceAPI::OpenCL;
     }
-    if (has_feature(Target::OpenGLCompute)) {
-        return DeviceAPI::OpenGLCompute;
-    }
     if (has_feature(Target::Vulkan)) {
         return DeviceAPI::Vulkan;
     }
@@ -1232,8 +1220,6 @@ Target::Feature target_feature_for_device_api(DeviceAPI api) {
         return Target::CUDA;
     case DeviceAPI::OpenCL:
         return Target::OpenCL;
-    case DeviceAPI::OpenGLCompute:
-        return Target::OpenGLCompute;
     case DeviceAPI::Metal:
         return Target::Metal;
     case DeviceAPI::Hexagon:
@@ -1333,7 +1319,6 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         Metal,
         NoNEON,
         OpenCL,
-        OpenGLCompute,
         Vulkan,
         WebGPU,
 
diff --git a/src/Target.h b/src/Target.h
index 97c141f308e5..20730a313883 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -109,7 +109,6 @@ struct Target {
         CLDoubles = halide_target_feature_cl_doubles,
         CLHalf = halide_target_feature_cl_half,
         CLAtomics64 = halide_target_feature_cl_atomic64,
-        OpenGLCompute = halide_target_feature_openglcompute,  // NOTE: This feature is deprecated and will be removed in Halide 17.
         EGL = halide_target_feature_egl,
         UserContext = halide_target_feature_user_context,
         Profile = halide_target_feature_profile,
@@ -234,10 +233,7 @@ struct Target {
 
     /** Is a fully feature GPU compute runtime enabled? I.e. is
      * Func::gpu_tile and similar going to work? Currently includes
-     * CUDA, OpenCL, Metal and D3D12Compute. We do not include OpenGL,
-     * because it is not capable of gpgpu, and is not scheduled via
-     * Func::gpu_tile.
-     * TODO: Should OpenGLCompute be included here? */
+     * CUDA, OpenCL, Metal and D3D12Compute. */
     bool has_gpu_feature() const;
 
     /** Does this target allow using a certain type. Generally all
diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs
index d91222d62f65..01a987b6f430 100644
--- a/src/halide_ir.fbs
+++ b/src/halide_ir.fbs
@@ -98,7 +98,6 @@ enum DeviceAPI: byte {
     Default_GPU,
     CUDA,
     OpenCL,
-    OpenGLCompute,
     Metal,
     Hexagon,
     HexagonDma,
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index b1331ed07e52..039fae2d1b11 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -43,13 +43,9 @@ set(RUNTIME_CPP
     msan
     msan_stubs
     opencl
-    opengl_egl_context
-    opengl_glx_context
-    openglcompute
     osx_clock
     osx_get_symbol
     osx_host_cpu_count
-    osx_opengl_context
     osx_yield
     posix_aligned_alloc
     posix_allocator
@@ -135,7 +131,6 @@ set(RUNTIME_HEADER_FILES
     HalideRuntimeHexagonHost.h
     HalideRuntimeMetal.h
     HalideRuntimeOpenCL.h
-    HalideRuntimeOpenGLCompute.h
     HalideRuntimeQurt.h
     HalideRuntimeVulkan.h
     HalideRuntimeWebGPU.h
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 7b84e44f6928..d8ae1268fbaf 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1356,8 +1356,6 @@ typedef enum halide_target_feature_t {
     halide_target_feature_cl_doubles,   ///< Enable double support on OpenCL targets
     halide_target_feature_cl_atomic64,  ///< Enable 64-bit atomics operations on OpenCL targets
 
-    halide_target_feature_openglcompute,  ///< Enable OpenGL Compute runtime. NOTE: This feature is deprecated and will be removed in Halide 17.
-
     halide_target_feature_user_context,  ///< Generated code takes a user_context pointer as first argument
 
     halide_target_feature_profile,     ///< Launch a sampling profiler alongside the Halide pipeline that monitors and reports the runtime used by each Func
diff --git a/src/runtime/HalideRuntimeOpenGLCompute.h b/src/runtime/HalideRuntimeOpenGLCompute.h
deleted file mode 100644
index f460703b798b..000000000000
--- a/src/runtime/HalideRuntimeOpenGLCompute.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef HALIDE_HALIDERUNTIMEOPENGLCOMPUTE_H
-#define HALIDE_HALIDERUNTIMEOPENGLCOMPUTE_H
-
-// Don't include HalideRuntime.h if the contents of it were already pasted into a generated header above this one
-#ifndef HALIDE_HALIDERUNTIME_H
-
-#include "HalideRuntime.h"
-
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** \file
- *  Routines specific to the Halide OpenGL Compute runtime.
- */
-
-#define HALIDE_RUNTIME_OPENGLCOMPUTE
-
-HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.")
-extern const struct halide_device_interface_t *halide_openglcompute_device_interface();
-
-/** These are forward declared here to allow clients to override the
- *  Halide Glsl runtime. Do not call them. */
-// @{
-
-/** This function sets up OpenGL context, loads relevant GL functions, then
- *  compiles src OpenGL compute shader into OpenGL program and stores it for future use.
- */
-HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.")
-extern int halide_openglcompute_initialize_kernels(void *user_context, void **state_ptr,
-                                                   const char *src, int size);
-
-/** This function triggers execution of OpenGL program built around compute shader.
- *  Execution of the shader is parallelized into given number of blocks and threads.
- *
- *  This function doesn't wait for the completion of the shader, but it sets memory
- *  barrier which forces successive retrieval of output data to wait until shader is done.
- */
-HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.")
-extern int halide_openglcompute_run(void *user_context,
-                                    void *state_ptr,
-                                    const char *entry_name,
-                                    int blocksX, int blocksY, int blocksZ,
-                                    int threadsX, int threadsY, int threadsZ,
-                                    int shared_mem_bytes,
-                                    struct halide_type_t arg_types[],
-                                    void *args[],
-                                    int8_t is_buffer[]);
-
-HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.")
-extern void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr);
-// @}
-
-/** This function retrieves pointers to OpenGL API functions.
- *
- *  You may have to implement this yourself. Halide only provides implementations
- *  for some platforms."
- */
-HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.")
-extern void *halide_opengl_get_proc_address(void *user_context, const char *name);
-
-/** This function creates an OpenGL context for use by the OpenGL backend.
- *
- *  You may have to implement this yourself as well. Halide only provides
- *   implementations for some platforms."
- */
-HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.")
-extern int halide_opengl_create_context(void *user_context);
-
-#ifdef __cplusplus
-}  // End extern "C"
-#endif
-
-#endif  // HALIDE_HALIDERUNTIMEOPENGLCOMPUTE_H
diff --git a/src/runtime/device_interface.cpp b/src/runtime/device_interface.cpp
index 710d1259678d..1625a6698ccc 100644
--- a/src/runtime/device_interface.cpp
+++ b/src/runtime/device_interface.cpp
@@ -473,10 +473,8 @@ WEAK int halide_default_buffer_copy(void *user_context, struct halide_buffer_t *
 
     // The right thing is that all devices have to support
     // device-to-device and device-to/from-arbitrarty-pointer.  This
-    // means there will always have to be a device specifc version of
-    // this function and the default can go away or fail. At present
-    // there are some devices, e.g. OpenGL and OpenGLCompute, for which
-    // this is not yet implemented.
+    // means there will always have to be a device specific version of
+    // this function and the default can go away or fail.
 
     return halide_error_code_device_buffer_copy_failed;
 }
diff --git a/src/runtime/mini_opengl.h b/src/runtime/mini_opengl.h
deleted file mode 100644
index 1101fcd5a24c..000000000000
--- a/src/runtime/mini_opengl.h
+++ /dev/null
@@ -1,221 +0,0 @@
-#ifndef MINI_OPENGL_H
-#define MINI_OPENGL_H
-
-// ---------- OpenGL core (1.3 and earlier) ----------
-
-typedef char GLchar;
-typedef unsigned char GLubyte;
-typedef unsigned int GLenum;
-typedef unsigned char GLboolean;
-typedef int GLint;
-typedef unsigned int GLuint;
-typedef int GLsizei;
-typedef ptrdiff_t GLsizeiptr;
-typedef float GLfloat;
-typedef double GLdouble;
-typedef void GLvoid;
-
-#define GL_NO_ERROR 0x0
-#define GL_FALSE 0x0
-#define GL_TRUE 0x1
-#define GL_TRIANGLES 0x0004
-#define GL_TRIANGLE_STRIP 0x0005
-#define GL_CULL_FACE 0x0B44
-#define GL_DEPTH_TEST 0x0B71
-#define GL_VIEWPORT 0x0BA2
-#define GL_PACK_ALIGNMENT 0x0D05
-#define GL_UNPACK_ALIGNMENT 0x0CF5
-#define GL_UNPACK_ROW_LENGTH 0x0CF2
-#define GL_PACK_ROW_LENGTH 0x0D02
-#define GL_TEXTURE_2D 0x0DE1
-#define GL_TEXTURE_WIDTH 0x1000
-#define GL_TEXTURE_HEIGHT 0x1001
-#define GL_BYTE 0x1400
-#define GL_UNSIGNED_BYTE 0x1401
-#define GL_SHORT 0x1402
-#define GL_UNSIGNED_SHORT 0x1403
-#define GL_INT 0x1404
-#define GL_UNSIGNED_INT 0x1405
-#define GL_FLOAT 0x1406
-#define GL_MODELVIEW 0x1700
-#define GL_PROJECTION 0x1701
-#define GL_RED 0x1903
-#define GL_RGB 0x1907
-#define GL_RGBA 0x1908
-#define GL_LUMINANCE 0x1909
-#define GL_LUMINANCE_ALPHA 0x190A
-#define GL_VERSION 0x1F02
-#define GL_EXTENSIONS 0x1F03
-#define GL_NEAREST 0x2600
-#define GL_TEXTURE_MAG_FILTER 0x2800
-#define GL_TEXTURE_MIN_FILTER 0x2801
-#define GL_TEXTURE_WRAP_S 0x2802
-#define GL_TEXTURE_WRAP_T 0x2803
-#define GL_CLAMP_TO_EDGE 0x812F
-#define GL_TEXTURE0 0x84C0
-#define GL_TEXTURE1 0x84C1
-#define GL_TEXTURE2 0x84C2
-#define GL_TEXTURE3 0x84C3
-#define GL_ACTIVE_TEXTURE 0x84E0
-#define GL_TEXTURE_BINDING_2D 0x8069
-#define GL_ACTIVE_UNIFORMS 0x8B86
-
-typedef void (*PFNGLACTIVETEXTUREPROC)(GLenum texture);
-typedef void (*PFNGLBINDTEXTUREPROC)(GLenum target, GLuint texture);
-typedef void (*PFNGLDISABLEPROC)(GLenum cap);
-typedef void (*PFNGLDELETETEXTURESPROC)(GLsizei n, const GLuint *textures);
-typedef void (*PFNGLDRAWBUFFERSPROC)(GLsizei n, const GLenum *bufs);
-typedef void (*PFNGLDRAWELEMENTSPROC)(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices);
-typedef void (*PFNGLGENTEXTURESPROC)(GLsizei n, GLuint *textures);
-typedef GLenum (*PFNGLGETERRORPROC)();
-typedef const GLubyte *(*PFNGLGETSTRINGPROC)(GLenum name);
-typedef void (*PFNGLGETTEXIMAGEPROC)(GLenum target, GLint level,
-                                     GLenum format, GLenum type,
-                                     GLvoid *pixels);
-typedef void (*PFNGLLOADIDENTITYPROC)();
-typedef void (*PFNGLMATRIXMODEPROC)(GLenum mode);
-typedef void (*PFNGLORTHOPROC)(GLdouble left, GLdouble right,
-                               GLdouble bottom, GLdouble top,
-                               GLdouble near_val, GLdouble far_val);
-typedef void (*PFNGLPIXELSTOREIPROC)(GLenum pname, GLint param);
-
-typedef void (*PFNGLGETTEXLEVELPARAMETERIVPROC)(GLenum target, GLint level,
-                                                GLenum pname, GLint *params);
-typedef void (*PFNGLTEXIMAGE2DPROC)(GLenum target, GLint level,
-                                    GLint internalFormat,
-                                    GLsizei width, GLsizei height,
-                                    GLint border, GLenum format, GLenum type,
-                                    const GLvoid *pixels);
-typedef void (*PFNGLTEXPARAMETERIPROC)(GLenum target, GLenum pname, GLint param);
-typedef void (*PFNGLTEXSUBIMAGE2DPROC)(GLenum target, GLint level,
-                                       GLint xoffset, GLint yoffset,
-                                       GLsizei width, GLsizei height,
-                                       GLenum format, GLenum type,
-                                       const GLvoid *data);
-typedef void (*PFNGLVIEWPORTPROC)(GLint x, GLint y, GLsizei width, GLsizei height);
-typedef void (*PFNGLREADPIXELS)(GLint x, GLint y,
-                                GLsizei width, GLsizei height,
-                                GLenum format, GLenum type,
-                                GLvoid *pixels);
-
-// ---------- OpenGL 1.5 ----------
-
-#define GL_ARRAY_BUFFER 0x8892
-#define GL_ELEMENT_ARRAY_BUFFER 0x8893
-#define GL_STATIC_DRAW 0x88E4
-#define GL_ARRAY_BUFFER_BINDING 0x8894
-#define GL_ELEMENT_ARRAY_BUFFER_BINDING 0x8895
-
-typedef void (*PFNGLGENBUFFERSPROC)(GLsizei n, GLuint *buffers);
-typedef void (*PFNGLDELETEBUFFERSPROC)(GLsizei n, const GLuint *buffers);
-typedef void (*PFNGLBINDBUFFERPROC)(GLenum target, GLuint buffer);
-typedef void (*PFNGLBUFFERDATAPROC)(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage);
-
-// ---------- OpenGL 2.0 ----------
-
-#define GL_FRAGMENT_SHADER 0x8B30
-#define GL_VERTEX_SHADER 0x8B31
-#define GL_COMPILE_STATUS 0x8B81
-#define GL_LINK_STATUS 0x8B82
-#define GL_INFO_LOG_LENGTH 0x8B84
-#define GL_IMPLEMENTATION_COLOR_READ_FORMAT 0x8B9B
-#define GL_IMPLEMENTATION_COLOR_READ_TYPE 0x8B9A
-#define GL_CURRENT_PROGRAM 0x8B8D
-#define GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS 0x8B4D
-#define GL_MAX_VERTEX_ATTRIBS 0x8869
-#define GL_VERTEX_ATTRIB_ARRAY_ENABLED 0x8622
-
-typedef void (*PFNGLATTACHSHADERPROC)(GLuint program, GLuint shader);
-typedef void (*PFNGLCOMPILESHADERPROC)(GLuint shader);
-typedef GLuint (*PFNGLCREATEPROGRAMPROC)();
-typedef GLuint (*PFNGLCREATESHADERPROC)(GLenum type);
-typedef void (*PFNGLDELETEPROGRAMPROC)(GLuint program);
-typedef void (*PFNGLDELETESHADERPROC)(GLuint shader);
-typedef void (*PFNGLDISABLEVERTEXATTRIBARRAYPROC)(GLuint index);
-typedef void (*PFNGLENABLEVERTEXATTRIBARRAYPROC)(GLuint index);
-typedef GLint (*PFNGLGETATTRIBLOCATIONPROC)(GLuint program, const GLchar *name);
-typedef void (*PFNGLGETPROGRAMIVPROC)(GLuint program, GLenum pname, GLint *params);
-typedef void (*PFNGLGETPROGRAMINFOLOGPROC)(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog);
-typedef void (*PFNGLGETSHADERIVPROC)(GLuint shader, GLenum pname, GLint *params);
-typedef void (*PFNGLGETSHADERINFOLOGPROC)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog);
-typedef GLint (*PFNGLGETUNIFORMLOCATIONPROC)(GLuint program, const GLchar *name);
-typedef void (*PFNGLLINKPROGRAMPROC)(GLuint program);
-typedef void (*PFNGLSHADERSOURCEPROC)(GLuint shader, GLsizei count, const GLchar **string, const GLint *length);
-typedef void (*PFNGLUNIFORM1FPROC)(GLuint location, GLfloat value);
-typedef void (*PFNGLUNIFORM1IPROC)(GLuint location, GLint value);
-typedef void (*PFNGLUNIFORM1IVPROC)(GLint location, GLsizei count, const GLint *value);
-typedef void (*PFNGLUNIFORM2IVPROC)(GLint location, GLsizei count, const GLint *value);
-typedef void (*PFNGLUNIFORM1FVPROC)(GLint location, GLsizei count, const GLfloat *value);
-typedef void (*PFNGLUSEPROGRAMPROC)(GLuint program);
-typedef void (*PFNGLVERTEXATTRIBPOINTERPROC)(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const GLvoid *pointer);
-typedef void (*PFNGLGETINTEGERV)(GLenum pname, GLint *data);
-typedef void (*PFNGLGETBOOLEANV)(GLenum pname, GLboolean *data);
-typedef void (*PFNGLFINISHPROC)();
-typedef void (*PFNGLGETVERTEXATTRIBIVPROC)(GLuint index, GLenum pname, GLint *params);
-
-// ---------- OpenGL 3.0 ----------
-
-#define GL_MAJOR_VERSION 0x821B
-#define GL_MINOR_VERSION 0x821C
-#define GL_NUM_EXTENSIONS 0x821D
-#define GL_RG 0x8227
-#define GL_R32F 0x822E
-#define GL_RG32F 0x8230
-#define GL_RGBA32F 0x8814
-#define GL_RGB32F 0x8815
-#define GL_LUMINANCE32F 0x8818
-#define GL_VERTEX_ARRAY_BINDING 0x85B5
-
-// GL_ARB_framebuffer_object
-#define GL_FRAMEBUFFER_COMPLETE 0x8CD5
-#define GL_COLOR_ATTACHMENT0 0x8CE0
-#define GL_FRAMEBUFFER 0x8D40
-#define GL_FRAMEBUFFER_BINDING 0x8CA6
-
-typedef void (*PFNGLBINDFRAMEBUFFERPROC)(GLenum target, GLuint framebuffer);
-typedef GLenum (*PFNGLCHECKFRAMEBUFFERSTATUSPROC)(GLenum target);
-typedef void (*PFNGLDELETEFRAMEBUFFERSPROC)(GLsizei n, const GLuint *framebuffers);
-typedef void (*PFNGLFRAMEBUFFERTEXTURE2DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
-typedef void (*PFNGLGENFRAMEBUFFERSPROC)(GLsizei n, GLuint *framebuffers);
-
-typedef void (*PFNGLGENVERTEXARRAYS)(GLsizei n, GLuint *arrays);
-typedef void (*PFNGLBINDVERTEXARRAY)(GLuint array);
-typedef void (*PFNGLDELETEVERTEXARRAYS)(GLsizei n, const GLuint *arrays);
-typedef const GLubyte *(*PFNGLGETSTRINGI)(GLenum name, GLuint index);
-typedef void (*PFNDRAWBUFFERS)(GLsizei n, const GLenum *bufs);
-
-// ---------- OpenGL ES 3.1 ----------
-
-#define GL_TEXTURE_BUFFER_EXT 0x8c2a
-
-#define GL_COMPUTE_SHADER 0x91B9
-#define GL_DYNAMIC_COPY 0x88ea
-
-#define GL_READ_ONLY 0x88B8
-#define GL_WRITE_ONLY 0x88B9
-
-#define GL_MAP_READ_BIT 0x0001
-#define GL_MAP_WRITE_BIT 0x0002
-
-#define GL_SHADER_STORAGE_BUFFER 0x90D2
-
-#define GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT 0x00000001
-#define GL_BUFFER_UPDATE_BARRIER_BIT 0x00000200
-#define GL_ALL_BARRIER_BITS 0xFFFFFFFF
-
-typedef unsigned int GLbitfield;
-typedef ptrdiff_t GLintptr;
-
-typedef void (*PFNGLTEXBUFFEREXTPROC)(GLenum target, GLenum internalformat, GLuint buffer);
-typedef void (*PFNGLBINDIMAGETEXTUREPROC)(GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, GLenum format);
-typedef void (*PFNGLMEMORYBARRIERPROC)(GLbitfield barriers);
-typedef void *(*PFNGLMAPBUFFERRANGEPROC)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
-typedef void (*PFNGLDISPATCHCOMPUTEPROC)(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z);
-typedef void (*PFNGLUNMAPBUFFERPROC)(GLenum target);
-typedef void (*PFNGLBINDBUFFERBASEPROC)(GLenum target, GLuint index, GLuint buffer);
-typedef void (*PFNGLDELETEBUFFERSPROC)(GLsizei n, const GLuint *buffers);
-
-typedef void (*PFNGLGETACTIVEUNIFORM)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name);
-typedef GLint (*PFNGLGETUNIFORMLOCATION)(GLuint program, const GLchar *name);
-
-#endif  // MINI_OPENGL_H
diff --git a/src/runtime/opengl_egl_context.cpp b/src/runtime/opengl_egl_context.cpp
deleted file mode 100644
index a41e51ee67a1..000000000000
--- a/src/runtime/opengl_egl_context.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-#include "HalideRuntime.h"
-#include "printer.h"
-
-extern "C" {
-
-#define EGLAPI
-#define EGLAPIENTRY
-#define EGLAPIENTRYP EGLAPIENTRY *
-
-typedef int32_t EGLint;
-typedef unsigned int EGLBoolean;
-typedef unsigned int EGLenum;
-typedef void *EGLContext;
-typedef void *EGLDisplay;
-typedef void *EGLNativeDisplayType;
-typedef void *EGLConfig;
-typedef void *EGLSurface;
-typedef void *EGLDeviceEXT;
-
-typedef EGLBoolean(EGLAPIENTRYP PFNEGLQUERYDEVICESEXTPROC)(
-    EGLint max_devices, EGLDeviceEXT *devices, EGLint *num_devices);
-typedef EGLDisplay(EGLAPIENTRYP PFNEGLGETPLATFORMDISPLAYEXTPROC)(
-    EGLenum platform, void *native_display, const EGLint *attrib_list);
-
-#define EGL_NO_CONTEXT ((EGLContext)0)
-#define EGL_DEFAULT_DISPLAY ((EGLNativeDisplayType)0)
-#define EGL_NO_DISPLAY ((EGLDisplay)0)
-#define EGL_NO_SURFACE ((EGLSurface)0)
-
-#define EGL_SUCCESS 0x3000
-
-#define EGL_ALPHA_SIZE 0x3021
-#define EGL_BLUE_SIZE 0x3022
-#define EGL_GREEN_SIZE 0x3023
-#define EGL_RED_SIZE 0x3024
-#define EGL_SURFACE_TYPE 0x3033
-#define EGL_NONE 0x3038
-#define EGL_RENDERABLE_TYPE 0x3040
-#define EGL_HEIGHT 0x3056
-#define EGL_WIDTH 0x3057
-#define EGL_CONTEXT_CLIENT_VERSION 0x3098
-
-#define EGL_PLATFORM_DEVICE_EXT 0x313F
-
-#define EGL_PBUFFER_BIT 0x0001
-#define EGL_OPENGL_ES2_BIT 0x0004
-
-#define EGL_FALSE 0
-#define EGL_TRUE 1
-
-EGLAPI EGLint EGLAPIENTRY eglGetError(void);
-EGLAPI EGLContext EGLAPIENTRY eglGetCurrentContext(void);
-EGLAPI EGLDisplay EGLAPIENTRY eglGetDisplay(EGLNativeDisplayType display_id);
-EGLAPI EGLBoolean EGLAPIENTRY eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor);
-EGLAPI EGLBoolean EGLAPIENTRY eglChooseConfig(EGLDisplay dpy, const EGLint *attrib_list,
-                                              EGLConfig *configs, EGLint config_size,
-                                              EGLint *num_config);
-EGLAPI EGLContext EGLAPIENTRY eglCreateContext(EGLDisplay dpy, EGLConfig config,
-                                               EGLContext share_context,
-                                               const EGLint *attrib_list);
-EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferSurface(EGLDisplay dpy, EGLConfig config,
-                                                      const EGLint *attrib_list);
-EGLAPI EGLBoolean EGLAPIENTRY eglMakeCurrent(EGLDisplay dpy, EGLSurface draw,
-                                             EGLSurface read, EGLContext ctx);
-
-EGLAPI void *eglGetProcAddress(const char *procname);
-
-extern int strcmp(const char *, const char *);
-
-WEAK int halide_opengl_create_context(void *user_context) {
-    if (eglGetCurrentContext() != EGL_NO_CONTEXT) {
-        return halide_error_code_success;
-    }
-
-    EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-    if (display == EGL_NO_DISPLAY || !eglInitialize(display, nullptr, nullptr)) {
-        PFNEGLQUERYDEVICESEXTPROC eglQueryDevicesEXT =
-            reinterpret_cast<PFNEGLQUERYDEVICESEXTPROC>(
-                eglGetProcAddress("eglQueryDevicesEXT"));
-        if (eglQueryDevicesEXT == nullptr) {
-            return halide_error_code_generic_error;
-        }
-
-        PFNEGLGETPLATFORMDISPLAYEXTPROC eglGetPlatformDisplayEXT =
-            reinterpret_cast<PFNEGLGETPLATFORMDISPLAYEXTPROC>(
-                eglGetProcAddress("eglGetPlatformDisplayEXT"));
-        if (eglGetPlatformDisplayEXT == nullptr) {
-            return halide_error_code_generic_error;
-        }
-
-        const int kMaxDevices = 32;
-        EGLDeviceEXT egl_devices[kMaxDevices];
-        EGLint num_devices = 0;
-        EGLint egl_error = eglGetError();
-        if (!eglQueryDevicesEXT(kMaxDevices, egl_devices, &num_devices) ||
-            egl_error != EGL_SUCCESS) {
-            return halide_error_code_generic_error;
-        }
-
-        EGLBoolean initialized = EGL_FALSE;
-        for (EGLint i = 0; i < num_devices; ++i) {
-            display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT,
-                                               egl_devices[i], nullptr);
-            if (eglGetError() == EGL_SUCCESS && display != EGL_NO_DISPLAY) {
-                int major, minor;
-                initialized = eglInitialize(display, &major, &minor);
-                if (eglGetError() == EGL_SUCCESS && initialized == EGL_TRUE) {
-                    break;
-                }
-            }
-        }
-
-        if (eglGetError() != EGL_SUCCESS || initialized != EGL_TRUE) {
-            error(user_context) << "Could not initialize EGL display";
-            return halide_error_code_generic_error;
-        }
-    }
-
-    EGLint attribs[] = {
-        EGL_SURFACE_TYPE,
-        EGL_PBUFFER_BIT,
-        EGL_RENDERABLE_TYPE,
-        EGL_OPENGL_ES2_BIT,
-        EGL_RED_SIZE,
-        8,
-        EGL_GREEN_SIZE,
-        8,
-        EGL_BLUE_SIZE,
-        8,
-        EGL_ALPHA_SIZE,
-        8,
-        EGL_NONE,
-    };
-    EGLConfig config;
-    int numconfig;
-    EGLBoolean result = eglChooseConfig(display, attribs, &config, 1, &numconfig);
-    if (result != EGL_TRUE || numconfig != 1) {
-        debug(user_context) << "eglChooseConfig(): config not found: "
-                            << " result=" << (int)result
-                            << " eglGetError=" << eglGetError()
-                            << " numConfig=" << numconfig;
-        error(user_context) << "eglChooseConfig(): config not found.";
-        return halide_error_code_generic_error;
-    }
-
-    EGLint context_attribs[] = {
-        EGL_CONTEXT_CLIENT_VERSION, 2,
-        EGL_NONE};
-    EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT,
-                                          context_attribs);
-    if (context == EGL_NO_CONTEXT) {
-        error(user_context) << "eglCreateContext failed.";
-        return halide_error_code_generic_error;
-    }
-
-    EGLint surface_attribs[] = {
-        EGL_WIDTH, 1,
-        EGL_HEIGHT, 1,
-        EGL_NONE};
-    EGLSurface surface = eglCreatePbufferSurface(display, config, surface_attribs);
-    if (surface == EGL_NO_SURFACE) {
-        error(user_context) << "Error: Could not create EGL window surface.";
-        return halide_error_code_generic_error;
-    }
-
-    result = eglMakeCurrent(display, surface, surface, context);
-    if (result != EGL_TRUE) {
-        debug(user_context) << "eglMakeCurrent fails: "
-                            << " result=" << (int)result
-                            << " eglGetError=" << eglGetError();
-        error(user_context) << "eglMakeCurrent failed.";
-        return halide_error_code_generic_error;
-    }
-    return halide_error_code_success;
-}
-
-WEAK void *halide_opengl_get_proc_address(void *user_context, const char *name) {
-    return (void *)eglGetProcAddress(name);
-}
-
-}  // extern "C"
diff --git a/src/runtime/opengl_glx_context.cpp b/src/runtime/opengl_glx_context.cpp
deleted file mode 100644
index 093285668806..000000000000
--- a/src/runtime/opengl_glx_context.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-#include "HalideRuntime.h"
-#include "printer.h"
-
-extern "C" {
-
-typedef void *GLXContext;
-typedef void *GLXFBConfig;
-typedef int Bool;
-typedef void Display;
-
-typedef void (*__GLXextFuncPtr)();
-extern __GLXextFuncPtr glXGetProcAddressARB(const char *);
-extern void *XOpenDisplay(void *);
-extern int XDefaultScreen(void *);
-extern int glXQueryExtension(void *, void *, void *);
-extern const char *glXQueryExtensionsString(Display *dpy, int screen);
-extern GLXContext glXCreateNewContext(void *, void *, int, void *, int);
-extern void **glXChooseFBConfig(void *, int, const int *, int *);
-extern unsigned long glXCreatePbuffer(void *, void *, const int *);
-extern int XFree(void *);
-extern int XSync(void *, int);
-extern GLXContext glXGetCurrentContext();
-extern int glXMakeContextCurrent(void *, unsigned long, unsigned long, void *);
-
-#define GLX_RENDER_TYPE 0x8011
-#define GLX_RGBA_TYPE 0x8014
-#define GLX_RGBA_BIT 1
-#define GLX_RED_SIZE 8
-#define GLX_GREEN_SIZE 8
-#define GLX_BLUE_SIZE 8
-#define GLX_ALPHA_SIZE 8
-
-#define GLX_CONTEXT_MAJOR_VERSION_ARB 0x2091
-#define GLX_CONTEXT_MINOR_VERSION_ARB 0x2092
-typedef GLXContext (*glXCreateContextAttribsARBProc)(Display *, GLXFBConfig, GLXContext, Bool, const int *);
-
-}  // extern "C"
-
-namespace Halide {
-namespace Runtime {
-namespace Internal {
-
-// Helper to check for extension string presence. Adapted from:
-//   http://www.opengl.org/resources/features/OGLextensions/
-WEAK bool glx_extension_supported(const char *extlist, const char *extension) {
-    // Extension names should not have spaces.
-    if (strchr(extension, ' ') != nullptr || *extension == '\0') {
-        return false;
-    }
-
-    const char *start = extlist;
-    while (const char *pos = strstr(start, extension)) {
-        const char *end = pos + strlen(extension);
-        // Ensure the found match is a full word, not a substring.
-        if ((pos == start || pos[-1] == ' ') &&
-            (*end == ' ' || *end == '\0')) {
-            return true;
-        }
-        start = end;
-    }
-    return false;
-}
-
-}  // namespace Internal
-}  // namespace Runtime
-}  // namespace Halide
-
-extern "C" {
-
-WEAK void *halide_opengl_get_proc_address(void *user_context, const char *name) {
-    return (void *)glXGetProcAddressARB(name);
-}
-
-// Initialize OpenGL
-WEAK int halide_opengl_create_context(void *user_context) {
-    const int desired_major_version = 3;
-    const int desired_minor_version = 2;
-
-    if (glXGetCurrentContext()) {
-        // Already have a context
-        return halide_error_code_success;
-    }
-
-    void *dpy = XOpenDisplay(nullptr);
-    if (!dpy) {
-        error(user_context) << "Could not open X11 display.";
-        return halide_error_code_generic_error;
-    }
-
-    // GLX supported?
-    if (!glXQueryExtension(dpy, nullptr, nullptr)) {
-        error(user_context) << "GLX not supported by X server.";
-        return halide_error_code_generic_error;
-    }
-
-    int screen = XDefaultScreen(dpy);
-
-    int attribs[] = {
-        GLX_RENDER_TYPE, GLX_RGBA_BIT,
-        GLX_RED_SIZE, 8,
-        GLX_GREEN_SIZE, 8,
-        GLX_BLUE_SIZE, 8,
-        GLX_ALPHA_SIZE, 8,
-        0};
-    int num_configs = 0;
-    void **fbconfigs = glXChooseFBConfig(dpy, screen, attribs, &num_configs);
-    if (!num_configs) {
-        error(user_context) << "Could not get framebuffer config.";
-        return halide_error_code_generic_error;
-    }
-    void *fbconfig = fbconfigs[0];
-
-    const char *glxexts = glXQueryExtensionsString(dpy, screen);
-    void *share_list = nullptr;
-    int direct = 1;
-    void *context = nullptr;
-
-    glXCreateContextAttribsARBProc glXCreateContextAttribsARB = nullptr;
-    glXCreateContextAttribsARB = (glXCreateContextAttribsARBProc)
-        glXGetProcAddressARB("glXCreateContextAttribsARB");
-
-    if (glx_extension_supported(glxexts, "GLX_ARB_create_context") &&
-        glXCreateContextAttribsARB) {
-        int context_attribs[] = {
-            GLX_CONTEXT_MAJOR_VERSION_ARB, desired_major_version,
-            GLX_CONTEXT_MINOR_VERSION_ARB, desired_minor_version,
-            0};
-        context = glXCreateContextAttribsARB(dpy, fbconfig, share_list, direct,
-                                             context_attribs);
-    }
-    if (!context) {
-        // Open a legacy context
-        context = glXCreateNewContext(dpy, fbconfig, GLX_RGBA_TYPE, share_list, direct);
-    }
-    if (!context) {
-        error(user_context) << "Could not create OpenGL context.";
-        return halide_error_code_generic_error;
-    }
-
-    int pbuffer_attribs[] = {
-        0x8041 /* GLX_PBUFFER_WIDTH */, 32,
-        0x8040 /* GLX_PBUFFER_HEIGHT */, 32,
-        0};
-    unsigned long pbuffer = glXCreatePbuffer(dpy, fbconfig, pbuffer_attribs);
-
-    XFree(fbconfigs);
-    XSync(dpy, 0);
-
-    if (!glXMakeContextCurrent(dpy, pbuffer, pbuffer, context)) {
-        error(user_context) << "Could not make context current.";
-        return halide_error_code_generic_error;
-    }
-
-    return halide_error_code_success;
-}
-}
diff --git a/src/runtime/openglcompute.cpp b/src/runtime/openglcompute.cpp
deleted file mode 100644
index edb1327d90a9..000000000000
--- a/src/runtime/openglcompute.cpp
+++ /dev/null
@@ -1,990 +0,0 @@
-// Ignore deprecation warnings inside our own runtime
-#define HALIDE_ALLOW_DEPRECATED 1
-
-#include "HalideRuntimeOpenGLCompute.h"
-#include "device_buffer_utils.h"
-#include "device_interface.h"
-#include "mini_opengl.h"
-#include "printer.h"
-
-// Implementation note: all function that directly or indirectly access the
-// runtime state in halide_openglcompute_state must be declared as WEAK, otherwise
-// the behavior at runtime is undefined.
-
-// List of all OpenGL functions used by the runtime. The list is used to
-// declare and initialize the dispatch table in OpenGLState below.
-//
-// grep "global_state." ../../src/runtime/openglcompute.cpp | sed -n "s/^\(.*\)global_state\.\([^(]*\).*/\2/p" | sort | uniq
-//  +GetError, GetString
-//  -CheckAndReportError
-//
-#define USED_GL_FUNCTIONS                                  \
-    GLFUNC(PFNGLATTACHSHADERPROC, AttachShader);           \
-    GLFUNC(PFNGLBINDBUFFERPROC, BindBuffer);               \
-    GLFUNC(PFNGLBINDBUFFERBASEPROC, BindBufferBase);       \
-    GLFUNC(PFNGLBUFFERDATAPROC, BufferData);               \
-    GLFUNC(PFNGLCREATEPROGRAMPROC, CreateProgram);         \
-    GLFUNC(PFNGLCOMPILESHADERPROC, CompileShader);         \
-    GLFUNC(PFNGLCREATESHADERPROC, CreateShader);           \
-    GLFUNC(PFNGLDELETEBUFFERSPROC, DeleteBuffers);         \
-    GLFUNC(PFNGLDELETEPROGRAMPROC, DeleteProgram);         \
-    GLFUNC(PFNGLDELETESHADERPROC, DeleteShader);           \
-    GLFUNC(PFNGLDISPATCHCOMPUTEPROC, DispatchCompute);     \
-    GLFUNC(PFNGLFINISHPROC, Finish);                       \
-    GLFUNC(PFNGLGENBUFFERSPROC, GenBuffers);               \
-    GLFUNC(PFNGLGETERRORPROC, GetError);                   \
-    GLFUNC(PFNGLGETPROGRAMINFOLOGPROC, GetProgramInfoLog); \
-    GLFUNC(PFNGLGETPROGRAMIVPROC, GetProgramiv);           \
-    GLFUNC(PFNGLGETSHADERINFOLOGPROC, GetShaderInfoLog);   \
-    GLFUNC(PFNGLGETSHADERIVPROC, GetShaderiv);             \
-    GLFUNC(PFNGLGETSTRINGPROC, GetString);                 \
-    GLFUNC(PFNGLLINKPROGRAMPROC, LinkProgram);             \
-    GLFUNC(PFNGLMAPBUFFERRANGEPROC, MapBufferRange);       \
-    GLFUNC(PFNGLMEMORYBARRIERPROC, MemoryBarrier);         \
-    GLFUNC(PFNGLSHADERSOURCEPROC, ShaderSource);           \
-    GLFUNC(PFNGLUNIFORM1IPROC, Uniform1i);                 \
-    GLFUNC(PFNGLUNIFORM1IPROC, Uniform1ui);                \
-    GLFUNC(PFNGLUNIFORM1FPROC, Uniform1f);                 \
-    GLFUNC(PFNGLUNMAPBUFFERPROC, UnmapBuffer);             \
-    GLFUNC(PFNGLUSEPROGRAMPROC, UseProgram);               \
-    GLFUNC(PFNGLGETACTIVEUNIFORM, GetActiveUniform);       \
-    GLFUNC(PFNGLGETUNIFORMLOCATION, GetUniformLocation);
-
-using namespace Halide::Runtime::Internal;
-
-namespace Halide {
-namespace Runtime {
-namespace Internal {
-namespace OpenGLCompute {
-
-extern WEAK halide_device_interface_t openglcompute_device_interface;
-
-WEAK const char *gl_error_name(int32_t err) {
-    switch (err) {
-    case 0x500:
-        return "GL_INVALID_ENUM";
-        break;
-    case 0x501:
-        return "GL_INVALID_VALUE";
-        break;
-    case 0x502:
-        return "GL_INVALID_OPERATION";
-        break;
-    case 0x503:
-        return "GL_STACK_OVERFLOW";
-        break;
-    case 0x504:
-        return "GL_STACK_UNDERFLOW";
-        break;
-    case 0x505:
-        return "GL_OUT_OF_MEMORY";
-        break;
-    case 0x506:
-        return "GL_INVALID_FRAMEBUFFER_OPERATION";
-        break;
-    case 0x507:
-        return "GL_CONTEXT_LOST";
-        break;
-    case 0x8031:
-        return "GL_TABLE_TOO_LARGE";
-        break;
-    default:
-        break;
-    }
-    return "<unknown GL error>";
-}
-
-struct HalideMalloc {
-    ALWAYS_INLINE HalideMalloc(void *user_context, size_t size)
-        : user_context(user_context), ptr(halide_malloc(user_context, size)) {
-    }
-    ALWAYS_INLINE ~HalideMalloc() {
-        halide_free(user_context, ptr);
-    }
-    void *const user_context;
-    void *const ptr;
-};
-
-struct KernelInfo {
-    char *kernel_name;
-    GLuint program_id;
-    KernelInfo *next;
-};
-
-struct ModuleState {
-    KernelInfo *kernel;
-    ModuleState *next;
-};
-
-WEAK KernelInfo *find_kernel_by_name(const char *kernel_name, const ModuleState *module) {
-    KernelInfo *kernel = module->kernel;
-    while (kernel && strcmp(kernel_name, kernel->kernel_name) != 0) {
-        kernel = kernel->next;
-    }
-    return kernel;
-}
-
-// All persistent state maintained by the runtime.
-struct GlobalState {
-    void init();
-    int CheckAndReportError(void *user_context, const char *location);
-
-    bool initialized;
-
-    // Declare pointers used OpenGL functions
-#define GLFUNC(PTYPE, VAR) PTYPE VAR
-    USED_GL_FUNCTIONS;
-#undef GLFUNC
-};
-
-WEAK int GlobalState::CheckAndReportError(void *user_context, const char *location) {
-    GLenum err = GetError();
-    if (err == GL_NO_ERROR) {
-        return halide_error_code_success;
-    }
-
-    error(user_context) << "OpenGL error " << gl_error_name(err) << "(" << (int)err << ")"
-                        << " at " << location << ".";
-    return halide_error_code_generic_error;
-}
-
-WEAK GlobalState global_state;
-
-// A list of module-specific state. Each module corresponds to a single Halide filter
-WEAK ModuleState *state_list;
-
-// ---------- Helper functions ----------
-
-WEAK void debug_buffer(void *user_context, halide_buffer_t *buf) {
-    debug(user_context)
-        << "  device: " << buf->device << "\n"
-        << "  texture_id: " << (GLuint)buf->device << "\n"
-        << "  host: " << buf->host << "\n"
-        << "  extent: " << buf->dim[0].extent << " " << buf->dim[1].extent
-        << " " << buf->dim[2].extent << " " << buf->dim[3].extent << "\n"
-        << "  stride: " << buf->dim[0].stride << " " << buf->dim[1].stride
-        << " " << buf->dim[2].stride << " " << buf->dim[3].stride << "\n"
-        << "  min: " << buf->dim[0].min << " " << buf->dim[1].min
-        << " " << buf->dim[2].min << " " << buf->dim[3].min << "\n"
-        << "  type: " << buf->type << "\n"
-        << "  host_dirty: " << buf->host_dirty() << "\n"
-        << "  device_dirty: " << buf->device_dirty() << "\n";
-}
-
-WEAK void GlobalState::init() {
-    initialized = false;
-#define GLFUNC(type, name) name = nullptr;
-    USED_GL_FUNCTIONS;
-#undef GLFUNC
-}
-
-WEAK int load_gl_func(void *user_context, const char *name, void **ptr, bool required) {
-    void *p = halide_opengl_get_proc_address(user_context, name);
-    if (!p && required) {
-        error(user_context) << "Could not load function pointer for " << name;
-        return halide_error_code_symbol_not_found;
-    }
-    *ptr = p;
-    return halide_error_code_success;
-}
-
-// Initialize the OpenGL-specific parts of the runtime.
-WEAK int halide_openglcompute_init(void *user_context) {
-    if (global_state.initialized) {
-        return halide_error_code_success;
-    }
-
-    global_state.init();
-
-    // Make a context if there isn't one
-    auto result = halide_opengl_create_context(user_context);
-    if (result) {
-        return result;
-    }
-
-    // Initialize pointers to OpenGL functions.
-#define GLFUNC(TYPE, VAR)                                                              \
-    if (load_gl_func(user_context, "gl" #VAR, (void **)&global_state.VAR, true) < 0) { \
-        error(user_context) << "Failed to load function: gl" #VAR;                     \
-        return halide_error_code_symbol_not_found;                                     \
-    }
-    USED_GL_FUNCTIONS;
-#undef GLFUNC
-
-    debug(user_context) << "Halide running on " << global_state.GetString(GL_VERSION) << "\n";
-
-    global_state.initialized = true;
-    return halide_error_code_success;
-}
-
-// Release all data allocated by the runtime.
-//
-// The OpenGL context itself is generally managed by the host application, so
-// we leave it untouched.
-WEAK int halide_openglcompute_device_release(void *user_context) {
-#ifdef DEBUG_RUNTIME
-    uint64_t t_before = halide_current_time_ns(user_context);
-#endif
-
-    debug(user_context) << "OpenGLCompute: halide_openglcompute_device_release(user_context: "
-                        << user_context << ")\n";
-
-    ModuleState *mod = state_list;
-    while (mod) {
-        KernelInfo *kernel = mod->kernel;
-        while (kernel) {
-            KernelInfo *next_kernel = kernel->next;
-            global_state.DeleteProgram(kernel->program_id);
-            free(kernel->kernel_name);
-            free(kernel);
-            kernel = next_kernel;
-        }
-        mod->kernel = nullptr;
-        ModuleState *next = mod->next;
-        // do not call free(mod) to avoid dangling pointers: the module state
-        // is still referenced in the code generated by Halide (see
-        // CodeGen_GPU_Host::get_module_state).
-        mod = next;
-    }
-
-    global_state = GlobalState();
-
-#ifdef DEBUG_RUNTIME
-    uint64_t t_after = halide_current_time_ns(user_context);
-    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6
-                        << " ms\n";
-#endif
-
-    return halide_error_code_success;
-}
-
-// Allocate a new texture matching the dimension and color format of the
-// specified buffer.
-WEAK int halide_openglcompute_device_malloc(void *user_context, halide_buffer_t *buf) {
-#ifdef DEBUG_RUNTIME
-    uint64_t t_before = halide_current_time_ns(user_context);
-#endif
-
-    debug(user_context) << "OpenGLCompute: halide_openglcompute_device_malloc (user_context: "
-                        << user_context << ", buf: " << buf << ")\n";
-
-    auto result = halide_openglcompute_init(user_context);
-    if (result) {
-        return result;
-    }
-
-    size_t size = buf->size_in_bytes();
-    halide_abort_if_false(user_context, size != 0);
-
-    if (buf->device) {
-        // This buffer already has a device allocation
-        debug(user_context) << "openglcompute_device_malloc: This buffer already has a "
-                               "device allocation\n";
-        return halide_error_code_success;
-    }
-
-    for (int i = 0; i < buf->dimensions; i++) {
-        halide_abort_if_false(user_context, buf->dim[i].stride >= 0);
-    }
-
-    debug(user_context) << "    allocating buffer, "
-                        << "extents: " << buf->dim[0].extent << "x"
-                        << buf->dim[1].extent << "x" << buf->dim[2].extent << "x"
-                        << buf->dim[3].extent << " "
-                        << "strides: " << buf->dim[0].stride << "x"
-                        << buf->dim[1].stride << "x" << buf->dim[2].stride << "x"
-                        << buf->dim[3].stride << " "
-                        << "(type: " << buf->type << ")\n";
-
-    result = halide_openglcompute_init(user_context);
-    if (result) {
-        return result;
-    }
-    debug(user_context) << "openglcompute_device_malloc: initialization completed.\n";
-
-    if (!buf) {
-        return halide_error_code_buffer_argument_is_null;
-    }
-
-    GLuint the_buffer;
-    global_state.GenBuffers(1, &the_buffer);
-    result = global_state.CheckAndReportError(user_context, "oglc: GenBuffers");
-    if (result) {
-        return result;
-    }
-    global_state.BindBuffer(GL_ARRAY_BUFFER, the_buffer);
-    result = global_state.CheckAndReportError(user_context, "oglc: BindBuffer");
-    if (result) {
-        return result;
-    }
-
-    // OpenGLCompute only supports int32, uint32, and float data
-    // types, all of which are 4 bytes. We'll inflate the size for
-    // smaller types.
-    size *= (4 / buf->type.bytes());
-    halide_abort_if_false(user_context, size != 0);
-    global_state.BufferData(GL_ARRAY_BUFFER, size, nullptr, GL_DYNAMIC_COPY);
-    result = global_state.CheckAndReportError(user_context, "oglc: BufferData");
-    if (result) {
-        return result;
-    }
-
-    buf->device = the_buffer;
-    buf->device_interface = &openglcompute_device_interface;
-    buf->device_interface->impl->use_module();
-
-    debug(user_context) << "Allocated dev_buffer(i.e. vbo) " << the_buffer << "\n";
-
-#ifdef DEBUG_RUNTIME
-    uint64_t t_after = halide_current_time_ns(user_context);
-    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6
-                        << " ms for malloc\n";
-#endif
-
-    return halide_error_code_success;
-}
-
-WEAK int halide_openglcompute_device_free(void *user_context, halide_buffer_t *buf) {
-#ifdef DEBUG_RUNTIME
-    uint64_t t_before = halide_current_time_ns(user_context);
-#endif
-
-    if (!global_state.initialized) {
-        error(user_context) << "OpenGL runtime not initialized in call to halide_openglcompute_device_free.";
-        return halide_error_code_generic_error;
-    }
-
-    if (buf->device == 0) {
-        return halide_error_code_success;
-    }
-    GLuint the_buffer = (GLuint)buf->device;
-
-    debug(user_context) << "OGLC: halide_openglcompute_device_free ("
-                        << "user_context: " << user_context
-                        << ", the_buffer:" << the_buffer
-                        << ")\n";
-
-    global_state.DeleteBuffers(1, &the_buffer);
-
-    buf->device = 0;
-    buf->device_interface->impl->release_module();
-    buf->device_interface = nullptr;
-
-#ifdef DEBUG_RUNTIME
-    uint64_t t_after = halide_current_time_ns(user_context);
-    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6
-                        << " ms for free\n";
-#endif
-
-    return halide_error_code_success;
-}
-
-namespace {
-
-template<typename Source, typename Dest>
-ALWAYS_INLINE void converting_copy_memory_helper(const device_copy &copy, int d, int64_t src_off, int64_t dst_off) {
-    // Skip size-1 dimensions
-    while (d >= 0 && copy.extent[d] == 1) {
-        d--;
-    }
-
-    if (d == -1) {
-        const Source *from = (Source *)(copy.src + src_off);
-        Dest *to = (Dest *)(copy.dst + dst_off);
-        for (uint64_t index = 0; index < copy.chunk_size; index++) {
-            *to++ = (Dest)*from++;
-        }
-    } else {
-        for (uint64_t i = 0; i < copy.extent[d]; i++) {
-            converting_copy_memory_helper<Source, Dest>(copy, d - 1, src_off, dst_off);
-            src_off += copy.src_stride_bytes[d];
-            dst_off += copy.dst_stride_bytes[d];
-        }
-    }
-}
-
-}  // namespace
-// Copy image data from host memory to texture.
-WEAK int halide_openglcompute_copy_to_device(void *user_context, halide_buffer_t *buf) {
-#ifdef DEBUG_RUNTIME
-    uint64_t t_before = halide_current_time_ns(user_context);
-#endif
-
-    if (!global_state.initialized) {
-        error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_copy_to_device).";
-        return halide_error_code_generic_error;
-    }
-
-    GLuint the_buffer = (GLuint)buf->device;
-    debug(user_context) << "OGLC: halide_openglcompute_copy_to_device ("
-                        << "user_context: " << user_context
-                        << ", buf: " << buf
-                        << ", the_buffer:" << the_buffer << ")\n";
-
-    global_state.BindBuffer(GL_ARRAY_BUFFER, the_buffer);
-    auto result = global_state.CheckAndReportError(user_context, "oglc: BindBuffer");
-    if (result) {
-        return result;
-    }
-
-    size_t size = buf->number_of_elements() * 4;
-    global_state.BindBuffer(GL_ARRAY_BUFFER, the_buffer);
-    result = global_state.CheckAndReportError(user_context, "oglc: BindBuffer");
-    if (result) {
-        return result;
-    }
-
-    debug(user_context) << "Calling global_state.MapBufferRange(GL_ARRAY_BUFFER, 0, " << (uint64_t)size << ", GL_MAP_READ_BIT|GL_MAP_WRITE_BIT)\n";
-    void *device_data = global_state.MapBufferRange(GL_ARRAY_BUFFER,
-                                                    0,
-                                                    size,
-                                                    GL_MAP_READ_BIT | GL_MAP_WRITE_BIT);
-    result = global_state.CheckAndReportError(user_context, "oglc: MapBufferRange");
-    if (result) {
-        return result;
-    }
-    halide_buffer_t buf_copy = *buf;
-    buf_copy.device = (uint64_t)device_data;
-    device_copy dev_copy = make_host_to_device_copy(&buf_copy);
-
-    if (buf->type.code == halide_type_int) {
-        if (buf->type.bits == 8) {
-            converting_copy_memory_helper<int8_t, int32_t>(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0);
-        } else if (buf->type.bits == 16) {
-            // Convert chunk_size in bytes to the number of items to be copied.
-            // This doesn't happen for the 8-bit case because it would be a division by one,
-            // and it doesn't happen for the 32-bit case as there is no data conversion and memcpy
-            // is used.
-            dev_copy.chunk_size /= 2;
-            converting_copy_memory_helper<int16_t, int32_t>(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0);
-        } else if (buf->type.bits == 32) {
-            copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0);
-        } else {
-            error(user_context) << "OpenGLCompute does not support 64-bit integers.";
-            return halide_error_code_generic_error;
-        }
-    } else if (buf->type.code == halide_type_uint) {
-        if (buf->type.bits == 8) {
-            converting_copy_memory_helper<uint8_t, uint32_t>(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0);
-        } else if (buf->type.bits == 16) {
-            // Convert chunk_size in bytes to the number of items to be copied.
-            // This doesn't happen for the 8-bit case because it would be a division by one,
-            // and it doesn't happen for the 32-bit case as there is no data conversion and memcpy
-            // is used.
-            dev_copy.chunk_size /= 2;
-            converting_copy_memory_helper<uint16_t, uint32_t>(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0);
-        } else if (buf->type.bits == 32) {
-            copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0);
-        } else {
-            error(user_context) << "OpenGLCompute does not support 64-bit integers.";
-            return halide_error_code_generic_error;
-        }
-    } else if (buf->type.code == halide_type_float) {
-        if (buf->type.bits == 32) {
-            copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0);
-        } else {
-            error(user_context) << "OpenGLCompute does not support 64-bit floating-point.";
-            return halide_error_code_generic_error;
-        }
-    }
-    global_state.UnmapBuffer(GL_ARRAY_BUFFER);
-
-    debug(user_context) << "  copied " << ((unsigned)size) << " bytes from " << buf->host << " to the device.\n";
-
-#ifdef DEBUG_RUNTIME
-    uint64_t t_after = halide_current_time_ns(user_context);
-    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6
-                        << " ms for copy to dev\n";
-#endif
-    return halide_error_code_success;
-}
-
-// Copy image data from texture back to host memory.
-WEAK int halide_openglcompute_copy_to_host(void *user_context, halide_buffer_t *buf) {
-#ifdef DEBUG_RUNTIME
-    uint64_t t_before = halide_current_time_ns(user_context);
-#endif
-
-    if (!global_state.initialized) {
-        error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_copy_to_host).";
-        return halide_error_code_generic_error;
-    }
-
-    GLuint the_buffer = (GLuint)buf->device;
-    size_t size = buf->size_in_bytes();
-    halide_abort_if_false(user_context, size != 0);
-
-    debug(user_context) << "OGLC: halide_openglcompute_copy_to_host ("
-                        << "user_context: " << user_context
-                        << ", buf: " << buf
-                        << ", the_buffer:" << the_buffer
-                        << ", size=" << (unsigned)size << ")\n";
-
-    global_state.BindBuffer(GL_ARRAY_BUFFER, the_buffer);
-    auto result = global_state.CheckAndReportError(user_context, "oglc: BindBuffer");
-    if (result) {
-        return result;
-    }
-
-    void *device_data = global_state.MapBufferRange(GL_ARRAY_BUFFER,
-                                                    0,
-                                                    size,
-                                                    GL_MAP_READ_BIT);
-    result = global_state.CheckAndReportError(user_context, "oglc: MapBufferRange");
-    if (result) {
-        return result;
-    }
-
-    halide_buffer_t buf_copy = *buf;
-    buf_copy.device = (uint64_t)device_data;
-    device_copy dev_copy = make_device_to_host_copy(&buf_copy);
-
-    if (buf->type.code == halide_type_int) {
-        if (buf->type.bits == 8) {
-            converting_copy_memory_helper<int32_t, int8_t>(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin);
-        } else if (buf->type.bits == 16) {
-            // Convert chunk_size in bytes to the number of items to be copied.
-            // This doesn't happen for the 8-bit case because it would be a division by one,
-            // and it doesn't happen for the 32-bit case as there is no data conversion and memcpy
-            // is used.
-            dev_copy.chunk_size /= 2;
-            converting_copy_memory_helper<int32_t, int16_t>(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin);
-        } else if (buf->type.bits == 32) {
-            copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin);
-        } else {
-            error(user_context) << "OpenGLCompute does not support 64-bit integers.";
-            return halide_error_code_generic_error;
-        }
-    } else if (buf->type.code == halide_type_uint) {
-        if (buf->type.bits == 8) {
-            converting_copy_memory_helper<uint32_t, uint8_t>(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin);
-        } else if (buf->type.bits == 16) {
-            // Convert chunk_size in bytes to the number of items to be copied.
-            // This doesn't happen for the 8-bit case because it would be a division by one,
-            // and it doesn't happen for the 32-bit case as there is no data conversion and memcpy
-            // is used.
-            dev_copy.chunk_size /= 2;
-            converting_copy_memory_helper<uint32_t, uint16_t>(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin);
-        } else if (buf->type.bits == 32) {
-            copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin);
-        } else {
-            error(user_context) << "OpenGLCompute does not support 64-bit integers.";
-            return halide_error_code_generic_error;
-        }
-    } else if (buf->type.code == halide_type_float) {
-        if (buf->type.bits == 32) {
-            copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin);
-        } else {
-            error(user_context) << "OpenGLCompute does not support 64-bit floating-point.";
-            return halide_error_code_generic_error;
-        }
-    }
-
-    global_state.UnmapBuffer(GL_ARRAY_BUFFER);
-
-    debug(user_context) << "  copied " << (unsigned)size << " bytes to the host.\n";
-
-#ifdef DEBUG_RUNTIME
-    uint64_t t_after = halide_current_time_ns(user_context);
-    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6
-                        << " ms for copy to host\n";
-#endif
-
-    return halide_error_code_success;
-}
-
-}  // namespace OpenGLCompute
-}  // namespace Internal
-}  // namespace Runtime
-}  // namespace Halide
-
-using namespace Halide::Runtime::Internal::OpenGLCompute;
-
-//  Create wrappers that satisfy old naming conventions
-
-extern "C" {
-
-WEAK int halide_openglcompute_run(void *user_context, void *state_ptr,
-                                  const char *entry_name, int blocksX, int blocksY,
-                                  int blocksZ, int threadsX, int threadsY, int threadsZ,
-                                  int shared_mem_bytes, halide_type_t arg_types[], void *args[],
-                                  int8_t arg_is_buffer[]) {
-#ifdef DEBUG_RUNTIME
-    uint64_t t_before = halide_current_time_ns(user_context);
-#endif
-
-    debug(user_context)
-        << "OpenGLCompute: halide_openglcompute_run (user_context: " << user_context << ", "
-        << "entry: " << entry_name << ", "
-        << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", "
-        << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", "
-        << "shmem: " << shared_mem_bytes << "\n";
-
-    if (!global_state.initialized) {
-        error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_run).";
-        return halide_error_code_generic_error;
-    }
-
-    ModuleState *mod = (ModuleState *)state_ptr;
-    if (!mod) {
-        error(user_context) << "Internal error: module state is nullptr.";
-        return halide_error_code_generic_error;
-    }
-
-    KernelInfo *kernel = find_kernel_by_name(entry_name, mod);
-    if (!kernel) {
-        error(user_context) << "Internal error: unknown kernel named '" << entry_name << "'";
-        return halide_error_code_generic_error;
-    }
-
-    global_state.UseProgram(kernel->program_id);
-    auto result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run UseProgram");
-    if (result) {
-        return result;
-    }
-
-    // Populate uniforms with values passed in arguments.
-    // Order of the passed arguments matches what was generated for this kernel.
-    int i = 0;
-    while (arg_types[i].bits != 0) {
-        debug(user_context) << "    args " << i
-                            << " " << arg_types[i]
-                            << " [" << (*((void **)args[i])) << " ...] "
-                            << arg_is_buffer[i] << "\n";
-        if (arg_is_buffer[i] == 0) {
-            if (arg_types[i].code == halide_type_int) {
-                int value;
-                if (arg_types[i].bits == 8) {
-                    value = *((int8_t *)args[i]);
-                } else if (arg_types[i].bits == 16) {
-                    value = *((int16_t *)args[i]);
-                } else if (arg_types[i].bits == 32) {
-                    value = *((int32_t *)args[i]);
-                } else {
-                    error(user_context) << "Cannot pass argument of type " << arg_types[i] << " to GL shader";
-                    return halide_error_code_generic_error;
-                }
-                global_state.Uniform1i(i, value);
-                result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run Uniform1i");
-                if (result) {
-                    return result;
-                }
-            } else if (arg_types[i].code == halide_type_uint) {
-                unsigned value;
-                if (arg_types[i].bits == 8 ||
-                    arg_types[i].bits == 1) {
-                    value = *((uint8_t *)args[i]);
-                } else if (arg_types[i].bits == 16) {
-                    value = *((uint16_t *)args[i]);
-                } else if (arg_types[i].bits == 32) {
-                    value = *((uint32_t *)args[i]);
-                } else {
-                    error(user_context) << "Cannot pass argument of type " << arg_types[i] << " to GL shader";
-                    return halide_error_code_generic_error;
-                }
-                global_state.Uniform1ui(i, value);
-                result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run Uniform1ui");
-                if (result) {
-                    return result;
-                }
-            } else if (arg_types[i].code == halide_type_float) {
-                float value;
-                if (arg_types[i].bits == 32) {
-                    value = *((float *)args[i]);
-                } else {
-                    error(user_context) << "Cannot pass argument of type " << arg_types[i] << " to GL shader";
-                    return halide_error_code_generic_error;
-                }
-                global_state.Uniform1f(i, value);
-                result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run Uniform1f");
-                if (result) {
-                    return result;
-                }
-            } else {
-                error(user_context) << "Cannot pass argument of type " << arg_types[i] << " to GL shader";
-                return halide_error_code_generic_error;
-            }
-        } else {
-            uint64_t arg_value = ((halide_buffer_t *)args[i])->device;
-
-            GLuint the_buffer = (GLuint)arg_value;
-            global_state.BindBufferBase(GL_SHADER_STORAGE_BUFFER, i, the_buffer);
-            result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run BindBufferBase");
-            if (result) {
-                return result;
-            }
-        }
-        i++;
-    }
-    global_state.DispatchCompute(blocksX, blocksY, blocksZ);
-    result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run DispatchCompute");
-    if (result) {
-        return result;
-    }
-
-    global_state.MemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT);
-    result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run MemoryBarrier");
-    if (result) {
-        return result;
-    }
-
-#ifdef DEBUG_RUNTIME
-    uint64_t t_after = halide_current_time_ns(user_context);
-    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6
-                        << " ms for run\n";
-#endif
-
-    return halide_error_code_success;
-}
-
-WEAK int halide_openglcompute_device_sync(void *user_context, halide_buffer_t *) {
-#ifdef DEBUG_RUNTIME
-    uint64_t t_before = halide_current_time_ns(user_context);
-#endif
-
-    if (!global_state.initialized) {
-        error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_device_sync).";
-        return halide_error_code_generic_error;
-    }
-    global_state.Finish();
-#ifdef DEBUG_RUNTIME
-    uint64_t t_after = halide_current_time_ns(user_context);
-    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6
-                        << " ms for sync\n";
-#endif
-    return halide_error_code_success;
-}
-
-namespace {
-WEAK char *get_kernel_name(const char *start, const char *end) {
-    const size_t kernel_name_length = end - start;
-    char *kernel_name = (char *)malloc(kernel_name_length + 1);
-    memcpy(kernel_name, start, kernel_name_length);
-    kernel_name[kernel_name_length] = '\0';
-    return kernel_name;
-}
-}  // namespace
-
-// Called at the beginning of a code block generated by Halide. This function
-// is responsible for setting up the OpenGL environment and compiling the GLSL
-// code into a compute shader.
-WEAK int halide_openglcompute_initialize_kernels(void *user_context, void **state_ptr,
-                                                 const char *src, int size) {
-#ifdef DEBUG_RUNTIME
-    halide_start_clock(user_context);
-    uint64_t t_before = halide_current_time_ns(user_context);
-#endif
-
-    auto result = halide_openglcompute_init(user_context);
-    if (result) {
-        return result;
-    }
-
-    ModuleState **state = (ModuleState **)state_ptr;
-    ModuleState *module = *state;
-    if (!module) {
-        module = (ModuleState *)malloc(sizeof(ModuleState));
-        module->kernel = nullptr;
-        module->next = state_list;
-        state_list = module;
-        *state = module;
-    }
-
-    if (module->kernel) {
-        return halide_error_code_success;
-    }
-
-    const char *END_OF_KERNEL_MARKER = "\n// end of kernel ";
-    const size_t END_OF_KERNEL_MARKER_LENGTH = strlen(END_OF_KERNEL_MARKER);
-
-    while (true) {
-        const char *end_of_kernel_marker = strstr(src, END_OF_KERNEL_MARKER);
-        if (!end_of_kernel_marker) {
-            break;  // end of kernels sources is reached
-        }
-
-        const char *just_before_kernel_name = end_of_kernel_marker + END_OF_KERNEL_MARKER_LENGTH;
-        const char *just_beyond_kernel_name = strstr(just_before_kernel_name, "\n");
-        if (!just_beyond_kernel_name) {
-            error(user_context) << "Failed to find kernel name.";
-            return halide_error_code_generic_error;
-        }
-
-        char *kernel_name = get_kernel_name(just_before_kernel_name, just_beyond_kernel_name);
-
-        size_t src_len = just_beyond_kernel_name - src;
-
-        KernelInfo *kernel = (KernelInfo *)malloc(sizeof(KernelInfo));
-        kernel->kernel_name = kernel_name;
-        kernel->next = module->kernel;
-        module->kernel = kernel;
-
-        GLuint shader = global_state.CreateShader(GL_COMPUTE_SHADER);
-        result = global_state.CheckAndReportError(user_context, "create shader");
-        if (result) {
-            return result;
-        }
-        const GLchar *sources = {src};
-        const GLint sources_lengths = {(GLint)src_len};
-
-#ifdef DEBUG_RUNTIME
-        print(user_context) << "Compute shader source for: " << kernel_name;
-        halide_print(user_context, src);
-#endif
-
-        global_state.ShaderSource(shader, 1, &sources, &sources_lengths);
-        result = global_state.CheckAndReportError(user_context, "shader source");
-        if (result) {
-            return result;
-        }
-        global_state.CompileShader(shader);
-        result = global_state.CheckAndReportError(user_context, "compile shader");
-        if (result) {
-            return result;
-        }
-
-        GLint shader_ok = 0;
-        global_state.GetShaderiv(shader, GL_COMPILE_STATUS, &shader_ok);
-        if (shader_ok != GL_TRUE) {
-            debug(user_context) << "Could not compile shader:\n";
-            GLint log_len;
-            global_state.GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_len);
-            HalideMalloc log_tmp(user_context, log_len);
-            if (log_tmp.ptr) {
-                char *log = (char *)log_tmp.ptr;
-                global_state.GetShaderInfoLog(shader, log_len, nullptr, log);
-                debug(user_context) << log << "\n";
-            }
-            global_state.DeleteShader(shader);
-            error(user_context) << "Could not compile shader.";
-            return halide_error_code_generic_error;
-        }
-
-        // Link GLSL program
-        GLuint program = global_state.CreateProgram();
-        global_state.AttachShader(program, shader);
-        result = global_state.CheckAndReportError(user_context, "attach shader");
-        if (result) {
-            return result;
-        }
-        global_state.LinkProgram(program);
-        result = global_state.CheckAndReportError(user_context, "link program");
-        if (result) {
-            return result;
-        }
-
-        // Release the individual shaders
-        global_state.DeleteShader(shader);
-
-        GLint status;
-        global_state.GetProgramiv(program, GL_LINK_STATUS, &status);
-        if (!status) {
-            GLint log_len;
-            global_state.GetProgramiv(program, GL_INFO_LOG_LENGTH, &log_len);
-            HalideMalloc log_tmp(user_context, log_len);
-            if (log_tmp.ptr) {
-                char *log = (char *)log_tmp.ptr;
-                global_state.GetProgramInfoLog(program, log_len, nullptr, log);
-                debug(user_context) << "Could not link GLSL program:\n"
-                                    << log << "\n";
-            }
-            global_state.DeleteProgram(program);
-            error(user_context) << "Could not link GLSL program.";
-            return halide_error_code_generic_error;
-        }
-        kernel->program_id = program;
-
-#ifdef DEBUG_RUNTIME
-        GLint i;
-        GLint count;
-
-        GLint size;   // size of the variable
-        GLenum type;  // type of the variable (float, vec3 or mat4, etc)
-
-        const GLsizei bufSize = 64;  // maximum name length
-        GLchar name[bufSize];        // variable name in GLSL
-        GLsizei length;              // name length
-
-        global_state.GetProgramiv(program, GL_ACTIVE_UNIFORMS, &count);
-        debug(user_context) << "Active Uniforms: " << count << "\n";
-
-        for (i = 0; i < count; i++) {
-            global_state.GetActiveUniform(program, (GLuint)i, bufSize, &length, &size, &type, name);
-            GLint loc = global_state.GetUniformLocation(program, name);
-            debug(user_context) << "Uniform " << i << " Type: " << type << " Name: " << name << " location: " << loc << "\n";
-        }
-#endif
-        src += src_len;  // moving on to the next kernel
-    }
-#ifdef DEBUG_RUNTIME
-    uint64_t t_after = halide_current_time_ns(user_context);
-    debug(user_context) << "    Time: " << (t_after - t_before) / 1.0e6
-                        << " ms\n";
-#endif
-
-    return halide_error_code_success;
-}
-
-WEAK void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr) {
-}
-
-WEAK int halide_openglcompute_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
-    return halide_default_device_and_host_malloc(user_context, buf, &openglcompute_device_interface);
-}
-
-WEAK int halide_openglcompute_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
-    return halide_default_device_and_host_free(user_context, buf, &openglcompute_device_interface);
-}
-
-WEAK const struct halide_device_interface_t *halide_openglcompute_device_interface() {
-    return &openglcompute_device_interface;
-}
-
-}  // extern "C"
-
-namespace Halide {
-namespace Runtime {
-namespace Internal {
-namespace OpenGLCompute {
-
-WEAK halide_device_interface_impl_t openglcompute_device_interface_impl = {
-    halide_use_jit_module,
-    halide_release_jit_module,
-    halide_openglcompute_device_malloc,
-    halide_openglcompute_device_free,
-    halide_openglcompute_device_sync,
-    halide_openglcompute_device_release,
-    halide_openglcompute_copy_to_host,
-    halide_openglcompute_copy_to_device,
-    halide_openglcompute_device_and_host_malloc,
-    halide_openglcompute_device_and_host_free,
-    halide_default_buffer_copy,
-    halide_default_device_crop,
-    halide_default_device_slice,
-    halide_default_device_release_crop,
-    halide_default_device_wrap_native,
-    halide_default_device_detach_native,
-};
-
-WEAK halide_device_interface_t openglcompute_device_interface = {
-    halide_device_malloc,
-    halide_device_free,
-    halide_device_sync,
-    halide_device_release,
-    halide_copy_to_host,
-    halide_copy_to_device,
-    halide_device_and_host_malloc,
-    halide_device_and_host_free,
-    halide_buffer_copy,
-    halide_device_crop,
-    halide_device_slice,
-    halide_device_release_crop,
-    halide_device_wrap_native,
-    halide_device_detach_native,
-    nullptr,
-    &openglcompute_device_interface_impl};
-
-}  // namespace OpenGLCompute
-}  // namespace Internal
-}  // namespace Runtime
-}  // namespace Halide
diff --git a/src/runtime/osx_opengl_context.cpp b/src/runtime/osx_opengl_context.cpp
deleted file mode 100644
index 734d94b039ab..000000000000
--- a/src/runtime/osx_opengl_context.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-#include "HalideRuntime.h"
-#include "printer.h"
-#include "scoped_mutex_lock.h"
-
-#define USE_AGL 0
-#if USE_AGL
-extern "C" void *aglChoosePixelFormat(void *, int, const int *);
-extern "C" void *aglCreateContext(void *, void *);
-extern "C" int aglGetError();
-extern "C" void aglDestroyPixelFormat(void *);
-extern "C" unsigned char aglSetCurrentContext(void *);
-#endif
-
-#if !USE_AGL
-namespace Halide {
-namespace Runtime {
-namespace Internal {
-namespace OpenGL {
-
-WEAK halide_mutex cgl_functions_mutex;
-WEAK bool cgl_initialized = false;
-WEAK int (*CGLChoosePixelFormat)(int *attributes, void **pixel_format_result, int *num_formats);
-WEAK int (*CGLCreateContext)(void *pixel_format, void *share_context, void **context_Result);
-WEAK int (*CGLDestroyPixelFormat)(void *);
-WEAK int (*CGLSetCurrentContext)(void *);
-
-}  // namespace OpenGL
-}  // namespace Internal
-}  // namespace Runtime
-}  // namespace Halide
-
-using namespace Halide::Runtime::Internal::OpenGL;
-#endif
-
-extern "C" {
-
-WEAK void *halide_opengl_get_proc_address(void *user_context, const char *name) {
-    static void *dylib = nullptr;
-    if (!dylib) {
-        dylib = halide_load_library(
-            "/System/Library/Frameworks/OpenGL.framework/Versions/Current/OpenGL");
-        if (!dylib) {
-            return nullptr;
-        }
-    }
-    return halide_get_library_symbol(dylib, name);
-}
-
-// Initialize OpenGL
-WEAK int halide_opengl_create_context(void *user_context) {
-#if USE_AGL
-    void *ctx = nullptr;
-
-    int attrib[] = {4 /* AGL_RGBA */, 0 /* Sentinel */};
-    void *pf = aglChoosePixelFormat(nullptr, 0, attrib);
-    if (!pf) {
-        error(user_context) << "Could not create pixel format.";
-        return halide_error_code_generic_error;
-    }
-    ctx = aglCreateContext(pf, nullptr);
-    if (!ctx || aglGetError()) {
-        error(user_context) << "Could not create context.";
-        return halide_error_code_generic_error;
-    }
-    aglDestroyPixelFormat(pf);
-    if (!aglSetCurrentContext(ctx)) {
-        error(user_context) << "Could not activate OpenGL context.";
-        return halide_error_code_generic_error;
-    }
-#else
-    {  // locking scope
-        ScopedMutexLock lock(&cgl_functions_mutex);
-
-        if (!cgl_initialized) {
-            if ((CGLChoosePixelFormat =
-                     (int (*)(int *, void **, int *))halide_opengl_get_proc_address(user_context, "CGLChoosePixelFormat")) == nullptr) {
-                return halide_error_code_generic_error;
-            }
-            if ((CGLCreateContext =
-                     (int (*)(void *, void *, void **))halide_opengl_get_proc_address(user_context, "CGLCreateContext")) == nullptr) {
-                return halide_error_code_generic_error;
-            }
-            if ((CGLDestroyPixelFormat =
-                     (int (*)(void *))halide_opengl_get_proc_address(user_context, "CGLDestroyPixelFormat")) == nullptr) {
-                return halide_error_code_generic_error;
-            }
-            if ((CGLSetCurrentContext =
-                     (int (*)(void *))halide_opengl_get_proc_address(user_context, "CGLSetCurrentContext")) == nullptr) {
-                return halide_error_code_generic_error;
-            }
-        }
-        cgl_initialized = true;
-    }
-
-    void *ctx = nullptr;
-    int attribs[] = {
-        /* 5 kCGLPFADoubleBuffer */
-        72,      // kCGLPFANoRecovery
-        96,      // kCGLPFAAllowOfflineRenderers
-        99,      // kCGLPFAOpenGLProfile
-        0x1000,  // kCGLOGLPVersion_Legacy -- 0x3200 is kCGLOGLPVersion_3_2_Core -- kCGLOGLPVersion_GL4_Core is 0x4100
-        0        // sentinel ending list
-    };
-
-    void *fmt;
-    int numFormats = 0;
-    if (CGLChoosePixelFormat(attribs, &fmt, &numFormats) != 0) {
-        return halide_error_code_generic_error;
-    }
-    if (CGLCreateContext(fmt, nullptr, &ctx) != 0) {
-        CGLDestroyPixelFormat(fmt);
-        return halide_error_code_generic_error;
-    }
-    CGLSetCurrentContext(ctx);
-#endif
-    return halide_error_code_success;
-}
-}
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
index 5c64391b6259..a8651ae081a6 100644
--- a/src/runtime/runtime_api.cpp
+++ b/src/runtime/runtime_api.cpp
@@ -7,7 +7,6 @@
 #include "HalideRuntimeHexagonHost.h"
 #include "HalideRuntimeMetal.h"
 #include "HalideRuntimeOpenCL.h"
-#include "HalideRuntimeOpenGLCompute.h"
 #include "HalideRuntimeQurt.h"
 #include "HalideRuntimeVulkan.h"
 #include "HalideRuntimeWebGPU.h"
@@ -160,12 +159,6 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_opencl_set_device_type,
     (void *)&halide_opencl_set_platform_name,
     (void *)&halide_opencl_wrap_cl_mem,
-    (void *)&halide_opengl_create_context,
-    (void *)&halide_opengl_get_proc_address,
-    (void *)&halide_openglcompute_device_interface,
-    (void *)&halide_openglcompute_initialize_kernels,
-    (void *)&halide_openglcompute_finalize_kernels,
-    (void *)&halide_openglcompute_run,
     (void *)&halide_pointer_to_string,
     (void *)&halide_print,
     (void *)&halide_profiler_get_pipeline_state,
diff --git a/test/correctness/async_copy_chain.cpp b/test/correctness/async_copy_chain.cpp
index 45b014c4bd8b..ae8623d446bb 100644
--- a/test/correctness/async_copy_chain.cpp
+++ b/test/correctness/async_copy_chain.cpp
@@ -25,12 +25,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) {
-        printf("Skipping test for OpenGLCompute as it does not support copy_to_host/device() yet"
-               " (halide_buffer_copy is unimplemented in that backend).\n");
-        return 0;
-    }
-
     // Make a list of extern pipeline stages (just copies) all async
     // and connected by double buffers, then try various nestings of
     // them. This is a stress test of the async extern storage folding
diff --git a/test/correctness/async_device_copy.cpp b/test/correctness/async_device_copy.cpp
index 6e579c77b65f..3fc73d1c6139 100644
--- a/test/correctness/async_device_copy.cpp
+++ b/test/correctness/async_device_copy.cpp
@@ -22,12 +22,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) {
-        printf("Skipping test for OpenGLCompute as it does not support copy_to_host/device() yet"
-               " (halide_buffer_copy is unimplemented in that backend).\n");
-        return 0;
-    }
-
     // Compute frames on GPU/CPU, and then sum then on
     // CPU/GPU. async() lets us overlap the CPU computation with the
     // copies.
diff --git a/test/correctness/boundary_conditions.cpp b/test/correctness/boundary_conditions.cpp
index def2d410226b..61422d130d01 100644
--- a/test/correctness/boundary_conditions.cpp
+++ b/test/correctness/boundary_conditions.cpp
@@ -20,12 +20,7 @@ bool expect_eq(T actual, T expected) {
 
 void schedule_test(Func f, int vector_width, Partition partition_policy, const Target &t) {
     if (vector_width != 1) {
-        if (t.has_feature(Target::OpenGLCompute)) {
-            // Vector stores not yet supported in OpenGLCompute backend
-            f.unroll(x, vector_width);
-        } else {
-            f.vectorize(x, vector_width);
-        }
+        f.vectorize(x, vector_width);
     }
     f.partition(x, partition_policy);
     f.partition(y, partition_policy);
@@ -388,7 +383,6 @@ int main(int argc, char **argv) {
     int vector_width_max = 32;
     if (target.has_feature(Target::Metal) ||
         target.has_feature(Target::Vulkan) ||
-        target.has_feature(Target::OpenGLCompute) ||
         target.has_feature(Target::D3D12Compute) ||
         target.has_feature(Target::WebGPU)) {
         // https://github.com/halide/Halide/issues/2148
diff --git a/test/correctness/device_buffer_copy.cpp b/test/correctness/device_buffer_copy.cpp
index 9179ac83cd24..31ff92b4ae85 100644
--- a/test/correctness/device_buffer_copy.cpp
+++ b/test/correctness/device_buffer_copy.cpp
@@ -32,11 +32,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (target.has_feature(Target::OpenGLCompute)) {
-        printf("Skipping test for OpenGLCompute, as it does not support device crops, slices, or copies\n");
-        return 0;
-    }
-
     printf("Test copy to device.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);
diff --git a/test/correctness/device_crop.cpp b/test/correctness/device_crop.cpp
index ee4b900bc1f3..44fa0a4b2bde 100644
--- a/test/correctness/device_crop.cpp
+++ b/test/correctness/device_crop.cpp
@@ -30,11 +30,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (target.has_feature(Target::OpenGLCompute)) {
-        printf("Skipping test for OpenGLCompute, as it does not support device crops, slices, or copies\n");
-        return 0;
-    }
-
     printf("Test in-place cropping.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);
diff --git a/test/correctness/device_slice.cpp b/test/correctness/device_slice.cpp
index 0b9e3ca5bbcb..3bebc6bbb541 100644
--- a/test/correctness/device_slice.cpp
+++ b/test/correctness/device_slice.cpp
@@ -32,11 +32,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (target.has_feature(Target::OpenGLCompute)) {
-        printf("Skipping test for OpenGLCompute, as it does not support device crops, slices, or copies\n");
-        return 0;
-    }
-
     printf("Test in-place slicing.\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);
diff --git a/test/correctness/dynamic_allocation_in_gpu_kernel.cpp b/test/correctness/dynamic_allocation_in_gpu_kernel.cpp
index 242b0e94ba06..9ba586a04a7d 100644
--- a/test/correctness/dynamic_allocation_in_gpu_kernel.cpp
+++ b/test/correctness/dynamic_allocation_in_gpu_kernel.cpp
@@ -4,7 +4,7 @@ using namespace Halide;
 
 int main(int argc, char **argv) {
     Target t(get_jit_target_from_environment());
-    if (!t.has_gpu_feature() && !t.has_feature(Target::OpenGLCompute)) {
+    if (!t.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
         return 0;
     }
@@ -23,7 +23,7 @@ int main(int argc, char **argv) {
 
     // All of the f's have a dynamic size required (it depends on p),
     // so we'll store them in global memory ("Heap"). On cuda we get
-    // one big heap allocation. On openglcompute/d3d we should get one
+    // one big heap allocation. On d3d we should get one
     // allocation per coalesced group, and groups can only be
     // coalesced if the types match, so we get an allocation for
     // [f1,f3,f6], another for [f2,f4], and a third for f5.
diff --git a/test/correctness/gpu_allocation_cache.cpp b/test/correctness/gpu_allocation_cache.cpp
index cbb864bd6409..51b03e73f82c 100644
--- a/test/correctness/gpu_allocation_cache.cpp
+++ b/test/correctness/gpu_allocation_cache.cpp
@@ -140,21 +140,16 @@ int main(int argc, char **argv) {
 
     // Now run all at the same time to check for concurrency issues.
 
-    // FIXME: Skipping OpenGLCompute, which has concurrency
-    // issues. Probably due to using the GL context on the wrong
-    // thread.
-    if (!target.has_feature(Target::OpenGLCompute)) {
-        Halide::Tools::ThreadPool<void> pool(1);
-        std::vector<std::future<void>> futures;
-        futures.emplace_back(pool.async(test1, true));
-        futures.emplace_back(pool.async(test1, true));
-        futures.emplace_back(pool.async(test2, true));
-        futures.emplace_back(pool.async(test2, true));
-        futures.emplace_back(pool.async(test3, true));
-        futures.emplace_back(pool.async(test3, true));
-        for (auto &f : futures) {
-            f.get();
-        }
+    Halide::Tools::ThreadPool<void> pool(1);
+    std::vector<std::future<void>> futures;
+    futures.emplace_back(pool.async(test1, true));
+    futures.emplace_back(pool.async(test1, true));
+    futures.emplace_back(pool.async(test2, true));
+    futures.emplace_back(pool.async(test2, true));
+    futures.emplace_back(pool.async(test3, true));
+    futures.emplace_back(pool.async(test3, true));
+    for (auto &f : futures) {
+        f.get();
     }
 
     // Vulkan will OOM unless allocation cache is used ... skip this since we just ran the same tests above concurrently
diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
index a956c6831afc..f98636ea8905 100644
--- a/test/correctness/gpu_dynamic_shared.cpp
+++ b/test/correctness/gpu_dynamic_shared.cpp
@@ -10,11 +10,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (t.has_feature(Target::OpenGLCompute)) {
-        printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n");
-        return 0;
-    }
-
     if (t.has_feature(Target::Vulkan)) {
         const auto *interface = get_device_interface_for_device_api(DeviceAPI::Vulkan);
         assert(interface->compute_capability != nullptr);
diff --git a/test/correctness/gpu_jit_explicit_copy_to_device.cpp b/test/correctness/gpu_jit_explicit_copy_to_device.cpp
index bfa57b40d80d..2b234e7f9d06 100644
--- a/test/correctness/gpu_jit_explicit_copy_to_device.cpp
+++ b/test/correctness/gpu_jit_explicit_copy_to_device.cpp
@@ -5,7 +5,7 @@ using namespace Halide;
 int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
 
-    if (!target.has_gpu_feature() && !target.has_feature(Target::OpenGLCompute)) {
+    if (!target.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
         return 0;
     }
diff --git a/test/correctness/gpu_large_alloc.cpp b/test/correctness/gpu_large_alloc.cpp
index 6800cf12248a..da3022172a60 100644
--- a/test/correctness/gpu_large_alloc.cpp
+++ b/test/correctness/gpu_large_alloc.cpp
@@ -21,7 +21,7 @@ int main(int argc, char **argv) {
     g(x, y) = clamp(f(x, y), 20, 100);
 
     Target target = get_jit_target_from_environment();
-    if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) {
+    if (target.has_gpu_feature()) {
         Var xi, yi;
         f.compute_root().gpu_tile(x, y, xi, yi, 16, 16);
         g.compute_root().gpu_tile(x, y, xi, yi, 16, 16);
diff --git a/test/correctness/gpu_mixed_dimensionality.cpp b/test/correctness/gpu_mixed_dimensionality.cpp
index b3decba3ee76..f76eb15efe80 100644
--- a/test/correctness/gpu_mixed_dimensionality.cpp
+++ b/test/correctness/gpu_mixed_dimensionality.cpp
@@ -5,7 +5,7 @@ using namespace Halide;
 
 int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
-    if (!target.has_gpu_feature() && !target.has_feature(Target::OpenGLCompute)) {
+    if (!target.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
         return 0;
     }
diff --git a/test/correctness/gpu_multi_device.cpp b/test/correctness/gpu_multi_device.cpp
index ea9be0e4eb82..ad1b0223f551 100644
--- a/test/correctness/gpu_multi_device.cpp
+++ b/test/correctness/gpu_multi_device.cpp
@@ -39,14 +39,6 @@ struct MultiDevicePipeline {
                 .gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::Metal);
             current_stage++;
         }
-        if (jit_target.has_feature(Target::OpenGLCompute)) {
-            stage[current_stage](x, y, c) = stage[current_stage - 1](x, y, c) + 69;
-            stage[current_stage]
-                .compute_root()
-                .reorder(c, x, y)
-                .gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::OpenGLCompute);
-            current_stage++;
-        }
     }
 
     void run(Buffer<float> &result) {
diff --git a/test/correctness/gpu_multi_kernel.cpp b/test/correctness/gpu_multi_kernel.cpp
index 722c720c78c9..66e21b6896e5 100644
--- a/test/correctness/gpu_multi_kernel.cpp
+++ b/test/correctness/gpu_multi_kernel.cpp
@@ -16,7 +16,7 @@ int main(int argc, char *argv[]) {
     kernel3(x) = cast<int32_t>(round(x + kernel2(x)));
 
     Target target = get_jit_target_from_environment();
-    if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) {
+    if (target.has_gpu_feature()) {
         kernel1.gpu_tile(x, xi, 32).compute_root();
         kernel2.gpu_tile(x, xi, 32).compute_root();
         kernel3.gpu_tile(x, xi, 32);
diff --git a/test/correctness/gpu_reuse_shared_memory.cpp b/test/correctness/gpu_reuse_shared_memory.cpp
index 422775ac2021..37e932d78273 100644
--- a/test/correctness/gpu_reuse_shared_memory.cpp
+++ b/test/correctness/gpu_reuse_shared_memory.cpp
@@ -189,9 +189,7 @@ int main(int argc, char **argv) {
         }
 
         printf("Running dynamic shared test\n");
-        if (t.has_feature(Target::OpenGLCompute) && memory_type == MemoryType::GPUShared) {
-            printf("Skipping test because GL doesn't support dynamic sizes for shared memory\n");
-        } else if (t.has_feature(Target::Vulkan) && ((t.os == Target::IOS) || t.os == Target::OSX)) {
+        if (t.has_feature(Target::Vulkan) && ((t.os == Target::IOS) || t.os == Target::OSX)) {
             printf("Skipping test for Vulkan on iOS/OSX (MoltenVK doesn't support dynamic sizes for shared memory)!\n");
         } else {
             if (dynamic_shared_test(memory_type) != 0) {
diff --git a/test/correctness/logical.cpp b/test/correctness/logical.cpp
index 50ef8df9421f..1bd134bc37f4 100644
--- a/test/correctness/logical.cpp
+++ b/test/correctness/logical.cpp
@@ -31,9 +31,7 @@ int main(int argc, char **argv) {
         Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
-            if (!target.has_feature(Target::OpenGLCompute)) {
-                f.vectorize(xi, 4);
-            }
+            f.vectorize(xi, 4);
         } else if (target.has_feature(Target::HVX)) {
             f.hexagon().vectorize(x, 128);
         } else {
@@ -67,9 +65,7 @@ int main(int argc, char **argv) {
         Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
-            if (!target.has_feature(Target::OpenGLCompute)) {
-                f.vectorize(xi, 4);
-            }
+            f.vectorize(xi, 4);
         } else if (target.has_feature(Target::HVX)) {
             f.hexagon().vectorize(x, 128);
         } else {
@@ -101,9 +97,7 @@ int main(int argc, char **argv) {
 
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
-            if (!target.has_feature(Target::OpenGLCompute)) {
-                f.vectorize(xi, 4);
-            }
+            f.vectorize(xi, 4);
         } else if (target.has_feature(Target::HVX)) {
             f.hexagon().vectorize(x, 128);
         } else {
@@ -133,9 +127,7 @@ int main(int argc, char **argv) {
         Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
-            if (!target.has_feature(Target::OpenGLCompute)) {
-                f.vectorize(xi, 4);
-            }
+            f.vectorize(xi, 4);
         } else if (target.has_feature(Target::HVX)) {
             f.hexagon().vectorize(x, 128);
         } else {
@@ -193,9 +185,7 @@ int main(int argc, char **argv) {
             }
             if (target.has_gpu_feature()) {
                 gpu.gpu_tile(x, y, xi, yi, 16, 16);
-                if (!target.has_feature(Target::OpenGLCompute)) {
-                    gpu.vectorize(xi, 4);
-                }
+                gpu.vectorize(xi, 4);
             } else if (target.has_feature(Target::HVX)) {
                 gpu.hexagon().vectorize(x, 128);
             } else {
diff --git a/test/correctness/math.cpp b/test/correctness/math.cpp
index 618a30ea104a..e45b1876918d 100644
--- a/test/correctness/math.cpp
+++ b/test/correctness/math.cpp
@@ -50,7 +50,6 @@ bool relatively_equal(value_t a, value_t b, Target target) {
         // For HLSL, try again with a lower error threshold, as it might be using
         // fast but approximated trigonometric functions:
         if (target.supports_device_api(DeviceAPI::D3D12Compute) ||
-            target.supports_device_api(DeviceAPI::OpenGLCompute) ||
             target.supports_device_api(DeviceAPI::WebGPU)) {
             // this threshold value has been empirically determined since there
             // is no clear documentation on the precision of these algorithms
@@ -299,12 +298,7 @@ int main(int argc, char **argv) {
     call_1_float_types(ceil, 256, -25, 25);
     call_1_float_types(trunc, 256, -25, 25);
 
-    if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) {
-        // GLSL isn't required to support NaN, so keep things real
-        call_2_float_types(pow, 256, 0.0, 10.0, -4.0f, 4.0f);
-    } else {
-        call_2_float_types(pow, 256, -10.0, 10.0, -4.0f, 4.0f);
-    }
+    call_2_float_types(pow, 256, -10.0, 10.0, -4.0f, 4.0f);
 
     const int8_t int8_min = std::numeric_limits<int8_t>::min();
     const int16_t int16_min = std::numeric_limits<int16_t>::min();
diff --git a/test/correctness/mul_div_mod.cpp b/test/correctness/mul_div_mod.cpp
index f4f41c8fc9f8..8eca8141bba2 100644
--- a/test/correctness/mul_div_mod.cpp
+++ b/test/correctness/mul_div_mod.cpp
@@ -556,8 +556,6 @@ int main(int argc, char **argv) {
         for (int i = 2; i <= 4; i *= 2) {
             vector_widths.push_back(i);
         }
-    } else if (target.has_feature(Target::OpenGLCompute)) {
-        // Vector load/store unimplemented
     } else if (target.has_feature(Target::HVX)) {
         vector_widths.push_back(128);
     } else {
diff --git a/test/correctness/newtons_method.cpp b/test/correctness/newtons_method.cpp
index bdd8652b28a9..fa1d4744eedd 100644
--- a/test/correctness/newtons_method.cpp
+++ b/test/correctness/newtons_method.cpp
@@ -59,9 +59,8 @@ int find_pi() {
 
     T secant_result = evaluate_may_gpu<T>(g()[0]);
 
-    // Trig in vulkan/openglcompute/d3d12 is approximate
+    // Trig in vulkan/d3d12 is approximate
     float tolerance = target.has_feature(Target::Vulkan) ||
-                              target.has_feature(Target::OpenGLCompute) ||
                               target.has_feature(Target::D3D12Compute) ?
                           1e-5f :
                           1e-20f;
diff --git a/test/correctness/parallel_gpu_nested.cpp b/test/correctness/parallel_gpu_nested.cpp
index a7e604b8435a..53ddcc768e3a 100644
--- a/test/correctness/parallel_gpu_nested.cpp
+++ b/test/correctness/parallel_gpu_nested.cpp
@@ -14,7 +14,7 @@ int main(int argc, char **argv) {
     f(x, y, z) = x * y + z * k + 1;
 
     Target t = get_jit_target_from_environment();
-    if (t.has_gpu_feature() && !t.has_feature(Target::OpenGLCompute)) {
+    if (t.has_gpu_feature()) {
         Var xi, yi;
         f.gpu_tile(x, y, xi, yi, 16, 16);
     } else if (t.has_feature(Target::HVX)) {
diff --git a/test/correctness/plain_c_includes.c b/test/correctness/plain_c_includes.c
index 65a436014cbd..0caadc695f9a 100644
--- a/test/correctness/plain_c_includes.c
+++ b/test/correctness/plain_c_includes.c
@@ -10,7 +10,6 @@
 #include "HalideRuntimeHexagonHost.h"
 #include "HalideRuntimeMetal.h"
 #include "HalideRuntimeOpenCL.h"
-#include "HalideRuntimeOpenGLCompute.h"
 #include "HalideRuntimeQurt.h"
 
 int main(int argc, char **argv) {
diff --git a/test/correctness/target.cpp b/test/correctness/target.cpp
index 8fc03b589a73..7c8fcbe4d15f 100644
--- a/test/correctness/target.cpp
+++ b/test/correctness/target.cpp
@@ -52,9 +52,9 @@ int main(int argc, char **argv) {
     // Full specification round-trip, crazy features
     t1 = Target(Target::Android, Target::ARM, 32,
                 {Target::JIT, Target::CUDA, Target::OpenCL,
-                 Target::OpenGLCompute, Target::Debug});
+                 Target::Debug});
     ts = t1.to_string();
-    if (ts != "arm-32-android-cuda-debug-jit-opencl-openglcompute") {
+    if (ts != "arm-32-android-cuda-debug-jit-opencl") {
         printf("to_string failure: %s\n", ts.c_str());
         return 1;
     }
diff --git a/test/correctness/vectorized_gpu_allocation.cpp b/test/correctness/vectorized_gpu_allocation.cpp
index 9435509c2c6c..2a157cc93ada 100644
--- a/test/correctness/vectorized_gpu_allocation.cpp
+++ b/test/correctness/vectorized_gpu_allocation.cpp
@@ -11,12 +11,6 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if (t.has_feature(Target::OpenGLCompute)) {
-        printf("[SKIP] No support for vector loads and stores in OpenGLCompute yet\n");
-        // https://github.com/halide/Halide/issues/4979
-        return 0;
-    }
-
     // Fill input buffer.
     Buffer<float> input(2, 2, 3);
     Buffer<float> output(2, 2, 3);

From 9c3615b07285d263dd0e617b61acb793e49f2c7d Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sun, 11 Feb 2024 10:41:01 -0800
Subject: [PATCH 056/186] Add checks to prevent people from using negative
 split factors (#8076)

* Add checks to prevent people from using negative split factors

Our analysis passes assume that loop maxes are greater than loop mins,
so negative split factors cause sufficient havoc that not even output
bounds queries are safe. These are therefore checked on pipeline entry.

This is a new way for output bounds queries to throw errors (in addition
to the buffer pointers themselves being null, and maybe some buffer
constraints). Testing this, I realized these errors were getting thrown
twice, because the output buffer bounds query in Pipeline::realize was
built around two recursive calls to realize, and both were calling the
custom error handler. In addition to reporting errors in this class
twice, this implies several other inefficiencies, e.g. jit call args
were being prepped twice. I reworked it to be built around two calls to
call_jit_code instead.

Fixes #7938

* Add test to cmakelists

* Remove pointless target arg to call_jit_code

It has to be the same as the cached target in the receiving object
anyway
---
 Makefile                                    |  2 +
 src/AddSplitFactorChecks.cpp                | 68 +++++++++++++++++++++
 src/AddSplitFactorChecks.h                  | 25 ++++++++
 src/CMakeLists.txt                          |  6 +-
 src/Callable.cpp                            |  2 +-
 src/JITModule.cpp                           |  4 +-
 src/JITModule.h                             |  2 +-
 src/Lower.cpp                               |  5 ++
 src/Pipeline.cpp                            | 62 +++++++++++++------
 src/Pipeline.h                              |  3 +-
 src/runtime/HalideRuntime.h                 |  6 ++
 src/runtime/errors.cpp                      |  9 +++
 src/runtime/runtime_api.cpp                 |  1 +
 test/correctness/CMakeLists.txt             |  1 +
 test/correctness/negative_split_factors.cpp | 40 ++++++++++++
 15 files changed, 208 insertions(+), 28 deletions(-)
 create mode 100644 src/AddSplitFactorChecks.cpp
 create mode 100644 src/AddSplitFactorChecks.h
 create mode 100644 test/correctness/negative_split_factors.cpp

diff --git a/Makefile b/Makefile
index e1457ea161e2..b73b1632a0eb 100644
--- a/Makefile
+++ b/Makefile
@@ -442,6 +442,7 @@ SOURCE_FILES = \
   AddAtomicMutex.cpp \
   AddImageChecks.cpp \
   AddParameterChecks.cpp \
+  AddSplitFactorChecks.cpp \
   AlignLoads.cpp \
   AllocationBoundsInference.cpp \
   ApplySplit.cpp \
@@ -637,6 +638,7 @@ HEADER_FILES = \
   AddAtomicMutex.h \
   AddImageChecks.h \
   AddParameterChecks.h \
+  AddSplitFactorChecks.h \
   AlignLoads.h \
   AllocationBoundsInference.h \
   ApplySplit.h \
diff --git a/src/AddSplitFactorChecks.cpp b/src/AddSplitFactorChecks.cpp
new file mode 100644
index 000000000000..74ec033ebb4f
--- /dev/null
+++ b/src/AddSplitFactorChecks.cpp
@@ -0,0 +1,68 @@
+#include "AddSplitFactorChecks.h"
+#include "Definition.h"
+#include "Function.h"
+#include "IR.h"
+#include "IROperator.h"
+#include "Simplify.h"
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+
+void check_all_split_factors(const Function &f, const Definition &def, std::vector<Stmt> *stmts) {
+    const StageSchedule &sched = def.schedule();
+    for (const Split &split : sched.splits()) {
+        if (split.split_type != Split::SplitVar) {
+            continue;
+        }
+        if (is_positive_const(split.factor)) {
+            // Common-case optimization
+            continue;
+        }
+        Expr positive = simplify(split.factor > 0);
+        if (is_const_one(positive)) {
+            // We statically proved it
+            continue;
+        }
+        // We need a runtime check that says: if the condition is
+        // entered, the split factor will be positive. We can still
+        // assume the pipeline preconditions, because they will be
+        // checked before this.
+        std::ostringstream factor_str;
+        factor_str << split.factor;
+        Expr error = Call::make(Int(32), "halide_error_split_factor_not_positive",
+                                {f.name(),
+                                 split_string(split.old_var, ".").back(),
+                                 split_string(split.outer, ".").back(),
+                                 split_string(split.inner, ".").back(),
+                                 factor_str.str(), split.factor},
+                                Call::Extern);
+        stmts->push_back(AssertStmt::make(positive, error));
+    }
+
+    for (const auto &s : def.specializations()) {
+        check_all_split_factors(f, s.definition, stmts);
+    }
+}
+
+}  // namespace
+
+Stmt add_split_factor_checks(const Stmt &s, const std::map<std::string, Function> &env) {
+    // Check split factors are strictly positive
+    std::vector<Stmt> stmts;
+
+    for (const auto &p : env) {
+        const Function &f = p.second;
+        check_all_split_factors(f, f.definition(), &stmts);
+        for (const auto &u : f.updates()) {
+            check_all_split_factors(f, u, &stmts);
+        }
+    }
+
+    stmts.push_back(s);
+    return Block::make(stmts);
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/AddSplitFactorChecks.h b/src/AddSplitFactorChecks.h
new file mode 100644
index 000000000000..8db610043808
--- /dev/null
+++ b/src/AddSplitFactorChecks.h
@@ -0,0 +1,25 @@
+#ifndef HALIDE_INTERNAL_ADD_SPLIT_FACTOR_CHECKS_H
+#define HALIDE_INTERNAL_ADD_SPLIT_FACTOR_CHECKS_H
+
+/** \file
+ *
+ * Defines the lowering pass that adds the assertions that all split factors are
+ * strictly positive.
+ */
+#include <map>
+
+#include "Expr.h"
+
+namespace Halide {
+namespace Internal {
+
+class Function;
+
+/** Insert checks that all split factors that depend on scalar parameters are
+ * strictly positive. */
+Stmt add_split_factor_checks(const Stmt &s, const std::map<std::string, Function> &env);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 77453fbce0a9..cca681661c35 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,6 +10,7 @@ set(HEADER_FILES
     AddAtomicMutex.h
     AddImageChecks.h
     AddParameterChecks.h
+    AddSplitFactorChecks.h
     AlignLoads.h
     AllocationBoundsInference.h
     ApplySplit.h
@@ -22,7 +23,7 @@ set(HEADER_FILES
     Bounds.h
     BoundsInference.h
     BoundConstantExtentLoops.h
-    BoundSmallAllocations.h    
+    BoundSmallAllocations.h
     Buffer.h
     Callable.h
     CanonicalizeGPUVars.h
@@ -178,6 +179,7 @@ set(SOURCE_FILES
     AddAtomicMutex.cpp
     AddImageChecks.cpp
     AddParameterChecks.cpp
+    AddSplitFactorChecks.cpp
     AlignLoads.cpp
     AllocationBoundsInference.cpp
     ApplySplit.cpp
@@ -546,7 +548,7 @@ set_target_properties(Halide PROPERTIES
 # Note that we (deliberately) redeclare these versions here, even though the macros
 # with identical versions are expected to be defined in source; this allows us to
 # ensure that the versions defined between all build systems are identical.
-target_compile_definitions(Halide PUBLIC 
+target_compile_definitions(Halide PUBLIC
                            HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR}
                            HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR}
                            HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH})
diff --git a/src/Callable.cpp b/src/Callable.cpp
index 905155e52254..95a34ed455b1 100644
--- a/src/Callable.cpp
+++ b/src/Callable.cpp
@@ -192,7 +192,7 @@ Callable::FailureFn Callable::check_fcci(size_t argc, const FullCallCheckInfo *a
 
     JITFuncCallContext jit_call_context(context, contents->saved_jit_handlers);
 
-    int exit_status = contents->jit_cache.call_jit_code(contents->jit_cache.jit_target, argv);
+    int exit_status = contents->jit_cache.call_jit_code(argv);
 
     // If we're profiling, report runtimes and reset profiler stats.
     contents->jit_cache.finish_profiling(context);
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index ffd8949d4ca1..735f782f67c1 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -1113,7 +1113,7 @@ Target JITCache::get_compiled_jit_target() const {
     return jit_target;
 }
 
-int JITCache::call_jit_code(const Target &target, const void *const *args) {
+int JITCache::call_jit_code(const void *const *args) {
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
     user_warning << "MSAN does not support JIT compilers of any sort, and will report "
@@ -1122,7 +1122,7 @@ int JITCache::call_jit_code(const Target &target, const void *const *args) {
                     "compilation for Halide code.";
 #endif
 #endif
-    if (target.arch == Target::WebAssembly) {
+    if (get_compiled_jit_target().arch == Target::WebAssembly) {
         internal_assert(wasm_module.contents.defined());
         return wasm_module.run(args);
     } else {
diff --git a/src/JITModule.h b/src/JITModule.h
index 467fb82db207..59b4c3a4f9a0 100644
--- a/src/JITModule.h
+++ b/src/JITModule.h
@@ -300,7 +300,7 @@ struct JITCache {
 
     Target get_compiled_jit_target() const;
 
-    int call_jit_code(const Target &target, const void *const *args);
+    int call_jit_code(const void *const *args);
 
     void finish_profiling(JITUserContext *context);
 };
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 74af1aeffe28..ba0918831fc8 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -9,6 +9,7 @@
 #include "AddAtomicMutex.h"
 #include "AddImageChecks.h"
 #include "AddParameterChecks.h"
+#include "AddSplitFactorChecks.h"
 #include "AllocationBoundsInference.h"
 #include "AsyncProducers.h"
 #include "BoundConstantExtentLoops.h"
@@ -182,6 +183,10 @@ void lower_impl(const vector<Function> &output_funcs,
     s = bounds_inference(s, outputs, order, fused_groups, env, func_bounds, t);
     log("Lowering after computation bounds inference:", s);
 
+    debug(1) << "Asserting that all split factors are positive...\n";
+    s = add_split_factor_checks(s, env);
+    log("Lowering after asserting that all split factors are positive:", s);
+
     debug(1) << "Removing extern loops...\n";
     s = remove_extern_loops(s);
     log("Lowering after removing extern loops:", s);
diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
index 536b8994e686..79d1701a2593 100644
--- a/src/Pipeline.cpp
+++ b/src/Pipeline.cpp
@@ -570,7 +570,18 @@ Target Pipeline::get_compiled_jit_target() const {
 void Pipeline::compile_jit(const Target &target_arg) {
     user_assert(defined()) << "Pipeline is undefined\n";
 
-    Target target = target_arg.with_feature(Target::JIT).with_feature(Target::UserContext);
+    Target target = target_arg;
+
+    if (target.has_unknowns()) {
+        // If we've already jit-compiled for a specific target, use that.
+        target = get_compiled_jit_target();
+        if (target.has_unknowns()) {
+            // Otherwise get the target from the environment
+            target = get_jit_target_from_environment();
+        }
+    }
+
+    target.set_features({Target::JIT, Target::UserContext});
 
     // If we're re-jitting for the same target, we can just keep the old jit module.
     if (get_compiled_jit_target() == target) {
@@ -751,17 +762,37 @@ Realization Pipeline::realize(JITUserContext *context,
             bufs.emplace_back(t, nullptr, sizes);
         }
     }
-    Realization r(std::move(bufs));
+    Realization r{std::move(bufs)};
+
+    compile_jit(target);
+    JITUserContext empty_user_context = {};
+    if (!context) {
+        context = &empty_user_context;
+    }
+    JITFuncCallContext jit_context(context, jit_handlers());
+    JITCallArgs args(contents->inferred_args.size() + r.size());
+    RealizationArg arg{r};
+    prepare_jit_call_arguments(arg, contents->jit_cache.jit_target,
+                               &context, true, args);
+
     // Do an output bounds query if we can. Otherwise just assume the
     // output size is good.
+    int exit_status = 0;
     if (!target.has_feature(Target::NoBoundsQuery)) {
-        realize(context, r, target);
+        exit_status = call_jit_code(args);
     }
-    for (size_t i = 0; i < r.size(); i++) {
-        r[i].allocate();
+    if (exit_status == 0) {
+        // Make the output allocations
+        for (size_t i = 0; i < r.size(); i++) {
+            r[i].allocate();
+        }
+        // Do the actual computation
+        exit_status = call_jit_code(args);
     }
-    // Do the actual computation
-    realize(context, r, target);
+
+    // If we're profiling, report runtimes and reset profiler stats.
+    contents->jit_cache.finish_profiling(context);
+    jit_context.finalize(exit_status);
 
     // Crop back to the requested size if necessary
     bool needs_crop = false;
@@ -943,8 +974,8 @@ Pipeline::make_externs_jit_module(const Target &target,
     return result;
 }
 
-int Pipeline::call_jit_code(const Target &target, const JITCallArgs &args) {
-    return contents->jit_cache.call_jit_code(target, args.store);
+int Pipeline::call_jit_code(const JITCallArgs &args) {
+    return contents->jit_cache.call_jit_code(args.store);
 }
 
 void Pipeline::realize(RealizationArg outputs, const Target &t) {
@@ -959,15 +990,6 @@ void Pipeline::realize(JITUserContext *context,
 
     debug(2) << "Realizing Pipeline for " << target << "\n";
 
-    if (target.has_unknowns()) {
-        // If we've already jit-compiled for a specific target, use that.
-        target = get_compiled_jit_target();
-        if (target.has_unknowns()) {
-            // Otherwise get the target from the environment
-            target = get_jit_target_from_environment();
-        }
-    }
-
     // We need to make a context for calling the jitted function to
     // carry the the set of custom handlers. Here's how handlers get
     // called when running jitted code:
@@ -1041,7 +1063,7 @@ void Pipeline::realize(JITUserContext *context,
     // exception.
 
     debug(2) << "Calling jitted function\n";
-    int exit_status = call_jit_code(target, args);
+    int exit_status = call_jit_code(args);
     debug(2) << "Back from jitted function. Exit status was " << exit_status << "\n";
 
     // If we're profiling, report runtimes and reset profiler stats.
@@ -1111,7 +1133,7 @@ void Pipeline::infer_input_bounds(JITUserContext *context,
         }
 
         Internal::debug(2) << "Calling jitted function\n";
-        int exit_status = call_jit_code(contents->jit_cache.jit_target, args);
+        int exit_status = call_jit_code(args);
         jit_context.finalize(exit_status);
         Internal::debug(2) << "Back from jitted function\n";
         bool changed = false;
diff --git a/src/Pipeline.h b/src/Pipeline.h
index 19272b2ed68d..37537db04fb7 100644
--- a/src/Pipeline.h
+++ b/src/Pipeline.h
@@ -149,7 +149,6 @@ class Pipeline {
 private:
     Internal::IntrusivePtr<PipelineContents> contents;
 
-    // For the three method below, precisely one of the first two args should be non-null
     void prepare_jit_call_arguments(RealizationArg &output, const Target &target,
                                     JITUserContext **user_context, bool is_bounds_inference, Internal::JITCallArgs &args_result);
 
@@ -160,7 +159,7 @@ class Pipeline {
 
     static AutoSchedulerFn find_autoscheduler(const std::string &autoscheduler_name);
 
-    int call_jit_code(const Target &target, const Internal::JITCallArgs &args);
+    int call_jit_code(const Internal::JITCallArgs &args);
 
     // Get the value of contents->jit_target, but reality-check that the contents
     // sensibly match the value. Return Target() if not jitted.
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index d8ae1268fbaf..64034b8be328 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1242,6 +1242,10 @@ enum halide_error_code_t {
     /** An explicit storage bound provided is too small to store
      * all the values produced by the function. */
     halide_error_code_storage_bound_too_small = -45,
+
+    /** A factor used to split a loop was discovered to be zero or negative at
+     * runtime. */
+    halide_error_code_split_factor_not_positive = -46,
 };
 
 /** Halide calls the functions below on various error conditions. The
@@ -1316,6 +1320,8 @@ extern int halide_error_device_dirty_with_no_device_support(void *user_context,
 extern int halide_error_storage_bound_too_small(void *user_context, const char *func_name, const char *var_name,
                                                 int provided_size, int required_size);
 extern int halide_error_device_crop_failed(void *user_context);
+extern int halide_error_split_factor_not_positive(void *user_context, const char *func_name, const char *orig, const char *outer, const char *inner, const char *factor_str, int factor);
+
 // @}
 
 /** Optional features a compilation Target can have.
diff --git a/src/runtime/errors.cpp b/src/runtime/errors.cpp
index 003dde531dfc..0879cc4a7c60 100644
--- a/src/runtime/errors.cpp
+++ b/src/runtime/errors.cpp
@@ -291,4 +291,13 @@ WEAK int halide_error_device_crop_failed(void *user_context) {
     return halide_error_code_device_crop_failed;
 }
 
+WEAK int halide_error_split_factor_not_positive(void *user_context, const char *func_name, const char *orig, const char *outer, const char *inner, const char *factor_str, int factor) {
+    error(user_context) << "In schedule for func " << func_name
+                        << ", the factor used to split the variable " << orig
+                        << " into " << outer << " and " << inner << " is " << factor_str
+                        << ". This evaluated to " << factor << ", which is not strictly positive. "
+                        << "Consider using max(" << factor_str << ", 1) instead.";
+    return halide_error_code_split_factor_not_positive;
+}
+
 }  // extern "C"
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
index a8651ae081a6..db8ada2f4b8e 100644
--- a/src/runtime/runtime_api.cpp
+++ b/src/runtime/runtime_api.cpp
@@ -85,6 +85,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_error_param_too_small_u64,
     (void *)&halide_error_requirement_failed,
     (void *)&halide_error_specialize_fail,
+    (void *)&halide_error_split_factor_not_positive,
     (void *)&halide_error_unaligned_host_ptr,
     (void *)&halide_error_storage_bound_too_small,
     (void *)&halide_error_device_crop_failed,
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 3b946edda6d9..f77393a21114 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -222,6 +222,7 @@ tests(GROUPS correctness
       multiple_outputs.cpp
       mux.cpp
       narrow_predicates.cpp
+      negative_split_factors.cpp
       nested_tail_strategies.cpp
       newtons_method.cpp
       non_nesting_extern_bounds_query.cpp
diff --git a/test/correctness/negative_split_factors.cpp b/test/correctness/negative_split_factors.cpp
new file mode 100644
index 000000000000..bc032022b60f
--- /dev/null
+++ b/test/correctness/negative_split_factors.cpp
@@ -0,0 +1,40 @@
+#include "Halide.h"
+#include "halide_test_dirs.h"
+
+#include <cstdio>
+#include <fstream>
+
+using namespace Halide;
+
+bool error_occurred = false;
+void my_error_handler(JITUserContext *user_context, const char *msg) {
+    error_occurred = true;
+}
+
+int main(int argc, char **argv) {
+    // Trying to realize a Pipeline with a negative or zero split factor should
+    // error out cleanly, and not for example segfault because the output bounds
+    // query returned a garbage buffer.
+
+    Param<int> split;
+
+    Func f;
+    Var x;
+
+    f(x) = x;
+    f.parallel(x, split);
+
+    split.set(-17);
+
+    f.jit_handlers().custom_error = my_error_handler;
+
+    f.realize({32});
+
+    if (!error_occurred) {
+        printf("There was supposed to be an error!\n");
+        return 1;
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From ada6345a8a1416ee5a29796f1f0f684df6c5f976 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 12 Feb 2024 10:10:00 -0800
Subject: [PATCH 057/186] Fix rfactor adding too many pure loops (#8086)

When you rfactor an update definition, the new update definition must
use all the pure vars of the Func, even though the one you're rfactoring
may not have used them all.

We also want to preserve any scheduling already done to the pure vars,
so we want to preserve the dims list and splits list from the original
definition.

The code accounted for this by checking the dims list for any missing
pure vars and adding them at the end (just before Var::outermost()), but
this didn't account for the fact that they may no longer exist in the
dims list due to splits that didn't reuse the outer name. In these
circumstances we could end up with too many pure loops. E.g. if x has
been split into xo and xi, then the code was adding a loop for x even
though there were already loops for xo and xi, which of course produces
garbage output.

This PR instead just checks which pure vars are actually used in the
update definition up front, and then uses that to tell which ones should
be added.

Fixes #7890
---
 src/Func.cpp                       | 26 +++++++++++++++++++++++---
 test/correctness/fuzz_schedule.cpp | 25 +++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/Func.cpp b/src/Func.cpp
index 978d2b19a436..7e0995cc33b5 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -788,6 +788,17 @@ Func Stage::rfactor(vector<pair<RVar, Var>> preserved) {
     vector<Expr> &args = definition.args();
     vector<Expr> &values = definition.values();
 
+    // Figure out which pure vars were used in this update definition.
+    std::set<string> pure_vars_used;
+    internal_assert(args.size() == dim_vars.size());
+    for (size_t i = 0; i < args.size(); i++) {
+        if (const Internal::Variable *var = args[i].as<Variable>()) {
+            if (var->name == dim_vars[i].name()) {
+                pure_vars_used.insert(var->name);
+            }
+        }
+    }
+
     // Check whether the operator is associative and determine the operator and
     // its identity for each value in the definition if it is a Tuple
     const auto &prover_result = prove_associativity(func_name, args, values);
@@ -1012,16 +1023,20 @@ Func Stage::rfactor(vector<pair<RVar, Var>> preserved) {
 
     // Determine the dims of the new update definition
 
+    // The new update definition needs all the pure vars of the Func, but the
+    // one we're rfactoring may not have used them all. Add any missing ones to
+    // the dims list.
+
     // Add pure Vars from the original init definition to the dims list
     // if they are not already in the list
     for (const Var &v : dim_vars) {
-        const auto &iter = std::find_if(dims.begin(), dims.end(),
-                                        [&v](const Dim &dim) { return var_name_match(dim.var, v.name()); });
-        if (iter == dims.end()) {
+        if (!pure_vars_used.count(v.name())) {
             Dim d = {v.name(), ForType::Serial, DeviceAPI::None, DimType::PureVar, Partition::Auto};
+            // Insert it just before Var::outermost
             dims.insert(dims.end() - 1, d);
         }
     }
+
     // Then, we need to remove lifted RVars from the dims list
     for (const string &rv : rvars_removed) {
         remove(rv);
@@ -1888,6 +1903,11 @@ Stage &Stage::reorder(const std::vector<VarOrRVar> &vars) {
 
     dims_old.swap(dims);
 
+    // We're not allowed to reorder Var::outermost inwards (rfactor assumes it's
+    // the last one).
+    user_assert(dims.back().var == Var::outermost().name())
+        << "Var::outermost() may not be reordered inside any other var.\n";
+
     return *this;
 }
 
diff --git a/test/correctness/fuzz_schedule.cpp b/test/correctness/fuzz_schedule.cpp
index a774335a07bf..78fe9e0cb757 100644
--- a/test/correctness/fuzz_schedule.cpp
+++ b/test/correctness/fuzz_schedule.cpp
@@ -202,6 +202,31 @@ int main(int argc, char **argv) {
         check_blur_output(buf, correct);
     }
 
+    // https://github.com/halide/Halide/issues/7890
+    {
+        Func input("input");
+        Func local_sum("local_sum");
+        Func blurry("blurry");
+        Var x("x"), y("y");
+        RVar yryf;
+        input(x, y) = 2 * x + 5 * y;
+        RDom r(-2, 5, -2, 5, "rdom_r");
+        local_sum(x, y) = 0;
+        local_sum(x, y) += input(x + r.x, y + r.y);
+        blurry(x, y) = cast<int32_t>(local_sum(x, y) / 25);
+
+        Var yo, yi, xo, xi, u;
+        blurry.split(y, yo, yi, 2, TailStrategy::Auto);
+        local_sum.split(x, xo, xi, 4, TailStrategy::Auto);
+        local_sum.update(0).split(x, xo, xi, 1, TailStrategy::Auto);
+        local_sum.update(0).rfactor(r.x, u);
+        blurry.store_root();
+        local_sum.compute_root();
+        Pipeline p({blurry});
+        auto buf = p.realize({32, 32});
+        check_blur_output(buf, correct);
+    }
+
     // https://github.com/halide/Halide/issues/8054
     {
         ImageParam input(Float(32), 2, "input");

From d8cfed69531b0e1a29955115e5f3148209d657df Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 13 Feb 2024 13:47:09 -0800
Subject: [PATCH 058/186] Forward the partition methods from generator outputs
 (#8090)

---
 src/Generator.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Generator.h b/src/Generator.h
index 99d106056842..e819bd2a88a8 100644
--- a/src/Generator.h
+++ b/src/Generator.h
@@ -2280,6 +2280,8 @@ class GeneratorOutputBase : public GIOBase {
     HALIDE_FORWARD_METHOD(Func, align_bounds)
     HALIDE_FORWARD_METHOD(Func, align_extent)
     HALIDE_FORWARD_METHOD(Func, align_storage)
+    HALIDE_FORWARD_METHOD(Func, always_partition)
+    HALIDE_FORWARD_METHOD(Func, always_partition_all)
     HALIDE_FORWARD_METHOD_CONST(Func, args)
     HALIDE_FORWARD_METHOD(Func, bound)
     HALIDE_FORWARD_METHOD(Func, bound_extent)
@@ -2303,9 +2305,12 @@ class GeneratorOutputBase : public GIOBase {
     HALIDE_FORWARD_METHOD(Func, hexagon)
     HALIDE_FORWARD_METHOD(Func, in)
     HALIDE_FORWARD_METHOD(Func, memoize)
+    HALIDE_FORWARD_METHOD(Func, never_partition)
+    HALIDE_FORWARD_METHOD(Func, never_partition_all)
     HALIDE_FORWARD_METHOD_CONST(Func, num_update_definitions)
     HALIDE_FORWARD_METHOD_CONST(Func, outputs)
     HALIDE_FORWARD_METHOD(Func, parallel)
+    HALIDE_FORWARD_METHOD(Func, partition)
     HALIDE_FORWARD_METHOD(Func, prefetch)
     HALIDE_FORWARD_METHOD(Func, print_loop_nest)
     HALIDE_FORWARD_METHOD(Func, rename)

From c8f43f3b9a8b44fab5e500cd5c9af7204090b182 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 13 Feb 2024 13:47:19 -0800
Subject: [PATCH 059/186] Parallelize some tests (#8078)

* Parallelize some tests

This reduces the time taken to run all correctness tests from 8:15 to
3:15 on my machine.

* The FIXME is actually fine

* Remove debug print

* Fix when we're willing to run x86 code in simd_op_check

* Use separate imageparams per task

* Deep-copy the LoopLevels

* Make float16_t neon op check test at least build

* Revert accidental serialization

* Throw return values from callable into the void

We don't have a custom error handler in place, so they're always zero

* Skip test under ASAN

* Fix unintentional change to test
---
 src/Schedule.cpp                             |   6 +-
 test/correctness/float16_t_neon_op_check.cpp |  59 +----
 test/correctness/simd_op_check.h             | 243 ++++++++++++++-----
 test/correctness/simd_op_check_hvx.cpp       |  12 +-
 test/correctness/simd_op_check_wasm.cpp      |   5 +
 test/correctness/simd_op_check_x86.cpp       |  12 +-
 test/correctness/unroll_huge_mux.cpp         |  13 +-
 test/correctness/vector_cast.cpp             |  12 +-
 test/correctness/vector_math.cpp             |  13 +-
 test/correctness/vector_reductions.cpp       |  15 +-
 10 files changed, 239 insertions(+), 151 deletions(-)

diff --git a/src/Schedule.cpp b/src/Schedule.cpp
index a2a34f34862e..72737b596e91 100644
--- a/src/Schedule.cpp
+++ b/src/Schedule.cpp
@@ -354,9 +354,9 @@ FuncSchedule FuncSchedule::deep_copy(
 
     internal_assert(contents.defined()) << "Cannot deep-copy undefined FuncSchedule\n";
     FuncSchedule copy;
-    copy.contents->store_level = contents->store_level;
-    copy.contents->compute_level = contents->compute_level;
-    copy.contents->hoist_storage_level = contents->hoist_storage_level;
+    copy.contents->store_level.set(contents->store_level);
+    copy.contents->compute_level.set(contents->compute_level);
+    copy.contents->hoist_storage_level.set(contents->hoist_storage_level);
     copy.contents->storage_dims = contents->storage_dims;
     copy.contents->bounds = contents->bounds;
     copy.contents->estimates = contents->estimates;
diff --git a/test/correctness/float16_t_neon_op_check.cpp b/test/correctness/float16_t_neon_op_check.cpp
index a83db47758f5..33d2541cbd4a 100644
--- a/test/correctness/float16_t_neon_op_check.cpp
+++ b/test/correctness/float16_t_neon_op_check.cpp
@@ -64,7 +64,7 @@ class SimdOpCheck : public SimdOpCheckTest {
         // bits, 192 bits, and 256 bits for everything.
         struct TestParams {
             const int bits;
-            ImageParam in_f;
+            std::function<Expr(Expr)> in_f;
             std::vector<std::pair<int, string>> vl_params;
             Expr f_1, f_2, f_3, u_1, i_1;
         };
@@ -77,7 +77,7 @@ class SimdOpCheck : public SimdOpCheckTest {
 
         for (auto &test_param : test_params) {  // outer loop for {fp32, fp16}
             const int bits = test_param.bits;
-            ImageParam in_f = test_param.in_f;
+            auto in_f = test_param.in_f;
             Expr f_1 = test_param.f_1;
             Expr f_2 = test_param.f_2;
             Expr f_3 = test_param.f_3;
@@ -256,7 +256,7 @@ class SimdOpCheck : public SimdOpCheckTest {
         suffix_map.emplace(tasks.back().name, suffix);
     }
 
-    void compile_and_check(Func error, const string &op, const string &name, int vector_width, std::ostringstream &error_msg) override {
+    void compile_and_check(Func error, const string &op, const string &name, int vector_width, const std::vector<Argument> &arg_types, std::ostringstream &error_msg) override {
         std::string fn_name = "test_" + name;
         std::string file_name = output_directory + fn_name;
 
@@ -315,52 +315,11 @@ class SimdOpCheck : public SimdOpCheckTest {
 }  // namespace
 
 int main(int argc, char **argv) {
-    Target host = get_host_target();
-    Target hl_target = get_target_from_environment();
-    Target jit_target = get_jit_target_from_environment();
-    printf("host is:      %s\n", host.to_string().c_str());
-    printf("HL_TARGET is: %s\n", hl_target.to_string().c_str());
-    printf("HL_JIT_TARGET is: %s\n", jit_target.to_string().c_str());
-
-    // Only for 64bit target with fp16 feature
-    if (!(hl_target.arch == Target::ARM && hl_target.bits == 64 && hl_target.has_feature(Target::ARMFp16))) {
-        Halide::Internal::Test::Sharder::accept_sharded_status();
-        printf("[SKIP] To run this test, set HL_TARGET=arm-64-<os>-arm_fp16. \n");
-        return 0;
-    }
-
-    // Create Test Object
-    // Use smaller dimension than default(768, 128) to avoid fp16 overflow in reduction test case
-    SimdOpCheck test(hl_target, 384, 32);
-
-    if (!test.can_run_code()) {
-        printf("[WARN] To run verification of realization, set HL_JIT_TARGET=arm-64-<os>-arm_fp16. \n");
-    }
-
-    if (argc > 1) {
-        test.filter = argv[1];
-    }
-
-    if (getenv("HL_SIMD_OP_CHECK_FILTER")) {
-        test.filter = getenv("HL_SIMD_OP_CHECK_FILTER");
-    }
-
-    if (argc > 2) {
-        // Don't forget: if you want to run the standard tests to a specific output
-        // directory, you'll need to invoke with the first arg enclosed
-        // in quotes (to avoid it being wildcard-expanded by the shell):
-        //
-        //    correctness_simd_op_check "*" /path/to/output
-        //
-        test.output_directory = argv[2];
-    }
-
-    bool success = test.test_all();
-
-    if (!success) {
-        return 1;
-    }
-
-    printf("Success!\n");
+    // FIXME
+    printf("[SKIP] Test is currently broken. See https://github.com/halide/Halide/issues/8083");
     return 0;
+
+    return SimdOpCheckTest::main<SimdOpCheck>(
+        argc, argv,
+        {Target("arm-64-linux-arm_fp16")});
 }
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index 7b1057b7f3ea..fce3172132ba 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -3,11 +3,59 @@
 
 #include "Halide.h"
 #include "halide_test_dirs.h"
+#include "halide_thread_pool.h"
 #include "test_sharding.h"
 
 #include <fstream>
 #include <iostream>
 
+namespace {
+
+using namespace Halide;
+
+// Some exprs of each type to use in checked expressions. These will turn
+// into loads to thread-local image params.
+Expr input(const Type &t, const Expr &arg) {
+    return Internal::Call::make(t, "input", {arg}, Internal::Call::Extern);
+}
+Expr in_f16(const Expr &arg) {
+    return input(Float(16), arg);
+}
+Expr in_bf16(const Expr &arg) {
+    return input(BFloat(16), arg);
+}
+Expr in_f32(const Expr &arg) {
+    return input(Float(32), arg);
+}
+Expr in_f64(const Expr &arg) {
+    return input(Float(64), arg);
+}
+Expr in_i8(const Expr &arg) {
+    return input(Int(8), arg);
+}
+Expr in_i16(const Expr &arg) {
+    return input(Int(16), arg);
+}
+Expr in_i32(const Expr &arg) {
+    return input(Int(32), arg);
+}
+Expr in_i64(const Expr &arg) {
+    return input(Int(64), arg);
+}
+Expr in_u8(const Expr &arg) {
+    return input(UInt(8), arg);
+}
+Expr in_u16(const Expr &arg) {
+    return input(UInt(16), arg);
+}
+Expr in_u32(const Expr &arg) {
+    return input(UInt(32), arg);
+}
+Expr in_u64(const Expr &arg) {
+    return input(UInt(64), arg);
+}
+}  // namespace
+
 namespace Halide {
 struct TestResult {
     std::string op;
@@ -33,32 +81,18 @@ class SimdOpCheckTest {
     std::string filter{"*"};
     std::string output_directory{Internal::get_test_tmp_dir()};
     std::vector<Task> tasks;
-    std::mt19937 rng;
 
     Target target;
 
-    ImageParam in_f32{Float(32), 1, "in_f32"};
-    ImageParam in_f64{Float(64), 1, "in_f64"};
-    ImageParam in_f16{Float(16), 1, "in_f16"};
-    ImageParam in_bf16{BFloat(16), 1, "in_bf16"};
-    ImageParam in_i8{Int(8), 1, "in_i8"};
-    ImageParam in_u8{UInt(8), 1, "in_u8"};
-    ImageParam in_i16{Int(16), 1, "in_i16"};
-    ImageParam in_u16{UInt(16), 1, "in_u16"};
-    ImageParam in_i32{Int(32), 1, "in_i32"};
-    ImageParam in_u32{UInt(32), 1, "in_u32"};
-    ImageParam in_i64{Int(64), 1, "in_i64"};
-    ImageParam in_u64{UInt(64), 1, "in_u64"};
-
-    const std::vector<ImageParam> image_params{in_f32, in_f64, in_f16, in_bf16, in_i8, in_u8, in_i16, in_u16, in_i32, in_u32, in_i64, in_u64};
-    const std::vector<Argument> arg_types{in_f32, in_f64, in_f16, in_bf16, in_i8, in_u8, in_i16, in_u16, in_i32, in_u32, in_i64, in_u64};
     int W;
     int H;
 
+    int rng_seed;
+
     using Sharder = Halide::Internal::Test::Sharder;
 
     SimdOpCheckTest(const Target t, int w, int h)
-        : target(t), W(w), H(h) {
+        : target(t), W(w), H(h), rng_seed(0) {
         target = target
                      .with_feature(Target::NoBoundsQuery)
                      .with_feature(Target::NoAsserts)
@@ -67,7 +101,7 @@ class SimdOpCheckTest {
     virtual ~SimdOpCheckTest() = default;
 
     void set_seed(int seed) {
-        rng.seed(seed);
+        rng_seed = seed;
     }
 
     virtual bool can_run_code() const {
@@ -112,7 +146,12 @@ class SimdOpCheckTest {
         return can_run_the_code;
     }
 
-    virtual void compile_and_check(Func error, const std::string &op, const std::string &name, int vector_width, std::ostringstream &error_msg) {
+    virtual void compile_and_check(Func error,
+                                   const std::string &op,
+                                   const std::string &name,
+                                   int vector_width,
+                                   const std::vector<Argument> &arg_types,
+                                   std::ostringstream &error_msg) {
         std::string fn_name = "test_" + name;
         std::string file_name = output_directory + fn_name;
 
@@ -197,6 +236,56 @@ class SimdOpCheckTest {
     TestResult check_one(const std::string &op, const std::string &name, int vector_width, Expr e) {
         std::ostringstream error_msg;
 
+        // Map the input calls in the Expr to loads to local
+        // imageparams, so that we're not sharing state across threads.
+        std::vector<ImageParam> image_params{
+            ImageParam{Float(32), 1, "in_f32"},
+            ImageParam{Float(64), 1, "in_f64"},
+            ImageParam{Float(16), 1, "in_f16"},
+            ImageParam{BFloat(16), 1, "in_bf16"},
+            ImageParam{Int(8), 1, "in_i8"},
+            ImageParam{UInt(8), 1, "in_u8"},
+            ImageParam{Int(16), 1, "in_i16"},
+            ImageParam{UInt(16), 1, "in_u16"},
+            ImageParam{Int(32), 1, "in_i32"},
+            ImageParam{UInt(32), 1, "in_u32"},
+            ImageParam{Int(64), 1, "in_i64"},
+            ImageParam{UInt(64), 1, "in_u64"}};
+
+        for (auto &p : image_params) {
+            const int alignment_bytes = image_param_alignment();
+            p.set_host_alignment(alignment_bytes);
+            const int alignment = alignment_bytes / p.type().bytes();
+            p.dim(0).set_min((p.dim(0).min() / alignment) * alignment);
+        }
+
+        const std::vector<Argument> arg_types(image_params.begin(), image_params.end());
+
+        class HookUpImageParams : public Internal::IRMutator {
+            using Internal::IRMutator::visit;
+
+            Expr visit(const Internal::Call *op) override {
+                if (op->name == "input") {
+                    for (auto &p : image_params) {
+                        if (p.type() == op->type) {
+                            return p(mutate(op->args[0]));
+                        }
+                    }
+                } else if (op->call_type == Internal::Call::Halide && !op->func.weak) {
+                    Internal::Function f(op->func);
+                    f.mutate(this);
+                }
+                return Internal::IRMutator::visit(op);
+            }
+            const std::vector<ImageParam> &image_params;
+
+        public:
+            HookUpImageParams(const std::vector<ImageParam> &image_params)
+                : image_params(image_params) {
+            }
+        } hook_up_image_params(image_params);
+        e = hook_up_image_params.mutate(e);
+
         class HasInlineReduction : public Internal::IRVisitor {
             using Internal::IRVisitor::visit;
             void visit(const Internal::Call *op) override {
@@ -250,42 +339,70 @@ class SimdOpCheckTest {
         Halide::Func error("error_" + name);
         error() = Halide::cast<double>(maximum(absd(f(r_check.x, r_check.y), f_scalar(r_check.x, r_check.y))));
 
-        setup_images();
-        compile_and_check(error, op, name, vector_width, error_msg);
+        compile_and_check(error, op, name, vector_width, arg_types, error_msg);
 
         bool can_run_the_code = can_run_code();
         if (can_run_the_code) {
             Target run_target = get_run_target();
 
-            error.infer_input_bounds({}, run_target);
-            // Fill the inputs with noise
-            for (auto p : image_params) {
-                Halide::Buffer<> buf = p.get();
-                if (!buf.defined()) continue;
-                assert(buf.data());
-                Type t = buf.type();
-                // For floats/doubles, we only use values that aren't
-                // subject to rounding error that may differ between
-                // vectorized and non-vectorized versions
-                if (t == Float(32)) {
-                    buf.as<float>().for_each_value([&](float &f) { f = (rng() & 0xfff) / 8.0f - 0xff; });
-                } else if (t == Float(64)) {
-                    buf.as<double>().for_each_value([&](double &f) { f = (rng() & 0xfff) / 8.0 - 0xff; });
-                } else if (t == Float(16)) {
-                    buf.as<float16_t>().for_each_value([&](float16_t &f) { f = float16_t((rng() & 0xff) / 8.0f - 0xf); });
-                } else {
-                    // Random bits is fine
-                    for (uint32_t *ptr = (uint32_t *)buf.data();
-                         ptr != (uint32_t *)buf.data() + buf.size_in_bytes() / 4;
-                         ptr++) {
-                        // Never use the top four bits, to avoid
-                        // signed integer overflow.
-                        *ptr = ((uint32_t)rng()) & 0x0fffffff;
+            // Make some unallocated input buffers
+            std::vector<Runtime::Buffer<>> inputs(image_params.size());
+
+            std::vector<Argument> args(image_params.size());
+            for (size_t i = 0; i < args.size(); i++) {
+                args[i] = image_params[i];
+                inputs[i] = Runtime::Buffer<>(args[i].type, nullptr, 0);
+            }
+            auto callable = error.compile_to_callable(args, run_target);
+
+            Runtime::Buffer<double> output = Runtime::Buffer<double>::make_scalar();
+            output(0) = 1;  // To ensure we'll fail if it's never written to
+
+            // Do the bounds query call
+            assert(inputs.size() == 12);
+            (void)callable(inputs[0], inputs[1], inputs[2], inputs[3],
+                           inputs[4], inputs[5], inputs[6], inputs[7],
+                           inputs[8], inputs[9], inputs[10], inputs[11],
+                           output);
+
+            std::mt19937 rng;
+            rng.seed(rng_seed);
+
+            // Allocate the input buffers and fill them with noise
+            for (size_t i = 0; i < inputs.size(); i++) {
+                if (inputs[i].size_in_bytes()) {
+                    inputs[i].allocate();
+
+                    Type t = inputs[i].type();
+                    // For floats/doubles, we only use values that aren't
+                    // subject to rounding error that may differ between
+                    // vectorized and non-vectorized versions
+                    if (t == Float(32)) {
+                        inputs[i].as<float>().for_each_value([&](float &f) { f = (rng() & 0xfff) / 8.0f - 0xff; });
+                    } else if (t == Float(64)) {
+                        inputs[i].as<double>().for_each_value([&](double &f) { f = (rng() & 0xfff) / 8.0 - 0xff; });
+                    } else if (t == Float(16)) {
+                        inputs[i].as<float16_t>().for_each_value([&](float16_t &f) { f = float16_t((rng() & 0xff) / 8.0f - 0xf); });
+                    } else {
+                        // Random bits is fine
+                        for (uint32_t *ptr = (uint32_t *)inputs[i].data();
+                             ptr != (uint32_t *)inputs[i].data() + inputs[i].size_in_bytes() / 4;
+                             ptr++) {
+                            // Never use the top four bits, to avoid
+                            // signed integer overflow.
+                            *ptr = ((uint32_t)rng()) & 0x0fffffff;
+                        }
                     }
                 }
             }
-            Realization r = error.realize();
-            double e = Buffer<double>(r[0])();
+
+            // Do the real call
+            (void)callable(inputs[0], inputs[1], inputs[2], inputs[3],
+                           inputs[4], inputs[5], inputs[6], inputs[7],
+                           inputs[8], inputs[9], inputs[10], inputs[11],
+                           output);
+
+            double e = output(0);
             // Use a very loose tolerance for floating point tests. The
             // kinds of bugs we're looking for are codegen bugs that
             // return the wrong value entirely, not floating point
@@ -329,16 +446,10 @@ class SimdOpCheckTest {
         tasks.emplace_back(Task{op, name, vector_width, e});
     }
     virtual void add_tests() = 0;
-    virtual void setup_images() {
-        for (auto p : image_params) {
-            p.reset();
-
-            const int alignment_bytes = 16;
-            p.set_host_alignment(alignment_bytes);
-            const int alignment = alignment_bytes / p.type().bytes();
-            p.dim(0).set_min((p.dim(0).min() / alignment) * alignment);
-        }
+    virtual int image_param_alignment() {
+        return 16;
     }
+
     virtual bool test_all() {
         /* First add some tests based on the target */
         add_tests();
@@ -348,21 +459,33 @@ class SimdOpCheckTest {
         const std::string run_target_str = run_target.to_string();
 
         Sharder sharder;
-        bool success = true;
+
+        Halide::Tools::ThreadPool<TestResult> pool;
+        std::vector<std::future<TestResult>> futures;
+
         for (size_t t = 0; t < tasks.size(); t++) {
             if (!sharder.should_run(t)) continue;
             const auto &task = tasks.at(t);
-            auto result = check_one(task.op, task.name, task.vector_width, task.expr);
+            futures.push_back(pool.async([&]() {
+                return check_one(task.op, task.name, task.vector_width, task.expr);
+            }));
+        }
+
+        for (auto &f : futures) {
+            auto result = f.get();
             constexpr int tabstop = 32;
             const int spaces = std::max(1, tabstop - (int)result.op.size());
             std::cout << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n";
             if (!result.error_msg.empty()) {
                 std::cerr << result.error_msg;
-                success = false;
+                // The thread-pool destructor will block until in-progress tasks
+                // are done, and then will discard any tasks that haven't been
+                // launched yet.
+                return false;
             }
         }
 
-        return success;
+        return true;
     }
 
     template<typename SIMDOpCheckT>
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
index 450ef3f06fe6..29bdde4a9163 100644
--- a/test/correctness/simd_op_check_hvx.cpp
+++ b/test/correctness/simd_op_check_hvx.cpp
@@ -23,16 +23,10 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
     SimdOpCheckHVX(Target t, int w = 768 /*256*3*/, int h = 128)
         : SimdOpCheckTest(t, w, h) {
     }
-    void setup_images() override {
-        for (auto p : image_params) {
-            p.reset();
-            // HVX needs 128 byte alignment
-            constexpr int kHostAlignmentBytes = 128;
-            p.set_host_alignment(kHostAlignmentBytes);
-            Expr min = p.dim(0).min();
-            p.dim(0).set_min((min / 128) * 128);
-        }
+    int image_param_alignment() override {
+        return 128;
     }
+
     void add_tests() override {
         Expr f32_1 = in_f32(x), f32_2 = in_f32(x + 16), f32_3 = in_f32(x + 32);
         Expr f64_1 = in_f64(x), f64_2 = in_f64(x + 16), f64_3 = in_f64(x + 32);
diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp
index 56e2e4231876..2045b42699f4 100644
--- a/test/correctness/simd_op_check_wasm.cpp
+++ b/test/correctness/simd_op_check_wasm.cpp
@@ -533,6 +533,11 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
 }  // namespace
 
 int main(int argc, char **argv) {
+#ifdef HALIDE_INTERNAL_USING_ASAN
+    printf("[SKIP] This test causes an ASAN crash relating to ASAN's use of sigaltstack. It doesn't seem to be due to a bug in the test itself (see https://github.com/halide/Halide/pull/8078#issuecomment-1935407878)");
+    return 0;
+#endif
+
     return SimdOpCheckTest::main<SimdOpCheckWASM>(
         argc, argv,
         {
diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp
index 990e4e886307..b4c086ce0fc3 100644
--- a/test/correctness/simd_op_check_x86.cpp
+++ b/test/correctness/simd_op_check_x86.cpp
@@ -663,15 +663,15 @@ int main(int argc, char **argv) {
             // Always turn on f16c when using avx. Sandy Bridge had avx without
             // f16c, but f16c is orthogonal to everything else, so there's no
             // real reason to test avx without it.
-            Target("x86-64-linux-sse41-avx-f16c"),
-            Target("x86-64-linux-sse41-avx-f16c-avx2"),
+            Target("x86-64-linux-sse41-avx-f16c-fma"),
+            Target("x86-64-linux-sse41-avx-f16c-fma-avx2"),
             // See above: don't test avx512 without extra features, the test
             // isn't yet set up to test it properly.
             // Target("x86-64-linux-sse41-avx-avx2-avx512"),
             // Target("x86-64-linux-sse41-avx-avx2-avx512-avx512_knl"),
-            Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake"),
-            Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake"),
-            Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4"),
-            Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4-avx512_sapphirerapids"),
+            Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake"),
+            Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake"),
+            Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4"),
+            Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4-avx512_sapphirerapids"),
         });
 }
diff --git a/test/correctness/unroll_huge_mux.cpp b/test/correctness/unroll_huge_mux.cpp
index 233ee038c4e8..b24420fe68bb 100644
--- a/test/correctness/unroll_huge_mux.cpp
+++ b/test/correctness/unroll_huge_mux.cpp
@@ -12,7 +12,7 @@ int main(int argc, char **argv) {
     Var x;
 
     std::vector<Expr> exprs;
-    for (int i = 0; i < 10000; i++) {
+    for (int i = 0; i < 5000; i++) {
         exprs.push_back(x & i);
     }
 
@@ -21,17 +21,6 @@ int main(int argc, char **argv) {
     f.bound(x, 0, (int)exprs.size());
     f.unroll(x);
 
-    // For 10000 expressions in the mux, this test uses more than 8MB
-    // in stack because the simplifier's Block visitor is still
-    // recursive and has a large stack frame. We'll put a cap on it to
-    // at least make sure the problem doesn't get worse. If this test
-    // crashes try raising the cap to see if we have a stack size
-    // regression.
-    //
-    // https://github.com/halide/Halide/issues/6238
-
-    set_compiler_stack_size(16 * 1024 * 1024);
-
     f.compile_jit();
 
     printf("Success!\n");
diff --git a/test/correctness/vector_cast.cpp b/test/correctness/vector_cast.cpp
index 3b6eae0fa2e6..575d97842176 100644
--- a/test/correctness/vector_cast.cpp
+++ b/test/correctness/vector_cast.cpp
@@ -1,6 +1,6 @@
 #include "Halide.h"
+#include "halide_thread_pool.h"
 #include "test_sharding.h"
-
 #include <stdio.h>
 
 using namespace Halide;
@@ -164,11 +164,17 @@ int main(int argc, char **argv) {
 
     using Sharder = Halide::Internal::Test::Sharder;
     Sharder sharder;
+    Halide::Tools::ThreadPool<bool> pool;
+    std::vector<std::future<bool>> futures;
     for (size_t t = 0; t < tasks.size(); t++) {
         if (!sharder.should_run(t)) continue;
         const auto &task = tasks.at(t);
-        if (!task.fn()) {
-            exit(1);
+        futures.push_back(pool.async(task.fn));
+    }
+
+    for (auto &f : futures) {
+        if (!f.get()) {
+            return 1;
         }
     }
 
diff --git a/test/correctness/vector_math.cpp b/test/correctness/vector_math.cpp
index 6e7f19a8bb1e..c5036fd1346f 100644
--- a/test/correctness/vector_math.cpp
+++ b/test/correctness/vector_math.cpp
@@ -1,4 +1,5 @@
 #include "Halide.h"
+#include "halide_thread_pool.h"
 #include "test_sharding.h"
 
 #include <algorithm>
@@ -742,11 +743,19 @@ int main(int argc, char **argv) {
 
     using Sharder = Halide::Internal::Test::Sharder;
     Sharder sharder;
+
+    std::vector<std::future<bool>> futures;
+
+    Halide::Tools::ThreadPool<bool> pool;
     for (size_t t = 0; t < tasks.size(); t++) {
         if (!sharder.should_run(t)) continue;
         const auto &task = tasks.at(t);
-        if (!task.fn(task.lanes, task.seed)) {
-            exit(1);
+        futures.push_back(pool.async(task.fn, task.lanes, task.seed));
+    }
+
+    for (auto &f : futures) {
+        if (!f.get()) {
+            return 1;
         }
     }
 
diff --git a/test/correctness/vector_reductions.cpp b/test/correctness/vector_reductions.cpp
index f1c250cfec3d..9db9475e7fca 100644
--- a/test/correctness/vector_reductions.cpp
+++ b/test/correctness/vector_reductions.cpp
@@ -1,4 +1,5 @@
 #include "Halide.h"
+#include "halide_thread_pool.h"
 #include "test_sharding.h"
 
 using namespace Halide;
@@ -194,15 +195,17 @@ int main(int argc, char **argv) {
 
     using Sharder = Halide::Internal::Test::Sharder;
     Sharder sharder;
-    Target prev_target;
+
+    std::vector<std::future<void>> futures;
+    Halide::Tools::ThreadPool<void> pool;
     for (size_t t = 0; t < tasks.size(); t++) {
         if (!sharder.should_run(t)) continue;
         const auto &task = tasks.at(t);
-        if (task.target != prev_target) {
-            std::cout << "vector_reductions: Testing with " << task.target << "\n";
-            prev_target = task.target;
-        }
-        task.fn();
+        futures.push_back(pool.async(task.fn));
+    }
+
+    for (auto &f : futures) {
+        f.wait();
     }
 
     std::cout << "Success!\n";

From 6edea167432abc11bd3c324c144f7dccb33d7574 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 14 Feb 2024 20:26:27 +0000
Subject: [PATCH 060/186] Allow disabling of mutlithreading in simd op check
 (#8096)

simd_op_check_xtensa is not threadsafe at present
---
 test/correctness/simd_op_check.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index fce3172132ba..f386b7efc094 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -450,6 +450,10 @@ class SimdOpCheckTest {
         return 16;
     }
 
+    virtual bool use_multiple_threads() const {
+        return true;
+    }
+
     virtual bool test_all() {
         /* First add some tests based on the target */
         add_tests();
@@ -460,7 +464,10 @@ class SimdOpCheckTest {
 
         Sharder sharder;
 
-        Halide::Tools::ThreadPool<TestResult> pool;
+        Halide::Tools::ThreadPool<TestResult> pool(
+            use_multiple_threads() ?
+                Halide::Tools::ThreadPool<TestResult>::num_processors_online() :
+                1);
         std::vector<std::future<TestResult>> futures;
 
         for (size_t t = 0; t < tasks.size(); t++) {

From 40a622fa15f369a68a03e7e32529e39c54e9f0a2 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 14 Feb 2024 23:34:23 +0300
Subject: [PATCH 061/186] clang does not support `_Float16` when targeting i386
 (#8085)

See https://github.com/halide/Halide/issues/7678
---
 src/runtime/HalideRuntime.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 64034b8be328..b235117e9f5e 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -91,7 +91,7 @@ extern "C" {
 // Ideally there would be a better way to detect if the type
 // is supported, even in a compiler independent fashion, but
 // coming up with one has proven elusive.
-#if defined(__clang__) && (__clang_major__ >= 16) && !defined(__EMSCRIPTEN__)
+#if defined(__clang__) && (__clang_major__ >= 16) && !defined(__EMSCRIPTEN__) && !defined(__i386__)
 #if defined(__is_identifier)
 #if !__is_identifier(_Float16)
 #define HALIDE_CPP_COMPILER_HAS_FLOAT16

From f2d750f355fccadcd03af51bcb58af724719859c Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 14 Feb 2024 23:35:52 +0300
Subject: [PATCH 062/186] tests: correctness/float16_t: mark `__extendhfsf2`
 with default visibility (#8084)

```
[2336/4154] /usr/bin/clang++-17 -DHALIDE_ENABLE_RTTI -DHALIDE_VERSION_MAJOR=17 -DHALIDE_VERSION_MINOR=0 -DHALIDE_VERSION_PATCH=0 -DHALIDE_WITH_EXCEPTIONS -I/build/halide-17.0.0/test/common -I/build/halide-17.0.0/tools -I/build/halide-17.0.0/build/stage-1/halide/include -g -fdebug-default-version=4 -fprofile-use=/build/halide-17.0.0/build-profile/default.profdata -fcs-profile-generate -Xclang -mllvm -Xclang -vp-counters-per-site=100.0 -fuse-ld=lld-17 -Wl,--build-id=sha1 -std=c++17 -flto=thin -fPIE -fvisibility=hidden -fvisibility-inlines-hidden -Winvalid-pch -Xclang -include-pch -Xclang /build/halide-17.0.0/build/stage-1/halide/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /build/halide-17.0.0/build/stage-1/halide/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx -MD -MT test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o -MF test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o.d -o test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o -c /build/halide-17.0.0/test/correctness/float16_t.cpp
<...>
ld.lld-17: error: undefined hidden symbol: __extendhfsf2
>>> referenced by float16_t.cpp:391 (/build/halide-17.0.0/test/correctness/float16_t.cpp:391)
>>>               lto.tmp:(main)
>>> did you mean: __extendbfsf2
>>> defined in: /lib/x86_64-linux-gnu/libgcc_s.so.1
clang++-17: error: linker command failed with exit code 1 (use -v to see invocation)

```
---
 test/correctness/float16_t.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/correctness/float16_t.cpp b/test/correctness/float16_t.cpp
index d135e8108fa7..d4399b008f0a 100644
--- a/test/correctness/float16_t.cpp
+++ b/test/correctness/float16_t.cpp
@@ -12,7 +12,7 @@ extern "C" {
 
 // In Clang 15 and later, this function is passed a uint16... but in the xmm0 register on x86-64.
 // So we'll declare it as a float and just grab the upper 16 bits.
-__attribute__((weak)) float __extendhfsf2(float actually_a_float16) {
+__attribute__((weak, visibility("default"))) float __extendhfsf2(float actually_a_float16) {
     uint16_t data;
     memcpy(&data, &actually_a_float16, sizeof(data));
     return (float)Halide::float16_t::make_from_bits(data);
@@ -20,7 +20,7 @@ __attribute__((weak)) float __extendhfsf2(float actually_a_float16) {
 
 #else
 
-__attribute__((weak)) float __extendhfsf2(uint16_t data) {
+__attribute__((weak, visibility("default"))) float __extendhfsf2(uint16_t data) {
     return (float)Halide::float16_t::make_from_bits(data);
 }
 

From b5825618d186a24b8ff55bf0d810b88546133805 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 14 Feb 2024 13:57:09 -0800
Subject: [PATCH 063/186] Fix reduce_expr_modulo of vector in Solve.cpp (#8089)

* Fix reduce_expr_modulo of vector in Solve.cpp

* Fix test
---
 src/Solve.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Solve.cpp b/src/Solve.cpp
index 22bd14e44412..b25719cff8c7 100644
--- a/src/Solve.cpp
+++ b/src/Solve.cpp
@@ -394,7 +394,7 @@ class SolveExpression : public IRMutator {
         if (a_uses_var && !b_uses_var) {
             const int64_t *ib = as_const_int(b);
             auto is_multiple_of_b = [&](const Expr &e) {
-                if (ib) {
+                if (ib && op->type.is_scalar()) {
                     int64_t r = 0;
                     return reduce_expr_modulo(e, *ib, &r) && r == 0;
                 } else {
@@ -1478,6 +1478,9 @@ void solve_test() {
     check_solve(min(x + y, x - z), x + min(y, 0 - z));
     check_solve(max(x + y, x - z), x + max(y, 0 - z));
 
+    check_solve((5 * Broadcast::make(x, 4) + y) / 5,
+                Broadcast::make(x, 4) + (Broadcast::make(y, 4) / 5));
+
     debug(0) << "Solve test passed\n";
 }
 

From 9a740b584e63cc67e841f134d61d79502e973252 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <derek.gerstmann@gmail.com>
Date: Wed, 14 Feb 2024 14:41:51 -0800
Subject: [PATCH 064/186] [Vulkan] Region allocator fixes for memory
 requirements and allocations (#8087)

* Add region allocator tests that check alignment, nearest_multiple and
collect routines

* Fix can_split() routine to use conformed sizes so that split allocation
matches
Fix region size accounting so that coalesce never has zero size regions
to merge

* Fix aligned_offset() routine to check for zero alignment (which means no
constraint)

* Fix ifdef for internal debugging

* Clean up debug internal log messages

* Use memory_requirements to determine nearest_multiple during
initialization
Query memory_requirements for each region, and reallocate if driver
requires additional device memory

* Formatting pass

---------

Co-authored-by: Derek Gerstmann <dgerstmann@adobe.com>
---
 src/runtime/internal/block_allocator.h  | 126 ++++++------
 src/runtime/internal/memory_arena.h     |   2 +-
 src/runtime/internal/memory_resources.h |   2 +-
 src/runtime/internal/region_allocator.h | 246 +++++++++++++++---------
 src/runtime/vulkan_memory.h             |  36 +++-
 test/runtime/block_allocator.cpp        | 180 ++++++++++++++++-
 6 files changed, 424 insertions(+), 168 deletions(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index 3ff850e5b19f..feee56a4e531 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -126,7 +126,7 @@ BlockAllocator *BlockAllocator::create(void *user_context, const Config &cfg, co
         allocators.system.allocate(user_context, sizeof(BlockAllocator)));
 
     if (result == nullptr) {
-        error(user_context) << "BlockAllocator: Failed to create instance! Out of memory!\n";
+        error(user_context) << "BlockAllocator: Failed to create instance! Out of memory\n";
         return nullptr;
     }
 
@@ -160,12 +160,12 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
                         << "dedicated=" << (request.dedicated ? "true" : "false") << " "
                         << "usage=" << halide_memory_usage_name(request.properties.usage) << " "
                         << "caching=" << halide_memory_caching_name(request.properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...\n";
+                        << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...";
 #endif
     BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated);
     if (block_entry == nullptr) {
         error(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size ("
-                            << (int32_t)(request.size) << " bytes)!\n";
+                            << (int32_t)(request.size) << " bytes)\n";
         return nullptr;
     }
 
@@ -180,7 +180,7 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
         block_entry = create_block_entry(user_context, request.properties, request.size, request.dedicated);
         if (block_entry == nullptr) {
             error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
-                                << (int32_t)(request.size) << " bytes)!\n";
+                                << (int32_t)(request.size) << " bytes)\n";
             return nullptr;
         }
 
@@ -288,7 +288,7 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl
     if (result == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: Failed to allocate region of size ("
-                            << (int32_t)(request.size) << " bytes)!\n";
+                            << (int32_t)(request.size) << " bytes)\n";
 #endif
         // allocator has enough free space, but not enough contiguous space
         // -- collect and try to reallocate
@@ -302,17 +302,17 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl
 bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const {
     if (!is_compatible_block(block, properties)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "BlockAllocator: skipping block ... incompatible properties!\n"
-                            << " block_resource=" << (void *)block << "\n"
-                            << " block_size=" << (uint32_t)block->memory.size << "\n"
-                            << " block_reserved=" << (uint32_t)block->reserved << "\n"
-                            << " block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << "\n"
-                            << " block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << "\n"
-                            << " block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << "\n";
-        debug(user_context) << " request_size=" << (uint32_t)size << "\n"
-                            << " request_usage=" << halide_memory_usage_name(properties.usage) << "\n"
-                            << " request_caching=" << halide_memory_caching_name(properties.caching) << "\n"
-                            << " request_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+        debug(user_context) << "BlockAllocator: skipping block ... incompatible properties! ("
+                            << "block_resource=" << (void *)block << " "
+                            << "block_size=" << (uint32_t)block->memory.size << " "
+                            << "block_reserved=" << (uint32_t)block->reserved << " "
+                            << "block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << " "
+                            << "block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << " "
+                            << "block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << " "
+                            << "request_size=" << (uint32_t)size << " "
+                            << "request_usage=" << halide_memory_usage_name(properties.usage) << " "
+                            << "request_caching=" << halide_memory_caching_name(properties.caching) << " "
+                            << "request_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
 #endif
         // skip blocks that are using incompatible memory
         return false;
@@ -320,20 +320,20 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
 
     if (dedicated && (block->reserved > 0)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation!\n"
-                            << " block_resource=" << (void *)block << "\n"
-                            << " block_size=" << (uint32_t)block->memory.size << "\n"
-                            << " block_reserved=" << (uint32_t)block->reserved << "\n";
+        debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation! ("
+                            << "block_resource=" << (void *)block << " "
+                            << "block_size=" << (uint32_t)block->memory.size << " "
+                            << "block_reserved=" << (uint32_t)block->reserved << ")";
 #endif
         // skip blocks that can't be dedicated to a single allocation
         return false;
 
     } else if (block->memory.dedicated && (block->reserved > 0)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "BlockAllocator: skipping block ... already dedicated to an allocation!\n"
-                            << " block_resource=" << (void *)block << "\n"
-                            << " block_size=" << (uint32_t)block->memory.size << "\n"
-                            << " block_reserved=" << (uint32_t)block->reserved << "\n";
+        debug(user_context) << "BlockAllocator: skipping block ... already dedicated to an allocation! ("
+                            << "block_resource=" << (void *)block << " "
+                            << "block_size=" << (uint32_t)block->memory.size << " "
+                            << "block_reserved=" << (uint32_t)block->reserved << ")";
 #endif
         // skip dedicated blocks that are already allocated
         return false;
@@ -355,16 +355,16 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
         const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
         if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-            debug(user_context) << "BlockAllocator: found suitable block ...\n"
-                                << " user_context=" << (void *)(user_context) << "\n"
-                                << " block_resource=" << (void *)block << "\n"
-                                << " block_size=" << (uint32_t)block->memory.size << "\n"
-                                << " block_reserved=" << (uint32_t)block->reserved << "\n"
-                                << " request_size=" << (uint32_t)size << "\n"
-                                << " dedicated=" << (dedicated ? "true" : "false") << "\n"
-                                << " usage=" << halide_memory_usage_name(properties.usage) << "\n"
-                                << " caching=" << halide_memory_caching_name(properties.caching) << "\n"
-                                << " visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+            debug(user_context) << "BlockAllocator: found suitable block ("
+                                << "user_context=" << (void *)(user_context) << " "
+                                << "block_resource=" << (void *)block << " "
+                                << "block_size=" << (uint32_t)block->memory.size << " "
+                                << "block_reserved=" << (uint32_t)block->reserved << " "
+                                << "request_size=" << (uint32_t)size << " "
+                                << "dedicated=" << (dedicated ? "true" : "false") << " "
+                                << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                                << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                                << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
 #endif
             return block_entry;
         }
@@ -373,13 +373,13 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
 
     if (block_entry == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "BlockAllocator: couldn't find suitable block!\n"
-                            << " user_context=" << (void *)(user_context) << "\n"
-                            << " request_size=" << (uint32_t)size << "\n"
-                            << " dedicated=" << (dedicated ? "true" : "false") << "\n"
-                            << " usage=" << halide_memory_usage_name(properties.usage) << "\n"
-                            << " caching=" << halide_memory_caching_name(properties.caching) << "\n"
-                            << " visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+        debug(user_context) << "BlockAllocator: couldn't find suitable block! ("
+                            << "user_context=" << (void *)(user_context) << " "
+                            << "request_size=" << (uint32_t)size << " "
+                            << "dedicated=" << (dedicated ? "true" : "false") << " "
+                            << "usage=" << halide_memory_usage_name(properties.usage) << " "
+                            << "caching=" << halide_memory_caching_name(properties.caching) << " "
+                            << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
 #endif
     }
     return block_entry;
@@ -388,22 +388,22 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
 BlockAllocator::BlockEntry *
 BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "BlockAllocator: reserving block ... !\n"
-                        << " requested_size=" << (uint32_t)size << "\n"
-                        << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n"
-                        << " requested_usage=" << halide_memory_usage_name(properties.usage) << "\n"
-                        << " requested_caching=" << halide_memory_caching_name(properties.caching) << "\n"
-                        << " requested_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+    debug(user_context) << "BlockAllocator: reserving block ... ! ("
+                        << "requested_size=" << (uint32_t)size << " "
+                        << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " "
+                        << "requested_usage=" << halide_memory_usage_name(properties.usage) << " "
+                        << "requested_caching=" << halide_memory_caching_name(properties.caching) << " "
+                        << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
 #endif
     BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated);
     if (block_entry == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "BlockAllocator: creating block ... !\n"
-                            << " requested_size=" << (uint32_t)size << "\n"
-                            << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n"
-                            << " requested_usage=" << halide_memory_usage_name(properties.usage) << "\n"
-                            << " requested_caching=" << halide_memory_caching_name(properties.caching) << "\n"
-                            << " requested_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n";
+        debug(user_context) << "BlockAllocator: creating block ... ! ("
+                            << "requested_size=" << (uint32_t)size << " "
+                            << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " "
+                            << "requested_usage=" << halide_memory_usage_name(properties.usage) << " "
+                            << "requested_caching=" << halide_memory_caching_name(properties.caching) << " "
+                            << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
 #endif
         block_entry = create_block_entry(user_context, properties, size, dedicated);
     }
@@ -422,14 +422,14 @@ BlockAllocator::create_region_allocator(void *user_context, BlockResource *block
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Creating region allocator ("
                         << "user_context=" << (void *)(user_context) << " "
-                        << "block_resource=" << (void *)(block) << ")...\n";
+                        << "block_resource=" << (void *)(block) << ")...";
 #endif
     halide_abort_if_false(user_context, block != nullptr);
     RegionAllocator *region_allocator = RegionAllocator::create(
         user_context, block, {allocators.system, allocators.region});
 
     if (region_allocator == nullptr) {
-        error(user_context) << "BlockAllocator: Failed to create new region allocator!\n";
+        error(user_context) << "BlockAllocator: Failed to create new region allocator\n";
         return nullptr;
     }
 
@@ -440,7 +440,7 @@ int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Destroying region allocator ("
                         << "user_context=" << (void *)(user_context) << " "
-                        << "region_allocator=" << (void *)(region_allocator) << ")...\n";
+                        << "region_allocator=" << (void *)(region_allocator) << ")...";
 #endif
     if (region_allocator == nullptr) {
         return 0;
@@ -459,13 +459,13 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
 
     if (config.maximum_block_count && (block_count() >= config.maximum_block_count)) {
         error(user_context) << "BlockAllocator: No free blocks found! Maximum block count reached ("
-                            << (int32_t)(config.maximum_block_count) << ")!\n";
+                            << (int32_t)(config.maximum_block_count) << ")\n";
         return nullptr;
     }
 
     BlockEntry *block_entry = block_list.append(user_context);
     if (block_entry == nullptr) {
-        debug(user_context) << "BlockAllocator: Failed to allocate new block entry!\n";
+        debug(user_context) << "BlockAllocator: Failed to allocate new block entry\n";
         return nullptr;
     }
 
@@ -473,7 +473,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
     debug(user_context) << "BlockAllocator: Creating block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << " "
-                        << "allocator=" << (void *)(allocators.block.allocate) << ")...\n";
+                        << "allocator=" << (void *)(allocators.block.allocate) << ")...";
 #endif
 
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
@@ -492,7 +492,7 @@ int BlockAllocator::release_block_entry(void *user_context, BlockAllocator::Bloc
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: Releasing block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
-                        << "block=" << (void *)(block_entry->value) << ")...\n";
+                        << "block=" << (void *)(block_entry->value) << ")...";
 #endif
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
     if (block->allocator) {
@@ -506,7 +506,7 @@ int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Bloc
     debug(user_context) << "BlockAllocator: Destroying block entry ("
                         << "block_entry=" << (void *)(block_entry) << " "
                         << "block=" << (void *)(block_entry->value) << " "
-                        << "deallocator=" << (void *)(allocators.block.deallocate) << ")...\n";
+                        << "deallocator=" << (void *)(allocators.block.deallocate) << ")...";
 #endif
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
     if (block->allocator) {
@@ -520,7 +520,7 @@ int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Bloc
 
 int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n";
+    debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...";
 #endif
     halide_abort_if_false(user_context, allocators.block.allocate != nullptr);
     MemoryBlock *memory_block = &(block->memory);
@@ -531,7 +531,7 @@ int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block)
 
 int BlockAllocator::free_memory_block(void *user_context, BlockResource *block) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n";
+    debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...";
 #endif
     halide_abort_if_false(user_context, allocators.block.deallocate != nullptr);
     MemoryBlock *memory_block = &(block->memory);
diff --git a/src/runtime/internal/memory_arena.h b/src/runtime/internal/memory_arena.h
index 5953e12e470a..7d6c33da8f5d 100644
--- a/src/runtime/internal/memory_arena.h
+++ b/src/runtime/internal/memory_arena.h
@@ -271,7 +271,7 @@ void *MemoryArena::create_entry(void *user_context, Block *block, uint32_t index
     void *entry_ptr = lookup_entry(user_context, block, index);
     block->free_index = block->indices[index];
     block->status[index] = AllocationStatus::InUse;
-#if DEBUG_RUNTIME_INTERNAL
+#ifdef DEBUG_RUNTIME_INTERNAL
     memset(entry_ptr, 0, config.entry_size);
 #endif
     return entry_ptr;
diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index e30afb0dd4ea..d41fa57304fb 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -127,7 +127,7 @@ ALWAYS_INLINE bool is_power_of_two_alignment(size_t x) {
 // -- Alignment must be power of two!
 ALWAYS_INLINE size_t aligned_offset(size_t offset, size_t alignment) {
     halide_abort_if_false(nullptr, is_power_of_two_alignment(alignment));
-    return (offset + (alignment - 1)) & ~(alignment - 1);
+    return (alignment == 0) ? (offset) : (offset + (alignment - 1)) & ~(alignment - 1);
 }
 
 // Returns a suitable alignment such that requested alignment is a suitable
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 13c6b69f12e7..02c2cd7e6aa0 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -73,7 +73,7 @@ class RegionAllocator {
     BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region);
 
     // Returns true if the given region can be split to accomodate the given size
-    bool can_split(const BlockRegion *region, size_t size) const;
+    bool can_split(const BlockRegion *region, size_t size, size_t alignment) const;
 
     // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining
     BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment);
@@ -155,7 +155,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Unable to reserve more memory from block "
                             << "-- requested size (" << (int32_t)(request.size) << " bytes) "
-                            << "greater than available (" << (int32_t)(remaining) << " bytes)!\n";
+                            << "greater than available (" << (int32_t)(remaining) << " bytes)";
 #endif
         return nullptr;
     }
@@ -164,15 +164,15 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
     if (block_region == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Failed to locate region for requested size ("
-                            << (int32_t)(request.size) << " bytes)!\n";
+                            << (int32_t)(request.size) << " bytes)";
 #endif
         return nullptr;
     }
 
-    if (can_split(block_region, request.size)) {
+    if (can_split(block_region, request.size, request.alignment)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
-                            << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n";
+                            << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)";
 #endif
         split_block_region(user_context, block_region, request.size, request.alignment);
     }
@@ -200,9 +200,6 @@ int RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) {
     }
     release_block_region(user_context, block_region);
     free_block_region(user_context, block_region);
-    if (can_coalesce(block_region)) {
-        block_region = coalesce_block_regions(user_context, block_region);
-    }
     return 0;
 }
 
@@ -232,8 +229,10 @@ bool RegionAllocator::is_last_block_region(void *user_context, const BlockRegion
 bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, const BlockRegion *region, const MemoryRequest &request) const {
     if (!is_available(region)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "RegionAllocator: skipping block region ... not available! "
-                            << " block_region=" << (void *)region << "\n";
+        debug(user_context) << "    skipping block region ... not available! ("
+                            << " block_region=" << (void *)region
+                            << " region_size=" << (uint32_t)(region->memory.size)
+                            << ")";
 #endif
         return false;
     }
@@ -241,8 +240,10 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
     // skip incompatible block regions for this request
     if (!is_compatible_block_region(region, request.properties)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "RegionAllocator: skipping block region ... incompatible properties! "
-                            << " block_region=" << (void *)region << "\n";
+        debug(user_context) << "    skipping block region ... incompatible properties! ("
+                            << " block_region=" << (void *)region
+                            << " region_size=" << (uint32_t)(region->memory.size)
+                            << ")";
 #endif
         return false;
     }
@@ -253,8 +254,12 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
     // is the adjusted size larger than the current region?
     if (actual_size > region->memory.size) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "RegionAllocator: skipping block region ... not enough space for adjusted size! "
-                            << " block_region=" << (void *)region << "\n";
+        debug(user_context) << "    skipping block region ... not enough space for adjusted size! ("
+                            << " block_region=" << (void *)region
+                            << " request_size=" << (uint32_t)(request.size)
+                            << " actual_size=" << (uint32_t)(actual_size)
+                            << " region_size=" << (uint32_t)(region->memory.size)
+                            << ")";
 #endif
         return false;
     }
@@ -262,8 +267,12 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
     // will the adjusted size fit within the remaining unallocated space?
     if ((actual_size + block->reserved) <= block->memory.size) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "RegionAllocator: found suitable block region! "
-                            << " block_region=" << (void *)region << "\n";
+        debug(user_context) << "    found suitable block region! ("
+                            << " block_region=" << (void *)region
+                            << " request_size=" << (uint32_t)(request.size)
+                            << " actual_size=" << (uint32_t)(actual_size)
+                            << " region_size=" << (uint32_t)(region->memory.size)
+                            << ")";
 #endif
         return true;  // you betcha
     }
@@ -272,20 +281,29 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
 }
 
 BlockRegion *RegionAllocator::find_block_region(void *user_context, const MemoryRequest &request) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+    debug(user_context) << "RegionAllocator: find block region ( "
+                        << "user_context=" << (void *)(user_context) << " "
+                        << "requested_size=" << (uint32_t)request.size << " "
+                        << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                        << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                        << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                        << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
+#endif
     BlockRegion *block_region = block->regions;
     while (block_region != nullptr) {
         if (is_block_region_suitable_for_request(user_context, block_region, request)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-            debug(user_context) << "RegionAllocator: found suitable region ...\n"
-                                << " user_context=" << (void *)(user_context) << "\n"
-                                << " block_resource=" << (void *)block << "\n"
-                                << " block_size=" << (uint32_t)block->memory.size << "\n"
-                                << " block_reserved=" << (uint32_t)block->reserved << "\n"
-                                << " requested_size=" << (uint32_t)request.size << "\n"
-                                << " requested_is_dedicated=" << (request.dedicated ? "true" : "false") << "\n"
-                                << " requested_usage=" << halide_memory_usage_name(request.properties.usage) << "\n"
-                                << " requested_caching=" << halide_memory_caching_name(request.properties.caching) << "\n"
-                                << " requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << "\n";
+            debug(user_context) << "RegionAllocator: found suitable region ( "
+                                << "user_context=" << (void *)(user_context) << " "
+                                << "block_resource=" << (void *)block << " "
+                                << "block_size=" << (uint32_t)block->memory.size << " "
+                                << "block_reserved=" << (uint32_t)block->reserved << " "
+                                << "requested_size=" << (uint32_t)request.size << " "
+                                << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                                << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                                << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                                << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
             return block_region;
         }
@@ -299,13 +317,13 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory
 
     if (block_region == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "RegionAllocator: couldn't find suitable region!\n"
-                            << " user_context=" << (void *)(user_context) << "\n"
-                            << " requested_size=" << (uint32_t)request.size << "\n"
-                            << " requested_is_dedicated=" << (request.dedicated ? "true" : "false") << "\n"
-                            << " requested_usage=" << halide_memory_usage_name(request.properties.usage) << "\n"
-                            << " requested_caching=" << halide_memory_caching_name(request.properties.caching) << "\n"
-                            << " requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << "\n";
+        debug(user_context) << "RegionAllocator: couldn't find suitable region! ("
+                            << "user_context=" << (void *)(user_context) << " "
+                            << "requested_size=" << (uint32_t)request.size << " "
+                            << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                            << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                            << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                            << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
     }
 
@@ -342,12 +360,12 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
 
     if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "Freeing region ("
+        debug(user_context) << "RegionAllocator: Freeing unused region to coalesce ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
                             << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                             << "block_reserved=" << (uint32_t)block->reserved << " "
-                            << ")\n";
+                            << ")";
 #endif
         halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
         MemoryRegion *memory_region = &(block_region->memory);
@@ -361,7 +379,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Coalescing "
                             << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) "
-                            << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!";
+                            << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!";
 #endif
 
         prev_region->next_ptr = block_region->next_ptr;
@@ -379,7 +397,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Coalescing "
                             << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) "
-                            << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n";
+                            << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)";
 #endif
 
         if (next_region->next_ptr) {
@@ -393,8 +411,10 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     return block_region;
 }
 
-bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size) const {
-    return (block_region && (block_region->memory.size > size) && (block_region->usage_count == 0));
+bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size, size_t alignment) const {
+    size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment);
+    size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple);
+    return (block_region && (block_region->memory.size > split_size) && (block_region->usage_count == 0));
 }
 
 BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
@@ -406,7 +426,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
                             << "block_region=" << (void *)block_region << " "
                             << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                             << "block_reserved=" << (uint32_t)block_region->block_ptr->reserved << " "
-                            << ")\n";
+                            << ")";
 #endif
         halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
         MemoryRegion *memory_region = &(block_region->memory);
@@ -420,18 +440,20 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     size_t empty_size = block_region->memory.size - split_size;
 
 #ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "RegionAllocator: Conforming size and alignment \n"
-                        << " requested_size=" << (uint32_t)size << "\n"
-                        << " split_size=" << (uint32_t)split_size << "\n"
-                        << " requested_alignment=" << (uint32_t)alignment << " "
-                        << " required_alignment=" << (uint32_t)block->memory.properties.alignment << " "
-                        << " actual_alignment=" << (uint32_t)actual_alignment << ")\n";
+    debug(user_context) << "RegionAllocator: Conforming size and alignment ("
+                        << "requested_size=" << (uint32_t)size << " "
+                        << "split_size=" << (uint32_t)split_size << " "
+                        << "split_offset=" << (uint32_t)split_size << " "
+                        << "empty_size=" << (uint32_t)empty_size << " "
+                        << "requested_alignment=" << (uint32_t)alignment << " "
+                        << "required_alignment=" << (uint32_t)block->memory.properties.alignment << " "
+                        << "actual_alignment=" << (uint32_t)actual_alignment << ")";
 #endif
 
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Splitting "
                         << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
-                        << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n";
+                        << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)";
 #endif
 
     BlockRegion *next_region = block_region->next_ptr;
@@ -453,7 +475,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 
 BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "RegionAllocator: Creating block region ("
+    debug(user_context) << "RegionAllocator: Creating block region request ("
                         << "user_context=" << (void *)(user_context) << " "
                         << "offset=" << (uint32_t)offset << " "
                         << "size=" << (uint32_t)size << " "
@@ -461,8 +483,16 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
                         << "dedicated=" << (dedicated ? "true" : "false") << " "
                         << "usage=" << halide_memory_usage_name(properties.usage) << " "
                         << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n";
+                        << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...";
 #endif
+    size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment);
+    size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple);
+    size_t actual_offset = aligned_offset(offset, actual_alignment);
+
+    if (actual_size == 0) {
+        error(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n";
+        return nullptr;
+    }
 
     BlockRegion *block_region = static_cast<BlockRegion *>(arena->reserve(user_context, true));
     if (block_region == nullptr) {
@@ -470,16 +500,6 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
         return nullptr;
     }
 
-#ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "RegionAllocator: Added block region ("
-                        << "user_context=" << (void *)(user_context) << " "
-                        << "block_region=" << (void *)(block_region) << ") ...\n";
-#endif
-
-    size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment);
-    size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple);
-    size_t actual_offset = aligned_offset(offset, actual_alignment);
-
     block_region->memory.handle = nullptr;
     block_region->memory.offset = actual_offset;
     block_region->memory.size = actual_size;
@@ -490,11 +510,13 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo
     block_region->usage_count = 0;
 
 #ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "Creating region ("
+    debug(user_context) << "RegionAllocator: Created block region allocation ("
+                        << "user_context=" << (void *)(user_context) << " "
                         << "block_ptr=" << (void *)block_region->block_ptr << " "
                         << "block_region=" << (void *)block_region << " "
+                        << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " "
                         << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
-                        << ")\n";
+                        << ")";
 #endif
 
     return block_region;
@@ -504,7 +526,12 @@ int RegionAllocator::release_block_region(void *user_context, BlockRegion *block
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Releasing block region ("
                         << "user_context=" << (void *)(user_context) << " "
-                        << "block_region=" << (void *)(block_region) << ") ...\n";
+                        << "block_ptr=" << ((block_region) ? ((void *)block_region->block_ptr) : nullptr) << " "
+                        << "block_region=" << (void *)block_region << " "
+                        << "usage_count=" << ((block_region) ? (uint32_t)(block_region->usage_count) : 0) << " "
+                        << "memory_offset=" << ((block_region) ? (uint32_t)(block_region->memory.offset) : 0) << " "
+                        << "memory_size=" << ((block_region) ? (uint32_t)(block_region->memory.size) : 0) << " "
+                        << "block_reserved=" << (uint32_t)(block->reserved) << ") ... ";
 #endif
     if (block_region == nullptr) {
         return 0;
@@ -517,12 +544,13 @@ int RegionAllocator::release_block_region(void *user_context, BlockRegion *block
     if (block_region->status != AllocationStatus::Available) {
 
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "Releasing region ("
+        debug(user_context) << "    releasing region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
+                            << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " "
                             << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                             << "block_reserved=" << (uint32_t)(block->reserved - block_region->memory.size) << " "
-                            << ")\n";
+                            << ")";
 #endif
 
         block->reserved -= block_region->memory.size;
@@ -535,7 +563,7 @@ int RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Destroying block region ("
                         << "user_context=" << (void *)(user_context) << " "
-                        << "block_region=" << (void *)(block_region) << ") ...\n";
+                        << "block_region=" << (void *)(block_region) << ") ...";
 #endif
 
     block_region->usage_count = 0;
@@ -549,7 +577,7 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Allocating region (user_context=" << (void *)(user_context)
                         << " size=" << (int32_t)(block_region->memory.size)
-                        << " offset=" << (int32_t)block_region->memory.offset << ")!\n";
+                        << " offset=" << (int32_t)block_region->memory.offset << ")";
 #endif
     halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
     halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available);
@@ -560,25 +588,25 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r
         memory_region->is_owner = true;
 
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "Allocating region ("
+        debug(user_context) << "    allocating region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
                             << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " "
                             << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                             << "block_reserved=" << (uint32_t)block->reserved << " "
-                            << ")\n";
+                            << ")";
 #endif
 
     } else {
 
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "Re-using region  ("
+        debug(user_context) << "    re-using region  ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
                             << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " "
                             << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                             << "block_reserved=" << (uint32_t)block->reserved << " "
-                            << ")\n";
+                            << ")";
 #endif
     }
     block_region->status = block_region->memory.dedicated ? AllocationStatus::Dedicated : AllocationStatus::InUse;
@@ -590,24 +618,26 @@ int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_re
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Freeing block region ("
                         << "user_context=" << (void *)(user_context) << " "
+                        << "block_ptr=" << (void *)block_region->block_ptr << " "
                         << "block_region=" << (void *)(block_region) << " "
+                        << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                         << "status=" << (uint32_t)block_region->status << " "
-                        << "usage_count=" << (uint32_t)block_region->usage_count << ") ...\n";
+                        << "usage_count=" << (uint32_t)block_region->usage_count << " "
+                        << "block_reserved=" << (uint32_t)block->reserved << ")";
 #endif
     if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-        debug(user_context) << "Freeing region ("
+        debug(user_context) << "    deallocating region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
                             << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                             << "block_reserved=" << (uint32_t)block->reserved << " "
-                            << ")\n";
+                            << ")";
 #endif
+        // NOTE: Deallocate but leave memory size as is, so that coalesce can compute region merging sizes
         halide_abort_if_false(user_context, allocators.region.deallocate != nullptr);
         MemoryRegion *memory_region = &(block_region->memory);
         allocators.region.deallocate(user_context, memory_region);
-        block_region->memory.size = 0;
-        block_region->memory.offset = 0;
         block_region->memory.handle = nullptr;
     }
     block_region->usage_count = 0;
@@ -618,7 +648,7 @@ int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_re
 int RegionAllocator::release(void *user_context) {
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Releasing all regions ("
-                        << "user_context=" << (void *)(user_context) << ") ...\n";
+                        << "user_context=" << (void *)(user_context) << ") ...";
 #endif
 
     BlockRegion *block_region = block->regions;
@@ -635,45 +665,67 @@ int RegionAllocator::release(void *user_context) {
 bool RegionAllocator::collect(void *user_context) {
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Collecting free block regions ("
-                        << "user_context=" << (void *)(user_context) << ") ...\n";
+                        << "user_context=" << (void *)(user_context) << ") ...";
 
-    uint32_t count = 0;
+    uint32_t collected_count = 0;
+    uint32_t remaining_count = 0;
     uint64_t reserved = block->reserved;
     debug(user_context) << "    collecting unused regions ("
                         << "block_ptr=" << (void *)block << " "
                         << "block_reserved=" << (uint32_t)block->reserved << " "
-                        << ")\n";
+                        << ")";
 #endif
 
     bool has_collected = false;
     BlockRegion *block_region = block->regions;
     while (block_region != nullptr) {
+        debug(user_context) << "    checking region ("
+                            << "block_ptr=" << (void *)block_region->block_ptr << " "
+                            << "block_region=" << (void *)block_region << " "
+                            << "usage_count=" << (uint32_t)(block_region->usage_count) << " "
+                            << "status=" << (uint32_t)(block_region->status) << " "
+                            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                            << "block_reserved=" << (uint32_t)block->reserved << " "
+                            << ")";
+
         if (can_coalesce(block_region)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
-            count++;
+            collected_count++;
             debug(user_context) << "    collecting region ("
                                 << "block_ptr=" << (void *)block_region->block_ptr << " "
                                 << "block_region=" << (void *)block_region << " "
                                 << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                                 << "block_reserved=" << (uint32_t)block->reserved << " "
-                                << ")\n";
+                                << ")";
 #endif
             block_region = coalesce_block_regions(user_context, block_region);
             has_collected = true;
+        } else {
+#ifdef DEBUG_RUNTIME_INTERNAL
+            remaining_count++;
+#endif
         }
         if (is_last_block_region(user_context, block_region)) {
             break;
         }
         block_region = block_region->next_ptr;
     }
+#ifdef DEBUG_RUNTIME_INTERNAL
+    debug(user_context) << "    scanned active regions ("
+                        << "block_ptr=" << (void *)block << " "
+                        << "total_count=" << (uint32_t)(collected_count + remaining_count) << " "
+                        << "block_reserved=" << (uint32_t)(block->reserved) << " "
+                        << ")";
+#endif
 
     if (has_collected) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "    collected unused regions ("
                             << "block_ptr=" << (void *)block << " "
-                            << "region_count=" << (uint32_t)count << " "
-                            << "collected=" << (uint32_t)(reserved - block->reserved) << " "
-                            << ")\n";
+                            << "collected_count=" << (uint32_t)collected_count << " "
+                            << "remaining_count=" << (uint32_t)remaining_count << " "
+                            << "reclaimed=" << (uint32_t)(reserved - block->reserved) << " "
+                            << ")";
 #endif
     }
     return has_collected;
@@ -682,23 +734,27 @@ bool RegionAllocator::collect(void *user_context) {
 int RegionAllocator::destroy(void *user_context) {
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Destroying all block regions ("
-                        << "user_context=" << (void *)(user_context) << ") ...\n";
+                        << "user_context=" << (void *)(user_context) << ") ...";
 #endif
-    for (BlockRegion *block_region = block->regions; block_region != nullptr;) {
-
-        if (is_last_block_region(user_context, block_region)) {
-            destroy_block_region(user_context, block_region);
-            block_region = nullptr;
-        } else {
-            BlockRegion *prev_region = block_region;
-            block_region = block_region->next_ptr;
-            destroy_block_region(user_context, prev_region);
+    if (block->regions != nullptr) {
+        for (BlockRegion *block_region = block->regions; block_region != nullptr;) {
+
+            if (is_last_block_region(user_context, block_region)) {
+                destroy_block_region(user_context, block_region);
+                block_region = nullptr;
+            } else {
+                BlockRegion *prev_region = block_region;
+                block_region = block_region->next_ptr;
+                destroy_block_region(user_context, prev_region);
+            }
         }
     }
     block->reserved = 0;
     block->regions = nullptr;
     block->allocator = nullptr;
-    MemoryArena::destroy(user_context, arena);
+    if (arena != nullptr) {
+        MemoryArena::destroy(user_context, arena);
+    }
     arena = nullptr;
     return 0;
 }
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 70a6bda64e5d..96535f3446ba 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -614,7 +614,8 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
                    << "size=" << (uint32_t)block->size << ", "
-                   << "alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "required_size=" << (uint32_t)memory_requirements.size << ", "
                    << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
                    << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
                    << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
@@ -630,6 +631,9 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block
     if (memory_requirements.alignment > block->properties.alignment) {
         block->properties.alignment = memory_requirements.alignment;
     }
+    if (memory_requirements.alignment > block->properties.nearest_multiple) {
+        block->properties.nearest_multiple = memory_requirements.alignment;
+    }
     block->handle = (void *)device_memory;
     instance->block_byte_count += block->size;
     instance->block_count++;
@@ -867,6 +871,36 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
                             << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
         return halide_error_code_device_malloc_failed;
     }
+
+    // NOTE: Vulkan will only allow us to bind device memory to a buffer if the memory requirements are met.
+    // So now we have to check those (on every allocation) and potentially recreate the buffer if the requirements
+    // don't match the requested VkBuffer's properties. Note that this is the internal storage for the driver,
+    // whose size may be required to larger than our requested size (even though we will only ever touch the
+    // size of the region we're managing as within our block)
+    VkMemoryRequirements memory_requirements = {0};
+    vkGetBufferMemoryRequirements(instance->device, *buffer, &memory_requirements);
+
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements ("
+                   << "requested_size=" << (uint32_t)region->size << ", "
+                   << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "required_size=" << (uint32_t)memory_requirements.size << ")\n";
+#endif
+
+    if (memory_requirements.size > region->size) {
+        vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks);
+#ifdef DEBUG_RUNTIME
+        debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" << (uint64_t)memory_requirements.size << " bytes) ...\n";
+#endif
+        create_info.size = memory_requirements.size;
+        VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer);
+        if (result != VK_SUCCESS) {
+            error(user_context) << "VulkanRegionAllocator: Failed to recreate buffer!\n\t"
+                                << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+            return halide_error_code_device_malloc_failed;
+        }
+    }
+
 #ifdef DEBUG_RUNTIME
     debug(nullptr) << "vkCreateBuffer: Created buffer for device region (" << (uint64_t)region->size << " bytes) ...\n";
 #endif
diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp
index b56c817e1f4e..b2190f63b592 100644
--- a/test/runtime/block_allocator.cpp
+++ b/test/runtime/block_allocator.cpp
@@ -21,7 +21,7 @@ int allocate_block(void *user_context, MemoryBlock *block) {
                         << "block=" << (void *)(block) << " "
                         << "block_size=" << int32_t(block->size) << " "
                         << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
-                        << ") !\n";
+                        << ") ...";
 
     return halide_error_code_success;
 }
@@ -34,7 +34,7 @@ int deallocate_block(void *user_context, MemoryBlock *block) {
                         << "block=" << (void *)(block) << " "
                         << "block_size=" << int32_t(block->size) << " "
                         << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
-                        << ") !\n";
+                        << ") ...";
 
     return halide_error_code_success;
 }
@@ -47,7 +47,7 @@ int allocate_region(void *user_context, MemoryRegion *region) {
                         << "region=" << (void *)(region) << " "
                         << "region_size=" << int32_t(region->size) << " "
                         << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
-                        << ") !\n";
+                        << ") ...";
 
     return halide_error_code_success;
 }
@@ -60,7 +60,7 @@ int deallocate_region(void *user_context, MemoryRegion *region) {
                         << "region=" << (void *)(region) << " "
                         << "region_size=" << int32_t(region->size) << " "
                         << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
-                        << ") !\n";
+                        << ") ...";
 
     return halide_error_code_success;
 }
@@ -74,7 +74,173 @@ int main(int argc, char **argv) {
     MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block};
     MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region};
 
-    // test class interface
+    // test region allocator class interface
+    {
+        // Manually create a block resource and allocate memory
+        size_t block_size = 4 * 1024 * 1024;
+        BlockResource block_resource = {};
+        MemoryBlock *memory_block = &(block_resource.memory);
+        memory_block->size = block_size;
+        allocate_block(user_context, memory_block);
+
+        // Create a region allocator to manage the block resource
+        RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator};
+        RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators);
+
+        MemoryRequest request = {0};
+        request.size = sizeof(int);
+        request.alignment = sizeof(int);
+        request.properties.visibility = MemoryVisibility::DefaultVisibility;
+        request.properties.caching = MemoryCaching::DefaultCaching;
+        request.properties.usage = MemoryUsage::DefaultUsage;
+
+        MemoryRegion *r1 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r1 != nullptr);
+        HALIDE_CHECK(user_context, allocated_block_memory == block_size);
+        HALIDE_CHECK(user_context, allocated_region_memory == request.size);
+
+        MemoryRegion *r2 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r2 != nullptr);
+        HALIDE_CHECK(user_context, allocated_block_memory == block_size);
+        HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size));
+
+        instance->reclaim(user_context, r1);
+        HALIDE_CHECK(user_context, allocated_region_memory == (1 * request.size));
+
+        MemoryRegion *r3 = instance->reserve(user_context, request);
+        halide_abort_if_false(user_context, r3 != nullptr);
+        halide_abort_if_false(user_context, allocated_block_memory == block_size);
+        halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size));
+        instance->retain(user_context, r3);
+        halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size));
+        instance->release(user_context, r3);
+        halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size));
+        instance->reclaim(user_context, r3);
+        instance->release(user_context, r1);
+
+        // [r1 = available] [r2 = in use] [r3 = available] ... no contiguous regions
+        HALIDE_CHECK(user_context, false == instance->collect(user_context));
+
+        // release r2 to make three consecutive regions to collect
+        instance->release(user_context, r2);
+        HALIDE_CHECK(user_context, true == instance->collect(user_context));
+
+        request.size = block_size / 2;  // request two half-size regions
+        MemoryRegion *r4 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r4 != nullptr);
+        MemoryRegion *r5 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r5 != nullptr);
+        HALIDE_CHECK(user_context, nullptr == instance->reserve(user_context, request));  // requesting a third should fail
+
+        HALIDE_CHECK(user_context, allocated_block_memory == block_size);
+        HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size));
+
+        instance->release(user_context, r4);
+        instance->release(user_context, r5);
+
+        HALIDE_CHECK(user_context, true == instance->collect(user_context));
+
+        request.size = block_size;
+        MemoryRegion *r6 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r6 != nullptr);
+
+        instance->destroy(user_context);
+        deallocate_block(user_context, memory_block);
+
+        debug(user_context) << "Test : region_allocator::destroy ("
+                            << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
+                            << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
+                            << ") ...";
+
+        HALIDE_CHECK(user_context, allocated_block_memory == 0);
+        HALIDE_CHECK(user_context, allocated_region_memory == 0);
+
+        RegionAllocator::destroy(user_context, instance);
+
+        debug(user_context) << "Test : region_allocator::destroy ("
+                            << "allocated_system_memory=" << int32_t(get_allocated_system_memory()) << " "
+                            << ") ...";
+
+        HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
+    }
+
+    // test region allocator nearest_multiple padding
+    {
+        // Manually create a block resource and allocate memory
+        size_t block_size = 4 * 1024 * 1024;
+        size_t padded_size = 32;
+        BlockResource block_resource = {};
+        MemoryBlock *memory_block = &(block_resource.memory);
+        memory_block->size = block_size;
+        memory_block->properties.nearest_multiple = padded_size;
+        allocate_block(user_context, memory_block);
+
+        // Create a region allocator to manage the block resource
+        RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator};
+        RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators);
+
+        MemoryRequest request = {0};
+        request.size = sizeof(int);
+        request.alignment = sizeof(int);
+        request.properties.visibility = MemoryVisibility::DefaultVisibility;
+        request.properties.caching = MemoryCaching::DefaultCaching;
+        request.properties.usage = MemoryUsage::DefaultUsage;
+
+        MemoryRegion *r1 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r1 != nullptr);
+        HALIDE_CHECK(user_context, allocated_block_memory == block_size);
+        HALIDE_CHECK(user_context, allocated_region_memory == padded_size);
+
+        MemoryRegion *r2 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r2 != nullptr);
+        HALIDE_CHECK(user_context, allocated_block_memory == block_size);
+        HALIDE_CHECK(user_context, allocated_region_memory == (2 * padded_size));
+
+        instance->release(user_context, r1);
+        instance->release(user_context, r2);
+        HALIDE_CHECK(user_context, allocated_region_memory == (2 * padded_size));
+        HALIDE_CHECK(user_context, true == instance->collect(user_context));
+
+        request.size = block_size / 2;  // request two half-size regions
+        MemoryRegion *r4 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r4 != nullptr);
+        MemoryRegion *r5 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r5 != nullptr);
+        HALIDE_CHECK(user_context, nullptr == instance->reserve(user_context, request));  // requesting a third should fail
+
+        HALIDE_CHECK(user_context, allocated_block_memory == block_size);
+        HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size));
+
+        instance->release(user_context, r4);
+        instance->release(user_context, r5);
+
+        HALIDE_CHECK(user_context, true == instance->collect(user_context));
+
+        request.size = block_size;
+        MemoryRegion *r6 = instance->reserve(user_context, request);
+        HALIDE_CHECK(user_context, r6 != nullptr);
+
+        instance->destroy(user_context);
+        deallocate_block(user_context, memory_block);
+
+        debug(user_context) << "Test : region_allocator::destroy ("
+                            << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
+                            << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
+                            << ") ...";
+
+        HALIDE_CHECK(user_context, allocated_block_memory == 0);
+        HALIDE_CHECK(user_context, allocated_region_memory == 0);
+
+        RegionAllocator::destroy(user_context, instance);
+
+        debug(user_context) << "Test : region_allocator::destroy ("
+                            << "allocated_system_memory=" << int32_t(get_allocated_system_memory()) << " "
+                            << ") ...";
+
+        HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
+    }
+
+    // test block allocator class interface
     {
         BlockAllocator::Config config = {0};
         config.minimum_block_size = 1024;
@@ -116,7 +282,7 @@ int main(int argc, char **argv) {
         debug(user_context) << "Test : block_allocator::destroy ("
                             << "allocated_block_memory=" << int32_t(allocated_block_memory) << " "
                             << "allocated_region_memory=" << int32_t(allocated_region_memory) << " "
-                            << ") !\n";
+                            << ") ...";
 
         HALIDE_CHECK(user_context, allocated_block_memory == 0);
         HALIDE_CHECK(user_context, allocated_region_memory == 0);
@@ -125,7 +291,7 @@ int main(int argc, char **argv) {
 
         debug(user_context) << "Test : block_allocator::destroy ("
                             << "allocated_system_memory=" << int32_t(get_allocated_system_memory()) << " "
-                            << ") !\n";
+                            << ") ...";
 
         HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
     }

From e6e1b6f2dfa42120613b8fc0b9ea7768454fff9d Mon Sep 17 00:00:00 2001
From: Alex Reinking <quic_areinkin@quicinc.com>
Date: Wed, 14 Feb 2024 17:58:55 -0800
Subject: [PATCH 065/186] Ensure string(REPLACE) is called with the right
 number of arguments (#8097)

---
 dependencies/wasm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dependencies/wasm/CMakeLists.txt b/dependencies/wasm/CMakeLists.txt
index 7c0a02b377f1..c5122a042dd5 100644
--- a/dependencies/wasm/CMakeLists.txt
+++ b/dependencies/wasm/CMakeLists.txt
@@ -164,7 +164,7 @@ function(find_node_js)
     execute_process(COMMAND "${NODE_JS_EXECUTABLE}" --version
                     OUTPUT_VARIABLE NODE_JS_VERSION_RAW
                     OUTPUT_STRIP_TRAILING_WHITESPACE)
-    string(REPLACE "v" "" NODE_JS_VERSION ${NODE_JS_VERSION_RAW})
+    string(REPLACE "v" "" NODE_JS_VERSION "${NODE_JS_VERSION_RAW}")
 
     if (NODE_JS_VERSION VERSION_LESS "16.13")
         message(FATAL_ERROR "Halide requires Node v16.13 or later, but found ${NODE_JS_VERSION_RAW} at ${NODE_JS_EXECUTABLE}. Please set NODE_JS_EXECUTABLE on the CMake command line.")

From 2855ca31aa12c990d58ab1c4cab0dff2be4abea4 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 15 Feb 2024 09:06:36 -0800
Subject: [PATCH 066/186] Strip asserts right at the end of lowering (#8094)

The simplifier exploits asserts to make simplification. When compiling
with NoAsserts, certain assertions aren't ever introduced, which means
that the simplifier can't exploit certain things that we know to be
true. Mostly this has a negative effect on code size. E.g. tail cases
get generated even though they are actually dead code.

This PR keeps all the assertions right until the end of lowering, when
it strips them in a dedicated pass.

This reduces object file size for a large production blob of Halide code
by ~10%, without measurably affecting runtime.
---
 Makefile                  |   2 +
 src/AddImageChecks.cpp    |  39 ++++--------
 src/CMakeLists.txt        |   2 +
 src/Lower.cpp             |   7 +++
 src/ScheduleFunctions.cpp |   6 +-
 src/StripAsserts.cpp      | 121 ++++++++++++++++++++++++++++++++++++++
 src/StripAsserts.h        |  18 ++++++
 7 files changed, 164 insertions(+), 31 deletions(-)
 create mode 100644 src/StripAsserts.cpp
 create mode 100644 src/StripAsserts.h

diff --git a/Makefile b/Makefile
index b73b1632a0eb..72c05619e3ea 100644
--- a/Makefile
+++ b/Makefile
@@ -603,6 +603,7 @@ SOURCE_FILES = \
   StorageFlattening.cpp \
   StorageFolding.cpp \
   StrictifyFloat.cpp \
+  StripAsserts.cpp \
   Substitute.cpp \
   Target.cpp \
   Tracing.cpp \
@@ -785,6 +786,7 @@ HEADER_FILES = \
   StorageFlattening.h \
   StorageFolding.h \
   StrictifyFloat.h \
+  StripAsserts.h \
   Substitute.h \
   Target.h \
   Tracing.h \
diff --git a/src/AddImageChecks.cpp b/src/AddImageChecks.cpp
index dfe9ae88c85f..77d8015f32b9 100644
--- a/src/AddImageChecks.cpp
+++ b/src/AddImageChecks.cpp
@@ -162,7 +162,6 @@ Stmt add_image_checks_inner(Stmt s,
                             const FuncValueBounds &fb,
                             bool will_inject_host_copies) {
 
-    bool no_asserts = t.has_feature(Target::NoAsserts);
     bool no_bounds_query = t.has_feature(Target::NoBoundsQuery);
 
     // First hunt for all the referenced buffers
@@ -618,12 +617,9 @@ Stmt add_image_checks_inner(Stmt s,
                 replace_with_constrained[name] = constrained_var;
             }
 
-            Expr error = 0;
-            if (!no_asserts) {
-                error = Call::make(Int(32), "halide_error_constraint_violated",
-                                   {name, var, constrained_var_str, constrained_var},
-                                   Call::Extern);
-            }
+            Expr error = Call::make(Int(32), "halide_error_constraint_violated",
+                                    {name, var, constrained_var_str, constrained_var},
+                                    Call::Extern);
 
             // Check the var passed in equals the constrained version (when not in inference mode)
             asserts_constrained.push_back(AssertStmt::make(var == constrained_var, error));
@@ -679,14 +675,12 @@ Stmt add_image_checks_inner(Stmt s,
         }
     };
 
-    if (!no_asserts) {
-        // Inject the code that checks the host pointers.
-        prepend_stmts(&asserts_host_non_null);
-        prepend_stmts(&asserts_host_alignment);
-        prepend_stmts(&asserts_device_not_dirty);
-        prepend_stmts(&dims_no_overflow_asserts);
-        prepend_lets(&lets_overflow);
-    }
+    // Inject the code that checks the host pointers.
+    prepend_stmts(&asserts_host_non_null);
+    prepend_stmts(&asserts_host_alignment);
+    prepend_stmts(&asserts_device_not_dirty);
+    prepend_stmts(&dims_no_overflow_asserts);
+    prepend_lets(&lets_overflow);
 
     // Replace uses of the var with the constrained versions in the
     // rest of the program. We also need to respect the existence of
@@ -698,15 +692,10 @@ Stmt add_image_checks_inner(Stmt s,
     // all in reverse order compared to execution, as we incrementally
     // prepending code.
 
-    // Inject the code that checks the constraints are correct. We
-    // need these regardless of how NoAsserts is set, because they are
-    // what gets Halide to actually exploit the constraint.
+    // Inject the code that checks the constraints are correct.
     prepend_stmts(&asserts_constrained);
-
-    if (!no_asserts) {
-        prepend_stmts(&asserts_required);
-        prepend_stmts(&asserts_type_checks);
-    }
+    prepend_stmts(&asserts_required);
+    prepend_stmts(&asserts_type_checks);
 
     // Inject the code that returns early for inference mode.
     if (!no_bounds_query) {
@@ -714,9 +703,7 @@ Stmt add_image_checks_inner(Stmt s,
         prepend_stmts(&buffer_rewrites);
     }
 
-    if (!no_asserts) {
-        prepend_stmts(&asserts_proposed);
-    }
+    prepend_stmts(&asserts_proposed);
 
     // Inject the code that defines the proposed sizes.
     prepend_lets(&lets_proposed);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cca681661c35..557574f284c4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -156,6 +156,7 @@ set(HEADER_FILES
     StorageFlattening.h
     StorageFolding.h
     StrictifyFloat.h
+    StripAsserts.h
     Substitute.h
     Target.h
     Tracing.h
@@ -340,6 +341,7 @@ set(SOURCE_FILES
     StorageFlattening.cpp
     StorageFolding.cpp
     StrictifyFloat.cpp
+    StripAsserts.cpp
     Substitute.cpp
     Target.cpp
     Tracing.cpp
diff --git a/src/Lower.cpp b/src/Lower.cpp
index ba0918831fc8..560e0353c7a4 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -68,6 +68,7 @@
 #include "StorageFlattening.h"
 #include "StorageFolding.h"
 #include "StrictifyFloat.h"
+#include "StripAsserts.h"
 #include "Substitute.h"
 #include "Tracing.h"
 #include "TrimNoOps.h"
@@ -427,6 +428,12 @@ void lower_impl(const vector<Function> &output_funcs,
     s = hoist_prefetches(s);
     log("Lowering after hoisting prefetches:", s);
 
+    if (t.has_feature(Target::NoAsserts)) {
+        debug(1) << "Stripping asserts...\n";
+        s = strip_asserts(s);
+        log("Lowering after stripping asserts:", s);
+    }
+
     debug(1) << "Lowering after final simplification:\n"
              << s << "\n\n";
 
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index c575cd47477d..aa45841253b7 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -1368,11 +1368,7 @@ class InjectFunctionRealization : public IRMutator {
 
         // This is also the point at which we inject explicit bounds
         // for this realization.
-        if (target.has_feature(Target::NoAsserts)) {
-            return s;
-        } else {
-            return inject_explicit_bounds(s, func);
-        }
+        return inject_explicit_bounds(s, func);
     }
 
     Stmt build_realize_function_from_group(Stmt s, int func_index) {
diff --git a/src/StripAsserts.cpp b/src/StripAsserts.cpp
new file mode 100644
index 000000000000..9d9c667f4db1
--- /dev/null
+++ b/src/StripAsserts.cpp
@@ -0,0 +1,121 @@
+#include "StripAsserts.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "IRVisitor.h"
+#include <set>
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+
+bool may_discard(const Expr &e) {
+    class MayDiscard : public IRVisitor {
+        using IRVisitor::visit;
+
+        void visit(const Call *op) override {
+            // Extern calls that are side-effecty in the sense that you can't
+            // move them around in the IR, but we're free to discard because
+            // they're just getters.
+            static const std::set<std::string> discardable{
+                Call::buffer_get_dimensions,
+                Call::buffer_get_min,
+                Call::buffer_get_extent,
+                Call::buffer_get_stride,
+                Call::buffer_get_max,
+                Call::buffer_get_host,
+                Call::buffer_get_device,
+                Call::buffer_get_device_interface,
+                Call::buffer_get_shape,
+                Call::buffer_get_host_dirty,
+                Call::buffer_get_device_dirty,
+                Call::buffer_get_type};
+
+            if (!(op->is_pure() ||
+                  discardable.count(op->name))) {
+                result = false;
+            }
+        }
+
+    public:
+        bool result = true;
+    } d;
+    e.accept(&d);
+
+    return d.result;
+}
+
+class StripAsserts : public IRMutator {
+    using IRMutator::visit;
+
+    // We're going to track which symbols are used so that we can strip lets we
+    // don't need after removing the asserts.
+    std::set<std::string> used;
+
+    // Drop all assert stmts. Assumes that you don't want any side-effects from
+    // the condition.
+    Stmt visit(const AssertStmt *op) override {
+        return Evaluate::make(0);
+    }
+
+    Expr visit(const Variable *op) override {
+        used.insert(op->name);
+        return op;
+    }
+
+    Expr visit(const Load *op) override {
+        used.insert(op->name);
+        return IRMutator::visit(op);
+    }
+
+    Stmt visit(const Store *op) override {
+        used.insert(op->name);
+        return IRMutator::visit(op);
+    }
+
+    // Also dead-code eliminate any let stmts wrapped around asserts
+    Stmt visit(const LetStmt *op) override {
+        Stmt body = mutate(op->body);
+        if (is_no_op(body)) {
+            if (may_discard(op->value)) {
+                return body;
+            } else {
+                // We visit the value just to keep the used variable set
+                // accurate.
+                mutate(op->value);
+                return Evaluate::make(op->value);
+            }
+        } else if (body.same_as(op->body)) {
+            mutate(op->value);
+            return op;
+        } else if (may_discard(op->value) && !used.count(op->name)) {
+            return body;
+        } else {
+            mutate(op->value);
+            return LetStmt::make(op->name, op->value, body);
+        }
+    }
+
+    Stmt visit(const Block *op) override {
+        Stmt first = mutate(op->first);
+        Stmt rest = mutate(op->rest);
+        if (first.same_as(op->first) && rest.same_as(op->rest)) {
+            return op;
+        } else if (is_no_op(rest)) {
+            return first;
+        } else if (is_no_op(first)) {
+            return rest;
+        } else {
+            return Block::make(first, rest);
+        }
+    }
+};
+
+}  // namespace
+
+Stmt strip_asserts(const Stmt &s) {
+    return StripAsserts().mutate(s);
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/StripAsserts.h b/src/StripAsserts.h
new file mode 100644
index 000000000000..48b22b3a5218
--- /dev/null
+++ b/src/StripAsserts.h
@@ -0,0 +1,18 @@
+#ifndef HALIDE_STRIP_ASSERTS_H
+#define HALIDE_STRIP_ASSERTS_H
+
+/** \file
+ * Defines the lowering pass that strips asserts when NoAsserts is set.
+ */
+
+#include "Expr.h"
+
+namespace Halide {
+namespace Internal {
+
+Stmt strip_asserts(const Stmt &s);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif

From d9668c5bcf7325cd669bf34f55d40f8c935453cb Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 15 Feb 2024 17:57:16 +0000
Subject: [PATCH 067/186] Fix clang-tidy error in runtime.printer.h (parameter
 shadows member) (#8074)

---
 src/runtime/.clang-tidy | 2 ++
 src/runtime/printer.h   | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/runtime/.clang-tidy b/src/runtime/.clang-tidy
index 9c335b626fdf..4032f4ea3d9c 100644
--- a/src/runtime/.clang-tidy
+++ b/src/runtime/.clang-tidy
@@ -73,6 +73,8 @@ Checks: >
     bugprone-use-after-move,
     bugprone-virtual-near-miss,
 
+    clang-diagnostic-shadow-field,
+
     misc-confusable-identifiers,
     -misc-const-correctness,
     -misc-definitions-in-headers,
diff --git a/src/runtime/printer.h b/src/runtime/printer.h
index 6a379561dbe5..af07a21730fd 100644
--- a/src/runtime/printer.h
+++ b/src/runtime/printer.h
@@ -184,8 +184,8 @@ namespace {
 template<PrinterType printer_type, uint64_t buffer_length = default_printer_buffer_length>
 class HeapPrinter : public PrinterBase {
 public:
-    NEVER_INLINE explicit HeapPrinter(void *user_context)
-        : PrinterBase(user_context, (char *)malloc(buffer_length), buffer_length) {
+    NEVER_INLINE explicit HeapPrinter(void *user_context_)
+        : PrinterBase(user_context_, (char *)malloc(buffer_length), buffer_length) {
         if (!start) {
             allocation_error();
         }
@@ -247,8 +247,8 @@ class StackPrinter : public PrinterBase {
     char scratch[buffer_length];
 
 public:
-    explicit StackPrinter(void *user_context)
-        : PrinterBase(user_context, scratch, buffer_length) {
+    explicit StackPrinter(void *user_context_)
+        : PrinterBase(user_context_, scratch, buffer_length) {
         static_assert(buffer_length <= 256, "StackPrinter is meant only for small buffer sizes; you are probably making a mistake.");
     }
 };

From 4fc1e57ea34f267ec1ea085bbab228569429170a Mon Sep 17 00:00:00 2001
From: Zalman Stern <zalman@google.com>
Date: Fri, 16 Feb 2024 13:58:23 -0800
Subject: [PATCH 068/186] Fix an issue where the Halide compiler hits an
 internal error for bool types in widening intrinsics. (#8099)

* Fix an issue where the Halide compiler hits an internal error when
bool types are used with e.g. widening_mul. This situation did not
arise from user code doing this directly, but rather through some
chain o lowering with float16 types. The test cases added to
correctness_intrinsics target the issue directly and do fail without
the fix.

I did not add broader coverage for bool types and intrinsics as it
would require more thinking. Most of them overflow for the true/true
case and thus are of questionable use, however widening operations
cannot overflow... Certainly we could define the language to forbid
this, but currently the frontend does not do so. As indicated above,
the use case driving this was not using bool arithmetic to begin with.

* Formatting.
---
 src/FindIntrinsics.cpp          | 273 ++++++++++++++++++--------------
 test/correctness/intrinsics.cpp |   5 +
 2 files changed, 159 insertions(+), 119 deletions(-)

diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp
index a77a7b1798f3..febd88d2399b 100644
--- a/src/FindIntrinsics.cpp
+++ b/src/FindIntrinsics.cpp
@@ -13,6 +13,8 @@ using namespace Halide::ConciseCasts;
 
 namespace {
 
+// This routine provides a guard on the return type of intrisics such that only
+// these types will ever be considered in the visiting that happens here.
 bool find_intrinsics_for_type(const Type &t) {
     // Currently, we only try to find and replace intrinsics for vector types that aren't bools.
     return t.is_vector() && t.bits() >= 8;
@@ -28,17 +30,36 @@ Expr narrow(Expr a) {
     return Cast::make(result_type, std::move(a));
 }
 
+// Check a type to make sure it can be narrowed. find_intrinsics_for_type
+// attempts to prevent this code from narrowing in cases that do not work, but
+// it is incomplete for two reasons:
+//
+//     - Arguments can be narrowed and that guard is only on return type, which
+//     are different for widening operations.
+//
+//     - find_intrinsics_for_type does not cull out float16, and it probably
+//     should not as while it's ok to skip matching bool things, float16 things
+//     are useful.
+bool can_narrow(const Type &t) {
+    return (t.is_float() && t.bits() >= 32) ||
+           t.bits() >= 8;
+}
+
 Expr lossless_narrow(const Expr &x) {
-    return lossless_cast(x.type().narrow(), x);
+    return can_narrow(x.type()) ? lossless_cast(x.type().narrow(), x) : Expr();
 }
 
 // Remove a widening cast even if it changes the sign of the result.
 Expr strip_widening_cast(const Expr &x) {
-    Expr narrow = lossless_narrow(x);
-    if (narrow.defined()) {
-        return narrow;
+    if (can_narrow(x.type())) {
+        Expr narrow = lossless_narrow(x);
+        if (narrow.defined()) {
+            return narrow;
+        }
+        return lossless_cast(x.type().narrow().with_code(halide_type_uint), x);
+    } else {
+        return Expr();
     }
-    return lossless_cast(x.type().narrow().with_code(halide_type_uint), x);
 }
 
 Expr saturating_narrow(const Expr &a) {
@@ -217,16 +238,18 @@ class FindIntrinsics : public IRMutator {
 
         // Try widening both from the same signedness as the result, and from uint.
         for (halide_type_code_t code : {op->type.code(), halide_type_uint}) {
-            Type narrow = op->type.narrow().with_code(code);
-            Expr narrow_a = lossless_cast(narrow, a);
-            Expr narrow_b = lossless_cast(narrow, b);
+            if (can_narrow(op->type)) {
+                Type narrow = op->type.narrow().with_code(code);
+                Expr narrow_a = lossless_cast(narrow, a);
+                Expr narrow_b = lossless_cast(narrow, b);
 
-            if (narrow_a.defined() && narrow_b.defined()) {
-                Expr result = widening_add(narrow_a, narrow_b);
-                if (result.type() != op->type) {
-                    result = Cast::make(op->type, result);
+                if (narrow_a.defined() && narrow_b.defined()) {
+                    Expr result = widening_add(narrow_a, narrow_b);
+                    if (result.type() != op->type) {
+                        result = Cast::make(op->type, result);
+                    }
+                    return mutate(result);
                 }
-                return mutate(result);
             }
         }
 
@@ -235,41 +258,43 @@ class FindIntrinsics : public IRMutator {
             // Yes we do an duplicate code, but we want to check the op->type.code() first,
             // and the opposite as well.
             for (halide_type_code_t code : {op->type.code(), halide_type_uint, halide_type_int}) {
-                Type narrow = op->type.narrow().with_code(code);
-                // Pulling casts out of VectorReduce nodes breaks too much codegen, skip for now.
-                Expr narrow_a = (a.node_type() == IRNodeType::VectorReduce) ? Expr() : lossless_cast(narrow, a);
-                Expr narrow_b = (b.node_type() == IRNodeType::VectorReduce) ? Expr() : lossless_cast(narrow, b);
-
-                // This case should have been handled by the above check for widening_add.
-                internal_assert(!(narrow_a.defined() && narrow_b.defined()))
-                    << "find_intrinsics failed to find a widening_add: " << a << " + " << b << "\n";
-
-                if (narrow_a.defined()) {
-                    Expr result;
-                    if (b.type().code() != narrow_a.type().code()) {
-                        // Need to do a safe reinterpret.
-                        Type t = b.type().with_code(code);
-                        result = widen_right_add(cast(t, b), narrow_a);
-                        internal_assert(result.type() != op->type);
-                        result = cast(op->type, result);
-                    } else {
-                        result = widen_right_add(b, narrow_a);
-                    }
-                    internal_assert(result.type() == op->type);
-                    return mutate(result);
-                } else if (narrow_b.defined()) {
-                    Expr result;
-                    if (a.type().code() != narrow_b.type().code()) {
-                        // Need to do a safe reinterpret.
-                        Type t = a.type().with_code(code);
-                        result = widen_right_add(cast(t, a), narrow_b);
-                        internal_assert(result.type() != op->type);
-                        result = cast(op->type, result);
-                    } else {
-                        result = widen_right_add(a, narrow_b);
+                if (can_narrow(op->type)) {
+                    Type narrow = op->type.narrow().with_code(code);
+                    // Pulling casts out of VectorReduce nodes breaks too much codegen, skip for now.
+                    Expr narrow_a = (a.node_type() == IRNodeType::VectorReduce) ? Expr() : lossless_cast(narrow, a);
+                    Expr narrow_b = (b.node_type() == IRNodeType::VectorReduce) ? Expr() : lossless_cast(narrow, b);
+
+                    // This case should have been handled by the above check for widening_add.
+                    internal_assert(!(narrow_a.defined() && narrow_b.defined()))
+                        << "find_intrinsics failed to find a widening_add: " << a << " + " << b << "\n";
+
+                    if (narrow_a.defined()) {
+                        Expr result;
+                        if (b.type().code() != narrow_a.type().code()) {
+                            // Need to do a safe reinterpret.
+                            Type t = b.type().with_code(code);
+                            result = widen_right_add(cast(t, b), narrow_a);
+                            internal_assert(result.type() != op->type);
+                            result = cast(op->type, result);
+                        } else {
+                            result = widen_right_add(b, narrow_a);
+                        }
+                        internal_assert(result.type() == op->type);
+                        return mutate(result);
+                    } else if (narrow_b.defined()) {
+                        Expr result;
+                        if (a.type().code() != narrow_b.type().code()) {
+                            // Need to do a safe reinterpret.
+                            Type t = a.type().with_code(code);
+                            result = widen_right_add(cast(t, a), narrow_b);
+                            internal_assert(result.type() != op->type);
+                            result = cast(op->type, result);
+                        } else {
+                            result = widen_right_add(a, narrow_b);
+                        }
+                        internal_assert(result.type() == op->type);
+                        return mutate(result);
                     }
-                    internal_assert(result.type() == op->type);
-                    return mutate(result);
                 }
             }
         }
@@ -294,22 +319,24 @@ class FindIntrinsics : public IRMutator {
 
         // Try widening both from the same type as the result, and from uint.
         for (halide_type_code_t code : {op->type.code(), halide_type_uint}) {
-            Type narrow = op->type.narrow().with_code(code);
-            Expr narrow_a = lossless_cast(narrow, a);
-            Expr narrow_b = lossless_cast(narrow, b);
+            if (can_narrow(op->type)) {
+                Type narrow = op->type.narrow().with_code(code);
+                Expr narrow_a = lossless_cast(narrow, a);
+                Expr narrow_b = lossless_cast(narrow, b);
 
-            if (narrow_a.defined() && narrow_b.defined()) {
-                Expr negative_narrow_b = lossless_negate(narrow_b);
-                Expr result;
-                if (negative_narrow_b.defined()) {
-                    result = widening_add(narrow_a, negative_narrow_b);
-                } else {
-                    result = widening_sub(narrow_a, narrow_b);
-                }
-                if (result.type() != op->type) {
-                    result = Cast::make(op->type, result);
+                if (narrow_a.defined() && narrow_b.defined()) {
+                    Expr negative_narrow_b = lossless_negate(narrow_b);
+                    Expr result;
+                    if (negative_narrow_b.defined()) {
+                        result = widening_add(narrow_a, negative_narrow_b);
+                    } else {
+                        result = widening_sub(narrow_a, narrow_b);
+                    }
+                    if (result.type() != op->type) {
+                        result = Cast::make(op->type, result);
+                    }
+                    return mutate(result);
                 }
-                return mutate(result);
             }
         }
 
@@ -324,22 +351,24 @@ class FindIntrinsics : public IRMutator {
             // Yes we do an duplicate code, but we want to check the op->type.code() first,
             // and the opposite as well.
             for (halide_type_code_t code : {op->type.code(), halide_type_uint, halide_type_int}) {
-                Type narrow = op->type.narrow().with_code(code);
-                Expr narrow_b = lossless_cast(narrow, b);
-
-                if (narrow_b.defined()) {
-                    Expr result;
-                    if (a.type().code() != narrow_b.type().code()) {
-                        // Need to do a safe reinterpret.
-                        Type t = a.type().with_code(code);
-                        result = widen_right_sub(cast(t, a), narrow_b);
-                        internal_assert(result.type() != op->type);
-                        result = cast(op->type, result);
-                    } else {
-                        result = widen_right_sub(a, narrow_b);
+                if (can_narrow(op->type)) {
+                    Type narrow = op->type.narrow().with_code(code);
+                    Expr narrow_b = lossless_cast(narrow, b);
+
+                    if (narrow_b.defined()) {
+                        Expr result;
+                        if (a.type().code() != narrow_b.type().code()) {
+                            // Need to do a safe reinterpret.
+                            Type t = a.type().with_code(code);
+                            result = widen_right_sub(cast(t, a), narrow_b);
+                            internal_assert(result.type() != op->type);
+                            result = cast(op->type, result);
+                        } else {
+                            result = widen_right_sub(a, narrow_b);
+                        }
+                        internal_assert(result.type() == op->type);
+                        return mutate(result);
                     }
-                    internal_assert(result.type() == op->type);
-                    return mutate(result);
                 }
             }
         }
@@ -401,40 +430,42 @@ class FindIntrinsics : public IRMutator {
             // Yes we do an duplicate code, but we want to check the op->type.code() first,
             // and the opposite as well.
             for (halide_type_code_t code : {op->type.code(), halide_type_uint, halide_type_int}) {
-                Type narrow = op->type.narrow().with_code(code);
-                Expr narrow_a = lossless_cast(narrow, a);
-                Expr narrow_b = lossless_cast(narrow, b);
-
-                // This case should have been handled by the above check for widening_mul.
-                internal_assert(!(narrow_a.defined() && narrow_b.defined()))
-                    << "find_intrinsics failed to find a widening_mul: " << a << " + " << b << "\n";
-
-                if (narrow_a.defined()) {
-                    Expr result;
-                    if (b.type().code() != narrow_a.type().code()) {
-                        // Need to do a safe reinterpret.
-                        Type t = b.type().with_code(code);
-                        result = widen_right_mul(cast(t, b), narrow_a);
-                        internal_assert(result.type() != op->type);
-                        result = cast(op->type, result);
-                    } else {
-                        result = widen_right_mul(b, narrow_a);
-                    }
-                    internal_assert(result.type() == op->type);
-                    return mutate(result);
-                } else if (narrow_b.defined()) {
-                    Expr result;
-                    if (a.type().code() != narrow_b.type().code()) {
-                        // Need to do a safe reinterpret.
-                        Type t = a.type().with_code(code);
-                        result = widen_right_mul(cast(t, a), narrow_b);
-                        internal_assert(result.type() != op->type);
-                        result = cast(op->type, result);
-                    } else {
-                        result = widen_right_mul(a, narrow_b);
+                if (can_narrow(op->type)) {
+                    Type narrow = op->type.narrow().with_code(code);
+                    Expr narrow_a = lossless_cast(narrow, a);
+                    Expr narrow_b = lossless_cast(narrow, b);
+
+                    // This case should have been handled by the above check for widening_mul.
+                    internal_assert(!(narrow_a.defined() && narrow_b.defined()))
+                        << "find_intrinsics failed to find a widening_mul: " << a << " + " << b << "\n";
+
+                    if (narrow_a.defined()) {
+                        Expr result;
+                        if (b.type().code() != narrow_a.type().code()) {
+                            // Need to do a safe reinterpret.
+                            Type t = b.type().with_code(code);
+                            result = widen_right_mul(cast(t, b), narrow_a);
+                            internal_assert(result.type() != op->type);
+                            result = cast(op->type, result);
+                        } else {
+                            result = widen_right_mul(b, narrow_a);
+                        }
+                        internal_assert(result.type() == op->type);
+                        return mutate(result);
+                    } else if (narrow_b.defined()) {
+                        Expr result;
+                        if (a.type().code() != narrow_b.type().code()) {
+                            // Need to do a safe reinterpret.
+                            Type t = a.type().with_code(code);
+                            result = widen_right_mul(cast(t, a), narrow_b);
+                            internal_assert(result.type() != op->type);
+                            result = cast(op->type, result);
+                        } else {
+                            result = widen_right_mul(a, narrow_b);
+                        }
+                        internal_assert(result.type() == op->type);
+                        return mutate(result);
                     }
-                    internal_assert(result.type() == op->type);
-                    return mutate(result);
                 }
             }
         }
@@ -853,21 +884,25 @@ class FindIntrinsics : public IRMutator {
         } else if (op->is_intrinsic(Call::widening_add) && (op->type.bits() >= 16)) {
             internal_assert(op->args.size() == 2);
             for (halide_type_code_t t : {op->type.code(), halide_type_uint}) {
-                Type narrow_t = op->type.narrow().narrow().with_code(t);
-                Expr narrow_a = lossless_cast(narrow_t, op->args[0]);
-                Expr narrow_b = lossless_cast(narrow_t, op->args[1]);
-                if (narrow_a.defined() && narrow_b.defined()) {
-                    return mutate(Cast::make(op->type, widening_add(narrow_a, narrow_b)));
+                if (can_narrow(op->type)) {
+                    Type narrow_t = op->type.narrow().narrow().with_code(t);
+                    Expr narrow_a = lossless_cast(narrow_t, op->args[0]);
+                    Expr narrow_b = lossless_cast(narrow_t, op->args[1]);
+                    if (narrow_a.defined() && narrow_b.defined()) {
+                        return mutate(Cast::make(op->type, widening_add(narrow_a, narrow_b)));
+                    }
                 }
             }
         } else if (op->is_intrinsic(Call::widening_sub) && (op->type.bits() >= 16)) {
             internal_assert(op->args.size() == 2);
             for (halide_type_code_t t : {op->type.code(), halide_type_uint}) {
-                Type narrow_t = op->type.narrow().narrow().with_code(t);
-                Expr narrow_a = lossless_cast(narrow_t, op->args[0]);
-                Expr narrow_b = lossless_cast(narrow_t, op->args[1]);
-                if (narrow_a.defined() && narrow_b.defined()) {
-                    return mutate(Cast::make(op->type, widening_sub(narrow_a, narrow_b)));
+                if (can_narrow(op->type)) {
+                    Type narrow_t = op->type.narrow().narrow().with_code(t);
+                    Expr narrow_a = lossless_cast(narrow_t, op->args[0]);
+                    Expr narrow_b = lossless_cast(narrow_t, op->args[1]);
+                    if (narrow_a.defined() && narrow_b.defined()) {
+                        return mutate(Cast::make(op->type, widening_sub(narrow_a, narrow_b)));
+                    }
                 }
             }
         }
diff --git a/test/correctness/intrinsics.cpp b/test/correctness/intrinsics.cpp
index 19f9c610b099..339a5c2525e5 100644
--- a/test/correctness/intrinsics.cpp
+++ b/test/correctness/intrinsics.cpp
@@ -121,6 +121,8 @@ Expr make_leaf(Type t, const char *name) {
 }
 
 int main(int argc, char **argv) {
+    Expr i1x = make_leaf(Int(1, 4), "i1x");
+    Expr i1y = make_leaf(Int(1, 4), "i1y");
     Expr i8x = make_leaf(Int(8, 4), "i8x");
     Expr i8y = make_leaf(Int(8, 4), "i8y");
     Expr i8z = make_leaf(Int(8, 4), "i8w");
@@ -150,15 +152,18 @@ int main(int argc, char **argv) {
     // check(u32(u8x) * 256, u32(widening_shift_left(u8x, u8(8))));
 
     // Check widening arithmetic
+    check(i8(i1x) + i1y, widening_add(i1x, i1y));
     check(i16(i8x) + i8y, widening_add(i8x, i8y));
     check(u16(u8x) + u8y, widening_add(u8x, u8y));
     check(i16(u8x) + u8y, i16(widening_add(u8x, u8y)));
     check(f32(f16x) + f32(f16y), widening_add(f16x, f16y));
 
+    check(i8(i1x) - i1y, widening_sub(i1x, i1y));
     check(i16(i8x) - i8y, widening_sub(i8x, i8y));
     check(i16(u8x) - u8y, widening_sub(u8x, u8y));
     check(f32(f16x) - f32(f16y), widening_sub(f16x, f16y));
 
+    check(i8(i1x) * i1y, widening_mul(i1x, i1y));
     check(i16(i8x) * i8y, widening_mul(i8x, i8y));
     check(u16(u8x) * u8y, widening_mul(u8x, u8y));
     check(i32(i8x) * i8y, i32(widening_mul(i8x, i8y)));

From c4d56c6202476274280b8251810cd926913b499c Mon Sep 17 00:00:00 2001
From: Tarushii Goel <tishi.tng@gmail.com>
Date: Mon, 19 Feb 2024 17:46:15 -0500
Subject: [PATCH 069/186] Small Tutorial Fix (#8111)

* Update lesson_17_predicated_rdom.cpp

* Update lesson_17_predicated_rdom.cpp
---
 tutorial/lesson_17_predicated_rdom.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorial/lesson_17_predicated_rdom.cpp b/tutorial/lesson_17_predicated_rdom.cpp
index 77c43bdb55c2..b36fc49a773a 100644
--- a/tutorial/lesson_17_predicated_rdom.cpp
+++ b/tutorial/lesson_17_predicated_rdom.cpp
@@ -38,7 +38,7 @@ int main(int argc, char **argv) {
         Var x("x"), y("y");
         circle(x, y) = x + y;
 
-        // Say we want an update that squares the values inside a
+        // Say we want an update that multiplies by two the values inside a
         // circular region centered at (3, 3) with radius of 3. To do
         // this, we first define the minimal bounding box over the
         // circular region using an RDom.

From 46132176ff262a337c7cd4acb2839c18d49b6911 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 22 Feb 2024 09:13:15 -0800
Subject: [PATCH 070/186] Optionally print the time taken by each lowering pass
 (#8116)

* Optionally print the time taken by each lowering pass

I've been copy-pasting this from branch to branch, but I should just
check it in. This is useful for performance optimization of the compiler
itself.
---
 src/Lower.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/Lower.cpp b/src/Lower.cpp
index 560e0353c7a4..6b56f23fcff9 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -91,15 +91,39 @@ namespace {
 
 class LoweringLogger {
     Stmt last_written;
+    std::chrono::time_point<std::chrono::high_resolution_clock> last_time;
+    std::vector<std::pair<double, std::string>> timings;
+    bool time_lowering_passes = false;
 
 public:
+    LoweringLogger() {
+        last_time = std::chrono::high_resolution_clock::now();
+        static bool should_time = !get_env_variable("HL_TIME_LOWERING_PASSES").empty();
+        time_lowering_passes = should_time;
+    }
+
     void operator()(const string &message, const Stmt &s) {
+        auto t = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diff = t - last_time;
         if (!s.same_as(last_written)) {
             debug(2) << message << "\n"
                      << s << "\n";
             last_written = s;
+            last_time = t;
         } else {
             debug(2) << message << " (unchanged)\n\n";
+            last_time = t;
+        }
+        timings.emplace_back(diff.count() * 1000, message);
+    }
+
+    ~LoweringLogger() {
+        if (time_lowering_passes) {
+            debug(0) << "Lowering pass runtimes:\n";
+            std::sort(timings.begin(), timings.end());
+            for (const auto &p : timings) {
+                debug(0) << " " << p.first << " ms : " << p.second << "\n";
+            }
         }
     }
 };

From ef31bf95f056ee7ee3c6eb76f6ac3690ad8f4f5f Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 22 Feb 2024 09:13:43 -0800
Subject: [PATCH 071/186] Do less redundant work in UnpackBuffers (#8104)

We were redundantly creating a handle Variable every time we encountered
something like foo.stride.0, instead of just the first time we encounter
a Variable that refers to an input Parameter/Buffer.

Speeds up this already-fast lowering pass by 10% or so. No measurable
impact on total lowering time.
---
 src/UnpackBuffers.cpp | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/UnpackBuffers.cpp b/src/UnpackBuffers.cpp
index 2f5b5a65bef6..3040f0b273ba 100644
--- a/src/UnpackBuffers.cpp
+++ b/src/UnpackBuffers.cpp
@@ -26,37 +26,46 @@ class FindBufferSymbols : public IRVisitor {
     void visit_param(const string &ref_name, const Parameter &param) {
         if (param.defined() && param.is_buffer()) {
             const string &name = param.name();
-            buffers[name] =
-                BufferInfo{Variable::make(type_of<halide_buffer_t *>(), name + ".buffer", param),
-                           param.dimensions()};
+            auto r = buffers.try_emplace(name);
+            if (r.second) {
+                // It's the first time we've seen this Parameter
+                r.first->second.handle = Variable::make(type_of<halide_buffer_t *>(), name + ".buffer", param);
+                r.first->second.dimensions = param.dimensions();
+            }
         }
     }
 
     void visit_buffer(const string &ref_name, const Buffer<> &buffer) {
         if (buffer.defined()) {
             const string &name = buffer.name();
-            buffers[name] =
-                BufferInfo{Variable::make(type_of<halide_buffer_t *>(), name + ".buffer", buffer),
-                           buffer.dimensions()};
+            auto r = buffers.try_emplace(name);
+            if (r.second) {
+                // It's the first time we've seen this Buffer
+                r.first->second.handle = Variable::make(type_of<halide_buffer_t *>(), name + ".buffer", buffer);
+                r.first->second.dimensions = buffer.dimensions();
+            }
         }
     }
 
     void visit(const Variable *op) override {
-        visit_param(op->name, op->param);
-        visit_buffer(op->name, op->image);
-        symbols.insert(op->name);
+        if (symbols.insert(op->name).second) {
+            visit_param(op->name, op->param);
+            visit_buffer(op->name, op->image);
+        }
     }
 
     void visit(const Load *op) override {
-        visit_param(op->name, op->param);
-        visit_buffer(op->name, op->image);
-        symbols.insert(op->name);
+        if (symbols.insert(op->name).second) {
+            visit_param(op->name, op->param);
+            visit_buffer(op->name, op->image);
+        }
         IRVisitor::visit(op);
     }
 
     void visit(const Store *op) override {
-        visit_param(op->name, op->param);
-        symbols.insert(op->name);
+        if (symbols.insert(op->name).second) {
+            visit_param(op->name, op->param);
+        }
         IRVisitor::visit(op);
     }
 

From 57164dfe3d98e0e27bb44bb0efc525d8c5411e00 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 22 Feb 2024 10:52:54 -0800
Subject: [PATCH 072/186] Avoid redundant scope lookups (#8103)

* Avoid redundant scope lookups

This pattern has been bugging me for a long time:

```
if (scope.contains(key)) {
  Foo f = scope.get(key);
}
```

This redundantly looks up the key in the scope twice. I've finally
gotten around to fixing it. I've introduced a find method that either
returns a const pointer to the value, if it exists, or null. It also
searches any containing scopes, which are held by const pointer, so the
method has to return a const pointer.

```
if (const Foo *f = scope.find(key)) {
}
```

For cases where you want to get and then mutate, I added shallow_find,
which doesn't search enclosing scopes, but returns a mutable pointer.

We were also doing redundant scope lookups in ScopedBinding. We stored
the key in the helper object, and then did a pop on that key in the
ScopedBinding destructor. This commit changes Scope so that Scope::push
returns an opaque token that you can pass to Scope::pop to have it
remove that element without doing a fresh lookup. ScopedBinding now uses
this. Under the hood it's just an iterator on the underlying map (map
iterators are not invalidated on inserting or removing other stuff).

The net effect is to speed up local laplacian lowering by about 5%

I also considered making it look more like an stl class, and having find
return an iterator, but it doesn't really work. The iterator it returns
might point to an entry in an enclosing scope, in which case you can't
compare it to the .end() method of the scope you have. Scopes are
different enough from maps that the interface really needs to be
distinct.
---
 src/Bounds.cpp                   | 65 +++++++++++-----------
 src/CSE.cpp                      |  4 +-
 src/ClampUnsafeAccesses.cpp      |  6 ++-
 src/CodeGen_ARM.cpp              |  5 +-
 src/CodeGen_C.cpp                |  5 +-
 src/CodeGen_D3D12Compute_Dev.cpp |  5 +-
 src/CodeGen_Hexagon.cpp          | 11 ++--
 src/CodeGen_LLVM.cpp             |  5 +-
 src/CodeGen_Metal_Dev.cpp        |  9 ++--
 src/CodeGen_OpenCL_Dev.cpp       |  8 +--
 src/CodeGen_Posix.cpp            |  4 +-
 src/CodeGen_Vulkan_Dev.cpp       | 28 +++++-----
 src/CodeGen_WebGPU_Dev.cpp       |  8 +--
 src/CodeGen_X86.cpp              | 38 +++++++------
 src/EliminateBoolVectors.cpp     |  4 +-
 src/ExprUsesVar.h                |  4 +-
 src/FindIntrinsics.cpp           |  4 +-
 src/FuseGPUThreadLoops.cpp       | 16 +++---
 src/HexagonOptimize.cpp          | 32 +++++------
 src/LICM.cpp                     |  4 +-
 src/LoopCarry.cpp                | 13 +++--
 src/LowerWarpShuffles.cpp        | 17 +++---
 src/ModulusRemainder.cpp         |  4 +-
 src/Monotonic.cpp                |  4 +-
 src/Prefetch.cpp                 |  7 ++-
 src/PrintLoopNest.cpp            | 15 +++---
 src/Scope.h                      | 93 +++++++++++++++++++++++---------
 src/Simplify.cpp                 | 43 +++++++--------
 src/Simplify_Exprs.cpp           | 32 +++++------
 src/Simplify_Stmts.cpp           | 20 +++----
 src/SlidingWindow.cpp            |  7 ++-
 src/Solve.cpp                    | 18 +++----
 src/StageStridedLoads.cpp        |  8 +--
 src/StmtToHTML.cpp               |  4 +-
 src/StorageFlattening.cpp        |  5 +-
 src/UniquifyVariableNames.cpp    |  7 ++-
 src/VectorizeLoops.cpp           |  4 +-
 37 files changed, 305 insertions(+), 261 deletions(-)

diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index a08bb0b9ad61..16fd69f3e8fb 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -406,13 +406,12 @@ class Bounds : public IRVisitor {
 
         if (const_bound) {
             bounds_of_type(op->type);
-            if (scope.contains(op->name)) {
-                const Interval &scope_interval = scope.get(op->name);
-                if (scope_interval.has_upper_bound() && is_const(scope_interval.max)) {
-                    interval.max = Interval::make_min(interval.max, scope_interval.max);
+            if (const Interval *scope_interval = scope.find(op->name)) {
+                if (scope_interval->has_upper_bound() && is_const(scope_interval->max)) {
+                    interval.max = Interval::make_min(interval.max, scope_interval->max);
                 }
-                if (scope_interval.has_lower_bound() && is_const(scope_interval.min)) {
-                    interval.min = Interval::make_max(interval.min, scope_interval.min);
+                if (scope_interval->has_lower_bound() && is_const(scope_interval->min)) {
+                    interval.min = Interval::make_max(interval.min, scope_interval->min);
                 }
             }
 
@@ -429,8 +428,8 @@ class Bounds : public IRVisitor {
                 }
             }
         } else {
-            if (scope.contains(op->name)) {
-                interval = scope.get(op->name);
+            if (const Interval *in = scope.find(op->name)) {
+                interval = *in;
             } else if (op->type.is_vector()) {
                 // Uh oh, we need to take the min/max lane of some unknown vector. Treat as unbounded.
                 bounds_of_type(op->type);
@@ -2054,11 +2053,10 @@ class FindInnermostVar : public IRVisitor {
     int innermost_depth = -1;
 
     void visit(const Variable *op) override {
-        if (vars_depth.contains(op->name)) {
-            int depth = vars_depth.get(op->name);
-            if (depth > innermost_depth) {
+        if (const int *depth = vars_depth.find(op->name)) {
+            if (*depth > innermost_depth) {
                 innermost_var = op->name;
-                innermost_depth = depth;
+                innermost_depth = *depth;
             }
         }
     }
@@ -2545,16 +2543,17 @@ class BoxesTouched : public IRGraphVisitor {
                 // If this let stmt is a redefinition of a previous one, we should
                 // remove the old let stmt from the 'children' map since it is
                 // no longer valid at this point.
-                if ((f.vi.instance > 0) && let_stmts.contains(op->name)) {
-                    const Expr &val = let_stmts.get(op->name);
-                    CollectVars collect(op->name);
-                    val.accept(&collect);
-                    f.old_let_vars = collect.vars;
-
-                    VarInstance old_vi = VarInstance(f.vi.var, f.vi.instance - 1);
-                    for (const auto &v : f.old_let_vars) {
-                        internal_assert(vars_renaming.count(v));
-                        children[get_var_instance(v)].erase(old_vi);
+                if (f.vi.instance > 0) {
+                    if (const Expr *val = let_stmts.find(op->name)) {
+                        CollectVars collect(op->name);
+                        val->accept(&collect);
+                        f.old_let_vars = collect.vars;
+
+                        VarInstance old_vi = VarInstance(f.vi.var, f.vi.instance - 1);
+                        for (const auto &v : f.old_let_vars) {
+                            internal_assert(vars_renaming.count(v));
+                            children[get_var_instance(v)].erase(old_vi);
+                        }
                     }
                 }
                 let_stmts.push(op->name, op->value);
@@ -2756,17 +2755,17 @@ class BoxesTouched : public IRGraphVisitor {
                                                       expr_uses_var(box[i].min, l.min_name))) ||
                         (box[i].has_upper_bound() && (expr_uses_var(box[i].max, l.max_name) ||
                                                       expr_uses_var(box[i].max, l.min_name)))) {
-                        internal_assert(let_stmts.contains(l.var));
-                        const Expr &val = let_stmts.get(l.var);
-                        v_bound = bounds_of_expr_in_scope(val, scope, func_bounds);
+                        const Expr *val = let_stmts.find(l.var);
+                        internal_assert(val);
+                        v_bound = bounds_of_expr_in_scope(*val, scope, func_bounds);
                         bool fixed = v_bound.min.same_as(v_bound.max);
                         v_bound.min = simplify(v_bound.min);
                         v_bound.max = fixed ? v_bound.min : simplify(v_bound.max);
 
-                        internal_assert(scope.contains(l.var));
-                        const Interval &old_bound = scope.get(l.var);
-                        v_bound.max = simplify(min(v_bound.max, old_bound.max));
-                        v_bound.min = simplify(max(v_bound.min, old_bound.min));
+                        const Interval *old_bound = scope.find(l.var);
+                        internal_assert(old_bound);
+                        v_bound.max = simplify(min(v_bound.max, old_bound->max));
+                        v_bound.min = simplify(max(v_bound.min, old_bound->min));
                     }
 
                     if (box[i].has_lower_bound()) {
@@ -3017,14 +3016,14 @@ class BoxesTouched : public IRGraphVisitor {
         }
 
         Expr min_val, max_val;
-        if (scope.contains(op->name + ".loop_min")) {
-            min_val = scope.get(op->name + ".loop_min").min;
+        if (const Interval *in = scope.find(op->name + ".loop_min")) {
+            min_val = in->min;
         } else {
             min_val = bounds_of_expr_in_scope(op->min, scope, func_bounds).min;
         }
 
-        if (scope.contains(op->name + ".loop_max")) {
-            max_val = scope.get(op->name + ".loop_max").max;
+        if (const Interval *in = scope.find(op->name + ".loop_max")) {
+            max_val = in->max;
         } else {
             max_val = bounds_of_expr_in_scope(op->extent, scope, func_bounds).max;
             max_val += bounds_of_expr_in_scope(op->min, scope, func_bounds).max;
diff --git a/src/CSE.cpp b/src/CSE.cpp
index 7d39fcc90dc5..d8ecd619db81 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -201,8 +201,8 @@ class RemoveLets : public IRGraphMutator {
     Scope<Expr> scope;
 
     Expr visit(const Variable *op) override {
-        if (scope.contains(op->name)) {
-            return scope.get(op->name);
+        if (const Expr *e = scope.find(op->name)) {
+            return *e;
         } else {
             return op;
         }
diff --git a/src/ClampUnsafeAccesses.cpp b/src/ClampUnsafeAccesses.cpp
index 5e2e1f5d5b2e..b3dd9ddc235e 100644
--- a/src/ClampUnsafeAccesses.cpp
+++ b/src/ClampUnsafeAccesses.cpp
@@ -50,8 +50,10 @@ struct ClampUnsafeAccesses : IRMutator {
     }
 
     Expr visit(const Variable *var) override {
-        if (is_inside_indexing && let_var_inside_indexing.contains(var->name)) {
-            let_var_inside_indexing.ref(var->name) = true;
+        if (is_inside_indexing) {
+            if (bool *b = let_var_inside_indexing.shallow_find(var->name)) {
+                *b = true;
+            }
         }
         return var;
     }
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 9c6525703f16..7852532183bf 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -82,13 +82,14 @@ class SubstituteInStridedLoads : public IRMutator {
     Expr visit(const Shuffle *op) override {
         int stride = op->slice_stride();
         const Variable *var = op->vectors[0].as<Variable>();
+        const Expr *vec = nullptr;
         if (var &&
             poisoned_vars.count(var->name) == 0 &&
             op->vectors.size() == 1 &&
             2 <= stride && stride <= 4 &&
             op->slice_begin() < stride &&
-            loads.contains(var->name)) {
-            return Shuffle::make_slice({loads.get(var->name)}, op->slice_begin(), op->slice_stride(), op->type.lanes());
+            (vec = loads.find(var->name))) {
+            return Shuffle::make_slice({*vec}, op->slice_begin(), op->slice_stride(), op->type.lanes());
         } else {
             return IRMutator::visit(op);
         }
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index 89c18cb8ab28..b0cdcb3e956c 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -1936,8 +1936,9 @@ void CodeGen_C::visit(const Load *op) {
         user_assert(is_const_one(op->predicate)) << "Predicated scalar load is not supported by C backend.\n";
 
         string id_index = print_expr(op->index);
-        bool type_cast_needed = !(allocations.contains(op->name) &&
-                                  allocations.get(op->name).type.element_of() == t.element_of());
+        const auto *alloc = allocations.find(op->name);
+        bool type_cast_needed = !(alloc &&
+                                  alloc->type.element_of() == t.element_of());
         if (type_cast_needed) {
             const char *const_flag = output_kind == CPlusPlusImplementation ? " const" : "";
             rhs << "((" << print_type(t.element_of()) << const_flag << " *)" << name << ")";
diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index c8e45ea2ae09..4fd614cc0dfc 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -592,8 +592,9 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Load *op) {
     string id_index = print_expr(op->index);
 
     // Get the rhs just for the cache.
-    bool type_cast_needed = !(allocations.contains(op->name) &&
-                              allocations.get(op->name).type == op->type);
+    const auto *alloc = allocations.find(op->name);
+    bool type_cast_needed = !(alloc &&
+                              alloc->type == op->type);
 
     ostringstream rhs;
     if (type_cast_needed) {
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 9463a4c921aa..a77e9c7c1a76 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -221,8 +221,8 @@ class SloppyUnpredicateLoadsAndStores : public IRMutator {
                 }
             }
         } else if (const Variable *op = e.as<Variable>()) {
-            if (monotonic_vectors.contains(op->name)) {
-                return monotonic_vectors.get(op->name);
+            if (const auto *p = monotonic_vectors.find(op->name)) {
+                return *p;
             }
         } else if (const Let *op = e.as<Let>()) {
             auto v = get_extreme_lanes(op->value);
@@ -2245,10 +2245,9 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
         codegen(alloc->body);
 
         // If there was no early free, free it now.
-        if (allocations.contains(alloc->name)) {
-            Allocation alloc_obj = allocations.get(alloc->name);
-            internal_assert(alloc_obj.destructor);
-            trigger_destructor(alloc_obj.destructor_function, alloc_obj.destructor);
+        if (const Allocation *alloc_obj = allocations.find(alloc->name)) {
+            internal_assert(alloc_obj->destructor);
+            trigger_destructor(alloc_obj->destructor_function, alloc_obj->destructor);
 
             allocations.pop(alloc->name);
             sym_pop(alloc->name);
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index a5c32cf83cc7..8922461524c5 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1268,7 +1268,8 @@ void CodeGen_LLVM::sym_pop(const string &name) {
 
 llvm::Value *CodeGen_LLVM::sym_get(const string &name, bool must_succeed) const {
     // look in the symbol table
-    if (!symbol_table.contains(name)) {
+    llvm::Value *const *v = symbol_table.find(name);
+    if (!v) {
         if (must_succeed) {
             std::ostringstream err;
             err << "Symbol not found: " << name << "\n";
@@ -1283,7 +1284,7 @@ llvm::Value *CodeGen_LLVM::sym_get(const string &name, bool must_succeed) const
             return nullptr;
         }
     }
-    return symbol_table.get(name);
+    return *v;
 }
 
 bool CodeGen_LLVM::sym_exists(const string &name) const {
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 69d47279e9ae..79060294798e 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -390,8 +390,9 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Load *op) {
     string id_index = print_expr(op->index);
 
     // Get the rhs just for the cache.
-    bool type_cast_needed = !(allocations.contains(op->name) &&
-                              allocations.get(op->name).type == op->type);
+    const auto *alloc = allocations.find(op->name);
+    bool type_cast_needed = !(alloc &&
+                              alloc->type == op->type);
     ostringstream rhs;
     if (type_cast_needed) {
         rhs << "((" << get_memory_space(op->name) << " "
@@ -467,8 +468,8 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Store *op) {
                    << id_value << "[" << i << "];\n";
         }
     } else {
-        bool type_cast_needed = !(allocations.contains(op->name) &&
-                                  allocations.get(op->name).type == t);
+        const auto *alloc = allocations.find(op->name);
+        bool type_cast_needed = !(alloc && alloc->type == t);
 
         string id_index = print_expr(op->index);
         stream << get_indent();
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index 52feed53f9e0..c86e483cc5a8 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -484,8 +484,8 @@ string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_array_access(const string &na
                                                                 const Type &type,
                                                                 const string &id_index) {
     ostringstream rhs;
-    bool type_cast_needed = !(allocations.contains(name) &&
-                              allocations.get(name).type == type);
+    const auto *alloc = allocations.find(name);
+    bool type_cast_needed = !(alloc && alloc->type == type);
 
     if (type_cast_needed) {
         rhs << "((" << get_memory_space(name) << " "
@@ -583,8 +583,8 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Store *op) {
         // For atomicAdd, we check if op->value - store[index] is independent of store.
         // The atomicAdd operations in OpenCL only supports integers so we also check that.
         bool is_atomic_add = t.is_int_or_uint() && !expr_uses_var(delta, op->name);
-        bool type_cast_needed = !(allocations.contains(op->name) &&
-                                  allocations.get(op->name).type == t);
+        const auto *alloc = allocations.find(op->name);
+        bool type_cast_needed = !(alloc && alloc->type == t);
         auto print_store_var = [&]() {
             if (type_cast_needed) {
                 stream << "(("
diff --git a/src/CodeGen_Posix.cpp b/src/CodeGen_Posix.cpp
index af508194b06e..f812b63cce9d 100644
--- a/src/CodeGen_Posix.cpp
+++ b/src/CodeGen_Posix.cpp
@@ -342,8 +342,8 @@ void CodeGen_Posix::free_allocation(const std::string &name) {
 }
 
 string CodeGen_Posix::get_allocation_name(const std::string &n) {
-    if (allocations.contains(n)) {
-        return allocations.get(n).name;
+    if (const auto *alloc = allocations.find(n)) {
+        return alloc->name;
     } else {
         return n;
     }
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 61b365f2f7aa..39dd65b67671 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1539,10 +1539,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) {
     user_assert(is_const_one(op->predicate)) << "Predicated loads not supported by SPIR-V codegen\n";
 
     // Construct the pointer to read from
-    internal_assert(symbol_table.contains(op->name));
-    SymbolIdStorageClassPair id_and_storage_class = symbol_table.get(op->name);
-    SpvId variable_id = id_and_storage_class.first;
-    SpvStorageClass storage_class = id_and_storage_class.second;
+    const SymbolIdStorageClassPair *id_and_storage_class = symbol_table.find(op->name);
+    internal_assert(id_and_storage_class);
+    SpvId variable_id = id_and_storage_class->first;
+    SpvStorageClass storage_class = id_and_storage_class->second;
     internal_assert(variable_id != SpvInvalidId);
     internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax));
 
@@ -1576,10 +1576,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) {
     op->value.accept(this);
     SpvId value_id = builder.current_id();
 
-    internal_assert(symbol_table.contains(op->name));
-    SymbolIdStorageClassPair id_and_storage_class = symbol_table.get(op->name);
-    SpvId variable_id = id_and_storage_class.first;
-    SpvStorageClass storage_class = id_and_storage_class.second;
+    const SymbolIdStorageClassPair *id_and_storage_class = symbol_table.find(op->name);
+    internal_assert(id_and_storage_class);
+    SpvId variable_id = id_and_storage_class->first;
+    SpvStorageClass storage_class = id_and_storage_class->second;
     internal_assert(variable_id != SpvInvalidId);
     internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax));
 
@@ -1665,9 +1665,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
         const std::string intrinsic_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + intrinsic.first;
 
         // Intrinsics are inserted when adding the kernel
-        internal_assert(symbol_table.contains(intrinsic_var_name));
-        SpvId intrinsic_id = symbol_table.get(intrinsic_var_name).first;
-        SpvStorageClass storage_class = symbol_table.get(intrinsic_var_name).second;
+        const auto *intrin = symbol_table.find(intrinsic_var_name);
+        internal_assert(intrin);
+        SpvId intrinsic_id = intrin->first;
+        SpvStorageClass storage_class = intrin->second;
 
         // extract and cast to the extent type (which is what's expected by Halide's for loops)
         Type unsigned_type = UInt(32);
@@ -1908,8 +1909,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
 
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Free *op) {
     debug(3) << "Vulkan: Popping allocation called " << op->name << " off the symbol table\n";
-    internal_assert(symbol_table.contains(op->name));
-    SpvId variable_id = symbol_table.get(op->name).first;
+    const auto *id = symbol_table.find(op->name);
+    internal_assert(id);
+    SpvId variable_id = id->first;
     storage_access_map.erase(variable_id);
     symbol_table.pop(op->name);
 }
diff --git a/src/CodeGen_WebGPU_Dev.cpp b/src/CodeGen_WebGPU_Dev.cpp
index 08d3a542f41b..de55113ff695 100644
--- a/src/CodeGen_WebGPU_Dev.cpp
+++ b/src/CodeGen_WebGPU_Dev.cpp
@@ -684,8 +684,8 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::visit(const Load *op) {
 
     // Get the allocation type, which may be different from the result type.
     Type alloc_type = result_type;
-    if (allocations.contains(op->name)) {
-        alloc_type = allocations.get(op->name).type;
+    if (const auto *alloc = allocations.find(op->name)) {
+        alloc_type = alloc->type;
     } else if (workgroup_allocations.count(op->name)) {
         alloc_type = workgroup_allocations.at(op->name)->type;
     }
@@ -826,8 +826,8 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::visit(const Store *op) {
 
     // Get the allocation type, which may be different from the value type.
     Type alloc_type = value_type;
-    if (allocations.contains(op->name)) {
-        alloc_type = allocations.get(op->name).type;
+    if (const auto *alloc = allocations.find(op->name)) {
+        alloc_type = alloc->type;
     } else if (workgroup_allocations.count(op->name)) {
         alloc_type = workgroup_allocations.at(op->name)->type;
     }
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 8d87f4c1937e..0320e64b5ae5 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -866,28 +866,32 @@ void CodeGen_X86::visit(const Allocate *op) {
 }
 
 void CodeGen_X86::visit(const Load *op) {
-    if (mem_type.contains(op->name) && mem_type.get(op->name) == MemoryType::AMXTile) {
-        const Ramp *ramp = op->index.as<Ramp>();
-        internal_assert(ramp) << "Expected AMXTile to have index ramp\n";
-        Value *ptr = codegen_buffer_pointer(op->name, op->type, ramp->base);
-        LoadInst *load = builder->CreateAlignedLoad(llvm_type_of(upgrade_type_for_storage(op->type)), ptr, llvm::Align(op->type.bytes()));
-        add_tbaa_metadata(load, op->name, op->index);
-        value = load;
-        return;
+    if (const auto *mt = mem_type.find(op->name)) {
+        if (*mt == MemoryType::AMXTile) {
+            const Ramp *ramp = op->index.as<Ramp>();
+            internal_assert(ramp) << "Expected AMXTile to have index ramp\n";
+            Value *ptr = codegen_buffer_pointer(op->name, op->type, ramp->base);
+            LoadInst *load = builder->CreateAlignedLoad(llvm_type_of(upgrade_type_for_storage(op->type)), ptr, llvm::Align(op->type.bytes()));
+            add_tbaa_metadata(load, op->name, op->index);
+            value = load;
+            return;
+        }
     }
     CodeGen_Posix::visit(op);
 }
 
 void CodeGen_X86::visit(const Store *op) {
-    if (mem_type.contains(op->name) && mem_type.get(op->name) == MemoryType::AMXTile) {
-        Value *val = codegen(op->value);
-        Halide::Type value_type = op->value.type();
-        const Ramp *ramp = op->index.as<Ramp>();
-        internal_assert(ramp) << "Expected AMXTile to have index ramp\n";
-        Value *ptr = codegen_buffer_pointer(op->name, value_type, ramp->base);
-        StoreInst *store = builder->CreateAlignedStore(val, ptr, llvm::Align(value_type.bytes()));
-        add_tbaa_metadata(store, op->name, op->index);
-        return;
+    if (const auto *mt = mem_type.find(op->name)) {
+        if (*mt == MemoryType::AMXTile) {
+            Value *val = codegen(op->value);
+            Halide::Type value_type = op->value.type();
+            const Ramp *ramp = op->index.as<Ramp>();
+            internal_assert(ramp) << "Expected AMXTile to have index ramp\n";
+            Value *ptr = codegen_buffer_pointer(op->name, value_type, ramp->base);
+            StoreInst *store = builder->CreateAlignedStore(val, ptr, llvm::Align(value_type.bytes()));
+            add_tbaa_metadata(store, op->name, op->index);
+            return;
+        }
     }
     CodeGen_Posix::visit(op);
 }
diff --git a/src/EliminateBoolVectors.cpp b/src/EliminateBoolVectors.cpp
index cebfe0f0019b..62cdbdbef5b5 100644
--- a/src/EliminateBoolVectors.cpp
+++ b/src/EliminateBoolVectors.cpp
@@ -15,8 +15,8 @@ class EliminateBoolVectors : public IRMutator {
     Scope<Type> lets;
 
     Expr visit(const Variable *op) override {
-        if (lets.contains(op->name)) {
-            return Variable::make(lets.get(op->name), op->name);
+        if (const Type *t = lets.find(op->name)) {
+            return Variable::make(*t, op->name);
         } else {
             return op;
         }
diff --git a/src/ExprUsesVar.h b/src/ExprUsesVar.h
index 3bf129d259f7..84c3f7ae23d4 100644
--- a/src/ExprUsesVar.h
+++ b/src/ExprUsesVar.h
@@ -36,8 +36,8 @@ class ExprUsesVars : public IRGraphVisitor {
     void visit_name(const std::string &name) {
         if (vars.contains(name)) {
             result = true;
-        } else if (scope.contains(name)) {
-            include(scope.get(name));
+        } else if (const Expr *e = scope.find(name)) {
+            IRGraphVisitor::include(*e);
         }
     }
 
diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp
index febd88d2399b..d453d0134c29 100644
--- a/src/FindIntrinsics.cpp
+++ b/src/FindIntrinsics.cpp
@@ -1118,8 +1118,8 @@ class SubstituteInWideningLets : public IRMutator {
 
     Scope<Expr> replacements;
     Expr visit(const Variable *op) override {
-        if (replacements.contains(op->name)) {
-            return replacements.get(op->name);
+        if (const Expr *e = replacements.find(op->name)) {
+            return *e;
         } else {
             return op;
         }
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index ef5a75344bb8..abde50d62e1f 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -1140,21 +1140,21 @@ class ExtractRegisterAllocations : public IRMutator {
     }
 
     Expr visit(const Load *op) override {
-        string new_name = op->name;
-        if (alloc_renaming.contains(op->name)) {
-            new_name = alloc_renaming.get(op->name);
+        const string *new_name = alloc_renaming.find(op->name);
+        if (!new_name) {
+            new_name = &(op->name);
         }
-        return Load::make(op->type, new_name, mutate(op->index),
+        return Load::make(op->type, *new_name, mutate(op->index),
                           op->image, op->param, mutate(op->predicate),
                           op->alignment);
     }
 
     Stmt visit(const Store *op) override {
-        string new_name = op->name;
-        if (alloc_renaming.contains(op->name)) {
-            new_name = alloc_renaming.get(op->name);
+        const string *new_name = alloc_renaming.find(op->name);
+        if (!new_name) {
+            new_name = &(op->name);
         }
-        return Store::make(new_name, mutate(op->value), mutate(op->index),
+        return Store::make(*new_name, mutate(op->value), mutate(op->index),
                            op->param, mutate(op->predicate), op->alignment);
     }
 
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index b76a9eb1cfef..deabd95d1d1b 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -1357,8 +1357,8 @@ class EliminateInterleaves : public IRMutator {
         }
 
         if (const Load *load = x.as<Load>()) {
-            if (buffers.contains(load->name)) {
-                return buffers.get(load->name) != BufferState::NotInterleaved;
+            if (const auto *state = buffers.find(load->name)) {
+                return *state != BufferState::NotInterleaved;
             }
         }
 
@@ -1398,8 +1398,8 @@ class EliminateInterleaves : public IRMutator {
         }
 
         if (const Load *load = x.as<Load>()) {
-            if (buffers.contains(load->name)) {
-                return buffers.get(load->name) != BufferState::NotInterleaved;
+            if (const auto *state = buffers.find(load->name)) {
+                return *state != BufferState::NotInterleaved;
             }
         }
 
@@ -1816,34 +1816,33 @@ class EliminateInterleaves : public IRMutator {
         Expr value = mutate(op->value);
         Expr index = mutate(op->index);
 
-        if (buffers.contains(op->name)) {
+        if (BufferState *state = buffers.shallow_find(op->name)) {
             // When inspecting the stores to a buffer, update the state.
-            BufferState &state = buffers.ref(op->name);
             if (!is_const_one(predicate) || !op->value.type().is_vector()) {
                 // TODO(psuriana): This store is predicated. Mark the buffer as
                 // not interleaved for now.
-                state = BufferState::NotInterleaved;
+                *state = BufferState::NotInterleaved;
             } else if (yields_removable_interleave(value)) {
                 // The value yields a removable interleave. If we aren't tracking
                 // this buffer, mark it as interleaved.
-                if (state == BufferState::Unknown) {
-                    state = BufferState::Interleaved;
+                if (*state == BufferState::Unknown) {
+                    *state = BufferState::Interleaved;
                 }
             } else if (!yields_interleave(value)) {
                 // The value does not yield an interleave. Mark the
                 // buffer as not interleaved.
-                state = BufferState::NotInterleaved;
+                *state = BufferState::NotInterleaved;
             } else {
                 // If the buffer yields an interleave, but is not an
                 // interleave itself, we don't want to change the
                 // buffer state.
             }
-            internal_assert(aligned_buffer_access.contains(op->name) && "Buffer not found in scope");
-            bool &aligned_accesses = aligned_buffer_access.ref(op->name);
+            bool *aligned_accesses = aligned_buffer_access.shallow_find(op->name);
+            internal_assert(aligned_accesses) << "Buffer not found in scope";
             int64_t aligned_offset = 0;
 
             if (!alignment_analyzer.is_aligned(op, &aligned_offset)) {
-                aligned_accesses = false;
+                *aligned_accesses = false;
             }
         }
         if (deinterleave_buffers.contains(op->name)) {
@@ -1872,12 +1871,13 @@ class EliminateInterleaves : public IRMutator {
                 // which is only true if any of the stores are
                 // actually interleaved (and don't just yield an
                 // interleave).
-                internal_assert(aligned_buffer_access.contains(op->name) && "Buffer not found in scope");
-                bool &aligned_accesses = aligned_buffer_access.ref(op->name);
+                bool *aligned_accesses = aligned_buffer_access.shallow_find(op->name);
+                internal_assert(aligned_accesses) << "Buffer not found in scope";
+
                 int64_t aligned_offset = 0;
 
                 if (!alignment_analyzer.is_aligned(op, &aligned_offset)) {
-                    aligned_accesses = false;
+                    *aligned_accesses = false;
                 }
             } else {
                 // This is not a double vector load, so we can't
diff --git a/src/LICM.cpp b/src/LICM.cpp
index 641f4982a3e2..719b41442cfc 100644
--- a/src/LICM.cpp
+++ b/src/LICM.cpp
@@ -350,8 +350,8 @@ class GroupLoopInvariants : public IRMutator {
         const Scope<int> &depth;
 
         void visit(const Variable *op) override {
-            if (depth.contains(op->name)) {
-                result = std::max(result, depth.get(op->name));
+            if (const int *d = depth.find(op->name)) {
+                result = std::max(result, *d);
             }
         }
 
diff --git a/src/LoopCarry.cpp b/src/LoopCarry.cpp
index 050cdfbfc8d9..bfc2abc8ddf1 100644
--- a/src/LoopCarry.cpp
+++ b/src/LoopCarry.cpp
@@ -27,8 +27,8 @@ Expr is_linear(const Expr &e, const Scope<Expr> &linear) {
         return Expr();
     }
     if (const Variable *v = e.as<Variable>()) {
-        if (linear.contains(v->name)) {
-            return linear.get(v->name);
+        if (const Expr *e = linear.find(v->name)) {
+            return *e;
         } else {
             return make_zero(v->type);
         }
@@ -140,18 +140,17 @@ class StepForwards : public IRGraphMutator {
     using IRGraphMutator::visit;
 
     Expr visit(const Variable *op) override {
-        if (linear.contains(op->name)) {
-            Expr step = linear.get(op->name);
-            if (!step.defined()) {
+        if (const Expr *step = linear.find(op->name)) {
+            if (!step->defined()) {
                 // It's non-linear
                 success = false;
                 return op;
-            } else if (is_const_zero(step)) {
+            } else if (is_const_zero(*step)) {
                 // It's a known inner constant
                 return op;
             } else {
                 // It's linear
-                return Expr(op) + step;
+                return Expr(op) + *step;
             }
         } else {
             // It's some external constant
diff --git a/src/LowerWarpShuffles.cpp b/src/LowerWarpShuffles.cpp
index 79332c9336e5..ad48c37db78f 100644
--- a/src/LowerWarpShuffles.cpp
+++ b/src/LowerWarpShuffles.cpp
@@ -149,8 +149,8 @@ class DetermineAllocStride : public IRVisitor {
         } else if (const Variable *var = e.as<Variable>()) {
             if (var->name == lane_var) {
                 return 1;
-            } else if (dependent_vars.contains(var->name)) {
-                return dependent_vars.get(var->name);
+            } else if (const Expr *e = dependent_vars.find(var->name)) {
+                return *e;
             } else {
                 return 0;
             }
@@ -475,8 +475,9 @@ class LowerWarpShuffles : public IRMutator {
         if ((lt && equal(lt->a, this_lane) && is_const(lt->b)) ||
             (le && equal(le->a, this_lane) && is_const(le->b))) {
             Expr condition = mutate(op->condition);
-            internal_assert(bounds.contains(this_lane_name));
-            Interval interval = bounds.get(this_lane_name);
+            const Interval *in = bounds.find(this_lane_name);
+            internal_assert(in);
+            Interval interval = *in;
             interval.max = lt ? simplify(lt->b - 1) : le->b;
             ScopedBinding<Interval> bind(bounds, this_lane_name, interval);
             Stmt then_case = mutate(op->then_case);
@@ -488,10 +489,10 @@ class LowerWarpShuffles : public IRMutator {
     }
 
     Stmt visit(const Store *op) override {
-        if (allocation_info.contains(op->name)) {
+        if (const auto *alloc = allocation_info.find(op->name)) {
             Expr idx = mutate(op->index);
             Expr value = mutate(op->value);
-            Expr stride = allocation_info.get(op->name).stride;
+            Expr stride = alloc->stride;
             internal_assert(stride.defined() && warp_size.defined());
 
             // Reduce the index to an index in my own stripe. We have
@@ -639,9 +640,9 @@ class LowerWarpShuffles : public IRMutator {
     }
 
     Expr visit(const Load *op) override {
-        if (allocation_info.contains(op->name)) {
+        if (const auto *alloc = allocation_info.find(op->name)) {
             Expr idx = mutate(op->index);
-            Expr stride = allocation_info.get(op->name).stride;
+            Expr stride = alloc->stride;
 
             // Break the index into lane and stripe components
             Expr lane = simplify(reduce_expr(idx / stride, warp_size, bounds), true, bounds);
diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp
index cfccce1da786..13b3c72a181d 100644
--- a/src/ModulusRemainder.cpp
+++ b/src/ModulusRemainder.cpp
@@ -110,8 +110,8 @@ void ComputeModulusRemainder::visit(const Reinterpret *) {
 }
 
 void ComputeModulusRemainder::visit(const Variable *op) {
-    if (scope.contains(op->name)) {
-        result = scope.get(op->name);
+    if (const auto *m = scope.find(op->name)) {
+        result = *m;
     } else {
         result = ModulusRemainder{};
     }
diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp
index dd8e17d5b177..fee151f00a22 100644
--- a/src/Monotonic.cpp
+++ b/src/Monotonic.cpp
@@ -280,8 +280,8 @@ class DerivativeBounds : public IRVisitor {
     void visit(const Variable *op) override {
         if (op->name == var) {
             result = ConstantInterval::single_point(1);
-        } else if (scope.contains(op->name)) {
-            result = scope.get(op->name);
+        } else if (const auto *r = scope.find(op->name)) {
+            result = *r;
         } else {
             result = ConstantInterval::single_point(0);
         }
diff --git a/src/Prefetch.cpp b/src/Prefetch.cpp
index c0fb1f5c9a64..144b1950c5cd 100644
--- a/src/Prefetch.cpp
+++ b/src/Prefetch.cpp
@@ -86,10 +86,9 @@ class InjectPrefetch : public IRMutator {
     using IRMutator::visit;
 
     Box get_buffer_bounds(const string &name, int dims) {
-        if (buffer_bounds.contains(name)) {
-            const Box &b = buffer_bounds.ref(name);
-            internal_assert((int)b.size() == dims);
-            return b;
+        if (const Box *b = buffer_bounds.find(name)) {
+            internal_assert((int)b->size() == dims);
+            return *b;
         }
 
         // It is an external buffer.
diff --git a/src/PrintLoopNest.cpp b/src/PrintLoopNest.cpp
index 52f1c319951a..9d38efaaf80a 100644
--- a/src/PrintLoopNest.cpp
+++ b/src/PrintLoopNest.cpp
@@ -94,12 +94,16 @@ class PrintLoopNest : public IRVisitor {
         Expr min_val = op->min, extent_val = op->extent;
         const Variable *min_var = min_val.as<Variable>();
         const Variable *extent_var = extent_val.as<Variable>();
-        if (min_var && constants.contains(min_var->name)) {
-            min_val = constants.get(min_var->name);
+        if (min_var) {
+            if (const Expr *e = constants.find(min_var->name)) {
+                min_val = *e;
+            }
         }
 
-        if (extent_var && constants.contains(extent_var->name)) {
-            extent_val = constants.get(extent_var->name);
+        if (extent_var) {
+            if (const Expr *e = constants.find(extent_var->name)) {
+                extent_val = *e;
+            }
         }
 
         if (extent_val.defined() && is_const(extent_val) &&
@@ -151,9 +155,8 @@ class PrintLoopNest : public IRVisitor {
 
     void visit(const LetStmt *op) override {
         if (is_const(op->value)) {
-            constants.push(op->name, op->value);
+            ScopedBinding<Expr> bind(constants, op->name, op->value);
             op->body.accept(this);
-            constants.pop(op->name);
         } else {
             op->body.accept(this);
         }
diff --git a/src/Scope.h b/src/Scope.h
index 9d1cc43e1164..94d9eb9c165b 100644
--- a/src/Scope.h
+++ b/src/Scope.h
@@ -150,7 +150,39 @@ class Scope {
         return iter->second.top_ref();
     }
 
-    /** Tests if a name is in scope */
+    /** Returns a const pointer to an entry if it exists in this scope or any
+     * containing scope, or nullptr if it does not. Use this instead of if
+     * (scope.contains(foo)) { ... scope.get(foo) ... } to avoid doing two
+     * lookups. */
+    template<typename T2 = T,
+             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
+    const T2 *find(const std::string &name) const {
+        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
+        if (iter == table.end() || iter->second.empty()) {
+            if (containing_scope) {
+                return containing_scope->find(name);
+            } else {
+                return nullptr;
+            }
+        }
+        return &(iter->second.top_ref());
+    }
+
+    /** A version of find that returns a non-const pointer, but ignores
+     * containing scope. */
+    template<typename T2 = T,
+             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
+    T2 *shallow_find(const std::string &name) {
+        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
+        if (iter == table.end() || iter->second.empty()) {
+            return nullptr;
+        } else {
+            return &(iter->second.top_ref());
+        }
+    }
+
+    /** Tests if a name is in scope. If you plan to use the value if it is, call
+     * find instead. */
     bool contains(const std::string &name) const {
         typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
         if (iter == table.end() || iter->second.empty()) {
@@ -173,19 +205,28 @@ class Scope {
         }
     }
 
-    /** Add a new (name, value) pair to the current scope. Hide old
-     * values that have this name until we pop this name.
+    struct PushToken {
+        typename std::map<std::string, SmallStack<T>>::iterator iter;
+    };
+
+    /** Add a new (name, value) pair to the current scope. Hide old values that
+     * have this name until we pop this name. Returns a token that can be used
+     * to pop the same value without doing a fresh lookup.
      */
     template<typename T2 = T,
              typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
-    void push(const std::string &name, T2 &&value) {
-        table[name].push(std::forward<T2>(value));
+    PushToken push(const std::string &name, T2 &&value) {
+        auto it = table.try_emplace(name).first;
+        it->second.push(std::forward<T2>(value));
+        return PushToken{it};
     }
 
     template<typename T2 = T,
              typename = typename std::enable_if<std::is_same<T2, void>::value>::type>
-    void push(const std::string &name) {
-        table[name].push();
+    PushToken push(const std::string &name) {
+        auto it = table.try_emplace(name).first;
+        it->second.push();
+        return PushToken{it};
     }
 
     /** A name goes out of scope. Restore whatever its old value
@@ -201,6 +242,14 @@ class Scope {
         }
     }
 
+    /** Pop a name using a token returned by push instead of a string. */
+    void pop(PushToken p) {
+        p.iter->second.pop();
+        if (p.iter->second.empty()) {
+            table.erase(p.iter);
+        }
+    }
+
     /** Iterate through the scope. Does not capture any containing scope. */
     class const_iterator {
         typename std::map<std::string, SmallStack<T>>::const_iterator iter;
@@ -271,20 +320,17 @@ std::ostream &operator<<(std::ostream &stream, const Scope<T> &s) {
 template<typename T = void>
 struct ScopedBinding {
     Scope<T> *scope = nullptr;
-    std::string name;
+    typename Scope<T>::PushToken token;
 
     ScopedBinding() = default;
 
     ScopedBinding(Scope<T> &s, const std::string &n, T value)
-        : scope(&s), name(n) {
-        scope->push(name, std::move(value));
+        : scope(&s), token(scope->push(n, std::move(value))) {
     }
 
     ScopedBinding(bool condition, Scope<T> &s, const std::string &n, const T &value)
-        : scope(condition ? &s : nullptr), name(n) {
-        if (condition) {
-            scope->push(name, value);
-        }
+        : scope(condition ? &s : nullptr),
+          token(condition ? scope->push(n, value) : typename Scope<T>::PushToken{}) {
     }
 
     bool bound() const {
@@ -293,7 +339,7 @@ struct ScopedBinding {
 
     ~ScopedBinding() {
         if (scope) {
-            scope->pop(name);
+            scope->pop(token);
         }
     }
 
@@ -301,7 +347,7 @@ struct ScopedBinding {
     ScopedBinding(const ScopedBinding &that) = delete;
     ScopedBinding(ScopedBinding &&that) noexcept
         : scope(that.scope),
-          name(std::move(that.name)) {
+          token(that.token) {
         // The move constructor must null out scope, so we don't try to pop it
         that.scope = nullptr;
     }
@@ -313,20 +359,17 @@ struct ScopedBinding {
 template<>
 struct ScopedBinding<void> {
     Scope<> *scope;
-    std::string name;
+    Scope<>::PushToken token;
     ScopedBinding(Scope<> &s, const std::string &n)
-        : scope(&s), name(n) {
-        scope->push(name);
+        : scope(&s), token(scope->push(n)) {
     }
     ScopedBinding(bool condition, Scope<> &s, const std::string &n)
-        : scope(condition ? &s : nullptr), name(n) {
-        if (condition) {
-            scope->push(name);
-        }
+        : scope(condition ? &s : nullptr),
+          token(condition ? scope->push(n) : Scope<>::PushToken{}) {
     }
     ~ScopedBinding() {
         if (scope) {
-            scope->pop(name);
+            scope->pop(token);
         }
     }
 
@@ -334,7 +377,7 @@ struct ScopedBinding<void> {
     ScopedBinding(const ScopedBinding &that) = delete;
     ScopedBinding(ScopedBinding &&that) noexcept
         : scope(that.scope),
-          name(std::move(that.name)) {
+          token(that.token) {
         // The move constructor must null out scope, so we don't try to pop it
         that.scope = nullptr;
     }
diff --git a/src/Simplify.cpp b/src/Simplify.cpp
index 339ef2917c83..61cf7886cb70 100644
--- a/src/Simplify.cpp
+++ b/src/Simplify.cpp
@@ -34,8 +34,8 @@ Simplify::Simplify(bool r, const Scope<Interval> *bi, const Scope<ModulusRemaind
             bounds.max = *i_max;
         }
 
-        if (ai->contains(iter.name())) {
-            bounds.alignment = ai->get(iter.name());
+        if (const auto *a = ai->find(iter.name())) {
+            bounds.alignment = *a;
         }
 
         if (bounds.min_defined || bounds.max_defined || bounds.alignment.modulus != 1) {
@@ -74,18 +74,18 @@ std::pair<std::vector<Expr>, bool> Simplify::mutate_with_changes(const std::vect
 void Simplify::found_buffer_reference(const string &name, size_t dimensions) {
     for (size_t i = 0; i < dimensions; i++) {
         string stride = name + ".stride." + std::to_string(i);
-        if (var_info.contains(stride)) {
-            var_info.ref(stride).old_uses++;
+        if (auto *info = var_info.shallow_find(stride)) {
+            info->old_uses++;
         }
 
         string min = name + ".min." + std::to_string(i);
-        if (var_info.contains(min)) {
-            var_info.ref(min).old_uses++;
+        if (auto *info = var_info.shallow_find(min)) {
+            info->old_uses++;
         }
     }
 
-    if (var_info.contains(name)) {
-        var_info.ref(name).old_uses++;
+    if (auto *info = var_info.shallow_find(name)) {
+        info->old_uses++;
     }
 }
 
@@ -187,8 +187,8 @@ void Simplify::ScopedFact::learn_upper_bound(const Variable *v, int64_t val) {
     ExprInfo b;
     b.max_defined = true;
     b.max = val;
-    if (simplify->bounds_and_alignment_info.contains(v->name)) {
-        b.intersect(simplify->bounds_and_alignment_info.get(v->name));
+    if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) {
+        b.intersect(*info);
     }
     simplify->bounds_and_alignment_info.push(v->name, b);
     bounds_pop_list.push_back(v);
@@ -198,8 +198,8 @@ void Simplify::ScopedFact::learn_lower_bound(const Variable *v, int64_t val) {
     ExprInfo b;
     b.min_defined = true;
     b.min = val;
-    if (simplify->bounds_and_alignment_info.contains(v->name)) {
-        b.intersect(simplify->bounds_and_alignment_info.get(v->name));
+    if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) {
+        b.intersect(*info);
     }
     simplify->bounds_and_alignment_info.push(v->name, b);
     bounds_pop_list.push_back(v);
@@ -228,10 +228,9 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) {
                 // TODO: Visiting it again is inefficient
                 Simplify::ExprInfo expr_info;
                 simplify->mutate(eq->b, &expr_info);
-                if (simplify->bounds_and_alignment_info.contains(v->name)) {
+                if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) {
                     // We already know something about this variable and don't want to suppress it.
-                    auto existing_knowledge = simplify->bounds_and_alignment_info.get(v->name);
-                    expr_info.intersect(existing_knowledge);
+                    expr_info.intersect(*info);
                 }
                 simplify->bounds_and_alignment_info.push(v->name, expr_info);
                 bounds_pop_list.push_back(v);
@@ -245,10 +244,9 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) {
             // TODO: Visiting it again is inefficient
             Simplify::ExprInfo expr_info;
             simplify->mutate(eq->a, &expr_info);
-            if (simplify->bounds_and_alignment_info.contains(vb->name)) {
+            if (const auto *info = simplify->bounds_and_alignment_info.find(vb->name)) {
                 // We already know something about this variable and don't want to suppress it.
-                auto existing_knowledge = simplify->bounds_and_alignment_info.get(vb->name);
-                expr_info.intersect(existing_knowledge);
+                expr_info.intersect(*info);
             }
             simplify->bounds_and_alignment_info.push(vb->name, expr_info);
             bounds_pop_list.push_back(vb);
@@ -257,10 +255,9 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) {
             Simplify::ExprInfo expr_info;
             expr_info.alignment.modulus = *modulus;
             expr_info.alignment.remainder = *remainder;
-            if (simplify->bounds_and_alignment_info.contains(v->name)) {
+            if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) {
                 // We already know something about this variable and don't want to suppress it.
-                auto existing_knowledge = simplify->bounds_and_alignment_info.get(v->name);
-                expr_info.intersect(existing_knowledge);
+                expr_info.intersect(*info);
             }
             simplify->bounds_and_alignment_info.push(v->name, expr_info);
             bounds_pop_list.push_back(v);
@@ -417,8 +414,8 @@ bool can_prove(Expr e, const Scope<Interval> &bounds) {
 
             Expr visit(const Variable *op) override {
                 auto it = vars.find(op->name);
-                if (lets.contains(op->name)) {
-                    return Variable::make(op->type, lets.get(op->name));
+                if (const std::string *n = lets.find(op->name)) {
+                    return Variable::make(op->type, *n);
                 } else if (it == vars.end()) {
                     std::string name = "v" + std::to_string(count++);
                     vars[op->name] = name;
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index a8e5fcce1a8d..b5fcc96ac0cd 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -221,35 +221,32 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
 }
 
 Expr Simplify::visit(const Variable *op, ExprInfo *bounds) {
-    if (bounds_and_alignment_info.contains(op->name)) {
-        const ExprInfo &b = bounds_and_alignment_info.get(op->name);
+    if (const ExprInfo *b = bounds_and_alignment_info.find(op->name)) {
         if (bounds) {
-            *bounds = b;
+            *bounds = *b;
         }
-        if (b.min_defined && b.max_defined && b.min == b.max) {
-            return make_const(op->type, b.min);
+        if (b->min_defined && b->max_defined && b->min == b->max) {
+            return make_const(op->type, b->min);
         }
     }
 
-    if (var_info.contains(op->name)) {
-        auto &info = var_info.ref(op->name);
-
+    if (auto *info = var_info.shallow_find(op->name)) {
         // if replacement is defined, we should substitute it in (unless
         // it's a var that has been hidden by a nested scope).
-        if (info.replacement.defined()) {
-            internal_assert(info.replacement.type() == op->type)
+        if (info->replacement.defined()) {
+            internal_assert(info->replacement.type() == op->type)
                 << "Cannot replace variable " << op->name
                 << " of type " << op->type
-                << " with expression of type " << info.replacement.type() << "\n";
-            info.new_uses++;
+                << " with expression of type " << info->replacement.type() << "\n";
+            info->new_uses++;
             // We want to remutate the replacement, because we may be
             // injecting it into a context where it is known to be a
             // constant (e.g. due to an if).
-            return mutate(info.replacement, bounds);
+            return mutate(info->replacement, bounds);
         } else {
             // This expression was not something deemed
             // substitutable - no replacement is defined.
-            info.old_uses++;
+            info->old_uses++;
             return op;
         }
     } else {
@@ -321,15 +318,14 @@ Expr Simplify::visit(const Load *op, ExprInfo *bounds) {
     // unreachable loads.
     if (is_const_one(op->predicate)) {
         string alloc_extent_name = op->name + ".total_extent_bytes";
-        if (bounds_and_alignment_info.contains(alloc_extent_name)) {
+        if (const auto *alloc_info = bounds_and_alignment_info.find(alloc_extent_name)) {
             if (index_info.max_defined && index_info.max < 0) {
                 in_unreachable = true;
                 return unreachable(op->type);
             }
-            const ExprInfo &alloc_info = bounds_and_alignment_info.get(alloc_extent_name);
-            if (alloc_info.max_defined && index_info.min_defined) {
+            if (alloc_info->max_defined && index_info.min_defined) {
                 int index_min_bytes = index_info.min * op->type.bytes();
-                if (index_min_bytes > alloc_info.max) {
+                if (index_min_bytes > alloc_info->max) {
                     in_unreachable = true;
                     return unreachable(op->type);
                 }
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 11b146ecdc6a..f6cb81345961 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -305,19 +305,19 @@ Stmt Simplify::visit(const Store *op) {
     // but perhaps the branch was hard to prove constant true or false. This
     // provides an alternative mechanism to simplify these unreachable stores.
     string alloc_extent_name = op->name + ".total_extent_bytes";
-    if (is_const_one(op->predicate) &&
-        bounds_and_alignment_info.contains(alloc_extent_name)) {
-        if (index_info.max_defined && index_info.max < 0) {
-            in_unreachable = true;
-            return Evaluate::make(unreachable());
-        }
-        const ExprInfo &alloc_info = bounds_and_alignment_info.get(alloc_extent_name);
-        if (alloc_info.max_defined && index_info.min_defined) {
-            int index_min_bytes = index_info.min * op->value.type().bytes();
-            if (index_min_bytes > alloc_info.max) {
+    if (is_const_one(op->predicate)) {
+        if (const auto *alloc_info = bounds_and_alignment_info.find(alloc_extent_name)) {
+            if (index_info.max_defined && index_info.max < 0) {
                 in_unreachable = true;
                 return Evaluate::make(unreachable());
             }
+            if (alloc_info->max_defined && index_info.min_defined) {
+                int index_min_bytes = index_info.min * op->value.type().bytes();
+                if (index_min_bytes > alloc_info->max) {
+                    in_unreachable = true;
+                    return Evaluate::make(unreachable());
+                }
+            }
         }
     }
 
diff --git a/src/SlidingWindow.cpp b/src/SlidingWindow.cpp
index ab25ad32bc87..dfb50d714e37 100644
--- a/src/SlidingWindow.cpp
+++ b/src/SlidingWindow.cpp
@@ -69,10 +69,9 @@ class ExpandExpr : public IRMutator {
     const Scope<Expr> &scope;
 
     Expr visit(const Variable *var) override {
-        if (scope.contains(var->name)) {
-            Expr expr = scope.get(var->name);
-            debug(4) << "Fully expanded " << var->name << " -> " << expr << "\n";
-            return expr;
+        if (const Expr *expr = scope.find(var->name)) {
+            debug(4) << "Fully expanded " << var->name << " -> " << *expr << "\n";
+            return *expr;
         } else {
             return var;
         }
diff --git a/src/Solve.cpp b/src/Solve.cpp
index b25719cff8c7..09245d90bf24 100644
--- a/src/Solve.cpp
+++ b/src/Solve.cpp
@@ -786,17 +786,15 @@ class SolveExpression : public IRMutator {
         if (op->name == var) {
             uses_var = true;
             return op;
-        } else if (scope.contains(op->name)) {
-            CacheEntry e = scope.get(op->name);
-            uses_var = uses_var || e.uses_var;
-            failed = failed || e.failed;
-            return e.expr;
-        } else if (external_scope.contains(op->name)) {
-            Expr e = external_scope.get(op->name);
+        } else if (const CacheEntry *e = scope.find(op->name)) {
+            uses_var = uses_var || e->uses_var;
+            failed = failed || e->failed;
+            return e->expr;
+        } else if (const Expr *e = external_scope.find(op->name)) {
             // Expressions in the external scope haven't been solved
             // yet. This will either pull its solution from the cache,
             // or solve it and then put it into the cache.
-            return mutate(e);
+            return mutate(*e);
         } else {
             return op;
         }
@@ -948,13 +946,13 @@ class SolveForInterval : public IRVisitor {
 
     void visit(const Variable *op) override {
         internal_assert(op->type.is_bool());
-        if (scope.contains(op->name)) {
+        if (const Expr *e = scope.find(op->name)) {
             pair<string, bool> key = {op->name, target};
             auto it = solved_vars.find(key);
             if (it != solved_vars.end()) {
                 result = it->second;
             } else {
-                scope.get(op->name).accept(this);
+                e->accept(this);
                 solved_vars[key] = result;
             }
         } else {
diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp
index feeab56a4122..723fc738ce51 100644
--- a/src/StageStridedLoads.cpp
+++ b/src/StageStridedLoads.cpp
@@ -103,8 +103,8 @@ class FindStridedLoads : public IRVisitor {
                 if (stride >= 2 && stride < r->lanes && r->stride.type().is_scalar()) {
                     const IRNode *s = scope;
                     const Allocate *a = nullptr;
-                    if (allocation_scope.contains(op->name)) {
-                        a = allocation_scope.get(op->name);
+                    if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
+                        a = *a_ptr;
                     }
                     found_loads[Key{op->name, base, stride, r->lanes, op->type, a, s}][offset].push_back(op);
                 }
@@ -161,8 +161,8 @@ class ReplaceStridedLoads : public IRMutator {
 protected:
     Expr visit(const Load *op) override {
         const Allocate *alloc = nullptr;
-        if (allocation_scope.contains(op->name)) {
-            alloc = allocation_scope.get(op->name);
+        if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) {
+            alloc = *a_ptr;
         }
         auto it = replacements.find({alloc, op});
         if (it != replacements.end()) {
diff --git a/src/StmtToHTML.cpp b/src/StmtToHTML.cpp
index 9c317ba35525..79cf6563551e 100644
--- a/src/StmtToHTML.cpp
+++ b/src/StmtToHTML.cpp
@@ -1134,8 +1134,8 @@ class HTMLCodePrinter : public IRVisitor {
 
     std::string variable(const std::string &x, const std::string &tooltip) {
         int id;
-        if (scope.contains(x)) {
-            id = scope.get(x);
+        if (const int *i = scope.find(x)) {
+            id = *i;
         } else {
             id = gen_unique_id();
             scope.push(x, id);
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index d7e7c50002f6..13d7d6475120 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -31,10 +31,9 @@ class ExpandExpr : public IRMutator {
     const Scope<Expr> &scope;
 
     Expr visit(const Variable *var) override {
-        if (scope.contains(var->name)) {
-            Expr expr = scope.get(var->name);
+        if (const Expr *e = scope.find(var->name)) {
             // Mutate the expression, so lets can get replaced recursively.
-            expr = mutate(expr);
+            Expr expr = mutate(*e);
             debug(4) << "Fully expanded " << var->name << " -> " << expr << "\n";
             return expr;
         } else {
diff --git a/src/UniquifyVariableNames.cpp b/src/UniquifyVariableNames.cpp
index 26689ec34633..85a6ba521771 100644
--- a/src/UniquifyVariableNames.cpp
+++ b/src/UniquifyVariableNames.cpp
@@ -104,10 +104,9 @@ class UniquifyVariableNames : public IRMutator {
     }
 
     Expr visit(const Variable *op) override {
-        if (renaming.contains(op->name)) {
-            string new_name = renaming.get(op->name);
-            if (new_name != op->name) {
-                return Variable::make(op->type, new_name);
+        if (const string *new_name = renaming.find(op->name)) {
+            if (*new_name != op->name) {
+                return Variable::make(op->type, *new_name);
             }
         }
         return op;
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 6d10d2e9d5f3..0745a34a9d39 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -297,8 +297,8 @@ bool is_interleaved_ramp(const Expr &e, const Scope<Expr> &scope, InterleavedRam
             return true;
         }
     } else if (const Variable *var = e.as<Variable>()) {
-        if (scope.contains(var->name)) {
-            return is_interleaved_ramp(scope.get(var->name), scope, result);
+        if (const Expr *e = scope.find(var->name)) {
+            return is_interleaved_ramp(*e, scope, result);
         }
     }
     return false;

From 4399ed819bbc23f6d89a0baece854419587120d2 Mon Sep 17 00:00:00 2001
From: Zalman Stern <zalman@google.com>
Date: Thu, 22 Feb 2024 20:07:47 -0800
Subject: [PATCH 073/186] Add Intel APX and AVX10 target flags and LLVM
 attribute setting. (#8052)

* Add target flag and LLVM enables support for Intel AVX10.

* Go ahead and add APX support as well.

Correct spelling of APX target attributes.

* Implement AVX10 and APX cpu feature detection. (As yet untested.)

* Expand target feature flags for AVX10.

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 .../src/halide/halide_/PyEnums.cpp            |  2 +
 src/CodeGen_X86.cpp                           | 43 ++++++++++++++++---
 src/Target.cpp                                | 39 ++++++++++++++++-
 src/Target.h                                  |  2 +
 src/runtime/HalideRuntime.h                   |  2 +
 test/correctness/simd_op_check_x86.cpp        |  2 +
 6 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index e6cede6c6edb..4edd8029c340 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -192,6 +192,8 @@ void define_enums(py::module &m) {
         .value("VulkanV12", Target::VulkanV12)
         .value("VulkanV13", Target::VulkanV13)
         .value("Semihosting", Target::Feature::Semihosting)
+        .value("AVX10_1", Target::Feature::AVX10_1)
+        .value("X86APX", Target::Feature::X86APX)
         .value("FeatureEnd", Target::Feature::FeatureEnd);
 
     py::enum_<halide_type_code_t>(m, "TypeCode")
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 0320e64b5ae5..b0df27af0f2f 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -28,6 +28,14 @@ namespace {
 // existing flags, so that instruction patterns can just check for the
 // oldest feature flag that supports an instruction.
 Target complete_x86_target(Target t) {
+    if (t.has_feature(Target::AVX10_1)) {
+        if (t.vector_bits >= 256) {
+            t.set_feature(Target::AVX2);
+        }
+        if (t.vector_bits >= 512) {
+            t.set_feature(Target::AVX512_SapphireRapids);
+        }
+    }
     if (t.has_feature(Target::AVX512_SapphireRapids)) {
         t.set_feature(Target::AVX512_Zen4);
     }
@@ -54,6 +62,7 @@ Target complete_x86_target(Target t) {
     if (t.has_feature(Target::AVX)) {
         t.set_feature(Target::SSE41);
     }
+
     return t;
 }
 
@@ -1035,9 +1044,31 @@ string CodeGen_X86::mattrs() const {
     }
 #if LLVM_VERSION >= 180
     if (gather_might_be_slow(target)) {
-        attrs.push_back("+prefer-no-gather");
+        attrs.emplace_back("+prefer-no-gather");
     }
 #endif
+
+    if (target.has_feature(Target::AVX10_1)) {
+        switch (target.vector_bits) {
+        case 256:
+            attrs.emplace_back("+avx10.1-256");
+            break;
+        case 512:
+            attrs.emplace_back("+avx10.1-512");
+            break;
+        default:
+            user_error << "AVX10 only supports 256 or 512 bit variants at present.\n";
+            break;
+        }
+    }
+
+    if (target.has_feature(Target::X86APX)) {
+        attrs.emplace_back("+egpr");
+        attrs.emplace_back("+push2pop2");
+        attrs.emplace_back("+ppx");
+        attrs.emplace_back("+ndd");
+    }
+
     return join_strings(attrs, ",");
 }
 
@@ -1046,10 +1077,12 @@ bool CodeGen_X86::use_soft_float_abi() const {
 }
 
 int CodeGen_X86::native_vector_bits() const {
-    if (target.has_feature(Target::AVX512) ||
-        target.has_feature(Target::AVX512_Skylake) ||
-        target.has_feature(Target::AVX512_KNL) ||
-        target.has_feature(Target::AVX512_Cannonlake)) {
+    if (target.has_feature(Target::AVX10_1)) {
+        return target.vector_bits;
+    } else if (target.has_feature(Target::AVX512) ||
+               target.has_feature(Target::AVX512_Skylake) ||
+               target.has_feature(Target::AVX512_KNL) ||
+               target.has_feature(Target::AVX512_Cannonlake)) {
         return 512;
     } else if (target.has_feature(Target::AVX) ||
                target.has_feature(Target::AVX2)) {
diff --git a/src/Target.cpp b/src/Target.cpp
index 082b5103bd0b..ac96ae019065 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -251,6 +251,8 @@ Target calculate_host_target() {
         // Call cpuid with eax=7, ecx=0
         int info2[4];
         cpuid(info2, 7, 0);
+        int info3[4];
+        cpuid(info3, 7, 1);
         const uint32_t avx2 = 1U << 5;
         const uint32_t avx512f = 1U << 16;
         const uint32_t avx512dq = 1U << 17;
@@ -283,8 +285,6 @@ Target calculate_host_target() {
 
                 const uint32_t avxvnni = 1U << 4;     // avxvnni (note, not avx512vnni) result in eax
                 const uint32_t avx512bf16 = 1U << 5;  // bf16 result in eax, with cpuid(eax=7, ecx=1)
-                int info3[4];
-                cpuid(info3, 7, 1);
                 // TODO: port to family/model -based detection.
                 if ((info3[0] & avxvnni) == avxvnni &&
                     (info3[0] & avx512bf16) == avx512bf16) {
@@ -292,7 +292,40 @@ Target calculate_host_target() {
                 }
             }
         }
+
+        // AVX10 converged vector instructions.
+        const uint32_t avx10 = 1U << 19;
+        if (info2[3] & avx10) {
+            int info_avx10[4];
+            cpuid(info_avx10, 0x24, 0x0);
+
+            // This checks that the AVX10 version is greater than zero.
+            // It isn't really needed as for now only one version exists, but
+            // the docs indicate bits 0:7 of EBX should be >= 0 so...
+            if ((info[1] & 0xff) >= 1) {
+                initial_features.push_back(Target::AVX10_1);
+
+                const uint32_t avx10_128 = 1U << 16;
+                const uint32_t avx10_256 = 1U << 17;
+                const uint32_t avx10_512 = 1U << 18;
+                // Choose the maximum one that is available.
+                if (info[1] & avx10_512) {
+                    vector_bits = 512;
+                } else if (info[1] & avx10_256) {
+                    vector_bits = 256;
+                } else if (info[1] & avx10_128) {  // Not clear it is worth turning on AVX10 for this case.
+                    vector_bits = 128;
+                }
+            }
+        }
+
+        // APX register extensions, etc.
+        const uint32_t apx = 1U << 21;
+        if (info3[3] & apx) {
+            initial_features.push_back(Target::X86APX);
+        }
     }
+
 #endif
 #endif
 #endif
@@ -556,6 +589,8 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"vk_v12", Target::VulkanV12},
     {"vk_v13", Target::VulkanV13},
     {"semihosting", Target::Semihosting},
+    {"avx10_1", Target::AVX10_1},
+    {"x86apx", Target::X86APX},
     // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well.
 };
 
diff --git a/src/Target.h b/src/Target.h
index 20730a313883..3bc586822f75 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -167,6 +167,8 @@ struct Target {
         VulkanV12 = halide_target_feature_vulkan_version12,
         VulkanV13 = halide_target_feature_vulkan_version13,
         Semihosting = halide_target_feature_semihosting,
+        AVX10_1 = halide_target_feature_avx10_1,
+        X86APX = halide_target_feature_x86_apx,
         FeatureEnd = halide_target_feature_end
     };
     Target() = default;
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index b235117e9f5e..62fbaeb66d43 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1425,6 +1425,8 @@ typedef enum halide_target_feature_t {
     halide_target_feature_vulkan_version12,       ///< Enable Vulkan v1.2 runtime target support.
     halide_target_feature_vulkan_version13,       ///< Enable Vulkan v1.3 runtime target support.
     halide_target_feature_semihosting,            ///< Used together with Target::NoOS for the baremetal target built with semihosting library and run with semihosting mode where minimum I/O communication with a host PC is available.
+    halide_target_feature_avx10_1,                ///< Intel AVX10 version 1 support. vector_bits is used to indicate width.
+    halide_target_feature_x86_apx,                ///< Intel x86 APX support. Covers initial set of features released as APX: egpr,push2pop2,ppx,ndd .
     halide_target_feature_end                     ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing.
 } halide_target_feature_t;
 
diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp
index b4c086ce0fc3..8286bc68f9e6 100644
--- a/test/correctness/simd_op_check_x86.cpp
+++ b/test/correctness/simd_op_check_x86.cpp
@@ -673,5 +673,7 @@ int main(int argc, char **argv) {
             Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake"),
             Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4"),
             Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4-avx512_sapphirerapids"),
+            // Can be enabled when AVX10 and APX support are stable in LLVM.
+            // Target("x86-64-linux-avx10_1-vector_bits_256-x86apx"),
         });
 }

From aae84f69ebbffe1689f25ab4bd80a2143b626bf2 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 26 Feb 2024 09:56:17 -0800
Subject: [PATCH 074/186] Use a caching version of stmt_uses_vars in
 TightenProducerConsumer nodes (#8102)

We were making a very large number stmt_uses_vars queries that covered
the same sub-stmts. I solved it by adding a cache.

Speeds up local laplacian lowering by 10% by basically removing this
pass from the profile.

Also a drive-by typo fix in Lower.cpp
---
 src/AsyncProducers.cpp | 80 +++++++++++++++++++++++++++++++++++-------
 src/Lower.cpp          |  2 +-
 2 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
index 92012ccfe4c1..352219478923 100644
--- a/src/AsyncProducers.cpp
+++ b/src/AsyncProducers.cpp
@@ -569,11 +569,67 @@ class InitializeSemaphores : public IRMutator {
     }
 };
 
+// A class to support stmt_uses_vars queries that repeatedly hit the same
+// sub-stmts. Used to support TightenProducerConsumerNodes below.
+class CachingStmtUsesVars : public IRMutator {
+    const Scope<> &query;
+    bool found_use = false;
+    std::map<Stmt, bool> cache;
+
+    using IRMutator::visit;
+    Expr visit(const Variable *op) override {
+        found_use |= query.contains(op->name);
+        return op;
+    }
+
+    Expr visit(const Call *op) override {
+        found_use |= query.contains(op->name);
+        IRMutator::visit(op);
+        return op;
+    }
+
+    Stmt visit(const Provide *op) override {
+        found_use |= query.contains(op->name);
+        IRMutator::visit(op);
+        return op;
+    }
+
+public:
+    CachingStmtUsesVars(const Scope<> &q)
+        : query(q) {
+    }
+
+    using IRMutator::mutate;
+    Stmt mutate(const Stmt &s) override {
+        auto it = cache.find(s);
+        if (it != cache.end()) {
+            found_use |= it->second;
+        } else {
+            bool old = found_use;
+            found_use = false;
+            Stmt stmt = IRMutator::mutate(s);
+            if (found_use) {
+                cache.emplace(s, true);
+            } else {
+                cache.emplace(s, false);
+            }
+            found_use |= old;
+        }
+        return s;
+    }
+
+    bool check_stmt(const Stmt &s) {
+        found_use = false;
+        mutate(s);
+        return found_use;
+    }
+};
+
 // Tighten the scope of consume nodes as much as possible to avoid needless synchronization.
 class TightenProducerConsumerNodes : public IRMutator {
     using IRMutator::visit;
 
-    Stmt make_producer_consumer(const string &name, bool is_producer, Stmt body, const Scope<int> &scope) {
+    Stmt make_producer_consumer(const string &name, bool is_producer, Stmt body, const Scope<> &scope, CachingStmtUsesVars &uses_vars) {
         if (const LetStmt *let = body.as<LetStmt>()) {
             Stmt orig = body;
             // 'orig' is only used to keep a reference to the let
@@ -595,7 +651,7 @@ class TightenProducerConsumerNodes : public IRMutator {
                 body = ProducerConsumer::make(name, is_producer, body);
             } else {
                 // Recurse onto a non-let-node
-                body = make_producer_consumer(name, is_producer, body, scope);
+                body = make_producer_consumer(name, is_producer, body, scope, uses_vars);
             }
 
             for (auto it = containing_lets.rbegin(); it != containing_lets.rend(); it++) {
@@ -611,7 +667,6 @@ class TightenProducerConsumerNodes : public IRMutator {
             vector<Stmt> sub_stmts;
             Stmt rest;
             do {
-                Stmt first = block->first;
                 sub_stmts.push_back(block->first);
                 rest = block->rest;
                 block = rest.as<Block>();
@@ -619,18 +674,18 @@ class TightenProducerConsumerNodes : public IRMutator {
             sub_stmts.push_back(rest);
 
             for (Stmt &s : sub_stmts) {
-                if (stmt_uses_vars(s, scope)) {
-                    s = make_producer_consumer(name, is_producer, s, scope);
+                if (uses_vars.check_stmt(s)) {
+                    s = make_producer_consumer(name, is_producer, s, scope, uses_vars);
                 }
             }
 
             return Block::make(sub_stmts);
         } else if (const ProducerConsumer *pc = body.as<ProducerConsumer>()) {
-            return ProducerConsumer::make(pc->name, pc->is_producer, make_producer_consumer(name, is_producer, pc->body, scope));
+            return ProducerConsumer::make(pc->name, pc->is_producer, make_producer_consumer(name, is_producer, pc->body, scope, uses_vars));
         } else if (const Realize *r = body.as<Realize>()) {
             return Realize::make(r->name, r->types, r->memory_type,
                                  r->bounds, r->condition,
-                                 make_producer_consumer(name, is_producer, r->body, scope));
+                                 make_producer_consumer(name, is_producer, r->body, scope, uses_vars));
         } else {
             return ProducerConsumer::make(name, is_producer, body);
         }
@@ -638,17 +693,18 @@ class TightenProducerConsumerNodes : public IRMutator {
 
     Stmt visit(const ProducerConsumer *op) override {
         Stmt body = mutate(op->body);
-        Scope<int> scope;
-        scope.push(op->name, 0);
+        Scope<> scope;
+        scope.push(op->name);
         Function f = env.find(op->name)->second;
         if (f.outputs() == 1) {
-            scope.push(op->name + ".buffer", 0);
+            scope.push(op->name + ".buffer");
         } else {
             for (int i = 0; i < f.outputs(); i++) {
-                scope.push(op->name + "." + std::to_string(i) + ".buffer", 0);
+                scope.push(op->name + "." + std::to_string(i) + ".buffer");
             }
         }
-        return make_producer_consumer(op->name, op->is_producer, body, scope);
+        CachingStmtUsesVars uses_vars{scope};
+        return make_producer_consumer(op->name, op->is_producer, body, scope, uses_vars);
     }
 
     const map<string, Function> &env;
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 6b56f23fcff9..52c049b63c72 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -331,7 +331,7 @@ void lower_impl(const vector<Function> &output_funcs,
     debug(1) << "Simplifying...\n";
     s = simplify(s);
     s = unify_duplicate_lets(s);
-    log("Lowering after second simplifcation:", s);
+    log("Lowering after second simplification:", s);
 
     debug(1) << "Reduce prefetch dimension...\n";
     s = reduce_prefetch_dimension(s, t);

From 2b5beb3dfd2e079d21bce146b09c6645e0ba7df5 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 26 Feb 2024 17:11:47 -0800
Subject: [PATCH 075/186] Fix hoist_storage not handling condition correctly.
 (#8123)

The allocation condition wasn't getting relaxed over the scope and loop
vars like the extents were.
---
 src/StorageFlattening.cpp        | 32 +++++++++++++++++++++++---------
 test/correctness/skip_stages.cpp | 26 +++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index 13d7d6475120..ba4cc9b8acca 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -293,23 +293,37 @@ class FlattenDimensions : public IRMutator {
         stmt = LetStmt::make(op->name + ".buffer", builder.build(), stmt);
         if (hoisted_storages_map.count(op->name) > 0) {
             HoistedStorageData &hoisted_storage_data = hoisted_storages[hoisted_storages_map[op->name]];
-            vector<Expr> bounded_extents;
-            for (const auto &e : allocation_extents) {
-                Expr expanded_extent = e;
+
+            auto expand_and_bound = [&](Expr e) {
                 // Iterate from innermost outwards
                 for (auto it = hoisted_storages.rbegin(); it != hoisted_storages.rend(); it++) {
-                    expanded_extent = expand_expr(expanded_extent, it->scope);
+                    e = expand_expr(e, it->scope);
                     if (it->name == op->name) {
                         break;
                     }
                 }
-                expanded_extent = simplify(common_subexpression_elimination(expanded_extent));
-                Interval bounds = bounds_of_expr_in_scope(expanded_extent, hoisted_storage_data.loop_vars);
-                user_assert(bounds.max.defined()) << "Couldn't infer the upper bound for the storage size of " << op->name << ", consider using bound_storage.\n";
-                bounded_extents.push_back(bounds.max);
+
+                e = simplify(common_subexpression_elimination(e));
+                Interval bounds = bounds_of_expr_in_scope(e, hoisted_storage_data.loop_vars);
+                return bounds.max;
+            };
+
+            vector<Expr> bounded_extents;
+            for (const auto &e : allocation_extents) {
+                Expr expanded_extent = expand_and_bound(e);
+                user_assert(expanded_extent.defined() &&
+                            !expanded_extent.same_as(Interval::pos_inf()))
+                    << "Couldn't infer the upper bound for the storage size of " << op->name << ", consider using bound_storage.\n";
+                bounded_extents.push_back(expanded_extent);
+            }
+
+            Expr expanded_condition = expand_and_bound(condition);
+            if (!expanded_condition.defined() ||
+                expanded_condition.same_as(Interval::pos_inf())) {
+                expanded_condition = const_true();
             }
 
-            HoistedAllocationInfo hoisted_alloc(op->name, op->types[0], op->memory_type, bounded_extents, condition);
+            HoistedAllocationInfo hoisted_alloc(op->name, op->types[0], op->memory_type, bounded_extents, expanded_condition);
 
             hoisted_storage_data.hoisted_allocations.push_back(hoisted_alloc);
         } else {
diff --git a/test/correctness/skip_stages.cpp b/test/correctness/skip_stages.cpp
index ea298670b6bf..970966a78e30 100644
--- a/test/correctness/skip_stages.cpp
+++ b/test/correctness/skip_stages.cpp
@@ -27,7 +27,7 @@ void check_counts(int a = 0, int b = 0, int c = 0, int d = 0) {
 }
 
 int main(int argc, char **argv) {
-    Var x;
+    Var x, y;
     Param<bool> toggle1, toggle2;
 
     {
@@ -201,6 +201,30 @@ int main(int argc, char **argv) {
         check_counts(11);
     }
 
+    {
+        // Check the interation with storage hoisting
+
+        // This Func may or may not be loaded, depending on y
+        Func maybe_loaded("maybe_loaded");
+        maybe_loaded(x, y) = x + y;
+
+        // This Func may or may not be used, depending on y
+        Func maybe_used("maybe_used");
+        maybe_used(x, y) = maybe_loaded(x, y);
+
+        Func output("output");
+        output(x, y) = select(y % 100 == 37, 0, maybe_used(x, y));
+
+        // The allocation condition depends on y, but the actual allocation
+        // happens at the root level.
+        maybe_loaded.compute_at(output, y).hoist_storage_root();
+        maybe_used.compute_at(output, y).hoist_storage_root();
+
+        // This will fail to compile with an undefined symbol if we haven't
+        // handled the condition correctly.
+        output.realize({100, 100});
+    }
+
     printf("Success!\n");
     return 0;
 }

From 36d74a8cbf9c4129f608cd97d231961f1bd99c4c Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 26 Feb 2024 17:56:59 -0800
Subject: [PATCH 076/186] Rewrite the skip stages lowering pass (#8115)

* Avoid redundant scope lookups

This pattern has been bugging me for a long time:

```
if (scope.contains(key)) {
  Foo f = scope.get(key);
}
```

This redundantly looks up the key in the scope twice. I've finally
gotten around to fixing it. I've introduced a find method that either
returns a const pointer to the value, if it exists, or null. It also
searches any containing scopes, which are held by const pointer, so the
method has to return a const pointer.

```
if (const Foo *f = scope.find(key)) {
}
```

For cases where you want to get and then mutate, I added shallow_find,
which doesn't search enclosing scopes, but returns a mutable pointer.

We were also doing redundant scope lookups in ScopedBinding. We stored
the key in the helper object, and then did a pop on that key in the
ScopedBinding destructor. This commit changes Scope so that Scope::push
returns an opaque token that you can pass to Scope::pop to have it
remove that element without doing a fresh lookup. ScopedBinding now uses
this. Under the hood it's just an iterator on the underlying map (map
iterators are not invalidated on inserting or removing other stuff).

The net effect is to speed up local laplacian lowering by about 5%

I also considered making it look more like an stl class, and having find
return an iterator, but it doesn't really work. The iterator it returns
might point to an entry in an enclosing scope, in which case you can't
compare it to the .end() method of the scope you have. Scopes are
different enough from maps that the interface really needs to be
distinct.

* Pacify clang-tidy

* Fix unintentional mutation of interval in scope

* Fix accidental Scope::get

* Rewrite the skip stages lowering pass

Skip stages was slow due to crappy computational complexity (quadratic?)

I reworked it into a two-pass linear-time algorithm. The first part
remembers which pieces of IR are actually relevant to the task, and the
second pass performs the task using a bounds-inference-like algorithm.

On main resnet50 spends 519 ms in this pass. This commit reduces it to
40 ms. Local laplacian with 100 pyramid levels spends 7.4 seconds in
this pass. This commit reduces it to ~3 ms.

This commit also moves the cache store for memoized Funcs into the
produce node, instead of at the top of the consume node, because it
naturally places it inside a condition you inject into the produce node.

* clang-tidy fixes

* Fix skip stages interaction with compute_with

* Unify let visitors, and use fewer stack frames for them

* Fix accidental leakage of .used into .loaded

* Visit the bodies of uninteresting let chains

* Another used -> loaded

* Fix hoist_storage not handling condition correctly.

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/BoundsInference.cpp          |    7 +-
 src/IR.cpp                       |    1 +
 src/IR.h                         |    4 +
 src/Lower.cpp                    |    2 +-
 src/Memoization.cpp              |   14 +-
 src/Scope.h                      |    5 +
 src/SkipStages.cpp               | 1044 ++++++++++++++++++------------
 src/SkipStages.h                 |    8 +-
 src/Util.cpp                     |    2 +-
 test/correctness/skip_stages.cpp |   43 ++
 10 files changed, 721 insertions(+), 409 deletions(-)

diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index 31b441ea4251..5965303197bc 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -1383,9 +1383,14 @@ Stmt bounds_inference(Stmt s,
         fused_pairs_in_groups.push_back(pairs);
     }
 
+    // Add a note in the IR for where the outermost dynamic-stage skipping
+    // checks should go. These are injected in a later pass.
+    Expr marker = Call::make(Int(32), Call::skip_stages_marker, {}, Call::Intrinsic);
+    s = Block::make(Evaluate::make(marker), s);
+
     // Add a note in the IR for where assertions on input images
     // should go. Those are handled by a later lowering pass.
-    Expr marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic);
+    marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic);
     s = Block::make(Evaluate::make(marker), s);
 
     // Add a synthetic outermost loop to act as 'root'.
diff --git a/src/IR.cpp b/src/IR.cpp
index 3dcb73281412..c0bdb718291d 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -674,6 +674,7 @@ const char *const intrinsic_op_names[] = {
     "shift_right",
     "signed_integer_overflow",
     "size_of_halide_buffer_t",
+    "skip_stages_marker",
     "sliding_window_marker",
     "sorted_avg",
     "strict_float",
diff --git a/src/IR.h b/src/IR.h
index 82722af8173a..252e4588db03 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -594,6 +594,10 @@ struct Call : public ExprNode<Call> {
         signed_integer_overflow,
         size_of_halide_buffer_t,
 
+        // Marks the point in lowering where the outermost skip stages checks
+        // should be introduced.
+        skip_stages_marker,
+
         // Takes a realization name and a loop variable. Declares that values of
         // the realization that were stored on earlier loop iterations of the
         // given loop are potentially loaded in this loop iteration somewhere
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 52c049b63c72..3b357eb3061e 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -269,7 +269,7 @@ void lower_impl(const vector<Function> &output_funcs,
     log("Lowering after discarding safe promises:", s);
 
     debug(1) << "Dynamically skipping stages...\n";
-    s = skip_stages(s, order);
+    s = skip_stages(s, outputs, fused_groups, env);
     log("Lowering after dynamically skipping stages:", s);
 
     debug(1) << "Forking asynchronous producers...\n";
diff --git a/src/Memoization.cpp b/src/Memoization.cpp
index d07914591cc5..be99c3b8fcba 100644
--- a/src/Memoization.cpp
+++ b/src/Memoization.cpp
@@ -425,13 +425,10 @@ class InjectMemoization : public IRMutator {
 
             Stmt body = mutate(op->body);
 
-            std::string cache_miss_name = op->name + ".cache_miss";
-            Expr cache_miss = Variable::make(Bool(), cache_miss_name);
-
             if (op->is_producer) {
-                Stmt mutated_body = IfThenElse::make(cache_miss, body);
-                return ProducerConsumer::make(op->name, op->is_producer, mutated_body);
-            } else {
+                std::string cache_miss_name = op->name + ".cache_miss";
+                Expr cache_miss = Variable::make(Bool(), cache_miss_name);
+
                 const Function f(iter->second);
                 KeyInfo key_info(f, top_level_name, memoize_instance);
 
@@ -447,9 +444,10 @@ class InjectMemoization : public IRMutator {
                                      key_info.store_computation(cache_key_name, computed_bounds_name,
                                                                 eviction_key_name, f.outputs(), op->name));
 
-                Stmt mutated_body = Block::make(cache_store_back, body);
-                return ProducerConsumer::make(op->name, op->is_producer, mutated_body);
+                body = Block::make(body, cache_store_back);
+                body = IfThenElse::make(cache_miss, body);
             }
+            return ProducerConsumer::make(op->name, op->is_producer, body);
         } else {
             return IRMutator::visit(op);
         }
diff --git a/src/Scope.h b/src/Scope.h
index 94d9eb9c165b..f0578874762f 100644
--- a/src/Scope.h
+++ b/src/Scope.h
@@ -205,6 +205,11 @@ class Scope {
         }
     }
 
+    /** How many distinct names exist (does not count nested definitions of the same name) */
+    size_t size() const {
+        return table.size();
+    }
+
     struct PushToken {
         typename std::map<std::string, SmallStack<T>>::iterator iter;
     };
diff --git a/src/SkipStages.cpp b/src/SkipStages.cpp
index 9da328c6f374..caf292972fbb 100644
--- a/src/SkipStages.cpp
+++ b/src/SkipStages.cpp
@@ -1,4 +1,5 @@
 #include "SkipStages.h"
+#include "Bounds.h"
 #include "CSE.h"
 #include "Debug.h"
 #include "ExprUsesVar.h"
@@ -9,6 +10,7 @@
 #include "Scope.h"
 #include "Simplify.h"
 #include "Substitute.h"
+#include "UniquifyVariableNames.h"
 
 #include <iterator>
 #include <utility>
@@ -16,526 +18,774 @@
 namespace Halide {
 namespace Internal {
 
-using std::set;
-using std::string;
-using std::vector;
+// This lowering pass skips run produce nodes and sometimes allocating for
+// stages where the result can't affect the output. It's essentially computation
+// and allocation bounds inference but simpler. For each production, instead of
+// inferring the bounds to compute, we want to infer a single boolean that tells
+// us whether or not to run it. For each allocation, instead of inferring the
+// region to allocate, we want to infer a single boolean that tells us whether
+// or not to allocate.
+//
+// Like with bounds inference, if we infer this from scratch for each Func,
+// using the conditions under which its consumers are computed, we'd get a
+// quadratic blow-up in the size of these conditions as you go along the from
+// the output to the inputs. Instead, for each stage, we want the condition
+// under which it will be computed in terms of symbolic variables that signify
+// whether or not its immediate consumers are going to be computed. These
+// conditions can depend on loop variables, so we potentially need a fresh set
+// of these variables at each loop level that contains produce nodes.
 
 namespace {
 
-bool extern_call_uses_buffer(const Call *op, const std::string &func) {
-    if (op->is_extern()) {
-        if (starts_with(op->name, "halide_memoization")) {
-            return false;
-        }
-        for (const auto &arg : op->args) {
-            const Variable *var = arg.as<Variable>();
-            if (var &&
-                starts_with(var->name, func + ".") &&
-                ends_with(var->name, ".buffer")) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-class PredicateFinder : public IRVisitor {
-public:
-    Expr predicate;
-    PredicateFinder(const string &b, bool s)
-        : predicate(const_false()),
-          buffer(b),
+// A prepass to rule out certain pieces of IR as uninteresting, to speed up the
+// main pass.
+class SkipStagesAnalysis : public IRVisitor {
+    using IRVisitor::visit;
 
-          treat_selects_as_guards(s) {
-    }
+    // Is the visitor currently inside the condition of an IfThenElse or a
+    // Select (note: *not* one of the branches - the condition itself)
+    bool in_condition = false;
 
-private:
-    using IRVisitor::visit;
-    string buffer;
-    bool varies = false;
-    bool treat_selects_as_guards;
-    bool in_produce = false;
-    Scope<> varying;
-    Scope<> in_pipeline;
-    Scope<> local_buffers;
+    // What is the nearest enclosing conditional node for the realize node of
+    // each func. nullptr for outputs, because they don't have realize nodes.
+    std::map<size_t, const IRNode *> conditional_around_realize_node;
 
-    void visit(const Variable *op) override {
-        bool this_varies = varying.contains(op->name);
+    // What is the current nearest enclosing conditional node.
+    const IRNode *enclosing_conditional = nullptr;
 
-        varies |= this_varies;
+    void visit(const Select *op) override {
+        {
+            ScopedValue<bool> bind(in_condition, true);
+            op->condition.accept(this);
+        }
+        {
+            ScopedValue<const IRNode *> bind(enclosing_conditional, op);
+            op->true_value.accept(this);
+            op->false_value.accept(this);
+        }
     }
 
-    void visit(const For *op) override {
-        op->min.accept(this);
-        bool min_varies = varies;
-        op->extent.accept(this);
-        bool should_pop = false;
-        if (!is_const_one(op->extent) || min_varies) {
-            should_pop = true;
-            varying.push(op->name);
+    void visit(const IfThenElse *op) override {
+        {
+            ScopedValue<bool> bind(in_condition, true);
+            op->condition.accept(this);
         }
-        op->body.accept(this);
-        if (should_pop) {
-            varying.pop(op->name);
-        } else if (expr_uses_var(predicate, op->name)) {
-            predicate = Let::make(op->name, op->min, predicate);
+        {
+            ScopedValue<const IRNode *> bind(enclosing_conditional, op);
+            op->then_case.accept(this);
+            if (op->else_case.defined()) {
+                op->else_case.accept(this);
+            }
         }
     }
 
-    template<typename T>
-    void visit_let(const T *op) {
-        struct Frame {
-            const T *op;
-            ScopedBinding<> binding;
-        };
-        vector<Frame> frames;
+    // Have we encountered a Var or Call used inside a condition. If this
+    // happens in the value field of a let, then that let name should also be
+    // marked as interesting, because it could show up in a .used or .loaded
+    // condition.
+    bool found_var_used_in_condition = false;
 
-        decltype(op->body) body;
-        do {
-            bool old_varies = varies;
-            varies = false;
+    void visit(const LetStmt *op) override {
+        op->body.accept(this);
+        {
+            ScopedValue<bool> bind(in_condition, in_condition ||
+                                                     interesting_vars.count(op->name));
+            found_var_used_in_condition = false;
             op->value.accept(this);
-
-            frames.push_back(Frame{op, ScopedBinding<>(varies, varying, op->name)});
-
-            varies |= old_varies;
-            body = op->body;
-            op = body.template as<T>();
-        } while (op);
-
-        body.accept(this);
-
-        for (auto it = frames.rbegin(); it != frames.rend(); it++) {
-            if (expr_uses_var(predicate, it->op->name)) {
-                predicate = Let::make(it->op->name, it->op->value, predicate);
+            if (found_var_used_in_condition) {
+                // The value referred to a var or call that gets used in a
+                // condition somewhere, therefore this LetStmt could also get
+                // hoisted into a condition at some point.
+                interesting_vars.insert(op->name);
             }
         }
     }
 
-    void visit(const LetStmt *op) override {
-        visit_let(op);
+    void visit(const Let *op) override {
+        op->body.accept(this);
+        {
+            ScopedValue<bool> bind(in_condition, in_condition ||
+                                                     interesting_vars.count(op->name));
+            bool old = found_var_used_in_condition;
+            found_var_used_in_condition = false;
+            op->value.accept(this);
+            if (found_var_used_in_condition) {
+                interesting_vars.insert(op->name);
+            }
+            // Is this expression interesting? I.e. might it show up in a .used
+            // or .loaded? Either the body Expr was interesting in its own right
+            // (refered to something used in a conditional somewhere), or the
+            // value was interesting, and presumably the value is used in the
+            // body.
+            found_var_used_in_condition = found_var_used_in_condition || old;
+        }
     }
 
-    void visit(const Let *op) override {
-        visit_let(op);
+    void visit(const Block *op) override {
+        // Reverse order
+        op->rest.accept(this);
+        op->first.accept(this);
     }
 
+    Scope<> in_produce;
     void visit(const ProducerConsumer *op) override {
-        ScopedBinding<> bind(in_pipeline, op->name);
-        if (op->is_producer && op->name == buffer) {
-            ScopedValue<bool> sv(in_produce, true);
-            IRVisitor::visit(op);
+        size_t id = func_id.at(op->name);
+
+        if (op->is_producer &&
+            !unconditionally_used_funcs.count(id) &&
+            conditional_around_realize_node.count(id)) {
+            // This node could have an if statement injected here
+            ScopedValue<const IRNode *> s(enclosing_conditional, op);
+            ScopedBinding<> bind(in_produce, op->name);
+            op->body.accept(this);
         } else {
-            IRVisitor::visit(op);
+            // Either it's a consume, or it's used unconditionally outside of
+            // and after this produce node (remember we're iterating in reverse
+            // order), or it's an output (there is no enclosing realize node).
+            op->body.accept(this);
         }
     }
 
-    // Logical operators with eager constant folding
-    Expr make_and(Expr a, Expr b) {
-        if (is_const_zero(a) || is_const_one(b)) {
-            return a;
-        } else if (is_const_zero(b) || is_const_one(a)) {
-            return b;
-        } else if (equal(a, b)) {
-            return a;
-        } else {
-            return a && b;
+    Scope<> in_realize;
+    void visit(const Realize *op) override {
+        size_t id = func_id.at(op->name);
+
+        // There may have already been a Realize node for this Func. We need to
+        // analyze this node from scratch.
+        unconditionally_used_funcs.erase(id);
+
+        conditional_around_realize_node[id] = enclosing_conditional;
+
+        // Don't consider the realization bounds, which can't contain Func uses,
+        // or the new or free exprs, which can't access Func data.
+        {
+            ScopedBinding<> bind(in_realize, op->name);
+            op->body.accept(this);
+        }
+
+        if (conditionally_used_funcs.count(id)) {
+            // Was used conditionally in a different Realize node, and used
+            // unconditionally in this one.
+            unconditionally_used_funcs.erase(id);
+        } else if (!unconditionally_used_funcs.count(id)) {
+            // Was used conditionally in this Realize node.
+            conditionally_used_funcs.insert(id);
         }
     }
 
-    Expr make_or(Expr a, Expr b) {
-        if (is_const_zero(a) || is_const_one(b)) {
-            return b;
-        } else if (is_const_zero(b) || is_const_one(a)) {
-            return a;
-        } else if (equal(a, b)) {
-            return a;
-        } else {
-            return a || b;
-        }
-    }
-
-    Expr make_select(const Expr &a, Expr b, Expr c) {
-        if (is_const_one(a)) {
-            return b;
-        } else if (is_const_zero(a)) {
-            return c;
-        } else if (is_const_one(b)) {
-            return make_or(a, c);
-        } else if (is_const_zero(b)) {
-            return make_and(make_not(a), c);
-        } else if (is_const_one(c)) {
-            return make_or(make_not(a), b);
-        } else if (is_const_zero(c)) {
-            return make_and(a, b);
-        } else {
-            return select(a, b, c);
+    void visit(const Call *op) override {
+        if (op->call_type == Call::Halide) {
+            if (in_condition) {
+                interesting_vars.insert(op->name);
+                found_var_used_in_condition = true;
+            }
+            size_t id = func_id.at(op->name);
+            if (!in_produce.contains(op->name) &&
+                enclosing_conditional == conditional_around_realize_node[id]) {
+                unconditionally_used_funcs.insert(id);
+            }
         }
+        IRVisitor::visit(op);
     }
 
-    Expr make_not(const Expr &a) {
-        if (is_const_one(a)) {
-            return make_zero(a.type());
-        } else if (is_const_zero(a)) {
-            return make_one(a.type());
-        } else {
-            return !a;
+    void visit(const Variable *op) override {
+        if (in_condition) {
+            interesting_vars.insert(op->name);
+            found_var_used_in_condition = true;
+        }
+        if (op->type.is_handle()) {
+            auto it = func_id.find(op->name);
+            if (it != func_id.end() &&
+                in_realize.contains(op->name) &&
+                !in_produce.contains(op->name) &&
+                enclosing_conditional == conditional_around_realize_node[it->second]) {
+                unconditionally_used_funcs.insert(it->second);
+            }
         }
     }
 
-    template<typename T>
-    void visit_conditional(const Expr &condition, T true_case, T false_case) {
-        Expr old_predicate = predicate;
+public:
+    SkipStagesAnalysis(const std::map<std::string, size_t> &func_id)
+        : func_id(func_id) {
+    }
 
-        predicate = const_false();
-        true_case.accept(this);
-        Expr true_predicate = predicate;
+    const std::map<std::string, size_t> func_id;
 
-        predicate = const_false();
-        if (false_case.defined()) {
-            false_case.accept(this);
-        }
-        Expr false_predicate = predicate;
+    // Vars which could conceivably end up in a skip-stages predicate. These are
+    // the ones that are used (possibly transitively) in conditions in Select or
+    // IfThenElse nodes.
+    std::set<std::string> interesting_vars;
 
-        bool old_varies = varies;
-        predicate = const_false();
-        varies = false;
-        condition.accept(this);
+    // All Funcs that are unconditionally called within the scope of at least
+    // one of their Realize nodes (and therefore could never be skipped so we
+    // don't need to worry about them in the mutator below)
+    std::set<size_t> unconditionally_used_funcs;
 
-        predicate = make_or(predicate, old_predicate);
-        if (varies) {
-            predicate = make_or(predicate, make_or(true_predicate, false_predicate));
-        } else {
-            predicate = make_or(predicate, make_select(condition, true_predicate, false_predicate));
-        }
+    // All Funcs that are conditionally called within the scope of at least one
+    // of their Realize nodes, and therefore must not be added to
+    // unconditionally_used_funcs.
+    std::set<size_t> conditionally_used_funcs;
+};
 
-        varies = varies || old_varies;
+class SkipStages : public IRMutator {
+public:
+    SkipStages(const SkipStagesAnalysis &analysis, const std::vector<std::string> &name_for_id)
+        : analysis(analysis), name_for_id(name_for_id) {
     }
 
-    void visit(const Select *op) override {
-        if (treat_selects_as_guards) {
-            visit_conditional(op->condition, op->true_value, op->false_value);
-        } else {
-            IRVisitor::visit(op);
-        }
-    }
+protected:
+    const SkipStagesAnalysis &analysis;
+    const std::vector<std::string> &name_for_id;
 
-    void visit(const IfThenElse *op) override {
-        visit_conditional(op->condition, op->then_case, op->else_case);
-    }
+    using IRMutator::visit;
 
-    void visit(const Call *op) override {
-        varies |= in_pipeline.contains(op->name);
+    struct FuncInfo {
+        // Condition under which values are used and need to be correct.
+        Expr used;
 
-        IRVisitor::visit(op);
+        // Condition under which values are accessed, but don't need to be
+        // correct. May be distinct from used if the calls to this Func are
+        // guarded by selects.
+        Expr loaded;
+    };
+
+    // Conditions for each Func that describe how it is used in the Stmt just
+    // mutated, and any Stmts that come after it in the same enclosing loop
+    // body. (TODO: worry about fork)
+    std::map<size_t, FuncInfo> func_info;
+
+    bool found_marker = false;
+
+    // Might there be nested lets with the same name? Set to true if we ever
+    // stamp down a .used let more than once for the same Func.
+    bool need_uniquify = false;
 
-        if (!in_produce && (op->name == buffer || extern_call_uses_buffer(op, buffer))) {
-            predicate = const_true();
+    // Func ids for which we have ever stamped down a .used or .loaded let.
+    std::set<size_t> lets_emitted;
+
+    // Have we made use of .used or .loaded vars that haven't been wrapped in a
+    // LetStmt yet (while iterating from inside out)?
+    bool inner_unbound_use_of_used_or_loaded_vars = false;
+
+    Stmt emit_defs(Stmt stmt) {
+        for (auto &p : func_info) {
+            stmt = LetStmt::make(used_var_name(p.first), p.second.used, stmt);
+            stmt = LetStmt::make(loaded_var_name(p.first), p.second.loaded, stmt);
+            need_uniquify |= !lets_emitted.insert(p.first).second;
         }
+        return stmt;
     }
 
-    void visit(const Provide *op) override {
-        IRVisitor::visit(op);
-        if (in_produce && op->name != buffer && !local_buffers.contains(op->name)) {
-            predicate = const_true();
+    Stmt visit(const Block *op) override {
+        // We want to iterate in reverse, which really just requires changing
+        // the block visitor.
+        Stmt rest = mutate(op->rest);
+        found_marker = false;
+        Stmt first = mutate(op->first);
+        if (found_marker) {
+            // This is where the outermost .used definitions go
+            internal_assert(first.as<Evaluate>());
+            if (inner_unbound_use_of_used_or_loaded_vars) {
+                rest = emit_defs(rest);
+            }
+            if (need_uniquify) {
+                rest = uniquify_variable_names(rest);
+            }
+            return rest;
+        }
+        if (first.same_as(op->first) &&
+            rest.same_as(op->rest)) {
+            return op;
+        } else {
+            return Block::make(std::move(first), std::move(rest));
         }
     }
 
-    void visit(const Realize *op) override {
-        ScopedBinding<> bind(local_buffers, op->name);
-        IRVisitor::visit(op);
-    }
+    Expr visit(const Call *op) override {
+        if (op->name == "halide_memoization_cache_lookup") {
+            // The buffer reference in a cache lookup doesn't count as a use -
+            // it's an out parameter. However, do *do* need to conditionalize
+            // the lookup on whether or not the buffer needs to be allocated.
+            Expr last_arg = op->args.back();
+            const Call *c = last_arg.as<Call>();
+            internal_assert(c &&
+                            c->is_intrinsic(Call::make_struct) &&
+                            !c->args.empty())
+                << last_arg;
+            const Variable *v = c->args[0].as<Variable>();
+            internal_assert(v);
+            auto it = analysis.func_id.find(v->name);
+            internal_assert(it != analysis.func_id.end());
+            size_t func = it->second;
+            if (func_info.find(func) != func_info.end()) {
+                return Call::make(op->type, Call::if_then_else, {loaded_var(func), Expr(op), make_zero(op->type)}, Call::PureIntrinsic);
+            } else {
+                // Not in the func info map, so it must be unconditionally used.
+                return op;
+            }
+        }
 
-    void visit(const Allocate *op) override {
-        // This code works to ensure expressions depending on an
-        // allocation don't get moved outside the allocation and are
-        // marked as varying if predicate depends on the value of the
-        // allocation.
-        ScopedBinding<>
-            bind_host_ptr(varying, op->name),
-            bind_buffer(varying, op->name + ".buffer");
-        IRVisitor::visit(op);
+        Expr e = IRMutator::visit(op);
+        if (op->call_type == Call::Halide) {
+            size_t id = analysis.func_id.at(op->name);
+            if (!analysis.unconditionally_used_funcs.count(id)) {
+                // We're unconditionally used. Clobber any existing info.
+                func_info[id] = FuncInfo{const_true(), const_true()};
+            }
+        } else if (op->is_intrinsic(Call::skip_stages_marker)) {
+            found_marker = true;
+        }
+        return e;
     }
-};
 
-class ProductionGuarder : public IRMutator {
-public:
-    ProductionGuarder(const string &b, Expr compute_p, Expr alloc_p)
-        : buffer(b), compute_predicate(std::move(compute_p)), alloc_predicate(std::move(alloc_p)) {
+    Expr visit(const Variable *op) override {
+        if (op->type == halide_type_of<halide_buffer_t *>()) {
+            auto it = analysis.func_id.find(op->name);
+            if (it != analysis.func_id.end() &&
+                !analysis.unconditionally_used_funcs.count(it->second)) {
+                // Conservatively assume any use of a .buffer symbol depends on
+                // the Func being allocated and the values being correct.
+                func_info[it->second] = FuncInfo{const_true(), const_true()};
+            }
+        }
+        return op;
     }
 
-private:
-    string buffer;
-    Expr compute_predicate;
-    Expr alloc_predicate;
-
-    using IRMutator::visit;
-
-    bool memoize_call_uses_buffer(const Call *op) {
-        internal_assert(op->call_type == Call::Extern);
-        internal_assert(starts_with(op->name, "halide_memoization"));
-        for (const auto &arg : op->args) {
-            const Variable *var = arg.as<Variable>();
-            if (var &&
-                starts_with(var->name, buffer + ".") &&
-                ends_with(var->name, ".buffer")) {
-                return true;
+    void merge_func_info(std::map<size_t, FuncInfo> *old,
+                         const std::map<size_t, FuncInfo> &new_info,
+                         const Expr &used = Expr{},
+                         const Expr &evaluated = Expr{}) {
+        for (const auto &it : new_info) {
+            FuncInfo fi = it.second;
+            if (used.defined()) {
+                fi.used = fi.used && used;
+            }
+            if (evaluated.defined()) {
+                fi.loaded = fi.loaded && evaluated;
+            }
+            auto [p, inserted] = old->try_emplace(it.first, fi);
+            if (!inserted) {
+                // Merge with any existing info
+                if (!is_const_one(p->second.used)) {
+                    p->second.used = p->second.used || fi.used;
+                }
+                if (!is_const_one(p->second.loaded)) {
+                    p->second.loaded = p->second.loaded || fi.loaded;
+                }
             }
         }
-        return false;
     }
 
-    Expr visit(const Call *op) override {
+    // Is an Expr safe to lift into a .used or .loaded condition.
+    bool may_lift(const Expr &e) {
+        class MayLift : public IRVisitor {
+            using IRVisitor::visit;
+            void visit(const Call *op) override {
+                if (!op->is_pure() && op->call_type != Call::Halide) {
+                    result = false;
+                } else {
+                    IRVisitor::visit(op);
+                }
+            }
 
-        if ((op->name == "halide_memoization_cache_lookup") &&
-            memoize_call_uses_buffer(op)) {
-            // We need to guard call to halide_memoization_cache_lookup to only
-            // be executed if the corresponding buffer is allocated. We ignore
-            // the compute_predicate since in the case that alloc_predicate is
-            // true but compute_predicate is false, the consumer would still load
-            // data from the buffer even if it won't actually use the result,
-            // hence, we need to allocate some scratch memory for the consumer
-            // to load from. For memoized func, the memory might already be in
-            // the cache, so we perform the lookup instead of allocating a new one.
-            return Call::make(op->type, Call::if_then_else,
-                              {alloc_predicate, op, 0}, Call::PureIntrinsic);
-        } else if ((op->name == "halide_memoization_cache_store") &&
-                   memoize_call_uses_buffer(op)) {
-            // We need to wrap the halide_memoization_cache_store with the
-            // compute_predicate, since the data to be written is only valid if
-            // the producer of the buffer is executed.
-            return Call::make(op->type, Call::if_then_else,
-                              {compute_predicate, op, 0}, Call::PureIntrinsic);
+        public:
+            bool result = true;
+        } v;
+        e.accept(&v);
+        return v.result;
+    }
+
+    // Come up with an upper bound for the truth value of an expression with the
+    // given var eliminated.
+    Expr relax_over_var(const Expr &e, const std::string &var) {
+        Scope<Interval> domain;
+        domain.push(var, Interval::everything());
+        Interval in = bounds_of_expr_in_scope(e, domain);
+        if (!in.has_upper_bound()) {
+            return const_true();
         } else {
-            return IRMutator::visit(op);
+            return simplify(in.max);
         }
     }
 
-    Stmt visit(const ProducerConsumer *op) override {
-        // If the compute_predicate at this stage depends on something
-        // vectorized we should bail out.
-        Stmt stmt = IRMutator::visit(op);
-
-        if (op->is_producer) {
-            op = stmt.as<ProducerConsumer>();
-            internal_assert(op);
-            if (op->name == buffer) {
-                Stmt body = IfThenElse::make(compute_predicate, op->body);
-                stmt = ProducerConsumer::make(op->name, op->is_producer, body);
-            }
-        }
-        return stmt;
-    }
-};
+    // Come up with an upper bound for the truth value of an expression with any
+    // calls to the given func eliminated.
+    Expr relax_over_calls(const Expr &e, const std::string &func) {
+        class ReplaceCalls : public IRMutator {
+            const std::string &func;
 
-class StageSkipper : public IRMutator {
-public:
-    StageSkipper(const string &f)
-        : func(f) {
-    }
+            using IRMutator::visit;
 
-private:
-    string func;
-    using IRMutator::visit;
+            Expr visit(const Call *op) override {
+                if (op->call_type == Call::Halide && op->name == func) {
+                    return cast(op->type, var);
+                }
+                return IRMutator::visit(op);
+            }
 
-    Scope<> vector_vars;
-    bool in_vector_loop = false;
+        public:
+            const std::string var_name;
+            const Expr var;
 
-    Stmt visit(const For *op) override {
-        bool old_in_vector_loop = in_vector_loop;
+            ReplaceCalls(const std::string &func)
+                : func(func),
+                  var_name(unique_name('t')),
+                  var(Variable::make(Int(32), var_name)) {
+            }
+        } replacer(func);
 
-        // We want to be sure that the predicate doesn't vectorize.
-        if (op->for_type == ForType::Vectorized) {
-            vector_vars.push(op->name);
-            in_vector_loop = true;
+        return relax_over_var(replacer.mutate(e), replacer.var_name);
+    }
+
+    Expr visit(const Select *op) override {
+        if (!may_lift(op->condition)) {
+            return IRMutator::visit(op);
         }
 
-        Stmt stmt = IRMutator::visit(op);
+        std::map<size_t, FuncInfo> old;
+        old.swap(func_info);
+        mutate(op->true_value);
+        merge_func_info(&old, func_info, op->condition);
+        func_info.clear();
+        mutate(op->false_value);
+        merge_func_info(&old, func_info, !op->condition);
+        old.swap(func_info);
+        mutate(op->condition);  // Check for any calls in the condition
+
+        return op;
+    }
+
+    Stmt mutate_conditional_stmt(const Stmt &s, const Expr &condition) {
+        std::map<size_t, FuncInfo> old;
+        old.swap(func_info);
+        Stmt stmt = mutate(s);
+        merge_func_info(&old, func_info, condition, condition);
+        old.swap(func_info);
+        return stmt;
+    }
 
-        if (op->for_type == ForType::Vectorized) {
-            vector_vars.pop(op->name);
+    Stmt visit(const IfThenElse *op) override {
+        if (!may_lift(op->condition)) {
+            // We won't be able to lift the condition
+            return IRMutator::visit(op);
         }
 
-        in_vector_loop = old_in_vector_loop;
-
-        return stmt;
+        Stmt then_case = mutate_conditional_stmt(op->then_case, op->condition);
+        Stmt else_case;
+        if (op->else_case.defined()) {
+            else_case = mutate_conditional_stmt(op->else_case, !op->condition);
+        }
+        mutate(op->condition);
+        if (then_case.same_as(op->then_case) &&
+            else_case.same_as(op->else_case)) {
+            return op;
+        } else {
+            return IfThenElse::make(op->condition, then_case, else_case);
+        }
     }
 
-    Stmt visit(const LetStmt *op) override {
-        struct Frame {
-            const LetStmt *op;
-            bool vector_var;
-        };
-        vector<Frame> frames;
-        Stmt result;
-
-        while (op) {
-            bool vector_var = in_vector_loop && expr_uses_vars(op->value, vector_vars);
-            frames.push_back({op, vector_var});
-            if (vector_var) {
-                vector_vars.push(op->name);
-            }
-            result = op->body;
-            op = result.as<LetStmt>();
+    template<typename T>
+    auto visit_let(const T *op) -> decltype(op->body) {
+        const T *orig = op;
+
+        // Peel off any uninteresting lets without wasting stack frames.
+        std::vector<std::pair<std::string, Expr>> containing_lets;
+        decltype(op->body) body;
+        while (op && !analysis.interesting_vars.count(op->name)) {
+            containing_lets.emplace_back(op->name, op->value);
+            body = op->body;
+            op = body.template as<T>();
         }
 
-        result = mutate(result);
+        bool changed = false;
+        if (op) {
+            std::map<size_t, FuncInfo> old;
+            old.swap(func_info);
+            body = mutate(op->body);
+            internal_assert(body.defined());
+            if (may_lift(op->value)) {
+                for (auto &it : func_info) {
+                    if (expr_uses_var(it.second.used, op->name)) {
+                        it.second.used = Let::make(op->name, op->value, it.second.used);
+                    }
+                    if (expr_uses_var(it.second.loaded, op->name)) {
+                        it.second.loaded = Let::make(op->name, op->value, it.second.loaded);
+                    }
+                }
+            } else {
+                // Treat the let value as an unknown
+                for (auto &it : func_info) {
+                    if (expr_uses_var(it.second.used, op->name)) {
+                        it.second.used = relax_over_var(it.second.used, op->name);
+                    }
+                    if (expr_uses_var(it.second.loaded, op->name)) {
+                        it.second.loaded = relax_over_var(it.second.loaded, op->name);
+                    }
+                }
+            }
+            merge_func_info(&old, func_info);
+            old.swap(func_info);
+            mutate(op->value);
+            if (body.same_as(op->body)) {
+                body = op;
+            } else {
+                internal_assert(body.defined());
+                body = T::make(op->name, op->value, std::move(body));
+                changed = true;
+            }
+        } else if (std::is_same<T, LetStmt>::value) {
+            auto new_body = mutate(body);
+            changed = !new_body.same_as(body);
+            body = std::move(new_body);
+        } else {
+            // Just visit the body
+            mutate(body);
+        }
 
-        for (auto it = frames.rbegin(); it != frames.rend(); it++) {
-            if (it->vector_var) {
-                vector_vars.pop(it->op->name);
+        // Rewrap any uninteresting lets
+        for (auto it = containing_lets.rbegin(); it != containing_lets.rend(); it++) {
+            mutate(it->second);  // Visit the value of each let
+            if (changed) {
+                body = T::make(it->first, std::move(it->second), std::move(body));
             }
-            result = LetStmt::make(it->op->name, it->op->value, result);
         }
-        return result;
+
+        if (changed) {
+            internal_assert(body.defined());
+            return body;
+        } else {
+            return orig;
+        }
     }
 
-    Stmt visit(const Realize *op) override {
-        if (op->name == func) {
-            debug(3) << "Finding compute predicate for " << op->name << "\n";
-            PredicateFinder find_compute(op->name, true);
-            op->body.accept(&find_compute);
-
-            debug(3) << "Simplifying compute predicate for " << op->name << ": " << find_compute.predicate << "\n";
-            Expr compute_predicate = simplify(common_subexpression_elimination(find_compute.predicate));
-
-            debug(3) << "Compute predicate for " << op->name << " : " << compute_predicate << "\n";
-
-            if (expr_uses_vars(compute_predicate, vector_vars)) {
-                // Don't try to skip stages if the predicate may vary
-                // per lane. This will just unvectorize the
-                // production, which is probably contrary to the
-                // intent of the user.
-                compute_predicate = const_true();
-            }
+    Expr visit(const Let *op) override {
+        return visit_let(op);
+    }
 
-            if (!is_const_one(compute_predicate)) {
+    Stmt visit(const LetStmt *op) override {
+        return visit_let(op);
+    }
 
-                debug(3) << "Finding allocate predicate for " << op->name << "\n";
-                PredicateFinder find_alloc(op->name, false);
-                op->body.accept(&find_alloc);
-                debug(3) << "Simplifying allocate predicate for " << op->name << "\n";
-                Expr alloc_predicate = simplify(common_subexpression_elimination(find_alloc.predicate));
+    std::string used_var_name(size_t id) {
+        return name_for_id[id] + ".used";
+    }
 
-                debug(3) << "Allocate predicate for " << op->name << " : " << alloc_predicate << "\n";
+    Expr used_var(size_t id) {
+        return Variable::make(Bool(), used_var_name(id));
+    }
+
+    std::string loaded_var_name(size_t id) {
+        return name_for_id[id] + ".loaded";
+    }
 
-                ProductionGuarder g(op->name, compute_predicate, alloc_predicate);
-                Stmt body = g.mutate(op->body);
+    Expr loaded_var(size_t id) {
+        return Variable::make(Bool(), loaded_var_name(id));
+    }
 
-                debug(3) << "Done guarding computation for " << op->name << "\n";
+    Scope<> in_realize;
+    Scope<> in_realize_and_produce_or_consume;
 
-                return Realize::make(op->name, op->types, op->memory_type, op->bounds,
-                                     alloc_predicate, body);
+    Stmt visit(const ProducerConsumer *op) override {
+        size_t id = analysis.func_id.at(op->name);
+        const bool unconditionally_used = analysis.unconditionally_used_funcs.count(id);
+
+        if (op->is_producer && !unconditionally_used) {
+            // The body of this is conditional, based on a yet-to-be defined symbolic value.
+            Expr used = used_var(id);
+            Stmt body;
+
+            auto it = func_info.try_emplace(id, FuncInfo{const_false(), const_false()}).first;
+
+            // Save the info about how this Func is called. We don't
+            // care about self-calls in the produce node.
+            FuncInfo fi = it->second;
+            ScopedBinding<> bind_if(in_realize.contains(op->name),
+                                    in_realize_and_produce_or_consume, op->name);
+
+            body = mutate_conditional_stmt(op->body, used);
+            // Restore the info about how this Func is called. Calls to
+            // it in its own producer don't count towards skip stages
+            // analysis.
+            it->second = fi;
+            body = IfThenElse::make(used, body);
+            inner_unbound_use_of_used_or_loaded_vars = true;
+
+            if (body.same_as(op->body)) {
+                return op;
             } else {
-                return IRMutator::visit(op);
+                return ProducerConsumer::make(op->name, op->is_producer, std::move(body));
             }
         } else {
-            return IRMutator::visit(op);
-        }
-    }
-};
+            ScopedBinding<> bind_if(!unconditionally_used &&
+                                        in_realize.contains(op->name),
+                                    in_realize_and_produce_or_consume, op->name);
 
-// Find Funcs where at least one of the consume nodes only uses the
-// Func conditionally. We may want to guard the production of these
-// Funcs.
-class MightBeSkippable : public IRVisitor {
+            Stmt s = IRMutator::visit(op);
 
-    using IRVisitor::visit;
-
-    bool in_conditional_stmt{false};
+            if (analysis.interesting_vars.count(op->name)) {
+                for (auto &p : func_info) {
+                    p.second.used = relax_over_calls(p.second.used, op->name);
+                    p.second.loaded = relax_over_calls(p.second.loaded, op->name);
+                }
+            }
 
-    void visit(const Call *op) override {
-        IRVisitor::visit(op);
-        if (op->call_type == Call::Halide) {
-            unconditionally_used.insert(op->name);
+            return s;
         }
     }
 
-    void visit(const IfThenElse *op) override {
-        op->condition.accept(this);
-
-        std::set<string> old;
-        unconditionally_used.swap(old);
-
-        ScopedValue<bool> old_in_conditional(in_conditional_stmt, true);
-        op->then_case.accept(this);
-
-        std::set<string> used_in_true;
-        used_in_true.swap(unconditionally_used);
-        if (op->else_case.defined()) {
-            op->else_case.accept(this);
+    Stmt visit(const Realize *op) override {
+        size_t id = analysis.func_id.at(op->name);
+        if (analysis.unconditionally_used_funcs.count(id)) {
+            return IRMutator::visit(op);
         }
 
-        // Take the set intersection of the true and false paths, and add them to the set.
-        std::set_intersection(used_in_true.begin(), used_in_true.end(),
-                              unconditionally_used.begin(), unconditionally_used.end(),
-                              std::inserter(old, old.begin()));
+        Stmt body;
+        {
+            ScopedBinding<> bind(in_realize, op->name);
+            body = mutate(op->body);
+        }
+        Expr condition = mutate(op->condition);
+        auto it = func_info.find(id);
+        if (it != func_info.end()) {
+            if (!is_const_one(it->second.loaded)) {
+                inner_unbound_use_of_used_or_loaded_vars = true;
+                condition = condition && loaded_var(id);
+            }
+        }
 
-        unconditionally_used.swap(old);
+        // We don't need to visit the bounds, because there can't be call nodes
+        // in them.
+        if (body.same_as(op->body) &&
+            condition.same_as(op->condition)) {
+            return op;
+        } else {
+            return Realize::make(op->name, op->types, op->memory_type,
+                                 op->bounds, std::move(condition), std::move(body));
+        }
     }
 
-    void visit(const Select *op) override {
-        op->condition.accept(this);
-
-        std::set<string> old;
-        unconditionally_used.swap(old);
+    bool in_vector_loop = false;
 
-        op->true_value.accept(this);
-        std::set<string> used_in_true;
-        used_in_true.swap(unconditionally_used);
+    Stmt visit(const For *op) override {
+        ScopedValue<bool> s(in_vector_loop,
+                            in_vector_loop || op->for_type == ForType::Vectorized);
+        bool old_inner_unbound_uses = inner_unbound_use_of_used_or_loaded_vars;
+        inner_unbound_use_of_used_or_loaded_vars = false;
+
+        std::map<size_t, FuncInfo> old;
+        old.swap(func_info);
+
+        Stmt body;
+        body = mutate(op->body);
+        // There can't be calls in the min and extent, so no need to visit;
+        // those.
+
+        const bool in_sliding_loop = in_realize_and_produce_or_consume.size() < in_realize.size();
+        bool may_emit =
+            !in_vector_loop &&
+            !in_sliding_loop &&
+            inner_unbound_use_of_used_or_loaded_vars;
+
+        Stmt body_before = body;
+        if (may_emit) {
+            body = emit_defs(body);
+        }
 
-        op->false_value.accept(this);
+        // Now relax all the conditions that depend on this loop variable.
+        bool anything_depended_on_loop_var = false;
+        for (auto &p : func_info) {
+            if (expr_uses_var(p.second.used, op->name)) {
+                p.second.used = relax_over_var(p.second.used, op->name);
+                anything_depended_on_loop_var = true;
+            }
+            if (expr_uses_var(p.second.loaded, op->name)) {
+                p.second.loaded = relax_over_var(p.second.loaded, op->name);
+                anything_depended_on_loop_var = true;
+            }
+        }
 
-        // Again, take the set intersection
-        std::set_intersection(used_in_true.begin(), used_in_true.end(),
-                              unconditionally_used.begin(), unconditionally_used.end(),
-                              std::inserter(old, old.begin()));
+        if (!anything_depended_on_loop_var) {
+            // Adding definitions for .used and .loaded symbols is unnecessary
+            // here. We can just use the ones one loop level further out.
+            body = body_before;
+        } else if (may_emit) {
+            inner_unbound_use_of_used_or_loaded_vars = false;
+        }
+        inner_unbound_use_of_used_or_loaded_vars |= old_inner_unbound_uses;
 
-        unconditionally_used.swap(old);
-    }
+        // To consider: Could add that the loop has non-zero extent here. That
+        // somewhat blurs the lines between bounds inference and skip stages.
+        merge_func_info(&old, func_info);
+        old.swap(func_info);
 
-    void visit(const ProducerConsumer *op) override {
-        if (!op->is_producer) {
-            op->body.accept(this);
-            if (!unconditionally_used.count(op->name) || in_conditional_stmt) {
-                // This Func has a least one consume clause in which
-                // it is only used conditionally.
-                candidates.insert(op->name);
-            }
+        if (body.same_as(op->body)) {
+            return op;
         } else {
-            IRVisitor::visit(op);
-            // Calls inside the produce don't count - that's the block of code we intend to guard.
-            unconditionally_used.erase(op->name);
+            return For::make(op->name, op->min, op->extent,
+                             op->for_type, op->partition_policy, op->device_api, std::move(body));
         }
     }
+};
 
-    set<string> unconditionally_used;
+// Just drop the skip stages marker in the IR. Used when we deduce that we don't
+// need to run the mutator above.
+class StripSkipStagesMarker : public IRMutator {
+    using IRMutator::visit;
 
-public:
-    set<string> candidates;
+    Expr visit(const Call *op) override {
+        if (op->is_intrinsic(Call::skip_stages_marker)) {
+            return 0;
+        } else {
+            return op;
+        }
+    }
 };
 
 }  // namespace
 
-Stmt skip_stages(Stmt stmt, const vector<string> &order) {
-    // Don't consider the last stage, because it's the output, so it's
-    // never skippable.
-    MightBeSkippable check;
-    stmt.accept(&check);
-    for (size_t i = order.size() - 1; i > 0; i--) {
-        debug(2) << "skip_stages checking " << order[i - 1] << "\n";
-        if (check.candidates.count(order[i - 1])) {
-            debug(2) << "skip_stages can skip " << order[i - 1] << "\n";
-            StageSkipper skipper(order[i - 1]);
-            Stmt new_stmt = skipper.mutate(stmt);
-            if (!new_stmt.same_as(stmt)) {
-                // Might have made earlier stages skippable too
-                new_stmt.accept(&check);
-            }
-            stmt = new_stmt;
+Stmt skip_stages(const Stmt &stmt,
+                 const std::vector<Function> &outputs,
+                 const std::vector<std::vector<std::string>> &order,
+                 const std::map<std::string, Function> &env) {
+
+    // Each thing we might want to skip gets a unique id, sorted by realization
+    // order of the corresponding Func.
+    std::map<std::string, size_t> func_id;
+    std::vector<std::string> name_for_id(order.size());
+    for (size_t i = 0; i < order.size(); i++) {
+        // Funcs in a compute_with group get the same id, because you can either
+        // skip them all or skip none of them.
+        for (const auto &f : order[i]) {
+            func_id[f] = i;
+        }
+        name_for_id[i] = order[i][0];
+    }
+
+    // Map any .buffer symbols back to the id of the Func they refer to.
+    for (const auto &p : env) {
+        for (const auto &buf : p.second.output_buffers()) {
+            func_id[buf.name() + ".buffer"] = func_id[p.first];
         }
     }
-    return stmt;
+
+    // Make a map from Funcs to the first member of any compute_with group they belong to.
+    SkipStagesAnalysis analysis(func_id);
+    stmt.accept(&analysis);
+
+    if (analysis.conditionally_used_funcs.empty()) {
+        // Nothing to do. No Funcs can be skipped. Just strip the skip stages
+        // marker.
+        return StripSkipStagesMarker().mutate(stmt);
+    }
+
+    // There may be no calls to the output, which means they'll show up in
+    // neither set. Add them to the unconditionally used set so that the mutator
+    // knows to skip them.
+    for (const Function &f : outputs) {
+        analysis.unconditionally_used_funcs.insert(func_id[f.name()]);
+    }
+
+    return SkipStages(analysis, name_for_id).mutate(stmt);
 }
 
 }  // namespace Internal
diff --git a/src/SkipStages.h b/src/SkipStages.h
index a50886700485..2f0b86f5e971 100644
--- a/src/SkipStages.h
+++ b/src/SkipStages.h
@@ -1,6 +1,7 @@
 #ifndef HALIDE_SKIP_STAGES
 #define HALIDE_SKIP_STAGES
 
+#include <map>
 #include <string>
 #include <vector>
 
@@ -13,12 +14,17 @@
 namespace Halide {
 namespace Internal {
 
+class Function;
+
 /** Avoid computing certain stages if we can infer a runtime condition
  * to check that tells us they won't be used. Does this by analyzing
  * all reads of each buffer allocated, and inferring some condition
  * that tells us if the reads occur. If the condition is non-trivial,
  * inject ifs that guard the production. */
-Stmt skip_stages(Stmt s, const std::vector<std::string> &order);
+Stmt skip_stages(const Stmt &s,
+                 const std::vector<Function> &outputs,
+                 const std::vector<std::vector<std::string>> &order,
+                 const std::map<std::string, Function> &env);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Util.cpp b/src/Util.cpp
index e1d1f3307848..d7f3c36a7993 100644
--- a/src/Util.cpp
+++ b/src/Util.cpp
@@ -619,7 +619,7 @@ struct TickStackEntry {
 
 namespace {
 
-vector<TickStackEntry> tick_stack;
+thread_local vector<TickStackEntry> tick_stack;
 
 }  // namespace
 
diff --git a/test/correctness/skip_stages.cpp b/test/correctness/skip_stages.cpp
index 970966a78e30..a981e5bf3160 100644
--- a/test/correctness/skip_stages.cpp
+++ b/test/correctness/skip_stages.cpp
@@ -201,6 +201,49 @@ int main(int argc, char **argv) {
         check_counts(11);
     }
 
+    for (int test_case = 0; test_case <= 2; test_case++) {
+        // Test a data-dependent stage skip. Double all values that exist in
+        // rows that do not contain any negative numbers.
+        Func input("input");
+        input(x, y) = select(y % 3 == 0 && x == 37, -1, x);
+
+        Func any_negative("any_negative");
+        const int W = 100, H = 100;
+        RDom r(0, W);
+        any_negative(y) = cast<bool>(false);
+        any_negative(y) = any_negative(y) || (input(r, y) < 0);
+
+        Func doubled("doubled");
+        doubled(x, y) = call_counter(input(x, y) * 2, 0);
+
+        Func output("output");
+        output(x, y) = select(any_negative(y), input(x, y), doubled(x, y));
+
+        input.compute_root();
+
+        if (test_case == 0) {
+            // any_negative(y) is a constant condition over this loop, so 'double' can be skipped.
+            doubled.compute_at(output, y);
+            any_negative.compute_root();
+        } else if (test_case == 1) {
+            // any_negative(y) is not constant here, so 'double' can't be skipped.
+            Var yo, yi;
+            output.split(y, yo, yi, 10);
+            doubled.compute_at(output, yo);
+            any_negative.compute_root();
+        } else {
+            // double is computed outside of the consume node for any_negative,
+            // so the condition can't be lifted because it contains a call that
+            // may change in value.
+            doubled.compute_at(output, y);
+            any_negative.compute_at(output, y);
+        }
+
+        reset_counts();
+        output.realize({W, H});
+        check_counts(test_case == 0 ? 66 * 100 : 100 * 100);
+    }
+
     {
         // Check the interation with storage hoisting
 

From 7636c44acc2954a7c20275618093973da6767359 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 26 Feb 2024 18:03:33 -0800
Subject: [PATCH 077/186] Remove two dead vars from the Makefile (#8125)

These appear to be unused
---
 Makefile | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Makefile b/Makefile
index 72c05619e3ea..17e8a80e1ca4 100644
--- a/Makefile
+++ b/Makefile
@@ -230,9 +230,6 @@ CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS)
 # On ubuntu, this requires packages flatbuffers-compiler and libflatbuffers-dev
 ifneq (,$(shell which flatc))
 CXX_FLAGS += -DWITH_SERIALIZATION -I $(BUILD_DIR) -I $(shell which flatc | sed 's/bin.flatc/include/')
-# Note: if updating here, be sure to update in CMakeLists.txt as well
-HALIDE_SERIALIZATION_VERSION_MINOR ?= 1
-HALIDE_SERIALIZATION_VERSION_PATCH ?= 0
 endif
 
 # This is required on some hosts like powerpc64le-linux-gnu because we may build

From 8b3312ce9b343aef10ca7101a3f3f67db5501c71 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Tue, 5 Mar 2024 17:16:06 +0100
Subject: [PATCH 078/186] Add support for setting the default allocator and
 deallocator functions in Halide::Runtime::Buffer. (#8132)

---
 src/runtime/HalideBuffer.h         | 26 +++++++++++++++++++----
 test/correctness/halide_buffer.cpp | 33 ++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/src/runtime/HalideBuffer.h b/src/runtime/HalideBuffer.h
index 4ac2317278bc..7f914d0a4ff2 100644
--- a/src/runtime/HalideBuffer.h
+++ b/src/runtime/HalideBuffer.h
@@ -142,8 +142,8 @@ struct AllInts<float, Args...> : std::false_type {};
 template<typename... Args>
 struct AllInts<double, Args...> : std::false_type {};
 
-// A helper to detect if there are any zeros in a container
 namespace Internal {
+// A helper to detect if there are any zeros in a container
 template<typename Container>
 bool any_zero(const Container &c) {
     for (int i : c) {
@@ -153,6 +153,11 @@ bool any_zero(const Container &c) {
     }
     return false;
 }
+
+struct DefaultAllocatorFns {
+    static inline void *(*default_allocate_fn)(size_t) = nullptr;
+    static inline void (*default_deallocate_fn)(void *) = nullptr;
+};
 }  // namespace Internal
 
 /** A struct acting as a header for allocations owned by the Buffer
@@ -711,6 +716,13 @@ class Buffer {
     }
 
 public:
+    static void set_default_allocate_fn(void *(*allocate_fn)(size_t)) {
+        Internal::DefaultAllocatorFns::default_allocate_fn = allocate_fn;
+    }
+    static void set_default_deallocate_fn(void (*deallocate_fn)(void *)) {
+        Internal::DefaultAllocatorFns::default_deallocate_fn = deallocate_fn;
+    }
+
     /** Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
      * If this can be determined at compile time, fail with a static assert; otherwise
      * return a boolean based on runtime typing. */
@@ -893,7 +905,7 @@ class Buffer {
 
 #if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
         // Only use aligned_alloc() if no custom allocators are specified.
-        if (!allocate_fn && !deallocate_fn) {
+        if (!allocate_fn && !deallocate_fn && !Internal::DefaultAllocatorFns::default_allocate_fn && !Internal::DefaultAllocatorFns::default_deallocate_fn) {
             // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
             // on any supported platform, so we will just overallocate by 'alignment'
             // so that the user storage also starts at an aligned point. This is a bit
@@ -908,10 +920,16 @@ class Buffer {
         // else fall thru
 #endif
         if (!allocate_fn) {
-            allocate_fn = malloc;
+            allocate_fn = Internal::DefaultAllocatorFns::default_allocate_fn;
+            if (!allocate_fn) {
+                allocate_fn = malloc;
+            }
         }
         if (!deallocate_fn) {
-            deallocate_fn = free;
+            deallocate_fn = Internal::DefaultAllocatorFns::default_deallocate_fn;
+            if (!deallocate_fn) {
+                deallocate_fn = free;
+            }
         }
 
         static_assert(sizeof(AllocationHeader) <= alignment);
diff --git a/test/correctness/halide_buffer.cpp b/test/correctness/halide_buffer.cpp
index 6c35f4b7a409..accaf6f6bb3e 100644
--- a/test/correctness/halide_buffer.cpp
+++ b/test/correctness/halide_buffer.cpp
@@ -6,6 +6,22 @@
 
 using namespace Halide::Runtime;
 
+static void *my_malloced_addr = nullptr;
+static int my_malloc_count = 0;
+static void *my_freed_addr = nullptr;
+static int my_free_count = 0;
+void *my_malloc(size_t size) {
+    void *ptr = malloc(size);
+    my_malloced_addr = ptr;
+    my_malloc_count++;
+    return ptr;
+}
+void my_free(void *ptr) {
+    my_freed_addr = ptr;
+    my_free_count++;
+    free(ptr);
+}
+
 template<typename T1, typename T2>
 void check_equal_shape(const Buffer<T1> &a, const Buffer<T2> &b) {
     if (a.dimensions() != b.dimensions()) abort();
@@ -515,6 +531,23 @@ int main(int argc, char **argv) {
         assert(b.dim(3).stride() == b2.dim(3).stride());
     }
 
+    {
+        // Test setting default allocate and deallocate functions.
+        Buffer<>::set_default_allocate_fn(my_malloc);
+        Buffer<>::set_default_deallocate_fn(my_free);
+
+        assert(my_malloc_count == 0);
+        assert(my_free_count == 0);
+        auto b = Buffer<uint8_t, 2>(5, 4).fill(1);
+        assert(my_malloced_addr != nullptr && my_malloced_addr < b.data());
+        assert(my_malloc_count == 1);
+        assert(my_free_count == 0);
+        b.deallocate();
+        assert(my_malloc_count == 1);
+        assert(my_free_count == 1);
+        assert(my_malloced_addr == my_freed_addr);
+    }
+
     printf("Success!\n");
     return 0;
 }

From d33ffa20f233224adcf80aa147cadf7f594dda51 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 5 Mar 2024 09:50:07 -0800
Subject: [PATCH 079/186] Make realization order invariant to unique_name
 suffixes (#8124)

* Make realization order invariant to unique_name suffixes

* Add test

* definition_order -> uint64 everywhere

* Use visitation order instead of definition order

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/FindCalls.cpp                             | 83 ++++++++++++-------
 src/FindCalls.h                               |  5 ++
 src/RealizationOrder.cpp                      | 71 +++++++++++++++-
 test/correctness/CMakeLists.txt               |  1 +
 test/correctness/stable_realization_order.cpp | 41 +++++++++
 5 files changed, 167 insertions(+), 34 deletions(-)
 create mode 100644 test/correctness/stable_realization_order.cpp

diff --git a/src/FindCalls.cpp b/src/FindCalls.cpp
index 77c5ae7645cd..9345c89dcac5 100644
--- a/src/FindCalls.cpp
+++ b/src/FindCalls.cpp
@@ -8,24 +8,22 @@
 namespace Halide {
 namespace Internal {
 
-using std::map;
-using std::string;
-using std::vector;
-
 namespace {
+
 /* Find all the internal halide calls in an expr */
 class FindCalls : public IRVisitor {
 public:
-    map<string, Function> calls;
+    std::map<std::string, Function> calls;
+    std::vector<Function> order;
 
     using IRVisitor::visit;
 
     void include_function(const Function &f) {
-        map<string, Function>::iterator iter = calls.find(f.name());
-        if (iter == calls.end()) {
-            calls[f.name()] = f;
+        auto [it, inserted] = calls.emplace(f.name(), f);
+        if (inserted) {
+            order.push_back(f);
         } else {
-            user_assert(iter->second.same_as(f))
+            user_assert(it->second.same_as(f))
                 << "Can't compile a pipeline using multiple functions with same name: "
                 << f.name() << "\n";
         }
@@ -41,64 +39,87 @@ class FindCalls : public IRVisitor {
     }
 };
 
-void populate_environment_helper(const Function &f, map<string, Function> &env,
-                                 bool recursive = true, bool include_wrappers = false) {
-    map<string, Function>::const_iterator iter = env.find(f.name());
-    if (iter != env.end()) {
+void populate_environment_helper(const Function &f,
+                                 std::map<std::string, Function> *env,
+                                 std::vector<Function> *order,
+                                 bool recursive = true,
+                                 bool include_wrappers = false) {
+    std::map<std::string, Function>::const_iterator iter = env->find(f.name());
+    if (iter != env->end()) {
         user_assert(iter->second.same_as(f))
             << "Can't compile a pipeline using multiple functions with same name: "
             << f.name() << "\n";
         return;
     }
 
+    auto insert_func = [](const Function &f,
+                          std::map<std::string, Function> *env,
+                          std::vector<Function> *order) {
+        auto [it, inserted] = env->emplace(f.name(), f);
+        if (inserted) {
+            order->push_back(f);
+        }
+    };
+
     FindCalls calls;
     f.accept(&calls);
     if (f.has_extern_definition()) {
         for (const ExternFuncArgument &arg : f.extern_arguments()) {
             if (arg.is_func()) {
-                Function g(arg.func);
-                calls.calls[g.name()] = g;
+                insert_func(Function{arg.func}, &calls.calls, &calls.order);
             }
         }
     }
 
     if (include_wrappers) {
         for (const auto &it : f.schedule().wrappers()) {
-            Function g(it.second);
-            calls.calls[g.name()] = g;
+            insert_func(Function{it.second}, &calls.calls, &calls.order);
         }
     }
 
     if (!recursive) {
-        env.insert(calls.calls.begin(), calls.calls.end());
+        for (const Function &g : calls.order) {
+            insert_func(g, env, order);
+        }
     } else {
-        env[f.name()] = f;
-
-        for (const auto &i : calls.calls) {
-            populate_environment_helper(i.second, env, recursive, include_wrappers);
+        insert_func(f, env, order);
+        for (const Function &g : calls.order) {
+            populate_environment_helper(g, env, order, recursive, include_wrappers);
         }
     }
 }
 
 }  // namespace
 
-map<string, Function> build_environment(const vector<Function> &funcs) {
-    map<string, Function> env;
+std::map<std::string, Function> build_environment(const std::vector<Function> &funcs) {
+    std::map<std::string, Function> env;
+    std::vector<Function> order;
     for (const Function &f : funcs) {
-        populate_environment_helper(f, env, true, true);
+        populate_environment_helper(f, &env, &order, true, true);
     }
     return env;
 }
 
-map<string, Function> find_transitive_calls(const Function &f) {
-    map<string, Function> res;
-    populate_environment_helper(f, res, true, false);
+std::vector<Function> called_funcs_in_order_found(const std::vector<Function> &funcs) {
+    std::map<std::string, Function> env;
+    std::vector<Function> order;
+    for (const Function &f : funcs) {
+        populate_environment_helper(f, &env, &order, true, true);
+    }
+    return order;
+}
+
+std::map<std::string, Function> find_transitive_calls(const Function &f) {
+    std::map<std::string, Function> res;
+    std::vector<Function> order;
+    populate_environment_helper(f, &res, &order, true, false);
     return res;
 }
 
-map<string, Function> find_direct_calls(const Function &f) {
-    map<string, Function> res;
-    populate_environment_helper(f, res, false, false);
+std::map<std::string, Function> find_direct_calls(const Function &f) {
+    std::map<std::string, Function> res;
+    std::vector<Function> order;
+    populate_environment_helper(f, &res, &order, false, false);
     return res;
 }
 
diff --git a/src/FindCalls.h b/src/FindCalls.h
index f55140ae9162..40787d922a4f 100644
--- a/src/FindCalls.h
+++ b/src/FindCalls.h
@@ -36,6 +36,11 @@ std::map<std::string, Function> find_transitive_calls(const Function &f);
  * a map of them. */
 std::map<std::string, Function> build_environment(const std::vector<Function> &funcs);
 
+/** Returns the same Functions as build_environment, but returns a vector of
+ * Functions instead, where the order is the order in which the Functions were
+ * first encountered. This is stable to changes in the names of the Functions. */
+std::vector<Function> called_funcs_in_order_found(const std::vector<Function> &funcs);
+
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/src/RealizationOrder.cpp b/src/RealizationOrder.cpp
index 8541c17ea862..af12ba80c228 100644
--- a/src/RealizationOrder.cpp
+++ b/src/RealizationOrder.cpp
@@ -41,6 +41,7 @@ find_fused_groups(const map<string, Function> &env,
     map<string, vector<string>> fused_groups;
     map<string, string> group_name;
 
+    int counter = 0;
     for (const auto &iter : env) {
         const string &fn = iter.first;
         if (visited.find(fn) == visited.end()) {
@@ -48,7 +49,7 @@ find_fused_groups(const map<string, Function> &env,
             find_fused_groups_dfs(fn, fuse_adjacency_list, visited, group);
 
             // Create a unique name for the fused group.
-            string rename = unique_name("_fg");
+            string rename = "_fg" + std::to_string(counter++);
             fused_groups.emplace(rename, group);
             for (const auto &m : group) {
                 group_name.emplace(m, rename);
@@ -69,7 +70,7 @@ void realization_order_dfs(const string &current,
     internal_assert(iter != graph.end());
 
     for (const string &fn : iter->second) {
-        internal_assert(fn != current);
+        internal_assert(fn != current) << fn;
         if (visited.find(fn) == visited.end()) {
             realization_order_dfs(fn, graph, visited, result_set, order);
         } else {
@@ -235,8 +236,63 @@ void check_fused_stages_are_scheduled_in_order(const Function &f) {
     }
 }
 
+// Reorder Funcs in a vector to have an order that's resistant to unique_name
+// calls, so that multitarget builds don't get arbitrary changes to topological
+// ordering, and so that machine-generated schedules (which depend on the
+// topological order) and less likely to be invalidated by things that have
+// happened in the same process earlier.
+//
+// To do this, we break each name into a prefix, the visitation order counter of
+// the Func, and then finally the full original name. The prefix is what you get
+// after stripping off anything after a $ (to handle suffixes introduced by
+// multi-character unique_name calls), and then stripping off any digits (to
+// handle suffixes introduced by single-character unique_name calls). The
+// visitation order is when the Func is first encountered in an IRVisitor
+// traversal of the entire Pipeline.
+//
+// This is gross. The reason we don't just break ties by visitation order alone
+// is because that way it's likely to be consistent with the realization
+// order before this sorting was done.
+void sort_funcs_by_name_and_counter(vector<string> *funcs,
+                                    const map<string, Function> &env,
+                                    const map<string, uint64_t> &visitation_order) {
+    vector<std::tuple<string, uint64_t, string>> items;
+    items.reserve(funcs->size());
+    for (size_t i = 0; i < funcs->size(); i++) {
+        const string &full_name = (*funcs)[i];
+        string prefix = split_string(full_name, "$")[0];
+        while (!prefix.empty() && std::isdigit(prefix.back())) {
+            prefix.pop_back();
+        }
+        auto env_it = env.find(full_name);
+        uint64_t counter = 0;
+        if (env_it != env.end()) {
+            auto v_it = visitation_order.find(full_name);
+            internal_assert(v_it != visitation_order.end())
+                << "Func " << full_name
+                << " is somehow in the visitation order but not the environment.";
+            counter = v_it->second;
+        }
+
+        items.emplace_back(prefix, counter, full_name);
+    }
+    std::sort(items.begin(), items.end());
+    for (size_t i = 0; i < items.size(); i++) {
+        (*funcs)[i] = std::move(std::get<2>(items[i]));
+    }
+}
+
 }  // anonymous namespace
 
+map<string, uint64_t> compute_visitation_order(const vector<Function> &outputs) {
+    vector<Function> funcs = called_funcs_in_order_found(outputs);
+    map<string, uint64_t> result;
+    for (uint64_t i = 0; i < funcs.size(); i++) {
+        result[funcs[i].name()] = i;
+    }
+    return result;
+}
+
 pair<vector<string>, vector<vector<string>>> realization_order(
     const vector<Function> &outputs, map<string, Function> &env) {
 
@@ -318,6 +374,10 @@ pair<vector<string>, vector<vector<string>>> realization_order(
             }
         }
     }
+    auto visitation_order = compute_visitation_order(outputs);
+    for (auto &p : graph) {
+        sort_funcs_by_name_and_counter(&p.second, env, visitation_order);
+    }
 
     // Compute the realization order of the fused groups (i.e. the dummy nodes)
     // and also the realization order of the functions within a fused group.
@@ -376,7 +436,12 @@ vector<string> topological_order(const vector<Function> &outputs,
                 s.push_back(callee.first);
             }
         }
-        graph.emplace(caller.first, s);
+        graph.emplace(caller.first, std::move(s));
+    }
+
+    auto visitation_order = compute_visitation_order(outputs);
+    for (auto &p : graph) {
+        sort_funcs_by_name_and_counter(&p.second, env, visitation_order);
     }
 
     vector<string> order;
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index f77393a21114..9b934b768cdd 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -295,6 +295,7 @@ tests(GROUPS correctness
       split_fuse_rvar.cpp
       split_reuse_inner_name_bug.cpp
       split_store_compute.cpp
+      stable_realization_order.cpp
       stack_allocations.cpp
       stage_strided_loads.cpp
       stencil_chain_in_update_definitions.cpp
diff --git a/test/correctness/stable_realization_order.cpp b/test/correctness/stable_realization_order.cpp
new file mode 100644
index 000000000000..f62423559327
--- /dev/null
+++ b/test/correctness/stable_realization_order.cpp
@@ -0,0 +1,41 @@
+#include "Halide.h"
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+int main(int argc, char **argv) {
+    // Verify that the realization order is invariant to anything to do with
+    // unique_name counters.
+
+    std::vector<std::string> expected;
+
+    for (int i = 0; i < 10; i++) {
+        std::map<std::string, Function> env;
+        Var x, y;
+        Expr s = 0;
+        std::vector<Func> funcs(8);
+        for (size_t i = 0; i < funcs.size() - 1; i++) {
+            funcs[i](x, y) = x + y;
+            s += funcs[i](x, y);
+            env[funcs[i].name()] = funcs[i].function();
+        }
+        funcs.back()(x, y) = s;
+        env[funcs.back().name()] = funcs.back().function();
+
+        auto r = realization_order({funcs.back().function()}, env).first;
+        // Ties in the realization order are supposed to be broken by any
+        // alphabetical prefix of the Func name followed by time of
+        // definition. All the Funcs in this test have the same name, so it
+        // should just depend on time of definition.
+        assert(r.size() == funcs.size());
+        for (size_t i = 0; i < funcs.size(); i++) {
+            if (funcs[i].name() != r[i]) {
+                debug(0) << "Unexpected realization order: "
+                         << funcs[i].name() << " != " << r[i] << "\n";
+            }
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From 05ae15a82983c76fffcc0a2c3f4aecfd7098d4db Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 5 Mar 2024 09:50:19 -0800
Subject: [PATCH 080/186] Make gpu thread and block for loop names opaque
 (#8133)

This is one of our largest remaining type of magic name. These were
explicitly constructed in lots of places and then explicitly checked for
with ends_with in lots of places.

This PR makes the names opaque. Only CanonicalizeGPUVars.cpp knows what
they are, and they don't have to be a single fixed thing as long as
they're consistent within a process.

Also reduced the number of GPU dimensions to three more uniformly. We
were already asserting this, but there was lots of dead code in lowering
passes after gpu loop validation that allowed for four.

Also fixed a bug I found in is_block_uniform. It didn't consider that
the dependence on a gpu thread variable in a load index could be because
a let variable encountered depends on a gpu thread variable.
---
 src/CanonicalizeGPUVars.cpp           | 43 ++++++++--------
 src/CanonicalizeGPUVars.h             |  7 +++
 src/CodeGen_D3D12Compute_Dev.cpp      | 36 +++++--------
 src/CodeGen_GPU_Dev.cpp               | 74 +++++++++++----------------
 src/CodeGen_GPU_Dev.h                 |  4 --
 src/CodeGen_Metal_Dev.cpp             | 22 +++-----
 src/CodeGen_OpenCL_Dev.cpp            | 22 +++-----
 src/CodeGen_PTX_Dev.cpp               | 23 ++++-----
 src/CodeGen_Vulkan_Dev.cpp            | 55 ++++++--------------
 src/CodeGen_WebGPU_Dev.cpp            | 22 +++-----
 src/DeviceArgument.cpp                |  2 +-
 src/Expr.cpp                          |  7 +++
 src/Expr.h                            |  3 ++
 src/FuseGPUThreadLoops.cpp            | 68 ++++++++++++------------
 src/OffloadGPULoops.cpp               | 44 ++++++----------
 src/PartitionLoops.cpp                |  8 +--
 src/TrimNoOps.cpp                     |  2 +-
 test/correctness/fuse_gpu_threads.cpp | 10 ++--
 18 files changed, 195 insertions(+), 257 deletions(-)

diff --git a/src/CanonicalizeGPUVars.cpp b/src/CanonicalizeGPUVars.cpp
index 7e993d7a72c1..aef1f55c5577 100644
--- a/src/CanonicalizeGPUVars.cpp
+++ b/src/CanonicalizeGPUVars.cpp
@@ -11,23 +11,26 @@ namespace Halide {
 namespace Internal {
 
 using std::map;
-using std::string;
 using std::vector;
 
-namespace {
-string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"};
-string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"};
-
-string get_thread_name(int index) {
+const std::string &gpu_thread_name(int index) {
+    static std::string gpu_thread_names[3] = {"." + unique_name("thread_id_x"),
+                                              "." + unique_name("thread_id_y"),
+                                              "." + unique_name("thread_id_z")};
     internal_assert(index >= 0 && index < 3);
-    return thread_names[index];
+    return gpu_thread_names[index];
 }
 
-string get_block_name(int index) {
+const std::string &gpu_block_name(int index) {
+    static std::string gpu_block_names[3] = {"." + unique_name("block_id_x"),
+                                             "." + unique_name("block_id_y"),
+                                             "." + unique_name("block_id_z")};
     internal_assert(index >= 0 && index < 3);
-    return block_names[index];
+    return gpu_block_names[index];
 }
 
+namespace {
+
 class CountGPUBlocksThreads : public IRVisitor {
     using IRVisitor::visit;
 
@@ -73,12 +76,12 @@ class CountGPUBlocksThreads : public IRVisitor {
 };
 
 class CanonicalizeGPUVars : public IRMutator {
-    map<string, string> gpu_vars;
+    map<std::string, std::string> gpu_vars;
 
     using IRMutator::visit;
 
-    string find_replacement(const string &suffix, const string &name) {
-        vector<string> v = split_string(name, suffix);
+    std::string find_replacement(const std::string &suffix, const std::string &name) {
+        vector<std::string> v = split_string(name, suffix);
         internal_assert(v.size() == 2);
         const auto &iter = gpu_vars.find(v[0]);
         if (iter != gpu_vars.end()) {
@@ -87,7 +90,7 @@ class CanonicalizeGPUVars : public IRMutator {
         return name;
     }
 
-    string canonicalize_let(const string &name) {
+    std::string canonicalize_let(const std::string &name) {
         if (ends_with(name, ".loop_max")) {
             return find_replacement(".loop_max", name);
         } else if (ends_with(name, ".loop_min")) {
@@ -100,7 +103,7 @@ class CanonicalizeGPUVars : public IRMutator {
     }
 
     Stmt visit(const For *op) override {
-        string name = op->name;
+        std::string name = op->name;
         Expr min = mutate(op->min);
         Expr extent = mutate(op->extent);
         Stmt body = mutate(op->body);
@@ -113,13 +116,13 @@ class CanonicalizeGPUVars : public IRMutator {
             op->body.accept(&counter);
 
             if (op->for_type == ForType::GPUBlock) {
-                name += "." + get_block_name(counter.nblocks);
+                name += gpu_block_name(counter.nblocks);
                 debug(5) << "Replacing " << op->name << " with GPU block name " << name << "\n";
             } else if (op->for_type == ForType::GPUThread) {
-                name += "." + get_thread_name(counter.nthreads);
+                name += gpu_thread_name(counter.nthreads);
                 debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n";
             } else if (op->for_type == ForType::GPULane) {
-                name += "." + get_thread_name(0);
+                name += gpu_thread_name(0);
             }
 
             if (name != op->name) {
@@ -143,7 +146,7 @@ class CanonicalizeGPUVars : public IRMutator {
     }
 
     Stmt visit(const LetStmt *op) override {
-        vector<std::pair<string, Expr>> lets;
+        vector<std::pair<std::string, Expr>> lets;
         Stmt result;
 
         do {
@@ -154,7 +157,7 @@ class CanonicalizeGPUVars : public IRMutator {
         result = mutate(result);
 
         for (auto it = lets.rbegin(); it != lets.rend(); it++) {
-            string name = canonicalize_let(it->first);
+            std::string name = canonicalize_let(it->first);
             if (name != it->first) {
                 Expr new_var = Variable::make(Int(32), name);
                 result = substitute(it->first, new_var, result);
@@ -168,7 +171,7 @@ class CanonicalizeGPUVars : public IRMutator {
     Stmt visit(const IfThenElse *op) override {
         Expr condition = mutate(op->condition);
 
-        map<string, string> old_gpu_vars;
+        map<std::string, std::string> old_gpu_vars;
         old_gpu_vars.swap(gpu_vars);
         Stmt then_case = mutate(op->then_case);
 
diff --git a/src/CanonicalizeGPUVars.h b/src/CanonicalizeGPUVars.h
index 25d57a52dfc8..573471179a6a 100644
--- a/src/CanonicalizeGPUVars.h
+++ b/src/CanonicalizeGPUVars.h
@@ -15,6 +15,13 @@ namespace Internal {
  * by the nesting order: innermost is assigned to x and so on. */
 Stmt canonicalize_gpu_vars(Stmt s);
 
+/** Names for the thread and block id variables. Includes the leading
+ * dot. Indexed from inside out, so 0 gives you the innermost loop. */
+// @{
+const std::string &gpu_thread_name(int index);
+const std::string &gpu_block_name(int index);
+// @}
+
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index 4fd614cc0dfc..4b5ea37d8a0e 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -3,6 +3,7 @@
 #include <sstream>
 #include <utility>
 
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_D3D12Compute_Dev.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
@@ -221,22 +222,18 @@ string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_reinterpret(Type
 
 namespace {
 string simt_intrinsic(const string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
+    if (ends_with(name, gpu_thread_name(0))) {
         return "tid_in_tgroup.x";
-    } else if (ends_with(name, ".__thread_id_y")) {
+    } else if (ends_with(name, gpu_thread_name(1))) {
         return "tid_in_tgroup.y";
-    } else if (ends_with(name, ".__thread_id_z")) {
+    } else if (ends_with(name, gpu_thread_name(2))) {
         return "tid_in_tgroup.z";
-    } else if (ends_with(name, ".__thread_id_w")) {
-        user_error << "HLSL (SM5.1) does not support more than three dimensions for compute kernel threads.\n";
-    } else if (ends_with(name, ".__block_id_x")) {
+    } else if (ends_with(name, gpu_block_name(0))) {
         return "tgroup_index.x";
-    } else if (ends_with(name, ".__block_id_y")) {
+    } else if (ends_with(name, gpu_block_name(1))) {
         return "tgroup_index.y";
-    } else if (ends_with(name, ".__block_id_z")) {
+    } else if (ends_with(name, gpu_block_name(2))) {
         return "tgroup_index.z";
-    } else if (ends_with(name, ".__block_id_w")) {
-        user_error << "HLSL (SM5.1) does not support more than three dimensions for compute dispatch groups.\n";
     }
     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
     return "";
@@ -300,15 +297,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const For *loop) {
     user_assert(loop->for_type != ForType::GPULane)
         << "The D3D12Compute backend does not support the gpu_lanes() scheduling directive.";
 
-    if (!is_gpu_var(loop->name)) {
-        user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside D3D12Compute kernel\n";
+    if (!is_gpu(loop->for_type)) {
         CodeGen_GPU_C::visit(loop);
         return;
     }
-
-    internal_assert((loop->for_type == ForType::GPUBlock) ||
-                    (loop->for_type == ForType::GPUThread))
-        << "kernel loop must be either gpu block or gpu thread\n";
     internal_assert(is_const_zero(loop->min));
 
     stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
@@ -1153,7 +1145,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
     struct FindThreadGroupSize : public IRVisitor {
         using IRVisitor::visit;
         void visit(const For *loop) override {
-            if (!is_gpu_var(loop->name)) {
+            if (!is_gpu(loop->for_type)) {
                 return loop->body.accept(this);
             }
             if (loop->for_type != ForType::GPUThread) {
@@ -1175,13 +1167,9 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
             loop->body.accept(this);
         }
         int thread_loop_workgroup_index(const string &name) {
-            string ids[] = {".__thread_id_x",
-                            ".__thread_id_y",
-                            ".__thread_id_z",
-                            ".__thread_id_w"};
-            for (auto &id : ids) {
-                if (ends_with(name, id)) {
-                    return (&id - ids);
+            for (int i = 0; i < 3; i++) {
+                if (ends_with(name, gpu_thread_name(i))) {
+                    return i;
                 }
             }
             return -1;
diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp
index 08e456e78ce2..07148a508144 100644
--- a/src/CodeGen_GPU_Dev.cpp
+++ b/src/CodeGen_GPU_Dev.cpp
@@ -1,5 +1,7 @@
 #include "CodeGen_GPU_Dev.h"
+#include "CanonicalizeGPUVars.h"
 #include "Deinterleave.h"
+#include "ExprUsesVar.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "IRVisitor.h"
@@ -9,50 +11,6 @@ namespace Internal {
 
 CodeGen_GPU_Dev::~CodeGen_GPU_Dev() = default;
 
-bool CodeGen_GPU_Dev::is_gpu_var(const std::string &name) {
-    return is_gpu_block_var(name) || is_gpu_thread_var(name);
-}
-
-bool CodeGen_GPU_Dev::is_gpu_block_var(const std::string &name) {
-    return (ends_with(name, ".__block_id_x") ||
-            ends_with(name, ".__block_id_y") ||
-            ends_with(name, ".__block_id_z") ||
-            ends_with(name, ".__block_id_w"));
-}
-
-bool CodeGen_GPU_Dev::is_gpu_thread_var(const std::string &name) {
-    return (ends_with(name, ".__thread_id_x") ||
-            ends_with(name, ".__thread_id_y") ||
-            ends_with(name, ".__thread_id_z") ||
-            ends_with(name, ".__thread_id_w"));
-}
-
-namespace {
-// Check to see if an expression is uniform within a block.
-// This is done by checking to see if the expression depends on any GPU
-// thread indices.
-class IsBlockUniform : public IRVisitor {
-    using IRVisitor::visit;
-
-    void visit(const Variable *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) {
-            result = false;
-        }
-    }
-
-public:
-    bool result = true;
-
-    IsBlockUniform() = default;
-};
-}  // namespace
-
-bool CodeGen_GPU_Dev::is_block_uniform(const Expr &expr) {
-    IsBlockUniform v;
-    expr.accept(&v);
-    return v.result;
-}
-
 namespace {
 // Check to see if a buffer is a candidate for constant memory storage.
 // A buffer is a candidate for constant memory if it is never written to,
@@ -71,7 +29,7 @@ class IsBufferConstant : public IRVisitor {
 
     void visit(const Load *op) override {
         if (op->name == buffer &&
-            !CodeGen_GPU_Dev::is_block_uniform(op->index)) {
+            expr_uses_vars(op->index, depends_on_thread_var)) {
             result = false;
         }
         if (result) {
@@ -79,6 +37,32 @@ class IsBufferConstant : public IRVisitor {
         }
     }
 
+    void visit(const LetStmt *op) override {
+        op->value.accept(this);
+        ScopedBinding<> bind_if(expr_uses_vars(op->value, depends_on_thread_var),
+                                depends_on_thread_var,
+                                op->name);
+        op->body.accept(this);
+    }
+
+    void visit(const Let *op) override {
+        op->value.accept(this);
+        ScopedBinding<> bind_if(expr_uses_vars(op->value, depends_on_thread_var),
+                                depends_on_thread_var,
+                                op->name);
+        op->body.accept(this);
+    }
+
+    void visit(const For *op) override {
+        ScopedBinding<> bind_if(op->for_type == ForType::GPUThread ||
+                                    op->for_type == ForType::GPULane,
+                                depends_on_thread_var,
+                                op->name);
+        IRVisitor::visit(op);
+    }
+
+    Scope<> depends_on_thread_var;
+
 public:
     bool result = true;
     const std::string &buffer;
diff --git a/src/CodeGen_GPU_Dev.h b/src/CodeGen_GPU_Dev.h
index f6100116b955..ff80480003bc 100644
--- a/src/CodeGen_GPU_Dev.h
+++ b/src/CodeGen_GPU_Dev.h
@@ -55,10 +55,6 @@ struct CodeGen_GPU_Dev {
         return false;
     }
 
-    static bool is_gpu_var(const std::string &name);
-    static bool is_gpu_block_var(const std::string &name);
-    static bool is_gpu_thread_var(const std::string &name);
-
     /** Checks if expr is block uniform, i.e. does not depend on a thread
      * var. */
     static bool is_block_uniform(const Expr &expr);
diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 79060294798e..35b22058aec1 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -2,6 +2,7 @@
 #include <sstream>
 #include <utility>
 
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Metal_Dev.h"
@@ -187,22 +188,18 @@ string CodeGen_Metal_Dev::CodeGen_Metal_C::print_reinterpret(Type type, const Ex
 
 namespace {
 string simt_intrinsic(const string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
+    if (ends_with(name, gpu_thread_name(0))) {
         return "tid_in_tgroup.x";
-    } else if (ends_with(name, ".__thread_id_y")) {
+    } else if (ends_with(name, gpu_thread_name(1))) {
         return "tid_in_tgroup.y";
-    } else if (ends_with(name, ".__thread_id_z")) {
+    } else if (ends_with(name, gpu_thread_name(2))) {
         return "tid_in_tgroup.z";
-    } else if (ends_with(name, ".__thread_id_w")) {
-        user_error << "Metal does not support more than three dimensions in a kernel (threads).\n";
-    } else if (ends_with(name, ".__block_id_x")) {
+    } else if (ends_with(name, gpu_block_name(0))) {
         return "tgroup_index.x";
-    } else if (ends_with(name, ".__block_id_y")) {
+    } else if (ends_with(name, gpu_block_name(1))) {
         return "tgroup_index.y";
-    } else if (ends_with(name, ".__block_id_z")) {
+    } else if (ends_with(name, gpu_block_name(2))) {
         return "tgroup_index.z";
-    } else if (ends_with(name, ".__block_id_w")) {
-        user_error << "Metal does not support more than three dimensions in a kernel (groups).\n";
     }
     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
     return "";
@@ -272,10 +269,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const For *loop) {
     user_assert(loop->for_type != ForType::GPULane)
         << "The Metal backend does not support the gpu_lanes() scheduling directive.";
 
-    if (is_gpu_var(loop->name)) {
-        internal_assert((loop->for_type == ForType::GPUBlock) ||
-                        (loop->for_type == ForType::GPUThread))
-            << "kernel loop must be either gpu block or gpu thread\n";
+    if (is_gpu(loop->for_type)) {
         internal_assert(is_const_zero(loop->min));
 
         stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index c86e483cc5a8..d7c7951936f3 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -4,6 +4,7 @@
 #include <utility>
 
 #include "CSE.h"
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_OpenCL_Dev.h"
@@ -184,22 +185,18 @@ string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_reinterpret(Type type, const
 
 namespace {
 string simt_intrinsic(const string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
+    if (ends_with(name, gpu_thread_name(0))) {
         return "get_local_id(0)";
-    } else if (ends_with(name, ".__thread_id_y")) {
+    } else if (ends_with(name, gpu_thread_name(1))) {
         return "get_local_id(1)";
-    } else if (ends_with(name, ".__thread_id_z")) {
+    } else if (ends_with(name, gpu_thread_name(2))) {
         return "get_local_id(2)";
-    } else if (ends_with(name, ".__thread_id_w")) {
-        return "get_local_id(3)";
-    } else if (ends_with(name, ".__block_id_x")) {
+    } else if (ends_with(name, gpu_block_name(0))) {
         return "get_group_id(0)";
-    } else if (ends_with(name, ".__block_id_y")) {
+    } else if (ends_with(name, gpu_block_name(1))) {
         return "get_group_id(1)";
-    } else if (ends_with(name, ".__block_id_z")) {
+    } else if (ends_with(name, gpu_block_name(2))) {
         return "get_group_id(2)";
-    } else if (ends_with(name, ".__block_id_w")) {
-        return "get_group_id(3)";
     }
     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
     return "";
@@ -210,10 +207,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) {
     user_assert(loop->for_type != ForType::GPULane)
         << "The OpenCL backend does not support the gpu_lanes() scheduling directive.";
 
-    if (is_gpu_var(loop->name)) {
-        internal_assert((loop->for_type == ForType::GPUBlock) ||
-                        (loop->for_type == ForType::GPUThread))
-            << "kernel loop must be either gpu block or gpu thread\n";
+    if (is_gpu(loop->for_type)) {
         internal_assert(is_const_zero(loop->min));
 
         stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name)
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 6be2f1b7e988..0d63427b8d83 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -1,5 +1,6 @@
 #include "CodeGen_PTX_Dev.h"
 #include "CSE.h"
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_LLVM.h"
@@ -105,8 +106,8 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev {
     }
     Type upgrade_type_for_storage(const Type &t) const override;
 
-    /** Map from simt variable names (e.g. foo.__block_id_x) to the llvm
-     * ptx intrinsic functions to call to get them. */
+    /** Map from simt variable names (e.g. foo.block_id_x) to the llvm ptx
+     * intrinsic functions to call to get them. */
     std::string simt_intrinsic(const std::string &name);
 
     bool supports_atomic_add(const Type &t) const override;
@@ -282,29 +283,25 @@ void CodeGen_PTX_Dev::visit(const Call *op) {
 }
 
 string CodeGen_PTX_Dev::simt_intrinsic(const string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
+    if (ends_with(name, gpu_thread_name(0))) {
         return "llvm.nvvm.read.ptx.sreg.tid.x";
-    } else if (ends_with(name, ".__thread_id_y")) {
+    } else if (ends_with(name, gpu_thread_name(1))) {
         return "llvm.nvvm.read.ptx.sreg.tid.y";
-    } else if (ends_with(name, ".__thread_id_z")) {
+    } else if (ends_with(name, gpu_thread_name(2))) {
         return "llvm.nvvm.read.ptx.sreg.tid.z";
-    } else if (ends_with(name, ".__thread_id_w")) {
-        return "llvm.nvvm.read.ptx.sreg.tid.w";
-    } else if (ends_with(name, ".__block_id_x")) {
+    } else if (ends_with(name, gpu_block_name(0))) {
         return "llvm.nvvm.read.ptx.sreg.ctaid.x";
-    } else if (ends_with(name, ".__block_id_y")) {
+    } else if (ends_with(name, gpu_block_name(1))) {
         return "llvm.nvvm.read.ptx.sreg.ctaid.y";
-    } else if (ends_with(name, ".__block_id_z")) {
+    } else if (ends_with(name, gpu_block_name(2))) {
         return "llvm.nvvm.read.ptx.sreg.ctaid.z";
-    } else if (ends_with(name, ".__block_id_w")) {
-        return "llvm.nvvm.read.ptx.sreg.ctaid.w";
     }
     internal_error << "simt_intrinsic called on bad variable name\n";
     return "";
 }
 
 void CodeGen_PTX_Dev::visit(const For *loop) {
-    if (is_gpu_var(loop->name)) {
+    if (is_gpu(loop->for_type)) {
         Expr simt_idx = Call::make(Int(32), simt_intrinsic(loop->name), std::vector<Expr>(), Call::Extern);
         internal_assert(is_const_zero(loop->min));
         sym_push(loop->name, codegen(simt_idx));
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 39dd65b67671..157a3cbdc9ea 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -4,6 +4,7 @@
 #include <unordered_set>
 
 #include "CSE.h"
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Vulkan_Dev.h"
@@ -381,12 +382,10 @@ class CheckAlignedDenseVectorLoadStore : public IRVisitor {
 struct FindWorkGroupSize : public IRVisitor {
     using IRVisitor::visit;
     void visit(const For *loop) override {
-        if (!CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
-            return loop->body.accept(this);
-        }
+        user_assert(loop->for_type != ForType::GPULane)
+            << "The Vulkan backend does not support the gpu_lanes() scheduling directive.";
 
-        if ((loop->for_type == ForType::GPUBlock) ||
-            (loop->for_type == ForType::GPUThread)) {
+        if (is_gpu(loop->for_type)) {
 
             // This should always be true at this point in codegen
             internal_assert(is_const_zero(loop->min));
@@ -411,11 +410,8 @@ struct FindWorkGroupSize : public IRVisitor {
     }
 
     int thread_loop_workgroup_index(const std::string &name) {
-        std::string ids[] = {".__thread_id_x",
-                             ".__thread_id_y",
-                             ".__thread_id_z"};
-        for (size_t i = 0; i < sizeof(ids) / sizeof(std::string); i++) {
-            if (ends_with(name, ids[i])) {
+        for (size_t i = 0; i < 3; i++) {
+            if (ends_with(name, gpu_thread_name(i))) {
                 return i;
             }
         }
@@ -1630,20 +1626,18 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const AssertStmt *stmt) {
 
 namespace {
 std::pair<std::string, uint32_t> simt_intrinsic(const std::string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
+    if (ends_with(name, gpu_thread_name(0))) {
         return {"LocalInvocationId", 0};
-    } else if (ends_with(name, ".__thread_id_y")) {
+    } else if (ends_with(name, gpu_thread_name(1))) {
         return {"LocalInvocationId", 1};
-    } else if (ends_with(name, ".__thread_id_z")) {
+    } else if (ends_with(name, gpu_thread_name(2))) {
         return {"LocalInvocationId", 2};
-    } else if (ends_with(name, ".__block_id_x")) {
+    } else if (ends_with(name, gpu_block_name(0))) {
         return {"WorkgroupId", 0};
-    } else if (ends_with(name, ".__block_id_y")) {
+    } else if (ends_with(name, gpu_block_name(1))) {
         return {"WorkgroupId", 1};
-    } else if (ends_with(name, ".__block_id_z")) {
+    } else if (ends_with(name, gpu_block_name(2))) {
         return {"WorkgroupId", 2};
-    } else if (ends_with(name, "id_w")) {
-        user_error << "Vulkan only supports <=3 dimensions for gpu blocks";
     }
     internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
     return {"", -1};
@@ -1654,11 +1648,7 @@ std::pair<std::string, uint32_t> simt_intrinsic(const std::string &name) {
 void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) {
     debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(For): name=" << op->name << " min=" << op->min << " extent=" << op->extent << "\n";
 
-    if (is_gpu_var(op->name)) {
-        internal_assert((op->for_type == ForType::GPUBlock) ||
-                        (op->for_type == ForType::GPUThread))
-            << "kernel loops must be either gpu block or gpu thread\n";
-
+    if (is_gpu(op->for_type)) {
         // This should always be true at this point in codegen
         internal_assert(is_const_zero(op->min));
         auto intrinsic = simt_intrinsic(op->name);
@@ -2477,11 +2467,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_workgroup_size(SpvId kernel_func
             local_size_y_id,
             local_size_z_id};
 
-        const char *local_size_names[3] = {
-            "__thread_id_x",
-            "__thread_id_y",
-            "__thread_id_z"};
-
         debug(1) << "Vulkan: Using dynamic workgroup local size with default of [" << local_size_x << ", " << local_size_y << ", " << local_size_z << "]...\n";
 
         // annotate each local size with a corresponding specialization constant
@@ -2489,8 +2474,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_workgroup_size(SpvId kernel_func
             SpvId constant_id = (uint32_t)(descriptor_set_table.back().specialization_constants.size() + 1);
             SpvBuilder::Literals spec_id = {constant_id};
             builder.add_annotation(local_size_ids[dim], SpvDecorationSpecId, spec_id);
-            builder.add_symbol(local_size_names[dim], local_size_ids[dim], builder.current_module().id());
-            SpecializationBinding spec_binding = {constant_id, (uint32_t)sizeof(uint32_t), local_size_names[dim]};
+            builder.add_symbol(gpu_thread_name(dim), local_size_ids[dim], builder.current_module().id());
+            SpecializationBinding spec_binding = {constant_id, (uint32_t)sizeof(uint32_t), gpu_thread_name(dim)};
             descriptor_set_table.back().specialization_constants.push_back(spec_binding);
             descriptor_set_table.back().workgroup_size_binding.local_size_constant_id[dim] = constant_id;
         }
@@ -2520,18 +2505,12 @@ namespace {
 class FindIntrinsicsUsed : public IRVisitor {
     using IRVisitor::visit;
     void visit(const For *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
+        if (is_gpu(op->for_type)) {
             auto intrinsic = simt_intrinsic(op->name);
-            intrinsics_used.insert(intrinsic.first);
+            intrinsics_used.insert(op->name);
         }
         op->body.accept(this);
     }
-    void visit(const Variable *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
-            auto intrinsic = simt_intrinsic(op->name);
-            intrinsics_used.insert(intrinsic.first);
-        }
-    }
 
 public:
     std::unordered_set<std::string> intrinsics_used;
diff --git a/src/CodeGen_WebGPU_Dev.cpp b/src/CodeGen_WebGPU_Dev.cpp
index de55113ff695..815013798bb4 100644
--- a/src/CodeGen_WebGPU_Dev.cpp
+++ b/src/CodeGen_WebGPU_Dev.cpp
@@ -4,6 +4,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_WebGPU_Dev.h"
@@ -603,22 +604,18 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::visit(const FloatImm *op) {
 
 namespace {
 string simt_intrinsic(const string &name) {
-    if (ends_with(name, ".__thread_id_x")) {
+    if (ends_with(name, gpu_thread_name(0))) {
         return "local_id.x";
-    } else if (ends_with(name, ".__thread_id_y")) {
+    } else if (ends_with(name, gpu_thread_name(1))) {
         return "local_id.y";
-    } else if (ends_with(name, ".__thread_id_z")) {
+    } else if (ends_with(name, gpu_thread_name(2))) {
         return "local_id.z";
-    } else if (ends_with(name, ".__thread_id_w")) {
-        user_error << "WebGPU does not support more than three dimensions.\n";
-    } else if (ends_with(name, ".__block_id_x")) {
+    } else if (ends_with(name, gpu_block_name(0))) {
         return "group_id.x";
-    } else if (ends_with(name, ".__block_id_y")) {
+    } else if (ends_with(name, gpu_block_name(1))) {
         return "group_id.y";
-    } else if (ends_with(name, ".__block_id_z")) {
+    } else if (ends_with(name, gpu_block_name(2))) {
         return "group_id.z";
-    } else if (ends_with(name, ".__block_id_w")) {
-        user_error << "WebGPU does not support more than three dimensions.\n";
     }
     internal_error << "invalid simt_intrinsic name: " << name << "\n";
     return "";
@@ -646,10 +643,7 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::visit(const For *loop) {
     user_assert(loop->for_type != ForType::GPULane)
         << "The WebGPU backend does not support the gpu_lanes() directive.";
 
-    if (is_gpu_var(loop->name)) {
-        internal_assert((loop->for_type == ForType::GPUBlock) ||
-                        (loop->for_type == ForType::GPUThread))
-            << "kernel loop must be either gpu block or gpu thread\n";
+    if (is_gpu(loop->for_type)) {
         internal_assert(is_const_zero(loop->min));
 
         stream << get_indent()
diff --git a/src/DeviceArgument.cpp b/src/DeviceArgument.cpp
index 82278be273e5..104538611a65 100644
--- a/src/DeviceArgument.cpp
+++ b/src/DeviceArgument.cpp
@@ -65,7 +65,7 @@ void HostClosure::visit(const Call *op) {
 }
 
 void HostClosure::visit(const For *loop) {
-    if (CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
+    if (is_gpu(loop->for_type)) {
         // The size of the threads and blocks is not part of the closure
         ScopedBinding<> p(ignore, loop->name);
         loop->body.accept(this);
diff --git a/src/Expr.cpp b/src/Expr.cpp
index a619661dedf6..c3a7deb483aa 100644
--- a/src/Expr.cpp
+++ b/src/Expr.cpp
@@ -87,6 +87,13 @@ bool is_parallel(ForType for_type) {
             for_type == ForType::GPULane);
 }
 
+/** Returns true if for_type is GPUBlock, GPUThread, or GPULane. */
+bool is_gpu(ForType for_type) {
+    return (for_type == ForType::GPUBlock ||
+            for_type == ForType::GPUThread ||
+            for_type == ForType::GPULane);
+}
+
 }  // namespace Internal
 
 Range::Range(const Expr &min_in, const Expr &extent_in)
diff --git a/src/Expr.h b/src/Expr.h
index 327462f973c0..31850fc56001 100644
--- a/src/Expr.h
+++ b/src/Expr.h
@@ -415,6 +415,9 @@ bool is_unordered_parallel(ForType for_type);
 /** Returns true if for_type executes for loop iterations in parallel. */
 bool is_parallel(ForType for_type);
 
+/** Returns true if for_type is GPUBlock, GPUThread, or GPULane. */
+bool is_gpu(ForType for_type);
+
 /** A reference-counted handle to a statement node. */
 struct Stmt : public IRHandle {
     Stmt() = default;
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index abde50d62e1f..4294f2ebc825 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -4,6 +4,7 @@
 
 #include "Bounds.h"
 #include "CSE.h"
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_GPU_Dev.h"
 #include "CompilerLogger.h"
 #include "ExprUsesVar.h"
@@ -29,17 +30,14 @@ using std::vector;
 
 namespace {
 
-string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"};
-string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"};
-
 class ExtractBlockSize : public IRVisitor {
-    Expr block_extent[4], block_count[4];
-    string block_var_name[4];
+    Expr block_extent[3], block_count[3];
+    string block_var_name[3];
 
     using IRVisitor::visit;
 
     void found_thread_for(int dim, const string &name, const Expr &extent) {
-        internal_assert(dim >= 0 && dim < 4);
+        internal_assert(dim >= 0 && dim < 3);
         if (!block_extent[dim].defined()) {
             block_extent[dim] = extent;
         } else {
@@ -48,17 +46,17 @@ class ExtractBlockSize : public IRVisitor {
     }
 
     void found_block_for(int dim, const string &name, Expr extent) {
-        internal_assert(dim >= 0 && dim < 4);
+        internal_assert(dim >= 0 && dim < 3);
         internal_assert(!block_count[dim].defined());
         block_count[dim] = std::move(extent);
         block_var_name[dim] = name;
     }
 
     void visit(const For *op) override {
-        for (int i = 0; i < 4; i++) {
-            if (ends_with(op->name, thread_names[i])) {
+        for (int i = 0; i < 3; i++) {
+            if (ends_with(op->name, gpu_thread_name(i))) {
                 found_thread_for(i, op->name, op->extent);
-            } else if (ends_with(op->name, block_names[i])) {
+            } else if (ends_with(op->name, gpu_block_name(i))) {
                 found_block_for(i, op->name, op->extent);
             }
         }
@@ -88,21 +86,21 @@ class ExtractBlockSize : public IRVisitor {
 
 public:
     int blocks_dimensions() const {
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < 3; i++) {
             if (!block_count[i].defined()) {
                 return i;
             }
         }
-        return 4;
+        return 3;
     }
 
     int threads_dimensions() const {
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < 3; i++) {
             if (!block_extent[i].defined()) {
                 return i;
             }
         }
-        return 4;
+        return 3;
     }
 
     Expr num_threads(int d) const {
@@ -114,12 +112,13 @@ class ExtractBlockSize : public IRVisitor {
     }
 
     Expr block_var(int d) const {
+        // The name of the actual for loop
         return Variable::make(Int(32), block_var_name[d]);
     }
 
     Expr thread_var(int d) const {
         // Thread variables get canonical names
-        return Variable::make(Int(32), "." + thread_names[d]);
+        return Variable::make(Int(32), gpu_thread_name(d));
     }
 };
 
@@ -142,8 +141,8 @@ class NormalizeDimensionality : public IRMutator {
             return s;
         }
         while (max_depth < block_size.threads_dimensions()) {
-            string name = thread_names[max_depth];
-            s = For::make("." + name, 0, 1, ForType::GPUThread, Partition::Never, device_api, s);
+            s = For::make(gpu_thread_name(max_depth), 0, 1, ForType::GPUThread,
+                          Partition::Never, device_api, s);
             max_depth++;
         }
         return s;
@@ -166,7 +165,8 @@ class NormalizeDimensionality : public IRMutator {
     }
 
     Stmt visit(const For *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) {
+        if (op->for_type == ForType::GPUThread ||
+            op->for_type == ForType::GPULane) {
             depth++;
             if (depth > max_depth) {
                 max_depth = depth;
@@ -191,10 +191,11 @@ class ReplaceForWithIf : public IRMutator {
     const ExtractBlockSize &block_size;
 
     Stmt visit(const For *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) {
+        if (op->for_type == ForType::GPUThread ||
+            op->for_type == ForType::GPULane) {
             int dim;
-            for (dim = 0; dim < 4; dim++) {
-                if (ends_with(op->name, thread_names[dim])) {
+            for (dim = 0; dim < 3; dim++) {
+                if (ends_with(op->name, gpu_thread_name(dim))) {
                     break;
                 }
             }
@@ -203,7 +204,7 @@ class ReplaceForWithIf : public IRMutator {
 
             Stmt body = mutate(op->body);
 
-            Expr var = Variable::make(Int(32), "." + thread_names[dim]);
+            Expr var = Variable::make(Int(32), gpu_thread_name(dim));
             body = substitute(op->name, var + op->min, body);
 
             if (equal(op->extent, block_size.num_threads(dim))) {
@@ -322,7 +323,7 @@ class ExtractSharedAndHeapAllocations : public IRMutator {
     }
 
     Stmt visit(const For *op) override {
-        bool is_thread_loop = CodeGen_GPU_Dev::is_gpu_thread_var(op->name);
+        bool is_thread_loop = op->for_type == ForType::GPUThread || op->for_type == ForType::GPULane;
         ScopedValue<bool> old_in_threads(in_threads, in_threads || is_thread_loop);
 
         // Set aside the allocations we've found so far.
@@ -1366,7 +1367,7 @@ class FuseGPUThreadLoopsSingleKernel : public IRMutator {
     ExtractSharedAndHeapAllocations &block_allocations;
 
     Stmt visit(const For *op) override {
-        if (ends_with(op->name, ".__block_id_x")) {
+        if (ends_with(op->name, gpu_block_name(0))) {
             Stmt body = op->body;
 
             // This is the innermost loop over blocks.
@@ -1407,17 +1408,17 @@ class FuseGPUThreadLoopsSingleKernel : public IRMutator {
             debug(3) << "Replaced for with if:\n"
                      << body << "\n\n";
 
-            // There is always a loop over thread_id_x
-            string thread_id = "." + thread_names[0];
+            // There is always a loop over the innermost thread dimension
+            string thread_id = gpu_thread_name(0);
             // Add back in any register-level allocations
             body = register_allocs.rewrap(body, thread_id);
             body = For::make(thread_id, 0, block_size_x, innermost_loop_type, op->partition_policy, op->device_api, body);
 
             // Rewrap the whole thing in other loops over threads
             for (int i = 1; i < block_size.threads_dimensions(); i++) {
-                thread_id = "." + thread_names[i];
+                thread_id = gpu_thread_name(i);
                 body = register_allocs.rewrap(body, thread_id);
-                body = For::make("." + thread_names[i], 0, block_size.num_threads(i),
+                body = For::make(thread_id, 0, block_size.num_threads(i),
                                  ForType::GPUThread, op->partition_policy, op->device_api, body);
             }
             thread_id.clear();
@@ -1452,14 +1453,15 @@ class FuseGPUThreadLoops : public IRMutator {
     using IRMutator::visit;
 
     Stmt visit(const For *op) override {
-        user_assert(!(CodeGen_GPU_Dev::is_gpu_thread_var(op->name)))
+        user_assert(!(op->for_type == ForType::GPUThread ||
+                      op->for_type == ForType::GPULane))
             << "Loops over GPU thread variable: \"" << op->name
             << "\" is outside of any loop over a GPU block variable. "
             << "This schedule is malformed. There must be a GPU block "
             << "variable, and it must reordered to be outside all GPU "
             << "thread variables.\n";
 
-        if (CodeGen_GPU_Dev::is_gpu_block_var(op->name)) {
+        if (op->for_type == ForType::GPUBlock) {
             // Do the analysis of thread block size and shared memory
             // usage.
             ExtractBlockSize block_size;
@@ -1498,7 +1500,7 @@ class ZeroGPULoopMins : public IRMutator {
                           (op->device_api == DeviceAPI::Vulkan);
 
         Stmt stmt = IRMutator::visit(op);
-        if (CodeGen_GPU_Dev::is_gpu_var(op->name) && !is_const_zero(op->min)) {
+        if (is_gpu(op->for_type) && !is_const_zero(op->min)) {
             op = stmt.as<For>();
             internal_assert(op);
             Expr adjusted = Variable::make(Int(32), op->name) + op->min;
@@ -1526,7 +1528,7 @@ class FindInnermostGPUBlock : public IRVisitor {
     using IRVisitor::visit;
 
     void visit(const For *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_block_var(op->name)) {
+        if (op->for_type == ForType::GPUBlock) {
             // Set the last found GPU block to found_gpu_block.
             found_gpu_block = op;
         }
@@ -1567,7 +1569,7 @@ class NormalizeIfStatements : public IRMutator {
     bool inside_gpu_blocks = false;
 
     Stmt visit(const For *op) override {
-        if (!CodeGen_GPU_Dev::is_gpu_block_var(op->name)) {
+        if (op->for_type != ForType::GPUBlock) {
             return IRMutator::visit(op);
         }
         ScopedValue<bool> old_inside_gpu_blocks(inside_gpu_blocks, true);
diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp
index 77a57efc1149..4a33c8f1bc00 100644
--- a/src/OffloadGPULoops.cpp
+++ b/src/OffloadGPULoops.cpp
@@ -1,5 +1,6 @@
 #include <memory>
 
+#include "CanonicalizeGPUVars.h"
 #include "Closure.h"
 #include "CodeGen_D3D12Compute_Dev.h"
 #include "CodeGen_GPU_Dev.h"
@@ -31,13 +32,13 @@ namespace {
 // amount of shared memory to allocate.
 class ExtractBounds : public IRVisitor {
 public:
-    Expr num_threads[4];
-    Expr num_blocks[4];
+    Expr num_threads[3];
+    Expr num_blocks[3];
     Expr shared_mem_size;
 
     ExtractBounds()
         : shared_mem_size(0) {
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < 3; i++) {
             num_threads[i] = num_blocks[i] = 1;
         }
     }
@@ -48,26 +49,17 @@ class ExtractBounds : public IRVisitor {
     using IRVisitor::visit;
 
     void visit(const For *op) override {
-        if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
+        if (is_gpu(op->for_type)) {
             internal_assert(is_const_zero(op->min));
         }
 
-        if (ends_with(op->name, ".__thread_id_x")) {
-            num_threads[0] = op->extent;
-        } else if (ends_with(op->name, ".__thread_id_y")) {
-            num_threads[1] = op->extent;
-        } else if (ends_with(op->name, ".__thread_id_z")) {
-            num_threads[2] = op->extent;
-        } else if (ends_with(op->name, ".__thread_id_w")) {
-            num_threads[3] = op->extent;
-        } else if (ends_with(op->name, ".__block_id_x")) {
-            num_blocks[0] = op->extent;
-        } else if (ends_with(op->name, ".__block_id_y")) {
-            num_blocks[1] = op->extent;
-        } else if (ends_with(op->name, ".__block_id_z")) {
-            num_blocks[2] = op->extent;
-        } else if (ends_with(op->name, ".__block_id_w")) {
-            num_blocks[3] = op->extent;
+        for (int i = 0; i < 3; i++) {
+            if (ends_with(op->name, gpu_thread_name(i))) {
+                num_threads[i] = op->extent;
+            }
+            if (ends_with(op->name, gpu_block_name(i))) {
+                num_blocks[i] = op->extent;
+            }
         }
 
         op->body.accept(this);
@@ -127,7 +119,7 @@ class InjectGpuOffload : public IRMutator {
     using IRMutator::visit;
 
     Stmt visit(const For *loop) override {
-        if (!CodeGen_GPU_Dev::is_gpu_var(loop->name)) {
+        if (!is_gpu(loop->for_type)) {
             return IRMutator::visit(loop);
         }
 
@@ -142,12 +134,10 @@ class InjectGpuOffload : public IRMutator {
         debug(2) << "Kernel bounds: ("
                  << bounds.num_threads[0] << ", "
                  << bounds.num_threads[1] << ", "
-                 << bounds.num_threads[2] << ", "
-                 << bounds.num_threads[3] << ") threads, ("
+                 << bounds.num_threads[2] << ") threads, ("
                  << bounds.num_blocks[0] << ", "
                  << bounds.num_blocks[1] << ", "
-                 << bounds.num_blocks[2] << ", "
-                 << bounds.num_blocks[3] << ") blocks\n";
+                 << bounds.num_blocks[2] << ") blocks\n";
 
         // compute a closure over the state passed into the kernel
         HostClosure c;
@@ -222,10 +212,6 @@ class InjectGpuOffload : public IRMutator {
         }
         arg_is_buffer.emplace_back(cast<uint8_t>(0));
 
-        // TODO: only three dimensions can be passed to
-        // cuLaunchKernel. How should we handle blkid[3]?
-        internal_assert(is_const_one(bounds.num_threads[3]) && is_const_one(bounds.num_blocks[3]))
-            << bounds.num_threads[3] << ", " << bounds.num_blocks[3] << "\n";
         debug(3) << "bounds.num_blocks[0] = " << bounds.num_blocks[0] << "\n";
         debug(3) << "bounds.num_blocks[1] = " << bounds.num_blocks[1] << "\n";
         debug(3) << "bounds.num_blocks[2] = " << bounds.num_blocks[2] << "\n";
diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp
index 99b7a7cc25e1..a17f5db5b7c1 100644
--- a/src/PartitionLoops.cpp
+++ b/src/PartitionLoops.cpp
@@ -2,6 +2,7 @@
 #include <utility>
 
 #include "CSE.h"
+#include "CanonicalizeGPUVars.h"
 #include "CodeGen_GPU_Dev.h"
 #include "ExprUsesVar.h"
 #include "IREquality.h"
@@ -566,8 +567,7 @@ class PartitionLoops : public IRMutator {
             }
         } mutation_checker{op, op->partition_policy == Partition::Always};
 
-        ScopedValue<bool> old_in_gpu_loop(in_gpu_loop, in_gpu_loop ||
-                                                           CodeGen_GPU_Dev::is_gpu_var(op->name));
+        ScopedValue<bool> old_in_gpu_loop(in_gpu_loop, in_gpu_loop || is_gpu(op->for_type));
 
         // If we're inside GPU kernel, and the body contains thread
         // barriers or warp shuffles, it's not safe to partition loops.
@@ -877,12 +877,12 @@ class RenormalizeGPULoops : public IRMutator {
         bool old_in_gpu_loop = in_gpu_loop;
         Stmt stmt;
 
-        if (in_gpu_loop || CodeGen_GPU_Dev::is_gpu_var(op->name)) {
+        if (in_gpu_loop || is_gpu(op->for_type)) {
             gpu_vars.push(op->name);
             in_gpu_loop = true;
         }
 
-        if (ends_with(op->name, "__thread_id_x")) {
+        if (ends_with(op->name, gpu_thread_name(0))) {
             internal_assert(!in_thread_loop);
             in_thread_loop = true;
             stmt = IRMutator::visit(op);
diff --git a/src/TrimNoOps.cpp b/src/TrimNoOps.cpp
index 25c164ed44b4..bbcf0dd3fdfb 100644
--- a/src/TrimNoOps.cpp
+++ b/src/TrimNoOps.cpp
@@ -355,7 +355,7 @@ class TrimNoOps : public IRMutator {
 
     Stmt visit(const For *op) override {
         // Bounds of GPU loops can't depend on outer gpu loop vars
-        if (CodeGen_GPU_Dev::is_gpu_var(op->name)) {
+        if (is_gpu(op->for_type)) {
             debug(3) << "TrimNoOps found gpu loop var: " << op->name << "\n";
             return IRMutator::visit(op);
         }
diff --git a/test/correctness/fuse_gpu_threads.cpp b/test/correctness/fuse_gpu_threads.cpp
index 9ddba37db2b7..63361e76b928 100644
--- a/test/correctness/fuse_gpu_threads.cpp
+++ b/test/correctness/fuse_gpu_threads.cpp
@@ -6,8 +6,7 @@ using namespace Halide::Internal;
 class CheckThreadExtent : public IRVisitor {
     using IRVisitor::visit;
     void visit(const For *op) override {
-        if ((op->name == ".__thread_id_x") || (op->name == ".__thread_id_y")) {
-            assert(op->for_type == ForType::GPUThread);
+        if (op->for_type == ForType::GPUThread) {
             // Assert the min and extent to be 0 and 16 for this particular test case
             const int64_t *min = as_const_int(op->min);
             const int64_t *extent = as_const_int(op->extent);
@@ -19,6 +18,11 @@ class CheckThreadExtent : public IRVisitor {
 };
 
 int main(int argc, char **argv) {
+    // Canonical GPU for loop names are uniqued to make sure they don't collide
+    // with user-provided names. We'll test that works by trying for a collision:
+    unique_name("thread_id_x");
+    unique_name("block_id_x");
+
     Target target = get_jit_target_from_environment();
     if (!target.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
@@ -51,7 +55,7 @@ int main(int argc, char **argv) {
         .vectorize(x, 4, TailStrategy::RoundUp)
         .gpu_threads(x, y);
 
-    // Lower it and inspect the IR to verify the min/extent of GPU ".__thread_id_x"
+    // Lower it and inspect the IR to verify the min/extent of GPU thread loops
     Module m = consumer.compile_to_module({consumer.infer_arguments()}, "fuse_gpu_threads", target);
     CheckThreadExtent c;
     m.functions().front().body.accept(&c);

From 10e07e647ccc9b1d0e0523b8c110f40722fc7525 Mon Sep 17 00:00:00 2001
From: Zalman Stern <zalman@google.com>
Date: Tue, 5 Mar 2024 09:53:29 -0800
Subject: [PATCH 081/186] Add class template type deduction guides to avoid
 CTAD warning. (#8135)

* Add class template type dedeuction guides to avoid CTAD warning.

* Formatting.
---
 src/Debug.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/Debug.h b/src/Debug.h
index 9f47a5aebeb6..432ba07dc115 100644
--- a/src/Debug.h
+++ b/src/Debug.h
@@ -77,6 +77,9 @@ struct PrintSpan {
         : span(span) {
     }
 };
+// Class template argument deduction (CTAD) guide to prevent warnings.
+template<typename T>
+PrintSpan(const T &) -> PrintSpan<T>;
 
 template<typename StreamT, typename T>
 inline StreamT &operator<<(StreamT &stream, const PrintSpan<T> &wrapper) {
@@ -108,6 +111,9 @@ struct PrintSpanLn {
         : span(span) {
     }
 };
+// Class template argument deduction (CTAD) guide to prevent warnings.
+template<typename T>
+PrintSpanLn(const T &) -> PrintSpanLn<T>;
 
 template<typename StreamT, typename T>
 inline StreamT &operator<<(StreamT &stream, const PrintSpanLn<T> &wrapper) {

From 754e6ec9c076733971895bb7f8fe087e3bde9e11 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <derek.gerstmann@gmail.com>
Date: Wed, 6 Mar 2024 11:46:23 -0800
Subject: [PATCH 082/186] [vulkan] Add conform API methods to memory allocator
 to fix block allocations (#8130)

* Add conform API methods to block and region allocator classes
Override conform requests for Vulkan memory allocator
Cleanup memory requirement constraints for Vulkan
Add conform test cases to block_allocator runtime test.

* Clang format/tidy pas

* Fix unsigned int comparisons

* Clang format pass

* Fix other unsigned int comparisons

* Fix mismatched template types for max()

* Fix whitespace for clang format

---------

Co-authored-by: Derek Gerstmann <dgerstmann@adobe.com>
---
 src/runtime/internal/block_allocator.h  | 132 ++++++++-----
 src/runtime/internal/memory_resources.h |   4 +
 src/runtime/internal/region_allocator.h | 184 ++++++++++--------
 src/runtime/vulkan_memory.h             | 241 ++++++++++++++++++------
 test/runtime/block_allocator.cpp        | 189 ++++++++++++++++++-
 5 files changed, 566 insertions(+), 184 deletions(-)

diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index feee56a4e531..89b1a929e79b 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -55,10 +55,11 @@ class BlockAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    int release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
-    int reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
-    int retain(void *user_context, MemoryRegion *region);   //< retain the region and increase the usage count
-    bool collect(void *user_context);                       //< returns true if any blocks were removed
+    int conform(void *user_context, MemoryRequest *request) const;  //< conform the given request into a suitable allocation
+    int release(void *user_context, MemoryRegion *region);          //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *region);          //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *region);           //< retain the region and increase the usage count
+    bool collect(void *user_context);                               //< returns true if any blocks were removed
     int release(void *user_context);
     int destroy(void *user_context);
 
@@ -86,13 +87,13 @@ class BlockAllocator {
     int destroy_region_allocator(void *user_context, RegionAllocator *region_allocator);
 
     // Reserves a block of memory for the requested size and returns the corresponding block entry, or nullptr on failure
-    BlockEntry *reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+    BlockEntry *reserve_block_entry(void *user_context, const MemoryRequest &request);
 
     // Locates the "best-fit" block entry for the requested size, or nullptr if none was found
-    BlockEntry *find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+    BlockEntry *find_block_entry(void *user_context, const MemoryRequest &request);
 
-    // Creates a new block entry and int the list
-    BlockEntry *create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+    // Creates a new block entry and adds it tos the list
+    BlockEntry *create_block_entry(void *user_context, const MemoryRequest &request);
 
     // Releases the block entry from being used, and makes it available for further allocations
     int release_block_entry(void *user_context, BlockEntry *block_entry);
@@ -113,7 +114,7 @@ class BlockAllocator {
     bool is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const;
 
     // Returns true if the given block is suitable for the request allocation
-    bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const;
+    bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const;
 
     Config config;
     LinkedList block_list;
@@ -162,7 +163,8 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
                         << "caching=" << halide_memory_caching_name(request.properties.caching) << " "
                         << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...";
 #endif
-    BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated);
+    // Reserve a block entry for use
+    BlockEntry *block_entry = reserve_block_entry(user_context, request);
     if (block_entry == nullptr) {
         error(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size ("
                             << (int32_t)(request.size) << " bytes)\n";
@@ -173,11 +175,12 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
     halide_abort_if_false(user_context, block != nullptr);
     halide_abort_if_false(user_context, block->allocator != nullptr);
 
+    // Reserve an initial memory region for the block
     MemoryRegion *result = reserve_memory_region(user_context, block->allocator, request);
     if (result == nullptr) {
 
         // Unable to reserve region in an existing block ... create a new block and try again.
-        block_entry = create_block_entry(user_context, request.properties, request.size, request.dedicated);
+        block_entry = create_block_entry(user_context, request);
         if (block_entry == nullptr) {
             error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
                                 << (int32_t)(request.size) << " bytes)\n";
@@ -299,8 +302,8 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl
     return result;
 }
 
-bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const {
-    if (!is_compatible_block(block, properties)) {
+bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const {
+    if (!is_compatible_block(block, request.properties)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: skipping block ... incompatible properties! ("
                             << "block_resource=" << (void *)block << " "
@@ -309,16 +312,16 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
                             << "block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << " "
                             << "block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << " "
                             << "block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << " "
-                            << "request_size=" << (uint32_t)size << " "
-                            << "request_usage=" << halide_memory_usage_name(properties.usage) << " "
-                            << "request_caching=" << halide_memory_caching_name(properties.caching) << " "
-                            << "request_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                            << "request_size=" << (uint32_t)request.size << " "
+                            << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                            << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                            << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
         // skip blocks that are using incompatible memory
         return false;
     }
 
-    if (dedicated && (block->reserved > 0)) {
+    if (request.dedicated && (block->reserved > 0)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation! ("
                             << "block_resource=" << (void *)block << " "
@@ -340,7 +343,7 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
     }
 
     size_t available = (block->memory.size - block->reserved);
-    if (available >= size) {
+    if (available >= request.size) {
         return true;
     }
 
@@ -348,23 +351,23 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
 }
 
 BlockAllocator::BlockEntry *
-BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+BlockAllocator::find_block_entry(void *user_context, const MemoryRequest &request) {
     BlockEntry *block_entry = block_list.back();
     while (block_entry != nullptr) {
         BlockEntry *prev_entry = block_entry->prev_ptr;
         const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
-        if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) {
+        if (is_block_suitable_for_request(user_context, block, request)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
             debug(user_context) << "BlockAllocator: found suitable block ("
                                 << "user_context=" << (void *)(user_context) << " "
                                 << "block_resource=" << (void *)block << " "
                                 << "block_size=" << (uint32_t)block->memory.size << " "
                                 << "block_reserved=" << (uint32_t)block->reserved << " "
-                                << "request_size=" << (uint32_t)size << " "
-                                << "dedicated=" << (dedicated ? "true" : "false") << " "
-                                << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                                << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                                << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                                << "request_size=" << (uint32_t)request.size << " "
+                                << "request_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                                << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                                << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                                << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
             return block_entry;
         }
@@ -375,37 +378,37 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: couldn't find suitable block! ("
                             << "user_context=" << (void *)(user_context) << " "
-                            << "request_size=" << (uint32_t)size << " "
-                            << "dedicated=" << (dedicated ? "true" : "false") << " "
-                            << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                            << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                            << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                            << "request_size=" << (uint32_t)request.size << " "
+                            << "request_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                            << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                            << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                            << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
     }
     return block_entry;
 }
 
 BlockAllocator::BlockEntry *
-BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+BlockAllocator::reserve_block_entry(void *user_context, const MemoryRequest &request) {
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: reserving block ... ! ("
-                        << "requested_size=" << (uint32_t)size << " "
-                        << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " "
-                        << "requested_usage=" << halide_memory_usage_name(properties.usage) << " "
-                        << "requested_caching=" << halide_memory_caching_name(properties.caching) << " "
-                        << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                        << "requested_size=" << (uint32_t)request.size << " "
+                        << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                        << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                        << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                        << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
-    BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated);
+    BlockEntry *block_entry = find_block_entry(user_context, request);
     if (block_entry == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: creating block ... ! ("
-                            << "requested_size=" << (uint32_t)size << " "
-                            << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " "
-                            << "requested_usage=" << halide_memory_usage_name(properties.usage) << " "
-                            << "requested_caching=" << halide_memory_caching_name(properties.caching) << " "
-                            << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                            << "requested_size=" << (uint32_t)request.size << " "
+                            << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                            << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                            << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                            << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
-        block_entry = create_block_entry(user_context, properties, size, dedicated);
+        block_entry = create_block_entry(user_context, request);
     }
 
     if (block_entry) {
@@ -449,7 +452,7 @@ int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator
 }
 
 BlockAllocator::BlockEntry *
-BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+BlockAllocator::create_block_entry(void *user_context, const MemoryRequest &request) {
     if (config.maximum_pool_size && (pool_size() >= config.maximum_pool_size)) {
         error(user_context) << "BlockAllocator: No free blocks found! Maximum pool size reached ("
                             << (int32_t)(config.maximum_pool_size) << " bytes or "
@@ -476,12 +479,16 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
                         << "allocator=" << (void *)(allocators.block.allocate) << ")...";
 #endif
 
+    // Constrain the request to the a valid block allocation
+    MemoryRequest block_request = request;
+    conform(user_context, &block_request);
+
+    // Create the block resource itself
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
-    block->memory.size = constrain_requested_size(size);
+    block->memory.size = block_request.size;
     block->memory.handle = nullptr;
-    block->memory.properties = properties;
-    block->memory.properties.nearest_multiple = max(config.nearest_multiple, properties.nearest_multiple);
-    block->memory.dedicated = dedicated;
+    block->memory.properties = block_request.properties;
+    block->memory.dedicated = block_request.dedicated;
     block->reserved = 0;
     block->allocator = create_region_allocator(user_context, block);
     alloc_memory_block(user_context, block);
@@ -561,6 +568,33 @@ size_t BlockAllocator::constrain_requested_size(size_t size) const {
     return actual_size;
 }
 
+int BlockAllocator::conform(void *user_context, MemoryRequest *request) const {
+
+    request->properties.nearest_multiple = max(config.nearest_multiple, request->properties.nearest_multiple);
+
+    if (request->properties.nearest_multiple) {
+        size_t nm = request->properties.nearest_multiple;
+        request->size = (((request->size + nm - 1) / nm) * nm);  // round up to nearest multiple
+    }
+
+    if (config.minimum_block_size) {
+        request->size = ((request->size < config.minimum_block_size) ?
+                             config.minimum_block_size :
+                             request->size);
+    }
+    if (config.maximum_block_size) {
+        request->size = ((request->size > config.maximum_block_size) ?
+                             config.maximum_block_size :
+                             request->size);
+    }
+
+    if (allocators.block.conform) {
+        return allocators.block.conform(user_context, request);
+    }
+
+    return 0;
+}
+
 bool BlockAllocator::is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const {
     if (properties.caching != MemoryCaching::DefaultCaching) {
         if (properties.caching != block->memory.properties.caching) {
diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index d41fa57304fb..0be6041519a1 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -202,18 +202,22 @@ struct HalideSystemAllocatorFns {
 
 typedef int (*AllocateBlockFn)(void *, MemoryBlock *);
 typedef int (*DeallocateBlockFn)(void *, MemoryBlock *);
+typedef int (*ConformBlockRequestFn)(void *, MemoryRequest *);
 
 struct MemoryBlockAllocatorFns {
     AllocateBlockFn allocate = nullptr;
     DeallocateBlockFn deallocate = nullptr;
+    ConformBlockRequestFn conform = nullptr;
 };
 
 typedef int (*AllocateRegionFn)(void *, MemoryRegion *);
 typedef int (*DeallocateRegionFn)(void *, MemoryRegion *);
+typedef int (*ConformBlockRegionFn)(void *, MemoryRequest *);
 
 struct MemoryRegionAllocatorFns {
     AllocateRegionFn allocate = nullptr;
     DeallocateRegionFn deallocate = nullptr;
+    ConformBlockRegionFn conform = nullptr;
 };
 
 // --
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 02c2cd7e6aa0..3588389c3747 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -46,10 +46,11 @@ class RegionAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    int release(void *user_context, MemoryRegion *memory_region);  //< unmark and cache the region for reuse
-    int reclaim(void *user_context, MemoryRegion *memory_region);  //< free the region and consolidate
-    int retain(void *user_context, MemoryRegion *memory_region);   //< retain the region and increase usage count
-    bool collect(void *user_context);                              //< returns true if any blocks were removed
+    int conform(void *user_context, MemoryRequest *request) const;  //< conform the given request into a suitable allocation
+    int release(void *user_context, MemoryRegion *memory_region);   //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *memory_region);   //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *memory_region);    //< retain the region and increase usage count
+    bool collect(void *user_context);                               //< returns true if any blocks were removed
     int release(void *user_context);
     int destroy(void *user_context);
 
@@ -73,13 +74,13 @@ class RegionAllocator {
     BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region);
 
     // Returns true if the given region can be split to accomodate the given size
-    bool can_split(const BlockRegion *region, size_t size, size_t alignment) const;
+    bool can_split(const BlockRegion *region, const MemoryRequest &request) const;
 
     // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining
-    BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment);
+    BlockRegion *split_block_region(void *user_context, BlockRegion *region, const MemoryRequest &request);
 
     // Creates a new block region and adds it to the region list
-    BlockRegion *create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated);
+    BlockRegion *create_block_region(void *user_context, const MemoryRequest &request);
 
     // Creates a new block region and adds it to the region list
     int destroy_block_region(void *user_context, BlockRegion *region);
@@ -137,30 +138,55 @@ int RegionAllocator::initialize(void *user_context, BlockResource *mb, const Mem
     allocators = ma;
     arena = MemoryArena::create(user_context, {sizeof(BlockRegion), MemoryArena::default_capacity, 0}, allocators.system);
     halide_abort_if_false(user_context, arena != nullptr);
+    MemoryRequest block_request = {};
+    block_request.size = block->memory.size;
+    block_request.offset = 0;
+    block_request.alignment = block->memory.properties.alignment;
+    block_request.properties = block->memory.properties;
+    block_request.dedicated = block->memory.dedicated;
     block->allocator = this;
-    block->regions = create_block_region(
-        user_context,
-        block->memory.properties,
-        0, block->memory.size,
-        block->memory.dedicated);
+    block->regions = create_block_region(user_context, block_request);
+    return 0;
+}
+
+int RegionAllocator::conform(void *user_context, MemoryRequest *request) const {
+    if (allocators.region.conform) {
+        return allocators.region.conform(user_context, request);
+    } else {
+        size_t actual_alignment = conform_alignment(request->alignment, block->memory.properties.alignment);
+        size_t actual_offset = aligned_offset(request->offset, actual_alignment);
+        size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, block->memory.properties.nearest_multiple);
+        request->alignment = actual_alignment;
+        request->offset = actual_offset;
+        request->size = actual_size;
+    }
     return 0;
 }
 
 MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) {
     halide_abort_if_false(user_context, request.size > 0);
-    size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
-    size_t actual_size = conform_size(request.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple);
+
+    MemoryRequest region_request = request;
+
+    int error_code = conform(user_context, &region_request);
+    if (error_code) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n";
+#endif
+        return nullptr;
+    }
+
     size_t remaining = block->memory.size - block->reserved;
-    if (remaining < actual_size) {
+    if (remaining < region_request.size) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Unable to reserve more memory from block "
-                            << "-- requested size (" << (int32_t)(request.size) << " bytes) "
+                            << "-- requested size (" << (int32_t)(region_request.size) << " bytes) "
                             << "greater than available (" << (int32_t)(remaining) << " bytes)";
 #endif
         return nullptr;
     }
 
-    BlockRegion *block_region = find_block_region(user_context, request);
+    BlockRegion *block_region = find_block_region(user_context, region_request);
     if (block_region == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Failed to locate region for requested size ("
@@ -169,12 +195,12 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
         return nullptr;
     }
 
-    if (can_split(block_region, request.size, request.alignment)) {
+    if (can_split(block_region, region_request)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
-                            << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)";
+                            << "to accomodate requested size (" << (int32_t)(region_request.size) << " bytes)";
 #endif
-        split_block_region(user_context, block_region, request.size, request.alignment);
+        split_block_region(user_context, block_region, region_request);
     }
 
     alloc_block_region(user_context, block_region);
@@ -237,8 +263,17 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
         return false;
     }
 
+    MemoryRequest region_request = request;
+    int error_code = conform(user_context, &region_request);
+    if (error_code) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n";
+#endif
+        return false;
+    }
+
     // skip incompatible block regions for this request
-    if (!is_compatible_block_region(region, request.properties)) {
+    if (!is_compatible_block_region(region, region_request.properties)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "    skipping block region ... incompatible properties! ("
                             << " block_region=" << (void *)region
@@ -248,16 +283,13 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
         return false;
     }
 
-    size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
-    size_t actual_size = conform_size(region->memory.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple);
-
     // is the adjusted size larger than the current region?
-    if (actual_size > region->memory.size) {
+    if (region_request.size > region->memory.size) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "    skipping block region ... not enough space for adjusted size! ("
                             << " block_region=" << (void *)region
                             << " request_size=" << (uint32_t)(request.size)
-                            << " actual_size=" << (uint32_t)(actual_size)
+                            << " actual_size=" << (uint32_t)(region_request.size)
                             << " region_size=" << (uint32_t)(region->memory.size)
                             << ")";
 #endif
@@ -265,12 +297,12 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
     }
 
     // will the adjusted size fit within the remaining unallocated space?
-    if ((actual_size + block->reserved) <= block->memory.size) {
+    if ((region_request.size + block->reserved) <= block->memory.size) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "    found suitable block region! ("
                             << " block_region=" << (void *)region
                             << " request_size=" << (uint32_t)(request.size)
-                            << " actual_size=" << (uint32_t)(actual_size)
+                            << " actual_size=" << (uint32_t)(region_request.size)
                             << " region_size=" << (uint32_t)(region->memory.size)
                             << ")";
 #endif
@@ -411,13 +443,11 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     return block_region;
 }
 
-bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size, size_t alignment) const {
-    size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment);
-    size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple);
-    return (block_region && (block_region->memory.size > split_size) && (block_region->usage_count == 0));
+bool RegionAllocator::can_split(const BlockRegion *block_region, const MemoryRequest &split_request) const {
+    return (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0));
 }
 
-BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
+BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, const MemoryRequest &request) {
 
     if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
@@ -434,33 +464,17 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
         block_region->memory.handle = nullptr;
     }
 
-    size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment);
-    size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple);
-    size_t split_offset = aligned_offset(block_region->memory.offset + size, actual_alignment);
-    size_t empty_size = block_region->memory.size - split_size;
-
-#ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "RegionAllocator: Conforming size and alignment ("
-                        << "requested_size=" << (uint32_t)size << " "
-                        << "split_size=" << (uint32_t)split_size << " "
-                        << "split_offset=" << (uint32_t)split_size << " "
-                        << "empty_size=" << (uint32_t)empty_size << " "
-                        << "requested_alignment=" << (uint32_t)alignment << " "
-                        << "required_alignment=" << (uint32_t)block->memory.properties.alignment << " "
-                        << "actual_alignment=" << (uint32_t)actual_alignment << ")";
-#endif
+    MemoryRequest split_request = request;
+    split_request.size = block_region->memory.size - request.size;
+    split_request.offset = block_region->memory.offset + request.size;
 
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Splitting "
                         << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
-                        << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)";
+                        << "to create empty region (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)";
 #endif
-
     BlockRegion *next_region = block_region->next_ptr;
-    BlockRegion *empty_region = create_block_region(user_context,
-                                                    block_region->memory.properties,
-                                                    split_offset, empty_size,
-                                                    block_region->memory.dedicated);
+    BlockRegion *empty_region = create_block_region(user_context, split_request);
     halide_abort_if_false(user_context, empty_region != nullptr);
 
     empty_region->next_ptr = next_region;
@@ -469,42 +483,52 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     }
     empty_region->prev_ptr = block_region;
     block_region->next_ptr = empty_region;
-    block_region->memory.size -= empty_size;
+    block_region->memory.size -= empty_region->memory.size;
     return empty_region;
 }
 
-BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) {
+BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryRequest &request) {
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Creating block region request ("
                         << "user_context=" << (void *)(user_context) << " "
-                        << "offset=" << (uint32_t)offset << " "
-                        << "size=" << (uint32_t)size << " "
-                        << "alignment=" << (uint32_t)properties.alignment << " "
-                        << "dedicated=" << (dedicated ? "true" : "false") << " "
-                        << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                        << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...";
-#endif
-    size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment);
-    size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple);
-    size_t actual_offset = aligned_offset(offset, actual_alignment);
-
-    if (actual_size == 0) {
-        error(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n";
+                        << "offset=" << (uint32_t)request.offset << " "
+                        << "size=" << (uint32_t)request.size << " "
+                        << "alignment=" << (uint32_t)request.properties.alignment << " "
+                        << "dedicated=" << (request.dedicated ? "true" : "false") << " "
+                        << "usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                        << "caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                        << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...";
+#endif
+
+    MemoryRequest region_request = request;
+    int error_code = conform(user_context, &region_request);
+    if (error_code) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to conform request for new block region!\n";
+#endif
+        return nullptr;
+    }
+
+    if (region_request.size == 0) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n";
+#endif
         return nullptr;
     }
 
     BlockRegion *block_region = static_cast<BlockRegion *>(arena->reserve(user_context, true));
     if (block_region == nullptr) {
-        error(user_context) << "RegionAllocator: Failed to allocate new block region!\n";
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to allocate new block region!\n";
+#endif
         return nullptr;
     }
 
     block_region->memory.handle = nullptr;
-    block_region->memory.offset = actual_offset;
-    block_region->memory.size = actual_size;
-    block_region->memory.properties = properties;
-    block_region->memory.dedicated = dedicated;
+    block_region->memory.offset = region_request.offset;
+    block_region->memory.size = region_request.size;
+    block_region->memory.properties = region_request.properties;
+    block_region->memory.dedicated = region_request.dedicated;
     block_region->status = AllocationStatus::Available;
     block_region->block_ptr = block;
     block_region->usage_count = 0;
@@ -669,6 +693,8 @@ bool RegionAllocator::collect(void *user_context) {
 
     uint32_t collected_count = 0;
     uint32_t remaining_count = 0;
+    uint64_t available_bytes = 0;
+    uint64_t scanned_bytes = 0;
     uint64_t reserved = block->reserved;
     debug(user_context) << "    collecting unused regions ("
                         << "block_ptr=" << (void *)block << " "
@@ -679,6 +705,8 @@ bool RegionAllocator::collect(void *user_context) {
     bool has_collected = false;
     BlockRegion *block_region = block->regions;
     while (block_region != nullptr) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        scanned_bytes += block_region->memory.size;
         debug(user_context) << "    checking region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
@@ -687,6 +715,7 @@ bool RegionAllocator::collect(void *user_context) {
                             << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                             << "block_reserved=" << (uint32_t)block->reserved << " "
                             << ")";
+#endif
 
         if (can_coalesce(block_region)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
@@ -705,6 +734,9 @@ bool RegionAllocator::collect(void *user_context) {
             remaining_count++;
 #endif
         }
+#ifdef DEBUG_RUNTIME_INTERNAL
+        available_bytes += is_available(block_region) ? block_region->memory.size : 0;
+#endif
         if (is_last_block_region(user_context, block_region)) {
             break;
         }
@@ -715,6 +747,8 @@ bool RegionAllocator::collect(void *user_context) {
                         << "block_ptr=" << (void *)block << " "
                         << "total_count=" << (uint32_t)(collected_count + remaining_count) << " "
                         << "block_reserved=" << (uint32_t)(block->reserved) << " "
+                        << "scanned_bytes=" << (uint32_t)(scanned_bytes) << " "
+                        << "available_bytes=" << (uint32_t)(available_bytes) << " "
                         << ")";
 #endif
 
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 96535f3446ba..055fbef72277 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -58,11 +58,12 @@ class VulkanMemoryAllocator {
     static int destroy(void *user_context, VulkanMemoryAllocator *allocator);
 
     // Public interface methods
-    MemoryRegion *reserve(void *user_context, MemoryRequest &request);
-    int release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
-    int reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
-    int retain(void *user_context, MemoryRegion *region);   //< retain the region and increase its use count
-    bool collect(void *user_context);                       //< returns true if any blocks were removed
+    MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
+    int conform(void *user_context, MemoryRequest *request);  //< conforms the given memory request into one that can be allocated
+    int release(void *user_context, MemoryRegion *region);    //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *region);    //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *region);     //< retain the region and increase its use count
+    bool collect(void *user_context);                         //< returns true if any blocks were removed
     int release(void *user_context);
     int destroy(void *user_context);
 
@@ -86,9 +87,11 @@ class VulkanMemoryAllocator {
 
     static int allocate_block(void *instance_ptr, MemoryBlock *block);
     static int deallocate_block(void *instance_ptr, MemoryBlock *block);
+    static int conform_block_request(void *instance_ptr, MemoryRequest *request);
 
     static int allocate_region(void *instance_ptr, MemoryRegion *region);
     static int deallocate_region(void *instance_ptr, MemoryRegion *region);
+    static int conform_region_request(void *instance_ptr, MemoryRequest *request);
 
     size_t bytes_allocated_for_blocks() const;
     size_t blocks_allocated() const;
@@ -113,6 +116,8 @@ class VulkanMemoryAllocator {
                                 MemoryProperties properties,
                                 uint32_t required_flags) const;
 
+    int lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements);
+
     size_t block_byte_count = 0;
     size_t block_count = 0;
     size_t region_byte_count = 0;
@@ -180,8 +185,8 @@ int VulkanMemoryAllocator::initialize(void *user_context,
     block_byte_count = 0;
     BlockAllocator::MemoryAllocators allocators;
     allocators.system = system_allocator;
-    allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block};
-    allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region};
+    allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block, VulkanMemoryAllocator::conform_block_request};
+    allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region, VulkanMemoryAllocator::conform_region_request};
     BlockAllocator::Config block_allocator_config = {0};
     block_allocator_config.maximum_pool_size = cfg.maximum_pool_size;
     block_allocator_config.maximum_block_count = cfg.maximum_block_count;
@@ -202,7 +207,7 @@ int VulkanMemoryAllocator::initialize(void *user_context,
     return halide_error_code_success;
 }
 
-MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) {
+MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, const MemoryRequest &request) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Reserving memory ("
                    << "user_context=" << user_context << " "
@@ -272,6 +277,7 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
         error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid memory range !\n";
         return nullptr;
     }
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: MapMemory ("
                    << "user_context=" << user_context << "\n"
                    << "  region_size=" << (uint32_t)region->size << "\n"
@@ -279,8 +285,8 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
                    << "  region_range.head_offset=" << (uint32_t)region->range.head_offset << "\n"
                    << "  region_range.tail_offset=" << (uint32_t)region->range.tail_offset << "\n"
                    << "  memory_offset=" << (uint32_t)memory_offset << "\n"
-                   << "  memory_size=" << (uint32_t)memory_size << ") ...\n";
-
+                   << "  memory_size=" << (uint32_t)memory_size << "\n)\n";
+#endif
     VkResult result = vkMapMemory(device, *device_memory, memory_offset, memory_size, 0, (void **)(&mapped_ptr));
     if (result != VK_SUCCESS) {
         error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n";
@@ -528,6 +534,79 @@ VulkanMemoryAllocator::default_config() {
 }
 
 // --
+int VulkanMemoryAllocator::lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements) {
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Looking up requirements ("
+                   << "user_context=" << user_context << " "
+                   << "size=" << (uint32_t)block->size << ", "
+                   << "usage_flags=" << usage_flags << ") ... \n";
+#endif
+    VkBufferCreateInfo create_info = {
+        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
+        nullptr,                               // struct extending this
+        0,                                     // create flags
+        size,                                  // buffer size (in bytes)
+        usage_flags,                           // buffer usage flags
+        VK_SHARING_MODE_EXCLUSIVE,             // sharing mode
+        0, nullptr};
+
+    // Create a buffer to determine alignment requirements
+    VkBuffer buffer = {0};
+    VkResult result = vkCreateBuffer(this->device, &create_info, this->alloc_callbacks, &buffer);
+    if (result != VK_SUCCESS) {
+#if defined(HL_VK_DEBUG_MEM)
+        debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer to find requirements!\n\t"
+                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+#endif
+        return halide_error_code_device_malloc_failed;
+    }
+
+    vkGetBufferMemoryRequirements(this->device, buffer, memory_requirements);
+    vkDestroyBuffer(this->device, buffer, this->alloc_callbacks);
+    return halide_error_code_success;
+}
+
+int VulkanMemoryAllocator::conform_block_request(void *instance_ptr, MemoryRequest *request) {
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
+    if (instance == nullptr) {
+        return halide_error_code_internal_error;
+    }
+
+    void *user_context = instance->owner_context;
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Conforming block request ("
+                   << "user_context=" << user_context << " "
+                   << "request=" << (void *)(request) << ") ... \n";
+#endif
+
+    if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
+        error(user_context) << "VulkanRegionAllocator: Unable to conform block request! Invalid device handle!\n";
+        return halide_error_code_internal_error;
+    }
+
+    VkMemoryRequirements memory_requirements = {0};
+    uint32_t usage_flags = instance->select_memory_usage(user_context, request->properties);
+    int error_code = instance->lookup_requirements(user_context, request->size, usage_flags, &memory_requirements);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n";
+        return error_code;
+    }
+
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
+                   << "size=" << (uint32_t)request->size << ", "
+                   << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "required_size=" << (uint32_t)memory_requirements.size << ", "
+                   << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
+                   << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
+                   << "dedicated=" << (request->dedicated ? "true" : "false") << ")\n";
+#endif
+
+    request->size = memory_requirements.size;
+    request->properties.alignment = memory_requirements.alignment;
+    return halide_error_code_success;
+}
 
 int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block) {
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
@@ -587,53 +666,6 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block
     debug(nullptr) << "vkAllocateMemory: Allocated memory for device region (" << (uint64_t)block->size << " bytes) ...\n";
 #endif
 
-    uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties);
-
-    VkBufferCreateInfo create_info = {
-        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
-        nullptr,                               // struct extending this
-        0,                                     // create flags
-        sizeof(uint32_t),                      // buffer size (in bytes)
-        usage_flags,                           // buffer usage flags
-        VK_SHARING_MODE_EXCLUSIVE,             // sharing mode
-        0, nullptr};
-
-    // Create a buffer to determine alignment requirements
-    VkBuffer buffer = {0};
-    result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, &buffer);
-    if (result != VK_SUCCESS) {
-        debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer!\n\t"
-                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
-        return halide_error_code_device_malloc_failed;
-    }
-
-    VkMemoryRequirements memory_requirements = {0};
-    vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements);
-    vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
-
-#if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
-                   << "size=" << (uint32_t)block->size << ", "
-                   << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
-                   << "required_size=" << (uint32_t)memory_requirements.size << ", "
-                   << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
-                   << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
-                   << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
-#endif
-
-    // Enforce any alignment constrainst reported by the device limits for each usage type
-    if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
-        block->properties.alignment = instance->physical_device_limits.minStorageBufferOffsetAlignment;
-    } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) {
-        block->properties.alignment = instance->physical_device_limits.minUniformBufferOffsetAlignment;
-    }
-    // Some drivers appear to report a buffer alignment constraint (regardless of usage) that can be larger than either of the above
-    if (memory_requirements.alignment > block->properties.alignment) {
-        block->properties.alignment = memory_requirements.alignment;
-    }
-    if (memory_requirements.alignment > block->properties.nearest_multiple) {
-        block->properties.nearest_multiple = memory_requirements.alignment;
-    }
     block->handle = (void *)device_memory;
     instance->block_byte_count += block->size;
     instance->block_count++;
@@ -814,6 +846,98 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
 
 // --
 
+int VulkanMemoryAllocator::conform(void *user_context, MemoryRequest *request) {
+
+    // NOTE: Vulkan will only allow us to bind device memory to a buffer if the memory requirements are met.
+    // So now we have to check those (on every allocation) and potentially recreate the buffer if the requirements
+    // don't match the requested VkBuffer's properties. Note that this is the internal storage for the driver,
+    // whose size may be required to larger than our requested size (even though we will only ever touch the
+    // size of the region we're managing as within our block)
+
+    VkMemoryRequirements memory_requirements = {0};
+    uint32_t usage_flags = select_memory_usage(user_context, request->properties);
+    int error_code = lookup_requirements(user_context, request->size, usage_flags, &memory_requirements);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n";
+        return error_code;
+    }
+
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements ("
+                   << "requested_size=" << (uint32_t)region->size << ", "
+                   << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "required_size=" << (uint32_t)memory_requirements.size << ")\n";
+#endif
+
+    // Enforce any alignment constraints reported by the device limits for each usage type
+    if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
+        if ((request->alignment % this->physical_device_limits.minStorageBufferOffsetAlignment) != 0) {
+            request->alignment = this->physical_device_limits.minStorageBufferOffsetAlignment;
+        }
+    } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) {
+        if ((request->alignment % this->physical_device_limits.minUniformBufferOffsetAlignment) != 0) {
+            request->alignment = this->physical_device_limits.minUniformBufferOffsetAlignment;
+        }
+    }
+
+    // Ensure the request ends on an aligned address
+    if (request->alignment > config.nearest_multiple) {
+        request->properties.nearest_multiple = request->alignment;
+    }
+
+    size_t actual_alignment = conform_alignment(request->alignment, memory_requirements.alignment);
+    size_t actual_offset = aligned_offset(request->offset, actual_alignment);
+    size_t actual_size = conform_size(actual_offset, memory_requirements.size, actual_alignment, request->properties.nearest_multiple);
+
+#if defined(HL_VK_DEBUG_MEM)
+    if ((request->size != actual_size) || (request->alignment != actual_alignment) || (request->offset != actual_offset)) {
+        debug(nullptr) << "VulkanMemoryAllocator: Adjusting request to match requirements (\n"
+                       << "  size = " << (uint64_t)request->size << " => " << (uint64_t)actual_size << ",\n"
+                       << "  alignment = " << (uint64_t)request->alignment << " => " << (uint64_t)actual_alignment << ",\n"
+                       << "  offset = " << (uint64_t)request->offset << " => " << (uint64_t)actual_offset << ",\n"
+                       << "  required.size = " << (uint64_t)memory_requirements.size << ",\n"
+                       << "  required.alignment = " << (uint64_t)memory_requirements.alignment << "\n)\n";
+    }
+#endif
+    request->size = actual_size;
+    request->alignment = actual_alignment;
+    request->offset = actual_offset;
+
+    return halide_error_code_success;
+}
+
+int VulkanMemoryAllocator::conform_region_request(void *instance_ptr, MemoryRequest *request) {
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
+    if (instance == nullptr) {
+        return halide_error_code_internal_error;
+    }
+
+    void *user_context = instance->owner_context;
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Conforming region request ("
+                   << "user_context=" << user_context << " "
+                   << "request=" << (void *)(region) << ") ... \n";
+#endif
+
+    if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
+        error(user_context) << "VulkanRegionAllocator: Unable to conform region request! Invalid device handle!\n";
+        return halide_error_code_internal_error;
+    }
+
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanRegionAllocator: Conforming region request ("
+                   << "size=" << (uint32_t)request->size << ", "
+                   << "offset=" << (uint32_t)request->offset << ", "
+                   << "dedicated=" << (request->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(request->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(request->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(request->properties.visibility) << ")\n";
+#endif
+
+    return instance->conform(user_context, request);
+}
+
 int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *region) {
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
@@ -890,7 +1014,8 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
     if (memory_requirements.size > region->size) {
         vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks);
 #ifdef DEBUG_RUNTIME
-        debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" << (uint64_t)memory_requirements.size << " bytes) ...\n";
+        debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size ("
+                       << (uint64_t)region->size << " => " << (uint64_t)memory_requirements.size << " bytes) ...\n";
 #endif
         create_info.size = memory_requirements.size;
         VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer);
diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp
index b2190f63b592..26ce8066e118 100644
--- a/test/runtime/block_allocator.cpp
+++ b/test/runtime/block_allocator.cpp
@@ -1,3 +1,7 @@
+// NOTE: Uncomment the following two defines to enable debug output
+// #define DEBUG_RUNTIME
+// #define DEBUG_RUNTIME_INTERNAL
+
 #include "HalideRuntime.h"
 
 #include "common.h"
@@ -39,6 +43,17 @@ int deallocate_block(void *user_context, MemoryBlock *block) {
     return halide_error_code_success;
 }
 
+int conform_block(void *user_context, MemoryRequest *request) {
+
+    debug(user_context) << "Test : conform_block ("
+                        << "request_size=" << int32_t(request->size) << " "
+                        << "request_offset=" << int32_t(request->offset) << " "
+                        << "request_alignment=" << int32_t(request->alignment) << " "
+                        << ") ...";
+
+    return halide_error_code_success;
+}
+
 int allocate_region(void *user_context, MemoryRegion *region) {
     region->handle = (void *)1;
     allocated_region_memory += region->size;
@@ -65,17 +80,38 @@ int deallocate_region(void *user_context, MemoryRegion *region) {
     return halide_error_code_success;
 }
 
+int conform_region(void *user_context, MemoryRequest *request) {
+    size_t actual_alignment = conform_alignment(request->alignment, 0);
+    size_t actual_offset = aligned_offset(request->offset, actual_alignment);
+    size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, actual_alignment);
+
+    debug(user_context) << "Test : conform_region (\n  "
+                        << "request_size=" << int32_t(request->size) << "\n  "
+                        << "request_offset=" << int32_t(request->offset) << "\n  "
+                        << "request_alignment=" << int32_t(request->alignment) << "\n  "
+                        << "actual_size=" << int32_t(actual_size) << "\n  "
+                        << "actual_offset=" << int32_t(actual_offset) << "\n  "
+                        << "actual_alignment=" << int32_t(actual_alignment) << "\n"
+                        << ") ...";
+
+    request->alignment = actual_alignment;
+    request->offset = actual_offset;
+    request->size = actual_size;
+    return halide_error_code_success;
+}
+
 }  // end namespace
 
 int main(int argc, char **argv) {
     void *user_context = (void *)1;
 
     SystemMemoryAllocatorFns system_allocator = {allocate_system, deallocate_system};
-    MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block};
-    MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region};
 
     // test region allocator class interface
     {
+        // Use custom conform allocation request callbacks
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, conform_region};
+
         // Manually create a block resource and allocate memory
         size_t block_size = 4 * 1024 * 1024;
         BlockResource block_resource = {};
@@ -164,8 +200,104 @@ int main(int argc, char **argv) {
         HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
     }
 
+    // test region allocator conform request
+    {
+        // Use default conform allocation request callbacks
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
+
+        // Manually create a block resource and allocate memory
+        size_t block_size = 4 * 1024 * 1024;
+        size_t padded_size = 32;
+        BlockResource block_resource = {};
+        MemoryBlock *memory_block = &(block_resource.memory);
+        memory_block->size = block_size;
+        memory_block->properties.nearest_multiple = padded_size;
+        allocate_block(user_context, memory_block);
+
+        // Create a region allocator to manage the block resource
+        RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator};
+        RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators);
+
+        // test zero size request
+        MemoryRequest request = {0};
+        instance->conform(user_context, &request);
+
+        debug(user_context) << "Test : region_allocator::conform ("
+                            << "request.size=" << int32_t(request.size) << " "
+                            << "request.alignment=" << int32_t(request.alignment) << " "
+                            << ") ...";
+
+        halide_abort_if_false(user_context, request.size == size_t(0));
+
+        // test round up size to alignment
+        request.size = 1;
+        request.alignment = 0;
+        request.properties.alignment = 4;
+        instance->conform(user_context, &request);
+        halide_abort_if_false(user_context, request.size != 4);
+        halide_abort_if_false(user_context, request.alignment != 4);
+
+        size_t nm = padded_size;
+        for (uint32_t sz = 1; sz < 256; ++sz) {
+            for (uint32_t a = 2; a < sz; a *= 2) {
+                request.size = sz;
+                request.alignment = a;
+                instance->conform(user_context, &request);
+
+                debug(user_context) << "Test : region_allocator::conform ("
+                                    << "request.size=(" << sz << " => " << int32_t(request.size) << ") "
+                                    << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") "
+                                    << "...";
+
+                halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm)));
+                halide_abort_if_false(user_context, request.alignment == a);
+            }
+        }
+
+        // test round up size and offset to alignment
+        request.size = 1;
+        request.offset = 1;
+        request.alignment = 32;
+        instance->conform(user_context, &request);
+        halide_abort_if_false(user_context, request.size == 32);
+        halide_abort_if_false(user_context, request.offset == 32);
+        halide_abort_if_false(user_context, request.alignment == 32);
+
+        for (uint32_t sz = 1; sz < 256; ++sz) {
+            for (uint32_t os = 1; os < sz; ++os) {
+                for (uint32_t a = 2; a < sz; a *= 2) {
+                    request.size = sz;
+                    request.offset = os;
+                    request.alignment = a;
+                    instance->conform(user_context, &request);
+
+                    debug(user_context) << "Test : region_allocator::conform ("
+                                        << "request.size=(" << sz << " => " << int32_t(request.size) << ") "
+                                        << "request.offset=(" << os << " => " << int32_t(request.offset) << ") "
+                                        << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") "
+                                        << "...";
+
+                    halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm)));
+                    halide_abort_if_false(user_context, request.offset == aligned_offset(os, a));
+                    halide_abort_if_false(user_context, request.alignment == a);
+                }
+            }
+        }
+
+        instance->destroy(user_context);
+        deallocate_block(user_context, memory_block);
+        HALIDE_CHECK(user_context, allocated_block_memory == 0);
+        HALIDE_CHECK(user_context, allocated_region_memory == 0);
+
+        RegionAllocator::destroy(user_context, instance);
+        HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
+    }
+
     // test region allocator nearest_multiple padding
     {
+        // Use default conform allocation request callbacks
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
+
         // Manually create a block resource and allocate memory
         size_t block_size = 4 * 1024 * 1024;
         size_t padded_size = 32;
@@ -245,6 +377,9 @@ int main(int argc, char **argv) {
         BlockAllocator::Config config = {0};
         config.minimum_block_size = 1024;
 
+        // Use default conform allocation request callbacks
+        MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr};
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
         BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
         BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
 
@@ -296,11 +431,58 @@ int main(int argc, char **argv) {
         HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
     }
 
+    // test conform request
+    {
+        uint32_t mbs = 1024;  // min block size
+        BlockAllocator::Config config = {0};
+        config.minimum_block_size = mbs;
+
+        // Use default conform allocation request callbacks
+        MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr};
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
+        BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
+        BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
+
+        MemoryRequest request = {0};
+        instance->conform(user_context, &request);
+        halide_abort_if_false(user_context, request.size != 0);
+
+        // test round up size to alignment
+        request.size = 1;
+        request.alignment = 0;
+        request.properties.alignment = 4;
+        instance->conform(user_context, &request);
+        halide_abort_if_false(user_context, request.size != 4);
+        halide_abort_if_false(user_context, request.alignment != 4);
+
+        for (uint32_t sz = 1; sz < 256; ++sz) {
+            for (uint32_t a = 2; a < sz; a *= 2) {
+                request.size = sz;
+                request.alignment = a;
+                instance->conform(user_context, &request);
+
+                debug(user_context) << "Test : block_allocator::conform ("
+                                    << "request.size=(" << sz << " => " << int32_t(request.size) << ") "
+                                    << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") "
+                                    << "...";
+
+                halide_abort_if_false(user_context, request.size == max(mbs, (((sz + a - 1) / a) * a)));
+                halide_abort_if_false(user_context, request.alignment == a);
+            }
+        }
+
+        BlockAllocator::destroy(user_context, instance);
+        HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
+    }
+
     // allocation stress test
     {
         BlockAllocator::Config config = {0};
         config.minimum_block_size = 1024;
 
+        // Use default conform allocation request callbacks
+        MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr};
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
         BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
         BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
 
@@ -340,6 +522,9 @@ int main(int argc, char **argv) {
         BlockAllocator::Config config = {0};
         config.minimum_block_size = 1024;
 
+        // Use default conform allocation request callbacks
+        MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr};
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
         BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
         BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
 

From 22868a4db5f3a3a142ed7bc457fd9fd9ee6bdd76 Mon Sep 17 00:00:00 2001
From: Prasoon Mishra <quic_prasmish@quicinc.com>
Date: Thu, 7 Mar 2024 03:10:00 +0530
Subject: [PATCH 083/186] Add sobel in hexagon benchmarks app for CMake builds
 (#8127)

* Add sobel in hexagon_benchmarks app for CMake builds

Resolved compilation errors caused by the eliminate interleave pass,
which changed the instruction from halide.hexagon.pack_satub.vuh to
halide.hexagon.trunc_satub.vuh. The latter is only available in v65 or
later. This commit ensures compatibility with v65 and later versions.

* Minor fix to address the issue.

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 apps/hexagon_benchmarks/CMakeLists.txt |  9 ++--
 apps/hexagon_benchmarks/process.cpp    |  5 +-
 src/HexagonOptimize.cpp                | 66 ++++++++++++++++----------
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/apps/hexagon_benchmarks/CMakeLists.txt b/apps/hexagon_benchmarks/CMakeLists.txt
index 9cbcc541b76a..c01ad22035bd 100644
--- a/apps/hexagon_benchmarks/CMakeLists.txt
+++ b/apps/hexagon_benchmarks/CMakeLists.txt
@@ -22,23 +22,24 @@ endmacro()
 add_generator_and_library(dilate3x3)
 add_generator_and_library(gaussian5x5)
 add_generator_and_library(median3x3)
+add_generator_and_library(sobel)
 
 # Main executable
 add_executable(process process.cpp)
 target_compile_options(process PRIVATE $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-O2>)
 if (Halide_TARGET MATCHES "hvx")
-  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 TARGET_HAS_HVX)
+  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL TARGET_HAS_HVX)
 else()
-  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3)
+  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL)
 endif()
 target_link_libraries(process
                       PRIVATE
                       Halide::Tools
-                      dilate3x3 gaussian5x5 median3x3)
+                      dilate3x3 gaussian5x5 median3x3 sobel)
 
 # Test that the app actually works!
 add_test(NAME hexagon_benchmarks COMMAND process -n 1)
 set_tests_properties(hexagon_benchmarks PROPERTIES
                      LABELS hexagon_benchmarks
                      PASS_REGULAR_EXPRESSION "Success!"
-                     SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
+                     SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
\ No newline at end of file
diff --git a/apps/hexagon_benchmarks/process.cpp b/apps/hexagon_benchmarks/process.cpp
index 87a492c577d1..def519963ad0 100644
--- a/apps/hexagon_benchmarks/process.cpp
+++ b/apps/hexagon_benchmarks/process.cpp
@@ -43,10 +43,11 @@ int main(int argc, char **argv) {
     Dilate3x3Descriptor dilate3x3_pipeine(W, H);
     Median3x3Descriptor median3x3_pipeline(W, H);
     Gaussian5x5Descriptor gaussian5x5_pipeline(W, H);
+    SobelDescriptor sobel_pipeline(W, H);
     Conv3x3a32Descriptor conv3x3a32_pipeline(W, H);
 
     std::vector<PipelineDescriptorBase *> pipelines = {&conv3x3a16_pipeline, &dilate3x3_pipeine, &median3x3_pipeline,
-                                                       &gaussian5x5_pipeline, &conv3x3a32_pipeline};
+                                                       &gaussian5x5_pipeline, &sobel_pipeline, &conv3x3a32_pipeline};
 
     for (PipelineDescriptorBase *p : pipelines) {
         if (!p->defined()) {
@@ -85,4 +86,4 @@ int main(int argc, char **argv) {
 
     printf("Success!\n");
     return 0;
-}
+}
\ No newline at end of file
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index deabd95d1d1b..f11fa3348399 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -1685,6 +1685,14 @@ class EliminateInterleaves : public IRMutator {
         return true;
     }
 
+    // Indicates the minimum Hexagon Vector Extension (HVX) target version required for using these instructions.
+    enum class HvxTarget {
+        v62orLater,  // Use for Hexagon v62 target or later
+        v65orLater,  // Use for Hexagon v65 target or later
+        v66orLater,  // Use for Hexagon v66 target or later
+    };
+    HvxTarget hvx_target;
+
     Expr visit(const Call *op) override {
         vector<Expr> args(op->args);
 
@@ -1702,27 +1710,27 @@ class EliminateInterleaves : public IRMutator {
         // does not deinterleave, and then opportunistically select
         // the interleaving alternative when we can cancel out to the
         // interleave.
-        static std::map<string, string> deinterleaving_alts = {
-            {"halide.hexagon.pack.vh", "halide.hexagon.trunc.vh"},
-            {"halide.hexagon.pack.vw", "halide.hexagon.trunc.vw"},
-            {"halide.hexagon.packhi.vh", "halide.hexagon.trunclo.vh"},
-            {"halide.hexagon.packhi.vw", "halide.hexagon.trunclo.vw"},
-            {"halide.hexagon.pack_satub.vh", "halide.hexagon.trunc_satub.vh"},
-            {"halide.hexagon.pack_satub.vuh", "halide.hexagon.trunc_satub.vuh"},
-            {"halide.hexagon.pack_sath.vw", "halide.hexagon.trunc_sath.vw"},
-            {"halide.hexagon.pack_satuh.vw", "halide.hexagon.trunc_satuh.vw"},
-            {"halide.hexagon.pack_satuh.vuw", "halide.hexagon.trunc_satuh.vuw"},
+        static std::map<string, std::pair<HvxTarget, std::string>> deinterleaving_alts = {
+            {"halide.hexagon.pack.vh", {HvxTarget::v62orLater, "halide.hexagon.trunc.vh"}},
+            {"halide.hexagon.pack.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc.vw"}},
+            {"halide.hexagon.packhi.vh", {HvxTarget::v62orLater, "halide.hexagon.trunclo.vh"}},
+            {"halide.hexagon.packhi.vw", {HvxTarget::v62orLater, "halide.hexagon.trunclo.vw"}},
+            {"halide.hexagon.pack_satub.vh", {HvxTarget::v62orLater, "halide.hexagon.trunc_satub.vh"}},
+            {"halide.hexagon.pack_satub.vuh", {HvxTarget::v65orLater, "halide.hexagon.trunc_satub.vuh"}},
+            {"halide.hexagon.pack_sath.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc_sath.vw"}},
+            {"halide.hexagon.pack_satuh.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc_satuh.vw"}},
+            {"halide.hexagon.pack_satuh.vuw", {HvxTarget::v62orLater, "halide.hexagon.trunc_satuh.vuw"}},
         };
 
         // The reverse mapping of the above.
-        static std::map<string, string> interleaving_alts = {
-            {"halide.hexagon.trunc.vh", "halide.hexagon.pack.vh"},
-            {"halide.hexagon.trunc.vw", "halide.hexagon.pack.vw"},
-            {"halide.hexagon.trunclo.vh", "halide.hexagon.packhi.vh"},
-            {"halide.hexagon.trunclo.vw", "halide.hexagon.packhi.vw"},
-            {"halide.hexagon.trunc_satub.vh", "halide.hexagon.pack_satub.vh"},
-            {"halide.hexagon.trunc_sath.vw", "halide.hexagon.pack_sath.vw"},
-            {"halide.hexagon.trunc_satuh.vw", "halide.hexagon.pack_satuh.vw"},
+        static std::map<string, std::pair<HvxTarget, std::string>> interleaving_alts = {
+            {"halide.hexagon.trunc.vh", {HvxTarget::v62orLater, "halide.hexagon.pack.vh"}},
+            {"halide.hexagon.trunc.vw", {HvxTarget::v62orLater, "halide.hexagon.pack.vw"}},
+            {"halide.hexagon.trunclo.vh", {HvxTarget::v62orLater, "halide.hexagon.packhi.vh"}},
+            {"halide.hexagon.trunclo.vw", {HvxTarget::v62orLater, "halide.hexagon.packhi.vw"}},
+            {"halide.hexagon.trunc_satub.vh", {HvxTarget::v62orLater, "halide.hexagon.pack_satub.vh"}},
+            {"halide.hexagon.trunc_sath.vw", {HvxTarget::v62orLater, "halide.hexagon.pack_sath.vw"}},
+            {"halide.hexagon.trunc_satuh.vw", {HvxTarget::v62orLater, "halide.hexagon.pack_satuh.vw"}},
         };
 
         if (is_native_deinterleave(op) && yields_interleave(args[0])) {
@@ -1738,7 +1746,8 @@ class EliminateInterleaves : public IRMutator {
                                    op->func, op->value_index, op->image, op->param);
             // Add the interleave back to the result of the call.
             return native_interleave(expr);
-        } else if (deinterleaving_alts.find(op->name) != deinterleaving_alts.end() &&
+        } else if (deinterleaving_alts.find(op->name) != deinterleaving_alts.end() && hvx_target >= deinterleaving_alts[op->name].first &&
+
                    yields_removable_interleave(args)) {
             // This call has a deinterleaving alternative, and the
             // arguments are interleaved, so we should use the
@@ -1746,14 +1755,14 @@ class EliminateInterleaves : public IRMutator {
             for (Expr &i : args) {
                 i = remove_interleave(i);
             }
-            return Call::make(op->type, deinterleaving_alts[op->name], args, op->call_type);
-        } else if (interleaving_alts.count(op->name) && is_native_deinterleave(args[0])) {
+            return Call::make(op->type, deinterleaving_alts[op->name].second, args, op->call_type);
+        } else if (interleaving_alts.count(op->name) && hvx_target >= interleaving_alts[op->name].first && is_native_deinterleave(args[0])) {
             // This is an interleaving alternative with a
             // deinterleave, which can be generated when we
             // deinterleave storage. Revert back to the interleaving
             // op so we can remove the deinterleave.
             Expr arg = args[0].as<Call>()->args[0];
-            return Call::make(op->type, interleaving_alts[op->name], {arg}, op->call_type,
+            return Call::make(op->type, interleaving_alts[op->name].second, {arg}, op->call_type,
                               op->func, op->value_index, op->image, op->param);
         } else if (changed) {
             return Call::make(op->type, op->name, args, op->call_type,
@@ -1896,8 +1905,15 @@ class EliminateInterleaves : public IRMutator {
     using IRMutator::visit;
 
 public:
-    EliminateInterleaves(int native_vector_bytes)
+    EliminateInterleaves(const Target &t, int native_vector_bytes)
         : native_vector_bits(native_vector_bytes * 8), alignment_analyzer(native_vector_bytes) {
+        if (t.features_any_of({Target::HVX_v65})) {
+            hvx_target = HvxTarget::v65orLater;
+        } else if (t.features_any_of({Target::HVX_v66})) {
+            hvx_target = HvxTarget::v66orLater;
+        } else {
+            hvx_target = HvxTarget::v62orLater;
+        }
     }
 };
 
@@ -2233,7 +2249,7 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) {
              << s << "\n";
 
     // Try to eliminate any redundant interleave/deinterleave pairs.
-    s = EliminateInterleaves(t.natural_vector_size(Int(8))).mutate(s);
+    s = EliminateInterleaves(t, t.natural_vector_size(Int(8))).mutate(s);
     debug(4) << "Hexagon: Lowering after EliminateInterleaves\n"
              << s << "\n";
 
@@ -2246,4 +2262,4 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) {
 }
 
 }  // namespace Internal
-}  // namespace Halide
+}  // namespace Halide
\ No newline at end of file

From 8cc4f02c94184da567dd5b653ca377bd3523c5ae Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 8 Mar 2024 02:13:56 +0000
Subject: [PATCH 084/186] Fix for top-of-tree LLVM (#8145)

---
 src/CodeGen_Internal.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index 78fc4224fb61..697b9200fa33 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -610,7 +610,11 @@ void get_target_options(const llvm::Module &module, llvm::TargetOptions &options
     options.UseInitArray = true;
     options.FloatABIType =
         use_soft_float_abi ? llvm::FloatABI::Soft : llvm::FloatABI::Hard;
+#if LLVM_VERSION >= 190
+    options.MCOptions.X86RelaxRelocations = false;
+#else
     options.RelaxELFRelocations = false;
+#endif
     options.MCOptions.ABIName = mabi;
 }
 

From 009fe7a15ffd6707ce15bc380e41ad66968d9bfa Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 8 Mar 2024 08:50:20 -0800
Subject: [PATCH 085/186] Handle loads of broadcasts in FlattenNestedRamps
 (#8139)

With sufficiently perverse schedules, it's possible to end up with a
load of a broadcast index (rather than a broadcast of a scalar load).
This made FlattenNestedRamps divide by zero. Unfortunately this happened
in a complex production pipeline, so I'm not entirely sure how to
reproduce it. For that pipeline, this change fixes it and produces
correct output.
---
 src/FlattenNestedRamps.cpp | 42 +++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp
index f48bd75c37a2..92bcf3870d5d 100644
--- a/src/FlattenNestedRamps.cpp
+++ b/src/FlattenNestedRamps.cpp
@@ -81,19 +81,19 @@ class FlattenRamps : public IRMutator {
 
             // If they are, we'll have a full vector of const_indices
             if ((int)const_indices.size() == lanes) {
-
                 // Compute the stride for the underlying strided load
-                int stride = 0;
-                for (int c : const_indices) {
-                    stride = (int)gcd(stride, c);
-                }
-                for (int &c : const_indices) {
-                    c /= stride;
+                int stride = 0, extent = 1;
+                if (max_constant_offset > 0) {
+                    for (int c : const_indices) {
+                        stride = (int)gcd(stride, c);
+                    }
+                    for (int &c : const_indices) {
+                        c /= stride;
+                    }
+                    // Compute the number of elements loaded
+                    extent = (int)((max_constant_offset / stride) + 1);
                 }
 
-                // Compute the number of elements loaded
-                int extent = (int)((max_constant_offset / stride) + 1);
-
                 // If we're gathering from a very large range, it
                 // might be better to just do the gather rather than
                 // doing a big dense load and then shuffling. We
@@ -105,12 +105,22 @@ class FlattenRamps : public IRMutator {
                 // in the schedule somehow.
                 const int max_unused_lane_factor = 4;
                 if (extent < max_unused_lane_factor * lanes) {
-                    Expr dense_index = Ramp::make(min_lane, make_const(min_lane.type(), stride), extent);
-                    Expr dense_load =
-                        Load::make(op->type.with_lanes(extent), op->name, dense_index,
-                                   op->image, op->param,
-                                   const_true(extent), ModulusRemainder{});
-                    return Shuffle::make({dense_load}, const_indices);
+                    if (max_constant_offset == 0) {
+                        // It's a load of a broadcast. Convert it to a broadcast of a load
+                        Expr load = Load::make(op->type.element_of(), op->name, min_lane,
+                                               op->image, op->param,
+                                               const_true(), ModulusRemainder{});
+                        return Broadcast::make(load, lanes);
+                    } else {
+                        // Turn it into a dense load and a shuffle
+                        Expr dense_index =
+                            Ramp::make(min_lane, make_const(min_lane.type(), stride), extent);
+                        Expr dense_load =
+                            Load::make(op->type.with_lanes(extent), op->name, dense_index,
+                                       op->image, op->param,
+                                       const_true(extent), ModulusRemainder{});
+                        return Shuffle::make({dense_load}, const_indices);
+                    }
                 }
             }
         }

From 3c2d8099451521d9f1e1eb3632b31b2d7bc29310 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 11 Mar 2024 17:05:44 -0700
Subject: [PATCH 086/186] Use python itself to get the extension suffix, not
 python-config (#8148)

* Use python itself to get the extension suffix, not python-config

* Add a comment
---
 apps/onnx/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/apps/onnx/Makefile b/apps/onnx/Makefile
index f714b0254b75..5188c1c85068 100644
--- a/apps/onnx/Makefile
+++ b/apps/onnx/Makefile
@@ -90,7 +90,12 @@ ifeq ($(UNAME), Darwin)
     # Keep OS X builds from complaining about missing libpython symbols
     PYBIND11_CFLAGS += -undefined dynamic_lookup
 endif
-PY_EXT = $(shell $(PYTHON)-config --extension-suffix)
+# Get the python extension module suffix from python itself. You can
+# also do this with python-config, but that's not resistant to version
+# mismatches between python and python-config. This can happen when
+# using a virtualenv, because virtualenvs override python, but not
+# python-config.
+PY_EXT = $(shell $(PYTHON) -c 'import sysconfig; print(sysconfig.get_config_var("EXT_SUFFIX"))')
 PY_MODEL_EXT = model_cpp$(PY_EXT)
 PYCXXFLAGS = $(PYBIND11_CFLAGS) $(CXXFLAGS) -Wno-deprecated-register
 

From bf0d61149dde511f39b950689c2a08af7078e88b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 12 Mar 2024 09:49:26 -0700
Subject: [PATCH 087/186] Rewrite the pass that adds mutexes for atomic nodes
 (#8105)

* Avoid redundant scope lookups

This pattern has been bugging me for a long time:

```
if (scope.contains(key)) {
  Foo f = scope.get(key);
}
```

This redundantly looks up the key in the scope twice. I've finally
gotten around to fixing it. I've introduced a find method that either
returns a const pointer to the value, if it exists, or null. It also
searches any containing scopes, which are held by const pointer, so the
method has to return a const pointer.

```
if (const Foo *f = scope.find(key)) {
}
```

For cases where you want to get and then mutate, I added shallow_find,
which doesn't search enclosing scopes, but returns a mutable pointer.

We were also doing redundant scope lookups in ScopedBinding. We stored
the key in the helper object, and then did a pop on that key in the
ScopedBinding destructor. This commit changes Scope so that Scope::push
returns an opaque token that you can pass to Scope::pop to have it
remove that element without doing a fresh lookup. ScopedBinding now uses
this. Under the hood it's just an iterator on the underlying map (map
iterators are not invalidated on inserting or removing other stuff).

The net effect is to speed up local laplacian lowering by about 5%

I also considered making it look more like an stl class, and having find
return an iterator, but it doesn't really work. The iterator it returns
might point to an entry in an enclosing scope, in which case you can't
compare it to the .end() method of the scope you have. Scopes are
different enough from maps that the interface really needs to be
distinct.

* Pacify clang-tidy

* Rewrite the pass that injects mutexes to support atomics

For O(n) nested allocate nodes, this pass was quadratic in n, even if
there was no use of atomics. This commit rewrites it to use a
linear-time algorithm, and skips it entirely after the first validation
pass if there aren't any atomic nodes.

It also needlessly used IRGraphMutators, which slowed things down,
didn't handle LargeBuffers (could overflow in the allocation),
incorrectly thought every producer/consumer node was associated with an
output buffer, and didn't print the realization name when printing the
atomic node (the body of an atomic node is only atomic w.r.t. a specific
realization).

I noticed all this because it stuck out in a profile. For resnet 50, the
rewrite that changed to a linear algorithm took this stage from 185ms
down to 6.7ms, and then skipping it entirely when it doesn't find any
atomic nodes added 1.5 for the single IRVisitor check.

For local laplacian with 100 pyramid levels (which contains many nested
allocate nodes due to a large number of skip connections), the times are
5846 ms -> 16 ms -> 4.6 ms

This is built on top of #8103

* Fix unintentional mutation of interval in scope

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/AddAtomicMutex.cpp               | 216 ++++++++++++++-------------
 src/AddAtomicMutex.h                 |   2 +-
 src/IRPrinter.cpp                    |   9 +-
 src/Lower.cpp                        |   2 +-
 src/runtime/HalideRuntime.h          |   2 +-
 src/runtime/fake_thread_pool.cpp     |   2 +-
 src/runtime/synchronization_common.h |   2 +-
 7 files changed, 119 insertions(+), 116 deletions(-)

diff --git a/src/AddAtomicMutex.cpp b/src/AddAtomicMutex.cpp
index a2bf990e38f6..cf3b0ae8bb89 100644
--- a/src/AddAtomicMutex.cpp
+++ b/src/AddAtomicMutex.cpp
@@ -1,5 +1,4 @@
 #include "AddAtomicMutex.h"
-
 #include "ExprUsesVar.h"
 #include "Func.h"
 #include "IREquality.h"
@@ -11,14 +10,10 @@
 namespace Halide {
 namespace Internal {
 
-using std::map;
-using std::set;
-using std::string;
-
 namespace {
 
 /** Collect names of all stores matching the producer name inside a statement. */
-class CollectProducerStoreNames : public IRGraphVisitor {
+class CollectProducerStoreNames : public IRVisitor {
 public:
     CollectProducerStoreNames(const std::string &producer_name)
         : producer_name(producer_name) {
@@ -27,12 +22,12 @@ class CollectProducerStoreNames : public IRGraphVisitor {
     Scope<void> store_names;
 
 protected:
-    using IRGraphVisitor::visit;
+    using IRVisitor::visit;
 
     void visit(const Store *op) override {
-        IRGraphVisitor::visit(op);
+        IRVisitor::visit(op);
         if (op->name == producer_name || starts_with(op->name, producer_name + ".")) {
-            // This is a Store for the desginated Producer.
+            // This is a Store for the designated Producer.
             store_names.push(op->name);
         }
     }
@@ -42,7 +37,7 @@ class CollectProducerStoreNames : public IRGraphVisitor {
 
 /** Find Store inside of an Atomic node for the designated producer
  *  and return their indices. */
-class FindProducerStoreIndex : public IRGraphVisitor {
+class FindProducerStoreIndex : public IRVisitor {
 public:
     FindProducerStoreIndex(const std::string &producer_name)
         : producer_name(producer_name) {
@@ -51,11 +46,11 @@ class FindProducerStoreIndex : public IRGraphVisitor {
     Expr index;  // The returned index.
 
 protected:
-    using IRGraphVisitor::visit;
+    using IRVisitor::visit;
 
     // Need to also extract the let bindings of a Store index.
     void visit(const Let *op) override {
-        IRGraphVisitor::visit(op);  // Make sure we visit the Store first.
+        IRVisitor::visit(op);  // Make sure we visit the Store first.
         if (index.defined()) {
             if (expr_uses_var(index, op->name)) {
                 index = Let::make(op->name, op->value, index);
@@ -63,7 +58,7 @@ class FindProducerStoreIndex : public IRGraphVisitor {
         }
     }
     void visit(const LetStmt *op) override {
-        IRGraphVisitor::visit(op);  // Make sure we visit the Store first.
+        IRVisitor::visit(op);  // Make sure we visit the Store first.
         if (index.defined()) {
             if (expr_uses_var(index, op->name)) {
                 index = Let::make(op->name, op->value, index);
@@ -72,7 +67,7 @@ class FindProducerStoreIndex : public IRGraphVisitor {
     }
 
     void visit(const Store *op) override {
-        IRGraphVisitor::visit(op);
+        IRVisitor::visit(op);
         if (op->name == producer_name || starts_with(op->name, producer_name + ".")) {
             // This is a Store for the designated producer.
 
@@ -94,11 +89,13 @@ class FindProducerStoreIndex : public IRGraphVisitor {
 /** Throws an assertion for cases where the indexing on left-hand-side of
  *  an atomic update references to itself.
  *  e.g. f(clamp(f(r), 0, 100)) = f(r) + 1 should be rejected. */
-class CheckAtomicValidity : public IRGraphVisitor {
+class CheckAtomicValidity : public IRVisitor {
 protected:
-    using IRGraphVisitor::visit;
+    using IRVisitor::visit;
 
     void visit(const Atomic *op) override {
+        any_atomic = true;
+
         // Collect the names of all Store nodes inside.
         CollectProducerStoreNames collector(op->producer_name);
         op->body.accept(&collector);
@@ -115,13 +112,16 @@ class CheckAtomicValidity : public IRGraphVisitor {
         }
         op->body.accept(this);
     }
+
+public:
+    bool any_atomic = false;
 };
 
 /** Search if the value of a Store node has a variable pointing to a let binding,
  *  where the let binding contains the Store location. Use for checking whether
  *  we need a mutex lock for Atomic since some lowering pass before lifted a let
  *  binding from the Store node (currently only SplitTuple would do this). */
-class FindAtomicLetBindings : public IRGraphVisitor {
+class FindAtomicLetBindings : public IRVisitor {
 public:
     FindAtomicLetBindings(const Scope<void> &store_names)
         : store_names(store_names) {
@@ -133,18 +133,18 @@ class FindAtomicLetBindings : public IRGraphVisitor {
     using IRVisitor::visit;
 
     void visit(const Let *op) override {
-        include(op->value);
+        op->value.accept(this);
         {
             ScopedBinding<Expr> bind(let_bindings, op->name, op->value);
-            include(op->body);
+            op->body.accept(this);
         }
     }
 
     void visit(const LetStmt *op) override {
-        include(op->value);
+        op->value.accept(this);
         {
             ScopedBinding<Expr> bind(let_bindings, op->name, op->value);
-            include(op->body);
+            op->body.accept(this);
         }
     }
 
@@ -159,19 +159,19 @@ class FindAtomicLetBindings : public IRGraphVisitor {
     }
 
     void visit(const Store *op) override {
-        include(op->predicate);
+        op->predicate.accept(this);
+        op->index.accept(this);
         if (store_names.contains(op->name)) {
             // If we are in a designated store and op->value has a let binding
             // that uses one of the store_names, we found a lifted let.
-            ScopedValue<string> old_inside_store(inside_store, op->name);
-            include(op->value);
+            ScopedValue<std::string> old_inside_store(inside_store, op->name);
+            op->value.accept(this);
         } else {
-            include(op->value);
+            op->value.accept(this);
         }
-        include(op->index);
     }
 
-    string inside_store;
+    std::string inside_store;
     const Scope<void> &store_names;
     Scope<Expr> let_bindings;
 };
@@ -179,7 +179,7 @@ class FindAtomicLetBindings : public IRGraphVisitor {
 /** Clear out the Atomic node's mutex usages if it doesn't need one. */
 class RemoveUnnecessaryMutexUse : public IRMutator {
 public:
-    set<string> remove_mutex_lock_names;
+    std::set<std::string> remove_mutex_lock_names;
 
 protected:
     using IRMutator::visit;
@@ -200,30 +200,30 @@ class RemoveUnnecessaryMutexUse : public IRMutator {
             remove_mutex_lock_names.insert(op->mutex_name);
             Stmt body = mutate(op->body);
             return Atomic::make(op->producer_name,
-                                string(),
+                                std::string{},
                                 std::move(body));
         }
     }
 };
 
 /** Find Store inside an Atomic that matches the provided store_names. */
-class FindStoreInAtomicMutex : public IRGraphVisitor {
+class FindStoreInAtomicMutex : public IRVisitor {
 public:
-    using IRGraphVisitor::visit;
+    using IRVisitor::visit;
 
     FindStoreInAtomicMutex(const std::set<std::string> &store_names)
         : store_names(store_names) {
     }
 
     bool found = false;
-    string producer_name;
-    string mutex_name;
+    std::string producer_name;
+    std::string mutex_name;
 
 protected:
     void visit(const Atomic *op) override {
         if (!found && !op->mutex_name.empty()) {
             ScopedValue<bool> old_in_atomic_mutex(in_atomic_mutex, true);
-            include(op->body);
+            op->body.accept(this);
             if (found) {
                 // We found a Store inside Atomic with matching name,
                 // record the mutex information.
@@ -231,7 +231,7 @@ class FindStoreInAtomicMutex : public IRGraphVisitor {
                 mutex_name = op->mutex_name;
             }
         } else {
-            include(op->body);
+            op->body.accept(this);
         }
     }
 
@@ -241,11 +241,11 @@ class FindStoreInAtomicMutex : public IRGraphVisitor {
                 found = true;
             }
         }
-        IRGraphVisitor::visit(op);
+        IRVisitor::visit(op);
     }
 
     bool in_atomic_mutex = false;
-    const set<string> &store_names;
+    const std::set<std::string> &store_names;
 };
 
 /** Replace the indices in the Store nodes with the specified variable. */
@@ -276,26 +276,32 @@ class ReplaceStoreIndexWithVar : public IRMutator {
 /** Add mutex allocation & lock & unlock if required. */
 class AddAtomicMutex : public IRMutator {
 public:
-    AddAtomicMutex(const map<string, Function> &env)
-        : env(env) {
+    AddAtomicMutex(const std::vector<Function> &o) {
+        for (const Function &f : o) {
+            outputs.emplace(f.name(), f);
+        }
     }
 
 protected:
     using IRMutator::visit;
 
-    const map<string, Function> &env;
-    // The set of producers that have allocated a mutex buffer
-    set<string> allocated_mutexes;
+    // Maps from a producer name to a mutex name, for all encountered atomic
+    // nodes.
+    Scope<std::string> needs_mutex_allocation;
 
-    Stmt allocate_mutex(const string &mutex_name, Expr extent, Stmt body) {
+    // Pipeline outputs
+    std::map<std::string, Function> outputs;
+
+    Stmt allocate_mutex(const std::string &mutex_name, Expr extent, Stmt body) {
         Expr mutex_array = Call::make(type_of<halide_mutex_array *>(),
                                       "halide_mutex_array_create",
                                       {std::move(extent)},
                                       Call::Extern);
+
         // Allocate a scalar of halide_mutex_array.
         // This generates halide_mutex_array mutex[1];
         body = Allocate::make(mutex_name,
-                              Handle(),
+                              type_of<halide_mutex *>(),
                               MemoryType::Stack,
                               {},
                               const_true(),
@@ -309,37 +315,44 @@ class AddAtomicMutex : public IRMutator {
         // If this Allocate node is allocating a buffer for a producer,
         // and there is a Store node inside of an Atomic node requiring mutex lock
         // matching the name of the Allocate, allocate a mutex lock.
-        set<string> store_names{op->name};
-        FindStoreInAtomicMutex finder(store_names);
-        op->body.accept(&finder);
-        if (!finder.found) {
-            // No Atomic node that requires mutex lock from this node inside.
-            return IRMutator::visit(op);
-        }
 
-        if (allocated_mutexes.find(finder.mutex_name) != allocated_mutexes.end()) {
-            // We've already allocated a mutex.
-            return IRMutator::visit(op);
+        Stmt body = mutate(op->body);
+
+        std::string producer_name;
+        if (ends_with(op->name, ".0")) {
+            producer_name = op->name.substr(0, op->name.size() - 2);
+        } else {
+            producer_name = op->name;
         }
 
-        allocated_mutexes.insert(finder.mutex_name);
+        if (const std::string *mutex_name = needs_mutex_allocation.find(producer_name)) {
+            Expr extent = cast<uint64_t>(1);  // uint64_t to handle LargeBuffers
+            for (const Expr &e : op->extents) {
+                extent = extent * e;
+            }
 
-        const string &mutex_name = finder.mutex_name;
-        Stmt body = mutate(op->body);
-        Expr extent = Expr(1);
-        for (const Expr &e : op->extents) {
-            extent = extent * e;
+            body = allocate_mutex(*mutex_name, extent, body);
+
+            // At this stage in lowering it should be impossible to have an
+            // allocation that shadows the name of an outer allocation, but may as
+            // well handle it anyway by using a scope and popping at each allocate
+            // node.
+            needs_mutex_allocation.pop(producer_name);
+        }
+
+        if (body.same_as(op->body)) {
+            return op;
+        } else {
+            return Allocate::make(op->name,
+                                  op->type,
+                                  op->memory_type,
+                                  op->extents,
+                                  op->condition,
+                                  std::move(body),
+                                  op->new_expr,
+                                  op->free_function,
+                                  op->padding);
         }
-        body = allocate_mutex(mutex_name, extent, body);
-        return Allocate::make(op->name,
-                              op->type,
-                              op->memory_type,
-                              op->extents,
-                              op->condition,
-                              std::move(body),
-                              op->new_expr,
-                              op->free_function,
-                              op->padding);
     }
 
     Stmt visit(const ProducerConsumer *op) override {
@@ -348,50 +361,35 @@ class AddAtomicMutex : public IRMutator {
         // buffer at the producer node.
 
         if (!op->is_producer) {
-            // This is a consumer.
+            // This is a consumer
             return IRMutator::visit(op);
         }
 
-        // Find the corresponding output.
-        auto func_it = env.find(op->name);
-        if (func_it == env.end()) {
-            // Not an output.
-            return IRMutator::visit(op);
-        }
-        Func f = Func(func_it->second);
-        if (f.output_buffers().empty()) {
-            // Not an output.
+        auto it = outputs.find(op->name);
+        if (it == outputs.end()) {
+            // Not an output
             return IRMutator::visit(op);
         }
 
-        set<string> store_names;
-        for (const auto &buffer : f.output_buffers()) {
-            store_names.insert(buffer.name());
-        }
+        Function f = it->second;
 
-        FindStoreInAtomicMutex finder(store_names);
-        op->body.accept(&finder);
-        if (!finder.found) {
-            // No Atomic node that requires mutex lock from this node inside.
-            return IRMutator::visit(op);
-        }
+        Stmt body = mutate(op->body);
 
-        if (allocated_mutexes.find(finder.mutex_name) != allocated_mutexes.end()) {
-            // We've already allocated a mutex.
-            return IRMutator::visit(op);
+        if (const std::string *mutex_name = needs_mutex_allocation.find(it->first)) {
+            // All output buffers in a Tuple have the same extent.
+            OutputImageParam output_buffer = Func(f).output_buffers()[0];
+            Expr extent = cast<uint64_t>(1);  // uint64_t to handle LargeBuffers
+            for (int i = 0; i < output_buffer.dimensions(); i++) {
+                extent *= output_buffer.dim(i).extent();
+            }
+            body = allocate_mutex(*mutex_name, extent, body);
         }
 
-        allocated_mutexes.insert(finder.mutex_name);
-
-        // We assume all output buffers in a Tuple have the same extent.
-        OutputImageParam output_buffer = f.output_buffers()[0];
-        Expr extent = Expr(1);
-        for (int i = 0; i < output_buffer.dimensions(); i++) {
-            extent = extent * output_buffer.dim(i).extent();
+        if (body.same_as(op->body)) {
+            return op;
+        } else {
+            return ProducerConsumer::make(op->name, op->is_producer, std::move(body));
         }
-        Stmt body = mutate(op->body);
-        body = allocate_mutex(finder.mutex_name, extent, body);
-        return ProducerConsumer::make(op->name, op->is_producer, std::move(body));
     }
 
     Stmt visit(const Atomic *op) override {
@@ -414,7 +412,7 @@ class AddAtomicMutex : public IRMutator {
             // Lift the index outside of the atomic node.
             // This is for avoiding side-effects inside those expressions
             // being evaluated twice.
-            string name = unique_name('t');
+            std::string name = unique_name('t');
             index_let = index;
             index = Variable::make(index.type(), name);
             body = ReplaceStoreIndexWithVar(op->producer_name, index).mutate(body);
@@ -444,17 +442,21 @@ class AddAtomicMutex : public IRMutator {
             internal_assert(index.as<Variable>() != nullptr);
             ret = LetStmt::make(index.as<Variable>()->name, index_let, ret);
         }
+        needs_mutex_allocation.push(op->producer_name, op->mutex_name);
+
         return ret;
     }
 };
 
 }  // namespace
 
-Stmt add_atomic_mutex(Stmt s, const map<string, Function> &env) {
+Stmt add_atomic_mutex(Stmt s, const std::vector<Function> &outputs) {
     CheckAtomicValidity check;
     s.accept(&check);
-    s = RemoveUnnecessaryMutexUse().mutate(s);
-    s = AddAtomicMutex(env).mutate(s);
+    if (check.any_atomic) {
+        s = RemoveUnnecessaryMutexUse().mutate(s);
+        s = AddAtomicMutex(outputs).mutate(s);
+    }
     return s;
 }
 
diff --git a/src/AddAtomicMutex.h b/src/AddAtomicMutex.h
index c27b0346f349..5b11de621e97 100644
--- a/src/AddAtomicMutex.h
+++ b/src/AddAtomicMutex.h
@@ -23,7 +23,7 @@ namespace Internal {
 
 class Function;
 
-Stmt add_atomic_mutex(Stmt s, const std::map<std::string, Function> &env);
+Stmt add_atomic_mutex(Stmt s, const std::vector<Function> &outputs);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index bc03dd124d9a..a186be1874d7 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1112,11 +1112,12 @@ void IRPrinter::visit(const VectorReduce *op) {
 
 void IRPrinter::visit(const Atomic *op) {
     if (op->mutex_name.empty()) {
-        stream << get_indent() << "atomic {\n";
+        stream << get_indent() << "atomic ("
+               << op->producer_name << ") {\n";
     } else {
-        stream << get_indent() << "atomic (";
-        stream << op->mutex_name;
-        stream << ") {\n";
+        stream << get_indent() << "atomic ("
+               << op->producer_name << ", "
+               << op->mutex_name << ") {\n";
     }
     indent += 2;
     print(op->body);
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 3b357eb3061e..e39d55a65b9f 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -299,7 +299,7 @@ void lower_impl(const vector<Function> &output_funcs,
     log("Lowering after storage flattening:", s);
 
     debug(1) << "Adding atomic mutex allocation...\n";
-    s = add_atomic_mutex(s, env);
+    s = add_atomic_mutex(s, outputs);
     log("Lowering after adding atomic mutex allocation:", s);
 
     debug(1) << "Unpacking buffer arguments...\n";
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 62fbaeb66d43..1a19202745bb 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -195,7 +195,7 @@ extern void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mute
 /** Functions for constructing/destroying/locking/unlocking arrays of mutexes. */
 struct halide_mutex_array;
 //@{
-extern struct halide_mutex_array *halide_mutex_array_create(int sz);
+extern struct halide_mutex_array *halide_mutex_array_create(uint64_t sz);
 extern void halide_mutex_array_destroy(void *user_context, void *array);
 extern int halide_mutex_array_lock(struct halide_mutex_array *array, int entry);
 extern int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry);
diff --git a/src/runtime/fake_thread_pool.cpp b/src/runtime/fake_thread_pool.cpp
index 9c3cfddc5a47..531a16d1312e 100644
--- a/src/runtime/fake_thread_pool.cpp
+++ b/src/runtime/fake_thread_pool.cpp
@@ -96,7 +96,7 @@ WEAK void halide_mutex_unlock(halide_mutex *mutex) {
 // (e.g. correctness/multiple_scatter). Since we don't have threads, we don't
 // need to mutex to do anything, but returning a null would trigger an error
 // condition that would be misrepoted as out-of-memory.
-WEAK halide_mutex_array *halide_mutex_array_create(int sz) {
+WEAK halide_mutex_array *halide_mutex_array_create(uint64_t sz) {
     return &halide_fake_mutex_array;
 }
 
diff --git a/src/runtime/synchronization_common.h b/src/runtime/synchronization_common.h
index cb244f360eeb..778c423e4046 100644
--- a/src/runtime/synchronization_common.h
+++ b/src/runtime/synchronization_common.h
@@ -908,7 +908,7 @@ struct halide_mutex_array {
     struct halide_mutex *array;
 };
 
-WEAK halide_mutex_array *halide_mutex_array_create(int sz) {
+WEAK halide_mutex_array *halide_mutex_array_create(uint64_t sz) {
     // TODO: If sz is huge, we should probably hash it down to something smaller
     // in the accessors below. Check for deadlocks before doing so.
     halide_mutex_array *array = (halide_mutex_array *)halide_malloc(

From 4988ab5467b612bb6ce29914e5baf8bf70596ccb Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Wed, 13 Mar 2024 00:58:14 +0100
Subject: [PATCH 088/186] Feature: mark a Func as no_profiling, to prevent
 injection of profiling. (2nd implementation) (#8143)

* Small feature to allow you to specify that a (typically small inner loop) Func should not be profiled.

* Simplified the tuple name handling.

* Optimize tuple name normalization in Profiling.cpp

* Clang-format

* Feedback on Function already being a pointer. Bump the Patch version of the serialization.
---
 src/Deserialization.cpp |  4 ++-
 src/Func.cpp            |  5 +++
 src/Func.h              |  9 +++++
 src/Function.cpp        | 19 ++++++++---
 src/Function.h          |  7 ++++
 src/Lower.cpp           |  2 +-
 src/Profiling.cpp       | 74 +++++++++++++++++++++++++++++++----------
 src/Profiling.h         |  5 ++-
 src/Serialization.cpp   |  5 ++-
 src/halide_ir.fbs       |  3 +-
 10 files changed, 107 insertions(+), 26 deletions(-)

diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp
index 551acfcdebf2..0a1403362621 100644
--- a/src/Deserialization.cpp
+++ b/src/Deserialization.cpp
@@ -504,12 +504,14 @@ void Deserializer::deserialize_function(const Serialize::Func *function, Functio
     const std::vector<std::string> trace_tags =
         deserialize_vector<flatbuffers::String, std::string>(function->trace_tags(),
                                                              &Deserializer::deserialize_string);
+    const bool no_profiling = function->no_profiling();
     const bool frozen = function->frozen();
     hl_function.update_with_deserialization(name, origin_name, output_types, required_types,
                                             required_dim, args, func_schedule, init_def, updates,
                                             debug_file, output_buffers, extern_arguments, extern_function_name,
                                             name_mangling, extern_function_device_api, extern_proxy_expr,
-                                            trace_loads, trace_stores, trace_realizations, trace_tags, frozen);
+                                            trace_loads, trace_stores, trace_realizations, trace_tags,
+                                            no_profiling, frozen);
 }
 
 Stmt Deserializer::deserialize_stmt(Serialize::Stmt type_code, const void *stmt) {
diff --git a/src/Func.cpp b/src/Func.cpp
index 7e0995cc33b5..1f480c99983c 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -3037,6 +3037,11 @@ Func &Func::add_trace_tag(const std::string &trace_tag) {
     return *this;
 }
 
+Func &Func::no_profiling() {
+    func.do_not_profile();
+    return *this;
+}
+
 void Func::debug_to_file(const string &filename) {
     invalidate_cache();
     func.debug_file() = filename;
diff --git a/src/Func.h b/src/Func.h
index d4074ee18cc6..bced13f79481 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -2559,6 +2559,15 @@ class Func {
      */
     Func &add_trace_tag(const std::string &trace_tag);
 
+    /** Marks this function as a function that should not be profiled
+     * when using the target feature Profile or ProfileByTimer.
+     * This is useful when this function is does too little work at once
+     * such that the overhead of setting the profiling token might
+     * become significant, or that the measured time is not representative
+     * due to modern processors (instruction level parallelism, out-of-order
+     * execution). */
+    Func &no_profiling();
+
     /** Get a handle on the internal halide function that this Func
      * represents. Useful if you want to do introspection on Halide
      * functions */
diff --git a/src/Function.cpp b/src/Function.cpp
index 795d18136843..cbb4b61574d4 100644
--- a/src/Function.cpp
+++ b/src/Function.cpp
@@ -110,6 +110,8 @@ struct FunctionContents {
     bool trace_loads = false, trace_stores = false, trace_realizations = false;
     std::vector<string> trace_tags;
 
+    bool no_profiling = false;
+
     bool frozen = false;
 
     void accept(IRVisitor *visitor) const {
@@ -352,6 +354,7 @@ void Function::update_with_deserialization(const std::string &name,
                                            bool trace_stores,
                                            bool trace_realizations,
                                            const std::vector<std::string> &trace_tags,
+                                           bool no_profiling,
                                            bool frozen) {
     contents->name = name;
     contents->origin_name = origin_name;
@@ -373,6 +376,7 @@ void Function::update_with_deserialization(const std::string &name,
     contents->trace_stores = trace_stores;
     contents->trace_realizations = trace_realizations;
     contents->trace_tags = trace_tags;
+    contents->no_profiling = no_profiling;
     contents->frozen = frozen;
 }
 
@@ -509,6 +513,7 @@ void Function::deep_copy(const FunctionPtr &copy, DeepCopyMap &copied_map) const
     copy->trace_stores = contents->trace_stores;
     copy->trace_realizations = contents->trace_realizations;
     copy->trace_tags = contents->trace_tags;
+    copy->no_profiling = contents->no_profiling;
     copy->frozen = contents->frozen;
     copy->output_buffers = contents->output_buffers;
     copy->func_schedule = contents->func_schedule.deep_copy(copied_map);
@@ -1139,10 +1144,6 @@ const std::vector<std::string> &Function::get_trace_tags() const {
     return contents->trace_tags;
 }
 
-void Function::freeze() {
-    contents->frozen = true;
-}
-
 void Function::lock_loop_levels() {
     auto &schedule = contents->func_schedule;
     schedule.compute_level().lock();
@@ -1166,6 +1167,16 @@ void Function::lock_loop_levels() {
     }
 }
 
+void Function::do_not_profile() {
+    contents->no_profiling = true;
+}
+bool Function::should_not_profile() const {
+    return contents->no_profiling;
+}
+
+void Function::freeze() {
+    contents->frozen = true;
+}
 bool Function::frozen() const {
     return contents->frozen;
 }
diff --git a/src/Function.h b/src/Function.h
index 66b62a01f66b..49f68805d61e 100644
--- a/src/Function.h
+++ b/src/Function.h
@@ -88,6 +88,7 @@ class Function {
                                      bool trace_stores,
                                      bool trace_realizations,
                                      const std::vector<std::string> &trace_tags,
+                                     bool no_profiling,
                                      bool frozen);
 
     /** Get a handle on the halide function contents that this Function
@@ -290,6 +291,12 @@ class Function {
      * cannot be mutated further. */
     void lock_loop_levels();
 
+    /** Mark the function as too small for meaningful profiling. */
+    void do_not_profile();
+
+    /** Check if the function is marked as one that should not be profiled. */
+    bool should_not_profile() const;
+
     /** Mark function as frozen, which means it cannot accept new
      * definitions. */
     void freeze();
diff --git a/src/Lower.cpp b/src/Lower.cpp
index e39d55a65b9f..79d02323b3bf 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -408,7 +408,7 @@ void lower_impl(const vector<Function> &output_funcs,
 
     if (t.has_feature(Target::Profile) || t.has_feature(Target::ProfileByTimer)) {
         debug(1) << "Injecting profiling...\n";
-        s = inject_profiling(s, pipeline_name);
+        s = inject_profiling(s, pipeline_name, env);
         log("Lowering after injecting profiling:", s);
     }
 
diff --git a/src/Profiling.cpp b/src/Profiling.cpp
index 2be058b3c8a6..414578299df6 100644
--- a/src/Profiling.cpp
+++ b/src/Profiling.cpp
@@ -3,7 +3,7 @@
 #include <string>
 
 #include "CodeGen_Internal.h"
-#include "ExprUsesVar.h"
+#include "Function.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "InjectHostDevBufferCopies.h"
@@ -71,13 +71,14 @@ class InjectProfiling : public IRMutator {
     vector<int> stack;  // What produce nodes are we currently inside of.
 
     string pipeline_name;
+    const map<string, Function> &env;
 
     bool in_fork = false;
     bool in_parallel = false;
     bool in_leaf_task = false;
 
-    InjectProfiling(const string &pipeline_name)
-        : pipeline_name(pipeline_name) {
+    InjectProfiling(const string &pipeline_name, const map<string, Function> &env)
+        : pipeline_name(pipeline_name), env(env) {
         stack.push_back(get_func_id("overhead"));
         // ID 0 is treated specially in the runtime as overhead
         internal_assert(stack.back() == 0);
@@ -119,10 +120,28 @@ class InjectProfiling : public IRMutator {
     bool profiling_memory = true;
 
     // Strip down the tuple name, e.g. f.0 into f
-    string normalize_name(const string &name) {
-        vector<string> v = split_string(name, ".");
-        internal_assert(!v.empty());
-        return v[0];
+    string normalize_name(const string &name) const {
+        size_t idx = name.find('.');
+        if (idx != std::string::npos) {
+            internal_assert(idx != 0);
+            return name.substr(0, idx);
+        } else {
+            return name;
+        }
+    }
+
+    Function lookup_function(const string &name) const {
+        auto it = env.find(name);
+        if (it != env.end()) {
+            return it->second;
+        }
+        string norm_name = normalize_name(name);
+        it = env.find(norm_name);
+        if (it != env.end()) {
+            return it->second;
+        }
+        internal_error << "No function in the environment found for name '" << name << "'.\n";
+        return {};
     }
 
     int get_func_id(const string &name) {
@@ -185,7 +204,6 @@ class InjectProfiling : public IRMutator {
     }
 
     Stmt visit(const Allocate *op) override {
-        int idx = get_func_id(op->name);
 
         auto [new_extents, changed] = mutate_with_changes(op->extents);
         Expr condition = mutate(op->condition);
@@ -199,6 +217,13 @@ class InjectProfiling : public IRMutator {
         // always conditionally false. remove_dead_allocations() is called after
         // inject_profiling() so this is a possible scenario.
         if (!is_const_zero(size) && on_stack) {
+            int idx;
+            Function func = lookup_function(op->name);
+            if (func.should_not_profile()) {
+                idx = stack.back();  // Attribute the stack size contribution to the deepest _profiled_ func.
+            } else {
+                idx = get_func_id(op->name);
+            }
             const uint64_t *int_size = as_const_uint(size);
             internal_assert(int_size != nullptr);  // Stack size is always a const int
             func_stack_current[idx] += *int_size;
@@ -212,6 +237,7 @@ class InjectProfiling : public IRMutator {
         vector<Stmt> tasks;
         bool track_heap_allocation = !is_const_zero(size) && !on_stack && profiling_memory;
         if (track_heap_allocation) {
+            int idx = get_func_id(op->name);
             debug(3) << "  Allocation on heap: " << op->name
                      << "(" << size << ") in pipeline "
                      << pipeline_name << "\n";
@@ -245,8 +271,6 @@ class InjectProfiling : public IRMutator {
     }
 
     Stmt visit(const Free *op) override {
-        int idx = get_func_id(op->name);
-
         AllocSize alloc = func_alloc_sizes.get(op->name);
         internal_assert(alloc.size.type() == UInt(64));
         func_alloc_sizes.pop(op->name);
@@ -256,6 +280,7 @@ class InjectProfiling : public IRMutator {
         if (!is_const_zero(alloc.size)) {
             if (!alloc.on_stack) {
                 if (profiling_memory) {
+                    int idx = get_func_id(op->name);
                     debug(3) << "  Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "\n";
 
                     vector<Stmt> tasks{
@@ -271,6 +296,13 @@ class InjectProfiling : public IRMutator {
                 const uint64_t *int_size = as_const_uint(alloc.size);
                 internal_assert(int_size != nullptr);
 
+                int idx;
+                Function func = lookup_function(op->name);
+                if (func.should_not_profile()) {
+                    idx = stack.back();  // Attribute the stack size contribution to the deepest _profiled_ func.
+                } else {
+                    idx = get_func_id(op->name);
+                }
                 func_stack_current[idx] -= *int_size;
                 debug(3) << "  Free on stack: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name
                          << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n";
@@ -283,11 +315,19 @@ class InjectProfiling : public IRMutator {
         int idx;
         Stmt body;
         if (op->is_producer) {
-            idx = get_func_id(op->name);
-            stack.push_back(idx);
-            Stmt set_current = set_current_func(idx);
-            body = Block::make(set_current, mutate(op->body));
-            stack.pop_back();
+            Function func = lookup_function(op->name);
+            if (func.should_not_profile()) {
+                body = mutate(op->body);
+                if (body.same_as(op->body)) {
+                    return op;
+                }
+            } else {
+                idx = get_func_id(op->name);
+                stack.push_back(idx);
+                Stmt set_current = set_current_func(idx);
+                body = Block::make(set_current, mutate(op->body));
+                stack.pop_back();
+            }
         } else {
             // At the beginning of the consume step, set the current task
             // back to the outer one.
@@ -498,8 +538,8 @@ class InjectProfiling : public IRMutator {
 
 }  // namespace
 
-Stmt inject_profiling(Stmt s, const string &pipeline_name) {
-    InjectProfiling profiling(pipeline_name);
+Stmt inject_profiling(Stmt s, const string &pipeline_name, const std::map<string, Function> &env) {
+    InjectProfiling profiling(pipeline_name, env);
     s = profiling.mutate(s);
 
     int num_funcs = (int)(profiling.indices.size());
diff --git a/src/Profiling.h b/src/Profiling.h
index a6040b9160af..afaa47fe6d6e 100644
--- a/src/Profiling.h
+++ b/src/Profiling.h
@@ -23,6 +23,7 @@
  *   mandelbrot:  0.006444ms (10%)   peak: 505344   num: 104000   avg: 5376
  *   argmin:      0.027715ms (46%)   stack: 20
  */
+#include <map>
 #include <string>
 
 #include "Expr.h"
@@ -30,6 +31,8 @@
 namespace Halide {
 namespace Internal {
 
+class Function;
+
 /** Take a statement representing a halide pipeline insert
  * high-resolution timing into the generated code (via spawning a
  * thread that acts as a sampling profiler); summaries of execution
@@ -37,7 +40,7 @@ namespace Internal {
  * storage flattening, but after all bounds inference.
  *
  */
-Stmt inject_profiling(Stmt, const std::string &);
+Stmt inject_profiling(Stmt, const std::string &, const std::map<std::string, Function> &env);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Serialization.cpp b/src/Serialization.cpp
index 144d79af7e5e..c1cb3a6d1193 100644
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -1029,6 +1029,7 @@ Offset<Serialize::Func> Serializer::serialize_function(FlatBufferBuilder &builde
     for (const auto &tag : function.get_trace_tags()) {
         trace_tags_serialized.push_back(serialize_string(builder, tag));
     }
+    const bool no_profiling = function.should_not_profile();
     const bool frozen = function.frozen();
     auto func = Serialize::CreateFunc(builder,
                                       name_serialized,
@@ -1050,7 +1051,9 @@ Offset<Serialize::Func> Serializer::serialize_function(FlatBufferBuilder &builde
                                       trace_loads,
                                       trace_stores,
                                       trace_realizations,
-                                      builder.CreateVector(trace_tags_serialized), frozen);
+                                      builder.CreateVector(trace_tags_serialized),
+                                      no_profiling,
+                                      frozen);
     return func;
 }
 
diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs
index 01a987b6f430..efc465cbee82 100644
--- a/src/halide_ir.fbs
+++ b/src/halide_ir.fbs
@@ -15,7 +15,7 @@ enum SerializationVersionMinor: int {
     Value = 0
 }
 enum SerializationVersionPatch: int {
-    Value = 0
+    Value = 1
 }
 
 // from src/IR.cpp
@@ -713,6 +713,7 @@ table Func {
     trace_stores: bool = false;
     trace_realizations: bool = false;
     trace_tags: [string];
+    no_profiling: bool = false;
     frozen: bool = false;
 }
 

From 83616f20c49c6f8e97403acd0add3df41753adeb Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 12 Mar 2024 17:00:49 -0700
Subject: [PATCH 089/186] Fix three nits (#8137)

1) has_gpu_feature already includes Vulkan, so there's no need to check
for it.

2) Use emplace(...) instead of insert(make_pair(...))

3) Fixed a place that should be using a ScopedValue
---
 src/BoundsInference.cpp                           | 6 +++---
 src/Lower.cpp                                     | 5 +----
 src/StorageFlattening.cpp                         | 8 ++------
 src/autoschedulers/mullapudi2016/AutoSchedule.cpp | 2 +-
 4 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index 5965303197bc..21ca06e07285 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -1152,7 +1152,7 @@ class BoundsInference : public IRMutator {
         map<string, Function> stage_name_to_func;
 
         if (producing >= 0) {
-            fused_group.insert(make_pair(f.name(), stage_index));
+            fused_group.emplace(f.name(), stage_index);
         }
 
         if (!no_pipelines && producing >= 0 && !f.has_extern_definition()) {
@@ -1164,12 +1164,12 @@ class BoundsInference : public IRMutator {
                 if (!((pair.func_1 == stages[producing].name) && ((int)pair.stage_1 == stage_index)) && is_fused_with_others(fused_groups, fused_pairs_in_groups,
                                                                                                                              f, stage_index,
                                                                                                                              pair.func_1, pair.stage_1, var)) {
-                    fused_group.insert(make_pair(pair.func_1, pair.stage_1));
+                    fused_group.emplace(pair.func_1, pair.stage_1);
                 }
                 if (!((pair.func_2 == stages[producing].name) && ((int)pair.stage_2 == stage_index)) && is_fused_with_others(fused_groups, fused_pairs_in_groups,
                                                                                                                              f, stage_index,
                                                                                                                              pair.func_2, pair.stage_2, var)) {
-                    fused_group.insert(make_pair(pair.func_2, pair.stage_2));
+                    fused_group.emplace(pair.func_2, pair.stage_2);
                 }
             }
 
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 79d02323b3bf..f092e2e711ef 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -280,10 +280,7 @@ void lower_impl(const vector<Function> &output_funcs,
     s = split_tuples(s, env);
     log("Lowering after destructuring tuple-valued realizations:", s);
 
-    // Vulkan relies on GPU var canonicalization occurring before
-    // storage flattening.
-    if (t.has_gpu_feature() ||
-        t.has_feature(Target::Vulkan)) {
+    if (t.has_gpu_feature()) {
         debug(1) << "Canonicalizing GPU var names...\n";
         s = canonicalize_gpu_vars(s);
         log("Lowering after canonicalizing GPU var names:", s);
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index ba4cc9b8acca..5860a7e50d0f 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -535,13 +535,9 @@ class FlattenDimensions : public IRMutator {
             Interval loop_bounds = Interval(expanded_min, simplify(expanded_min + expanded_extent - 1));
             it->loop_vars.push(op->name, loop_bounds);
         }
-        bool old_in_gpu = in_gpu;
-        if (op->for_type == ForType::GPUBlock ||
-            op->for_type == ForType::GPUThread) {
-            in_gpu = true;
-        }
+
+        ScopedValue<bool> old_in_gpu(in_gpu, in_gpu || is_gpu(op->for_type));
         Stmt stmt = IRMutator::visit(op);
-        in_gpu = old_in_gpu;
 
         for (auto &p : hoisted_storages) {
             p.loop_vars.pop(op->name);
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index 2ce325538a86..e3cc2ec5e825 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -1359,7 +1359,7 @@ Partitioner::Partitioner(const map<string, Box> &_pipeline_bounds,
         for (int s = 0; s < num_stages; s++) {
             FStage stg(f.second, s);
             Group g(stg, {stg});
-            groups.insert(make_pair(stg, g));
+            groups.emplace(stg, g);
         }
     }
 

From f841a27b0c3f0b2b45c756908773c530d47f482c Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Thu, 14 Mar 2024 12:53:17 -0700
Subject: [PATCH 090/186] Bound allocation extents for hoist_storage using loop
 variables one-by-one (#8154)

* Bound allocation extents using loop variable one-by-one

* Use emplace_back
---
 src/StorageFlattening.cpp          | 19 ++++++++++++++-----
 test/correctness/hoist_storage.cpp | 14 ++++++++++++++
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index 5860a7e50d0f..59278d50fe69 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -88,7 +88,7 @@ class FlattenDimensions : public IRMutator {
     struct HoistedStorageData {
         string name;
         vector<HoistedAllocationInfo> hoisted_allocations;
-        Scope<Interval> loop_vars;
+        vector<pair<string, Interval>> loop_vars;
         Scope<Expr> scope;
 
         HoistedStorageData(const string &n)
@@ -304,8 +304,17 @@ class FlattenDimensions : public IRMutator {
                 }
 
                 e = simplify(common_subexpression_elimination(e));
-                Interval bounds = bounds_of_expr_in_scope(e, hoisted_storage_data.loop_vars);
-                return bounds.max;
+                // Find bounds of expression using the intervals of the loop variables. The loop variables may depend on
+                // the other loop variables, so we just call bounds_of_expr_in_scope for each loop variable separately
+                // in a reverse order.
+                for (auto it = hoisted_storage_data.loop_vars.rbegin(); it != hoisted_storage_data.loop_vars.rend(); ++it) {
+                    Scope<Interval> one_loop_var;
+                    one_loop_var.push(it->first, it->second);
+                    Interval bounds = bounds_of_expr_in_scope(e, one_loop_var);
+                    e = bounds.max;
+                }
+
+                return e;
             };
 
             vector<Expr> bounded_extents;
@@ -533,14 +542,14 @@ class FlattenDimensions : public IRMutator {
             expanded_min = simplify(expand_expr(expanded_min, it->scope));
             expanded_extent = expand_expr(expanded_extent, it->scope);
             Interval loop_bounds = Interval(expanded_min, simplify(expanded_min + expanded_extent - 1));
-            it->loop_vars.push(op->name, loop_bounds);
+            it->loop_vars.emplace_back(op->name, loop_bounds);
         }
 
         ScopedValue<bool> old_in_gpu(in_gpu, in_gpu || is_gpu(op->for_type));
         Stmt stmt = IRMutator::visit(op);
 
         for (auto &p : hoisted_storages) {
-            p.loop_vars.pop(op->name);
+            p.loop_vars.pop_back();
         }
 
         return stmt;
diff --git a/test/correctness/hoist_storage.cpp b/test/correctness/hoist_storage.cpp
index ce98b421bc54..4e96dfee9f2d 100644
--- a/test/correctness/hoist_storage.cpp
+++ b/test/correctness/hoist_storage.cpp
@@ -604,6 +604,20 @@ int main(int argc, char **argv) {
         });
     }
 
+    {
+        ImageParam input(UInt(8), 2);
+        Var x{"x"}, y{"y"}, yo{"yo"}, yi{"yi"};
+        Func f[3];
+        f[0] = BoundaryConditions::repeat_edge(input);
+        f[1](x, y) = ((f[0]((x / 2) + 2, (y / 2) + 2)) + (f[0](x + 1, y)));
+        f[2](x, y) = ((f[1](x * 2, (y * 2) + -2)) + (f[1](x + -1, y + -1)));
+        f[2].split(y, yo, yi, 16);
+        f[0].hoist_storage(f[2], yo).compute_at(f[1], x);
+        f[1].hoist_storage_root().compute_at(f[2], yi);
+
+        f[2].compile_jit();
+    }
+
     printf("Success!\n");
     return 0;
 }

From 76a7dd4f7fb538deaf7c2ade56c02bc84e5221e8 Mon Sep 17 00:00:00 2001
From: Zalman Stern <zalman@google.com>
Date: Fri, 15 Mar 2024 13:01:51 -0700
Subject: [PATCH 091/186] Support for ARM SVE2. (#8051)

* Checkpoint SVE2 restart.

* Remove dead code. Add new test.

* Update cmake for new file.

* Checkpoint progress on SVE2.

* Checkpoint ARM SVE2 support. Passes correctness_simd_op_check_sve2 test at 128 and 256 bits.

* Remove an opportunity for RISC V codegen to change due to SVE2 support.

* Ensure SVE intrinsics get vscale vectors and non-SVE ones get fixed vectors.

Use proper prefix for neon intrinsics.

Comment cleanups.

* Checkpoint SVE2 work. Generally passes test, though using both NEON
and SVE2 with simd_op_check_sve2 fails as both posibilities need to be
allowed for 128-bit or smaller operations.

* Remove an unfavored implementation possibility.

* Fix opcode recognition in test to handle some cases that show up.

Change name of test class to avoid confusion.

* Formatting fixes.

Replace internal_error with nop return for CodeGen_LLVM::match_vector_type_scalable called on scalar.

* Formatting fix.

* Limit SVE2 test to LLVM 19.

Remove dead code.

* Fix a degenerate case asking for zero sized vectors via a HAlide type
with lanes of zero, which is not correct.

* Fix confusion about Neon64/Neon128 and make it clear this is just the
width multiplier applied to intrinsics.

* REmove extraneous commented out line.

* Address some review feedback. Mostly comment fixes.

* Fix missed conflict resolution.

* Fix some TODOs in SVE code. Move utility function to Util.h and common
code the other obvious use.

* Formatting.

* Add missed refactor change.

* Add issue to TODO comment.

* Remove TODOs that don't seem necessary.

* Add issue for TODO.

* Add issue for TODO.

* Remove dubious looking FP to int code that was ifdef'ed out. Doesn't
look like a TODO is needed anymore.

* Add issues for TODOs.

* Update simd_op_check_sve2.cpp

* Make a deep copy of each piece of test IR so that we can parallelize

* Fix two clang-tidy warnings

* Remove try/catch block from simd-op-check-sve2

* Don't try to run SVE2 code if vector_bits doesn't match host.

* Add support for fcvtm/p, make scalars go through pattern matching too (#8151)

* Don't do arm neon instruction selection on scalars

This revealed a bug. FindIntrinsics was not enabled for scalars anyway,
so it was semi-pointless.

---------

Co-authored-by: Zalman Stern <zalman@macbook-pro.lan>
Co-authored-by: Steven Johnson <srj@google.com>
Co-authored-by: Andrew Adams <andrew.b.adams@gmail.com>
---
 src/CodeGen_ARM.cpp                     | 1388 ++++++++++++++++++-----
 src/CodeGen_LLVM.cpp                    |  230 +++-
 src/CodeGen_LLVM.h                      |    7 +
 src/Function.cpp                        |    6 +-
 src/IR.cpp                              |    1 +
 src/IR.h                                |    2 +
 src/IRMatch.cpp                         |    3 +
 src/LLVM_Output.cpp                     |    6 +
 src/StorageFolding.cpp                  |    5 +-
 src/Util.h                              |   11 +
 src/WasmExecutor.cpp                    |   11 +-
 src/runtime/HalideRuntime.h             |    6 +-
 src/runtime/aarch64.ll                  |   76 +-
 src/runtime/errors.cpp                  |    8 +
 src/runtime/posix_math.ll               |   28 +-
 src/runtime/runtime_api.cpp             |    1 +
 test/correctness/CMakeLists.txt         |    1 +
 test/correctness/simd_op_check_arm.cpp  |    7 +
 test/correctness/simd_op_check_sve2.cpp | 1387 ++++++++++++++++++++++
 19 files changed, 2836 insertions(+), 348 deletions(-)
 create mode 100644 test/correctness/simd_op_check_sve2.cpp

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 7852532183bf..d0538d6ccca8 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -105,17 +105,30 @@ class CodeGen_ARM : public CodeGen_Posix {
     CodeGen_ARM(const Target &);
 
 protected:
+    using codegen_func_t = std::function<Value *(int lanes, const std::vector<Value *> &)>;
     using CodeGen_Posix::visit;
 
-    /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that
-     * takes one vector argument and splits it into two to call inner. */
-    llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name);
+    /** Similar to llvm_type_of, but allows providing a VectorTypeConstraint to
+     * force Fixed or VScale vector results. */
+    llvm::Type *llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, VectorTypeConstraint constraint);
+
+    /** Define a wrapper LLVM func that takes some arguments which Halide defines
+     * and call inner LLVM intrinsic with an additional argument which LLVM requires. */
+    llvm::Function *define_intrin_wrapper(const std::string &inner_name,
+                                          const Type &ret_type,
+                                          const std::string &mangled_name,
+                                          const std::vector<Type> &arg_types,
+                                          int intrinsic_flags,
+                                          bool sve_intrinsic);
 
     void init_module() override;
     void compile_func(const LoweredFunc &f,
                       const std::string &simple_name, const std::string &extern_name) override;
 
-    /** Nodes for which we want to emit specific neon intrinsics */
+    void begin_func(LinkageType linkage, const std::string &simple_name,
+                    const std::string &extern_name, const std::vector<LoweredArgument> &args) override;
+
+    /** Nodes for which we want to emit specific ARM vector intrinsics */
     // @{
     void visit(const Cast *) override;
     void visit(const Add *) override;
@@ -125,15 +138,25 @@ class CodeGen_ARM : public CodeGen_Posix {
     void visit(const Store *) override;
     void visit(const Load *) override;
     void visit(const Shuffle *) override;
+    void visit(const Ramp *) override;
     void visit(const Call *) override;
     void visit(const LT *) override;
     void visit(const LE *) override;
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
+    bool codegen_dot_product_vector_reduce(const VectorReduce *, const Expr &);
+    bool codegen_pairwise_vector_reduce(const VectorReduce *, const Expr &);
+    bool codegen_across_vector_reduce(const VectorReduce *, const Expr &);
     // @}
     Type upgrade_type_for_arithmetic(const Type &t) const override;
     Type upgrade_type_for_argument_passing(const Type &t) const override;
     Type upgrade_type_for_storage(const Type &t) const override;
 
+    /** Helper function to perform codegen of vector operation in a way that
+     * total_lanes are divided into slices, codegen is performed for each slice
+     * and results are concatenated into total_lanes.
+     */
+    Value *codegen_with_lanes(int slice_lanes, int total_lanes, const std::vector<Expr> &args, codegen_func_t &cg_func);
+
     /** Various patterns to peephole match against */
     struct Pattern {
         string intrin;  ///< Name of the intrinsic
@@ -150,10 +173,12 @@ class CodeGen_ARM : public CodeGen_Posix {
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
+    int target_vscale() const override;
 
     // NEON can be disabled for older processors.
-    bool neon_intrinsics_disabled() {
-        return target.has_feature(Target::NoNEON);
+    bool simd_intrinsics_disabled() {
+        return target.has_feature(Target::NoNEON) &&
+               !target.has_feature(Target::SVE2);
     }
 
     bool is_float16_and_has_feature(const Type &t) const {
@@ -161,11 +186,28 @@ class CodeGen_ARM : public CodeGen_Posix {
         return t.code() == Type::Float && t.bits() == 16 && target.has_feature(Target::ARMFp16);
     }
     bool supports_call_as_float16(const Call *op) const override;
+
+    /** Make predicate vector which starts with consecutive true followed by consecutive false */
+    Expr make_vector_predicate_1s_0s(int true_lanes, int false_lanes) {
+        internal_assert((true_lanes + false_lanes) != 0) << "CodeGen_ARM::make_vector_predicate_1s_0s called with total of 0 lanes.\n";
+        if (true_lanes == 0) {
+            return const_false(false_lanes);
+        } else if (false_lanes == 0) {
+            return const_true(true_lanes);
+        } else {
+            return Shuffle::make_concat({const_true(true_lanes), const_false(false_lanes)});
+        }
+    }
 };
 
 CodeGen_ARM::CodeGen_ARM(const Target &target)
     : CodeGen_Posix(target) {
 
+    // TODO(https://github.com/halide/Halide/issues/8088): See if
+    // use_llvm_vp_intrinsics can replace architecture specific code in this
+    // file, specifically in Load and Store visitors.  Depends on quality of
+    // LLVM aarch64 backend lowering for these intrinsics on SVE2.
+
     // RADDHN - Add and narrow with rounding
     // These must come before other narrowing rounding shift patterns
     casts.emplace_back("rounding_add_narrow", i8(rounding_shift_right(wild_i16x_ + wild_i16x_, 8)));
@@ -211,6 +253,12 @@ CodeGen_ARM::CodeGen_ARM(const Target &target)
     casts.emplace_back("shift_right_narrow", i32(wild_i64x_ >> wild_u64_));
     casts.emplace_back("shift_right_narrow", u32(wild_u64x_ >> wild_u64_));
 
+    // VCVTP/M
+    casts.emplace_back("fp_to_int_floor", i32(floor(wild_f32x_)));
+    casts.emplace_back("fp_to_int_floor", u32(floor(wild_f32x_)));
+    casts.emplace_back("fp_to_int_ceil", i32(ceil(wild_f32x_)));
+    casts.emplace_back("fp_to_int_ceil", u32(ceil(wild_f32x_)));
+
     // SQRSHL, UQRSHL - Saturating rounding shift left (by signed vector)
     // TODO: We need to match rounding shift right, and negate the RHS.
 
@@ -299,26 +347,66 @@ struct ArmIntrinsic {
         SplitArg0 = 1 << 6,          // This intrinsic requires splitting the argument into the low and high halves.
         NoPrefix = 1 << 7,           // Don't prefix the intrinsic with llvm.*
         RequireFp16 = 1 << 8,        // Available only if Target has ARMFp16 feature
+        Neon64Unavailable = 1 << 9,  // Unavailable for 64 bit NEON
+        SveUnavailable = 1 << 10,    // Unavailable for SVE
+        SveNoPredicate = 1 << 11,    // In SVE intrinsics, additional predicate argument is required as default, unless this flag is set.
+        SveInactiveArg = 1 << 12,    // This intrinsic needs the additional argument for fallback value for the lanes inactivated by predicate.
+        SveRequired = 1 << 13,       // This intrinsic requires SVE.
     };
 };
 
 // clang-format off
 const ArmIntrinsic intrinsic_defs[] = {
-    {"vabs", "abs", UInt(8, 8), "abs", {Int(8, 8)}, ArmIntrinsic::HalfWidth},
-    {"vabs", "abs", UInt(16, 4), "abs", {Int(16, 4)}, ArmIntrinsic::HalfWidth},
-    {"vabs", "abs", UInt(32, 2), "abs", {Int(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"llvm.fabs", "llvm.fabs", Float(32, 2), "abs", {Float(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"llvm.fabs", "llvm.fabs", Float(16, 4), "abs", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16},
-
-    {"llvm.sqrt", "llvm.sqrt", Float(32, 2), "sqrt_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"llvm.sqrt", "llvm.sqrt", Float(64, 2), "sqrt_f64", {Float(64, 2)}},
-
-    {"llvm.roundeven", "llvm.roundeven", Float(16, 8), "round", {Float(16, 8)}, ArmIntrinsic::RequireFp16},
-    {"llvm.roundeven", "llvm.roundeven", Float(32, 4), "round", {Float(32, 4)}},
-    {"llvm.roundeven", "llvm.roundeven", Float(64, 2), "round", {Float(64, 2)}},
-    {"llvm.roundeven.f16", "llvm.roundeven.f16", Float(16), "round", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle},
-    {"llvm.roundeven.f32", "llvm.roundeven.f32", Float(32), "round", {Float(32)}, ArmIntrinsic::NoMangle},
-    {"llvm.roundeven.f64", "llvm.roundeven.f64", Float(64), "round", {Float(64)}, ArmIntrinsic::NoMangle},
+    // TODO(https://github.com/halide/Halide/issues/8093):
+    // Some of the Arm intrinsic have the same name between Neon and SVE2 but with different behavior. For example,
+    // widening, narrowing and pair-wise operations which are performed in even (top) and odd (bottom) lanes basis in SVE,
+    // while in high and low lanes in Neon. Therefore, peep-hole code-gen with those SVE2 intrinsic is not enabled for now,
+    // because additional interleaving/deinterleaveing would be required to restore the element order in a vector.
+
+    {"vabs", "abs", UInt(8, 8), "abs", {Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg},
+    {"vabs", "abs", UInt(16, 4), "abs", {Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg},
+    {"vabs", "abs", UInt(32, 2), "abs", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg},
+    {"llvm.fabs", "llvm.fabs", Float(16, 4), "abs", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate},
+    {"llvm.fabs", "llvm.fabs", Float(32, 2), "abs", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
+    {"llvm.fabs", "llvm.fabs", Float(64, 2), "abs", {Float(64, 2)},  ArmIntrinsic::SveNoPredicate},
+    {"llvm.fabs.f16", "llvm.fabs.f16", Float(16), "abs", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.fabs.f32", "llvm.fabs.f32", Float(32), "abs", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.fabs.f64", "llvm.fabs.f64", Float(64), "abs", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+
+    {"llvm.sqrt", "llvm.sqrt", Float(16, 4), "sqrt_f16", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate},
+    {"llvm.sqrt", "llvm.sqrt", Float(32, 2), "sqrt_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
+    {"llvm.sqrt", "llvm.sqrt", Float(64, 2), "sqrt_f64", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate},
+    {"llvm.sqrt.f16", "llvm.sqrt.f16", Float(16), "sqrt_f16", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.sqrt.f32", "llvm.sqrt.f32", Float(32), "sqrt_f32", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.sqrt.f64", "llvm.sqrt.f64", Float(64), "sqrt_f64", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+
+    {"llvm.floor", "llvm.floor", Float(16, 4), "floor_f16", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate},
+    {"llvm.floor", "llvm.floor", Float(32, 2), "floor_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
+    {"llvm.floor", "llvm.floor", Float(64, 2), "floor_f64", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate},
+    {"llvm.floor.f16", "llvm.floor.f16", Float(16), "floor_f16", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.floor.f32", "llvm.floor.f32", Float(32), "floor_f32", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.floor.f64", "llvm.floor.f64", Float(64), "floor_f64", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+
+    {"llvm.ceil", "llvm.ceil", Float(16, 4), "ceil_f16", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate},
+    {"llvm.ceil", "llvm.ceil", Float(32, 2), "ceil_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
+    {"llvm.ceil", "llvm.ceil", Float(64, 2), "ceil_f64", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate},
+    {"llvm.ceil.f16", "llvm.ceil.f16", Float(16), "ceil_f16", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.ceil.f32", "llvm.ceil.f32", Float(32), "ceil_f32", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.ceil.f64", "llvm.ceil.f64", Float(64), "ceil_f64", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+
+    {"llvm.trunc", "llvm.trunc", Float(16, 4), "trunc_f16", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate},
+    {"llvm.trunc", "llvm.trunc", Float(32, 2), "trunc_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
+    {"llvm.trunc", "llvm.trunc", Float(64, 2), "trunc_f64", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate},
+    {"llvm.trunc.f16", "llvm.trunc.f16", Float(16), "trunc_f16", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.trunc.f32", "llvm.trunc.f32", Float(32), "trunc_f32", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.trunc.f64", "llvm.trunc.f64", Float(64), "trunc_f64", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+
+    {"llvm.roundeven", "llvm.roundeven", Float(16, 4), "round", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate},
+    {"llvm.roundeven", "llvm.roundeven", Float(32, 2), "round", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
+    {"llvm.roundeven", "llvm.roundeven", Float(64, 2), "round", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate},
+    {"llvm.roundeven.f16", "llvm.roundeven.f16", Float(16), "round", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.roundeven.f32", "llvm.roundeven.f32", Float(32), "round", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
+    {"llvm.roundeven.f64", "llvm.roundeven.f64", Float(64), "round", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate},
 
     // SABD, UABD - Absolute difference
     {"vabds", "sabd", UInt(8, 8), "absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth},
@@ -329,12 +417,12 @@ const ArmIntrinsic intrinsic_defs[] = {
     {"vabdu", "uabd", UInt(32, 2), "absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::HalfWidth},
 
     // SMULL, UMULL - Widening multiply
-    {"vmulls", "smull", Int(16, 8), "widening_mul", {Int(8, 8), Int(8, 8)}},
-    {"vmullu", "umull", UInt(16, 8), "widening_mul", {UInt(8, 8), UInt(8, 8)}},
-    {"vmulls", "smull", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}},
-    {"vmullu", "umull", UInt(32, 4), "widening_mul", {UInt(16, 4), UInt(16, 4)}},
-    {"vmulls", "smull", Int(64, 2), "widening_mul", {Int(32, 2), Int(32, 2)}},
-    {"vmullu", "umull", UInt(64, 2), "widening_mul", {UInt(32, 2), UInt(32, 2)}},
+    {"vmulls", "smull", Int(16, 8), "widening_mul", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vmullu", "umull", UInt(16, 8), "widening_mul", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vmulls", "smull", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vmullu", "umull", UInt(32, 4), "widening_mul", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vmulls", "smull", Int(64, 2), "widening_mul", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vmullu", "umull", UInt(64, 2), "widening_mul", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::SveUnavailable},
 
     // SQADD, UQADD - Saturating add
     // On arm32, the ARM version of this seems to be missing on some configurations.
@@ -385,12 +473,30 @@ const ArmIntrinsic intrinsic_defs[] = {
     {"vminu", "umin", UInt(16, 4), "min", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::HalfWidth},
     {"vmins", "smin", Int(32, 2), "min", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth},
     {"vminu", "umin", UInt(32, 2), "min", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"vmins", "fmin", Float(32, 2), "min", {Float(32, 2), Float(32, 2)}, ArmIntrinsic::HalfWidth},
+    {nullptr, "smin", Int(64, 2), "min", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::Neon64Unavailable},
+    {nullptr, "umin", UInt(64, 2), "min", {UInt(64, 2), UInt(64, 2)}, ArmIntrinsic::Neon64Unavailable},
     {"vmins", "fmin", Float(16, 4), "min", {Float(16, 4), Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16},
+    {"vmins", "fmin", Float(32, 2), "min", {Float(32, 2), Float(32, 2)}, ArmIntrinsic::HalfWidth},
+    {nullptr, "fmin", Float(64, 2), "min", {Float(64, 2), Float(64, 2)}},
 
     // FCVTZS, FCVTZU
-    {nullptr, "fcvtzs", Int(16, 4), "fp_to_int", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::RequireFp16},
-    {nullptr, "fcvtzu", UInt(16, 4), "fp_to_int", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::RequireFp16},
+    {nullptr, "fcvtzs", Int(16, 4), "fp_to_int", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveInactiveArg},
+    {nullptr, "fcvtzu", UInt(16, 4), "fp_to_int", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveInactiveArg},
+    {nullptr, "fcvtzs", Int(32, 2), "fp_to_int", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveInactiveArg},
+    {nullptr, "fcvtzu", UInt(32, 2), "fp_to_int", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveInactiveArg},
+    {nullptr, "fcvtzs", Int(64, 2), "fp_to_int", {Float(64, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveInactiveArg},
+    {nullptr, "fcvtzu", UInt(64, 2), "fp_to_int", {Float(64, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveInactiveArg},
+
+    // FCVTP/M. These only exist in armv8 and onwards, so we just skip them for
+    // arm-32. LLVM doesn't seem to have intrinsics for them for SVE.
+    {nullptr, "fcvtpu", UInt(32, 4), "fp_to_int_ceil", {Float(32, 4)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fcvtmu", UInt(32, 4), "fp_to_int_floor", {Float(32, 4)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fcvtps", Int(32, 4), "fp_to_int_ceil", {Float(32, 4)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fcvtms", Int(32, 4), "fp_to_int_floor", {Float(32, 4)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fcvtpu", UInt(32, 2), "fp_to_int_ceil", {Float(32, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fcvtmu", UInt(32, 2), "fp_to_int_floor", {Float(32, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fcvtps", Int(32, 2), "fp_to_int_ceil", {Float(32, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fcvtms", Int(32, 2), "fp_to_int_floor", {Float(32, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
 
     // SMAX, UMAX, FMAX - Max
     {"vmaxs", "smax", Int(8, 8), "max", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth},
@@ -399,25 +505,34 @@ const ArmIntrinsic intrinsic_defs[] = {
     {"vmaxu", "umax", UInt(16, 4), "max", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::HalfWidth},
     {"vmaxs", "smax", Int(32, 2), "max", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth},
     {"vmaxu", "umax", UInt(32, 2), "max", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"vmaxs", "fmax", Float(32, 2), "max", {Float(32, 2), Float(32, 2)}, ArmIntrinsic::HalfWidth},
+    {nullptr, "smax", Int(64, 2), "max", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::Neon64Unavailable},
+    {nullptr, "umax", UInt(64, 2), "max", {UInt(64, 2), UInt(64, 2)}, ArmIntrinsic::Neon64Unavailable},
     {"vmaxs", "fmax", Float(16, 4), "max", {Float(16, 4), Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16},
+    {"vmaxs", "fmax", Float(32, 2), "max", {Float(32, 2), Float(32, 2)}, ArmIntrinsic::HalfWidth},
+    {nullptr, "fmax", Float(64, 2), "max", {Float(64, 2), Float(64, 2)}},
+
+    // NEG, FNEG
+    {nullptr, "neg", Int(8, 16), "negate", {Int(8, 16)}, ArmIntrinsic::SveInactiveArg | ArmIntrinsic::Neon64Unavailable},
+    {nullptr, "neg", Int(16, 8), "negate", {Int(16, 8)}, ArmIntrinsic::SveInactiveArg | ArmIntrinsic::Neon64Unavailable},
+    {nullptr, "neg", Int(32, 4), "negate", {Int(32, 4)}, ArmIntrinsic::SveInactiveArg | ArmIntrinsic::Neon64Unavailable},
+    {nullptr, "neg", Int(64, 2), "negate", {Int(64, 2)}, ArmIntrinsic::SveInactiveArg | ArmIntrinsic::Neon64Unavailable},
 
     // SQNEG, UQNEG - Saturating negation
-    {"vqneg", "sqneg", Int(8, 8), "saturating_negate", {Int(8, 8)}, ArmIntrinsic::HalfWidth},
-    {"vqneg", "sqneg", Int(16, 4), "saturating_negate", {Int(16, 4)}, ArmIntrinsic::HalfWidth},
-    {"vqneg", "sqneg", Int(32, 2), "saturating_negate", {Int(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"vqneg", "sqneg", Int(64, 2), "saturating_negate", {Int(64, 2)}},
+    {"vqneg", "sqneg", Int(8, 8), "saturating_negate", {Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg},
+    {"vqneg", "sqneg", Int(16, 4), "saturating_negate", {Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg},
+    {"vqneg", "sqneg", Int(32, 2), "saturating_negate", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg},
+    {"vqneg", "sqneg", Int(64, 2), "saturating_negate", {Int(64, 2)}, ArmIntrinsic::SveInactiveArg},
 
     // SQXTN, UQXTN, SQXTUN - Saturating narrowing
-    {"vqmovns", "sqxtn", Int(8, 8), "saturating_narrow", {Int(16, 8)}},
-    {"vqmovnu", "uqxtn", UInt(8, 8), "saturating_narrow", {UInt(16, 8)}},
-    {"vqmovnsu", "sqxtun", UInt(8, 8), "saturating_narrow", {Int(16, 8)}},
-    {"vqmovns", "sqxtn", Int(16, 4), "saturating_narrow", {Int(32, 4)}},
-    {"vqmovnu", "uqxtn", UInt(16, 4), "saturating_narrow", {UInt(32, 4)}},
-    {"vqmovnsu", "sqxtun", UInt(16, 4), "saturating_narrow", {Int(32, 4)}},
-    {"vqmovns", "sqxtn", Int(32, 2), "saturating_narrow", {Int(64, 2)}},
-    {"vqmovnu", "uqxtn", UInt(32, 2), "saturating_narrow", {UInt(64, 2)}},
-    {"vqmovnsu", "sqxtun", UInt(32, 2), "saturating_narrow", {Int(64, 2)}},
+    {"vqmovns", "sqxtn", Int(8, 8), "saturating_narrow", {Int(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vqmovnu", "uqxtn", UInt(8, 8), "saturating_narrow", {UInt(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vqmovnsu", "sqxtun", UInt(8, 8), "saturating_narrow", {Int(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vqmovns", "sqxtn", Int(16, 4), "saturating_narrow", {Int(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vqmovnu", "uqxtn", UInt(16, 4), "saturating_narrow", {UInt(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vqmovnsu", "sqxtun", UInt(16, 4), "saturating_narrow", {Int(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vqmovns", "sqxtn", Int(32, 2), "saturating_narrow", {Int(64, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vqmovnu", "uqxtn", UInt(32, 2), "saturating_narrow", {UInt(64, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vqmovnsu", "sqxtun", UInt(32, 2), "saturating_narrow", {Int(64, 2)}, ArmIntrinsic::SveUnavailable},
 
     // RSHRN - Rounding shift right narrow (by immediate in [1, output bits])
     // arm32 expects a vector RHS of the same type as the LHS except signed.
@@ -440,52 +555,52 @@ const ArmIntrinsic intrinsic_defs[] = {
     // LLVM pattern matches these.
 
     // SQRSHL, UQRSHL - Saturating rounding shift left (by signed vector)
-    {"vqrshifts", "sqrshl", Int(8, 8), "saturating_rounding_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth},
-    {"vqrshiftu", "uqrshl", UInt(8, 8), "saturating_rounding_shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth},
-    {"vqrshifts", "sqrshl", Int(16, 4), "saturating_rounding_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth},
-    {"vqrshiftu", "uqrshl", UInt(16, 4), "saturating_rounding_shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth},
-    {"vqrshifts", "sqrshl", Int(32, 2), "saturating_rounding_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"vqrshiftu", "uqrshl", UInt(32, 2), "saturating_rounding_shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"vqrshifts", "sqrshl", Int(64, 2), "saturating_rounding_shift_left", {Int(64, 2), Int(64, 2)}},
-    {"vqrshiftu", "uqrshl", UInt(64, 2), "saturating_rounding_shift_left", {UInt(64, 2), Int(64, 2)}},
+    {"vqrshifts", "sqrshl", Int(8, 8), "saturating_rounding_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftu", "uqrshl", UInt(8, 8), "saturating_rounding_shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshifts", "sqrshl", Int(16, 4), "saturating_rounding_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftu", "uqrshl", UInt(16, 4), "saturating_rounding_shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshifts", "sqrshl", Int(32, 2), "saturating_rounding_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftu", "uqrshl", UInt(32, 2), "saturating_rounding_shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshifts", "sqrshl", Int(64, 2), "saturating_rounding_shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftu", "uqrshl", UInt(64, 2), "saturating_rounding_shift_left", {UInt(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
 
     // SQRSHRN, UQRSHRN, SQRSHRUN - Saturating rounding narrowing shift right (by immediate in [1, output bits])
     // arm32 expects a vector RHS of the same type as the LHS except signed.
-    {"vqrshiftns", nullptr, Int(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), Int(16, 8)}},
-    {"vqrshiftnu", nullptr, UInt(8, 8), "saturating_rounding_shift_right_narrow", {UInt(16, 8), Int(16, 8)}},
-    {"vqrshiftnsu", nullptr, UInt(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), Int(16, 8)}},
-    {"vqrshiftns", nullptr, Int(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), Int(32, 4)}},
-    {"vqrshiftnu", nullptr, UInt(16, 4), "saturating_rounding_shift_right_narrow", {UInt(32, 4), Int(32, 4)}},
-    {"vqrshiftnsu", nullptr, UInt(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), Int(32, 4)}},
-    {"vqrshiftns", nullptr, Int(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), Int(64, 2)}},
-    {"vqrshiftnu", nullptr, UInt(32, 2), "saturating_rounding_shift_right_narrow", {UInt(64, 2), Int(64, 2)}},
-    {"vqrshiftnsu", nullptr, UInt(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), Int(64, 2)}},
+    {"vqrshiftns", nullptr, Int(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftnu", nullptr, UInt(8, 8), "saturating_rounding_shift_right_narrow", {UInt(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftnsu", nullptr, UInt(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftns", nullptr, Int(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftnu", nullptr, UInt(16, 4), "saturating_rounding_shift_right_narrow", {UInt(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftnsu", nullptr, UInt(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftns", nullptr, Int(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftnu", nullptr, UInt(32, 2), "saturating_rounding_shift_right_narrow", {UInt(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vqrshiftnsu", nullptr, UInt(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
 
     // arm64 expects a 32-bit constant.
-    {nullptr, "sqrshrn", Int(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), UInt(32)}},
-    {nullptr, "uqrshrn", UInt(8, 8), "saturating_rounding_shift_right_narrow", {UInt(16, 8), UInt(32)}},
-    {nullptr, "sqrshrun", UInt(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), UInt(32)}},
-    {nullptr, "sqrshrn", Int(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), UInt(32)}},
-    {nullptr, "uqrshrn", UInt(16, 4), "saturating_rounding_shift_right_narrow", {UInt(32, 4), UInt(32)}},
-    {nullptr, "sqrshrun", UInt(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), UInt(32)}},
-    {nullptr, "sqrshrn", Int(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), UInt(32)}},
-    {nullptr, "uqrshrn", UInt(32, 2), "saturating_rounding_shift_right_narrow", {UInt(64, 2), UInt(32)}},
-    {nullptr, "sqrshrun", UInt(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), UInt(32)}},
+    {nullptr, "sqrshrn", Int(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "uqrshrn", UInt(8, 8), "saturating_rounding_shift_right_narrow", {UInt(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqrshrun", UInt(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqrshrn", Int(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "uqrshrn", UInt(16, 4), "saturating_rounding_shift_right_narrow", {UInt(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqrshrun", UInt(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqrshrn", Int(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "uqrshrn", UInt(32, 2), "saturating_rounding_shift_right_narrow", {UInt(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqrshrun", UInt(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable},
 
     // SQSHL, UQSHL, SQSHLU - Saturating shift left by signed register.
     // There is also an immediate version of this - hopefully LLVM does this matching when appropriate.
     {"vqshifts", "sqshl", Int(8, 8), "saturating_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
     {"vqshiftu", "uqshl", UInt(8, 8), "saturating_shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
-    {"vqshiftsu", "sqshlu", UInt(8, 8), "saturating_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
+    {"vqshiftsu", "sqshlu", UInt(8, 8), "saturating_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
     {"vqshifts", "sqshl", Int(16, 4), "saturating_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
     {"vqshiftu", "uqshl", UInt(16, 4), "saturating_shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
-    {"vqshiftsu", "sqshlu", UInt(16, 4), "saturating_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
+    {"vqshiftsu", "sqshlu", UInt(16, 4), "saturating_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
     {"vqshifts", "sqshl", Int(32, 2), "saturating_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
     {"vqshiftu", "uqshl", UInt(32, 2), "saturating_shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
-    {"vqshiftsu", "sqshlu", UInt(32, 2), "saturating_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth},
+    {"vqshiftsu", "sqshlu", UInt(32, 2), "saturating_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
     {"vqshifts", "sqshl", Int(64, 2), "saturating_shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::AllowUnsignedOp1},
     {"vqshiftu", "uqshl", UInt(64, 2), "saturating_shift_left", {UInt(64, 2), Int(64, 2)}, ArmIntrinsic::AllowUnsignedOp1},
-    {"vqshiftsu", "sqshlu", UInt(64, 2), "saturating_shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::AllowUnsignedOp1},
+    {"vqshiftsu", "sqshlu", UInt(64, 2), "saturating_shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::SveUnavailable},
 
     // SQSHRN, UQSHRN, SQRSHRUN Saturating narrowing shift right by an (by immediate in [1, output bits])
     // arm32 expects a vector RHS of the same type as the LHS.
@@ -500,15 +615,15 @@ const ArmIntrinsic intrinsic_defs[] = {
     {"vqshiftnsu", nullptr, UInt(32, 2), "saturating_shift_right_narrow", {Int(64, 2), Int(64, 2)}},
 
     // arm64 expects a 32-bit constant.
-    {nullptr, "sqshrn", Int(8, 8), "saturating_shift_right_narrow", {Int(16, 8), UInt(32)}},
-    {nullptr, "uqshrn", UInt(8, 8), "saturating_shift_right_narrow", {UInt(16, 8), UInt(32)}},
-    {nullptr, "sqshrn", Int(16, 4), "saturating_shift_right_narrow", {Int(32, 4), UInt(32)}},
-    {nullptr, "uqshrn", UInt(16, 4), "saturating_shift_right_narrow", {UInt(32, 4), UInt(32)}},
-    {nullptr, "sqshrn", Int(32, 2), "saturating_shift_right_narrow", {Int(64, 2), UInt(32)}},
-    {nullptr, "uqshrn", UInt(32, 2), "saturating_shift_right_narrow", {UInt(64, 2), UInt(32)}},
-    {nullptr, "sqshrun", UInt(8, 8), "saturating_shift_right_narrow", {Int(16, 8), UInt(32)}},
-    {nullptr, "sqshrun", UInt(16, 4), "saturating_shift_right_narrow", {Int(32, 4), UInt(32)}},
-    {nullptr, "sqshrun", UInt(32, 2), "saturating_shift_right_narrow", {Int(64, 2), UInt(32)}},
+    {nullptr, "sqshrn", Int(8, 8), "saturating_shift_right_narrow", {Int(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "uqshrn", UInt(8, 8), "saturating_shift_right_narrow", {UInt(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqshrn", Int(16, 4), "saturating_shift_right_narrow", {Int(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "uqshrn", UInt(16, 4), "saturating_shift_right_narrow", {UInt(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqshrn", Int(32, 2), "saturating_shift_right_narrow", {Int(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "uqshrn", UInt(32, 2), "saturating_shift_right_narrow", {UInt(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqshrun", UInt(8, 8), "saturating_shift_right_narrow", {Int(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqshrun", UInt(16, 4), "saturating_shift_right_narrow", {Int(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable},
+    {nullptr, "sqshrun", UInt(32, 2), "saturating_shift_right_narrow", {Int(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable},
 
     // SRSHL, URSHL - Rounding shift left (by signed vector)
     {"vrshifts", "srshl", Int(8, 8), "rounding_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth},
@@ -521,14 +636,15 @@ const ArmIntrinsic intrinsic_defs[] = {
     {"vrshiftu", "urshl", UInt(64, 2), "rounding_shift_left", {UInt(64, 2), Int(64, 2)}},
 
     // SSHL, USHL - Shift left (by signed vector)
-    {"vshifts", "sshl", Int(8, 8), "shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth},
-    {"vshiftu", "ushl", UInt(8, 8), "shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth},
-    {"vshifts", "sshl", Int(16, 4), "shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth},
-    {"vshiftu", "ushl", UInt(16, 4), "shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth},
-    {"vshifts", "sshl", Int(32, 2), "shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"vshiftu", "ushl", UInt(32, 2), "shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth},
-    {"vshifts", "sshl", Int(64, 2), "shift_left", {Int(64, 2), Int(64, 2)}},
-    {"vshiftu", "ushl", UInt(64, 2), "shift_left", {UInt(64, 2), Int(64, 2)}},
+    // In SVE, no equivalent is found, though there are rounding, saturating, or widening versions.
+    {"vshifts", "sshl", Int(8, 8), "shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {"vshiftu", "ushl", UInt(8, 8), "shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {"vshifts", "sshl", Int(16, 4), "shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {"vshiftu", "ushl", UInt(16, 4), "shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {"vshifts", "sshl", Int(32, 2), "shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {"vshiftu", "ushl", UInt(32, 2), "shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {"vshifts", "sshl", Int(64, 2), "shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vshiftu", "ushl", UInt(64, 2), "shift_left", {UInt(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
 
     // SRSHR, URSHR - Rounding shift right (by immediate in [1, output bits])
     // LLVM wants these expressed as SRSHL by negative amounts.
@@ -537,28 +653,28 @@ const ArmIntrinsic intrinsic_defs[] = {
     // LLVM pattern matches these for us.
 
     // RADDHN - Add and narrow with rounding.
-    {"vraddhn", "raddhn", Int(8, 8), "rounding_add_narrow", {Int(16, 8), Int(16, 8)}},
-    {"vraddhn", "raddhn", UInt(8, 8), "rounding_add_narrow", {UInt(16, 8), UInt(16, 8)}},
-    {"vraddhn", "raddhn", Int(16, 4), "rounding_add_narrow", {Int(32, 4), Int(32, 4)}},
-    {"vraddhn", "raddhn", UInt(16, 4), "rounding_add_narrow", {UInt(32, 4), UInt(32, 4)}},
-    {"vraddhn", "raddhn", Int(32, 2), "rounding_add_narrow", {Int(64, 2), Int(64, 2)}},
-    {"vraddhn", "raddhn", UInt(32, 2), "rounding_add_narrow", {UInt(64, 2), UInt(64, 2)}},
+    {"vraddhn", "raddhn", Int(8, 8), "rounding_add_narrow", {Int(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vraddhn", "raddhn", UInt(8, 8), "rounding_add_narrow", {UInt(16, 8), UInt(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vraddhn", "raddhn", Int(16, 4), "rounding_add_narrow", {Int(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vraddhn", "raddhn", UInt(16, 4), "rounding_add_narrow", {UInt(32, 4), UInt(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vraddhn", "raddhn", Int(32, 2), "rounding_add_narrow", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vraddhn", "raddhn", UInt(32, 2), "rounding_add_narrow", {UInt(64, 2), UInt(64, 2)}, ArmIntrinsic::SveUnavailable},
 
     // RSUBHN - Sub and narrow with rounding.
-    {"vrsubhn", "rsubhn", Int(8, 8), "rounding_sub_narrow", {Int(16, 8), Int(16, 8)}},
-    {"vrsubhn", "rsubhn", UInt(8, 8), "rounding_sub_narrow", {UInt(16, 8), UInt(16, 8)}},
-    {"vrsubhn", "rsubhn", Int(16, 4), "rounding_sub_narrow", {Int(32, 4), Int(32, 4)}},
-    {"vrsubhn", "rsubhn", UInt(16, 4), "rounding_sub_narrow", {UInt(32, 4), UInt(32, 4)}},
-    {"vrsubhn", "rsubhn", Int(32, 2), "rounding_sub_narrow", {Int(64, 2), Int(64, 2)}},
-    {"vrsubhn", "rsubhn", UInt(32, 2), "rounding_sub_narrow", {UInt(64, 2), UInt(64, 2)}},
+    {"vrsubhn", "rsubhn", Int(8, 8), "rounding_sub_narrow", {Int(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vrsubhn", "rsubhn", UInt(8, 8), "rounding_sub_narrow", {UInt(16, 8), UInt(16, 8)}, ArmIntrinsic::SveUnavailable},
+    {"vrsubhn", "rsubhn", Int(16, 4), "rounding_sub_narrow", {Int(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vrsubhn", "rsubhn", UInt(16, 4), "rounding_sub_narrow", {UInt(32, 4), UInt(32, 4)}, ArmIntrinsic::SveUnavailable},
+    {"vrsubhn", "rsubhn", Int(32, 2), "rounding_sub_narrow", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable},
+    {"vrsubhn", "rsubhn", UInt(32, 2), "rounding_sub_narrow", {UInt(64, 2), UInt(64, 2)}, ArmIntrinsic::SveUnavailable},
 
     // SQDMULH - Saturating doubling multiply keep high half.
-    {"vqdmulh", "sqdmulh", Int(16, 4), "qdmulh", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth},
-    {"vqdmulh", "sqdmulh", Int(32, 2), "qdmulh", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth},
+    {"vqdmulh", "sqdmulh", Int(16, 4), "qdmulh", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
+    {"vqdmulh", "sqdmulh", Int(32, 2), "qdmulh", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
 
     // SQRDMULH - Saturating doubling multiply keep high half with rounding.
-    {"vqrdmulh", "sqrdmulh", Int(16, 4), "qrdmulh", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth},
-    {"vqrdmulh", "sqrdmulh", Int(32, 2), "qrdmulh", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth},
+    {"vqrdmulh", "sqrdmulh", Int(16, 4), "qrdmulh", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
+    {"vqrdmulh", "sqrdmulh", Int(32, 2), "qrdmulh", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate},
 
     // PADD - Pairwise add.
     // 32-bit only has half-width versions.
@@ -571,47 +687,49 @@ const ArmIntrinsic intrinsic_defs[] = {
     {"vpadd", nullptr, Float(32, 2), "pairwise_add", {Float(32, 4)}, ArmIntrinsic::SplitArg0},
     {"vpadd", nullptr, Float(16, 4), "pairwise_add", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::RequireFp16},
 
-    {nullptr, "addp", Int(8, 8), "pairwise_add", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "addp", UInt(8, 8), "pairwise_add", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "addp", Int(16, 4), "pairwise_add", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "addp", UInt(16, 4), "pairwise_add", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "addp", Int(32, 2), "pairwise_add", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "addp", UInt(32, 2), "pairwise_add", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "faddp", Float(32, 2), "pairwise_add", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "faddp", Float(64, 2), "pairwise_add", {Float(64, 4)}, ArmIntrinsic::SplitArg0},
-    {nullptr, "faddp", Float(16, 4), "pairwise_add", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16},
+    {nullptr, "addp", Int(8, 8), "pairwise_add", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "addp", UInt(8, 8), "pairwise_add", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "addp", Int(16, 4), "pairwise_add", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "addp", UInt(16, 4), "pairwise_add", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "addp", Int(32, 2), "pairwise_add", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "addp", UInt(32, 2), "pairwise_add", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "addp", Int(64, 2), "pairwise_add", {Int(64, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::SveUnavailable},
+    {nullptr, "addp", UInt(64, 2), "pairwise_add", {UInt(64, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::SveUnavailable},
+    {nullptr, "faddp", Float(32, 2), "pairwise_add", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "faddp", Float(64, 2), "pairwise_add", {Float(64, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::SveUnavailable},
+    {nullptr, "faddp", Float(16, 4), "pairwise_add", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveUnavailable},
 
     // SADDLP, UADDLP - Pairwise add long.
-    {"vpaddls", "saddlp", Int(16, 4), "pairwise_widening_add", {Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs},
-    {"vpaddlu", "uaddlp", UInt(16, 4), "pairwise_widening_add", {UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs},
-    {"vpaddlu", "uaddlp", Int(16, 4), "pairwise_widening_add", {UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs},
-    {"vpaddls", "saddlp", Int(32, 2), "pairwise_widening_add", {Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs},
-    {"vpaddlu", "uaddlp", UInt(32, 2), "pairwise_widening_add", {UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs},
-    {"vpaddlu", "uaddlp", Int(32, 2), "pairwise_widening_add", {UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs},
-    {"vpaddls", "saddlp", Int(64, 1), "pairwise_widening_add", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors},
-    {"vpaddlu", "uaddlp", UInt(64, 1), "pairwise_widening_add", {UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors},
-    {"vpaddlu", "uaddlp", Int(64, 1), "pairwise_widening_add", {UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors},
+    {"vpaddls", "saddlp", Int(16, 4), "pairwise_widening_add", {Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {"vpaddlu", "uaddlp", UInt(16, 4), "pairwise_widening_add", {UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {"vpaddlu", "uaddlp", Int(16, 4), "pairwise_widening_add", {UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {"vpaddls", "saddlp", Int(32, 2), "pairwise_widening_add", {Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {"vpaddlu", "uaddlp", UInt(32, 2), "pairwise_widening_add", {UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {"vpaddlu", "uaddlp", Int(32, 2), "pairwise_widening_add", {UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable},
+    {"vpaddls", "saddlp", Int(64, 1), "pairwise_widening_add", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::SveUnavailable},
+    {"vpaddlu", "uaddlp", UInt(64, 1), "pairwise_widening_add", {UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::SveUnavailable},
+    {"vpaddlu", "uaddlp", Int(64, 1), "pairwise_widening_add", {UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::SveUnavailable},
 
     // SPADAL, UPADAL - Pairwise add and accumulate long.
-    {"vpadals", nullptr, Int(16, 4), "pairwise_widening_add_accumulate", {Int(16, 4), Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs},
-    {"vpadalu", nullptr, UInt(16, 4), "pairwise_widening_add_accumulate", {UInt(16, 4), UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs},
-    {"vpadalu", nullptr, Int(16, 4), "pairwise_widening_add_accumulate", {Int(16, 4), UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs},
-    {"vpadals", nullptr, Int(32, 2), "pairwise_widening_add_accumulate", {Int(32, 2), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs},
-    {"vpadalu", nullptr, UInt(32, 2), "pairwise_widening_add_accumulate", {UInt(32, 2), UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs},
-    {"vpadalu", nullptr, Int(32, 2), "pairwise_widening_add_accumulate", {Int(32, 2), UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs},
-    {"vpadals", nullptr, Int(64, 1), "pairwise_widening_add_accumulate", {Int(64, 1), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors},
-    {"vpadalu", nullptr, UInt(64, 1), "pairwise_widening_add_accumulate", {UInt(64, 1), UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors},
-    {"vpadalu", nullptr, Int(64, 1), "pairwise_widening_add_accumulate", {Int(64, 1), UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors},
+    {"vpadals", "sadalp", Int(16, 4), "pairwise_widening_add_accumulate", {Int(16, 4), Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable},
+    {"vpadalu", "uadalp", UInt(16, 4), "pairwise_widening_add_accumulate", {UInt(16, 4), UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable},
+    {"vpadalu", "uadalp", Int(16, 4), "pairwise_widening_add_accumulate", {Int(16, 4), UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable},
+    {"vpadals", "sadalp", Int(32, 2), "pairwise_widening_add_accumulate", {Int(32, 2), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable},
+    {"vpadalu", "uadalp", UInt(32, 2), "pairwise_widening_add_accumulate", {UInt(32, 2), UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable},
+    {"vpadalu", "uadalp", Int(32, 2), "pairwise_widening_add_accumulate", {Int(32, 2), UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable},
+    {"vpadals", "sadalp", Int(64, 1), "pairwise_widening_add_accumulate", {Int(64, 1), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::Neon64Unavailable},
+    {"vpadalu", "uadalp", UInt(64, 1), "pairwise_widening_add_accumulate", {UInt(64, 1), UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::Neon64Unavailable},
+    {"vpadalu", "uadalp", Int(64, 1), "pairwise_widening_add_accumulate", {Int(64, 1), UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::Neon64Unavailable},
 
     // SMAXP, UMAXP, FMAXP - Pairwise max.
-    {nullptr, "smaxp", Int(8, 8), "pairwise_max", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "umaxp", UInt(8, 8), "pairwise_max", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "smaxp", Int(16, 4), "pairwise_max", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "umaxp", UInt(16, 4), "pairwise_max", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "smaxp", Int(32, 2), "pairwise_max", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "umaxp", UInt(32, 2), "pairwise_max", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "fmaxp", Float(32, 2), "pairwise_max", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "fmaxp", Float(16, 4), "pairwise_max", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16},
+    {nullptr, "smaxp", Int(8, 8), "pairwise_max", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "umaxp", UInt(8, 8), "pairwise_max", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "smaxp", Int(16, 4), "pairwise_max", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "umaxp", UInt(16, 4), "pairwise_max", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "smaxp", Int(32, 2), "pairwise_max", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "umaxp", UInt(32, 2), "pairwise_max", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fmaxp", Float(32, 2), "pairwise_max", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fmaxp", Float(16, 4), "pairwise_max", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveUnavailable},
 
     // On arm32, we only have half-width versions of these.
     {"vpmaxs", nullptr, Int(8, 8), "pairwise_max", {Int(8, 16)}, ArmIntrinsic::SplitArg0},
@@ -624,14 +742,14 @@ const ArmIntrinsic intrinsic_defs[] = {
     {"vpmaxs", nullptr, Float(16, 4), "pairwise_max", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::RequireFp16},
 
     // SMINP, UMINP, FMINP - Pairwise min.
-    {nullptr, "sminp", Int(8, 8), "pairwise_min", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "uminp", UInt(8, 8), "pairwise_min", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "sminp", Int(16, 4), "pairwise_min", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "uminp", UInt(16, 4), "pairwise_min", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "sminp", Int(32, 2), "pairwise_min", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "uminp", UInt(32, 2), "pairwise_min", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "fminp", Float(32, 2), "pairwise_min", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth},
-    {nullptr, "fminp", Float(16, 4), "pairwise_min", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16},
+    {nullptr, "sminp", Int(8, 8), "pairwise_min", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "uminp", UInt(8, 8), "pairwise_min", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "sminp", Int(16, 4), "pairwise_min", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "uminp", UInt(16, 4), "pairwise_min", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "sminp", Int(32, 2), "pairwise_min", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "uminp", UInt(32, 2), "pairwise_min", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fminp", Float(32, 2), "pairwise_min", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable},
+    {nullptr, "fminp", Float(16, 4), "pairwise_min", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveUnavailable},
 
     // On arm32, we only have half-width versions of these.
     {"vpmins", nullptr, Int(8, 8), "pairwise_min", {Int(8, 16)}, ArmIntrinsic::SplitArg0},
@@ -645,28 +763,35 @@ const ArmIntrinsic intrinsic_defs[] = {
 
     // SDOT, UDOT - Dot products.
     // Mangle this one manually, there aren't that many and it is a special case.
-    {nullptr, "sdot.v2i32.v8i8", Int(32, 2), "dot_product", {Int(32, 2), Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle},
-    {nullptr, "udot.v2i32.v8i8", Int(32, 2), "dot_product", {Int(32, 2), UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle},
-    {nullptr, "udot.v2i32.v8i8", UInt(32, 2), "dot_product", {UInt(32, 2), UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle},
-    {nullptr, "sdot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), Int(8, 16), Int(8, 16)}, ArmIntrinsic::NoMangle},
-    {nullptr, "udot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle},
-    {nullptr, "udot.v4i32.v16i8", UInt(32, 4), "dot_product", {UInt(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle},
+    {nullptr, "sdot.v2i32.v8i8", Int(32, 2), "dot_product", {Int(32, 2), Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable},
+    {nullptr, "udot.v2i32.v8i8", Int(32, 2), "dot_product", {Int(32, 2), UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable},
+    {nullptr, "udot.v2i32.v8i8", UInt(32, 2), "dot_product", {UInt(32, 2), UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable},
+    {nullptr, "sdot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), Int(8, 16), Int(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable},
+    {nullptr, "udot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable},
+    {nullptr, "udot.v4i32.v16i8", UInt(32, 4), "dot_product", {UInt(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable},
+    // SVE versions.
+    {nullptr, "sdot.nxv4i32", Int(32, 4), "dot_product", {Int(32, 4), Int(8, 16), Int(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::SveRequired},
+    {nullptr, "udot.nxv4i32", Int(32, 4), "dot_product", {Int(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::SveRequired},
+    {nullptr, "udot.nxv4i32", UInt(32, 4), "dot_product", {UInt(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::SveRequired},
+    {nullptr, "sdot.nxv2i64", Int(64, 2), "dot_product", {Int(64, 2), Int(16, 8), Int(16, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::Neon64Unavailable | ArmIntrinsic::SveRequired},
+    {nullptr, "udot.nxv2i64", Int(64, 2), "dot_product", {Int(64, 2), UInt(16, 8), UInt(16, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::Neon64Unavailable | ArmIntrinsic::SveRequired},
+    {nullptr, "udot.nxv2i64", UInt(64, 2), "dot_product", {UInt(64, 2), UInt(16, 8), UInt(16, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::Neon64Unavailable | ArmIntrinsic::SveRequired},
 
     // ABDL - Widening absolute difference
     // The ARM backend folds both signed and unsigned widening casts of absd to a widening_absd, so we need to handle both signed and
     // unsigned input and return types.
-    {"vabdl_i8x8", "vabdl_i8x8", Int(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_i8x8", "vabdl_i8x8", UInt(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_u8x8", "vabdl_u8x8", Int(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_u8x8", "vabdl_u8x8", UInt(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_i16x4", "vabdl_i16x4", Int(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_i16x4", "vabdl_i16x4", UInt(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_u16x4", "vabdl_u16x4", Int(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_u16x4", "vabdl_u16x4", UInt(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_i32x2", "vabdl_i32x2", Int(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_i32x2", "vabdl_i32x2", UInt(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_u32x2", "vabdl_u32x2", Int(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
-    {"vabdl_u32x2", "vabdl_u32x2", UInt(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix},
+    {"vabdl_i8x8", "vabdl_i8x8", Int(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_i8x8", "vabdl_i8x8", UInt(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_u8x8", "vabdl_u8x8", Int(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_u8x8", "vabdl_u8x8", UInt(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_i16x4", "vabdl_i16x4", Int(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_i16x4", "vabdl_i16x4", UInt(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_u16x4", "vabdl_u16x4", Int(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_u16x4", "vabdl_u16x4", UInt(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_i32x2", "vabdl_i32x2", Int(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_i32x2", "vabdl_i32x2", UInt(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_u32x2", "vabdl_u32x2", Int(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
+    {"vabdl_u32x2", "vabdl_u32x2", UInt(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable},
 };
 
 // List of fp16 math functions which we can avoid "emulated" equivalent code generation.
@@ -706,32 +831,103 @@ const std::map<string, string> float16_transcendental_remapping = {
 };
 // clang-format on
 
-llvm::Function *CodeGen_ARM::define_concat_args_wrapper(llvm::Function *inner, const string &name) {
-    llvm::FunctionType *inner_ty = inner->getFunctionType();
+llvm::Type *CodeGen_ARM::llvm_type_with_constraint(const Type &t, bool scalars_are_vectors,
+                                                   VectorTypeConstraint constraint) {
+    llvm::Type *ret = llvm_type_of(t.element_of());
+    if (!t.is_scalar() || scalars_are_vectors) {
+        int lanes = t.lanes();
+        if (constraint == VectorTypeConstraint::VScale) {
+            lanes /= target_vscale();
+        }
+        ret = get_vector_type(ret, lanes, constraint);
+    }
+    return ret;
+}
+
+llvm::Function *CodeGen_ARM::define_intrin_wrapper(const std::string &inner_name,
+                                                   const Type &ret_type,
+                                                   const std::string &mangled_name,
+                                                   const std::vector<Type> &arg_types,
+                                                   int intrinsic_flags,
+                                                   bool sve_intrinsic) {
+
+    auto to_llvm_type = [&](const Type &t) {
+        return llvm_type_with_constraint(t, (intrinsic_flags & ArmIntrinsic::ScalarsAreVectors),
+                                         !sve_intrinsic ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale);
+    };
+
+    llvm::Type *llvm_ret_type = to_llvm_type(ret_type);
+    std::vector<llvm::Type *> llvm_arg_types;
+    std::transform(arg_types.begin(), arg_types.end(), std::back_inserter(llvm_arg_types), to_llvm_type);
+
+    const bool add_predicate = sve_intrinsic && !(intrinsic_flags & ArmIntrinsic::SveNoPredicate);
+    bool add_inactive_arg = sve_intrinsic && (intrinsic_flags & ArmIntrinsic::SveInactiveArg);
+    bool split_arg0 = intrinsic_flags & ArmIntrinsic::SplitArg0;
+
+    if (!(add_inactive_arg || add_predicate || split_arg0)) {
+        // No need to wrap
+        return get_llvm_intrin(llvm_ret_type, mangled_name, llvm_arg_types);
+    }
+
+    std::vector<llvm::Type *> inner_llvm_arg_types;
+    std::vector<Value *> inner_args;
+    internal_assert(!arg_types.empty());
+    const int inner_lanes = split_arg0 ? arg_types[0].lanes() / 2 : arg_types[0].lanes();
+
+    if (add_inactive_arg) {
+        // The fallback value has the same type as ret value.
+        // We don't use this, so just pad it with 0.
+        inner_llvm_arg_types.push_back(llvm_ret_type);
+
+        Value *zero = Constant::getNullValue(llvm_ret_type);
+        inner_args.push_back(zero);
+    }
+    if (add_predicate) {
+        llvm::Type *pred_type = to_llvm_type(Int(1, inner_lanes));
+        inner_llvm_arg_types.push_back(pred_type);
+        // Halide does not have general support for predication so use
+        // constant true for all lanes.
+        Value *ptrue = Constant::getAllOnesValue(pred_type);
+        inner_args.push_back(ptrue);
+    }
+    if (split_arg0) {
+        llvm::Type *split_arg_type = to_llvm_type(arg_types[0].with_lanes(inner_lanes));
+        inner_llvm_arg_types.push_back(split_arg_type);
+        inner_llvm_arg_types.push_back(split_arg_type);
+        internal_assert(arg_types.size() == 1);
+    } else {
+        // Push back all argument typs which Halide defines
+        std::copy(llvm_arg_types.begin(), llvm_arg_types.end(), std::back_inserter(inner_llvm_arg_types));
+    }
 
-    internal_assert(inner_ty->getNumParams() == 2);
-    llvm::Type *inner_arg0_ty = inner_ty->getParamType(0);
-    llvm::Type *inner_arg1_ty = inner_ty->getParamType(1);
-    int inner_arg0_lanes = get_vector_num_elements(inner_arg0_ty);
-    int inner_arg1_lanes = get_vector_num_elements(inner_arg1_ty);
+    llvm::Function *inner = get_llvm_intrin(llvm_ret_type, mangled_name, inner_llvm_arg_types);
+    llvm::FunctionType *inner_ty = inner->getFunctionType();
 
-    llvm::Type *concat_arg_ty =
-        get_vector_type(inner_arg0_ty->getScalarType(), inner_arg0_lanes + inner_arg1_lanes);
+    llvm::FunctionType *wrapper_ty = llvm::FunctionType::get(inner_ty->getReturnType(), llvm_arg_types, false);
 
-    // Make a wrapper.
-    llvm::FunctionType *wrapper_ty =
-        llvm::FunctionType::get(inner_ty->getReturnType(), {concat_arg_ty}, false);
+    string wrapper_name = inner_name + unique_name("_wrapper");
     llvm::Function *wrapper =
-        llvm::Function::Create(wrapper_ty, llvm::GlobalValue::InternalLinkage, name, module.get());
+        llvm::Function::Create(wrapper_ty, llvm::GlobalValue::InternalLinkage, wrapper_name, module.get());
     llvm::BasicBlock *block =
         llvm::BasicBlock::Create(module->getContext(), "entry", wrapper);
     IRBuilderBase::InsertPoint here = builder->saveIP();
     builder->SetInsertPoint(block);
 
+    if (split_arg0) {
+        // Call the real intrinsic.
+        Value *low = slice_vector(wrapper->getArg(0), 0, inner_lanes);
+        Value *high = slice_vector(wrapper->getArg(0), inner_lanes, inner_lanes);
+        inner_args.push_back(low);
+        inner_args.push_back(high);
+        internal_assert(inner_llvm_arg_types.size() == 2);
+    } else {
+        for (auto *itr = wrapper->arg_begin(); itr != wrapper->arg_end(); ++itr) {
+            inner_args.push_back(itr);
+        }
+    }
+
     // Call the real intrinsic.
-    Value *low = slice_vector(wrapper->getArg(0), 0, inner_arg0_lanes);
-    Value *high = slice_vector(wrapper->getArg(0), inner_arg0_lanes, inner_arg1_lanes);
-    Value *ret = builder->CreateCall(inner, {low, high});
+    Value *ret = builder->CreateCall(inner, inner_args);
     builder->CreateRet(ret);
 
     // Always inline these wrappers.
@@ -746,15 +942,32 @@ llvm::Function *CodeGen_ARM::define_concat_args_wrapper(llvm::Function *inner, c
 void CodeGen_ARM::init_module() {
     CodeGen_Posix::init_module();
 
-    if (neon_intrinsics_disabled()) {
+    const bool has_neon = !target.has_feature(Target::NoNEON);
+    const bool has_sve = target.has_feature(Target::SVE2);
+    if (!(has_neon || has_sve)) {
         return;
     }
 
-    string prefix = target.bits == 32 ? "llvm.arm.neon." : "llvm.aarch64.neon.";
+    enum class SIMDFlavors {
+        NeonWidthX1,
+        NeonWidthX2,
+        SVE,
+    };
+
+    std::vector<SIMDFlavors> flavors;
+    if (has_neon) {
+        flavors.push_back(SIMDFlavors::NeonWidthX1);
+        flavors.push_back(SIMDFlavors::NeonWidthX2);
+    }
+    if (has_sve) {
+        flavors.push_back(SIMDFlavors::SVE);
+    }
+
     for (const ArmIntrinsic &intrin : intrinsic_defs) {
         if (intrin.flags & ArmIntrinsic::RequireFp16 && !target.has_feature(Target::ARMFp16)) {
             continue;
         }
+
         // Get the name of the intrinsic with the appropriate prefix.
         const char *intrin_name = nullptr;
         if (target.bits == 32) {
@@ -765,21 +978,66 @@ void CodeGen_ARM::init_module() {
         if (!intrin_name) {
             continue;
         }
-        string full_name = intrin_name;
-        if (!starts_with(full_name, "llvm.") && (intrin.flags & ArmIntrinsic::NoPrefix) == 0) {
-            full_name = prefix + full_name;
-        }
 
-        // We might have to generate versions of this intrinsic with multiple widths.
-        vector<int> width_factors = {1};
-        if (intrin.flags & ArmIntrinsic::HalfWidth) {
-            width_factors.push_back(2);
-        }
+        // This makes up to three passes defining intrinsics for 64-bit,
+        // 128-bit, and, if SVE is avaailable, whatever the SVE target width
+        // is. Some variants will not result in a definition getting added based
+        // on the target and the intrinsic flags. The intrinsic width may be
+        // scaled and one of two opcodes may be selected by different
+        // interations of this loop.
+        for (const auto flavor : flavors) {
+            const bool is_sve = (flavor == SIMDFlavors::SVE);
+
+            // Skip intrinsics that are NEON or SVE only depending on whether compiling for SVE.
+            if (is_sve) {
+                if (intrin.flags & ArmIntrinsic::SveUnavailable) {
+                    continue;
+                }
+            } else {
+                if (intrin.flags & ArmIntrinsic::SveRequired) {
+                    continue;
+                }
+            }
+            if ((target.bits == 64) &&
+                (intrin.flags & ArmIntrinsic::Neon64Unavailable) &&
+                !is_sve) {
+                continue;
+            }
+            // Already declared in the x1 pass.
+            if ((flavor == SIMDFlavors::NeonWidthX2) &&
+                !(intrin.flags & ArmIntrinsic::HalfWidth)) {
+                continue;
+            }
+
+            string full_name = intrin_name;
+            const bool is_vanilla_intrinsic = starts_with(full_name, "llvm.");
+            if (!is_vanilla_intrinsic && (intrin.flags & ArmIntrinsic::NoPrefix) == 0) {
+                if (target.bits == 32) {
+                    full_name = "llvm.arm.neon." + full_name;
+                } else {
+                    full_name = (is_sve ? "llvm.aarch64.sve." : "llvm.aarch64.neon.") + full_name;
+                }
+            }
+
+            int width_factor = 1;
+            if (!((intrin.ret_type.lanes <= 1) && (intrin.flags & ArmIntrinsic::NoMangle))) {
+                switch (flavor) {
+                case SIMDFlavors::NeonWidthX1:
+                    width_factor = 1;
+                    break;
+                case SIMDFlavors::NeonWidthX2:
+                    width_factor = 2;
+                    break;
+                case SIMDFlavors::SVE:
+                    width_factor = (intrin.flags & ArmIntrinsic::HalfWidth) ? 2 : 1;
+                    width_factor *= target_vscale();
+                    break;
+                }
+            }
 
-        for (int width_factor : width_factors) {
             Type ret_type = intrin.ret_type;
             ret_type = ret_type.with_lanes(ret_type.lanes() * width_factor);
-            internal_assert(ret_type.bits() * ret_type.lanes() <= 128) << full_name << "\n";
+            internal_assert(ret_type.bits() * ret_type.lanes() <= 128 * width_factor) << full_name << "\n";
             vector<Type> arg_types;
             arg_types.reserve(4);
             for (halide_type_t i : intrin.arg_types) {
@@ -787,9 +1045,7 @@ void CodeGen_ARM::init_module() {
                     break;
                 }
                 Type arg_type = i;
-                if (arg_type.is_vector()) {
-                    arg_type = arg_type.with_lanes(arg_type.lanes() * width_factor);
-                }
+                arg_type = arg_type.with_lanes(arg_type.lanes() * width_factor);
                 arg_types.emplace_back(arg_type);
             }
 
@@ -799,7 +1055,7 @@ void CodeGen_ARM::init_module() {
             if (starts_with(full_name, "llvm.") && (intrin.flags & ArmIntrinsic::NoMangle) == 0) {
                 // Append LLVM name mangling for either the return type or the arguments, or both.
                 vector<Type> types;
-                if (intrin.flags & ArmIntrinsic::MangleArgs) {
+                if (intrin.flags & ArmIntrinsic::MangleArgs && !is_sve) {
                     types = arg_types;
                 } else if (intrin.flags & ArmIntrinsic::MangleRetArgs) {
                     types = {ret_type};
@@ -808,7 +1064,9 @@ void CodeGen_ARM::init_module() {
                     types = {ret_type};
                 }
                 for (const Type &t : types) {
-                    mangled_name_builder << ".v" << t.lanes();
+                    std::string llvm_vector_prefix = is_sve ? ".nxv" : ".v";
+                    int mangle_lanes = t.lanes() / (is_sve ? target_vscale() : 1);
+                    mangled_name_builder << llvm_vector_prefix << mangle_lanes;
                     if (t.is_int() || t.is_uint()) {
                         mangled_name_builder << "i";
                     } else if (t.is_float()) {
@@ -819,17 +1077,9 @@ void CodeGen_ARM::init_module() {
             }
             string mangled_name = mangled_name_builder.str();
 
-            llvm::Function *intrin_impl = nullptr;
-            if (intrin.flags & ArmIntrinsic::SplitArg0) {
-                // This intrinsic needs a wrapper to split the argument.
-                string wrapper_name = intrin.name + unique_name("_wrapper");
-                Type split_arg_type = arg_types[0].with_lanes(arg_types[0].lanes() / 2);
-                llvm::Function *to_wrap = get_llvm_intrin(ret_type, mangled_name, {split_arg_type, split_arg_type});
-                intrin_impl = define_concat_args_wrapper(to_wrap, wrapper_name);
-            } else {
-                bool scalars_are_vectors = intrin.flags & ArmIntrinsic::ScalarsAreVectors;
-                intrin_impl = get_llvm_intrin(ret_type, mangled_name, arg_types, scalars_are_vectors);
-            }
+            llvm::Function *intrin_impl = define_intrin_wrapper(
+                intrin.name, ret_type, mangled_name, arg_types,
+                intrin.flags, is_sve);
 
             function_does_not_access_memory(intrin_impl);
             intrin_impl->addFnAttr(llvm::Attribute::NoUnwind);
@@ -862,8 +1112,31 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f,
     CodeGen_Posix::compile_func(func, simple_name, extern_name);
 }
 
+void CodeGen_ARM::begin_func(LinkageType linkage, const std::string &simple_name,
+                             const std::string &extern_name, const std::vector<LoweredArgument> &args) {
+    CodeGen_Posix::begin_func(linkage, simple_name, extern_name, args);
+
+    // TODO(https://github.com/halide/Halide/issues/8092): There is likely a
+    // better way to ensure this is only generated for the outermost function
+    // that is being compiled. Avoiding the assert on inner functions is both an
+    // efficiency and a correctness issue as the assertion code may not compile
+    // in all contexts.
+    if (linkage != LinkageType::Internal) {
+        int effective_vscale = target_vscale();
+        if (effective_vscale != 0 && !target.has_feature(Target::NoAsserts)) {
+            // Make sure run-time vscale is equal to compile-time vscale
+            Expr runtime_vscale = Call::make(Int(32), Call::get_runtime_vscale, {}, Call::PureIntrinsic);
+            Value *val_runtime_vscale = codegen(runtime_vscale);
+            Value *val_compiletime_vscale = ConstantInt::get(i32_t, effective_vscale);
+            Value *cond = builder->CreateICmpEQ(val_runtime_vscale, val_compiletime_vscale);
+            create_assertion(cond, Call::make(Int(32), "halide_error_vscale_invalid",
+                                              {simple_name, runtime_vscale, Expr(effective_vscale)}, Call::Extern));
+        }
+    }
+}
+
 void CodeGen_ARM::visit(const Cast *op) {
-    if (!neon_intrinsics_disabled() && op->type.is_vector()) {
+    if (!simd_intrinsics_disabled() && op->type.is_vector()) {
         vector<Expr> matches;
         for (const Pattern &pattern : casts) {
             if (expr_match(pattern.pattern, op, matches)) {
@@ -898,14 +1171,11 @@ void CodeGen_ARM::visit(const Cast *op) {
         }
     }
 
-    // LLVM fptoui generates fcvtzs if src is fp16 scalar else fcvtzu.
-    // To avoid that, we use neon intrinsic explicitly.
-    if (is_float16_and_has_feature(op->value.type())) {
-        if (op->type.is_int_or_uint() && op->type.bits() == 16) {
-            value = call_overloaded_intrin(op->type, "fp_to_int", {op->value});
-            if (value) {
-                return;
-            }
+    // LLVM fptoui generates fcvtzs or fcvtzu in inconsistent way
+    if (op->value.type().is_float() && op->type.is_int_or_uint()) {
+        if (Value *v = call_overloaded_intrin(op->type, "fp_to_int", {op->value})) {
+            value = v;
+            return;
         }
     }
 
@@ -913,7 +1183,7 @@ void CodeGen_ARM::visit(const Cast *op) {
 }
 
 void CodeGen_ARM::visit(const Add *op) {
-    if (neon_intrinsics_disabled() ||
+    if (simd_intrinsics_disabled() ||
         !op->type.is_vector() ||
         !target.has_feature(Target::ARMDotProd) ||
         !op->type.is_int_or_uint() ||
@@ -997,7 +1267,7 @@ void CodeGen_ARM::visit(const Add *op) {
 }
 
 void CodeGen_ARM::visit(const Sub *op) {
-    if (neon_intrinsics_disabled()) {
+    if (simd_intrinsics_disabled()) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1012,6 +1282,46 @@ void CodeGen_ARM::visit(const Sub *op) {
         }
     }
 
+    // Peep-hole (0 - b) pattern to generate "negate" instruction
+    if (is_const_zero(op->a)) {
+        if (target_vscale() != 0) {
+            if ((op->type.bits() >= 8 && op->type.is_int())) {
+                if (Value *v = call_overloaded_intrin(op->type, "negate", {op->b})) {
+                    value = v;
+                    return;
+                }
+            } else if (op->type.bits() >= 16 && op->type.is_float()) {
+                value = builder->CreateFNeg(codegen(op->b));
+                return;
+            }
+        } else {
+            // llvm.neon.neg/fneg intrinsic doesn't seem to exist. Instead,
+            // llvm will generate floating point negate instructions if we ask for (-0.0f)-x
+            if (op->type.is_float() &&
+                (op->type.bits() >= 32 || is_float16_and_has_feature(op->type))) {
+                Constant *a;
+                if (op->type.bits() == 16) {
+                    a = ConstantFP::getNegativeZero(f16_t);
+                } else if (op->type.bits() == 32) {
+                    a = ConstantFP::getNegativeZero(f32_t);
+                } else if (op->type.bits() == 64) {
+                    a = ConstantFP::getNegativeZero(f64_t);
+                } else {
+                    a = nullptr;
+                    internal_error << "Unknown bit width for floating point type: " << op->type << "\n";
+                }
+
+                Value *b = codegen(op->b);
+
+                if (op->type.lanes() > 1) {
+                    a = get_splat(op->type.lanes(), a);
+                }
+                value = builder->CreateFSub(a, b);
+                return;
+            }
+        }
+    }
+
     // llvm will generate floating point negate instructions if we ask for (-0.0f)-x
     if (op->type.is_float() &&
         (op->type.bits() >= 32 || is_float16_and_has_feature(op->type)) &&
@@ -1042,7 +1352,7 @@ void CodeGen_ARM::visit(const Sub *op) {
 
 void CodeGen_ARM::visit(const Min *op) {
     // Use a 2-wide vector for scalar floats.
-    if (!neon_intrinsics_disabled() && (op->type == Float(32) || op->type.is_vector())) {
+    if (!simd_intrinsics_disabled() && (op->type.is_float() || op->type.is_vector())) {
         value = call_overloaded_intrin(op->type, "min", {op->a, op->b});
         if (value) {
             return;
@@ -1054,7 +1364,7 @@ void CodeGen_ARM::visit(const Min *op) {
 
 void CodeGen_ARM::visit(const Max *op) {
     // Use a 2-wide vector for scalar floats.
-    if (!neon_intrinsics_disabled() && (op->type == Float(32) || op->type.is_vector())) {
+    if (!simd_intrinsics_disabled() && (op->type.is_float() || op->type.is_vector())) {
         value = call_overloaded_intrin(op->type, "max", {op->a, op->b});
         if (value) {
             return;
@@ -1066,12 +1376,13 @@ void CodeGen_ARM::visit(const Max *op) {
 
 void CodeGen_ARM::visit(const Store *op) {
     // Predicated store
-    if (!is_const_one(op->predicate)) {
+    const bool is_predicated_store = !is_const_one(op->predicate);
+    if (is_predicated_store && !target.has_feature(Target::SVE2)) {
         CodeGen_Posix::visit(op);
         return;
     }
 
-    if (neon_intrinsics_disabled()) {
+    if (simd_intrinsics_disabled()) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1079,8 +1390,8 @@ void CodeGen_ARM::visit(const Store *op) {
     // A dense store of an interleaving can be done using a vst2 intrinsic
     const Ramp *ramp = op->index.as<Ramp>();
 
-    // We only deal with ramps here
-    if (!ramp) {
+    // We only deal with ramps here except for SVE2
+    if (!ramp && !target.has_feature(Target::SVE2)) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1102,21 +1413,27 @@ void CodeGen_ARM::visit(const Store *op) {
         intrin_type = t;
         Type elt = t.element_of();
         int vec_bits = t.bits() * t.lanes();
-        if (elt == Float(32) ||
+        if (elt == Float(32) || elt == Float(64) ||
             is_float16_and_has_feature(elt) ||
-            elt == Int(8) || elt == Int(16) || elt == Int(32) ||
-            elt == UInt(8) || elt == UInt(16) || elt == UInt(32)) {
+            elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
+            elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
+            // TODO(zvookin): Handle vector_bits_*.
             if (vec_bits % 128 == 0) {
                 type_ok_for_vst = true;
-                intrin_type = intrin_type.with_lanes(128 / t.bits());
+                int target_vector_bits = target.vector_bits;
+                if (target_vector_bits == 0) {
+                    target_vector_bits = 128;
+                }
+                intrin_type = intrin_type.with_lanes(target_vector_bits / t.bits());
             } else if (vec_bits % 64 == 0) {
                 type_ok_for_vst = true;
-                intrin_type = intrin_type.with_lanes(64 / t.bits());
+                auto intrin_bits = (vec_bits % 128 == 0 || target.has_feature(Target::SVE2)) ? 128 : 64;
+                intrin_type = intrin_type.with_lanes(intrin_bits / t.bits());
             }
         }
     }
 
-    if (is_const_one(ramp->stride) &&
+    if (ramp && is_const_one(ramp->stride) &&
         shuffle && shuffle->is_interleave() &&
         type_ok_for_vst &&
         2 <= shuffle->vectors.size() && shuffle->vectors.size() <= 4) {
@@ -1138,11 +1455,14 @@ void CodeGen_ARM::visit(const Store *op) {
         for (int i = 0; i < num_vecs; ++i) {
             args[i] = codegen(shuffle->vectors[i]);
         }
+        Value *store_pred_val = codegen(op->predicate);
+
+        bool is_sve = target.has_feature(Target::SVE2);
 
         // Declare the function
         std::ostringstream instr;
         vector<llvm::Type *> arg_types;
-        llvm::Type *intrin_llvm_type = llvm_type_of(intrin_type);
+        llvm::Type *intrin_llvm_type = llvm_type_with_constraint(intrin_type, false, is_sve ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed);
 #if LLVM_VERSION >= 170
         const bool is_opaque = true;
 #else
@@ -1160,27 +1480,38 @@ void CodeGen_ARM::visit(const Store *op) {
             arg_types.front() = i8_t->getPointerTo();
             arg_types.back() = i32_t;
         } else {
-            instr << "llvm.aarch64.neon.st"
-                  << num_vecs
-                  << ".v"
-                  << intrin_type.lanes()
-                  << (t.is_float() ? 'f' : 'i')
-                  << t.bits()
-                  << ".p0";
-            if (!is_opaque) {
-                instr << (t.is_float() ? 'f' : 'i') << t.bits();
+            if (is_sve) {
+                instr << "llvm.aarch64.sve.st"
+                      << num_vecs
+                      << ".nxv"
+                      << (intrin_type.lanes() / target_vscale())
+                      << (t.is_float() ? 'f' : 'i')
+                      << t.bits();
+                arg_types = vector<llvm::Type *>(num_vecs, intrin_llvm_type);
+                arg_types.emplace_back(get_vector_type(i1_t, intrin_type.lanes() / target_vscale(), VectorTypeConstraint::VScale));  // predicate
+                arg_types.emplace_back(llvm_type_of(intrin_type.element_of())->getPointerTo());
+            } else {
+                instr << "llvm.aarch64.neon.st"
+                      << num_vecs
+                      << ".v"
+                      << intrin_type.lanes()
+                      << (t.is_float() ? 'f' : 'i')
+                      << t.bits()
+                      << ".p0";
+                if (!is_opaque) {
+                    instr << (t.is_float() ? 'f' : 'i') << t.bits();
+                }
+                arg_types = vector<llvm::Type *>(num_vecs + 1, intrin_llvm_type);
+                arg_types.back() = llvm_type_of(intrin_type.element_of())->getPointerTo();
             }
-            arg_types = vector<llvm::Type *>(num_vecs + 1, intrin_llvm_type);
-            arg_types.back() = llvm_type_of(intrin_type.element_of())->getPointerTo();
         }
         llvm::FunctionType *fn_type = FunctionType::get(llvm::Type::getVoidTy(*context), arg_types, false);
         llvm::FunctionCallee fn = module->getOrInsertFunction(instr.str(), fn_type);
         internal_assert(fn);
 
-        // How many vst instructions do we need to generate?
-        int slices = t.lanes() / intrin_type.lanes();
+        // SVE2 supports predication for smaller than whole vector size.
+        internal_assert(target.has_feature(Target::SVE2) || (t.lanes() >= intrin_type.lanes()));
 
-        internal_assert(slices >= 1);
         for (int i = 0; i < t.lanes(); i += intrin_type.lanes()) {
             Expr slice_base = simplify(ramp->base + i * num_vecs);
             Expr slice_ramp = Ramp::make(slice_base, ramp->stride, intrin_type.lanes() * num_vecs);
@@ -1190,6 +1521,7 @@ void CodeGen_ARM::visit(const Store *op) {
             // Take a slice of each arg
             for (int j = 0; j < num_vecs; j++) {
                 slice_args[j] = slice_vector(slice_args[j], i, intrin_type.lanes());
+                slice_args[j] = convert_fixed_or_scalable_vector_type(slice_args[j], get_vector_type(slice_args[j]->getType()->getScalarType(), intrin_type.lanes()));
             }
 
             if (target.bits == 32) {
@@ -1200,10 +1532,30 @@ void CodeGen_ARM::visit(const Store *op) {
                 // Set the alignment argument
                 slice_args.push_back(ConstantInt::get(i32_t, alignment));
             } else {
+                if (is_sve) {
+                    // Set the predicate argument
+                    auto active_lanes = std::min(t.lanes() - i, intrin_type.lanes());
+                    Value *vpred_val;
+                    if (is_predicated_store) {
+                        vpred_val = slice_vector(store_pred_val, i, intrin_type.lanes());
+                    } else {
+                        Expr vpred = make_vector_predicate_1s_0s(active_lanes, intrin_type.lanes() - active_lanes);
+                        vpred_val = codegen(vpred);
+                    }
+                    slice_args.push_back(vpred_val);
+                }
                 // Set the pointer argument
                 slice_args.push_back(ptr);
             }
 
+            if (is_sve) {
+                for (auto &arg : slice_args) {
+                    if (arg->getType()->isVectorTy()) {
+                        arg = match_vector_type_scalable(arg, VectorTypeConstraint::VScale);
+                    }
+                }
+            }
+
             CallInst *store = builder->CreateCall(fn, slice_args);
             add_tbaa_metadata(store, op->name, slice_ramp);
         }
@@ -1216,8 +1568,95 @@ void CodeGen_ARM::visit(const Store *op) {
         return;
     }
 
+    if (target.has_feature(Target::SVE2)) {
+        const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
+        if (stride && stride->value == 1) {
+            // Basically we can deal with vanilla codegen,
+            // but to avoid LLVM error, process with the multiple of natural_lanes
+            const int natural_lanes = target.natural_vector_size(op->value.type());
+            if (ramp->lanes % natural_lanes) {
+                int aligned_lanes = align_up(ramp->lanes, natural_lanes);
+                // Use predicate to prevent overrun
+                Expr vpred;
+                if (is_predicated_store) {
+                    vpred = Shuffle::make_concat({op->predicate, const_false(aligned_lanes - ramp->lanes)});
+                } else {
+                    vpred = make_vector_predicate_1s_0s(ramp->lanes, aligned_lanes - ramp->lanes);
+                }
+                auto aligned_index = Ramp::make(ramp->base, stride, aligned_lanes);
+                Expr padding = make_zero(op->value.type().with_lanes(aligned_lanes - ramp->lanes));
+                Expr aligned_value = Shuffle::make_concat({op->value, padding});
+                codegen(Store::make(op->name, aligned_value, aligned_index, op->param, vpred, op->alignment));
+                return;
+            }
+        } else if (op->index.type().is_vector()) {
+            // Scatter
+            Type elt = op->value.type().element_of();
+
+            // Rewrite float16 case into reinterpret and Store in uint16, as it is unsupported in LLVM
+            if (is_float16_and_has_feature(elt)) {
+                Type u16_type = op->value.type().with_code(halide_type_uint);
+                Expr v = reinterpret(u16_type, op->value);
+                codegen(Store::make(op->name, v, op->index, op->param, op->predicate, op->alignment));
+                return;
+            }
+
+            const int store_lanes = op->value.type().lanes();
+            const int index_bits = 32;
+            Type type_with_max_bits = Int(std::max(elt.bits(), index_bits));
+            // The number of lanes is constrained by index vector type
+            const int natural_lanes = target.natural_vector_size(type_with_max_bits);
+            const int vscale_natural_lanes = natural_lanes / target_vscale();
+
+            Expr base = 0;
+            Value *elt_ptr = codegen_buffer_pointer(op->name, elt, base);
+            Value *val = codegen(op->value);
+            Value *index = codegen(op->index);
+            Value *store_pred_val = codegen(op->predicate);
+
+            llvm::Type *slice_type = get_vector_type(llvm_type_of(elt), vscale_natural_lanes, VectorTypeConstraint::VScale);
+            llvm::Type *slice_index_type = get_vector_type(llvm_type_of(op->index.type().element_of()), vscale_natural_lanes, VectorTypeConstraint::VScale);
+            llvm::Type *pred_type = get_vector_type(llvm_type_of(op->predicate.type().element_of()), vscale_natural_lanes, VectorTypeConstraint::VScale);
+
+            std::ostringstream instr;
+            instr << "llvm.aarch64.sve.st1.scatter.uxtw."
+                  << (elt.bits() != 8 ? "index." : "")  // index is scaled into bytes
+                  << "nxv"
+                  << vscale_natural_lanes
+                  << (elt == Float(32) || elt == Float(64) ? 'f' : 'i')
+                  << elt.bits();
+
+            vector<llvm::Type *> arg_types{slice_type, pred_type, elt_ptr->getType(), slice_index_type};
+            llvm::FunctionType *fn_type = FunctionType::get(void_t, arg_types, false);
+            FunctionCallee fn = module->getOrInsertFunction(instr.str(), fn_type);
+
+            // We need to slice the result into native vector lanes to use intrinsic
+            for (int i = 0; i < store_lanes; i += natural_lanes) {
+                Value *slice_value = slice_vector(val, i, natural_lanes);
+                Value *slice_index = slice_vector(index, i, natural_lanes);
+                const int active_lanes = std::min(store_lanes - i, natural_lanes);
+
+                Expr vpred = make_vector_predicate_1s_0s(active_lanes, natural_lanes - active_lanes);
+                Value *vpred_val = codegen(vpred);
+                vpred_val = convert_fixed_or_scalable_vector_type(vpred_val, pred_type);
+                if (is_predicated_store) {
+                    Value *sliced_store_vpred_val = slice_vector(store_pred_val, i, natural_lanes);
+                    vpred_val = builder->CreateAnd(vpred_val, sliced_store_vpred_val);
+                }
+
+                slice_value = match_vector_type_scalable(slice_value, VectorTypeConstraint::VScale);
+                vpred_val = match_vector_type_scalable(vpred_val, VectorTypeConstraint::VScale);
+                slice_index = match_vector_type_scalable(slice_index, VectorTypeConstraint::VScale);
+                CallInst *store = builder->CreateCall(fn, {slice_value, vpred_val, elt_ptr, slice_index});
+                add_tbaa_metadata(store, op->name, op->index);
+            }
+
+            return;
+        }
+    }
+
     // If the stride is one or minus one, we can deal with that using vanilla codegen
-    const IntImm *stride = ramp->stride.as<IntImm>();
+    const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
     if (stride && (stride->value == 1 || stride->value == -1)) {
         CodeGen_Posix::visit(op);
         return;
@@ -1250,12 +1689,13 @@ void CodeGen_ARM::visit(const Store *op) {
 
 void CodeGen_ARM::visit(const Load *op) {
     // Predicated load
-    if (!is_const_one(op->predicate)) {
+    const bool is_predicated_load = !is_const_one(op->predicate);
+    if (is_predicated_load && !target.has_feature(Target::SVE2)) {
         CodeGen_Posix::visit(op);
         return;
     }
 
-    if (neon_intrinsics_disabled()) {
+    if (simd_intrinsics_disabled()) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1263,14 +1703,15 @@ void CodeGen_ARM::visit(const Load *op) {
     const Ramp *ramp = op->index.as<Ramp>();
 
     // We only deal with ramps here
-    if (!ramp) {
+    if (!ramp && !target.has_feature(Target::SVE2)) {
         CodeGen_Posix::visit(op);
         return;
     }
 
     // If the stride is in [-1, 1], we can deal with that using vanilla codegen
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
-    if (stride && (-1 <= stride->value && stride->value <= 1)) {
+    if (stride && (-1 <= stride->value && stride->value <= 1) &&
+        !target.has_feature(Target::SVE2)) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1296,6 +1737,168 @@ void CodeGen_ARM::visit(const Load *op) {
         }
     }
 
+    if (target.has_feature(Target::SVE2)) {
+        if (stride && stride->value < 1) {
+            CodeGen_Posix::visit(op);
+            return;
+        } else if (stride && stride->value == 1) {
+            const int natural_lanes = target.natural_vector_size(op->type);
+            if (ramp->lanes % natural_lanes) {
+                // Load with lanes multiple of natural_lanes
+                int aligned_lanes = align_up(ramp->lanes, natural_lanes);
+                // Use predicate to prevent from overrun
+                Expr vpred;
+                if (is_predicated_load) {
+                    vpred = Shuffle::make_concat({op->predicate, const_false(aligned_lanes - ramp->lanes)});
+                } else {
+                    vpred = make_vector_predicate_1s_0s(ramp->lanes, aligned_lanes - ramp->lanes);
+                }
+                auto aligned_index = Ramp::make(ramp->base, stride, aligned_lanes);
+                auto aligned_type = op->type.with_lanes(aligned_lanes);
+                value = codegen(Load::make(aligned_type, op->name, aligned_index, op->image, op->param, vpred, op->alignment));
+                value = slice_vector(value, 0, ramp->lanes);
+                return;
+            } else {
+                CodeGen_Posix::visit(op);
+                return;
+            }
+        } else if (stride && (2 <= stride->value && stride->value <= 4)) {
+            // Structured load ST2/ST3/ST4 of SVE
+
+            Expr base = ramp->base;
+            ModulusRemainder align = op->alignment;
+
+            int aligned_stride = gcd(stride->value, align.modulus);
+            int offset = 0;
+            if (aligned_stride == stride->value) {
+                offset = mod_imp((int)align.remainder, aligned_stride);
+            } else {
+                const Add *add = base.as<Add>();
+                if (const IntImm *add_c = add ? add->b.as<IntImm>() : base.as<IntImm>()) {
+                    offset = mod_imp(add_c->value, stride->value);
+                }
+            }
+
+            if (offset) {
+                base = simplify(base - offset);
+            }
+
+            Value *load_pred_val = codegen(op->predicate);
+
+            // We need to slice the result in to native vector lanes to use sve intrin.
+            // LLVM will optimize redundant ld instructions afterwards
+            const int slice_lanes = target.natural_vector_size(op->type);
+            vector<Value *> results;
+            for (int i = 0; i < op->type.lanes(); i += slice_lanes) {
+                int load_base_i = i * stride->value;
+                Expr slice_base = simplify(base + load_base_i);
+                Expr slice_index = Ramp::make(slice_base, stride, slice_lanes);
+                std::ostringstream instr;
+                instr << "llvm.aarch64.sve.ld"
+                      << stride->value
+                      << ".sret.nxv"
+                      << slice_lanes
+                      << (op->type.is_float() ? 'f' : 'i')
+                      << op->type.bits();
+                llvm::Type *elt = llvm_type_of(op->type.element_of());
+                llvm::Type *slice_type = get_vector_type(elt, slice_lanes);
+                StructType *sret_type = StructType::get(module->getContext(), std::vector(stride->value, slice_type));
+                std::vector<llvm::Type *> arg_types{get_vector_type(i1_t, slice_lanes), PointerType::get(elt, 0)};
+                llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false);
+                FunctionCallee fn = module->getOrInsertFunction(instr.str(), fn_type);
+
+                // Set the predicate argument
+                int active_lanes = std::min(op->type.lanes() - i, slice_lanes);
+
+                Expr vpred = make_vector_predicate_1s_0s(active_lanes, slice_lanes - active_lanes);
+                Value *vpred_val = codegen(vpred);
+                vpred_val = convert_fixed_or_scalable_vector_type(vpred_val, get_vector_type(vpred_val->getType()->getScalarType(), slice_lanes));
+                if (is_predicated_load) {
+                    Value *sliced_load_vpred_val = slice_vector(load_pred_val, i, slice_lanes);
+                    vpred_val = builder->CreateAnd(vpred_val, sliced_load_vpred_val);
+                }
+
+                Value *elt_ptr = codegen_buffer_pointer(op->name, op->type.element_of(), slice_base);
+                CallInst *load_i = builder->CreateCall(fn, {vpred_val, elt_ptr});
+                add_tbaa_metadata(load_i, op->name, slice_index);
+                // extract one element out of returned struct
+                Value *extracted = builder->CreateExtractValue(load_i, offset);
+                results.push_back(extracted);
+            }
+
+            // Retrieve original lanes
+            value = concat_vectors(results);
+            value = slice_vector(value, 0, op->type.lanes());
+            return;
+        } else if (op->index.type().is_vector()) {
+            // General Gather Load
+
+            // Rewrite float16 case into load in uint16 and reinterpret, as it is unsupported in LLVM
+            if (is_float16_and_has_feature(op->type)) {
+                Type u16_type = op->type.with_code(halide_type_uint);
+                Expr equiv = Load::make(u16_type, op->name, op->index, op->image, op->param, op->predicate, op->alignment);
+                equiv = reinterpret(op->type, equiv);
+                equiv = common_subexpression_elimination(equiv);
+                value = codegen(equiv);
+                return;
+            }
+
+            Type elt = op->type.element_of();
+            const int load_lanes = op->type.lanes();
+            const int index_bits = 32;
+            Type type_with_max_bits = Int(std::max(elt.bits(), index_bits));
+            // The number of lanes is constrained by index vector type
+            const int natural_lanes = target.natural_vector_size(type_with_max_bits);
+            const int vscale_natural_lanes = natural_lanes / target_vscale();
+
+            Expr base = 0;
+            Value *elt_ptr = codegen_buffer_pointer(op->name, elt, base);
+            Value *index = codegen(op->index);
+            Value *load_pred_val = codegen(op->predicate);
+
+            llvm::Type *slice_type = get_vector_type(llvm_type_of(elt), vscale_natural_lanes, VectorTypeConstraint::VScale);
+            llvm::Type *slice_index_type = get_vector_type(llvm_type_of(op->index.type().element_of()), vscale_natural_lanes, VectorTypeConstraint::VScale);
+            llvm::Type *pred_type = get_vector_type(llvm_type_of(op->predicate.type().element_of()), vscale_natural_lanes, VectorTypeConstraint::VScale);
+
+            std::ostringstream instr;
+            instr << "llvm.aarch64.sve.ld1.gather.uxtw."
+                  << (elt.bits() != 8 ? "index." : "")  // index is scaled into bytes
+                  << "nxv"
+                  << vscale_natural_lanes
+                  << (elt == Float(32) || elt == Float(64) ? 'f' : 'i')
+                  << elt.bits();
+
+            llvm::FunctionType *fn_type = FunctionType::get(slice_type, {pred_type, elt_ptr->getType(), slice_index_type}, false);
+            FunctionCallee fn = module->getOrInsertFunction(instr.str(), fn_type);
+
+            // We need to slice the result in to native vector lanes to use intrinsic
+            vector<Value *> results;
+            for (int i = 0; i < load_lanes; i += natural_lanes) {
+                Value *slice_index = slice_vector(index, i, natural_lanes);
+
+                const int active_lanes = std::min(load_lanes - i, natural_lanes);
+
+                Expr vpred = make_vector_predicate_1s_0s(active_lanes, natural_lanes - active_lanes);
+                Value *vpred_val = codegen(vpred);
+                if (is_predicated_load) {
+                    Value *sliced_load_vpred_val = slice_vector(load_pred_val, i, natural_lanes);
+                    vpred_val = builder->CreateAnd(vpred_val, sliced_load_vpred_val);
+                }
+
+                vpred_val = match_vector_type_scalable(vpred_val, VectorTypeConstraint::VScale);
+                slice_index = match_vector_type_scalable(slice_index, VectorTypeConstraint::VScale);
+                CallInst *gather = builder->CreateCall(fn, {vpred_val, elt_ptr, slice_index});
+                add_tbaa_metadata(gather, op->name, op->index);
+                results.push_back(gather);
+            }
+
+            // Retrieve original lanes
+            value = concat_vectors(results);
+            value = slice_vector(value, 0, load_lanes);
+            return;
+        }
+    }
+
     CodeGen_Posix::visit(op);
 }
 
@@ -1322,6 +1925,33 @@ void CodeGen_ARM::visit(const Shuffle *op) {
     }
 }
 
+void CodeGen_ARM::visit(const Ramp *op) {
+    if (target_vscale() != 0 && op->type.is_int_or_uint()) {
+        if (is_const_zero(op->base) && is_const_one(op->stride)) {
+            codegen_func_t cg_func = [&](int lanes, const std::vector<Value *> &args) {
+                internal_assert(args.empty());
+                // Generate stepvector intrinsic for ScalableVector
+                return builder->CreateStepVector(llvm_type_of(op->type.with_lanes(lanes)));
+            };
+
+            // codgen with next-power-of-two lanes, because if we sliced into natural_lanes(e.g. 4),
+            // it would produce {0,1,2,3,0,1,..} instead of {0,1,2,3,4,5,..}
+            const int ret_lanes = op->type.lanes();
+            const int aligned_lanes = next_power_of_two(ret_lanes);
+            value = codegen_with_lanes(aligned_lanes, ret_lanes, {}, cg_func);
+            return;
+        } else {
+            Expr broadcast_base = Broadcast::make(op->base, op->lanes);
+            Expr broadcast_stride = Broadcast::make(op->stride, op->lanes);
+            Expr step_ramp = Ramp::make(make_zero(op->base.type()), make_one(op->base.type()), op->lanes);
+            value = codegen(broadcast_base + broadcast_stride * step_ramp);
+            return;
+        }
+    }
+
+    CodeGen_Posix::visit(op);
+}
+
 void CodeGen_ARM::visit(const Call *op) {
     if (op->is_intrinsic(Call::sorted_avg)) {
         value = codegen(halving_add(op->args[0], op->args[1]));
@@ -1407,7 +2037,6 @@ void CodeGen_ARM::visit(const Call *op) {
         for (const auto &i : cast_rewrites) {
             if (expr_match(i.first, op, matches)) {
                 Expr replacement = substitute("*", matches[0], with_lanes(i.second, op->type.lanes()));
-                debug(3) << "rewriting cast to: " << replacement << " from " << Expr(op) << "\n";
                 value = codegen(replacement);
                 return;
             }
@@ -1464,14 +2093,28 @@ void CodeGen_ARM::visit(const LE *op) {
 }
 
 void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) {
-    if (neon_intrinsics_disabled() ||
-        op->op == VectorReduce::Or ||
-        op->op == VectorReduce::And ||
-        op->op == VectorReduce::Mul) {
+    if (simd_intrinsics_disabled()) {
         CodeGen_Posix::codegen_vector_reduce(op, init);
         return;
     }
 
+    if (codegen_dot_product_vector_reduce(op, init)) {
+        return;
+    }
+    if (codegen_pairwise_vector_reduce(op, init)) {
+        return;
+    }
+    if (codegen_across_vector_reduce(op, init)) {
+        return;
+    }
+    CodeGen_Posix::codegen_vector_reduce(op, init);
+}
+
+bool CodeGen_ARM::codegen_dot_product_vector_reduce(const VectorReduce *op, const Expr &init) {
+    if (op->op != VectorReduce::Add) {
+        return false;
+    }
+
     struct Pattern {
         VectorReduce::Operator reduce_op;
         int factor;
@@ -1485,11 +2128,23 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
         {VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Target::ARMDotProd},
         {VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd},
         {VectorReduce::Add, 4, u32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd},
+        {VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Target::SVE2},
+        {VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::SVE2},
+        {VectorReduce::Add, 4, u32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::SVE2},
+        {VectorReduce::Add, 4, i64(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", Target::SVE2},
+        {VectorReduce::Add, 4, i64(widening_mul(wild_u16x_, wild_u16x_)), "dot_product", Target::SVE2},
+        {VectorReduce::Add, 4, u64(widening_mul(wild_u16x_, wild_u16x_)), "dot_product", Target::SVE2},
         // A sum is the same as a dot product with a vector of ones, and this appears to
         // be a bit faster.
         {VectorReduce::Add, 4, i32(wild_i8x_), "dot_product", Target::ARMDotProd, {1}},
         {VectorReduce::Add, 4, i32(wild_u8x_), "dot_product", Target::ARMDotProd, {1}},
         {VectorReduce::Add, 4, u32(wild_u8x_), "dot_product", Target::ARMDotProd, {1}},
+        {VectorReduce::Add, 4, i32(wild_i8x_), "dot_product", Target::SVE2, {1}},
+        {VectorReduce::Add, 4, i32(wild_u8x_), "dot_product", Target::SVE2, {1}},
+        {VectorReduce::Add, 4, u32(wild_u8x_), "dot_product", Target::SVE2, {1}},
+        {VectorReduce::Add, 4, i64(wild_i16x_), "dot_product", Target::SVE2, {1}},
+        {VectorReduce::Add, 4, i64(wild_u16x_), "dot_product", Target::SVE2, {1}},
+        {VectorReduce::Add, 4, u64(wild_u16x_), "dot_product", Target::SVE2, {1}},
     };
     // clang-format on
 
@@ -1507,7 +2162,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
                 Expr equiv = VectorReduce::make(op->op, op->value, op->value.type().lanes() / p.factor);
                 equiv = VectorReduce::make(op->op, equiv, op->type.lanes());
                 codegen_vector_reduce(equiv.as<VectorReduce>(), init);
-                return;
+                return true;
             }
 
             for (int i : p.extra_operands) {
@@ -1518,6 +2173,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
             if (!i.defined()) {
                 i = make_zero(op->type);
             }
+
             if (const Shuffle *s = matches[0].as<Shuffle>()) {
                 if (s->is_broadcast()) {
                     // LLVM wants the broadcast as the second operand for the broadcasting
@@ -1525,15 +2181,27 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
                     std::swap(matches[0], matches[1]);
                 }
             }
-            value = call_overloaded_intrin(op->type, p.intrin, {i, matches[0], matches[1]});
-            if (value) {
-                return;
+
+            if (Value *v = call_overloaded_intrin(op->type, p.intrin, {i, matches[0], matches[1]})) {
+                value = v;
+                return true;
             }
         }
     }
 
+    return false;
+}
+
+bool CodeGen_ARM::codegen_pairwise_vector_reduce(const VectorReduce *op, const Expr &init) {
+    if (op->op != VectorReduce::Add &&
+        op->op != VectorReduce::Max &&
+        op->op != VectorReduce::Min) {
+        return false;
+    }
+
     // TODO: Move this to be patterns? The patterns are pretty trivial, but some
     // of the other logic is tricky.
+    int factor = op->value.type().lanes() / op->type.lanes();
     const char *intrin = nullptr;
     vector<Expr> intrin_args;
     Expr accumulator = init;
@@ -1547,33 +2215,38 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
             narrow = lossless_cast(narrow_type.with_code(Type::UInt), op->value);
         }
         if (narrow.defined()) {
-            if (init.defined() && target.bits == 32) {
-                // On 32-bit, we have an intrinsic for widening add-accumulate.
+            if (init.defined() && (target.bits == 32 || target.has_feature(Target::SVE2))) {
+                // On 32-bit or SVE2, we have an intrinsic for widening add-accumulate.
                 // TODO: this could be written as a pattern with widen_right_add (#6951).
                 intrin = "pairwise_widening_add_accumulate";
                 intrin_args = {accumulator, narrow};
                 accumulator = Expr();
+            } else if (target.has_feature(Target::SVE2)) {
+                intrin = "pairwise_widening_add_accumulate";
+                intrin_args = {Expr(0), narrow};
+                accumulator = Expr();
             } else {
                 // On 64-bit, LLVM pattern matches widening add-accumulate if
                 // we give it the widening add.
                 intrin = "pairwise_widening_add";
                 intrin_args = {narrow};
             }
-        } else {
+        } else if (!target.has_feature(Target::SVE2)) {
+            // Exclude SVE, as it process lanes in different order (even/odd wise) than NEON
             intrin = "pairwise_add";
             intrin_args = {op->value};
         }
-    } else if (op->op == VectorReduce::Min && factor == 2) {
+    } else if (op->op == VectorReduce::Min && factor == 2 && !target.has_feature(Target::SVE2)) {
         intrin = "pairwise_min";
         intrin_args = {op->value};
-    } else if (op->op == VectorReduce::Max && factor == 2) {
+    } else if (op->op == VectorReduce::Max && factor == 2 && !target.has_feature(Target::SVE2)) {
         intrin = "pairwise_max";
         intrin_args = {op->value};
     }
 
     if (intrin) {
-        value = call_overloaded_intrin(op->type, intrin, intrin_args);
-        if (value) {
+        if (Value *v = call_overloaded_intrin(op->type, intrin, intrin_args)) {
+            value = v;
             if (accumulator.defined()) {
                 // We still have an initial value to take care of
                 string n = unique_name('t');
@@ -1595,11 +2268,126 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
                 codegen(accumulator);
                 sym_pop(n);
             }
-            return;
+            return true;
         }
     }
 
-    CodeGen_Posix::codegen_vector_reduce(op, init);
+    return false;
+}
+
+bool CodeGen_ARM::codegen_across_vector_reduce(const VectorReduce *op, const Expr &init) {
+    if (target_vscale() == 0) {
+        // Leave this to vanilla codegen to emit "llvm.vector.reduce." intrinsic,
+        // which doesn't support scalable vector in LLVM 14
+        return false;
+    }
+
+    if (op->op != VectorReduce::Add &&
+        op->op != VectorReduce::Max &&
+        op->op != VectorReduce::Min) {
+        return false;
+    }
+
+    Expr val = op->value;
+    const int output_lanes = op->type.lanes();
+    const int native_lanes = target.natural_vector_size(op->type);
+    const int input_lanes = val.type().lanes();
+    const int input_bits = op->type.bits();
+    Type elt = op->type.element_of();
+
+    if (output_lanes != 1 || input_lanes < 2) {
+        return false;
+    }
+
+    Expr (*binop)(Expr, Expr) = nullptr;
+    std::string op_name;
+    switch (op->op) {
+    case VectorReduce::Add:
+        binop = Add::make;
+        op_name = "add";
+        break;
+    case VectorReduce::Min:
+        binop = Min::make;
+        op_name = "min";
+        break;
+    case VectorReduce::Max:
+        binop = Max::make;
+        op_name = "max";
+        break;
+    default:
+        internal_error << "unreachable";
+    }
+
+    if (input_lanes == native_lanes) {
+        std::stringstream name;  // e.g. llvm.aarch64.sve.sminv.nxv4i32
+        name << "llvm.aarch64.sve."
+             << (op->type.is_float() ? "f" : op->type.is_int() ? "s" :
+                                                                 "u")
+             << op_name << "v"
+             << ".nxv" << (native_lanes / target_vscale()) << (op->type.is_float() ? "f" : "i") << input_bits;
+
+        // Integer add accumulation output is 64 bit only
+        const bool type_upgraded = op->op == VectorReduce::Add && op->type.is_int_or_uint();
+        const int output_bits = type_upgraded ? 64 : input_bits;
+        Type intrin_ret_type = op->type.with_bits(output_bits);
+
+        const string intrin_name = name.str();
+
+        Expr pred = const_true(native_lanes);
+        vector<Expr> args{pred, op->value};
+
+        // Make sure the declaration exists, or the codegen for
+        // call will assume that the args should scalarize.
+        if (!module->getFunction(intrin_name)) {
+            vector<llvm::Type *> arg_types;
+            for (const Expr &e : args) {
+                arg_types.push_back(llvm_type_with_constraint(e.type(), false, VectorTypeConstraint::VScale));
+            }
+            FunctionType *func_t = FunctionType::get(llvm_type_with_constraint(intrin_ret_type, false, VectorTypeConstraint::VScale),
+                                                     arg_types, false);
+            llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get());
+        }
+
+        Expr equiv = Call::make(intrin_ret_type, intrin_name, args, Call::PureExtern);
+        if (type_upgraded) {
+            equiv = Cast::make(op->type, equiv);
+        }
+        if (init.defined()) {
+            equiv = binop(init, equiv);
+        }
+        equiv = common_subexpression_elimination(equiv);
+        equiv.accept(this);
+        return true;
+
+    } else if (input_lanes < native_lanes) {
+        // Create equivalent where lanes==native_lanes by padding data which doesn't affect the result
+        Expr padding;
+        const int inactive_lanes = native_lanes - input_lanes;
+
+        switch (op->op) {
+        case VectorReduce::Add:
+            padding = make_zero(elt.with_lanes(inactive_lanes));
+            break;
+        case VectorReduce::Min:
+            padding = elt.with_lanes(inactive_lanes).min();
+            break;
+        case VectorReduce::Max:
+            padding = elt.with_lanes(inactive_lanes).max();
+            break;
+        default:
+            internal_error << "unreachable";
+        }
+
+        Expr equiv = VectorReduce::make(op->op, Shuffle::make_concat({val, padding}), 1);
+        if (init.defined()) {
+            equiv = binop(equiv, init);
+        }
+        equiv = common_subexpression_elimination(equiv);
+        equiv.accept(this);
+        return true;
+    }
+
+    return false;
 }
 
 Type CodeGen_ARM::upgrade_type_for_arithmetic(const Type &t) const {
@@ -1623,6 +2411,39 @@ Type CodeGen_ARM::upgrade_type_for_storage(const Type &t) const {
     return CodeGen_Posix::upgrade_type_for_storage(t);
 }
 
+Value *CodeGen_ARM::codegen_with_lanes(int slice_lanes, int total_lanes,
+                                       const std::vector<Expr> &args, codegen_func_t &cg_func) {
+    std::vector<Value *> llvm_args;
+    // codegen args
+    for (const auto &arg : args) {
+        llvm_args.push_back(codegen(arg));
+    }
+
+    if (slice_lanes == total_lanes) {
+        // codegen op
+        return cg_func(slice_lanes, llvm_args);
+    }
+
+    std::vector<Value *> results;
+    for (int start = 0; start < total_lanes; start += slice_lanes) {
+        std::vector<Value *> sliced_args;
+        for (auto &llvm_arg : llvm_args) {
+            Value *v = llvm_arg;
+            if (get_vector_num_elements(llvm_arg->getType()) == total_lanes) {
+                // Except for scalar argument which some ops have, arguments are sliced
+                v = slice_vector(llvm_arg, start, slice_lanes);
+            }
+            sliced_args.push_back(v);
+        }
+        // codegen op
+        value = cg_func(slice_lanes, sliced_args);
+        results.push_back(value);
+    }
+    // Restore the results into vector with total_lanes
+    value = concat_vectors(results);
+    return slice_vector(value, 0, total_lanes);
+}
+
 string CodeGen_ARM::mcpu_target() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
@@ -1635,6 +2456,8 @@ string CodeGen_ARM::mcpu_target() const {
             return "cyclone";
         } else if (target.os == Target::OSX) {
             return "apple-a12";
+        } else if (target.has_feature(Target::SVE2)) {
+            return "cortex-x1";
         } else {
             return "generic";
         }
@@ -1667,6 +2490,7 @@ string CodeGen_ARM::mattrs() const {
         }
     } else {
         // TODO: Should Halide's SVE flags be 64-bit only?
+        // TODO: Sound we ass "-neon" if NoNEON is set? Does this make any sense?
         if (target.has_feature(Target::SVE2)) {
             attrs.emplace_back("+sve2");
         } else if (target.has_feature(Target::SVE)) {
@@ -1689,7 +2513,21 @@ bool CodeGen_ARM::use_soft_float_abi() const {
 }
 
 int CodeGen_ARM::native_vector_bits() const {
-    return 128;
+    if (target.has_feature(Target::SVE) || target.has_feature(Target::SVE2)) {
+        return std::max(target.vector_bits, 128);
+    } else {
+        return 128;
+    }
+}
+
+int CodeGen_ARM::target_vscale() const {
+    if (target.features_any_of({Target::SVE, Target::SVE2})) {
+        user_assert(target.vector_bits != 0) << "For SVE/SVE2 support, target_vector_bits=<size> must be set in target.\n";
+        user_assert((target.vector_bits % 128) == 0) << "For SVE/SVE2 support, target_vector_bits must be a multiple of 128.\n";
+        return target.vector_bits / 128;
+    }
+
+    return 0;
 }
 
 bool CodeGen_ARM::supports_call_as_float16(const Call *op) const {
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 8922461524c5..1871460569c3 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -657,7 +657,11 @@ void CodeGen_LLVM::end_func(const std::vector<LoweredArgument> &args) {
         }
     }
 
-    internal_assert(!verifyFunction(*function, &llvm::errs()));
+    bool valid = !verifyFunction(*function, &llvm::errs());
+    if (!valid) {
+        function->print(dbgs());
+    }
+    internal_assert(valid) << "Generated function does not pass LLVM's verifyFunction.\n";
 
     current_function_args.clear();
 }
@@ -1348,10 +1352,6 @@ bool is_power_of_two(int x) {
     return (x & (x - 1)) == 0;
 }
 
-int next_power_of_two(int x) {
-    return static_cast<int>(1) << static_cast<int>(std::ceil(std::log2(x)));
-}
-
 }  // namespace
 
 Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const {
@@ -1449,16 +1449,16 @@ void CodeGen_LLVM::visit(const Cast *op) {
     }
 
     value = codegen(op->value);
-    llvm::Type *llvm_dst = llvm_type_of(dst);
+    llvm::Type *llvm_dst = llvm_type_of(dst.element_of());
+    if (value->getType()->isVectorTy()) {
+        llvm_dst = VectorType::get(llvm_dst, dyn_cast<VectorType>(value->getType())->getElementCount());
+    }
 
     if (dst.is_handle() && src.is_handle()) {
         value = builder->CreateBitCast(value, llvm_dst);
     } else if (dst.is_handle() || src.is_handle()) {
         internal_error << "Can't cast from " << src << " to " << dst << "\n";
     } else if (!src.is_float() && !dst.is_float()) {
-        // Widening integer casts either zero extend or sign extend,
-        // depending on the source type. Narrowing integer casts
-        // always truncate.
         value = builder->CreateIntCast(value, llvm_dst, src.is_int());
     } else if (src.is_float() && dst.is_int()) {
         value = builder->CreateFPToSI(value, llvm_dst);
@@ -1879,6 +1879,11 @@ void CodeGen_LLVM::visit(const Select *op) {
 
     Value *a = codegen(op->true_value);
     Value *b = codegen(op->false_value);
+    if (a->getType()->isVectorTy()) {
+        cmp = match_vector_type_scalable(cmp, a);
+        b = match_vector_type_scalable(b, a);
+    }
+
     if (!try_vector_predication_intrinsic("llvm.vp.select", llvm_type_of(op->type), op->type.lanes(),
                                           NoMask(), {VPArg(cmp), VPArg(a, 0), VPArg(b)})) {
         value = builder->CreateSelect(cmp, a, b);
@@ -2266,6 +2271,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) {
         Value *vpred = codegen(op->predicate);
         Halide::Type value_type = op->value.type();
         Value *val = codegen(op->value);
+        vpred = match_vector_type_scalable(vpred, value);
         int alignment = value_type.bytes();
         int native_bytes = native_vector_bits() / 8;
 
@@ -2357,7 +2363,6 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
                                                llvm::Value *vpred, bool slice_to_native, llvm::Value *stride) {
     debug(4) << "Vectorize predicated dense vector load:\n\t"
              << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n";
-
     int align_bytes = type.bytes();  // The size of a single element
 
     int native_bits = native_vector_bits();
@@ -2402,7 +2407,7 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri
         Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base);
         Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo());
 
-        Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr;
+        Value *slice_mask = (vpred != nullptr) ? match_vector_type_scalable(slice_vector(vpred, i, slice_lanes), slice_type) : nullptr;
         MaskVariant vp_slice_mask = slice_mask ? MaskVariant(slice_mask) : AllEnabledMask();
 
         Instruction *load_inst = nullptr;
@@ -3304,6 +3309,8 @@ void CodeGen_LLVM::visit(const Call *op) {
         value = codegen(lower_extract_bits(op));
     } else if (op->is_intrinsic(Call::concat_bits)) {
         value = codegen(lower_concat_bits(op));
+    } else if (op->is_intrinsic(Call::get_runtime_vscale)) {
+        value = builder->CreateVScale(ConstantInt::get(i32_t, 1));
     } else if (op->is_intrinsic()) {
         Expr lowered = lower_intrinsic(op);
         if (!lowered.defined()) {
@@ -3478,6 +3485,11 @@ void CodeGen_LLVM::visit(const Call *op) {
                                  << halide_arg << "\n";
                         args[i] = builder->CreatePointerCast(args[i], t);
                     }
+                } else if (args[i]->getType()->isVectorTy()) {
+                    llvm::Type *t = func_t->getParamType(i);
+                    if (t->isVectorTy()) {
+                        args[i] = match_vector_type_scalable(args[i], t);
+                    }
                 }
             }
         }
@@ -4274,14 +4286,14 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
                     break;
                 case VectorReduce::Min:
                     name = "fmin";
-                    // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118
+                    // TODO(zvookin): Not correct for strict_float. See: https://github.com/halide/Halide/issues/7118
                     if (takes_initial_value && !initial_value.defined()) {
                         initial_value = op->type.max();
                     }
                     break;
                 case VectorReduce::Max:
                     name = "fmax";
-                    // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118
+                    // TODO(zvookin): Not correct for strict_float. See: https://github.com/halide/Halide/issues/7118
                     if (takes_initial_value && !initial_value.defined()) {
                         initial_value = op->type.min();
                     }
@@ -4752,16 +4764,45 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes
 
     llvm::FunctionType *intrin_type = intrin->getFunctionType();
     for (int i = 0; i < (int)arg_values.size(); i++) {
-        if (arg_values[i]->getType() != intrin_type->getParamType(i)) {
-            // TODO: Change this to call convert_fixed_or_scalable_vector_type and
-            // remove normalize_fixed_scalable_vector_type, fixed_to_scalable_vector_type,
-            // and scalable_to_fixed_vector_type
-            arg_values[i] = normalize_fixed_scalable_vector_type(intrin_type->getParamType(i), arg_values[i]);
-        }
-        if (arg_values[i]->getType() != intrin_type->getParamType(i)) {
-            // There can be some mismatches in types, such as when passing scalar Halide type T
-            // to LLVM vector type <1 x T>.
-            arg_values[i] = builder->CreateBitCast(arg_values[i], intrin_type->getParamType(i));
+        llvm::Type *arg_type = arg_values[i]->getType();
+        llvm::Type *formal_param_type = intrin_type->getParamType(i);
+        if (arg_type != formal_param_type) {
+            bool both_vectors = isa<VectorType>(arg_type) && isa<VectorType>(formal_param_type);
+            bool arg_is_fixed = isa<FixedVectorType>(arg_type);
+            bool formal_is_fixed = isa<FixedVectorType>(formal_param_type);
+
+            // Apparently the bitcast in the else branch below can
+            // change the scalar type and vector length together so
+            // long as the total bits are the same. E.g. on HVX,
+            // <128 x i16> to <64 x i32>.  This is probably a bug, but
+            // it seems to be allowed so it is also supported in the
+            // fixed/vscale matching path.
+            if (both_vectors && (arg_is_fixed != formal_is_fixed) && (effective_vscale != 0)) {
+                bool scalar_types_match = arg_type->getScalarType() == formal_param_type->getScalarType();
+                if (arg_is_fixed && !scalar_types_match) {
+                    unsigned fixed_count = dyn_cast<llvm::VectorType>(formal_param_type)->getElementCount().getKnownMinValue() * effective_vscale;
+                    llvm::Type *match_scalar_type = llvm::VectorType::get(formal_param_type->getScalarType(), fixed_count, false);
+                    arg_values[i] = builder->CreateBitCast(arg_values[i], match_scalar_type);
+                }
+                llvm::ElementCount ec = dyn_cast<VectorType>(arg_values[i]->getType())->getElementCount();
+                int mid_count = formal_is_fixed ? (ec.getKnownMinValue() * effective_vscale) : (ec.getFixedValue() / effective_vscale);
+                llvm::Type *match_vector_flavor_type = llvm::VectorType::get(arg_values[i]->getType()->getScalarType(), mid_count, !formal_is_fixed);
+                arg_values[i] = convert_fixed_or_scalable_vector_type(arg_values[i], match_vector_flavor_type);
+                if (formal_is_fixed && !scalar_types_match) {
+                    arg_values[i] = builder->CreateBitCast(arg_values[i], formal_param_type);
+                }
+            } else {
+                // TODO(https://github.com/halide/Halide/issues/8117): That this
+                // can happen is probably a bug. It will crash in module
+                // validation for anything LLVM doesn't support. Better to
+                // regularize the Halide IR by inserting an intentional cast or
+                // to add extra intrinsics patterns. At the very least, some
+                // extra validation should be added here.
+
+                // There can be some mismatches in types, such as when passing
+                // scalar Halide type T to LLVM vector type <1 x T>.
+                arg_values[i] = builder->CreateBitCast(arg_values[i], formal_param_type);
+            }
         }
     }
 
@@ -4785,16 +4826,45 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
         return builder->CreateExtractElement(vec, (uint64_t)start);
     }
 
-    vector<int> indices(size);
-    for (int i = 0; i < size; i++) {
-        int idx = start + i;
-        if (idx >= 0 && idx < vec_lanes) {
-            indices[i] = idx;
-        } else {
-            indices[i] = -1;
+    bool is_fixed = isa<FixedVectorType>(vec->getType());
+
+    // TODO(https://github.com/halide/Halide/issues/8118): It is likely worth
+    // looking into using llvm.vector.{extract,insert} for this case
+    // too. However that would need to be validated performance wise for all
+    // architectures.
+    if (is_fixed) {
+        vector<int> indices(size);
+        for (int i = 0; i < size; i++) {
+            int idx = start + i;
+            if (idx >= 0 && idx < vec_lanes) {
+                indices[i] = idx;
+            } else {
+                indices[i] = -1;
+            }
         }
+        return shuffle_vectors(vec, indices);
+    } else {
+        // Extract a fixed vector with all the values in the source.
+        // Then insert back into a vector extended to size. This will
+        // be a scalable vector if size can be scalable, fixed
+        // otherwise.
+        llvm::Type *scalar_type = vec->getType()->getScalarType();
+
+        int intermediate_lanes = std::min(size, vec_lanes - start);
+        llvm::Type *intermediate_type = get_vector_type(scalar_type, intermediate_lanes, VectorTypeConstraint::Fixed);
+
+        vec = builder->CreateExtractVector(intermediate_type, vec, ConstantInt::get(i64_t, start));
+
+        // Insert vector into a poison vector and return.
+        int effective_size = is_fixed ? size : (size / effective_vscale);
+        llvm::VectorType *result_type = dyn_cast<VectorType>(get_vector_type(scalar_type, effective_size,
+                                                                             is_fixed ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale));
+        Constant *poison = PoisonValue::get(scalar_type);
+        llvm::Value *result_vec = ConstantVector::getSplat(result_type->getElementCount(), poison);
+        vec = builder->CreateInsertVector(result_type, result_vec, vec, ConstantInt::get(i64_t, 0));
+
+        return vec;
     }
-    return shuffle_vectors(vec, indices);
 }
 
 Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
@@ -4831,6 +4901,11 @@ Value *CodeGen_LLVM::concat_vectors(const vector<Value *> &v) {
             }
             int w_matched = std::max(w1, w2);
 
+            if (v1->getType() != v2->getType()) {
+                // arbitrary decision here to convert v2 to type of v1 rather than
+                // target fixed or scalable.
+                v2 = convert_fixed_or_scalable_vector_type(v2, v1->getType());
+            }
             internal_assert(v1->getType() == v2->getType());
 
             vector<int> indices(w1 + w2);
@@ -4903,8 +4978,11 @@ std::pair<llvm::Function *, int> CodeGen_LLVM::find_vector_runtime_function(cons
     while (l < lanes) {
         l *= 2;
     }
-    for (int i = l; i > 1; i /= 2) {
-        sizes_to_try.push_back(i);
+
+    // This will be 1 for non-vscale architectures.
+    int vscale_divisor = std::max(effective_vscale, 1);
+    for (int i = l; i > vscale_divisor; i /= 2) {
+        sizes_to_try.push_back(i / vscale_divisor);
     }
 
     // If none of those match, we'll also try doubling
@@ -4913,10 +4991,11 @@ std::pair<llvm::Function *, int> CodeGen_LLVM::find_vector_runtime_function(cons
     // vector implementation).
     sizes_to_try.push_back(l * 2);
 
+    std::string vec_prefix = effective_vscale != 0 ? "nx" : "x";
     for (int l : sizes_to_try) {
-        llvm::Function *vec_fn = module->getFunction(name + "x" + std::to_string(l));
+        llvm::Function *vec_fn = module->getFunction(name + vec_prefix + std::to_string(l));
         if (vec_fn) {
-            return {vec_fn, l};
+            return {vec_fn, l * vscale_divisor};
         }
     }
 
@@ -4982,6 +5061,42 @@ llvm::Value *CodeGen_LLVM::normalize_fixed_scalable_vector_type(llvm::Type *desi
     return result;
 }
 
+llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, VectorTypeConstraint constraint) {
+    if (constraint == VectorTypeConstraint::None) {
+        return value;
+    }
+
+    llvm::Type *value_type = value->getType();
+    if (!isa<VectorType>(value_type)) {
+        return value;
+    }
+
+    bool value_fixed = isa<llvm::FixedVectorType>(value_type);
+    bool guide_fixed = (constraint == VectorTypeConstraint::Fixed);
+    if (value_fixed != guide_fixed) {
+        int value_scaled_elements = get_vector_num_elements(value_type);
+        if (!guide_fixed) {
+            value_scaled_elements /= effective_vscale;
+        }
+        llvm::Type *desired_type = get_vector_type(value_type->getScalarType(), value_scaled_elements,
+                                                   guide_fixed ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale);
+        value = convert_fixed_or_scalable_vector_type(value, desired_type);
+    }
+
+    return value;
+}
+
+llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, llvm::Type *guide_type) {
+    if (!isa<llvm::VectorType>(guide_type)) {
+        return value;
+    }
+    return match_vector_type_scalable(value, isa<FixedVectorType>(guide_type) ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale);
+}
+
+llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, llvm::Value *guide) {
+    return match_vector_type_scalable(value, guide->getType());
+}
+
 llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *arg,
                                                                  llvm::Type *desired_type) {
     llvm::Type *arg_type = arg->getType();
@@ -5007,13 +5122,21 @@ llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *ar
     if (isa<llvm::FixedVectorType>(arg_type) &&
         isa<llvm::ScalableVectorType>(result_type)) {
         use_insert = true;
+        if (arg_elements > result_elements) {
+            arg = slice_vector(arg, 0, result_elements);
+        }
+        arg_elements = result_elements;
     } else if (isa<llvm::FixedVectorType>(result_type) &&
                isa<llvm::ScalableVectorType>(arg_type)) {
         use_insert = false;
+        if (arg_elements < result_elements) {
+            arg = slice_vector(arg, 0, result_elements);
+        }
+        arg_elements = result_elements;
     } else {
         // Use extract to make smaller, insert to make bigger.
         // A somewhat arbitary decision.
-        use_insert = (arg_elements > result_elements);
+        use_insert = (arg_elements < result_elements);
     }
 
     std::string intrin_name = "llvm.vector.";
@@ -5165,10 +5288,27 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n,
     bool scalable = false;
     switch (type_constraint) {
     case VectorTypeConstraint::None:
-        scalable = effective_vscale != 0 &&
-                   ((n % effective_vscale) == 0);
-        if (scalable) {
-            n = n / effective_vscale;
+        if (effective_vscale > 0) {
+            bool wide_enough = true;
+            // TODO(https://github.com/halide/Halide/issues/8119): Architecture
+            // specific code should not go here. Ideally part of this can go
+            // away via LLVM fixes and modifying intrinsic selection to handle
+            // scalable vs. fixed vectors. Making this method virtual is
+            // possibly expensive.
+            if (target.arch == Target::ARM) {
+                if (!target.has_feature(Target::NoNEON)) {
+                    // force booleans into bytes. TODO(https://github.com/halide/Halide/issues/8119): figure out a better way to do this.
+                    int bit_size = std::max((int)t->getScalarSizeInBits(), 8);
+                    wide_enough = (bit_size * n) > 128;
+                } else {
+                    // TODO(https://github.com/halide/Halide/issues/8119): AArch64 SVE2 support is crashy with scalable vectors of min size 1.
+                    wide_enough = (n / effective_vscale) > 1;
+                }
+            }
+            scalable = wide_enough && ((n % effective_vscale) == 0);
+            if (scalable) {
+                n = n / effective_vscale;
+            }
         }
         break;
     case VectorTypeConstraint::Fixed:
@@ -5190,10 +5330,12 @@ llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value,
     bool scalable = false;
     switch (type_constraint) {
     case VectorTypeConstraint::None:
-        scalable = effective_vscale != 0 &&
-                   ((lanes % effective_vscale) == 0);
-        if (scalable) {
-            lanes = lanes / effective_vscale;
+        if (effective_vscale > 0) {
+            bool wide_enough = (lanes / effective_vscale) > 1;
+            scalable = wide_enough && ((lanes % effective_vscale) == 0);
+            if (scalable) {
+                lanes = lanes / effective_vscale;
+            }
         }
         break;
     case VectorTypeConstraint::Fixed:
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index b3e9cdabd498..908929e54373 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -579,6 +579,13 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Constant *get_splat(int lanes, llvm::Constant *value,
                               VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
 
+    /** Make sure a value type has the same scalable/fixed vector type as a guide. */
+    // @{
+    llvm::Value *match_vector_type_scalable(llvm::Value *value, VectorTypeConstraint constraint);
+    llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Type *guide);
+    llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Value *guide);
+    // @}
+
     /** Support for generating LLVM vector predication intrinsics
      * ("@llvm.vp.*" and "@llvm.experimental.vp.*")
      */
diff --git a/src/Function.cpp b/src/Function.cpp
index cbb4b61574d4..b72a39e1c90a 100644
--- a/src/Function.cpp
+++ b/src/Function.cpp
@@ -491,8 +491,10 @@ ExternFuncArgument deep_copy_extern_func_argument_helper(const ExternFuncArgumen
 }  // namespace
 
 void Function::deep_copy(const FunctionPtr &copy, DeepCopyMap &copied_map) const {
-    internal_assert(copy.defined() && contents.defined())
-        << "Cannot deep-copy undefined Function\n";
+    internal_assert(copy.defined())
+        << "Cannot deep-copy to undefined Function\n";
+    internal_assert(contents.defined())
+        << "Cannot deep-copy from undefined Function\n";
 
     // Add reference to this Function's deep-copy to the map in case of
     // self-reference, e.g. self-reference in an Definition.
diff --git a/src/IR.cpp b/src/IR.cpp
index c0bdb718291d..81cf0a0f41ff 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -690,6 +690,7 @@ const char *const intrinsic_op_names[] = {
     "widening_shift_left",
     "widening_shift_right",
     "widening_sub",
+    "get_runtime_vscale",
 };
 
 static_assert(sizeof(intrinsic_op_names) / sizeof(intrinsic_op_names[0]) == Call::IntrinsicOpCount,
diff --git a/src/IR.h b/src/IR.h
index 252e4588db03..31aa3f195e43 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -629,6 +629,8 @@ struct Call : public ExprNode<Call> {
         widening_shift_right,
         widening_sub,
 
+        get_runtime_vscale,
+
         IntrinsicOpCount  // Sentinel: keep last.
     };
 
diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp
index 3e5d95d787e6..10521f82ac03 100644
--- a/src/IRMatch.cpp
+++ b/src/IRMatch.cpp
@@ -262,6 +262,9 @@ class IRMatch : public IRVisitor {
         if (result && e && types_match(op->type, e->type)) {
             expr = e->value;
             op->value.accept(this);
+        } else if (op->lanes == 0 && types_match(op->value.type(), expr.type())) {
+            // zero lanes means any number of lanes, so match scalars too.
+            op->value.accept(this);
         } else {
             result = false;
         }
diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp
index 6b54aeef0e97..e40441b388f0 100644
--- a/src/LLVM_Output.cpp
+++ b/src/LLVM_Output.cpp
@@ -331,6 +331,12 @@ std::unique_ptr<llvm::Module> clone_module(const llvm::Module &module_in) {
     // Read it back in.
     llvm::MemoryBufferRef buffer_ref(llvm::StringRef(clone_buffer.data(), clone_buffer.size()), "clone_buffer");
     auto cloned_module = llvm::parseBitcodeFile(buffer_ref, module_in.getContext());
+
+    // TODO(<add issue>): Add support for returning the error.
+    if (!cloned_module) {
+        llvm::dbgs() << cloned_module.takeError();
+        module_in.print(llvm::dbgs(), nullptr, false, true);
+    }
     internal_assert(cloned_module);
 
     return std::move(cloned_module.get());
diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp
index fd7a12d66995..a207b3ce63f5 100644
--- a/src/StorageFolding.cpp
+++ b/src/StorageFolding.cpp
@@ -10,6 +10,7 @@
 #include "Monotonic.h"
 #include "Simplify.h"
 #include "Substitute.h"
+#include "Util.h"
 #include <utility>
 
 namespace Halide {
@@ -17,10 +18,6 @@ namespace Internal {
 
 namespace {
 
-int64_t next_power_of_two(int64_t x) {
-    return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
-}
-
 using std::map;
 using std::string;
 using std::vector;
diff --git a/src/Util.h b/src/Util.h
index 15c297796911..bce0a7f1d015 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -13,6 +13,7 @@
 /** \file
  * Various utility functions used internally Halide. */
 
+#include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <functional>
@@ -532,6 +533,16 @@ int clz64(uint64_t x);
 int ctz64(uint64_t x);
 // @}
 
+/** Return an integer 2^n, for some n,  which is >= x. Argument x must be > 0. */
+inline int64_t next_power_of_two(int64_t x) {
+    return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
+}
+
+template<typename T>
+inline T align_up(T x, int n) {
+    return (x + n - 1) / n * n;
+}
+
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp
index b99efdc6d67e..bfe66213f44f 100644
--- a/src/WasmExecutor.cpp
+++ b/src/WasmExecutor.cpp
@@ -101,11 +101,6 @@ struct debug_sink {
 // BDMalloc
 // ---------------------
 
-template<typename T>
-inline T align_up(T p, int alignment = 32) {
-    return (p + alignment - 1) & ~(alignment - 1);
-}
-
 // Debugging our Malloc is extremely noisy and usually undesired
 
 #define BDMALLOC_DEBUG_LEVEL 0
@@ -318,7 +313,7 @@ std::vector<char> compile_to_wasm(const Module &module, const std::string &fn_na
         stack_size += cg->get_requested_alloca_total();
     }
 
-    stack_size = align_up(stack_size);
+    stack_size = align_up(stack_size, 32);
     wdebug(1) << "Requesting stack size of " << stack_size << "\n";
 
     std::unique_ptr<llvm::Module> llvm_module =
@@ -708,7 +703,7 @@ wasm32_ptr_t hostbuf_to_wasmbuf(WabtContext &wabt_context, const halide_buffer_t
     const size_t dims_size_in_bytes = sizeof(halide_dimension_t) * src->dimensions;
     const size_t dims_offset = sizeof(wasm_halide_buffer_t);
     const size_t mem_needed_base = sizeof(wasm_halide_buffer_t) + dims_size_in_bytes;
-    const size_t host_offset = align_up(mem_needed_base);
+    const size_t host_offset = align_up(mem_needed_base, 32);
     const size_t host_size_in_bytes = src->size_in_bytes();
     const size_t mem_needed = host_offset + host_size_in_bytes;
 
@@ -1613,7 +1608,7 @@ wasm32_ptr_t hostbuf_to_wasmbuf(const Local<Context> &context, const halide_buff
     const size_t dims_size_in_bytes = sizeof(halide_dimension_t) * src->dimensions;
     const size_t dims_offset = sizeof(wasm_halide_buffer_t);
     const size_t mem_needed_base = sizeof(wasm_halide_buffer_t) + dims_size_in_bytes;
-    const size_t host_offset = align_up(mem_needed_base);
+    const size_t host_offset = align_up(mem_needed_base, 32);
     const size_t host_size_in_bytes = src->size_in_bytes();
     const size_t mem_needed = host_offset + host_size_in_bytes;
 
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 1a19202745bb..1d0843be0329 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1246,6 +1246,10 @@ enum halide_error_code_t {
     /** A factor used to split a loop was discovered to be zero or negative at
      * runtime. */
     halide_error_code_split_factor_not_positive = -46,
+
+    /** "vscale" value of Scalable Vector detected in runtime does not match
+     * the vscale value used in compilation. */
+    halide_error_code_vscale_invalid = -47,
 };
 
 /** Halide calls the functions below on various error conditions. The
@@ -1321,7 +1325,7 @@ extern int halide_error_storage_bound_too_small(void *user_context, const char *
                                                 int provided_size, int required_size);
 extern int halide_error_device_crop_failed(void *user_context);
 extern int halide_error_split_factor_not_positive(void *user_context, const char *func_name, const char *orig, const char *outer, const char *inner, const char *factor_str, int factor);
-
+extern int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale);
 // @}
 
 /** Optional features a compilation Target can have.
diff --git a/src/runtime/aarch64.ll b/src/runtime/aarch64.ll
index 9ae3b8e46ac2..c68a4f05fb42 100644
--- a/src/runtime/aarch64.ll
+++ b/src/runtime/aarch64.ll
@@ -48,25 +48,34 @@ define weak_odr <2 x i64> @vabdl_u32x2(<2 x i32> %a, <2 x i32> %b) nounwind alwa
 
 declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %x) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %x) nounwind readnone;
+declare float @llvm.aarch64.neon.frecpe.f32(float)
 declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %x) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %x) nounwind readnone;
+declare float @llvm.aarch64.neon.frsqrte.f32(float)
 declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %x, <4 x float> %y) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %x, <2 x float> %y) nounwind readnone;
+declare float @llvm.aarch64.neon.frecps.f32(float, float)
 declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %x, <4 x float> %y) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %x, <2 x float> %y) nounwind readnone;
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float)
+
 declare <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> %x) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> %x) nounwind readnone;
+declare half @llvm.aarch64.neon.frecpe.f16(half)
 declare <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> %x) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> %x) nounwind readnone;
+declare half @llvm.aarch64.neon.frsqrte.f16(half)
 declare <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> %x, <8 x half> %y) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> %x, <4 x half> %y) nounwind readnone;
+declare half @llvm.aarch64.neon.frecps.f16(half, half)
 declare <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %x, <8 x half> %y) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> %x, <4 x half> %y) nounwind readnone;
+declare half @llvm.aarch64.neon.frsqrts.f16(half, half)
 
 define weak_odr float @fast_inverse_f32(float %x) nounwind alwaysinline {
-       %vec = insertelement <2 x float> poison, float %x, i32 0
-       %approx = tail call <2 x float> @fast_inverse_f32x2(<2 x float> %vec)
-       %result = extractelement <2 x float> %approx, i32 0
+       %approx = tail call float @llvm.aarch64.neon.frecpe.f32(float %x)
+       %correction = tail call float @llvm.aarch64.neon.frecps.f32(float %approx, float %x)
+       %result = fmul float %approx, %correction
        ret float %result
 }
 
@@ -85,9 +94,9 @@ define weak_odr <4 x float> @fast_inverse_f32x4(<4 x float> %x) nounwind alwaysi
 }
 
 define weak_odr half @fast_inverse_f16(half %x) nounwind alwaysinline {
-       %vec = insertelement <4 x half> poison, half %x, i32 0
-       %approx = tail call <4 x half> @fast_inverse_f16x4(<4 x half> %vec)
-       %result = extractelement <4 x half> %approx, i32 0
+       %approx = tail call half @llvm.aarch64.neon.frecpe.f16(half %x)
+       %correction = tail call half @llvm.aarch64.neon.frecps.f16(half %approx, half %x)
+       %result = fmul half %approx, %correction
        ret half %result
 }
 
@@ -106,9 +115,10 @@ define weak_odr <8 x half> @fast_inverse_f16x8(<8 x half> %x) nounwind alwaysinl
 }
 
 define weak_odr float @fast_inverse_sqrt_f32(float %x) nounwind alwaysinline {
-       %vec = insertelement <2 x float> poison, float %x, i32 0
-       %approx = tail call <2 x float> @fast_inverse_sqrt_f32x2(<2 x float> %vec)
-       %result = extractelement <2 x float> %approx, i32 0
+       %approx = tail call float @llvm.aarch64.neon.frsqrte.f32(float %x)
+       %approx2 = fmul float %approx, %approx
+       %correction = tail call float @llvm.aarch64.neon.frsqrts.f32(float %approx2, float %x)
+       %result = fmul float %approx, %correction
        ret float %result
 }
 
@@ -129,9 +139,10 @@ define weak_odr <4 x float> @fast_inverse_sqrt_f32x4(<4 x float> %x) nounwind al
 }
 
 define weak_odr half @fast_inverse_sqrt_f16(half %x) nounwind alwaysinline {
-       %vec = insertelement <4 x half> poison, half %x, i32 0
-       %approx = tail call <4 x half> @fast_inverse_sqrt_f16x4(<4 x half> %vec)
-       %result = extractelement <4 x half> %approx, i32 0
+       %approx = tail call half @llvm.aarch64.neon.frsqrte.f16(half %x)
+       %approx2 = fmul half %approx, %approx
+       %correction = tail call half @llvm.aarch64.neon.frsqrts.f16(half %approx2, half %x)
+       %result = fmul half %approx, %correction
        ret half %result
 }
 
@@ -149,4 +160,43 @@ define weak_odr <8 x half> @fast_inverse_sqrt_f16x8(<8 x half> %x) nounwind alwa
        %correction = tail call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %approx2, <8 x half> %x)
        %result = fmul <8 x half> %approx, %correction
        ret <8 x half> %result
-}
\ No newline at end of file
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.frecpe.x.nxv4f32(<vscale x 4 x float> %x) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frsqrte.x.nxv4f32(<vscale x 4 x float> %x) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frecps.x.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frsqrts.x.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frecpe.x.nxv8f16(<vscale x 8 x half> %x) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frsqrte.x.nxv8f16(<vscale x 8 x half> %x) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frecps.x.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %y) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frsqrts.x.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %y) nounwind readnone;
+
+define weak_odr <vscale x 4 x float> @fast_inverse_f32nx4(<vscale x 4 x float> %x) nounwind alwaysinline {
+       %approx = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecpe.x.nxv4f32(<vscale x 4 x float> %x)
+       %correction = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecps.x.nxv4f32(<vscale x 4 x float> %approx, <vscale x 4 x float> %x)
+       %result = fmul <vscale x 4 x float> %approx, %correction
+       ret <vscale x 4 x float> %result
+}
+
+define weak_odr <vscale x 8 x half> @fast_inverse_f16nx8(<vscale x 8 x half> %x) nounwind alwaysinline {
+       %approx = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecpe.x.nxv8f16(<vscale x 8 x half> %x)
+       %correction = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecps.x.nxv8f16(<vscale x 8 x half> %approx, <vscale x 8 x half> %x)
+       %result = fmul <vscale x 8 x half> %approx, %correction
+       ret <vscale x 8 x half> %result
+}
+
+define weak_odr <vscale x 4 x float> @fast_inverse_sqrt_f32nx4(<vscale x 4 x float> %x) nounwind alwaysinline {
+       %approx = tail call <vscale x 4 x float> @llvm.aarch64.sve.frsqrte.x.nxv4f32(<vscale x 4 x float> %x)
+       %approx2 = fmul <vscale x 4 x float> %approx, %approx
+       %correction = tail call <vscale x 4 x float> @llvm.aarch64.sve.frsqrts.x.nxv4f32(<vscale x 4 x float> %approx2, <vscale x 4 x float> %x)
+       %result = fmul <vscale x 4 x float> %approx, %correction
+       ret <vscale x 4 x float> %result
+}
+
+define weak_odr <vscale x 8 x half> @fast_inverse_sqrt_f16nx8(<vscale x 8 x half> %x) nounwind alwaysinline {
+       %approx = tail call <vscale x 8 x half> @llvm.aarch64.sve.frsqrte.x.nxv8f16(<vscale x 8 x half> %x)
+       %approx2 = fmul <vscale x 8 x half> %approx, %approx
+       %correction = tail call <vscale x 8 x half> @llvm.aarch64.sve.frsqrts.x.nxv8f16(<vscale x 8 x half> %approx2, <vscale x 8 x half> %x)
+       %result = fmul <vscale x 8 x half> %approx, %correction
+       ret <vscale x 8 x half> %result
+}
diff --git a/src/runtime/errors.cpp b/src/runtime/errors.cpp
index 0879cc4a7c60..acb640c44b52 100644
--- a/src/runtime/errors.cpp
+++ b/src/runtime/errors.cpp
@@ -300,4 +300,12 @@ WEAK int halide_error_split_factor_not_positive(void *user_context, const char *
     return halide_error_code_split_factor_not_positive;
 }
 
+WEAK int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale) {
+    error(user_context)
+        << "The function " << func_name
+        << " is compiled with the assumption that vscale of Scalable Vector is " << compiletime_vscale
+        << ". However, the detected runtime vscale is " << runtime_vscale << ".";
+    return halide_error_code_vscale_invalid;
+}
+
 }  // extern "C"
diff --git a/src/runtime/posix_math.ll b/src/runtime/posix_math.ll
index 236652279615..ee6c2571f4eb 100644
--- a/src/runtime/posix_math.ll
+++ b/src/runtime/posix_math.ll
@@ -322,4 +322,30 @@ define weak_odr double @neg_inf_f64() nounwind uwtable readnone alwaysinline {
 
 define weak_odr double @nan_f64() nounwind uwtable readnone alwaysinline {
        ret double 0x7FF8000000000000
-}
\ No newline at end of file
+}
+
+; In case scalable vector with un-natural vector size, LLVM doesn't auto-vectorize the above scalar version
+define weak_odr <vscale x 4 x float> @inf_f32nx4() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0x7FF0000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 4 x float> @neg_inf_f32nx4() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0xFFF0000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 4 x float> @nan_f32nx4() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0x7FF8000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+
+define weak_odr <vscale x 2 x double> @inf_f64nx2() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0x7FF0000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 2 x double> @neg_inf_f64nx2() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0xFFF0000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 2 x double> @nan_f64nx2() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0x7FF8000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
index db8ada2f4b8e..7955e8749df7 100644
--- a/src/runtime/runtime_api.cpp
+++ b/src/runtime/runtime_api.cpp
@@ -89,6 +89,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_error_unaligned_host_ptr,
     (void *)&halide_error_storage_bound_too_small,
     (void *)&halide_error_device_crop_failed,
+    (void *)&halide_error_vscale_invalid,
     (void *)&halide_float16_bits_to_double,
     (void *)&halide_float16_bits_to_float,
     (void *)&halide_free,
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 9b934b768cdd..604ceda468f5 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -277,6 +277,7 @@ tests(GROUPS correctness
       simd_op_check_hvx.cpp
       simd_op_check_powerpc.cpp
       simd_op_check_riscv.cpp
+      simd_op_check_sve2.cpp
       simd_op_check_wasm.cpp
       simd_op_check_x86.cpp
       simplified_away_embedded_image.cpp
diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp
index e8762a6ea2d8..3ebf5071569e 100644
--- a/test/correctness/simd_op_check_arm.cpp
+++ b/test/correctness/simd_op_check_arm.cpp
@@ -230,6 +230,13 @@ class SimdOpCheckARM : public SimdOpCheckTest {
             check(arm32 ? "vcvt.s32.f32" : "fcvtzs", 2 * w, i32(f32_1));
             // skip the fixed point conversions for now
 
+            if (!arm32) {
+                check("fcvtmu *v", 2 * w, u32(floor(f32_1)));
+                check("fcvtpu *v", 2 * w, u32(ceil(f32_1)));
+                check("fcvtms *v", 2 * w, i32(floor(f32_1)));
+                check("fcvtps *v", 2 * w, i32(ceil(f32_1)));
+            }
+
             // VDIV     -       F, D    Divide
             // This doesn't actually get vectorized in 32-bit. Not sure cortex processors can do vectorized division.
             check(arm32 ? "vdiv.f32" : "fdiv", 2 * w, f32_1 / f32_2);
diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
new file mode 100644
index 000000000000..1a176dbccecd
--- /dev/null
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -0,0 +1,1387 @@
+#include "simd_op_check.h"
+
+#include "Halide.h"
+
+#include <algorithm>
+#include <iomanip>
+#include <optional>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+
+using namespace Halide;
+using namespace Halide::ConciseCasts;
+using namespace std;
+
+namespace {
+
+using CastFuncTy = function<Expr(Expr)>;
+
+class SimdOpCheckArmSve : public SimdOpCheckTest {
+public:
+    SimdOpCheckArmSve(Target t, int w = 384, int h = 32)
+        : SimdOpCheckTest(t, w, h), debug_mode(Internal::get_env_variable("HL_DEBUG_SIMDOPCHECK")) {
+
+        // Determine and hold can_run_the_code
+        // TODO: Since features of Arm CPU cannot be obtained automatically from get_host_target(),
+        // it is necessary to set some feature (e.g. "arm_fp16") explicitly to HL_JIT_TARGET.
+        // Halide throws error if there is unacceptable mismatch between jit_target and host_target.
+
+        Target host = get_host_target();
+        Target jit_target = get_jit_target_from_environment();
+        cout << "host is:          " << host.to_string() << endl;
+        cout << "HL_TARGET is:     " << target.to_string() << endl;
+        cout << "HL_JIT_TARGET is: " << jit_target.to_string() << endl;
+
+        auto is_same_triple = [](const Target &t1, const Target &t2) -> bool {
+            return t1.arch == t2.arch && t1.bits == t2.bits && t1.os == t2.os && t1.vector_bits == t2.vector_bits;
+        };
+
+        can_run_the_code = is_same_triple(host, target) && is_same_triple(jit_target, target);
+
+        // A bunch of feature flags also need to match between the
+        // compiled code and the host in order to run the code.
+        for (Target::Feature f : {Target::ARMv7s, Target::ARMFp16, Target::NoNEON, Target::SVE2}) {
+            if (target.has_feature(f) != jit_target.has_feature(f)) {
+                can_run_the_code = false;
+            }
+        }
+        if (!can_run_the_code) {
+            cout << "[WARN] To perform verification of realization, "
+                 << R"(the target triple "arm-<bits>-<os>" and key feature "arm_fp16")"
+                 << " must be the same between HL_TARGET and HL_JIT_TARGET" << endl;
+        }
+    }
+
+    bool can_run_code() const override {
+        // If we can meet the condition about target, run the error checking Halide::Func.
+        return can_run_the_code;
+    }
+
+    void add_tests() override {
+        check_arm_integer();
+        check_arm_float();
+        check_arm_load_store();
+        check_arm_pairwise();
+    }
+
+private:
+    void check_arm_integer() {
+        // clang-format off
+        vector<tuple<int, CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy,
+                     CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy,
+                     CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy>> test_params{
+            {8,  in_i8,  in_u8,  in_f16, in_i16, in_u16, i8,  i8_sat,  i16, i8,  i8_sat,  u8,  u8_sat,  u16, u8,  u8_sat},
+            {16, in_i16, in_u16, in_f16, in_i32, in_u32, i16, i16_sat, i32, i8,  i8_sat,  u16, u16_sat, u32, u8,  u8_sat},
+            {32, in_i32, in_u32, in_f32, in_i64, in_u64, i32, i32_sat, i64, i16, i16_sat, u32, u32_sat, u64, u16, u16_sat},
+            {64, in_i64, in_u64, in_f64, in_i64, in_u64, i64, i64_sat, i64, i32, i32_sat, u64, u64_sat, u64, u32, u32_sat},
+        };
+        // clang-format on
+
+        for (const auto &[bits, in_i, in_u, in_f, in_i_wide, in_u_wide,
+                          cast_i, satcast_i, widen_i, narrow_i, satnarrow_i,
+                          cast_u, satcast_u, widen_u, narrow_u, satnarrow_u] : test_params) {
+
+            Expr i_1 = in_i(x), i_2 = in_i(x + 16), i_3 = in_i(x + 32);
+            Expr u_1 = in_u(x), u_2 = in_u(x + 16), u_3 = in_u(x + 32);
+            Expr i_wide_1 = in_i_wide(x), i_wide_2 = in_i_wide(x + 16);
+            Expr u_wide_1 = in_u_wide(x), u_wide_2 = in_u_wide(x + 16);
+            Expr f_1 = in_f(x);
+
+            // TODO: reconcile this comment and logic and figure out
+            // whether we're test 192 and 256 for NEON and which bit
+            // widths other that the target one for SVE2.
+            //
+            // In general neon ops have the 64-bit version, the 128-bit
+            // version (ending in q), and the widening version that takes
+            // 64-bit args and produces a 128-bit result (ending in l). We try
+            // to peephole match any with vector, so we just try 64-bits, 128
+            // bits, 192 bits, and 256 bits for everything.
+            std::vector<int> simd_bit_widths;
+            if (has_neon()) {
+                simd_bit_widths.push_back(64);
+                simd_bit_widths.push_back(128);
+            }
+            if (has_sve() && ((target.vector_bits > 128) || !has_neon())) {
+                simd_bit_widths.push_back(target.vector_bits);
+            }
+            for (auto &total_bits : simd_bit_widths) {
+                const int vf = total_bits / bits;
+
+                // Due to workaround for SVE LLVM issues, in case of vector of half length of natural_lanes,
+                // there is some inconsistency in generated SVE insturction about the number of lanes.
+                // So the verification of lanes is skipped for this specific case.
+                const int instr_lanes = (total_bits == 64 && has_sve()) ?
+                                            Instruction::ANY_LANES :
+                                            Instruction::get_instr_lanes(bits, vf, target);
+                const int widen_lanes = Instruction::get_instr_lanes(bits * 2, vf, target);
+                const int narrow_lanes = Instruction::get_instr_lanes(bits, vf * 2, target);
+
+                AddTestFunctor add_all(*this, bits, instr_lanes, vf);
+                AddTestFunctor add_all_vec(*this, bits, instr_lanes, vf, vf != 1);
+                AddTestFunctor add_8_16_32(*this, bits, instr_lanes, vf, bits != 64);
+                AddTestFunctor add_16_32_64(*this, bits, instr_lanes, vf, bits != 8);
+                AddTestFunctor add_16_32(*this, bits, instr_lanes, vf, bits == 16 || bits == 32);
+                AddTestFunctor add_32(*this, bits, instr_lanes, vf, bits == 32);
+
+                AddTestFunctor add_8_16_32_widen(*this, bits, widen_lanes, vf, bits != 64 && !has_sve());
+
+                AddTestFunctor add_16_32_64_narrow(*this, bits, narrow_lanes, vf * 2, bits != 8 && !has_sve());
+                AddTestFunctor add_16_32_narrow(*this, bits, narrow_lanes, vf * 2, (bits == 16 || bits == 32) && !has_sve());
+                AddTestFunctor add_16_narrow(*this, bits, narrow_lanes, vf * 2, bits == 16 && !has_sve());
+
+                // VABA     I       -       Absolute Difference and Accumulate
+                if (!has_sve()) {
+                    // Relying on LLVM to detect accumulation
+                    add_8_16_32(sel_op("vaba.s", "saba"), i_1 + absd(i_2, i_3));
+                    add_8_16_32(sel_op("vaba.u", "uaba"), u_1 + absd(u_2, u_3));
+                }
+
+                // VABAL    I       -       Absolute Difference and Accumulate Long
+                add_8_16_32_widen(sel_op("vabal.s", "sabal"), i_wide_1 + absd(i_2, i_3));
+                add_8_16_32_widen(sel_op("vabal.u", "uabal"), u_wide_1 + absd(u_2, u_3));
+
+                // VABD     I, F    -       Absolute Difference
+                add_8_16_32(sel_op("vabd.s", "sabd"), absd(i_2, i_3));
+                add_8_16_32(sel_op("vabd.u", "uabd"), absd(u_2, u_3));
+
+                // Via widening, taking abs, then narrowing
+                add_8_16_32(sel_op("vabd.s", "sabd"), cast_u(abs(widen_i(i_2) - i_3)));
+                add_8_16_32(sel_op("vabd.u", "uabd"), cast_u(abs(widen_i(u_2) - u_3)));
+
+                // VABDL    I       -       Absolute Difference Long
+                add_8_16_32_widen(sel_op("vabdl.s", "sabdl"), widen_i(absd(i_2, i_3)));
+                add_8_16_32_widen(sel_op("vabdl.u", "uabdl"), widen_u(absd(u_2, u_3)));
+
+                // Via widening then taking an abs
+                add_8_16_32_widen(sel_op("vabdl.s", "sabdl"), abs(widen_i(i_2) - widen_i(i_3)));
+                add_8_16_32_widen(sel_op("vabdl.u", "uabdl"), abs(widen_i(u_2) - widen_i(u_3)));
+
+                // VABS     I, F    F, D    Absolute
+                add_8_16_32(sel_op("vabs.s", "abs"), abs(i_1));
+
+                // VADD     I, F    F, D    Add
+                add_all_vec(sel_op("vadd.i", "add"), i_1 + i_2);
+                add_all_vec(sel_op("vadd.i", "add"), u_1 + u_2);
+
+                // VADDHN   I       -       Add and Narrow Returning High Half
+                add_16_32_64_narrow(sel_op("vaddhn.i", "addhn"), narrow_i((i_1 + i_2) >> (bits / 2)));
+                add_16_32_64_narrow(sel_op("vaddhn.i", "addhn"), narrow_u((u_1 + u_2) >> (bits / 2)));
+
+                // VADDL    I       -       Add Long
+                add_8_16_32_widen(sel_op("vaddl.s", "saddl"), widen_i(i_1) + widen_i(i_2));
+                add_8_16_32_widen(sel_op("vaddl.u", "uaddl"), widen_u(u_1) + widen_u(u_2));
+
+                // VADDW    I       -       Add Wide
+                add_8_16_32_widen(sel_op("vaddw.s", "saddw"), i_1 + i_wide_1);
+                add_8_16_32_widen(sel_op("vaddw.u", "uaddw"), u_1 + u_wide_1);
+
+                // VAND     X       -       Bitwise AND
+                // Not implemented in front-end yet
+                // VBIC     I       -       Bitwise Clear
+                // VBIF     X       -       Bitwise Insert if False
+                // VBIT     X       -       Bitwise Insert if True
+                // skip these ones
+
+                // VCEQ     I, F    -       Compare Equal
+                add_8_16_32(sel_op("vceq.i", "cmeq", "cmpeq"), select(i_1 == i_2, cast_i(1), cast_i(2)));
+                add_8_16_32(sel_op("vceq.i", "cmeq", "cmpeq"), select(u_1 == u_2, cast_u(1), cast_u(2)));
+#if 0
+                // VCGE     I, F    -       Compare Greater Than or Equal
+                // Halide flips these to less than instead
+                check("vcge.s8", 16, select(i8_1 >= i8_2, i8(1), i8(2)));
+                check("vcge.u8", 16, select(u8_1 >= u8_2, u8(1), u8(2)));
+                check("vcge.s16", 8, select(i16_1 >= i16_2, i16(1), i16(2)));
+                check("vcge.u16", 8, select(u16_1 >= u16_2, u16(1), u16(2)));
+                check("vcge.s32", 4, select(i32_1 >= i32_2, i32(1), i32(2)));
+                check("vcge.u32", 4, select(u32_1 >= u32_2, u32(1), u32(2)));
+                check("vcge.f32", 4, select(f32_1 >= f32_2, 1.0f, 2.0f));
+                check("vcge.s8", 8, select(i8_1 >= i8_2, i8(1), i8(2)));
+                check("vcge.u8", 8, select(u8_1 >= u8_2, u8(1), u8(2)));
+                check("vcge.s16", 4, select(i16_1 >= i16_2, i16(1), i16(2)));
+                check("vcge.u16", 4, select(u16_1 >= u16_2, u16(1), u16(2)));
+                check("vcge.s32", 2, select(i32_1 >= i32_2, i32(1), i32(2)));
+                check("vcge.u32", 2, select(u32_1 >= u32_2, u32(1), u32(2)));
+                check("vcge.f32", 2, select(f32_1 >= f32_2, 1.0f, 2.0f));
+#endif
+                // VCGT     I, F    -       Compare Greater Than
+                add_8_16_32(sel_op("vcgt.s", "cmgt", "cmpgt"), select(i_1 > i_2, cast_i(1), cast_i(2)));
+                add_8_16_32(sel_op("vcgt.u", "cmhi", "cmphi"), select(u_1 > u_2, cast_u(1), cast_u(2)));
+#if 0
+                // VCLS     I       -       Count Leading Sign Bits
+                // We don't currently match these, but it wouldn't be hard to do.
+                check(arm32 ? "vcls.s8" : "cls", 8 * w, max(count_leading_zeros(i8_1), count_leading_zeros(~i8_1)));
+                check(arm32 ? "vcls.s16" : "cls", 8 * w, max(count_leading_zeros(i16_1), count_leading_zeros(~i16_1)));
+                check(arm32 ? "vcls.s32" : "cls", 8 * w, max(count_leading_zeros(i32_1), count_leading_zeros(~i32_1)));
+#endif
+                // VCLZ     I       -       Count Leading Zeros
+                add_8_16_32(sel_op("vclz.i", "clz"), count_leading_zeros(i_1));
+                add_8_16_32(sel_op("vclz.i", "clz"), count_leading_zeros(u_1));
+
+                // VCMP     -       F, D    Compare Setting Flags
+                // We skip this
+
+                // VCNT     I       -       Count Number of Set Bits
+                if (!has_sve()) {
+                    // In NEON, there is only cnt for bytes, and then horizontal adds.
+                    add_8_16_32({{sel_op("vcnt.", "cnt"), 8, total_bits == 64 ? 8 : 16}}, vf, popcount(i_1));
+                    add_8_16_32({{sel_op("vcnt.", "cnt"), 8, total_bits == 64 ? 8 : 16}}, vf, popcount(u_1));
+                } else {
+                    add_8_16_32("cnt", popcount(i_1));
+                    add_8_16_32("cnt", popcount(u_1));
+                }
+
+                // VDUP     X       -       Duplicate
+                add_8_16_32(sel_op("vdup.", "dup", "mov"), cast_i(y));
+                add_8_16_32(sel_op("vdup.", "dup", "mov"), cast_u(y));
+
+                // VEOR     X       -       Bitwise Exclusive OR
+                // check("veor", 4, bool1 ^ bool2);
+
+                // VEXT     I       -       Extract Elements and Concatenate
+                // unaligned loads with known offsets should use vext
+#if 0
+                // We currently don't do this.
+                check("vext.8", 16, in_i8(x+1));
+                check("vext.16", 8, in_i16(x+1));
+                check("vext.32", 4, in_i32(x+1));
+#endif
+                // VHADD    I       -       Halving Add
+                add_8_16_32(sel_op("vhadd.s", "shadd"), cast_i((widen_i(i_1) + widen_i(i_2)) / 2));
+                add_8_16_32(sel_op("vhadd.u", "uhadd"), cast_u((widen_u(u_1) + widen_u(u_2)) / 2));
+
+                // Halide doesn't define overflow behavior for i32 so we
+                // can use vhadd instruction. We can't use it for unsigned u8,i16,u16,u32.
+                add_32(sel_op("vhadd.s", "shadd"), (i_1 + i_2) / 2);
+
+                // VHSUB    I       -       Halving Subtract
+                add_8_16_32(sel_op("vhsub.s", "shsub"), cast_i((widen_i(i_1) - widen_i(i_2)) / 2));
+                add_8_16_32(sel_op("vhsub.u", "uhsub"), cast_u((widen_u(u_1) - widen_u(u_2)) / 2));
+
+                add_32(sel_op("vhsub.s", "shsub"), (i_1 - i_2) / 2);
+
+                // VMAX     I, F    -       Maximum
+                add_8_16_32(sel_op("vmax.s", "smax"), max(i_1, i_2));
+                add_8_16_32(sel_op("vmax.u", "umax"), max(u_1, u_2));
+
+                // VMIN     I, F    -       Minimum
+                add_8_16_32(sel_op("vmin.s", "smin"), min(i_1, i_2));
+                add_8_16_32(sel_op("vmin.u", "umin"), min(u_1, u_2));
+
+                // VMLA     I, F    F, D    Multiply Accumulate
+                add_8_16_32("mla signed", sel_op("vmla.i", "mla", "(mad|mla)"), i_1 + i_2 * i_3);
+                add_8_16_32("mla unsigned", sel_op("vmla.i", "mla", "(mad|mla)"), u_1 + u_2 * u_3);
+                // VMLS     I, F    F, D    Multiply Subtract
+                add_8_16_32("mls signed", sel_op("vmls.i", "mls", "(mls|msb)"), i_1 - i_2 * i_3);
+                add_8_16_32("mls unsigned", sel_op("vmls.i", "mls", "(mls|msb)"), u_1 - u_2 * u_3);
+
+                // VMLAL    I       -       Multiply Accumulate Long
+                // Try to trick LLVM into generating a zext instead of a sext by making
+                // LLVM think the operand never has a leading 1 bit. zext breaks LLVM's
+                // pattern matching of mlal.
+                add_8_16_32_widen(sel_op("vmlal.s", "smlal"), i_wide_1 + widen_i(i_2 & 0x3) * i_3);
+                add_8_16_32_widen(sel_op("vmlal.u", "umlal"), u_wide_1 + widen_u(u_2) * u_3);
+
+                // VMLSL    I       -       Multiply Subtract Long
+                add_8_16_32_widen(sel_op("vmlsl.s", "smlsl"), i_wide_1 - widen_i(i_2 & 0x3) * i_3);
+                add_8_16_32_widen(sel_op("vmlsl.u", "umlsl"), u_wide_1 - widen_u(u_2) * u_3);
+
+                // VMOV     X       F, D    Move Register or Immediate
+                // This is for loading immediates, which we won't do in the inner loop anyway
+
+                // VMOVL    I       -       Move Long
+                // For aarch64, llvm does a widening shift by 0 instead of using the sxtl instruction.
+                add_8_16_32_widen(sel_op("vmovl.s", "sshll"), widen_i(i_1));
+                add_8_16_32_widen(sel_op("vmovl.u", "ushll"), widen_u(u_1));
+                add_8_16_32_widen(sel_op("vmovl.u", "ushll"), widen_i(u_1));
+
+                // VMOVN    I       -       Move and Narrow
+                if (Halide::Internal::get_llvm_version() >= 140 && total_bits >= 128) {
+                    if (is_arm32()) {
+                        add_16_32_64_narrow("vmovn.i", narrow_i(i_1));
+                        add_16_32_64_narrow("vmovn.i", narrow_u(u_1));
+                    } else {
+                        add_16_32_64({{"uzp1", bits / 2, narrow_lanes * 2}}, vf * 2, narrow_i(i_1));
+                        add_16_32_64({{"uzp1", bits / 2, narrow_lanes * 2}}, vf * 2, narrow_u(u_1));
+                    }
+                } else {
+                    add_16_32_64_narrow(sel_op("vmovn.i", "xtn"), narrow_i(i_1));
+                    add_16_32_64_narrow(sel_op("vmovn.i", "xtn"), narrow_u(u_1));
+                }
+
+                // VMRS     X       F, D    Move Advanced SIMD or VFP Register to ARM compute Engine
+                // VMSR     X       F, D    Move ARM Core Register to Advanced SIMD or VFP
+                // trust llvm to use this correctly
+
+                // VMUL     I, F, P F, D    Multiply
+                add_8_16_32(sel_op("vmul.i", "mul"), i_2 * i_1);
+                add_8_16_32(sel_op("vmul.i", "mul"), u_2 * u_1);
+
+                // VMULL    I, F, P -       Multiply Long
+                add_8_16_32_widen(sel_op("vmull.s", "smull"), widen_i(i_1) * i_2);
+                add_8_16_32_widen(sel_op("vmull.u", "umull"), widen_u(u_1) * u_2);
+
+                // integer division by a constant should use fixed point unsigned
+                // multiplication, which is done by using a widening multiply
+                // followed by a narrowing
+                add_8_16_32_widen(sel_op("vmull.u", "umull"), i_1 / 37);
+                add_8_16_32_widen(sel_op("vmull.u", "umull"), u_1 / 37);
+
+                // VMVN     X       -       Bitwise NOT
+                // check("vmvn", ~bool1);
+
+                // VNEG     I, F    F, D    Negate
+                add_8_16_32(sel_op("vneg.s", "neg"), -i_1);
+
+#if 0
+                // These are vfp, not neon. They only work on scalars
+                check("vnmla.f32", 4, -(f32_1 + f32_2*f32_3));
+                check("vnmla.f64", 2, -(f64_1 + f64_2*f64_3));
+                check("vnmls.f32", 4, -(f32_1 - f32_2*f32_3));
+                check("vnmls.f64", 2, -(f64_1 - f64_2*f64_3));
+                check("vnmul.f32", 4, -(f32_1*f32_2));
+                check("vnmul.f64", 2, -(f64_1*f64_2));
+
+                // Of questionable value. Catching abs calls is annoying, and the
+                // slow path is only one more op (for the max).
+                check("vqabs.s8", 16, abs(max(i8_1, -max_i8)));
+                check("vqabs.s8", 8, abs(max(i8_1, -max_i8)));
+                check("vqabs.s16", 8, abs(max(i16_1, -max_i16)));
+                check("vqabs.s16", 4, abs(max(i16_1, -max_i16)));
+                check("vqabs.s32", 4, abs(max(i32_1, -max_i32)));
+                check("vqabs.s32", 2, abs(max(i32_1, -max_i32)));
+#endif
+                // VQADD    I       -       Saturating Add
+                add_8_16_32(sel_op("vqadd.s", "sqadd"), satcast_i(widen_i(i_1) + widen_i(i_2)));
+                const Expr max_u = UInt(bits).max();
+                add_8_16_32(sel_op("vqadd.u", "uqadd"), cast_u(min(widen_u(u_1) + widen_u(u_2), max_u)));
+
+                // Check the case where we add a constant that could be narrowed
+                add_8_16_32(sel_op("vqadd.u", "uqadd"), cast_u(min(widen_u(u_1) + 17, max_u)));
+
+                // Can't do larger ones because we can't represent the intermediate 128-bit wide ops.
+
+                // VQDMLAL  I       -       Saturating Double Multiply Accumulate Long
+                // VQDMLSL  I       -       Saturating Double Multiply Subtract Long
+                // We don't do these, but it would be possible.
+
+                // VQDMULH  I       -       Saturating Doubling Multiply Returning High Half
+                // VQDMULL  I       -       Saturating Doubling Multiply Long
+                add_16_32(sel_op("vqdmulh.s", "sqdmulh"), satcast_i((widen_i(i_1) * widen_i(i_2)) >> (bits - 1)));
+
+                // VQMOVN   I       -       Saturating Move and Narrow
+                // VQMOVUN  I       -       Saturating Move and Unsigned Narrow
+                add_16_32_64_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(i_1));
+                add_16_32_64_narrow(sel_op("vqmovun.s", "sqxtun"), satnarrow_u(i_1));
+                const Expr max_u_narrow = UInt(bits / 2).max();
+                add_16_32_64_narrow(sel_op("vqmovn.u", "uqxtn"), narrow_u(min(u_1, max_u_narrow)));
+                // Double saturating narrow
+                add_16_32_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(i_wide_1));
+                add_16_32_narrow(sel_op("vqmovn.u", "uqxtn"), narrow_u(min(u_wide_1, max_u_narrow)));
+                add_16_32_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(i_wide_1));
+                add_16_32_narrow(sel_op("vqmovun.s", "sqxtun"), satnarrow_u(i_wide_1));
+                // Triple saturating narrow
+                Expr i64_1 = in_i64(x), u64_1 = in_u64(x), f32_1 = in_f32(x), f64_1 = in_f64(x);
+                add_16_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(i64_1));
+                add_16_narrow(sel_op("vqmovn.u", "uqxtn"), narrow_u(min(u64_1, max_u_narrow)));
+                add_16_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(f32_1));
+                add_16_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(f64_1));
+                add_16_narrow(sel_op("vqmovun.s", "sqxtun"), satnarrow_u(f32_1));
+                add_16_narrow(sel_op("vqmovun.s", "sqxtun"), satnarrow_u(f64_1));
+
+                // VQNEG    I       -       Saturating Negate
+                const Expr max_i = Int(bits).max();
+                add_8_16_32(sel_op("vqneg.s", "sqneg"), -max(i_1, -max_i));
+
+                // VQRDMULH I       -       Saturating Rounding Doubling Multiply Returning High Half
+                // Note: division in Halide always rounds down (not towards
+                // zero). Otherwise these patterns would be more complicated.
+                add_16_32(sel_op("vqrdmulh.s", "sqrdmulh"), satcast_i((widen_i(i_1) * widen_i(i_2) + (1 << (bits - 2))) / (widen_i(1) << (bits - 1))));
+
+                // VQRSHRN   I       -       Saturating Rounding Shift Right Narrow
+                // VQRSHRUN  I       -       Saturating Rounding Shift Right Unsigned Narrow
+                add_16_32_64_narrow(sel_op("vqrshrn.s", "sqrshrn"), satnarrow_i((widen_i(i_1) + 8) / 16));
+                add_16_32_64_narrow(sel_op("vqrshrun.s", "sqrshrun"), satnarrow_u((widen_i(i_1) + 8) / 16));
+                add_16_32_narrow(sel_op("vqrshrn.u", "uqrshrn"), narrow_u(min((widen_u(u_1) + 8) / 16, max_u_narrow)));
+
+                // VQSHL    I       -       Saturating Shift Left
+                add_8_16_32(sel_op("vqshl.s", "sqshl"), satcast_i(widen_i(i_1) * 16));
+                add_8_16_32(sel_op("vqshl.u", "uqshl"), cast_u(min(widen_u(u_1) * 16, max_u)));
+
+                // VQSHLU   I       -       Saturating Shift Left Unsigned
+                if (!has_sve()) {
+                    add_8_16_32(sel_op("vqshlu.s", "sqshlu"), satcast_u(widen_i(i_1) * 16));
+                }
+
+                // VQSHRN   I       -       Saturating Shift Right Narrow
+                // VQSHRUN  I       -       Saturating Shift Right Unsigned Narrow
+                add_16_32_64_narrow(sel_op("vqshrn.s", "sqshrn"), satnarrow_i(i_1 / 16));
+                add_16_32_64_narrow(sel_op("vqshrun.s", "sqshrun"), satnarrow_u(i_1 / 16));
+                add_16_32_narrow(sel_op("vqshrn.u", "uqshrn"), narrow_u(min(u_1 / 16, max_u_narrow)));
+
+                // VQSUB    I       -       Saturating Subtract
+                add_8_16_32(sel_op("vqsub.s", "sqsub"), satcast_i(widen_i(i_1) - widen_i(i_2)));
+
+                // N.B. Saturating subtracts are expressed by widening to a igned* type
+                add_8_16_32(sel_op("vqsub.u", "uqsub"), satcast_u(widen_i(u_1) - widen_i(u_2)));
+
+                // VRADDHN  I       -       Rounding Add and Narrow Returning High Half
+                add_16_32_64_narrow(sel_op("vraddhn.i", "raddhn"), narrow_i((widen_i(i_1 + i_2) + (Expr(cast_i(1)) << (bits / 2 - 1))) >> (bits / 2)));
+                add_16_32_narrow(sel_op("vraddhn.i", "raddhn"), narrow_u((widen_u(u_1 + u_2) + (Expr(cast_u(1)) << (bits / 2 - 1))) >> (bits / 2)));
+
+                // VREV16   X       -       Reverse in Halfwords
+                // VREV32   X       -       Reverse in Words
+                // VREV64   X       -       Reverse in Doublewords
+
+                // These reverse within each halfword, word, and doubleword
+                // respectively. Sometimes llvm generates them, and sometimes
+                // it generates vtbl instructions.
+
+                // VRHADD   I       -       Rounding Halving Add
+                add_8_16_32(sel_op("vrhadd.s", "srhadd"), cast_i((widen_i(i_1) + widen_i(i_2) + 1) / 2));
+                add_8_16_32(sel_op("vrhadd.u", "urhadd"), cast_u((widen_u(u_1) + widen_u(u_2) + 1) / 2));
+
+                // VRSHL    I       -       Rounding Shift Left
+                Expr shift = (i_2 % bits) - (bits / 2);
+                Expr round_s = (cast_i(1) >> min(shift, 0)) / 2;
+                Expr round_u = (cast_u(1) >> min(shift, 0)) / 2;
+                add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) << shift));
+                add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) << shift));
+
+                round_s = (cast_i(1) << max(shift, 0)) / 2;
+                round_u = (cast_u(1) << max(shift, 0)) / 2;
+                add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) >> shift));
+                add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) >> shift));
+
+                // VRSHR    I       -       Rounding Shift Right
+                add_8_16_32(sel_op("vrshr.s", "srshr", "srshl"), cast_i((widen_i(i_1) + 1) >> 1));
+                add_8_16_32(sel_op("vrshr.u", "urshr", "urshl"), cast_u((widen_u(u_1) + 1) >> 1));
+
+                // VRSHRN   I       -       Rounding Shift Right Narrow
+                if (Halide::Internal::get_llvm_version() >= 140) {
+                    // LLVM14 converts RSHRN/RSHRN2 to RADDHN/RADDHN2 when the shift amount is half the width of the vector element
+                    // See https://reviews.llvm.org/D116166
+                    add_16_32_narrow(sel_op("vrshrn.i", "raddhn"), narrow_i((widen_i(i_1) + (cast_i(1) << (bits / 2 - 1))) >> (bits / 2)));
+                    add_16_32_narrow(sel_op("vrshrn.i", "raddhn"), narrow_u((widen_u(u_1) + (cast_u(1) << (bits / 2 - 1))) >> (bits / 2)));
+                }
+                add_16_32_64_narrow(sel_op("vrshrn.i", "rshrn"), narrow_i((widen_i(i_1) + (1 << (bits / 4))) >> (bits / 4 + 1)));
+                add_16_32_narrow(sel_op("vrshrn.i", "rshrn"), narrow_u((widen_u(u_1) + (1 << (bits / 4))) >> (bits / 4 + 1)));
+
+                // VRSRA    I       -       Rounding Shift Right and Accumulate
+                if (!has_sve()) {
+                    // Relying on LLVM to detect accumulation
+                    add_8_16_32(sel_op("vrsra.s", "srsra"), i_2 + cast_i((widen_i(i_1) + 1) >> 1));
+                    add_8_16_32(sel_op("vrsra.u", "ursra"), i_2 + cast_u((widen_u(u_1) + 1) >> 1));
+                }
+
+                // VRSUBHN  I       -       Rounding Subtract and Narrow Returning High Half
+                add_16_32_64_narrow(sel_op("vrsubhn.i", "rsubhn"), narrow_i((widen_i(i_1 - i_2) + (Expr(cast_i(1)) << (bits / 2 - 1))) >> (bits / 2)));
+                add_16_32_narrow(sel_op("vrsubhn.i", "rsubhn"), narrow_u((widen_u(u_1 - u_2) + (Expr(cast_u(1)) << (bits / 2 - 1))) >> (bits / 2)));
+
+                // VSHL     I       -       Shift Left
+                add_all_vec(sel_op("vshl.i", "shl", "lsl"), i_1 * 16);
+                add_all_vec(sel_op("vshl.i", "shl", "lsl"), u_1 * 16);
+
+                if (!has_sve()) {  // No equivalent instruction in SVE.
+                    add_all_vec(sel_op("vshl.s", "sshl"), i_1 << shift);
+                    add_all_vec(sel_op("vshl.s", "sshl"), i_1 >> shift);
+                    add_all_vec(sel_op("vshl.u", "ushl"), u_1 << shift);
+                    add_all_vec(sel_op("vshl.u", "ushl"), u_1 >> shift);
+                }
+
+                // VSHLL    I       -       Shift Left Long
+                add_8_16_32_widen(sel_op("vshll.s", "sshll"), widen_i(i_1) * 16);
+                add_8_16_32_widen(sel_op("vshll.u", "ushll"), widen_u(u_1) * 16);
+
+                // VSHR     I       -       Shift Right
+                add_all_vec(sel_op("vshr.s", "sshr", "asr"), i_1 / 16);
+                add_all_vec(sel_op("vshr.u", "ushr", "lsr"), u_1 / 16);
+
+                // VSHRN    I       -       Shift Right Narrow
+                add_16_32_64_narrow(sel_op("vshrn.i", "shrn"), narrow_i(i_1 >> (bits / 2)));
+                add_16_32_64_narrow(sel_op("vshrn.i", "shrn"), narrow_u(u_1 >> (bits / 2)));
+
+                add_16_32_64_narrow(sel_op("vshrn.i", "shrn"), narrow_i(i_1 / 16));
+                add_16_32_64_narrow(sel_op("vshrn.i", "shrn"), narrow_u(u_1 / 16));
+
+                // VSLI     X       -       Shift Left and Insert
+                // I guess this could be used for (x*256) | (y & 255)? We don't do bitwise ops on integers, so skip it.
+
+                // VSRA     I       -       Shift Right and Accumulate
+                if (!has_sve()) {
+                    // Relying on LLVM to detect accumulation
+                    add_all_vec(sel_op("vsra.s", "ssra"), i_2 + i_1 / 16);
+                    add_all_vec(sel_op("vsra.u", "usra"), u_2 + u_1 / 16);
+                }
+
+                // VSRI     X       -       Shift Right and Insert
+                // See VSLI
+
+                // VSUB     I, F    F, D    Subtract
+                add_all_vec(sel_op("vsub.i", "sub"), i_1 - i_2);
+                add_all_vec(sel_op("vsub.i", "sub"), u_1 - u_2);
+
+                // VSUBHN   I       -       Subtract and Narrow
+                add_16_32_64_narrow(sel_op("vsubhn.i", "subhn"), narrow_i((i_1 - i_2) >> (bits / 2)));
+                add_16_32_64_narrow(sel_op("vsubhn.i", "subhn"), narrow_u((u_1 - u_2) >> (bits / 2)));
+
+                // VSUBL    I       -       Subtract Long
+                add_8_16_32_widen(sel_op("vsubl.s", "ssubl"), widen_i(i_1) - widen_i(i_2));
+                add_8_16_32_widen(sel_op("vsubl.u", "usubl"), widen_u(u_1) - widen_u(u_2));
+
+                add_8_16_32_widen(sel_op("vsubl.s", "ssubl"), widen_i(i_1) - widen_i(in_i(0)));
+                add_8_16_32_widen(sel_op("vsubl.u", "usubl"), widen_u(u_1) - widen_u(in_u(0)));
+
+                // VSUBW    I       -       Subtract Wide
+                add_8_16_32_widen(sel_op("vsubw.s", "ssubw"), i_wide_1 - i_1);
+                add_8_16_32_widen(sel_op("vsubw.u", "usubw"), u_wide_1 - u_1);
+            }
+        }
+    }
+
+    void check_arm_float() {
+        vector<tuple<int, CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy>> test_params{
+            {16, in_f16, in_u16, in_i16, f16},
+            {32, in_f32, in_u32, in_i32, f32},
+            {64, in_f64, in_u64, in_i64, f64},
+        };
+
+        for (const auto &[bits, in_f, in_u, in_i, cast_f] : test_params) {
+            Expr f_1 = in_f(x), f_2 = in_f(x + 16), f_3 = in_f(x + 32);
+            Expr u_1 = in_u(x);
+            Expr i_1 = in_i(x);
+
+            // Arithmetic which could throw FP exception could return NaN, which results in output mismatch.
+            // To avoid that, we need a positive value within certain range
+            Func in_f_clamped;
+            in_f_clamped(x) = clamp(in_f(x), cast_f(1e-3f), cast_f(1.0f));
+            in_f_clamped.compute_root();  // To prevent LLVM optimization which results in a different instruction
+            Expr f_1_clamped = in_f_clamped(x);
+            Expr f_2_clamped = in_f_clamped(x + 16);
+
+            if (bits == 16 && !is_float16_supported()) {
+                continue;
+            }
+
+            vector total_bits_params = {256};  // {64, 128, 192, 256};
+            if (bits != 64) {
+                // Add scalar case to verify float16 native operation
+                total_bits_params.push_back(bits);
+            }
+
+            for (auto total_bits : total_bits_params) {
+                const int vf = total_bits / bits;
+                const bool is_vector = vf > 1;
+
+                const int instr_lanes = Instruction::get_instr_lanes(bits, vf, target);
+                const int force_vectorized_lanes = Instruction::get_force_vectorized_instr_lanes(bits, vf, target);
+
+                AddTestFunctor add(*this, bits, instr_lanes, vf);
+                AddTestFunctor add_arm32_f32(*this, bits, vf, is_arm32() && bits == 32);
+                AddTestFunctor add_arm64(*this, bits, instr_lanes, vf, !is_arm32());
+
+                add({{sel_op("vabs.f", "fabs"), bits, force_vectorized_lanes}}, vf, abs(f_1));
+                add(sel_op("vadd.f", "fadd"), f_1 + f_2);
+                add(sel_op("vsub.f", "fsub"), f_1 - f_2);
+                add(sel_op("vmul.f", "fmul"), f_1 * f_2);
+                add("fdiv", sel_op("vdiv.f", "fdiv", "(fdiv|fdivr)"), f_1 / f_2_clamped);
+                auto fneg_lanes = has_sve() ? force_vectorized_lanes : instr_lanes;
+                add({{sel_op("vneg.f", "fneg"), bits, fneg_lanes}}, vf, -f_1);
+                add({{sel_op("vsqrt.f", "fsqrt"), bits, force_vectorized_lanes}}, vf, sqrt(f_1_clamped));
+
+                add_arm32_f32(is_vector ? "vceq.f" : "vcmp.f", select(f_1 == f_2, cast_f(1.0f), cast_f(2.0f)));
+                add_arm32_f32(is_vector ? "vcgt.f" : "vcmp.f", select(f_1 > f_2, cast_f(1.0f), cast_f(2.0f)));
+                add_arm64(is_vector ? "fcmeq" : "fcmp", select(f_1 == f_2, cast_f(1.0f), cast_f(2.0f)));
+                add_arm64(is_vector ? "fcmgt" : "fcmp", select(f_1 > f_2, cast_f(1.0f), cast_f(2.0f)));
+
+                add_arm32_f32("vcvt.f32.u", cast_f(u_1));
+                add_arm32_f32("vcvt.f32.s", cast_f(i_1));
+                add_arm32_f32("vcvt.u32.f", cast(UInt(bits), f_1));
+                add_arm32_f32("vcvt.s32.f", cast(Int(bits), f_1));
+                // The max of Float(16) is less than that of UInt(16), which generates "nan" in emulator
+                Expr float_max = Float(bits).max();
+                add_arm64("ucvtf", cast_f(min(float_max, u_1)));
+                add_arm64("scvtf", cast_f(i_1));
+                add_arm64({{"fcvtzu", bits, force_vectorized_lanes}}, vf, cast(UInt(bits), f_1));
+                add_arm64({{"fcvtzs", bits, force_vectorized_lanes}}, vf, cast(Int(bits), f_1));
+                add_arm64({{"frintn", bits, force_vectorized_lanes}}, vf, round(f_1));
+                add_arm64({{"frintm", bits, force_vectorized_lanes}}, vf, floor(f_1));
+                add_arm64({{"frintp", bits, force_vectorized_lanes}}, vf, ceil(f_1));
+                add_arm64({{"frintz", bits, force_vectorized_lanes}}, vf, trunc(f_1));
+
+                add_arm32_f32({{"vmax.f", bits, force_vectorized_lanes}}, vf, max(f_1, f_2));
+                add_arm32_f32({{"vmin.f", bits, force_vectorized_lanes}}, vf, min(f_1, f_2));
+
+                add_arm64({{"fmax", bits, force_vectorized_lanes}}, vf, max(f_1, f_2));
+                add_arm64({{"fmin", bits, force_vectorized_lanes}}, vf, min(f_1, f_2));
+                if (bits != 64 && total_bits != 192) {
+                    // Halide relies on LLVM optimization for this pattern, and in some case it doesn't work
+                    add_arm64("fmla", is_vector ? (has_sve() ? "(fmla|fmad)" : "fmla") : "fmadd", f_1 + f_2 * f_3);
+                    add_arm64("fmls", is_vector ? (has_sve() ? "(fmls|fmsb)" : "fmls") : "fmsub", f_1 - f_2 * f_3);
+                }
+                if (bits != 64) {
+                    add_arm64(vector<string>{"frecpe", "frecps"}, fast_inverse(f_1_clamped));
+                    add_arm64(vector<string>{"frsqrte", "frsqrts"}, fast_inverse_sqrt(f_1_clamped));
+                }
+
+                if (bits == 16) {
+                    // Some of the math ops (exp,log,pow) for fp16 are converted into "xxx_fp32" call
+                    // and then lowered to Internal::halide_xxx() function.
+                    // In case the target has FP16 feature, native type conversion between fp16 and fp32 should be generated
+                    // instead of emulated equivalent code with other types.
+                    if (is_vector && !has_sve()) {
+                        add_arm64("exp", {{"fcvtl", 16, 4}, {"fcvtn", 16, 4}}, vf, exp(f_1_clamped));
+                        add_arm64("log", {{"fcvtl", 16, 4}, {"fcvtn", 16, 4}}, vf, log(f_1_clamped));
+                        add_arm64("pow", {{"fcvtl", 16, 4}, {"fcvtn", 16, 4}}, vf, pow(f_1_clamped, f_2_clamped));
+                    } else {
+                        add_arm64("exp", "fcvt", exp(f_1_clamped));
+                        add_arm64("log", "fcvt", log(f_1_clamped));
+                        add_arm64("pow", "fcvt", pow(f_1_clamped, f_2_clamped));
+                    }
+                }
+
+                // No corresponding instructions exists for is_nan, is_inf, is_finite.
+                // The instructions expected to be generated depends on CodeGen_LLVM::visit(const Call *op)
+                add_arm64("nan", is_vector ? sel_op("", "fcmge", "fcmuo") : "fcmp", is_nan(f_1));
+                add_arm64("inf", {{"fabs", bits, force_vectorized_lanes}}, vf, is_inf(f_1));
+                add_arm64("finite", {{"fabs", bits, force_vectorized_lanes}}, vf, is_inf(f_1));
+            }
+
+            if (bits == 16) {
+                // Actually, the following ops are not vectorized because SIMD instruction is unavailable.
+                // The purpose of the test is just to confirm no error.
+                // In case the target has FP16 feature, native type conversion between fp16 and fp32 should be generated
+                // instead of emulated equivalent code with other types.
+                AddTestFunctor add_f16(*this, 16, 1);
+
+                add_f16("sinf", {{"bl", "sinf"}, {"fcvt", 16, 1}}, 1, sin(f_1_clamped));
+                add_f16("asinf", {{"bl", "asinf"}, {"fcvt", 16, 1}}, 1, asin(f_1_clamped));
+                add_f16("cosf", {{"bl", "cosf"}, {"fcvt", 16, 1}}, 1, cos(f_1_clamped));
+                add_f16("acosf", {{"bl", "acosf"}, {"fcvt", 16, 1}}, 1, acos(f_1_clamped));
+                add_f16("tanf", {{"bl", "tanf"}, {"fcvt", 16, 1}}, 1, tan(f_1_clamped));
+                add_f16("atanf", {{"bl", "atanf"}, {"fcvt", 16, 1}}, 1, atan(f_1_clamped));
+                add_f16("atan2f", {{"bl", "atan2f"}, {"fcvt", 16, 1}}, 1, atan2(f_1_clamped, f_2_clamped));
+                add_f16("sinhf", {{"bl", "sinhf"}, {"fcvt", 16, 1}}, 1, sinh(f_1_clamped));
+                add_f16("asinhf", {{"bl", "asinhf"}, {"fcvt", 16, 1}}, 1, asinh(f_1_clamped));
+                add_f16("coshf", {{"bl", "coshf"}, {"fcvt", 16, 1}}, 1, cosh(f_1_clamped));
+                add_f16("acoshf", {{"bl", "acoshf"}, {"fcvt", 16, 1}}, 1, acosh(max(f_1, cast_f(1.0f))));
+                add_f16("tanhf", {{"bl", "tanhf"}, {"fcvt", 16, 1}}, 1, tanh(f_1_clamped));
+                add_f16("atanhf", {{"bl", "atanhf"}, {"fcvt", 16, 1}}, 1, atanh(clamp(f_1, cast_f(-0.5f), cast_f(0.5f))));
+            }
+        }
+    }
+
+    void check_arm_load_store() {
+        vector<tuple<Type, CastFuncTy>> test_params = {
+            {Int(8), in_i8}, {Int(16), in_i16}, {Int(32), in_i32}, {Int(64), in_i64}, {UInt(8), in_u8}, {UInt(16), in_u16}, {UInt(32), in_u32}, {UInt(64), in_u64}, {Float(16), in_f16}, {Float(32), in_f32}, {Float(64), in_f64}};
+
+        for (const auto &[elt, in_im] : test_params) {
+            const int bits = elt.bits();
+            if ((elt == Float(16) && !is_float16_supported()) ||
+                (is_arm32() && bits == 64)) {
+                continue;
+            }
+
+            // LD/ST       -       Load/Store
+            for (int width = 64; width <= 64 * 4; width *= 2) {
+                const int total_lanes = width / bits;
+                const int instr_lanes = min(total_lanes, 128 / bits);
+                if (instr_lanes < 2) continue;  // bail out scalar op
+
+                // In case of arm32, instruction selection looks inconsistent due to optimization by LLVM
+                AddTestFunctor add(*this, bits, total_lanes, target.bits == 64);
+                // NOTE: if the expr is too simple, LLVM might generate "bl memcpy"
+                Expr load_store_1 = in_im(x) * 3;
+
+                if (has_sve()) {
+                    // in native width, ld1b/st1b is used regardless of data type
+                    const bool allow_byte_ls = (width == target.vector_bits);
+                    add({get_sve_ls_instr("ld1", bits, bits, "", allow_byte_ls ? "b" : "")}, total_lanes, load_store_1);
+                    add({get_sve_ls_instr("st1", bits, bits, "", allow_byte_ls ? "b" : "")}, total_lanes, load_store_1);
+                } else {
+                    // vector register is not used for simple load/store
+                    string reg_prefix = (width <= 64) ? "d" : "q";
+                    add({{"st[rp]", reg_prefix + R"(\d\d?)"}}, total_lanes, load_store_1);
+                    add({{"ld[rp]", reg_prefix + R"(\d\d?)"}}, total_lanes, load_store_1);
+                }
+            }
+
+            // LD2/ST2       -       Load/Store two-element structures
+            int base_vec_bits = has_sve() ? target.vector_bits : 128;
+            for (int width = base_vec_bits; width <= base_vec_bits * 4; width *= 2) {
+                const int total_lanes = width / bits;
+                const int vector_lanes = total_lanes / 2;
+                const int instr_lanes = min(vector_lanes, base_vec_bits / bits);
+                if (instr_lanes < 2) continue;  // bail out scalar op
+
+                AddTestFunctor add_ldn(*this, bits, vector_lanes);
+                AddTestFunctor add_stn(*this, bits, instr_lanes, total_lanes);
+
+                Func tmp1, tmp2;
+                tmp1(x) = cast(elt, x);
+                tmp1.compute_root();
+                tmp2(x, y) = select(x % 2 == 0, tmp1(x / 2), tmp1(x / 2 + 16));
+                tmp2.compute_root().vectorize(x, total_lanes);
+                Expr load_2 = in_im(x * 2) + in_im(x * 2 + 1);
+                Expr store_2 = tmp2(0, 0) + tmp2(0, 127);
+
+                if (has_sve()) {
+                    // TODO(inssue needed): Added strided load support.
+#if 0
+                    add_ldn({get_sve_ls_instr("ld2", bits)}, vector_lanes, load_2);
+#endif
+                    add_stn({get_sve_ls_instr("st2", bits)}, total_lanes, store_2);
+                } else {
+                    add_ldn(sel_op("vld2.", "ld2"), load_2);
+                    add_stn(sel_op("vst2.", "st2"), store_2);
+                }
+            }
+
+            // Also check when the two expressions interleaved have a common
+            // subexpression, which results in a vector var being lifted out.
+            for (int width = base_vec_bits; width <= base_vec_bits * 4; width *= 2) {
+                const int total_lanes = width / bits;
+                const int vector_lanes = total_lanes / 2;
+                const int instr_lanes = Instruction::get_instr_lanes(bits, vector_lanes, target);
+                if (instr_lanes < 2) continue;  // bail out scalar op
+
+                AddTestFunctor add_stn(*this, bits, instr_lanes, total_lanes);
+
+                Func tmp1, tmp2;
+                tmp1(x) = cast(elt, x);
+                tmp1.compute_root();
+                Expr e = (tmp1(x / 2) * 2 + 7) / 4;
+                tmp2(x, y) = select(x % 2 == 0, e * 3, e + 17);
+                tmp2.compute_root().vectorize(x, total_lanes);
+                Expr store_2 = tmp2(0, 0) + tmp2(0, 127);
+
+                if (has_sve()) {
+                    add_stn({get_sve_ls_instr("st2", bits)}, total_lanes, store_2);
+                } else {
+                    add_stn(sel_op("vst2.", "st2"), store_2);
+                }
+            }
+
+            // LD3/ST3       -       Store three-element structures
+            for (int width = 192; width <= 192 * 4; width *= 2) {
+                const int total_lanes = width / bits;
+                const int vector_lanes = total_lanes / 3;
+                const int instr_lanes = Instruction::get_instr_lanes(bits, vector_lanes, target);
+                if (instr_lanes < 2) continue;  // bail out scalar op
+
+                AddTestFunctor add_ldn(*this, bits, vector_lanes);
+                AddTestFunctor add_stn(*this, bits, instr_lanes, total_lanes);
+
+                Func tmp1, tmp2;
+                tmp1(x) = cast(elt, x);
+                tmp1.compute_root();
+                tmp2(x, y) = select(x % 3 == 0, tmp1(x / 3),
+                                    x % 3 == 1, tmp1(x / 3 + 16),
+                                    tmp1(x / 3 + 32));
+                tmp2.compute_root().vectorize(x, total_lanes);
+                Expr load_3 = in_im(x * 3) + in_im(x * 3 + 1) + in_im(x * 3 + 2);
+                Expr store_3 = tmp2(0, 0) + tmp2(0, 127);
+
+                if (has_sve()) {
+                    // TODO(issue needed): Added strided load support.
+#if 0
+                    add_ldn({get_sve_ls_instr("ld3", bits)}, vector_lanes, load_3);
+                    add_stn({get_sve_ls_instr("st3", bits)}, total_lanes, store_3);
+#endif
+                } else {
+                    add_ldn(sel_op("vld3.", "ld3"), load_3);
+                    add_stn(sel_op("vst3.", "st3"), store_3);
+                }
+            }
+
+            // LD4/ST4       -       Store four-element structures
+            for (int width = 256; width <= 256 * 4; width *= 2) {
+                const int total_lanes = width / bits;
+                const int vector_lanes = total_lanes / 4;
+                const int instr_lanes = Instruction::get_instr_lanes(bits, vector_lanes, target);
+                if (instr_lanes < 2) continue;  // bail out scalar op
+
+                AddTestFunctor add_ldn(*this, bits, vector_lanes);
+                AddTestFunctor add_stn(*this, bits, instr_lanes, total_lanes);
+
+                Func tmp1, tmp2;
+                tmp1(x) = cast(elt, x);
+                tmp1.compute_root();
+                tmp2(x, y) = select(x % 4 == 0, tmp1(x / 4),
+                                    x % 4 == 1, tmp1(x / 4 + 16),
+                                    x % 4 == 2, tmp1(x / 4 + 32),
+                                    tmp1(x / 4 + 48));
+                tmp2.compute_root().vectorize(x, total_lanes);
+                Expr load_4 = in_im(x * 4) + in_im(x * 4 + 1) + in_im(x * 4 + 2) + in_im(x * 4 + 3);
+                Expr store_4 = tmp2(0, 0) + tmp2(0, 127);
+
+                if (has_sve()) {
+                    // TODO(issue needed): Added strided load support.
+#if 0
+                    add_ldn({get_sve_ls_instr("ld4", bits)}, vector_lanes, load_4);
+                    add_stn({get_sve_ls_instr("st4", bits)}, total_lanes, store_4);
+#endif
+                } else {
+                    add_ldn(sel_op("vld4.", "ld4"), load_4);
+                    add_stn(sel_op("vst4.", "st4"), store_4);
+                }
+            }
+
+            // SVE Gather/Scatter
+            if (has_sve()) {
+                for (int width = 64; width <= 64 * 4; width *= 2) {
+                    const int total_lanes = width / bits;
+                    const int instr_lanes = min(total_lanes, 128 / bits);
+                    if (instr_lanes < 2) continue;  // bail out scalar op
+
+                    AddTestFunctor add(*this, bits, total_lanes);
+                    Expr index = clamp(cast<int>(in_im(x)), 0, W - 1);
+                    Func tmp;
+                    tmp(x, y) = cast(elt, y);
+                    tmp(x, index) = cast(elt, 1);
+                    tmp.compute_root().update().vectorize(x, total_lanes);
+                    Expr gather = in_im(index);
+                    Expr scatter = tmp(0, 0) + tmp(0, 127);
+
+                    const int index_bits = std::max(32, bits);
+                    add({get_sve_ls_instr("ld1", bits, index_bits, "uxtw")}, total_lanes, gather);
+                    add({get_sve_ls_instr("st1", bits, index_bits, "uxtw")}, total_lanes, scatter);
+                }
+            }
+        }
+    }
+
+    void check_arm_pairwise() {
+        // A summation reduction that starts at something
+        // non-trivial, to avoid llvm simplifying accumulating
+        // widening summations into just widening summations.
+        auto sum_ = [&](Expr e) {
+            Func f;
+            f(x) = cast(e.type(), 123);
+            f(x) += e;
+            return f(x);
+        };
+
+        // Tests for integer type
+        {
+            vector<tuple<int, CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy, CastFuncTy>> test_params{
+                {8, in_i8, in_u8, i16, i32, u16, u32},
+                {16, in_i16, in_u16, i32, i64, u32, u64},
+                {32, in_i32, in_u32, i64, i64, u64, u64},
+                {64, in_i64, in_u64, i64, i64, u64, u64},
+            };
+            // clang-format on
+
+            for (const auto &[bits, in_i, in_u, widen_i, widenx4_i, widen_u, widenx4_u] : test_params) {
+
+                for (auto &total_bits : {64, 128}) {
+                    const int vf = total_bits / bits;
+                    const int instr_lanes = Instruction::get_force_vectorized_instr_lanes(bits, vf, target);
+                    AddTestFunctor add(*this, bits, instr_lanes, vf, !(is_arm32() && bits == 64));  // 64 bit is unavailable in neon 32 bit
+                    AddTestFunctor add_8_16_32(*this, bits, instr_lanes, vf, bits != 64);
+                    const int widen_lanes = Instruction::get_instr_lanes(bits, vf * 2, target);
+                    AddTestFunctor add_widen(*this, bits, widen_lanes, vf, bits != 64);
+
+                    if (!has_sve()) {
+                        // VPADD    I, F    -       Pairwise Add
+                        // VPMAX    I, F    -       Pairwise Maximum
+                        // VPMIN    I, F    -       Pairwise Minimum
+                        for (int f : {2, 4}) {
+                            RDom r(0, f);
+
+                            add(sel_op("vpadd.i", "addp"), sum_(in_i(f * x + r)));
+                            add(sel_op("vpadd.i", "addp"), sum_(in_u(f * x + r)));
+                            add_8_16_32(sel_op("vpmax.s", "smaxp"), maximum(in_i(f * x + r)));
+                            add_8_16_32(sel_op("vpmax.u", "umaxp"), maximum(in_u(f * x + r)));
+                            add_8_16_32(sel_op("vpmin.s", "sminp"), minimum(in_i(f * x + r)));
+                            add_8_16_32(sel_op("vpmin.u", "uminp"), minimum(in_u(f * x + r)));
+                        }
+                    }
+
+                    // VPADAL   I       -       Pairwise Add and Accumulate Long
+                    // VPADDL   I       -       Pairwise Add Long
+                    {
+                        int f = 2;
+                        RDom r(0, f);
+
+                        // If we're reducing by a factor of two, we can
+                        // use the forms with an accumulator
+                        add_widen(sel_op("vpadal.s", "sadalp"), sum_(widen_i(in_i(f * x + r))));
+                        add_widen(sel_op("vpadal.u", "uadalp"), sum_(widen_i(in_u(f * x + r))));
+                        add_widen(sel_op("vpadal.u", "uadalp"), sum_(widen_u(in_u(f * x + r))));
+                    }
+                    {
+                        int f = 4;
+                        RDom r(0, f);
+
+                        // If we're reducing by more than that, that's not
+                        // possible.
+                        // In case of SVE, addlp is unavailable, so adalp is used with accumulator=0 instead.
+                        add_widen(sel_op("vpaddl.s", "saddlp", "sadalp"), sum_(widen_i(in_i(f * x + r))));
+                        add_widen(sel_op("vpaddl.u", "uaddlp", "uadalp"), sum_(widen_i(in_u(f * x + r))));
+                        add_widen(sel_op("vpaddl.u", "uaddlp", "uadalp"), sum_(widen_u(in_u(f * x + r))));
+                    }
+
+                    const bool is_arm_dot_prod_available = (!is_arm32() && target.has_feature(Target::ARMDotProd) && bits == 8) ||
+                                                           (has_sve() && (bits == 8 || bits == 16));
+                    if ((bits == 8 || bits == 16) && !is_arm_dot_prod_available) {  // udot/sdot is applied if available
+                        int f = 4;
+                        RDom r(0, f);
+                        // If we're widening the type by a factor of four
+                        // as well as reducing by a factor of four, we
+                        // expect vpaddl followed by vpadal
+                        // Note that when going from u8 to i32 like this,
+                        // the vpaddl is unsigned and the vpadal is a
+                        // signed, because the intermediate type is u16
+                        const int widenx4_lanes = Instruction::get_instr_lanes(bits * 2, vf, target);
+                        string op_addl, op_adal;
+                        op_addl = sel_op("vpaddl.s", "saddlp");
+                        op_adal = sel_op("vpadal.s", "sadalp");
+                        add({{op_addl, bits, widen_lanes}, {op_adal, bits * 2, widenx4_lanes}}, vf, sum_(widenx4_i(in_i(f * x + r))));
+                        op_addl = sel_op("vpaddl.u", "uaddlp");
+                        op_adal = sel_op("vpadal.u", "uadalp");
+                        add({{op_addl, bits, widen_lanes}, {op_adal, bits * 2, widenx4_lanes}}, vf, sum_(widenx4_i(in_u(f * x + r))));
+                        add({{op_addl, bits, widen_lanes}, {op_adal, bits * 2, widenx4_lanes}}, vf, sum_(widenx4_u(in_u(f * x + r))));
+                    }
+
+                    // UDOT/SDOT
+                    if (is_arm_dot_prod_available) {
+                        const int factor_32bit = vf / 4;
+                        for (int f : {4, 8}) {
+                            // checks vector register for narrow src data type (i.e. 8 or 16 bit)
+                            const int lanes_src = Instruction::get_instr_lanes(bits, f * factor_32bit, target);
+                            AddTestFunctor add_dot(*this, bits, lanes_src, factor_32bit);
+                            RDom r(0, f);
+
+                            add_dot("udot", sum(widenx4_u(in_u(f * x + r)) * in_u(f * x + r + 32)));
+                            add_dot("sdot", sum(widenx4_i(in_i(f * x + r)) * in_i(f * x + r + 32)));
+                            if (f == 4) {
+                                // This doesn't generate for higher reduction factors because the
+                                // intermediate is 16-bit instead of 32-bit. It seems like it would
+                                // be slower to fix this (because the intermediate sum would be
+                                // 32-bit instead of 16-bit).
+                                add_dot("udot", sum(widenx4_u(in_u(f * x + r))));
+                                add_dot("sdot", sum(widenx4_i(in_i(f * x + r))));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // Tests for Float type
+        {
+            // clang-format off
+            vector<tuple<int, CastFuncTy>> test_params{
+                {16, in_f16},
+                {32, in_f32},
+                {64, in_f64},
+            };
+            // clang-format on
+            if (!has_sve()) {
+                for (const auto &[bits, in_f] : test_params) {
+                    for (auto &total_bits : {64, 128}) {
+                        const int vf = total_bits / bits;
+                        if (vf < 2) continue;
+                        AddTestFunctor add(*this, bits, vf);
+                        AddTestFunctor add_16_32(*this, bits, vf, bits != 64);
+
+                        if (bits == 16 && !is_float16_supported()) {
+                            continue;
+                        }
+
+                        for (int f : {2, 4}) {
+                            RDom r(0, f);
+
+                            add(sel_op("vadd.f", "faddp"), sum_(in_f(f * x + r)));
+                            add_16_32(sel_op("vmax.f", "fmaxp"), maximum(in_f(f * x + r)));
+                            add_16_32(sel_op("vmin.f", "fminp"), minimum(in_f(f * x + r)));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    struct ArmTask {
+        vector<string> instrs;
+    };
+
+    struct Instruction {
+        string opcode;
+        optional<string> operand;
+        optional<int> bits;
+        optional<int> pattern_lanes;
+        static inline const int ANY_LANES = -1;
+
+        // matching pattern for opcode/operand is directly set
+        Instruction(const string &opcode, const string &operand)
+            : opcode(opcode), operand(operand), bits(nullopt), pattern_lanes(nullopt) {
+        }
+
+        // matching pattern for opcode/operand is generated from bits/lanes
+        Instruction(const string &opcode, int bits, int lanes)
+            : opcode(opcode), operand(nullopt), bits(bits), pattern_lanes(lanes) {
+        }
+
+        string generate_pattern(const Target &target) const {
+            bool is_arm32 = target.bits == 32;
+            bool has_sve = target.has_feature(Target::SVE2);
+
+            string opcode_pattern;
+            string operand_pattern;
+            if (bits && pattern_lanes) {
+                if (is_arm32) {
+                    opcode_pattern = get_opcode_neon32();
+                    operand_pattern = get_reg_neon32();
+                } else if (!has_sve) {
+                    opcode_pattern = opcode;
+                    operand_pattern = get_reg_neon64();
+                } else {
+                    opcode_pattern = opcode;
+                    operand_pattern = get_reg_sve();
+                }
+            } else {
+                opcode_pattern = opcode;
+                operand_pattern = operand.value_or("");
+            }
+            // e.g "add v15.h "  ->  "\s*add\s.*\bv\d\d?\.h\b.*"
+            return opcode_pattern + R"(\s.*\b)" + operand_pattern + R"(\b.*)";
+        }
+
+        // TODO Fix this for SVE2
+        static int natural_lanes(int bits) {
+            return 128 / bits;
+        }
+
+        static int get_instr_lanes(int bits, int vec_factor, const Target &target) {
+            return min(natural_lanes(bits), vec_factor);
+        }
+
+        static int get_force_vectorized_instr_lanes(int bits, int vec_factor, const Target &target) {
+            // For some cases, where scalar operation is forced to vectorize
+            if (target.has_feature(Target::SVE2)) {
+                if (vec_factor == 1) {
+                    return 1;
+                } else {
+                    return natural_lanes(bits);
+                }
+            } else {
+                int min_lanes = std::max(2, natural_lanes(bits) / 2);  // 64 bit wide VL
+                return max(min_lanes, get_instr_lanes(bits, vec_factor, target));
+            }
+        }
+
+        string get_opcode_neon32() const {
+            return opcode + to_string(bits.value());
+        }
+
+        const char *get_bits_designator() const {
+            static const map<int, const char *> designators{
+                // NOTE: vector or float only
+                {8, "b"},
+                {16, "h"},
+                {32, "s"},
+                {64, "d"},
+            };
+            auto iter = designators.find(bits.value());
+            assert(iter != designators.end());
+            return iter->second;
+        }
+
+        string get_reg_sve() const {
+            if (pattern_lanes == ANY_LANES) {
+                return R"((z\d\d?\.[bhsd])|(s\d\d?))";
+            } else {
+                const char *bits_designator = get_bits_designator();
+                // TODO(need issue): This should only match the scalar register, and likely a NEON instruction opcode.
+                // Generating a full SVE vector instruction for a scalar operation is inefficient. However this is
+                // happening and fixing it involves changing intrinsic selection. Likely to use NEON intrinsics where
+                // applicable. For now, accept both a scalar operation and a vector one.
+                std::string scalar_reg_pattern = (pattern_lanes > 1) ? "" : std::string("|(") + bits_designator + R"(\d\d?))";  // e.g. "h15"
+
+                return std::string(R"(((z\d\d?\.)") + bits_designator + ")|(" +
+                       R"(v\d\d?\.)" + to_string(pattern_lanes.value()) + bits_designator + ")" + scalar_reg_pattern + ")";
+            }
+        }
+
+        string get_reg_neon32() const {
+            return "";
+        }
+
+        string get_reg_neon64() const {
+            const char *bits_designator = get_bits_designator();
+            if (pattern_lanes == 1) {
+                return std::string(bits_designator) + R"(\d\d?)";  // e.g. "h15"
+            } else if (pattern_lanes == ANY_LANES) {
+                return R"(v\d\d?\.[bhsd])";
+            } else {
+                return R"(v\d\d?\.)" + to_string(pattern_lanes.value()) + bits_designator;  // e.g. "v15.4h"
+            }
+        }
+    };
+
+    Instruction get_sve_ls_instr(const string &base_opcode, int opcode_bits, int operand_bits, const string &additional = "", const string &optional_type = "") {
+        static const map<int, string> opcode_suffix_map = {{8, "b"}, {16, "h"}, {32, "w"}, {64, "d"}};
+        static const map<int, string> operand_suffix_map = {{8, "b"}, {16, "h"}, {32, "s"}, {64, "d"}};
+        string opcode_size_specifier;
+        string operand_size_specifier;
+        if (!optional_type.empty()) {
+            opcode_size_specifier = "[";
+            operand_size_specifier = "[";
+        }
+        opcode_size_specifier += opcode_suffix_map.at(opcode_bits);
+        operand_size_specifier += operand_suffix_map.at(operand_bits);
+        if (!optional_type.empty()) {
+            opcode_size_specifier += optional_type;
+            opcode_size_specifier += "]";
+            operand_size_specifier += optional_type;
+            operand_size_specifier += "]";
+        }
+        const string opcode = base_opcode + opcode_size_specifier;
+        string operand = R"(z\d\d?\.)" + operand_size_specifier;
+        if (!additional.empty()) {
+            operand += ", " + additional;
+        }
+        return Instruction(opcode, operand);
+    }
+
+    Instruction get_sve_ls_instr(const string &base_opcode, int bits) {
+        return get_sve_ls_instr(base_opcode, bits, bits, "");
+    }
+
+    // Helper functor to add test case
+    class AddTestFunctor {
+    public:
+        AddTestFunctor(SimdOpCheckArmSve &p,
+                       int default_bits,
+                       int default_instr_lanes,
+                       int default_vec_factor,
+                       bool is_enabled = true /* false to skip testing */)
+            : parent(p), default_bits(default_bits), default_instr_lanes(default_instr_lanes),
+              default_vec_factor(default_vec_factor), is_enabled(is_enabled){};
+
+        AddTestFunctor(SimdOpCheckArmSve &p,
+                       int default_bits,
+                       // default_instr_lanes is inferred from bits and vec_factor
+                       int default_vec_factor,
+                       bool is_enabled = true /* false to skip testing */)
+            : parent(p), default_bits(default_bits),
+              default_instr_lanes(Instruction::get_instr_lanes(default_bits, default_vec_factor, p.target)),
+              default_vec_factor(default_vec_factor), is_enabled(is_enabled){};
+
+        // Constructs single Instruction with default parameters
+        void operator()(const string &opcode, Expr e) {
+            // Use opcode for name
+            (*this)(opcode, opcode, e);
+        }
+
+        // Constructs single Instruction with default parameters except for custom name
+        void operator()(const string &op_name, const string &opcode, Expr e) {
+            create_and_register(op_name, {Instruction{opcode, default_bits, default_instr_lanes}}, default_vec_factor, e);
+        }
+
+        // Constructs multiple Instruction with default parameters
+        void operator()(const vector<string> &opcodes, Expr e) {
+            assert(!opcodes.empty());
+            (*this)(opcodes[0], opcodes, e);
+        }
+
+        // Constructs multiple Instruction with default parameters except for custom name
+        void operator()(const string &op_name, const vector<string> &opcodes, Expr e) {
+            vector<Instruction> instrs;
+            for (const auto &opcode : opcodes) {
+                instrs.emplace_back(opcode, default_bits, default_instr_lanes);
+            }
+            create_and_register(op_name, instrs, default_vec_factor, e);
+        }
+
+        // Set single or multiple Instructions of custom parameters
+        void operator()(const vector<Instruction> &instructions, int vec_factor, Expr e) {
+            // Use the 1st opcode for name
+            assert(!instructions.empty());
+            string op_name = instructions[0].opcode;
+            (*this)(op_name, instructions, vec_factor, e);
+        }
+
+        // Set single or multiple Instructions of custom parameters, with custom name
+        void operator()(const string &op_name, const vector<Instruction> &instructions, int vec_factor, Expr e) {
+            create_and_register(op_name, instructions, vec_factor, e);
+        }
+
+    private:
+        void create_and_register(const string &op_name, const vector<Instruction> &instructions, int vec_factor, Expr e) {
+            if (!is_enabled) return;
+
+            // Generate regular expression for the instruction we check
+            vector<string> instr_patterns;
+            transform(instructions.begin(), instructions.end(), back_inserter(instr_patterns),
+                      [t = parent.target](const Instruction &instr) { return instr.generate_pattern(t); });
+
+            std::stringstream type_name_stream;
+            type_name_stream << e.type();
+            std::string decorated_op_name = op_name + "_" + type_name_stream.str() + "_x" + std::to_string(vec_factor);
+            auto unique_name = "op_" + decorated_op_name + "_" + std::to_string(parent.tasks.size());
+
+            // Bail out after generating the unique_name, so that names are
+            // unique across different processes and don't depend on filter
+            // settings.
+            if (!parent.wildcard_match(parent.filter, decorated_op_name)) return;
+
+            // Create a deep copy of the expr and all Funcs referenced by it, so
+            // that no IR is shared between tests. This is required by the base
+            // class, and is why we can parallelize.
+            {
+                using namespace Halide::Internal;
+                class FindOutputs : public IRVisitor {
+                    using IRVisitor::visit;
+                    void visit(const Call *op) override {
+                        if (op->func.defined()) {
+                            outputs.insert(op->func);
+                        }
+                        IRVisitor::visit(op);
+                    }
+
+                public:
+                    std::set<FunctionPtr> outputs;
+                } finder;
+                e.accept(&finder);
+                std::vector<Function> outputs(finder.outputs.begin(), finder.outputs.end());
+                auto env = deep_copy(outputs, build_environment(outputs)).second;
+                class DeepCopy : public IRMutator {
+                    std::map<FunctionPtr, FunctionPtr> copied;
+                    using IRMutator::visit;
+                    Expr visit(const Call *op) override {
+                        if (op->func.defined()) {
+                            auto it = env.find(op->name);
+                            if (it != env.end()) {
+                                return Func(it->second)(mutate(op->args));
+                            }
+                        }
+                        return IRMutator::visit(op);
+                    }
+                    const std::map<std::string, Function> &env;
+
+                public:
+                    DeepCopy(const std::map<std::string, Function> &env)
+                        : env(env) {
+                    }
+                } copier(env);
+                e = copier.mutate(e);
+            }
+
+            // Create Task and register
+            parent.tasks.emplace_back(Task{decorated_op_name, unique_name, vec_factor, e});
+            parent.arm_tasks.emplace(unique_name, ArmTask{std::move(instr_patterns)});
+        }
+
+        SimdOpCheckArmSve &parent;
+        int default_bits;
+        int default_instr_lanes;
+        int default_vec_factor;
+        bool is_enabled;
+    };
+
+    void compile_and_check(Func error, const string &op, const string &name, int vector_width, const std::vector<Argument> &arg_types, ostringstream &error_msg) override {
+        // This is necessary as LLVM validation errors, crashes, etc. don't tell which op crashed.
+        cout << "Starting op " << op << "\n";
+        string fn_name = "test_" + name;
+        string file_name = output_directory + fn_name;
+
+        auto ext = Internal::get_output_info(target);
+        std::map<OutputFileType, std::string> outputs = {
+            {OutputFileType::llvm_assembly, file_name + ext.at(OutputFileType::llvm_assembly).extension},
+            {OutputFileType::c_header, file_name + ext.at(OutputFileType::c_header).extension},
+            {OutputFileType::object, file_name + ext.at(OutputFileType::object).extension},
+            {OutputFileType::assembly, file_name + ".s"},
+        };
+
+        error.compile_to(outputs, arg_types, fn_name, target);
+
+        std::ifstream asm_file;
+        asm_file.open(file_name + ".s");
+
+        auto arm_task = arm_tasks.find(name);
+        assert(arm_task != arm_tasks.end());
+
+        std::ostringstream msg;
+        msg << op << " did not generate for target=" << target.to_string()
+            << " vector_width=" << vector_width << ". Instead we got:\n";
+
+        string line;
+        vector<string> matched_lines;
+        vector<string> &patterns = arm_task->second.instrs;
+        while (getline(asm_file, line) && !patterns.empty()) {
+            msg << line << "\n";
+            auto pattern = patterns.begin();
+            while (pattern != patterns.end()) {
+                smatch match;
+                if (regex_search(line, match, regex(*pattern))) {
+                    pattern = patterns.erase(pattern);
+                    matched_lines.emplace_back(match[0]);
+                } else {
+                    ++pattern;
+                }
+            }
+        }
+
+        if (!patterns.empty()) {
+            error_msg << "Failed: " << msg.str() << "\n";
+            error_msg << "The following instruction patterns were not found:\n";
+            for (auto &p : patterns) {
+                error_msg << p << "\n";
+            }
+        } else if (debug_mode == "1") {
+            for (auto &l : matched_lines) {
+                error_msg << "    " << setw(20) << name << ", vf=" << setw(2) << vector_width << ",     ";
+                error_msg << l << endl;
+            }
+        }
+    }
+
+    inline const string &sel_op(const string &neon32, const string &neon64) {
+        return is_arm32() ? neon32 : neon64;
+    }
+
+    inline const string &sel_op(const string &neon32, const string &neon64, const string &sve) {
+        return is_arm32()                                                          ? neon32 :
+               target.has_feature(Target::SVE) || target.has_feature(Target::SVE2) ? sve :
+                                                                                     neon64;
+    }
+
+    inline bool is_arm32() const {
+        return target.bits == 32;
+    };
+    inline bool has_neon() const {
+        return !target.has_feature(Target::NoNEON);
+    };
+    inline bool has_sve() const {
+        return target.has_feature(Target::SVE2);
+    };
+
+    bool is_float16_supported() const {
+        return (target.bits == 64) && target.has_feature(Target::ARMFp16);
+    }
+
+    bool can_run_the_code;
+    string debug_mode;
+    std::unordered_map<string, ArmTask> arm_tasks;
+    const Var x{"x"}, y{"y"};
+};
+}  // namespace
+
+int main(int argc, char **argv) {
+    if (Halide::Internal::get_llvm_version() < 190) {
+        std::cout << "[SKIP] simd_op_check_sve2 requires LLVM 19 or later.\n";
+        return 0;
+    }
+
+    return SimdOpCheckTest::main<SimdOpCheckArmSve>(
+        argc, argv,
+        {
+            Target("arm-64-linux-sve2-no_neon-vector_bits_128"),
+            Target("arm-64-linux-sve2-no_neon-vector_bits_256"),
+        });
+}

From a132246ced07adc59c7b3631009464e5a14e0abb Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 15 Mar 2024 14:04:44 -0700
Subject: [PATCH 092/186] Fix two compute_with bugs. (#8152)

* Fix two compute_with bugs.

This PR fixes a bug in compute_with, and another bug I found while
fixing it (we could really use a compute_with fuzzer).

The first bug is that you can get into situations where the bounds of a
producer func will refer directly to the loop variable of a consumer
func, where the consumer is in a compute_with fused group. In main, that
loop variable may not be defined because fused loop names have been
rewritten to include the token ".fused.". This PR adds let stmts to
define it just inside the fused loop body.

The second bug is that not all parent loops in compute_with fused groups
were having their bounds expanded to cover the region to be computed of
all children, because the logic for deciding which loops to expand only
considered the non-specialized pure definition. So e.g. compute_with
applied to an update stage would fail to compute values of the child
Func where they do not overlap with the parent Func. This PR visits all
definitions of the parent Func of the fused group, instead of just the
unspecialized pure definition of the parent Func.

Fixes #8149

* clang-tidy
---
 src/ScheduleFunctions.cpp         | 224 ++++++++++++++++++++----------
 test/correctness/compute_with.cpp |  87 +++++++++++-
 2 files changed, 236 insertions(+), 75 deletions(-)

diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index aa45841253b7..8fa2fd71a7a2 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -1021,81 +1021,126 @@ class CollectBounds : public IRVisitor {
     }
 };
 
-class SubstituteFusedBounds : public IRMutator {
-public:
-    const map<string, Expr> &replacements;
-    explicit SubstituteFusedBounds(const map<string, Expr> &r)
-        : replacements(r) {
+// Rename a loop var in a compute_with cluster to include '.fused.', to
+// disambiguate its bounds from the original loop bounds. The '.fused.' token is
+// injected somewhere that's not going to change the results of var_name_match,
+// so that it's unchanged as a scheduling point.
+string fused_name(const string &var) {
+    size_t last_dot = var.rfind('.');
+    internal_assert(last_dot != string::npos);
+    return var.substr(0, last_dot) + ".fused." + var.substr(last_dot + 1);
+}
+
+// The bounds of every loop exist in 'replacements' should be replaced. The
+// loop is also renamed by adding '.fused' in the original name before the
+// variable name.
+Stmt substitute_fused_bounds(Stmt s, const map<string, Expr> &replacements) {
+    if (!s.defined() || replacements.empty()) {
+        return s;
     }
 
-private:
-    using IRMutator::visit;
+    class SubstituteFusedBounds : public IRMutator {
+        const map<string, Expr> &replacements;
 
-    Stmt visit(const For *op) override {
-        const auto *min_var = op->min.as<Variable>();
-        const auto *extent_var = op->extent.as<Variable>();
-        if (min_var && extent_var) {
-            Expr min_val, extent_val;
-            {
-                const auto &it = replacements.find(min_var->name);
-                if (it != replacements.end()) {
-                    min_val = it->second;
+        using IRMutator::visit;
+
+        Stmt visit(const For *op) override {
+            const auto *min_var = op->min.as<Variable>();
+            const auto *extent_var = op->extent.as<Variable>();
+            if (min_var && extent_var) {
+                Expr min_val, extent_val;
+                {
+                    const auto &it = replacements.find(min_var->name);
+                    if (it != replacements.end()) {
+                        min_val = it->second;
+                    }
                 }
-            }
-            {
-                const auto &it = replacements.find(extent_var->name);
-                if (it != replacements.end()) {
-                    extent_val = it->second;
+                {
+                    const auto &it = replacements.find(extent_var->name);
+                    if (it != replacements.end()) {
+                        extent_val = it->second;
+                    }
+                }
+                if (!min_val.defined() || !extent_val.defined()) {
+                    return IRMutator::visit(op);
+                }
+
+                Stmt body = mutate(op->body);
+
+                string new_var = fused_name(op->name);
+
+                ForType for_type = op->for_type;
+                DeviceAPI device_api = op->device_api;
+                if (is_const_one(extent_val)) {
+                    // This is the child loop of a fused group. The real loop of the
+                    // fused group is the loop of the parent function of the fused
+                    // group. This child loop is just a scheduling point, and should
+                    // never be a device transition, so we rewrite it to be a simple
+                    // serial loop of extent 1."
+                    for_type = ForType::Serial;
+                    device_api = DeviceAPI::None;
                 }
+
+                Stmt stmt = For::make(new_var, Variable::make(Int(32), new_var + ".loop_min"),
+                                      Variable::make(Int(32), new_var + ".loop_extent"),
+                                      for_type, op->partition_policy, device_api, body);
+
+                // Add let stmts defining the bound of the renamed for-loop.
+                stmt = LetStmt::make(new_var + ".loop_min", min_val, stmt);
+                stmt = LetStmt::make(new_var + ".loop_max", simplify(min_val + extent_val - 1), stmt);
+                stmt = LetStmt::make(new_var + ".loop_extent", extent_val, stmt);
+                // Replace any reference to the old loop name with the new one.
+                stmt = substitute(op->name, Variable::make(Int(32), new_var), stmt);
+                return stmt;
+            } else {
+                return IRMutator::visit(op);
             }
-            if (!min_val.defined() || !extent_val.defined()) {
+        }
+
+    public:
+        explicit SubstituteFusedBounds(const map<string, Expr> &r)
+            : replacements(r) {
+        }
+    } subs(replacements);
+
+    return subs.mutate(s);
+}
+
+// Add letstmts inside each parent loop that define the corresponding child loop
+// vars as equal to it. Bounds inference might need a child loop var.
+Stmt add_loop_var_aliases(Stmt s, const map<string, set<string>> &loop_var_aliases) {
+    if (!s.defined() || loop_var_aliases.empty()) {
+        return s;
+    }
+
+    class AddLoopVarAliases : public IRMutator {
+        const map<string, set<string>> &loop_var_aliases;
+
+        using IRMutator::visit;
+
+        Stmt visit(const For *op) override {
+            auto it = loop_var_aliases.find(op->name);
+            if (it == loop_var_aliases.end()) {
                 return IRMutator::visit(op);
             }
 
+            Expr var = Variable::make(Int(32), op->name);
             Stmt body = mutate(op->body);
-
-            size_t last_dot = op->name.rfind('.');
-            internal_assert(last_dot != string::npos);
-            string new_var = op->name.substr(0, last_dot) + ".fused." + op->name.substr(last_dot + 1);
-
-            ForType for_type = op->for_type;
-            DeviceAPI device_api = op->device_api;
-            if (is_const_one(extent_val)) {
-                // This is the child loop of a fused group. The real loop of the
-                // fused group is the loop of the parent function of the fused
-                // group. This child loop is just a scheduling point, and should
-                // never be a device transition, so we rewrite it to be a simple
-                // serial loop of extent 1."
-                for_type = ForType::Serial;
-                device_api = DeviceAPI::None;
+            for (const string &alias : it->second) {
+                body = LetStmt::make(alias, var, body);
             }
 
-            Stmt stmt = For::make(new_var, Variable::make(Int(32), new_var + ".loop_min"),
-                                  Variable::make(Int(32), new_var + ".loop_extent"),
-                                  for_type, op->partition_policy, device_api, body);
+            return For::make(op->name, op->min, op->extent, op->for_type,
+                             op->partition_policy, op->device_api, std::move(body));
+        }
 
-            // Add let stmts defining the bound of the renamed for-loop.
-            stmt = LetStmt::make(new_var + ".loop_min", min_val, stmt);
-            stmt = LetStmt::make(new_var + ".loop_max", simplify(min_val + extent_val - 1), stmt);
-            stmt = LetStmt::make(new_var + ".loop_extent", extent_val, stmt);
-            // Replace any reference to the old loop name with the new one.
-            stmt = substitute(op->name, Variable::make(Int(32), new_var), stmt);
-            return stmt;
-        } else {
-            return IRMutator::visit(op);
+    public:
+        explicit AddLoopVarAliases(const map<string, set<string>> &a)
+            : loop_var_aliases(a) {
         }
-    }
-};
+    } add_aliases(loop_var_aliases);
 
-// The bounds of every loop exist in 'replacements' should be replaced. The
-// loop is also renamed by adding '.fused' in the original name before the
-// variable name.
-Stmt substitute_fused_bounds(Stmt s, const map<string, Expr> &replacements) {
-    if (!s.defined() || replacements.empty()) {
-        return s;
-    } else {
-        return SubstituteFusedBounds(replacements).mutate(s);
-    }
+    return add_aliases.mutate(s);
 }
 
 // Shift the iteration domain of a loop nest by some factor.
@@ -1460,7 +1505,9 @@ class InjectFunctionRealization : public IRMutator {
     }
 
     Stmt build_produce_definition(const Function &f, const string &prefix, const Definition &def, bool is_update,
-                                  map<string, Expr> &replacements, vector<pair<string, Expr>> &add_lets) {
+                                  map<string, Expr> &replacements,
+                                  vector<pair<string, Expr>> &add_lets,
+                                  map<string, set<string>> &aliases) {
         const vector<Dim> &dims = def.schedule().dims();  // From inner to outer
         const LoopLevel &fuse_level = def.schedule().fuse_level().level;
 
@@ -1499,6 +1546,10 @@ class InjectFunctionRealization : public IRMutator {
                 replacements.emplace(var + ".loop_extent", make_const(Int(32), 1));
                 replacements.emplace(var + ".loop_min", val);
                 replacements.emplace(var + ".loop_max", val);
+
+                string var_fused = fused_name(var_orig);
+                aliases[var_fused].emplace(std::move(var_orig));
+                aliases[var_fused].emplace(std::move(var));
             }
         }
 
@@ -1550,18 +1601,17 @@ class InjectFunctionRealization : public IRMutator {
 
     // Replace the bounds of the parent fused loop (i.e. the first one to be
     // realized in the group) with union of the bounds of the fused group.
-    Stmt replace_parent_bound_with_union_bound(const Function &f, Stmt produce, const map<string, Expr> &bounds) {
-        string prefix = f.name() + ".s0";
-        const Definition &def = f.definition();
+    Stmt replace_parent_bound_with_union_bound(const string &func, int stage,
+                                               const Definition &def, Stmt produce,
+                                               const map<string, Expr> &bounds,
+                                               map<string, Expr> &replacements) {
 
-        if (!def.defined()) {
+        if (def.schedule().fused_pairs().empty()) {
             return produce;
         }
 
         const vector<Dim> &dims = def.schedule().dims();  // From inner to outer
 
-        map<string, Expr> replacements;
-
         vector<FusedPair> dependence = collect_all_dependence(def);
 
         // Compute the union of the bounds of the fused loops.
@@ -1582,6 +1632,8 @@ class InjectFunctionRealization : public IRMutator {
                 // the parent, e.g. y.yi and yi.
                 int dim2_idx = (int)(dims_2.size() - (dims.size() - i));
                 internal_assert(dim2_idx < (int)dims_2.size());
+                string var_1 = func + ".s" + std::to_string(stage) +
+                               "." + dims[i].var;
 
                 string var_2 = pair.func_2 + ".s" + std::to_string(pair.stage_2) +
                                "." + dims_2[dim2_idx].var;
@@ -1592,7 +1644,6 @@ class InjectFunctionRealization : public IRMutator {
                 Expr max_2 = bounds.find(var_2 + ".loop_max")->second;
                 Expr extent_2 = bounds.find(var_2 + ".loop_extent")->second;
 
-                string var_1 = prefix + "." + dims[i].var;
                 internal_assert(bounds.count(var_1 + ".loop_min"));
                 internal_assert(bounds.count(var_1 + ".loop_max"));
                 internal_assert(bounds.count(var_1 + ".loop_extent"));
@@ -1616,8 +1667,26 @@ class InjectFunctionRealization : public IRMutator {
             }
         }
 
-        // Now, replace the bounds of the parent fused loops with the union bounds.
+        // Now, replace the bounds of the parent fused loops with the union
+        // bounds.
+        for (const auto &spec : def.specializations()) {
+            produce = replace_parent_bound_with_union_bound(func, stage, spec.definition, produce, bounds, replacements);
+        }
+
+        return produce;
+    }
+
+    Stmt replace_parent_bound_with_union_bound(const Function &f, Stmt produce,
+                                               const map<string, Expr> &bounds) {
+        map<string, Expr> replacements;
+
+        int stage = 0;
+        produce = replace_parent_bound_with_union_bound(f.name(), stage++, f.definition(), produce, bounds, replacements);
+        for (const Definition &def : f.updates()) {
+            produce = replace_parent_bound_with_union_bound(f.name(), stage++, def, produce, bounds, replacements);
+        }
         produce = substitute_fused_bounds(produce, replacements);
+
         return produce;
     }
 
@@ -1748,22 +1817,23 @@ class InjectFunctionRealization : public IRMutator {
         Stmt producer;
         map<string, Expr> replacements;
         vector<pair<string, Expr>> add_lets;
+        map<string, set<string>> aliases;
 
         for (const auto &func_stage : stage_order) {
             const auto &f = func_stage.first;
 
             if (f.has_extern_definition() && (func_stage.second == 0)) {
-                const Stmt &produceDef = Internal::build_extern_produce(env, f, target);
-                producer = inject_stmt(producer, produceDef, LoopLevel::inlined().lock());
+                const Stmt &produce_def = Internal::build_extern_produce(env, f, target);
+                producer = inject_stmt(producer, produce_def, LoopLevel::inlined().lock());
                 continue;
             }
 
             string def_prefix = f.name() + ".s" + std::to_string(func_stage.second) + ".";
             const auto &def = (func_stage.second == 0) ? f.definition() : f.updates()[func_stage.second - 1];
 
-            const Stmt &produceDef = build_produce_definition(f, def_prefix, def, func_stage.second > 0,
-                                                              replacements, add_lets);
-            producer = inject_stmt(producer, produceDef, def.schedule().fuse_level().level);
+            const Stmt &produce_def = build_produce_definition(f, def_prefix, def, func_stage.second > 0,
+                                                               replacements, add_lets, aliases);
+            producer = inject_stmt(producer, produce_def, def.schedule().fuse_level().level);
         }
 
         internal_assert(producer.defined());
@@ -1799,8 +1869,14 @@ class InjectFunctionRealization : public IRMutator {
 
         // Replace the bounds of parent fused loop with union of bounds of
         // the fused loops.
+        Function group_parent = funcs.back();
         producer = replace_parent_bound_with_union_bound(funcs.back(), producer, bounds);
 
+        // Define the old loop var names as equal to the corresponding parent
+        // fused loop var. Bounds inference might refer directly to the original
+        // loop vars.
+        producer = add_loop_var_aliases(producer, aliases);
+
         // Add the producer nodes.
         for (const auto &i : funcs) {
             producer = ProducerConsumer::make_produce(i.name(), producer);
diff --git a/test/correctness/compute_with.cpp b/test/correctness/compute_with.cpp
index 053570a2f5c0..0152642028eb 100644
--- a/test/correctness/compute_with.cpp
+++ b/test/correctness/compute_with.cpp
@@ -2204,6 +2204,89 @@ int two_compute_at_test() {
     return 0;
 }
 
+// Test for the issue described in https://github.com/halide/Halide/issues/8149.
+int child_var_dependent_bounds_test() {
+    Func f{"f"}, g{"g"};
+    Var x{"x"}, y{"y"};
+    RDom r(0, 10, "r");
+
+    Func f_inter{"f_inter"}, g_inter{"g_inter"};
+
+    f_inter(x, y) = x;
+    f_inter(x, y) += 1;
+    f(x) = x;
+    f(x) += f_inter(x, r);
+
+    g_inter(x, y) = x;
+    g_inter(x, y) += 1;
+    g(x) = x;
+    g(x) += g_inter(x, r);
+
+    f_inter.compute_at(f, r);
+    g_inter.compute_at(f, r);
+    g.update().compute_with(f.update(), r);
+    f.update().unscheduled();
+
+    Pipeline p({f, g});
+
+    p.compile_jit();
+    Buffer<int> f_buf(10), g_buf(10);
+
+    f_buf.set_min(2);
+    p.realize({f_buf, g_buf});
+    f_buf.set_min(0);
+
+    for (int i = 0; i < 10; i++) {
+        int correct_f = 10 + 11 * (i + 2);
+        int correct_g = 10 + 11 * i;
+        if (f_buf(i) != correct_f) {
+            printf("f(%d) = %d instead of %d\n", i, f_buf(i), correct_f);
+        }
+        if (g_buf(i) != correct_g) {
+            printf("g(%d) = %d instead of %d\n", i, g_buf(i), correct_f);
+        }
+    }
+
+    return 0;
+}
+
+int overlapping_updates_test() {
+    Func f{"f"}, g{"g"};
+    Var x{"x"};
+
+    f(x) = 0;
+    f(x) += x;
+    g(x) = 0;
+    g(x) += x;
+
+    g.update().compute_with(f.update(), x);
+    f.update().unscheduled();
+
+    Pipeline p({f, g});
+
+    p.compile_jit();
+    Buffer<int> f_buf(10), g_buf(10);
+
+    f_buf.set_min(2);
+    p.realize({f_buf, g_buf});
+    f_buf.set_min(0);
+
+    for (int i = 0; i < 10; i++) {
+        int correct_f = i + 2;
+        int correct_g = i;
+        if (f_buf(i) != correct_f) {
+            printf("f(%d) = %d instead of %d\n", i, f_buf(i), correct_f);
+            return 1;
+        }
+        if (g_buf(i) != correct_g) {
+            printf("g(%d) = %d instead of %d\n", i, g_buf(i), correct_f);
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 }  // namespace
 
 int main(int argc, char **argv) {
@@ -2247,7 +2330,9 @@ int main(int argc, char **argv) {
         {"different arg number compute_at test", different_arg_num_compute_at_test},
         {"store_at different levels test", store_at_different_levels_test},
         {"rvar bounds test", rvar_bounds_test},
-        {"two_compute_at test", two_compute_at_test},
+        {"two compute at test", two_compute_at_test},
+        {"overlapping updates test", overlapping_updates_test},
+        {"child var dependent bounds test", child_var_dependent_bounds_test},
     };
 
     using Sharder = Halide::Internal::Test::Sharder;

From 8864e8ac1c0bb460f0034e9c46f7f944afad3a19 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 19 Mar 2024 02:09:09 +0300
Subject: [PATCH 093/186] Python bindings: `add_python_test()`: do set
 `HL_JIT_TARGET` too (#8156)

This one took quite a bit of digging.

I wanted to enable opencl tests on debian package,
and `boundary_conditions.py`+`division.py` were failing
when run with `HL_TARGET=host OCL_ICD_VENDORS=no-opencl-please.missing`
env variables with `clGetPlatformIDs failed`, which made no sense to me.

Empty `HL_JIT_TARGET` results in `opencl` being detected,
unsurprisingly.
---
 python_bindings/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python_bindings/CMakeLists.txt b/python_bindings/CMakeLists.txt
index 590ecc432e10..25f61fe7dcdd 100644
--- a/python_bindings/CMakeLists.txt
+++ b/python_bindings/CMakeLists.txt
@@ -68,7 +68,7 @@ function(add_python_test)
     list(PREPEND ARG_PYTHONPATH "$<TARGET_FILE_DIR:Halide::Python>/..")
     list(TRANSFORM ARG_PYTHONPATH PREPEND "PYTHONPATH=path_list_prepend:")
 
-    list(PREPEND ARG_ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
+    list(PREPEND ARG_ENVIRONMENT "HL_TARGET=${Halide_TARGET};HL_JIT_TARGET=${Halide_TARGET}")
 
     cmake_path(GET ARG_FILE STEM test_name)
     set(test_name "${ARG_LABEL}_${test_name}")

From a4158c0bf062440e91cbd0b2d5690bc7d82ea568 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 3 Apr 2024 12:28:25 -0700
Subject: [PATCH 094/186] fix ub in lower rounding shift right (#8173)

* Avoid out-of-range shifts in lower_rounding_shift_left/right

Consider `lower_rounding_shift_right(a, (uint8)0)`

The term b - 1 becomes 255, and now you have an out-of-range shift,
which causes the simplifier to inject a signed_integer_overflow
intrinsic, and compilation to fail.

This is a little annoying because if b == 0, b_positive is a zero mask,
so the result isn't used anyway (this is also why this change is legal).
In llvm, it's a poison value, not UB, so masking it off works. If the
simplifier were smarter, it might just drop the signed_integer_overflow
intrinsic on detecting that it was being bitwise-and-ed with zero.

But the safest thing to do is not overflow. saturating_add/sub are
typically as cheap as add/sub. 99.9% of the time b is some positive
constant anyway, so it's going to get constant-folded.

* Add test
---
 src/FindIntrinsics.cpp          | 14 ++++++++------
 test/correctness/intrinsics.cpp | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp
index d453d0134c29..d7b053981ac8 100644
--- a/src/FindIntrinsics.cpp
+++ b/src/FindIntrinsics.cpp
@@ -1274,10 +1274,11 @@ Expr lower_widening_shift_right(const Expr &a, const Expr &b) {
 }
 
 Expr lower_rounding_shift_left(const Expr &a, const Expr &b) {
-    // Shift left, then add one to the result if bits were dropped
-    // (because b < 0) and the most significant dropped bit was a one.
+    // Shift left, then add one to the result if bits were dropped (because b < 0)
+    // and the most significant dropped bit was a one. We must take care not
+    // to introduce UB in the shifts, even if the result would be masked off.
     Expr b_negative = select(b < 0, make_one(a.type()), make_zero(a.type()));
-    return simplify((a << b) + (b_negative & (a << (b + 1))));
+    return simplify((a << b) + (b_negative & (a << saturating_add(b, make_one(b.type())))));
 }
 
 Expr lower_rounding_shift_right(const Expr &a, const Expr &b) {
@@ -1289,10 +1290,11 @@ Expr lower_rounding_shift_right(const Expr &a, const Expr &b) {
         Expr round = simplify(cast(a.type(), (1 << shift) - 1));
         return rounding_halving_add(a, round) >> shift;
     }
-    // Shift right, then add one to the result if bits were dropped
-    // (because b > 0) and the most significant dropped bit was a one.
+    // Shift right, then add one to the result if bits were dropped (because b > 0)
+    // and the most significant dropped bit was a one.  We must take care not to
+    // introduce UB in the shifts, even if the result would be masked off.
     Expr b_positive = select(b > 0, make_one(a.type()), make_zero(a.type()));
-    return simplify((a >> b) + (b_positive & (a >> (b - 1))));
+    return simplify((a >> b) + (b_positive & (a >> saturating_sub(b, make_one(b.type())))));
 }
 
 Expr lower_saturating_add(const Expr &a, const Expr &b) {
diff --git a/test/correctness/intrinsics.cpp b/test/correctness/intrinsics.cpp
index 339a5c2525e5..e5119bd5e1be 100644
--- a/test/correctness/intrinsics.cpp
+++ b/test/correctness/intrinsics.cpp
@@ -361,6 +361,22 @@ int main(int argc, char **argv) {
         g.compile_jit();
     }
 
+    // Rounding shifts by extreme values, when lowered, used to have the
+    // potential to overflow and turn into out-of-range shifts. The simplifier
+    // detected this and injected a signed_integer_overflow intrinsic, which
+    // then threw an error in codegen, even though the rounding shift calls are
+    // well-defined.
+    {
+        Func f, g;
+
+        f(x) = cast<uint8_t>(x);
+        f.compute_root();
+
+        g(x) = rounding_shift_right(x, 0) + rounding_shift_left(x, 8);
+
+        g.compile_jit();
+    }
+
     printf("Success!\n");
     return 0;
 }

From 3b8a532538ab8f4fa81b0d74ac7ab5449826e099 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 4 Apr 2024 10:19:13 -0700
Subject: [PATCH 095/186] Add some missing _Float16 support (#8174)

(Changes extracted from https://github.com/halide/Halide/pull/8169, which may or may not land in its current form)

Some missing support for _Float16 that will likely be handy:
- Allow _Float16 to be detected for Clang 15 (since my local XCode Clang 15 definitely supports it)
- Expr(_Float16)
- HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(_Float16);
- Add _Float16 to the convert matrix in halide_image_io.h
---
 src/Expr.h                  |   5 ++
 src/Type.h                  |   3 +
 src/runtime/HalideRuntime.h |   2 +-
 tools/halide_image_io.h     | 118 ++++++++++++++++++++++++++++++++++++
 4 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/src/Expr.h b/src/Expr.h
index 31850fc56001..b9832c104de8 100644
--- a/src/Expr.h
+++ b/src/Expr.h
@@ -298,6 +298,11 @@ struct Expr : public Internal::IRHandle {
     Expr(bfloat16_t x)
         : IRHandle(Internal::FloatImm::make(BFloat(16), (double)x)) {
     }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+    explicit Expr(_Float16 x)
+        : IRHandle(Internal::FloatImm::make(Float(16), (double)x)) {
+    }
+#endif
     Expr(float x)
         : IRHandle(Internal::FloatImm::make(Float(32), x)) {
     }
diff --git a/src/Type.h b/src/Type.h
index af5447350810..c8a397b3f0a7 100644
--- a/src/Type.h
+++ b/src/Type.h
@@ -166,6 +166,9 @@ HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::float16_t);
 HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::bfloat16_t);
 HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(halide_task_t);
 HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(halide_loop_task_t);
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(_Float16);
+#endif
 HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(float);
 HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(double);
 HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_buffer_t);
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 1d0843be0329..0379c1f9ab47 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -91,7 +91,7 @@ extern "C" {
 // Ideally there would be a better way to detect if the type
 // is supported, even in a compiler independent fashion, but
 // coming up with one has proven elusive.
-#if defined(__clang__) && (__clang_major__ >= 16) && !defined(__EMSCRIPTEN__) && !defined(__i386__)
+#if defined(__clang__) && (__clang_major__ >= 15) && !defined(__EMSCRIPTEN__) && !defined(__i386__)
 #if defined(__is_identifier)
 #if !__is_identifier(_Float16)
 #define HALIDE_CPP_COMPILER_HAS_FLOAT16
diff --git a/tools/halide_image_io.h b/tools/halide_image_io.h
index e039f7c2e798..1e0cbff01897 100644
--- a/tools/halide_image_io.h
+++ b/tools/halide_image_io.h
@@ -116,6 +116,12 @@ template<>
 inline bool convert(const int64_t &in) {
     return in != 0;
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline bool convert(const _Float16 &in) {
+    return (float)in != 0;
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline bool convert(const float &in) {
     return in != 0;
@@ -165,6 +171,12 @@ template<>
 inline uint8_t convert(const int64_t &in) {
     return convert<uint8_t, uint64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint8_t convert(const _Float16 &in) {
+    return (uint8_t)std::lround((float)in * 255.0f);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline uint8_t convert(const float &in) {
     return (uint8_t)std::lround(in * 255.0f);
@@ -211,6 +223,12 @@ template<>
 inline uint16_t convert(const int64_t &in) {
     return convert<uint16_t, uint64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint16_t convert(const _Float16 &in) {
+    return (uint16_t)std::lround((float)in * 65535.0f);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline uint16_t convert(const float &in) {
     return (uint16_t)std::lround(in * 65535.0f);
@@ -257,6 +275,12 @@ template<>
 inline uint32_t convert(const int64_t &in) {
     return convert<uint32_t, uint64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint32_t convert(const _Float16 &in) {
+    return (uint32_t)std::llround((float)in * 4294967295.0);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline uint32_t convert(const float &in) {
     return (uint32_t)std::llround(in * 4294967295.0);
@@ -303,6 +327,12 @@ template<>
 inline uint64_t convert(const int64_t &in) {
     return convert<uint64_t, uint64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint64_t convert(const _Float16 &in) {
+    return convert<uint64_t, uint32_t>((uint32_t)std::llround((float)in * 4294967295.0));
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline uint64_t convert(const float &in) {
     return convert<uint64_t, uint32_t>((uint32_t)std::llround(in * 4294967295.0));
@@ -349,6 +379,12 @@ template<>
 inline int8_t convert(const int64_t &in) {
     return convert<uint8_t, int64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int8_t convert(const _Float16 &in) {
+    return convert<uint8_t, float>((float)in);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline int8_t convert(const float &in) {
     return convert<uint8_t, float>(in);
@@ -395,6 +431,12 @@ template<>
 inline int16_t convert(const int64_t &in) {
     return convert<uint16_t, int64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int16_t convert(const _Float16 &in) {
+    return convert<uint16_t, float>((float)in);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline int16_t convert(const float &in) {
     return convert<uint16_t, float>(in);
@@ -441,6 +483,12 @@ template<>
 inline int32_t convert(const int64_t &in) {
     return convert<uint32_t, int64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int32_t convert(const _Float16 &in) {
+    return convert<uint32_t, float>((float)in);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline int32_t convert(const float &in) {
     return convert<uint32_t, float>(in);
@@ -487,6 +535,12 @@ template<>
 inline int64_t convert(const int64_t &in) {
     return convert<uint64_t, int64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int64_t convert(const _Float16 &in) {
+    return convert<uint64_t, float>((float)in);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline int64_t convert(const float &in) {
     return convert<uint64_t, float>(in);
@@ -496,6 +550,58 @@ inline int64_t convert(const double &in) {
     return convert<uint64_t, double>(in);
 }
 
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+// Convert to f16
+template<>
+inline _Float16 convert(const bool &in) {
+    return in;
+}
+template<>
+inline _Float16 convert(const uint8_t &in) {
+    return (_Float16)(in / 255.0f);
+}
+template<>
+inline _Float16 convert(const uint16_t &in) {
+    return (_Float16)(in / 65535.0f);
+}
+template<>
+inline _Float16 convert(const uint32_t &in) {
+    return (_Float16)(in / 4294967295.0);
+}
+template<>
+inline _Float16 convert(const uint64_t &in) {
+    return convert<_Float16, uint32_t>(uint32_t(in >> 32));
+}
+template<>
+inline _Float16 convert(const int8_t &in) {
+    return convert<_Float16, uint8_t>(in);
+}
+template<>
+inline _Float16 convert(const int16_t &in) {
+    return convert<_Float16, uint16_t>(in);
+}
+template<>
+inline _Float16 convert(const int32_t &in) {
+    return convert<_Float16, uint64_t>(in);
+}
+template<>
+inline _Float16 convert(const int64_t &in) {
+    return convert<_Float16, uint64_t>(in);
+}
+template<>
+inline _Float16 convert(const _Float16 &in) {
+    return in;
+}
+template<>
+inline _Float16 convert(const float &in) {
+    return (_Float16)in;
+}
+template<>
+inline _Float16 convert(const double &in) {
+    return (_Float16)in;
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+
 // Convert to f32
 template<>
 inline float convert(const bool &in) {
@@ -533,6 +639,12 @@ template<>
 inline float convert(const int64_t &in) {
     return convert<float, uint64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline float convert(const _Float16 &in) {
+    return (float)in;
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline float convert(const float &in) {
     return in;
@@ -579,6 +691,12 @@ template<>
 inline double convert(const int64_t &in) {
     return convert<double, uint64_t>(in);
 }
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline double convert(const _Float16 &in) {
+    return (double)in;
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
 template<>
 inline double convert(const float &in) {
     return (double)in;

From 7d9935740ca1c8790b494c670a79f163f4a4c168 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 5 Apr 2024 09:07:05 -0700
Subject: [PATCH 096/186] Add conversion code for Float16 that was missed in
 #8174 (#8178)

* Add conversion code for Float16 that was missed in #8174

* Don't sniff for _Float16 when building ASAN

* Update HalideRuntime.h
---
 src/runtime/HalideRuntime.h | 16 ++++++++++++++++
 tools/halide_image_io.h     |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 0379c1f9ab47..1d66ab02b368 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -86,6 +86,20 @@ extern "C" {
 
 #ifndef COMPILING_HALIDE_RUNTIME
 
+// ASAN builds can cause linker errors for Float16, so sniff for that and
+// don't enable it by default.
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define HALIDE_RUNTIME_ASAN_DETECTED
+#endif
+#endif
+
+#if defined(__SANITIZE_ADDRESS__) && !defined(HALIDE_RUNTIME_ASAN_DETECTED)
+#define HALIDE_RUNTIME_ASAN_DETECTED
+#endif
+
+#if !defined(HALIDE_RUNTIME_ASAN_DETECTED)
+
 // clang had _Float16 added as a reserved name in clang 8, but
 // doesn't actually support it on most platforms until clang 15.
 // Ideally there would be a better way to detect if the type
@@ -108,6 +122,8 @@ extern "C" {
 #endif
 #endif
 
+#endif  // !HALIDE_RUNTIME_ASAN_DETECTED
+
 #endif  // !COMPILING_HALIDE_RUNTIME
 
 /** \file
diff --git a/tools/halide_image_io.h b/tools/halide_image_io.h
index 1e0cbff01897..ff23c30aa995 100644
--- a/tools/halide_image_io.h
+++ b/tools/halide_image_io.h
@@ -2227,6 +2227,10 @@ struct ImageTypeConversion {
 
         const halide_type_t src_type = src.type();
         switch (src_type.element_of().as_u32()) {
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+        case halide_type_t(halide_type_float, 16).as_u32():
+            return convert_image<DstElemType>(src.template as<_Float16, AnyDims>());
+#endif
         case halide_type_t(halide_type_float, 32).as_u32():
             return convert_image<DstElemType>(src.template as<float, AnyDims>());
         case halide_type_t(halide_type_float, 64).as_u32():
@@ -2272,6 +2276,10 @@ struct ImageTypeConversion {
         // Call the appropriate static-to-static conversion routine
         // based on the desired dst type.
         switch (dst_type.element_of().as_u32()) {
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+        case halide_type_t(halide_type_float, 16).as_u32():
+            return convert_image<_Float16>(src);
+#endif
         case halide_type_t(halide_type_float, 32).as_u32():
             return convert_image<float>(src);
         case halide_type_t(halide_type_float, 64).as_u32():

From a46204408f0762479473f0c478327c0a5b7553f1 Mon Sep 17 00:00:00 2001
From: Alexander Root <32245479+rootjalex@users.noreply.github.com>
Date: Fri, 5 Apr 2024 09:38:46 -0700
Subject: [PATCH 097/186] Tighten bounds of abs() (#8168)

* Tighten bounds of abs()

* make abs bounds tight for non-int32 too

* make int32 min expression match non-int32 min expression
---
 dependencies/llvm/CMakeLists.txt |  2 +-
 src/Bounds.cpp                   | 30 ++++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt
index a4aef94b08de..d070caf53b19 100644
--- a/dependencies/llvm/CMakeLists.txt
+++ b/dependencies/llvm/CMakeLists.txt
@@ -21,7 +21,7 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 message(STATUS "Using ClangConfig.cmake in: ${Clang_DIR}")
 
 if (LLVM_PACKAGE_VERSION VERSION_LESS 16.0)
-    message(FATAL_ERROR "LLVM version must be 15.0 or newer")
+    message(FATAL_ERROR "LLVM version must be 16.0 or newer")
 endif ()
 
 if (LLVM_PACKAGE_VERSION VERSION_GREATER 19.0)
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index 16fd69f3e8fb..d7d337dacfdf 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -1237,18 +1237,29 @@ class Bounds : public IRVisitor {
 
         if (op->is_intrinsic(Call::abs)) {
             Interval a = arg_bounds.get(0);
-            interval.min = make_zero(t);
+
             if (a.is_bounded()) {
                 if (equal(a.min, a.max)) {
                     interval = Interval::single_point(Call::make(t, Call::abs, {a.max}, Call::PureIntrinsic));
                 } else if (op->args[0].type().is_int() && op->args[0].type().bits() >= 32) {
-                    interval.max = Max::make(Cast::make(t, -a.min), Cast::make(t, a.max));
+                    interval.min = Cast::make(t, Max::make(a.min, -Min::make(make_zero(a.min.type()), a.max)));
+                    interval.max = Cast::make(t, Max::make(-a.min, a.max));
                 } else {
+                    interval.min = Cast::make(t, Max::make(a.min, -Min::make(make_zero(a.min.type()), a.max)));
                     a.min = Call::make(t, Call::abs, {a.min}, Call::PureIntrinsic);
                     a.max = Call::make(t, Call::abs, {a.max}, Call::PureIntrinsic);
                     interval.max = Max::make(a.min, a.max);
                 }
             } else {
+                if (a.has_lower_bound()) {
+                    // If a is strictly positive, then abs(a) is strictly positive.
+                    interval.min = Cast::make(t, Max::make(make_zero(a.min.type()), a.min));
+                } else if (a.has_upper_bound()) {
+                    // If a is strictly negative, then abs(a) is strictly positive.
+                    interval.min = Cast::make(t, -Min::make(make_zero(a.max.type()), a.max));
+                } else {
+                    interval.min = make_zero(t);
+                }
                 // If the argument is unbounded on one side, then the max is unbounded.
                 interval.max = Interval::pos_inf();
             }
@@ -3651,6 +3662,21 @@ void bounds_test() {
     check(scope, cast<float>(x), 0.0f, 10.0f);
 
     check(scope, cast<int32_t>(abs(cast<float>(x))), 0, 10);
+    check(scope, abs(2 + x), u32(2), u32(12));
+    check(scope, abs(x - 11), u32(1), u32(11));
+    check(scope, abs(x - 5), u32(0), u32(5));
+    check(scope, abs(2 + cast<float>(x)), 2.f, 12.f);
+    check(scope, abs(cast<float>(x) - 11), 1.f, 11.f);
+    check(scope, abs(cast<float>(x) - 5), 0.f, 5.f);
+    check(scope, abs(2 + cast<int8_t>(x)), u8(2), u8(12));
+    check(scope, abs(cast<int8_t>(x) - 11), u8(1), u8(11));
+    check(scope, abs(cast<int8_t>(x) - 5), u8(0), u8(5));
+    scope.push("x", Interval(123, Interval::pos_inf()));
+    check(scope, abs(x), u32(123), Interval::pos_inf());
+    scope.pop("x");
+    scope.push("x", Interval(Interval::neg_inf(), -123));
+    check(scope, abs(x), u32(123), Interval::pos_inf());
+    scope.pop("x");
 
     // Check some vectors
     check(scope, Ramp::make(x * 2, 5, 5), 0, 40);

From 14ae0826dc93f0dcc40465f0bcd6b742fda3f656 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 5 Apr 2024 09:39:07 -0700
Subject: [PATCH 098/186] Clarify the meaning of Shuffle::is_broadcast()
 (#8158)

* Fix horrifying bug in lossless_cast of a subtract

* A 'broadcast' shuffle is more complex than it seems

I was poking at the Shuffle node, and checking its usage, and it seems
that despite the comment, Shuffles that return true for is_broadcast are
not the same as a Broadcast node. Instead of repeating the input vector
some number of times, it repeats a shuffle of the input vector. This
means IRPrinter was incorrect. None of the other usages were bad.

This PR makes this clearer in the comment, and fixes IRPrinter.

* Revert accidental change
---
 src/IR.h          | 9 ++++-----
 src/IRPrinter.cpp | 4 ----
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/IR.h b/src/IR.h
index 31aa3f195e43..d3f6af596f31 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -879,11 +879,10 @@ struct Shuffle : public ExprNode<Shuffle> {
      * arguments. */
     bool is_interleave() const;
 
-    /** Check if this shuffle can be represented as a broadcast.
-     * For example:
-     * A uint8 shuffle of with 4*n lanes and indices:
-     *     0, 1, 2, 3, 0, 1, 2, 3, ....., 0, 1, 2, 3
-     * can be represented as a uint32 broadcast with n lanes (factor = 4). */
+    /** Check if this shuffle can be represented as a repeating pattern that
+     * repeats the same shuffle of the single input vector some number of times.
+     * For example: 0, 3, 1, 1,  0, 3, 1, 1, .....,  0, 3, 1, 1
+     */
     bool is_broadcast() const;
     int broadcast_factor() const;
 
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index a186be1874d7..fb40de78f14a 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1084,10 +1084,6 @@ void IRPrinter::visit(const Shuffle *op) {
                << ", " << op->slice_stride()
                << ", " << op->indices.size()
                << ")";
-    } else if (op->is_broadcast()) {
-        stream << "broadcast(";
-        print_list(op->vectors);
-        stream << ", " << op->broadcast_factor() << ")";
     } else {
         stream << "shuffle(";
         print_list(op->vectors);

From 35f0c29a1930b118edab98b6d22ccad12fe6b3c6 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Sat, 6 Apr 2024 08:17:25 -0700
Subject: [PATCH 099/186] Add .npy support to halide_image_io (#8175)

* Add .npy support to halide_image_io

The .npy format is NumPy's native format for storing multidimensional arrays (aka tensors/buffers). Being able to load/save in this format makes it (potentially) a lot easier to interchange data with the Python ecosystem, as well as providing a file format that support floating-point data more robustly than any of the others that we current support.

This adds load/save support for a useful subset:
- We support the int/uint/float types common in Halide (except for f16/bf16 for now)
- We don't support reading or writing files that are in `fortran_order`
- We don't support any object/struct/etc files, only numeric primitives
- We only support loading files that are in the host's endianness (typically little-endian)

Note that at present this doesn't support f16 / bf16 formats, but that could likely be added with minimal difficulty.

The tricky bit of this is that the reading code has to parse a (limited) Python dict in text form. Please review that part carefully.

TODO: we could probably add this as an option for `debug_to_file()` without too much pain in a followup PR.

* clang-tidy

* clang-tidy

* Address review comments

* Allow for "keys" as well as 'keys'

* Add float16 support

* Use old-school parser

* clang-tidy
---
 test/correctness/image_io.cpp |  62 ++++--
 tools/halide_image_io.h       | 359 +++++++++++++++++++++++++++++-----
 2 files changed, 357 insertions(+), 64 deletions(-)

diff --git a/test/correctness/image_io.cpp b/test/correctness/image_io.cpp
index 132dac492f82..4921aa6f8a02 100644
--- a/test/correctness/image_io.cpp
+++ b/test/correctness/image_io.cpp
@@ -25,7 +25,10 @@ void test_round_trip(Buffer<T> buf, std::string format) {
         reloaded.translate(d, buf.dim(d).min() - reloaded.dim(d).min());
     }
 
-    Tools::save_image(reloaded, Internal::get_test_tmp_dir() + "test_reloaded." + format);
+    o = std::ostringstream();
+    o << Internal::get_test_tmp_dir() << "test_" << halide_type_of<T>() << "x" << buf.channels() << ".reloaded." << format;
+    filename = o.str();
+    Tools::save_image(reloaded, filename);
 
     // Check they're not too different.
     RDom r(reloaded);
@@ -33,15 +36,15 @@ void test_round_trip(Buffer<T> buf, std::string format) {
     for (int i = 0; i < r.dimensions(); ++i) {
         args.push_back(r[i]);
     }
-    uint32_t diff = evaluate<uint32_t>(maximum(abs(cast<int>(buf(args)) - cast<int>(reloaded(args)))));
+    double diff = evaluate<double>(maximum(abs(cast<double>(buf(args)) - cast<double>(reloaded(args)))));
 
-    uint32_t max_diff = 0;
+    double max_diff = 0.00001;
     if (format == "jpg") {
         max_diff = 32;
     }
     if (diff > max_diff) {
-        printf("test_round_trip: Difference of %d when saved and loaded as %s\n", diff, format.c_str());
-        abort();
+        printf("test_round_trip: Difference of %f when saved and loaded as %s\n", diff, format.c_str());
+        exit(1);
     }
 }
 
@@ -62,7 +65,7 @@ void test_convert_image_s2s(Buffer<T> buf) {
     uint32_t diff = evaluate<uint32_t>(maximum(abs(cast<int>(buf(args)) - cast<int>(buf2(args)))));
     if (diff > 0) {
         printf("test_convert_image_s2s: Difference of %d when converted\n", diff);
-        abort();
+        exit(1);
     }
 }
 
@@ -85,7 +88,7 @@ void test_convert_image_d2s(Buffer<T> buf) {
     uint32_t diff = evaluate<uint32_t>(maximum(abs(cast<int>(buf(args)) - cast<int>(buf2(args)))));
     if (diff > 0) {
         printf("test_convert_image_d2s: Difference of %d when converted\n", diff);
-        abort();
+        exit(1);
     }
 }
 
@@ -110,7 +113,7 @@ void test_convert_image_s2d(Buffer<T> buf) {
     uint32_t diff = evaluate<uint32_t>(maximum(abs(cast<int>(buf(args)) - cast<int>(buf2(args)))));
     if (diff > 0) {
         printf("test_convert_image_s2d: Difference of %d when converted\n", diff);
-        abort();
+        exit(1);
     }
 }
 
@@ -135,7 +138,7 @@ void test_convert_image_d2d(Buffer<> buf_d) {
     uint32_t diff = evaluate<uint32_t>(maximum(abs(cast<int>(buf(args)) - cast<int>(buf2(args)))));
     if (diff > 0) {
         printf("test_convert_image_d2d: Difference of %d when converted\n", diff);
-        abort();
+        exit(1);
     }
 }
 
@@ -166,8 +169,8 @@ void do_test() {
     // Make some colored noise
     Func f;
     Var x, y, c, w;
-    const float one = std::numeric_limits<T>::max();
-    f(x, y, c) = cast<T>(clamp(make_noise(10)(x, y, c), 0.0f, 1.0f) * one);
+    const Expr one = std::is_floating_point<T>::value ? Expr(1.0) : Expr(std::numeric_limits<T>::max());
+    f(x, y, c) = cast<T>(clamp(make_noise(10)(x, y, c), Expr(0.0), Expr(1.0)) * one);
 
     Buffer<T> color_buf = f.realize({width, height, 3});
 
@@ -176,16 +179,19 @@ void do_test() {
     color_buf.crop(0, inset, width - inset * 2);
     color_buf.crop(1, inset, height - inset * 2);
 
-    test_convert_image_s2s<T>(color_buf);
-    test_convert_image_s2d<T>(color_buf);
-    test_convert_image_d2s<T>(color_buf);
-    test_convert_image_d2d<T>(color_buf);
+    const auto ht = halide_type_of<T>();
+    if (ht == halide_type_t(halide_type_uint, 8) || ht == halide_type_t(halide_type_uint, 16)) {
+        test_convert_image_s2s<T>(color_buf);
+        test_convert_image_s2d<T>(color_buf);
+        test_convert_image_d2s<T>(color_buf);
+        test_convert_image_d2d<T>(color_buf);
+    }
 
     Buffer<T> luma_buf(width, height, 1);
     luma_buf.copy_from(color_buf);
     luma_buf.slice(2);
 
-    std::vector<std::string> formats = {"ppm", "pgm", "tmp", "mat", "tiff"};
+    std::vector<std::string> formats = {"npy", "ppm", "pgm", "tmp", "mat", "tiff"};
 #ifndef HALIDE_NO_JPEG
     formats.push_back("jpg");
 #endif
@@ -193,7 +199,14 @@ void do_test() {
     formats.push_back("png");
 #endif
     for (std::string format : formats) {
-        if (format == "jpg" && halide_type_of<T>() != halide_type_t(halide_type_uint, 8)) {
+        // .npy is the only format here that supports float16
+        if (halide_type_of<T>() == halide_type_t(halide_type_float, 16) && format != "npy") {
+            continue;
+        }
+        if ((format == "jpg" || format == "pgm" || format == "ppm") && ht != halide_type_t(halide_type_uint, 8)) {
+            continue;
+        }
+        if (format == "png" && ht != halide_type_t(halide_type_uint, 8) && ht != halide_type_t(halide_type_uint, 16)) {
             continue;
         }
         if (format == "tmp") {
@@ -238,7 +251,7 @@ void test_mat_header() {
     std::ifstream fs(filename.c_str(), std::ifstream::binary);
     if (!fs) {
         std::cout << "Cannot read " << filename << "\n";
-        abort();
+        exit(1);
     }
     fs.seekg(0, fs.end);
     // .mat file begins with a 128 bytes header and a 8 bytes
@@ -251,13 +264,24 @@ void test_mat_header() {
     fs.close();
     if (file_size != stored_file_size) {
         std::cout << "Wrong file size written for " << filename << ". Expected " << file_size << ", got" << stored_file_size << "\n";
-        abort();
+        exit(1);
     }
 }
 
 int main(int argc, char **argv) {
+    do_test<int8_t>();
+    do_test<int16_t>();
+    do_test<int32_t>();
+    do_test<int64_t>();
     do_test<uint8_t>();
     do_test<uint16_t>();
+    do_test<uint32_t>();
+    do_test<uint64_t>();
+    do_test<float>();
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+    do_test<_Float16>();
+#endif
+    do_test<double>();
     test_mat_header();
     printf("Success!\n");
     return 0;
diff --git a/tools/halide_image_io.h b/tools/halide_image_io.h
index ff23c30aa995..1a0d250b746f 100644
--- a/tools/halide_image_io.h
+++ b/tools/halide_image_io.h
@@ -1166,6 +1166,317 @@ bool save_ppm(ImageType &im, const std::string &filename) {
     return Internal::save_pnm<ImageType, check>(im, 3, filename);
 }
 
+// -------------- .npy file format
+// Based on documentation at https://numpy.org/devdocs/reference/generated/numpy.lib.format.html
+// and elsewhere
+
+#if (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN) || defined(HALIDE_FORCE_BIG_ENDIAN)
+constexpr bool host_is_big_endian = true;
+#else
+constexpr bool host_is_big_endian = false;
+#endif
+
+constexpr char little_endian_char = '<';
+constexpr char big_endian_char = '>';
+constexpr char no_endian_char = '|';
+constexpr char host_endian_char = (host_is_big_endian ? big_endian_char : little_endian_char);
+
+struct npy_dtype_info_t {
+    char byte_order;
+    char type_code;
+    char type_bytes;
+
+    std::string descr() const {
+        return std::string(1, byte_order) + std::string(1, type_code) + std::to_string((int)type_bytes);
+    }
+};
+
+inline static const std::array<std::pair<halide_type_t, npy_dtype_info_t>, 11> npy_dtypes = {{
+    {halide_type_t(halide_type_float, 16), {host_endian_char, 'f', 2}},
+    {halide_type_of<float>(), {host_endian_char, 'f', sizeof(float)}},
+    {halide_type_of<double>(), {host_endian_char, 'f', sizeof(double)}},
+    {halide_type_of<int8_t>(), {no_endian_char, 'i', sizeof(int8_t)}},
+    {halide_type_of<int16_t>(), {host_endian_char, 'i', sizeof(int16_t)}},
+    {halide_type_of<int32_t>(), {host_endian_char, 'i', sizeof(int32_t)}},
+    {halide_type_of<int64_t>(), {host_endian_char, 'i', sizeof(int64_t)}},
+    {halide_type_of<uint8_t>(), {no_endian_char, 'u', sizeof(uint8_t)}},
+    {halide_type_of<uint16_t>(), {host_endian_char, 'u', sizeof(uint16_t)}},
+    {halide_type_of<uint32_t>(), {host_endian_char, 'u', sizeof(uint32_t)}},
+    {halide_type_of<uint64_t>(), {host_endian_char, 'u', sizeof(uint64_t)}},
+}};
+
+inline static const std::array<char, 6> npy_magic_string = {'\x93', 'N', 'U', 'M', 'P', 'Y'};
+inline static const std::array<char, 2> npy_v1_bytes = {'\x01', '\x00'};
+
+inline std::string trim_whitespace(const std::string &s) {
+    const size_t first = s.find_first_not_of(" \t\n");
+    if (first == std::string::npos) {
+        return "";
+    }
+    const size_t last = s.find_last_not_of(" \t\n");
+    return s.substr(first, (last - first + 1));
+}
+
+struct NpyHeader {
+    char type_code;
+    int type_bytes;
+    std::vector<int> extents;
+
+    bool parse(const std::string &header) {
+        const char *ptr = &header[0];
+        if (*ptr++ != '{') {
+            return false;
+        }
+        while (true) {
+            char endian;
+            int consumed;
+            if (std::sscanf(ptr, "'descr': '%c%c%d'%n", &endian, &type_code, &type_bytes, &consumed) == 3) {
+                if (endian != '<' && endian != '|') {
+                    return false;
+                }
+                ptr += consumed;
+            } else if (std::strncmp(ptr, "'fortran_order': False", 22) == 0) {
+                ptr += 22;
+            } else if (std::strncmp(ptr, "'shape': (", 10) == 0) {
+                ptr += 10;
+                int n;
+                while (std::sscanf(ptr, "%d%n", &n, &consumed) == 1) {
+                    extents.push_back(n);
+                    ptr += consumed;
+                    if (*ptr == ',') {
+                        ptr++;
+                    }
+                    if (*ptr == ' ') {
+                        ptr++;
+                    }
+                }
+                if (*ptr++ != ')') {
+                    return false;
+                }
+            } else if (*ptr == '}') {
+                return true;
+            } else {
+                return false;
+            }
+            if (*ptr == ',') {
+                ptr++;
+            }
+            if (*ptr == ' ') {
+                ptr++;
+            }
+            assert(ptr <= &header.back());
+        }
+    }
+};
+
+// return true iff the buffer storage has no padding between
+// any elements, and is in strictly planar order.
+template<typename ImageType>
+bool buffer_is_compact_planar(ImageType &im) {
+    const halide_type_t im_type = im.type();
+    const size_t elem_size = (im_type.bits / 8);
+    if (((const uint8_t *)im.begin() + (im.number_of_elements() * elem_size)) != (const uint8_t *)im.end()) {
+        return false;
+    }
+    for (int d = 1; d < im.dimensions(); ++d) {
+        if (im.dim(d - 1).stride() > im.dim(d).stride()) {
+            return false;
+        }
+        // Strides can only match if the previous dimension has extent 1
+        // (this can happen when artificially adding dimension(s), e.g.
+        // to write a .tmp file)
+        if (im.dim(d - 1).stride() == im.dim(d).stride() && im.dim(d - 1).extent() != 1) {
+            return false;
+        }
+    }
+    return true;
+}
+
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool load_npy(const std::string &filename, ImageType *im) {
+    static_assert(!ImageType::has_static_halide_type, "");
+
+    FileOpener f(filename, "rb");
+    if (!check(f.f != nullptr, "File could not be opened for reading")) {
+        return false;
+    }
+
+    char magic_and_version[8];
+    if (!check(f.read_bytes(magic_and_version, 8), "Could not read .npy header")) {
+        return false;
+    }
+    if (memcmp(magic_and_version, npy_magic_string.data(), npy_magic_string.size()) != 0) {
+        return check(false, "Bad .npy magic string");
+    }
+    if ((magic_and_version[6] != 1 && magic_and_version[6] != 2 && magic_and_version[6] != 3) || magic_and_version[7] != 0) {
+        return check(false, "Bad .npy version");
+    }
+    size_t header_len;
+    uint8_t header_len_le[4];
+    if (magic_and_version[6] == 1) {
+        if (!check(f.read_bytes(header_len_le, 2), "Could not read .npy header")) {
+            return false;
+        }
+        header_len = (header_len_le[0] << 0) | (header_len_le[1] << 8);
+        if (!check((6 + 2 + 2 + header_len) % 64 == 0, ".npy header is not aligned properly")) {
+            return false;
+        }
+    } else {
+        if (!check(f.read_bytes(header_len_le, 4), "Could not read .npy header")) {
+            return false;
+        }
+        header_len = (header_len_le[0] << 0) | (header_len_le[1] << 8) | (header_len_le[2] << 16) | (header_len_le[3] << 24);
+        if (!check((6 + 2 + 4 + header_len) % 64 == 0, ".npy header is not aligned properly")) {
+            return false;
+        }
+    }
+
+    std::string header(header_len + 1, ' ');
+    if (!check(f.read_bytes(header.data(), header_len), "Could not read .npy header string")) {
+        return false;
+    }
+
+    NpyHeader h;
+    if (!check(h.parse(header), "Could not parse .npy header dict")) {
+        return false;
+    }
+
+    halide_type_t im_type((halide_type_code_t)0, 0, 0);
+    for (const auto &d : npy_dtypes) {
+        if (h.type_code == d.second.type_code && h.type_bytes == d.second.type_bytes) {
+            im_type = d.first;
+            break;
+        }
+    }
+    if (!check(im_type.bits != 0, "Unsupported type in load_npy")) {
+        return false;
+    }
+
+    *im = ImageType(im_type, h.extents);
+
+    // This should never fail unless the default Buffer<> constructor behavior changes.
+    if (!check(buffer_is_compact_planar(*im), "load_npy() requires compact planar images")) {
+        return false;
+    }
+
+    if (!check(f.read_bytes(im->begin(), im->size_in_bytes()), "Count not read .npy payload")) {
+        return false;
+    }
+
+    im->set_host_dirty();
+    return true;
+}
+
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool write_planar_payload(ImageType &im, FileOpener &f) {
+    if (im.dimensions() == 0 || buffer_is_compact_planar(im)) {
+        // Contiguous buffer! Write it all in one swell foop.
+        if (!check(f.write_bytes(im.begin(), im.size_in_bytes()), "Count not write planar payload")) {
+            return false;
+        }
+    } else {
+        // We have to do this the hard way.
+        int d = im.dimensions() - 1;
+        for (int i = im.dim(d).min(); i <= im.dim(d).max(); i++) {
+            auto slice = im.sliced(d, i);
+            if (!write_planar_payload(slice, f)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool save_npy(ImageType &im, const std::string &filename) {
+    static_assert(!ImageType::has_static_halide_type, "");
+
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return false;
+    }
+
+    const halide_type_t im_type = im.type();
+    npy_dtype_info_t di = {0, 0, 0};
+    for (const auto &d : npy_dtypes) {
+        if (d.first == im_type) {
+            di = d.second;
+            break;
+        }
+    }
+    if (!check(di.byte_order != 0, "Unsupported type in save_npy")) {
+        return false;
+    }
+
+    std::string shape = "(";
+    for (int d = 0; d < im.dimensions(); ++d) {
+        if (d > 0) {
+            shape += ",";
+        }
+        shape += std::to_string(im.dim(d).extent());
+        if (im.dimensions() == 1) {
+            shape += ",";  // special-case for single-element tuples
+        }
+    }
+    shape += ")";
+
+    std::string header_dict_str = "{'descr': '" + di.descr() + "', 'fortran_order': False, 'shape': " + shape + "}\n";
+
+    const size_t unpadded_length = npy_magic_string.size() + npy_v1_bytes.size() + 2 + header_dict_str.size();
+    const size_t padded_length = (unpadded_length + 64 - 1) & ~(64 - 1);
+    const size_t padding = padded_length - unpadded_length;
+    header_dict_str += std::string(padding, ' ');
+
+    if (!check(header_dict_str.size() <= 65535, "Header is too large for v1 .npy file")) {
+        return false;
+    }
+    const uint16_t header_len = (uint16_t)(header_dict_str.size());
+    const uint8_t header_len_le[2] = {
+        (uint8_t)((header_len >> 0) & 0xff),
+        (uint8_t)((header_len >> 8) & 0xff)};
+
+    FileOpener f(filename, "wb");
+    if (!check(f.write_bytes(npy_magic_string.data(), npy_magic_string.size()), ".npy write failed")) {
+        return false;
+    }
+    if (!check(f.write_bytes(npy_v1_bytes.data(), npy_v1_bytes.size()), ".npy write failed")) {
+        return false;
+    }
+    if (!check(f.write_bytes(header_len_le, 2), ".npy write failed")) {
+        return false;
+    }
+    if (!check(f.write_bytes(header_dict_str.data(), header_dict_str.size()), ".npy write failed")) {
+        return false;
+    }
+
+    if (!write_planar_payload<ImageType, check>(im, f)) {
+        return false;
+    }
+
+    return true;
+}
+
+inline const std::set<FormatInfo> &query_npy() {
+    auto build_set = []() -> std::set<FormatInfo> {
+        // NumPy doesn't support bfloat16, not sure if they plan to,
+        // so we don't attempt to support it here
+        std::set<FormatInfo> s;
+        for (halide_type_code_t code : {halide_type_int, halide_type_uint, halide_type_float}) {
+            for (int bits : {8, 16, 32, 64}) {
+                if (code == halide_type_float && bits < 16) {
+                    continue;
+                }
+                for (int dims : {1, 2, 3, 4}) {
+                    s.insert({halide_type_t(code, bits), dims});
+                }
+            }
+        }
+        return s;
+    };
+
+    static std::set<FormatInfo> info = build_set();
+    return info;
+}
+
 #ifndef HALIDE_NO_JPEG
 
 template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
@@ -1293,29 +1604,6 @@ inline const halide_type_t *tmp_code_to_halide_type() {
     return tmp_code_to_halide_type_;
 }
 
-// return true iff the buffer storage has no padding between
-// any elements, and is in strictly planar order.
-template<typename ImageType>
-bool buffer_is_compact_planar(ImageType &im) {
-    const halide_type_t im_type = im.type();
-    const size_t elem_size = (im_type.bits / 8);
-    if (((const uint8_t *)im.begin() + (im.number_of_elements() * elem_size)) != (const uint8_t *)im.end()) {
-        return false;
-    }
-    for (int d = 1; d < im.dimensions(); ++d) {
-        if (im.dim(d - 1).stride() > im.dim(d).stride()) {
-            return false;
-        }
-        // Strides can only match if the previous dimension has extent 1
-        // (this can happen when artificially adding dimension(s), e.g.
-        // to write a .tmp file)
-        if (im.dim(d - 1).stride() == im.dim(d).stride() && im.dim(d - 1).extent() != 1) {
-            return false;
-        }
-    }
-    return true;
-}
-
 // ".tmp" is a file format used by the ImageStack tool (see https://github.com/abadams/ImageStack)
 template<typename ImageType, CheckFunc check = CheckReturn>
 bool load_tmp(const std::string &filename, ImageType *im) {
@@ -1371,26 +1659,6 @@ inline const std::set<FormatInfo> &query_tmp() {
     return info;
 }
 
-template<typename ImageType, CheckFunc check = CheckReturn>
-bool write_planar_payload(ImageType &im, FileOpener &f) {
-    if (im.dimensions() == 0 || buffer_is_compact_planar(im)) {
-        // Contiguous buffer! Write it all in one swell foop.
-        if (!check(f.write_bytes(im.begin(), im.size_in_bytes()), "Count not write .tmp payload")) {
-            return false;
-        }
-    } else {
-        // We have to do this the hard way.
-        int d = im.dimensions() - 1;
-        for (int i = im.dim(d).min(); i <= im.dim(d).max(); i++) {
-            auto slice = im.sliced(d, i);
-            if (!write_planar_payload(slice, f)) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
 // ".tmp" is a file format used by the ImageStack tool (see https://github.com/abadams/ImageStack)
 template<typename ImageType, CheckFunc check = CheckReturn>
 bool save_tmp(ImageType &im, const std::string &filename) {
@@ -2121,6 +2389,7 @@ bool find_imageio(const std::string &filename, ImageIO<ImageType, check> *result
         {"jpeg", {load_jpg<ImageType, check>, save_jpg<ConstImageType, check>, query_jpg}},
         {"jpg", {load_jpg<ImageType, check>, save_jpg<ConstImageType, check>, query_jpg}},
 #endif
+        {"npy", {load_npy<ImageType, check>, save_npy<ConstImageType, check>, query_npy}},
         {"pgm", {load_pgm<ImageType, check>, save_pgm<ConstImageType, check>, query_pgm}},
 #ifndef HALIDE_NO_PNG
         {"png", {load_png<ImageType, check>, save_png<ConstImageType, check>, query_png}},
@@ -2441,7 +2710,7 @@ class load_image {
     operator ImageType() {
         using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
         DynamicImageType im_d;
-        (void)load<DynamicImageType, Internal::CheckFail>(filename, &im_d);
+        Internal::CheckFail(load<DynamicImageType, Internal::CheckFail>(filename, &im_d), "load() failed");
         Internal::CheckFail(ImageType::can_convert_from(im_d),
                             "Type mismatch assigning the result of load_image. "
                             "Did you mean to use load_and_convert_image?");
@@ -2464,7 +2733,7 @@ class load_and_convert_image {
     inline operator ImageType() {
         using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
         DynamicImageType im_d;
-        (void)load<DynamicImageType, Internal::CheckFail>(filename, &im_d);
+        Internal::CheckFail(load<DynamicImageType, Internal::CheckFail>(filename, &im_d), "load() failed");
         const halide_type_t expected_type = ImageType::static_halide_type();
         if (im_d.type() == expected_type) {
             return im_d.template as<typename ImageType::ElemType, Internal::AnyDims>();

From e3d3c8cacfe6d664a8994166d0998f362bf55ce8 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Mon, 8 Apr 2024 17:29:33 +0200
Subject: [PATCH 100/186] Fix unused variable. (#8180)

---
 src/FindCalls.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/FindCalls.cpp b/src/FindCalls.cpp
index 9345c89dcac5..1fca6de1175c 100644
--- a/src/FindCalls.cpp
+++ b/src/FindCalls.cpp
@@ -55,7 +55,7 @@ void populate_environment_helper(const Function &f,
     auto insert_func = [](const Function &f,
                           std::map<std::string, Function> *env,
                           std::vector<Function> *order) {
-        auto [it, inserted] = env->emplace(f.name(), f);
+        bool inserted = env->emplace(f.name(), f).second;
         if (inserted) {
             order->push_back(f);
         }

From 8f3f6cff6996afe993883d4fbb3bf99f2f700fb1 Mon Sep 17 00:00:00 2001
From: Fabian Schuetze <fabian.schutze@eui.eu>
Date: Thu, 11 Apr 2024 18:58:36 +0200
Subject: [PATCH 101/186] Update Hexagon Install Instructions (#8182)

update Hexagon install instructions
---
 README.md | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index c5dfe5507a8b..839785441292 100644
--- a/README.md
+++ b/README.md
@@ -406,15 +406,12 @@ branch.)
 
 ### 2. Download and install the Hexagon SDK and Hexagon Tools
 
-Go to https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
-
-1. Select the Hexagon Series 600 Software and download & run QPM and install
-   the Hexagon SDK 4.3.0 version or later for Linux.
-2. untar the installer
-3. Run the extracted installer to install the Hexagon SDK and Hexagon Tools,
-   selecting Installation of Hexagon SDK into `/location/of/SDK/Hexagon_SDK/4.x`
-   and the Hexagon tools into `/location/of/SDK/Hexagon_Tools/8.x`
-4. Set an environment variable to point to the SDK installation location
+Go to https://qpm.qualcomm.com/#/main/home
+
+1. Go to Tools, and download Qualcomm Package Manager 3. Install the package manager on your machine.
+2. Run the installed Qualcomm Package Manager and install the Qualcomm Hexagon SDK 5.x (or 4.x).
+   The SDK can be selected from the Qualcomm Hexagon SDK Products.
+3. Set an environment variable to point to the SDK installation location
    ```
    export SDK_LOC=/location/of/SDK
    ```

From dc837074c4ca73583c3541ea54438d7fda84fdf9 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 11 Apr 2024 11:04:42 -0700
Subject: [PATCH 102/186] Add .npy support to debug_to_file() (#8177)

* Add .npy support to halide_image_io

The .npy format is NumPy's native format for storing multidimensional arrays (aka tensors/buffers). Being able to load/save in this format makes it (potentially) a lot easier to interchange data with the Python ecosystem, as well as providing a file format that support floating-point data more robustly than any of the others that we current support.

This adds load/save support for a useful subset:
- We support the int/uint/float types common in Halide (except for f16/bf16 for now)
- We don't support reading or writing files that are in `fortran_order`
- We don't support any object/struct/etc files, only numeric primitives
- We only support loading files that are in the host's endianness (typically little-endian)

Note that at present this doesn't support f16 / bf16 formats, but that could likely be added with minimal difficulty.

The tricky bit of this is that the reading code has to parse a (limited) Python dict in text form. Please review that part carefully.

TODO: we could probably add this as an option for `debug_to_file()` without too much pain in a followup PR.

* clang-tidy

* clang-tidy

* Address review comments

* Allow for "keys" as well as 'keys'

* Add .npy support to debug_to_file()

Built on top of https://github.com/halide/Halide/pull/8175, this adds .npy as an option. This is actually pretty great because it's easy to do something like

```
ss = numpy.load("my_file.npy")
print(ss)
```

in Python and get nicely-formatted output, which can sometimes be a lot easier for debugging that inserting lots of print() statements (see https://github.com/halide/Halide/issues/8176)

Did a drive-by change to the correctness test to use this format instead of .mat.

* Add float16 support

* Add support for Float16 images in npy

* Assume little-endian

* Remove redundant halide_error()

* naming convention

* naming convention

* Test both mat and npy

* Don't call halide_error()

* Use old-school parser

* clang-tidy
---
 src/DebugToFile.cpp                |   4 +
 src/runtime/write_debug_image.cpp  | 140 ++++++++++++++++++++++++---
 test/correctness/debug_to_file.cpp | 147 +++++++++++++++--------------
 3 files changed, 207 insertions(+), 84 deletions(-)

diff --git a/src/DebugToFile.cpp b/src/DebugToFile.cpp
index 8147e4cfe7f1..8510b806a132 100644
--- a/src/DebugToFile.cpp
+++ b/src/DebugToFile.cpp
@@ -42,6 +42,8 @@ class DebugToFile : public IRMutator {
                 num_elements *= bound.extent;
             }
 
+            // TODO: why do we bother with this? halide_debug_to_file()
+            // can infer the type-and-size it needs from the buffer's type field.
             int type_code = 0;
             Type t = op->types[0];
             if (t == Float(32)) {
@@ -64,6 +66,8 @@ class DebugToFile : public IRMutator {
                 type_code = 8;
             } else if (t == Int(64)) {
                 type_code = 9;
+            } else if (t == Float(16)) {
+                type_code = 10;
             } else {
                 user_error << "Type " << t << " not supported for debug_to_file\n";
             }
diff --git a/src/runtime/write_debug_image.cpp b/src/runtime/write_debug_image.cpp
index f51017c1fbb4..a5f8816db2c7 100644
--- a/src/runtime/write_debug_image.cpp
+++ b/src/runtime/write_debug_image.cpp
@@ -1,13 +1,16 @@
 #include "HalideRuntime.h"
 
-// We support three formats, tiff, mat, and tmp.
+// We support four formats, npy, tiff, mat, and tmp.
 //
 // All formats support arbitrary types, and are easy to write in a
 // small amount of code.
 //
+// npy:
+// - Arbitrary dimensionality, type
+// - Readable by NumPy and other Python tools
 // TIFF:
 // - 2/3-D only
-// - Readable by the most tools
+// - Readable by a lot of tools
 // mat:
 // - Arbitrary dimensionality, type
 // - Readable by matlab, ImageStack, and many other tools
@@ -26,20 +29,22 @@ namespace Internal {
 // Mappings from the type_code passed in to the type codes of the
 // formats. See "type_code" in DebugToFile.cpp
 
+constexpr int kNumTypeCodes = 11;
+
 // TIFF sample type values are:
 //     1 => Unsigned int
 //     2 => Signed int
 //     3 => Floating-point
-WEAK int16_t pixel_type_to_tiff_sample_type[] = {
+WEAK int16_t pixel_type_to_tiff_sample_type[kNumTypeCodes] = {
     // float, double, uint8, int8, ... uint64, int64
-    3, 3, 1, 2, 1, 2, 1, 2, 1, 2};
+    3, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0};
 
 // See the .mat level 5 documentation for matlab class codes.
-WEAK uint8_t pixel_type_to_matlab_class_code[] = {
-    7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+WEAK uint8_t pixel_type_to_matlab_class_code[kNumTypeCodes] = {
+    7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0};
 
-WEAK uint8_t pixel_type_to_matlab_type_code[] = {
-    7, 9, 2, 1, 4, 3, 6, 5, 13, 12};
+WEAK uint8_t pixel_type_to_matlab_type_code[kNumTypeCodes] = {
+    7, 9, 2, 1, 4, 3, 6, 5, 13, 12, 0};
 
 #pragma pack(push)
 #pragma pack(2)
@@ -125,6 +130,39 @@ struct ScopedFile {
     }
 };
 
+// Halide runtime has lots of assumptions that we are always little-endian,
+// so we'll hardcode this here; leaving in the logic to make it clear.
+constexpr bool host_is_big_endian = false;
+constexpr char little_endian_char = '<';
+constexpr char big_endian_char = '>';
+constexpr char no_endian_char = '|';
+constexpr char host_endian_char = (host_is_big_endian ? big_endian_char : little_endian_char);
+
+struct npy_dtype_info_t {
+    char byte_order;
+    char kind;
+    size_t item_size;
+};
+
+struct htype_to_dtype {
+    halide_type_t htype;
+    npy_dtype_info_t dtype;
+};
+
+WEAK htype_to_dtype npy_dtypes[] = {
+    {halide_type_t(halide_type_float, 16), {host_endian_char, 'f', 2}},
+    {halide_type_of<float>(), {host_endian_char, 'f', sizeof(float)}},
+    {halide_type_of<double>(), {host_endian_char, 'f', sizeof(double)}},
+    {halide_type_of<int8_t>(), {no_endian_char, 'i', sizeof(int8_t)}},
+    {halide_type_of<int16_t>(), {host_endian_char, 'i', sizeof(int16_t)}},
+    {halide_type_of<int32_t>(), {host_endian_char, 'i', sizeof(int32_t)}},
+    {halide_type_of<int64_t>(), {host_endian_char, 'i', sizeof(int64_t)}},
+    {halide_type_of<uint8_t>(), {no_endian_char, 'u', sizeof(uint8_t)}},
+    {halide_type_of<uint16_t>(), {host_endian_char, 'u', sizeof(uint16_t)}},
+    {halide_type_of<uint32_t>(), {host_endian_char, 'u', sizeof(uint32_t)}},
+    {halide_type_of<uint64_t>(), {host_endian_char, 'u', sizeof(uint64_t)}},
+};
+
 }  // namespace Internal
 }  // namespace Runtime
 }  // namespace Halide
@@ -142,11 +180,15 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
         return halide_error_code_bad_dimensions;
     }
 
-    if (auto result = halide_copy_to_host(user_context, buf);
-        result != halide_error_code_success) {
+    if (auto result = halide_copy_to_host(user_context, buf); result != halide_error_code_success) {
+        // halide_error() has already been called
         return result;
     }
 
+    // Note: all calls to this function are wrapped in an assert that identifies
+    // the function that failed, so calling halide_error() anywhere after this is redundant
+    // and actually unhelpful.
+
     ScopedFile f(filename, "wb");
     if (!f.open()) {
         return halide_error_code_debug_to_file_failed;
@@ -167,7 +209,73 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
 
     uint32_t final_padding_bytes = 0;
 
-    if (ends_with(filename, ".tiff") || ends_with(filename, ".tif")) {
+    if (ends_with(filename, ".npy")) {
+        npy_dtype_info_t di = {0, 0, 0};
+        for (const auto &d : npy_dtypes) {
+            if (d.htype == buf->type) {
+                di = d.dtype;
+                break;
+            }
+        }
+        if (di.byte_order == 0) {
+            return halide_error_code_debug_to_file_failed;
+        }
+
+        constexpr int max_dict_string_size = 1024;
+        char dict_string_buf[max_dict_string_size];
+        char *dst = dict_string_buf;
+        char *end = dict_string_buf + max_dict_string_size - 1;
+
+        dst = halide_string_to_string(dst, end, "{'descr': '");
+        *dst++ = di.byte_order;
+        *dst++ = di.kind;
+        dst = halide_int64_to_string(dst, end, di.item_size, 1);
+        dst = halide_string_to_string(dst, end, "', 'fortran_order': False, 'shape': (");
+        for (int d = 0; d < buf->dimensions; ++d) {
+            if (d > 0) {
+                dst = halide_string_to_string(dst, end, ",");
+            }
+            dst = halide_int64_to_string(dst, end, buf->dim[d].extent, 1);
+            if (buf->dimensions == 1) {
+                dst = halide_string_to_string(dst, end, ",");  // special-case for single-element tuples
+            }
+        }
+        dst = halide_string_to_string(dst, end, ")}\n");
+        if (dst >= end) {
+            // bloody unlikely, but just in case
+            return halide_error_code_debug_to_file_failed;
+        }
+
+        const char *npy_magic_string_and_version = "\x93NUMPY\x01\x00";
+
+        const size_t unpadded_length = 8 + 2 + (dst - dict_string_buf);
+        const size_t padded_length = (unpadded_length + 64 - 1) & ~(64 - 1);
+        const size_t padding = padded_length - unpadded_length;
+        memset(dst, ' ', padding);
+        dst += padding;
+
+        const size_t header_len = dst - dict_string_buf;
+        if (header_len > 65535) {
+            return halide_error_code_debug_to_file_failed;
+        }
+        const uint8_t header_len_le[2] = {
+            (uint8_t)((header_len >> 0) & 0xff),
+            (uint8_t)((header_len >> 8) & 0xff)};
+
+        if (!f.write(npy_magic_string_and_version, 8)) {
+            return halide_error_code_debug_to_file_failed;
+        }
+        if (!f.write(header_len_le, 2)) {
+            return halide_error_code_debug_to_file_failed;
+        }
+        if (!f.write(dict_string_buf, dst - dict_string_buf)) {
+            return halide_error_code_debug_to_file_failed;
+        }
+    } else if (ends_with(filename, ".tiff") || ends_with(filename, ".tif")) {
+        if (type_code == 10) {
+            return halide_error_code_debug_to_file_failed;
+        }
+
         int32_t channels;
         int32_t width = shape[0].extent;
         int32_t height = shape[1].extent;
@@ -243,6 +351,10 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
             }
         }
     } else if (ends_with(filename, ".mat")) {
+        if (type_code == 10) {
+            return halide_error_code_debug_to_file_failed;
+        }
+
         // Construct a name for the array from the filename
         const char *end = filename;
         while (*end) {
@@ -279,7 +391,6 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
         // level 5 .mat files have a size limit. (Padding itself should never cause the overflow.
         // Code written this way for safety.)
         if (((uint64_t)payload_bytes + final_padding_bytes) >> 32) {
-            halide_error(user_context, "Can't debug_to_file to a .mat file greater than 4GB\n");
             return halide_error_code_debug_to_file_failed;
         }
 
@@ -325,6 +436,10 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
             return halide_error_code_debug_to_file_failed;
         }
     } else {
+        if (type_code == 10) {
+            return halide_error_code_debug_to_file_failed;
+        }
+
         int32_t header[] = {shape[0].extent,
                             shape[1].extent,
                             shape[2].extent,
@@ -370,7 +485,6 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
     const uint64_t zero = 0;
     if (final_padding_bytes) {
         if (final_padding_bytes > sizeof(zero)) {
-            halide_error(user_context, "Unexpectedly large final_padding_bytes");
             return halide_error_code_debug_to_file_failed;
         }
         if (!f.write(&zero, final_padding_bytes)) {
diff --git a/test/correctness/debug_to_file.cpp b/test/correctness/debug_to_file.cpp
index 2b0aee28e8c0..780428c3389f 100644
--- a/test/correctness/debug_to_file.cpp
+++ b/test/correctness/debug_to_file.cpp
@@ -15,88 +15,93 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    std::string f_mat = Internal::get_test_tmp_dir() + "f.mat";
-    std::string g_mat = Internal::get_test_tmp_dir() + "g.mat";
-    std::string h_mat = Internal::get_test_tmp_dir() + "h.mat";
-
-    Internal::ensure_no_file_exists(f_mat);
-    Internal::ensure_no_file_exists(g_mat);
-    Internal::ensure_no_file_exists(h_mat);
-
-    {
-        Func f, g, h, j;
-        Var x, y, z;
-        f(x, y, z) = cast<int32_t>(x + y + z);
-        g(x, y) = cast<float>(f(x, y, 0) + f(x + 1, y, 1));
-        h(x, y) = cast<int32_t>(f(x, y, -1) + g(x, y));
-
-        Target target = get_jit_target_from_environment();
-        if (target.has_gpu_feature()) {
-            Var xi, yi;
-            f.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(f_mat);
-            g.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(g_mat);
-            h.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(h_mat);
-        } else {
-            f.compute_root().debug_to_file(f_mat);
-            g.compute_root().debug_to_file(g_mat);
-            h.compute_root().debug_to_file(h_mat);
-        }
+    std::vector<std::string> formats = {"npy", "mat"};
+    for (const auto &format : formats) {
+        std::cout << "Testing format " << format << "...\n";
+
+        std::string f_path = Internal::get_test_tmp_dir() + "f." + format;
+        std::string g_path = Internal::get_test_tmp_dir() + "g." + format;
+        std::string h_path = Internal::get_test_tmp_dir() + "h." + format;
+
+        Internal::ensure_no_file_exists(f_path);
+        Internal::ensure_no_file_exists(g_path);
+        Internal::ensure_no_file_exists(h_path);
+
+        {
+            Func f, g, h, j;
+            Var x, y, z;
+            f(x, y, z) = cast<int32_t>(x + y + z);
+            g(x, y) = cast<float>(f(x, y, 0) + f(x + 1, y, 1));
+            h(x, y) = cast<int32_t>(f(x, y, -1) + g(x, y));
+
+            Target target = get_jit_target_from_environment();
+            if (target.has_gpu_feature()) {
+                Var xi, yi;
+                f.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(f_path);
+                g.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(g_path);
+                h.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(h_path);
+            } else {
+                f.compute_root().debug_to_file(f_path);
+                g.compute_root().debug_to_file(g_path);
+                h.compute_root().debug_to_file(h_path);
+            }
 
-        Buffer<int32_t> im = h.realize({10, 10}, target);
-    }
+            Buffer<int32_t> im = h.realize({10, 10}, target);
+        }
 
-    {
-        Internal::assert_file_exists(f_mat);
-        Internal::assert_file_exists(g_mat);
-        Internal::assert_file_exists(h_mat);
+        {
+            Internal::assert_file_exists(f_path);
+            Internal::assert_file_exists(g_path);
+            Internal::assert_file_exists(h_path);
+
+            Buffer<int32_t> f = Tools::load_image(f_path);
+            assert(f.dimensions() == 3 &&
+                   f.dim(0).extent() == 11 &&
+                   f.dim(1).extent() == 10 &&
+                   f.dim(2).extent() == 3);
+
+            for (int z = 0; z < 3; z++) {
+                for (int y = 0; y < 10; y++) {
+                    for (int x = 0; x < 11; x++) {
+                        int32_t val = f(x, y, z);
+                        // The min coord gets lost on debug_to_file, so f should be shifted up by one.
+                        if (val != x + y + z - 1) {
+                            printf("f(%d, %d, %d) = %d instead of %d\n", x, y, z, val, x + y);
+                            return 1;
+                        }
+                    }
+                }
+            }
 
-        Buffer<int32_t> f = Tools::load_image(f_mat);
-        assert(f.dimensions() == 3 &&
-               f.dim(0).extent() == 11 &&
-               f.dim(1).extent() == 10 &&
-               f.dim(2).extent() == 3);
+            Buffer<float> g = Tools::load_image(g_path);
+            assert(g.dimensions() == 2 &&
+                   g.dim(0).extent() == 10 &&
+                   g.dim(1).extent() == 10);
 
-        for (int z = 0; z < 3; z++) {
             for (int y = 0; y < 10; y++) {
-                for (int x = 0; x < 11; x++) {
-                    int32_t val = f(x, y, z);
-                    // The min coord gets lost on debug_to_file, so f should be shifted up by one.
-                    if (val != x + y + z - 1) {
-                        printf("f(%d, %d, %d) = %d instead of %d\n", x, y, z, val, x + y);
+                for (int x = 0; x < 10; x++) {
+                    float val = g(x, y);
+                    float correct = (float)(f(x, y, 1) + f(x + 1, y, 2));
+                    if (val != correct) {
+                        printf("g(%d, %d) = %f instead of %f\n", x, y, val, correct);
                         return 1;
                     }
                 }
             }
-        }
 
-        Buffer<float> g = Tools::load_image(g_mat);
-        assert(g.dimensions() == 2 &&
-               g.dim(0).extent() == 10 &&
-               g.dim(1).extent() == 10);
-
-        for (int y = 0; y < 10; y++) {
-            for (int x = 0; x < 10; x++) {
-                float val = g(x, y);
-                float correct = (float)(f(x, y, 1) + f(x + 1, y, 2));
-                if (val != correct) {
-                    printf("g(%d, %d) = %f instead of %f\n", x, y, val, correct);
-                    return 1;
-                }
-            }
-        }
+            Buffer<int32_t> h = Tools::load_image(h_path);
+            assert(h.dimensions() == 2 &&
+                   h.dim(0).extent() == 10 &&
+                   h.dim(1).extent() == 10);
 
-        Buffer<int32_t> h = Tools::load_image(h_mat);
-        assert(h.dimensions() == 2 &&
-               h.dim(0).extent() == 10 &&
-               h.dim(1).extent() == 10);
-
-        for (int y = 0; y < 10; y++) {
-            for (int x = 0; x < 10; x++) {
-                int32_t val = h(x, y);
-                int32_t correct = f(x, y, 0) + g(x, y);
-                if (val != correct) {
-                    printf("h(%d, %d) = %d instead of %d\n", x, y, val, correct);
-                    return 1;
+            for (int y = 0; y < 10; y++) {
+                for (int x = 0; x < 10; x++) {
+                    int32_t val = h(x, y);
+                    int32_t correct = f(x, y, 0) + g(x, y);
+                    if (val != correct) {
+                        printf("h(%d, %d) = %d instead of %d\n", x, y, val, correct);
+                        return 1;
+                    }
                 }
             }
         }

From f4c78317887b6df4d2486e1f81e81f9012943f0f Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 11 Apr 2024 15:07:20 -0700
Subject: [PATCH 103/186] Don't print on parallel task entry/exit with -debug
 flag (#8185)

Fixes #8184
---
 src/LowerParallelTasks.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/LowerParallelTasks.cpp b/src/LowerParallelTasks.cpp
index a035b2af6d1f..70f47885528c 100644
--- a/src/LowerParallelTasks.cpp
+++ b/src/LowerParallelTasks.cpp
@@ -302,9 +302,6 @@ struct LowerParallelTasks : public IRMutator {
                 // TODO(zvookin): Figure out how we want to handle name mangling of closures.
                 // For now, the C++ backend makes them extern "C" so they have to be NameMangling::C.
                 LoweredFunc closure_func{new_function_name, closure_args, std::move(wrapped_body), LinkageType::Internal, NameMangling::C};
-                if (target.has_feature(Target::Debug)) {
-                    debug_arguments(&closure_func, target);
-                }
                 closure_implementations.emplace_back(std::move(closure_func));
             }
 

From 7994e7030976f9fcd321a4d1d5f76f4582e01905 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 16 Apr 2024 14:27:43 -0700
Subject: [PATCH 104/186] Fix corner case in if_then_else simplification
 (#8189)

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/Simplify_Call.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/Simplify_Call.cpp b/src/Simplify_Call.cpp
index 33d11ccb8d06..29bc75aa2bb2 100644
--- a/src/Simplify_Call.cpp
+++ b/src/Simplify_Call.cpp
@@ -576,7 +576,11 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
             }
             in_unreachable = false;
             if (true_unreachable) {
-                return false_value;
+                if (false_value.defined()) {
+                    return false_value;
+                } else {
+                    return make_zero(op->type);
+                }
             } else if (false_unreachable) {
                 return true_value;
             }

From 4e0b313fa7f6d3897f960dd322cfd13daed97c98 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 18 Apr 2024 12:48:59 -0700
Subject: [PATCH 105/186] Rewrite IREquality to use a more compact stack
 instead of deep recursion (#8198)

* Rewrite IREquality to use a more compact stack instead of deep recursion

Deletes a bunch of code and speeds up lowering time of local laplacian
with 20 pyramid levels by ~2.5%

* clang-tidy

* Fold in the version of equal in IRMatch.h/cpp

* Add missing switch breaks

* Add missing comments

* Elaborate on why we treat NaNs as equal
---
 src/Associativity.cpp     |    2 +-
 src/Bounds.cpp            |    4 +-
 src/CSE.cpp               |   18 +-
 src/IREquality.cpp        | 1179 ++++++++++++++++---------------------
 src/IREquality.h          |  239 ++++----
 src/IRMatch.cpp           |  144 -----
 src/IRMatch.h             |   12 -
 src/ParallelRVar.cpp      |    2 +-
 src/RDom.cpp              |    2 +-
 src/ScheduleFunctions.cpp |    2 +-
 10 files changed, 660 insertions(+), 944 deletions(-)

diff --git a/src/Associativity.cpp b/src/Associativity.cpp
index 39a0011391a6..6baa9e5fa7c6 100644
--- a/src/Associativity.cpp
+++ b/src/Associativity.cpp
@@ -145,7 +145,7 @@ bool associative_op_pattern_match(const Expr &e,
                 debug(5) << "Adding result: " << iter.first << " -> " << iter.second << "\n";
                 match.emplace(iter.first, iter.second);
             } else {
-                if (!equal(iter.first, match_iter->first) || !equal(iter.second, match_iter->second)) {
+                if (iter.first != match_iter->first || !equal(iter.second, match_iter->second)) {
                     return false;
                 }
             }
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index d7d337dacfdf..a8ed2deba0d2 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -79,9 +79,9 @@ int static_sign(const Expr &x) {
         return -1;
     } else {
         Expr zero = make_zero(x.type());
-        if (equal(const_true(), simplify(x > zero))) {
+        if (is_const_one(simplify(x > zero))) {
             return 1;
-        } else if (equal(const_true(), simplify(x < zero))) {
+        } else if (is_const_one(simplify(x < zero))) {
             return -1;
         }
     }
diff --git a/src/CSE.cpp b/src/CSE.cpp
index d8ecd619db81..0905562c4e63 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -76,7 +76,7 @@ class GVN : public IRMutator {
         Expr expr;
         int use_count = 0;
         // All consumer Exprs for which this is the last child Expr.
-        map<ExprWithCompareCache, int> uses;
+        map<Expr, int, IRGraphDeepCompare> uses;
         Entry(const Expr &e)
             : expr(e) {
         }
@@ -84,25 +84,15 @@ class GVN : public IRMutator {
     vector<std::unique_ptr<Entry>> entries;
 
     map<Expr, int, ExprCompare> shallow_numbering, output_numbering;
-    map<ExprWithCompareCache, int> leaves;
+    map<Expr, int, IRGraphDeepCompare> leaves;
 
-    int number = -1;
-
-    IRCompareCache cache;
-
-    GVN()
-        : number(0), cache(8) {
-    }
+    int number = 0;
 
     Stmt mutate(const Stmt &s) override {
         internal_error << "Can't call GVN on a Stmt: " << s << "\n";
         return Stmt();
     }
 
-    ExprWithCompareCache with_cache(const Expr &e) {
-        return ExprWithCompareCache(e, &cache);
-    }
-
     Expr mutate(const Expr &e) override {
         // Early out if we've already seen this exact Expr.
         {
@@ -123,7 +113,7 @@ class GVN : public IRMutator {
         // that child has an identical parent to this one.
 
         auto &use_map = number == -1 ? leaves : entries[number]->uses;
-        auto p = use_map.emplace(with_cache(new_e), (int)entries.size());
+        auto p = use_map.emplace(new_e, (int)entries.size());
         auto iter = p.first;
         bool novel = p.second;
         if (novel) {
diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index 0d21ca1e26b5..bb64c1035590 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -10,713 +10,561 @@ using std::vector;
 
 namespace {
 
-/** The class that does the work of comparing two IR nodes. */
-class IRComparer : public IRVisitor {
-public:
-    /** Different possible results of a comparison. Unknown should
-     * only occur internally due to a cache miss. */
-    enum CmpResult { Unknown,
-                     Equal,
-                     LessThan,
-                     GreaterThan };
-
-    /** The result of the comparison. Should be Equal, LessThan, or GreaterThan. */
-    CmpResult result = Equal;
-
-    /** Compare two expressions or statements and return the
-     * result. Returns the result immediately if it is already
-     * non-zero. */
-    // @{
-    CmpResult compare_expr(const Expr &a, const Expr &b);
-    CmpResult compare_stmt(const Stmt &a, const Stmt &b);
-    // @}
-
-    /** If the expressions you're comparing may contain many repeated
-     * subexpressions, it's worth passing in a cache to use.
-     * Currently this is only done in common-subexpression
-     * elimination. */
-    IRComparer(IRCompareCache *c = nullptr)
-        : cache(c) {
-    }
-
-private:
-    Expr expr;
-    Stmt stmt;
-    IRCompareCache *cache;
-
-    CmpResult compare_names(const std::string &a, const std::string &b);
-    CmpResult compare_types(Type a, Type b);
-    CmpResult compare_expr_vector(const std::vector<Expr> &a, const std::vector<Expr> &b);
-
-    // Compare two things that already have a well-defined operator<
-    template<typename T>
-    CmpResult compare_scalar(T a, T b);
-
-    void visit(const IntImm *) override;
-    void visit(const UIntImm *) override;
-    void visit(const FloatImm *) override;
-    void visit(const StringImm *) override;
-    void visit(const Cast *) override;
-    void visit(const Reinterpret *) override;
-    void visit(const Variable *) override;
-    void visit(const Add *) override;
-    void visit(const Sub *) override;
-    void visit(const Mul *) override;
-    void visit(const Div *) override;
-    void visit(const Mod *) override;
-    void visit(const Min *) override;
-    void visit(const Max *) override;
-    void visit(const EQ *) override;
-    void visit(const NE *) override;
-    void visit(const LT *) override;
-    void visit(const LE *) override;
-    void visit(const GT *) override;
-    void visit(const GE *) override;
-    void visit(const And *) override;
-    void visit(const Or *) override;
-    void visit(const Not *) override;
-    void visit(const Select *) override;
-    void visit(const Load *) override;
-    void visit(const Ramp *) override;
-    void visit(const Broadcast *) override;
-    void visit(const Call *) override;
-    void visit(const Let *) override;
-    void visit(const LetStmt *) override;
-    void visit(const AssertStmt *) override;
-    void visit(const ProducerConsumer *) override;
-    void visit(const For *) override;
-    void visit(const Acquire *) override;
-    void visit(const Store *) override;
-    void visit(const Provide *) override;
-    void visit(const Allocate *) override;
-    void visit(const Free *) override;
-    void visit(const Realize *) override;
-    void visit(const Block *) override;
-    void visit(const Fork *) override;
-    void visit(const IfThenElse *) override;
-    void visit(const Evaluate *) override;
-    void visit(const Shuffle *) override;
-    void visit(const Prefetch *) override;
-    void visit(const Atomic *) override;
-    void visit(const VectorReduce *) override;
-    void visit(const HoistedStorage *) override;
-};
-
-template<typename T>
-IRComparer::CmpResult IRComparer::compare_scalar(T a, T b) {
-    if (result != Equal) {
-        return result;
-    }
-
-    if constexpr (std::is_floating_point_v<T>) {
-        // NaNs are equal to each other and less than non-nans
-        if (std::isnan(a) && std::isnan(b)) {
-            result = Equal;
-            return result;
-        }
-        if (std::isnan(a)) {
-            result = LessThan;
-            return result;
+enum class Order { Equal,
+                   LessThan,
+                   GreaterThan };
+
+// A helper class for comparing two pieces of IR with the minimum amount of
+// recursion.
+template<size_t cache_size>
+struct Comparer {
+
+    // Points to any cache in use for comparing Expr graphs. Will be non-null
+    // exactly when cache_size > 0
+    const IRNode **cache;
+
+    // The compare method below does the actual work, but it needs to call out
+    // to a variety of template helper functions to compare specific types. We
+    // make the syntax in the giant switch statement in the compare method much
+    // simpler if we just give these helper functions access to the state in the
+    // compare method: The stack pointers, the currently-considered piece of
+    // IR, and the result of the comparison so far.
+    const IRNode **stack_end = nullptr, **stack_ptr = nullptr;
+    const IRNode *next_a = nullptr, *next_b = nullptr;
+    Order result = Order::Equal;
+
+    Comparer(const IRNode **cache)
+        : cache(cache) {
+    }
+
+    // Compare the given member variable of next_a and next_b. If it's an Expr
+    // or Stmt, it's guaranteed to be defined.
+    template<typename Node, typename MemberType>
+    HALIDE_ALWAYS_INLINE void cmp(MemberType Node::*member_ptr) {
+        if (result == Order::Equal) {
+            cmp(((const Node *)next_a)->*member_ptr, ((const Node *)next_b)->*member_ptr);
         }
-        if (std::isnan(b)) {
-            result = GreaterThan;
-            return result;
-        }
-    }
-
-    if (a < b) {
-        result = LessThan;
-    } else if (a > b) {
-        result = GreaterThan;
-    }
-
-    return result;
-}
-
-IRComparer::CmpResult IRComparer::compare_expr(const Expr &a, const Expr &b) {
-    if (result != Equal) {
-        return result;
-    }
-
-    if (a.same_as(b)) {
-        result = Equal;
-        return result;
-    }
-
-    // Undefined values are equal to each other and less than defined values
-    if (!a.defined() && !b.defined()) {
-        result = Equal;
-        return result;
-    }
-
-    if (!a.defined()) {
-        result = LessThan;
-        return result;
     }
 
-    if (!b.defined()) {
-        result = GreaterThan;
-        return result;
-    }
-
-    // If in the future we have hashes for Exprs, this is a good place
-    // to compare the hashes:
-    // if (compare_scalar(a.hash(), b.hash()) != Equal) {
-    //   return result;
-    // }
-
-    if (compare_scalar(a->node_type, b->node_type) != Equal) {
-        return result;
-    }
-
-    if (compare_types(a.type(), b.type()) != Equal) {
-        return result;
-    }
-
-    // Check the cache - perhaps these exprs have already been compared and found equal.
-    if (cache && cache->contains(a, b)) {
-        result = Equal;
-        return result;
-    }
-
-    expr = a;
-    b.accept(this);
-
-    if (cache && result == Equal) {
-        cache->insert(a, b);
-    }
-
-    return result;
-}
-
-IRComparer::CmpResult IRComparer::compare_stmt(const Stmt &a, const Stmt &b) {
-    if (result != Equal) {
-        return result;
-    }
-
-    if (a.same_as(b)) {
-        result = Equal;
-        return result;
-    }
-
-    if (!a.defined() && !b.defined()) {
-        result = Equal;
-        return result;
-    }
-
-    if (!a.defined()) {
-        result = LessThan;
-        return result;
-    }
-
-    if (!b.defined()) {
-        result = GreaterThan;
-        return result;
-    }
-
-    if (compare_scalar(a->node_type, b->node_type) != Equal) {
-        return result;
-    }
-
-    stmt = a;
-    b.accept(this);
-
-    return result;
-}
-
-IRComparer::CmpResult IRComparer::compare_types(Type a, Type b) {
-    if (result != Equal) {
-        return result;
-    }
-
-    compare_scalar(a.code(), b.code());
-    compare_scalar(a.bits(), b.bits());
-    compare_scalar(a.lanes(), b.lanes());
-
-    if (result != Equal) {
-        return result;
-    }
-
-    const halide_handle_cplusplus_type *ha = a.handle_type;
-    const halide_handle_cplusplus_type *hb = b.handle_type;
-
-    if (ha == hb) {
-        // Same handle type, or both not handles, or both void *
-        return result;
+    // The same as above, but with no guarantee.
+    template<typename Node, typename MemberType>
+    HALIDE_ALWAYS_INLINE void cmp_if_defined(MemberType Node::*member_ptr) {
+        if (result == Order::Equal) {
+            cmp_if_defined(((const Node *)next_a)->*member_ptr, ((const Node *)next_b)->*member_ptr);
+        }
     }
 
-    if (ha == nullptr) {
-        // void* < T*
-        result = LessThan;
-        return result;
-    }
+    size_t hash(const IRNode *a, const IRNode *b) {
+        // A simple hash designed to get enough information into the low bits to
+        // avoid too many collisions, while being robust to weird things like
+        // having strided set of Exprs.
+        uintptr_t pa = (uintptr_t)a;
+        uintptr_t pb = (uintptr_t)b;
+        uintptr_t h = (((pa * 17) ^ (pb * 13)) >> 4);
+        h ^= h >> 8;
+        h = h & (cache_size - 1);
+        return h;
+    }
+
+    // See if we've already processed this pair of IR nodes
+    bool cache_contains(const IRNode *a, const IRNode *b) {
+        size_t h = hash(a, b);
+        const IRNode **c = cache + h * 2;
+        return (c[0] == a && c[1] == b);
+    }
+
+    // Mark a pair of IR nodes as already processed. We don't do this until
+    // we're done processing their children, because there aren't going to be
+    // any queries to match a node with one of its children, because nodes can't
+    // be their own ancestors. Inserting it into the cache too soon just means
+    // it's going to be evicted before we need it.
+    void cache_insert(const IRNode *a, const IRNode *b) {
+        size_t h = hash(a, b);
+        const IRNode **c = cache + h * 2;
+        c[0] = a;
+        c[1] = b;
+    }
+
+    // Compare two known-to-be-defined IR nodes. Well... don't actually compare
+    // them because that would be a recursive call. Just push them onto the
+    // pending tasks stack.
+    void cmp(const IRHandle &a, const IRHandle &b) {
+        if (cache_size > 0 && cache_contains(a.get(), b.get())) {
+            return;
+        }
 
-    if (hb == nullptr) {
-        // T* > void*
-        result = GreaterThan;
-        return result;
+        if (a.get() == b.get()) {
+        } else if (stack_ptr == stack_end) {
+            // Out of stack space. Make a recursive call to buy some more stack.
+            Comparer<cache_size> sub_comparer(cache);
+            result = sub_comparer.compare(*(a.get()), *(b.get()));
+        } else {
+            *stack_ptr++ = a.get();
+            *stack_ptr++ = b.get();
+        }
     }
 
-    // They're both non-void handle types with distinct type info
-    // structs. We now need to distinguish between different C++
-    // pointer types (e.g. char * vs const float *). If would be nice
-    // if the structs were unique per C++ type. Then comparing the
-    // pointers above would be sufficient.  Unfortunately, different
-    // shared libraries in the same process each create a distinct
-    // struct for the same type. We therefore have to do a deep
-    // comparison of the type info fields.
-
-    compare_scalar(ha->reference_type, hb->reference_type);
-    compare_names(ha->inner_name.name, hb->inner_name.name);
-    compare_scalar(ha->inner_name.cpp_type_type, hb->inner_name.cpp_type_type);
-    compare_scalar(ha->namespaces.size(), hb->namespaces.size());
-    compare_scalar(ha->enclosing_types.size(), hb->enclosing_types.size());
-    compare_scalar(ha->cpp_type_modifiers.size(), hb->cpp_type_modifiers.size());
-
-    if (result != Equal) {
-        return result;
+    // Compare two IR nodes, which may or may not be defined.
+    HALIDE_ALWAYS_INLINE
+    void cmp_if_defined(const IRHandle &a, const IRHandle &b) {
+        if (a.defined() < b.defined()) {
+            result = Order::LessThan;
+        } else if (a.defined() > b.defined()) {
+            result = Order::GreaterThan;
+        } else if (a.defined() && b.defined()) {
+            cmp(a, b);
+        }
     }
 
-    for (size_t i = 0; i < ha->namespaces.size(); i++) {
-        compare_names(ha->namespaces[i], hb->namespaces[i]);
+    template<typename T>
+    void cmp(const std::vector<T> &a, const std::vector<T> &b) {
+        if (a.size() < b.size()) {
+            result = Order::LessThan;
+        } else if (a.size() > b.size()) {
+            result = Order::GreaterThan;
+        } else {
+            for (size_t i = 0; i < a.size() && result == Order::Equal; i++) {
+                cmp(a[i], b[i]);
+            }
+        }
     }
 
-    if (result != Equal) {
-        return result;
+    HALIDE_ALWAYS_INLINE
+    void cmp(const Range &a, const Range &b) {
+        cmp(a.min, b.min);
+        cmp(a.extent, b.extent);
+    }
+
+    HALIDE_ALWAYS_INLINE
+    void cmp(const ModulusRemainder &a, const ModulusRemainder &b) {
+        cmp(a.modulus, b.modulus);
+        cmp(a.remainder, b.remainder);
+    }
+
+    void cmp(const halide_handle_cplusplus_type *ha,
+             const halide_handle_cplusplus_type *hb) {
+        if (ha == hb) {
+            return;
+        } else if (!ha) {
+            result = Order::LessThan;
+        } else if (!hb) {
+            result = Order::GreaterThan;
+        } else {
+            // They're both non-void handle types with distinct type info
+            // structs. We now need to distinguish between different C++
+            // pointer types (e.g. char * vs const float *). If would be nice
+            // if the structs were unique per C++ type. Then comparing the
+            // pointers above would be sufficient.  Unfortunately, different
+            // shared libraries in the same process each create a distinct
+            // struct for the same type. We therefore have to do a deep
+            // comparison of the type info fields.
+            cmp(ha->reference_type, hb->reference_type);
+            cmp(ha->inner_name.name, hb->inner_name.name);
+            cmp(ha->inner_name.cpp_type_type, hb->inner_name.cpp_type_type);
+            cmp(ha->namespaces, hb->namespaces);
+            cmp(ha->enclosing_types, hb->enclosing_types);
+            cmp(ha->cpp_type_modifiers, hb->cpp_type_modifiers);
+        }
     }
 
-    for (size_t i = 0; i < ha->enclosing_types.size(); i++) {
-        compare_scalar(ha->enclosing_types[i].cpp_type_type,
-                       hb->enclosing_types[i].cpp_type_type);
-        compare_names(ha->enclosing_types[i].name,
-                      hb->enclosing_types[i].name);
+    HALIDE_ALWAYS_INLINE
+    void cmp(const Type &a, const Type &b) {
+        uint32_t ta = ((halide_type_t)a).as_u32();
+        uint32_t tb = ((halide_type_t)b).as_u32();
+        if (ta < tb) {
+            result = Order::LessThan;
+        } else if (ta > tb) {
+            result = Order::GreaterThan;
+        } else {
+            if (a.handle_type || b.handle_type) {
+                cmp(a.handle_type, b.handle_type);
+            }
+        }
     }
 
-    if (result != Equal) {
-        return result;
+    void cmp(const PrefetchDirective &a, const PrefetchDirective &b) {
+        cmp(a.name, b.name);
+        cmp(a.at, b.at);
+        cmp(a.from, b.from);
+        cmp(a.offset, b.offset);
+        cmp(a.strategy, b.strategy);
     }
 
-    for (size_t i = 0; i < ha->cpp_type_modifiers.size(); i++) {
-        compare_scalar(ha->cpp_type_modifiers[i],
-                       hb->cpp_type_modifiers[i]);
+    HALIDE_ALWAYS_INLINE
+    void cmp(double a, double b) {
+        // Floating point scalars need special handling, due to NaNs.
+        if (std::isnan(a) && std::isnan(b)) {
+            // Under numeric rules, NaNs aren't equal, but we're not actually
+            // comparing numbers here. We are comparing IR nodes to see if
+            // they'll compile to the same thing. Two NaN FloatImms will compile
+            // to the same thing, so they should be considered equal in this
+            // context, so we leave comparison state unchanged.
+            //
+            // Note however that we consider -0 equal to 0 here, because
+            // otherwise you get tedious problems like std::nearbyint(-0.5) with
+            // round-to-nearest mode leaving it platform-dependent whether you
+            // get -0 or 0. So if we say -0 != 0, our constant folding would be
+            // platform-dependent.
+        } else if (std::isnan(a)) {
+            result = Order::LessThan;
+        } else if (std::isnan(b)) {
+            result = Order::GreaterThan;
+        } else if (a < b) {
+            result = Order::LessThan;
+        } else if (b < a) {
+            result = Order::GreaterThan;
+        }
     }
 
-    return result;
-}
-
-IRComparer::CmpResult IRComparer::compare_names(const string &a, const string &b) {
-    if (result != Equal) {
-        return result;
+    HALIDE_ALWAYS_INLINE
+    void cmp(const std::string &a, const std::string &b) {
+        int r = a.compare(b);
+        if (r < 0) {
+            result = Order::LessThan;
+        } else if (r > 0) {
+            result = Order::GreaterThan;
+        }
     }
 
-    int string_cmp = a.compare(b);
-    if (string_cmp < 0) {
-        result = LessThan;
-    } else if (string_cmp > 0) {
-        result = GreaterThan;
+    // The method to use whenever we can just use operator< and get a bool.
+    template<typename T, typename = std::enable_if_t<!std::is_convertible_v<T, IRHandle> &&
+                                                     std::is_same_v<decltype(std::declval<T>() < std::declval<T>()), bool>>>
+    HALIDE_NEVER_INLINE void cmp(const T &a, const T &b) {
+        if (a < b) {
+            result = Order::LessThan;
+        } else if (b < a) {
+            result = Order::GreaterThan;
+        }
     }
 
-    return result;
-}
+    Order compare(const IRNode &root_a, const IRNode &root_b) {
+        constexpr size_t stack_size = 64;             // 1 kb
+        const IRNode *stack_storage[stack_size * 2];  // Intentionally uninitialized
+
+        stack_ptr = stack_storage;
+        stack_end = stack_storage + stack_size * 2;
+        result = Order::Equal;
+
+        *stack_ptr++ = &root_a;
+        *stack_ptr++ = &root_b;
+
+        while (result == Order::Equal && stack_ptr > stack_storage) {
+            stack_ptr -= 2;
+            next_a = stack_ptr[0];
+            next_b = stack_ptr[1];
+
+            if (next_a == next_b) {
+                continue;
+            }
+
+            if (cache_size > 0 && (((uintptr_t)next_a) & 1)) {
+                // If we are using a cache, we want to keep the nodes on the
+                // stack while processing their children, but mark them with a
+                // tombstone. We'll flip the low bit to 1 for our tombstone. We
+                // want to insert them into the cache when the tombstone is
+                // handled. This if statement triggers if we just hit a
+                // tombstone.
+                cache_insert((const IRNode *)((uintptr_t)next_a ^ 1), next_b);
+                continue;
+            }
+
+            cmp(next_a->node_type, next_b->node_type);
+            if (result != Order::Equal) {
+                break;
+            }
+
+            if (next_a->node_type < IRNodeType::LetStmt) {
+                cmp(&BaseExprNode::type);
+            }
+
+            if (cache_size > 0) {
+                // Keep the parent nodes on the stack, but mark them with a
+                // tombstone bit.
+                stack_ptr[0] = (const IRNode *)(((uintptr_t)next_a) | 1);
+                stack_ptr += 2;
+            }
+
+            switch (next_a->node_type) {
+            case IRNodeType::IntImm:
+                cmp(&IntImm::value);
+                break;
+            case IRNodeType::UIntImm:
+                cmp(&UIntImm::value);
+                break;
+            case IRNodeType::FloatImm:
+                cmp(&FloatImm::value);
+                break;
+            case IRNodeType::StringImm:
+                cmp(&StringImm::value);
+                break;
+            case IRNodeType::Broadcast:
+                cmp(&Broadcast::value);
+                break;
+            case IRNodeType::Cast:
+                cmp(&Cast::value);
+                break;
+            case IRNodeType::Reinterpret:
+                cmp(&Cast::value);
+                break;
+            case IRNodeType::Variable:
+                cmp(&Variable::name);
+                break;
+            case IRNodeType::Add:
+                cmp(&Add::a);
+                cmp(&Add::b);
+                break;
+            case IRNodeType::Sub:
+                cmp(&Sub::a);
+                cmp(&Sub::b);
+                break;
+            case IRNodeType::Mod:
+                cmp(&Mod::a);
+                cmp(&Mod::b);
+                break;
+            case IRNodeType::Mul:
+                cmp(&Mul::a);
+                cmp(&Mul::b);
+                break;
+            case IRNodeType::Div:
+                cmp(&Div::a);
+                cmp(&Div::b);
+                break;
+            case IRNodeType::Min:
+                cmp(&Min::a);
+                cmp(&Min::b);
+                break;
+            case IRNodeType::Max:
+                cmp(&Max::a);
+                cmp(&Max::b);
+                break;
+            case IRNodeType::EQ:
+                cmp(&EQ::a);
+                cmp(&EQ::b);
+                break;
+            case IRNodeType::NE:
+                cmp(&NE::a);
+                cmp(&NE::b);
+                break;
+            case IRNodeType::LT:
+                cmp(&LT::a);
+                cmp(&LT::b);
+                break;
+            case IRNodeType::LE:
+                cmp(&LE::a);
+                cmp(&LE::b);
+                break;
+            case IRNodeType::GT:
+                cmp(&GT::a);
+                cmp(&GT::b);
+            case IRNodeType::GE:
+                cmp(&GE::a);
+                cmp(&GE::b);
+                break;
+            case IRNodeType::And:
+                cmp(&And::a);
+                cmp(&And::b);
+                break;
+            case IRNodeType::Or:
+                cmp(&Or::a);
+                cmp(&Or::b);
+                break;
+            case IRNodeType::Not:
+                cmp(&Not::a);
+                break;
+            case IRNodeType::Select:
+                cmp(&Select::condition);
+                cmp(&Select::true_value);
+                cmp(&Select::false_value);
+                break;
+            case IRNodeType::Load:
+                cmp(&Load::name);
+                cmp(&Load::alignment);
+                cmp(&Load::index);
+                cmp(&Load::predicate);
+                break;
+            case IRNodeType::Ramp:
+                cmp(&Ramp::stride);
+                cmp(&Ramp::base);
+                break;
+            case IRNodeType::Call:
+                cmp(&Call::name);
+                cmp(&Call::call_type);
+                cmp(&Call::value_index);
+                cmp(&Call::args);
+                break;
+            case IRNodeType::Let:
+                cmp(&Let::name);
+                cmp(&Let::value);
+                cmp(&Let::body);
+                break;
+            case IRNodeType::Shuffle:
+                cmp(&Shuffle::indices);
+                cmp(&Shuffle::vectors);
+                break;
+            case IRNodeType::VectorReduce:
+                cmp(&VectorReduce::op);
+                cmp(&VectorReduce::value);
+                break;
+            case IRNodeType::LetStmt:
+                cmp(&LetStmt::name);
+                cmp(&LetStmt::value);
+                cmp(&LetStmt::body);
+                break;
+            case IRNodeType::AssertStmt:
+                cmp(&AssertStmt::condition);
+                cmp(&AssertStmt::message);
+                break;
+            case IRNodeType::ProducerConsumer:
+                cmp(&ProducerConsumer::name);
+                cmp(&ProducerConsumer::is_producer);
+                cmp(&ProducerConsumer::body);
+                break;
+            case IRNodeType::For:
+                cmp(&For::name);
+                cmp(&For::for_type);
+                cmp(&For::device_api);
+                cmp(&For::partition_policy);
+                cmp(&For::min);
+                cmp(&For::extent);
+                cmp(&For::body);
+                break;
+            case IRNodeType::Acquire:
+                cmp(&Acquire::semaphore);
+                cmp(&Acquire::count);
+                cmp(&Acquire::body);
+                break;
+            case IRNodeType::Store:
+                cmp(&Store::name);
+                cmp(&Store::alignment);
+                cmp(&Store::predicate);
+                cmp(&Store::value);
+                cmp(&Store::index);
+                break;
+            case IRNodeType::Provide:
+                cmp(&Provide::name);
+                cmp(&Provide::args);
+                cmp(&Provide::values);
+                break;
+            case IRNodeType::Allocate:
+                cmp(&Allocate::name);
+                cmp(&Allocate::type);
+                cmp(&Allocate::free_function);
+                cmp_if_defined(&Allocate::new_expr);
+                cmp(&Allocate::condition);
+                cmp(&Allocate::extents);
+                cmp(&Allocate::body);
+                break;
+            case IRNodeType::Free:
+                cmp(&Free::name);
+                break;
+            case IRNodeType::Realize:
+                cmp(&Realize::name);
+                cmp(&Realize::types);
+                cmp(&Realize::bounds);
+                cmp(&Realize::body);
+                cmp(&Realize::condition);
+                break;
+            case IRNodeType::Block:
+                cmp(&Block::first);
+                cmp(&Block::rest);
+                break;
+            case IRNodeType::Fork:
+                cmp(&Fork::first);
+                cmp(&Fork::rest);
+                break;
+            case IRNodeType::IfThenElse:
+                cmp(&IfThenElse::condition);
+                cmp(&IfThenElse::then_case);
+                cmp_if_defined(&IfThenElse::else_case);
+                break;
+            case IRNodeType::Evaluate:
+                cmp(&Evaluate::value);
+                break;
+            case IRNodeType::Prefetch:
+                cmp(&Prefetch::name);
+                cmp(&Prefetch::types);
+                cmp(&Prefetch::prefetch);
+                cmp(&Prefetch::bounds);
+                cmp(&Prefetch::condition);
+                cmp(&Prefetch::body);
+                break;
+            case IRNodeType::Atomic:
+                cmp(&Atomic::producer_name);
+                cmp(&Atomic::mutex_name);
+                cmp(&Atomic::body);
+                break;
+            case IRNodeType::HoistedStorage:
+                cmp(&HoistedStorage::name);
+                cmp(&HoistedStorage::body);
+                break;
+            }
+        }
 
-IRComparer::CmpResult IRComparer::compare_expr_vector(const vector<Expr> &a, const vector<Expr> &b) {
-    if (result != Equal) {
+        // Don't hold onto pointers to this stack frame.
+        stack_ptr = stack_end = nullptr;
         return result;
     }
-
-    compare_scalar(a.size(), b.size());
-    for (size_t i = 0; (i < a.size()) && result == Equal; i++) {
-        compare_expr(a[i], b[i]);
-    }
-
-    return result;
-}
-
-void IRComparer::visit(const IntImm *op) {
-    const IntImm *e = expr.as<IntImm>();
-    compare_scalar(e->value, op->value);
-}
-
-void IRComparer::visit(const UIntImm *op) {
-    const UIntImm *e = expr.as<UIntImm>();
-    compare_scalar(e->value, op->value);
-}
-
-void IRComparer::visit(const FloatImm *op) {
-    const FloatImm *e = expr.as<FloatImm>();
-    compare_scalar(e->value, op->value);
-}
-
-void IRComparer::visit(const StringImm *op) {
-    const StringImm *e = expr.as<StringImm>();
-    compare_names(e->value, op->value);
-}
-
-void IRComparer::visit(const Cast *op) {
-    compare_expr(expr.as<Cast>()->value, op->value);
-}
-
-void IRComparer::visit(const Reinterpret *op) {
-    compare_expr(expr.as<Reinterpret>()->value, op->value);
-}
-
-void IRComparer::visit(const Variable *op) {
-    const Variable *e = expr.as<Variable>();
-    compare_names(e->name, op->name);
-}
-
-namespace {
-template<typename T>
-void visit_binary_operator(IRComparer *cmp, const T *op, Expr expr) {
-    const T *e = expr.as<T>();
-    cmp->compare_expr(e->a, op->a);
-    cmp->compare_expr(e->b, op->b);
-}
-}  // namespace
-
-void IRComparer::visit(const Add *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Sub *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Mul *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Div *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Mod *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Min *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Max *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const EQ *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const NE *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const LT *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const LE *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const GT *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const GE *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const And *op) {
-    visit_binary_operator(this, op, expr);
-}
-void IRComparer::visit(const Or *op) {
-    visit_binary_operator(this, op, expr);
-}
-
-void IRComparer::visit(const Not *op) {
-    const Not *e = expr.as<Not>();
-    compare_expr(e->a, op->a);
-}
-
-void IRComparer::visit(const Select *op) {
-    const Select *e = expr.as<Select>();
-    compare_expr(e->condition, op->condition);
-    compare_expr(e->true_value, op->true_value);
-    compare_expr(e->false_value, op->false_value);
-}
-
-void IRComparer::visit(const Load *op) {
-    const Load *e = expr.as<Load>();
-    compare_names(op->name, e->name);
-    compare_expr(e->predicate, op->predicate);
-    compare_expr(e->index, op->index);
-    compare_scalar(e->alignment.modulus, op->alignment.modulus);
-    compare_scalar(e->alignment.remainder, op->alignment.remainder);
-}
-
-void IRComparer::visit(const Ramp *op) {
-    const Ramp *e = expr.as<Ramp>();
-    // No need to compare width because we already compared types
-    compare_expr(e->base, op->base);
-    compare_expr(e->stride, op->stride);
-}
-
-void IRComparer::visit(const Broadcast *op) {
-    const Broadcast *e = expr.as<Broadcast>();
-    compare_expr(e->value, op->value);
-}
-
-void IRComparer::visit(const Call *op) {
-    const Call *e = expr.as<Call>();
-
-    compare_names(e->name, op->name);
-    compare_scalar(e->call_type, op->call_type);
-    compare_scalar(e->value_index, op->value_index);
-    compare_expr_vector(e->args, op->args);
-}
-
-void IRComparer::visit(const Let *op) {
-    const Let *e = expr.as<Let>();
-
-    compare_names(e->name, op->name);
-    compare_expr(e->value, op->value);
-    compare_expr(e->body, op->body);
-}
-
-void IRComparer::visit(const LetStmt *op) {
-    const LetStmt *s = stmt.as<LetStmt>();
-
-    compare_names(s->name, op->name);
-    compare_expr(s->value, op->value);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const AssertStmt *op) {
-    const AssertStmt *s = stmt.as<AssertStmt>();
-
-    compare_expr(s->condition, op->condition);
-    compare_expr(s->message, op->message);
-}
-
-void IRComparer::visit(const ProducerConsumer *op) {
-    const ProducerConsumer *s = stmt.as<ProducerConsumer>();
-
-    compare_names(s->name, op->name);
-    compare_scalar(s->is_producer, op->is_producer);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const For *op) {
-    const For *s = stmt.as<For>();
-
-    compare_names(s->name, op->name);
-    compare_scalar(s->for_type, op->for_type);
-    compare_expr(s->min, op->min);
-    compare_expr(s->extent, op->extent);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const Acquire *op) {
-    const Acquire *s = stmt.as<Acquire>();
-
-    compare_expr(s->semaphore, op->semaphore);
-    compare_expr(s->count, op->count);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const Store *op) {
-    const Store *s = stmt.as<Store>();
-
-    compare_names(s->name, op->name);
-
-    compare_expr(s->predicate, op->predicate);
-    compare_expr(s->value, op->value);
-    compare_expr(s->index, op->index);
-    compare_scalar(s->alignment.modulus, op->alignment.modulus);
-    compare_scalar(s->alignment.remainder, op->alignment.remainder);
-}
-
-void IRComparer::visit(const Provide *op) {
-    const Provide *s = stmt.as<Provide>();
-
-    compare_names(s->name, op->name);
-    compare_expr_vector(s->args, op->args);
-    compare_expr_vector(s->values, op->values);
-}
-
-void IRComparer::visit(const Allocate *op) {
-    const Allocate *s = stmt.as<Allocate>();
-
-    compare_names(s->name, op->name);
-    compare_types(s->type, op->type);
-    compare_expr_vector(s->extents, op->extents);
-    compare_stmt(s->body, op->body);
-    compare_expr(s->condition, op->condition);
-    compare_expr(s->new_expr, op->new_expr);
-    compare_names(s->free_function, op->free_function);
-}
-
-void IRComparer::visit(const Realize *op) {
-    const Realize *s = stmt.as<Realize>();
-
-    compare_names(s->name, op->name);
-    compare_scalar(s->types.size(), op->types.size());
-    compare_scalar(s->bounds.size(), op->bounds.size());
-    for (size_t i = 0; (result == Equal) && (i < s->types.size()); i++) {
-        compare_types(s->types[i], op->types[i]);
-    }
-    for (size_t i = 0; (result == Equal) && (i < s->bounds.size()); i++) {
-        compare_expr(s->bounds[i].min, op->bounds[i].min);
-        compare_expr(s->bounds[i].extent, op->bounds[i].extent);
-    }
-    compare_stmt(s->body, op->body);
-    compare_expr(s->condition, op->condition);
-}
-
-void IRComparer::visit(const Block *op) {
-    const Block *s = stmt.as<Block>();
-
-    compare_stmt(s->first, op->first);
-    compare_stmt(s->rest, op->rest);
-}
-
-void IRComparer::visit(const Fork *op) {
-    const Fork *s = stmt.as<Fork>();
-
-    compare_stmt(s->first, op->first);
-    compare_stmt(s->rest, op->rest);
-}
-
-void IRComparer::visit(const Free *op) {
-    const Free *s = stmt.as<Free>();
-
-    compare_names(s->name, op->name);
-}
-
-void IRComparer::visit(const IfThenElse *op) {
-    const IfThenElse *s = stmt.as<IfThenElse>();
-
-    compare_expr(s->condition, op->condition);
-    compare_stmt(s->then_case, op->then_case);
-    compare_stmt(s->else_case, op->else_case);
-}
-
-void IRComparer::visit(const Evaluate *op) {
-    const Evaluate *s = stmt.as<Evaluate>();
-
-    compare_expr(s->value, op->value);
-}
-
-void IRComparer::visit(const Shuffle *op) {
-    const Shuffle *e = expr.as<Shuffle>();
-
-    compare_expr_vector(e->vectors, op->vectors);
-
-    compare_scalar(e->indices.size(), op->indices.size());
-    for (size_t i = 0; (i < e->indices.size()) && result == Equal; i++) {
-        compare_scalar(e->indices[i], op->indices[i]);
-    }
-}
-
-void IRComparer::visit(const Prefetch *op) {
-    const Prefetch *s = stmt.as<Prefetch>();
-
-    compare_names(s->name, op->name);
-    compare_scalar(s->types.size(), op->types.size());
-    compare_scalar(s->bounds.size(), op->bounds.size());
-    for (size_t i = 0; (result == Equal) && (i < s->types.size()); i++) {
-        compare_types(s->types[i], op->types[i]);
-    }
-    for (size_t i = 0; (result == Equal) && (i < s->bounds.size()); i++) {
-        compare_expr(s->bounds[i].min, op->bounds[i].min);
-        compare_expr(s->bounds[i].extent, op->bounds[i].extent);
-    }
-    compare_expr(s->condition, op->condition);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const Atomic *op) {
-    const Atomic *s = stmt.as<Atomic>();
-
-    compare_names(s->producer_name, op->producer_name);
-    compare_names(s->mutex_name, op->mutex_name);
-    compare_stmt(s->body, op->body);
-}
-
-void IRComparer::visit(const VectorReduce *op) {
-    const VectorReduce *e = expr.as<VectorReduce>();
-
-    compare_scalar(op->op, e->op);
-    // We've already compared types, so it's enough to compare the value
-    compare_expr(op->value, e->value);
-}
-
-void IRComparer::visit(const HoistedStorage *op) {
-    const HoistedStorage *s = stmt.as<HoistedStorage>();
-
-    compare_names(s->name, op->name);
-    compare_stmt(s->body, op->body);
-}
+};
 
 }  // namespace
 
-// Now the methods exposed in the header.
-bool equal(const Expr &a, const Expr &b) {
-    return IRComparer().compare_expr(a, b) == IRComparer::Equal;
-}
-
-bool graph_equal(const Expr &a, const Expr &b) {
-    IRCompareCache cache(8);
-    return IRComparer(&cache).compare_expr(a, b) == IRComparer::Equal;
-}
-
-bool graph_less_than(const Expr &a, const Expr &b) {
-    IRCompareCache cache(8);
-    return IRComparer(&cache).compare_expr(a, b) == IRComparer::LessThan;
+bool equal_impl(const IRNode &a, const IRNode &b) {
+    return Comparer<0>(nullptr).compare(a, b) == Order::Equal;
 }
 
-bool equal(const Stmt &a, const Stmt &b) {
-    return IRComparer().compare_stmt(a, b) == IRComparer::Equal;
+bool graph_equal_impl(const IRNode &a, const IRNode &b) {
+    const IRNode *cache[256] = {};
+    return Comparer<128>(cache).compare(a, b) == Order::Equal;
 }
 
-bool graph_equal(const Stmt &a, const Stmt &b) {
-    IRCompareCache cache(8);
-    return IRComparer(&cache).compare_stmt(a, b) == IRComparer::Equal;
+bool less_than_impl(const IRNode &a, const IRNode &b) {
+    return Comparer<0>(nullptr).compare(a, b) == Order::LessThan;
 }
 
-bool graph_less_than(const Stmt &a, const Stmt &b) {
-    IRCompareCache cache(8);
-    return IRComparer(&cache).compare_stmt(a, b) == IRComparer::LessThan;
-}
-
-bool IRDeepCompare::operator()(const Expr &a, const Expr &b) const {
-    IRComparer cmp;
-    cmp.compare_expr(a, b);
-    return cmp.result == IRComparer::LessThan;
-}
-
-bool IRDeepCompare::operator()(const Stmt &a, const Stmt &b) const {
-    IRComparer cmp;
-    cmp.compare_stmt(a, b);
-    return cmp.result == IRComparer::LessThan;
-}
-
-bool ExprWithCompareCache::operator<(const ExprWithCompareCache &other) const {
-    IRComparer cmp(cache);
-    cmp.compare_expr(expr, other.expr);
-    return cmp.result == IRComparer::LessThan;
+bool graph_less_than_impl(const IRNode &a, const IRNode &b) {
+    const IRNode *cache[256] = {};
+    return Comparer<128>(cache).compare(a, b) == Order::LessThan;
 }
 
 // Testing code
 namespace {
 
-IRComparer::CmpResult flip_result(IRComparer::CmpResult r) {
+Order flip_result(Order r) {
     switch (r) {
-    case IRComparer::LessThan:
-        return IRComparer::GreaterThan;
-    case IRComparer::Equal:
-        return IRComparer::Equal;
-    case IRComparer::GreaterThan:
-        return IRComparer::LessThan;
-    case IRComparer::Unknown:
-        return IRComparer::Unknown;
-    }
-    return IRComparer::Unknown;
+    case Order::Equal:
+        r = Order::Equal;
+        break;
+    case Order::LessThan:
+        r = Order::GreaterThan;
+        break;
+    case Order::GreaterThan:
+        r = Order::LessThan;
+        break;
+    }
+    return r;
+}
+
+std::ostream &operator<<(std::ostream &s, Order o) {
+    switch (o) {
+    case Order::Equal:
+        s << "Equal";
+        break;
+    case Order::LessThan:
+        s << "LessThan";
+        break;
+    case Order::GreaterThan:
+        s << "GreaterThan";
+        break;
+    }
+    return s;
 }
 
 void check_equal(const Expr &a, const Expr &b) {
-    IRCompareCache cache(5);
-    IRComparer::CmpResult r = IRComparer(&cache).compare_expr(a, b);
-    internal_assert(r == IRComparer::Equal)
+    const IRNode *cache[256] = {};
+    Order r = Comparer<128>(cache).compare(*(a.get()), *(b.get()));
+    internal_assert(r == Order::Equal)
         << "Error in ir_equality_test: " << r
-        << " instead of " << IRComparer::Equal
+        << " instead of " << Order::Equal
         << " when comparing:\n"
         << a
         << "\nand\n"
@@ -724,11 +572,10 @@ void check_equal(const Expr &a, const Expr &b) {
 }
 
 void check_not_equal(const Expr &a, const Expr &b) {
-    IRCompareCache cache(5);
-    IRComparer::CmpResult r1 = IRComparer(&cache).compare_expr(a, b);
-    IRComparer::CmpResult r2 = IRComparer(&cache).compare_expr(b, a);
-    internal_assert(r1 != IRComparer::Equal &&
-                    r1 != IRComparer::Unknown &&
+    const IRNode *cache[256] = {};
+    Order r1 = Comparer<128>(cache).compare(*(a.get()), *(b.get()));
+    Order r2 = Comparer<128>(cache).compare(*(b.get()), *(a.get()));
+    internal_assert(r1 != Order::Equal &&
                     flip_result(r1) == r2)
         << "Error in ir_equality_test: " << r1
         << " is not the opposite of " << r2
diff --git a/src/IREquality.h b/src/IREquality.h
index 1d59d19446e2..da1ad17681e1 100644
--- a/src/IREquality.h
+++ b/src/IREquality.h
@@ -2,7 +2,11 @@
 #define HALIDE_IR_EQUALITY_H
 
 /** \file
- * Methods to test Exprs and Stmts for equality of value
+ * Methods to test Exprs and Stmts for equality of value.
+ *
+ * These methods traverse the entire IR tree. For equality of reference, use
+ * Expr::same_as. If you're comparing non-CSE'd Exprs, use graph_equal or
+ * graph_less_than, which is safe for nasty graphs of IR nodes.
  */
 
 #include "Expr.h"
@@ -10,118 +14,149 @@
 namespace Halide {
 namespace Internal {
 
-/** A compare struct suitable for use in std::map and std::set that
- * computes a lexical ordering on IR nodes. */
-struct IRDeepCompare {
-    bool operator()(const Expr &a, const Expr &b) const;
-    bool operator()(const Stmt &a, const Stmt &b) const;
-};
-
-/** Lossily track known equal exprs with a cache. On collision, the
- * old pair is evicted. Used below by ExprWithCompareCache. */
-class IRCompareCache {
-private:
-    struct Entry {
-        Expr a, b;
-    };
-
-    int bits;
-
-    uint32_t hash(const Expr &a, const Expr &b) const {
-        // Note this hash is symmetric in a and b, so that a
-        // comparison in a and b hashes to the same bucket as
-        // a comparison on b and a.
-        uint64_t pa = (uint64_t)(a.get());
-        uint64_t pb = (uint64_t)(b.get());
-        uint64_t mix = (pa + pb) + (pa ^ pb);
-        mix ^= (mix >> bits);
-        mix ^= (mix >> (bits * 2));
-        uint32_t bottom = mix & ((1 << bits) - 1);
-        return bottom;
+// We want to inline a few quick checks into the caller. These are the actual
+// implementations that get called after those quick checks.
+bool equal_impl(const IRNode &a, const IRNode &b);
+bool graph_equal_impl(const IRNode &a, const IRNode &b);
+bool less_than_impl(const IRNode &a, const IRNode &b);
+bool graph_less_than_impl(const IRNode &a, const IRNode &b);
+
+/** Compare an Expr to an int literal. This is a somewhat common use of equal in
+ * tests. Making this separate avoids constructing an Expr out of the int
+ * literal just to check if it's equal to a. */
+HALIDE_ALWAYS_INLINE
+bool equal(const Expr &a, int b) {
+    if (const IntImm *i = a.as<IntImm>()) {
+        return (a.type() == Int(32) && i->value == b);
+    } else {
+        return false;
     }
-
-    std::vector<Entry> entries;
-
-public:
-    void insert(const Expr &a, const Expr &b) {
-        uint32_t h = hash(a, b);
-        entries[h].a = a;
-        entries[h].b = b;
+}
+
+/** Check if two defined Stmts or Exprs are equal. */
+HALIDE_ALWAYS_INLINE
+bool equal(const IRNode &a, const IRNode &b) {
+    if (&a == &b) {
+        return true;
+    } else if (a.node_type != b.node_type) {
+        return false;
+    } else {
+        return equal_impl(a, b);
     }
-
-    bool contains(const Expr &a, const Expr &b) const {
-        uint32_t h = hash(a, b);
-        const Entry &e = entries[h];
-        return ((a.same_as(e.a) && b.same_as(e.b)) ||
-                (a.same_as(e.b) && b.same_as(e.a)));
+}
+
+/** Check if two possible-undefined Stmts or Exprs are equal. */
+HALIDE_ALWAYS_INLINE
+bool equal(const IRHandle &a, const IRHandle &b) {
+    if (!a.defined()) {
+        return !b.defined();
+    } else if (!b.defined()) {
+        return false;
+    } else {
+        return equal(*(a.get()), *(b.get()));
     }
-
-    void clear() {
-        for (auto &entry : entries) {
-            entry.a = Expr();
-            entry.b = Expr();
-        }
+}
+
+/** Check if two defined Stmts or Exprs are equal. Safe to call on Exprs that
+ * haven't been passed to common_subexpression_elimination. */
+HALIDE_ALWAYS_INLINE
+bool graph_equal(const IRNode &a, const IRNode &b) {
+    if (&a == &b) {
+        return true;
+    } else if (a.node_type != b.node_type) {
+        return false;
+    } else {
+        return equal_impl(a, b);
+    }
+}
+
+/** Check if two possibly-undefined Stmts or Exprs are equal. Safe to call on
+ * Exprs that haven't been passed to common_subexpression_elimination. */
+HALIDE_ALWAYS_INLINE
+bool graph_equal(const IRHandle &a, const IRHandle &b) {
+    if (!a.defined()) {
+        return !b.defined();
+    } else if (!b.defined()) {
+        return false;
+    } else {
+        return equal(*(a.get()), *(b.get()));
+    }
+}
+
+/** Check if two defined Stmts or Exprs are in a lexicographic order. For use in
+ * map keys. */
+HALIDE_ALWAYS_INLINE
+bool less_than(const IRNode &a, const IRNode &b) {
+    if (&a == &b) {
+        return false;
+    } else if (a.node_type < b.node_type) {
+        return true;
+    } else {
+        return less_than_impl(a, b);
+    }
+}
+
+/** Check if two possibly-undefined Stmts or Exprs are in a lexicographic
+ * order. For use in map keys. */
+HALIDE_ALWAYS_INLINE
+bool less_than(const IRHandle &a, const IRHandle &b) {
+    if (a.get() == b.get()) {
+        return false;
+    } else if (!a.defined()) {
+        return true;
+    } else if (!b.defined()) {
+        return false;
+    } else {
+        return less_than(*(a.get()), *(b.get()));
     }
+}
+
+/** Check if two defined Stmts or Exprs are in a lexicographic order. For use in
+ * map keys. Safe to use on Exprs that haven't been passed to
+ * common_subexpression_elimination. */
+HALIDE_ALWAYS_INLINE
+bool graph_less_than(const IRNode &a, const IRNode &b) {
+    if (&a == &b) {
+        return false;
+    } else if (a.node_type < b.node_type) {
+        return true;
+    } else {
+        return graph_less_than_impl(a, b);
+    }
+}
+
+/** Check if two possibly-undefined Stmts or Exprs are in a lexicographic
+ * order. For use in map keys. Safe to use on Exprs that haven't been passed to
+ * common_subexpression_elimination. */
+HALIDE_ALWAYS_INLINE
+bool graph_less_than(const IRHandle &a, const IRHandle &b) {
+    if (a.get() == b.get()) {
+        return false;
+    } else if (!a.defined()) {
+        return true;
+    } else if (!b.defined()) {
+        return false;
+    } else {
+        return graph_less_than(*(a.get()), *(b.get()));
+    }
+}
 
-    IRCompareCache() = default;
-    IRCompareCache(int b)
-        : bits(b), entries(static_cast<size_t>(1) << bits) {
+/** A compare struct built around less_than, for use as the comparison
+ * object in a std::map or std::set. */
+struct IRDeepCompare {
+    bool operator()(const IRHandle &a, const IRHandle &b) const {
+        return less_than(a, b);
     }
 };
 
-/** A wrapper about Exprs so that they can be deeply compared with a
- * cache for known-equal subexpressions. Useful for unsanitized Exprs
- * coming in from the front-end, which may be horrible graphs with
- * sub-expressions that are equal by value but not by identity. This
- * isn't a comparison object like IRDeepCompare above, because libc++
- * requires that comparison objects be stateless (and constructs a new
- * one for each comparison!), so they can't have a cache associated
- * with them. However, by sneakily making the cache a mutable member
- * of the objects being compared, we can dodge this issue.
- *
- * Clunky example usage:
- *
-\code
-Expr a, b, c, query;
-std::set<ExprWithCompareCache> s;
-IRCompareCache cache(8);
-s.insert(ExprWithCompareCache(a, &cache));
-s.insert(ExprWithCompareCache(b, &cache));
-s.insert(ExprWithCompareCache(c, &cache));
-if (m.contains(ExprWithCompareCache(query, &cache))) {...}
-\endcode
- *
- */
-struct ExprWithCompareCache {
-    Expr expr;
-    mutable IRCompareCache *cache = nullptr;
-
-    ExprWithCompareCache() = default;
-    ExprWithCompareCache(const Expr &e, IRCompareCache *c)
-        : expr(e), cache(c) {
+/** A compare struct built around graph_less_than, for use as the comparison
+ * object in a std::map or std::set. */
+struct IRGraphDeepCompare {
+    bool operator()(const IRHandle &a, const IRHandle &b) const {
+        return graph_less_than(a, b);
     }
-
-    /** The comparison uses (and updates) the cache */
-    bool operator<(const ExprWithCompareCache &other) const;
 };
 
-/** Compare IR nodes for equality of value. Traverses entire IR
- * tree. For equality of reference, use Expr::same_as. If you're
- * comparing non-CSE'd Exprs, use graph_equal, which is safe for nasty
- * graphs of IR nodes. */
-// @{
-bool equal(const Expr &a, const Expr &b);
-bool equal(const Stmt &a, const Stmt &b);
-bool graph_equal(const Expr &a, const Expr &b);
-bool graph_equal(const Stmt &a, const Stmt &b);
-// @}
-
-/** Order unsanitized IRNodes for use in a map key */
-// @{
-bool graph_less_than(const Expr &a, const Expr &b);
-bool graph_less_than(const Stmt &a, const Stmt &b);
-// @}
-
 void ir_equality_test();
 
 }  // namespace Internal
diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp
index 10521f82ac03..ffbb9406ad1e 100644
--- a/src/IRMatch.cpp
+++ b/src/IRMatch.cpp
@@ -409,149 +409,5 @@ Expr with_lanes(const Expr &x, int lanes) {
     return WithLanes(lanes).mutate(x);
 }
 
-namespace IRMatcher {
-
-HALIDE_ALWAYS_INLINE
-bool equal_helper(const Expr &a, const Expr &b) {
-    return equal(*a.get(), *b.get());
-}
-
-template<typename Op>
-HALIDE_ALWAYS_INLINE bool equal_helper_binop(const BaseExprNode &a, const BaseExprNode &b) {
-    return (equal_helper(((const Op &)a).a, ((const Op &)b).a) &&
-            equal_helper(((const Op &)a).b, ((const Op &)b).b));
-}
-
-HALIDE_ALWAYS_INLINE
-bool equal_helper(int a, int b) {
-    return a == b;
-}
-
-template<typename T>
-HALIDE_ALWAYS_INLINE bool equal_helper(const std::vector<T> &a, const std::vector<T> &b) {
-    if (a.size() != b.size()) {
-        return false;
-    }
-    for (size_t i = 0; i < a.size(); i++) {
-        if (!equal_helper(a[i], b[i])) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool equal_helper(const BaseExprNode &a, const BaseExprNode &b) noexcept {
-    switch (a.node_type) {
-    case IRNodeType::IntImm:
-        return ((const IntImm &)a).value == ((const IntImm &)b).value;
-    case IRNodeType::UIntImm:
-        return ((const UIntImm &)a).value == ((const UIntImm &)b).value;
-    case IRNodeType::FloatImm:
-        return ((const FloatImm &)a).value == ((const FloatImm &)b).value;
-    case IRNodeType::StringImm:
-        return ((const StringImm &)a).value == ((const StringImm &)b).value;
-    case IRNodeType::Cast:
-        // While we know a and b have matching type, we don't know
-        // that the types of the values match, so use equal rather
-        // than equal_helper.
-        return equal(((const Cast &)a).value, ((const Cast &)b).value);
-    case IRNodeType::Reinterpret:
-        // While we know a and b have matching type, we don't know
-        // that the types of the values match, so use equal rather
-        // than equal_helper.
-        return equal(((const Reinterpret &)a).value, ((const Reinterpret &)b).value);
-    case IRNodeType::Variable:
-        return ((const Variable &)a).name == ((const Variable &)b).name;
-    case IRNodeType::Add:
-        return equal_helper_binop<Add>(a, b);
-    case IRNodeType::Sub:
-        return equal_helper_binop<Sub>(a, b);
-    case IRNodeType::Mul:
-        return equal_helper_binop<Mul>(a, b);
-    case IRNodeType::Div:
-        return equal_helper_binop<Div>(a, b);
-    case IRNodeType::Mod:
-        return equal_helper_binop<Mod>(a, b);
-    case IRNodeType::Min:
-        return equal_helper_binop<Min>(a, b);
-    case IRNodeType::Max:
-        return equal_helper_binop<Max>(a, b);
-    case IRNodeType::EQ:
-        return equal_helper_binop<EQ>(a, b);
-    case IRNodeType::NE:
-        return equal_helper_binop<NE>(a, b);
-    case IRNodeType::LT:
-        return equal_helper_binop<LT>(a, b);
-    case IRNodeType::LE:
-        return equal_helper_binop<LE>(a, b);
-    case IRNodeType::GT:
-        return equal_helper_binop<GT>(a, b);
-    case IRNodeType::GE:
-        return equal_helper_binop<GE>(a, b);
-    case IRNodeType::And:
-        return equal_helper_binop<And>(a, b);
-    case IRNodeType::Or:
-        return equal_helper_binop<Or>(a, b);
-    case IRNodeType::Not:
-        return equal_helper(((const Not &)a).a, ((const Not &)b).a);
-    case IRNodeType::Select:
-        return (equal_helper(((const Select &)a).condition, ((const Select &)b).condition) &&
-                equal_helper(((const Select &)a).true_value, ((const Select &)b).true_value) &&
-                equal_helper(((const Select &)a).false_value, ((const Select &)b).false_value));
-    case IRNodeType::Load:
-        return (((const Load &)a).name == ((const Load &)b).name &&
-                equal_helper(((const Load &)a).index, ((const Load &)b).index));
-    case IRNodeType::Ramp:
-        return (equal_helper(((const Ramp &)a).base, ((const Ramp &)b).base) &&
-                equal_helper(((const Ramp &)a).stride, ((const Ramp &)b).stride));
-    case IRNodeType::Broadcast:
-        return equal_helper(((const Broadcast &)a).value, ((const Broadcast &)b).value);
-    case IRNodeType::Call:
-        return (((const Call &)a).name == ((const Call &)b).name &&
-                ((const Call &)a).call_type == ((const Call &)b).call_type &&
-                ((const Call &)a).value_index == ((const Call &)b).value_index &&
-                equal_helper(((const Call &)a).args, ((const Call &)b).args));
-    case IRNodeType::Let:
-        return (((const Let &)a).name == ((const Let &)b).name &&
-                equal_helper(((const Let &)a).value, ((const Let &)b).value) &&
-                equal_helper(((const Let &)a).body, ((const Let &)b).body));
-    case IRNodeType::Shuffle:
-        return (equal_helper(((const Shuffle &)a).vectors, ((const Shuffle &)b).vectors) &&
-                equal_helper(((const Shuffle &)a).indices, ((const Shuffle &)b).indices));
-    case IRNodeType::VectorReduce:
-        // As with Cast above, we use equal instead of equal_helper
-        // here, because while we know a.type == b.type, we don't know
-        // if the types of the value fields also match. We could be
-        // comparing a reduction of an 8-vector down to a 4 vector to
-        // a reduction of a 16-vector down to a 4-vector.
-        return (((const VectorReduce &)a).op == ((const VectorReduce &)b).op &&
-                equal(((const VectorReduce &)a).value, ((const VectorReduce &)b).value));
-
-    // Explicitly list all the Stmts instead of using a default
-    // clause so that if new Exprs are added without being handled
-    // here we get a compile-time error.
-    case IRNodeType::LetStmt:
-    case IRNodeType::AssertStmt:
-    case IRNodeType::ProducerConsumer:
-    case IRNodeType::For:
-    case IRNodeType::Acquire:
-    case IRNodeType::Store:
-    case IRNodeType::Provide:
-    case IRNodeType::Allocate:
-    case IRNodeType::Free:
-    case IRNodeType::Realize:
-    case IRNodeType::Block:
-    case IRNodeType::Fork:
-    case IRNodeType::IfThenElse:
-    case IRNodeType::Evaluate:
-    case IRNodeType::Prefetch:
-    case IRNodeType::Atomic:
-    case IRNodeType::HoistedStorage:
-        break;
-    }
-    return false;
-}
-
-}  // namespace IRMatcher
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/IRMatch.h b/src/IRMatch.h
index a203fec51199..4f6dfb13c145 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -188,18 +188,6 @@ Expr make_const_expr(halide_scalar_value_t val, halide_type_t ty) {
     return e;
 }
 
-bool equal_helper(const BaseExprNode &a, const BaseExprNode &b) noexcept;
-
-// A fast version of expression equality that assumes a well-typed non-null expression tree.
-HALIDE_ALWAYS_INLINE
-bool equal(const BaseExprNode &a, const BaseExprNode &b) noexcept {
-    // Early out
-    return (&a == &b) ||
-           ((a.type == b.type) &&
-            (a.node_type == b.node_type) &&
-            equal_helper(a, b));
-}
-
 // A pattern that matches a specific expression
 struct SpecificExpr {
     struct pattern_tag {};
diff --git a/src/ParallelRVar.cpp b/src/ParallelRVar.cpp
index c210e487f3ad..a79b9eac56ba 100644
--- a/src/ParallelRVar.cpp
+++ b/src/ParallelRVar.cpp
@@ -145,7 +145,7 @@ bool can_parallelize_rvar(const string &v,
     }
 
     // Add the definition's predicate if there is any
-    if (pred.defined() || !equal(const_true(), pred)) {
+    if (pred.defined() || !is_const_one(pred)) {
         Expr this_pred = pred;
         Expr other_pred = renamer.mutate(pred);
         debug(3) << "......this thread predicate: " << this_pred << "\n";
diff --git a/src/RDom.cpp b/src/RDom.cpp
index 46d10fc8db10..b7859bafc2b7 100644
--- a/src/RDom.cpp
+++ b/src/RDom.cpp
@@ -264,7 +264,7 @@ std::ostream &operator<<(std::ostream &stream, const RDom &dom) {
     }
     stream << ")";
     Expr pred = simplify(dom.domain().predicate());
-    if (!equal(const_true(), pred)) {
+    if (!is_const_one(pred)) {
         stream << " where (\n  " << pred << ")";
     }
     stream << "\n";
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index 8fa2fd71a7a2..b5d8f35aac28 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -514,7 +514,7 @@ Stmt build_provide_loop_nest(const map<string, Function> &env,
             Stmt then_case = build_provide_loop_nest(env, prefix, func, s.definition, start_fuse, is_update);
             stmt = IfThenElse::make(s.condition, then_case, stmt);
         } else {
-            internal_assert(equal(s.condition, const_true()));
+            internal_assert(is_const_one(s.condition));
             // specialize_fail() should only be possible on the final specialization
             internal_assert(i == specializations.size());
             Expr specialize_fail_error =

From dd1d0e8d745fcff9d8f3a841d8570cd89b9d21ca Mon Sep 17 00:00:00 2001
From: aankit-quic <166656642+aankit-quic@users.noreply.github.com>
Date: Fri, 19 Apr 2024 10:33:44 -0700
Subject: [PATCH 106/186] [HEXAGON] Keep support for hexagon_remote/Makefile
 (#8186)

Update hexagon_remote/Makefile
---
 src/runtime/hexagon_remote/Makefile | 44 ++++++++++++++---------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/runtime/hexagon_remote/Makefile b/src/runtime/hexagon_remote/Makefile
index db2d161682c5..6b2f6b2a4eab 100644
--- a/src/runtime/hexagon_remote/Makefile
+++ b/src/runtime/hexagon_remote/Makefile
@@ -114,37 +114,37 @@ $(BIN)/%/thread_pool.o: thread_pool.cpp
 	mkdir -p $(@D)
 	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c thread_pool.cpp -o $@
 
-$(BIN)/%/halide_remote.o: halide_remote.cpp known_symbols.h $(BIN)/src/halide_hexagon_remote.h
+$(BIN)/%/halide_remote.o: qurt/halide_remote.cpp qurt/known_symbols.h $(BIN)/src/halide_hexagon_remote.h
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -I$(BIN)/src/ -fPIC -c halide_remote.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -I$(BIN)/src/ -fPIC -c qurt/halide_remote.cpp -o $@
 
-$(BIN)/%/host_malloc.o: host_malloc.cpp
+$(BIN)/%/host_malloc.o: android/host_malloc.cpp
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c host_malloc.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c android/host_malloc.cpp -o $@
 
-$(BIN)/%/libadsprpc_shim.o: libadsprpc_shim.cpp
+$(BIN)/%/libadsprpc_shim.o: android/libadsprpc_shim.cpp
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c libadsprpc_shim.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c android/libadsprpc_shim.cpp -o $@
 
-$(BIN)/%/host_shim.o: host_shim.cpp $(BIN)/src/halide_hexagon_remote.h
+$(BIN)/%/host_shim.o: android/host_shim.cpp $(BIN)/src/halide_hexagon_remote.h
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -I$(BIN)/src/ -fPIC -c host_shim.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -I$(BIN)/src/ -fPIC -c android/host_shim.cpp -o $@
 
-$(BIN)/%/known_symbols.o: known_symbols.cpp
+$(BIN)/%/known_symbols.o: qurt/known_symbols.cpp
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c known_symbols.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c qurt/known_symbols.cpp -o $@
 
-$(BIN)/%/nearbyint.o: nearbyint.cpp
+$(BIN)/%/nearbyint.o: qurt/nearbyint.cpp
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c nearbyint.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c qurt/nearbyint.cpp -o $@
 
-$(BIN)/%/c11_stubs.o: c11_stubs.cpp
+$(BIN)/%/c11_stubs.o: qurt/c11_stubs.cpp
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c c11_stubs.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c qurt/c11_stubs.cpp -o $@
 
-$(BIN)/%/log.o: log.cpp
+$(BIN)/%/log.o: qurt/log.cpp
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c log.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -fPIC -c qurt/log.cpp -o $@
 
 # Build rules for the hexagon implementation.
 $(BIN)/%/libhalide_hexagon_remote_skel.so: $(BIN)/%/halide_remote.o $(BIN)/%/halide_hexagon_remote_skel.o $(BIN)/%/nearbyint.o $(BIN)/%/c11_stubs.o $(BIN)/%/log.o $(BIN)/%/known_symbols.o
@@ -170,21 +170,21 @@ $(BIN)/adsp/%/libhalide_hexagon_host.so: bin/src/halide_hexagon_remote_stub.c bi
 	$(CC-$*) $^ $(CCFLAGS-$*) -Wl,-soname,libhalide_hexagon_host.so -shared -o $@  -ladsprpc
 
 # Build rules for the simulator implementation.
-$(BIN)/%/sim_remote.o: sim_remote.cpp sim_protocol.h known_symbols.h $(BIN)/src/halide_hexagon_remote.h
+$(BIN)/%/sim_remote.o: qurt/sim_remote.cpp sim_protocol.h qurt/known_symbols.h $(BIN)/src/halide_hexagon_remote.h
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -I$(BIN)/src/ -c sim_remote.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -I$(BIN)/src/ -I$(BIN)/.. -c qurt/sim_remote.cpp -o $@
 
 $(BIN)/%/sim_host.o: sim_host.cpp sim_protocol.h
 	mkdir -p $(@D)
 	$(CXX-$*) -std=c++17 $(CCFLAGS-$*) -c sim_host.cpp -o $@
 
-$(BIN)/%/sim_qurt.o: sim_qurt.cpp
+$(BIN)/%/sim_qurt.o: qurt/sim_qurt.cpp
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -c sim_qurt.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -c qurt/sim_qurt.cpp -o $@
 
-$(BIN)/%/sim_qurt_vtcm.o: sim_qurt_vtcm.cpp
+$(BIN)/%/sim_qurt_vtcm.o: qurt/sim_qurt_vtcm.cpp
 	mkdir -p $(@D)
-	$(CXX-$*) $(CCFLAGS-$*) -c sim_qurt_vtcm.cpp -o $@
+	$(CXX-$*) $(CCFLAGS-$*) -c qurt/sim_qurt_vtcm.cpp -o $@
 
 $(BIN)/%/libsim_qurt.a: $(BIN)/%/sim_qurt.o $(BIN)/%/sim_qurt_vtcm.o
 	mkdir -p $(@D)

From 31c52abee3ff118b3b61b8994e402c0a642a9c2e Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 19 Apr 2024 12:59:34 -0700
Subject: [PATCH 107/186] Faster substitute_facts (#8200)

* Fix computational complexity of substitute_facts

It was O(n) for n facts. This makes it O(log(n))

This was particularly bad for pipelines with lots of inputs or outputs,
because those pipelines have lots of asserts, which make for lots of
facts to substitute in.

Speeds up lowering of local laplacian with 20 pyramid levels (which has
only one input and one output) by 1.09x

Speeds up lowering of the adams 2019 cost model training pipeline (lots
of weight inputs and lots outputs due to derivatives) by 1.5x

Speeds up resnet50 (tons of weight inputs) lowering by 7.3x!

* Add missing switch breaks

* Add missing comments

* Elaborate on why we treat NaNs as equal
---
 src/Simplify.cpp        | 42 +++++++++++++++++++++++++++++------------
 src/Simplify_Internal.h |  2 +-
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/src/Simplify.cpp b/src/Simplify.cpp
index 61cf7886cb70..bc0c0964cf81 100644
--- a/src/Simplify.cpp
+++ b/src/Simplify.cpp
@@ -179,7 +179,7 @@ void Simplify::ScopedFact::learn_false(const Expr &fact) {
         return;
     }
     if (simplify->falsehoods.insert(fact).second) {
-        falsehoods.push_back(fact);
+        falsehoods.insert(fact);
     }
 }
 
@@ -311,20 +311,38 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) {
         return;
     }
     if (simplify->truths.insert(fact).second) {
-        truths.push_back(fact);
+        truths.insert(fact);
     }
 }
 
-template<class T>
-T substitute_facts_impl(T t, const vector<Expr> &truths, const vector<Expr> &falsehoods) {
-    // An std::map<Expr, Expr> version of substitute might be an optimization?
-    for (const auto &i : truths) {
-        t = substitute(i, const_true(i.type().lanes()), t);
-    }
-    for (const auto &i : falsehoods) {
-        t = substitute(i, const_false(i.type().lanes()), t);
-    }
-    return t;
+template<typename T>
+T substitute_facts_impl(const T &t,
+                        const std::set<Expr, IRDeepCompare> &truths,
+                        const std::set<Expr, IRDeepCompare> &falsehoods) {
+    class Substitutor : public IRMutator {
+        const std::set<Expr, IRDeepCompare> &truths, &falsehoods;
+
+    public:
+        using IRMutator::mutate;
+        Expr mutate(const Expr &e) override {
+            if (!e.type().is_bool()) {
+                return IRMutator::mutate(e);
+            } else if (truths.count(e)) {
+                return make_one(e.type());
+            } else if (falsehoods.count(e)) {
+                return make_zero(e.type());
+            } else {
+                return IRMutator::mutate(e);
+            }
+        }
+
+        Substitutor(const std::set<Expr, IRDeepCompare> &t,
+                    const std::set<Expr, IRDeepCompare> &f)
+            : truths(t), falsehoods(f) {
+        }
+    } substitutor(truths, falsehoods);
+
+    return substitutor.mutate(t);
 }
 
 Expr Simplify::ScopedFact::substitute_facts(const Expr &e) {
diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h
index a59a4250cf2b..92f012926091 100644
--- a/src/Simplify_Internal.h
+++ b/src/Simplify_Internal.h
@@ -258,7 +258,7 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
 
         std::vector<const Variable *> pop_list;
         std::vector<const Variable *> bounds_pop_list;
-        std::vector<Expr> truths, falsehoods;
+        std::set<Expr, IRDeepCompare> truths, falsehoods;
 
         void learn_false(const Expr &fact);
         void learn_true(const Expr &fact);

From e39497b1016b00a48c544c7995a739912c43ff8b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sat, 20 Apr 2024 20:43:38 -0700
Subject: [PATCH 108/186] Make Interval::is_single_point check for deep
 equality (#8202)

* Make is_single_point compare min and max by deep equality

Interval::is_single_point() used to only compare expressions by shallow
equality to see if they are the same Expr object.

However, bounds_of_expr_in_scope is really improved if it uses deep
equality instead, so it has a prepass that goes over the provided scope,
calls equal(min, max) on everything, and fixes up anything where deep
equality is true but shallow equality.

This prepass costs O(n) for n things in scope, regardless of how complex
the expression being analyzed is. So if you ask for the bounds of '4'
say in a context where there are lots of things in the scope, it's
absurdly slow. We were doing this! BoxTouched calls
bounds_of_expr_in_scope lots of times on small index Exprs within the
same very large scope.

It's better to just make Interval::is_single_point() check deep
equality. This speeds up local laplacian lowering by 1.1x, and resnet50
lowering by 1.5x.

There were also places where intervals that were a single point were
diverging due to carelessly written code. E.g. the interval [40*8,
40*8], where both of those 40*8s are the same Mul node, was being
simplified like this:

interval.min = simplify(interval.min);
interval.max = simplify(interval.max);

Not only does this do double the simplification work it should, but it
also caused something that was a single point to diverge into not being
a single point, because the repeated constant-folding creates a new
Expr. With the new is_single_point this matters a lot less, but even so,
I centralized simplification of intervals into a single helper that
doesn't do the pointless double-simplification for single points.

Some of these shallowly-unequal but deeply-equal Intervals were being
created in bounds inference itself after the prepass, which may have
been generating suboptimal bounds. This change should fix that in
addition to the compile-time benefits.

Also added a simplify call in SkipStages because I noticed when it
processed specializations it was creating things like (condition) ||
(!condition).
---
 src/Bounds.cpp     | 55 +++++++++++++++++++---------------------------
 src/Interval.cpp   |  4 ++--
 src/SkipStages.cpp |  4 ++--
 3 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index a8ed2deba0d2..847f6b73842a 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -88,6 +88,17 @@ int static_sign(const Expr &x) {
     return 0;
 }
 
+Interval simplify(const Interval &i) {
+    Interval result;
+    result.min = simplify(i.min);
+    if (i.is_single_point()) {
+        result.max = result.min;
+    } else {
+        result.max = simplify(i.max);
+    }
+    return result;
+}
+
 }  // anonymous namespace
 
 const FuncValueBounds &empty_func_value_bounds() {
@@ -109,8 +120,7 @@ Expr find_constant_bound(const Expr &e, Direction d, const Scope<Interval> &scop
 Interval find_constant_bounds(const Expr &e, const Scope<Interval> &scope) {
     Expr expr = bound_correlated_differences(simplify(remove_likelies(e)));
     Interval interval = bounds_of_expr_in_scope(expr, scope, FuncValueBounds(), true);
-    interval.min = simplify(interval.min);
-    interval.max = simplify(interval.max);
+    interval = simplify(interval);
 
     // Note that we can get non-const but well-defined results (e.g. signed_integer_overflow);
     // for our purposes here, treat anything non-const as no-bound.
@@ -158,16 +168,6 @@ class Bounds : public IRVisitor {
     Bounds(const Scope<Interval> *s, const FuncValueBounds &fb, bool const_bound)
         : func_bounds(fb), const_bound(const_bound) {
         scope.set_containing_scope(s);
-
-        // Find any points that are single_points but fail is_single_point due to
-        // pointer equality checks and replace with single_points.
-        for (auto item = s->cbegin(); item != s->cend(); ++item) {
-            const Interval &item_interval = item.value();
-            if (!item_interval.is_single_point() &&
-                equal(item_interval.min, item_interval.max)) {
-                scope.push(item.name(), Interval::single_point(item_interval.min));
-            }
-        }
     }
 
 #if DO_TRACK_BOUNDS_INTERVALS
@@ -325,8 +325,7 @@ class Bounds : public IRVisitor {
                 // constants, so try to make the constants first.
 
                 // First constant-fold
-                a.min = simplify(a.min);
-                a.max = simplify(a.max);
+                a = simplify(a);
 
                 // Then try to strip off junk mins and maxes.
                 bool old_constant_bound = const_bound;
@@ -355,8 +354,7 @@ class Bounds : public IRVisitor {
                 // a is bounded, but from and to can't necessarily represent
                 // each other; however, if the bounds can be simplified to
                 // constants, they might fit regardless of types.
-                a.min = simplify(a.min);
-                a.max = simplify(a.max);
+                a = simplify(a);
                 const auto *umin = as_const_uint(a.min);
                 const auto *umax = as_const_uint(a.max);
                 if (umin && umax && to.can_represent(*umin) && to.can_represent(*umax)) {
@@ -2573,13 +2571,11 @@ class BoxesTouched : public IRGraphVisitor {
             op->value.accept(this);
 
             f.value_bounds = bounds_of_expr_in_scope(op->value, scope, func_bounds);
-
-            bool fixed = f.value_bounds.min.same_as(f.value_bounds.max);
-            f.value_bounds.min = simplify(f.value_bounds.min);
-            f.value_bounds.max = fixed ? f.value_bounds.min : simplify(f.value_bounds.max);
+            f.value_bounds = simplify(f.value_bounds);
 
             if (is_small_enough_to_substitute(f.value_bounds.min) &&
-                (fixed || is_small_enough_to_substitute(f.value_bounds.max))) {
+                (f.value_bounds.is_single_point() ||
+                 is_small_enough_to_substitute(f.value_bounds.max))) {
                 scope.push(op->name, f.value_bounds);
             } else {
                 f.max_name = unique_name('t');
@@ -2769,9 +2765,7 @@ class BoxesTouched : public IRGraphVisitor {
                         const Expr *val = let_stmts.find(l.var);
                         internal_assert(val);
                         v_bound = bounds_of_expr_in_scope(*val, scope, func_bounds);
-                        bool fixed = v_bound.min.same_as(v_bound.max);
-                        v_bound.min = simplify(v_bound.min);
-                        v_bound.max = fixed ? v_bound.min : simplify(v_bound.max);
+                        v_bound = simplify(v_bound);
 
                         const Interval *old_bound = scope.find(l.var);
                         internal_assert(old_bound);
@@ -3368,12 +3362,12 @@ FuncValueBounds compute_function_value_bounds(const vector<string> &order,
                 result = compute_pure_function_definition_value_bounds(f.definition(), arg_scope, fb, j);
                 // These can expand combinatorially as we go down the
                 // pipeline if we don't run CSE on them.
+                bool fixed = result.is_single_point();
                 if (result.has_lower_bound()) {
                     result.min = simplify(common_subexpression_elimination(result.min));
                 }
-
                 if (result.has_upper_bound()) {
-                    result.max = simplify(common_subexpression_elimination(result.max));
+                    result.max = fixed ? result.min : simplify(common_subexpression_elimination(result.max));
                 }
 
                 fb[key] = result;
@@ -3431,8 +3425,7 @@ namespace {
 void check(const Scope<Interval> &scope, const Expr &e, const Expr &correct_min, const Expr &correct_max) {
     FuncValueBounds fb;
     Interval result = bounds_of_expr_in_scope(e, scope, fb);
-    result.min = simplify(result.min);
-    result.max = simplify(result.max);
+    result = simplify(result);
     if (!equal(result.min, correct_min)) {
         internal_error << "In bounds of " << e << ":\n"
                        << "Incorrect min: " << result.min << "\n"
@@ -3448,8 +3441,7 @@ void check(const Scope<Interval> &scope, const Expr &e, const Expr &correct_min,
 void check_constant_bound(const Scope<Interval> &scope, const Expr &e, const Expr &correct_min, const Expr &correct_max) {
     FuncValueBounds fb;
     Interval result = bounds_of_expr_in_scope(e, scope, fb, true);
-    result.min = simplify(result.min);
-    result.max = simplify(result.max);
+    result = simplify(result);
     if (!equal(result.min, correct_min)) {
         internal_error << "In find constant bound of " << e << ":\n"
                        << "Incorrect min constant bound: " << result.min << "\n"
@@ -3603,8 +3595,7 @@ void boxes_touched_test() {
     for (size_t i = 0; i < result.size(); ++i) {
         const Interval &correct = expected[i];
         Interval b = result[i];
-        b.min = simplify(b.min);
-        b.max = simplify(b.max);
+        b = simplify(b);
         if (!equal(correct.min, b.min)) {
             internal_error << "In bounds of dim " << i << ":\n"
                            << "Incorrect min: " << b.min << "\n"
diff --git a/src/Interval.cpp b/src/Interval.cpp
index 10550f7ed48b..bdb529999bf2 100644
--- a/src/Interval.cpp
+++ b/src/Interval.cpp
@@ -86,11 +86,11 @@ bool Interval::is_everything() const {
 }
 
 bool Interval::is_single_point() const {
-    return min.same_as(max);
+    return is_bounded() && equal(min, max);
 }
 
 bool Interval::is_single_point(const Expr &e) const {
-    return min.same_as(e) && max.same_as(e);
+    return is_bounded() && equal(min, e) && equal(max, e);
 }
 
 bool Interval::has_upper_bound() const {
diff --git a/src/SkipStages.cpp b/src/SkipStages.cpp
index caf292972fbb..3badfc5d77d9 100644
--- a/src/SkipStages.cpp
+++ b/src/SkipStages.cpp
@@ -269,8 +269,8 @@ class SkipStages : public IRMutator {
 
     Stmt emit_defs(Stmt stmt) {
         for (auto &p : func_info) {
-            stmt = LetStmt::make(used_var_name(p.first), p.second.used, stmt);
-            stmt = LetStmt::make(loaded_var_name(p.first), p.second.loaded, stmt);
+            stmt = LetStmt::make(used_var_name(p.first), simplify(p.second.used), stmt);
+            stmt = LetStmt::make(loaded_var_name(p.first), simplify(p.second.loaded), stmt);
             need_uniquify |= !lets_emitted.insert(p.first).second;
         }
         return stmt;

From 302aa1c35a20907de971e221db9254877753e255 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 25 Apr 2024 11:58:23 -0700
Subject: [PATCH 109/186] Refactor ConstantInterval (#8179)

* Make ConstantInterval more of a first-class thing

and use it in Monotonic.cpp

* Restore bound_correlated_differences calls

* Elaborate on TODO

* Handle some TODOs

Also explicit ignore lossless_cast bugs that will be fixed in #8155

* Fix constant interval mod, clean up constant interval saturating cast

* Improve comment

* Avoid unsigned overflow

* Fix the most obvious bug in lossless_cast, to make the fuzzer pass more

* Skip over pipelines that fail the lossless_cast check

* Drop iteration count on lossless_cast test

* Add test to CMakeLists.txt

* Avoid UB in constant_interval test (signed integer overflow of the scalars)

* Restore accidentally-deleted line from CMakeLists.txt

* Print on success

* Handle Lets in constant_integer_bounds

Also, plumb the cache through the recursive calls

* Delete duplicate operator<<

* Just always cast the bounds back to the range of the op type

* Address review comments

* Redo operator<< for ConstantIntervals

* Improve comment; disable buggy code for now
---
 Makefile                               |   4 +
 src/CMakeLists.txt                     |   4 +
 src/ConstantBounds.cpp                 | 170 +++++++
 src/ConstantBounds.h                   |  35 ++
 src/ConstantInterval.cpp               | 680 +++++++++++++++++++++++++
 src/ConstantInterval.h                 | 176 +++++++
 src/FindIntrinsics.cpp                 |   7 +-
 src/IROperator.cpp                     |   6 +-
 src/IRPrinter.cpp                      |  42 ++
 src/IRPrinter.h                        |  14 +-
 src/Interval.cpp                       |  88 +---
 src/Interval.h                         |  59 +--
 src/Monotonic.cpp                      | 299 +++--------
 src/Monotonic.h                        |   5 +-
 src/Type.cpp                           |   5 +
 src/Type.h                             |   8 +
 test/correctness/CMakeLists.txt        |   1 +
 test/correctness/constant_interval.cpp | 187 +++++++
 test/correctness/lossless_cast.cpp     | 378 +++++++++++++-
 test/fuzz/bounds.cpp                   |   5 -
 20 files changed, 1788 insertions(+), 385 deletions(-)
 create mode 100644 src/ConstantBounds.cpp
 create mode 100644 src/ConstantBounds.h
 create mode 100644 src/ConstantInterval.cpp
 create mode 100644 src/ConstantInterval.h
 create mode 100644 test/correctness/constant_interval.cpp

diff --git a/Makefile b/Makefile
index 17e8a80e1ca4..440b307a920e 100644
--- a/Makefile
+++ b/Makefile
@@ -477,6 +477,8 @@ SOURCE_FILES = \
   CodeGen_WebGPU_Dev.cpp \
   CodeGen_X86.cpp \
   CompilerLogger.cpp \
+  ConstantBounds.cpp \
+  ConstantInterval.cpp \
   CPlusPlusMangle.cpp \
   CSE.cpp \
   Debug.cpp \
@@ -671,6 +673,8 @@ HEADER_FILES = \
   CompilerLogger.h \
   ConciseCasts.h \
   CPlusPlusMangle.h \
+  ConstantBounds.h \
+  ConstantInterval.h \
   CSE.h \
   Debug.h \
   DebugArguments.h \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 557574f284c4..2f410244d2b0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -45,6 +45,8 @@ set(HEADER_FILES
     CompilerLogger.h
     ConciseCasts.h
     CPlusPlusMangle.h
+    ConstantBounds.h
+    ConstantInterval.h
     CSE.h
     Debug.h
     DebugArguments.h
@@ -219,6 +221,8 @@ set(SOURCE_FILES
     CodeGen_X86.cpp
     CompilerLogger.cpp
     CPlusPlusMangle.cpp
+    ConstantBounds.cpp
+    ConstantInterval.cpp
     CSE.cpp
     Debug.cpp
     DebugArguments.cpp
diff --git a/src/ConstantBounds.cpp b/src/ConstantBounds.cpp
new file mode 100644
index 000000000000..11d1a42133a9
--- /dev/null
+++ b/src/ConstantBounds.cpp
@@ -0,0 +1,170 @@
+#include "ConstantBounds.h"
+#include "IR.h"
+#include "IROperator.h"
+#include "IRPrinter.h"
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+ConstantInterval bounds_helper(const Expr &e,
+                               Scope<ConstantInterval> &scope,
+                               std::map<Expr, ConstantInterval, ExprCompare> *cache) {
+    internal_assert(e.defined());
+
+    auto recurse = [&](const Expr &e) {
+        return bounds_helper(e, scope, cache);
+    };
+
+    auto get_infinite_bounds = [&]() {
+        // Compute the bounds of each IR node from the bounds of its args. Math
+        // on ConstantInterval is in terms of infinite integers.
+        if (const UIntImm *op = e.as<UIntImm>()) {
+            if (Int(64).can_represent(op->value)) {
+                return ConstantInterval::single_point((int64_t)(op->value));
+            }
+        } else if (const IntImm *op = e.as<IntImm>()) {
+            return ConstantInterval::single_point(op->value);
+        } else if (const Variable *op = e.as<Variable>()) {
+            if (const auto *in = scope.find(op->name)) {
+                return *in;
+            }
+        } else if (const Add *op = e.as<Add>()) {
+            return recurse(op->a) + recurse(op->b);
+        } else if (const Sub *op = e.as<Sub>()) {
+            return recurse(op->a) - recurse(op->b);
+        } else if (const Mul *op = e.as<Mul>()) {
+            return recurse(op->a) * recurse(op->b);
+        } else if (const Div *op = e.as<Div>()) {
+            return recurse(op->a) / recurse(op->b);
+        } else if (const Mod *op = e.as<Mod>()) {
+            return recurse(op->a) % recurse(op->b);
+        } else if (const Min *op = e.as<Min>()) {
+            return min(recurse(op->a), recurse(op->b));
+        } else if (const Max *op = e.as<Max>()) {
+            return max(recurse(op->a), recurse(op->b));
+        } else if (const Cast *op = e.as<Cast>()) {
+            return recurse(op->value);
+        } else if (const Broadcast *op = e.as<Broadcast>()) {
+            return recurse(op->value);
+        } else if (const VectorReduce *op = e.as<VectorReduce>()) {
+            int f = op->value.type().lanes() / op->type.lanes();
+            ConstantInterval factor(f, f);
+            ConstantInterval arg_bounds = recurse(op->value);
+            switch (op->op) {
+            case VectorReduce::Add:
+                return arg_bounds * factor;
+            case VectorReduce::SaturatingAdd:
+                return saturating_cast(op->type, arg_bounds * factor);
+            case VectorReduce::Min:
+            case VectorReduce::Max:
+            case VectorReduce::And:
+            case VectorReduce::Or:
+                return arg_bounds;
+            default:;
+            }
+        } else if (const Shuffle *op = e.as<Shuffle>()) {
+            ConstantInterval arg_bounds = recurse(op->vectors[0]);
+            for (size_t i = 1; i < op->vectors.size(); i++) {
+                arg_bounds.include(recurse(op->vectors[i]));
+            }
+            return arg_bounds;
+        } else if (const Let *op = e.as<Let>()) {
+            ScopedBinding bind(scope, op->name, recurse(op->value));
+            return recurse(op->body);
+        } else if (const Call *op = e.as<Call>()) {
+            ConstantInterval result;
+            if (op->is_intrinsic(Call::abs)) {
+                return abs(recurse(op->args[0]));
+            } else if (op->is_intrinsic(Call::absd)) {
+                return abs(recurse(op->args[0]) - recurse(op->args[1]));
+            } else if (op->is_intrinsic(Call::count_leading_zeros) ||
+                       op->is_intrinsic(Call::count_trailing_zeros)) {
+                // Conservatively just say it's the potential number of zeros in the type.
+                return ConstantInterval(0, op->args[0].type().bits());
+            } else if (op->is_intrinsic(Call::halving_add)) {
+                return (recurse(op->args[0]) + recurse(op->args[1])) / 2;
+            } else if (op->is_intrinsic(Call::halving_sub)) {
+                return (recurse(op->args[0]) - recurse(op->args[1])) / 2;
+            } else if (op->is_intrinsic(Call::rounding_halving_add)) {
+                return (recurse(op->args[0]) + recurse(op->args[1]) + 1) / 2;
+            } else if (op->is_intrinsic(Call::saturating_add)) {
+                return saturating_cast(op->type,
+                                       (recurse(op->args[0]) +
+                                        recurse(op->args[1])));
+            } else if (op->is_intrinsic(Call::saturating_sub)) {
+                return saturating_cast(op->type,
+                                       (recurse(op->args[0]) -
+                                        recurse(op->args[1])));
+            } else if (op->is_intrinsic({Call::widening_add, Call::widen_right_add})) {
+                return recurse(op->args[0]) + recurse(op->args[1]);
+            } else if (op->is_intrinsic({Call::widening_sub, Call::widen_right_sub})) {
+                return recurse(op->args[0]) - recurse(op->args[1]);
+            } else if (op->is_intrinsic({Call::widening_mul, Call::widen_right_mul})) {
+                return recurse(op->args[0]) * recurse(op->args[1]);
+            } else if (op->is_intrinsic({Call::shift_right, Call::widening_shift_right})) {
+                return recurse(op->args[0]) >> recurse(op->args[1]);
+            } else if (op->is_intrinsic({Call::shift_left, Call::widening_shift_left})) {
+                return recurse(op->args[0]) << recurse(op->args[1]);
+            } else if (op->is_intrinsic({Call::rounding_shift_right, Call::rounding_shift_left})) {
+                ConstantInterval ca = recurse(op->args[0]);
+                ConstantInterval cb = recurse(op->args[1]);
+                if (op->is_intrinsic(Call::rounding_shift_left)) {
+                    cb = -cb;
+                }
+                ConstantInterval rounding_term = 1 << (cb - 1);
+                // Note if cb is <= 0, rounding_term is zero.
+                return (ca + rounding_term) >> cb;
+            } else if (op->is_intrinsic(Call::mul_shift_right)) {
+                ConstantInterval ca = recurse(op->args[0]);
+                ConstantInterval cb = recurse(op->args[1]);
+                ConstantInterval cq = recurse(op->args[2]);
+                return (ca * cb) >> cq;
+            } else if (op->is_intrinsic(Call::rounding_mul_shift_right)) {
+                ConstantInterval ca = recurse(op->args[0]);
+                ConstantInterval cb = recurse(op->args[1]);
+                ConstantInterval cq = recurse(op->args[2]);
+                ConstantInterval rounding_term = 1 << (cq - 1);
+                return (ca * cb + rounding_term) >> cq;
+            }
+            // If you add a new intrinsic here, also add it to the expression
+            // generator in test/correctness/lossless_cast.cpp
+        }
+
+        return ConstantInterval::bounds_of_type(e.type());
+    };
+
+    auto get_typed_bounds = [&]() {
+        return cast(e.type(), get_infinite_bounds());
+    };
+
+    ConstantInterval ret;
+    if (cache) {
+        auto [it, cache_miss] = cache->try_emplace(e);
+        if (cache_miss) {
+            it->second = get_typed_bounds();
+        }
+        ret = it->second;
+    } else {
+        ret = get_typed_bounds();
+    }
+
+    internal_assert((!ret.min_defined || e.type().can_represent(ret.min)) &&
+                    (!ret.max_defined || e.type().can_represent(ret.max)))
+        << "constant_bounds returned defined bounds that are not representable in "
+        << "the type of the Expr passed in.\n Expr: " << e << "\n Bounds: " << ret;
+
+    return ret;
+}
+}  // namespace
+
+ConstantInterval constant_integer_bounds(const Expr &e,
+                                         const Scope<ConstantInterval> &scope,
+                                         std::map<Expr, ConstantInterval, ExprCompare> *cache) {
+    Scope<ConstantInterval> sub_scope;
+    sub_scope.set_containing_scope(&scope);
+    return bounds_helper(e, sub_scope, cache);
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/ConstantBounds.h b/src/ConstantBounds.h
new file mode 100644
index 000000000000..26ab114455bb
--- /dev/null
+++ b/src/ConstantBounds.h
@@ -0,0 +1,35 @@
+#ifndef HALIDE_CONSTANT_BOUNDS_H
+#define HALIDE_CONSTANT_BOUNDS_H
+
+#include "ConstantInterval.h"
+#include "Expr.h"
+#include "Scope.h"
+
+/** \file
+ * Methods for computing compile-time constant int64_t upper and lower bounds of
+ * an expression. Cheaper than symbolic bounds inference, and useful for things
+ * like instruction selection.
+ */
+
+namespace Halide {
+namespace Internal {
+
+/** Deduce constant integer bounds on an expression. This can be useful to
+ * decide if, for example, the expression can be cast to another type, be
+ * negated, be incremented, etc without risking overflow.
+ *
+ * Also optionally accepts a scope containing the integer bounds of any
+ * variables that may be referenced, and a cache of constant integer bounds on
+ * known Exprs, which this function will update. The cache is helpful to
+ * short-circuit large numbers of redundant queries, but it should not be used
+ * in contexts where the same Expr object may take on different values within a
+ * single Expr (i.e. before uniquify_variable_names).
+ */
+ConstantInterval constant_integer_bounds(const Expr &e,
+                                         const Scope<ConstantInterval> &scope = Scope<ConstantInterval>::empty_scope(),
+                                         std::map<Expr, ConstantInterval, ExprCompare> *cache = nullptr);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/ConstantInterval.cpp b/src/ConstantInterval.cpp
new file mode 100644
index 000000000000..8bdeec69325c
--- /dev/null
+++ b/src/ConstantInterval.cpp
@@ -0,0 +1,680 @@
+#include "ConstantInterval.h"
+
+#include "Error.h"
+#include "IROperator.h"
+#include "IRPrinter.h"
+
+namespace Halide {
+namespace Internal {
+
+ConstantInterval::ConstantInterval() = default;
+
+ConstantInterval::ConstantInterval(int64_t min, int64_t max)
+    : min(min), max(max), min_defined(true), max_defined(true) {
+    internal_assert(min <= max);
+}
+
+ConstantInterval ConstantInterval::everything() {
+    return ConstantInterval();
+}
+
+ConstantInterval ConstantInterval::single_point(int64_t x) {
+    return ConstantInterval(x, x);
+}
+
+ConstantInterval ConstantInterval::bounded_below(int64_t min) {
+    ConstantInterval result(min, min);
+    result.max_defined = false;
+    result.max = 0;
+    return result;
+}
+
+ConstantInterval ConstantInterval::bounded_above(int64_t max) {
+    ConstantInterval result(max, max);
+    result.min_defined = false;
+    result.min = 0;
+    return result;
+}
+
+bool ConstantInterval::is_everything() const {
+    return !min_defined && !max_defined;
+}
+
+bool ConstantInterval::is_single_point() const {
+    return min_defined && max_defined && min == max;
+}
+
+bool ConstantInterval::is_single_point(int64_t x) const {
+    return min_defined && max_defined && min == x && max == x;
+}
+
+bool ConstantInterval::is_bounded() const {
+    return max_defined && min_defined;
+}
+
+bool ConstantInterval::operator==(const ConstantInterval &other) const {
+    if (min_defined != other.min_defined || max_defined != other.max_defined) {
+        return false;
+    }
+    return (!min_defined || min == other.min) && (!max_defined || max == other.max);
+}
+
+void ConstantInterval::include(const ConstantInterval &i) {
+    if (max_defined && i.max_defined) {
+        max = std::max(max, i.max);
+    } else {
+        max_defined = false;
+    }
+    if (min_defined && i.min_defined) {
+        min = std::min(min, i.min);
+    } else {
+        min_defined = false;
+    }
+}
+
+void ConstantInterval::include(int64_t x) {
+    if (max_defined) {
+        max = std::max(max, x);
+    }
+    if (min_defined) {
+        min = std::min(min, x);
+    }
+}
+
+bool ConstantInterval::contains(int64_t x) const {
+    const bool too_small = min_defined && x < min;
+    const bool too_large = max_defined && x > max;
+    return !(too_small || too_large);
+}
+
+bool ConstantInterval::contains(int32_t x) const {
+    return contains((int64_t)x);
+}
+
+bool ConstantInterval::contains(uint64_t x) const {
+    if (x <= (uint64_t)std::numeric_limits<int64_t>::max()) {
+        // Representable as an int64_t, so just defer to that method.
+        return contains((int64_t)x);
+    } else {
+        // This uint64_t is not representable as an int64_t, which means it's
+        // greater than 2^32 - 1. Given that we can't represent that as a bound,
+        // the best we can do is checking if the interval is unbounded above.
+        return !max_defined;
+    }
+}
+
+ConstantInterval ConstantInterval::make_union(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result = a;
+    result.include(b);
+    return result;
+}
+
+ConstantInterval ConstantInterval::make_intersection(const ConstantInterval &a,
+                                                     const ConstantInterval &b) {
+    ConstantInterval result;
+    if (a.min_defined) {
+        if (b.min_defined) {
+            result.min = std::max(a.min, b.min);
+        } else {
+            result.min = a.min;
+        }
+        result.min_defined = true;
+    } else {
+        result.min_defined = b.min_defined;
+        result.min = b.min;
+    }
+    if (a.max_defined) {
+        if (b.max_defined) {
+            result.max = std::min(a.max, b.max);
+        } else {
+            result.max = a.max;
+        }
+        result.max_defined = true;
+    } else {
+        result.max_defined = b.max_defined;
+        result.max = b.max;
+    }
+    // Our class invariant is that whenever they're both defined, min <=
+    // max. Intersection is the only method that could break that, and it
+    // happens when the intersected intervals do not overlap.
+    internal_assert(!result.is_bounded() || result.min <= result.max)
+        << "Empty ConstantInterval constructed in make_intersection";
+    return result;
+}
+
+void ConstantInterval::operator+=(const ConstantInterval &other) {
+    (*this) = (*this) + other;
+}
+
+void ConstantInterval::operator-=(const ConstantInterval &other) {
+    (*this) = (*this) - other;
+}
+
+void ConstantInterval::operator*=(const ConstantInterval &other) {
+    (*this) = (*this) * other;
+}
+
+void ConstantInterval::operator/=(const ConstantInterval &other) {
+    (*this) = (*this) / other;
+}
+
+void ConstantInterval::operator%=(const ConstantInterval &other) {
+    (*this) = (*this) % other;
+}
+
+void ConstantInterval::operator+=(int64_t x) {
+    (*this) = (*this) + x;
+}
+
+void ConstantInterval::operator-=(int64_t x) {
+    (*this) = (*this) - x;
+}
+
+void ConstantInterval::operator*=(int64_t x) {
+    (*this) = (*this) * x;
+}
+
+void ConstantInterval::operator/=(int64_t x) {
+    (*this) = (*this) / x;
+}
+
+void ConstantInterval::operator%=(int64_t x) {
+    (*this) = (*this) % x;
+}
+
+bool operator<=(const ConstantInterval &a, const ConstantInterval &b) {
+    return a.max_defined && b.min_defined && a.max <= b.min;
+}
+bool operator<(const ConstantInterval &a, const ConstantInterval &b) {
+    return a.max_defined && b.min_defined && a.max < b.min;
+}
+
+bool operator<=(const ConstantInterval &a, int64_t b) {
+    return a.max_defined && a.max <= b;
+}
+bool operator<(const ConstantInterval &a, int64_t b) {
+    return a.max_defined && a.max < b;
+}
+
+bool operator<=(int64_t a, const ConstantInterval &b) {
+    return b.min_defined && a <= b.min;
+}
+bool operator<(int64_t a, const ConstantInterval &b) {
+    return b.min_defined && a < b.min;
+}
+
+void ConstantInterval::cast_to(const Type &t) {
+    if (!t.can_represent(*this)) {
+        // We have potential overflow or underflow, return the entire bounds of
+        // the type.
+        ConstantInterval type_bounds;
+        if (t.is_int()) {
+            if (t.bits() <= 64) {
+                type_bounds.min_defined = type_bounds.max_defined = true;
+                type_bounds.min = ((int64_t)(-1)) << (t.bits() - 1);
+                type_bounds.max = ~type_bounds.min;
+            }
+        } else if (t.is_uint()) {
+            type_bounds.min_defined = true;
+            type_bounds.min = 0;
+            if (t.bits() < 64) {
+                type_bounds.max_defined = true;
+                type_bounds.max = (((int64_t)(1)) << t.bits()) - 1;
+            }
+        }
+        // If it's not int or uint, we're setting this to a default-constructed
+        // ConstantInterval, which is everything.
+        *this = type_bounds;
+    }
+}
+
+ConstantInterval ConstantInterval::operator-() const {
+    ConstantInterval result;
+    if (min_defined && min != INT64_MIN) {
+        result.max_defined = true;
+        result.max = -min;
+    }
+    if (max_defined) {
+        result.min_defined = true;
+        result.min = -max;
+    }
+    return result;
+}
+
+ConstantInterval ConstantInterval::bounds_of_type(Type t) {
+    return cast(t, ConstantInterval::everything());
+}
+
+ConstantInterval operator+(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result;
+    result.min_defined = a.min_defined &&
+                         b.min_defined &&
+                         add_with_overflow(64, a.min, b.min, &result.min);
+
+    result.max_defined = a.max_defined &&
+                         b.max_defined &&
+                         add_with_overflow(64, a.max, b.max, &result.max);
+    return result;
+}
+
+ConstantInterval operator-(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result;
+    result.min_defined = a.min_defined &&
+                         b.max_defined &&
+                         sub_with_overflow(64, a.min, b.max, &result.min);
+    result.max_defined = a.max_defined &&
+                         b.min_defined &&
+                         sub_with_overflow(64, a.max, b.min, &result.max);
+    return result;
+}
+
+ConstantInterval operator/(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result;
+
+    result.min = INT64_MAX;
+    result.max = INT64_MIN;
+
+    auto consider_case = [&](int64_t a, int64_t b) {
+        int64_t v = div_imp(a, b);
+        result.min = std::min(result.min, v);
+        result.max = std::max(result.max, v);
+    };
+
+    // Enumerate all possible values for the min and max and take the extreme values.
+    if (a.min_defined && b.min_defined && b.min != 0) {
+        consider_case(a.min, b.min);
+    }
+
+    if (a.min_defined && b.max_defined && b.max != 0) {
+        consider_case(a.min, b.max);
+    }
+
+    if (a.max_defined && b.max_defined && b.max != 0) {
+        consider_case(a.max, b.max);
+    }
+
+    if (a.max_defined && b.min_defined && b.min != 0) {
+        consider_case(a.max, b.min);
+    }
+
+    // Define an int64_t zero just to pacify std::min and std::max
+    constexpr int64_t zero = 0;
+
+    const bool b_positive = b > 0;
+    const bool b_negative = b < 0;
+    if ((b_positive && !b.max_defined) ||
+        (b_negative && !b.min_defined)) {
+        // Take limit as other -> +/- infinity
+        result.min = std::min(result.min, zero);
+        result.max = std::max(result.max, zero);
+    }
+
+    result.min_defined = ((a.min_defined && b_positive) ||
+                          (a.max_defined && b_negative));
+    result.max_defined = ((a.max_defined && b_positive) ||
+                          (a.min_defined && b_negative));
+
+    // That's as far as we can get knowing the sign of the
+    // denominator. For bounded numerators, we additionally know
+    // that div can't make anything larger in magnitude, so we can
+    // take the intersection with that.
+    if (a.is_bounded() && a.min != INT64_MIN) {
+        int64_t magnitude = std::max(a.max, -a.min);
+        if (result.min_defined) {
+            result.min = std::max(result.min, -magnitude);
+        } else {
+            result.min = -magnitude;
+        }
+        if (result.max_defined) {
+            result.max = std::min(result.max, magnitude);
+        } else {
+            result.max = magnitude;
+        }
+        result.min_defined = result.max_defined = true;
+    }
+
+    // Finally we can deduce the sign if the numerator and denominator are
+    // non-positive or non-negative.
+    bool a_non_negative = a >= 0;
+    bool b_non_negative = b >= 0;
+    bool a_non_positive = a <= 0;
+    bool b_non_positive = b <= 0;
+    if ((a_non_negative && b_non_negative) ||
+        (a_non_positive && b_non_positive)) {
+        if (result.min_defined) {
+            result.min = std::max(result.min, zero);
+        } else {
+            result.min_defined = true;
+            result.min = 0;
+        }
+    } else if ((a_non_negative && b_non_positive) ||
+               (a_non_positive && b_non_negative)) {
+        if (result.max_defined) {
+            result.max = std::min(result.max, zero);
+        } else {
+            result.max_defined = true;
+            result.max = 0;
+        }
+    }
+
+    // Normalize the values if it's undefined
+    if (!result.min_defined) {
+        result.min = 0;
+    }
+    if (!result.max_defined) {
+        result.max = 0;
+    }
+
+    // Check the class invariant as a sanity check.
+    internal_assert(!result.is_bounded() || (result.min <= result.max));
+
+    return result;
+}
+
+ConstantInterval operator*(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result;
+
+    // Compute a possible extreme value of the product, either incorporating it
+    // into result.min / result.max, or setting the min/max defined flags if it
+    // overflows.
+    auto consider_case = [&](int64_t a, int64_t b) {
+        int64_t c;
+        if (mul_with_overflow(64, a, b, &c)) {
+            result.min = std::min(result.min, c);
+            result.max = std::max(result.max, c);
+        } else if ((a > 0) == (b > 0)) {
+            result.max_defined = false;
+        } else {
+            result.min_defined = false;
+        }
+    };
+
+    result.min_defined = result.max_defined = true;
+    result.min = INT64_MAX;
+    result.max = INT64_MIN;
+    if (a.min_defined && b.min_defined) {
+        consider_case(a.min, b.min);
+    }
+    if (a.min_defined && b.max_defined) {
+        consider_case(a.min, b.max);
+    }
+    if (a.max_defined && b.min_defined) {
+        consider_case(a.max, b.min);
+    }
+    if (a.max_defined && b.max_defined) {
+        consider_case(a.max, b.max);
+    }
+
+    const bool a_bounded_negative = a.min_defined && a <= 0;
+    const bool a_bounded_positive = a.max_defined && a >= 0;
+    const bool b_bounded_negative = b.min_defined && b <= 0;
+    const bool b_bounded_positive = b.max_defined && b >= 0;
+
+    if (result.min_defined) {
+        result.min_defined =
+            ((a.is_bounded() && b.is_bounded()) ||
+             (a >= 0 && b >= 0) ||
+             (a <= 0 && b <= 0) ||
+             (a.min_defined && b_bounded_positive) ||
+             (b.min_defined && a_bounded_positive) ||
+             (a.max_defined && b_bounded_negative) ||
+             (b.max_defined && a_bounded_negative));
+    }
+
+    if (result.max_defined) {
+        result.max_defined =
+            ((a.is_bounded() && b.is_bounded()) ||
+             (a >= 0 && b <= 0) ||
+             (a <= 0 && b >= 0) ||
+             (a.max_defined && b_bounded_positive) ||
+             (b.max_defined && a_bounded_positive) ||
+             (a.min_defined && b_bounded_negative) ||
+             (b.min_defined && a_bounded_negative));
+    }
+
+    if (!result.min_defined) {
+        result.min = 0;
+    }
+
+    if (!result.max_defined) {
+        result.max = 0;
+    }
+
+    // Check the class invariant as a sanity check.
+    internal_assert(!result.is_bounded() || (result.min <= result.max));
+
+    return result;
+}
+
+ConstantInterval operator%(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result;
+
+    // Maybe the mod won't actually do anything
+    if (a >= 0 && a < abs(b)) {
+        return a;
+    }
+
+    // The result is at least zero.
+    result.min_defined = true;
+    result.min = 0;
+
+    // Mod by produces a result between 0
+    // and max(0, abs(modulus) - 1). However, if b is unbounded in
+    // either direction, abs(modulus) could be arbitrarily
+    // large.
+    if (b.is_bounded() && b.max != INT64_MIN) {
+        result.max_defined = true;
+        result.max = 0;                                // When b == 0
+        result.max = std::max(result.max, b.max - 1);  // When b > 0
+        result.max = std::max(result.max, ~b.min);     // When b < 0
+        // Note that ~b.min is equal to (-1 - b.min). It's written as ~b.min to
+        // make it clear that it can't overflow.
+    }
+
+    // If a is positive, mod can't make it larger
+    if (a.is_bounded() && a.min >= 0) {
+        if (result.max_defined) {
+            result.max = std::min(result.max, a.max);
+        } else {
+            result.max_defined = true;
+            result.max = a.max;
+        }
+    }
+
+    // Check the class invariant as a sanity check.
+    internal_assert(!result.is_bounded() || (result.min <= result.max));
+
+    return result;
+}
+
+ConstantInterval operator+(const ConstantInterval &a, int64_t b) {
+    return a + ConstantInterval::single_point(b);
+}
+
+ConstantInterval operator-(const ConstantInterval &a, int64_t b) {
+    return a - ConstantInterval::single_point(b);
+}
+
+ConstantInterval operator/(const ConstantInterval &a, int64_t b) {
+    return a / ConstantInterval::single_point(b);
+}
+
+ConstantInterval operator*(const ConstantInterval &a, int64_t b) {
+    return a * ConstantInterval::single_point(b);
+}
+
+ConstantInterval operator%(const ConstantInterval &a, int64_t b) {
+    return a % ConstantInterval::single_point(b);
+}
+
+ConstantInterval min(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result;
+    result.max_defined = a.max_defined || b.max_defined;
+    result.min_defined = a.min_defined && b.min_defined;
+    if (a.max_defined && b.max_defined) {
+        result.max = std::min(a.max, b.max);
+    } else if (a.max_defined) {
+        result.max = a.max;
+    } else if (b.max_defined) {
+        result.max = b.max;
+    }
+    if (a.min_defined && b.min_defined) {
+        result.min = std::min(a.min, b.min);
+    }
+    return result;
+}
+
+ConstantInterval min(const ConstantInterval &a, int64_t b) {
+    ConstantInterval result = a;
+    if (result.max_defined) {
+        result.max = std::min(a.max, b);
+    } else {
+        result.max = b;
+        result.max_defined = true;
+    }
+    if (result.min_defined) {
+        result.min = std::min(a.min, b);
+    }
+    return result;
+}
+
+ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b) {
+    ConstantInterval result;
+    result.min_defined = a.min_defined || b.min_defined;
+    result.max_defined = a.max_defined && b.max_defined;
+    if (a.min_defined && b.min_defined) {
+        result.min = std::max(a.min, b.min);
+    } else if (a.min_defined) {
+        result.min = a.min;
+    } else if (b.min_defined) {
+        result.min = b.min;
+    }
+    if (a.max_defined && b.max_defined) {
+        result.max = std::max(a.max, b.max);
+    }
+    return result;
+}
+
+ConstantInterval max(const ConstantInterval &a, int64_t b) {
+    ConstantInterval result = a;
+    if (result.min_defined) {
+        result.min = std::max(a.min, b);
+    } else {
+        result.min = b;
+        result.min_defined = true;
+    }
+    if (result.max_defined) {
+        result.max = std::max(a.max, b);
+    }
+    return result;
+}
+
+ConstantInterval abs(const ConstantInterval &a) {
+    ConstantInterval result;
+    if (a.min_defined && a.max_defined && a.min != INT64_MIN) {
+        result.max_defined = true;
+        result.max = std::max(-a.min, a.max);
+    }
+    result.min_defined = true;
+    if (a.min_defined && a.min > 0) {
+        result.min = a.min;
+    } else if (a.max_defined && a.max < 0 && a.max != INT64_MIN) {
+        result.min = -a.max;
+    } else {
+        result.min = 0;
+    }
+
+    return result;
+}
+
+ConstantInterval operator<<(const ConstantInterval &a, const ConstantInterval &b) {
+    // In infinite integers (with no overflow):
+
+    // a << b == a * 2^b
+
+    // This can't be used directly, because if b is negative then 2^b is not an
+    // integer. Instead, we'll break b into a difference of two positive values:
+    // b = b_pos - b_neg
+    // So
+    // a * 2^b
+    // = a * 2^(b_pos - b_neg)
+    // = (a * 2^b_pos) / 2^b_neg
+
+    // From there we can use the * and / operators.
+
+    ConstantInterval b_pos = max(b, 0), b_neg = max(-b, 0);
+
+    // At this point, we have sliced the interval b into two parts. E.g.
+    // if b = [10, 12],  b_pos = [10, 12] and b_neg = [0, 0]
+    // if b = [-4, 8],   b_pos = [0, 8]   and b_neg = [0, 4]
+    // if b = [-10, -3], b_pos = [0, 0]   and b_neg = [3, 10]
+    // if b = [-3, inf], b_pos = [0, inf] and b_neg = [0, 3]
+    // In all cases, note that b_pos - b_neg = b by our definition of operator-
+    // for ConstantIntervals above (ignoring corner cases, for which b_pos -
+    // b_neg safely over-approximates the bounds of b).
+
+    auto two_to_the = [](const ConstantInterval &i) {
+        const int64_t one = 1;
+        ConstantInterval r;
+        // We should know i is positive at this point.
+        internal_assert(i.min_defined && i.min >= 0);
+        r.min_defined = true;
+        if (i.min >= 63) {
+            // It's at least a value too large for us to represent, which is not
+            // the same as min_defined = false.
+            r.min = INT64_MAX;
+        } else {
+            r.min = one << i.min;
+        }
+        if (i.max < 63) {
+            r.max_defined = true;
+            r.max = one << i.max;
+        }
+        return r;
+    };
+
+    return (a * two_to_the(b_pos)) / two_to_the(b_neg);
+}
+
+ConstantInterval operator<<(const ConstantInterval &a, int64_t b) {
+    return a << ConstantInterval::single_point(b);
+}
+
+ConstantInterval operator<<(int64_t a, const ConstantInterval &b) {
+    return ConstantInterval::single_point(a) << b;
+}
+
+ConstantInterval operator>>(const ConstantInterval &a, const ConstantInterval &b) {
+    return a << (-b);
+}
+
+ConstantInterval operator>>(const ConstantInterval &a, int64_t b) {
+    return a >> ConstantInterval::single_point(b);
+}
+
+ConstantInterval operator>>(int64_t a, const ConstantInterval &b) {
+    return ConstantInterval::single_point(a) >> b;
+}
+
+}  // namespace Internal
+
+using namespace Internal;
+
+ConstantInterval cast(Type t, const ConstantInterval &a) {
+    ConstantInterval result = a;
+    result.cast_to(t);
+    return result;
+}
+
+ConstantInterval saturating_cast(Type t, const ConstantInterval &a) {
+    ConstantInterval b = ConstantInterval::bounds_of_type(t);
+    if (a >= b) {
+        return ConstantInterval::single_point(b.max);
+    } else if (a <= b) {
+        return ConstantInterval::single_point(b.min);
+    } else {
+        return ConstantInterval::make_intersection(a, b);
+    }
+}
+
+}  // namespace Halide
diff --git a/src/ConstantInterval.h b/src/ConstantInterval.h
new file mode 100644
index 000000000000..daa6f0f4dbe0
--- /dev/null
+++ b/src/ConstantInterval.h
@@ -0,0 +1,176 @@
+#ifndef HALIDE_CONSTANT_INTERVAL_H
+#define HALIDE_CONSTANT_INTERVAL_H
+
+#include <stdint.h>
+
+/** \file
+ * Defines the ConstantInterval class, and operators on it.
+ */
+
+namespace Halide {
+
+struct Type;
+
+namespace Internal {
+
+/** A class to represent ranges of integers. Can be unbounded above or below,
+ * but they cannot be empty. */
+struct ConstantInterval {
+    /** The lower and upper bound of the interval. They are included
+     * in the interval. */
+    int64_t min = 0, max = 0;
+    bool min_defined = false, max_defined = false;
+
+    /* A default-constructed Interval is everything */
+    ConstantInterval();
+
+    /** Construct an interval from a lower and upper bound. */
+    ConstantInterval(int64_t min, int64_t max);
+
+    /** The interval representing everything. */
+    static ConstantInterval everything();
+
+    /** Construct an interval representing a single point. */
+    static ConstantInterval single_point(int64_t x);
+
+    /** Construct intervals bounded above or below. */
+    static ConstantInterval bounded_below(int64_t min);
+    static ConstantInterval bounded_above(int64_t max);
+
+    /** Is the interval the entire range */
+    bool is_everything() const;
+
+    /** Is the interval just a single value (min == max) */
+    bool is_single_point() const;
+
+    /** Is the interval a particular single value */
+    bool is_single_point(int64_t x) const;
+
+    /** Does the interval have a finite upper and lower bound */
+    bool is_bounded() const;
+
+    /** Expand the interval to include another Interval */
+    void include(const ConstantInterval &i);
+
+    /** Expand the interval to include a point */
+    void include(int64_t x);
+
+    /** Test if the interval contains a particular value */
+    bool contains(int32_t x) const;
+
+    /** Test if the interval contains a particular value */
+    bool contains(int64_t x) const;
+
+    /** Test if the interval contains a particular unsigned value */
+    bool contains(uint64_t x) const;
+
+    /** Construct the smallest interval containing two intervals. */
+    static ConstantInterval make_union(const ConstantInterval &a, const ConstantInterval &b);
+
+    /** Construct the largest interval contained within two intervals. Throws an
+     * error if the interval is empty. */
+    static ConstantInterval make_intersection(const ConstantInterval &a, const ConstantInterval &b);
+
+    /** Equivalent to same_as. Exists so that the autoscheduler can
+     * compare two map<string, Interval> for equality in order to
+     * cache computations. */
+    bool operator==(const ConstantInterval &other) const;
+
+    /** In-place versions of the arithmetic operators below. */
+    // @{
+    void operator+=(const ConstantInterval &other);
+    void operator+=(int64_t);
+    void operator-=(const ConstantInterval &other);
+    void operator-=(int64_t);
+    void operator*=(const ConstantInterval &other);
+    void operator*=(int64_t);
+    void operator/=(const ConstantInterval &other);
+    void operator/=(int64_t);
+    void operator%=(const ConstantInterval &other);
+    void operator%=(int64_t);
+    // @}
+
+    /** Negate an interval. */
+    ConstantInterval operator-() const;
+
+    /** Track what happens if a constant integer interval is forced to fit into
+     * a concrete integer type. */
+    void cast_to(const Type &t);
+
+    /** Get constant integer bounds on a type. */
+    static ConstantInterval bounds_of_type(Type);
+};
+
+/** Arithmetic operators on ConstantIntervals. The resulting interval contains
+ * all possible values of the operator applied to any two elements of the
+ * argument intervals. Note that these operator on unbounded integers. If you
+ * are applying this to concrete small integer types, you will need to manually
+ * cast the constant interval back to the desired type to model the effect of
+ * overflow. */
+// @{
+ConstantInterval operator+(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator+(const ConstantInterval &a, int64_t b);
+ConstantInterval operator-(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator-(const ConstantInterval &a, int64_t b);
+ConstantInterval operator/(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator/(const ConstantInterval &a, int64_t b);
+ConstantInterval operator*(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator*(const ConstantInterval &a, int64_t b);
+ConstantInterval operator%(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator%(const ConstantInterval &a, int64_t b);
+ConstantInterval min(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval min(const ConstantInterval &a, int64_t b);
+ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval max(const ConstantInterval &a, int64_t b);
+ConstantInterval abs(const ConstantInterval &a);
+ConstantInterval operator<<(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator<<(const ConstantInterval &a, int64_t b);
+ConstantInterval operator<<(int64_t a, const ConstantInterval &b);
+ConstantInterval operator>>(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator>>(const ConstantInterval &a, int64_t b);
+ConstantInterval operator>>(int64_t a, const ConstantInterval &b);
+// @}
+
+/** Comparison operators on ConstantIntervals. Returns whether the comparison is
+ * true for all values of the two intervals. */
+// @{
+bool operator<=(const ConstantInterval &a, const ConstantInterval &b);
+bool operator<=(const ConstantInterval &a, int64_t b);
+bool operator<=(int64_t a, const ConstantInterval &b);
+bool operator<(const ConstantInterval &a, const ConstantInterval &b);
+bool operator<(const ConstantInterval &a, int64_t b);
+bool operator<(int64_t a, const ConstantInterval &b);
+
+inline bool operator>=(const ConstantInterval &a, const ConstantInterval &b) {
+    return b <= a;
+}
+inline bool operator>(const ConstantInterval &a, const ConstantInterval &b) {
+    return b < a;
+}
+inline bool operator>=(const ConstantInterval &a, int64_t b) {
+    return b <= a;
+}
+inline bool operator>(const ConstantInterval &a, int64_t b) {
+    return b < a;
+}
+inline bool operator>=(int64_t a, const ConstantInterval &b) {
+    return b <= a;
+}
+inline bool operator>(int64_t a, const ConstantInterval &b) {
+    return b < a;
+}
+
+// @}
+}  // namespace Internal
+
+/** Cast operators for ConstantIntervals. These ones have to live out in
+ * Halide::, to avoid C++ name lookup confusion with the Halide::cast variants
+ * that take Exprs. */
+// @{
+Internal::ConstantInterval cast(Type t, const Internal::ConstantInterval &a);
+Internal::ConstantInterval saturating_cast(Type t, const Internal::ConstantInterval &a);
+// @}
+
+}  // namespace Halide
+
+#endif
diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp
index d7b053981ac8..793234c8b3ff 100644
--- a/src/FindIntrinsics.cpp
+++ b/src/FindIntrinsics.cpp
@@ -935,9 +935,12 @@ class FindIntrinsics : public IRMutator {
             Expr b_narrow = lossless_narrow(op->args[1]);
             if (a_narrow.defined() && b_narrow.defined()) {
                 Expr result;
-                if (op->is_intrinsic(Call::rounding_shift_right) && can_prove(b_narrow > 0)) {
+                if (op->is_intrinsic(Call::rounding_shift_right) &&
+                    can_prove(b_narrow > 0 && b_narrow < a_narrow.type().bits())) {
                     result = rounding_shift_right(a_narrow, b_narrow);
-                } else if (op->is_intrinsic(Call::rounding_shift_left) && can_prove(b_narrow < 0)) {
+                } else if (op->is_intrinsic(Call::rounding_shift_left) &&
+                           b_narrow.type().is_int() &&
+                           can_prove(b_narrow < 0 && b_narrow > -a_narrow.type().bits())) {
                     result = rounding_shift_left(a_narrow, b_narrow);
                 } else {
                     return op;
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
index 0f318f777561..3492c9e828c3 100644
--- a/src/IROperator.cpp
+++ b/src/IROperator.cpp
@@ -502,7 +502,7 @@ Expr lossless_cast(Type t, Expr e) {
             Expr a = lossless_cast(t.narrow(), sub->a);
             Expr b = lossless_cast(t.narrow(), sub->b);
             if (a.defined() && b.defined()) {
-                return cast(t, a) + cast(t, b);
+                return cast(t, a) - cast(t, b);
             } else {
                 return Expr();
             }
@@ -567,7 +567,8 @@ Expr lossless_cast(Type t, Expr e) {
 }
 
 Expr lossless_negate(const Expr &x) {
-    if (const Mul *m = x.as<Mul>()) {
+    if (false /* const Mul *m = x.as<Mul>() */) {  // disabled pending #8155
+        /*
         Expr b = lossless_negate(m->b);
         if (b.defined()) {
             return Mul::make(m->a, b);
@@ -576,6 +577,7 @@ Expr lossless_negate(const Expr &x) {
         if (a.defined()) {
             return Mul::make(a, m->b);
         }
+        */
     } else if (const Call *m = Call::as_intrinsic(x, {Call::widening_mul})) {
         Expr b = lossless_negate(m->args[1]);
         if (b.defined()) {
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index fb40de78f14a..f14e45a335f6 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -6,8 +6,11 @@
 #include "AssociativeOpsTable.h"
 #include "Associativity.h"
 #include "Closure.h"
+#include "ConstantInterval.h"
 #include "IROperator.h"
+#include "Interval.h"
 #include "Module.h"
+#include "ModulusRemainder.h"
 #include "Target.h"
 #include "Util.h"
 
@@ -446,6 +449,45 @@ std::ostream &operator<<(std::ostream &out, const Closure &c) {
     return out;
 }
 
+std::ostream &operator<<(std::ostream &out, const Interval &in) {
+    out << "[";
+    if (in.has_lower_bound()) {
+        out << in.min;
+    } else {
+        out << "-inf";
+    }
+    out << ", ";
+    if (in.has_upper_bound()) {
+        out << in.max;
+    } else {
+        out << "inf";
+    }
+    out << "]";
+    return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const ConstantInterval &in) {
+    out << "[";
+    if (in.min_defined) {
+        out << in.min;
+    } else {
+        out << "-inf";
+    }
+    out << ", ";
+    if (in.max_defined) {
+        out << in.max;
+    } else {
+        out << "inf";
+    }
+    out << "]";
+    return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const ModulusRemainder &c) {
+    out << "(mod: " << c.modulus << " rem: " << c.remainder << ")";
+    return out;
+}
+
 IRPrinter::IRPrinter(ostream &s)
     : stream(s) {
     s.setf(std::ios::fixed, std::ios::floatfield);
diff --git a/src/IRPrinter.h b/src/IRPrinter.h
index 849e50b816f4..6addbbd7c771 100644
--- a/src/IRPrinter.h
+++ b/src/IRPrinter.h
@@ -58,6 +58,9 @@ namespace Internal {
 struct AssociativePattern;
 struct AssociativeOp;
 class Closure;
+struct Interval;
+struct ConstantInterval;
+struct ModulusRemainder;
 
 /** Emit a halide associative pattern on an output stream (such as std::cout)
  * in a human-readable form */
@@ -90,9 +93,18 @@ std::ostream &operator<<(std::ostream &stream, const LinkageType &);
 /** Emit a halide dimension type in human-readable format */
 std::ostream &operator<<(std::ostream &stream, const DimType &);
 
-/** Emit a Closure in human-readable format */
+/** Emit a Closure in human-readable form */
 std::ostream &operator<<(std::ostream &out, const Closure &c);
 
+/** Emit an Interval in human-readable form */
+std::ostream &operator<<(std::ostream &out, const Interval &c);
+
+/** Emit a ConstantInterval in human-readable form */
+std::ostream &operator<<(std::ostream &out, const ConstantInterval &c);
+
+/** Emit a ModulusRemainder in human-readable form */
+std::ostream &operator<<(std::ostream &out, const ModulusRemainder &c);
+
 struct Indentation {
     int indent;
 };
diff --git a/src/Interval.cpp b/src/Interval.cpp
index bdb529999bf2..e034dae04ee6 100644
--- a/src/Interval.cpp
+++ b/src/Interval.cpp
@@ -3,6 +3,8 @@
 #include "IRMatch.h"
 #include "IROperator.h"
 
+using namespace Halide::Internal;
+
 namespace Halide {
 namespace Internal {
 
@@ -157,91 +159,5 @@ Expr Interval::neg_inf_noinline() {
     return Interval::neg_inf_expr;
 }
 
-ConstantInterval::ConstantInterval() = default;
-
-ConstantInterval::ConstantInterval(int64_t min, int64_t max)
-    : min(min), max(max), min_defined(true), max_defined(true) {
-    internal_assert(min <= max);
-}
-
-ConstantInterval ConstantInterval::everything() {
-    return ConstantInterval();
-}
-
-ConstantInterval ConstantInterval::single_point(int64_t x) {
-    return ConstantInterval(x, x);
-}
-
-ConstantInterval ConstantInterval::bounded_below(int64_t min) {
-    ConstantInterval result(min, min);
-    result.max_defined = false;
-    return result;
-}
-
-ConstantInterval ConstantInterval::bounded_above(int64_t max) {
-    ConstantInterval result(max, max);
-    result.min_defined = false;
-    return result;
-}
-
-bool ConstantInterval::is_everything() const {
-    return !min_defined && !max_defined;
-}
-
-bool ConstantInterval::is_single_point() const {
-    return min_defined && max_defined && min == max;
-}
-
-bool ConstantInterval::is_single_point(int64_t x) const {
-    return min_defined && max_defined && min == x && max == x;
-}
-
-bool ConstantInterval::has_upper_bound() const {
-    return max_defined;
-}
-
-bool ConstantInterval::has_lower_bound() const {
-    return min_defined;
-}
-
-bool ConstantInterval::is_bounded() const {
-    return has_upper_bound() && has_lower_bound();
-}
-
-bool ConstantInterval::operator==(const ConstantInterval &other) const {
-    if (min_defined != other.min_defined || max_defined != other.max_defined) {
-        return false;
-    }
-    return (!min_defined || min == other.min) && (!max_defined || max == other.max);
-}
-
-void ConstantInterval::include(const ConstantInterval &i) {
-    if (max_defined && i.max_defined) {
-        max = std::max(max, i.max);
-    } else {
-        max_defined = false;
-    }
-    if (min_defined && i.min_defined) {
-        min = std::min(min, i.min);
-    } else {
-        min_defined = false;
-    }
-}
-
-void ConstantInterval::include(int64_t x) {
-    if (max_defined) {
-        max = std::max(max, x);
-    }
-    if (min_defined) {
-        min = std::min(min, x);
-    }
-}
-
-ConstantInterval ConstantInterval::make_union(const ConstantInterval &a, const ConstantInterval &b) {
-    ConstantInterval result = a;
-    result.include(b);
-    return result;
-}
-
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Interval.h b/src/Interval.h
index 1d90d4a29b55..ccd27341f167 100644
--- a/src/Interval.h
+++ b/src/Interval.h
@@ -87,7 +87,7 @@ struct Interval {
     /** Construct the smallest interval containing two intervals. */
     static Interval make_union(const Interval &a, const Interval &b);
 
-    /** Construct the largest interval contained within two intervals. */
+    /** Construct the largest interval contained within two other intervals. */
     static Interval make_intersection(const Interval &a, const Interval &b);
 
     /** An eagerly-simplifying max of two Exprs that respects infinities. */
@@ -110,63 +110,6 @@ struct Interval {
     static Expr neg_inf_noinline();
 };
 
-/** A class to represent ranges of integers. Can be unbounded above or below, but
- * they cannot be empty. */
-struct ConstantInterval {
-    /** The lower and upper bound of the interval. They are included
-     * in the interval. */
-    int64_t min = 0, max = 0;
-    bool min_defined = false, max_defined = false;
-
-    /* A default-constructed Interval is everything */
-    ConstantInterval();
-
-    /** Construct an interval from a lower and upper bound. */
-    ConstantInterval(int64_t min, int64_t max);
-
-    /** The interval representing everything. */
-    static ConstantInterval everything();
-
-    /** Construct an interval representing a single point. */
-    static ConstantInterval single_point(int64_t x);
-
-    /** Construct intervals bounded above or below. */
-    static ConstantInterval bounded_below(int64_t min);
-    static ConstantInterval bounded_above(int64_t max);
-
-    /** Is the interval the entire range */
-    bool is_everything() const;
-
-    /** Is the interval just a single value (min == max) */
-    bool is_single_point() const;
-
-    /** Is the interval a particular single value */
-    bool is_single_point(int64_t x) const;
-
-    /** Does the interval have a finite least upper bound */
-    bool has_upper_bound() const;
-
-    /** Does the interval have a finite greatest lower bound */
-    bool has_lower_bound() const;
-
-    /** Does the interval have a finite upper and lower bound */
-    bool is_bounded() const;
-
-    /** Expand the interval to include another Interval */
-    void include(const ConstantInterval &i);
-
-    /** Expand the interval to include a point */
-    void include(int64_t x);
-
-    /** Construct the smallest interval containing two intervals. */
-    static ConstantInterval make_union(const ConstantInterval &a, const ConstantInterval &b);
-
-    /** Equivalent to same_as. Exists so that the autoscheduler can
-     * compare two map<string, Interval> for equality in order to
-     * cache computations. */
-    bool operator==(const ConstantInterval &other) const;
-};
-
 }  // namespace Internal
 }  // namespace Halide
 
diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp
index fee151f00a22..1450faade800 100644
--- a/src/Monotonic.cpp
+++ b/src/Monotonic.cpp
@@ -1,9 +1,11 @@
 #include "Monotonic.h"
-#include "Bounds.h"
+#include "ConstantBounds.h"
 #include "IROperator.h"
+#include "IRPrinter.h"
 #include "IRVisitor.h"
 #include "Scope.h"
 #include "Simplify.h"
+#include "SimplifyCorrelatedDifferences.h"
 #include "Substitute.h"
 
 namespace Halide {
@@ -42,24 +44,8 @@ const int64_t *as_const_int_or_uint(const Expr &e) {
     return nullptr;
 }
 
-bool is_constant(const ConstantInterval &a) {
-    return a.is_single_point(0);
-}
-
-bool may_be_negative(const ConstantInterval &a) {
-    return !a.has_lower_bound() || a.min < 0;
-}
-
-bool may_be_positive(const ConstantInterval &a) {
-    return !a.has_upper_bound() || a.max > 0;
-}
-
-bool is_monotonic_increasing(const ConstantInterval &a) {
-    return !may_be_negative(a);
-}
-
-bool is_monotonic_decreasing(const ConstantInterval &a) {
-    return !may_be_positive(a);
+bool is_constant(const ConstantInterval &x) {
+    return x.is_single_point(0);
 }
 
 ConstantInterval to_interval(Monotonic m) {
@@ -79,162 +65,20 @@ ConstantInterval to_interval(Monotonic m) {
 Monotonic to_monotonic(const ConstantInterval &x) {
     if (is_constant(x)) {
         return Monotonic::Constant;
-    } else if (is_monotonic_increasing(x)) {
+    } else if (x >= 0) {
         return Monotonic::Increasing;
-    } else if (is_monotonic_decreasing(x)) {
+    } else if (x <= 0) {
         return Monotonic::Decreasing;
     } else {
         return Monotonic::Unknown;
     }
 }
 
-ConstantInterval unify(const ConstantInterval &a, const ConstantInterval &b) {
-    return ConstantInterval::make_union(a, b);
-}
-
-ConstantInterval unify(const ConstantInterval &a, int64_t b) {
-    ConstantInterval result;
-    result.include(b);
-    return result;
-}
-
-// Helpers for doing arithmetic on ConstantIntervals that avoid generating
-// expressions of pos_inf/neg_inf.
-ConstantInterval add(const ConstantInterval &a, const ConstantInterval &b) {
-    ConstantInterval result;
-    result.min_defined = a.has_lower_bound() && b.has_lower_bound();
-    result.max_defined = a.has_upper_bound() && b.has_upper_bound();
-    if (result.has_lower_bound()) {
-        result.min_defined = add_with_overflow(64, a.min, b.min, &result.min);
-    }
-    if (result.has_upper_bound()) {
-        result.max_defined = add_with_overflow(64, a.max, b.max, &result.max);
-    }
-    return result;
-}
-
-ConstantInterval add(const ConstantInterval &a, int64_t b) {
-    return add(a, ConstantInterval(b, b));
-}
-
-ConstantInterval negate(const ConstantInterval &r) {
-    ConstantInterval result;
-    result.min_defined = r.has_upper_bound();
-    if (result.min_defined) {
-        result.min_defined = sub_with_overflow(64, 0, r.max, &result.min);
-    }
-    result.max_defined = r.has_lower_bound();
-    if (result.max_defined) {
-        result.max_defined = sub_with_overflow(64, 0, r.min, &result.max);
-    }
-    return result;
-}
-
-ConstantInterval sub(const ConstantInterval &a, const ConstantInterval &b) {
-    ConstantInterval result;
-    result.min_defined = a.has_lower_bound() && b.has_lower_bound();
-    result.max_defined = a.has_upper_bound() && b.has_upper_bound();
-    if (result.has_lower_bound()) {
-        result.min_defined = sub_with_overflow(64, a.min, b.max, &result.min);
-    }
-    if (result.has_upper_bound()) {
-        result.max_defined = sub_with_overflow(64, a.max, b.min, &result.max);
-    }
-    return result;
-}
-
-ConstantInterval sub(const ConstantInterval &a, int64_t b) {
-    return sub(a, ConstantInterval(b, b));
-}
-
-ConstantInterval multiply(const ConstantInterval &a, int64_t b) {
-    ConstantInterval result(a);
-    if (b < 0) {
-        result = negate(result);
-        b = -b;
-    }
-    if (result.has_lower_bound()) {
-        result.min *= b;
-    }
-    if (result.has_upper_bound()) {
-        result.max *= b;
-    }
-    return result;
-}
-
-ConstantInterval multiply(const ConstantInterval &a, const Expr &b) {
-    if (const int64_t *bi = as_const_int_or_uint(b)) {
-        return multiply(a, *bi);
-    }
-    return ConstantInterval::everything();
-}
-
-ConstantInterval multiply(const ConstantInterval &a, const ConstantInterval &b) {
-    int64_t bounds[4];
-    int64_t *bounds_begin = &bounds[0];
-    int64_t *bounds_end = &bounds[0];
-    bool no_overflow = true;
-    if (a.has_lower_bound() && b.has_lower_bound()) {
-        no_overflow = no_overflow && mul_with_overflow(64, a.min, b.min, bounds_end++);
-    }
-    if (a.has_lower_bound() && b.has_upper_bound()) {
-        no_overflow = no_overflow && mul_with_overflow(64, a.min, b.max, bounds_end++);
-    }
-    if (a.has_upper_bound() && b.has_lower_bound()) {
-        no_overflow = no_overflow && mul_with_overflow(64, a.max, b.min, bounds_end++);
-    }
-    if (a.has_upper_bound() && b.has_upper_bound()) {
-        no_overflow = no_overflow && mul_with_overflow(64, a.max, b.max, bounds_end++);
-    }
-    if (no_overflow && (bounds_begin != bounds_end)) {
-        ConstantInterval result = {
-            *std::min_element(bounds_begin, bounds_end),
-            *std::max_element(bounds_begin, bounds_end),
-        };
-        // There *must* be a better way than this... Even
-        // cutting half the cases with swapping isn't that much help.
-        if (!a.has_lower_bound()) {
-            if (may_be_negative(b)) result.max_defined = false;  // NOLINT
-            if (may_be_positive(b)) result.min_defined = false;  // NOLINT
-        }
-        if (!a.has_upper_bound()) {
-            if (may_be_negative(b)) result.min_defined = false;  // NOLINT
-            if (may_be_positive(b)) result.max_defined = false;  // NOLINT
-        }
-        if (!b.has_lower_bound()) {
-            if (may_be_negative(a)) result.max_defined = false;  // NOLINT
-            if (may_be_positive(a)) result.min_defined = false;  // NOLINT
-        }
-        if (!b.has_upper_bound()) {
-            if (may_be_negative(a)) result.min_defined = false;  // NOLINT
-            if (may_be_positive(a)) result.max_defined = false;  // NOLINT
-        }
-        return result;
-    } else {
-        return ConstantInterval::everything();
-    }
-}
-
-ConstantInterval divide(const ConstantInterval &a, int64_t b) {
-    ConstantInterval result(a);
-    if (b < 0) {
-        result = negate(result);
-        b = -b;
-    }
-    if (result.has_lower_bound()) {
-        result.min = div_imp(result.min, b);
-    }
-    if (result.has_upper_bound()) {
-        result.max = div_imp(result.max - 1, b) + 1;
-    }
-    return result;
-}
-
 class DerivativeBounds : public IRVisitor {
     const string &var;
 
-    Scope<ConstantInterval> scope;
-    Scope<Interval> bounds;
+    // Bounds on the derivatives and values of variables in scope.
+    Scope<ConstantInterval> derivative_bounds, value_bounds;
 
     void visit(const IntImm *) override {
         result = ConstantInterval::single_point(0);
@@ -280,7 +124,7 @@ class DerivativeBounds : public IRVisitor {
     void visit(const Variable *op) override {
         if (op->name == var) {
             result = ConstantInterval::single_point(1);
-        } else if (const auto *r = scope.find(op->name)) {
+        } else if (const auto *r = derivative_bounds.find(op->name)) {
             result = *r;
         } else {
             result = ConstantInterval::single_point(0);
@@ -291,16 +135,14 @@ class DerivativeBounds : public IRVisitor {
         op->a.accept(this);
         ConstantInterval ra = result;
         op->b.accept(this);
-        ConstantInterval rb = result;
-        result = add(ra, rb);
+        result += ra;
     }
 
     void visit(const Sub *op) override {
-        op->a.accept(this);
-        ConstantInterval ra = result;
         op->b.accept(this);
         ConstantInterval rb = result;
-        result = sub(ra, rb);
+        op->a.accept(this);
+        result -= rb;
     }
 
     void visit(const Mul *op) override {
@@ -313,9 +155,9 @@ class DerivativeBounds : public IRVisitor {
             // This is essentially the product rule: a*rb + b*ra
             // but only implemented for the case where a or b is constant.
             if (const int64_t *b = as_const_int_or_uint(op->b)) {
-                result = multiply(ra, *b);
+                result = ra * (*b);
             } else if (const int64_t *a = as_const_int_or_uint(op->a)) {
-                result = multiply(rb, *a);
+                result = rb * (*a);
             } else {
                 result = ConstantInterval::everything();
             }
@@ -326,20 +168,37 @@ class DerivativeBounds : public IRVisitor {
 
     void visit(const Div *op) override {
         if (op->type.is_scalar()) {
-            op->a.accept(this);
-            ConstantInterval ra = result;
-
             if (const int64_t *b = as_const_int_or_uint(op->b)) {
-                result = divide(ra, *b);
-            } else {
-                result = ConstantInterval::everything();
+                op->a.accept(this);
+                // We don't just want to divide by b. For the min we want to
+                // take floor division, and for the max we want to use ceil
+                // division.
+                if (*b == 0) {
+                    result = ConstantInterval::single_point(0);
+                } else {
+                    if (result.min_defined) {
+                        result.min = div_imp(result.min, *b);
+                    }
+                    if (result.max_defined) {
+                        if (result.max != INT64_MIN) {
+                            result.max = div_imp(result.max - 1, *b) + 1;
+                        } else {
+                            result.max_defined = false;
+                            result.max = 0;
+                        }
+                    }
+                    if (*b < 0) {
+                        result = -result;
+                    }
+                }
+                return;
             }
-        } else {
-            result = ConstantInterval::everything();
         }
+        result = ConstantInterval::everything();
     }
 
     void visit(const Mod *op) override {
+        // TODO: It's possible to get tighter bounds here. What if neither arg uses the var!
         result = ConstantInterval::everything();
     }
 
@@ -347,16 +206,14 @@ class DerivativeBounds : public IRVisitor {
         op->a.accept(this);
         ConstantInterval ra = result;
         op->b.accept(this);
-        ConstantInterval rb = result;
-        result = unify(ra, rb);
+        result.include(ra);
     }
 
     void visit(const Max *op) override {
         op->a.accept(this);
         ConstantInterval ra = result;
         op->b.accept(this);
-        ConstantInterval rb = result;
-        result = unify(ra, rb);
+        result.include(ra);
     }
 
     void visit_eq(const Expr &a, const Expr &b) {
@@ -386,17 +243,12 @@ class DerivativeBounds : public IRVisitor {
         a.accept(this);
         ConstantInterval ra = result;
         b.accept(this);
-        ConstantInterval rb = result;
-        result = unify(negate(ra), rb);
+        result.include(-ra);
         // If the result is bounded, limit it to [-1, 1]. The largest
         // difference possible is flipping from true to false or false
         // to true.
-        if (result.has_lower_bound()) {
-            result.min = std::min<int64_t>(std::max<int64_t>(result.min, -1), 1);
-        }
-        if (result.has_upper_bound()) {
-            result.max = std::min<int64_t>(std::max<int64_t>(result.max, -1), 1);
-        }
+        result.min = std::min<int64_t>(std::max<int64_t>(result.min, -1), 1);
+        result.max = std::min<int64_t>(std::max<int64_t>(result.max, -1), 1);
     }
 
     void visit(const LT *op) override {
@@ -419,50 +271,52 @@ class DerivativeBounds : public IRVisitor {
         op->a.accept(this);
         ConstantInterval ra = result;
         op->b.accept(this);
-        ConstantInterval rb = result;
-        result = unify(ra, rb);
+        result.include(ra);
     }
 
     void visit(const Or *op) override {
         op->a.accept(this);
         ConstantInterval ra = result;
         op->b.accept(this);
-        ConstantInterval rb = result;
-        result = unify(ra, rb);
+        result.include(ra);
     }
 
     void visit(const Not *op) override {
         op->a.accept(this);
-        result = negate(result);
+        result = -result;
     }
 
     void visit(const Select *op) override {
-        // The result is the unified bounds, added to the "bump" that happens when switching from true to false.
+        // The result is the unified bounds, added to the "bump" that happens
+        // when switching from true to false.
         if (op->type.is_scalar()) {
             op->condition.accept(this);
             ConstantInterval rcond = result;
+            // rcond is:
+            //  [ 0  0] if the condition does not depend on the variable
+            //  [-1, 0] if it changes once from true to false
+            //  [ 0  1] if it changes once from false to true
+            //  [-1, 1] if it could change in either direction
 
             op->true_value.accept(this);
             ConstantInterval ra = result;
             op->false_value.accept(this);
-            ConstantInterval rb = result;
-            result = unify(ra, rb);
+            result.include(ra);
 
             // If the condition is not constant, we hit a "bump" when the condition changes value.
             if (!is_constant(rcond)) {
-                // TODO: How to handle unsigned values?
-                Expr delta = simplify(op->true_value - op->false_value);
-
-                Interval delta_bounds = find_constant_bounds(delta, bounds);
-                // TODO: Maybe we can do something with one-sided intervals?
-                if (delta_bounds.is_bounded()) {
-                    ConstantInterval delta_low = multiply(rcond, delta_bounds.min);
-                    ConstantInterval delta_high = multiply(rcond, delta_bounds.max);
-                    result = add(result, ConstantInterval::make_union(delta_low, delta_high));
-                } else {
-                    // The bump is unbounded.
-                    result = ConstantInterval::everything();
-                }
+                // It's very important to have stripped likelies here, or the
+                // simplification might not cancel things that it should. This
+                // happens below in the top-level derivative_bounds call.
+                Expr bump = simplify(op->true_value - op->false_value);
+
+                // This is of dubious value, because
+                // bound_correlated_differences really assumes you've solved for
+                // a variable that you're trying to cancel first. TODO: try
+                // removing this.
+                bump = bound_correlated_differences(bump);
+                ConstantInterval bump_bounds = constant_integer_bounds(bump, value_bounds);
+                result += rcond * bump_bounds;
             }
         } else {
             result = ConstantInterval::everything();
@@ -493,10 +347,9 @@ class DerivativeBounds : public IRVisitor {
             return;
         }
 
-        if (op->is_intrinsic(Call::unsafe_promise_clamped) ||
-            op->is_intrinsic(Call::promise_clamped) ||
-            op->is_intrinsic(Call::saturating_cast)) {
+        if (op->is_intrinsic(Call::saturating_cast)) {
             op->args[0].accept(this);
+            result.include(0);
             return;
         }
 
@@ -526,14 +379,16 @@ class DerivativeBounds : public IRVisitor {
     void visit(const Let *op) override {
         op->value.accept(this);
 
-        ScopedBinding<Interval> bounds_binding(bounds, op->name, find_constant_bounds(op->value, bounds));
-
+        // As above, this is of dubious value. TODO: Try removing it.
+        Expr v = bound_correlated_differences(op->value);
+        ScopedBinding<ConstantInterval> vb_binding(value_bounds, op->name,
+                                                   constant_integer_bounds(v, value_bounds));
         if (is_constant(result)) {
             // No point pushing it if it's constant w.r.t the var,
             // because unknown variables are treated as constant.
             op->body.accept(this);
         } else {
-            ScopedBinding<ConstantInterval> scope_binding(scope, op->name, result);
+            ScopedBinding<ConstantInterval> db_binding(derivative_bounds, op->name, result);
             op->body.accept(this);
         }
     }
@@ -554,7 +409,7 @@ class DerivativeBounds : public IRVisitor {
         switch (op->op) {
         case VectorReduce::Add:
         case VectorReduce::SaturatingAdd:
-            result = multiply(result, op->value.type().lanes() / op->type.lanes());
+            result *= op->value.type().lanes() / op->type.lanes();
             break;
         case VectorReduce::Min:
         case VectorReduce::Max:
@@ -643,7 +498,7 @@ class DerivativeBounds : public IRVisitor {
 
     DerivativeBounds(const std::string &v, const Scope<ConstantInterval> &parent)
         : var(v), result(ConstantInterval::everything()) {
-        scope.set_containing_scope(&parent);
+        derivative_bounds.set_containing_scope(&parent);
     }
 };
 
diff --git a/src/Monotonic.h b/src/Monotonic.h
index 3d7946a13ed7..c8ba66195961 100644
--- a/src/Monotonic.h
+++ b/src/Monotonic.h
@@ -8,13 +8,14 @@
 #include <iostream>
 #include <string>
 
-#include "Interval.h"
+#include "ConstantInterval.h"
 #include "Scope.h"
 
 namespace Halide {
 namespace Internal {
 
-/** Find the bounds of the derivative of an expression. */
+/** Find the bounds of the derivative of an expression. The scope gives the
+ * bounds on the derivatives of any variables found. */
 ConstantInterval derivative_bounds(const Expr &e, const std::string &var,
                                    const Scope<ConstantInterval> &scope = Scope<ConstantInterval>::empty_scope());
 
diff --git a/src/Type.cpp b/src/Type.cpp
index 64414fa04eca..1cd95e0a6b01 100644
--- a/src/Type.cpp
+++ b/src/Type.cpp
@@ -1,3 +1,4 @@
+#include "ConstantBounds.h"
 #include "IR.h"
 #include <cfloat>
 #include <sstream>
@@ -126,6 +127,10 @@ bool Type::can_represent(Type other) const {
     }
 }
 
+bool Type::can_represent(const Internal::ConstantInterval &in) const {
+    return in.is_bounded() && can_represent(in.min) && can_represent(in.max);
+}
+
 bool Type::can_represent(int64_t x) const {
     if (is_int()) {
         return x >= min_int(bits()) && x <= max_int(bits());
diff --git a/src/Type.h b/src/Type.h
index c8a397b3f0a7..1ed52da16c55 100644
--- a/src/Type.h
+++ b/src/Type.h
@@ -269,6 +269,10 @@ struct halide_handle_traits {
 
 namespace Halide {
 
+namespace Internal {
+struct ConstantInterval;
+}
+
 struct Expr;
 
 /** Types in the halide type system. They can be ints, unsigned ints,
@@ -504,6 +508,10 @@ struct Type {
     /** Can this type represent all values of another type? */
     bool can_represent(Type other) const;
 
+    /** Can this type represent exactly all integer values of some constant
+     * integer range? */
+    bool can_represent(const Internal::ConstantInterval &in) const;
+
     /** Can this type represent a particular constant? */
     // @{
     bool can_represent(double x) const;
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 604ceda468f5..ae4a6776ac72 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -58,6 +58,7 @@ tests(GROUPS correctness
       computed_index.cpp
       concat.cpp
       constant_expr.cpp
+      constant_interval.cpp
       constant_type.cpp
       constraints.cpp
       convolution_multiple_kernels.cpp
diff --git a/test/correctness/constant_interval.cpp b/test/correctness/constant_interval.cpp
new file mode 100644
index 000000000000..ba6fa73fbdcb
--- /dev/null
+++ b/test/correctness/constant_interval.cpp
@@ -0,0 +1,187 @@
+#include "Halide.h"
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+std::mt19937 rng;
+
+int64_t sample(const ConstantInterval &i) {
+    int64_t upper = i.max_defined ? i.max : 1024;
+    int64_t lower = i.min_defined ? i.min : -1024;
+    return lower + (rng() % (upper - lower + 1));
+}
+
+ConstantInterval random_interval() {
+    int64_t a = (rng() % 512) - 256;
+    int64_t b = (rng() % 512) - 256;
+    ConstantInterval result;
+    if (rng() & 1) {
+        result.max_defined = true;
+        result.max = std::max(a, b);
+    }
+    if (rng() & 1) {
+        result.min_defined = true;
+        result.min = std::min(a, b);
+    }
+    return result;
+}
+
+int main(int argc, char **argv) {
+    for (int i = 0; i < 1000; i++) {
+        std::vector<std::pair<ConstantInterval, int64_t>> values;
+        for (int j = 0; j < 10; j++) {
+            values.emplace_back(random_interval(), 0);
+            values.back().second = sample(values.back().first);
+        }
+
+        for (int j = 0; j < 1000; j++) {
+            auto a = values[rng() % values.size()];
+            auto b = values[rng() % values.size()];
+            decltype(a) c;
+
+            auto check = [&](const char *op) {
+                if (!c.first.contains(c.second)) {
+                    std::cout << "Error for operator " << op << ":\n"
+                              << "a: " << a.second << " in " << a.first << "\n"
+                              << "b: " << b.second << " in " << b.first << "\n"
+                              << "c: " << c.second << " not in " << c.first << "\n";
+                    exit(1);
+                }
+            };
+
+            auto check_scalar = [&](const char *op) {
+                if (!c.first.contains(c.second)) {
+                    std::cout << "Error for operator " << op << ":\n"
+                              << "a: " << a.second << " in " << a.first << "\n"
+                              << "b: " << b.second << "\n"
+                              << "c: " << c.second << " not in " << c.first << "\n";
+                    exit(1);
+                }
+            };
+
+            // Arithmetic
+            if (!add_would_overflow(64, a.second, b.second)) {
+                c.first = a.first + b.first;
+                c.second = a.second + b.second;
+                check("+");
+            }
+
+            if (!sub_would_overflow(64, a.second, b.second)) {
+                c.first = a.first - b.first;
+                c.second = a.second - b.second;
+                check("-");
+            }
+
+            if (!mul_would_overflow(64, a.second, b.second)) {
+                c.first = a.first * b.first;
+                c.second = a.second * b.second;
+                check("*");
+            }
+
+            c.first = a.first / b.first;
+            c.second = div_imp(a.second, b.second);
+            check("/");
+
+            c.first = min(a.first, b.first);
+            c.second = std::min(a.second, b.second);
+            check("min");
+
+            c.first = max(a.first, b.first);
+            c.second = std::max(a.second, b.second);
+            check("max");
+
+            c.first = a.first % b.first;
+            c.second = mod_imp(a.second, b.second);
+            check("%");
+
+            // Arithmetic with constant RHS
+            if (!add_would_overflow(64, a.second, b.second)) {
+                c.first = a.first + b.second;
+                c.second = a.second + b.second;
+                check_scalar("+");
+            }
+
+            if (!sub_would_overflow(64, a.second, b.second)) {
+                c.first = a.first - b.second;
+                c.second = a.second - b.second;
+                check_scalar("-");
+            }
+
+            if (!mul_would_overflow(64, a.second, b.second)) {
+                c.first = a.first * b.second;
+                c.second = a.second * b.second;
+                check_scalar("*");
+            }
+
+            c.first = a.first / b.second;
+            c.second = div_imp(a.second, b.second);
+            check_scalar("/");
+
+            c.first = min(a.first, b.second);
+            c.second = std::min(a.second, b.second);
+            check_scalar("min");
+
+            c.first = max(a.first, b.second);
+            c.second = std::max(a.second, b.second);
+            check_scalar("max");
+
+            c.first = a.first % b.second;
+            c.second = mod_imp(a.second, b.second);
+            check_scalar("%");
+
+            // Some unary operators
+            c.first = -a.first;
+            c.second = -a.second;
+            check("unary -");
+
+            c.first = cast(UInt(8), a.first);
+            c.second = (int64_t)(uint8_t)(a.second);
+            check("cast to uint8");
+
+            c.first = cast(Int(8), a.first);
+            c.second = (int64_t)(int8_t)(a.second);
+            check("cast to uint8");
+
+            // Comparison
+            _halide_user_assert(!(a.first < b.first) || a.second < b.second)
+                << a.first << " " << a.second << " " << b.first << " " << b.second;
+
+            _halide_user_assert(!(a.first <= b.first) || a.second <= b.second)
+                << a.first << " " << a.second << " " << b.first << " " << b.second;
+
+            _halide_user_assert(!(a.first > b.first) || a.second > b.second)
+                << a.first << " " << a.second << " " << b.first << " " << b.second;
+
+            _halide_user_assert(!(a.first >= b.first) || a.second >= b.second)
+                << a.first << " " << a.second << " " << b.first << " " << b.second;
+
+            // Comparison against constants
+            _halide_user_assert(!(a.first < b.second) || a.second < b.second)
+                << a.first << " " << a.second << " " << b.second;
+
+            _halide_user_assert(!(a.first <= b.second) || a.second <= b.second)
+                << a.first << " " << a.second << " " << b.second;
+
+            _halide_user_assert(!(a.first > b.second) || a.second > b.second)
+                << a.first << " " << a.second << " " << b.second;
+
+            _halide_user_assert(!(a.first >= b.second) || a.second >= b.second)
+                << a.first << " " << a.second << " " << b.second;
+
+            _halide_user_assert(!(a.second < b.first) || a.second < b.second)
+                << a.second << " " << b.first << " " << b.second;
+
+            _halide_user_assert(!(a.second <= b.first) || a.second <= b.second)
+                << a.second << " " << b.first << " " << b.second;
+
+            _halide_user_assert(!(a.second > b.first) || a.second > b.second)
+                << a.second << " " << b.first << " " << b.second;
+
+            _halide_user_assert(!(a.second >= b.first) || a.second >= b.second)
+                << a.second << " " << b.first << " " << b.second;
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/correctness/lossless_cast.cpp b/test/correctness/lossless_cast.cpp
index abdbaa9502c3..692abc0db7d4 100644
--- a/test/correctness/lossless_cast.cpp
+++ b/test/correctness/lossless_cast.cpp
@@ -6,9 +6,11 @@ using namespace Halide::Internal;
 int check_lossless_cast(const Type &t, const Expr &in, const Expr &correct) {
     Expr result = lossless_cast(t, in);
     if (!equal(result, correct)) {
-        std::cout << "Incorrect lossless_cast result:\nlossless_cast("
-                  << t << ", " << in << ") gave: " << result
-                  << " but expected was: " << correct << "\n";
+        std::cout << "Incorrect lossless_cast result:\n"
+                  << "lossless_cast(" << t << ", " << in << ") gave:\n"
+                  << " " << result
+                  << " but expected was:\n"
+                  << " " << correct << "\n";
         return 1;
     }
     return 0;
@@ -19,9 +21,11 @@ int lossless_cast_test() {
     Type u8 = UInt(8);
     Type u16 = UInt(16);
     Type u32 = UInt(32);
+    // Type u64 = UInt(64);
     Type i8 = Int(8);
     Type i16 = Int(16);
     Type i32 = Int(32);
+    Type i64 = Int(64);
     Type u8x = UInt(8, 4);
     Type u16x = UInt(16, 4);
     Type u32x = UInt(32, 4);
@@ -52,14 +56,374 @@ int lossless_cast_test() {
     e = VectorReduce::make(VectorReduce::Add, cast(u32x, var_u8x), 1);
     res |= check_lossless_cast(u16, e, VectorReduce::make(VectorReduce::Add, cast(u16x, var_u8x), 1));
 
-    return res;
+    e = cast(u32, var_u8) - 16;
+    res |= check_lossless_cast(u16, e, Expr());
+
+    e = cast(u32, var_u8) + 16;
+    res |= check_lossless_cast(u16, e, cast(u16, var_u8) + 16);
+
+    e = 16 - cast(u32, var_u8);
+    res |= check_lossless_cast(u16, e, Expr());
+
+    e = 16 + cast(u32, var_u8);
+    res |= check_lossless_cast(u16, e, 16 + cast(u16, var_u8));
+
+    // Check one where the target type is unsigned but there's a signed addition
+    // (that can't overflow)
+    e = cast(i64, cast(u16, var_u8) + cast(i32, 17));
+    res |= check_lossless_cast(u32, e, cast(u32, cast(u16, var_u8)) + cast(u32, 17));
+
+    // Check one where the target type is unsigned but there's a signed subtract
+    // (that can overflow). It's not safe to enter the i16 sub
+    e = cast(i64, cast(i16, 10) - cast(i16, 17));
+    res |= check_lossless_cast(u32, e, Expr());
+
+    e = cast(i64, 1024) * cast(i64, 1024) * cast(i64, 1024);
+    res |= check_lossless_cast(i32, e, (cast(i32, 1024) * 1024) * 1024);
+
+    if (res) {
+        std::cout << "Ignoring bugs in lossless_cast for now. Will be fixed in #8155\n";
+    }
+    return 0;
+    // return res;
+}
+
+constexpr int size = 1024;
+Buffer<uint8_t> buf_u8(size, "buf_u8");
+Buffer<int8_t> buf_i8(size, "buf_i8");
+Var x{"x"};
+
+Expr random_expr(std::mt19937 &rng) {
+    std::vector<Expr> exprs;
+    // Add some atoms
+    exprs.push_back(cast<uint8_t>((uint8_t)rng()));
+    exprs.push_back(cast<int8_t>((int8_t)rng()));
+    exprs.push_back(cast<uint8_t>((uint8_t)rng()));
+    exprs.push_back(cast<int8_t>((int8_t)rng()));
+    exprs.push_back(buf_u8(x));
+    exprs.push_back(buf_i8(x));
+
+    // Make random combinations of them
+    while (true) {
+        Expr e;
+        int i1 = rng() % exprs.size();
+        int i2 = rng() % exprs.size();
+        int i3 = rng() % exprs.size();
+        int op = rng() % 8;
+        Expr e1 = exprs[i1];
+        Expr e2 = cast(e1.type(), exprs[i2]);
+        Expr e3 = cast(e1.type().with_code(halide_type_uint), exprs[i3]);
+        bool may_widen = e1.type().bits() < 64;
+        Expr e2_narrow = exprs[i2];
+        bool may_widen_right = e2_narrow.type() == e1.type().narrow();
+        switch (op) {
+        case 0:
+            if (may_widen) {
+                e = cast(e1.type().widen(), e1);
+            }
+            break;
+        case 1:
+            if (may_widen) {
+                e = cast(Int(e1.type().bits() * 2), e1);
+            }
+            break;
+        case 2:
+            e = e1 + e2;
+            break;
+        case 3:
+            e = e1 - e2;
+            break;
+        case 4:
+            e = e1 * e2;
+            break;
+        case 5:
+            e = e1 / e2;
+            break;
+        case 6:
+            // Introduce some lets
+            e = common_subexpression_elimination(e1);
+            break;
+        case 7:
+            switch (rng() % 19) {
+            case 0:
+                if (may_widen) {
+                    e = widening_add(e1, e2);
+                }
+                break;
+            case 1:
+                if (may_widen) {
+                    e = widening_sub(e1, e2);
+                }
+                break;
+            case 2:
+                if (may_widen) {
+                    e = widening_mul(e1, e2);
+                }
+                break;
+            case 3:
+                e = halving_add(e1, e2);
+                break;
+            case 4:
+                e = rounding_halving_add(e1, e2);
+                break;
+            case 5:
+                e = halving_sub(e1, e2);
+                break;
+            case 6:
+                e = saturating_add(e1, e2);
+                break;
+            case 7:
+                e = saturating_sub(e1, e2);
+                break;
+            case 8:
+                e = count_leading_zeros(e1);
+                break;
+            case 9:
+                e = count_trailing_zeros(e1);
+                break;
+            case 10:
+                if (may_widen) {
+                    e = rounding_mul_shift_right(e1, e2, e3);
+                }
+                break;
+            case 11:
+                if (may_widen) {
+                    e = mul_shift_right(e1, e2, e3);
+                }
+                break;
+            case 12:
+                if (may_widen_right) {
+                    e = widen_right_add(e1, e2_narrow);
+                }
+                break;
+            case 13:
+                if (may_widen_right) {
+                    e = widen_right_sub(e1, e2_narrow);
+                }
+                break;
+            case 14:
+                if (may_widen_right) {
+                    e = widen_right_mul(e1, e2_narrow);
+                }
+                break;
+            case 15:
+                e = e1 << e2;
+                break;
+            case 16:
+                e = e1 >> e2;
+                break;
+            case 17:
+                e = rounding_shift_right(e1, e2);
+                break;
+            case 18:
+                e = rounding_shift_left(e1, e2);
+                break;
+            }
+        }
+
+        if (!e.defined()) {
+            continue;
+        }
+
+        // Stop when we get to 64 bits, but probably don't stop on a cast,
+        // because that'll just get trivially stripped.
+        if (e.type().bits() == 64 && (e.as<Cast>() == nullptr || ((rng() & 7) == 0))) {
+            return e;
+        }
+
+        exprs.push_back(e);
+    }
+}
+
+bool might_have_ub(Expr e) {
+    class MightOverflow : public IRVisitor {
+        std::map<Expr, ConstantInterval, ExprCompare> cache;
+
+        using IRVisitor::visit;
+
+        bool no_overflow_int(const Type &t) {
+            return t.is_int() && t.bits() >= 32;
+        }
+
+        ConstantInterval bounds(const Expr &e) {
+            return constant_integer_bounds(e, Scope<ConstantInterval>::empty_scope(), &cache);
+        }
+
+        void visit(const Add *op) override {
+            if (no_overflow_int(op->type) &&
+                !op->type.can_represent(bounds(op->a) + bounds(op->b))) {
+                found = true;
+            } else {
+                IRVisitor::visit(op);
+            }
+        }
+
+        void visit(const Sub *op) override {
+            if (no_overflow_int(op->type) &&
+                !op->type.can_represent(bounds(op->a) - bounds(op->b))) {
+                found = true;
+            } else {
+                IRVisitor::visit(op);
+            }
+        }
+
+        void visit(const Mul *op) override {
+            if (no_overflow_int(op->type) &&
+                !op->type.can_represent(bounds(op->a) * bounds(op->b))) {
+                found = true;
+            } else {
+                IRVisitor::visit(op);
+            }
+        }
+
+        void visit(const Div *op) override {
+            if (no_overflow_int(op->type) &&
+                (bounds(op->a) / bounds(op->b)).contains(-1)) {
+                found = true;
+            } else {
+                IRVisitor::visit(op);
+            }
+        }
+
+        void visit(const Cast *op) override {
+            if (no_overflow_int(op->type) &&
+                !op->type.can_represent(bounds(op->value))) {
+                found = true;
+            } else {
+                IRVisitor::visit(op);
+            }
+        }
+
+        void visit(const Call *op) override {
+            if (op->is_intrinsic({Call::shift_left,
+                                  Call::shift_right,
+                                  Call::rounding_shift_left,
+                                  Call::rounding_shift_right,
+                                  Call::widening_shift_left,
+                                  Call::widening_shift_right,
+                                  Call::mul_shift_right,
+                                  Call::rounding_mul_shift_right})) {
+                auto shift_bounds = bounds(op->args.back());
+                if (!(shift_bounds > -op->type.bits() && shift_bounds < op->type.bits())) {
+                    found = true;
+                }
+            } else if (op->is_intrinsic({Call::signed_integer_overflow})) {
+                found = true;
+            }
+            IRVisitor::visit(op);
+        }
+
+    public:
+        bool found = false;
+    } checker;
+
+    e.accept(&checker);
+
+    return checker.found;
+}
+
+bool found_error = false;
+
+int test_one(uint32_t seed) {
+    std::mt19937 rng{seed};
+
+    buf_u8.fill(rng);
+    buf_i8.fill(rng);
+
+    Expr e1 = random_expr(rng);
+
+    if (might_have_ub(e1)) {
+        return 0;
+    }
+
+    // We're also going to test constant_integer_bounds here.
+    ConstantInterval bounds = constant_integer_bounds(e1);
+
+    Type target;
+    std::vector<Type> target_types = {UInt(32), Int(32), UInt(16), Int(16)};
+    target = target_types[rng() % target_types.size()];
+    Expr e2 = lossless_cast(target, e1);
+
+    if (!e2.defined()) {
+        return 0;
+    }
+
+    Func f;
+    f(x) = {cast<int64_t>(e1), cast<int64_t>(e2)};
+    f.vectorize(x, 4, TailStrategy::RoundUp);
+
+    Buffer<int64_t> out1(size), out2(size);
+    Pipeline p(f);
+    p.realize({out1, out2});
+
+    for (int x = 0; x < size; x++) {
+        if (out1(x) != out2(x)) {
+            std::cout
+                << "lossless_cast failure\n"
+                << "seed = " << seed << "\n"
+                << "x = " << x << "\n"
+                << "buf_u8 = " << (int)buf_u8(x) << "\n"
+                << "buf_i8 = " << (int)buf_i8(x) << "\n"
+                << "out1 = " << out1(x) << "\n"
+                << "out2 = " << out2(x) << "\n"
+                << "Original: " << e1 << "\n"
+                << "Lossless cast: " << e2 << "\n"
+                << "Ignoring bug for now. Will be fixed in #8155\n";
+            // If lossless_cast has failed on this Expr, it's possible the test
+            // below will fail as well.
+            return 0;
+            // return 1;
+        }
+    }
+
+    for (int x = 0; x < size; x++) {
+        if ((e1.type().is_int() && !bounds.contains(out1(x))) ||
+            (e1.type().is_uint() && !bounds.contains((uint64_t)out1(x)))) {
+            Expr simplified = simplify(e1);
+            std::cout
+                << "constant_integer_bounds failure\n"
+                << "seed = " << seed << "\n"
+                << "x = " << x << "\n"
+                << "buf_u8 = " << (int)buf_u8(x) << "\n"
+                << "buf_i8 = " << (int)buf_i8(x) << "\n"
+                << "out1 = " << out1(x) << "\n"
+                << "Expression: " << e1 << "\n"
+                << "Bounds: " << bounds << "\n"
+                << "Simplified: " << simplified << "\n"
+                // If it's still out-of-bounds when the expression is
+                // simplified, that'll be easier to debug.
+                << "Bounds: " << constant_integer_bounds(simplified) << "\n";
+            return 1;
+        }
+    }
+
+    return 0;
 }
 
-int main() {
+int fuzz_test(uint32_t root_seed) {
+    std::mt19937 seed_generator(root_seed);
+
+    std::cout << "Fuzz testing with root seed " << root_seed << "\n";
+    for (int i = 0; i < 1000; i++) {
+        if (test_one(seed_generator())) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    if (argc == 2) {
+        return test_one(atoi(argv[1]));
+    }
     if (lossless_cast_test()) {
-        printf("lossless_cast test failed!\n");
+        std::cout << "lossless_cast test failed!\n";
+        return 1;
+    }
+    if (fuzz_test(time(NULL))) {
+        std::cout << "lossless_cast fuzz test failed!\n";
         return 1;
     }
-    printf("Success!\n");
+    std::cout << "Success!\n";
     return 0;
 }
diff --git a/test/fuzz/bounds.cpp b/test/fuzz/bounds.cpp
index df99dcd83b03..d109f4994bbc 100644
--- a/test/fuzz/bounds.cpp
+++ b/test/fuzz/bounds.cpp
@@ -283,11 +283,6 @@ Expr c(Variable::make(global_var_type, fuzz_var(2)));
 Expr d(Variable::make(global_var_type, fuzz_var(3)));
 Expr e(Variable::make(global_var_type, fuzz_var(4)));
 
-std::ostream &operator<<(std::ostream &stream, const Interval &interval) {
-    stream << "[" << interval.min << ", " << interval.max << "]";
-    return stream;
-}
-
 Interval random_interval(FuzzedDataProvider &fdp, Type t) {
     Interval interval;
 

From 64caf31759932bce9d024027976292f85757bb33 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sun, 28 Apr 2024 14:38:54 -0700
Subject: [PATCH 110/186] Faster vars used tracking in simplify let visitor
 (#8205)

* Speed up the vars_used visitor in the simplifier let visitor

This visitor shows up as the main cost of lowering in very large
pipelines.

This visitor is for tracking which lets are actually used for real
inside the body of a let block (as opposed to the tracking we do when
mutating, which is approximate, because we could construct and Expr that
uses a Var and then discard it in a later mutation).

The old implementation made a map of all variables referenced, and then
checked each let name against that map one by one. If there are a small
number of lets outside a huge Stmt, this is bad, because the data
structure has to hold a number of names proportional to the stmt size
instead of proportional to the number of lets.

This new implementation instead makes a hash set of the let names, and
than traverses the Stmt, removing names from the set as they are
encountered. This is a big speed-up.

We then make the speed-up larger by about the same factor again doing
the following:

1) Only add names to the map that might be used based on the recursive
mutate call. These are very very likely to be used, because we saw them
at least once, and mutations that remove *all* uses of a Var are rare.

2) The visitor should early out when the map becomes empty. The let
variables are often all used immediately, so this is frequent.

Speeds up lowering of local laplacian by 1.44x, 2.6x, and 4.8x
respectively for 20, 50, and 100 pyramid levels.

Speeds up lowering of resnet50 by 1.04x. Speeds up lowering of lens blur
by 1.06x

* Exploit the ref count of the replacement Expr

* Fix is_sole_reference logic in Simplify_Let.cpp

* Reduce hash map size
---
 src/IntrusivePtr.h   |  8 +++++
 src/Simplify_Let.cpp | 85 ++++++++++++++++++++++++++++++++------------
 2 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/src/IntrusivePtr.h b/src/IntrusivePtr.h
index f233420c8009..d265c3dcec8a 100644
--- a/src/IntrusivePtr.h
+++ b/src/IntrusivePtr.h
@@ -32,6 +32,9 @@ class RefCount {
     bool is_const_zero() const {
         return count == 0;
     }
+    int atomic_get() const {
+        return count;
+    }
 };
 
 /**
@@ -173,6 +176,11 @@ struct IntrusivePtr {
     bool operator<(const IntrusivePtr<T> &other) const {
         return ptr < other.ptr;
     }
+
+    HALIDE_ALWAYS_INLINE
+    bool is_sole_reference() const {
+        return ptr && ref_count(ptr).atomic_get() == 1;
+    }
 };
 
 }  // namespace Internal
diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index 4f1862abf6ac..342281fa6639 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -1,6 +1,8 @@
 #include "Simplify_Internal.h"
 #include "Substitute.h"
 
+#include <unordered_set>
+
 namespace Halide {
 namespace Internal {
 
@@ -9,34 +11,50 @@ using std::vector;
 
 namespace {
 
-class CountVarUses : public IRVisitor {
-    std::map<std::string, int> &var_uses;
+class FindVarUses : public IRVisitor {
+    std::unordered_set<std::string> &unused_vars;
 
     void visit(const Variable *var) override {
-        var_uses[var->name]++;
+        unused_vars.erase(var->name);
     }
 
     void visit(const Load *op) override {
-        var_uses[op->name]++;
-        IRVisitor::visit(op);
+        if (!unused_vars.empty()) {
+            unused_vars.erase(op->name);
+            IRVisitor::visit(op);
+        }
     }
 
     void visit(const Store *op) override {
-        var_uses[op->name]++;
-        IRVisitor::visit(op);
+        if (!unused_vars.empty()) {
+            unused_vars.erase(op->name);
+            IRVisitor::visit(op);
+        }
+    }
+
+    void visit(const Block *op) override {
+        // Early out at Block nodes if we've already seen every name we're
+        // interested in. In principal we could early-out at every node, but
+        // blocks, loads, and stores seem to be enough.
+        if (!unused_vars.empty()) {
+            op->first.accept(this);
+            if (!unused_vars.empty()) {
+                op->rest.accept(this);
+            }
+        }
     }
 
     using IRVisitor::visit;
 
 public:
-    CountVarUses(std::map<std::string, int> &var_uses)
-        : var_uses(var_uses) {
+    FindVarUses(std::unordered_set<std::string> &unused_vars)
+        : unused_vars(unused_vars) {
     }
 };
 
 template<typename StmtOrExpr>
-void count_var_uses(StmtOrExpr x, std::map<std::string, int> &var_uses) {
-    CountVarUses counter(var_uses);
+void find_var_uses(StmtOrExpr x, std::unordered_set<std::string> &unused_vars) {
+    FindVarUses counter(unused_vars);
     x.accept(&counter);
 }
 
@@ -49,10 +67,11 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
     // the call stack where it could overflow onto an explicit stack.
     struct Frame {
         const LetOrLetStmt *op;
-        Expr value, new_value;
+        Expr value, new_value, new_var;
         string new_name;
         bool new_value_alignment_tracked = false, new_value_bounds_tracked = false;
         bool value_alignment_tracked = false, value_bounds_tracked = false;
+        VarInfo info;
         Frame(const LetOrLetStmt *op)
             : op(op) {
         }
@@ -189,6 +208,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
             // Nothing to substitute
             f.new_value = Expr();
             replacement = Expr();
+            new_var = Expr();
         } else {
             debug(4) << "new let " << f.new_name << " = " << f.new_value << " in ... " << replacement << " ...\n";
         }
@@ -197,6 +217,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
         info.old_uses = 0;
         info.new_uses = 0;
         info.replacement = replacement;
+        f.new_var = new_var;
 
         var_info.push(op->name, info);
 
@@ -226,14 +247,35 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
 
     result = mutate_let_body(result, bounds);
 
-    // TODO: var_info and vars_used are pretty redundant; however, at the time
+    // TODO: var_info and unused_vars are pretty redundant; however, at the time
     // of writing, both cover cases that the other does not:
     // - var_info prevents duplicate lets from being generated, even
     //   from different Frame objects.
-    // - vars_used avoids dead lets being generated in cases where vars are
+    // - unused_vars avoids dead lets being generated in cases where vars are
     //   seen as used by var_info, and then later removed.
-    std::map<std::string, int> vars_used;
-    count_var_uses(result, vars_used);
+
+    std::unordered_set<std::string> unused_vars(frames.size());
+    // Insert everything we think *might* be used, and then visit the body,
+    // removing things from the set as we find uses of them.
+    for (auto &f : frames) {
+        f.info = var_info.get(f.op->name);
+        // Drop any reference to new_var held by the replacement expression so
+        // that the only references are either f.new_var, or ones in the body or
+        // new_values of other lets.
+        f.info.replacement = Expr();
+        if (f.new_var.is_sole_reference()) {
+            // Any new_uses must have been eliminated by later mutations.
+            f.info.new_uses = 0;
+        }
+        var_info.pop(f.op->name);
+        if (f.info.old_uses) {
+            internal_assert(f.info.new_uses == 0);
+            unused_vars.insert(f.op->name);
+        } else if (f.info.new_uses && f.new_value.defined()) {
+            unused_vars.insert(f.new_name);
+        }
+    }
+    find_var_uses(result, unused_vars);
 
     for (auto it = frames.rbegin(); it != frames.rend(); it++) {
         if (it->value_bounds_tracked) {
@@ -243,20 +285,17 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
             bounds_and_alignment_info.pop(it->new_name);
         }
 
-        VarInfo info = var_info.get(it->op->name);
-        var_info.pop(it->op->name);
-
-        if (it->new_value.defined() && (info.new_uses > 0 && vars_used.count(it->new_name) > 0)) {
+        if (it->new_value.defined() && (it->info.new_uses > 0 && !unused_vars.count(it->new_name))) {
             // The new name/value may be used
             result = LetOrLetStmt::make(it->new_name, it->new_value, result);
-            count_var_uses(it->new_value, vars_used);
+            find_var_uses(it->new_value, unused_vars);
         }
 
         if ((!remove_dead_code && std::is_same<LetOrLetStmt, LetStmt>::value) ||
-            (info.old_uses > 0 && vars_used.count(it->op->name) > 0)) {
+            (it->info.old_uses > 0 && !unused_vars.count(it->op->name))) {
             // The old name is still in use. We'd better keep it as well.
             result = LetOrLetStmt::make(it->op->name, it->value, result);
-            count_var_uses(it->value, vars_used);
+            find_var_uses(it->value, unused_vars);
         }
 
         const LetOrLetStmt *new_op = result.template as<LetOrLetStmt>();

From 82021630e8e5bb545d77afd6271c150c60f1cfcd Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sun, 28 Apr 2024 14:39:41 -0700
Subject: [PATCH 111/186] More aggressively unify duplicate lets (#8204)

* Make unify_duplicate_lets more aggressive

The simplifier can also clean up most of these, but it's harder for it
because it has to consider that other mutations may have taken place.
Beefing this up has no impact on lowering times for most apps, but
something pathological was going on for local_laplacian. At 20 pyramid
levels, this speeds up lowering by 1.3x. At 50 pyramid levels it's 2.3x.
At 100 pyramid levels it's 4.1x.

It also slightly reduces binary size.

* Clarify comment; Avoid double-lookup into the scope

Looking up with an Expr key and deep equality is expensive, so this was
bad.

* Add a std::move
---
 src/UnifyDuplicateLets.cpp | 79 +++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 35 deletions(-)

diff --git a/src/UnifyDuplicateLets.cpp b/src/UnifyDuplicateLets.cpp
index 5f6e120d6e76..fe163ae20778 100644
--- a/src/UnifyDuplicateLets.cpp
+++ b/src/UnifyDuplicateLets.cpp
@@ -1,6 +1,7 @@
 #include "UnifyDuplicateLets.h"
 #include "IREquality.h"
 #include "IRMutator.h"
+#include "Simplify.h"
 #include <map>
 
 namespace Halide {
@@ -14,31 +15,32 @@ namespace {
 class UnifyDuplicateLets : public IRMutator {
     using IRMutator::visit;
 
-    map<Expr, string, IRDeepCompare> scope;
-    map<string, string> rewrites;
-    string producing;
+    // Map from Exprs to a Variable in the let name that first introduced that
+    // Expr.
+    map<Expr, Expr, IRDeepCompare> scope;
+
+    // Map from Vars to the Expr they should be replaced with.
+    map<string, Expr> rewrites;
 
 public:
     using IRMutator::mutate;
 
     Expr mutate(const Expr &e) override {
-        if (e.defined()) {
-            map<Expr, string, IRDeepCompare>::iterator iter = scope.find(e);
-            if (iter != scope.end()) {
-                return Variable::make(e.type(), iter->second);
-            } else {
-                return IRMutator::mutate(e);
-            }
-        } else {
-            return Expr();
+        Expr new_e = IRMutator::mutate(e);
+
+        if (auto iter = scope.find(new_e);
+            iter != scope.end()) {
+            return iter->second;
         }
+
+        return new_e;
     }
 
 protected:
     Expr visit(const Variable *op) override {
-        map<string, string>::iterator iter = rewrites.find(op->name);
+        auto iter = rewrites.find(op->name);
         if (iter != rewrites.end()) {
-            return Variable::make(op->type, iter->second);
+            return iter->second;
         } else {
             return op;
         }
@@ -56,36 +58,41 @@ class UnifyDuplicateLets : public IRMutator {
         return IRMutator::visit(op);
     }
 
-    Stmt visit(const ProducerConsumer *op) override {
-        if (op->is_producer) {
-            string old_producing = producing;
-            producing = op->name;
-            Stmt stmt = IRMutator::visit(op);
-            producing = old_producing;
-            return stmt;
-        } else {
-            return IRMutator::visit(op);
-        }
-    }
-
     template<typename LetStmtOrLet>
     auto visit_let(const LetStmtOrLet *op) -> decltype(op->body) {
         is_impure = false;
         Expr value = mutate(op->value);
+        Expr simplified = simplify(value);
         auto body = op->body;
 
         bool should_pop = false;
         bool should_erase = false;
 
         if (!is_impure) {
-            map<Expr, string, IRDeepCompare>::iterator iter = scope.find(value);
-            if (iter == scope.end()) {
-                scope[value] = op->name;
-                should_pop = true;
-            } else {
-                value = Variable::make(value.type(), iter->second);
-                rewrites[op->name] = iter->second;
+            if (simplified.as<Variable>() ||
+                simplified.as<IntImm>()) {
+                // The RHS collapsed to just a Var or a constant, so uses of
+                // this should be rewritten to that value and we should drop
+                // this let. The LetStmts at this point in lowering that we're
+                // trying to remove are generally bounds inference expressions,
+                // so it's not worth checking for other types of constant.
+                rewrites[op->name] = simplified;
                 should_erase = true;
+            } else {
+                Expr var = Variable::make(value.type(), op->name);
+
+                // The mutate implementation above checks Exprs
+                // post-mutation but without simplification, so we should
+                // put the unsimplified version of the Expr into the scope.
+                auto [it, inserted] = scope.emplace(value, std::move(var));
+
+                if (inserted) {
+                    should_pop = true;
+                } else {
+                    // We have the same RHS as some earlier Let
+                    should_erase = true;
+                    rewrites[op->name] = it->second;
+                }
             }
         }
 
@@ -96,12 +103,14 @@ class UnifyDuplicateLets : public IRMutator {
         }
         if (should_erase) {
             rewrites.erase(op->name);
+            // We no longer need this let.
+            return body;
         }
 
-        if (value.same_as(op->value) && body.same_as(op->body)) {
+        if (simplified.same_as(op->value) && body.same_as(op->body)) {
             return op;
         } else {
-            return LetStmtOrLet::make(op->name, value, body);
+            return LetStmtOrLet::make(op->name, simplified, body);
         }
     }
 

From d55d82b08e599f35c00ea24ff4ed2de28dd0c298 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 29 Apr 2024 09:38:30 -0700
Subject: [PATCH 112/186] Update debug_to_file API to remove type_code (#8183)

* Add .npy support to halide_image_io

The .npy format is NumPy's native format for storing multidimensional arrays (aka tensors/buffers). Being able to load/save in this format makes it (potentially) a lot easier to interchange data with the Python ecosystem, as well as providing a file format that support floating-point data more robustly than any of the others that we current support.

This adds load/save support for a useful subset:
- We support the int/uint/float types common in Halide (except for f16/bf16 for now)
- We don't support reading or writing files that are in `fortran_order`
- We don't support any object/struct/etc files, only numeric primitives
- We only support loading files that are in the host's endianness (typically little-endian)

Note that at present this doesn't support f16 / bf16 formats, but that could likely be added with minimal difficulty.

The tricky bit of this is that the reading code has to parse a (limited) Python dict in text form. Please review that part carefully.

TODO: we could probably add this as an option for `debug_to_file()` without too much pain in a followup PR.

* clang-tidy

* clang-tidy

* Address review comments

* Allow for "keys" as well as 'keys'

* Add .npy support to debug_to_file()

Built on top of https://github.com/halide/Halide/pull/8175, this adds .npy as an option. This is actually pretty great because it's easy to do something like

```
ss = numpy.load("my_file.npy")
print(ss)
```

in Python and get nicely-formatted output, which can sometimes be a lot easier for debugging that inserting lots of print() statements (see https://github.com/halide/Halide/issues/8176)

Did a drive-by change to the correctness test to use this format instead of .mat.

* Add float16 support

* Add support for Float16 images in npy

* Assume little-endian

* Remove redundant halide_error()

* naming convention

* naming convention

* Test both mat and npy

* Don't call halide_error()

* Use old-school parser

* clang-tidy

* Update debug_to_file API to remove type_code

* Clean up into single table

* Update CodeGen_LLVM.cpp

* Fix tmp codes

* Update InjectHostDevBufferCopies.cpp

* Update InjectHostDevBufferCopies.cpp

* trigger buildbots
---
 src/CodeGen_LLVM.cpp              |   8 +-
 src/DebugToFile.cpp               |  31 --------
 src/InjectHostDevBufferCopies.cpp |   4 +-
 src/runtime/HalideRuntime.h       |   1 -
 src/runtime/write_debug_image.cpp | 119 +++++++++++++++---------------
 5 files changed, 66 insertions(+), 97 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 1871460569c3..9761414098a5 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -2637,7 +2637,7 @@ void CodeGen_LLVM::visit(const Call *op) {
     // handled in the standard library, but ones with e.g. varying
     // types are handled here.
     if (op->is_intrinsic(Call::debug_to_file)) {
-        internal_assert(op->args.size() == 3);
+        internal_assert(op->args.size() == 2);
         const StringImm *filename = op->args[0].as<StringImm>();
         internal_assert(filename) << "Malformed debug_to_file node\n";
         // Grab the function from the initial module
@@ -2647,10 +2647,10 @@ void CodeGen_LLVM::visit(const Call *op) {
         // Make the filename a global string constant
         Value *user_context = get_user_context();
         Value *char_ptr = codegen(Expr(filename));
-        vector<Value *> args = {user_context, char_ptr, codegen(op->args[1])};
+        vector<Value *> args = {user_context, char_ptr};
 
-        Value *buffer = codegen(op->args[2]);
-        buffer = builder->CreatePointerCast(buffer, debug_to_file->getFunctionType()->getParamType(3));
+        Value *buffer = codegen(op->args[1]);
+        buffer = builder->CreatePointerCast(buffer, debug_to_file->getFunctionType()->getParamType(2));
         args.push_back(buffer);
 
         value = builder->CreateCall(debug_to_file, args);
diff --git a/src/DebugToFile.cpp b/src/DebugToFile.cpp
index 8510b806a132..89ea6c36c92d 100644
--- a/src/DebugToFile.cpp
+++ b/src/DebugToFile.cpp
@@ -42,37 +42,6 @@ class DebugToFile : public IRMutator {
                 num_elements *= bound.extent;
             }
 
-            // TODO: why do we bother with this? halide_debug_to_file()
-            // can infer the type-and-size it needs from the buffer's type field.
-            int type_code = 0;
-            Type t = op->types[0];
-            if (t == Float(32)) {
-                type_code = 0;
-            } else if (t == Float(64)) {
-                type_code = 1;
-            } else if (t == UInt(8) || t == UInt(1)) {
-                type_code = 2;
-            } else if (t == Int(8)) {
-                type_code = 3;
-            } else if (t == UInt(16)) {
-                type_code = 4;
-            } else if (t == Int(16)) {
-                type_code = 5;
-            } else if (t == UInt(32)) {
-                type_code = 6;
-            } else if (t == Int(32)) {
-                type_code = 7;
-            } else if (t == UInt(64)) {
-                type_code = 8;
-            } else if (t == Int(64)) {
-                type_code = 9;
-            } else if (t == Float(16)) {
-                type_code = 10;
-            } else {
-                user_error << "Type " << t << " not supported for debug_to_file\n";
-            }
-            args.emplace_back(type_code);
-
             Expr buf = Variable::make(Handle(), f.name() + ".buffer");
             args.push_back(buf);
 
diff --git a/src/InjectHostDevBufferCopies.cpp b/src/InjectHostDevBufferCopies.cpp
index 6cdaabaeea1f..39037a39be7e 100644
--- a/src/InjectHostDevBufferCopies.cpp
+++ b/src/InjectHostDevBufferCopies.cpp
@@ -76,8 +76,8 @@ class FindBufferUsage : public IRVisitor {
                 op->args[i].accept(this);
             }
         } else if (op->is_intrinsic(Call::debug_to_file)) {
-            internal_assert(op->args.size() == 3);
-            if (is_buffer_var(op->args[2])) {
+            internal_assert(op->args.size() == 2);
+            if (is_buffer_var(op->args[1])) {
                 devices_touched.insert(current_device_api);
                 devices_writing.insert(current_device_api);
             }
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 1d66ab02b368..b02890a23f55 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -463,7 +463,6 @@ extern halide_get_library_symbol_t halide_set_custom_get_library_symbol(halide_g
  * Cannot be replaced in JITted code at present.
  */
 extern int32_t halide_debug_to_file(void *user_context, const char *filename,
-                                    int32_t type_code,
                                     struct halide_buffer_t *buf);
 
 /** Types in the halide type system. They can be ints, unsigned ints,
diff --git a/src/runtime/write_debug_image.cpp b/src/runtime/write_debug_image.cpp
index a5f8816db2c7..b656d7034524 100644
--- a/src/runtime/write_debug_image.cpp
+++ b/src/runtime/write_debug_image.cpp
@@ -26,26 +26,6 @@ namespace Halide {
 namespace Runtime {
 namespace Internal {
 
-// Mappings from the type_code passed in to the type codes of the
-// formats. See "type_code" in DebugToFile.cpp
-
-constexpr int kNumTypeCodes = 11;
-
-// TIFF sample type values are:
-//     1 => Unsigned int
-//     2 => Signed int
-//     3 => Floating-point
-WEAK int16_t pixel_type_to_tiff_sample_type[kNumTypeCodes] = {
-    // float, double, uint8, int8, ... uint64, int64
-    3, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0};
-
-// See the .mat level 5 documentation for matlab class codes.
-WEAK uint8_t pixel_type_to_matlab_class_code[kNumTypeCodes] = {
-    7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0};
-
-WEAK uint8_t pixel_type_to_matlab_type_code[kNumTypeCodes] = {
-    7, 9, 2, 1, 4, 3, 6, 5, 13, 12, 0};
-
 #pragma pack(push)
 #pragma pack(2)
 
@@ -138,37 +118,56 @@ constexpr char big_endian_char = '>';
 constexpr char no_endian_char = '|';
 constexpr char host_endian_char = (host_is_big_endian ? big_endian_char : little_endian_char);
 
-struct npy_dtype_info_t {
+struct npy_type_info_t {
     char byte_order;
     char kind;
-    size_t item_size;
+    uint16_t item_size;
 };
 
-struct htype_to_dtype {
+struct mat_type_info_t {
+    uint8_t class_code, type_code;
+};
+
+struct tmp_type_info_t {
+    int8_t type_code;
+};
+
+struct tiff_type_info_t {
+    int8_t type_code;
+};
+
+struct halide_type_to_dst_type_t {
     halide_type_t htype;
-    npy_dtype_info_t dtype;
+    npy_type_info_t npy;
+    mat_type_info_t mat;
+    tmp_type_info_t tmp;
+    tiff_type_info_t tiff;
 };
 
-WEAK htype_to_dtype npy_dtypes[] = {
-    {halide_type_t(halide_type_float, 16), {host_endian_char, 'f', 2}},
-    {halide_type_of<float>(), {host_endian_char, 'f', sizeof(float)}},
-    {halide_type_of<double>(), {host_endian_char, 'f', sizeof(double)}},
-    {halide_type_of<int8_t>(), {no_endian_char, 'i', sizeof(int8_t)}},
-    {halide_type_of<int16_t>(), {host_endian_char, 'i', sizeof(int16_t)}},
-    {halide_type_of<int32_t>(), {host_endian_char, 'i', sizeof(int32_t)}},
-    {halide_type_of<int64_t>(), {host_endian_char, 'i', sizeof(int64_t)}},
-    {halide_type_of<uint8_t>(), {no_endian_char, 'u', sizeof(uint8_t)}},
-    {halide_type_of<uint16_t>(), {host_endian_char, 'u', sizeof(uint16_t)}},
-    {halide_type_of<uint32_t>(), {host_endian_char, 'u', sizeof(uint32_t)}},
-    {halide_type_of<uint64_t>(), {host_endian_char, 'u', sizeof(uint64_t)}},
+// See the .mat level 5 documentation for matlab class codes.
+
+// clang-format off
+WEAK halide_type_to_dst_type_t debug_to_file_type_map[] = {  //                        mat       tmp   tiff
+    { halide_type_of<float>(),              {host_endian_char, 'f', sizeof(float)},    {7, 7},   {0},  {3} },
+    { halide_type_of<double>(),             {host_endian_char, 'f', sizeof(double)},   {6, 9},   {1},  {3} },
+    { halide_type_of<uint8_t>(),            {no_endian_char,   'u', sizeof(uint8_t)},  {9, 2},   {2},  {1} },
+    { halide_type_of<bool>(),               {no_endian_char,   'u', sizeof(uint8_t)},  {9, 2},   {2},  {1} },
+    { halide_type_of<int8_t>(),             {no_endian_char,   'i', sizeof(int8_t)},   {8, 1},   {3},  {2} },
+    { halide_type_of<uint16_t>(),           {host_endian_char, 'u', sizeof(uint16_t)}, {11, 4},  {4},  {1} },
+    { halide_type_of<int16_t>(),            {host_endian_char, 'i', sizeof(int16_t)},  {10, 3},  {5},  {2} },
+    { halide_type_of<uint32_t>(),           {host_endian_char, 'u', sizeof(uint32_t)}, {13, 6},  {6},  {1} },
+    { halide_type_of<int32_t>(),            {host_endian_char, 'i', sizeof(int32_t)},  {12, 5},  {7},  {2} },
+    { halide_type_of<uint64_t>(),           {host_endian_char, 'u', sizeof(uint64_t)}, {15, 13}, {8},  {1} },
+    { halide_type_of<int64_t>(),            {host_endian_char, 'i', sizeof(int64_t)},  {14, 12}, {9},  {2} },
+    { halide_type_t(halide_type_float, 16), {host_endian_char, 'f', 2},                {0, 0},   {-1}, {0} },
 };
+// clang-format on
 
 }  // namespace Internal
 }  // namespace Runtime
 }  // namespace Halide
 
-WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filename,
-                                         int32_t type_code, struct halide_buffer_t *buf) {
+WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filename, struct halide_buffer_t *buf) {
 
     if (buf->is_bounds_query()) {
         halide_error(user_context, "Bounds query buffer passed to halide_debug_to_file");
@@ -209,15 +208,19 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
 
     uint32_t final_padding_bytes = 0;
 
-    if (ends_with(filename, ".npy")) {
-        npy_dtype_info_t di = {0, 0, 0};
-        for (const auto &d : npy_dtypes) {
-            if (d.htype == buf->type) {
-                di = d.dtype;
-                break;
-            }
+    const halide_type_to_dst_type_t *type_found = nullptr;
+    for (const auto &d : debug_to_file_type_map) {
+        if (d.htype == buf->type) {
+            type_found = &d;
+            break;
         }
-        if (di.byte_order == 0) {
+    }
+    if (!type_found) {
+        return halide_error_code_debug_to_file_failed;
+    }
+
+    if (ends_with(filename, ".npy")) {
+        if (type_found->npy.item_size == 0) {
             return halide_error_code_debug_to_file_failed;
         }
 
@@ -227,9 +230,9 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
         char *end = dict_string_buf + max_dict_string_size - 1;
 
         dst = halide_string_to_string(dst, end, "{'descr': '");
-        *dst++ = di.byte_order;
-        *dst++ = di.kind;
-        dst = halide_int64_to_string(dst, end, di.item_size, 1);
+        *dst++ = type_found->npy.byte_order;
+        *dst++ = type_found->npy.kind;
+        dst = halide_int64_to_string(dst, end, type_found->npy.item_size, 1);
         dst = halide_string_to_string(dst, end, "', 'fortran_order': False, 'shape': (");
         for (int d = 0; d < buf->dimensions; ++d) {
             if (d > 0) {
@@ -272,7 +275,7 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
             return halide_error_code_debug_to_file_failed;
         }
     } else if (ends_with(filename, ".tiff") || ends_with(filename, ".tif")) {
-        if (type_code == 10) {
+        if (type_found->tiff.type_code == 0) {
             return halide_error_code_debug_to_file_failed;
         }
 
@@ -320,9 +323,8 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
                         __builtin_offsetof(halide_tiff_header, height_resolution));  // Height resolution
         tag++->assign16(284, 1, 2);                                                  // Planar configuration -- planar
         tag++->assign16(296, 1, 1);                                                  // Resolution Unit -- none
-        tag++->assign16(339, 1,
-                        pixel_type_to_tiff_sample_type[type_code]);  // Sample type
-        tag++->assign32(32997, 1, depth);                            // Image depth
+        tag++->assign16(339, 1, type_found->tiff.type_code);                         // Sample type
+        tag++->assign32(32997, 1, depth);                                            // Image depth
 
         header.ifd0_end = 0;
         header.width_resolution[0] = 1;
@@ -351,7 +353,7 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
             }
         }
     } else if (ends_with(filename, ".mat")) {
-        if (type_code == 10) {
+        if (type_found->mat.type_code == 0) {
             return halide_error_code_debug_to_file_failed;
         }
 
@@ -406,7 +408,7 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
             // This is a matrix
             14, 40 + padded_dimensions * 4 + padded_name_size + (uint32_t)payload_bytes + final_padding_bytes,
             // The element type
-            6, 8, pixel_type_to_matlab_class_code[type_code], 1,
+            6, 8, type_found->mat.class_code, 1,
             // The shape
             5, (uint32_t)(dims * 4)};
 
@@ -430,13 +432,12 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
         }
 
         // Payload header
-        uint32_t payload_header[2] = {
-            pixel_type_to_matlab_type_code[type_code], (uint32_t)payload_bytes};
+        uint32_t payload_header[2] = {type_found->mat.type_code, (uint32_t)payload_bytes};
         if (!f.write(payload_header, sizeof(payload_header))) {
             return halide_error_code_debug_to_file_failed;
         }
     } else {
-        if (type_code == 10) {
+        if (type_found->tmp.type_code < 0) {
             return halide_error_code_debug_to_file_failed;
         }
 
@@ -444,7 +445,7 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam
                             shape[1].extent,
                             shape[2].extent,
                             shape[3].extent,
-                            type_code};
+                            type_found->tmp.type_code};
         if (!f.write((void *)(&header[0]), sizeof(header))) {
             return halide_error_code_debug_to_file_failed;
         }

From 814119706bd806f13bb02f31db7e7caf099c38c9 Mon Sep 17 00:00:00 2001
From: Alexander Root <32245479+rootjalex@users.noreply.github.com>
Date: Tue, 30 Apr 2024 06:38:30 -0700
Subject: [PATCH 113/186] [x86 & HVX & WASM] Use bounds inference for
 saturating_narrow instruction selection (#7805)

* x86 bounds inference for saturating_narrow

* bounds inference for HVX too

* use can_represent(ConstantInterval) + clang-format

* use bounds inference for WASM IS too + add tests

* add tracking issue for scoped constant bounds

* add TODO about lossless_cast usage

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/CodeGen_WebAssembly.cpp             | 37 +++++++++++++++++++++++
 src/CodeGen_X86.cpp                     | 40 +++++++++++++++++++++++--
 src/HexagonOptimize.cpp                 | 38 +++++++++++++++++++++--
 test/correctness/simd_op_check_hvx.cpp  |  5 ++++
 test/correctness/simd_op_check_wasm.cpp |  5 ++++
 5 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
index 6f37f1447df1..53329ed52172 100644
--- a/src/CodeGen_WebAssembly.cpp
+++ b/src/CodeGen_WebAssembly.cpp
@@ -3,6 +3,7 @@
 
 #include "CodeGen_Posix.h"
 #include "ConciseCasts.h"
+#include "ConstantBounds.h"
 #include "IRMatch.h"
 #include "IROperator.h"
 #include "LLVM_Headers.h"
@@ -206,6 +207,12 @@ void CodeGen_WebAssembly::visit(const Call *op) {
         {"saturating_narrow", i16_sat(wild_i32x_), Target::WasmSimd128},
         {"saturating_narrow", u16_sat(wild_i32x_), Target::WasmSimd128},
     };
+    static const Pattern reinterpret_patterns[] = {
+        {"saturating_narrow", i8_sat(wild_u16x_), Target::WasmSimd128},
+        {"saturating_narrow", u8_sat(wild_u16x_), Target::WasmSimd128},
+        {"saturating_narrow", i16_sat(wild_u32x_), Target::WasmSimd128},
+        {"saturating_narrow", u16_sat(wild_u32x_), Target::WasmSimd128},
+    };
     static const vector<pair<Expr, Expr>> cast_rewrites = {
         // Some double-narrowing saturating casts can be better expressed as
         // combinations of single-narrowing saturating casts.
@@ -235,6 +242,36 @@ void CodeGen_WebAssembly::visit(const Call *op) {
                 return;
             }
         }
+
+        // Search for saturating casts where the inner value can be
+        // reinterpreted to signed, so that we can use existing
+        // saturating_narrow instructions.
+        // TODO: should use lossless_cast once it is fixed.
+        for (const Pattern &p : reinterpret_patterns) {
+            if (!target.has_feature(p.required_feature)) {
+                continue;
+            }
+            if (expr_match(p.pattern, op, matches)) {
+                const Expr &expr = matches[0];
+                const Type &t = expr.type();
+                // TODO(8212): might want to keep track of scope of bounds information.
+                const ConstantInterval ibounds = constant_integer_bounds(expr);
+                const Type reint_type = t.with_code(halide_type_int);
+                // If the signed type can represent the maximum value unsigned value,
+                //  we can safely reinterpret this unsigned expression as signed.
+                if (reint_type.can_represent(ibounds)) {
+                    // Can safely reinterpret to signed integer.
+                    matches[0] = cast(reint_type, matches[0]);
+
+                    value = call_overloaded_intrin(op->type, p.intrin, matches);
+                    if (value) {
+                        return;
+                    }
+                }
+                // No reinterpret patterns match the same input, so stop matching.
+                break;
+            }
+        }
     }
 
     if (op->is_intrinsic(Call::round)) {
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index b0df27af0f2f..5dd6a17e02d2 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -1,6 +1,7 @@
 #include "CodeGen_Internal.h"
 #include "CodeGen_Posix.h"
 #include "ConciseCasts.h"
+#include "ConstantBounds.h"
 #include "Debug.h"
 #include "IRMatch.h"
 #include "IRMutator.h"
@@ -537,7 +538,7 @@ void CodeGen_X86::visit(const Cast *op) {
     };
 
     // clang-format off
-    static Pattern patterns[] = {
+    static const Pattern patterns[] = {
         // This isn't rounding_multiply_quantzied(i16, i16, 15) because it doesn't
         // saturate the result.
         {"pmulhrs", i16(rounding_shift_right(widening_mul(wild_i16x_, wild_i16x_), 15))},
@@ -647,7 +648,7 @@ void CodeGen_X86::visit(const Call *op) {
     };
 
     // clang-format off
-    static Pattern patterns[] = {
+    static const Pattern patterns[] = {
         {"pmulh", mul_shift_right(wild_i16x_, wild_i16x_, 16)},
         {"pmulh", mul_shift_right(wild_u16x_, wild_u16x_, 16)},
         {"saturating_narrow", i16_sat(wild_i32x_)},
@@ -667,6 +668,41 @@ void CodeGen_X86::visit(const Call *op) {
         }
     }
 
+    // clang-format off
+    static const Pattern reinterpret_patterns[] = {
+        {"saturating_narrow", i16_sat(wild_u32x_)},
+        {"saturating_narrow", u16_sat(wild_u32x_)},
+        {"saturating_narrow", i8_sat(wild_u16x_)},
+        {"saturating_narrow", u8_sat(wild_u16x_)},
+    };
+    // clang-format on
+
+    // Search for saturating casts where the inner value can be
+    // reinterpreted to signed, so that we can use existing
+    // saturating_narrow instructions.
+    // TODO: should use lossless_cast once it is fixed.
+    for (const auto &pattern : reinterpret_patterns) {
+        if (expr_match(pattern.pattern, op, matches)) {
+            const Expr &expr = matches[0];
+            const Type &t = expr.type();
+            // TODO(8212): might want to keep track of scope of bounds information.
+            const ConstantInterval ibounds = constant_integer_bounds(expr);
+            const Type reint_type = t.with_code(halide_type_int);
+            // If the signed type can represent the maximum value unsigned value,
+            //  we can safely reinterpret this unsigned expression as signed.
+            if (reint_type.can_represent(ibounds)) {
+                // Can safely reinterpret to signed integer.
+                matches[0] = cast(reint_type, matches[0]);
+                value = call_overloaded_intrin(op->type, pattern.intrin, matches);
+                if (value) {
+                    return;
+                }
+            }
+            // No reinterpret patterns match the same input, so stop matching.
+            break;
+        }
+    }
+
     static const vector<pair<Expr, Expr>> cast_rewrites = {
         // Some double-narrowing saturating casts can be better expressed as
         // combinations of single-narrowing saturating casts.
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index f11fa3348399..a123738fe298 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -3,6 +3,7 @@
 #include "CSE.h"
 #include "CodeGen_Internal.h"
 #include "ConciseCasts.h"
+#include "ConstantBounds.h"
 #include "DistributeShifts.h"
 #include "ExprUsesVar.h"
 #include "FindIntrinsics.h"
@@ -189,8 +190,10 @@ struct Pattern {
         // re-interleave the result.
         ReinterleaveOp0 = InterleaveResult | DeinterleaveOp0,
 
-        v65orLater = 1 << 10,  // Pattern should be matched only for v65 target or later
-        v66orLater = 1 << 11,  // Pattern should be matched only for v66 target or later
+        SafeReinterpretOp0 = 1 << 10,  // Pattern should be matched only if the first arg can be safely reinterpreted.
+
+        v65orLater = 1 << 11,  // Pattern should be matched only for v65 target or later
+        v66orLater = 1 << 12,  // Pattern should be matched only for v66 target or later
     };
 
     string intrin;  // Name of the intrinsic
@@ -260,6 +263,27 @@ bool process_match_flags(vector<Expr> &matches, int flags) {
         internal_assert(matches.size() >= 3);
         std::swap(matches[1], matches[2]);
     }
+    if (flags & Pattern::SafeReinterpretOp0) {
+        // Use bounds inference to check if the first operand can
+        // be safely reinterpreted.
+        // TODO: should use lossless_cast once it is fixed.
+        const Expr &expr = matches[0];
+        const Type &t = expr.type();
+        if (t.is_int()) {
+            // TODO(8212): might want to keep track of scope of bounds information.
+            const ConstantInterval ibounds = constant_integer_bounds(expr);
+            const Type reint_type = UInt(t.bits());
+            // A signed integer can be reinterpreted as unsigned if strictly positive.
+            return reint_type.can_represent(ibounds);
+        } else {
+            internal_assert(t.is_uint());
+            // TODO(8212): might want to keep track of scope of bounds information.
+            const ConstantInterval ibounds = constant_integer_bounds(expr);
+            const Type reint_type = Int(t.bits());
+            // An unsigned integer can be reinterpreted as signed if less than int max.
+            return reint_type.can_represent(ibounds);
+        }
+    }
     return true;
 }
 
@@ -915,10 +939,18 @@ class OptimizePatterns : public IRMutator {
 
             // Saturating narrowing casts. These may interleave later with trunc_sat.
             {"halide.hexagon.pack_satub.vh", u8_sat(wild_i16x)},
-            {"halide.hexagon.pack_satub.vuh", u8_sat(wild_u16x)},
             {"halide.hexagon.pack_satuh.vw", u16_sat(wild_i32x)},
             {"halide.hexagon.pack_satb.vh", i8_sat(wild_i16x)},
             {"halide.hexagon.pack_sath.vw", i16_sat(wild_i32x)},
+            // The same patterns as above, but with safely reinterpreting the
+            // argument to be signed.
+            {"halide.hexagon.pack_satub.vh", u8_sat(wild_u16x), Pattern::SafeReinterpretOp0},
+            {"halide.hexagon.pack_satuh.vw", u16_sat(wild_u32x), Pattern::SafeReinterpretOp0},
+            {"halide.hexagon.pack_satb.vh", i8_sat(wild_u16x), Pattern::SafeReinterpretOp0},
+            {"halide.hexagon.pack_sath.vw", i16_sat(wild_u32x), Pattern::SafeReinterpretOp0},
+            // Slightly more expensive versions of uint saturation casts than the reinterpret
+            // patterns above, these perform vpack(min(UMAX, x)).
+            {"halide.hexagon.pack_satub.vuh", u8_sat(wild_u16x)},
             {"halide.hexagon.pack_satuh.vuw", u16_sat(wild_u32x)},
 
             // We don't have a vpack equivalent to this one, so we match it directly.
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
index 29bdde4a9163..db9564b9e460 100644
--- a/test/correctness/simd_op_check_hvx.cpp
+++ b/test/correctness/simd_op_check_hvx.cpp
@@ -304,6 +304,11 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         // for a more detailed explanation.
         check("v*.uh = vsat(v*.uw,v*.uw)", hvx_width / 2, u16_sat(u32_1));
         check("v*.h = vpack(v*.w,v*.w):sat", hvx_width / 2, i16_sat(i32_1));
+        // Test that bounds-inference instruction selection is working properly.
+        check("v*.ub = vpack(v*.h,v*.h):sat", hvx_width / 1, u8_sat(u16_1 >> 1));
+        check("v*.b = vpack(v*.h,v*.h):sat", hvx_width / 1, i8_sat(u16_1 >> 1));
+        check("v*.uh = vpack(v*.w,v*.w):sat", hvx_width / 2, u16_sat(u32_1 >> 1));
+        check("v*.h = vpack(v*.w,v*.w):sat", hvx_width / 2, i16_sat(u32_1 >> 1));
 
         // vpack doesn't interleave its inputs, which means it doesn't
         // simplify with widening. This is preferable for when the
diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp
index 2045b42699f4..a9d46cd2c1ad 100644
--- a/test/correctness/simd_op_check_wasm.cpp
+++ b/test/correctness/simd_op_check_wasm.cpp
@@ -506,6 +506,11 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
                 check("i16x8.narrow_i32x4_u", 8 * w, u16_sat(i32_1));
                 check("i16x8.narrow_i32x4_s", 8 * w, i8_sat(i32_1));
                 check("i16x8.narrow_i32x4_s", 8 * w, u8_sat(i32_1));
+                // Test that bounds-inference instruction selection is working properly.
+                check("i8x16.narrow_i16x8_s", 16 * w, i8_sat(u16_1 >> 1));
+                check("i8x16.narrow_i16x8_u", 16 * w, u8_sat(u16_1 >> 1));
+                check("i16x8.narrow_i32x4_s", 8 * w, i16_sat(u32_1 >> 1));
+                check("i16x8.narrow_i32x4_u", 8 * w, u16_sat(u32_1 >> 1));
 
                 // Integer to integer widening
                 check("i16x8.extend_low_i8x16_s", 16 * w, i16(i8_1));

From dfaf6ad0715bc4973955d085c40f13340971529b Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 30 Apr 2024 08:08:26 -0700
Subject: [PATCH 114/186] Insert apparently-missing `break;` in IREquality.cpp
 (#8211)

* Insert apparently-missing `break;` in IREquality.cpp

* Enable -Wimplicit-fallthrough

* Also add -Wimplicit-fallthrough to runtime builds

* Add missing break to runtime/webgpu.cpp

* Also add flag to Makefile

---------

Co-authored-by: Andrew Adams <andrew.b.adams@gmail.com>
---
 CMakeLists.txt             | 1 +
 Makefile                   | 2 +-
 src/IREquality.cpp         | 1 +
 src/runtime/CMakeLists.txt | 1 +
 src/runtime/webgpu.cpp     | 2 ++
 5 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b6fb85841c8..287ec4496f6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -109,6 +109,7 @@ function(set_halide_compiler_warnings NAME)
         $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wcast-qual>
         $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wignored-qualifiers>
         $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Woverloaded-virtual>
+        $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wimplicit-fallthrough>
 
         $<$<CXX_COMPILER_ID:GNU>:-Wsuggest-override>
 
diff --git a/Makefile b/Makefile
index 440b307a920e..761dfb058b75 100644
--- a/Makefile
+++ b/Makefile
@@ -193,7 +193,7 @@ WITH_RTTI ?= $(if $(LLVM_HAS_NO_RTTI),, not-empty)
 RTTI_CXX_FLAGS=$(if $(WITH_RTTI), , -fno-rtti )
 
 CXX_VERSION = $(shell $(CXX) --version | head -n1)
-CXX_WARNING_FLAGS = -Wall -Werror -Wno-unused-function -Wcast-qual -Wignored-qualifiers -Wno-comment -Wsign-compare -Wno-unknown-warning-option -Wno-psabi -Wno-mismatched-new-delete
+CXX_WARNING_FLAGS = -Wall -Werror -Wno-unused-function -Wcast-qual -Wignored-qualifiers -Wno-comment -Wsign-compare -Wno-unknown-warning-option -Wno-psabi -Wno-mismatched-new-delete -Wimplicit-fallthrough
 ifneq (,$(findstring g++,$(CXX_VERSION)))
 GCC_MAJOR_VERSION := $(shell $(CXX) -dumpfullversion -dumpversion | cut -f1 -d.)
 GCC_MINOR_VERSION := $(shell $(CXX) -dumpfullversion -dumpversion | cut -f2 -d.)
diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index bb64c1035590..53090be7b94e 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -356,6 +356,7 @@ struct Comparer {
             case IRNodeType::GT:
                 cmp(&GT::a);
                 cmp(&GT::b);
+                break;
             case IRNodeType::GE:
                 cmp(&GE::a);
                 cmp(&GE::b);
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index 039fae2d1b11..3366f2113969 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -167,6 +167,7 @@ set(RUNTIME_CXX_FLAGS
     -Wno-unused-function
     -Wvla
     -Wsign-compare
+    -Wimplicit-fallthrough
 )
 
 option(Halide_CLANG_TIDY_BUILD "Generate fake compile jobs for runtime files when running clang-tidy." OFF)
diff --git a/src/runtime/webgpu.cpp b/src/runtime/webgpu.cpp
index aa4e3fb5a71f..0a0ae240eade 100644
--- a/src/runtime/webgpu.cpp
+++ b/src/runtime/webgpu.cpp
@@ -1025,6 +1025,7 @@ WEAK int halide_webgpu_run(void *user_context,
                 switch (arg_type.bits) {
                 case 1: {
                     *(int32_t *)arg_out = *((int8_t *)arg_in);
+                    break;
                 }
                 case 8: {
                     *(int32_t *)arg_out = *((int8_t *)arg_in);
@@ -1048,6 +1049,7 @@ WEAK int halide_webgpu_run(void *user_context,
                 switch (arg_type.bits) {
                 case 1: {
                     *(uint32_t *)arg_out = *((uint8_t *)arg_in);
+                    break;
                 }
                 case 8: {
                     *(uint32_t *)arg_out = *((uint8_t *)arg_in);

From 211bafa2a453367a962bf18ae0618bc654ffc224 Mon Sep 17 00:00:00 2001
From: Alexander Root <32245479+rootjalex@users.noreply.github.com>
Date: Tue, 14 May 2024 13:15:57 -0700
Subject: [PATCH 115/186] Fix Reinterpret cmp in IREquality (#8217)

fix Reinterpret cmp
---
 src/IREquality.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/IREquality.cpp b/src/IREquality.cpp
index 53090be7b94e..a8dd27d7c7df 100644
--- a/src/IREquality.cpp
+++ b/src/IREquality.cpp
@@ -304,7 +304,7 @@ struct Comparer {
                 cmp(&Cast::value);
                 break;
             case IRNodeType::Reinterpret:
-                cmp(&Cast::value);
+                cmp(&Reinterpret::value);
                 break;
             case IRNodeType::Variable:
                 cmp(&Variable::name);

From 16d77e9ed98e91c50e4f2f38a36b8bd7b1d8b0f4 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 15 May 2024 10:43:34 -0700
Subject: [PATCH 116/186] Fix give-up case in ModulusRemainder (#8221)

A default-constructed ModulusRemainder means no information, which is
what we want here. ModulusRemainder{0, 1} means the constant one!
---
 src/ModulusRemainder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp
index 13b3c72a181d..7853f9a99287 100644
--- a/src/ModulusRemainder.cpp
+++ b/src/ModulusRemainder.cpp
@@ -487,7 +487,7 @@ ModulusRemainder ModulusRemainder::unify(const ModulusRemainder &a, const Modulu
     int64_t r;
     if (!sub_with_overflow(64, a.remainder, b.remainder, &r)) {
         // The modulus is not representable as an int64.
-        return {0, 1};
+        return ModulusRemainder{};
     }
 
     int64_t diff = a.remainder - b.remainder;

From e9f8b041f63a1a337ce3be0b07de5a1cfa6f2f65 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 15 May 2024 14:43:17 -0700
Subject: [PATCH 117/186] Fix for top-of-tree LLVM (#8223)

* Fix for top-of-tree LLVM

* Update LLVM_Runtime_Linker.cpp
---
 src/LLVM_Runtime_Linker.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 609fbc3467bc..e85331ee4eca 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -360,6 +360,17 @@ llvm::DataLayout get_data_layout_for_target(Target target) {
                 return llvm::DataLayout("e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64");
             }
         } else {  // 64-bit
+#if LLVM_VERSION >= 190
+            if (target.os == Target::IOS) {
+                return llvm::DataLayout("e-m:o-i64:64-i128:128-n32:64-S128-Fn32");
+            } else if (target.os == Target::OSX) {
+                return llvm::DataLayout("e-m:o-i64:64-i128:128-n32:64-S128-Fn32");
+            } else if (target.os == Target::Windows) {
+                return llvm::DataLayout("e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32");
+            } else {
+                return llvm::DataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32");
+            }
+#else
             if (target.os == Target::IOS) {
                 return llvm::DataLayout("e-m:o-i64:64-i128:128-n32:64-S128");
             } else if (target.os == Target::OSX) {
@@ -369,6 +380,7 @@ llvm::DataLayout get_data_layout_for_target(Target target) {
             } else {
                 return llvm::DataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128");
             }
+#endif
         }
     } else if (target.arch == Target::POWERPC) {
         if (target.bits == 32) {

From b5f5065c8867c45b4a08d4a416c9da4924f37fe1 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Thu, 23 May 2024 11:17:49 -0700
Subject: [PATCH 118/186] Add some EVAL_IN_LAMBDAs to Simplify_Sub.cpp (#8230)

Massively reduces compile time and peak cl.exe memory consumption on
windows (from 9.5gb down to 2.3gb).

Simplify_LT.cpp has these same EVAL_IN_LAMBDAs, which is probably why it
hasn't been causing build problems.
---
 src/Simplify_Sub.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Simplify_Sub.cpp b/src/Simplify_Sub.cpp
index 1ab53b2dea90..f3a06ca28949 100644
--- a/src/Simplify_Sub.cpp
+++ b/src/Simplify_Sub.cpp
@@ -184,7 +184,7 @@ Expr Simplify::visit(const Sub *op, ExprInfo *bounds) {
              rewrite((slice(x, c0, c1, c2) - z) - slice(y, c0, c1, c2), slice(x - y, c0, c1, c2) - z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
              rewrite((z - slice(x, c0, c1, c2)) - slice(y, c0, c1, c2), z - slice(x + y, c0, c1, c2), c2 > 1 && lanes_of(x) == lanes_of(y)) ||
 
-             (no_overflow(op->type) &&
+             (no_overflow(op->type) && EVAL_IN_LAMBDA
               (rewrite(max(x, y) - x, max(y - x, 0)) ||
                rewrite(min(x, y) - x, min(y - x, 0)) ||
                rewrite(max(x, y) - y, max(x - y, 0)) ||
@@ -387,7 +387,7 @@ Expr Simplify::visit(const Sub *op, ExprInfo *bounds) {
                rewrite(max(y, x + c0) - max(x + c1, w), max(y - max(x + c1, w), fold(c0 - c1)), can_prove(y + c1 >= w + c0, this)) ||
                rewrite(max(y, x + c0) - max(x + c1, w), min(max(x + c0, y) - w, fold(c0 - c1)), can_prove(y + c1 <= w + c0, this)))) ||
 
-             (no_overflow_int(op->type) &&
+             (no_overflow_int(op->type) && EVAL_IN_LAMBDA
               (rewrite(c0 - (c1 - x)/c2, (fold(c0*c2 - c1 + c2 - 1) + x)/c2, c2 > 0) ||
                rewrite(c0 - (x + c1)/c2, (fold(c0*c2 - c1 + c2 - 1) - x)/c2, c2 > 0) ||
                rewrite(x - (x + y)/c0, (x*fold(c0 - 1) - y + fold(c0 - 1))/c0, c0 > 0) ||

From 33d5ba9536f5aa6d3702c2c764b899a8f45b3b62 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 24 May 2024 12:56:03 -0700
Subject: [PATCH 119/186] Fix saturating add matching in associativity checking
 (#8220)

* Fix saturating add matching in associativity checking

The associative ops table defined saturating add as
saturating_narrow(widen(x + y)), instead of saturating_narrow(widen(x) +
y)
---
 src/AssociativeOpsTable.cpp | 122 +++++++++++++-----------------------
 src/Associativity.cpp       |  45 +++++--------
 src/Solve.cpp               |  31 +++++++++
 3 files changed, 89 insertions(+), 109 deletions(-)

diff --git a/src/AssociativeOpsTable.cpp b/src/AssociativeOpsTable.cpp
index c0436fade13e..93e2f7a556d0 100644
--- a/src/AssociativeOpsTable.cpp
+++ b/src/AssociativeOpsTable.cpp
@@ -12,19 +12,6 @@ using std::vector;
 
 namespace {
 
-enum class RootExpr {
-    Add = 0,
-    Mul = 1,
-    Max = 2,
-    Min = 3,
-    Sub = 4,
-    Select = 5,
-    And = 6,
-    Or = 7,
-    Cast = 8,
-    Unknown = 9,  // Not supported IR type
-};
-
 enum class ValType {
     UInt1 = 0,
     UInt8 = 1,
@@ -93,12 +80,12 @@ vector<ValType> convert_halide_types_to_val_types(const vector<Type> &halide_typ
 
 struct TableKey {
     vector<ValType> types;
-    RootExpr root;
+    IRNodeType root;
     size_t dim;
-    TableKey(ValType t, RootExpr r, size_t d)
+    TableKey(ValType t, IRNodeType r, size_t d)
         : types({t}), root(r), dim(d) {
     }
-    TableKey(const vector<ValType> &t, RootExpr r, size_t d)
+    TableKey(const vector<ValType> &t, IRNodeType r, size_t d)
         : types(t), root(r), dim(d) {
     }
 
@@ -169,6 +156,14 @@ void populate_ops_table_single_general_select(const vector<Type> &types, vector<
     declare_vars_single(types);
 }
 
+void populate_ops_table_single_general_call(const vector<Type> &types, vector<AssociativePattern> &table) {
+    declare_vars_single(types);
+    if (types[0].code() == Type::UInt) {
+        table.emplace_back(saturating_add(x0, y0), zero_0, true);
+        table.emplace_back(saturating_cast(types[0], widening_add(x0, y0)), zero_0, true);
+    }
+}
+
 void populate_ops_table_double_general_add(const vector<Type> &types, vector<AssociativePattern> &table) {
     declare_vars_double(types);
     if (types[0] == types[1]) {
@@ -217,9 +212,9 @@ void populate_ops_table_single_uint8_cast(const vector<Type> &types, vector<Asso
     Expr k0_uint16 = Variable::make(UInt(16), "k0");
     Expr k0_uint32 = Variable::make(UInt(32), "k0");
     Expr k0_uint64 = Variable::make(UInt(64), "k0");
-    table.emplace_back(cast<uint8_t>(min(cast<uint16_t>(x0 + y0), k0_uint16)), zero_0, true);
-    table.emplace_back(cast<uint8_t>(min(cast<uint32_t>(x0 + y0), k0_uint32)), zero_0, true);
-    table.emplace_back(cast<uint8_t>(min(cast<uint64_t>(x0 + y0), k0_uint64)), zero_0, true);
+    table.emplace_back(cast<uint8_t>(min(cast<uint16_t>(x0) + y0, k0_uint16)), zero_0, true);
+    table.emplace_back(cast<uint8_t>(min(cast<uint32_t>(x0) + y0, k0_uint32)), zero_0, true);
+    table.emplace_back(cast<uint8_t>(min(cast<uint64_t>(x0) + y0, k0_uint64)), zero_0, true);
 }
 
 void populate_ops_table_single_uint8_select(const vector<Type> &types, vector<AssociativePattern> &table) {
@@ -232,8 +227,8 @@ void populate_ops_table_single_uint16_cast(const vector<Type> &types, vector<Ass
     declare_vars_single(types);
     Expr k0_uint32 = Variable::make(UInt(32), "k0");
     Expr k0_uint64 = Variable::make(UInt(64), "k0");
-    table.emplace_back(cast<uint16_t>(min(cast<uint32_t>(x0 + y0), k0_uint32)), zero_0, true);
-    table.emplace_back(cast<uint16_t>(min(cast<uint64_t>(x0 + y0), k0_uint64)), zero_0, true);
+    table.emplace_back(cast<uint16_t>(min(cast<uint32_t>(x0) + y0, k0_uint32)), zero_0, true);
+    table.emplace_back(cast<uint16_t>(min(cast<uint64_t>(x0) + y0, k0_uint64)), zero_0, true);
 }
 
 void populate_ops_table_single_uint16_select(const vector<Type> &types, vector<AssociativePattern> &table) {
@@ -255,33 +250,34 @@ void populate_ops_table_single_uint32_select(const vector<Type> &types, vector<A
 }
 
 const map<TableKey, void (*)(const vector<Type> &types, vector<AssociativePattern> &)> val_type_to_populate_luts_fn = {
-    {TableKey(ValType::All, RootExpr::Add, 1), &populate_ops_table_single_general_add},
-    {TableKey(ValType::All, RootExpr::Mul, 1), &populate_ops_table_single_general_mul},
-    {TableKey(ValType::All, RootExpr::Max, 1), &populate_ops_table_single_general_max},
-    {TableKey(ValType::All, RootExpr::Min, 1), &populate_ops_table_single_general_min},
-    {TableKey(ValType::All, RootExpr::Sub, 1), &populate_ops_table_single_general_sub},
-    {TableKey(ValType::All, RootExpr::Select, 1), &populate_ops_table_single_general_select},
-    {TableKey(ValType::All, RootExpr::Add, 2), &populate_ops_table_double_general_add},
-    {TableKey(ValType::All, RootExpr::Mul, 2), &populate_ops_table_double_general_mul},
-    {TableKey(ValType::All, RootExpr::Max, 2), &populate_ops_table_double_general_max},
-    {TableKey(ValType::All, RootExpr::Min, 2), &populate_ops_table_double_general_min},
-    {TableKey(ValType::All, RootExpr::Sub, 2), &populate_ops_table_double_general_sub},
-    {TableKey(ValType::All, RootExpr::Select, 2), &populate_ops_table_double_general_select},
-
-    {TableKey(ValType::UInt1, RootExpr::And, 1), &populate_ops_table_single_uint1_and},
-    {TableKey(ValType::UInt1, RootExpr::Or, 1), &populate_ops_table_single_uint1_or},
-
-    {TableKey(ValType::UInt8, RootExpr::Cast, 1), &populate_ops_table_single_uint8_cast},
-    {TableKey(ValType::UInt8, RootExpr::Select, 1), &populate_ops_table_single_uint8_select},
-
-    {TableKey(ValType::UInt16, RootExpr::Cast, 1), &populate_ops_table_single_uint16_cast},
-    {TableKey(ValType::UInt16, RootExpr::Select, 1), &populate_ops_table_single_uint16_select},
-
-    {TableKey(ValType::UInt32, RootExpr::Cast, 1), &populate_ops_table_single_uint32_cast},
-    {TableKey(ValType::UInt32, RootExpr::Select, 1), &populate_ops_table_single_uint32_select},
+    {TableKey(ValType::All, IRNodeType::Add, 1), &populate_ops_table_single_general_add},
+    {TableKey(ValType::All, IRNodeType::Mul, 1), &populate_ops_table_single_general_mul},
+    {TableKey(ValType::All, IRNodeType::Max, 1), &populate_ops_table_single_general_max},
+    {TableKey(ValType::All, IRNodeType::Min, 1), &populate_ops_table_single_general_min},
+    {TableKey(ValType::All, IRNodeType::Sub, 1), &populate_ops_table_single_general_sub},
+    {TableKey(ValType::All, IRNodeType::Select, 1), &populate_ops_table_single_general_select},
+    {TableKey(ValType::All, IRNodeType::Call, 1), &populate_ops_table_single_general_call},
+    {TableKey(ValType::All, IRNodeType::Add, 2), &populate_ops_table_double_general_add},
+    {TableKey(ValType::All, IRNodeType::Mul, 2), &populate_ops_table_double_general_mul},
+    {TableKey(ValType::All, IRNodeType::Max, 2), &populate_ops_table_double_general_max},
+    {TableKey(ValType::All, IRNodeType::Min, 2), &populate_ops_table_double_general_min},
+    {TableKey(ValType::All, IRNodeType::Sub, 2), &populate_ops_table_double_general_sub},
+    {TableKey(ValType::All, IRNodeType::Select, 2), &populate_ops_table_double_general_select},
+
+    {TableKey(ValType::UInt1, IRNodeType::And, 1), &populate_ops_table_single_uint1_and},
+    {TableKey(ValType::UInt1, IRNodeType::Or, 1), &populate_ops_table_single_uint1_or},
+
+    {TableKey(ValType::UInt8, IRNodeType::Cast, 1), &populate_ops_table_single_uint8_cast},
+    {TableKey(ValType::UInt8, IRNodeType::Select, 1), &populate_ops_table_single_uint8_select},
+
+    {TableKey(ValType::UInt16, IRNodeType::Cast, 1), &populate_ops_table_single_uint16_cast},
+    {TableKey(ValType::UInt16, IRNodeType::Select, 1), &populate_ops_table_single_uint16_select},
+
+    {TableKey(ValType::UInt32, IRNodeType::Cast, 1), &populate_ops_table_single_uint32_cast},
+    {TableKey(ValType::UInt32, IRNodeType::Select, 1), &populate_ops_table_single_uint32_select},
 };
 
-const vector<AssociativePattern> &get_ops_table_helper(const vector<Type> &types, RootExpr root, size_t dim) {
+const vector<AssociativePattern> &get_ops_table_helper(const vector<Type> &types, IRNodeType root, size_t dim) {
     TableKey gen_key(ValType::All, root, dim);
     TableKey key(convert_halide_types_to_val_types(types), root, dim);
 
@@ -336,43 +332,13 @@ const vector<AssociativePattern> &get_ops_table(const vector<Expr> &exprs) {
         types[i] = exprs[i].type();
     }
 
-    RootExpr root = RootExpr::Unknown;
-    if (exprs[0].as<Halide::Internal::Add>()) {
-        debug(5) << "Returning Add root table for type " << print_types(types) << "\n";
-        root = RootExpr::Add;
-    } else if (exprs[0].as<Halide::Internal::Sub>()) {
-        debug(5) << "Returning Sub root table for type " << print_types(types) << "\n";
-        root = RootExpr::Sub;
-    } else if (exprs[0].as<Halide::Internal::Mul>()) {
-        debug(5) << "Returning Mul root table for type " << print_types(types) << "\n";
-        root = RootExpr::Mul;
-    } else if (exprs[0].as<Halide::Internal::Min>()) {
-        debug(5) << "Returning Min root table for type " << print_types(types) << "\n";
-        root = RootExpr::Min;
-    } else if (exprs[0].as<Halide::Internal::Max>()) {
-        debug(5) << "Returning Max root table for type " << print_types(types) << "\n";
-        root = RootExpr::Max;
-    } else if (exprs[0].as<Halide::Internal::Select>()) {
-        debug(5) << "Returning Select root table for type " << print_types(types) << "\n";
-        root = RootExpr::Select;
-    } else if (exprs[0].as<Halide::Internal::And>()) {
-        debug(5) << "Returning And root table for type " << print_types(types) << "\n";
-        root = RootExpr::And;
-    } else if (exprs[0].as<Halide::Internal::Or>()) {
-        debug(5) << "Returning Or root table for type " << print_types(types) << "\n";
-        root = RootExpr::Or;
-    } else if (exprs[0].as<Halide::Internal::Cast>()) {
-        debug(5) << "Returning Cast root table for type " << print_types(types) << "\n";
-        root = RootExpr::Cast;
-    }
-
-    if (root != RootExpr::Unknown) {
+    {
         // get_ops_table_helper() lazily initializes the table, so ensure
         // that multiple threads can't try to do so at the same time.
         static std::mutex ops_table_lock;
         std::lock_guard<std::mutex> lock_guard(ops_table_lock);
 
-        const vector<AssociativePattern> &table = get_ops_table_helper(types, root, exprs.size());
+        const vector<AssociativePattern> &table = get_ops_table_helper(types, exprs[0].node_type(), exprs.size());
         debug(7) << "Table size: " << table.size() << "\n";
         for (const auto &p : table) {
             debug(7) << p;
diff --git a/src/Associativity.cpp b/src/Associativity.cpp
index 6baa9e5fa7c6..6a8d8948be85 100644
--- a/src/Associativity.cpp
+++ b/src/Associativity.cpp
@@ -543,37 +543,20 @@ void associativity_test() {
         Expr x_idx = Variable::make(Int(32), "x_idx");
         Expr f_call_0 = Call::make(t, "f", {x_idx}, Call::CallType::Halide, FunctionPtr(), 0);
 
-        // f(x) = uint8(uint16(x + y), 255)
-        check_associativity("f", {x_idx}, {Cast::make(UInt(8), min(Cast::make(UInt(16), y + f_call_0), make_const(t, 255)))},
-                            AssociativeOp(
-                                AssociativePattern(Cast::make(UInt(8), min(Cast::make(UInt(16), x + y), make_const(t, 255))), make_const(t, 0), true),
-                                {Replacement("x", f_call_0)},
-                                {Replacement("y", y)},
-                                true));
-
-        // f(x) = uint8(uint16(x + y), uint16(255))
-        check_associativity("f", {x_idx}, {Cast::make(UInt(8), min(Cast::make(UInt(16), y + f_call_0), Cast::make(UInt(16), make_const(t, 255))))},
-                            AssociativeOp(
-                                AssociativePattern(Cast::make(UInt(8), min(Cast::make(UInt(16), x + y), make_const(t, 255))), make_const(t, 0), true),
-                                {Replacement("x", f_call_0)},
-                                {Replacement("y", y)},
-                                true));
-
-        // f(x) = select(x > 255 - y, 255, y)
-        check_associativity("f", {x_idx}, {select(f_call_0 > make_const(t, 255) - y, make_const(t, 255), y)},
-                            AssociativeOp(
-                                AssociativePattern(select(x > make_const(t, 255) - y, make_const(t, 255), y), make_const(t, 0), true),
-                                {Replacement("x", f_call_0)},
-                                {Replacement("y", y)},
-                                true));
-
-        // f(x) = select(x >= -y, 255, y)
-        check_associativity("f", {x_idx}, {select(f_call_0 >= -y, make_const(t, 255), y)},
-                            AssociativeOp(
-                                AssociativePattern(select(x < -y, y, make_const(t, 255)), make_const(t, 0), true),
-                                {Replacement("x", f_call_0)},
-                                {Replacement("y", y)},
-                                true));
+        for (const Expr &e : {cast<uint8_t>(min(cast<uint16_t>(x) + y, 255)),
+                              select(x > 255 - y, cast<uint8_t>(255), y),
+                              select(x < -y, y, cast<uint8_t>(255)),
+                              saturating_add(x, y),
+                              saturating_add(y, x),
+                              saturating_cast<uint8_t>(widening_add(x, y))}) {
+            check_associativity("f", {x_idx}, {substitute("x", f_call_0, e)},
+                                AssociativeOp(
+                                    AssociativePattern(solve_expression(e, "x").result,
+                                                       make_const(t, 0), true),
+                                    {Replacement("x", f_call_0)},
+                                    {Replacement("y", y)},
+                                    true));
+        }
     }
 
     {
diff --git a/src/Solve.cpp b/src/Solve.cpp
index 09245d90bf24..10e6232e379d 100644
--- a/src/Solve.cpp
+++ b/src/Solve.cpp
@@ -433,6 +433,37 @@ class SolveExpression : public IRMutator {
         // Ignore intrinsics that shouldn't affect the results.
         if (Call::as_tag(op)) {
             return mutate(op->args[0]);
+        } else if (op->is_intrinsic({Call::absd, Call::bitwise_and, Call::bitwise_or,
+                                     Call::bitwise_xor, Call::halving_add, Call::rounding_halving_add,
+                                     Call::saturating_add, Call::widening_add, Call::widening_mul})) {
+            // It's a commutative intrinsic. We won't try to lift uses of the
+            // var out of the call, but we will reorder the args if it would
+            // help.
+            internal_assert(op->args.size() == 2);
+            bool old_uses_var = uses_var;
+            uses_var = false;
+            bool old_failed = failed;
+            failed = false;
+            Expr a = mutate(op->args[0]);
+            bool a_uses_var = uses_var;
+            bool a_failed = failed;
+            uses_var = false;
+            failed = false;
+            Expr b = mutate(op->args[1]);
+            bool b_uses_var = uses_var;
+            bool b_failed = failed;
+            uses_var = old_uses_var || a_uses_var || b_uses_var;
+            failed = old_failed || a_failed || b_failed;
+
+            failed |= a_uses_var && b_uses_var;
+
+            if (b_uses_var && !a_uses_var) {
+                return Call::make(op->type, op->name, {b, a}, op->call_type);
+            } else if (a.same_as(op->args[0]) && b.same_as(op->args[1])) {
+                return op;
+            } else {
+                return Call::make(op->type, op->name, {a, b}, op->call_type);
+            }
         } else {
             return IRMutator::visit(op);
         }

From 711dc88a3c718033ff66f48584121f06536f63e7 Mon Sep 17 00:00:00 2001
From: Cheng Wang <wangcheng@google.com>
Date: Fri, 31 May 2024 10:53:47 -0700
Subject: [PATCH 120/186] Add HVX_v68 target to support Hexagon HVX v68.
 (#8232)

---
 .../src/halide/halide_/PyEnums.cpp            |  1 +
 src/CodeGen_Hexagon.cpp                       | 14 ++++++++++--
 src/CodeGen_Internal.cpp                      | 10 ++++++++-
 src/HexagonOffload.cpp                        |  6 ++++-
 src/HexagonOptimize.cpp                       | 22 ++++++++++++++++---
 src/LLVM_Runtime_Linker.cpp                   |  3 ++-
 src/Target.cpp                                | 10 +++++++++
 src/Target.h                                  |  1 +
 src/runtime/HalideRuntime.h                   |  1 +
 test/correctness/gather.cpp                   |  3 ++-
 test/correctness/hexagon_scatter.cpp          |  3 ++-
 test/correctness/histogram.cpp                |  3 ++-
 test/correctness/simd_op_check_hvx.cpp        |  7 ++++--
 13 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index 4edd8029c340..5a442bdca737 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -148,6 +148,7 @@ void define_enums(py::module &m) {
         .value("HVX_v62", Target::Feature::HVX_v62)
         .value("HVX_v65", Target::Feature::HVX_v65)
         .value("HVX_v66", Target::Feature::HVX_v66)
+        .value("HVX_v68", Target::Feature::HVX_v68)
         .value("FuzzFloatStores", Target::Feature::FuzzFloatStores)
         .value("SoftFloatABI", Target::Feature::SoftFloatABI)
         .value("MSAN", Target::Feature::MSAN)
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index a77e9c7c1a76..8c27fadf82f0 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -128,7 +128,9 @@ class CodeGen_Hexagon : public CodeGen_Posix {
 
 CodeGen_Hexagon::CodeGen_Hexagon(const Target &t)
     : CodeGen_Posix(t) {
-    if (target.has_feature(Halide::Target::HVX_v66)) {
+    if (target.has_feature(Halide::Target::HVX_v68)) {
+        isa_version = 68;
+    } else if (target.has_feature(Halide::Target::HVX_v66)) {
         isa_version = 66;
     } else if (target.has_feature(Halide::Target::HVX_v65)) {
         isa_version = 65;
@@ -1787,7 +1789,9 @@ Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name,
 }
 
 string CodeGen_Hexagon::mcpu_target() const {
-    if (target.has_feature(Halide::Target::HVX_v66)) {
+    if (target.has_feature(Halide::Target::HVX_v68)) {
+        return "hexagonv68";
+    } else if (target.has_feature(Halide::Target::HVX_v66)) {
         return "hexagonv66";
     } else if (target.has_feature(Halide::Target::HVX_v65)) {
         return "hexagonv65";
@@ -1856,6 +1860,12 @@ void CodeGen_Hexagon::visit(const Mul *op) {
             return;
         }
 
+        // v68 has vector support for single-precision float.
+        if (target.has_feature(Halide::Target::HVX_v68) &&
+            op->type.is_float() && op->type.bits() == 32) {
+            CodeGen_Posix::visit(op);
+            return;
+        }
         internal_error << "Unhandled HVX multiply " << op->a.type() << "*"
                        << op->b.type() << "\n"
                        << Expr(op) << "\n";
diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index 697b9200fa33..b68373f23ccf 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -674,8 +674,16 @@ std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &mod
 #else
     const auto opt_level = llvm::CodeGenOpt::Aggressive;
 #endif
+
+    // Get module mcpu_target and mattrs.
+    std::string mcpu_target;
+    get_md_string(module.getModuleFlag("halide_mcpu_target"), mcpu_target);
+    std::string mattrs;
+    get_md_string(module.getModuleFlag("halide_mattrs"), mattrs);
+
     auto *tm = llvm_target->createTargetMachine(module.getTargetTriple(),
-                                                /*CPU target=*/"", /*Features=*/"",
+                                                mcpu_target,
+                                                mattrs,
                                                 options,
                                                 use_pic ? llvm::Reloc::PIC_ : llvm::Reloc::Static,
                                                 use_large_code_model ? llvm::CodeModel::Large : llvm::CodeModel::Small,
diff --git a/src/HexagonOffload.cpp b/src/HexagonOffload.cpp
index 99db0413a5ba..221eb9fbd042 100644
--- a/src/HexagonOffload.cpp
+++ b/src/HexagonOffload.cpp
@@ -43,6 +43,7 @@ enum {
     EF_HEXAGON_MACH_V62 = 0x62,
     EF_HEXAGON_MACH_V65 = 0x65,
     EF_HEXAGON_MACH_V66 = 0x66,
+    EF_HEXAGON_MACH_V68 = 0x68,
 };
 
 enum {
@@ -551,7 +552,9 @@ class HexagonLinker : public Linker {
     uint32_t flags;
 
     HexagonLinker(const Target &target) {
-        if (target.has_feature(Target::HVX_v66)) {
+        if (target.has_feature(Target::HVX_v68)) {
+            flags = Elf::EF_HEXAGON_MACH_V68;
+        } else if (target.has_feature(Target::HVX_v66)) {
             flags = Elf::EF_HEXAGON_MACH_V66;
         } else if (target.has_feature(Target::HVX_v65)) {
             flags = Elf::EF_HEXAGON_MACH_V65;
@@ -983,6 +986,7 @@ Stmt inject_hexagon_rpc(Stmt s, const Target &host_target,
         Target::HVX_v62,
         Target::HVX_v65,
         Target::HVX_v66,
+        Target::HVX_v68,
     };
     for (Target::Feature i : shared_features) {
         if (host_target.has_feature(i)) {
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index a123738fe298..6834d4abe7f3 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -106,6 +106,13 @@ string type_suffix(Type type, bool signed_variants) {
         default:
             break;
         }
+    } else if (type.is_float()) {
+        switch (type.bits()) {
+        case 32:
+            return prefix + "sf";
+        default:
+            break;
+        }
     }
     internal_error << "Unsupported HVX type: " << type << "\n";
     return "";
@@ -194,6 +201,7 @@ struct Pattern {
 
         v65orLater = 1 << 11,  // Pattern should be matched only for v65 target or later
         v66orLater = 1 << 12,  // Pattern should be matched only for v66 target or later
+        v68orLater = 1 << 13,  // Pattern should be matched only for v68 target or later
     };
 
     string intrin;  // Name of the intrinsic
@@ -227,11 +235,16 @@ Expr wild_i64x = Variable::make(Type(Type::Int, 64, 0), "*");
 // Check if a pattern with flags 'flags' is supported on the target.
 bool check_pattern_target(int flags, const Target &target) {
     if ((flags & (Pattern::v65orLater)) &&
-        !target.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
+        !target.features_any_of({Target::HVX_v65, Target::HVX_v66,
+                                 Target::HVX_v68})) {
         return false;
     }
     if ((flags & (Pattern::v66orLater)) &&
-        !target.features_any_of({Target::HVX_v66})) {
+        !target.features_any_of({Target::HVX_v66, Target::HVX_v68})) {
+        return false;
+    }
+    if ((flags & (Pattern::v68orLater)) &&
+        !target.features_any_of({Target::HVX_v68})) {
         return false;
     }
     return true;
@@ -1722,6 +1735,7 @@ class EliminateInterleaves : public IRMutator {
         v62orLater,  // Use for Hexagon v62 target or later
         v65orLater,  // Use for Hexagon v65 target or later
         v66orLater,  // Use for Hexagon v66 target or later
+        v68orLater,  // Use for Hexagon v68 target or later
     };
     HvxTarget hvx_target;
 
@@ -1943,6 +1957,8 @@ class EliminateInterleaves : public IRMutator {
             hvx_target = HvxTarget::v65orLater;
         } else if (t.features_any_of({Target::HVX_v66})) {
             hvx_target = HvxTarget::v66orLater;
+        } else if (t.features_any_of({Target::HVX_v68})) {
+            hvx_target = HvxTarget::v68orLater;
         } else {
             hvx_target = HvxTarget::v62orLater;
         }
@@ -2294,4 +2310,4 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) {
 }
 
 }  // namespace Internal
-}  // namespace Halide
\ No newline at end of file
+}  // namespace Halide
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index e85331ee4eca..69f32c35831e 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -1129,7 +1129,8 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
             if (t.arch == Target::Hexagon) {
                 modules.push_back(get_initmod_qurt_hvx(c, bits_64, debug));
                 modules.push_back(get_initmod_hvx_128_ll(c));
-                if (t.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
+                if (t.features_any_of({Target::HVX_v65, Target::HVX_v66,
+                                       Target::HVX_v68})) {
                     modules.push_back(get_initmod_qurt_hvx_vtcm(c, bits_64,
                                                                 debug));
                 }
diff --git a/src/Target.cpp b/src/Target.cpp
index ac96ae019065..53e85196dae3 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -338,6 +338,7 @@ bool is_using_hexagon(const Target &t) {
             t.has_feature(Target::HVX_v62) ||
             t.has_feature(Target::HVX_v65) ||
             t.has_feature(Target::HVX_v66) ||
+            t.has_feature(Target::HVX_v68) ||
             t.has_feature(Target::HexagonDma) ||
             t.arch == Target::Hexagon);
 }
@@ -355,6 +356,9 @@ int get_hvx_lower_bound(const Target &t) {
     if (t.has_feature(Target::HVX_v66)) {
         return 66;
     }
+    if (t.has_feature(Target::HVX_v68)) {
+        return 68;
+    }
     return 60;
 }
 
@@ -543,6 +547,7 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"hvx_v62", Target::HVX_v62},
     {"hvx_v65", Target::HVX_v65},
     {"hvx_v66", Target::HVX_v66},
+    {"hvx_v68", Target::HVX_v68},
     {"fuzz_float_stores", Target::FuzzFloatStores},
     {"soft_float_abi", Target::SoftFloatABI},
     {"msan", Target::MSAN},
@@ -892,6 +897,7 @@ void Target::validate_features() const {
                                 HVX_v62,
                                 HVX_v65,
                                 HVX_v66,
+                                HVX_v68,
                                 NoNEON,
                                 POWER_ARCH_2_07,
                                 RVV,
@@ -1371,6 +1377,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         HVX_v62,
         HVX_v65,
         HVX_v66,
+        HVX_v68,
         VulkanV10,
         VulkanV12,
         VulkanV13,
@@ -1515,6 +1522,9 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
     if (hvx_version < 66) {
         output.features.reset(HVX_v66);
     }
+    if (hvx_version < 68) {
+        output.features.reset(HVX_v68);
+    }
 
     result = output;
     return true;
diff --git a/src/Target.h b/src/Target.h
index 3bc586822f75..7150513e6451 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -122,6 +122,7 @@ struct Target {
         HVX_v62 = halide_target_feature_hvx_v62,
         HVX_v65 = halide_target_feature_hvx_v65,
         HVX_v66 = halide_target_feature_hvx_v66,
+        HVX_v68 = halide_target_feature_hvx_v68,
         FuzzFloatStores = halide_target_feature_fuzz_float_stores,
         SoftFloatABI = halide_target_feature_soft_float_abi,
         MSAN = halide_target_feature_msan,
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index b02890a23f55..3a70babf1e7d 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1409,6 +1409,7 @@ typedef enum halide_target_feature_t {
     halide_target_feature_trace_pipeline,         ///< Trace the pipeline.
     halide_target_feature_hvx_v65,                ///< Enable Hexagon v65 architecture.
     halide_target_feature_hvx_v66,                ///< Enable Hexagon v66 architecture.
+    halide_target_feature_hvx_v68,                ///< Enable Hexagon v68 architecture.
     halide_target_feature_cl_half,                ///< Enable half support on OpenCL targets
     halide_target_feature_strict_float,           ///< Turn off all non-IEEE floating-point optimization. Currently applies only to LLVM targets.
     halide_target_feature_tsan,                   ///< Enable hooks for TSAN support.
diff --git a/test/correctness/gather.cpp b/test/correctness/gather.cpp
index 2d9bef71f326..8be533be9c9b 100644
--- a/test/correctness/gather.cpp
+++ b/test/correctness/gather.cpp
@@ -49,7 +49,8 @@ bool test() {
             .parallel(y)
             .vectorize(x, vector_size);
 
-        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
+        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66,
+                                    Target::HVX_v68})) {
             lut_vtcm
                 .store_in(MemoryType::VTCM)
                 .compute_at(output, Var::outermost())
diff --git a/test/correctness/hexagon_scatter.cpp b/test/correctness/hexagon_scatter.cpp
index 6fde0669ac19..83181d4f813a 100644
--- a/test/correctness/hexagon_scatter.cpp
+++ b/test/correctness/hexagon_scatter.cpp
@@ -76,7 +76,8 @@ int test() {
             .parallel(y)
             .vectorize(x, vector_size / 2);
 
-        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
+        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66,
+                                    Target::HVX_v68})) {
             f.store_in(MemoryType::VTCM);
         }
     }
diff --git a/test/correctness/histogram.cpp b/test/correctness/histogram.cpp
index bfb69058402d..b21363745487 100644
--- a/test/correctness/histogram.cpp
+++ b/test/correctness/histogram.cpp
@@ -50,7 +50,8 @@ bool test() {
             .compute_at(g, Var::outermost())
             .vectorize(x, vector_size);
 
-        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66})) {
+        if (target.features_any_of({Target::HVX_v65, Target::HVX_v66,
+                                    Target::HVX_v68})) {
             hist.store_in(MemoryType::VTCM);
 
             hist
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
index db9564b9e460..d07a74989773 100644
--- a/test/correctness/simd_op_check_hvx.cpp
+++ b/test/correctness/simd_op_check_hvx.cpp
@@ -13,7 +13,7 @@
 // simd_op_check into two tests, simd_op_check.cpp and simd_op_check_hvx.cpp
 // so that the latter is free to do its own thing - for simd_op_check_hvx.cpp
 // to run any tests, all that is needed is that HL_TARGET have a HVX related
-// target feature, i.e. one of HVX, HVX_v62, HVX_v65 and HVX_v66.
+// target feature, i.e. one of HVX, HVX_v62, HVX_v65, HVX_v66 and HVX_v68.
 
 using namespace Halide;
 using namespace Halide::ConciseCasts;
@@ -44,7 +44,9 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         constexpr int hvx_width = 128;
 
         int isa_version;
-        if (target.has_feature(Halide::Target::HVX_v66)) {
+        if (target.has_feature(Halide::Target::HVX_v68)) {
+            isa_version = 68;
+        } else if (target.has_feature(Halide::Target::HVX_v66)) {
             isa_version = 66;
         } else if (target.has_feature(Halide::Target::HVX_v65)) {
             isa_version = 65;
@@ -718,5 +720,6 @@ int main(int argc, char **argv) {
             Target("hexagon-32-noos-hvx-hvx_128-hvx_v62"),
             Target("hexagon-32-noos-hvx-hvx_128-hvx_v65"),
             Target("hexagon-32-noos-hvx-hvx_128-hvx_v66"),
+            Target("hexagon-32-noos-hvx-hvx_128-hvx_v68"),
         });
 }

From 35143d206f32be4ace05749f893ee5f5be079e53 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Sun, 2 Jun 2024 23:19:04 +0200
Subject: [PATCH 121/186] Mark host_dirty() and device_dirty() with no_discard.
 (#8248)

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/runtime/HalideRuntime.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 3a70babf1e7d..4ddb52cd89bf 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1574,11 +1574,11 @@ typedef struct halide_buffer_t {
         }
     }
 
-    HALIDE_ALWAYS_INLINE bool host_dirty() const {
+    HALIDE_MUST_USE_RESULT HALIDE_ALWAYS_INLINE bool host_dirty() const {
         return get_flag(halide_buffer_flag_host_dirty);
     }
 
-    HALIDE_ALWAYS_INLINE bool device_dirty() const {
+    HALIDE_MUST_USE_RESULT HALIDE_ALWAYS_INLINE bool device_dirty() const {
         return get_flag(halide_buffer_flag_device_dirty);
     }
 

From a9b8fbf7c106fe25bc84a35f1d62e224b25a09c7 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sun, 2 Jun 2024 14:33:45 -0700
Subject: [PATCH 122/186] Rework the simplifier to use ConstantInterval for
 bounds (#8222)

* Update the simplifier to use ConstantInterval

and track the bounds through more types

* Move the simplify fuzzer back to a correctness test

* Make debug_indent not static

Otherwise it causes a race condition in any parallel tests

* Track expr info on non-overflowing casts to int

* Delete commented-out code

* clang-tidy

* Delete unused member

* Fix cmakelists for the fuzzer removal

* Handle contradictions more gracefully in learn_true

The contradiction was arising from:

if (extent > 0) {
...
} else {
  for (x = 0; x < extent; x++) {
In here we can assume extent > 0, but we also know from the if
statement that extent <= 0
  }
}

* Better comments

* Address review comments

* Fix failure to pop loop var info
---
 src/Simplify.cpp                              |  70 +++--
 src/Simplify.h                                |   6 +-
 src/Simplify_Add.cpp                          |  28 +-
 src/Simplify_And.cpp                          |   4 +-
 src/Simplify_Call.cpp                         | 111 ++++----
 src/Simplify_Cast.cpp                         |  78 +++---
 src/Simplify_Div.cpp                          | 145 +++-------
 src/Simplify_EQ.cpp                           |  26 +-
 src/Simplify_Exprs.cpp                        | 148 +++++-----
 src/Simplify_Internal.h                       | 187 +++++++------
 src/Simplify_LT.cpp                           |  28 +-
 src/Simplify_Let.cpp                          |  29 +-
 src/Simplify_Max.cpp                          |  53 ++--
 src/Simplify_Min.cpp                          |  52 ++--
 src/Simplify_Mod.cpp                          |  63 ++---
 src/Simplify_Mul.cpp                          |  57 +---
 src/Simplify_Not.cpp                          |   4 +-
 src/Simplify_Or.cpp                           |   4 +-
 src/Simplify_Reinterpret.cpp                  |   4 +-
 src/Simplify_Select.cpp                       |  23 +-
 src/Simplify_Shuffle.cpp                      |  23 +-
 src/Simplify_Stmts.cpp                        |  98 +++----
 src/Simplify_Sub.cpp                          |  24 +-
 test/correctness/CMakeLists.txt               |   1 +
 test/correctness/fuse.cpp                     |   2 +-
 .../fuzz_simplify.cpp}                        | 253 ++++++++----------
 test/correctness/simplify.cpp                 |   6 +-
 test/fuzz/CMakeLists.txt                      |   3 +-
 28 files changed, 679 insertions(+), 851 deletions(-)
 rename test/{fuzz/simplify.cpp => correctness/fuzz_simplify.cpp} (53%)

diff --git a/src/Simplify.cpp b/src/Simplify.cpp
index bc0c0964cf81..29535d36255b 100644
--- a/src/Simplify.cpp
+++ b/src/Simplify.cpp
@@ -15,31 +15,29 @@ using std::pair;
 using std::string;
 using std::vector;
 
-#if (LOG_EXPR_MUTATIONS || LOG_STMT_MUTATIONS)
-int Simplify::debug_indent = 0;
-#endif
-
 Simplify::Simplify(bool r, const Scope<Interval> *bi, const Scope<ModulusRemainder> *ai)
     : remove_dead_code(r) {
 
     // Only respect the constant bounds from the containing scope.
     for (auto iter = bi->cbegin(); iter != bi->cend(); ++iter) {
-        ExprInfo bounds;
+        ExprInfo info;
         if (const int64_t *i_min = as_const_int(iter.value().min)) {
-            bounds.min_defined = true;
-            bounds.min = *i_min;
+            info.bounds.min_defined = true;
+            info.bounds.min = *i_min;
         }
         if (const int64_t *i_max = as_const_int(iter.value().max)) {
-            bounds.max_defined = true;
-            bounds.max = *i_max;
+            info.bounds.max_defined = true;
+            info.bounds.max = *i_max;
         }
 
         if (const auto *a = ai->find(iter.name())) {
-            bounds.alignment = *a;
+            info.alignment = *a;
         }
 
-        if (bounds.min_defined || bounds.max_defined || bounds.alignment.modulus != 1) {
-            bounds_and_alignment_info.push(iter.name(), bounds);
+        if (info.bounds.min_defined ||
+            info.bounds.max_defined ||
+            info.alignment.modulus != 1) {
+            bounds_and_alignment_info.push(iter.name(), info);
         }
     }
 
@@ -48,20 +46,20 @@ Simplify::Simplify(bool r, const Scope<Interval> *bi, const Scope<ModulusRemaind
             // Already handled
             continue;
         }
-        ExprInfo bounds;
-        bounds.alignment = iter.value();
-        bounds_and_alignment_info.push(iter.name(), bounds);
+        ExprInfo info;
+        info.alignment = iter.value();
+        bounds_and_alignment_info.push(iter.name(), info);
     }
 }
 
-std::pair<std::vector<Expr>, bool> Simplify::mutate_with_changes(const std::vector<Expr> &old_exprs, ExprInfo *bounds) {
+std::pair<std::vector<Expr>, bool> Simplify::mutate_with_changes(const std::vector<Expr> &old_exprs) {
     vector<Expr> new_exprs(old_exprs.size());
     bool changed = false;
 
     // Mutate the args
     for (size_t i = 0; i < old_exprs.size(); i++) {
         const Expr &old_e = old_exprs[i];
-        Expr new_e = mutate(old_e, bounds);
+        Expr new_e = mutate(old_e, nullptr);
         if (!new_e.same_as(old_e)) {
             changed = true;
         }
@@ -135,17 +133,17 @@ void Simplify::ScopedFact::learn_false(const Expr &fact) {
         Simplify::ExprInfo i;
         if (v) {
             simplify->mutate(lt->b, &i);
-            if (i.min_defined) {
+            if (i.bounds.min_defined) {
                 // !(v < i)
-                learn_lower_bound(v, i.min);
+                learn_lower_bound(v, i.bounds.min);
             }
         }
         v = lt->b.as<Variable>();
         if (v) {
             simplify->mutate(lt->a, &i);
-            if (i.max_defined) {
+            if (i.bounds.max_defined) {
                 // !(i < v)
-                learn_upper_bound(v, i.max);
+                learn_upper_bound(v, i.bounds.max);
             }
         }
     } else if (const LE *le = fact.as<LE>()) {
@@ -153,17 +151,17 @@ void Simplify::ScopedFact::learn_false(const Expr &fact) {
         Simplify::ExprInfo i;
         if (v && v->type.is_int() && v->type.bits() >= 32) {
             simplify->mutate(le->b, &i);
-            if (i.min_defined) {
+            if (i.bounds.min_defined) {
                 // !(v <= i)
-                learn_lower_bound(v, i.min + 1);
+                learn_lower_bound(v, i.bounds.min + 1);
             }
         }
         v = le->b.as<Variable>();
         if (v && v->type.is_int() && v->type.bits() >= 32) {
             simplify->mutate(le->a, &i);
-            if (i.max_defined) {
+            if (i.bounds.max_defined) {
                 // !(i <= v)
-                learn_upper_bound(v, i.max - 1);
+                learn_upper_bound(v, i.bounds.max - 1);
             }
         }
     } else if (const Call *c = Call::as_tag(fact)) {
@@ -185,8 +183,7 @@ void Simplify::ScopedFact::learn_false(const Expr &fact) {
 
 void Simplify::ScopedFact::learn_upper_bound(const Variable *v, int64_t val) {
     ExprInfo b;
-    b.max_defined = true;
-    b.max = val;
+    b.bounds = ConstantInterval::bounded_above(val);
     if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) {
         b.intersect(*info);
     }
@@ -196,8 +193,7 @@ void Simplify::ScopedFact::learn_upper_bound(const Variable *v, int64_t val) {
 
 void Simplify::ScopedFact::learn_lower_bound(const Variable *v, int64_t val) {
     ExprInfo b;
-    b.min_defined = true;
-    b.min = val;
+    b.bounds = ConstantInterval::bounded_below(val);
     if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) {
         b.intersect(*info);
     }
@@ -267,17 +263,17 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) {
         Simplify::ExprInfo i;
         if (v && v->type.is_int() && v->type.bits() >= 32) {
             simplify->mutate(lt->b, &i);
-            if (i.max_defined) {
+            if (i.bounds.max_defined) {
                 // v < i
-                learn_upper_bound(v, i.max - 1);
+                learn_upper_bound(v, i.bounds.max - 1);
             }
         }
         v = lt->b.as<Variable>();
         if (v && v->type.is_int() && v->type.bits() >= 32) {
             simplify->mutate(lt->a, &i);
-            if (i.min_defined) {
+            if (i.bounds.min_defined) {
                 // i < v
-                learn_lower_bound(v, i.min + 1);
+                learn_lower_bound(v, i.bounds.min + 1);
             }
         }
     } else if (const LE *le = fact.as<LE>()) {
@@ -285,17 +281,17 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) {
         Simplify::ExprInfo i;
         if (v) {
             simplify->mutate(le->b, &i);
-            if (i.max_defined) {
+            if (i.bounds.max_defined) {
                 // v <= i
-                learn_upper_bound(v, i.max);
+                learn_upper_bound(v, i.bounds.max);
             }
         }
         v = le->b.as<Variable>();
         if (v) {
             simplify->mutate(le->a, &i);
-            if (i.min_defined) {
+            if (i.bounds.min_defined) {
                 // i <= v
-                learn_lower_bound(v, i.min);
+                learn_lower_bound(v, i.bounds.min);
             }
         }
     } else if (const Call *c = Call::as_tag(fact)) {
diff --git a/src/Simplify.h b/src/Simplify.h
index b9335c0c3de9..61ca847d7a27 100644
--- a/src/Simplify.h
+++ b/src/Simplify.h
@@ -21,11 +21,13 @@ namespace Internal {
  * Exprs that should be assumed to be true.
  */
 // @{
-Stmt simplify(const Stmt &, bool remove_dead_code = true,
+Stmt simplify(const Stmt &,
+              bool remove_dead_code = true,
               const Scope<Interval> &bounds = Scope<Interval>::empty_scope(),
               const Scope<ModulusRemainder> &alignment = Scope<ModulusRemainder>::empty_scope(),
               const std::vector<Expr> &assumptions = std::vector<Expr>());
-Expr simplify(const Expr &, bool remove_dead_code = true,
+Expr simplify(const Expr &,
+              bool remove_dead_code = true,
               const Scope<Interval> &bounds = Scope<Interval>::empty_scope(),
               const Scope<ModulusRemainder> &alignment = Scope<ModulusRemainder>::empty_scope(),
               const std::vector<Expr> &assumptions = std::vector<Expr>());
diff --git a/src/Simplify_Add.cpp b/src/Simplify_Add.cpp
index fb9238dd9a6a..e4cccf131b5e 100644
--- a/src/Simplify_Add.cpp
+++ b/src/Simplify_Add.cpp
@@ -3,20 +3,16 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Add *op, ExprInfo *bounds) {
-    ExprInfo a_bounds, b_bounds;
-    Expr a = mutate(op->a, &a_bounds);
-    Expr b = mutate(op->b, &b_bounds);
-
-    if (bounds && no_overflow_int(op->type)) {
-        bounds->min_defined = a_bounds.min_defined &&
-                              b_bounds.min_defined &&
-                              add_with_overflow(64, a_bounds.min, b_bounds.min, &(bounds->min));
-        bounds->max_defined = a_bounds.max_defined &&
-                              b_bounds.max_defined &&
-                              add_with_overflow(64, a_bounds.max, b_bounds.max, &(bounds->max));
-        bounds->alignment = a_bounds.alignment + b_bounds.alignment;
-        bounds->trim_bounds_using_alignment();
+Expr Simplify::visit(const Add *op, ExprInfo *info) {
+    ExprInfo a_info, b_info;
+    Expr a = mutate(op->a, &a_info);
+    Expr b = mutate(op->b, &b_info);
+
+    if (info) {
+        info->bounds = a_info.bounds + b_info.bounds;
+        info->alignment = a_info.alignment + b_info.alignment;
+        info->trim_bounds_using_alignment();
+        info->cast_to(op->type);
     }
 
     if (may_simplify(op->type)) {
@@ -24,7 +20,7 @@ Expr Simplify::visit(const Add *op, ExprInfo *bounds) {
         // Order commutative operations by node type
         if (should_commute(a, b)) {
             std::swap(a, b);
-            std::swap(a_bounds, b_bounds);
+            std::swap(a_info, b_info);
         }
 
         auto rewrite = IRMatcher::rewriter(IRMatcher::add(a, b), op->type);
@@ -194,7 +190,7 @@ Expr Simplify::visit(const Add *op, ExprInfo *bounds) {
                rewrite(x + (y + (c0 - x)/c1)*c1, y * c1 - ((c0 - x) % c1) + c0, c1 > 0) ||
 
                false)))) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         // clang-format on
     }
diff --git a/src/Simplify_And.cpp b/src/Simplify_And.cpp
index 35bbd5f7f747..a6f7e82c9095 100644
--- a/src/Simplify_And.cpp
+++ b/src/Simplify_And.cpp
@@ -3,7 +3,7 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const And *op, ExprInfo *bounds) {
+Expr Simplify::visit(const And *op, ExprInfo *info) {
     if (falsehoods.count(op)) {
         return const_false(op->type.lanes());
     }
@@ -109,7 +109,7 @@ Expr Simplify::visit(const And *op, ExprInfo *bounds) {
         rewrite(x <= y && x <= z, x <= min(y, z)) ||
         rewrite(y <= x && z <= x, max(y, z) <= x)) {
 
-        return mutate(rewrite.result, bounds);
+        return mutate(rewrite.result, info);
     }
 
     if (a.same_as(op->a) &&
diff --git a/src/Simplify_Call.cpp b/src/Simplify_Call.cpp
index 29bc75aa2bb2..db3fe526418c 100644
--- a/src/Simplify_Call.cpp
+++ b/src/Simplify_Call.cpp
@@ -49,7 +49,7 @@ Expr lift_elementwise_broadcasts(Type type, const std::string &name, std::vector
 
 }  // namespace
 
-Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
+Expr Simplify::visit(const Call *op, ExprInfo *info) {
     // Calls implicitly depend on host, dev, mins, and strides of the buffer referenced
     if (op->call_type == Call::Image || op->call_type == Call::Halide) {
         found_buffer_reference(op->name, op->args.size());
@@ -79,7 +79,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
 
         Expr unbroadcast = lift_elementwise_broadcasts(op->type, op->name, {a}, op->call_type);
         if (unbroadcast.defined()) {
-            return mutate(unbroadcast, bounds);
+            return mutate(unbroadcast, info);
         }
 
         uint64_t ua = 0;
@@ -123,7 +123,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
 
         Expr unbroadcast = lift_elementwise_broadcasts(op->type, op->name, {a, b}, op->call_type);
         if (unbroadcast.defined()) {
-            return mutate(unbroadcast, bounds);
+            return mutate(unbroadcast, info);
         }
 
         const Type t = op->type;
@@ -132,9 +132,9 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
         std::string result_op = op->name;
 
         // If we know the sign of this shift, change it to an unsigned shift.
-        if (b_info.min_defined && b_info.min >= 0) {
+        if (b_info.bounds >= 0) {
             b = mutate(cast(b.type().with_code(halide_type_uint), b), nullptr);
-        } else if (b.type().is_int() && b_info.max_defined && b_info.max <= 0) {
+        } else if (b.type().is_int() && b_info.bounds <= 0) {
             result_op = Call::get_intrinsic_name(op->is_intrinsic(Call::shift_right) ? Call::shift_left : Call::shift_right);
             b = mutate(cast(b.type().with_code(halide_type_uint), -b), nullptr);
         }
@@ -145,24 +145,24 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
             // LLVM shl and shr instructions produce poison for
             // shifts >= typesize, so we will follow suit in our simplifier.
             if (ub >= (uint64_t)(t.bits())) {
-                clear_bounds_info(bounds);
+                clear_expr_info(info);
                 return make_signed_integer_overflow(t);
             }
             if (a.type().is_uint() || ub < ((uint64_t)t.bits() - 1)) {
                 b = make_const(t, ((int64_t)1LL) << ub);
                 if (result_op == Call::get_intrinsic_name(Call::shift_left)) {
-                    return mutate(Mul::make(a, b), bounds);
+                    return mutate(Mul::make(a, b), info);
                 } else {
-                    return mutate(Div::make(a, b), bounds);
+                    return mutate(Div::make(a, b), info);
                 }
             } else {
                 // For signed types, (1 << (t.bits() - 1)) will overflow into the sign bit while
                 // (-32768 >> (t.bits() - 1)) propagates the sign bit, making decomposition
                 // into mul or div problematic, so just special-case them here.
                 if (result_op == Call::get_intrinsic_name(Call::shift_left)) {
-                    return mutate(select((a & 1) != 0, make_const(t, ((int64_t)1LL) << ub), make_zero(t)), bounds);
+                    return mutate(select((a & 1) != 0, make_const(t, ((int64_t)1LL) << ub), make_zero(t)), info);
                 } else {
-                    return mutate(select(a < 0, make_const(t, -1), make_zero(t)), bounds);
+                    return mutate(select(a < 0, make_const(t, -1), make_zero(t)), info);
                 }
             }
         }
@@ -173,7 +173,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
                 if (is_const_zero(sub->a)) {
                     result_op = Call::get_intrinsic_name(op->is_intrinsic(Call::shift_right) ? Call::shift_left : Call::shift_right);
                     b = sub->b;
-                    return mutate(Call::make(op->type, result_op, {a, b}, Call::PureIntrinsic), bounds);
+                    return mutate(Call::make(op->type, result_op, {a, b}, Call::PureIntrinsic), info);
                 }
             }
         }
@@ -190,7 +190,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
 
         Expr unbroadcast = lift_elementwise_broadcasts(op->type, op->name, {a, b}, op->call_type);
         if (unbroadcast.defined()) {
-            return mutate(unbroadcast, bounds);
+            return mutate(unbroadcast, info);
         }
 
         int64_t ia, ib = 0;
@@ -227,7 +227,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
 
         Expr unbroadcast = lift_elementwise_broadcasts(op->type, op->name, {a, b}, op->call_type);
         if (unbroadcast.defined()) {
-            return mutate(unbroadcast, bounds);
+            return mutate(unbroadcast, info);
         }
 
         int64_t ia, ib;
@@ -248,7 +248,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
 
         Expr unbroadcast = lift_elementwise_broadcasts(op->type, op->name, {a}, op->call_type);
         if (unbroadcast.defined()) {
-            return mutate(unbroadcast, bounds);
+            return mutate(unbroadcast, info);
         }
 
         int64_t ia;
@@ -268,7 +268,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
 
         Expr unbroadcast = lift_elementwise_broadcasts(op->type, op->name, {a, b}, op->call_type);
         if (unbroadcast.defined()) {
-            return mutate(unbroadcast, bounds);
+            return mutate(unbroadcast, info);
         }
 
         int64_t ia, ib;
@@ -286,12 +286,17 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
         }
     } else if (op->is_intrinsic(Call::abs)) {
         // Constant evaluate abs(x).
-        ExprInfo a_bounds;
-        Expr a = mutate(op->args[0], &a_bounds);
+        ExprInfo a_info;
+        Expr a = mutate(op->args[0], &a_info);
 
         Expr unbroadcast = lift_elementwise_broadcasts(op->type, op->name, {a}, op->call_type);
         if (unbroadcast.defined()) {
-            return mutate(unbroadcast, bounds);
+            return mutate(unbroadcast, info);
+        }
+
+        if (info) {
+            info->bounds = abs(a_info.bounds);
+            info->cast_to(op->type);
         }
 
         Type ta = a.type();
@@ -310,9 +315,9 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
                 fa = -fa;
             }
             return make_const(a.type(), fa);
-        } else if (a.type().is_int() && a_bounds.min_defined && a_bounds.min >= 0) {
+        } else if (a.type().is_int() && a_info.bounds >= 0) {
             return cast(op->type, a);
-        } else if (a.type().is_int() && a_bounds.max_defined && a_bounds.max <= 0) {
+        } else if (a.type().is_int() && a_info.bounds <= 0) {
             return cast(op->type, -a);
         } else if (a.same_as(op->args[0])) {
             return op;
@@ -321,13 +326,13 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
         }
     } else if (op->is_intrinsic(Call::absd)) {
         // Constant evaluate absd(a, b).
-        ExprInfo a_bounds, b_bounds;
-        Expr a = mutate(op->args[0], &a_bounds);
-        Expr b = mutate(op->args[1], &b_bounds);
+        ExprInfo a_info, b_info;
+        Expr a = mutate(op->args[0], &a_info);
+        Expr b = mutate(op->args[1], &b_info);
 
         Expr unbroadcast = lift_elementwise_broadcasts(op->type, op->name, {a, b}, op->call_type);
         if (unbroadcast.defined()) {
-            return mutate(unbroadcast, bounds);
+            return mutate(unbroadcast, info);
         }
 
         Type ta = a.type();
@@ -355,14 +360,17 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
         }
     } else if (op->is_intrinsic(Call::saturating_cast)) {
         internal_assert(op->args.size() == 1);
-        ExprInfo a_bounds;
-        Expr a = mutate(op->args[0], &a_bounds);
+        ExprInfo a_info;
+        Expr a = mutate(op->args[0], &a_info);
 
-        // TODO(rootjalex): We could be intelligent about using a_bounds to remove saturating_casts;
+        // In principle we could use constant bounds here to convert saturating
+        // casts to casts, but it's probably a bad idea. Saturating casts only
+        // show up if the user asks for them, and they're faster than a cast on
+        // some platforms. We should leave them be.
 
         if (is_const(a)) {
             a = lower_saturating_cast(op->type, a);
-            return mutate(a, bounds);
+            return mutate(a, info);
         } else if (!a.same_as(op->args[0])) {
             return saturating_cast(op->type, a);
         } else {
@@ -424,7 +432,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
 
         internal_assert(op->args.size() % 2 == 0);  // Prefetch: {base, offset, extent0, stride0, ...}
 
-        auto [args, changed] = mutate_with_changes(op->args, nullptr);
+        auto [args, changed] = mutate_with_changes(op->args);
 
         // The {extent, stride} args in the prefetch call are sorted
         // based on the storage dimension in ascending order (i.e. innermost
@@ -478,7 +486,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
         {
             // Can assume the condition is true when evaluating the value.
             auto t = scoped_truth(cond);
-            result = mutate(op->args[1], bounds);
+            result = mutate(op->args[1], info);
         }
 
         if (is_const_one(cond)) {
@@ -511,12 +519,8 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
         const Broadcast *b_lower = lower.as<Broadcast>();
         const Broadcast *b_upper = upper.as<Broadcast>();
 
-        if (arg_info.min_defined &&
-            arg_info.max_defined &&
-            lower_info.max_defined &&
-            upper_info.min_defined &&
-            arg_info.min >= lower_info.max &&
-            arg_info.max <= upper_info.min) {
+        if (arg_info.bounds >= lower_info.bounds &&
+            arg_info.bounds <= upper_info.bounds) {
             return arg;
         } else if (b_arg && b_lower && b_upper) {
             // Move broadcasts outwards
@@ -537,7 +541,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
     } else if (Call::as_tag(op)) {
         // The bounds of the result are the bounds of the arg
         internal_assert(op->args.size() == 1);
-        Expr arg = mutate(op->args[0], bounds);
+        Expr arg = mutate(op->args[0], info);
         if (arg.same_as(op->args[0])) {
             return op;
         } else {
@@ -557,12 +561,12 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
         }
 
         if (is_const_one(cond)) {
-            return mutate(op->args[1], bounds);
+            return mutate(op->args[1], info);
         } else if (is_const_zero(cond)) {
             if (op->args.size() == 3) {
-                return mutate(op->args[2], bounds);
+                return mutate(op->args[2], info);
             } else {
-                return mutate(make_zero(op->type), bounds);
+                return mutate(make_zero(op->type), info);
             }
         } else {
             Expr true_value = mutate(op->args[1], nullptr);
@@ -576,11 +580,7 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
             }
             in_unreachable = false;
             if (true_unreachable) {
-                if (false_value.defined()) {
-                    return false_value;
-                } else {
-                    return make_zero(op->type);
-                }
+                return false_value;
             } else if (false_unreachable) {
                 return true_value;
             }
@@ -602,21 +602,20 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
         int num_values = (int)op->args.size() - 1;
         if (num_values == 1) {
             // Mux of a single value
-            return mutate(op->args[1], bounds);
+            return mutate(op->args[1], info);
         }
         ExprInfo index_info;
         Expr index = mutate(op->args[0], &index_info);
 
         // Check if the mux has statically resolved
-        if (index_info.min_defined &&
-            index_info.max_defined &&
-            index_info.min == index_info.max) {
-            if (index_info.min >= 0 && index_info.min < num_values) {
+        if (index_info.bounds.is_single_point()) {
+            int64_t v = index_info.bounds.min;
+            if (v >= 0 && v < num_values) {
                 // In-range, return the (simplified) corresponding value.
-                return mutate(op->args[index_info.min + 1], bounds);
+                return mutate(op->args[v + 1], info);
             } else {
                 // It's out-of-range, so return the last value.
-                return mutate(op->args.back(), bounds);
+                return mutate(op->args.back(), info);
             }
         }
 
@@ -782,16 +781,16 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) {
 
         // There are other PureExterns we don't bother with (e.g. fast_inverse_f32)...
         // just fall thru and take the general case.
-        debug(2) << "Simplifier: unhandled PureExtern: " << op->name;
+        debug(2) << "Simplifier: unhandled PureExtern: " << op->name << "\n";
     } else if (op->is_intrinsic(Call::signed_integer_overflow)) {
-        clear_bounds_info(bounds);
+        clear_expr_info(info);
     } else if (op->is_intrinsic(Call::concat_bits) && op->args.size() == 1) {
-        return mutate(op->args[0], bounds);
+        return mutate(op->args[0], info);
     }
 
     // No else: we want to fall thru from the PureExtern clause.
     {
-        auto [new_args, changed] = mutate_with_changes(op->args, nullptr);
+        auto [new_args, changed] = mutate_with_changes(op->args);
         if (!changed) {
             return op;
         } else {
diff --git a/src/Simplify_Cast.cpp b/src/Simplify_Cast.cpp
index 4e689212aaa0..985707ce2cfb 100644
--- a/src/Simplify_Cast.cpp
+++ b/src/Simplify_Cast.cpp
@@ -1,33 +1,25 @@
 #include "Simplify_Internal.h"
 
+#include "IRPrinter.h"
+
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
-    Expr value = mutate(op->value, bounds);
+Expr Simplify::visit(const Cast *op, ExprInfo *info) {
 
-    if (bounds) {
-        // If either the min value or the max value can't be represented
-        // in the destination type, or the min/max value is undefined,
-        // the bounds need to be cleared (one-sided for no_overflow,
-        // both sides for overflow types).
-        if ((bounds->min_defined && !op->type.can_represent(bounds->min)) || !bounds->min_defined) {
-            bounds->min_defined = false;
-            if (!no_overflow(op->type)) {
-                // If the type overflows, this invalidates the max too.
-                bounds->max_defined = false;
-            }
-        }
-        if ((bounds->max_defined && !op->type.can_represent(bounds->max)) || !bounds->max_defined) {
-            if (!no_overflow(op->type)) {
-                // If the type overflows, this invalidates the min too.
-                bounds->min_defined = false;
-            }
-            bounds->max_defined = false;
-        }
-        if (!op->type.can_represent(bounds->alignment.modulus) ||
-            !op->type.can_represent(bounds->alignment.remainder)) {
-            bounds->alignment = ModulusRemainder();
+    ExprInfo value_info;
+    Expr value = mutate(op->value, &value_info);
+
+    if (info) {
+        if (no_overflow(op->type) && !op->type.can_represent(value_info.bounds)) {
+            // If there's overflow in a no-overflow type (e.g. due to casting
+            // from a UInt(64) to an Int(32)), then forget everything we know
+            // about the Expr. The expression may or may not overflow. We don't
+            // know.
+            *info = ExprInfo{};
+        } else {
+            *info = value_info;
+            info->cast_to(op->type);
         }
     }
 
@@ -39,7 +31,7 @@ Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
         int64_t i = 0;
         uint64_t u = 0;
         if (Call::as_intrinsic(value, {Call::signed_integer_overflow})) {
-            clear_bounds_info(bounds);
+            clear_expr_info(info);
             return make_signed_integer_overflow(op->type);
         } else if (value.type() == op->type) {
             return value;
@@ -48,12 +40,13 @@ Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
                    std::isfinite(f)) {
             // float -> int
             // Recursively call mutate just to set the bounds
-            return mutate(make_const(op->type, safe_numeric_cast<int64_t>(f)), bounds);
+            return mutate(make_const(op->type, safe_numeric_cast<int64_t>(f)), info);
         } else if (op->type.is_uint() &&
                    const_float(value, &f) &&
                    std::isfinite(f)) {
             // float -> uint
-            return make_const(op->type, safe_numeric_cast<uint64_t>(f));
+            // Recursively call mutate just to set the bounds
+            return mutate(make_const(op->type, safe_numeric_cast<uint64_t>(f)), info);
         } else if (op->type.is_float() &&
                    const_float(value, &f)) {
             // float -> float
@@ -62,7 +55,7 @@ Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
                    const_int(value, &i)) {
             // int -> int
             // Recursively call mutate just to set the bounds
-            return mutate(make_const(op->type, i), bounds);
+            return mutate(make_const(op->type, i), info);
         } else if (op->type.is_uint() &&
                    const_int(value, &i)) {
             // int -> uint
@@ -70,19 +63,19 @@ Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
         } else if (op->type.is_float() &&
                    const_int(value, &i)) {
             // int -> float
-            return make_const(op->type, safe_numeric_cast<double>(i));
+            return mutate(make_const(op->type, safe_numeric_cast<double>(i)), info);
         } else if (op->type.is_int() &&
                    const_uint(value, &u) &&
                    op->type.bits() < value.type().bits()) {
             // uint -> int narrowing
             // Recursively call mutate just to set the bounds
-            return mutate(make_const(op->type, safe_numeric_cast<int64_t>(u)), bounds);
+            return mutate(make_const(op->type, safe_numeric_cast<int64_t>(u)), info);
         } else if (op->type.is_int() &&
                    const_uint(value, &u) &&
                    op->type.bits() == value.type().bits()) {
             // uint -> int reinterpret
             // Recursively call mutate just to set the bounds
-            return mutate(make_const(op->type, safe_numeric_cast<int64_t>(u)), bounds);
+            return mutate(make_const(op->type, safe_numeric_cast<int64_t>(u)), info);
         } else if (op->type.is_int() &&
                    const_uint(value, &u) &&
                    op->type.bits() > value.type().bits()) {
@@ -90,14 +83,14 @@ Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
             if (op->type.can_represent(u) || op->type.bits() < 32) {
                 // If the type can represent the value or overflow is well-defined.
                 // Recursively call mutate just to set the bounds
-                return mutate(make_const(op->type, safe_numeric_cast<int64_t>(u)), bounds);
+                return mutate(make_const(op->type, safe_numeric_cast<int64_t>(u)), info);
             } else {
                 return make_signed_integer_overflow(op->type);
             }
         } else if (op->type.is_uint() &&
                    const_uint(value, &u)) {
             // uint -> uint
-            return make_const(op->type, u);
+            return mutate(make_const(op->type, u), info);
         } else if (op->type.is_float() &&
                    const_uint(value, &u)) {
             // uint -> float
@@ -108,7 +101,18 @@ Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
             // If this is a cast of a cast of the same type, where the
             // outer cast is narrower, the inner cast can be
             // eliminated.
-            return mutate(Cast::make(op->type, cast->value), bounds);
+            return mutate(Cast::make(op->type, cast->value), info);
+        } else if (cast &&
+                   op->type.is_int_or_uint() &&
+                   cast->type.is_int() &&
+                   cast->value.type().is_int() &&
+                   op->type.bits() >= cast->type.bits() &&
+                   cast->type.bits() >= cast->value.type().bits()) {
+            // Casting from a signed type always sign-extends, so widening
+            // partway to a signed type and the rest of the way to some other
+            // integer type is the same as just widening to that integer type
+            // directly.
+            return mutate(Cast::make(op->type, cast->value), info);
         } else if (cast &&
                    (op->type.is_int() || op->type.is_uint()) &&
                    (cast->type.is_int() || cast->type.is_uint()) &&
@@ -119,10 +123,10 @@ Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
             // inner cast's argument, the inner cast can be
             // eliminated. The inner cast is either a sign extend
             // or a zero extend, and the outer cast truncates the extended bits
-            return mutate(Cast::make(op->type, cast->value), bounds);
+            return mutate(Cast::make(op->type, cast->value), info);
         } else if (broadcast_value) {
             // cast(broadcast(x)) -> broadcast(cast(x))
-            return mutate(Broadcast::make(Cast::make(op->type.with_lanes(broadcast_value->value.type().lanes()), broadcast_value->value), broadcast_value->lanes), bounds);
+            return mutate(Broadcast::make(Cast::make(op->type.with_lanes(broadcast_value->value.type().lanes()), broadcast_value->value), broadcast_value->lanes), info);
         } else if (ramp_value &&
                    op->type.element_of() == Int(64) &&
                    op->value.type().element_of() == Int(32)) {
@@ -132,7 +136,7 @@ Expr Simplify::visit(const Cast *op, ExprInfo *bounds) {
                                      Cast::make(op->type.with_lanes(ramp_value->stride.type().lanes()),
                                                 ramp_value->stride),
                                      ramp_value->lanes),
-                          bounds);
+                          info);
         }
     }
 
diff --git a/src/Simplify_Div.cpp b/src/Simplify_Div.cpp
index 49f98837404c..92487eddecc2 100644
--- a/src/Simplify_Div.cpp
+++ b/src/Simplify_Div.cpp
@@ -3,112 +3,51 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Div *op, ExprInfo *bounds) {
-    ExprInfo a_bounds, b_bounds;
-    Expr a = mutate(op->a, &a_bounds);
-    Expr b = mutate(op->b, &b_bounds);
-
-    if (bounds && no_overflow_int(op->type)) {
-        bounds->min = INT64_MAX;
-        bounds->max = INT64_MIN;
-
-        // Enumerate all possible values for the min and max and take the extreme values.
-        if (a_bounds.min_defined && b_bounds.min_defined && b_bounds.min != 0) {
-            int64_t v = div_imp(a_bounds.min, b_bounds.min);
-            bounds->min = std::min(bounds->min, v);
-            bounds->max = std::max(bounds->max, v);
-        }
-
-        if (a_bounds.min_defined && b_bounds.max_defined && b_bounds.max != 0) {
-            int64_t v = div_imp(a_bounds.min, b_bounds.max);
-            bounds->min = std::min(bounds->min, v);
-            bounds->max = std::max(bounds->max, v);
-        }
-
-        if (a_bounds.max_defined && b_bounds.max_defined && b_bounds.max != 0) {
-            int64_t v = div_imp(a_bounds.max, b_bounds.max);
-            bounds->min = std::min(bounds->min, v);
-            bounds->max = std::max(bounds->max, v);
-        }
-
-        if (a_bounds.max_defined && b_bounds.min_defined && b_bounds.min != 0) {
-            int64_t v = div_imp(a_bounds.max, b_bounds.min);
-            bounds->min = std::min(bounds->min, v);
-            bounds->max = std::max(bounds->max, v);
-        }
-
-        const bool b_positive = b_bounds.min_defined && b_bounds.min > 0;
-        const bool b_negative = b_bounds.max_defined && b_bounds.max < 0;
-
-        if ((b_positive && !b_bounds.max_defined) ||
-            (b_negative && !b_bounds.min_defined)) {
-            // Take limit as b -> +/- infinity
-            int64_t v = 0;
-            bounds->min = std::min(bounds->min, v);
-            bounds->max = std::max(bounds->max, v);
-        }
-
-        bounds->min_defined = ((a_bounds.min_defined && b_positive) ||
-                               (a_bounds.max_defined && b_negative));
-        bounds->max_defined = ((a_bounds.max_defined && b_positive) ||
-                               (a_bounds.min_defined && b_negative));
-
-        // That's as far as we can get knowing the sign of the
-        // denominator. For bounded numerators, we additionally know
-        // that div can't make anything larger in magnitude, so we can
-        // take the intersection with that.
-        if (a_bounds.max_defined && a_bounds.min_defined) {
-            int64_t v = std::max(a_bounds.max, -a_bounds.min);
-            if (bounds->min_defined) {
-                bounds->min = std::max(bounds->min, -v);
-            } else {
-                bounds->min = -v;
-            }
-            if (bounds->max_defined) {
-                bounds->max = std::min(bounds->max, v);
-            } else {
-                bounds->max = v;
-            }
-            bounds->min_defined = bounds->max_defined = true;
-        }
-
-        // Bounded numerator divided by constantish
-        // denominator can sometimes collapse things to a
-        // constant at this point
-        if (bounds->min_defined &&
-            bounds->max_defined &&
-            bounds->max == bounds->min) {
-            if (op->type.can_represent(bounds->min)) {
-                return make_const(op->type, bounds->min);
-            } else {
-                // Even though this is 'no-overflow-int', if the result
-                // we calculate can't fit into the destination type,
-                // we're better off returning an overflow condition than
-                // a known-wrong value. (Note that no_overflow_int() should
-                // only be true for signed integers.)
-                internal_assert(op->type.is_int());
-                clear_bounds_info(bounds);
-                return make_signed_integer_overflow(op->type);
+Expr Simplify::visit(const Div *op, ExprInfo *info) {
+    ExprInfo a_info, b_info;
+    Expr a = mutate(op->a, &a_info);
+    Expr b = mutate(op->b, &b_info);
+
+    if (info) {
+        if (op->type.is_int_or_uint()) {
+            // ConstantInterval division is integer division, so we can't use
+            // this code path for floats.
+            info->bounds = a_info.bounds / b_info.bounds;
+            info->alignment = a_info.alignment / b_info.alignment;
+            info->trim_bounds_using_alignment();
+            info->cast_to(op->type);
+
+            // Bounded numerator divided by constantish bounded denominator can
+            // sometimes collapse things to a constant at this point. This
+            // mostly happens when the denominator is a constant and the
+            // numerator span is small (e.g. [23, 29]/10 = 2), but there are
+            // also cases with a bounded denominator (e.g. [5, 7]/[4, 5] = 1).
+            if (info->bounds.is_single_point()) {
+                if (op->type.can_represent(info->bounds.min)) {
+                    return make_const(op->type, info->bounds.min);
+                } else {
+                    // Even though this is 'no-overflow-int', if the result
+                    // we calculate can't fit into the destination type,
+                    // we're better off returning an overflow condition than
+                    // a known-wrong value. (Note that no_overflow_int() should
+                    // only be true for signed integers.)
+                    internal_assert(no_overflow_int(op->type));
+                    clear_expr_info(info);
+                    return make_signed_integer_overflow(op->type);
+                }
             }
+        } else {
+            // TODO: Tracking constant integer bounds of floating point values
+            // isn't so useful right now, but if we want integer bounds for
+            // floating point division later, here's the place to put it.
+            clear_expr_info(info);
         }
-        // Code downstream can use min/max in calculated-but-unused arithmetic
-        // that can lead to UB (and thus, flaky failures under ASAN/UBSAN)
-        // if we leave them set to INT64_MAX/INT64_MIN; normalize to zero to avoid this.
-        if (!bounds->min_defined) {
-            bounds->min = 0;
-        }
-        if (!bounds->max_defined) {
-            bounds->max = 0;
-        }
-        bounds->alignment = a_bounds.alignment / b_bounds.alignment;
-        bounds->trim_bounds_using_alignment();
     }
 
     bool denominator_non_zero =
         (no_overflow_int(op->type) &&
-         ((b_bounds.min_defined && b_bounds.min > 0) ||
-          (b_bounds.max_defined && b_bounds.max < 0) ||
-          (b_bounds.alignment.remainder != 0)));
+         (!b_info.bounds.contains(0) ||
+          b_info.alignment.remainder != 0));
 
     if (may_simplify(op->type)) {
 
@@ -126,8 +65,8 @@ Expr Simplify::visit(const Div *op, ExprInfo *bounds) {
             return rewrite.result;
         }
 
-        int a_mod = a_bounds.alignment.modulus;
-        int a_rem = a_bounds.alignment.remainder;
+        int a_mod = a_info.alignment.modulus;
+        int a_rem = a_info.alignment.remainder;
 
         // clang-format off
         if (EVAL_IN_LAMBDA
@@ -272,7 +211,7 @@ Expr Simplify::visit(const Div *op, ExprInfo *bounds) {
                        c2 > 0 && c0 % c2 == 0) ||
                // A very specific pattern that comes up in bounds in upsampling code.
                rewrite((x % 2 + c0) / 2, x % 2 + fold(c0 / 2), c0 % 2 == 1))))) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         // clang-format on
     }
diff --git a/src/Simplify_EQ.cpp b/src/Simplify_EQ.cpp
index 13b49a90886c..97c32814e03d 100644
--- a/src/Simplify_EQ.cpp
+++ b/src/Simplify_EQ.cpp
@@ -3,7 +3,7 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const EQ *op, ExprInfo *bounds) {
+Expr Simplify::visit(const EQ *op, ExprInfo *info) {
     if (truths.count(op)) {
         return const_true(op->type.lanes());
     } else if (falsehoods.count(op)) {
@@ -31,7 +31,7 @@ Expr Simplify::visit(const EQ *op, ExprInfo *bounds) {
         if (rewrite(x == 1, x)) {
             return rewrite.result;
         } else if (rewrite(x == 0, !x)) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         } else if (rewrite(x == x, const_true(lanes))) {
             return rewrite.result;
         } else if (a.same_as(op->a) && b.same_as(op->b)) {
@@ -41,8 +41,8 @@ Expr Simplify::visit(const EQ *op, ExprInfo *bounds) {
         }
     }
 
-    ExprInfo delta_bounds;
-    Expr delta = mutate(op->a - op->b, &delta_bounds);
+    ExprInfo delta_info;
+    Expr delta = mutate(op->a - op->b, &delta_info);
     const int lanes = op->type.lanes();
 
     // If the delta is 0, then it's just x == x
@@ -51,16 +51,12 @@ Expr Simplify::visit(const EQ *op, ExprInfo *bounds) {
     }
 
     // Attempt to disprove using bounds analysis
-    if (delta_bounds.min_defined && delta_bounds.min > 0) {
-        return const_false(lanes);
-    }
-
-    if (delta_bounds.max_defined && delta_bounds.max < 0) {
+    if (!delta_info.bounds.contains(0)) {
         return const_false(lanes);
     }
 
     // Attempt to disprove using modulus remainder analysis
-    if (delta_bounds.alignment.remainder != 0) {
+    if (delta_info.alignment.remainder != 0) {
         return const_false(lanes);
     }
 
@@ -109,7 +105,7 @@ Expr Simplify::visit(const EQ *op, ExprInfo *bounds) {
         rewrite(min(x, 0) == 0, 0 <= x) ||
 
         false) {
-        return mutate(rewrite.result, bounds);
+        return mutate(rewrite.result, info);
     }
 
     if (rewrite(c0 == 0, fold(c0 == 0)) ||
@@ -121,7 +117,9 @@ Expr Simplify::visit(const EQ *op, ExprInfo *bounds) {
         const EQ *eq = rewrite.result.as<EQ>();
         if (eq &&
             eq->a.same_as(op->a) &&
-            eq->b.same_as(op->b)) {
+            equal(eq->b, op->b)) {
+            // Note we don't use same_as for b, because the shuffling of the RHS
+            // to the LHS and back might mutate it and then mutate it back.
             return op;
         } else {
             return rewrite.result;
@@ -134,7 +132,7 @@ Expr Simplify::visit(const EQ *op, ExprInfo *bounds) {
 }
 
 // ne redirects to not eq
-Expr Simplify::visit(const NE *op, ExprInfo *bounds) {
+Expr Simplify::visit(const NE *op, ExprInfo *info) {
     if (!may_simplify(op->a.type())) {
         Expr a = mutate(op->a, nullptr);
         Expr b = mutate(op->b, nullptr);
@@ -145,7 +143,7 @@ Expr Simplify::visit(const NE *op, ExprInfo *bounds) {
         }
     }
 
-    Expr mutated = mutate(Not::make(EQ::make(op->a, op->b)), bounds);
+    Expr mutated = mutate(Not::make(EQ::make(op->a, op->b)), info);
     if (const NE *ne = mutated.as<NE>()) {
         if (ne->a.same_as(op->a) && ne->b.same_as(op->b)) {
             return op;
diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp
index b5fcc96ac0cd..02f19ae13a6a 100644
--- a/src/Simplify_Exprs.cpp
+++ b/src/Simplify_Exprs.cpp
@@ -7,49 +7,48 @@ namespace Internal {
 
 // Miscellaneous expression visitors that are too small to bother putting in their own files
 
-Expr Simplify::visit(const IntImm *op, ExprInfo *bounds) {
-    if (bounds && no_overflow_int(op->type)) {
-        bounds->min_defined = bounds->max_defined = true;
-        bounds->min = bounds->max = op->value;
-        bounds->alignment.remainder = op->value;
-        bounds->alignment.modulus = 0;
+Expr Simplify::visit(const IntImm *op, ExprInfo *info) {
+    if (info) {
+        info->bounds = ConstantInterval::single_point(op->value);
+        info->alignment = ModulusRemainder(0, op->value);
+        info->cast_to(op->type);
     } else {
-        clear_bounds_info(bounds);
+        clear_expr_info(info);
     }
     return op;
 }
 
-Expr Simplify::visit(const UIntImm *op, ExprInfo *bounds) {
-    if (bounds && Int(64).can_represent(op->value)) {
-        bounds->min_defined = bounds->max_defined = true;
-        bounds->min = bounds->max = (int64_t)(op->value);
-        bounds->alignment.remainder = op->value;
-        bounds->alignment.modulus = 0;
+Expr Simplify::visit(const UIntImm *op, ExprInfo *info) {
+    if (info && Int(64).can_represent(op->value)) {
+        int64_t v = (int64_t)(op->value);
+        info->bounds = ConstantInterval::single_point(v);
+        info->alignment = ModulusRemainder(0, v);
+        info->cast_to(op->type);
     } else {
-        clear_bounds_info(bounds);
+        clear_expr_info(info);
     }
     return op;
 }
 
-Expr Simplify::visit(const FloatImm *op, ExprInfo *bounds) {
-    clear_bounds_info(bounds);
+Expr Simplify::visit(const FloatImm *op, ExprInfo *info) {
+    clear_expr_info(info);
     return op;
 }
 
-Expr Simplify::visit(const StringImm *op, ExprInfo *bounds) {
-    clear_bounds_info(bounds);
+Expr Simplify::visit(const StringImm *op, ExprInfo *info) {
+    clear_expr_info(info);
     return op;
 }
 
-Expr Simplify::visit(const Broadcast *op, ExprInfo *bounds) {
-    Expr value = mutate(op->value, bounds);
+Expr Simplify::visit(const Broadcast *op, ExprInfo *info) {
+    Expr value = mutate(op->value, info);
 
     const int lanes = op->lanes;
 
     auto rewrite = IRMatcher::rewriter(IRMatcher::broadcast(value, lanes), op->type);
     if (rewrite(broadcast(broadcast(x, c0), lanes), broadcast(x, c0 * lanes)) ||
         false) {
-        return mutate(rewrite.result, bounds);
+        return mutate(rewrite.result, info);
     }
 
     if (value.same_as(op->value)) {
@@ -59,8 +58,8 @@ Expr Simplify::visit(const Broadcast *op, ExprInfo *bounds) {
     }
 }
 
-Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
-    Expr value = mutate(op->value, bounds);
+Expr Simplify::visit(const VectorReduce *op, ExprInfo *info) {
+    Expr value = mutate(op->value, info);
 
     const int lanes = op->type.lanes();
     const int arg_lanes = op->value.type().lanes();
@@ -69,32 +68,22 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
         return value;
     }
 
-    if (bounds && op->type.is_int()) {
+    if (info && op->type.is_int()) {
         switch (op->op) {
         case VectorReduce::Add:
             // Alignment of result is the alignment of the arg. Bounds
             // of the result can grow according to the reduction
             // factor.
-            if (bounds->min_defined) {
-                bounds->min *= factor;
-            }
-            if (bounds->max_defined) {
-                bounds->max *= factor;
-            }
+            info->bounds = cast(op->type, info->bounds * factor);
             break;
         case VectorReduce::SaturatingAdd:
-            if (bounds->min_defined) {
-                bounds->min = saturating_mul(bounds->min, factor);
-            }
-            if (bounds->max_defined) {
-                bounds->max = saturating_mul(bounds->max, factor);
-            }
+            info->bounds = saturating_cast(op->type, info->bounds * factor);
             break;
         case VectorReduce::Mul:
             // Don't try to infer anything about bounds. Leave the
             // alignment unchanged even though we could theoretically
             // upgrade it.
-            bounds->min_defined = bounds->max_defined = false;
+            info->bounds = ConstantInterval{};
             break;
         case VectorReduce::Min:
         case VectorReduce::Max:
@@ -104,8 +93,8 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
         case VectorReduce::Or:
             // For integer types this is a bitwise operator. Don't try
             // to infer anything for now.
-            bounds->min_defined = bounds->max_defined = false;
-            bounds->alignment = ModulusRemainder{};
+            info->bounds = ConstantInterval{};
+            info->alignment = ModulusRemainder{};
             break;
         }
     }
@@ -134,7 +123,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
         auto rewrite = IRMatcher::rewriter(IRMatcher::h_add(value, lanes), op->type);
         if (rewrite(h_add(x * broadcast(y, arg_lanes), lanes), h_add(x, lanes) * broadcast(y, lanes)) ||
             rewrite(h_add(broadcast(x, arg_lanes) * y, lanes), h_add(y, lanes) * broadcast(x, lanes))) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         break;
     }
@@ -148,7 +137,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
             rewrite(h_min(broadcast(x, c0), lanes), h_min(x, lanes), factor % c0 == 0) ||
             rewrite(h_min(ramp(x, y, arg_lanes), lanes), x + min(y * (arg_lanes - 1), 0)) ||
             false) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         break;
     }
@@ -162,7 +151,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
             rewrite(h_max(broadcast(x, c0), lanes), h_max(x, lanes), factor % c0 == 0) ||
             rewrite(h_max(ramp(x, y, arg_lanes), lanes), x + max(y * (arg_lanes - 1), 0)) ||
             false) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         break;
     }
@@ -183,7 +172,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
             rewrite(h_and(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
                     x <= y + min(z * (arg_lanes - 1), 0)) ||
             false) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         break;
     }
@@ -205,7 +194,7 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
             rewrite(h_or(broadcast(x, arg_lanes) < ramp(y, z, arg_lanes), lanes),
                     x <= y + max(z * (arg_lanes - 1), 0)) ||
             false) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         break;
     }
@@ -220,33 +209,35 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) {
     }
 }
 
-Expr Simplify::visit(const Variable *op, ExprInfo *bounds) {
+Expr Simplify::visit(const Variable *op, ExprInfo *info) {
     if (const ExprInfo *b = bounds_and_alignment_info.find(op->name)) {
-        if (bounds) {
-            *bounds = *b;
+        if (info) {
+            *info = *b;
         }
-        if (b->min_defined && b->max_defined && b->min == b->max) {
-            return make_const(op->type, b->min);
+        if (b->bounds.is_single_point()) {
+            return make_const(op->type, b->bounds.min);
         }
+    } else if (info && !no_overflow_int(op->type)) {
+        info->bounds = ConstantInterval::bounds_of_type(op->type);
     }
 
-    if (auto *info = var_info.shallow_find(op->name)) {
+    if (auto *v_info = var_info.shallow_find(op->name)) {
         // if replacement is defined, we should substitute it in (unless
         // it's a var that has been hidden by a nested scope).
-        if (info->replacement.defined()) {
-            internal_assert(info->replacement.type() == op->type)
+        if (v_info->replacement.defined()) {
+            internal_assert(v_info->replacement.type() == op->type)
                 << "Cannot replace variable " << op->name
                 << " of type " << op->type
-                << " with expression of type " << info->replacement.type() << "\n";
-            info->new_uses++;
+                << " with expression of type " << v_info->replacement.type() << "\n";
+            v_info->new_uses++;
             // We want to remutate the replacement, because we may be
             // injecting it into a context where it is known to be a
             // constant (e.g. due to an if).
-            return mutate(info->replacement, bounds);
+            return mutate(v_info->replacement, info);
         } else {
             // This expression was not something deemed
             // substitutable - no replacement is defined.
-            info->old_uses++;
+            v_info->old_uses++;
             return op;
         }
     } else {
@@ -256,29 +247,28 @@ Expr Simplify::visit(const Variable *op, ExprInfo *bounds) {
     }
 }
 
-Expr Simplify::visit(const Ramp *op, ExprInfo *bounds) {
-    ExprInfo base_bounds, stride_bounds;
-    Expr base = mutate(op->base, &base_bounds);
-    Expr stride = mutate(op->stride, &stride_bounds);
+Expr Simplify::visit(const Ramp *op, ExprInfo *info) {
+    ExprInfo base_info, stride_info;
+    Expr base = mutate(op->base, &base_info);
+    Expr stride = mutate(op->stride, &stride_info);
     const int lanes = op->lanes;
 
-    if (bounds && no_overflow_int(op->type)) {
-        bounds->min_defined = base_bounds.min_defined && stride_bounds.min_defined;
-        bounds->max_defined = base_bounds.max_defined && stride_bounds.max_defined;
-        bounds->min = std::min(base_bounds.min, base_bounds.min + (lanes - 1) * stride_bounds.min);
-        bounds->max = std::max(base_bounds.max, base_bounds.max + (lanes - 1) * stride_bounds.max);
+    if (info) {
+        info->bounds = base_info.bounds + stride_info.bounds * ConstantInterval(0, lanes - 1);
         // A ramp lane is b + l * s. Expanding b into mb * x + rb and s into ms * y + rs, we get:
         //   mb * x + rb + l * (ms * y + rs)
         // = mb * x + ms * l * y + rs * l + rb
         // = gcd(rs, ms, mb) * z + rb
-        int64_t m = stride_bounds.alignment.modulus;
-        m = gcd(m, stride_bounds.alignment.remainder);
-        m = gcd(m, base_bounds.alignment.modulus);
-        int64_t r = base_bounds.alignment.remainder;
+        int64_t m = stride_info.alignment.modulus;
+        m = gcd(m, stride_info.alignment.remainder);
+        m = gcd(m, base_info.alignment.modulus);
+        int64_t r = base_info.alignment.remainder;
         if (m != 0) {
-            r = mod_imp(base_bounds.alignment.remainder, m);
+            r = mod_imp(base_info.alignment.remainder, m);
         }
-        bounds->alignment = {m, r};
+        info->alignment = {m, r};
+        info->trim_bounds_using_alignment();
+        info->cast_to(op->type);
     }
 
     // A somewhat torturous way to check if the stride is zero,
@@ -303,9 +293,13 @@ Expr Simplify::visit(const Ramp *op, ExprInfo *bounds) {
     }
 }
 
-Expr Simplify::visit(const Load *op, ExprInfo *bounds) {
+Expr Simplify::visit(const Load *op, ExprInfo *info) {
     found_buffer_reference(op->name);
 
+    if (info) {
+        info->bounds = ConstantInterval::bounds_of_type(op->type);
+    }
+
     Expr predicate = mutate(op->predicate, nullptr);
 
     ExprInfo index_info;
@@ -319,17 +313,11 @@ Expr Simplify::visit(const Load *op, ExprInfo *bounds) {
     if (is_const_one(op->predicate)) {
         string alloc_extent_name = op->name + ".total_extent_bytes";
         if (const auto *alloc_info = bounds_and_alignment_info.find(alloc_extent_name)) {
-            if (index_info.max_defined && index_info.max < 0) {
+            if (index_info.bounds < 0 ||
+                index_info.bounds * op->type.bytes() > alloc_info->bounds) {
                 in_unreachable = true;
                 return unreachable(op->type);
             }
-            if (alloc_info->max_defined && index_info.min_defined) {
-                int index_min_bytes = index_info.min * op->type.bytes();
-                if (index_min_bytes > alloc_info->max) {
-                    in_unreachable = true;
-                    return unreachable(op->type);
-                }
-            }
         }
     }
 
diff --git a/src/Simplify_Internal.h b/src/Simplify_Internal.h
index 92f012926091..19666cc77294 100644
--- a/src/Simplify_Internal.h
+++ b/src/Simplify_Internal.h
@@ -7,7 +7,9 @@
  * exported in Halide.h. */
 
 #include "Bounds.h"
+#include "ConstantInterval.h"
 #include "IRMatch.h"
+#include "IRPrinter.h"
 #include "IRVisitor.h"
 #include "Scope.h"
 
@@ -28,17 +30,6 @@
 namespace Halide {
 namespace Internal {
 
-inline int64_t saturating_mul(int64_t a, int64_t b) {
-    int64_t result;
-    if (mul_with_overflow(64, a, b, &result)) {
-        return result;
-    } else if ((a > 0) == (b > 0)) {
-        return INT64_MAX;
-    } else {
-        return INT64_MIN;
-    }
-}
-
 class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
     using Super = VariadicVisitor<Simplify, Expr, Stmt>;
 
@@ -47,80 +38,109 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
 
     struct ExprInfo {
         // We track constant integer bounds when they exist
-        // TODO: Use ConstantInterval?
-        int64_t min = 0, max = 0;
-        bool min_defined = false, max_defined = false;
+        ConstantInterval bounds;
         // And the alignment of integer variables
         ModulusRemainder alignment;
 
         void trim_bounds_using_alignment() {
             if (alignment.modulus == 0) {
-                min_defined = max_defined = true;
-                min = max = alignment.remainder;
+                bounds = ConstantInterval::single_point(alignment.remainder);
             } else if (alignment.modulus > 1) {
-                if (min_defined) {
+                if (bounds.min_defined) {
                     int64_t adjustment;
-                    bool no_overflow = sub_with_overflow(64, alignment.remainder, mod_imp(min, alignment.modulus), &adjustment);
+                    bool no_overflow = sub_with_overflow(64, alignment.remainder, mod_imp(bounds.min, alignment.modulus), &adjustment);
                     adjustment = mod_imp(adjustment, alignment.modulus);
                     int64_t new_min;
-                    no_overflow &= add_with_overflow(64, min, adjustment, &new_min);
+                    no_overflow &= add_with_overflow(64, bounds.min, adjustment, &new_min);
                     if (no_overflow) {
-                        min = new_min;
+                        bounds.min = new_min;
                     }
                 }
-                if (max_defined) {
+                if (bounds.max_defined) {
                     int64_t adjustment;
-                    bool no_overflow = sub_with_overflow(64, mod_imp(max, alignment.modulus), alignment.remainder, &adjustment);
+                    bool no_overflow = sub_with_overflow(64, mod_imp(bounds.max, alignment.modulus), alignment.remainder, &adjustment);
                     adjustment = mod_imp(adjustment, alignment.modulus);
                     int64_t new_max;
-                    no_overflow &= sub_with_overflow(64, max, adjustment, &new_max);
+                    no_overflow &= sub_with_overflow(64, bounds.max, adjustment, &new_max);
                     if (no_overflow) {
-                        max = new_max;
+                        bounds.max = new_max;
                     }
                 }
             }
 
-            if (min_defined && max_defined && min == max) {
+            if (bounds.is_single_point()) {
                 alignment.modulus = 0;
-                alignment.remainder = min;
+                alignment.remainder = bounds.min;
+            }
+
+            if (bounds.is_bounded() && bounds.min > bounds.max) {
+                // Impossible, we must be in unreachable code. TODO: surface
+                // this to the simplify instance's in_unreachable flag.
+                bounds.max = bounds.min;
             }
         }
 
-        // Mix in existing knowledge about this Expr
-        void intersect(const ExprInfo &other) {
-            if (min_defined && other.min_defined) {
-                min = std::max(min, other.min);
-            } else if (other.min_defined) {
-                min_defined = true;
-                min = other.min;
+        void cast_to(Type t) {
+            if ((!t.is_int() && !t.is_uint()) || (t.is_int() && t.bits() >= 32)) {
+                return;
             }
 
-            if (max_defined && other.max_defined) {
-                max = std::min(max, other.max);
-            } else if (other.max_defined) {
-                max_defined = true;
-                max = other.max;
+            // We've just done some infinite-integer operation on a bounded
+            // integer type, and we need to project the bounds and alignment
+            // back in-range.
+
+            if (!t.can_represent(bounds)) {
+                if (t.bits() >= 64) {
+                    // Just preserve any power-of-two factor in the modulus. When
+                    // alignment.modulus == 0, the value is some positive constant
+                    // representable as any 64-bit integer type, so there's no
+                    // wraparound.
+                    if (alignment.modulus > 0) {
+                        // This masks off all bits except for the lowest set one,
+                        // giving the largest power-of-two factor of a number.
+                        alignment.modulus &= -alignment.modulus;
+                        alignment.remainder = mod_imp(alignment.remainder, alignment.modulus);
+                    }
+                } else {
+                    // A narrowing integer cast that could possibly overflow adds
+                    // some unknown multiple of 2^bits
+                    alignment = alignment + ModulusRemainder(((int64_t)1 << t.bits()), 0);
+                }
             }
 
-            alignment = ModulusRemainder::intersect(alignment, other.alignment);
+            // Truncate the bounds to the new type.
+            bounds.cast_to(t);
+        }
 
+        // Mix in existing knowledge about this Expr
+        void intersect(const ExprInfo &other) {
+            if (bounds < other.bounds || other.bounds < bounds) {
+                // Impossible. We must be in unreachable code. TODO: It might
+                // be nice to surface this to the simplify instance's
+                // in_unreachable flag, but we'd have to be sure that it's going
+                // to be caught at the right place.
+                return;
+            }
+            bounds = ConstantInterval::make_intersection(bounds, other.bounds);
+            alignment = ModulusRemainder::intersect(alignment, other.alignment);
             trim_bounds_using_alignment();
         }
     };
 
     HALIDE_ALWAYS_INLINE
-    void clear_bounds_info(ExprInfo *b) {
+    void clear_expr_info(ExprInfo *b) {
         if (b) {
             *b = ExprInfo{};
         }
     }
 
 #if (LOG_EXPR_MUTATIONS || LOG_STMT_MUTATIONS)
-    static int debug_indent;
+    int debug_indent = 0;
 #endif
 
 #if LOG_EXPR_MUTATIONS
     Expr mutate(const Expr &e, ExprInfo *b) {
+        internal_assert(debug_indent >= 0);
         const std::string spaces(debug_indent, ' ');
         debug(1) << spaces << "Simplifying Expr: " << e << "\n";
         debug_indent++;
@@ -130,6 +150,19 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
             debug(1)
                 << spaces << "Before: " << e << "\n"
                 << spaces << "After:  " << new_e << "\n";
+            if (b) {
+                debug(1)
+                    << spaces << "Bounds: " << b->bounds << " " << b->alignment << "\n";
+                if (const int64_t *i = as_const_int(new_e)) {
+                    internal_assert(b->bounds.contains(*i)) << e << "\n"
+                                                            << new_e << "\n"
+                                                            << b->bounds;
+                } else if (const uint64_t *i = as_const_uint(new_e)) {
+                    internal_assert(b->bounds.contains(*i)) << e << "\n"
+                                                            << new_e << "\n"
+                                                            << b->bounds;
+                }
+            }
         }
         internal_assert(e.type() == new_e.type());
         return new_e;
@@ -298,45 +331,45 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
     Stmt mutate_let_body(const Stmt &s, ExprInfo *) {
         return mutate(s);
     }
-    Expr mutate_let_body(const Expr &e, ExprInfo *bounds) {
-        return mutate(e, bounds);
+    Expr mutate_let_body(const Expr &e, ExprInfo *info) {
+        return mutate(e, info);
     }
 
     template<typename T, typename Body>
-    Body simplify_let(const T *op, ExprInfo *bounds);
-
-    Expr visit(const IntImm *op, ExprInfo *bounds);
-    Expr visit(const UIntImm *op, ExprInfo *bounds);
-    Expr visit(const FloatImm *op, ExprInfo *bounds);
-    Expr visit(const StringImm *op, ExprInfo *bounds);
-    Expr visit(const Broadcast *op, ExprInfo *bounds);
-    Expr visit(const Cast *op, ExprInfo *bounds);
-    Expr visit(const Reinterpret *op, ExprInfo *bounds);
-    Expr visit(const Variable *op, ExprInfo *bounds);
-    Expr visit(const Add *op, ExprInfo *bounds);
-    Expr visit(const Sub *op, ExprInfo *bounds);
-    Expr visit(const Mul *op, ExprInfo *bounds);
-    Expr visit(const Div *op, ExprInfo *bounds);
-    Expr visit(const Mod *op, ExprInfo *bounds);
-    Expr visit(const Min *op, ExprInfo *bounds);
-    Expr visit(const Max *op, ExprInfo *bounds);
-    Expr visit(const EQ *op, ExprInfo *bounds);
-    Expr visit(const NE *op, ExprInfo *bounds);
-    Expr visit(const LT *op, ExprInfo *bounds);
-    Expr visit(const LE *op, ExprInfo *bounds);
-    Expr visit(const GT *op, ExprInfo *bounds);
-    Expr visit(const GE *op, ExprInfo *bounds);
-    Expr visit(const And *op, ExprInfo *bounds);
-    Expr visit(const Or *op, ExprInfo *bounds);
-    Expr visit(const Not *op, ExprInfo *bounds);
-    Expr visit(const Select *op, ExprInfo *bounds);
-    Expr visit(const Ramp *op, ExprInfo *bounds);
+    Body simplify_let(const T *op, ExprInfo *info);
+
+    Expr visit(const IntImm *op, ExprInfo *info);
+    Expr visit(const UIntImm *op, ExprInfo *info);
+    Expr visit(const FloatImm *op, ExprInfo *info);
+    Expr visit(const StringImm *op, ExprInfo *info);
+    Expr visit(const Broadcast *op, ExprInfo *info);
+    Expr visit(const Cast *op, ExprInfo *info);
+    Expr visit(const Reinterpret *op, ExprInfo *info);
+    Expr visit(const Variable *op, ExprInfo *info);
+    Expr visit(const Add *op, ExprInfo *info);
+    Expr visit(const Sub *op, ExprInfo *info);
+    Expr visit(const Mul *op, ExprInfo *info);
+    Expr visit(const Div *op, ExprInfo *info);
+    Expr visit(const Mod *op, ExprInfo *info);
+    Expr visit(const Min *op, ExprInfo *info);
+    Expr visit(const Max *op, ExprInfo *info);
+    Expr visit(const EQ *op, ExprInfo *info);
+    Expr visit(const NE *op, ExprInfo *info);
+    Expr visit(const LT *op, ExprInfo *info);
+    Expr visit(const LE *op, ExprInfo *info);
+    Expr visit(const GT *op, ExprInfo *info);
+    Expr visit(const GE *op, ExprInfo *info);
+    Expr visit(const And *op, ExprInfo *info);
+    Expr visit(const Or *op, ExprInfo *info);
+    Expr visit(const Not *op, ExprInfo *info);
+    Expr visit(const Select *op, ExprInfo *info);
+    Expr visit(const Ramp *op, ExprInfo *info);
     Stmt visit(const IfThenElse *op);
-    Expr visit(const Load *op, ExprInfo *bounds);
-    Expr visit(const Call *op, ExprInfo *bounds);
-    Expr visit(const Shuffle *op, ExprInfo *bounds);
-    Expr visit(const VectorReduce *op, ExprInfo *bounds);
-    Expr visit(const Let *op, ExprInfo *bounds);
+    Expr visit(const Load *op, ExprInfo *info);
+    Expr visit(const Call *op, ExprInfo *info);
+    Expr visit(const Shuffle *op, ExprInfo *info);
+    Expr visit(const VectorReduce *op, ExprInfo *info);
+    Expr visit(const Let *op, ExprInfo *info);
     Stmt visit(const LetStmt *op);
     Stmt visit(const AssertStmt *op);
     Stmt visit(const For *op);
@@ -354,7 +387,7 @@ class Simplify : public VariadicVisitor<Simplify, Expr, Stmt> {
     Stmt visit(const Atomic *op);
     Stmt visit(const HoistedStorage *op);
 
-    std::pair<std::vector<Expr>, bool> mutate_with_changes(const std::vector<Expr> &old_exprs, ExprInfo *bounds);
+    std::pair<std::vector<Expr>, bool> mutate_with_changes(const std::vector<Expr> &old_exprs);
 };
 
 }  // namespace Internal
diff --git a/src/Simplify_LT.cpp b/src/Simplify_LT.cpp
index 58c6d4d27ab3..c9ac45c349d7 100644
--- a/src/Simplify_LT.cpp
+++ b/src/Simplify_LT.cpp
@@ -3,10 +3,10 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const LT *op, ExprInfo *bounds) {
-    ExprInfo a_bounds, b_bounds;
-    Expr a = mutate(op->a, &a_bounds);
-    Expr b = mutate(op->b, &b_bounds);
+Expr Simplify::visit(const LT *op, ExprInfo *info) {
+    ExprInfo a_info, b_info;
+    Expr a = mutate(op->a, &a_info);
+    Expr b = mutate(op->b, &b_info);
 
     const int lanes = op->type.lanes();
     Type ty = a.type();
@@ -20,11 +20,9 @@ Expr Simplify::visit(const LT *op, ExprInfo *bounds) {
     if (may_simplify(ty)) {
 
         // Prove or disprove using bounds analysis
-        if (a_bounds.max_defined && b_bounds.min_defined && a_bounds.max < b_bounds.min) {
+        if (a_info.bounds < b_info.bounds) {
             return const_true(lanes);
-        }
-
-        if (a_bounds.min_defined && b_bounds.max_defined && a_bounds.min >= b_bounds.max) {
+        } else if (a_info.bounds >= b_info.bounds) {
             return const_false(lanes);
         }
 
@@ -499,7 +497,7 @@ Expr Simplify::visit(const LT *op, ExprInfo *bounds) {
                       c1 * (lanes - 1) < c0 &&
                       c1 * (lanes - 1) >= 0)
               ))) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         // clang-format on
     }
@@ -512,7 +510,7 @@ Expr Simplify::visit(const LT *op, ExprInfo *bounds) {
 }
 
 // The other comparison operators redirect to the less-than operator
-Expr Simplify::visit(const LE *op, ExprInfo *bounds) {
+Expr Simplify::visit(const LE *op, ExprInfo *info) {
     if (!may_simplify(op->a.type())) {
         Expr a = mutate(op->a, nullptr);
         Expr b = mutate(op->b, nullptr);
@@ -523,7 +521,7 @@ Expr Simplify::visit(const LE *op, ExprInfo *bounds) {
         }
     }
 
-    Expr mutated = mutate(!(op->b < op->a), bounds);
+    Expr mutated = mutate(!(op->b < op->a), info);
     if (const LE *le = mutated.as<LE>()) {
         if (le->a.same_as(op->a) && le->b.same_as(op->b)) {
             return op;
@@ -532,7 +530,7 @@ Expr Simplify::visit(const LE *op, ExprInfo *bounds) {
     return mutated;
 }
 
-Expr Simplify::visit(const GT *op, ExprInfo *bounds) {
+Expr Simplify::visit(const GT *op, ExprInfo *info) {
     if (!may_simplify(op->a.type())) {
         Expr a = mutate(op->a, nullptr);
         Expr b = mutate(op->b, nullptr);
@@ -543,10 +541,10 @@ Expr Simplify::visit(const GT *op, ExprInfo *bounds) {
         }
     }
 
-    return mutate(op->b < op->a, bounds);
+    return mutate(op->b < op->a, info);
 }
 
-Expr Simplify::visit(const GE *op, ExprInfo *bounds) {
+Expr Simplify::visit(const GE *op, ExprInfo *info) {
     if (!may_simplify(op->a.type())) {
         Expr a = mutate(op->a, nullptr);
         Expr b = mutate(op->b, nullptr);
@@ -557,7 +555,7 @@ Expr Simplify::visit(const GE *op, ExprInfo *bounds) {
         }
     }
 
-    return mutate(!(op->a < op->b), bounds);
+    return mutate(!(op->a < op->b), info);
 }
 
 }  // namespace Internal
diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index 342281fa6639..13fbd575d75f 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -61,7 +61,7 @@ void find_var_uses(StmtOrExpr x, std::unordered_set<std::string> &unused_vars) {
 }  // namespace
 
 template<typename LetOrLetStmt, typename Body>
-Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
+Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) {
 
     // Lets are often deeply nested. Get the intermediate state off
     // the call stack where it could overflow onto an explicit stack.
@@ -89,8 +89,8 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
 
         // If the value is trivial, make a note of it in the scope so
         // we can subs it in later
-        ExprInfo value_bounds;
-        f.value = mutate(op->value, &value_bounds);
+        ExprInfo value_info;
+        f.value = mutate(op->value, &value_info);
 
         // Iteratively peel off certain operations from the let value and push them inside.
         f.new_value = f.value;
@@ -222,21 +222,24 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
         var_info.push(op->name, info);
 
         // Before we enter the body, track the alignment info
-
         if (f.new_value.defined() && no_overflow_scalar_int(f.new_value.type())) {
             // Remutate new_value to get updated bounds
-            ExprInfo new_value_bounds;
-            f.new_value = mutate(f.new_value, &new_value_bounds);
-            if (new_value_bounds.min_defined || new_value_bounds.max_defined || new_value_bounds.alignment.modulus != 1) {
+            ExprInfo new_value_info;
+            f.new_value = mutate(f.new_value, &new_value_info);
+            if (new_value_info.bounds.min_defined ||
+                new_value_info.bounds.max_defined ||
+                new_value_info.alignment.modulus != 1) {
                 // There is some useful information
-                bounds_and_alignment_info.push(f.new_name, new_value_bounds);
+                bounds_and_alignment_info.push(f.new_name, new_value_info);
                 f.new_value_bounds_tracked = true;
             }
         }
 
         if (no_overflow_scalar_int(f.value.type())) {
-            if (value_bounds.min_defined || value_bounds.max_defined || value_bounds.alignment.modulus != 1) {
-                bounds_and_alignment_info.push(op->name, value_bounds);
+            if (value_info.bounds.min_defined ||
+                value_info.bounds.max_defined ||
+                value_info.alignment.modulus != 1) {
+                bounds_and_alignment_info.push(op->name, value_info);
                 f.value_bounds_tracked = true;
             }
         }
@@ -245,7 +248,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
         op = result.template as<LetOrLetStmt>();
     }
 
-    result = mutate_let_body(result, bounds);
+    result = mutate_let_body(result, info);
 
     // TODO: var_info and unused_vars are pretty redundant; however, at the time
     // of writing, both cover cases that the other does not:
@@ -310,8 +313,8 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *bounds) {
     return result;
 }
 
-Expr Simplify::visit(const Let *op, ExprInfo *bounds) {
-    return simplify_let<Let, Expr>(op, bounds);
+Expr Simplify::visit(const Let *op, ExprInfo *info) {
+    return simplify_let<Let, Expr>(op, info);
 }
 
 Stmt Simplify::visit(const LetStmt *op) {
diff --git a/src/Simplify_Max.cpp b/src/Simplify_Max.cpp
index 1a79aef962fa..6f3ecc1999f7 100644
--- a/src/Simplify_Max.cpp
+++ b/src/Simplify_Max.cpp
@@ -3,44 +3,33 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Max *op, ExprInfo *bounds) {
-    ExprInfo a_bounds, b_bounds;
-    Expr a = mutate(op->a, &a_bounds);
-    Expr b = mutate(op->b, &b_bounds);
-
-    if (bounds) {
-        bounds->min_defined = a_bounds.min_defined || b_bounds.min_defined;
-        bounds->max_defined = a_bounds.max_defined && b_bounds.max_defined;
-        bounds->max = std::max(a_bounds.max, b_bounds.max);
-        if (a_bounds.min_defined && b_bounds.min_defined) {
-            bounds->min = std::max(a_bounds.min, b_bounds.min);
-        } else if (a_bounds.min_defined) {
-            bounds->min = a_bounds.min;
-        } else {
-            bounds->min = b_bounds.min;
-        }
-        bounds->alignment = ModulusRemainder::unify(a_bounds.alignment, b_bounds.alignment);
-        bounds->trim_bounds_using_alignment();
+Expr Simplify::visit(const Max *op, ExprInfo *info) {
+    ExprInfo a_info, b_info;
+    Expr a = mutate(op->a, &a_info);
+    Expr b = mutate(op->b, &b_info);
+
+    if (info) {
+        info->bounds = max(a_info.bounds, b_info.bounds);
+        info->alignment = ModulusRemainder::unify(a_info.alignment, b_info.alignment);
+        info->trim_bounds_using_alignment();
     }
 
-    // Early out when the bounds tells us one side or the other is smaller
-    if (a_bounds.max_defined && b_bounds.min_defined && a_bounds.max <= b_bounds.min) {
-        if (const Call *call = b.as<Call>()) {
+    auto strip_likely = [](const Expr &e) {
+        if (const Call *call = e.as<Call>()) {
             if (call->is_intrinsic(Call::likely) ||
                 call->is_intrinsic(Call::likely_if_innermost)) {
                 return call->args[0];
             }
         }
-        return b;
+        return e;
+    };
+
+    // Early out when the bounds tells us one side or the other is smaller
+    if (a_info.bounds <= b_info.bounds) {
+        return strip_likely(b);
     }
-    if (b_bounds.max_defined && a_bounds.min_defined && b_bounds.max <= a_bounds.min) {
-        if (const Call *call = a.as<Call>()) {
-            if (call->is_intrinsic(Call::likely) ||
-                call->is_intrinsic(Call::likely_if_innermost)) {
-                return call->args[0];
-            }
-        }
-        return a;
+    if (b_info.bounds <= a_info.bounds) {
+        return strip_likely(a);
     }
 
     if (may_simplify(op->type)) {
@@ -48,7 +37,7 @@ Expr Simplify::visit(const Max *op, ExprInfo *bounds) {
         // Order commutative operations by node type
         if (should_commute(a, b)) {
             std::swap(a, b);
-            std::swap(a_bounds, b_bounds);
+            std::swap(a_info, b_info);
         }
 
         int lanes = op->type.lanes();
@@ -301,7 +290,7 @@ Expr Simplify::visit(const Max *op, ExprInfo *bounds) {
 
                rewrite(max(c0 - x, c1), c0 - min(x, fold(c0 - c1))))))) {
 
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         // clang-format on
     }
diff --git a/src/Simplify_Min.cpp b/src/Simplify_Min.cpp
index 214ed09374d3..41e455174351 100644
--- a/src/Simplify_Min.cpp
+++ b/src/Simplify_Min.cpp
@@ -3,44 +3,34 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Min *op, ExprInfo *bounds) {
-    ExprInfo a_bounds, b_bounds;
-    Expr a = mutate(op->a, &a_bounds);
-    Expr b = mutate(op->b, &b_bounds);
-
-    if (bounds) {
-        bounds->min_defined = a_bounds.min_defined && b_bounds.min_defined;
-        bounds->max_defined = a_bounds.max_defined || b_bounds.max_defined;
-        bounds->min = std::min(a_bounds.min, b_bounds.min);
-        if (a_bounds.max_defined && b_bounds.max_defined) {
-            bounds->max = std::min(a_bounds.max, b_bounds.max);
-        } else if (a_bounds.max_defined) {
-            bounds->max = a_bounds.max;
-        } else {
-            bounds->max = b_bounds.max;
-        }
-        bounds->alignment = ModulusRemainder::unify(a_bounds.alignment, b_bounds.alignment);
-        bounds->trim_bounds_using_alignment();
+Expr Simplify::visit(const Min *op, ExprInfo *info) {
+    ExprInfo a_info, b_info;
+    Expr a = mutate(op->a, &a_info);
+    Expr b = mutate(op->b, &b_info);
+
+    if (info) {
+        info->bounds = min(a_info.bounds, b_info.bounds);
+        info->alignment = ModulusRemainder::unify(a_info.alignment, b_info.alignment);
+        info->trim_bounds_using_alignment();
     }
 
     // Early out when the bounds tells us one side or the other is smaller
-    if (a_bounds.max_defined && b_bounds.min_defined && a_bounds.max <= b_bounds.min) {
-        if (const Call *call = a.as<Call>()) {
+    auto strip_likely = [](const Expr &e) {
+        if (const Call *call = e.as<Call>()) {
             if (call->is_intrinsic(Call::likely) ||
                 call->is_intrinsic(Call::likely_if_innermost)) {
                 return call->args[0];
             }
         }
-        return a;
+        return e;
+    };
+
+    // Early out when the bounds tells us one side or the other is smaller
+    if (a_info.bounds >= b_info.bounds) {
+        return strip_likely(b);
     }
-    if (b_bounds.max_defined && a_bounds.min_defined && b_bounds.max <= a_bounds.min) {
-        if (const Call *call = b.as<Call>()) {
-            if (call->is_intrinsic(Call::likely) ||
-                call->is_intrinsic(Call::likely_if_innermost)) {
-                return call->args[0];
-            }
-        }
-        return b;
+    if (b_info.bounds >= a_info.bounds) {
+        return strip_likely(a);
     }
 
     if (may_simplify(op->type)) {
@@ -48,7 +38,7 @@ Expr Simplify::visit(const Min *op, ExprInfo *bounds) {
         // Order commutative operations by node type
         if (should_commute(a, b)) {
             std::swap(a, b);
-            std::swap(a_bounds, b_bounds);
+            std::swap(a_info, b_info);
         }
 
         int lanes = op->type.lanes();
@@ -312,7 +302,7 @@ Expr Simplify::visit(const Min *op, ExprInfo *bounds) {
 
                false )))) {
 
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         // clang-format on
     }
diff --git a/src/Simplify_Mod.cpp b/src/Simplify_Mod.cpp
index fcd4021b759f..dbfcfcb14f81 100644
--- a/src/Simplify_Mod.cpp
+++ b/src/Simplify_Mod.cpp
@@ -3,60 +3,33 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Mod *op, ExprInfo *bounds) {
-    ExprInfo a_bounds, b_bounds;
-    Expr a = mutate(op->a, &a_bounds);
-    Expr b = mutate(op->b, &b_bounds);
+Expr Simplify::visit(const Mod *op, ExprInfo *info) {
+    ExprInfo a_info, b_info;
+    Expr a = mutate(op->a, &a_info);
+    Expr b = mutate(op->b, &b_info);
 
     // We always combine bounds here, even if not requested, because
     // we can use them to simplify down to a constant if the bounds
     // are tight enough.
-    ExprInfo mod_bounds;
-
-    if (no_overflow_int(op->type)) {
-        // The result is at least zero.
-        mod_bounds.min_defined = true;
-        mod_bounds.min = 0;
-
-        // Mod by produces a result between 0
-        // and max(0, abs(modulus) - 1). However, if b is unbounded in
-        // either direction, abs(modulus) could be arbitrarily
-        // large.
-        if (b_bounds.max_defined && b_bounds.min_defined) {
-            mod_bounds.max_defined = true;
-            mod_bounds.max = 0;                                            // When b == 0
-            mod_bounds.max = std::max(mod_bounds.max, b_bounds.max - 1);   // When b > 0
-            mod_bounds.max = std::max(mod_bounds.max, -1 - b_bounds.min);  // When b < 0
-        }
-
-        // If a is positive, mod can't make it larger
-        if (a_bounds.min_defined && a_bounds.min >= 0 && a_bounds.max_defined) {
-            if (mod_bounds.max_defined) {
-                mod_bounds.max = std::min(mod_bounds.max, a_bounds.max);
-            } else {
-                mod_bounds.max_defined = true;
-                mod_bounds.max = a_bounds.max;
-            }
-        }
-
-        mod_bounds.alignment = a_bounds.alignment % b_bounds.alignment;
-        mod_bounds.trim_bounds_using_alignment();
-        if (bounds) {
-            *bounds = mod_bounds;
-        }
+    ExprInfo mod_info;
+    if (op->type.is_int_or_uint()) {
+        mod_info.bounds = a_info.bounds % b_info.bounds;
+        mod_info.alignment = a_info.alignment % b_info.alignment;
+        mod_info.trim_bounds_using_alignment();
+        // Modulo can't overflow, so no mod_info.cast_to(op->type)
+    }
+    // TODO: Modulo bounds for floating-point modulo
+    if (info) {
+        *info = mod_info;
     }
 
     if (may_simplify(op->type)) {
-        if (a_bounds.min_defined && a_bounds.min >= 0 &&
-            a_bounds.max_defined && b_bounds.min_defined && a_bounds.max < b_bounds.min) {
-            if (bounds) {
-                *bounds = a_bounds;
-            }
+        if (a_info.bounds >= 0 && a_info.bounds < b_info.bounds) {
             return a;
         }
 
-        if (mod_bounds.min_defined && mod_bounds.max_defined && mod_bounds.min == mod_bounds.max) {
-            return make_const(op->type, mod_bounds.min);
+        if (mod_info.bounds.is_single_point()) {
+            return make_const(op->type, mod_info.bounds.min);
         }
 
         int lanes = op->type.lanes();
@@ -94,7 +67,7 @@ Expr Simplify::visit(const Mod *op, ExprInfo *bounds) {
                rewrite(ramp(x + c0, c2, c3) % broadcast(c1, c3), ramp(x + fold(c0 % c1), fold(c2 % c1), c3) % c1, c1 > 0 && (c0 >= c1 || c0 < 0)) ||
                rewrite(ramp(x * c0 + y, c2, c3) % broadcast(c1, c3), ramp(y, fold(c2 % c1), c3) % c1, c0 % c1 == 0) ||
                rewrite(ramp(y + x * c0, c2, c3) % broadcast(c1, c3), ramp(y, fold(c2 % c1), c3) % c1, c0 % c1 == 0))))) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         // clang-format on
     }
diff --git a/src/Simplify_Mul.cpp b/src/Simplify_Mul.cpp
index 881d09112f7d..446f420c6c91 100644
--- a/src/Simplify_Mul.cpp
+++ b/src/Simplify_Mul.cpp
@@ -3,49 +3,16 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Mul *op, ExprInfo *bounds) {
-    ExprInfo a_bounds, b_bounds;
-    Expr a = mutate(op->a, &a_bounds);
-    Expr b = mutate(op->b, &b_bounds);
-
-    if (bounds && no_overflow_int(op->type)) {
-        bool a_positive = a_bounds.min_defined && a_bounds.min > 0;
-        bool b_positive = b_bounds.min_defined && b_bounds.min > 0;
-        bool a_bounded = a_bounds.min_defined && a_bounds.max_defined;
-        bool b_bounded = b_bounds.min_defined && b_bounds.max_defined;
-
-        if (a_bounded && b_bounded) {
-            bounds->min_defined = bounds->max_defined = true;
-            int64_t v1 = saturating_mul(a_bounds.min, b_bounds.min);
-            int64_t v2 = saturating_mul(a_bounds.min, b_bounds.max);
-            int64_t v3 = saturating_mul(a_bounds.max, b_bounds.min);
-            int64_t v4 = saturating_mul(a_bounds.max, b_bounds.max);
-            bounds->min = std::min(std::min(v1, v2), std::min(v3, v4));
-            bounds->max = std::max(std::max(v1, v2), std::max(v3, v4));
-        } else if ((a_bounds.max_defined && b_bounded && b_positive) ||
-                   (b_bounds.max_defined && a_bounded && a_positive)) {
-            bounds->max_defined = true;
-            bounds->max = saturating_mul(a_bounds.max, b_bounds.max);
-        } else if ((a_bounds.min_defined && b_bounded && b_positive) ||
-                   (b_bounds.min_defined && a_bounded && a_positive)) {
-            bounds->min_defined = true;
-            bounds->min = saturating_mul(a_bounds.min, b_bounds.min);
-        }
-
-        if (bounds->max_defined && bounds->max == INT64_MAX) {
-            // Assume it saturated to avoid overflow. This gives up a
-            // single representable value at the top end of the range
-            // to represent infinity.
-            bounds->max_defined = false;
-            bounds->max = 0;
-        }
-        if (bounds->min_defined && bounds->min == INT64_MIN) {
-            bounds->min_defined = false;
-            bounds->min = 0;
-        }
-
-        bounds->alignment = a_bounds.alignment * b_bounds.alignment;
-        bounds->trim_bounds_using_alignment();
+Expr Simplify::visit(const Mul *op, ExprInfo *info) {
+    ExprInfo a_info, b_info;
+    Expr a = mutate(op->a, &a_info);
+    Expr b = mutate(op->b, &b_info);
+
+    if (info) {
+        info->bounds = a_info.bounds * b_info.bounds;
+        info->alignment = a_info.alignment * b_info.alignment;
+        info->trim_bounds_using_alignment();
+        info->cast_to(op->type);
     }
 
     if (may_simplify(op->type)) {
@@ -53,7 +20,7 @@ Expr Simplify::visit(const Mul *op, ExprInfo *bounds) {
         // Order commutative operations by node type
         if (should_commute(a, b)) {
             std::swap(a, b);
-            std::swap(a_bounds, b_bounds);
+            std::swap(a_info, b_info);
         }
 
         auto rewrite = IRMatcher::rewriter(IRMatcher::mul(a, b), op->type);
@@ -103,7 +70,7 @@ Expr Simplify::visit(const Mul *op, ExprInfo *bounds) {
             rewrite(slice(x, c0, c1, c2) * (z * slice(y, c0, c1, c2)), slice(x * y, c0, c1, c2) * z, c2 > 1 && lanes_of(x) == lanes_of(y)) ||
 
             false) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
     }
 
diff --git a/src/Simplify_Not.cpp b/src/Simplify_Not.cpp
index 70b4b234ddef..47b74661fd2c 100644
--- a/src/Simplify_Not.cpp
+++ b/src/Simplify_Not.cpp
@@ -3,7 +3,7 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Not *op, ExprInfo *bounds) {
+Expr Simplify::visit(const Not *op, ExprInfo *info) {
     Expr a = mutate(op->a, nullptr);
 
     auto rewrite = IRMatcher::rewriter(IRMatcher::not_op(a), op->type);
@@ -25,7 +25,7 @@ Expr Simplify::visit(const Not *op, ExprInfo *bounds) {
         rewrite(!(x && !y), !x || y) ||
         rewrite(!(x || !y), !x && y) ||
         false) {
-        return mutate(rewrite.result, bounds);
+        return mutate(rewrite.result, info);
     }
 
     if (a.same_as(op->a)) {
diff --git a/src/Simplify_Or.cpp b/src/Simplify_Or.cpp
index 274d66435ffb..083af6d5bc88 100644
--- a/src/Simplify_Or.cpp
+++ b/src/Simplify_Or.cpp
@@ -3,7 +3,7 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Or *op, ExprInfo *bounds) {
+Expr Simplify::visit(const Or *op, ExprInfo *info) {
     if (truths.count(op)) {
         return const_true(op->type.lanes());
     }
@@ -101,7 +101,7 @@ Expr Simplify::visit(const Or *op, ExprInfo *bounds) {
         rewrite(x <= y || x <= z, x <= max(y, z)) ||
         rewrite(y <= x || z <= x, min(y, z) <= x)) {
 
-        return mutate(rewrite.result, bounds);
+        return mutate(rewrite.result, info);
     }
 
     if (a.same_as(op->a) &&
diff --git a/src/Simplify_Reinterpret.cpp b/src/Simplify_Reinterpret.cpp
index d5a8c1361fbe..51289aac9b87 100644
--- a/src/Simplify_Reinterpret.cpp
+++ b/src/Simplify_Reinterpret.cpp
@@ -3,7 +3,7 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Reinterpret *op, ExprInfo *bounds) {
+Expr Simplify::visit(const Reinterpret *op, ExprInfo *info) {
     Expr a = mutate(op->value, nullptr);
 
     int64_t ia;
@@ -19,7 +19,7 @@ Expr Simplify::visit(const Reinterpret *op, ExprInfo *bounds) {
         return make_const(op->type, (int64_t)ua);
     } else if (const Reinterpret *as_r = a.as<Reinterpret>()) {
         // Fold double-reinterprets.
-        return mutate(reinterpret(op->type, as_r->value), bounds);
+        return mutate(reinterpret(op->type, as_r->value), info);
     } else if ((op->type.bits() == a.type().bits()) &&
                op->type.is_int_or_uint() &&
                a.type().is_int_or_uint()) {
diff --git a/src/Simplify_Select.cpp b/src/Simplify_Select.cpp
index 0233be61724d..63be8d64718e 100644
--- a/src/Simplify_Select.cpp
+++ b/src/Simplify_Select.cpp
@@ -3,20 +3,17 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Select *op, ExprInfo *bounds) {
+Expr Simplify::visit(const Select *op, ExprInfo *info) {
 
-    ExprInfo t_bounds, f_bounds;
+    ExprInfo t_info, f_info;
     Expr condition = mutate(op->condition, nullptr);
-    Expr true_value = mutate(op->true_value, &t_bounds);
-    Expr false_value = mutate(op->false_value, &f_bounds);
-
-    if (bounds) {
-        bounds->min_defined = t_bounds.min_defined && f_bounds.min_defined;
-        bounds->max_defined = t_bounds.max_defined && f_bounds.max_defined;
-        bounds->min = std::min(t_bounds.min, f_bounds.min);
-        bounds->max = std::max(t_bounds.max, f_bounds.max);
-        bounds->alignment = ModulusRemainder::unify(t_bounds.alignment, f_bounds.alignment);
-        bounds->trim_bounds_using_alignment();
+    Expr true_value = mutate(op->true_value, &t_info);
+    Expr false_value = mutate(op->false_value, &f_info);
+
+    if (info) {
+        info->bounds = ConstantInterval::make_union(t_info.bounds, f_info.bounds);
+        info->alignment = ModulusRemainder::unify(t_info.alignment, f_info.alignment);
+        info->trim_bounds_using_alignment();
     }
 
     if (may_simplify(op->type)) {
@@ -230,7 +227,7 @@ Expr Simplify::visit(const Select *op, ExprInfo *bounds) {
                rewrite(select(x, y, true), !x || y) ||
                rewrite(select(x, false, y), !x && y) ||
                rewrite(select(x, true, y), x || y))))) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
         // clang-format on
     }
diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index 7da4f6699ab7..348289ab0c83 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -7,7 +7,7 @@ namespace Internal {
 
 using std::vector;
 
-Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) {
+Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
     if (op->is_extract_element()) {
         int index = op->indices[0];
         internal_assert(index >= 0);
@@ -18,7 +18,7 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) {
                     // the same shuffle back.
                     break;
                 } else {
-                    return extract_lane(mutate(vector, bounds), index);
+                    return extract_lane(mutate(vector, info), index);
                 }
             }
             index -= vector.type().lanes();
@@ -29,20 +29,17 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) {
     vector<Expr> new_vectors;
     bool changed = false;
     for (const Expr &vector : op->vectors) {
-        ExprInfo v_bounds;
-        Expr new_vector = mutate(vector, &v_bounds);
+        ExprInfo v_info;
+        Expr new_vector = mutate(vector, &v_info);
         if (!vector.same_as(new_vector)) {
             changed = true;
         }
-        if (bounds) {
+        if (info) {
             if (new_vectors.empty()) {
-                *bounds = v_bounds;
+                *info = v_info;
             } else {
-                bounds->min_defined &= v_bounds.min_defined;
-                bounds->max_defined &= v_bounds.max_defined;
-                bounds->min = std::min(bounds->min, v_bounds.min);
-                bounds->max = std::max(bounds->max, v_bounds.max);
-                bounds->alignment = ModulusRemainder::unify(bounds->alignment, v_bounds.alignment);
+                info->bounds = ConstantInterval::make_union(info->bounds, v_info.bounds);
+                info->alignment = ModulusRemainder::unify(info->alignment, v_info.alignment);
             }
         }
         new_vectors.push_back(new_vector);
@@ -141,7 +138,7 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) {
                 }
             }
             if (can_collapse) {
-                return mutate(Ramp::make(r->base, r->stride / terms, r->lanes * terms), bounds);
+                return mutate(Ramp::make(r->base, r->stride / terms, r->lanes * terms), info);
             }
         }
 
@@ -272,7 +269,7 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *bounds) {
             if (cast->type.bits() > cast->value.type().bits()) {
                 return mutate(Cast::make(cast->type.with_lanes(op->type.lanes()),
                                          Shuffle::make({cast->value}, op->indices)),
-                              bounds);
+                              info);
             }
         }
     }
diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index f6cb81345961..3645ebbf4369 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -203,12 +203,12 @@ Stmt Simplify::visit(const AssertStmt *op) {
 }
 
 Stmt Simplify::visit(const For *op) {
-    ExprInfo min_bounds, extent_bounds;
-    Expr new_min = mutate(op->min, &min_bounds);
+    ExprInfo min_info, extent_info;
+    Expr new_min = mutate(op->min, &min_info);
     if (in_unreachable) {
         return Evaluate::make(new_min);
     }
-    Expr new_extent = mutate(op->extent, &extent_bounds);
+    Expr new_extent = mutate(op->extent, &extent_info);
     if (in_unreachable) {
         return Evaluate::make(new_extent);
     }
@@ -217,34 +217,43 @@ Stmt Simplify::visit(const For *op) {
                                          (in_vector_loop ||
                                           op->for_type == ForType::Vectorized));
 
-    bool bounds_tracked = false;
-    if (min_bounds.min_defined || (min_bounds.max_defined && extent_bounds.max_defined)) {
-        min_bounds.max += extent_bounds.max - 1;
-        min_bounds.max_defined &= extent_bounds.max_defined;
-        min_bounds.alignment = ModulusRemainder{};
-        bounds_tracked = true;
-        bounds_and_alignment_info.push(op->name, min_bounds);
+    Expr extent_positive = mutate(0 < new_extent, nullptr);
+    if (is_const_zero(extent_positive)) {
+        // This loop never runs
+        return Evaluate::make(0);
     }
 
+    ExprInfo loop_var_info;
+    // Deduce bounds for the loop var that are true for any code than runs
+    // inside the loop body. Code in the inner loop only runs if the extent is
+    // at least one, so we can throw a max around the extent bounds.
+
+    loop_var_info.bounds =
+        ConstantInterval::make_union(min_info.bounds,
+                                     min_info.bounds + max(extent_info.bounds, 1) - 1);
     Stmt new_body;
     {
+        ScopedBinding<ExprInfo> bind_if((loop_var_info.bounds.max_defined ||
+                                         loop_var_info.bounds.min_defined),
+                                        bounds_and_alignment_info,
+                                        op->name,
+                                        loop_var_info);
+
         // If we're in the loop, the extent must be greater than 0.
-        ScopedFact fact = scoped_truth(0 < new_extent);
+        ScopedFact fact = scoped_truth(extent_positive);
         new_body = mutate(op->body);
     }
+
     if (in_unreachable) {
-        if (extent_bounds.min_defined && extent_bounds.min >= 1) {
-            // If we know the loop executes once, the code that runs this loop is unreachable.
-            return new_body;
-        }
-        in_unreachable = false;
+        // We found that the body of this loop is unreachable when recursively
+        // mutating it, so we can remove the loop. Additionally, if we know the
+        // extent is greater than zero, then the code *outside* the loop must be
+        // unreachable too, because if it weren't, it'd run the unreachable body
+        // at least once.
+        in_unreachable = extent_info.bounds > 0;
         return Evaluate::make(0);
     }
 
-    if (bounds_tracked) {
-        bounds_and_alignment_info.pop(op->name);
-    }
-
     if (const Acquire *acquire = new_body.as<Acquire>()) {
         if (is_no_op(acquire->body)) {
             // Rewrite iterated no-op acquires as a single acquire.
@@ -254,14 +263,14 @@ Stmt Simplify::visit(const For *op) {
 
     if (is_no_op(new_body)) {
         return new_body;
-    } else if (extent_bounds.max_defined &&
-               extent_bounds.max <= 0) {
+    } else if (extent_info.bounds <= 0) {
         return Evaluate::make(0);
-    } else if (extent_bounds.max_defined &&
-               extent_bounds.max <= 1 &&
+    } else if (extent_info.bounds <= 1 &&
                op->device_api == DeviceAPI::None) {
+        // Loop body runs at most once
         Stmt s = LetStmt::make(op->name, new_min, new_body);
-        if (extent_bounds.min < 1) {
+        if (extent_info.bounds.contains(0)) {
+            // Loop body might not run at all
             s = IfThenElse::make(0 < new_extent, s);
         }
         return mutate(s);
@@ -280,8 +289,8 @@ Stmt Simplify::visit(const Provide *op) {
     found_buffer_reference(op->name, op->args.size());
 
     // Mutate the args
-    auto [new_args, changed_args] = mutate_with_changes(op->args, nullptr);
-    auto [new_values, changed_values] = mutate_with_changes(op->values, nullptr);
+    auto [new_args, changed_args] = mutate_with_changes(op->args);
+    auto [new_values, changed_values] = mutate_with_changes(op->values);
     Expr new_predicate = mutate(op->predicate, nullptr);
 
     if (!(changed_args || changed_values) && new_predicate.same_as(op->predicate)) {
@@ -307,17 +316,11 @@ Stmt Simplify::visit(const Store *op) {
     string alloc_extent_name = op->name + ".total_extent_bytes";
     if (is_const_one(op->predicate)) {
         if (const auto *alloc_info = bounds_and_alignment_info.find(alloc_extent_name)) {
-            if (index_info.max_defined && index_info.max < 0) {
+            if (index_info.bounds < 0 ||
+                index_info.bounds * op->value.type().bytes() > alloc_info->bounds) {
                 in_unreachable = true;
                 return Evaluate::make(unreachable());
             }
-            if (alloc_info->max_defined && index_info.min_defined) {
-                int index_min_bytes = index_info.min * op->value.type().bytes();
-                if (index_min_bytes > alloc_info->max) {
-                    in_unreachable = true;
-                    return Evaluate::make(unreachable());
-                }
-            }
         }
     }
 
@@ -356,33 +359,14 @@ Stmt Simplify::visit(const Allocate *op) {
     std::vector<Expr> new_extents;
     bool all_extents_unmodified = true;
     ExprInfo total_extent_info;
-    total_extent_info.min_defined = true;
-    total_extent_info.max_defined = true;
-    total_extent_info.min = 1;
-    total_extent_info.max = 1;
+    total_extent_info.bounds = ConstantInterval::single_point(op->type.bytes());
     for (size_t i = 0; i < op->extents.size(); i++) {
         ExprInfo extent_info;
         new_extents.push_back(mutate(op->extents[i], &extent_info));
         all_extents_unmodified &= new_extents[i].same_as(op->extents[i]);
-        if (extent_info.min_defined) {
-            total_extent_info.min *= extent_info.min;
-        } else {
-            total_extent_info.min_defined = false;
-        }
-        if (extent_info.max_defined) {
-            total_extent_info.max *= extent_info.max;
-        } else {
-            total_extent_info.max_defined = false;
-        }
-    }
-    if (total_extent_info.min_defined) {
-        total_extent_info.min *= op->type.bytes();
-        total_extent_info.min -= 1;
-    }
-    if (total_extent_info.max_defined) {
-        total_extent_info.max *= op->type.bytes();
-        total_extent_info.max -= 1;
+        total_extent_info.bounds *= extent_info.bounds;
     }
+    total_extent_info.bounds -= 1;
 
     ScopedBinding<ExprInfo> b(bounds_and_alignment_info, op->name + ".total_extent_bytes", total_extent_info);
 
diff --git a/src/Simplify_Sub.cpp b/src/Simplify_Sub.cpp
index f3a06ca28949..cf21205f13d1 100644
--- a/src/Simplify_Sub.cpp
+++ b/src/Simplify_Sub.cpp
@@ -3,23 +3,19 @@
 namespace Halide {
 namespace Internal {
 
-Expr Simplify::visit(const Sub *op, ExprInfo *bounds) {
-    ExprInfo a_bounds, b_bounds;
-    Expr a = mutate(op->a, &a_bounds);
-    Expr b = mutate(op->b, &b_bounds);
+Expr Simplify::visit(const Sub *op, ExprInfo *info) {
+    ExprInfo a_info, b_info;
+    Expr a = mutate(op->a, &a_info);
+    Expr b = mutate(op->b, &b_info);
 
-    if (bounds && no_overflow_int(op->type)) {
+    if (info) {
         // Doesn't account for correlated a, b, so any
         // cancellation rule that exploits that should always
         // remutate to recalculate the bounds.
-        bounds->min_defined = a_bounds.min_defined &&
-                              b_bounds.max_defined &&
-                              sub_with_overflow(64, a_bounds.min, b_bounds.max, &(bounds->min));
-        bounds->max_defined = a_bounds.max_defined &&
-                              b_bounds.min_defined &&
-                              sub_with_overflow(64, a_bounds.max, b_bounds.min, &(bounds->max));
-        bounds->alignment = a_bounds.alignment - b_bounds.alignment;
-        bounds->trim_bounds_using_alignment();
+        info->bounds = a_info.bounds - b_info.bounds;
+        info->alignment = a_info.alignment - b_info.alignment;
+        info->trim_bounds_using_alignment();
+        info->cast_to(op->type);
     }
 
     if (may_simplify(op->type)) {
@@ -446,7 +442,7 @@ Expr Simplify::visit(const Sub *op, ExprInfo *bounds) {
                rewrite((min(z, x*c0 + y) + w) / c1 - x*c2, (min(z - x*c0, y) + w) / c1, c0 == c1 * c2) ||
 
                false)))) {
-            return mutate(rewrite.result, bounds);
+            return mutate(rewrite.result, info);
         }
     }
     // clang-format on
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index ae4a6776ac72..4246ba807220 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -122,6 +122,7 @@ tests(GROUPS correctness
       fused_where_inner_extent_is_zero.cpp
       fuzz_float_stores.cpp
       fuzz_schedule.cpp
+      fuzz_simplify.cpp
       gameoflife.cpp
       gather.cpp
       gpu_allocation_cache.cpp
diff --git a/test/correctness/fuse.cpp b/test/correctness/fuse.cpp
index 87ebcba3dbc4..d644e6fb741e 100644
--- a/test/correctness/fuse.cpp
+++ b/test/correctness/fuse.cpp
@@ -72,7 +72,7 @@ int main(int argc, char **argv) {
         Var xy("xy");
         f.compute_root()
             .fuse(x, y, xy)
-            .vectorize(xy, 16);
+            .vectorize(xy, 16, TailStrategy::RoundUp);
 
         f.add_custom_lowering_pass(new CheckForMod);
         f.compile_jit();
diff --git a/test/fuzz/simplify.cpp b/test/correctness/fuzz_simplify.cpp
similarity index 53%
rename from test/fuzz/simplify.cpp
rename to test/correctness/fuzz_simplify.cpp
index ae4f8ee46a11..fd09316a5887 100644
--- a/test/fuzz/simplify.cpp
+++ b/test/correctness/fuzz_simplify.cpp
@@ -1,8 +1,6 @@
 #include "Halide.h"
-#include "fuzz_helpers.h"
 #include <array>
 #include <functional>
-#include <fuzzer/FuzzedDataProvider.h>
 #include <random>
 #include <stdio.h>
 #include <time.h>
@@ -25,21 +23,35 @@ std::string fuzz_var(int i) {
     return std::string(1, 'a' + i);
 }
 
-Expr random_var(FuzzedDataProvider &fdp) {
-    int fuzz_count = fdp.ConsumeIntegralInRange<int>(0, fuzz_var_count - 1);
-    return Variable::make(Int(0), fuzz_var(fuzz_count));
+Expr random_var(std::mt19937 &rng, Type t) {
+    int fuzz_count = rng() % (fuzz_var_count - 1);
+    return cast(t, Variable::make(Int(32), fuzz_var(fuzz_count)));
 }
 
-Type random_type(FuzzedDataProvider &fdp, int width) {
-    Type t = fdp.PickValueInArray(fuzz_types);
+template<typename T, int N>
+T random_choice(std::mt19937 &rng, const T (&choices)[N]) {
+    return choices[rng() % N];
+}
+
+template<typename T>
+T random_choice(std::mt19937 &rng, const std::vector<T> &choices) {
+    return choices[rng() % choices.size()];
+}
+
+template<typename T, size_t N>
+T random_choice(std::mt19937 &rng, const std::array<T, N> &choices) {
+    return choices[rng() % N];
+}
 
+Type random_type(std::mt19937 &rng, int width) {
+    Type t = random_choice(rng, fuzz_types);
     if (width > 1) {
         t = t.with_lanes(width);
     }
     return t;
 }
 
-int get_random_divisor(FuzzedDataProvider &fdp, Type t) {
+int get_random_divisor(std::mt19937 &rng, Type t) {
     std::vector<int> divisors = {t.lanes()};
     for (int dd = 2; dd < t.lanes(); dd++) {
         if (t.lanes() % dd == 0) {
@@ -47,43 +59,42 @@ int get_random_divisor(FuzzedDataProvider &fdp, Type t) {
         }
     }
 
-    return pick_value_in_vector(fdp, divisors);
+    return random_choice(rng, divisors);
 }
 
-Expr random_leaf(FuzzedDataProvider &fdp, Type t, bool overflow_undef = false, bool imm_only = false) {
+Expr random_leaf(std::mt19937 &rng, Type t, bool overflow_undef = false, bool imm_only = false) {
     if (t.is_int() && t.bits() == 32) {
         overflow_undef = true;
     }
     if (t.is_scalar()) {
-        if (!imm_only && fdp.ConsumeBool()) {
-            auto v1 = random_var(fdp);
-            return cast(t, v1);
+        if (!imm_only && (rng() & 1)) {
+            return random_var(rng, t);
         } else {
             if (overflow_undef) {
                 // For Int(32), we don't care about correctness during
                 // overflow, so just use numbers that are unlikely to
                 // overflow.
-                return cast(t, fdp.ConsumeIntegralInRange<int>(-128, 127));
+                return cast(t, (int32_t)((int8_t)(rng() & 255)));
             } else {
-                return cast(t, fdp.ConsumeIntegral<int>());
+                return cast(t, (int32_t)(rng()));
             }
         }
     } else {
-        int lanes = get_random_divisor(fdp, t);
-        if (fdp.ConsumeBool()) {
-            auto e1 = random_leaf(fdp, t.with_lanes(t.lanes() / lanes), overflow_undef);
-            auto e2 = random_leaf(fdp, t.with_lanes(t.lanes() / lanes), overflow_undef);
+        int lanes = get_random_divisor(rng, t);
+        if (rng() & 1) {
+            auto e1 = random_leaf(rng, t.with_lanes(t.lanes() / lanes), overflow_undef);
+            auto e2 = random_leaf(rng, t.with_lanes(t.lanes() / lanes), overflow_undef);
             return Ramp::make(e1, e2, lanes);
         } else {
-            auto e1 = random_leaf(fdp, t.with_lanes(t.lanes() / lanes), overflow_undef);
+            auto e1 = random_leaf(rng, t.with_lanes(t.lanes() / lanes), overflow_undef);
             return Broadcast::make(e1, lanes);
         }
     }
 }
 
-Expr random_expr(FuzzedDataProvider &fdp, Type t, int depth, bool overflow_undef = false);
+Expr random_expr(std::mt19937 &rng, Type t, int depth, bool overflow_undef = false);
 
-Expr random_condition(FuzzedDataProvider &fdp, Type t, int depth, bool maybe_scalar) {
+Expr random_condition(std::mt19937 &rng, Type t, int depth, bool maybe_scalar) {
     static make_bin_op_fn make_bin_op[] = {
         EQ::make,
         NE::make,
@@ -93,13 +104,13 @@ Expr random_condition(FuzzedDataProvider &fdp, Type t, int depth, bool maybe_sca
         GE::make,
     };
 
-    if (maybe_scalar && fdp.ConsumeBool()) {
+    if (maybe_scalar && (rng() & 1)) {
         t = t.element_of();
     }
 
-    Expr a = random_expr(fdp, t, depth);
-    Expr b = random_expr(fdp, t, depth);
-    return fdp.PickValueInArray(make_bin_op)(a, b);
+    Expr a = random_expr(rng, t, depth);
+    Expr b = random_expr(rng, t, depth);
+    return random_choice(rng, make_bin_op)(a, b);
 }
 
 Expr make_absd(Expr a, Expr b) {
@@ -108,67 +119,67 @@ Expr make_absd(Expr a, Expr b) {
     return cast(a.type(), absd(a, b));
 }
 
-Expr random_expr(FuzzedDataProvider &fdp, Type t, int depth, bool overflow_undef) {
+Expr random_expr(std::mt19937 &rng, Type t, int depth, bool overflow_undef) {
     if (t.is_int() && t.bits() == 32) {
         overflow_undef = true;
     }
 
     if (depth-- <= 0) {
-        return random_leaf(fdp, t, overflow_undef);
+        return random_leaf(rng, t, overflow_undef);
     }
 
     std::function<Expr()> operations[] = {
         [&]() {
-            return random_leaf(fdp, t);
+            return random_leaf(rng, t);
         },
         [&]() {
-            auto c = random_condition(fdp, t, depth, true);
-            auto e1 = random_expr(fdp, t, depth, overflow_undef);
-            auto e2 = random_expr(fdp, t, depth, overflow_undef);
+            auto c = random_condition(rng, t, depth, true);
+            auto e1 = random_expr(rng, t, depth, overflow_undef);
+            auto e2 = random_expr(rng, t, depth, overflow_undef);
             return Select::make(c, e1, e2);
         },
         [&]() {
             if (t.lanes() != 1) {
-                int lanes = get_random_divisor(fdp, t);
-                auto e1 = random_expr(fdp, t.with_lanes(t.lanes() / lanes), depth, overflow_undef);
+                int lanes = get_random_divisor(rng, t);
+                auto e1 = random_expr(rng, t.with_lanes(t.lanes() / lanes), depth, overflow_undef);
                 return Broadcast::make(e1, lanes);
             }
-            return random_expr(fdp, t, depth, overflow_undef);
+            return random_expr(rng, t, depth, overflow_undef);
         },
         [&]() {
             if (t.lanes() != 1) {
-                int lanes = get_random_divisor(fdp, t);
-                auto e1 = random_expr(fdp, t.with_lanes(t.lanes() / lanes), depth, overflow_undef);
-                auto e2 = random_expr(fdp, t.with_lanes(t.lanes() / lanes), depth, overflow_undef);
+                int lanes = get_random_divisor(rng, t);
+                auto e1 = random_expr(rng, t.with_lanes(t.lanes() / lanes), depth, overflow_undef);
+                auto e2 = random_expr(rng, t.with_lanes(t.lanes() / lanes), depth, overflow_undef);
                 return Ramp::make(e1, e2, lanes);
             }
-            return random_expr(fdp, t, depth, overflow_undef);
+            return random_expr(rng, t, depth, overflow_undef);
         },
         [&]() {
             if (t.is_bool()) {
-                auto e1 = random_expr(fdp, t, depth);
+                auto e1 = random_expr(rng, t, depth);
                 return Not::make(e1);
             }
-            return random_expr(fdp, t, depth, overflow_undef);
+            return random_expr(rng, t, depth, overflow_undef);
         },
         [&]() {
             // When generating boolean expressions, maybe throw in a condition on non-bool types.
             if (t.is_bool()) {
-                return random_condition(fdp, random_type(fdp, t.lanes()), depth, false);
+                return random_condition(rng, random_type(rng, t.lanes()), depth, false);
             }
-            return random_expr(fdp, t, depth, overflow_undef);
+            return random_expr(rng, t, depth, overflow_undef);
         },
         [&]() {
             // Get a random type that isn't t or int32 (int32 can overflow and we don't care about that).
-            // Note also that the FuzzedDataProvider doesn't actually promise to return a random distribution --
+            // Note also that the std::mt19937 doesn't actually promise to return a random distribution --
             // it can (e.g.) decide to just return 0 for all data, forever -- so this loop has no guarantee
             // of eventually finding a different type. To remedy this, we'll just put a limit on the retries.
             int count = 0;
             Type subtype;
             do {
-                subtype = random_type(fdp, t.lanes());
+                subtype = random_type(rng, t.lanes());
             } while (++count < 10 && (subtype == t || (subtype.is_int() && subtype.bits() == 32)));
-            auto e1 = random_expr(fdp, subtype, depth, overflow_undef);
+            auto e1 = random_expr(rng, subtype, depth, overflow_undef);
             return Cast::make(t, e1);
         },
         [&]() {
@@ -184,9 +195,9 @@ Expr random_expr(FuzzedDataProvider &fdp, Type t, int depth, bool overflow_undef
                 make_absd,
             };
 
-            Expr a = random_expr(fdp, t, depth, overflow_undef);
-            Expr b = random_expr(fdp, t, depth, overflow_undef);
-            return fdp.PickValueInArray(make_bin_op)(a, b);
+            Expr a = random_expr(rng, t, depth, overflow_undef);
+            Expr b = random_expr(rng, t, depth, overflow_undef);
+            return random_choice(rng, make_bin_op)(a, b);
         },
         [&]() {
             static make_bin_op_fn make_bin_op[] = {
@@ -196,14 +207,14 @@ Expr random_expr(FuzzedDataProvider &fdp, Type t, int depth, bool overflow_undef
 
             // Boolean operations -- both sides must be cast to booleans,
             // and then we must cast the result back to 't'.
-            Expr a = random_expr(fdp, t, depth, overflow_undef);
-            Expr b = random_expr(fdp, t, depth, overflow_undef);
+            Expr a = random_expr(rng, t, depth, overflow_undef);
+            Expr b = random_expr(rng, t, depth, overflow_undef);
             Type bool_with_lanes = Bool(t.lanes());
             a = cast(bool_with_lanes, a);
             b = cast(bool_with_lanes, b);
-            return cast(t, fdp.PickValueInArray(make_bin_op)(a, b));
+            return cast(t, random_choice(rng, make_bin_op)(a, b));
         }};
-    return fdp.PickValueInArray(operations)();
+    return random_choice(rng, operations)();
 }
 
 bool test_simplification(Expr a, Expr b, Type t, const map<string, Expr> &vars) {
@@ -240,7 +251,7 @@ bool test_simplification(Expr a, Expr b, Type t, const map<string, Expr> &vars)
     return true;
 }
 
-bool test_expression(FuzzedDataProvider &fdp, Expr test, int samples) {
+bool test_expression(std::mt19937 &rng, Expr test, int samples) {
     Expr simplified = simplify(test);
 
     map<string, Expr> vars;
@@ -254,7 +265,7 @@ bool test_expression(FuzzedDataProvider &fdp, Expr test, int samples) {
             // Don't let the random leaf depend on v itself.
             size_t iterations = 0;
             do {
-                v->second = random_leaf(fdp, test.type().element_of(), true);
+                v->second = random_leaf(rng, Int(32), true);
                 iterations++;
             } while (expr_uses_var(v->second, v->first) && iterations < kMaxLeafIterations);
         }
@@ -266,96 +277,62 @@ bool test_expression(FuzzedDataProvider &fdp, Expr test, int samples) {
     return true;
 }
 
-// These are here to enable copy of failed output expressions
-// and pasting them into the test for debugging; they are commented out
-// to avoid "unused function" warnings in some build environments.
-#if 0
-Expr ramp(Expr b, Expr s, int w) {
-    return Ramp::make(b, s, w);
-}
-Expr x1(Expr x) {
-    return Broadcast::make(x, 2);
-}
-Expr x2(Expr x) {
-    return Broadcast::make(x, 2);
-}
-Expr x3(Expr x) {
-    return Broadcast::make(x, 3);
-}
-Expr x4(Expr x) {
-    return Broadcast::make(x, 4);
-}
-Expr x6(Expr x) {
-    return Broadcast::make(x, 6);
-}
-Expr x8(Expr x) {
-    return Broadcast::make(x, 8);
-}
-Expr uint1(Expr x) {
-    return Cast::make(UInt(1), x);
-}
-Expr uint8(Expr x) {
-    return Cast::make(UInt(8), x);
-}
-Expr uint16(Expr x) {
-    return Cast::make(UInt(16), x);
-}
-Expr uint32(Expr x) {
-    return Cast::make(UInt(32), x);
-}
-Expr int8(Expr x) {
-    return Cast::make(Int(8), x);
-}
-Expr int16(Expr x) {
-    return Cast::make(Int(16), x);
-}
-Expr int32(Expr x) {
-    return Cast::make(Int(32), x);
-}
-Expr uint1x2(Expr x) {
-    return Cast::make(UInt(1).with_lanes(2), x);
-}
-Expr uint8x2(Expr x) {
-    return Cast::make(UInt(8).with_lanes(2), x);
-}
-Expr uint16x2(Expr x) {
-    return Cast::make(UInt(16).with_lanes(2), x);
-}
-Expr uint32x2(Expr x) {
-    return Cast::make(UInt(32).with_lanes(2), x);
-}
-Expr int8x2(Expr x) {
-    return Cast::make(Int(8).with_lanes(2), x);
-}
-Expr int16x2(Expr x) {
-    return Cast::make(Int(16).with_lanes(2), x);
-}
-Expr int32x2(Expr x) {
-    return Cast::make(Int(32).with_lanes(2), x);
-}
-#endif
-
-Expr a(Variable::make(Int(0), fuzz_var(0)));
-Expr b(Variable::make(Int(0), fuzz_var(1)));
-Expr c(Variable::make(Int(0), fuzz_var(2)));
-Expr d(Variable::make(Int(0), fuzz_var(3)));
-Expr e(Variable::make(Int(0), fuzz_var(4)));
-
 }  // namespace
 
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+int main(int argc, char **argv) {
     // Depth of the randomly generated expression trees.
     const int depth = 5;
     // Number of samples to test the generated expressions for.
     const int samples = 3;
 
-    FuzzedDataProvider fdp(data, size);
+    std::mt19937 seed_generator{(uint32_t)time(NULL)};
+
+    for (int i = 0; i < ((argc == 1) ? 10000 : 1); i++) {
+        uint32_t seed = seed_generator();
+        if (argc > 1) {
+            seed = atoi(argv[1]);
+        }
+        // Print the seed on every iteration so that if the simplifier crashes
+        // (rather than the check failing), we can reproduce.
+        printf("Seed: %d\n", seed);
+        std::mt19937 rng{seed};
+        std::array<int, 6> vector_widths = {1, 2, 3, 4, 6, 8};
+        int width = random_choice(rng, vector_widths);
+        Type VT = random_type(rng, width);
+        // Generate a random expr...
+        Expr test = random_expr(rng, VT, depth);
+        if (!test_expression(rng, test, samples)) {
+
+            // Failure. Find the minimal subexpression that failed.
+            printf("Testing subexpressions...\n");
+            class TestSubexpressions : public IRMutator {
+                std::mt19937 &rng;
+                bool found_failure = false;
+
+            public:
+                using IRMutator::mutate;
+                Expr mutate(const Expr &e) override {
+                    // We know there's a failure here somewhere, so test
+                    // subexpressions more aggressively.
+                    IRMutator::mutate(e);
+                    if (e.type().bits() && !found_failure) {
+                        const int samples = 100;
+                        found_failure = !test_expression(rng, e, samples);
+                    }
+                    return e;
+                }
+
+                TestSubexpressions(std::mt19937 &rng)
+                    : rng(rng) {
+                }
+            } tester(rng);
+            tester.mutate(test);
+
+            printf("Failed with seed %d\n", seed);
+            return 1;
+        }
+    }
 
-    std::array<int, 6> vector_widths = {1, 2, 3, 4, 6, 8};
-    int width = fdp.PickValueInArray(vector_widths);
-    Type VT = random_type(fdp, width);
-    // Generate a random expr...
-    Expr test = random_expr(fdp, VT, depth);
-    assert(test_expression(fdp, test, samples));
+    printf("Success!\n");
     return 0;
 }
diff --git a/test/correctness/simplify.cpp b/test/correctness/simplify.cpp
index 6f497531da94..6f51d65f59a6 100644
--- a/test/correctness/simplify.cpp
+++ b/test/correctness/simplify.cpp
@@ -2124,7 +2124,9 @@ void check_invariant() {
         Expr w = Variable::make(t, "w");
         check_inv(x + y);
         check_inv(x - y);
-        check_inv(x % y);
+        if (t != UInt(1)) {
+            check_inv(x % y);
+        }
         check_inv(x * y);
         check_inv(x / y);
         check_inv(min(x, y));
@@ -2214,7 +2216,7 @@ int main(int argc, char **argv) {
 
     // This expression used to cause infinite recursion.
     check(Broadcast::make(-16, 2) < (ramp(Cast::make(UInt(16), 7), Cast::make(UInt(16), 11), 2) - Broadcast::make(1, 2)),
-          Broadcast::make(-15, 2) < (ramp(make_const(UInt(16), 7), make_const(UInt(16), 11), 2)));
+          Broadcast::make(make_const(UInt(1), 1), 2));
 
     {
         // Verify that integer types passed to min() and max() are coerced to match
diff --git a/test/fuzz/CMakeLists.txt b/test/fuzz/CMakeLists.txt
index 4cd4000cb72c..18bdcaf1d42e 100644
--- a/test/fuzz/CMakeLists.txt
+++ b/test/fuzz/CMakeLists.txt
@@ -2,7 +2,6 @@ tests(GROUPS fuzz
       SOURCES
       bounds.cpp
       cse.cpp
-      simplify.cpp
       # By default, the libfuzzer harness runs with a timeout of 1200 seconds.
       # Let's dial that back:
       # - Do 1000 fuzz runs for each test.
@@ -26,7 +25,7 @@ tests(GROUPS fuzz
 set(LIB_FUZZING_ENGINE "$ENV{LIB_FUZZING_ENGINE}"
     CACHE STRING "Compiler flags necessary to link the fuzzing engine of choice e.g. libfuzzer, afl etc.")
 
-foreach(fuzzer "fuzz_bounds" "fuzz_cse" "fuzz_simplify")
+foreach(fuzzer "fuzz_bounds" "fuzz_cse")
   target_link_libraries(${fuzzer} PRIVATE Halide::Halide)
 
   # Allow OSS-fuzz to manage flags directly

From 7cf2951b0d6acff87318ff2f763a6e1eb5225f0a Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@jansel.net>
Date: Sun, 2 Jun 2024 14:34:36 -0700
Subject: [PATCH 123/186] Remove max size assert from Anderson2021 (#8253)

Fixes #8252
---
 src/autoschedulers/anderson2021/LoopNest.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp
index 04850bafe633..b24ed1815b05 100644
--- a/src/autoschedulers/anderson2021/LoopNest.cpp
+++ b/src/autoschedulers/anderson2021/LoopNest.cpp
@@ -1565,12 +1565,6 @@ int64_t LoopNest::points_accessed_per_thread(
     int64_t num_points = 1;
     for (int i = 0; i < producer->dimensions; i++) {
         num_points *= bounds->region_required(i).extent();
-
-        // If the min is >= 100000, there's a good chance that the bounds are
-        // uninitialized, indicating a bug
-        internal_assert(std::abs(bounds->region_required(i).min()) < 100000)
-            << "region_required min = " << std::abs(bounds->region_required(i).min())
-            << "; region_required max = " << std::abs(bounds->region_required(i).max());
         if (verbose) {
             aslog(2) << "region_required(" << i << ") = " << bounds->region_required(i).extent() << "; ";
         }

From 7ca95d8658db5383325b7ca51cef21aaeaab89ca Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@jansel.net>
Date: Sun, 2 Jun 2024 14:39:44 -0700
Subject: [PATCH 124/186] Expose BFloat in Python bindings (#8255)

There are two parts to support for BFloat16 in Python:
1) Ability to define kernels and AOT compile them [fixed in this PR]
2) Ability to call kernels from Python

This fixes part 1, which is what I need for my use case.  Part 2 is
blocked on bfloat16 support in Python buffer protocols. See #6849 for
more details.
---
 python_bindings/src/halide/halide_/PyType.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python_bindings/src/halide/halide_/PyType.cpp b/python_bindings/src/halide/halide_/PyType.cpp
index f2592e304ef0..09ac62f34570 100644
--- a/python_bindings/src/halide/halide_/PyType.cpp
+++ b/python_bindings/src/halide/halide_/PyType.cpp
@@ -32,6 +32,9 @@ std::string halide_type_to_string(const Type &type) {
         case halide_type_float:
             stream << "float";
             break;
+        case halide_type_bfloat:
+            stream << "bfloat";
+            break;
         case halide_type_handle:
             if (type.handle_type) {
                 stream << type.handle_type->inner_name.name;
@@ -67,6 +70,7 @@ void define_type(py::module &m) {
         .def("is_vector", &Type::is_vector)
         .def("is_scalar", &Type::is_scalar)
         .def("is_float", &Type::is_float)
+        .def("is_bfloat", &Type::is_bfloat)
         .def("is_int", &Type::is_int)
         .def("is_uint", &Type::is_uint)
         .def("is_handle", &Type::is_handle)
@@ -94,6 +98,7 @@ void define_type(py::module &m) {
     m.def("Int", Int, py::arg("bits"), py::arg("lanes") = 1);
     m.def("UInt", UInt, py::arg("bits"), py::arg("lanes") = 1);
     m.def("Float", Float, py::arg("bits"), py::arg("lanes") = 1);
+    m.def("BFloat", BFloat, py::arg("bits"), py::arg("lanes") = 1);
     m.def("Bool", Bool, py::arg("lanes") = 1);
     m.def("Handle", make_handle, py::arg("lanes") = 1);
 

From 9c75554ef02232b07b98c8287b0dd8a8f5de5981 Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Tue, 4 Jun 2024 11:21:04 -0400
Subject: [PATCH 125/186] Fix Metal handling for float16 literals (#8260)

* Fix Metal handling of float16 from bits, infinity, neg infinity, and nans

* Disable test for OpenCL half for now

* Formatting
---
 src/CodeGen_Metal_Dev.cpp               | 50 +++++++++++++++++
 test/correctness/CMakeLists.txt         |  1 +
 test/correctness/gpu_f16_intrinsics.cpp | 73 +++++++++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 test/correctness/gpu_f16_intrinsics.cpp

diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp
index 35b22058aec1..194bfdc3e5dd 100644
--- a/src/CodeGen_Metal_Dev.cpp
+++ b/src/CodeGen_Metal_Dev.cpp
@@ -98,6 +98,7 @@ class CodeGen_Metal_Dev : public CodeGen_GPU_Dev {
         void visit(const Cast *op) override;
         void visit(const VectorReduce *op) override;
         void visit(const Atomic *op) override;
+        void visit(const FloatImm *op) override;
     };
 
     std::ostringstream src_stream;
@@ -556,6 +557,51 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Atomic *op) {
     user_assert(false) << "Atomic updates are not supported inside Metal kernels";
 }
 
+void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const FloatImm *op) {
+    if (op->type.bits() == 16) {
+        float16_t f(op->value);
+        if (f.is_nan()) {
+            id = "nan_f16()";
+        } else if (f.is_infinity()) {
+            if (!f.is_negative()) {
+                id = "inf_f16()";
+            } else {
+                id = "neg_inf_f16()";
+            }
+        } else {
+            // Write the constant as reinterpreted uint to avoid any bits lost in conversion.
+            ostringstream oss;
+            oss << "half_from_bits(" << f.to_bits() << " /* " << float(f) << " */)";
+            print_assignment(op->type, oss.str());
+        }
+    } else {
+        if (std::isnan(op->value)) {
+            id = "nan_f32()";
+        } else if (std::isinf(op->value)) {
+            if (op->value > 0) {
+                id = "inf_f32()";
+            } else {
+                id = "neg_inf_f32()";
+            }
+        } else {
+            // Write the constant as reinterpreted uint to avoid any bits lost in conversion.
+            ostringstream oss;
+            union {
+                uint32_t as_uint;
+                float as_float;
+            } u;
+            u.as_float = op->value;
+            if (op->type.bits() == 64) {
+                user_error << "Metal does not support 64-bit floating point literals.\n";
+            } else if (op->type.bits() == 32) {
+                oss << "float_from_bits(" << u.as_uint << " /* " << u.as_float << " */)";
+            } else {
+                user_error << "Unsupported floating point literal with " << op->type.bits() << " bits.\n";
+            }
+            print_assignment(op->type, oss.str());
+        }
+    }
+}
 void CodeGen_Metal_Dev::add_kernel(Stmt s,
                                    const string &name,
                                    const vector<DeviceArgument> &args) {
@@ -815,6 +861,10 @@ void CodeGen_Metal_Dev::init_module() {
                << "#define tanh_f16 tanh\n"
                << "#define atanh_f16 atanh\n"
                << "#define fast_inverse_sqrt_f16 rsqrt\n"
+               << "constexpr half half_from_bits(unsigned short x) {return as_type<half>(x);}\n"
+               << "constexpr half nan_f16() { return half_from_bits(32767); }\n"
+               << "constexpr half neg_inf_f16() { return half_from_bits(64512); }\n"
+               << "constexpr half inf_f16() { return half_from_bits(31744); }\n"
                // This is quite annoying: even though the MSL docs claim
                // all versions of Metal support the same memory fence
                // names, the truth is that 1.0 does not.
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 4246ba807220..8e4d9ecce3ff 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -139,6 +139,7 @@ tests(GROUPS correctness
       gpu_jit_explicit_copy_to_device.cpp
       gpu_large_alloc.cpp
       gpu_many_kernels.cpp
+      gpu_f16_intrinsics.cpp
       gpu_mixed_dimensionality.cpp
       gpu_mixed_shared_mem_types.cpp
       gpu_multi_kernel.cpp
diff --git a/test/correctness/gpu_f16_intrinsics.cpp b/test/correctness/gpu_f16_intrinsics.cpp
new file mode 100644
index 000000000000..b8bad8eb0a37
--- /dev/null
+++ b/test/correctness/gpu_f16_intrinsics.cpp
@@ -0,0 +1,73 @@
+#include "Halide.h"
+
+using namespace Halide;
+int main(int argc, char *argv[]) {
+
+    auto target = get_jit_target_from_environment();
+    if (!target.has_feature(Target::Metal)) {
+        printf("[SKIP] No metal target enabled.\n");
+        return 0;
+    }
+
+    Func output, output_cpu;
+    Var x, y;
+    Expr val = cast(Float(16), cast(Float(16), x + y) + 1.f);
+    Expr clamp_val = clamp(cast(Float(16), 0.1f) * val, cast(Float(16), 0), cast(Float(16), 1));
+
+    output(x, y) = cast(Float(16), select(clamp_val > 1, cast<float>(abs(clamp_val)), cast<float>(fast_pow(clamp_val, cast(Float(16), 1.f / 2.2f)))));
+    output_cpu(x, y) = cast(Float(16), select(clamp_val > 1, cast<float>(abs(clamp_val)), cast<float>(fast_pow(clamp_val, cast(Float(16), 1.f / 2.2f)))));
+
+    Var xi, xo, yi, yo;
+    output.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
+
+    Buffer<float16_t> out = output.realize({64, 64});
+    Buffer<float16_t> out2 = output_cpu.realize({64, 64});
+    out.copy_to_host();
+
+    for (int i = 0; i < 64; i++) {
+        for (int j = 0; j < 64; j++) {
+            if (fabs(float(out2(i, j)) - float(out(i, j))) > 0.01) {
+                fprintf(stderr, "Failed: Incorrect value at %d,%d: %f vs %f\n", i, j, float(out(i, j)), float(out2(i, j)));
+                return 1;
+            }
+        }
+    }
+
+    Func f, g, h;
+    auto inf16 = float16_t::make_infinity();
+    auto neginf16 = float16_t::make_negative_infinity();
+    auto nan16 = float16_t::make_nan();
+
+    f(x) = inf16;
+    g(x) = neginf16;
+    h(x) = nan16;
+
+    f.gpu_tile(x, xo, xi, 8);
+    g.gpu_tile(x, xo, xi, 8);
+    h.gpu_tile(x, xo, xi, 8);
+
+    Buffer<float16_t> fout = f.realize({8});
+    Buffer<float16_t> gout = g.realize({8});
+    Buffer<float16_t> hout = h.realize({8});
+    fout.copy_to_host();
+    gout.copy_to_host();
+    hout.copy_to_host();
+
+    for (int i = 0; i < 8; i++) {
+        if (!fout(i).is_infinity()) {
+            fprintf(stderr, "Failed: did not get infinity at %d (got: %u, expected: %u)\n", i, fout(i).to_bits(), float16_t::make_infinity().to_bits());
+            return 1;
+        }
+        if (!(gout(i).is_infinity() && gout(i).is_negative())) {
+            fprintf(stderr, "Failed: did not get negative infinity at %d (got: %u, expected: %u)\n", i, gout(i).to_bits(), float16_t::make_negative_infinity().to_bits());
+            return 1;
+        }
+        if (!hout(i).is_nan()) {
+            fprintf(stderr, "Failed: did not get nan at %d (got: %u, expected: %u)\n", i, hout(i).to_bits(), float16_t::make_nan().to_bits());
+            return 1;
+        }
+    }
+
+    printf("Success!\n");
+    return 0;
+}

From 775bfbf8e17cca91b5b4178eca78b6ef948723f8 Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@jansel.net>
Date: Tue, 4 Jun 2024 09:31:30 -0700
Subject: [PATCH 126/186] Python binding support for int64 literals (#8254)

This makes >32bit python integers get mapped to `hl.i64` implicitly.

Fixes #8224
---
 python_bindings/src/halide/halide_/PyExpr.cpp | 2 ++
 python_bindings/test/correctness/basics.py    | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/python_bindings/src/halide/halide_/PyExpr.cpp b/python_bindings/src/halide/halide_/PyExpr.cpp
index 880d8ee7ac7d..1403e2db9a72 100644
--- a/python_bindings/src/halide/halide_/PyExpr.cpp
+++ b/python_bindings/src/halide/halide_/PyExpr.cpp
@@ -29,6 +29,7 @@ void define_expr(py::module &m) {
             // PyBind11 searches in declared order,
             // int should be tried before float conversion
             .def(py::init<int>())
+            .def(py::init<int64_t>())
             // Python float is implemented by double
             // But Halide prohibits implicitly construct by double.
             .def(py::init([](double v) {
@@ -65,6 +66,7 @@ void define_expr(py::module &m) {
     // int should be tried before float conversion
     py::implicitly_convertible<bool, Expr>();
     py::implicitly_convertible<int, Expr>();
+    py::implicitly_convertible<int64_t, Expr>();
     py::implicitly_convertible<float, Expr>();
     py::implicitly_convertible<double, Expr>();
 
diff --git a/python_bindings/test/correctness/basics.py b/python_bindings/test/correctness/basics.py
index 814fe70a30a1..204966418e41 100644
--- a/python_bindings/test/correctness/basics.py
+++ b/python_bindings/test/correctness/basics.py
@@ -445,6 +445,11 @@ def test_requirements():
         assert False, "Did not see expected exception!"
 
 
+def test_implicit_convert_int64():
+    assert (hl.i32(0) + 0x7fffffff).type() == hl.Int(32)
+    assert (hl.i32(0) + (0x7fffffff+1)).type() == hl.Int(64)
+
+
 if __name__ == "__main__":
     test_compiletime_error()
     test_runtime_error()
@@ -463,3 +468,4 @@ def test_requirements():
     test_scalar_funcs()
     test_bool_conversion()
     test_requirements()
+    test_implicit_convert_int64()

From 46e866dd2586db029b0c51fef9ba5ef553817dd5 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Tue, 4 Jun 2024 18:32:54 +0200
Subject: [PATCH 127/186] Report useful error to user if the promise_clamp all
 fails to losslessly cast. (#8238)

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/IROperator.cpp | 29 +++++++++++++++++++---
 src/IRPrinter.cpp  | 62 +++++++++++++++++++++++++++++++++++++++++++++-
 src/IRPrinter.h    |  5 ++++
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/src/IROperator.cpp b/src/IROperator.cpp
index 3492c9e828c3..2011fdfa06bf 100644
--- a/src/IROperator.cpp
+++ b/src/IROperator.cpp
@@ -1587,10 +1587,31 @@ Tuple mux(const Expr &id, const std::vector<Tuple> &values) {
     return Tuple{result};
 }
 
+namespace {
+void cast_bounds_for_promise_clamped(const Expr &value, const Expr &min, const Expr &max, Expr &casted_min, Expr &casted_max, const char *call_name) {
+    {
+        Expr n_min_val = lossless_cast(value.type(), min);
+        if (min.defined()) {
+            user_assert(n_min_val.defined())
+                << call_name << " min argument (type " << min.node_type() << " " << min.type() << ") could not be cast losslessly to " << value.type();
+        }
+        casted_min = n_min_val.defined() ? n_min_val : value.type().min();
+    }
+    {
+        Expr n_max_val = lossless_cast(value.type(), max);
+        if (max.defined()) {
+            user_assert(n_max_val.defined())
+                << call_name << " max argument (type " << max.node_type() << " " << max.type() << ") could not be cast losslessly to " << value.type();
+        }
+        casted_max = n_max_val.defined() ? n_max_val : value.type().max();
+    }
+}
+}  // namespace
+
 Expr unsafe_promise_clamped(const Expr &value, const Expr &min, const Expr &max) {
     user_assert(value.defined()) << "unsafe_promise_clamped with undefined value.\n";
-    Expr n_min_val = min.defined() ? lossless_cast(value.type(), min) : value.type().min();
-    Expr n_max_val = max.defined() ? lossless_cast(value.type(), max) : value.type().max();
+    Expr n_min_val, n_max_val;
+    cast_bounds_for_promise_clamped(value, min, max, n_min_val, n_max_val, "unsafe_promise_clamped");
 
     // Min and max are allowed to be undefined with the meaning of no bound on that side.
     return Call::make(value.type(),
@@ -1602,8 +1623,8 @@ Expr unsafe_promise_clamped(const Expr &value, const Expr &min, const Expr &max)
 namespace Internal {
 Expr promise_clamped(const Expr &value, const Expr &min, const Expr &max) {
     internal_assert(value.defined()) << "promise_clamped with undefined value.\n";
-    Expr n_min_val = min.defined() ? lossless_cast(value.type(), min) : value.type().min();
-    Expr n_max_val = max.defined() ? lossless_cast(value.type(), max) : value.type().max();
+    Expr n_min_val, n_max_val;
+    cast_bounds_for_promise_clamped(value, min, max, n_min_val, n_max_val, "promise_clamped");
 
     // Min and max are allowed to be undefined with the meaning of no bound on that side.
     return Call::make(value.type(),
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index f14e45a335f6..a42431f232d0 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -7,6 +7,7 @@
 #include "Associativity.h"
 #include "Closure.h"
 #include "ConstantInterval.h"
+#include "Expr.h"
 #include "IROperator.h"
 #include "Interval.h"
 #include "Module.h"
@@ -48,7 +49,6 @@ ostream &operator<<(ostream &out, const Type &type) {
     }
     return out;
 }
-
 ostream &operator<<(ostream &stream, const Expr &ir) {
     if (!ir.defined()) {
         stream << "(undefined)";
@@ -270,6 +270,66 @@ void IRPrinter::test() {
     std::cout << "IRPrinter test passed\n";
 }
 
+std::ostream &operator<<(std::ostream &stream, IRNodeType type) {
+#define CASE(e)         \
+    case IRNodeType::e: \
+        stream << #e;   \
+        break;
+    switch (type) {
+        CASE(IntImm)
+        CASE(UIntImm)
+        CASE(FloatImm)
+        CASE(StringImm)
+        CASE(Broadcast)
+        CASE(Cast)
+        CASE(Reinterpret)
+        CASE(Variable)
+        CASE(Add)
+        CASE(Sub)
+        CASE(Mod)
+        CASE(Mul)
+        CASE(Div)
+        CASE(Min)
+        CASE(Max)
+        CASE(EQ)
+        CASE(NE)
+        CASE(LT)
+        CASE(LE)
+        CASE(GT)
+        CASE(GE)
+        CASE(And)
+        CASE(Or)
+        CASE(Not)
+        CASE(Select)
+        CASE(Load)
+        CASE(Ramp)
+        CASE(Call)
+        CASE(Let)
+        CASE(Shuffle)
+        CASE(VectorReduce)
+        // Stmts
+        CASE(LetStmt)
+        CASE(AssertStmt)
+        CASE(ProducerConsumer)
+        CASE(For)
+        CASE(Acquire)
+        CASE(Store)
+        CASE(Provide)
+        CASE(Allocate)
+        CASE(Free)
+        CASE(Realize)
+        CASE(Block)
+        CASE(Fork)
+        CASE(IfThenElse)
+        CASE(Evaluate)
+        CASE(Prefetch)
+        CASE(Atomic)
+        CASE(HoistedStorage)
+    }
+#undef CASE
+    return stream;
+}
+
 ostream &operator<<(ostream &stream, const AssociativePattern &p) {
     stream << "{\n";
     for (size_t i = 0; i < p.ops.size(); ++i) {
diff --git a/src/IRPrinter.h b/src/IRPrinter.h
index 6addbbd7c771..48afef8603d3 100644
--- a/src/IRPrinter.h
+++ b/src/IRPrinter.h
@@ -61,6 +61,11 @@ class Closure;
 struct Interval;
 struct ConstantInterval;
 struct ModulusRemainder;
+enum class IRNodeType;
+
+/** Emit a halide node type on an output stream (such as std::cout) in
+ * human-readable form */
+std::ostream &operator<<(std::ostream &stream, IRNodeType);
 
 /** Emit a halide associative pattern on an output stream (such as std::cout)
  * in a human-readable form */

From 74b904470758d9105de33f0fe5e001f4b3064978 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 5 Jun 2024 09:24:06 -0700
Subject: [PATCH 128/186] It's generally a bad idea for simplifier rules to
 multiply constants (#8234)

Fixes #8227 but may break other things. Needs thorough testing.

Also, there are more rules like this lurking.
---
 src/Simplify_Add.cpp          | 4 ----
 src/Simplify_Min.cpp          | 1 +
 test/correctness/simplify.cpp | 3 ---
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/Simplify_Add.cpp b/src/Simplify_Add.cpp
index e4cccf131b5e..e4c22ed64216 100644
--- a/src/Simplify_Add.cpp
+++ b/src/Simplify_Add.cpp
@@ -123,10 +123,6 @@ Expr Simplify::visit(const Add *op, ExprInfo *info) {
                rewrite(x + y*x, (y + 1) * x) ||
                rewrite(x*y + x, x * (y + 1)) ||
                rewrite(y*x + x, (y + 1) * x, !is_const(x)) ||
-               rewrite((x + c0)/c1 + c2, (x + fold(c0 + c1*c2))/c1, c1 != 0) ||
-               rewrite((x + (y + c0)/c1) + c2, x + (y + fold(c0 + c1*c2))/c1, c1 != 0) ||
-               rewrite(((y + c0)/c1 + x) + c2, x + (y + fold(c0 + c1*c2))/c1, c1 != 0) ||
-               rewrite((c0 - x)/c1 + c2, (fold(c0 + c1*c2) - x)/c1, c0 != 0 && c1 != 0) || // When c0 is zero, this would fight another rule
                rewrite(x + (x + y)/c0, (fold(c0 + 1)*x + y)/c0, c0 != 0) ||
                rewrite(x + (y + x)/c0, (fold(c0 + 1)*x + y)/c0, c0 != 0) ||
                rewrite(x + (y - x)/c0, (fold(c0 - 1)*x + y)/c0, c0 != 0) ||
diff --git a/src/Simplify_Min.cpp b/src/Simplify_Min.cpp
index 41e455174351..beda5919900b 100644
--- a/src/Simplify_Min.cpp
+++ b/src/Simplify_Min.cpp
@@ -291,6 +291,7 @@ Expr Simplify::visit(const Min *op, ExprInfo *info) {
                // Required for nested GuardWithIf tilings
                rewrite(min((min(((y + c0)/c1), x)*c1), y + c2), min(x * c1, y + c2), c1 > 0 && c1 + c2 <= c0 + 1) ||
                rewrite(min((min(((y + c0)/c1), x)*c1) + c2, y), min(x * c1 + c2, y), c1 > 0 && c1 <= c0 + c2 + 1) ||
+               rewrite(min(min(((y + c0)/c1), x)*c1, y), min(x * c1, y), c1 > 0 && c1 <= c0 + 1) ||
 
                rewrite(min((x + c0)/c1, ((x + c2)/c3)*c4), (x + c0)/c1, c0 + c3 - c1 <= c2 && c1 > 0 && c3 > 0 && c1 * c4 == c3) ||
                rewrite(min((x + c0)/c1, ((x + c2)/c3)*c4), ((x + c2)/c3)*c4, c2 <= c0 && c1 > 0 && c3 > 0 && c1 * c4 == c3) ||
diff --git a/test/correctness/simplify.cpp b/test/correctness/simplify.cpp
index 6f51d65f59a6..be1421d5e11c 100644
--- a/test/correctness/simplify.cpp
+++ b/test/correctness/simplify.cpp
@@ -292,9 +292,6 @@ void check_algebra() {
     check((x * 2 - y) / 2, (0 - y) / 2 + x);
     check((x * -2 - y) / 2, (0 - y) / 2 - x);
     check((y - x * 4) / 2, y / 2 - x * 2);
-    check((x + 3) / 2 + 7, (x + 17) / 2);
-    check((x / 2 + 3) / 5, (x + 6) / 10);
-    check((x + (y + 3) / 5) + 5, (y + 28) / 5 + x);
     check((x + 8) / 2, x / 2 + 4);
     check((x - y) * -2, (y - x) * 2);
     check((xf - yf) * -2.0f, (yf - xf) * 2.0f);

From 4b67712b457278935e58f3bab9907295156ebbd9 Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <derek.gerstmann@gmail.com>
Date: Wed, 5 Jun 2024 14:43:13 -0700
Subject: [PATCH 129/186] [vulkan] Fix Vulkan SIMT mappings for GPU loop vars. 
 (#8259)

* Fix Vulkan SIMT mappings for GPU loop vars. Previous refactoring
accidentally used the fully qualified var name rather than the
categorized vulkan intrinsic name.

* Avoid formatting the GPU kernel to a string for Vulkan (since it's binary SPIR-V needs to remain intact).

---------

Co-authored-by: Derek Gerstmann <dgerstmann@adobe.com>
Co-authored-by: Steven Johnson <srj@google.com>
---
 src/CodeGen_C.cpp          |  5 +++--
 src/CodeGen_Vulkan_Dev.cpp | 22 ++++++++++++++++------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index b0cdcb3e956c..aa76a85c9664 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -1155,8 +1155,9 @@ void CodeGen_C::compile(const Buffer<> &buffer) {
     bool is_constant = buffer.dimensions() != 0;
 
     // If it is an GPU source kernel, we would like to see the actual output, not the
-    // uint8 representation. We use a string literal for this.
-    if (ends_with(name, "gpu_source_kernels")) {
+    // uint8 representation. We use a string literal for this. Since the Vulkan backend
+    // actually generates a SPIR-V binary, keep it as raw data to avoid textual reformatting.
+    if (ends_with(name, "gpu_source_kernels") && !target.has_feature(Target::Vulkan)) {
         stream << "static const char *" << name << "_string = R\"BUFCHARSOURCE(";
         stream.write((char *)b.host, num_elems);
         stream << ")BUFCHARSOURCE\";\n";
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 157a3cbdc9ea..75d538676384 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -2502,12 +2502,20 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_workgroup_size(SpvId kernel_func
 namespace {
 
 // Locate all the unique GPU variables used as SIMT intrinsics
+// This pass is used to identify if LocalInvocationID and/or WorkgroupID
+// need to be declared as variables for the entrypoint to the Kernel. Since
+// these can only be declared once and their type is vec3, we don't
+// care about the specific dims that are mapped to loop variables.
 class FindIntrinsicsUsed : public IRVisitor {
     using IRVisitor::visit;
     void visit(const For *op) override {
         if (is_gpu(op->for_type)) {
+
+            // map the block or thread id name to the SIMT intrinsic definition
             auto intrinsic = simt_intrinsic(op->name);
-            intrinsics_used.insert(op->name);
+
+            // mark the name of the intrinsic being used (without the dimension)
+            intrinsics_used.insert(intrinsic.first);  // name only!
         }
         op->body.accept(this);
     }
@@ -2537,20 +2545,22 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_entry_point(const Stmt &s, SpvId
     s.accept(&find_intrinsics);
 
     SpvFactory::Variables entry_point_variables;
-    for (const std::string &intrinsic_name : find_intrinsics.intrinsics_used) {
+    for (const std::string &used_intrinsic : find_intrinsics.intrinsics_used) {
 
-        // The builtins are pointers to vec3
+        // The builtins are pointers to vec3 and can only be declared once per kernel entrypoint
         SpvStorageClass storage_class = SpvStorageClassInput;
         SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3));
         SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, storage_class);
-        const std::string intrinsic_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + intrinsic_name;
+        const std::string intrinsic_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + used_intrinsic;
         SpvId intrinsic_var_id = builder.declare_global_variable(intrinsic_var_name, intrinsic_ptr_type_id, storage_class);
         SpvId intrinsic_loaded_id = builder.reserve_id();
         builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_var_id));
         symbol_table.push(intrinsic_var_name, {intrinsic_loaded_id, storage_class});
 
-        // Annotate that this is the specific builtin
-        SpvBuiltIn built_in_kind = map_simt_builtin(intrinsic_name);
+        // Map the used intrinsic name to the specific builtin
+        SpvBuiltIn built_in_kind = map_simt_builtin(used_intrinsic);
+
+        // Add an annotation that indicates this variable is bound to the requested intrinsic
         SpvBuilder::Literals annotation_literals = {(uint32_t)built_in_kind};
         builder.add_annotation(intrinsic_var_id, SpvDecorationBuiltIn, annotation_literals);
 

From 340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 7 Jun 2024 11:28:21 -0700
Subject: [PATCH 130/186] Stop region costs from complaining about new
 intrinsics (#8262)

Now by default it will treat them as cost one, unless you tell it
otherwise.
---
 src/RegionCosts.cpp | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/src/RegionCosts.cpp b/src/RegionCosts.cpp
index 270b7a25b9ca..7adc8a794637 100644
--- a/src/RegionCosts.cpp
+++ b/src/RegionCosts.cpp
@@ -221,29 +221,21 @@ class ExprCost : public IRVisitor {
                 user_warning << "Unknown extern call " << call->name << "\n";
             }
         } else if (call->is_intrinsic()) {
-            // TODO: Improve the cost model. In some architectures (e.g. ARM or
-            // NEON), count_leading_zeros should be as cheap as bitwise ops.
-            // div_round_to_zero and mod_round_to_zero can also get fairly expensive.
-            if (call->is_intrinsic(Call::bitwise_and) ||
-                call->is_intrinsic(Call::bitwise_not) || call->is_intrinsic(Call::bitwise_xor) ||
-                call->is_intrinsic(Call::bitwise_or) || call->is_intrinsic(Call::shift_left) ||
-                call->is_intrinsic(Call::shift_right) || call->is_intrinsic(Call::div_round_to_zero) ||
-                call->is_intrinsic(Call::mod_round_to_zero) || call->is_intrinsic(Call::undef) ||
-                call->is_intrinsic(Call::mux) || call->is_intrinsic(Call::round) ||
-                call->is_intrinsic(Call::widening_mul) || call->is_intrinsic(Call::rounding_shift_right)) {
-                arith += 1;
-            } else if (call->is_intrinsic(Call::abs) || call->is_intrinsic(Call::absd) ||
-                       call->is_intrinsic(Call::lerp) || call->is_intrinsic(Call::random) ||
-                       call->is_intrinsic(Call::count_leading_zeros) ||
-                       call->is_intrinsic(Call::count_trailing_zeros) ||
-                       call->is_intrinsic(Call::saturating_cast)) {
+            if (call->is_intrinsic({Call::lerp,
+                                    Call::div_round_to_zero,
+                                    Call::mod_round_to_zero,
+                                    Call::random})) {
+                // It's an expensive arithmetic intrinsic
                 arith += 5;
-            } else if (Call::as_tag(call)) {
-                // Tags do not result in actual operations.
+            } else if (Call::as_tag(call) ||
+                       call->is_intrinsic({Call::promise_clamped,
+                                           Call::unsafe_promise_clamped,
+                                           Call::undef})) {
+                // These intrinsics entail no actual work
             } else {
-                // For other intrinsics, use 1 for the arithmetic cost.
+                // For other intrinsics (e.g. bitwise ops, fixed-point math),
+                // use 1 for the arithmetic cost.
                 arith += 1;
-                user_warning << "Unhandled intrinsic call " << call->name << "\n";
             }
         }
 

From 6c8a4911c664e645bd4f37d846d1eb2515d12a9b Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Mon, 10 Jun 2024 09:38:57 -0700
Subject: [PATCH 131/186] Fix typo in Simplify_Let.cpp (#8274)

---
 src/Simplify_Let.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index 13fbd575d75f..e43403dfa8dc 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -34,7 +34,7 @@ class FindVarUses : public IRVisitor {
 
     void visit(const Block *op) override {
         // Early out at Block nodes if we've already seen every name we're
-        // interested in. In principal we could early-out at every node, but
+        // interested in. In principle we could early-out at every node, but
         // blocks, loads, and stores seem to be enough.
         if (!unused_vars.empty()) {
             op->first.accept(this);

From f9ccd5c0ae74c6c260e9be963dc0ab0410d93bdb Mon Sep 17 00:00:00 2001
From: Shoaib Kamil <shoaibkamil@gmail.com>
Date: Fri, 14 Jun 2024 14:24:17 -0400
Subject: [PATCH 132/186] No longer silently hide errors in Metal completion
 handlers (alternative approach) (#8240)

* No longer silently hide errors in Metal completion handlers

* Actually implement alternative

* clang-format

* Implement new API

* Implement test and refine the API

* Format.

* Remove some debug code

* Add missing includes.

* Add comment noting why we manually null-terminate after strncpy

* Reverse engineer Objective-C API for passing void* in a block; it turns out to be much simpler than I thought

* Formatting

* Don't add const-ness to declaration.

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/runtime/HalideRuntimeMetal.h              |  12 ++
 src/runtime/metal.cpp                         | 175 +++++++++++++++---
 test/correctness/CMakeLists.txt               |   3 +-
 ...u_metal_completion_handler_error_check.cpp |  45 +++++
 test/generator/CMakeLists.txt                 |   5 +
 ...al_completion_handler_override_aottest.cpp |  55 ++++++
 ..._completion_handler_override_generator.cpp |  25 +++
 7 files changed, 296 insertions(+), 24 deletions(-)
 create mode 100644 test/correctness/gpu_metal_completion_handler_error_check.cpp
 create mode 100644 test/generator/metal_completion_handler_override_aottest.cpp
 create mode 100644 test/generator/metal_completion_handler_override_generator.cpp

diff --git a/src/runtime/HalideRuntimeMetal.h b/src/runtime/HalideRuntimeMetal.h
index 8fd0f364cebb..30762e07d8ae 100644
--- a/src/runtime/HalideRuntimeMetal.h
+++ b/src/runtime/HalideRuntimeMetal.h
@@ -68,6 +68,7 @@ extern uint64_t halide_metal_get_crop_offset(void *user_context, struct halide_b
 
 struct halide_metal_device;
 struct halide_metal_command_queue;
+struct halide_metal_command_buffer;
 
 /** This prototype is exported as applications will typically need to
  * replace it to get Halide filters to execute on the same device and
@@ -93,6 +94,17 @@ extern int halide_metal_acquire_context(void *user_context, struct halide_metal_
  */
 extern int halide_metal_release_context(void *user_context);
 
+/** This function is called as part of the callback when a Metal command buffer completes.
+ * The return value, if not halide_error_code_success, will be stashed in Metal runtime and returned
+ * to the next call into the runtime, and the error string will be saved as well.
+ * The error string will be freed by the caller. The return value must be a valid Halide error code.
+ * This is called from the Metal driver, and thus:
+ * - Any user_context must be preserved between the call to halide_metal_run and the corresponding callback
+ * - The function must be thread-safe
+ */
+extern int halide_metal_command_buffer_completion_handler(void *user_context, struct halide_metal_command_buffer *buffer,
+                                                          char **returned_error_string);
+
 #ifdef __cplusplus
 }  // End extern "C"
 #endif
diff --git a/src/runtime/metal.cpp b/src/runtime/metal.cpp
index abc935b0743e..1fe7d895561b 100644
--- a/src/runtime/metal.cpp
+++ b/src/runtime/metal.cpp
@@ -12,6 +12,7 @@
 extern "C" {
 extern objc_id MTLCreateSystemDefaultDevice();
 extern struct ObjectiveCClass _NSConcreteGlobalBlock;
+extern struct ObjectiveCClass _NSConcreteStackBlock;
 void *dlsym(void *, const char *);
 #define RTLD_DEFAULT ((void *)-2)
 }
@@ -23,8 +24,8 @@ namespace Metal {
 
 typedef halide_metal_device mtl_device;
 typedef halide_metal_command_queue mtl_command_queue;
+typedef halide_metal_command_buffer mtl_command_buffer;
 struct mtl_buffer;
-struct mtl_command_buffer;
 struct mtl_compute_command_encoder;
 struct mtl_blit_command_encoder;
 struct mtl_compute_pipeline_state;
@@ -381,6 +382,8 @@ WEAK int halide_metal_release_context(void *user_context) {
 
 }  // extern "C"
 
+extern "C" size_t strnlen(const char *s, size_t maxlen);
+
 namespace Halide {
 namespace Runtime {
 namespace Internal {
@@ -389,7 +392,10 @@ namespace Metal {
 class MetalContextHolder {
     objc_id pool;
     void *const user_context;
-    int status;  // must always be a valid halide_error_code_t value
+    int status;                              // must always be a valid halide_error_code_t value
+    static int saved_status;                 // must always be a valid halide_error_code_t value
+    static halide_mutex saved_status_mutex;  // mutex for accessing saved status
+    static char error_string[1024];
 
 public:
     mtl_device *device;
@@ -404,11 +410,128 @@ class MetalContextHolder {
         drain_autorelease_pool(pool);
     }
 
-    ALWAYS_INLINE int error() const {
-        return status;
+    // We use two variants of this function: one for just checking status, and one
+    // that returns and clears the previous status.
+    ALWAYS_INLINE static int get_and_clear_saved_status(char *error_string = nullptr) {
+        halide_mutex_lock(&saved_status_mutex);
+        int result = saved_status;
+        saved_status = halide_error_code_success;
+        if (error_string != nullptr && result != halide_error_code_success && strnlen(MetalContextHolder::error_string, 1024) > 0) {
+            strncpy(error_string, MetalContextHolder::error_string, 1024);
+            // Ensure null-termination, since strncpy won't if the source string is too long
+            error_string[1023] = '\0';
+            MetalContextHolder::error_string[0] = '\0';
+            debug(nullptr) << "MetalContextHolder::get_and_clear_saved_status: " << error_string << "\n";
+        }
+        halide_mutex_unlock(&saved_status_mutex);
+        return result;
+    }
+
+    // Returns the previous status without clearing, and optionally copies the error string
+    ALWAYS_INLINE static int get_saved_status(char *error_string = nullptr) {
+        halide_mutex_lock(&saved_status_mutex);
+        int result = saved_status;
+        if (error_string != nullptr && result != halide_error_code_success && strnlen(MetalContextHolder::error_string, 1024) > 0) {
+            strncpy(error_string, MetalContextHolder::error_string, 1024);
+            // Ensure null-termination, since strncpy won't if the source string is too long
+            error_string[1023] = '\0';
+        }
+        halide_mutex_unlock(&saved_status_mutex);
+        return result;
+    }
+
+    ALWAYS_INLINE static void set_saved_status(int new_status, char *error_string = nullptr) {
+        halide_mutex_lock(&saved_status_mutex);
+        saved_status = new_status;
+        if (error_string != nullptr) {
+            strncpy(MetalContextHolder::error_string, error_string, 1024);
+            // Ensure null-termination, since strncpy won't if the source string is too long
+            error_string[1023] = '\0';
+            debug(nullptr) << "MetalContextHolder::set_saved_status: " << error_string << "\n";
+        }
+        halide_mutex_unlock(&saved_status_mutex);
+    }
+
+    ALWAYS_INLINE int error(char *error_string = nullptr) const {
+        if (status != halide_error_code_success) {
+            return status;
+        } else {
+            return get_saved_status(error_string);
+        }
+    }
+
+    ALWAYS_INLINE int get_and_clear_error(char *error_string = nullptr) const {
+        auto cleared_status = get_and_clear_saved_status(error_string);
+        if (status != halide_error_code_success) {
+            return status;
+        } else {
+            return cleared_status;
+        }
     }
 };
 
+int MetalContextHolder::saved_status = halide_error_code_success;
+halide_mutex MetalContextHolder::saved_status_mutex = {0};
+char MetalContextHolder::error_string[1024] = {0};
+
+}  // namespace Metal
+}  // namespace Internal
+}  // namespace Runtime
+}  // namespace Halide
+
+extern "C" {
+/** This function is called as part of the callback when a Metal command buffer completes.
+ * The return value, if not halide_error_code_success, will be stashed in Metal runtime and returned
+ * to the next call into the runtime, and the error string will be saved as well.
+ * The error string will be freed by the caller. The return value must be a valid Halide error code.
+ * This is called from the Metal driver, and thus:
+ * - Any user_context must be preserved between the call to halide_metal_run and the corresponding callback
+ * - The function must be thread-safe
+ */
+WEAK int halide_metal_command_buffer_completion_handler(void *const user_context, mtl_command_buffer *buffer, char **returned_error_string) {
+    objc_id buffer_error = command_buffer_error(buffer);
+    if (buffer_error != nullptr) {
+        retain_ns_object(buffer_error);
+
+        ns_log_object(buffer_error);
+
+        // Obtain the localized NSString for the error
+        typedef objc_id (*localized_description_method_t)(objc_id objc, objc_sel sel);
+        localized_description_method_t localized_description_method = (localized_description_method_t)&objc_msgSend;
+        objc_id error_ns_string = (*localized_description_method)(buffer_error, sel_getUid("localizedDescription"));
+
+        retain_ns_object(error_ns_string);
+
+        // Obtain a C-style string
+        typedef char *(*utf8_string_method_t)(objc_id objc, objc_sel sel);
+        utf8_string_method_t utf8_string_method = (utf8_string_method_t)&objc_msgSend;
+        char *error_string = (*utf8_string_method)(error_ns_string, sel_getUid("UTF8String"));
+
+        // Copy C-style string into a fresh buffer
+        if (returned_error_string != nullptr) {
+            *returned_error_string = (char *)malloc(sizeof(char) * 1024);
+            if (*returned_error_string != nullptr) {
+                strncpy(*returned_error_string, error_string, 1024);
+                // Ensure null-termination, since strncpy won't if the source string is too long
+                (*returned_error_string)[1023] = '\0';
+            } else {
+                debug(user_context) << "halide_metal_command_buffer_completion_handler: Failed to allocate memory for error string.\n";
+            }
+        }
+
+        release_ns_object(error_ns_string);
+        release_ns_object(buffer_error);
+        return halide_error_code_device_run_failed;
+    }
+    return halide_error_code_success;
+}
+}  // extern "C"
+
+namespace Halide {
+namespace Runtime {
+namespace Internal {
+namespace Metal {
+
 struct command_buffer_completed_handler_block_descriptor_1 {
     unsigned long reserved;
     unsigned long block_size;
@@ -420,24 +543,23 @@ struct command_buffer_completed_handler_block_literal {
     int reserved;
     void (*invoke)(command_buffer_completed_handler_block_literal *, mtl_command_buffer *buffer);
     struct command_buffer_completed_handler_block_descriptor_1 *descriptor;
+    void *const user_context;
 };
 
 WEAK command_buffer_completed_handler_block_descriptor_1 command_buffer_completed_handler_descriptor = {
     0, sizeof(command_buffer_completed_handler_block_literal)};
 
 WEAK void command_buffer_completed_handler_invoke(command_buffer_completed_handler_block_literal *block, mtl_command_buffer *buffer) {
-    objc_id buffer_error = command_buffer_error(buffer);
-    if (buffer_error != nullptr) {
-        ns_log_object(buffer_error);
-        release_ns_object(buffer_error);
-    }
-}
+    retain_ns_object(buffer);
+    char *error_string = nullptr;
+    void *const user_context = block->user_context;
 
-WEAK command_buffer_completed_handler_block_literal command_buffer_completed_handler_block = {
-    &_NSConcreteGlobalBlock,
-    (1 << 28) | (1 << 29),  // BLOCK_IS_GLOBAL | BLOCK_HAS_DESCRIPTOR
-    0, command_buffer_completed_handler_invoke,
-    &command_buffer_completed_handler_descriptor};
+    auto status = halide_metal_command_buffer_completion_handler(user_context, buffer, &error_string);
+    release_ns_object(buffer);
+
+    MetalContextHolder::set_saved_status(status, error_string);
+    free(error_string);
+}
 
 }  // namespace Metal
 }  // namespace Internal
@@ -476,7 +598,7 @@ WEAK int halide_metal_device_malloc(void *user_context, halide_buffer_t *buf) {
 
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error()) {
-        return metal_context.error();
+        return metal_context.get_and_clear_error();
     }
 
 #ifdef DEBUG_RUNTIME
@@ -544,7 +666,7 @@ WEAK int halide_metal_device_free(void *user_context, halide_buffer_t *buf) {
 WEAK int halide_metal_initialize_kernels(void *user_context, void **state_ptr, const char *source, int source_size) {
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error()) {
-        return metal_context.error();
+        return metal_context.get_and_clear_error();
     }
 #ifdef DEBUG_RUNTIME
     uint64_t t_before = halide_current_time_ns(user_context);
@@ -600,7 +722,7 @@ WEAK int halide_metal_device_sync(void *user_context, struct halide_buffer_t *bu
 
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error()) {
-        return metal_context.error();
+        return metal_context.get_and_clear_error();
     }
 
     halide_metal_device_sync_internal(metal_context.queue, buffer);
@@ -651,7 +773,7 @@ WEAK int halide_metal_copy_to_device(void *user_context, halide_buffer_t *buffer
 
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error()) {
-        return metal_context.error();
+        return metal_context.get_and_clear_error();
     }
 
     if (!(buffer->host && buffer->device)) {
@@ -695,7 +817,7 @@ WEAK int halide_metal_copy_to_host(void *user_context, halide_buffer_t *buffer)
 
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error()) {
-        return metal_context.error();
+        return metal_context.get_and_clear_error();
     }
 
     halide_metal_device_sync_internal(metal_context.queue, buffer);
@@ -738,7 +860,7 @@ WEAK int halide_metal_run(void *user_context,
 
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error()) {
-        return metal_context.error();
+        return metal_context.get_and_clear_error();
     }
 
     mtl_command_buffer *command_buffer = new_command_buffer(metal_context.queue, entry_name, strlen(entry_name));
@@ -882,6 +1004,13 @@ WEAK int halide_metal_run(void *user_context,
                           threadsX, threadsY, threadsZ);
     end_encoding(encoder);
 
+    command_buffer_completed_handler_block_literal command_buffer_completed_handler_block = {
+        &_NSConcreteStackBlock,
+        0,  // must be 0 for stack blocks
+        0, command_buffer_completed_handler_invoke,
+        &command_buffer_completed_handler_descriptor,
+        user_context};
+
     add_command_buffer_completed_handler(command_buffer, &command_buffer_completed_handler_block);
 
     commit_command_buffer(command_buffer);
@@ -962,7 +1091,7 @@ WEAK int halide_metal_buffer_copy(void *user_context, struct halide_buffer_t *sr
     {
         MetalContextHolder metal_context(user_context, true);
         if (metal_context.error()) {
-            return metal_context.error();
+            return metal_context.get_and_clear_error();
         }
 
         debug(user_context)
@@ -1036,7 +1165,7 @@ WEAK int metal_device_crop_from_offset(void *user_context,
                                        struct halide_buffer_t *dst) {
     MetalContextHolder metal_context(user_context, true);
     if (metal_context.error()) {
-        return metal_context.error();
+        return metal_context.get_and_clear_error();
     }
 
     dst->device_interface = src->device_interface;
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 8e4d9ecce3ff..623fdd16da5b 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -134,12 +134,13 @@ tests(GROUPS correctness
       gpu_data_flows.cpp
       gpu_different_blocks_threads_dimensions.cpp
       gpu_dynamic_shared.cpp
+      gpu_f16_intrinsics.cpp
       gpu_free_sync.cpp
       gpu_give_input_buffers_device_allocations.cpp
       gpu_jit_explicit_copy_to_device.cpp
       gpu_large_alloc.cpp
       gpu_many_kernels.cpp
-      gpu_f16_intrinsics.cpp
+      gpu_metal_completion_handler_error_check.cpp
       gpu_mixed_dimensionality.cpp
       gpu_mixed_shared_mem_types.cpp
       gpu_multi_kernel.cpp
diff --git a/test/correctness/gpu_metal_completion_handler_error_check.cpp b/test/correctness/gpu_metal_completion_handler_error_check.cpp
new file mode 100644
index 000000000000..f0bb396e2c12
--- /dev/null
+++ b/test/correctness/gpu_metal_completion_handler_error_check.cpp
@@ -0,0 +1,45 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+bool errored = false;
+
+int main(int argc, char **argv) {
+    Target t = get_jit_target_from_environment();
+    if (!t.has_feature(Target::Metal)) {
+        printf("[SKIP] Metal not enabled\n");
+        return 0;
+    }
+
+    Func f, g;
+    Var c, x, ci, xi;
+    RVar rxi;
+    RDom r(0, 1000, -327600, 327600);
+
+    // Create a function that is very costly to execute, resulting in a timeout
+    // on the GPU
+    f(x, c) = x + 0.1f * c;
+    f(r.x, c) += cos(sin(tan(cosh(tanh(sinh(exp(tanh(exp(log(tan(cos(exp(f(r.x, c) / cos(cosh(sinh(sin(f(r.x, c))))) / tanh(tan(tan(f(r.x, c)))))))))) + cast<float>(cast<uint8_t>(f(r.x, c) / cast<uint8_t>(log(f(r.x, c))))))))))));
+
+    f.gpu_tile(x, c, xi, ci, 4, 4);
+    f.update(0).gpu_tile(r.x, c, rxi, ci, 4, 4);
+
+    // Metal is surprisingly resilient.  Run this in a loop just to make sure we trigger the error.
+    for (int i = 0; (i < 10) && !errored; i++) {
+        auto out = f.realize({1000, 100}, t);
+        int result = out.device_sync();
+        if (result != halide_error_code_success) {
+            printf("Device sync failed as expected: %d\n", result);
+            errored = true;
+        }
+    }
+
+    if (!errored) {
+        printf("There was supposed to be an error\n");
+        return 1;
+    }
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index fc1cbfc76e78..2c010ae07717 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -497,6 +497,11 @@ _add_halide_libraries(metadata_tester_ucon
 _add_halide_aot_tests(metadata_tester
                       HALIDE_LIBRARIES metadata_tester metadata_tester_ucon)
 
+# metal_completion_handler_override_aottest.cpp
+# metal_completion_handler_override_generator.cpp
+_add_halide_libraries(metal_completion_handler_override FEATURES user_context)
+_add_halide_aot_tests(metal_completion_handler_override)
+
 # msan_aottest.cpp
 # msan_generator.cpp
 if ("${Halide_TARGET}" MATCHES "webgpu")
diff --git a/test/generator/metal_completion_handler_override_aottest.cpp b/test/generator/metal_completion_handler_override_aottest.cpp
new file mode 100644
index 000000000000..d6cd7fb5a112
--- /dev/null
+++ b/test/generator/metal_completion_handler_override_aottest.cpp
@@ -0,0 +1,55 @@
+#include <stdio.h>
+
+#include "HalideBuffer.h"
+#include "HalideRuntime.h"
+#include "HalideRuntimeMetal.h"
+
+#include "metal_completion_handler_override.h"
+
+struct MyUserContext {
+    int counter;
+
+    MyUserContext()
+        : counter(0) {
+    }
+};
+
+extern "C" int halide_metal_command_buffer_completion_handler(void *const user_context, struct halide_metal_command_buffer *, char **) {
+    if (user_context == nullptr) {
+        printf("Error: user_context is nullptr\n");
+        return -1;
+    }
+    auto ctx = (MyUserContext *)user_context;
+    ctx->counter++;
+    return halide_error_code_success;
+}
+
+int main(int argc, char *argv[]) {
+#if defined(TEST_METAL)
+    Halide::Runtime::Buffer<int32_t> output(32, 32);
+
+    MyUserContext my_context;
+    metal_completion_handler_override(&my_context, output);
+    output.copy_to_host();
+
+    // Check the output
+    for (int y = 0; y < output.height(); y++) {
+        for (int x = 0; x < output.width(); x++) {
+            if (output(x, y) != x + y * 2) {
+                printf("Error: output(%d, %d) = %d instead of %d\n", x, y, output(x, y), x + y * 2);
+                return -1;
+            }
+        }
+    }
+
+    if (my_context.counter < 1) {
+        printf("Error: completion handler was not called\n");
+        return -1;
+    }
+
+    printf("Success!\n");
+#else
+    printf("[SKIP] Metal not enabled\n");
+#endif
+    return 0;
+}
\ No newline at end of file
diff --git a/test/generator/metal_completion_handler_override_generator.cpp b/test/generator/metal_completion_handler_override_generator.cpp
new file mode 100644
index 000000000000..8130a87710dc
--- /dev/null
+++ b/test/generator/metal_completion_handler_override_generator.cpp
@@ -0,0 +1,25 @@
+#include "Halide.h"
+
+namespace {
+
+class SimpleMetalPipeline : public Halide::Generator<SimpleMetalPipeline> {
+public:
+    Output<Buffer<int32_t, 2>> output{"output"};
+
+    void generate() {
+        Var x("x"), y("y");
+
+        // Create a simple pipeline that scales pixel values by 2.
+        output(x, y) = x + y * 2;
+
+        Target target = get_target();
+        if (target.has_gpu_feature()) {
+            Var xo, yo, xi, yi;
+            output.gpu_tile(x, y, xo, yo, xi, yi, 16, 16);
+        }
+    }
+};
+
+}  // namespace
+
+HALIDE_REGISTER_GENERATOR(SimpleMetalPipeline, metal_completion_handler_override)
\ No newline at end of file

From ea775cc73131e00d5e6bfe394fead813c45b7313 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Thu, 20 Jun 2024 08:59:42 -0700
Subject: [PATCH 133/186] Use upstream interface for consuming SPIR-V (#8265)

---
 CMakePresets.json                             |  11 ++
 dependencies/CMakeLists.txt                   |   4 -
 dependencies/README.md                        |  21 ++++
 dependencies/spirv/CMakeLists.txt             |   3 -
 dependencies/spirv/{LICENSE.txt => LICENSE}   |   0
 dependencies/spirv/README.md                  |  38 -------
 .../spirv/{1.6 => unified1}/GLSL.std.450.h    |   0
 .../include/spirv/{1.6 => unified1}/spirv.h   |   1 +
 .../SPIRV-Headers/SPIRV-HeadersConfig.cmake   |  28 +++++
 .../SPIRV-HeadersConfigVersion.cmake          |  54 +++++++++
 .../SPIRV-Headers/SPIRV-HeadersTargets.cmake  | 106 ++++++++++++++++++
 dependencies/update-spirv.sh                  |  40 +++++++
 src/CMakeLists.txt                            |   8 +-
 src/SpirvIR.h                                 |   4 +-
 14 files changed, 268 insertions(+), 50 deletions(-)
 create mode 100644 dependencies/README.md
 delete mode 100644 dependencies/spirv/CMakeLists.txt
 rename dependencies/spirv/{LICENSE.txt => LICENSE} (100%)
 delete mode 100644 dependencies/spirv/README.md
 rename dependencies/spirv/include/spirv/{1.6 => unified1}/GLSL.std.450.h (100%)
 rename dependencies/spirv/include/spirv/{1.6 => unified1}/spirv.h (99%)
 create mode 100644 dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersConfig.cmake
 create mode 100644 dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersConfigVersion.cmake
 create mode 100644 dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersTargets.cmake
 create mode 100755 dependencies/update-spirv.sh

diff --git a/CMakePresets.json b/CMakePresets.json
index e097aaeef574..84eac16d30c5 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -101,6 +101,17 @@
       "description": "Visual Studio-based x64 build with vcpkg dependencies.",
       "architecture": "x64"
     },
+    {
+      "name": "macOS",
+      "displayName": "macOS (Apple Clang)",
+      "description": "macOS build using Apple Clang and Homebrew LLVM",
+      "generator": "Ninja",
+      "inherits": "release",
+      "cacheVariables": {
+        "CMAKE_PREFIX_PATH": "/opt/homebrew;/opt/homebrew/opt/llvm;/opt/homebrew/opt/jpeg",
+        "Halide_SHARED_LLVM": "YES"
+      }
+    },
     {
       "name": "package",
       "hidden": true,
diff --git a/dependencies/CMakeLists.txt b/dependencies/CMakeLists.txt
index ef5643fd3b9e..56078b20d4e0 100644
--- a/dependencies/CMakeLists.txt
+++ b/dependencies/CMakeLists.txt
@@ -4,9 +4,5 @@
 
 add_subdirectory(llvm)
 
-if (TARGET_SPIRV)
-    add_subdirectory(spirv)
-endif()
-
 # Needs cache vars set by llvm, do not reorder.
 add_subdirectory(wasm)
diff --git a/dependencies/README.md b/dependencies/README.md
new file mode 100644
index 000000000000..9e11b262e70c
--- /dev/null
+++ b/dependencies/README.md
@@ -0,0 +1,21 @@
+# Dependencies
+
+## LLVM / WASM
+
+These are Halide's wrappers for the LLVM and WASM CMake builds/packages.
+
+## SPIR-V
+
+This folder contains a copy of the officially released v1.6 ANSI-C header
+file for [SPIR-V], obtained from the `sdk-1.3.231` branch
+of https://github.com/KhronosGroup/SPIRV-Headers.
+
+The directory structure within this folder matches that of the official
+version's install tree, plus the upstream `LICENSE` notice, minus files
+that Halide doesn't need.
+
+The `update-spirv.sh` script will automatically acquire the upstream repo,
+build it, and extract the necessary files. It takes a single argument, the
+name of the branch to clone.
+
+[SPIR-V]: https://www.khronos.org/registry/spir-v
diff --git a/dependencies/spirv/CMakeLists.txt b/dependencies/spirv/CMakeLists.txt
deleted file mode 100644
index 127548b093f6..000000000000
--- a/dependencies/spirv/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_library(Halide_SPIRV INTERFACE)
-add_library(Halide::SPIRV ALIAS Halide_SPIRV)
-target_include_directories(Halide_SPIRV SYSTEM INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include")
diff --git a/dependencies/spirv/LICENSE.txt b/dependencies/spirv/LICENSE
similarity index 100%
rename from dependencies/spirv/LICENSE.txt
rename to dependencies/spirv/LICENSE
diff --git a/dependencies/spirv/README.md b/dependencies/spirv/README.md
deleted file mode 100644
index 3e8c2531473e..000000000000
--- a/dependencies/spirv/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPIR-V Headers
-
-This folder contains a copy of the officially released v1.0 ANSI-C header 
-file for [SPIR-V](https://www.khronos.org/registry/spir-v/), obtained from
-the [https://github.com/KhronosGroup/SPIRV-Headers](https://github.com/KhronosGroup/SPIRV-Headers).
-
-The directory structure within this folder matches that of the official
-versioned include path.
-
-## License
-<a name="license"></a>
-```
-Copyright (c) 2015-2018 The Khronos Group Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and/or associated documentation files (the
-"Materials"), to deal in the Materials without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Materials, and to
-permit persons to whom the Materials are furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Materials.
-
-MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
-KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
-SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
-   https://www.khronos.org/registry/
-
-THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
-```
diff --git a/dependencies/spirv/include/spirv/1.6/GLSL.std.450.h b/dependencies/spirv/include/spirv/unified1/GLSL.std.450.h
similarity index 100%
rename from dependencies/spirv/include/spirv/1.6/GLSL.std.450.h
rename to dependencies/spirv/include/spirv/unified1/GLSL.std.450.h
diff --git a/dependencies/spirv/include/spirv/1.6/spirv.h b/dependencies/spirv/include/spirv/unified1/spirv.h
similarity index 99%
rename from dependencies/spirv/include/spirv/1.6/spirv.h
rename to dependencies/spirv/include/spirv/unified1/spirv.h
index 73d6c76614a2..af4f06b7748d 100644
--- a/dependencies/spirv/include/spirv/1.6/spirv.h
+++ b/dependencies/spirv/include/spirv/unified1/spirv.h
@@ -2574,3 +2574,4 @@ inline void SpvHasResultAndType(SpvOp opcode, bool *hasResult, bool *hasResultTy
 #endif /* SPV_ENABLE_UTILITY_CODE */
 
 #endif
+
diff --git a/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersConfig.cmake b/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersConfig.cmake
new file mode 100644
index 000000000000..bf8a5021731d
--- /dev/null
+++ b/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersConfig.cmake
@@ -0,0 +1,28 @@
+
+####### Expanded from @PACKAGE_INIT@ by configure_package_config_file() #######
+####### Any changes to this file will be overwritten by the next CMake run ####
+####### The input file was Config.cmake.in                            ########
+
+get_filename_component(PACKAGE_PREFIX_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+
+macro(set_and_check _var _file)
+  set(${_var} "${_file}")
+  if(NOT EXISTS "${_file}")
+    message(FATAL_ERROR "File or directory ${_file} referenced by variable ${_var} does not exist !")
+  endif()
+endmacro()
+
+macro(check_required_components _NAME)
+  foreach(comp ${${_NAME}_FIND_COMPONENTS})
+    if(NOT ${_NAME}_${comp}_FOUND)
+      if(${_NAME}_FIND_REQUIRED_${comp})
+        set(${_NAME}_FOUND FALSE)
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+####################################################################################
+
+include("${CMAKE_CURRENT_LIST_DIR}/SPIRV-HeadersTargets.cmake")
+check_required_components("SPIRV-Headers")
diff --git a/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersConfigVersion.cmake b/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersConfigVersion.cmake
new file mode 100644
index 000000000000..8038bb6256dd
--- /dev/null
+++ b/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersConfigVersion.cmake
@@ -0,0 +1,54 @@
+# This is a basic version file for the Config-mode of find_package().
+# It is used by write_basic_package_version_file() as input file for configure_file()
+# to create a version-file which can be installed along a config.cmake file.
+#
+# The created file sets PACKAGE_VERSION_EXACT if the current version string and
+# the requested version string are exactly the same and it sets
+# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version,
+# but only if the requested major version is the same as the current one.
+# The variable CVF_VERSION must be set before calling configure_file().
+
+
+set(PACKAGE_VERSION "1.5.5")
+
+if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+
+  if("1.5.5" MATCHES "^([0-9]+)\\.")
+    set(CVF_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    if(NOT CVF_VERSION_MAJOR VERSION_EQUAL 0)
+      string(REGEX REPLACE "^0+" "" CVF_VERSION_MAJOR "${CVF_VERSION_MAJOR}")
+    endif()
+  else()
+    set(CVF_VERSION_MAJOR "1.5.5")
+  endif()
+
+  if(PACKAGE_FIND_VERSION_RANGE)
+    # both endpoints of the range must have the expected major version
+    math (EXPR CVF_VERSION_MAJOR_NEXT "${CVF_VERSION_MAJOR} + 1")
+    if (NOT PACKAGE_FIND_VERSION_MIN_MAJOR STREQUAL CVF_VERSION_MAJOR
+        OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND NOT PACKAGE_FIND_VERSION_MAX_MAJOR STREQUAL CVF_VERSION_MAJOR)
+          OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND NOT PACKAGE_FIND_VERSION_MAX VERSION_LESS_EQUAL CVF_VERSION_MAJOR_NEXT)))
+      set(PACKAGE_VERSION_COMPATIBLE FALSE)
+    elseif(PACKAGE_FIND_VERSION_MIN_MAJOR STREQUAL CVF_VERSION_MAJOR
+        AND ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS_EQUAL PACKAGE_FIND_VERSION_MAX)
+        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MAX)))
+      set(PACKAGE_VERSION_COMPATIBLE TRUE)
+    else()
+      set(PACKAGE_VERSION_COMPATIBLE FALSE)
+    endif()
+  else()
+    if(PACKAGE_FIND_VERSION_MAJOR STREQUAL CVF_VERSION_MAJOR)
+      set(PACKAGE_VERSION_COMPATIBLE TRUE)
+    else()
+      set(PACKAGE_VERSION_COMPATIBLE FALSE)
+    endif()
+
+    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
+      set(PACKAGE_VERSION_EXACT TRUE)
+    endif()
+  endif()
+endif()
+
+
diff --git a/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersTargets.cmake b/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersTargets.cmake
new file mode 100644
index 000000000000..5fb44a5d3a81
--- /dev/null
+++ b/dependencies/spirv/share/cmake/SPIRV-Headers/SPIRV-HeadersTargets.cmake
@@ -0,0 +1,106 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "3.0.0")
+   message(FATAL_ERROR "CMake >= 3.0.0 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 3.0.0...3.27)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS SPIRV-Headers::SPIRV-Headers)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target SPIRV-Headers::SPIRV-Headers
+add_library(SPIRV-Headers::SPIRV-Headers INTERFACE IMPORTED)
+
+set_target_properties(SPIRV-Headers::SPIRV-Headers PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/SPIRV-HeadersTargets-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  if(CMAKE_VERSION VERSION_LESS "3.28"
+      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}
+      OR NOT IS_DIRECTORY "${_cmake_import_check_xcframework_for_${_cmake_target}}")
+    foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+      if(NOT EXISTS "${_cmake_file}")
+        message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+      endif()
+    endforeach()
+  endif()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/dependencies/update-spirv.sh b/dependencies/update-spirv.sh
new file mode 100755
index 000000000000..6ca4528545de
--- /dev/null
+++ b/dependencies/update-spirv.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -eo pipefail
+
+cd -- "$(dirname -- "$0")" || exit 1
+
+GIT_BRANCH="$1"
+if [ -z "$GIT_BRANCH" ]; then
+  echo "error: usage: $0 <git-branch>"
+  echo "remark: the current <git-branch> is sdk-1.3.231"
+  exit 1
+fi
+
+mkdir -p spirv
+
+cleanup () {
+  rm -rf SPIRV-Headers
+}
+
+trap cleanup SIGINT SIGTERM EXIT
+
+git clone https://github.com/KhronosGroup/SPIRV-Headers.git --branch "$GIT_BRANCH"
+cmake -S SPIRV-Headers -B SPIRV-Headers/build -DCMAKE_BUILD_TYPE=Release "-DCMAKE_INSTALL_PREFIX=$PWD/SPIRV-Headers/_local"
+cmake --build SPIRV-Headers/build --target install
+
+# Copy license information
+cp SPIRV-Headers/LICENSE spirv/
+
+# Copy headers we care about
+mkdir -p spirv/include/spirv/unified1/
+cp SPIRV-Headers/_local/include/spirv/unified1/GLSL.std.450.h spirv/include/spirv/unified1/
+cp SPIRV-Headers/_local/include/spirv/unified1/spirv.h spirv/include/spirv/unified1/
+
+# Copy CMake config
+mkdir -p spirv/share/
+cp -R SPIRV-Headers/_local/share/cmake spirv/share/
+
+git add -f spirv/
+
+echo "Updated SPIRV-Headers to branch $GIT_BRANCH!"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2f410244d2b0..61691a47a47d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -624,10 +624,12 @@ if (TARGET_VULKAN)
 endif()
 
 if (TARGET_SPIRV)
-    # Our vendored SPIRV headers are only used internally; users do not need
-    # them installed.
+    find_package(
+        SPIRV-Headers 1.5.5 REQUIRED
+        HINTS "${Halide_SOURCE_DIR}/dependencies/spirv"
+    )
     target_compile_definitions(Halide PRIVATE WITH_SPIRV)
-    target_link_libraries(Halide PRIVATE "$<BUILD_INTERFACE:Halide::SPIRV>")
+    target_link_libraries(Halide PRIVATE "$<BUILD_INTERFACE:SPIRV-Headers::SPIRV-Headers>")
 endif ()
 
 option(TARGET_WEBGPU "Include WebGPU target" ON)
diff --git a/src/SpirvIR.h b/src/SpirvIR.h
index ccbae3c28098..b14cac6d4b9e 100644
--- a/src/SpirvIR.h
+++ b/src/SpirvIR.h
@@ -23,8 +23,8 @@
 #include "IntrusivePtr.h"
 #include "Type.h"
 
-#include <spirv/1.6/GLSL.std.450.h>  // GLSL extended instructions for common intrinsics
-#include <spirv/1.6/spirv.h>         // Use v1.6 headers but only use the minimal viable format version (for maximum compatiblity)
+#include <spirv/unified1/GLSL.std.450.h>  // GLSL extended instructions for common intrinsics
+#include <spirv/unified1/spirv.h>         // Use v1.6 headers but only use the minimal viable format version (for maximum compatiblity)
 
 namespace Halide {
 namespace Internal {

From b9217103f30beabd87e2dbfffe74cd75d3645882 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Thu, 20 Jun 2024 12:12:15 -0700
Subject: [PATCH 134/186] Fix OpenCL positive and negative INF constants.
 (#8266)

---
 src/CodeGen_OpenCL_Dev.cpp              | 11 ++++++++---
 test/correctness/gpu_f16_intrinsics.cpp |  5 +++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp
index d7c7951936f3..f99d5a859df7 100644
--- a/src/CodeGen_OpenCL_Dev.cpp
+++ b/src/CodeGen_OpenCL_Dev.cpp
@@ -12,6 +12,7 @@
 #include "EliminateBoolVectors.h"
 #include "EmulateFloat16Math.h"
 #include "ExprUsesVar.h"
+#include "Float16.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "Simplify.h"
@@ -1183,11 +1184,15 @@ void CodeGen_OpenCL_Dev::init_module() {
     }
 
     if (target.has_feature(Target::CLHalf)) {
+        const uint16_t nan_f16 = float16_t::make_nan().to_bits();
+        const uint16_t neg_inf_f16 = float16_t::make_negative_infinity().to_bits();
+        const uint16_t inf_f16 = float16_t::make_infinity().to_bits();
+
         src_stream << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
                    << "inline half half_from_bits(unsigned short x) {return __builtin_astype(x, half);}\n"
-                   << "inline half nan_f16() { return half_from_bits(32767); }\n"
-                   << "inline half neg_inf_f16() { return half_from_bits(31744); }\n"
-                   << "inline half inf_f16() { return half_from_bits(64512); }\n"
+                   << "inline half nan_f16() { return half_from_bits(" << nan_f16 << "); }\n"
+                   << "inline half neg_inf_f16() { return half_from_bits(" << neg_inf_f16 << "); }\n"
+                   << "inline half inf_f16() { return half_from_bits(" << inf_f16 << "); }\n"
                    << "inline bool is_nan_f16(half x) {return isnan(x); }\n"
                    << "inline bool is_inf_f16(half x) {return isinf(x); }\n"
                    << "inline bool is_finite_f16(half x) {return isfinite(x); }\n"
diff --git a/test/correctness/gpu_f16_intrinsics.cpp b/test/correctness/gpu_f16_intrinsics.cpp
index b8bad8eb0a37..17032ecbff07 100644
--- a/test/correctness/gpu_f16_intrinsics.cpp
+++ b/test/correctness/gpu_f16_intrinsics.cpp
@@ -4,8 +4,9 @@ using namespace Halide;
 int main(int argc, char *argv[]) {
 
     auto target = get_jit_target_from_environment();
-    if (!target.has_feature(Target::Metal)) {
-        printf("[SKIP] No metal target enabled.\n");
+    if (!target.has_feature(Target::Metal) &&
+        !target.features_all_of({Target::OpenCL, Target::CLHalf})) {
+        printf("[SKIP] Test only applies to Metal and OpenCL+CLHalf.\n");
         return 0;
     }
 

From 198c25eb6f532a9e2a756833cc65be2b9a85c573 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Sat, 22 Jun 2024 01:03:52 +0200
Subject: [PATCH 135/186] scoped_truth for the loop variable being always less
 than the loop extent. (#8306)

* scoped_truth for the loop variable being always less than the loop extent.

* Correctify the range.

* Complementary scoped_truth for the loop lower bound.
---
 src/Simplify_Stmts.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp
index 3645ebbf4369..6be1ebb070e2 100644
--- a/src/Simplify_Stmts.cpp
+++ b/src/Simplify_Stmts.cpp
@@ -240,7 +240,15 @@ Stmt Simplify::visit(const For *op) {
                                         loop_var_info);
 
         // If we're in the loop, the extent must be greater than 0.
-        ScopedFact fact = scoped_truth(extent_positive);
+        ScopedFact fact_extent_positive = scoped_truth(extent_positive);
+
+        // The loop variable will never exceed the loop bound.
+        Expr loop_var = Variable::make(Int(32), op->name);
+        Expr new_max = mutate(new_min + new_extent, nullptr);
+        ScopedFact fact_loop_var_less_than_extent = scoped_truth(loop_var < new_max);
+
+        ScopedFact fact_loop_var_ge_than_min = scoped_truth(new_min <= loop_var);
+
         new_body = mutate(op->body);
     }
 

From 155d6932ab29bafe5e25d3edbc22326a880d82ea Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sat, 22 Jun 2024 16:15:23 -0700
Subject: [PATCH 136/186] Fix incorrect type in emulation of float16 is_inf/nan
 (#8310)

Fixes #8309
---
 src/EmulateFloat16Math.cpp     | 11 +++++++++--
 test/correctness/float16_t.cpp | 31 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/EmulateFloat16Math.cpp b/src/EmulateFloat16Math.cpp
index 547796337317..1fda58a838e9 100644
--- a/src/EmulateFloat16Math.cpp
+++ b/src/EmulateFloat16Math.cpp
@@ -151,9 +151,16 @@ Expr lower_float16_transcendental_to_float32_equivalent(const Call *op) {
         for (size_t i = 0; i < op->args.size(); i++) {
             new_args[i] = float16_to_float32(op->args[i]);
         }
-        Expr e = Call::make(Float(32, op->type.lanes()), it->second, new_args, op->call_type,
+        // Most of the intrinsics above return float, so the return type needs
+        // adjusting, but some return bool.
+        Type t = op->type.is_float() ? Float(32, op->type.lanes()) : op->type;
+        Expr e = Call::make(t, it->second, new_args, op->call_type,
                             op->func, op->value_index, op->image, op->param);
-        return float32_to_float16(e);
+        if (op->type.is_float()) {
+            e = float32_to_float16(e);
+        }
+        internal_assert(e.type() == op->type);
+        return e;
     } else {
         internal_error << "Unknown float16 transcendental: " << Expr(op) << "\n";
         return Expr();
diff --git a/test/correctness/float16_t.cpp b/test/correctness/float16_t.cpp
index d4399b008f0a..9e917a6216e6 100644
--- a/test/correctness/float16_t.cpp
+++ b/test/correctness/float16_t.cpp
@@ -226,6 +226,37 @@ int run_test() {
         }
     }
 
+    // Check non-real-number values (requires strict_float)
+    {
+        Func f;
+        Var x;
+        Param<float16_t> a, b, c, d;
+        a.set(float16_t::make_nan());
+        b.set(float16_t::make_infinity());
+        c.set(float16_t::make_negative_infinity());
+        d.set(float16_t::make_zero());
+        f(x) = mux(x, {is_nan(a), is_inf(a), is_finite(a),
+                       is_nan(b), is_inf(b), is_finite(b),
+                       is_nan(c), is_inf(c), is_finite(c),
+                       is_nan(d), is_inf(d), is_finite(d)});
+        f.compute_root().bound(x, 0, 12).unroll(x);
+
+        bool expected[12] = {
+            true, false, false,
+            false, true, false,
+            false, true, false,
+            false, false, true};
+
+        Buffer<bool> result = f.realize({12}, get_jit_target_from_environment().with_feature(Target::StrictFloat));
+
+        for (int i = 0; i < 12; i++) {
+            if (result(i) != expected[i]) {
+                fprintf(stderr, "Result %d is %d instead of %d\n", i, result(i), expected[i]);
+                return 1;
+            }
+        }
+    }
+
     Target target = get_jit_target_from_environment();
     if (target.has_feature(Target::CUDA) ||
         target.has_feature(Target::Metal)) {

From 22367def45b9fd4417bb39073456d18ed0130aa1 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sat, 22 Jun 2024 19:37:05 -0700
Subject: [PATCH 137/186] Don't try to codegen predicated atomic stores (#8285)

* Don't try to codegen predicated atomic stores

By disabling predication if an Atomic node is found.

Fixes #8280.

* Add clarifying comment
---
 src/VectorizeLoops.cpp                     |  7 ++++++
 test/correctness/predicated_store_load.cpp | 25 +++++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 0745a34a9d39..1481e37b0345 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -380,6 +380,13 @@ class PredicateLoadStore : public IRMutator {
         return pred;
     }
 
+    Stmt visit(const Atomic *op) override {
+        // We don't support codegen for vectorized predicated atomic stores, so
+        // just bail out.
+        valid = false;
+        return op;
+    }
+
     Expr visit(const Load *op) override {
         valid = valid && ((op->predicate.type().lanes() == lanes) || (op->predicate.type().is_scalar() && !expr_uses_var(op->index, var)));
         if (!valid) {
diff --git a/test/correctness/predicated_store_load.cpp b/test/correctness/predicated_store_load.cpp
index 023a3c2f92f9..3e21e0f9a3d7 100644
--- a/test/correctness/predicated_store_load.cpp
+++ b/test/correctness/predicated_store_load.cpp
@@ -464,6 +464,24 @@ int vectorized_predicated_load_lut_test(const Target &t) {
     return 0;
 }
 
+int predicated_atomic_store_test(const Target &t) {
+    // We don't support atomic predicated stores, so ensure that we don't
+    // generate them. See https://github.com/halide/Halide/issues/8280
+    ImageParam in(Float(32), 1);
+    Func f;
+    Var x;
+    RDom r(0, 20);
+
+    f(x) = 0.f;
+    f(x) += in(r) + x;
+    f.update().vectorize(x, 8, TailStrategy::GuardWithIf).atomic().parallel(r);
+
+    // This will cause an internal_error in the LLVM backend if we pass a
+    // predicated atomic store down to codegen.
+    f.compile_jit(t);
+    return 0;
+}
+
 }  // namespace
 
 int main(int argc, char **argv) {
@@ -529,6 +547,11 @@ int main(int argc, char **argv) {
         return 1;
     }
 
+    printf("predicated atomic store test\n");
+    if (predicated_atomic_store_test(t) != 0) {
+        return 1;
+    }
+
     printf("Success!\n");
     return 0;
-}
\ No newline at end of file
+}

From 61df9bae43eb283323142c078884a3fbde15aa01 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Sat, 22 Jun 2024 19:37:18 -0700
Subject: [PATCH 138/186] Add ability to pass explicit RDom to
 Function::define_update (#8284)

* Add ability to pass explicit RDom to Function::define_update

And use it in rfactor. There are cases where an RDom is attached to the
original Func but not actually referred to in the LHS or RHS.

Fixes #8282

* Fix comment
---
 src/Func.cpp                 |  4 +++-
 src/Function.cpp             | 19 ++++++++++++++++-
 src/Function.h               | 17 +++++++--------
 test/correctness/rfactor.cpp | 40 ++++++++++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/src/Func.cpp b/src/Func.cpp
index 1f480c99983c..1f068f535aa1 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -999,7 +999,9 @@ Func Stage::rfactor(vector<pair<RVar, Var>> preserved) {
         val = substitute_self_reference(val, func_name, intm.function(), vars_rename);
         update_vals[i] = val;
     }
-    intm(update_args) = Tuple(update_vals);
+    // There may not actually be a reference to the RDom in the args or values,
+    // so we use Function::define_update, which lets pass pass an explicit RDom.
+    intm.function().define_update(update_args, update_vals, intm_rdom.domain());
 
     // Determine the dims and schedule of the update definition of the
     // intermediate Func. We copy over the schedule from the original
diff --git a/src/Function.cpp b/src/Function.cpp
index b72a39e1c90a..095b11e6729f 100644
--- a/src/Function.cpp
+++ b/src/Function.cpp
@@ -678,7 +678,7 @@ void Function::create_output_buffers(const std::vector<Type> &types, int dims) c
     }
 }
 
-void Function::define_update(const vector<Expr> &_args, vector<Expr> values) {
+void Function::define_update(const vector<Expr> &_args, vector<Expr> values, const ReductionDomain &rdom) {
     int update_idx = static_cast<int>(contents->updates.size());
 
     user_assert(!name().empty())
@@ -767,6 +767,23 @@ void Function::define_update(const vector<Expr> &_args, vector<Expr> values) {
     for (const auto &value : values) {
         value.accept(&check);
     }
+    if (!check.reduction_domain.defined()) {
+        // Use the provided one
+        check.reduction_domain = rdom;
+    } else if (rdom.defined()) {
+        // This is an internal error because the ability to pass an explicit
+        // RDom is not exposed to the front-end. At the time of writing this is
+        // only used by rfactor.
+        internal_assert(rdom.same_as(check.reduction_domain))
+            << "In update definition " << update_idx << " of Func \"" << name() << "\":\n"
+            << "Explicit reduction domain passed to Function::define_update, "
+            << "but another reduction domain was referred to by the args or values.\n"
+            << "Explicit reduction domain passed:\n"
+            << RDom(rdom) << "\n"
+            << "Found reduction domain:\n"
+            << RDom(check.reduction_domain) << "\n";
+    }
+
     if (check.reduction_domain.defined()) {
         check.unbound_reduction_vars_ok = true;
         check.reduction_domain.predicate().accept(&check);
diff --git a/src/Function.h b/src/Function.h
index 49f68805d61e..5305f4f058be 100644
--- a/src/Function.h
+++ b/src/Function.h
@@ -12,6 +12,7 @@
 #include "Definition.h"
 #include "Expr.h"
 #include "FunctionPtr.h"
+#include "Reduction.h"
 #include "Schedule.h"
 
 namespace Halide {
@@ -117,15 +118,15 @@ class Function {
      * reduction domain */
     void define(const std::vector<std::string> &args, std::vector<Expr> values);
 
-    /** Add an update definition to this function. It must already
-     * have a pure definition but not an update definition, and the
-     * length of args must match the length of args used in the pure
-     * definition. 'value' must depend on some reduction domain, and
-     * may contain variables from that domain as well as pure
-     * variables. Any pure variables must also appear as Variables in
-     * the args array, and they must have the same name as the pure
+    /** Add an update definition to this function. It must already have a pure
+     * definition but not an update definition, and the length of args must
+     * match the length of args used in the pure definition. 'value' may depend
+     * on some reduction domain may contain variables from that domain as well
+     * as pure variables. A reduction domain may also be introduced by passing
+     * it as the last argument. Any pure variables must also appear as Variables
+     * in the args array, and they must have the same name as the pure
      * definition's argument in the same index. */
-    void define_update(const std::vector<Expr> &args, std::vector<Expr> values);
+    void define_update(const std::vector<Expr> &args, std::vector<Expr> values, const ReductionDomain &rdom = ReductionDomain{});
 
     /** Accept a visitor to visit all of the definitions and arguments
      * of this function. */
diff --git a/test/correctness/rfactor.cpp b/test/correctness/rfactor.cpp
index 02bcc9e0f74e..7fb3eb5b1dd1 100644
--- a/test/correctness/rfactor.cpp
+++ b/test/correctness/rfactor.cpp
@@ -992,6 +992,45 @@ int self_assignment_rfactor_test() {
     return 0;
 }
 
+int inlined_rfactor_with_disappearing_rvar_test() {
+    ImageParam in1(Float(32), 1);
+
+    Var x("x"), r("r"), u("u");
+    RVar ro("ro"), ri("ri");
+    Func f("f"), g("g");
+    Func sum1("sum1");
+
+    RDom rdom(0, 16);
+    g(r, x) = in1(x);
+    f(x) = sum(rdom, g(rdom, x), sum1);
+
+    {
+        // Some of the autoschedulers execute code like the below, which can
+        // erase an RDom from the LHS and RHS of a Func, but not from the dims
+        // list, which confused the implementation of rfactor (see
+        // https://github.com/halide/Halide/issues/8282)
+        using namespace Halide::Internal;
+        std::vector<Function> outputs = {f.function()};
+        auto env = build_environment(outputs);
+
+        for (auto &iter : env) {
+            iter.second.lock_loop_levels();
+        }
+
+        inline_function(sum1.function(), g.function());
+    }
+
+    sum1.compute_root()
+        .update(0)
+        .split(rdom, ro, ri, 8, TailStrategy::GuardWithIf)
+        .rfactor({{ro, u}})
+        .compute_root();
+
+    // This would crash with a missing symbol error prior to #8282 being fixed.
+    f.compile_jit();
+    return 0;
+}
+
 }  // namespace
 
 int main(int argc, char **argv) {
@@ -1032,6 +1071,7 @@ int main(int argc, char **argv) {
         {"rfactor tile reorder test: checking output img correctness...", rfactor_tile_reorder_test},
         {"complex multiply rfactor test", complex_multiply_rfactor_test},
         {"argmin rfactor test", argmin_rfactor_test},
+        {"inlined rfactor with disappearing rvar test", inlined_rfactor_with_disappearing_rvar_test},
     };
 
     using Sharder = Halide::Internal::Test::Sharder;

From 5f6fc269cd7859065f32cf7c3b65f9a8f8c688cc Mon Sep 17 00:00:00 2001
From: Derek Gerstmann <derek.gerstmann@gmail.com>
Date: Sun, 23 Jun 2024 14:00:55 -0700
Subject: [PATCH 139/186] [vulkan] Dynamically load Vulkan loader library.
 Avoid Validation Layer crash on exit. (#8289)

* Remove the compile-time link dependency for the Vulkan loader, and resolve the instance methods dynamically.
Update the Vulkan readme to match the latest information regarding the SDK packages.

* Formatting pass.w

* Add runtime check to verify shared memory amount used in pipeline can be run on device

* Fix platform ifdefs for Vulkan library names (normal ones arent defined
when the runtime is compiled).

* Detect if VK_LAYER_KHRONOS_validation is enabled, and bypass the module
destructor which calls halide_vulkan_device_release() to avoid a
segfault (at the cost of leaking!).  See
https://github.com/halide/Halide/issues/8290.

Refactor and cleanup halide_vulkan_device_release().
Add vk_destroy_context() methods.

* Fix GPU object lifetime AOT test to use TEST_VULKAN macro.

* Fix clang tidy warning for usage of static in anonymous namespace

* Disable Vulkan validation layer for leak tests (or we'll leak).

* Add vk_validate_shader_for_device() method to check shader bindings
against device limits prior to compiling to verify shader compatibility.

---------

Co-authored-by: Derek Gerstmann <dgerstmann@adobe.com>
Co-authored-by: Andrew Adams <andrew.b.adams@gmail.com>
---
 Makefile                                      |  7 --
 README_vulkan.md                              |  4 +-
 cmake/HalideGeneratorHelpers.cmake            |  5 -
 src/runtime/mini_vulkan.h                     |  4 +-
 src/runtime/vulkan.cpp                        | 58 +++++------
 src/runtime/vulkan_context.h                  | 79 +++++++++++++--
 src/runtime/vulkan_functions.h                |  4 +-
 src/runtime/vulkan_interface.h                | 53 +++++++++-
 src/runtime/vulkan_internal.h                 | 12 +++
 src/runtime/vulkan_memory.h                   |  3 +
 src/runtime/vulkan_resources.h                | 99 ++++++++++++++++++-
 test/correctness/gpu_object_lifetime_1.cpp    |  8 ++
 test/correctness/gpu_object_lifetime_2.cpp    |  7 ++
 test/correctness/gpu_object_lifetime_3.cpp    |  7 ++
 test/correctness/leak_device_memory.cpp       |  7 ++
 .../generator/gpu_object_lifetime_aottest.cpp |  6 ++
 16 files changed, 301 insertions(+), 62 deletions(-)

diff --git a/Makefile b/Makefile
index 761dfb058b75..775e45803a73 100644
--- a/Makefile
+++ b/Makefile
@@ -335,9 +335,6 @@ endif
 ifneq ($(TEST_OPENCL), )
 OPENCL_LD_FLAGS ?= -lOpenCL
 endif
-ifneq ($(TEST_VULKAN), )
-VULKAN_LD_FLAGS ?= -lvulkan
-endif
 HOST_OS=linux
 endif
 
@@ -349,10 +346,6 @@ endif
 ifneq ($(TEST_OPENCL), )
 OPENCL_LD_FLAGS ?= -framework OpenCL
 endif
-ifneq ($(TEST_VULKAN), )
-# The Vulkan loader is distributed as a dylib on OSX (not a framework)
-VULKAN_LD_FLAGS ?= -lvulkan
-endif
 ifneq ($(TEST_METAL), )
 METAL_LD_FLAGS ?= -framework Metal -framework Foundation
 endif
diff --git a/README_vulkan.md b/README_vulkan.md
index 017dd56aed73..fa626afc37d2 100644
--- a/README_vulkan.md
+++ b/README_vulkan.md
@@ -57,9 +57,9 @@ For Vulkan device drivers, consult the appropriate hardware vendor for your devi
 
 ## Linux 
 
-On Ubuntu Linux v22.04, the vulkan runtime is distributed in the `vulkan-tools` package. For earlier versions of Ubuntu (e.g. v20.x or v18.x) the contents of the `vulkan-tools` package was distributed as `vulkan-utils` so use that package instead.
+The Vulkan SDK packages are now being maintained by LunarG.  These include the Vulkan Loader library, as well as the Vulkan Tools packages. Instructions for installing these can be found on their [Getting Started Guide](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html).
 
-Proprietary drivers can be installed via 'apt' using PPA's for each vendor. Examples for AMD and NVIDIA are provided below.
+Once the SDK has been installed, you need to install the appropriate driver for your device.  Proprietary drivers can be installed via 'apt' using PPA's for each vendor. Examples for AMD and NVIDIA are provided below.
 
 For AMD on Ubuntu v22.04:
 ```
diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake
index 3aa380da450e..ae2698cf1ce2 100644
--- a/cmake/HalideGeneratorHelpers.cmake
+++ b/cmake/HalideGeneratorHelpers.cmake
@@ -712,11 +712,6 @@ function(_Halide_add_targets_to_runtime TARGET)
 endfunction()
 
 function(_Halide_target_link_gpu_libs TARGET VISIBILITY)
-    if ("${ARGN}" MATCHES "vulkan")
-        find_package(Vulkan REQUIRED)
-        target_link_libraries(${TARGET} ${VISIBILITY} Vulkan::Vulkan)
-    endif ()
-
     if ("${ARGN}" MATCHES "metal")
         find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
         find_library(METAL_LIBRARY Metal REQUIRED)
diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h
index 1eff0ad7310b..c02f6044a308 100644
--- a/src/runtime/mini_vulkan.h
+++ b/src/runtime/mini_vulkan.h
@@ -2639,18 +2639,16 @@ typedef void(VKAPI_PTR *PFN_vkCmdNextSubpass)(VkCommandBuffer commandBuffer, VkS
 typedef void(VKAPI_PTR *PFN_vkCmdEndRenderPass)(VkCommandBuffer commandBuffer);
 typedef void(VKAPI_PTR *PFN_vkCmdExecuteCommands)(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer *pCommandBuffers);
 
-// This appears to be exported by the loader
+#ifndef VK_NO_PROTOTYPES
 VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance(
     const VkInstanceCreateInfo *pCreateInfo,
     const VkAllocationCallbacks *pAllocator,
     VkInstance *pInstance);
 
-// Same as above ... these two methods are the only prototypes we depend upon
 VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(
     VkInstance instance,
     const char *pName);
 
-#ifndef VK_NO_PROTOTYPES
 VKAPI_ATTR void VKAPI_CALL vkDestroyInstance(
     VkInstance instance,
     const VkAllocationCallbacks *pAllocator);
diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
index a4765e4f56fa..cb79950f7421 100644
--- a/src/runtime/vulkan.cpp
+++ b/src/runtime/vulkan.cpp
@@ -210,47 +210,36 @@ WEAK int halide_vulkan_device_release(void *user_context) {
     debug(user_context)
         << "halide_vulkan_device_release (user_context: " << user_context << ")\n";
 
-    VulkanMemoryAllocator *allocator;
-    VkInstance instance;
-    VkDevice device;
-    VkCommandPool command_pool;
-    VkPhysicalDevice physical_device;
-    VkQueue queue;
-    uint32_t _throwaway;
-
+    VulkanMemoryAllocator *allocator = nullptr;
+    VkInstance instance = nullptr;
+    VkDevice device = nullptr;
+    VkCommandPool command_pool = VkInvalidCommandPool;
+    VkPhysicalDevice physical_device = nullptr;
+    VkQueue queue = nullptr;
+    uint32_t queue_family_index = 0;
+
+    int destroy_status = halide_error_code_success;
     int acquire_status = halide_vulkan_acquire_context(user_context,
                                                        reinterpret_cast<halide_vulkan_memory_allocator **>(&allocator),
-                                                       &instance, &device, &physical_device, &command_pool, &queue, &_throwaway, false);
+                                                       &instance, &device, &physical_device, &command_pool, &queue, &queue_family_index, false);
 
-    if ((acquire_status == halide_error_code_success) && (instance != nullptr)) {
-        vkQueueWaitIdle(queue);
-        if (command_pool == cached_command_pool) {
-            cached_command_pool = 0;
-        }
-        if (reinterpret_cast<halide_vulkan_memory_allocator *>(allocator) == cached_allocator) {
+    if (acquire_status == halide_error_code_success) {
+        // Destroy the context if we created it
+        if ((instance == cached_instance) && (device == cached_device)) {
+            destroy_status = vk_destroy_context(user_context, allocator, instance, device, physical_device, command_pool, queue);
+            cached_command_pool = VkInvalidCommandPool;
             cached_allocator = nullptr;
-        }
-
-        vk_destroy_command_pool(user_context, allocator, command_pool);
-        vk_destroy_shader_modules(user_context, allocator);
-        vk_destroy_memory_allocator(user_context, allocator);
-
-        if (device == cached_device) {
             cached_device = nullptr;
             cached_physical_device = nullptr;
             cached_queue = nullptr;
             cached_queue_family_index = 0;
-        }
-        vkDestroyDevice(device, nullptr);
-
-        if (instance == cached_instance) {
             cached_instance = nullptr;
         }
-        vkDestroyInstance(instance, nullptr);
+
         halide_vulkan_release_context(user_context, instance, device, queue);
     }
 
-    return halide_error_code_success;
+    return destroy_status;
 }
 
 WEAK int halide_vulkan_device_malloc(void *user_context, halide_buffer_t *buf) {
@@ -1409,7 +1398,18 @@ WEAK __attribute__((constructor)) void register_vulkan_allocation_pool() {
 }
 
 WEAK __attribute__((destructor)) void halide_vulkan_cleanup() {
-    halide_vulkan_device_release(nullptr);
+    // FIXME: When the VK_LAYER_KHRONOS_validation is intercepting calls to the API, it will
+    //        cause a segfault if it's invoked inside the destructor since it uses it's own global
+    //        state to track object usage which is no longer valid once this call is invoked.
+    //        Calling this destructor with the validator hooks in place will cause an uncaught
+    //        exception for an uninitialized mutex lock. We can avoid the crash on exit by \
+    //        bypassing the device release call and leak (!!!!)
+    // ISSUE: https://github.com/halide/Halide/issues/8290
+    const char *layer_names = vk_get_layer_names_internal(nullptr);
+    bool has_validation_layer = strstr(layer_names, "VK_LAYER_KHRONOS_validation");
+    if (!has_validation_layer) {
+        halide_vulkan_device_release(nullptr);
+    }
 }
 
 // --------------------------------------------------------------------------
diff --git a/src/runtime/vulkan_context.h b/src/runtime/vulkan_context.h
index 80165318643e..f95cb441fb4a 100644
--- a/src/runtime/vulkan_context.h
+++ b/src/runtime/vulkan_context.h
@@ -24,7 +24,7 @@ halide_vulkan_memory_allocator *WEAK cached_allocator = nullptr;
 // Cached instance related handles for device resources
 VkInstance WEAK cached_instance = nullptr;
 VkDevice WEAK cached_device = nullptr;
-VkCommandPool WEAK cached_command_pool = 0;
+VkCommandPool WEAK cached_command_pool = VkInvalidCommandPool;
 VkQueue WEAK cached_queue = nullptr;
 VkPhysicalDevice WEAK cached_physical_device = nullptr;
 uint32_t WEAK cached_queue_family_index = 0;
@@ -42,7 +42,7 @@ class VulkanContext {
     VulkanMemoryAllocator *allocator = nullptr;
     VkInstance instance = nullptr;
     VkDevice device = nullptr;
-    VkCommandPool command_pool = 0;
+    VkCommandPool command_pool = VkInvalidCommandPool;
     VkPhysicalDevice physical_device = nullptr;
     VkQueue queue = nullptr;
     uint32_t queue_family_index = 0;  // used for operations requiring queue family
@@ -88,27 +88,52 @@ int vk_find_compute_capability(void *user_context, int *major, int *minor) {
     VkPhysicalDevice physical_device = nullptr;
     uint32_t queue_family_index = 0;
 
+    if (!lib_vulkan) {
+        // If the Vulkan loader can't be found, we want to return compute
+        // capability of (0, 0) ... this is not considered an error. So we
+        // should be very careful about looking for Vulkan without tripping
+        // any errors in the rest of this runtime.
+        void *sym = halide_vulkan_get_symbol(user_context, "vkCreateInstance");
+        if (!sym) {
+            *major = *minor = 0;
+            return halide_error_code_success;
+        }
+    }
+
+    if (vkCreateInstance == nullptr) {
+        vk_load_vulkan_loader_functions(user_context);
+        if (vkCreateInstance == nullptr) {
+            debug(user_context) << "  no valid vulkan loader library was found ...\n";
+            *major = *minor = 0;
+            return halide_error_code_success;
+        }
+    }
+
     StringTable requested_layers;
     vk_get_requested_layers(user_context, requested_layers);
 
+    // Attempt to find a suitable instance that can support the requested layers
     const VkAllocationCallbacks *alloc_callbacks = halide_vulkan_get_allocation_callbacks(user_context);
     int status = vk_create_instance(user_context, requested_layers, &instance, alloc_callbacks);
     if (status != halide_error_code_success) {
         debug(user_context) << "  no valid vulkan runtime was found ...\n";
-        *major = 0;
-        *minor = 0;
+        *major = *minor = 0;
         return 0;
     }
 
     if (vkCreateDevice == nullptr) {
-        vk_load_vulkan_functions(instance);
+        vk_load_vulkan_functions(user_context, instance);
+        if (vkCreateDevice == nullptr) {
+            debug(user_context) << "  no valid vulkan runtime was found ...\n";
+            *major = *minor = 0;
+            return halide_error_code_success;
+        }
     }
 
     status = vk_select_device_for_context(user_context, &instance, &device, &physical_device, &queue_family_index);
     if (status != halide_error_code_success) {
         debug(user_context) << "  no valid vulkan device was found ...\n";
-        *major = 0;
-        *minor = 0;
+        *major = *minor = 0;
         return 0;
     }
 
@@ -425,6 +450,14 @@ int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
 
     debug(user_context) << " vk_create_context (user_context: " << user_context << ")\n";
 
+    if (vkCreateInstance == nullptr) {
+        vk_load_vulkan_loader_functions(user_context);
+        if (vkCreateInstance == nullptr) {
+            error(user_context) << "Vulkan: Failed to resolve loader library methods to create instance!\n";
+            return halide_error_code_symbol_not_found;
+        }
+    }
+
     StringTable requested_layers;
     uint32_t requested_layer_count = vk_get_requested_layers(user_context, requested_layers);
     debug(user_context) << "  requested " << requested_layer_count << " layers for instance!\n";
@@ -440,7 +473,11 @@ int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
     }
 
     if (vkCreateDevice == nullptr) {
-        vk_load_vulkan_functions(*instance);
+        vk_load_vulkan_functions(user_context, *instance);
+        if (vkCreateDevice == nullptr) {
+            error(user_context) << "Vulkan: Failed to resolve API library methods to create device!\n";
+            return halide_error_code_symbol_not_found;
+        }
     }
 
     error_code = vk_select_device_for_context(user_context, instance, device, physical_device, queue_family_index);
@@ -470,6 +507,32 @@ int vk_create_context(void *user_context, VulkanMemoryAllocator **allocator,
     return halide_error_code_success;
 }
 
+// Destroys the context and all associated resources (used by halide_vulkan_device_release)
+// NOTE: This should be called inside an acquire_context/release_context scope
+int vk_destroy_context(void *user_context, VulkanMemoryAllocator *allocator,
+                       VkInstance instance, VkDevice device, VkPhysicalDevice physical_device,
+                       VkCommandPool command_pool, VkQueue queue) {
+
+    debug(user_context)
+        << "vk_destroy_context (user_context: " << user_context << ")\n";
+
+    if (device != nullptr) {
+        vkDeviceWaitIdle(device);
+    }
+    if ((command_pool != VkInvalidCommandPool) && (allocator != nullptr)) {
+        vk_destroy_command_pool(user_context, allocator, command_pool);
+        vk_destroy_shader_modules(user_context, allocator);
+        vk_destroy_memory_allocator(user_context, allocator);
+    }
+    if (device != nullptr) {
+        vkDestroyDevice(device, nullptr);
+    }
+    if (instance != nullptr) {
+        vkDestroyInstance(instance, nullptr);
+    }
+    return halide_error_code_success;
+}
+
 // --------------------------------------------------------------------------
 
 }  // namespace
diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
index d1c0a8bfd32c..22cb28d750e5 100644
--- a/src/runtime/vulkan_functions.h
+++ b/src/runtime/vulkan_functions.h
@@ -1,6 +1,8 @@
-// NOTE: vkCreateInstance is already defined in the mini_vulkan header
+// NOTE: vkCreateInstance and vkGetInstanceProcAddr are defined in the loader library and will be resolved seperately
+// The rest of these are resolved via vkGetInstanceProcAddr which the loader exports and maps to the driver implementation
 VULKAN_FN(vkDestroyInstance)
 VULKAN_FN(vkCreateDevice)
+VULKAN_FN(vkDeviceWaitIdle)
 VULKAN_FN(vkDestroyDevice)
 VULKAN_FN(vkGetDeviceQueue)
 VULKAN_FN(vkCreateBuffer)
diff --git a/src/runtime/vulkan_interface.h b/src/runtime/vulkan_interface.h
index 676c8548f6fc..d7ff99c860ea 100644
--- a/src/runtime/vulkan_interface.h
+++ b/src/runtime/vulkan_interface.h
@@ -37,25 +37,70 @@ namespace Vulkan {
 
 // --------------------------------------------------------------------------
 
-// Halide device interface struct for runtime specific funtion table
+// Halide device interface struct for runtime specific function table
 extern WEAK halide_device_interface_t vulkan_device_interface;
 
 // --------------------------------------------------------------------------
 
+// The default implementation of halide_vulkan_get_symbol attempts to load
+// the Vulkan loader shared library/DLL, and then get the symbol from it.
+WEAK void *lib_vulkan = nullptr;
+
+extern "C" WEAK void *halide_vulkan_get_symbol(void *user_context, const char *name) {
+    // Only try to load the library if the library isn't already
+    // loaded, or we can't load the symbol from the process already.
+    void *symbol = halide_get_library_symbol(lib_vulkan, name);
+    if (symbol) {
+        return symbol;
+    }
+
+    const char *lib_names[] = {
+#ifdef WINDOWS
+        "vulkan-1.dll",
+#else
+        "libvulkan.so.1",
+        "libvulkan.1.dylib",
+#endif
+    };
+    for (auto &lib_name : lib_names) {
+        lib_vulkan = halide_load_library(lib_name);
+        if (lib_vulkan) {
+            debug(user_context) << "    Loaded Vulkan loader library: " << lib_name << "\n";
+            break;
+        } else {
+            debug(user_context) << "    Missing Vulkan loader library: " << lib_name << "\n";
+        }
+    }
+
+    return halide_get_library_symbol(lib_vulkan, name);
+}
+
+// Declare all the function pointers for the Vulkan API methods that will be resolved dynamically
 // clang-format off
 #define VULKAN_FN(fn) WEAK PFN_##fn fn; 
+VULKAN_FN(vkCreateInstance)
+VULKAN_FN(vkGetInstanceProcAddr)
 #include "vulkan_functions.h"
 #undef VULKAN_FN
 // clang-format on
 
-void WEAK vk_load_vulkan_functions(VkInstance instance) {
+// Get the function pointers from the Vulkan loader to create an Instance (to find all available driver implementations)
+void WEAK vk_load_vulkan_loader_functions(void *user_context) {
+    debug(user_context) << "    vk_load_vulkan_loader_functions (user_context: " << user_context << ")\n";
+#define VULKAN_FN(fn) fn = (PFN_##fn)halide_vulkan_get_symbol(user_context, #fn);
+    VULKAN_FN(vkCreateInstance)
+    VULKAN_FN(vkGetInstanceProcAddr)
+#undef VULKAN_FN
+}
+
+// Get the function pointers from the Vulkan instance for the resolved driver API methods.
+void WEAK vk_load_vulkan_functions(void *user_context, VkInstance instance) {
+    debug(user_context) << "    vk_load_vulkan_functions (user_context: " << user_context << ")\n";
 #define VULKAN_FN(fn) fn = (PFN_##fn)vkGetInstanceProcAddr(instance, #fn);
 #include "vulkan_functions.h"
 #undef VULKAN_FN
 }
 
-// --
-
 // --------------------------------------------------------------------------
 
 }  // namespace Vulkan
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index 05eb03361d15..319d8caf63bf 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -58,6 +58,15 @@ int vk_create_context(
     VkCommandPool *command_pool,
     VkQueue *queue, uint32_t *queue_family_index);
 
+int vk_destroy_context(
+    void *user_context,
+    VulkanMemoryAllocator *allocator,
+    VkInstance instance,
+    VkDevice device,
+    VkPhysicalDevice physical_device,
+    VkCommandPool command_pool,
+    VkQueue queue);
+
 int vk_find_compute_capability(void *user_context, int *major, int *minor);
 
 int vk_create_instance(void *user_context, const StringTable &requested_layers, VkInstance *instance, const VkAllocationCallbacks *alloc_callbacks);
@@ -92,6 +101,9 @@ bool vk_validate_required_extension_support(void *user_context,
 int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, uint32_t queue_index, VkCommandPool *command_pool);
 int vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool);
 
+// Command pools are uint64_t and zero may be valid, so use this as a sentinel for invalid
+const VkCommandPool VkInvalidCommandPool(uint64_t(-1));
+
 // -- Command Buffer
 int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool pool, VkCommandBuffer *command_buffer);
 int vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocator, VkCommandPool command_pool, VkCommandBuffer command_buffer);
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 055fbef72277..1a88bc141cbf 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -79,6 +79,9 @@ class VulkanMemoryAllocator {
     VkPhysicalDevice current_physical_device() const {
         return this->physical_device;
     }
+    VkPhysicalDeviceLimits current_physical_device_limits() const {
+        return this->physical_device_limits;
+    }
     const VkAllocationCallbacks *callbacks() const {
         return this->alloc_callbacks;
     }
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index d4b7bf866d11..f28548700457 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -113,7 +113,11 @@ int vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator
         error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!\n";
         return halide_error_code_generic_error;
     }
-
+    if (command_pool == VkInvalidCommandPool) {
+        debug(user_context) << "Vulkan: Command pool already destroyed.\n";
+        return halide_error_code_success;
+    }
+    vkResetCommandPool(allocator->current_device(), command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
     vkDestroyCommandPool(allocator->current_device(), command_pool, allocator->callbacks());
     return halide_error_code_success;
 }
@@ -750,6 +754,16 @@ int vk_create_pipeline_layout(void *user_context,
         return halide_error_code_generic_error;
     }
 
+    if (allocator->current_physical_device_limits().maxBoundDescriptorSets > 0) {
+        uint64_t max_bound_descriptor_sets = allocator->current_physical_device_limits().maxBoundDescriptorSets;
+        if (descriptor_set_count > max_bound_descriptor_sets) {
+            error(user_context) << "Vulkan: Number of descriptor sets for pipeline layout exceeds the number that can be bound by device!\n"
+                                << " requested: " << descriptor_set_count << ","
+                                << " available: " << max_bound_descriptor_sets << "\n";
+            return halide_error_code_incompatible_device_interface;
+        }
+    }
+
     VkPipelineLayoutCreateInfo pipeline_layout_info = {
         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,  // structure type
         nullptr,                                        // pointer to a structure extending this
@@ -908,21 +922,40 @@ int vk_setup_compute_pipeline(void *user_context,
             }
         }
         uint32_t shared_mem_bytes_avail = (dispatch_data->shared_mem_bytes - static_shared_mem_bytes);
+#ifdef DEBUG_RUNTIME
         debug(user_context) << "  pipeline uses " << static_shared_mem_bytes << " bytes of static shared memory\n";
         debug(user_context) << "  dispatch requests " << dispatch_data->shared_mem_bytes << " bytes of shared memory\n";
         debug(user_context) << "  dynamic shared memory " << shared_mem_bytes_avail << " bytes available\n";
-
+#endif
         // setup the dynamic array size
         if ((shared_mem_constant_id > 0) && (shared_mem_bytes_avail > 0)) {
             uint32_t dynamic_array_size = (uint32_t)shared_mem_bytes_avail / shared_mem_type_size;
+#ifdef DEBUG_RUNTIME
             debug(user_context) << "  setting shared memory to " << (uint32_t)dynamic_array_size << " elements "
                                 << "(or " << (uint32_t)shared_mem_bytes_avail << " bytes)\n";
-
+#endif
             // save the shared mem specialization constant in the first slot
             dispatch_constant_ids[dispatch_constant_index] = shared_mem_constant_id;
             dispatch_constant_values[dispatch_constant_index] = dynamic_array_size;
             dispatch_constant_index++;
         }
+
+        // verify the device can actually support the necessary amount of shared memory requested
+        if (allocator->current_physical_device_limits().maxComputeSharedMemorySize > 0) {
+            uint64_t device_shared_mem_size = allocator->current_physical_device_limits().maxComputeSharedMemorySize;
+            if (static_shared_mem_bytes > device_shared_mem_size) {
+                error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n"
+                                    << " requested: " << static_shared_mem_bytes << " bytes,"
+                                    << " available: " << device_shared_mem_size << " bytes\n";
+                return halide_error_code_incompatible_device_interface;
+            }
+            if (dispatch_data->shared_mem_bytes > device_shared_mem_size) {
+                error(user_context) << "Vulkan: Amount of dynamic shared memory used exceeds device limit!\n"
+                                    << " requested: " << dispatch_data->shared_mem_bytes << " bytes,"
+                                    << " available: " << device_shared_mem_size << " bytes\n";
+                return halide_error_code_incompatible_device_interface;
+            }
+        }
     }
 
     // locate the mapping for overriding any dynamic workgroup local sizes
@@ -940,9 +973,11 @@ int vk_setup_compute_pipeline(void *user_context,
         uint32_t found_index = invalid_index;
         for (uint32_t sc = 0; sc < shader_bindings->specialization_constants_count; sc++) {
             if (shader_bindings->specialization_constants[sc].constant_id == dispatch_constant_ids[dc]) {
+#ifdef DEBUG_RUNTIME
                 debug(user_context) << "  binding specialization constant [" << dispatch_constant_ids[dc] << "] "
                                     << "'" << shader_bindings->specialization_constants[sc].constant_name << "' "
                                     << " => " << dispatch_constant_values[dc] << "\n";
+#endif
                 found_index = sc;
                 break;
             }
@@ -1254,6 +1289,56 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
     return shader_bindings;
 }
 
+int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *allocator,
+                                  const VulkanShaderBinding *shader_bindings, uint32_t shader_count) {
+#ifdef DEBUG_RUNTIME
+    debug(user_context)
+        << " vk_validate_shader_for_device (user_context: " << user_context << ", "
+        << "allocator: " << (void *)allocator << ", "
+        << "shader_bindings: " << (void *)shader_bindings << ", "
+        << "shader_count: " << shader_count << ")\n";
+#endif
+
+    // validate that the shared memory used is less than the available amount on device
+    if (shader_bindings->shared_memory_allocations_count) {
+
+        uint32_t static_shared_mem_bytes = 0;
+
+        for (uint32_t sm = 0; sm < shader_bindings->shared_memory_allocations_count; sm++) {
+            VulkanSharedMemoryAllocation *allocation = &(shader_bindings->shared_memory_allocations[sm]);
+            if (allocation->constant_id == 0) {
+                // static fixed-size allocation
+                static_shared_mem_bytes += allocation->type_size * allocation->array_size;
+            } else {
+                // dynamic allocation (can't determine this until runtime)
+            }
+        }
+
+        // verify the device can actually support the necessary amount of shared memory requested
+        if (allocator->current_physical_device_limits().maxComputeSharedMemorySize > 0) {
+            uint64_t device_shared_mem_size = allocator->current_physical_device_limits().maxComputeSharedMemorySize;
+            if (static_shared_mem_bytes > device_shared_mem_size) {
+                error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n"
+                                    << " requested: " << static_shared_mem_bytes << " bytes,"
+                                    << " available: " << device_shared_mem_size << " bytes\n";
+                return halide_error_code_incompatible_device_interface;
+            }
+        }
+    }
+
+    // validate the number of descriptor sets used is within the amount supported by the device
+    if (allocator->current_physical_device_limits().maxPerStageDescriptorStorageBuffers > 0) {
+        uint64_t max_descriptors = allocator->current_physical_device_limits().maxPerStageDescriptorStorageBuffers;
+        if (shader_count > max_descriptors) {
+            error(user_context) << "Vulkan: Number of required descriptor sets exceeds the amount available for device!\n"
+                                << " requested: " << shader_count << ","
+                                << " available: " << max_descriptors << "\n";
+            return halide_error_code_incompatible_device_interface;
+        }
+    }
+    return halide_error_code_success;
+}
+
 VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, VulkanMemoryAllocator *allocator,
                                                       const char *ptr, int size) {
 #ifdef DEBUG_RUNTIME
@@ -1323,6 +1408,14 @@ VulkanCompilationCacheEntry *vk_compile_shader_module(void *user_context, Vulkan
         return nullptr;
     }
 
+    // validate that the compiled shader can be executed by the device with the requested resources
+    int valid_status = vk_validate_shader_for_device(user_context, allocator, decoded_bindings, shader_count);
+    if (valid_status != halide_error_code_success) {
+        vk_host_free(user_context, cache_entry->shader_bindings, allocator->callbacks());
+        vk_host_free(user_context, cache_entry, allocator->callbacks());
+        return nullptr;
+    }
+
     // save the shader bindings in the cache entry
     cache_entry->shader_bindings = decoded_bindings;
     cache_entry->shader_count = shader_count;
diff --git a/test/correctness/gpu_object_lifetime_1.cpp b/test/correctness/gpu_object_lifetime_1.cpp
index d955d822d068..ce11522aed6f 100644
--- a/test/correctness/gpu_object_lifetime_1.cpp
+++ b/test/correctness/gpu_object_lifetime_1.cpp
@@ -1,6 +1,7 @@
 #include "Halide.h"
 #include "gpu_object_lifetime_tracker.h"
 
+#include <cstdlib>
 #include <iostream>
 
 using namespace Halide;
@@ -18,6 +19,13 @@ int main(int argc, char *argv[]) {
 
     Target target = get_jit_target_from_environment();
 
+    // Disable the Vulkan validation layer or we'll leak
+    // https://github.com/halide/Halide/issues/8290
+    if (target.has_feature(Target::Vulkan)) {
+        char clear_env_var[] = "VK_INSTANCE_LAYERS=";
+        putenv(clear_env_var);
+    }
+
     // We need to hook the default handler too, to catch the frees done by release_all
     JITHandlers handlers;
     handlers.custom_print = halide_print;
diff --git a/test/correctness/gpu_object_lifetime_2.cpp b/test/correctness/gpu_object_lifetime_2.cpp
index 17010576fb99..ade23b1d15e8 100644
--- a/test/correctness/gpu_object_lifetime_2.cpp
+++ b/test/correctness/gpu_object_lifetime_2.cpp
@@ -18,6 +18,13 @@ int main(int argc, char *argv[]) {
 
     Target target = get_jit_target_from_environment();
 
+    // Disable the Vulkan validation layer or we'll leak
+    // https://github.com/halide/Halide/issues/8290
+    if (target.has_feature(Target::Vulkan)) {
+        char clear_env_var[] = "VK_INSTANCE_LAYERS=";
+        putenv(clear_env_var);
+    }
+
     // We need to hook the default handler too, to catch the frees done by release_all
     JITHandlers handlers;
     handlers.custom_print = halide_print;
diff --git a/test/correctness/gpu_object_lifetime_3.cpp b/test/correctness/gpu_object_lifetime_3.cpp
index edaee04ba9ff..aea0b6280292 100644
--- a/test/correctness/gpu_object_lifetime_3.cpp
+++ b/test/correctness/gpu_object_lifetime_3.cpp
@@ -18,6 +18,13 @@ int main(int argc, char *argv[]) {
 
     Target target = get_jit_target_from_environment();
 
+    // Disable the Vulkan validation layer or we'll leak
+    // https://github.com/halide/Halide/issues/8290
+    if (target.has_feature(Target::Vulkan)) {
+        char clear_env_var[] = "VK_INSTANCE_LAYERS=";
+        putenv(clear_env_var);
+    }
+
     // We need to hook the default handler too, to catch the frees done by release_all
     JITHandlers handlers;
     handlers.custom_print = halide_print;
diff --git a/test/correctness/leak_device_memory.cpp b/test/correctness/leak_device_memory.cpp
index 567aeddb5fd8..fcded0edac54 100644
--- a/test/correctness/leak_device_memory.cpp
+++ b/test/correctness/leak_device_memory.cpp
@@ -21,6 +21,13 @@ int main(int argc, char **argv) {
 
     Target target = get_jit_target_from_environment();
 
+    // Disable the Vulkan validation layer or we'll leak
+    // https://github.com/halide/Halide/issues/8290
+    if (target.has_feature(Target::Vulkan)) {
+        char clear_env_var[] = "VK_INSTANCE_LAYERS=";
+        putenv(clear_env_var);
+    }
+
     // We need debug output to record object creation.
     target.set_feature(Target::Debug);
 
diff --git a/test/generator/gpu_object_lifetime_aottest.cpp b/test/generator/gpu_object_lifetime_aottest.cpp
index 0ebfaaa9b20b..d0d0a7d544f3 100644
--- a/test/generator/gpu_object_lifetime_aottest.cpp
+++ b/test/generator/gpu_object_lifetime_aottest.cpp
@@ -10,6 +10,8 @@
 #include "HalideRuntimeOpenCL.h"
 #elif defined(TEST_METAL)
 #include "HalideRuntimeMetal.h"
+#elif defined(TEST_VULKAN)
+#include "HalideRuntimeVulkan.h"
 #endif
 
 #include "gpu_object_lifetime.h"
@@ -34,6 +36,8 @@ int main(int argc, char **argv) {
     printf("TEST_OPENCL enabled for gpu_object_lifetime testing...\n");
 #elif defined(TEST_METAL)
     printf("TEST_METAL enabled for gpu_object_lifetime testing...\n");
+#elif defined(TEST_VULKAN)
+    printf("TEST_VULKAN enabled for gpu_object_lifetime testing...\n");
 #else
     // TODO: we can't support WebGPU here (yet) because our WebGPU runtime doesn't
     // (yet) support halide_webgpu_wrap_native(); when it does, we should be able
@@ -211,6 +215,8 @@ int main(int argc, char **argv) {
         halide_device_release(nullptr, halide_opencl_device_interface());
 #elif defined(TEST_METAL)
         halide_device_release(nullptr, halide_metal_device_interface());
+#elif defined(TEST_VULKAN)
+        halide_device_release(nullptr, halide_vulkan_device_interface());
 #endif
     }
 

From 84bb8eecab1800fe2921cdee148d4f609a9f0214 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 24 Jun 2024 16:15:59 -0700
Subject: [PATCH 140/186] Fixes for top-of-tree LLVM (#8314)

---
 src/CodeGen_LLVM.cpp    | 2 +-
 src/CodeGen_PTX_Dev.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 9761414098a5..f1e55654f540 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1232,7 +1232,7 @@ void CodeGen_LLVM::optimize_module() {
     }
 
     if (tm) {
-#if LLVM_VERSION >= 180
+#if LLVM_VERSION >= 180 && LLVM_VERSION < 190
         tm->registerPassBuilderCallbacks(pb, /*PopulateClassToPassNames=*/false);
 #else
         tm->registerPassBuilderCallbacks(pb);
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 0d63427b8d83..10c9463f16c4 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -697,7 +697,7 @@ vector<char> CodeGen_PTX_Dev::compile_to_src() {
     using OptimizationLevel = llvm::OptimizationLevel;
     OptimizationLevel level = OptimizationLevel::O3;
 
-#if LLVM_VERSION >= 180
+#if LLVM_VERSION >= 180 && LLVM_VERSION < 190
     target_machine->registerPassBuilderCallbacks(pb, /*PopulateClassToPassNames=*/false);
 #else
     target_machine->registerPassBuilderCallbacks(pb);

From 8c836b33f8fe4df148144507762e561f9cec3eff Mon Sep 17 00:00:00 2001
From: Yueming Hao <findhao@outlook.com>
Date: Tue, 25 Jun 2024 09:44:18 -0700
Subject: [PATCH 141/186] Update README_cmake.md (#8322)

The requirements.txt is in the root of the repository now.
---
 README_cmake.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_cmake.md b/README_cmake.md
index 3908920450a1..38cf7978c1c3 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -205,7 +205,7 @@ Once Python is installed, you can install the Python module dependencies either
 globally or in a [virtual environment][venv] by running
 
 ```
-> pip3 install -r .\python_bindings\requirements.txt
+> pip3 install -r requirements.txt
 ```
 
 from the root of the repository.

From 14496928ce0198a42e60555482112ba0bad08a75 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 25 Jun 2024 10:30:08 -0700
Subject: [PATCH 142/186] Remove Introspection (#8273)

* Remove Introspection

Introspection (to provide better error messages + automatic var/func/etc names) has always been kinda handy but kinda fragile, and with the evolution of the DWARF standard it's become broken for newer compilers. We don't have the bandwidth to fix it, and many large customers (e.g. Google) have never been able to rely on it, and given that it can cause crashes in some unusual situations (e.g. when embedded inside a Go app), it's time to say goodbye.

Alas! Poor Introspection. I knew him, Horatio. A feature of infinite jest, of most excellent fancy. It hath borne me on his back a thousand times.

* Update Deserialization.cpp

---------

Co-authored-by: Andrew Adams <andrew.b.adams@gmail.com>
---
 Makefile                                      |    5 -
 .../src/halide/halide_/PyScheduleMethods.h    |    4 +-
 src/Buffer.h                                  |    2 +-
 src/CMakeLists.txt                            |    7 -
 src/Definition.cpp                            |   12 +-
 src/Definition.h                              |    8 +-
 src/Deserialization.cpp                       |    3 +-
 src/Error.cpp                                 |    8 -
 src/Func.cpp                                  |   17 +-
 src/Func.h                                    |   10 -
 src/Generator.cpp                             |   12 +-
 src/Generator.h                               |    6 +-
 src/ImageParam.cpp                            |    2 +-
 src/Introspection.cpp                         | 2399 -----------------
 src/Introspection.h                           |  155 --
 src/ObjectInstanceRegistry.cpp                |   13 +-
 src/ObjectInstanceRegistry.h                  |   24 +-
 src/Param.h                                   |    8 +-
 src/RDom.cpp                                  |    2 +-
 src/RDom.h                                    |    2 +-
 src/Serialization.cpp                         |    5 +-
 src/Util.cpp                                  |   17 -
 src/Util.h                                    |    5 -
 src/Var.cpp                                   |    2 +-
 test/correctness/CMakeLists.txt               |    1 -
 test/correctness/introspection.cpp            |  220 --
 26 files changed, 32 insertions(+), 2917 deletions(-)
 delete mode 100644 src/Introspection.cpp
 delete mode 100644 src/Introspection.h
 delete mode 100644 test/correctness/introspection.cpp

diff --git a/Makefile b/Makefile
index 775e45803a73..5a5cd0189260 100644
--- a/Makefile
+++ b/Makefile
@@ -130,7 +130,6 @@ WITH_D3D12 ?= not-empty
 WITH_VULKAN ?= not-empty
 WITH_SPIRV ?= not-empty
 WITH_WEBGPU ?= not-empty
-WITH_INTROSPECTION ?= not-empty
 WITH_EXCEPTIONS ?=
 WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty
 
@@ -173,7 +172,6 @@ AARCH64_LLVM_CONFIG_LIB=$(if $(WITH_AARCH64), aarch64, )
 RISCV_CXX_FLAGS=$(if $(WITH_RISCV), -DWITH_RISCV, )
 RISCV_LLVM_CONFIG_LIB=$(if $(WITH_RISCV), riscv, )
 
-INTROSPECTION_CXX_FLAGS=$(if $(WITH_INTROSPECTION), -DWITH_INTROSPECTION, )
 EXCEPTIONS_CXX_FLAGS=$(if $(WITH_EXCEPTIONS), -DHALIDE_WITH_EXCEPTIONS -fexceptions, )
 
 HEXAGON_CXX_FLAGS=$(if $(WITH_HEXAGON), -DWITH_HEXAGON, )
@@ -218,7 +216,6 @@ CXX_FLAGS += $(METAL_CXX_FLAGS)
 CXX_FLAGS += $(D3D12_CXX_FLAGS)
 CXX_FLAGS += $(WEBGPU_CXX_FLAGS)
 CXX_FLAGS += $(POWERPC_CXX_FLAGS)
-CXX_FLAGS += $(INTROSPECTION_CXX_FLAGS)
 CXX_FLAGS += $(EXCEPTIONS_CXX_FLAGS)
 CXX_FLAGS += $(AMDGPU_CXX_FLAGS)
 CXX_FLAGS += $(RISCV_CXX_FLAGS)
@@ -512,7 +509,6 @@ SOURCE_FILES = \
   InlineReductions.cpp \
   IntegerDivisionTable.cpp \
   Interval.cpp \
-  Introspection.cpp \
   IR.cpp \
   IREquality.cpp \
   IRMatch.cpp \
@@ -712,7 +708,6 @@ HEADER_FILES = \
   InlineReductions.h \
   IntegerDivisionTable.h \
   Interval.h \
-  Introspection.h \
   IntrusivePtr.h \
   IR.h \
   IREquality.h \
diff --git a/python_bindings/src/halide/halide_/PyScheduleMethods.h b/python_bindings/src/halide/halide_/PyScheduleMethods.h
index 2c8c00a98f4e..5e11f757da72 100644
--- a/python_bindings/src/halide/halide_/PyScheduleMethods.h
+++ b/python_bindings/src/halide/halide_/PyScheduleMethods.h
@@ -107,9 +107,7 @@ HALIDE_NEVER_INLINE void add_schedule_methods(PythonClass &class_instance) {
                 // Templated function; specializing only on ImageParam for now
                 return t.template prefetch<ImageParam>(image, at, from, offset, strategy);
             },
-            py::arg("image"), py::arg("at"), py::arg("from"), py::arg("offset") = 1, py::arg("strategy") = PrefetchBoundStrategy::GuardWithIf)
-
-        .def("source_location", &T::source_location);
+            py::arg("image"), py::arg("at"), py::arg("from"), py::arg("offset") = 1, py::arg("strategy") = PrefetchBoundStrategy::GuardWithIf);
 }
 
 }  // namespace PythonBindings
diff --git a/src/Buffer.h b/src/Buffer.h
index 304a1bd197ab..67c8c2d18d11 100644
--- a/src/Buffer.h
+++ b/src/Buffer.h
@@ -196,7 +196,7 @@ class Buffer {
         : contents(new Internal::BufferContents) {
         contents->buf = std::move(buf);
         if (name.empty()) {
-            contents->name = Internal::make_entity_name(this, "Halide:.*:Buffer<.*>", 'b');
+            contents->name = Internal::unique_name('b');
         } else {
             contents->name = name;
         }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 61691a47a47d..3a89e132b9c3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -91,7 +91,6 @@ set(HEADER_FILES
     InlineReductions.h
     IntegerDivisionTable.h
     Interval.h
-    Introspection.h
     IntrusivePtr.h
     IR.h
     IREquality.h
@@ -262,7 +261,6 @@ set(SOURCE_FILES
     InlineReductions.cpp
     IntegerDivisionTable.cpp
     Interval.cpp
-    Introspection.cpp
     IR.cpp
     IREquality.cpp
     IRMatch.cpp
@@ -563,11 +561,6 @@ target_compile_definitions(Halide PUBLIC
 target_include_directories(Halide INTERFACE "$<BUILD_INTERFACE:${Halide_BINARY_DIR}/include>")
 add_dependencies(Halide HalideIncludes)
 
-option(Halide_WITH_INTROSPECTION "Enable use of debugging symbols for default Func, Var, etc. names" ON)
-if (Halide_WITH_INTROSPECTION)
-    target_compile_definitions(Halide PRIVATE WITH_INTROSPECTION)
-endif ()
-
 if (TARGET Halide_wabt)
     target_link_libraries(Halide PRIVATE Halide_wabt)
     target_compile_definitions(Halide PRIVATE WITH_WABT)
diff --git a/src/Definition.cpp b/src/Definition.cpp
index c6bcc162d444..5e6b00d95867 100644
--- a/src/Definition.cpp
+++ b/src/Definition.cpp
@@ -6,7 +6,6 @@
 #include "IR.h"
 #include "IRMutator.h"
 #include "IROperator.h"
-#include "Introspection.h"
 #include "Var.h"
 
 namespace Halide {
@@ -22,7 +21,6 @@ struct DefinitionContents {
     std::vector<Expr> values, args;
     StageSchedule stage_schedule;
     std::vector<Specialization> specializations;
-    std::string source_location;
 
     DefinitionContents()
         : predicate(const_true()) {
@@ -99,7 +97,6 @@ Definition::Definition(const std::vector<Expr> &args, const std::vector<Expr> &v
     contents->is_init = is_init;
     contents->values = values;
     contents->args = args;
-    contents->source_location = Introspection::get_source_location();
     if (rdom.defined()) {
         contents->predicate = rdom.predicate();
         for (const auto &rv : rdom.domain()) {
@@ -109,7 +106,7 @@ Definition::Definition(const std::vector<Expr> &args, const std::vector<Expr> &v
 }
 
 Definition::Definition(bool is_init, const Expr &predicate, const std::vector<Expr> &args, const std::vector<Expr> &values,
-                       const StageSchedule &schedule, const std::vector<Specialization> &specializations, const std::string &source_location)
+                       const StageSchedule &schedule, const std::vector<Specialization> &specializations)
     : contents(new DefinitionContents) {
     contents->is_init = is_init;
     contents->values = values;
@@ -117,7 +114,6 @@ Definition::Definition(bool is_init, const Expr &predicate, const std::vector<Ex
     contents->predicate = predicate;
     contents->stage_schedule = schedule;
     contents->specializations = specializations;
-    contents->source_location = source_location;
 }
 
 Definition Definition::get_copy() const {
@@ -129,7 +125,6 @@ Definition Definition::get_copy() const {
     copy.contents->values = contents->values;
     copy.contents->args = contents->args;
     copy.contents->stage_schedule = contents->stage_schedule.get_copy();
-    copy.contents->source_location = contents->source_location;
 
     // Deep-copy specializations
     for (const Specialization &s : contents->specializations) {
@@ -204,10 +199,6 @@ const std::vector<Specialization> &Definition::specializations() const {
     return contents->specializations;
 }
 
-std::string Definition::source_location() const {
-    return contents->source_location;
-}
-
 const Specialization &Definition::add_specialization(Expr condition) {
     Specialization s;
     s.condition = std::move(condition);
@@ -216,7 +207,6 @@ const Specialization &Definition::add_specialization(Expr condition) {
     s.definition.contents->predicate = contents->predicate;
     s.definition.contents->values = contents->values;
     s.definition.contents->args = contents->args;
-    s.definition.contents->source_location = contents->source_location;
 
     // The sub-schedule inherits everything about its parent except for its specializations.
     s.definition.contents->stage_schedule = contents->stage_schedule.get_copy();
diff --git a/src/Definition.h b/src/Definition.h
index 890d16d673e1..fc91681275f3 100644
--- a/src/Definition.h
+++ b/src/Definition.h
@@ -49,7 +49,7 @@ class Definition {
 
     /** Construct a Definition with deserialized data. */
     Definition(bool is_init, const Expr &predicate, const std::vector<Expr> &args, const std::vector<Expr> &values,
-               const StageSchedule &schedule, const std::vector<Specialization> &specializations, const std::string &source_location);
+               const StageSchedule &schedule, const std::vector<Specialization> &specializations);
 
     /** Construct an undefined Definition object. */
     Definition();
@@ -123,12 +123,6 @@ class Definition {
     std::vector<Specialization> &specializations();
     const Specialization &add_specialization(Expr condition);
     // @}
-
-    /** Attempt to get the source file and line where this definition
-     * was made using DWARF introspection. Returns an empty string if
-     * no debug symbols were found or the debug symbols were not
-     * understood. Works on OS X and Linux only. */
-    std::string source_location() const;
 };
 
 struct Specialization {
diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp
index 0a1403362621..81ba48c81eae 100644
--- a/src/Deserialization.cpp
+++ b/src/Deserialization.cpp
@@ -1057,8 +1057,9 @@ Definition Deserializer::deserialize_definition(const Serialize::Definition *def
     const std::vector<Specialization> specializations =
         deserialize_vector<Serialize::Specialization, Specialization>(definition->specializations(),
                                                                       &Deserializer::deserialize_specialization);
+    // Deserialize and ignore this value.
     const auto source_location = deserialize_string(definition->source_location());
-    return Definition(is_init, predicate, args, values, stage_schedule, specializations, source_location);
+    return Definition(is_init, predicate, args, values, stage_schedule, specializations);
 }
 
 ReductionVariable Deserializer::deserialize_reduction_variable(const Serialize::ReductionVariable *reduction_variable) {
diff --git a/src/Error.cpp b/src/Error.cpp
index 1a1d1705aa74..136e2fa29328 100644
--- a/src/Error.cpp
+++ b/src/Error.cpp
@@ -1,5 +1,4 @@
 #include "Error.h"
-#include "Introspection.h"
 #include "Util.h"  // for get_env_variable
 
 #include <csignal>
@@ -155,7 +154,6 @@ ErrorReport::ErrorReport(const char *file, int line, const char *condition_strin
 #endif
     const char sep = use_newlines ? '\n' : ' ';
 
-    const std::string &source_loc = Introspection::get_source_location();
     const char *what = (flags & Warning) ? "Warning" : "Error";
     if (flags & User) {
         // Only mention where inside of libHalide the error tripped if we have debug level > 0
@@ -164,15 +162,9 @@ ErrorReport::ErrorReport(const char *file, int line, const char *condition_strin
             debug(1) << "Condition failed: " << condition_string << "\n";
         }
         msg << what << ":";
-        if (!source_loc.empty()) {
-            msg << " (at " << source_loc << ")";
-        }
         msg << sep;
     } else {
         msg << "Internal " << what << " at " << file << ":" << line;
-        if (source_loc.empty()) {
-            msg << " triggered by user code at " << source_loc << ":";
-        }
         msg << sep;
         if (condition_string) {
             msg << "Condition failed: " << condition_string << ":" << sep;
diff --git a/src/Func.cpp b/src/Func.cpp
index 1f068f535aa1..3373348d37a9 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -69,11 +69,11 @@ Func::Func(const std::vector<Type> &required_types, int required_dims, const str
 }
 
 Func::Func()
-    : func(make_entity_name(this, "Halide:.*:Func", 'f')) {
+    : func(unique_name('f')) {
 }
 
 Func::Func(const Expr &e)
-    : func(make_entity_name(this, "Halide:.*:Func", 'f')) {
+    : func(unique_name('f')) {
     (*this)(_) = e;
 }
 
@@ -2146,14 +2146,6 @@ Stage &Stage::compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrate
     return compute_with(LoopLevel(s.function, var, s.stage_index), align);
 }
 
-/** Attempt to get the source file and line where this stage was
- * defined by parsing the process's own debug symbols. Returns an
- * empty string if no debug symbols were found or the debug
- * symbols were not understood. Works on OS X and Linux only. */
-std::string Stage::source_location() const {
-    return definition.source_location();
-}
-
 void Stage::unscheduled() {
     user_assert(!definition.schedule().touched()) << "Stage::unscheduled called on an update definition with a schedule\n";
     definition.schedule().touched() = true;
@@ -3492,11 +3484,6 @@ vector<Argument> Func::infer_arguments() const {
     return Pipeline(*this).infer_arguments();
 }
 
-std::string Func::source_location() const {
-    user_assert(defined()) << "A Func with no definition has no source_location\n";
-    return func.definition().source_location();
-}
-
 Module Func::compile_to_module(const vector<Argument> &args, const std::string &fn_name, const Target &target) {
     return pipeline().compile_to_module(args, fn_name, target);
 }
diff --git a/src/Func.h b/src/Func.h
index bced13f79481..f63195beb082 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -471,12 +471,6 @@ class Stage {
     }
     // @}
 
-    /** Attempt to get the source file and line where this stage was
-     * defined by parsing the process's own debug symbols. Returns an
-     * empty string if no debug symbols were found or the debug
-     * symbols were not understood. Works on OS X and Linux only. */
-    std::string source_location() const;
-
     /** Assert that this stage has intentionally been given no schedule, and
      * suppress the warning about unscheduled update definitions that would
      * otherwise fire. This counts as a schedule, so calling this twice on the
@@ -2600,10 +2594,6 @@ class Func {
      */
     std::vector<Argument> infer_arguments() const;
 
-    /** Get the source location of the pure definition of this
-     * Func. See Stage::source_location() */
-    std::string source_location() const;
-
     /** Return the current StageSchedule associated with this initial
      * Stage of this Func. For introspection only: to modify schedule,
      * use the Func interface. */
diff --git a/src/Generator.cpp b/src/Generator.cpp
index 8719b2f2adae..478b585660a6 100644
--- a/src/Generator.cpp
+++ b/src/Generator.cpp
@@ -1114,8 +1114,7 @@ void execute_generator(const ExecuteGeneratorArgs &args_in) {
 
 GeneratorParamBase::GeneratorParamBase(const std::string &name)
     : name_(name) {
-    ObjectInstanceRegistry::register_instance(this, 0, ObjectInstanceRegistry::GeneratorParam,
-                                              this, nullptr);
+    ObjectInstanceRegistry::register_instance(this, 0, ObjectInstanceRegistry::GeneratorParam, this);
 }
 
 GeneratorParamBase::~GeneratorParamBase() {
@@ -1238,9 +1237,9 @@ std::vector<std::string> GeneratorRegistry::enumerate() {
     return result;
 }
 
-GeneratorBase::GeneratorBase(size_t size, const void *introspection_helper)
+GeneratorBase::GeneratorBase(size_t size)
     : size(size) {
-    ObjectInstanceRegistry::register_instance(this, size, ObjectInstanceRegistry::Generator, this, introspection_helper);
+    ObjectInstanceRegistry::register_instance(this, size, ObjectInstanceRegistry::Generator, this);
 }
 
 GeneratorBase::~GeneratorBase() {
@@ -1851,7 +1850,7 @@ GeneratorInputBase::GeneratorInputBase(size_t array_size,
                                        const std::vector<Type> &t,
                                        int d)
     : GIOBase(array_size, name, kind, t, d) {
-    ObjectInstanceRegistry::register_instance(this, 0, ObjectInstanceRegistry::GeneratorInput, this, nullptr);
+    ObjectInstanceRegistry::register_instance(this, 0, ObjectInstanceRegistry::GeneratorInput, this);
 }
 
 GeneratorInputBase::GeneratorInputBase(const std::string &name, ArgInfoKind kind, const std::vector<Type> &t, int d)
@@ -1994,8 +1993,7 @@ void GeneratorInputBase::set_estimates_impl(const Region &estimates) {
 GeneratorOutputBase::GeneratorOutputBase(size_t array_size, const std::string &name, ArgInfoKind kind, const std::vector<Type> &t, int d)
     : GIOBase(array_size, name, kind, t, d) {
     internal_assert(kind != ArgInfoKind::Scalar);
-    ObjectInstanceRegistry::register_instance(this, 0, ObjectInstanceRegistry::GeneratorOutput,
-                                              this, nullptr);
+    ObjectInstanceRegistry::register_instance(this, 0, ObjectInstanceRegistry::GeneratorOutput, this);
 }
 
 GeneratorOutputBase::GeneratorOutputBase(const std::string &name, ArgInfoKind kind, const std::vector<Type> &t, int d)
diff --git a/src/Generator.h b/src/Generator.h
index e819bd2a88a8..5cc0dfc60f03 100644
--- a/src/Generator.h
+++ b/src/Generator.h
@@ -274,7 +274,6 @@
 #include "AbstractGenerator.h"
 #include "Func.h"
 #include "ImageParam.h"
-#include "Introspection.h"
 #include "ObjectInstanceRegistry.h"
 #include "Target.h"
 
@@ -3359,7 +3358,7 @@ class GeneratorBase : public NamesInterface, public AbstractGenerator {
     }
 
 protected:
-    GeneratorBase(size_t size, const void *introspection_helper);
+    GeneratorBase(size_t size);
     void set_generator_names(const std::string &registered_name, const std::string &stub_name);
 
     // Note that it is explicitly legal to override init_from_context(), so that you can (say)
@@ -3741,8 +3740,7 @@ template<class T>
 class Generator : public Internal::GeneratorBase {
 protected:
     Generator()
-        : Internal::GeneratorBase(sizeof(T),
-                                  Internal::Introspection::get_introspection_helper<T>()) {
+        : Internal::GeneratorBase(sizeof(T)) {
     }
 
 public:
diff --git a/src/ImageParam.cpp b/src/ImageParam.cpp
index 7e72ff2ab47b..ee321a7c3b8a 100644
--- a/src/ImageParam.cpp
+++ b/src/ImageParam.cpp
@@ -6,7 +6,7 @@ namespace Halide {
 
 ImageParam::ImageParam(Type t, int d)
     : OutputImageParam(
-          Parameter(t, true, d, Internal::make_entity_name(this, "Halide:.*:ImageParam", 'p')),
+          Parameter(t, true, d, Internal::unique_name('p')),
           Argument::InputBuffer,
           Func()) {
     // We must call create_func() after the super-ctor has completed.
diff --git a/src/Introspection.cpp b/src/Introspection.cpp
deleted file mode 100644
index 88c952d1c100..000000000000
--- a/src/Introspection.cpp
+++ /dev/null
@@ -1,2399 +0,0 @@
-#include "Introspection.h"
-
-#if defined(_MSC_VER)
-#undef WITH_INTROSPECTION
-#elif defined(__has_include)
-#if !__has_include(<execinfo.h>)
-#undef WITH_INTROSPECTION
-#endif
-#endif
-
-#ifdef WITH_INTROSPECTION
-
-#include "Debug.h"
-#include "Error.h"
-#include "LLVM_Headers.h"
-#include "Util.h"
-
-#include <cstdio>
-#include <iostream>
-#include <map>
-#include <sstream>
-#include <string>
-
-// defines backtrace, which gets the call stack as instruction pointers
-#include <execinfo.h>
-
-#include <regex>
-
-using std::map;
-using std::pair;
-using std::vector;
-
-namespace Halide {
-namespace Internal {
-namespace Introspection {
-
-// All of this only works with DWARF debug info on linux and OS X. For
-// other platforms, WITH_INTROSPECTION should be off.
-#ifdef __APPLE__
-extern "C" void _NSGetExecutablePath(char *, int32_t *);
-void get_program_name(char *name, int32_t size) {
-    _NSGetExecutablePath(name, &size);
-}
-#else
-// glibc defines the binary name for us
-extern "C" char *program_invocation_name;
-void get_program_name(char *name, int32_t size) {
-    strncpy(name, program_invocation_name, size);
-}
-#endif
-
-namespace {
-
-template<typename T>
-inline T load_misaligned(const T *p) {
-    T result;
-    memcpy(&result, p, sizeof(T));
-    return result;
-}
-
-typedef uint64_t llvm_offset_t;
-
-class DebugSections {
-
-    bool calibrated = false;
-
-    struct FieldFormat {
-        uint64_t name = 0, form = 0;
-        FieldFormat() = default;
-        FieldFormat(uint64_t n, uint64_t f)
-            : name(n), form(f) {
-        }
-    };
-
-    struct EntryFormat {
-        uint64_t code = 0, tag = 0;
-        bool has_children = false;
-        EntryFormat() = default;
-        vector<FieldFormat> fields;
-    };
-    map<uint64_t, EntryFormat> entry_formats;
-
-    struct LiveRange {
-        uint64_t pc_begin, pc_end;
-    };
-
-    struct TypeInfo;
-
-    struct GlobalVariable {
-        std::string name;
-        TypeInfo *type = nullptr;
-        uint64_t type_def_loc = 0;
-        uint64_t def_loc = 0, spec_loc = 0;
-        uint64_t addr = 0;
-        GlobalVariable()
-            : name("") {
-        }
-        bool operator<(const GlobalVariable &other) const {
-            return addr < other.addr;
-        }
-    };
-    vector<GlobalVariable> global_variables;
-
-    struct HeapObject {
-        uint64_t addr;
-        TypeInfo *type;
-        struct Member {
-            uint64_t addr;
-            std::string name;
-            TypeInfo *type;
-            bool operator<(const Member &other) const {
-                return addr < other.addr;
-            }
-        };
-        vector<Member> members;
-    };
-    map<uint64_t, HeapObject> heap_objects;
-
-    struct LocalVariable {
-        std::string name;
-        TypeInfo *type = nullptr;
-        int stack_offset = 0;
-        uint64_t type_def_loc = 0;
-        uint64_t def_loc = 0, origin_loc = 0;
-        // Some local vars are only alive for certain address ranges
-        // (e.g. those inside a lexical block). If the ranges vector
-        // is empty, the variables are alive for the entire containing
-        // function.
-        vector<LiveRange> live_ranges;
-        LocalVariable()
-            : name("") {
-        }
-    };
-
-    struct FunctionInfo {
-        std::string name;
-        uint64_t pc_begin = 0, pc_end = 0;
-        vector<LocalVariable> variables;
-        uint64_t def_loc = 0, spec_loc = 0;
-        // The stack variable offsets are w.r.t either:
-        // gcc: the top of the stack frame (one below the return address to the caller)
-        // clang with frame pointers: the bottom of the stack frame (one above the return address to this function)
-        // clang without frame pointers: the top of the stack frame (...TODO...)
-        enum { Unknown = 0,
-               GCC,
-               ClangFP,
-               ClangNoFP } frame_base;
-        FunctionInfo()
-            : name("") {
-        }
-
-        bool operator<(const FunctionInfo &other) const {
-            return pc_begin < other.pc_begin;
-        }
-    };
-    vector<FunctionInfo> functions;
-
-    vector<std::string> source_files;
-    struct LineInfo {
-        uint64_t pc;
-        uint32_t line;
-        uint32_t file;  // Index into source_files
-        bool operator<(const LineInfo &other) const {
-            return pc < other.pc;
-        }
-    };
-    vector<LineInfo> source_lines;
-
-    struct TypeInfo {
-        std::string name;
-        uint64_t size = 0;
-        uint64_t def_loc = 0;
-        vector<LocalVariable> members;
-
-        // TypeInfo can also be used to represent a pointer to
-        // another type, in which case there's a single member, which
-        // represents the value pointed to (its name is empty and its
-        // stack_offset is meaningless).
-        enum { Primitive,
-               Class,
-               Struct,
-               Pointer,
-               Typedef,
-               Const,
-               Reference,
-               Array } type = Primitive;
-
-        TypeInfo() = default;
-    };
-    vector<TypeInfo> types;
-
-public:
-    bool working = false;
-
-    DebugSections(const std::string &binary) {
-        std::string binary_path = binary;
-#ifdef __APPLE__
-        size_t last_slash = binary_path.rfind('/');
-        if (last_slash == std::string::npos ||
-            last_slash >= binary_path.size() - 1) {
-            last_slash = 0;
-        } else {
-            last_slash++;
-        }
-        std::string file_only = binary_path.substr(last_slash, binary_path.size() - last_slash);
-        binary_path += ".dSYM/Contents/Resources/DWARF/" + file_only;
-#endif
-
-        debug(5) << "Loading " << binary_path << "\n";
-
-        load_and_parse_object_file(binary_path);
-    }
-
-    int count_trailing_zeros(int64_t x) {
-        for (int i = 0; i < 64; i++) {
-            if (x & (1 << i)) {
-                return i;
-            }
-        }
-        return 64;
-    }
-
-    void calibrate_pc_offset(void (*fn)()) {
-        // Calibrate for the offset between the instruction pointers
-        // in the debug info and the instruction pointers in the
-        // actual file.
-        bool found = false;
-        uint64_t pc_real = (uint64_t)fn;
-        int64_t pc_adjust = 0;
-        for (auto &function : functions) {
-            if (function.name == "HalideIntrospectionCanary::offset_marker" &&
-                function.pc_begin) {
-
-                uint64_t pc_debug = function.pc_begin;
-
-                if (calibrated) {
-                    // If we're already calibrated, we should find a function with a matching pc
-                    if (pc_debug == pc_real) {
-                        return;
-                    }
-                } else {
-                    int64_t pc_adj = pc_real - pc_debug;
-
-                    // Offset must be a multiple of 4096
-                    if (pc_adj & (4095)) {
-                        continue;
-                    }
-
-                    // If we find multiple matches, pick the one with more trailing zeros
-                    if (!found ||
-                        count_trailing_zeros(pc_adj) > count_trailing_zeros(pc_adjust)) {
-                        pc_adjust = pc_adj;
-                        found = true;
-                    }
-                }
-            }
-        }
-
-        if (!found) {
-            if (!calibrated) {
-                debug(2) << "Failed to find HalideIntrospectionCanary::offset_marker\n";
-            } else {
-                debug(2) << "Failed to find HalideIntrospectionCanary::offset_marker at the expected location\n";
-            }
-            working = false;
-            return;
-        }
-
-        debug(5) << "Program counter adjustment between debug info and actual code: " << pc_adjust << "\n";
-
-        for (auto &f : functions) {
-            f.pc_begin += pc_adjust;
-            f.pc_end += pc_adjust;
-            for (auto &v : f.variables) {
-                for (auto &live_range : v.live_ranges) {
-                    live_range.pc_begin += pc_adjust;
-                    live_range.pc_end += pc_adjust;
-                }
-            }
-        }
-
-        for (auto &source_line : source_lines) {
-            source_line.pc += pc_adjust;
-        }
-
-        for (auto &global_variable : global_variables) {
-            global_variable.addr += pc_adjust;
-        }
-
-        calibrated = true;
-    }
-
-    int find_global_variable(const void *global_pointer) {
-        if (global_variables.empty()) {
-            debug(5) << "Considering possible global at " << global_pointer << " but global_variables is empty\n";
-            return -1;
-        }
-        debug(5) << "Considering possible global at " << global_pointer << "\n";
-
-        debug(5) << "Known globals range from " << std::hex << global_variables.front().addr << " to " << global_variables.back().addr << std::dec << "\n";
-        uint64_t address = (uint64_t)(global_pointer);
-        size_t hi = global_variables.size();
-        size_t lo = 0;
-        while (hi > lo + 1) {
-            size_t mid = (hi + lo) / 2;
-            uint64_t addr_mid = global_variables[mid].addr;
-            if (address < addr_mid) {
-                hi = mid;
-            } else {
-                lo = mid;
-            }
-        }
-
-        if (lo >= global_variables.size()) {
-            return -1;
-        }
-
-        // There may be multiple matching addresses. Walk backwards to find the first one.
-        size_t idx = lo;
-        while (idx > 0 && global_variables[idx - 1].addr == global_variables[lo].addr) {
-            idx--;
-        }
-
-        // Check the address is indeed inside the object found
-        uint64_t end_ptr = global_variables[idx].addr;
-        TypeInfo *t = global_variables[idx].type;
-        if (t == nullptr) {
-            return -1;
-        }
-        uint64_t size = t->size;
-        while (t->type == TypeInfo::Array) {
-            t = t->members[0].type;
-            size *= t->size;
-        }
-        end_ptr += size;
-        if (address < global_variables[idx].addr ||
-            address >= end_ptr) {
-            return -1;
-        }
-
-        return (int)idx;
-    }
-
-    // Get the debug name of a global var from a pointer to it
-    std::string get_global_variable_name(const void *global_pointer, const std::string &type_name = "") {
-        // Find the index of the first global variable with this address
-        int idx = find_global_variable(global_pointer);
-
-        if (idx < 0) {
-            // No matching global variable found.
-            return "";
-        }
-
-        uint64_t address = (uint64_t)global_pointer;
-
-        std::regex re(type_name);
-
-        // Now test all of them
-        for (; (size_t)idx < global_variables.size() && global_variables[idx].addr <= address; idx++) {
-
-            GlobalVariable &v = global_variables[idx];
-            TypeInfo *elem_type = nullptr;
-            if (v.type && v.type->type == TypeInfo::Array && v.type->size) {
-                elem_type = v.type->members[0].type;
-            }
-
-            debug(5) << "Closest global is " << v.name << " at " << std::hex << v.addr << std::dec;
-            if (v.type) {
-                debug(5) << " with type " << v.type->name << "\n";
-            } else {
-                debug(5) << "\n";
-            }
-
-            if (v.addr == address &&
-                (type_name.empty() ||
-                 (v.type && regex_match(v.type->name, re)))) {
-                return v.name;
-            } else if (elem_type &&  // Check if it's an array element
-                       (type_name.empty() ||
-                        (elem_type && regex_match(elem_type->name, re)))) {
-                int64_t array_size_bytes = v.type->size * elem_type->size;
-                int64_t pos_bytes = address - v.addr;
-                if (pos_bytes >= 0 &&
-                    pos_bytes < array_size_bytes &&
-                    pos_bytes % elem_type->size == 0) {
-                    std::ostringstream oss;
-                    oss << v.name << "[" << (pos_bytes / elem_type->size) << "]";
-                    debug(5) << "Successful match to array element\n";
-                    return oss.str();
-                } else {
-                    debug(5) << "Failed match to array element: " << pos_bytes << " " << array_size_bytes << " " << elem_type->size << "\n";
-                }
-            }
-        }
-
-        // No match
-        return "";
-    }
-
-    void register_heap_object(const void *obj, size_t size, const void *helper) {
-        // helper should be a pointer to a global
-        int idx = find_global_variable(helper);
-        if (idx == -1) {
-            debug(5) << "Could not find helper object: " << helper << "\n";
-            return;
-        }
-        const GlobalVariable &ptr = global_variables[idx];
-        debug(5) << "helper object is " << ptr.name << " at " << std::hex << ptr.addr << std::dec;
-        if (ptr.type) {
-            debug(5) << " with type " << ptr.type->name << "\n";
-        } else {
-            debug(5) << " with unknown type!\n";
-            return;
-        }
-
-        internal_assert(ptr.type->type == TypeInfo::Pointer)
-            << "The type of the helper object was supposed to be a pointer\n";
-        internal_assert(ptr.type->members.size() == 1);
-        TypeInfo *object_type = ptr.type->members[0].type;
-        internal_assert(object_type);
-
-        debug(5) << "The object has type: " << object_type->name << "\n";
-
-        internal_assert(size == object_type->size);
-
-        HeapObject heap_object;
-        heap_object.type = object_type;
-        heap_object.addr = (uint64_t)obj;
-
-        // Recursively enumerate the members.
-        for (auto &member_spec : object_type->members) {
-            HeapObject::Member member;
-            member.name = member_spec.name;
-            member.type = member_spec.type;
-            member.addr = heap_object.addr + member_spec.stack_offset;
-            if (member.type) {
-                heap_object.members.push_back(member);
-                debug(5) << member.name << " - " << (int)(member.type->type) << "\n";
-            }
-        }
-
-        // Note that this loop pushes elements onto the vector it's
-        // iterating over as it goes - that's what makes the
-        // enumeration recursive.
-        for (size_t i = 0; i < heap_object.members.size(); i++) {
-            HeapObject::Member parent = heap_object.members[i];
-
-            // Stop at references or pointers. We could register them
-            // recursively (and basically write a garbage collector
-            // object tracker), but that's beyond the scope of what
-            // we're trying to do here. Besides, predicting the
-            // addresses of their children-of-children might follow a
-            // dangling pointer.
-            if (parent.type->type == TypeInfo::Pointer ||
-                parent.type->type == TypeInfo::Reference) {
-                continue;
-            }
-
-            for (size_t j = 0; j < parent.type->members.size(); j++) {
-                const LocalVariable &member_spec = parent.type->members[j];
-                TypeInfo *member_type = member_spec.type;
-
-                HeapObject::Member child;
-                child.type = member_type;
-
-                if (parent.type->type == TypeInfo::Typedef ||
-                    parent.type->type == TypeInfo::Const) {
-                    // We're just following a type modifier. It's still the same member.
-                    child.name = parent.name;
-                } else if (parent.type->type == TypeInfo::Array) {
-                    child.name = "";  // the '[index]' gets added in the query routine.
-                } else {
-                    child.name = member_spec.name;
-                }
-
-                child.addr = parent.addr + member_spec.stack_offset;
-
-                if (child.type) {
-                    debug(5) << child.name << " - " << (int)(child.type->type) << "\n";
-                    heap_object.members.push_back(child);
-                }
-            }
-        }
-
-        // Sort by member address, but use stable stort so that parents stay before children.
-        std::stable_sort(heap_object.members.begin(), heap_object.members.end());
-
-        debug(5) << "Children of heap object of type " << object_type->name << " at " << obj << ":\n";
-        for (auto &member : heap_object.members) {
-            debug(5) << std::hex << member.addr << std::dec << ": " << member.type->name << " " << member.name << "\n";
-        }
-
-        heap_objects[heap_object.addr] = heap_object;
-    }
-
-    void deregister_heap_object(const void *obj, size_t size) {
-        heap_objects.erase((uint64_t)obj);
-    }
-
-    // Get the debug name of a member of a heap variable from a pointer to it
-    std::string get_heap_member_name(const void *ptr, const std::string &type_name = "") {
-        debug(5) << "Getting heap member name of " << ptr << "\n";
-
-        if (heap_objects.empty()) {
-            debug(5) << "No registered heap objects\n";
-            return "";
-        }
-
-        uint64_t addr = (uint64_t)ptr;
-        std::map<uint64_t, HeapObject>::iterator it = heap_objects.upper_bound(addr);
-
-        if (it == heap_objects.begin()) {
-            debug(5) << "No heap objects less than this address\n";
-            return "";
-        }
-
-        // 'it' is the first element strictly greater than addr, so go
-        // back one to get less-than-or-equal-to.
-        it--;
-
-        const HeapObject &obj = it->second;
-        uint64_t object_start = it->first;
-        uint64_t object_end = object_start + obj.type->size;
-        if (addr < object_start || addr >= object_end) {
-            debug(5) << "Not contained in any heap object\n";
-            return "";
-        }
-
-        std::ostringstream name;
-
-        std::regex re(type_name);
-
-        // Look in the members for the appropriate offset.
-        for (const auto &member : obj.members) {
-            TypeInfo *t = member.type;
-
-            if (!t) {
-                continue;
-            }
-
-            debug(5) << "Comparing to member " << member.name
-                     << " at address " << std::hex << member.addr << std::dec
-                     << " with type " << t->name
-                     << " and type type " << (int)t->type << "\n";
-
-            if (member.addr == addr &&
-                (type_name.empty() ||
-                 regex_match(t->name, re))) {
-                name << member.name;
-                return name.str();
-            }
-
-            // For arrays, we only unpacked the first element.
-            if (t->type == TypeInfo::Array) {
-                TypeInfo *elem_type = t->members[0].type;
-                uint64_t array_start_addr = member.addr;
-                uint64_t array_end_addr = array_start_addr + t->size * elem_type->size;
-                debug(5) << "Array runs from " << std::hex << array_start_addr << " to " << array_end_addr << "\n";
-                if (elem_type && addr >= array_start_addr && addr < array_end_addr) {
-                    // Adjust the query address backwards to lie
-                    // within the first array element and remember the
-                    // array index to correct the name later.
-                    uint64_t containing_elem = (addr - array_start_addr) / elem_type->size;
-                    addr -= containing_elem * elem_type->size;
-                    debug(5) << "Query belongs to this array. Adjusting query address backwards to "
-                             << std::hex << addr << std::dec << "\n";
-                    name << member.name << "[" << containing_elem << "]";
-                }
-            } else if (t->type == TypeInfo::Struct ||
-                       t->type == TypeInfo::Class ||
-                       t->type == TypeInfo::Primitive) {
-                // If I'm not this member, but am contained within it, incorporate its name.
-                uint64_t struct_start_addr = member.addr;
-                uint64_t struct_end_addr = struct_start_addr + t->size;
-                debug(5) << "Struct runs from " << std::hex << struct_start_addr << " to " << struct_end_addr << "\n";
-                if (addr >= struct_start_addr && addr < struct_end_addr) {
-                    name << member.name << ".";
-                }
-            }
-        }
-
-        debug(5) << "Didn't seem to be any of the members of this heap object\n";
-        return "";
-    }
-
-    // Get the debug name of a stack variable from a pointer to it
-    std::string get_stack_variable_name(const void *stack_pointer, const std::string &type_name = "") {
-
-        // Check it's a plausible stack pointer
-        int marker = 0;
-        uint64_t marker_addr = (uint64_t)&marker;
-        uint64_t top_of_stack;
-        if (marker_addr >> 63) {
-            top_of_stack = (uint64_t)(-1);
-        } else {
-            // Conservatively assume top of stack is first multiple of
-            // 1GB larger than the marker (seriously, who allocates
-            // 1GB of stack space?).
-            top_of_stack = ((marker_addr >> 30) + 1) << 30;
-        }
-
-        if ((uint64_t)stack_pointer > top_of_stack ||
-            (uint64_t)stack_pointer < marker_addr) {
-            return "";
-        }
-
-        struct frame_info {
-            frame_info *frame_pointer;
-            void *return_address;
-        };
-
-        frame_info *fp = (frame_info *)__builtin_frame_address(0);
-        frame_info *next_fp = nullptr;
-
-        // Walk up the stack until we pass the pointer.
-        debug(5) << "Walking up the stack\n";
-        while (fp < stack_pointer) {
-            debug(5) << "frame pointer: " << (void *)(fp->frame_pointer)
-                     << " return address: " << fp->return_address << "\n";
-            next_fp = fp;
-            if (fp->frame_pointer < fp) {
-                // If we ever jump downwards, something is
-                // wrong. Maybe this was a heap pointer.
-                debug(5) << "Bailing out because fp decreased\n";
-                return "";
-            }
-            fp = fp->frame_pointer;
-            if (fp < (void *)&marker) {
-                // If we're still below the marker after one hop,
-                // something is wrong. Maybe this was a heap pointer.
-                debug(5) << "Bailing out because we're below the marker\n";
-                return "";
-            }
-        }
-
-        if (!next_fp) {
-            // If we didn't manage to walk up one frame, something is
-            // wrong. Maybe this was a heap pointer.
-            debug(5) << "Bailing out because we didn't even walk up one frame\n";
-            return "";
-        }
-
-        // It's a stack variable in the function containing address
-        // next_fp->return_address
-
-        // Get the program counter at the position of the call
-        uint64_t pc = (uint64_t)(next_fp->return_address) - 5;  // -5 for the callq instruction
-
-        FunctionInfo *func = find_containing_function(next_fp->return_address);
-
-        if (!func) {
-            debug(5) << "Bailing out because we couldn't find the containing function\n";
-            return "";
-        }
-
-        // Now what is its offset in that function's frame? The return
-        // address is always at the top of the frame.
-        int offset_above = (int)((int64_t)(stack_pointer) - (int64_t)(fp));
-        int offset_below = (int)((int64_t)(stack_pointer) - (int64_t)(next_fp));
-
-        const int addr_size = sizeof(void *);
-
-        int offset;
-        if (func->frame_base == FunctionInfo::GCC) {
-            offset = offset_above - 2 * addr_size;
-        } else if (func->frame_base == FunctionInfo::ClangFP) {
-            offset = offset_above;
-        } else if (func->frame_base == FunctionInfo::ClangNoFP) {
-            offset = offset_below - 2 * addr_size;
-        } else {
-            debug(5) << "Bailing out because containing function used an unknown mechanism for specifying stack offsets\n";
-            return "";
-        }
-
-        debug(5) << "Searching for var at offset " << offset << "\n";
-
-        std::regex re(type_name);
-
-        for (auto &var : func->variables) {
-            debug(5) << "Var " << var.name << " is at offset " << var.stack_offset << "\n";
-
-            // Reject it if we're not in its live ranges
-            if (!var.live_ranges.empty()) {
-                bool in_live_range = false;
-                for (auto live_range : var.live_ranges) {
-                    if (pc >= live_range.pc_begin &&
-                        pc < live_range.pc_end) {
-                        in_live_range = true;
-                        break;
-                    }
-                }
-                if (!in_live_range) {
-                    debug(5) << "Skipping var because we're not in any of its live ranges\n";
-                    continue;
-                }
-            }
-
-            TypeInfo *type = var.type;
-            TypeInfo *elem_type = nullptr;
-            if (type && type->type == TypeInfo::Array && type->size) {
-                elem_type = type->members[0].type;
-            }
-
-            if (offset == var.stack_offset && var.type) {
-                debug(5) << "Considering match: " << var.type->name << ", " << var.name << "\n";
-            }
-
-            if (offset == var.stack_offset &&
-                (type_name.empty() ||
-                 (type && regex_match(type->name, re)))) {
-                debug(5) << "Successful match to scalar var\n";
-                return var.name;
-            } else if (elem_type &&  // Check if it's an array element
-                       (type_name.empty() ||
-                        (elem_type &&  // Check the type matches
-                         regex_match(elem_type->name, re)))) {
-                int64_t array_size_bytes = type->size * elem_type->size;
-                int64_t pos_bytes = offset - var.stack_offset;
-                if (pos_bytes >= 0 &&
-                    pos_bytes < array_size_bytes &&
-                    pos_bytes % elem_type->size == 0) {
-                    std::ostringstream oss;
-                    oss << var.name << "[" << (pos_bytes / elem_type->size) << "]";
-                    debug(5) << "Successful match to array element\n";
-                    return oss.str();
-                } else {
-                    debug(5) << "No match to array element: " << type->size << " " << array_size_bytes << " " << pos_bytes << " " << elem_type->size << "\n";
-                }
-            }
-        }
-
-        debug(5) << "Failed to find variable at the matching offset with the given type\n";
-        return "";
-    }
-
-    // Look up n stack frames and get the source location as filename:line
-    std::string get_source_location() {
-        debug(5) << "Finding source location\n";
-
-        if (source_lines.empty()) {
-            debug(5) << "Bailing out because we have no source lines\n";
-            return "";
-        }
-
-        const int max_stack_frames = 256;
-
-        // Get the backtrace
-        vector<void *> trace(max_stack_frames);
-        int trace_size = backtrace(&trace[0], (int)(trace.size()));
-
-        for (int frame = 2; frame < trace_size; frame++) {
-            uint64_t address = (uint64_t)trace[frame];
-
-            debug(5) << "Considering address " << ((void *)address) << "\n";
-
-            // In some situations on OSX (most notable, compiling with different
-            // setting for -fomit-frame-pointer), we can get invalid addresses here that
-            // are small but nonnull (eg, 0x08). It's probably better to miss introspection
-            // options here than to crash during compilation.
-            if (address <= (uint64_t)0xff) {
-                debug(1) << "Bailing out because we found an obviously-bad address in the backtrace. (Did you set -fno-omit-frame-pointer everywhere?)\n";
-                return "";
-            }
-
-            const uint8_t *inst_ptr = (const uint8_t *)address;
-            if (inst_ptr[-5] == 0xe8) {
-                // The actual address of the call is probably 5 bytes
-                // earlier (using callq with an immediate address)
-                address -= 5;
-            } else if (inst_ptr[-2] == 0xff) {
-                // Or maybe it's 2 bytes earlier (using callq with a
-                // register address)
-                address -= 2;
-            } else {
-                debug(5) << "Skipping function because there's no callq before " << (const void *)(inst_ptr) << "\n";
-                continue;
-            }
-
-            // Binary search into functions
-            FunctionInfo *f = find_containing_function((void *)address);
-
-            // If no debug info for this function, we must still be
-            // inside libHalide. Continue searching upwards.
-            if (!f) {
-                debug(5) << "Skipping function because we have no debug info for it\n";
-                continue;
-            }
-
-            debug(5) << "Containing function is " << f->name << "\n";
-
-            // If we're still in the Halide namespace, continue searching
-            if (f->name.size() > 8 &&
-                f->name.substr(0, 8) == "Halide::") {
-                debug(5) << "Skipping function because it's in the Halide namespace\n";
-                continue;
-            }
-
-            // Binary search into source_lines
-            size_t hi = source_lines.size();
-            size_t lo = 0;
-            while (hi > lo + 1) {
-                size_t mid = (hi + lo) / 2;
-                uint64_t pc_mid = source_lines[mid].pc;
-                if (address < pc_mid) {
-                    hi = mid;
-                } else {
-                    lo = mid;
-                }
-            }
-
-            const std::string &file = source_files[source_lines[lo].file];
-            int line = source_lines[lo].line;
-
-            std::ostringstream oss;
-            oss << file << ":" << line;
-
-            debug(5) << "Source location is " << oss.str() << "\n";
-
-            return oss.str();
-        }
-
-        debug(5) << "Bailing out because we reached the end of the backtrace\n";
-        return "";
-    }
-
-    void dump() {
-        // Dump all the types
-        for (auto &type : types) {
-            printf("Class %s of size %llu @ %llx: \n",
-                   type.name.c_str(),
-                   (unsigned long long)(type.size),
-                   (unsigned long long)(type.def_loc));
-            for (auto &member : type.members) {
-                TypeInfo *c = member.type;
-                const char *type_name = "(unknown)";
-                if (c) {
-                    type_name = c->name.c_str();
-                }
-                printf("  Member %s at %d of type %s @ %llx\n",
-                       member.name.c_str(),
-                       member.stack_offset,
-                       type_name,
-                       (long long unsigned)member.type_def_loc);
-            }
-        }
-
-        // Dump all the functions and their local variables
-        for (auto &f : functions) {
-            printf("Function %s at %llx - %llx (frame_base %d): \n",
-                   f.name.c_str(),
-                   (unsigned long long)(f.pc_begin),
-                   (unsigned long long)(f.pc_end),
-                   (int)f.frame_base);
-            for (const auto &v : f.variables) {
-                TypeInfo *c = v.type;
-                const char *type_name = "(unknown)";
-                if (c) {
-                    type_name = c->name.c_str();
-                }
-                printf("  Variable %s at %d of type %s @ %llx\n",
-                       v.name.c_str(),
-                       v.stack_offset,
-                       type_name,
-                       (long long unsigned)v.type_def_loc);
-                for (auto live_range : v.live_ranges) {
-                    printf("    Live range: %llx - %llx\n",
-                           (unsigned long long)live_range.pc_begin,
-                           (unsigned long long)live_range.pc_end);
-                }
-            }
-        }
-
-        // Dump the pc -> source file relationship
-        for (auto &source_line : source_lines) {
-            printf("%p -> %s:%d\n",
-                   (void *)(source_line.pc),
-                   source_files[source_line.file].c_str(),
-                   source_line.line);
-        }
-
-        // Dump the global variables
-        for (auto &v : global_variables) {
-            TypeInfo *c = v.type;
-            const char *type_name = "(unknown)";
-            if (c) {
-                type_name = c->name.c_str();
-            }
-            printf("  Global variable %s at %llx of type %s\n",
-                   v.name.c_str(),
-                   (long long unsigned)v.addr,
-                   type_name);
-        }
-    }
-
-    bool dump_stack_frame(void *ptr) {
-        FunctionInfo *fi = find_containing_function(ptr);
-        if (fi == nullptr) {
-            debug(0) << "Failed to find function containing " << ptr << " in debug info\n";
-            return false;
-        }
-
-        debug(0) << fi->name << ":\n";
-        for (const LocalVariable &v : fi->variables) {
-            TypeInfo *t = v.type;
-            debug(0) << " ";
-            if (t) {
-                debug(0) << t->name << " ";
-            } else {
-                debug(0) << "(unknown type) ";
-            }
-            debug(0) << v.name << " @ " << v.stack_offset << "\n";
-        }
-        return true;
-    }
-
-private:
-    void load_and_parse_object_file(const std::string &binary) {
-        llvm::object::ObjectFile *obj = nullptr;
-
-        // Open the object file in question.
-        llvm::Expected<llvm::object::OwningBinary<llvm::object::ObjectFile>> maybe_obj =
-            llvm::object::ObjectFile::createObjectFile(binary);
-
-        if (!maybe_obj) {
-            consumeError(maybe_obj.takeError());
-            debug(1) << "Failed to load binary:" << binary << "\n";
-            return;
-        }
-
-        obj = maybe_obj.get().getBinary();
-
-        if (obj) {
-            working = true;
-            parse_object_file(obj);
-        } else {
-            debug(1) << "Could not load object file: " << binary << "\n";
-            working = false;
-        }
-    }
-
-    void parse_object_file(llvm::object::ObjectFile *obj) {
-        // Look for the debug_info, debug_abbrev, debug_line, and debug_str sections
-        llvm::StringRef debug_info, debug_abbrev, debug_str, debug_line, debug_line_str, debug_ranges;
-
-#ifdef __APPLE__
-        std::string prefix = "__";
-#else
-        std::string prefix = ".";
-#endif
-
-        for (llvm::object::section_iterator iter = obj->section_begin();
-             iter != obj->section_end(); ++iter) {
-            auto expected_name = iter->getName();
-            internal_assert(expected_name);
-            llvm::StringRef name = expected_name.get();
-            debug(2) << "Section: " << name.str() << "\n";
-            // ignore errors, just leave strings empty
-            auto e = iter->getContents();
-            if (e) {
-                if (name == prefix + "debug_info") {
-                    debug_info = *e;
-                } else if (name == prefix + "debug_abbrev") {
-                    debug_abbrev = *e;
-                } else if (name == prefix + "debug_str") {
-                    debug_str = *e;
-                } else if (name == prefix + "debug_line_str") {
-                    debug_line_str = *e;
-                } else if (name == prefix + "debug_line") {
-                    debug_line = *e;
-                } else if (name == prefix + "debug_ranges") {
-                    debug_ranges = *e;
-                }
-            }
-        }
-
-        if (debug_info.empty() ||
-            debug_abbrev.empty() ||
-            debug_str.empty() ||
-            debug_line.empty() ||
-            debug_ranges.empty()) {
-            // It's OK for debug_line_str to be empty
-            debug(2) << "Debugging sections not found\n";
-            working = false;
-            return;
-        }
-
-        {
-            // Parse the debug_info section to populate the functions and local variables
-            llvm::DataExtractor extractor(debug_info, true, obj->getBytesInAddress());
-            llvm::DataExtractor debug_abbrev_extractor(debug_abbrev, true, obj->getBytesInAddress());
-            parse_debug_info(extractor, debug_abbrev_extractor, debug_str, debug_line_str, debug_ranges);
-            if (!working) {
-                return;
-            }
-        }
-
-        {
-            llvm::DataExtractor e(debug_line, true, obj->getBytesInAddress());
-            parse_debug_line(e);
-        }
-    }
-
-    void parse_debug_ranges(const llvm::DataExtractor &e) {
-    }
-
-    void parse_debug_abbrev(const llvm::DataExtractor &e, llvm_offset_t off = 0) {
-        entry_formats.clear();
-        while (true) {
-            EntryFormat fmt;
-            fmt.code = e.getULEB128(&off);
-            if (!fmt.code) {
-                break;
-            }
-            fmt.tag = e.getULEB128(&off);
-            fmt.has_children = (e.getU8(&off) != 0);
-            // Get the attributes
-            /*
-              printf("code = %lu\n"
-              " tag = %lu\n"
-              " has_children = %u\n", fmt.code, fmt.tag, fmt.has_children);
-            */
-            while (true) {
-                uint64_t name = e.getULEB128(&off);
-                uint64_t form = e.getULEB128(&off);
-                if (!name && !form) {
-                    break;
-                }
-                // printf(" name = %lu, form = %lu\n", name, form);
-
-                FieldFormat f_fmt(name, form);
-                fmt.fields.push_back(f_fmt);
-            }
-            entry_formats[fmt.code] = std::move(fmt);
-        }
-    }
-
-    void parse_debug_info(const llvm::DataExtractor &e,
-                          const llvm::DataExtractor &debug_abbrev,
-                          llvm::StringRef debug_str,
-                          llvm::StringRef debug_line_str,
-                          llvm::StringRef debug_ranges) {
-        // Offset into the section
-        llvm_offset_t off = 0;
-
-        llvm::StringRef debug_info = e.getData();
-
-        // A constant to use indicating that we don't know the stack
-        // offset of a variable.
-        const int no_location = 0x80000000;
-
-        while (true) {
-            uint64_t start_of_unit_header = off;
-
-            // Parse compilation unit header
-            bool dwarf_64;
-            uint64_t unit_length = e.getU32(&off);
-            if (unit_length == 0xffffffff) {
-                dwarf_64 = true;
-                unit_length = e.getU64(&off);
-            } else {
-                dwarf_64 = false;
-            }
-            // clang-format off
-            const auto parse_offset = dwarf_64 ?
-                [](const llvm::DataExtractor &e, llvm_offset_t *off) -> uint64_t { return e.getU64(off); } :
-                [](const llvm::DataExtractor &e, llvm_offset_t *off) -> uint64_t { return e.getU32(off); };
-            // clang-format on
-
-            if (!unit_length) {
-                // A zero-length compilation unit indicates the end of
-                // the list.
-                break;
-            }
-
-            uint64_t start_of_unit = off;
-
-            uint16_t dwarf_version = e.getU16(&off);
-            // DWARF v4 and lower is well-tested; DWARF v5 is very lightly
-            // tested and is almost certainly incomplete.
-            internal_assert(dwarf_version <= 5);  // We haven't tested on anything larger
-
-            uint64_t debug_abbrev_offset = 0;
-            uint8_t address_size = 0;
-            if (dwarf_version == 5) {
-                constexpr uint8_t DW_UT_compile = 0x01;
-                // constexpr uint8_t DW_UT_type = 0x02;
-                // constexpr uint8_t DW_UT_partial = 0x03;
-                constexpr uint8_t DW_UT_skeleton = 0x04;
-                // constexpr uint8_t DW_UT_split_compile = 0x05;
-                // constexpr uint8_t DW_UT_split_type = 0x06;
-                const uint8_t unit_type = e.getU8(&off);
-                internal_assert(unit_type == DW_UT_compile || unit_type == DW_UT_skeleton) << unit_type;
-
-                address_size = e.getU8(&off);
-                debug_abbrev_offset = parse_offset(e, &off);
-
-                if (unit_type == DW_UT_skeleton) {
-                    (void)e.getU64(&off);
-                }
-            } else {
-                debug_abbrev_offset = parse_offset(e, &off);
-                address_size = e.getU8(&off);
-            }
-            parse_debug_abbrev(debug_abbrev, debug_abbrev_offset);
-
-            internal_assert(address_size == sizeof(uintptr_t));
-
-            vector<pair<FunctionInfo, int>> func_stack;
-            vector<pair<TypeInfo, int>> type_stack;
-            vector<pair<std::string, int>> namespace_stack;
-            vector<pair<vector<LiveRange>, int>> live_range_stack;
-
-            int stack_depth = 0;
-
-            uint64_t compile_unit_base_pc = 0;
-
-            // From the dwarf 4 spec
-            const unsigned tag_array_type = 0x01;
-            const unsigned tag_class_type = 0x02;
-            const unsigned tag_lexical_block = 0x0b;
-            const unsigned tag_member = 0x0d;
-            const unsigned tag_pointer_type = 0x0f;
-            const unsigned tag_reference_type = 0x10;
-            const unsigned tag_compile_unit = 0x11;
-            const unsigned tag_structure_type = 0x13;
-            const unsigned tag_typedef = 0x16;
-            const unsigned tag_inlined_subroutine = 0x1d;
-            const unsigned tag_subrange_type = 0x21;
-            const unsigned tag_base_type = 0x24;
-            const unsigned tag_const_type = 0x26;
-            const unsigned tag_function = 0x2e;
-            const unsigned tag_variable = 0x34;
-            const unsigned tag_namespace = 0x39;
-
-            const unsigned attr_location = 0x02;
-            const unsigned attr_name = 0x03;
-            const unsigned attr_byte_size = 0x0b;
-            const unsigned attr_low_pc = 0x11;
-            const unsigned attr_high_pc = 0x12;
-            const unsigned attr_upper_bound = 0x2f;
-            const unsigned attr_abstract_origin = 0x31;
-            const unsigned attr_count = 0x37;
-            const unsigned attr_data_member_location = 0x38;
-            const unsigned attr_frame_base = 0x40;
-            const unsigned attr_specification = 0x47;
-            const unsigned attr_type = 0x49;
-            const unsigned attr_ranges = 0x55;
-
-            while (off - start_of_unit < unit_length) {
-                uint64_t location = off;
-
-                // Grab the next debugging information entry
-                uint64_t abbrev_code = e.getULEB128(&off);
-
-                // A null entry indicates we're popping the stack.
-                if (abbrev_code == 0) {
-                    if (!func_stack.empty() &&
-                        stack_depth == func_stack.back().second) {
-                        const FunctionInfo &f = func_stack.back().first;
-                        functions.push_back(f);
-                        func_stack.pop_back();
-                    }
-                    if (!type_stack.empty() &&
-                        stack_depth == type_stack.back().second) {
-                        const TypeInfo &c = type_stack.back().first;
-                        types.push_back(c);
-                        type_stack.pop_back();
-                    }
-                    if (!namespace_stack.empty() &&
-                        stack_depth == namespace_stack.back().second) {
-                        namespace_stack.pop_back();
-                    }
-                    if (!live_range_stack.empty() &&
-                        stack_depth == live_range_stack.back().second) {
-                        live_range_stack.pop_back();
-                    }
-                    stack_depth--;
-                    continue;
-                }
-
-                auto it = entry_formats.find(abbrev_code);
-                if (it == entry_formats.end()) {
-                    // Either the DWARF is malformed or we are parsing it incorrectly.
-                    // (This has only been reported when compiling with TSAN enabled,
-                    // so either is quite possible.)
-                    debug(2) << "Unspecified abbrev_code, ignoring introspection\n";
-                    working = false;
-                    return;
-                }
-                const EntryFormat &fmt = it->second;
-
-                LocalVariable var;
-                GlobalVariable gvar;
-                FunctionInfo func;
-                TypeInfo type_info;
-                vector<LiveRange> live_ranges;
-                type_info.def_loc = location;
-                func.def_loc = location;
-                var.def_loc = location;
-                gvar.def_loc = location;
-                std::string namespace_name;
-
-                std::string containing_namespace;
-                if (!type_stack.empty()) {
-                    containing_namespace = type_stack.back().first.name + "::";
-                } else {
-                    for (auto &i : namespace_stack) {
-                        containing_namespace += i.first + "::";
-                    }
-                }
-
-                var.stack_offset = no_location;
-
-                if (fmt.has_children) {
-                    stack_depth++;
-                }
-
-                // Track local vars found for this function
-
-                // Grab the fields
-                for (size_t i = 0; i < fmt.fields.size(); i++) {
-                    unsigned attr = fmt.fields[i].name;
-
-                    // A field can either be a constant value:
-                    uint64_t val = 0;
-                    // Or a variable length payload:
-                    const uint8_t *payload = nullptr;
-                    // If payload is non-null, val indicates the
-                    // payload size. If val is zero the payload is a
-                    // null-terminated string.
-
-                    switch (fmt.fields[i].form) {
-                    case 1:  // addr (4 or 8 bytes)
-                    {
-                        if (address_size == 4) {
-                            val = e.getU32(&off);
-                        } else {
-                            val = e.getU64(&off);
-                        }
-                        break;
-                    }
-                    case 2:  // There is no case 2
-                    {
-                        internal_error << "What's form 2?";
-                        break;
-                    }
-                    case 3:  // block2 (2 byte length followed by payload)
-                    {
-                        val = e.getU16(&off);
-                        payload = (const uint8_t *)(debug_info.data() + off);
-                        off += val;
-                        break;
-                    }
-                    case 4:  // block4 (4 byte length followed by payload)
-                    {
-                        val = e.getU32(&off);
-                        payload = (const uint8_t *)(debug_info.data() + off);
-                        off += val;
-                        break;
-                    }
-                    case 5:  // data2 (2 bytes)
-                    {
-                        val = e.getU16(&off);
-                        break;
-                    }
-                    case 6:  // data4 (4 bytes)
-                    {
-                        val = e.getU32(&off);
-                        break;
-                    }
-                    case 7:  // data8 (8 bytes)
-                    {
-                        val = e.getU64(&off);
-                        break;
-                    }
-                    case 8:  // string (null terminated sequence of bytes)
-                    {
-                        val = 0;
-                        payload = (const uint8_t *)(debug_info.data() + off);
-                        while (e.getU8(&off)) {
-                        }
-                        break;
-                    }
-                    case 9:  // block (uleb128 length followed by payload)
-                    {
-                        val = e.getULEB128(&off);
-                        payload = (const uint8_t *)(debug_info.data() + off);
-                        off += val;
-                        break;
-                    }
-                    case 10:  // block1 (1 byte length followed by payload)
-                    {
-                        val = e.getU8(&off);
-                        payload = (const uint8_t *)(debug_info.data() + off);
-                        off += val;
-                        break;
-                    }
-                    case 11:  // data1 (1 byte)
-                    {
-                        val = e.getU8(&off);
-                        break;
-                    }
-                    case 12:  // flag (1 byte)
-                    {
-                        val = e.getU8(&off);
-                        break;
-                    }
-                    case 13:  // sdata (sleb128 constant)
-                    {
-                        val = (uint64_t)e.getSLEB128(&off);
-                        break;
-                    }
-                    case 14:  // strp (offset into debug_str section. 4 bytes in dwarf 32, 8 in dwarf 64)
-                    {
-                        uint64_t offset = parse_offset(e, &off);
-                        val = 0;
-                        payload = (const uint8_t *)(debug_str.data() + offset);
-                        break;
-                    }
-                    case 15:  // udata (uleb128 constant)
-                    {
-                        val = e.getULEB128(&off);
-                        break;
-                    }
-                    case 16:  // ref_addr (offset from beginning of debug_info. 4 bytes in dwarf 32, 8 in dwarf 64)
-                    {
-                        if ((dwarf_version <= 2 && address_size == 8) ||
-                            (dwarf_version > 2 && dwarf_64)) {
-                            val = e.getU64(&off);
-                        } else {
-                            val = e.getU32(&off);
-                        }
-                        break;
-                    }
-                    case 17:  // ref1 (1 byte offset from the first byte of the compilation unit header)
-                    {
-                        val = e.getU8(&off) + start_of_unit_header;
-                        break;
-                    }
-                    case 18:  // ref2 (2 byte version of the same)
-                    {
-                        val = e.getU16(&off) + start_of_unit_header;
-                        break;
-                    }
-                    case 19:  // ref4 (4 byte version of the same)
-                    {
-                        val = e.getU32(&off) + start_of_unit_header;
-                        break;
-                    }
-                    case 20:  // ref8 (8 byte version of the same)
-                    {
-                        val = e.getU64(&off) + start_of_unit_header;
-                        break;
-                    }
-                    case 21:  // ref_udata (uleb128 version of the same)
-                    {
-                        val = e.getULEB128(&off) + start_of_unit_header;
-                        break;
-                    }
-                    case 22:  // indirect
-                    {
-                        internal_error << "Can't handle indirect form";
-                        break;
-                    }
-                    case 23:  // sec_offset
-                    {
-                        val = parse_offset(e, &off);
-                        break;
-                    }
-                    case 24:  // exprloc
-                    {
-                        // Length
-                        val = e.getULEB128(&off);
-                        // Payload (contains a DWARF expression to evaluate (ugh))
-                        payload = (const uint8_t *)(debug_info.data() + off);
-                        off += val;
-                        break;
-                    }
-                    case 25:  // flag_present
-                    {
-                        val = 0;
-                        // Just the existence of this field is information apparently? There's no data.
-                        break;
-                    }
-                    case 31:  // line_strp (offset into debug_line_str section. 4 bytes in dwarf 32, 8 in dwarf 64)
-                    {
-                        uint64_t offset = parse_offset(e, &off);
-                        val = 0;
-                        payload = (const uint8_t *)(debug_line_str.data() + offset);
-                        break;
-                    }
-                    case 32:  // ref_sig8
-                    {
-                        // 64-bit type signature for a reference in its own type unit
-                        val = e.getU64(&off);
-                        break;
-                    }
-                    default:
-                        internal_error << "Unknown form " << fmt.fields[i].form;
-                        break;
-                    }
-
-                    if (fmt.tag == tag_function) {
-                        if (attr == attr_name) {
-                            func.name = containing_namespace + std::string((const char *)payload);
-                        } else if (attr == attr_low_pc) {
-                            func.pc_begin = val;
-                        } else if (attr == attr_high_pc) {
-                            if (fmt.fields[i].form == 0x1) {
-                                // Literal address
-                                func.pc_end = val;
-                            } else {
-                                // Size of the thing
-                                func.pc_end = func.pc_begin + val;
-                            }
-                        } else if (attr == attr_frame_base) {
-                            // GCC style
-                            if (val == 1 && payload && payload[0] == 0x9c) {
-                                func.frame_base = FunctionInfo::GCC;
-                            } else if (val == 1 && payload && payload[0] == 0x56 && sizeof(void *) == 8) {
-                                func.frame_base = FunctionInfo::ClangFP;
-                            } else if (val == 1 && payload && payload[0] == 0x55 && sizeof(void *) == 4) {
-                                func.frame_base = FunctionInfo::ClangFP;
-                            } else if (val == 1 && payload && payload[0] == 0x57 && sizeof(void *) == 8) {
-                                func.frame_base = FunctionInfo::ClangNoFP;
-                            } else if (val == 1 && payload && payload[0] == 0x54 && sizeof(void *) == 4) {
-                                func.frame_base = FunctionInfo::ClangNoFP;
-                            } else {
-                                func.frame_base = FunctionInfo::Unknown;
-                            }
-                        } else if (attr == attr_specification) {
-                            func.spec_loc = val;
-                        }
-                    } else if (fmt.tag == tag_base_type) {
-                        if (attr == attr_name) {
-                            type_info.name = containing_namespace + std::string((const char *)payload);
-                            type_info.type = TypeInfo::Primitive;
-                        } else if (attr == attr_byte_size) {
-                            type_info.size = val;
-                        }
-                    } else if (fmt.tag == tag_class_type) {
-                        if (attr == attr_name) {
-                            type_info.name = containing_namespace + std::string((const char *)payload);
-                            type_info.type = TypeInfo::Class;
-                        } else if (attr == attr_byte_size) {
-                            type_info.size = val;
-                        }
-                    } else if (fmt.tag == tag_structure_type) {
-                        if (attr == attr_name) {
-                            type_info.name = containing_namespace + std::string((const char *)payload);
-                            type_info.type = TypeInfo::Struct;
-                        } else if (attr == attr_byte_size) {
-                            type_info.size = val;
-                        }
-                    } else if (fmt.tag == tag_typedef) {
-                        if (attr == attr_name) {
-                            type_info.name = containing_namespace + std::string((const char *)payload);
-                            type_info.type = TypeInfo::Typedef;
-                        } else if (attr == attr_type) {
-                            // Approximate a typedef as a single-member class
-                            LocalVariable m;
-                            m.type_def_loc = val;
-                            m.stack_offset = 0;
-                            type_info.members.push_back(m);
-                        }
-                    } else if (fmt.tag == tag_pointer_type) {
-                        if (attr == attr_type) {
-                            // Approximate a pointer type as a single-member class
-                            LocalVariable m;
-                            m.type_def_loc = val;
-                            m.stack_offset = 0;
-                            type_info.members.push_back(m);
-                            type_info.type = TypeInfo::Pointer;
-                            // Assume the size is the address size
-                            type_info.size = address_size;
-                        } else if (attr == attr_byte_size) {
-                            // Should really be 4 or 8
-                            type_info.size = val;
-                        }
-                    } else if (fmt.tag == tag_reference_type) {
-                        if (attr == attr_type) {
-                            LocalVariable m;
-                            m.type_def_loc = val;
-                            m.stack_offset = 0;
-                            type_info.members.push_back(m);
-                            type_info.type = TypeInfo::Reference;
-                        } else if (attr == attr_byte_size) {
-                            // Should really be 4 or 8
-                            type_info.size = val;
-                        }
-                    } else if (fmt.tag == tag_const_type) {
-                        if (attr == attr_type) {
-                            LocalVariable m;
-                            m.type_def_loc = val;
-                            m.stack_offset = 0;
-                            type_info.members.push_back(m);
-                            type_info.type = TypeInfo::Const;
-                        } else if (attr == attr_byte_size) {
-                            // Should really be 4 or 8
-                            type_info.size = val;
-                        }
-                    } else if (fmt.tag == tag_array_type) {
-                        if (attr == attr_type) {
-                            LocalVariable m;
-                            m.type_def_loc = val;
-                            m.stack_offset = 0;
-                            type_info.members.push_back(m);
-                            type_info.type = TypeInfo::Array;
-                        } else if (attr == attr_byte_size) {
-                            // According to the dwarf spec, this
-                            // should be the number of bytes the array
-                            // occupies, but compilers seem to emit
-                            // the number of array entries instead.
-                            type_info.size = val;
-                        }
-                    } else if (fmt.tag == tag_variable) {
-                        if (attr == attr_name) {
-                            if (func_stack.empty()) {
-                                // Global var
-                                gvar.name = containing_namespace + std::string((const char *)payload);
-                            } else {
-                                // Either a local var, or a static var inside a function
-                                gvar.name = var.name = std::string((const char *)payload);
-                            }
-                        } else if (attr == attr_location) {
-                            // We only understand locations which are
-                            // offsets from the function's frame
-                            if (payload && payload[0] == 0x91) {
-                                // It's a local
-                                // payload + 1 is a sleb128
-                                var.stack_offset = (int)(get_sleb128(payload + 1));
-                            } else if (payload && payload[0] == 0x03 && val == (sizeof(void *) + 1)) {
-                                // It's a global
-                                // payload + 1 is an address
-                                const void *addr = load_misaligned((const void *const *)(payload + 1));
-                                gvar.addr = (uint64_t)(addr);
-                            } else {
-                                // Some other format that we don't understand
-                                var.stack_offset = no_location;
-                            }
-                        } else if (attr == attr_type) {
-                            var.type_def_loc = val;
-                            gvar.type_def_loc = val;
-                        } else if (attr == attr_abstract_origin) {
-                            // This is a stack variable imported from a function that was inlined.
-                            var.origin_loc = val;
-                        } else if (attr == attr_specification) {
-                            // This is an instance of a global var with a prototype elsewhere
-                            gvar.spec_loc = val;
-                        }
-                    } else if (fmt.tag == tag_member) {
-                        if (attr == attr_name) {
-                            var.name = std::string((const char *)payload);
-                            if (!type_stack.empty()) {
-                                gvar.name = type_stack.back().first.name + "::" + var.name;
-                            } else {
-                                gvar.name = var.name;
-                            }
-                        } else if (attr == attr_data_member_location) {
-                            if (!payload) {
-                                var.stack_offset = val;
-                            } else if (payload[0] == 0x23) {
-                                var.stack_offset = (int)(get_uleb128(payload + 1));
-                            }
-                        } else if (attr == attr_type) {
-                            var.type_def_loc = val;
-                            gvar.type_def_loc = val;
-                        }
-                    } else if (fmt.tag == tag_namespace) {
-                        if (attr == attr_name) {
-                            namespace_name = std::string((const char *)payload);
-                        }
-                    } else if (fmt.tag == tag_subrange_type) {
-                        // Could be telling us the size of an array type
-                        if (attr == attr_upper_bound &&
-                            !type_stack.empty() &&
-                            type_stack.back().first.type == TypeInfo::Array) {
-                            type_stack.back().first.size = val + 1;
-                        } else if (attr == attr_count &&
-                                   !type_stack.empty() &&
-                                   type_stack.back().first.type == TypeInfo::Array) {
-                            type_stack.back().first.size = val;
-                        }
-                    } else if (fmt.tag == tag_inlined_subroutine ||
-                               fmt.tag == tag_lexical_block) {
-                        if (attr == attr_low_pc) {
-                            LiveRange r = {val, val};
-                            live_ranges.push_back(r);
-                        } else if (attr == attr_high_pc && !live_ranges.empty()) {
-                            if (fmt.fields[i].form == 0x1) {
-                                // Literal address
-                                live_ranges.back().pc_end = val;
-                            } else {
-                                // Size
-                                live_ranges.back().pc_end = live_ranges.back().pc_begin + val;
-                            }
-                        } else if (attr == attr_ranges) {
-                            if (val < debug_ranges.size()) {
-                                // It's an array of addresses
-                                const void *const *ptr = (const void *const *)(debug_ranges.data() + val);
-                                const void *const *end = (const void *const *)(debug_ranges.data() + debug_ranges.size());
-                                // Note: might not be properly aligned; use memcpy to avoid
-                                // sanitizer warnings
-                                while (load_misaligned(ptr) && ptr < end - 1) {
-                                    LiveRange r = {(uint64_t)load_misaligned(ptr), (uint64_t)load_misaligned(ptr + 1)};
-                                    r.pc_begin += compile_unit_base_pc;
-                                    r.pc_end += compile_unit_base_pc;
-                                    live_ranges.push_back(r);
-                                    ptr += 2;
-                                }
-                            }
-                        }
-                    } else if (fmt.tag == tag_compile_unit) {
-                        if (attr == attr_low_pc) {
-                            compile_unit_base_pc = val;
-                        }
-                    }
-                }
-
-                if (fmt.tag == tag_variable) {
-                    if (!func_stack.empty() && !gvar.addr) {
-                        if (!live_range_stack.empty()) {
-                            var.live_ranges = live_range_stack.back().first;
-                        }
-                        func_stack.back().first.variables.push_back(var);
-                    } else {
-                        global_variables.push_back(gvar);
-                    }
-                } else if (fmt.tag == tag_member &&
-                           !type_stack.empty()) {
-                    if (var.stack_offset == no_location) {
-                        // A member with no stack offset location is probably the prototype for a static member
-                        global_variables.push_back(gvar);
-                    } else {
-                        type_stack.back().first.members.push_back(var);
-                    }
-
-                } else if (fmt.tag == tag_function) {
-                    if (fmt.has_children) {
-                        func_stack.emplace_back(func, stack_depth);
-                    } else {
-                        functions.push_back(func);
-                    }
-                } else if (fmt.tag == tag_class_type ||
-                           fmt.tag == tag_structure_type ||
-                           fmt.tag == tag_array_type ||
-                           fmt.tag == tag_base_type) {
-                    if (fmt.has_children) {
-                        type_stack.emplace_back(type_info, stack_depth);
-                    } else {
-                        types.push_back(type_info);
-                    }
-                } else if ((fmt.tag == tag_typedef ||
-                            fmt.tag == tag_pointer_type ||
-                            fmt.tag == tag_reference_type ||
-                            fmt.tag == tag_const_type) &&
-                           type_info.members.size() == 1) {
-                    types.push_back(type_info);
-                } else if (fmt.tag == tag_namespace && fmt.has_children) {
-                    if (namespace_name.empty()) {
-                        namespace_name = "_";
-                    }
-                    namespace_stack.emplace_back(namespace_name, stack_depth);
-                } else if ((fmt.tag == tag_inlined_subroutine ||
-                            fmt.tag == tag_lexical_block) &&
-                           !live_ranges.empty() && fmt.has_children) {
-                    live_range_stack.emplace_back(live_ranges, stack_depth);
-                }
-            }
-        }
-
-        // Connect function definitions to their declarations
-        {
-            std::map<uint64_t, FunctionInfo *> func_map;
-            for (auto &function : functions) {
-                func_map[function.def_loc] = &function;
-            }
-
-            for (auto &function : functions) {
-                if (function.spec_loc) {
-                    FunctionInfo *spec = func_map[function.spec_loc];
-                    if (spec) {
-                        function.name = spec->name;
-                    }
-                }
-            }
-        }
-
-        // Connect inlined variable instances to their origins
-        {
-            std::map<uint64_t, LocalVariable *> var_map;
-            for (auto &function : functions) {
-                for (auto &variable : function.variables) {
-                    var_map[variable.def_loc] = &variable;
-                }
-            }
-
-            for (auto &function : functions) {
-                for (auto &v : function.variables) {
-                    uint64_t loc = v.origin_loc;
-                    if (loc) {
-                        LocalVariable *origin = var_map[loc];
-                        if (origin) {
-                            v.name = origin->name;
-                            v.type = origin->type;
-                            v.type_def_loc = origin->type_def_loc;
-                        } else {
-                            debug(5) << "Variable with bad abstract origin: " << loc << "\n";
-                        }
-                    }
-                }
-            }
-        }
-
-        // Connect global variable instances to their prototypes
-        {
-            std::map<uint64_t, GlobalVariable *> var_map;
-            for (auto &var : global_variables) {
-                debug(5) << "var " << var.name << " is at " << var.def_loc << "\n";
-                if (var.spec_loc || var.name.empty()) {
-                    // Not a prototype
-                    continue;
-                }
-                var_map[var.def_loc] = &var;
-            }
-
-            for (auto &var : global_variables) {
-                if (var.name.empty() && var.spec_loc) {
-                    GlobalVariable *spec = var_map[var.spec_loc];
-                    if (spec) {
-                        var.name = spec->name;
-                        var.type = spec->type;
-                        var.type_def_loc = spec->type_def_loc;
-                    } else {
-                        debug(5) << "Global variable with bad spec loc: " << var.spec_loc << "\n";
-                    }
-                }
-            }
-        }
-
-        // Hook up the type pointers
-        {
-            std::map<uint64_t, TypeInfo *> type_map;
-            for (auto &type : types) {
-                type_map[type.def_loc] = &type;
-            }
-
-            for (auto &function : functions) {
-                for (auto &variable : function.variables) {
-                    variable.type =
-                        type_map[variable.type_def_loc];
-                }
-            }
-
-            for (auto &global_variable : global_variables) {
-                global_variable.type =
-                    type_map[global_variable.type_def_loc];
-            }
-
-            for (auto &type : types) {
-                for (auto &member : type.members) {
-                    member.type =
-                        type_map[member.type_def_loc];
-                }
-            }
-        }
-
-        for (auto &type : types) {
-            // Set the names of the pointer types
-            vector<std::string> suffix;
-            TypeInfo *t = &type;
-            while (t) {
-                if (t->type == TypeInfo::Pointer) {
-                    suffix.emplace_back("*");
-                    internal_assert(t->members.size() == 1);
-                    t = t->members[0].type;
-                } else if (t->type == TypeInfo::Reference) {
-                    suffix.emplace_back("&");
-                    internal_assert(t->members.size() == 1);
-                    t = t->members[0].type;
-                } else if (t->type == TypeInfo::Const) {
-                    suffix.emplace_back("const");
-                    internal_assert(t->members.size() == 1);
-                    t = t->members[0].type;
-                } else if (t->type == TypeInfo::Array) {
-                    // Do we know the size?
-                    if (t->size != 0) {
-                        std::ostringstream oss;
-                        oss << "[" << t->size << "]";
-                        suffix.push_back(oss.str());
-                    } else {
-                        suffix.emplace_back("[]");
-                    }
-                    internal_assert(t->members.size() == 1);
-                    t = t->members[0].type;
-                } else {
-                    break;
-                }
-            }
-
-            if (t && !suffix.empty()) {
-                type.name = t->name;
-                while (!suffix.empty()) {
-                    type.name += " " + suffix.back();
-                    suffix.pop_back();
-                }
-            }
-        }
-
-        // Fix up the sizes of typedefs where we know the underlying type
-        for (auto &type : types) {
-            TypeInfo *t = &type;
-            if (type.type == TypeInfo::Typedef &&
-                !t->members.empty() &&
-                t->members[0].type) {
-                t->size = t->members[0].type->size;
-            }
-        }
-
-        // Unpack class members into the local variables list.
-        for (auto &function : functions) {
-            vector<LocalVariable> new_vars = function.variables;
-            for (size_t j = 0; j < new_vars.size(); j++) {
-                // If new_vars[j] is a class type, unpack its members
-                // immediately after this point.
-                const LocalVariable &v = new_vars[j];
-                if (v.type &&
-                    (v.type->type == TypeInfo::Struct ||
-                     v.type->type == TypeInfo::Class ||
-                     v.type->type == TypeInfo::Typedef)) {
-                    size_t members = v.type->members.size();
-                    new_vars.insert(new_vars.begin() + j + 1,
-                                    v.type->members.begin(),
-                                    v.type->members.end());
-
-                    // Typedefs retain the same name and stack offset
-                    if (new_vars[j].type->type == TypeInfo::Typedef) {
-                        new_vars[j + 1].name = new_vars[j].name;
-                        new_vars[j + 1].stack_offset = new_vars[j].stack_offset;
-                    } else {
-                        // Correct the stack offsets and names
-                        for (size_t k = 0; k < members; k++) {
-                            new_vars[j + k + 1].stack_offset += new_vars[j].stack_offset;
-                            if (!new_vars[j + k + 1].name.empty() &&
-                                !new_vars[j].name.empty()) {
-                                new_vars[j + k + 1].name = new_vars[j].name + "." + new_vars[j + k + 1].name;
-                            }
-                        }
-                    }
-                }
-            }
-            function.variables.swap(new_vars);
-
-            if (!function.variables.empty()) {
-                debug(5) << "Function " << function.name << ":\n";
-                for (auto &variable : function.variables) {
-                    if (variable.type) {
-                        debug(5) << " " << variable.type->name << " " << variable.name << "\n";
-                    }
-                }
-            }
-        }
-
-        // Unpack class members of global variables
-        for (size_t i = 0; i < global_variables.size(); i++) {
-            GlobalVariable v = global_variables[i];
-            if (v.type && v.addr &&
-                (v.type->type == TypeInfo::Struct ||
-                 v.type->type == TypeInfo::Class ||
-                 v.type->type == TypeInfo::Typedef)) {
-                debug(5) << "Unpacking members of " << v.name << " at " << std::hex << v.addr << "\n";
-                vector<LocalVariable> &members = v.type->members;
-                for (auto &member : members) {
-                    GlobalVariable mem;
-                    if (!v.name.empty() && !member.name.empty()) {
-                        mem.name = v.name + "." + member.name;
-                    } else {
-                        // Might be a member of an anonymous struct?
-                        mem.name = member.name;
-                    }
-                    mem.type = member.type;
-                    mem.type_def_loc = member.type_def_loc;
-                    mem.addr = v.addr + member.stack_offset;
-                    debug(5) << " Member " << mem.name << " goes at " << mem.addr << "\n";
-                    global_variables.push_back(mem);
-                }
-                debug(5) << std::dec;
-            }
-        }
-
-        // Drop functions for which we don't know the program counter,
-        // and variables for which we don't know the stack offset,
-        // name, or type.
-        {
-            vector<FunctionInfo> trimmed;
-            for (auto &f : functions) {
-                if (!f.pc_begin ||
-                    !f.pc_end ||
-                    f.name.empty()) {
-                    // debug(5) << "Dropping " << f.name << "\n";
-                    continue;
-                }
-
-                vector<LocalVariable> vars;
-                for (auto &v : f.variables) {
-                    if (!v.name.empty() && v.type && v.stack_offset != no_location) {
-                        vars.push_back(v);
-                    } else {
-                        // debug(5) << "Dropping " << v.name << "\n";
-                    }
-                }
-                f.variables.clear();
-                trimmed.push_back(f);
-                trimmed.back().variables = vars;
-            }
-            std::swap(functions, trimmed);
-        }
-
-        // Drop globals for which we don't know the address or name
-        {
-            vector<GlobalVariable> trimmed;
-            for (auto &v : global_variables) {
-                if (!v.name.empty() && v.addr) {
-                    trimmed.push_back(v);
-                }
-            }
-
-            std::swap(global_variables, trimmed);
-        }
-
-        // Sort the functions list by program counter
-        std::sort(functions.begin(), functions.end());
-
-        // Sort the global variables by address
-        std::sort(global_variables.begin(), global_variables.end());
-    }
-
-    void parse_debug_line(const llvm::DataExtractor &e) {
-        llvm_offset_t off = 0;
-
-        // For every compilation unit
-        while (true) {
-            // Parse the header
-            uint32_t unit_length = e.getU32(&off);
-
-            if (unit_length == 0) {
-                // No more units
-                break;
-            }
-
-            llvm_offset_t unit_end = off + unit_length;
-
-            debug(5) << "Parsing compilation unit from " << off << " to " << unit_end << "\n";
-
-            uint16_t version = e.getU16(&off);
-            internal_assert(version >= 2);
-
-            uint32_t header_length = e.getU32(&off);
-            llvm_offset_t end_header_off = off + header_length;
-            uint8_t min_instruction_length = e.getU8(&off);
-            uint8_t max_ops_per_instruction = 1;
-            if (version >= 4) {
-                // This is for VLIW architectures
-                max_ops_per_instruction = e.getU8(&off);
-            }
-            uint8_t default_is_stmt = e.getU8(&off);
-            int8_t line_base = (int8_t)e.getU8(&off);
-            uint8_t line_range = e.getU8(&off);
-            uint8_t opcode_base = e.getU8(&off);
-
-            vector<uint8_t> standard_opcode_length(opcode_base);
-            for (int i = 1; i < opcode_base; i++) {
-                // Note we don't use entry 0
-                standard_opcode_length[i] = e.getU8(&off);
-            }
-
-            vector<std::string> include_dirs;
-            // The current directory is implicitly the first dir.
-            include_dirs.emplace_back(".");
-            while (off < end_header_off) {
-                const char *s = e.getCStr(&off);
-                if (s && s[0]) {
-                    include_dirs.emplace_back(s);
-                } else {
-                    break;
-                }
-            }
-
-            // The first source file index for this compilation unit.
-            int source_files_base = source_files.size();
-
-            while (off < end_header_off) {
-                const char *name = e.getCStr(&off);
-                if (name && name[0]) {
-                    uint64_t dir = e.getULEB128(&off);
-                    uint64_t mod_time = e.getULEB128(&off);
-                    uint64_t length = e.getULEB128(&off);
-                    (void)mod_time;
-                    (void)length;
-                    internal_assert(dir <= include_dirs.size());
-                    source_files.push_back(include_dirs[dir] + "/" + name);
-                } else {
-                    break;
-                }
-            }
-
-            internal_assert(off == end_header_off) << "Failed parsing section .debug_line";
-
-            // Now parse the table. It uses a state machine with the following fields:
-            struct {
-                // Current program counter
-                uint64_t address;
-                // Which op within that instruction (for VLIW archs)
-                uint32_t op_index;
-                // File and line index;
-                uint32_t file, line, column;
-                bool is_stmt, basic_block, end_sequence, prologue_end, epilogue_begin;
-                // The ISA of the architecture (e.g. x86-64 vs armv7 vs thumb)
-                uint32_t isa;
-                // The id of the block to which this line belongs
-                uint32_t discriminator;
-
-                void append_row(vector<LineInfo> &lines) {
-                    LineInfo l = {address, line, file};
-                    lines.push_back(l);
-                }
-            } state, initial_state;
-
-            // Initialize the state table.
-            initial_state.address = 0;
-            initial_state.op_index = 0;
-            initial_state.file = 0;
-            initial_state.line = 1;
-            initial_state.column = 0;
-            initial_state.is_stmt = default_is_stmt;
-            initial_state.basic_block = false;
-            initial_state.end_sequence = false;
-            initial_state.prologue_end = false;
-            initial_state.epilogue_begin = false;
-            initial_state.isa = 0;
-            initial_state.discriminator = 0;
-            state = initial_state;
-
-            // For every sequence.
-            while (off < unit_end) {
-                uint8_t opcode = e.getU8(&off);
-
-                if (opcode == 0) {
-                    // Extended opcodes
-                    llvm_offset_t ext_offset = off;
-                    uint64_t len = e.getULEB128(&off);
-                    llvm_offset_t arg_size = len - (off - ext_offset);
-                    uint8_t sub_opcode = e.getU8(&off);
-                    switch (sub_opcode) {
-                    case 1:  // end_sequence
-                    {
-                        state.end_sequence = true;
-                        state.append_row(source_lines);
-                        state = initial_state;
-                        break;
-                    }
-                    case 2:  // set_address
-                    {
-                        state.address = e.getAddress(&off);
-                        break;
-                    }
-                    case 3:  // define_file
-                    {
-                        const char *name = e.getCStr(&off);
-                        uint64_t dir_index = e.getULEB128(&off);
-                        uint64_t mod_time = e.getULEB128(&off);
-                        uint64_t length = e.getULEB128(&off);
-                        (void)mod_time;
-                        (void)length;
-                        internal_assert(dir_index < include_dirs.size());
-                        source_files.push_back(include_dirs[dir_index] + "/" + name);
-                        break;
-                    }
-                    case 4:  // set_discriminator
-                    {
-                        state.discriminator = e.getULEB128(&off);
-                        break;
-                    }
-                    default:  // Some unknown thing. Skip it.
-                        off += arg_size;
-                    }
-                } else if (opcode < opcode_base) {
-                    // A standard opcode
-                    switch (opcode) {
-                    case 1:  // copy
-                    {
-                        state.append_row(source_lines);
-                        state.basic_block = false;
-                        state.prologue_end = false;
-                        state.epilogue_begin = false;
-                        state.discriminator = 0;
-                        break;
-                    }
-                    case 2:  // advance_pc
-                    {
-                        uint64_t advance = e.getULEB128(&off);
-                        state.address += min_instruction_length * ((state.op_index + advance) / max_ops_per_instruction);
-                        state.op_index = (state.op_index + advance) % max_ops_per_instruction;
-                        break;
-                    }
-                    case 3:  // advance_line
-                    {
-                        state.line += e.getSLEB128(&off);
-                        break;
-                    }
-                    case 4:  // set_file
-                    {
-                        state.file = e.getULEB128(&off) - 1 + source_files_base;
-                        break;
-                    }
-                    case 5:  // set_column
-                    {
-                        state.column = e.getULEB128(&off);
-                        break;
-                    }
-                    case 6:  // negate_stmt
-                    {
-                        state.is_stmt = !state.is_stmt;
-                        break;
-                    }
-                    case 7:  // set_basic_block
-                    {
-                        state.basic_block = true;
-                        break;
-                    }
-                    case 8:  // const_add_pc
-                    {
-                        // Same as special opcode 255 (but doesn't emit a row or reset state)
-                        uint8_t adjust_opcode = 255 - opcode_base;
-                        uint64_t advance = adjust_opcode / line_range;
-                        state.address += min_instruction_length * ((state.op_index + advance) / max_ops_per_instruction);
-                        state.op_index = (state.op_index + advance) % max_ops_per_instruction;
-                        break;
-                    }
-                    case 9:  // fixed_advance_pc
-                    {
-                        uint16_t advance = e.getU16(&off);
-                        state.address += advance;
-                        break;
-                    }
-                    case 10:  // set_prologue_end
-                    {
-                        state.prologue_end = true;
-                        break;
-                    }
-                    case 11:  // set_epilogue_begin
-                    {
-                        state.epilogue_begin = true;
-                        break;
-                    }
-                    case 12:  // set_isa
-                    {
-                        state.isa = e.getULEB128(&off);
-                        break;
-                    }
-                    default: {
-                        // Unknown standard opcode. Skip over the args.
-                        uint8_t args = standard_opcode_length[opcode];
-                        for (int i = 0; i < args; i++) {
-                            e.getULEB128(&off);
-                        }
-                    }
-                    }
-                } else {
-                    // Special opcode
-                    uint8_t adjust_opcode = opcode - opcode_base;
-                    uint64_t advance_op = adjust_opcode / line_range;
-                    uint64_t advance_line = line_base + adjust_opcode % line_range;
-                    state.address += min_instruction_length * ((state.op_index + advance_op) / max_ops_per_instruction);
-                    state.op_index = (state.op_index + advance_op) % max_ops_per_instruction;
-                    state.line += advance_line;
-                    state.append_row(source_lines);
-                    state.basic_block = false;
-                    state.prologue_end = false;
-                    state.epilogue_begin = false;
-                    state.discriminator = 0;
-                }
-            }
-        }
-
-        // Sort the sequences and functions by low PC to make searching into it faster.
-        std::sort(source_lines.begin(), source_lines.end());
-    }
-
-    FunctionInfo *find_containing_function(void *addr) {
-        uint64_t address = (uint64_t)addr;
-        debug(5) << "Searching for function containing address " << addr << "\n";
-        size_t hi = functions.size();
-        size_t lo = 0;
-        while (hi > lo) {
-            size_t mid = (hi + lo) / 2;
-            uint64_t pc_mid_begin = functions[mid].pc_begin;
-            uint64_t pc_mid_end = functions[mid].pc_end;
-            if (address < pc_mid_begin) {
-                hi = mid;
-            } else if (address > pc_mid_end) {
-                lo = mid + 1;
-            } else {
-                debug(5) << "At function " << functions[mid].name
-                         << " spanning: " << (void *)pc_mid_begin
-                         << ", " << (void *)pc_mid_end << "\n";
-                return &functions[mid];
-            }
-        }
-
-        return nullptr;
-    }
-
-    int64_t get_sleb128(const uint8_t *ptr) {
-        int64_t result = 0;
-        unsigned shift = 0;
-        uint8_t byte = 0;
-
-        while (true) {
-            internal_assert(shift < 57);
-            byte = *ptr++;
-            result |= (uint64_t)(byte & 0x7f) << shift;
-            shift += 7;
-            if ((byte & 0x80) == 0) {
-                break;
-            }
-        }
-
-        // Second-highest bit of the final byte gives the sign.
-        if (shift < 64 && (byte & 0x40)) {
-            // Fill the rest of the bytes with ones.
-            result |= -(1ULL << shift);
-        }
-
-        return result;
-    }
-
-    int64_t get_uleb128(const uint8_t *ptr) {
-        uint64_t result = 0;
-        unsigned shift = 0;
-        uint8_t byte = 0;
-
-        while (true) {
-            internal_assert(shift < 57);
-            byte = *ptr++;
-            result |= (uint64_t)(byte & 0x7f) << shift;
-            shift += 7;
-            if ((byte & 0x80) == 0) {
-                return result;
-            }
-        }
-    }
-};
-
-DebugSections *debug_sections = nullptr;
-
-}  // namespace
-
-bool dump_stack_frame() {
-    if (!debug_sections || !debug_sections->working) {
-        return false;
-    }
-    void *ptr = __builtin_return_address(0);
-    return debug_sections->dump_stack_frame(ptr);
-}
-
-std::string get_variable_name(const void *var, const std::string &expected_type) {
-    if (!debug_sections ||
-        !debug_sections->working) {
-        return "";
-    }
-    std::string name = debug_sections->get_stack_variable_name(var, expected_type);
-    if (name.empty()) {
-        // Maybe it's a member of a heap object.
-        name = debug_sections->get_heap_member_name(var, expected_type);
-    }
-    if (name.empty()) {
-        // Maybe it's a global
-        name = debug_sections->get_global_variable_name(var, expected_type);
-    }
-
-    return name;
-}
-
-std::string get_source_location() {
-    if (!debug_sections ||
-        !debug_sections->working) {
-        return "";
-    }
-    return debug_sections->get_source_location();
-}
-
-void register_heap_object(const void *obj, size_t size, const void *helper) {
-    if (!debug_sections ||
-        !debug_sections->working ||
-        !helper) {
-        return;
-    }
-    debug_sections->register_heap_object(obj, size, helper);
-}
-
-void deregister_heap_object(const void *obj, size_t size) {
-    if (!debug_sections ||
-        !debug_sections->working) {
-        return;
-    }
-    debug_sections->deregister_heap_object(obj, size);
-}
-
-bool saves_frame_pointer(void *fn) {
-    // On x86-64, if we save the frame pointer, the first two instructions should be pushing the stack pointer and the frame pointer:
-    const uint8_t *ptr = (const uint8_t *)(fn);
-    // Skip over a valid-branch-target marker (endbr64), if there is
-    // one. These sometimes start functions to help detect control flow
-    // violations.
-    if (ptr[0] == 0xf3 && ptr[1] == 0x0f && ptr[2] == 0x1e && ptr[3] == 0xfa) {
-        ptr += 4;
-    }
-    return ptr[0] == 0x55;  // push %rbp
-}
-
-void test_compilation_unit(bool (*test)(bool (*)(const void *, const std::string &)),
-                           bool (*test_a)(const void *, const std::string &),
-                           void (*calib)()) {
-#ifdef __ARM__
-    return;
-#else
-
-    // Skip entirely on arm or 32-bit
-    if (sizeof(void *) == 4) {
-        return;
-    }
-
-    debug(5) << "Testing compilation unit with offset_marker at " << reinterpret_bits<void *>(calib) << "\n";
-
-    if (!debug_sections) {
-        char path[2048];
-        get_program_name(path, sizeof(path));
-        debug_sections = new DebugSections(path);
-    }
-
-    if (!saves_frame_pointer(reinterpret_bits<void *>(&test_compilation_unit)) ||
-        !saves_frame_pointer(reinterpret_bits<void *>(test))) {
-        // Make sure libHalide and the test compilation unit both save the frame pointer
-        debug_sections->working = false;
-        debug(5) << "Failed because frame pointer not saved\n";
-    } else if (debug_sections->working) {
-        debug_sections->calibrate_pc_offset(calib);
-        if (!debug_sections->working) {
-            debug(5) << "Failed because offset calibration failed\n";
-            return;
-        }
-
-        debug_sections->working = (*test)(test_a);
-        if (!debug_sections->working) {
-            debug(5) << "Failed because test routine failed\n";
-            return;
-        }
-
-        debug(5) << "Test passed\n";
-    }
-
-#endif
-}
-
-}  // namespace Introspection
-}  // namespace Internal
-}  // namespace Halide
-
-#else  // WITH_INTROSPECTION
-
-namespace Halide {
-namespace Internal {
-namespace Introspection {
-
-std::string get_variable_name(const void *var, const std::string &expected_type) {
-    return "";
-}
-
-std::string get_source_location() {
-    return "";
-}
-
-void register_heap_object(const void *obj, size_t size, const void *helper) {
-}
-
-void deregister_heap_object(const void *obj, size_t size) {
-}
-
-void test_compilation_unit(bool (*test)(bool (*)(const void *, const std::string &)),
-                           bool (*test_a)(const void *, const std::string &),
-                           void (*calib)()) {
-}
-
-}  // namespace Introspection
-}  // namespace Internal
-}  // namespace Halide
-
-#endif
diff --git a/src/Introspection.h b/src/Introspection.h
deleted file mode 100644
index ea657b2bef28..000000000000
--- a/src/Introspection.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef HALIDE_INTROSPECTION_H
-#define HALIDE_INTROSPECTION_H
-
-#include <cstdint>
-#include <iostream>
-#include <string>
-
-/** \file
- *
- * Defines methods for introspecting in C++. Relies on DWARF debugging
- * metadata, so the compilation unit that uses this must be compiled
- * with -g.
- */
-
-namespace Halide {
-namespace Internal {
-
-namespace Introspection {
-/** Get the name of a stack variable from its address. The stack
- * variable must be in a compilation unit compiled with -g to
- * work. The expected type helps distinguish between variables at the
- * same address, e.g a class instance vs its first member. */
-std::string get_variable_name(const void *, const std::string &expected_type);
-
-/** Register an untyped heap object. Derive type information from an
- * introspectable pointer to a pointer to a global object of the same
- * type. Not thread-safe. */
-void register_heap_object(const void *obj, size_t size, const void *helper);
-
-/** Deregister a heap object. Not thread-safe. */
-void deregister_heap_object(const void *obj, size_t size);
-
-/** Dump the contents of the stack frame of the calling function. Used
- * for debugging stack frame sizes inside the compiler. Returns
- * whether or not it was able to find the relevant debug
- * information. */
-bool dump_stack_frame();
-
-#define HALIDE_DUMP_STACK_FRAME                                                  \
-    {                                                                            \
-        static bool check = Halide::Internal::Introspection::dump_stack_frame(); \
-        (void)check;                                                             \
-    }
-
-/** Return the address of a global with type T *. Call this to
- * generate something to pass as the last argument to
- * register_heap_object.
- */
-template<typename T>
-const void *get_introspection_helper() {
-    static T *introspection_helper = nullptr;
-    return &introspection_helper;
-}
-
-/** Get the source location in the call stack, skipping over calls in
- * the Halide namespace. */
-std::string get_source_location();
-
-// This gets called automatically by anyone who includes Halide.h by
-// the code below. It tests if this functionality works for the given
-// compilation unit, and disables it if not.
-void test_compilation_unit(bool (*test)(bool (*)(const void *, const std::string &)),
-                           bool (*test_a)(const void *, const std::string &),
-                           void (*calib)());
-}  // namespace Introspection
-
-}  // namespace Internal
-}  // namespace Halide
-
-// This code verifies that introspection is working before relying on
-// it. The definitions must appear in Halide.h, but they should not
-// appear in libHalide itself. They're defined as static so that clients
-// can include Halide.h multiple times without link errors.
-#ifndef COMPILING_HALIDE
-
-namespace Halide {
-namespace Internal {
-static bool check_introspection(const void *var, const std::string &type,
-                                const std::string &correct_name,
-                                const std::string &correct_file, int line) {
-    std::string correct_loc = correct_file + ":" + std::to_string(line);
-    std::string loc = Introspection::get_source_location();
-    std::string name = Introspection::get_variable_name(var, type);
-    return name == correct_name && loc == correct_loc;
-}
-}  // namespace Internal
-}  // namespace Halide
-
-namespace HalideIntrospectionCanary {
-
-// A function that acts as a signpost. By taking it's address and
-// comparing it to the program counter listed in the debugging info,
-// we can calibrate for any offset between the debugging info and the
-// actual memory layout where the code was loaded.
-static void offset_marker() {
-    std::cerr << "You should not have called this function\n";
-}
-
-struct A {
-    int an_int;
-
-    class B {
-        int private_member = 17;
-
-    public:
-        float a_float;
-        A *parent;
-        B() {
-            a_float = private_member * 2.0f;
-        }
-    };
-
-    B a_b;
-
-    A() {
-        a_b.parent = this;
-    }
-
-    bool test(const std::string &my_name);
-};
-
-static bool test_a(const void *a_ptr, const std::string &my_name) {
-    const A *a = (const A *)a_ptr;
-    bool success = true;
-    success &= Halide::Internal::check_introspection(&a->an_int, "int", my_name + ".an_int", __FILE__, __LINE__);
-    success &= Halide::Internal::check_introspection(&a->a_b, "HalideIntrospectionCanary::A::B", my_name + ".a_b", __FILE__, __LINE__);
-    success &= Halide::Internal::check_introspection(&a->a_b.parent, "HalideIntrospectionCanary::A \\*", my_name + ".a_b.parent", __FILE__, __LINE__);
-    success &= Halide::Internal::check_introspection(&a->a_b.a_float, "float", my_name + ".a_b.a_float", __FILE__, __LINE__);
-    success &= Halide::Internal::check_introspection(a->a_b.parent, "HalideIntrospectionCanary::A", my_name, __FILE__, __LINE__);
-    return success;
-}
-
-static bool test(bool (*f)(const void *, const std::string &)) {
-    A a1, a2;
-
-    // Call via pointer to prevent inlining.
-    return f(&a1, "a1") && f(&a2, "a2");
-}
-
-// Run the tests, and calibrate for the PC offset at static initialization time.
-namespace {
-struct TestCompilationUnit {
-    TestCompilationUnit() {
-        Halide::Internal::Introspection::test_compilation_unit(&test, &test_a, &offset_marker);
-    }
-};
-}  // namespace
-
-static TestCompilationUnit test_object;
-
-}  // namespace HalideIntrospectionCanary
-
-#endif
-
-#endif
diff --git a/src/ObjectInstanceRegistry.cpp b/src/ObjectInstanceRegistry.cpp
index 76b02748f5fc..aef8ec91c8b4 100644
--- a/src/ObjectInstanceRegistry.cpp
+++ b/src/ObjectInstanceRegistry.cpp
@@ -1,6 +1,5 @@
 #include "ObjectInstanceRegistry.h"
 #include "Error.h"
-#include "Introspection.h"
 
 namespace Halide {
 namespace Internal {
@@ -13,17 +12,12 @@ ObjectInstanceRegistry &ObjectInstanceRegistry::get_registry() {
 
 /* static */
 void ObjectInstanceRegistry::register_instance(void *this_ptr, size_t size, Kind kind,
-                                               void *subject_ptr, const void *introspection_helper) {
+                                               void *subject_ptr) {
     ObjectInstanceRegistry &registry = get_registry();
     std::lock_guard<std::mutex> lock(registry.mutex);
     uintptr_t key = (uintptr_t)this_ptr;
     internal_assert(registry.instances.find(key) == registry.instances.end());
-    if (introspection_helper) {
-        registry.instances[key] = InstanceInfo(size, kind, subject_ptr, true);
-        Introspection::register_heap_object(this_ptr, size, introspection_helper);
-    } else {
-        registry.instances[key] = InstanceInfo(size, kind, subject_ptr, false);
-    }
+    registry.instances[key] = InstanceInfo(size, kind, subject_ptr);
 }
 
 /* static */
@@ -33,9 +27,6 @@ void ObjectInstanceRegistry::unregister_instance(void *this_ptr) {
     uintptr_t key = (uintptr_t)this_ptr;
     std::map<uintptr_t, InstanceInfo>::iterator it = registry.instances.find(key);
     internal_assert(it != registry.instances.end());
-    if (it->second.registered_for_introspection) {
-        Introspection::deregister_heap_object(this_ptr, it->second.size);
-    }
     registry.instances.erase(it);
 }
 
diff --git a/src/ObjectInstanceRegistry.h b/src/ObjectInstanceRegistry.h
index d6d7275f6b5f..f92fac9ce9ed 100644
--- a/src/ObjectInstanceRegistry.h
+++ b/src/ObjectInstanceRegistry.h
@@ -32,25 +32,8 @@ class ObjectInstanceRegistry {
      * but not for Generator. subject_ptr is the value actually associated
      * with this instance; it is usually (but not necessarily) the same
      * as this_ptr. Assert if this_ptr is already registered.
-     *
-     * If 'this' is directly heap allocated (not a member of a
-     * heap-allocated object) and you want the introspection subsystem
-     * to know about it and its members, set the introspection_helper
-     * argument to a pointer to a global variable with the same true
-     * type as 'this'. For example:
-     *
-     * MyObject *obj = new MyObject;
-     * static MyObject *introspection_helper = nullptr;
-     * register_instance(obj, sizeof(MyObject), kind, obj, &introspection_helper);
-     *
-     * I.e. introspection_helper should be a pointer to a pointer to
-     * an object instance. The inner pointer can be null. The
-     * introspection subsystem will then assume this new object is of
-     * the matching type, which will help its members deduce their
-     * names on construction.
      */
-    static void register_instance(void *this_ptr, size_t size, Kind kind, void *subject_ptr,
-                                  const void *introspection_helper);
+    static void register_instance(void *this_ptr, size_t size, Kind kind, void *subject_ptr);
 
     /** Remove an instance from the registry. Assert if not found.
      */
@@ -70,11 +53,10 @@ class ObjectInstanceRegistry {
         void *subject_ptr = nullptr;  // May be different from the this_ptr in the key
         size_t size = 0;              // May be 0 for params
         Kind kind = Invalid;
-        bool registered_for_introspection = false;
 
         InstanceInfo() = default;
-        InstanceInfo(size_t size, Kind kind, void *subject_ptr, bool registered_for_introspection)
-            : subject_ptr(subject_ptr), size(size), kind(kind), registered_for_introspection(registered_for_introspection) {
+        InstanceInfo(size_t size, Kind kind, void *subject_ptr)
+            : subject_ptr(subject_ptr), size(size), kind(kind) {
         }
     };
 
diff --git a/src/Param.h b/src/Param.h
index a8e8864059c7..2731b68cdd75 100644
--- a/src/Param.h
+++ b/src/Param.h
@@ -58,11 +58,11 @@ class Param {
      * auto-generated name */
     // @{
     Param()
-        : param(type_of<T>(), false, 0, Internal::make_entity_name(this, "Halide:.*:Param<.*>", 'p')) {
+        : param(type_of<T>(), false, 0, Internal::unique_name('p')) {
         static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
     }
     explicit Param(Type t)
-        : param(t, false, 0, Internal::make_entity_name(this, "Halide:.*:Param<.*>", 'p')) {
+        : param(t, false, 0, Internal::unique_name('p')) {
         static_assert(!has_static_type, "Cannot use this ctor with an explicit type.");
     }
     // @}
@@ -90,7 +90,7 @@ class Param {
      * 'val'. Only triggers for non-pointer types. */
     template<typename T2 = T, typename std::enable_if<!std::is_pointer<T2>::value>::type * = nullptr>
     explicit Param(not_void_T val)
-        : param(type_of<T>(), false, 0, Internal::make_entity_name(this, "Halide:.*:Param<.*>", 'p')) {
+        : param(type_of<T>(), false, 0, Internal::unique_name('p')) {
         static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
         set<not_void_T>(val);
     }
@@ -107,7 +107,7 @@ class Param {
     /** Construct a scalar parameter of type T with an initial value of 'val'
      * and a given min and max. */
     Param(not_void_T val, const Expr &min, const Expr &max)
-        : param(type_of<T>(), false, 0, Internal::make_entity_name(this, "Halide:.*:Param<.*>", 'p')) {
+        : param(type_of<T>(), false, 0, Internal::unique_name('p')) {
         static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
         set_range(min, max);
         set<not_void_T>(val);
diff --git a/src/RDom.cpp b/src/RDom.cpp
index b7859bafc2b7..dcebe2d6e370 100644
--- a/src/RDom.cpp
+++ b/src/RDom.cpp
@@ -135,7 +135,7 @@ void RDom::validate_min_extent(const Expr &min, const Expr &extent) {
 
 void RDom::initialize_from_region(const Region &region, string name) {
     if (name.empty()) {
-        name = make_entity_name(this, "Halide:.*:RDom", 'r');
+        name = unique_name('r');
     }
 
     std::vector<ReductionVariable> vars;
diff --git a/src/RDom.h b/src/RDom.h
index 09a794b00f6e..00eab89a0009 100644
--- a/src/RDom.h
+++ b/src/RDom.h
@@ -40,7 +40,7 @@ class RVar {
 public:
     /** An empty reduction variable. */
     RVar()
-        : _name(Internal::make_entity_name(this, "Halide:.*:RVar", 'r')) {
+        : _name(Internal::unique_name('r')) {
     }
 
     /** Construct an RVar with the given name */
diff --git a/src/Serialization.cpp b/src/Serialization.cpp
index c1cb3a6d1193..27bea2f5cfa3 100644
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -1163,7 +1163,10 @@ Offset<Serialize::Definition> Serializer::serialize_definition(FlatBufferBuilder
     for (const auto &specialization : definition.specializations()) {
         specializations_serialized.push_back(serialize_specialization(builder, specialization));
     }
-    const auto source_location_serialized = serialize_string(builder, definition.source_location());
+    // This always relied on Introspection working, so an empty string was always valid.
+    // Rather than change the serialization format for a compiler-dependent value, we'll
+    // just always use an empty string now.
+    const auto source_location_serialized = serialize_string(builder, "");
     return Serialize::CreateDefinition(builder, is_init,
                                        predicate_serialized.first, predicate_serialized.second,
                                        builder.CreateVector(values_types), builder.CreateVector(values_serialized),
diff --git a/src/Util.cpp b/src/Util.cpp
index d7f3c36a7993..b266efeda55e 100644
--- a/src/Util.cpp
+++ b/src/Util.cpp
@@ -7,7 +7,6 @@
 #include "Util.h"
 #include "Debug.h"
 #include "Error.h"
-#include "Introspection.h"
 #include <atomic>
 #include <chrono>
 #include <fstream>
@@ -273,22 +272,6 @@ string replace_all(const string &str, const string &find, const string &replace)
     return result;
 }
 
-string make_entity_name(void *stack_ptr, const string &type, char prefix) {
-    string name = Introspection::get_variable_name(stack_ptr, type);
-
-    if (name.empty()) {
-        return unique_name(prefix);
-    } else {
-        // Halide names may not contain '.'
-        for (char &c : name) {
-            if (c == '.') {
-                c = ':';
-            }
-        }
-        return unique_name(name);
-    }
-}
-
 std::vector<std::string> split_string(const std::string &source, const std::string &delim) {
     std::vector<std::string> elements;
     size_t start = 0;
diff --git a/src/Util.h b/src/Util.h
index bce0a7f1d015..40c48bb42b9e 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -139,11 +139,6 @@ DstType reinterpret_bits(const SrcType &src) {
     return dst;
 }
 
-/** Make a unique name for an object based on the name of the stack
- * variable passed in. If introspection isn't working or there are no
- * debug symbols, just uses unique_name with the given prefix. */
-std::string make_entity_name(void *stack_ptr, const std::string &type, char prefix);
-
 /** Get value of an environment variable. Returns its value
  * is defined in the environment. If the var is not defined, an empty string
  * is returned.
diff --git a/src/Var.cpp b/src/Var.cpp
index b71b3bec0142..ec07b68253c4 100644
--- a/src/Var.cpp
+++ b/src/Var.cpp
@@ -9,7 +9,7 @@ Var::Var(const std::string &n)
 }
 
 Var::Var()
-    : e(Internal::Variable::make(Int(32), Internal::make_entity_name(this, "Halide:.*:Var", 'v'))) {
+    : e(Internal::Variable::make(Int(32), Internal::unique_name('v'))) {
 }
 
 Var Var::implicit(int n) {
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 623fdd16da5b..46b5338d8ecd 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -188,7 +188,6 @@ tests(GROUPS correctness
       interleave_x.cpp
       interval.cpp
       intrinsics.cpp
-      introspection.cpp
       invalid_gpu_loop_nests.cpp
       inverse.cpp
       isnan.cpp
diff --git a/test/correctness/introspection.cpp b/test/correctness/introspection.cpp
deleted file mode 100644
index c01595dcb24e..000000000000
--- a/test/correctness/introspection.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-#include "Halide.h"
-#include <stdio.h>
-
-using namespace Halide;
-
-// The check has to go in the Halide namespace, because get_source_location looks for the first thing outside of it
-namespace Halide {
-bool paths_equal(const std::string &path1, const std::string &path2) {
-    bool one_is_first = path1.size() >= path2.size();
-    const std::string &first(one_is_first ? path1 : path2);
-    const std::string &second(one_is_first ? path2 : path1);
-
-    if (first.empty() || first.size() == second.size()) {
-        return first == second;
-    }
-    size_t length_delta = first.size() - second.size();
-    char sep = first[length_delta - 1];
-    return (sep == '/' || sep == '\\') &&
-           first.substr(length_delta) == second;
-}
-
-void check(const void *var, const std::string &type,
-           const std::string &correct_name,
-           const std::string &correct_file, int line) {
-    std::string correct_loc = correct_file + ":" + std::to_string(line);
-    std::string loc = Halide::Internal::Introspection::get_source_location();
-    std::string name = Halide::Internal::Introspection::get_variable_name(var, type);
-
-    if (!paths_equal(correct_name, name)) {
-        printf("Mispredicted name: %s vs %s\n",
-               name.c_str(), correct_name.c_str());
-        exit(1);
-    }
-
-    if (!paths_equal(loc, correct_loc)) {
-        printf("Mispredicted source location: %s vs %s\n",
-               loc.c_str(), correct_loc.c_str());
-        exit(1);
-    }
-}
-}  // namespace Halide
-
-using Halide::check;
-
-int global_int = 7;
-
-struct SomeStruct {
-    int global_struct_a;
-    int global_struct_b;
-    static float static_float;
-    static double static_member_double_array[17];
-    static struct SubStruct {
-        int a;
-    } substruct;
-} global_struct;
-
-float SomeStruct::static_float = 3.0f;
-double SomeStruct::static_member_double_array[17] = {0};
-SomeStruct::SubStruct SomeStruct::substruct = {0};
-
-float global_array[7];
-
-namespace Foo {
-
-int global_int_in_foo = 8;
-
-namespace {
-struct Bar {
-    typedef int bint;
-    bint bar_int;
-    Bar(int x)
-        : bar_int(x) {
-    }
-    ~Bar() {
-    }
-    void check_bar() {
-        check(this, "Foo::_::Bar", "b", __FILE__, __LINE__);
-        check(&bar_int, "Foo::_::Bar::bint", "b.bar_int", __FILE__, __LINE__);
-    }
-    int get() {
-        return bar_int * 2;
-    }
-};
-
-int g(int x) {
-    Bar b(x * 7);
-    b.check_bar();
-    return b.get();
-}
-
-}  // namespace
-
-int f(int x) {
-    static float static_float_in_f = 0.3f;
-    int y = g(x) + g(x - 1);
-    check(&y, "int", "y", __FILE__, __LINE__);
-    check(&static_float_in_f, "float", "static_float_in_f", __FILE__, __LINE__);
-    return y - 1;
-}
-
-}  // namespace Foo
-
-typedef float fancy_float;
-
-struct HeapObject {
-    float f;
-    fancy_float f2;
-    int i;
-    struct {
-        char c;
-        double d;
-        int i_array[20];
-    } inner;
-    HeapObject *ptr;
-    struct inner2 {
-        int a[5];
-    };
-    inner2 inner2_array[10];
-};
-
-int main(int argc, char **argv) {
-    bool result = HalideIntrospectionCanary::test(&HalideIntrospectionCanary::test_a);
-
-    if (result) {
-        printf("Halide C++ introspection claims to be working with this build config\n");
-    } else {
-        printf("[SKIP] Halide C++ introspection doesn't claim to work with this build config.\n");
-        return 0;
-    }
-
-    printf("Continuing with further tests...\n");
-
-    Foo::f(17);
-
-    // Make sure it works all the way up to main
-    int secret_int = 5;
-    check(&secret_int, "int", "secret_int", __FILE__, __LINE__);
-
-    // Make sure it rejects heap variables
-    int *on_the_heap = new int;
-    check(on_the_heap, "int", "", __FILE__, __LINE__);
-    delete on_the_heap;
-
-    // .. unless they're members of explicitly registered objects
-    HeapObject *obj = new HeapObject;
-    static HeapObject *dummy_heap_object_ptr = nullptr;
-    check(&dummy_heap_object_ptr, "HeapObject \\*", "dummy_heap_object_ptr", __FILE__, __LINE__);
-    Halide::Internal::Introspection::register_heap_object(obj, sizeof(HeapObject), &dummy_heap_object_ptr);
-    check(&(obj->f), "float", "f", __FILE__, __LINE__);
-    check(&(obj->f2), "fancy_float", "f2", __FILE__, __LINE__);
-    check(&(obj->f2), "float", "f2", __FILE__, __LINE__);
-    check(&(obj->i), "int", "i", __FILE__, __LINE__);
-    check(&(obj->inner.c), "char", "inner.c", __FILE__, __LINE__);
-    check(&(obj->inner.d), "double", "inner.d", __FILE__, __LINE__);
-    check(&(obj->ptr), "HeapObject \\*", "ptr", __FILE__, __LINE__);
-    // TODO:
-    check(&(obj->inner.i_array[10]), "int", "inner.i_array[10]", __FILE__, __LINE__);
-    check(&(obj->inner2_array[4].a[2]), "int", "inner2_array[4].a[2]", __FILE__, __LINE__);
-    check(&(obj->inner.i_array), "int .20.", "inner.i_array", __FILE__, __LINE__);
-    Halide::Internal::Introspection::deregister_heap_object(obj, sizeof(HeapObject));
-    delete obj;
-
-    // Make sure it works for arrays.
-    float an_array[17];
-    check(&an_array[5], "float", "an_array[5]", __FILE__, __LINE__);
-
-    // Check what happens with lexical blocks which may reuse stack positions
-    {
-        int block_a = 3;
-        (void)block_a;
-        check(&block_a, "int", "block_a", __FILE__, __LINE__);
-    }
-
-    {
-        int block_b = 3;
-        (void)block_b;
-        check(&block_b, "int", "block_b", __FILE__, __LINE__);
-    }
-
-    {
-        int block_c = 3;
-        (void)block_c;
-        check(&block_c, "int", "block_c", __FILE__, __LINE__);
-    }
-
-    // Check we can name globals
-    check(&global_int, "int", "global_int", __FILE__, __LINE__);
-    check(&Foo::global_int_in_foo, "int", "Foo::global_int_in_foo", __FILE__, __LINE__);
-
-    // Check we can name members of globals
-    check(&global_struct, "SomeStruct", "global_struct", __FILE__, __LINE__);
-    check(&global_struct.global_struct_a, "int", "global_struct.global_struct_a", __FILE__, __LINE__);
-    check(&global_struct.global_struct_b, "int", "global_struct.global_struct_b", __FILE__, __LINE__);
-
-    check(&global_array[4], "float", "global_array[4]", __FILE__, __LINE__);
-
-    check(&SomeStruct::static_float, "float", "SomeStruct::static_float", __FILE__, __LINE__);
-
-    check(&SomeStruct::static_member_double_array[5], "double", "SomeStruct::static_member_double_array[5]", __FILE__, __LINE__);
-
-    check(&SomeStruct::substruct.a, "int", "SomeStruct::substruct.a", __FILE__, __LINE__);
-
-    // Check that we can query front-end objects for their source locations
-    {
-        std::string loc;
-        Func f;
-        Var x;
-        f(x) = x;
-        loc = std::string(__FILE__) + ":" + std::to_string(__LINE__ - 1);
-        assert(paths_equal(f.source_location(), loc));
-
-        f(x) += 1;
-        loc = std::string(__FILE__) + ":" + std::to_string(__LINE__ - 1);
-        assert(paths_equal(f.update().source_location(), loc));
-    }
-
-    printf("Success!\n");
-
-    return 0;
-}

From 8ff261e3949ea8e30cc9f0d64b625094fd8485ae Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Tue, 25 Jun 2024 13:10:43 -0700
Subject: [PATCH 143/186] Per-pipeline-invocation profiling (#8153)

* Profiler tracks per-invocation state, instead of global state

This should give better results when multiple Halide pipelines are
running at the same time.

* Profiler improvements

- Don't profile bounds queries
- Simplify layout calculation
- Bill time after decrementing main thread as overhead, not waiting on
parallel tasks
- Change waiting on parallel tasks label

* name hygiene

* Fix signature

* Fix tracking of pipeline-level memory statistics

* Address review comments

* Pacify clang-tidy

* [Hexagon] Profiling changes for abadams/per_instance_profiling (#8187)

* Get abadams/per_instance_profiling working on hvx

* More changes

* Add Hexagon libraries

* Fix multiple instances of profiler_state

* Update hexagon libraries

* clang-format

---------

Co-authored-by: Steven Johnson <srj@google.com>
Co-authored-by: aankit-quic <166656642+aankit-quic@users.noreply.github.com>
---
 src/BoundsInference.cpp                       |   7 +
 src/CodeGen_Internal.cpp                      |   4 +-
 src/IR.cpp                                    |   1 +
 src/IR.h                                      |   1 +
 src/Profiling.cpp                             | 207 +++++++----
 src/Profiling.h                               |   2 +-
 src/runtime/HalideRuntime.h                   | 130 ++++---
 src/runtime/fuchsia_clock.cpp                 |   4 +-
 src/runtime/hexagon_host.cpp                  |  23 +-
 .../bin/host/libhalide_hexagon_host.so        | Bin 109984 -> 88328 bytes
 .../hexagon_remote/bin/v65/hexagon_sim_remote | Bin 518136 -> 518168 bytes
 .../bin/v65/libhalide_hexagon_remote_skel.so  | Bin 53152 -> 53216 bytes
 .../hexagon_remote/bin/v65/libsim_qurt.a      | Bin 5678 -> 5682 bytes
 .../libhalide_hexagon_remote_skel.so          | Bin 47058 -> 0 bytes
 .../hexagon_remote/qurt/halide_remote.cpp     |  15 +-
 .../hexagon_remote/qurt/known_symbols.cpp     |   1 -
 .../hexagon_remote/qurt/sim_remote.cpp        |  10 +-
 src/runtime/linux_clock.cpp                   |   4 +-
 src/runtime/osx_clock.cpp                     |   4 +-
 src/runtime/posix_clock.cpp                   |   4 +-
 src/runtime/posix_timer_profiler.cpp          |   2 +-
 src/runtime/profiler_common.cpp               | 348 +++++++++++-------
 src/runtime/profiler_inlined.cpp              |  20 +-
 src/runtime/runtime_api.cpp                   |   5 +-
 src/runtime/runtime_atomics.h                 |   6 +-
 src/runtime/runtime_internal.h                |  23 +-
 src/runtime/windows_clock.cpp                 |   8 +-
 27 files changed, 538 insertions(+), 291 deletions(-)
 delete mode 100644 src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so

diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index 21ca06e07285..84f076d94537 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -1388,6 +1388,13 @@ Stmt bounds_inference(Stmt s,
     Expr marker = Call::make(Int(32), Call::skip_stages_marker, {}, Call::Intrinsic);
     s = Block::make(Evaluate::make(marker), s);
 
+    if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) {
+        // Add a note in the IR for what profiling should cover, so that it doesn't
+        // include bounds queries as pipeline executions.
+        marker = Call::make(Int(32), Call::profiling_enable_instance_marker, {}, Call::Intrinsic);
+        s = Block::make(Evaluate::make(marker), s);
+    }
+
     // Add a note in the IR for where assertions on input images
     // should go. Those are handled by a later lowering pass.
     marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic);
diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index b68373f23ccf..21a0cea839df 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -51,8 +51,8 @@ bool function_takes_user_context(const std::string &name) {
         "halide_print",
         "halide_profiler_memory_allocate",
         "halide_profiler_memory_free",
-        "halide_profiler_pipeline_start",
-        "halide_profiler_pipeline_end",
+        "halide_profiler_instance_start",
+        "halide_profiler_instance_end",
         "halide_profiler_stack_peak_update",
         "halide_spawn_thread",
         "halide_device_release",
diff --git a/src/IR.cpp b/src/IR.cpp
index 81cf0a0f41ff..804e41234f71 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -653,6 +653,7 @@ const char *const intrinsic_op_names[] = {
     "mux",
     "popcount",
     "prefetch",
+    "profiling_enable_instance_marker",
     "promise_clamped",
     "random",
     "register_destructor",
diff --git a/src/IR.h b/src/IR.h
index d3f6af596f31..f21f3a9a52ba 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -569,6 +569,7 @@ struct Call : public ExprNode<Call> {
         mux,
         popcount,
         prefetch,
+        profiling_enable_instance_marker,
         promise_clamped,
         random,
         register_destructor,
diff --git a/src/Profiling.cpp b/src/Profiling.cpp
index 414578299df6..b0178b4a073a 100644
--- a/src/Profiling.cpp
+++ b/src/Profiling.cpp
@@ -23,14 +23,38 @@ using std::vector;
 
 namespace {
 
-Stmt incr_active_threads(const Expr &profiler_state) {
+// All names that need to be unique, just in case someone does something
+// perverse like naming a func "profiler_instance".
+struct Names {
+    const std::string &pipeline_name;
+    std::string profiler_instance;
+    std::string profiler_local_sampling_token;
+    std::string profiler_shared_sampling_token;
+    std::string hvx_profiler_instance;
+    std::string profiler_func_names;
+    std::string profiler_func_stack_peak_buf;
+    std::string profiler_start_error_code;
+
+    Names(const std::string &pipeline_name)
+        : pipeline_name(pipeline_name),
+          profiler_instance(unique_name("profiler_instance")),
+          profiler_local_sampling_token(unique_name("profiler_local_sampling_token")),
+          profiler_shared_sampling_token(unique_name("profiler_shared_sampling_token")),
+          hvx_profiler_instance(unique_name("hvx_profiler_instance")),
+          profiler_func_names(unique_name("profiler_func_names")),
+          profiler_func_stack_peak_buf(unique_name("profiler_func_stack_peak_buf")),
+          profiler_start_error_code(unique_name("profiler_start_error_code")) {
+    }
+};
+
+Stmt incr_active_threads(const Expr &profiler_instance) {
     return Evaluate::make(Call::make(Int(32), "halide_profiler_incr_active_threads",
-                                     {profiler_state}, Call::Extern));
+                                     {profiler_instance}, Call::Extern));
 }
 
-Stmt decr_active_threads(const Expr &profiler_state) {
+Stmt decr_active_threads(const Expr &profiler_instance) {
     return Evaluate::make(Call::make(Int(32), "halide_profiler_decr_active_threads",
-                                     {profiler_state}, Call::Extern));
+                                     {profiler_instance}, Call::Extern));
 }
 
 Stmt acquire_sampling_token(const Expr &shared_token, const Expr &local_token) {
@@ -43,18 +67,6 @@ Stmt release_sampling_token(const Expr &shared_token, const Expr &local_token) {
                                      {shared_token, local_token}, Call::Extern));
 }
 
-Stmt activate_thread(const Stmt &s, const Expr &profiler_state) {
-    return Block::make({incr_active_threads(profiler_state),
-                        s,
-                        decr_active_threads(profiler_state)});
-}
-
-Stmt suspend_thread(const Stmt &s, const Expr &profiler_state) {
-    return Block::make({decr_active_threads(profiler_state),
-                        s,
-                        incr_active_threads(profiler_state)});
-}
-
 Stmt claim_sampling_token(const Stmt &s, const Expr &shared_token, const Expr &local_token) {
     return LetStmt::make(local_token.as<Variable>()->name,
                          Call::make(Handle(), Call::alloca, {Int(32).bytes()}, Call::Intrinsic),
@@ -70,38 +82,62 @@ class InjectProfiling : public IRMutator {
 
     vector<int> stack;  // What produce nodes are we currently inside of.
 
-    string pipeline_name;
+    const Names &names;
     const map<string, Function> &env;
 
     bool in_fork = false;
     bool in_parallel = false;
     bool in_leaf_task = false;
 
-    InjectProfiling(const string &pipeline_name, const map<string, Function> &env)
-        : pipeline_name(pipeline_name), env(env) {
+    InjectProfiling(const Names &names, const map<std::string, Function> &env)
+        : names(names), env(env) {
         stack.push_back(get_func_id("overhead"));
         // ID 0 is treated specially in the runtime as overhead
         internal_assert(stack.back() == 0);
 
+        waiting_on_tasks_id = get_func_id("waiting for parallel tasks to finish");
         malloc_id = get_func_id("halide_malloc");
         free_id = get_func_id("halide_free");
-        profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state");
-        profiler_state = Variable::make(Handle(), "profiler_state");
-        profiler_token = Variable::make(Int(32), "profiler_token");
-        profiler_local_sampling_token = Variable::make(Handle(), "profiler_local_sampling_token");
-        profiler_shared_sampling_token = Variable::make(Handle(), "profiler_shared_sampling_token");
+
+        profiler_instance = Variable::make(Handle(), names.profiler_instance);
+        profiler_local_sampling_token = Variable::make(Handle(), names.profiler_local_sampling_token);
+        profiler_shared_sampling_token = Variable::make(Handle(), names.profiler_shared_sampling_token);
     }
 
     map<int, uint64_t> func_stack_current;  // map from func id -> current stack allocation
     map<int, uint64_t> func_stack_peak;     // map from func id -> peak stack allocation
 
+    Stmt activate_thread(const Stmt &s) {
+        return activate_thread_helper(s, waiting_on_tasks_id);
+    }
+
+    Stmt activate_main_thread(const Stmt &s) {
+        // The same as a child task, except when we finish (but before the
+        // instances get popped), bill anything as overhead.
+        return activate_thread_helper(s, 0);
+    }
+
+    Stmt activate_thread_helper(const Stmt &s, int final_id) {
+        return Block::make({incr_active_threads(profiler_instance),
+                            unconditionally_set_current_func(stack.back()),
+                            s,
+                            decr_active_threads(profiler_instance),
+                            unconditionally_set_current_func(final_id)});
+    }
+
+    Stmt suspend_thread(const Stmt &s) {
+        return Block::make({decr_active_threads(profiler_instance),
+                            unconditionally_set_current_func(waiting_on_tasks_id),
+                            s,
+                            incr_active_threads(profiler_instance),
+                            unconditionally_set_current_func(stack.back())});
+    }
+
 private:
     using IRMutator::visit;
 
-    int malloc_id, free_id;
-    Expr profiler_pipeline_state;
-    Expr profiler_state;
-    Expr profiler_token;
+    int malloc_id, free_id, waiting_on_tasks_id;
+    Expr profiler_instance;
     Expr profiler_local_sampling_token;
     Expr profiler_shared_sampling_token;
 
@@ -157,6 +193,12 @@ class InjectProfiling : public IRMutator {
         return idx;
     }
 
+    Stmt unconditionally_set_current_func(int id) {
+        Stmt s = Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func",
+                                           {profiler_instance, id, reinterpret(Handle(), cast<uint64_t>(0))}, Call::Extern));
+        return s;
+    }
+
     Stmt set_current_func(int id) {
         if (most_recently_set_func == id) {
             return Evaluate::make(0);
@@ -165,7 +207,7 @@ class InjectProfiling : public IRMutator {
         Expr last_arg = in_leaf_task ? profiler_local_sampling_token : reinterpret(Handle(), cast<uint64_t>(0));
         // This call gets inlined and becomes a single store instruction.
         Stmt s = Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func",
-                                           {profiler_state, profiler_token, id, last_arg}, Call::Extern));
+                                           {profiler_instance, id, last_arg}, Call::Extern));
 
         return s;
     }
@@ -203,6 +245,17 @@ class InjectProfiling : public IRMutator {
         return size;
     }
 
+    Expr visit(const Call *op) override {
+        if (op->is_intrinsic(Call::profiling_enable_instance_marker)) {
+            // We're out of the bounds query code. This instance should be
+            // tracked (including any samples taken before this point.
+            return Call::make(Int(32), "halide_profiler_enable_instance",
+                              {profiler_instance}, Call::Extern);
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
     Stmt visit(const Allocate *op) override {
 
         auto [new_extents, changed] = mutate_with_changes(op->extents);
@@ -229,7 +282,7 @@ class InjectProfiling : public IRMutator {
             func_stack_current[idx] += *int_size;
             func_stack_peak[idx] = std::max(func_stack_peak[idx], func_stack_current[idx]);
             debug(3) << "  Allocation on stack: " << op->name
-                     << "(" << size << ") in pipeline " << pipeline_name
+                     << "(" << size << ") in pipeline " << names.pipeline_name
                      << "; current: " << func_stack_current[idx]
                      << "; peak: " << func_stack_peak[idx] << "\n";
         }
@@ -240,11 +293,11 @@ class InjectProfiling : public IRMutator {
             int idx = get_func_id(op->name);
             debug(3) << "  Allocation on heap: " << op->name
                      << "(" << size << ") in pipeline "
-                     << pipeline_name << "\n";
+                     << names.pipeline_name << "\n";
 
             tasks.push_back(set_current_func(malloc_id));
             tasks.push_back(Evaluate::make(Call::make(Int(32), "halide_profiler_memory_allocate",
-                                                      {profiler_pipeline_state, idx, size}, Call::Extern)));
+                                                      {profiler_instance, idx, size}, Call::Extern)));
         }
 
         Stmt body = mutate(op->body);
@@ -281,12 +334,12 @@ class InjectProfiling : public IRMutator {
             if (!alloc.on_stack) {
                 if (profiling_memory) {
                     int idx = get_func_id(op->name);
-                    debug(3) << "  Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "\n";
+                    debug(3) << "  Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << names.pipeline_name << "\n";
 
                     vector<Stmt> tasks{
                         set_current_func(free_id),
                         Evaluate::make(Call::make(Int(32), "halide_profiler_memory_free",
-                                                  {profiler_pipeline_state, idx, alloc.size}, Call::Extern)),
+                                                  {profiler_instance, idx, alloc.size}, Call::Extern)),
                         stmt,
                         set_current_func(stack.back())};
 
@@ -304,8 +357,10 @@ class InjectProfiling : public IRMutator {
                     idx = get_func_id(op->name);
                 }
                 func_stack_current[idx] -= *int_size;
-                debug(3) << "  Free on stack: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name
-                         << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n";
+                debug(3) << "  Free on stack: " << op->name
+                         << "(" << alloc.size << ") in pipeline " << names.pipeline_name
+                         << "; current: " << func_stack_current[idx]
+                         << "; peak: " << func_stack_peak[idx] << "\n";
             }
         }
         return stmt;
@@ -345,7 +400,7 @@ class InjectProfiling : public IRMutator {
         } else if (const Acquire *a = s.as<Acquire>()) {
             s = Acquire::make(a->semaphore, a->count, visit_parallel_task(a->body));
         } else {
-            s = activate_thread(mutate(s), profiler_state);
+            s = activate_thread(mutate(s));
         }
         if (most_recently_set_func != old) {
             most_recently_set_func = -1;
@@ -355,13 +410,13 @@ class InjectProfiling : public IRMutator {
 
     Stmt visit(const Acquire *op) override {
         Stmt s = visit_parallel_task(op);
-        return suspend_thread(s, profiler_state);
+        return suspend_thread(s);
     }
 
     Stmt visit(const Fork *op) override {
         ScopedValue<bool> bind(in_fork, true);
         Stmt s = visit_parallel_task(op);
-        return suspend_thread(s, profiler_state);
+        return suspend_thread(s);
     }
 
     Stmt visit(const For *op) override {
@@ -378,7 +433,6 @@ class InjectProfiling : public IRMutator {
 
         bool leaf_task = false;
         if (update_active_threads) {
-            body = activate_thread(body, profiler_state);
 
             class ContainsParallelOrBlockingNode : public IRVisitor {
                 using IRVisitor::visit;
@@ -404,6 +458,8 @@ class InjectProfiling : public IRMutator {
             if (leaf_task) {
                 body = claim_sampling_token(body, profiler_shared_sampling_token, profiler_local_sampling_token);
             }
+
+            body = activate_thread(body);
         }
         ScopedValue<bool> bind_leaf_task(in_leaf_task, in_leaf_task || leaf_task);
 
@@ -423,9 +479,9 @@ class InjectProfiling : public IRMutator {
             // Get the profiler state pointer from scratch inside the
             // kernel. There will be a separate copy of the state on
             // the DSP that the host side will periodically query.
-            Expr get_state = Call::make(Handle(), "halide_profiler_get_state", {}, Call::Extern);
-            body = substitute("profiler_state", Variable::make(Handle(), "hvx_profiler_state"), body);
-            body = LetStmt::make("hvx_profiler_state", get_state, body);
+            Expr get_state = Call::make(Handle(), "halide_hexagon_remote_profiler_get_global_instance", {}, Call::Extern);
+            body = substitute(names.profiler_instance, Variable::make(Handle(), names.hvx_profiler_instance), body);
+            body = LetStmt::make(names.hvx_profiler_instance, get_state, body);
         } else if (op->device_api == DeviceAPI::None ||
                    op->device_api == DeviceAPI::Host) {
             body = mutate(body);
@@ -440,7 +496,7 @@ class InjectProfiling : public IRMutator {
         Stmt stmt = For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body);
 
         if (update_active_threads) {
-            stmt = suspend_thread(stmt, profiler_state);
+            stmt = suspend_thread(stmt);
         }
 
         return stmt;
@@ -538,77 +594,84 @@ class InjectProfiling : public IRMutator {
 
 }  // namespace
 
-Stmt inject_profiling(Stmt s, const string &pipeline_name, const std::map<string, Function> &env) {
-    InjectProfiling profiling(pipeline_name, env);
-    s = profiling.mutate(s);
+Stmt inject_profiling(const Stmt &stmt, const string &pipeline_name, const std::map<string, Function> &env) {
+    Names names(pipeline_name);
+
+    InjectProfiling profiling(names, env);
+    Stmt s = profiling.mutate(stmt);
 
     int num_funcs = (int)(profiling.indices.size());
 
-    Expr func_names_buf = Variable::make(Handle(), "profiling_func_names");
+    // TODO: unique_name all these strings
 
-    Expr start_profiler = Call::make(Int(32), "halide_profiler_pipeline_start",
-                                     {pipeline_name, num_funcs, func_names_buf}, Call::Extern);
+    Expr instance = Variable::make(Handle(), names.profiler_instance);
 
-    Expr get_state = Call::make(Handle(), "halide_profiler_get_state", {}, Call::Extern);
+    Expr func_names_buf = Variable::make(Handle(), names.profiler_func_names);
 
-    Expr get_pipeline_state = Call::make(Handle(), "halide_profiler_get_pipeline_state", {pipeline_name}, Call::Extern);
+    Expr start_profiler = Call::make(Int(32), "halide_profiler_instance_start",
+                                     {pipeline_name, num_funcs, func_names_buf, instance}, Call::Extern);
 
-    Expr profiler_token = Variable::make(Int(32), "profiler_token");
+    Expr profiler_start_error_code = Variable::make(Int(32), names.profiler_start_error_code);
 
     Expr stop_profiler = Call::make(Handle(), Call::register_destructor,
-                                    {Expr("halide_profiler_pipeline_end"), get_state}, Call::Intrinsic);
+                                    {Expr("halide_profiler_instance_end"), instance}, Call::Intrinsic);
 
     bool no_stack_alloc = profiling.func_stack_peak.empty();
     if (!no_stack_alloc) {
-        Expr func_stack_peak_buf = Variable::make(Handle(), "profiling_func_stack_peak_buf");
+        Expr func_stack_peak_buf = Variable::make(Handle(), names.profiler_func_stack_peak_buf);
 
-        Expr profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state");
         Stmt update_stack = Evaluate::make(Call::make(Int(32), "halide_profiler_stack_peak_update",
-                                                      {profiler_pipeline_state, func_stack_peak_buf}, Call::Extern));
+                                                      {instance, func_stack_peak_buf}, Call::Extern));
         s = Block::make(update_stack, s);
     }
 
-    Expr profiler_state = Variable::make(Handle(), "profiler_state");
-
-    s = activate_thread(s, profiler_state);
+    s = profiling.activate_main_thread(s);
 
     // Initialize the shared sampling token
-    Expr shared_sampling_token_var = Variable::make(Handle(), "profiler_shared_sampling_token");
+    Expr shared_sampling_token_var = Variable::make(Handle(), names.profiler_shared_sampling_token);
     Expr init_sampling_token =
         Call::make(Int(32), "halide_profiler_init_sampling_token", {shared_sampling_token_var, 0}, Call::Extern);
     s = Block::make({Evaluate::make(init_sampling_token), s});
-    s = LetStmt::make("profiler_shared_sampling_token",
+    s = LetStmt::make(names.profiler_shared_sampling_token,
                       Call::make(Handle(), Call::alloca, {Int(32).bytes()}, Call::Intrinsic), s);
 
-    s = LetStmt::make("profiler_pipeline_state", get_pipeline_state, s);
-    s = LetStmt::make("profiler_state", get_state, s);
     // If there was a problem starting the profiler, it will call an
     // appropriate halide error function and then return the
     // (negative) error code as the token.
-    s = Block::make(AssertStmt::make(profiler_token >= 0, profiler_token), s);
-    s = LetStmt::make("profiler_token", start_profiler, s);
+    s = Block::make(AssertStmt::make(profiler_start_error_code == 0, profiler_start_error_code), s);
+    s = LetStmt::make(names.profiler_start_error_code, start_profiler, s);
 
     if (!no_stack_alloc) {
         for (int i = num_funcs - 1; i >= 0; --i) {
-            s = Block::make(Store::make("profiling_func_stack_peak_buf",
+            s = Block::make(Store::make(names.profiler_func_stack_peak_buf,
                                         make_const(UInt(64), profiling.func_stack_peak[i]),
                                         i, Parameter(), const_true(), ModulusRemainder()),
                             s);
         }
-        s = Block::make(s, Free::make("profiling_func_stack_peak_buf"));
-        s = Allocate::make("profiling_func_stack_peak_buf", UInt(64),
+        s = Block::make(s, Free::make(names.profiler_func_stack_peak_buf));
+        s = Allocate::make(names.profiler_func_stack_peak_buf, UInt(64),
                            MemoryType::Auto, {num_funcs}, const_true(), s);
     }
 
     for (const auto &p : profiling.indices) {
-        s = Block::make(Store::make("profiling_func_names", p.first, p.second, Parameter(), const_true(), ModulusRemainder()), s);
+        s = Block::make(Store::make(names.profiler_func_names, p.first, p.second, Parameter(), const_true(), ModulusRemainder()), s);
     }
 
-    s = Block::make(s, Free::make("profiling_func_names"));
-    s = Allocate::make("profiling_func_names", Handle(),
+    s = Block::make(s, Free::make(names.profiler_func_names));
+    s = Allocate::make(names.profiler_func_names, Handle(),
                        MemoryType::Auto, {num_funcs}, const_true(), s);
     s = Block::make(Evaluate::make(stop_profiler), s);
 
+    // Allocate memory for the profiler instance state
+
+    // Check there isn't going to be end-of-struct padding to worry about.
+    static_assert((sizeof(halide_profiler_func_stats) & 7) == 0);
+
+    const int instance_size_bytes = sizeof(halide_profiler_instance_state) + num_funcs * sizeof(halide_profiler_func_stats);
+
+    s = Allocate::make(names.profiler_instance, UInt(64), MemoryType::Auto,
+                       {(instance_size_bytes + 7) / 8}, const_true(), s);
+
     // We have nested definitions of the sampling token
     s = uniquify_variable_names(s);
 
diff --git a/src/Profiling.h b/src/Profiling.h
index afaa47fe6d6e..b91644409655 100644
--- a/src/Profiling.h
+++ b/src/Profiling.h
@@ -40,7 +40,7 @@ class Function;
  * storage flattening, but after all bounds inference.
  *
  */
-Stmt inject_profiling(Stmt, const std::string &, const std::map<std::string, Function> &env);
+Stmt inject_profiling(const Stmt &, const std::string &, const std::map<std::string, Function> &env);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 4ddb52cd89bf..736d64478c8b 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1265,6 +1265,9 @@ enum halide_error_code_t {
     /** "vscale" value of Scalable Vector detected in runtime does not match
      * the vscale value used in compilation. */
     halide_error_code_vscale_invalid = -47,
+
+    /** Profiling failed for a pipeline invocation. */
+    halide_error_code_cannot_profile_pipeline = -48,
 };
 
 /** Halide calls the functions below on various error conditions. The
@@ -1849,7 +1852,7 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_func_stats {
 /** Per-pipeline state tracked by the sampling profiler. These exist
  * in a linked list. */
 struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
-    /** Total time spent inside this pipeline (in nanoseconds) */
+    /** Total time spent in this pipeline (in nanoseconds) */
     uint64_t time;
 
     /** The current memory allocation of funcs in this pipeline. */
@@ -1878,9 +1881,6 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
     /** The number of funcs in this pipeline. */
     int num_funcs;
 
-    /** An internal base id used to identify the funcs in this pipeline. */
-    int first_func_id;
-
     /** The number of times this pipeline has been run. */
     int runs;
 
@@ -1891,48 +1891,98 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
     int num_allocs;
 };
 
-/** The global state of the profiler. */
+/** Per-invocation-of-a-pipeline state. Lives on the stack of the Halide
+ * code. Exists in a doubly-linked list to that it can be cleanly
+ * removed. */
+struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_instance_state {
+    /** Time billed to funcs in this instance by the sampling thread. */
+    uint64_t billed_time;
 
-struct halide_profiler_state {
-    /** Guards access to the fields below. If not locked, the sampling
-     * profiler thread is free to modify things below (including
-     * reordering the linked list of pipeline stats). */
-    struct halide_mutex lock;
+    /** Wall clock time of the start of the instance. */
+    uint64_t start_time;
 
-    /** The amount of time the profiler thread sleeps between samples
-     * in milliseconds. Defaults to 1 */
-    int sleep_time;
+    /** The current memory allocation of funcs in this instance. */
+    uint64_t memory_current;
 
-    /** An internal id used for bookkeeping. */
-    int first_free_id;
+    /** The peak memory allocation of funcs in this instance. */
+    uint64_t memory_peak;
+
+    /** The total memory allocation of funcs in this instance. */
+    uint64_t memory_total;
+
+    /** The average number of thread pool worker threads doing useful
+     * work while computing this instance. */
+    uint64_t active_threads_numerator, active_threads_denominator;
+
+    /** A pointer to the next running instance, so that the running instances
+     * can exist in a linked list. */
+    struct halide_profiler_instance_state *next;
+
+    /** A pointer to the address of the next pointer of the previous instance,
+     * so that this can be removed from the linked list when the instance
+     * terminates. */
+    struct halide_profiler_instance_state **prev_next;
+
+    /** Information shared across all instances. The stats above are merged into
+     * it when the instance is retired. */
+    struct halide_profiler_pipeline_stats *pipeline_stats;
+
+    /** An array containing states for each Func in this instance of this pipeline. */
+    struct halide_profiler_func_stats *funcs;
 
     /** The id of the current running Func. Set by the pipeline, read
      * periodically by the profiler thread. */
     int current_func;
 
-    /** The number of threads currently doing work. */
+    /** The number of threads currently doing work on this pipeline instance. */
     int active_threads;
 
+    /** The number of samples taken by this instance. */
+    int samples;
+
+    /** The total number of memory allocation of funcs in this instance. */
+    int num_allocs;
+
+    /** Whether or not this instance should count towards pipeline
+     * statistics. */
+    int should_collect_statistics;
+};
+
+/** The global state of the profiler. */
+struct halide_profiler_state {
+    /** Guards access to the fields below. If not locked, the sampling
+     * profiler thread is free to modify things below (including
+     * reordering the linked list of pipeline stats). */
+    struct halide_mutex lock;
+
     /** A linked list of stats gathered for each pipeline. */
     struct halide_profiler_pipeline_stats *pipelines;
 
     /** Retrieve remote profiler state. Used so that the sampling
      * profiler can follow along with execution that occurs elsewhere,
      * e.g. on a DSP. If null, it reads from the int above instead. */
-    void (*get_remote_profiler_state)(int *func, int *active_workers);
 
     /** Sampling thread reference to be joined at shutdown. */
     struct halide_thread *sampling_thread;
-};
 
-/** Profiler func ids with special meanings. */
-enum {
-    /// current_func takes on this value when not inside Halide code
-    halide_profiler_outside_of_halide = -1,
-    /// Set current_func to this value to tell the profiling thread to
-    /// halt. It will start up again next time you run a pipeline with
-    /// profiling enabled.
-    halide_profiler_please_stop = -2
+    /** The running instances of Halide pipelines. */
+    struct halide_profiler_instance_state *instances;
+
+    /** If this callback is defined, the profiler asserts that there is a single
+     * live instance, and then uses it to get the current func and number of
+     * active threads insted of reading the fields in the instance. This is used
+     * so that the profiler can follow along with execution that occurs
+     * elsewhere (e.g. on an accelerator). */
+    void (*get_remote_profiler_state)(int *func, int *active_workers);
+
+    /** The amount of time the profiler thread sleeps between samples in
+     * microseconds. Defaults to 1000. To change it call
+     * halide_profiler_get_state and mutate this field. */
+    int sleep_time;
+
+    /** Set to 1 when you want the profiler to wait for all running instances to
+     * finish and then stop gracefully. */
+    int shutdown;
 };
 
 /** Get a pointer to the global profiler state for programmatic
@@ -1950,34 +2000,24 @@ extern struct halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state
  * accurate time interval if desired. */
 extern int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t);
 
-/** Reset profiler state cheaply. May leave threads running or some
- * memory allocated but all accumluated statistics are reset.
- * WARNING: Do NOT call this method while any halide pipeline is
- * running; halide_profiler_memory_allocate/free and
- * halide_profiler_stack_peak_update update the profiler pipeline's
- * state without grabbing the global profiler state's lock. */
+/** Reset profiler state cheaply. May leave threads running or some memory
+ * allocated but all accumulated statistics are reset. Blocks until all running
+ * profiled Halide pipelines exit. */
 extern void halide_profiler_reset(void);
 
-/** Reset all profiler state.
- * WARNING: Do NOT call this method while any halide pipeline is
- * running; halide_profiler_memory_allocate/free and
- * halide_profiler_stack_peak_update update the profiler pipeline's
- * state without grabbing the global profiler state's lock. */
-void halide_profiler_shutdown(void);
+/** Reset all profiler state. Blocks until all running profiled Halide
+ * pipelines exit. */
+extern void halide_profiler_shutdown(void);
 
 /** Print out timing statistics for everything run since the last
  * reset. Also happens at process exit. */
 extern void halide_profiler_report(void *user_context);
 
-/** For timer based profiling, this routine starts the timer chain running.
- * halide_get_profiler_state can be called to get the current timer interval.
- */
-extern void halide_start_timer_chain(void);
 /** These routines are called to temporarily disable and then reenable
- * timer interuppts for profiling */
+ * the profiler. */
 //@{
-extern void halide_disable_timer_interrupt(void);
-extern void halide_enable_timer_interrupt(void);
+extern void halide_profiler_lock(struct halide_profiler_state *);
+extern void halide_profiler_unlock(struct halide_profiler_state *);
 //@}
 
 /// \name "Float16" functions
diff --git a/src/runtime/fuchsia_clock.cpp b/src/runtime/fuchsia_clock.cpp
index b532867e0788..9733d6e135bc 100644
--- a/src/runtime/fuchsia_clock.cpp
+++ b/src/runtime/fuchsia_clock.cpp
@@ -30,7 +30,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
     return zx_clock_get_monotonic() - halide_reference_clock;
 }
 
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    zx_nanosleep(zx_deadline_after(ms * 1000));
+WEAK void halide_sleep_us(void *user_context, int us) {
+    zx_nanosleep(zx_deadline_after(us));
 }
 }
diff --git a/src/runtime/hexagon_host.cpp b/src/runtime/hexagon_host.cpp
index 7035741a9fdf..80dcca1dc707 100644
--- a/src/runtime/hexagon_host.cpp
+++ b/src/runtime/hexagon_host.cpp
@@ -275,7 +275,6 @@ WEAK int halide_hexagon_initialize_kernels(void *user_context, void **state_ptr,
     halide_abort_if_false(user_context, state_ptr != nullptr);
 
 #ifdef DEBUG_RUNTIME
-    halide_start_clock(user_context);
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
@@ -478,11 +477,22 @@ WEAK int halide_hexagon_run(void *user_context,
     // get_remote_profiler_func to retrieve the current
     // func. Otherwise leave it alone - the cost of remote running
     // will be billed to the calling Func.
+    halide_profiler_state *s = halide_profiler_get_state();
     if (remote_poll_profiler_state) {
-        halide_profiler_get_state()->get_remote_profiler_state = get_remote_profiler_state;
-        if (remote_profiler_set_current_func) {
-            remote_profiler_set_current_func(halide_profiler_get_state()->current_func);
+        halide_profiler_lock(s);
+        const halide_profiler_instance_state *instance = s->instances;
+        if (instance) {
+            if (instance->next) {
+                halide_profiler_unlock(s);
+                error(user_context) << "Hexagon: multiple simultaneous profiled pipelines is unsupported.";
+                return halide_error_code_cannot_profile_pipeline;
+            }
+            s->get_remote_profiler_state = get_remote_profiler_state;
+            if (remote_profiler_set_current_func) {
+                remote_profiler_set_current_func(instance->current_func);
+            }
         }
+        halide_profiler_unlock(s);
     }
 
     // Call the pipeline on the device side.
@@ -498,7 +508,9 @@ WEAK int halide_hexagon_run(void *user_context,
         return halide_error_code_generic_error;
     }
 
-    halide_profiler_get_state()->get_remote_profiler_state = nullptr;
+    halide_profiler_lock(s);
+    s->get_remote_profiler_state = nullptr;
+    halide_profiler_unlock(s);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -580,6 +592,7 @@ WEAK int halide_hexagon_device_malloc(void *user_context, halide_buffer_t *buf)
     debug(user_context) << "    allocating buffer of " << (uint64_t)size << " bytes\n";
 
 #ifdef DEBUG_RUNTIME
+    halide_start_clock(user_context);
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
diff --git a/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so b/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so
index 148388d21adfd5efa76d0ca8462e94bd4c7edd8d..ee503bb14686e04002b2445bc798bd8379c6e101 100755
GIT binary patch
literal 88328
zcmeFa3wTpi);E57gMwhIq9Rs}jv^>xTP`9L4VPdFEyPlgQJl22DHMCNO|f7*Qz%7Z
zs8bP@(T>*f#wcn>K@p1tL_kHSA}VUts>qyb)u@QoLHqsI+WRCsX;Lol`+U#;|2*H(
z!#cmU_g;JLwbx$za+2mRmb5965fPekMQK-QgsLkA$1D?eU+d3f*2ZYPG_!V&mcV&q
zWS&su!VbYTsHT~iLDG8RuK0!_sEHp8>g^-=iG2k%%0&WFxwW#~T3ODZ@st6VL1P8f
zY+pKaqwzfQb-@Ux2_zJkK`Ad?*P@(E5{#q#8!yu{WPRb<w^Hh9P{X=4C~wAX!hI5M
zZTr9&%^aykn#>W^6V2y0o^OlSmTG1*k|MO&>Vz{+UT%)iYBlW?Epjc2ogN?EWNx~!
z+8q1P$jKL&wQCdqplLCYr)t`1Z^rklj<1e2YZ1#LRz9mahAg`%>kKXOq1dQvmzkDE
zM4aDu{iy1bV&kK-qW`JI_URkbD>~W~VWO&HclXtztCv@!H>x$0Ni$Wi?K|-^t=v?-
za&=^UbVU5hp*C~GQuDPj+Pv7vh^PoHa=H0I(-|vKc5Tyy(_&-dqr6(5YOS|N(;}Ch
z9*cfSjDoRD^|2EoH1p)te#-_#Xr}l^?M+RCQIJ%P|Dw$hK-Wav7JkySDWKQjw&I?O
zI}LX_Zn|dRz7}@|0bH55XXCyO_Z-}Fanp4p?rhw7xEJ8Q33mbRBHYEeOL3RsHm-6!
zRN!8M`&Qglxc`Eiu4TBX-&WwJ2H%d`jk^x_9k~CBJNUX2jJp)(8a&^FdoAw!a5vzl
z>+gbS4}d;|`(fOVN#2u^t_OVzcN6Y^;eHPH2HY>;-i-St+|9V@+9C-5`!fFZNW2Yn
zJMLF;@522C?tkN^>uub7aqq+Z9`5&X8`lST_z-t1?vHVQiu-fibR85#`wys&`zzdE
z<L<!yE$;7de~<eI+&|*}3Ab?_#=|eTk4Q!r=-ywxefyS@7cQv$&sCca-Mse0dvBZi
z@&`ZEK0m9^o93m9I_~=R{qNR)eDX&V&i%yo&vR-llMCCo?b<#5v&;XzYV4Y<K~=L~
zto#1tld7iPw)xLzO&R*|GxG;pThIC|H=}pg*g>^leq8f)#l|Ub+%xR#M^61^T1Hjx
zX9m7-`-5exCl=;bG$q_M;G$bclwbGfRPWTjZ|?ajVNdL#4Ld%0r0nhDr9U_y8~WG9
zZ@jVS@(pjB&To3G|E_oL`t6oiR=+XofeUtA{Nl7ZD?U5exV7=*wlk;fYyLT|F?aNr
zKU^|k@U~)WmHx$~vFrDI|6c#}75D!2m6YYE=d5LCj(c-z@yCm%6+ZXV*qt|?``>F?
zVotha@45wZ-@pCqr5~n$8<}$Hmg`S`dcwM<ot<}{de$4?d^&Q)cW?i7`^_oy?mNwM
z<Ke!;$B&=-&&>~|<So8*wKe9D^R8XXubaBF{oCymUjC+c{Bz$dKdblK|4FJ_cz9FH
zo}}U%OD8?uwo?1DVeoA$@4xEok4}5=#ks3%hCTP^J?(KnFRc9F<~a%9e*e;2&;HlF
zFZ+vEe;Qu#>nG)RjhL2~e8ZJ(Q8O!Q`oG_jm$2%?j&moDzWv1+50~Eh`vZ5~J9u*H
z+yh;4Ppmv?-BbOZAM@22O`n}U;Rbum(7ew-K7IBdSN{}ux7qXk!IX3EGGAaiY3rCz
zBKxI&eo4U}tDgPh?uUlizf8UR=QTh5KJMq=)=c`7<?N@UUTFN|g6C&k^V7c_U$;lU
zviXbEl|xRMGqv#7-`*Xv=)3Jtb=>^l9bHqCqwmanY{Bn;8h6`C<JRQGUSI#!JKvo=
zaLeH-*JQq%Iq)CD?l|R_0hfPT_}%AEUp=~apLzT4oM?HsBWcp({hyk)y}rJ-{NwER
zli$1I!LRIzga7;cwf0jVsd}ZL*O=9(&8ln9J~(LQA56BIrdijW_ekHdZ_mB?^Ce6A
zL|&UP;rmmn#{V|eIdGQ6bpD({52l?~^6Uj~-@o=Q$E4*W`d$8X#o1R?P5g4`HzR(p
zd${Po({3*NX^-do`!|kxe1Run_u|SgW1hUcZ_D0C?Xk8y_Gf;#x8b}gtv9^iw)UhC
zE;^VyW5f9uuU|BG)3|GT$8TA3!_XJkosn5|WbP9i(>6Zd6m_=yN;xz_l5a47o*E*h
z_Vm#F*(ZnQ_X;EDnzKUZUv*|^eoJg<{(BdO=6@JQ&fG9^Xb!j_M0-bsk-sA<biMte
zL-U^sV}}dE=<`*W{Gas-EkFO_(ERBGL-Q{<BQ*ah$O&bK!>5JL|3(-)yc$M++n~^L
zXnqRiZ)>5CDMY`#6B9cB*<tvly+h|8KOi)}JxqJ2h4HI3P(GCYIbrk}6-NH@F!p>P
zGPL}TF!~G%Bmc=T{+SrYPin*X!<S+7nH)y`z%cE)E)0K2nErZUaA-SR6-NJq$)WSZ
zJwy7{hhh3_O&EJF4ddtEg&C(y!`NqIm^cs-Mo%;=r2ccm*t1s{Ke-NZ_yX({njR5@
zY=|J*^kIVU*(w-UOa8W@f^XU+7*|NX7y6%~4b&=sHZcdy<m)X6Wckkte758d{e$3_
z(83cJHU@lI`%{0Nq_xrF7uWf)BV8jd68X)~2z;p^+JKV;zvX_xSSa~Vf>OP<M+L*E
z_ZKOrnijjbFm3U5t(21}7tDtIt1c6AytEL;l^}>VB}wp0Xc3Idu)`y=y)Cl6hWy_B
zM1GAF!!=eA?Guh0vD)rJ$&lnB_#5f(X%viN$$u2%j`-Do6AYtWz1Tm|-g^XpiOhdG
z#tY>)%l2L_`NLsX;<v05jJc9OSIRd@`PWMR)$nu5&l?EvQY8Pa3#1&`E|^zb=Lx>|
zaRJ9kb|E^N`q$$Uj8&+Qu1DiU{-zfMM%dwNDmHRdZ=2*BcIywj5x@S~!2HXhBJryq
z7W}?6fN-5NT<{yEKSMpfZjt?Bmiqru^6!%U>yiDNA^Fe1;3VJnm|z(Fn=S2JFYRpD
z(=PKT%KS!u^_J~5Z48u?&T$Wos1<yV97r!nJD5CzKS>bnYpH+RKLx`WM_))gv^^pC
zr^x(&hkVk%QTDIl&sUr;`O*%FGJiGf$N3)>48#6orN4QlzacE}b*PWXU-F>9snY(N
zWxetD3I0Suv@LRcRW}R92+2RoB<q#_iDf5WA7Z{HeX<@B*s$l_i2uYdG5THRzp%gX
zH`^+~sF(WGOTV@KD^Sk*^JTs31%IBDbFZ{>{8NHq^!s}0Cno7vhCW-RKa@!Mmr6M^
zFA?%RFA4_2AYZEo$#w<#Nq=b{&*s4V=f{ZrO^*w{q0bBqaI!<9v}d|V)-IF&U~Ui$
zL(f(WShkz=TSLx!7$C&2mi9OD-zD?M2j#y?j>i@`9*uD{8xBTt8iU%^Dcc(##2+f-
zr-=?Aa2fU9VUqetKTqRSO}hb<<hR``7*&#gf2`nVNq;j*{vc`pdO3d`l>DKxe@$|{
z$Z(^51pguVCR(84lIGK{lK$hBc0lv_y8A4tkDN~p{pU$JS#o|c_^-+SYTP2q8}@IK
zax^*aV0OOZq?{6IXG6~G7~rI*NAe9hGcYiTUn1>k_}daVB=ODCuZ(f`&~U-Gy(H=z
zFYDEZ34WHe+hvlUagN}dq#vFu`E7lrocjf%Qu24eP!#_YW!&f``QOWaH%mQ@@iI4A
z$f=j(%V^gGC>)RV#)6+sQi32$p%}HxIIy}{rXB)iJ4kyP@$DAr&n3@@jAohtC261f
zpmBH7g+h+$X^}BQ=64Se{6ra-4E-;G!c?zE`j26s(`CPVq@7WmuU^uxOiu}nY<#^a
z$B`#!ynKN9f#iE7-)PtQGCsFRJ7YNWbu&yu`OVT#jQmb0M*M2okA`2-@S$~GiL^6@
z6<;68IB8oi@N~%^3xkuKL@D3suV@)3n_d<f4ZeA>$ZrZ7M;8qee6Q>;L!Wy173otg
z^?}>)^`jgwi84R3@ikVqH%kuO*CqeEOQif41;fx^m-@FoC-_&%{1;#XV*3R3?~k${
zO>+F+D)TRj7x_y(f?>2b0t1lbXwq&5{}(C0?H?llnNrR@Xb9I!9f8Z3XE&hZh#xQQ
zIY}gIC&_-y3hKvKCBISfje6g}0HpC)vP=lLF+$K<WFYx%4+v}$L`#?BZk@EhWNRa3
zoN1Kf*XXZn<UEqNPUJNBPfPjQgMz<M3i?>ur&{_yx`VGuIqsSyKV9<MV}zcGa$dMX
z@|(~OZda9HbYh&)wXe6x@A;d+(*@B!7%upYFA0XBC%O@G`e~W32tHxDF2Mljev$D3
z&EcyhPU<7qbvH)}{sm|V<=5o6Kr{IoA^WjO`cIbR{~*Ujqg+22`OlH@s^w{sah%M5
z723=FCH>#fXDs*Encd=~Q`Va${m00Et+aFeGeWQte_~{uZ<BGx$p5m8SJhG<L(f!c
zhZgA%lcb(^BuhKIA{a)0O*IL=NBRleo-enY*UU2isgmCy^)X9-Hu@`D>Ju;Rc9qQU
zgrAdrN~E6{`d<mZpm<Ut{aL!dRxRU0wTusDDQCOv$3z()jP|<w3;ET57a5Ip!eV5g
zdTr8gp*CMPT_*VP(%+1J+zJ<={9dW2A^(2KZ;^b%KCjDlON(qT)Z^=ZIbUlYfni3z
zUP%)2&2m1SBKdAPjy%$ya1Xu)^cLf&rAFY(q=8O_K+?0?Blw1&x1%GN|Crz#{qp;z
zg5M_Fi|LH7+|z`7)4c*4aqq7~MShR;AEUiD4j24-DaYu?)iQoIJtX7|mU_;|1VQ>_
z$^J6h+a$+Pll1dHielOpl;iws1q0dmIw1EKwuc2iOY*IKM1FJ7xO)c*lbpo+MMj?;
zr2Kj*e}$BD0FF)Z%6yMt^pgA{$&X(n80X4(l7GICA1~*T6q)}!BTmZkXw>V*Kqh@W
z%_8G0ng2zpXOo;?$|e7=GH%=Cc!U}GdLITSIrWbSY{-8?)~m^S2gv-I-9EptxX51S
z%r14>?V3F=zbId`r)H+xa~-9Q`T1o|M`>pIq=MoiM`rfC0*6Qoonp_a$R+{V1^G*m
z@%rgAok@x1Mfo?EJM1OS($u7pR!2ql{NkePO0!E!9HmGend`{QE-!GRwgQLKkvcQc
zZm~>Gwk*=h@(ZCHNz8USD)OCD-YC1BB$7OPeql+$F$*1SPp3kVeBz2FCCzsjstQRn
zog)j27dcWhY{8AQ%%J9vE_G14z0{Eha~9<|QZwzAk<P^>4$F+0BVq6Br%wxEhHk=g
z3M@9*2KmRJgPe9)hZ{nLPsGGIi{`N<P&}zH`z8k)wcOFonp2ZTED(KW&nYRlJ1rKp
z>PmY$ip)YcP-~LVC1v?V^9vmI;&}@lIZn&uk{Ow->xnu*_Doo3gk6KR9Yz16c9u9r
zhX)%pdDNuhqP+b1<)scwg(IijNxnT1u3Ng;Vw;vj{nD+-7+;a;v(nOR#rZ`T8xtK1
zvKQqSms*k%i^>ZM*e{)0wzC+csJo1$(LNdJj>0MV1qGG`A@g17%a@s72;-&~=Q=FO
ziRcRqAG?!z_Chf2POZ>USmxmI5v+1j;!KA#vot#g63|aZPddRtL9{a|vAD!hB;|zA
zH#yO#M+O|(>6q-8S3cjeaGsV+53E5b{gX%d%Gt7WZgMy$ASf&@%P%A29<9=(F_xm-
z8F_|;Y-fIPQJ7v#Oe@aL4cCDuIyOeY?YO?8TM&Y-pXt0JuekJ<?9$xS;K48>)1DD}
zl+R2uhIuISOiHo@iCVaj@}1ydo|K$AGugh#VuXm?0viU+tfE`;i*oH5j<WK?|4B5y
zq7<PyyUZcF-~?41RqB9`8VV{Vg}mT7AlQp7mLBXpLa}pF@-3zLPKO->qPTQ%zHQnf
z3_^Q$S(yWKTV8g4L8!h+O1h2;VK}0UW%0tr$dr^+>d4NOa%_w8LyzUe3}o_^Ba>gB
zlw9V3%?ll6W!du``5AD*(Tflgi%V1UXJ)77TPz_}7)6q1E-su`TrfMkpxi;D#Ks=c
zUBJjPOs+;>TtPErMN-n0&SFgAr8LZi%VJ{8q1c|YAiLC#@t*H2OU<!Bg&dCHM!BTq
zdD&(8IkGIGIpTIETo($=9A%#|a}3W==q_1LQ2wx0OimISLjB_{I*PUN6%R6Beg!wM
zTW}tcQC?&bCJT0-ku#l+63ar{LYhBIN{jRI3mm2Pobpo4$8d}CB98XQujuIfB62KC
zp_ngxD*=q+<83oMJaiaQ77HiZ6^J#w(pXehR00Qa=8*?piDh3=Ir~ns-45re5Z;q|
zBFe+1GVQi0`8GHYi~*;yC5^D#7iHTEuza*Tv*DC8C)$^23$QT6jAcwiv^2tKEkFoI
z&UvMnd$q!HEcQr4iZcbpIXBtoms8}O=^VvLmPyH$46zWJo-{&Am{*>M)fH02h(}=>
zYQVg7=r#nA0O{zyd}lr{RZCe>6v22k)-6GP$4gRDizFK$NCAT#PIZ5}Xv;`ypU^Q#
z5;WwHqjb}VBF8QE>;e?Q+5*u;X<kyYeKyA!I|j4E2sDu4YYxqXq1roY9#;5rZVi`b
z1Oi$ZLndJ%XfMq!n(wgtjhSkjhA_7%(~bZ#)0vzo=T19fY$>Tu{vbOzq~3ljg>DyD
zO3U!f62CoI!I{o6_Q?)7a`EC+TYf(4kYvw5m%%YP@b+X+f4!o2Mwb`K{t4~2q(T{o
zZ2O{-VO+bkyht?F@TQ=ChH7v)i_(`z$(WSvz@CRzZ}YIMN}ZZ9Eh%Y1F$S3!0u~Do
z4qAW9z<s>({%7Jw=4LyyrAG(t^^%jXaX4q@7nT=bJC{3YaZZ6_QgL|^mIO56Vz?hg
zQzP&nHKV^DgZ!sw+aqWjtGr;N9^>kRLe^;C2sI{Mba7C{7R!}6#YJ$_@*F31e~3<8
zl!5i5Ez_=Uq>f^rP{TB+V#C{#uApJ-U{x`2X%Y+NNsH830yA1sacN<8fo~kA(tZ@n
zPHfkY+1*Bx*UGW#H+P`V^<ZsOCbrHpew>)K1FJ)*I(}n^wqQ5G$8A7mwHL1MlF)ZF
z%J@Kqg9rQsp*Cbl4JU-+_R&LVaA`y;3hT)MAr(#^7{-H3$q>iGGIS)B`wQP10O25b
z4TmjZVZPJmD9xMc#D1{MnV(Z;S&Yk;voJXJay*2mB6{<*)Khw{gC?#Xa)eCED|I-6
z6Xa5)q?pbCj726+)tq5-`Z8f6>9!iBg*sMlashL^>q%?~Z8_KvqPMQFV-<j-6tQ;6
zu^21!<6op=O_z?vS`K1>Kg^$im}L6|?wbWz3znjdwj95P$Gvif^QGJU;naq;DEa6W
zC%A8cI7K=CRC2r>kVL1AKDF%G<rU}<Lp2^U*pC*J;SdHVf&RYntyqt{zHnyfX$7XN
zhEQLAnOz)UiE}O5D9%G@fpX9nS)B_?i*G^y&&PJrQCeDDYHWMP<YGK#)6piHNR!|w
z|58tEJo%6?H=h>ibOg$cwjhq+e-XBhx49I*axjBA9hQtj+VG;<<CQ^^Ap5cH1owYQ
z6pC}QOK`g8Tr75_C$Qc1sd0is9iwt4JgsE0owoWoU(2C&H_ZopcFqfYXz9%1bH?L0
zQ&J+?DZi|wSeyW{)KUj~gRFy2JpLy&P+Y;`kiEcBG~c;E9$=8m`TPjd9rJNK)1!x7
z(Jeb|fY`<0l!<m~#R>Z<#?ff(baIOe?b$gwINtFqC-yTq6~_h#N3~E@cIB~6jQ(8o
zIs>~lQM&uG(U)|LGcFb>a$87(+1Cvb%E%AeR|d`A#?EA>lMY2w(+U$UQ@g$CfOUh9
z@OWWj;j_^Z&hiqx8o(K1A<kP;=O7qhGtbP_sgp}+Yi5zlx(wfW6TMS7vDZApU-@N{
zctg`|Eg@$>XpN%}S_Lg|6yRkPH?CXTriO1EG|xb4l0B0f$S<R)byM+P2koL$G!dBs
zvmB=lDLyQ~=^!?C;&l<KOtf3+c#?PfB(9s*Y4Z{-7Nw*d1hNm4{x9@l2<s-%sY|e<
z1TXX~Q?Z2&Z`lz&TNW)lmO27#BiBa3Ry{Ut)8-|G5JpESC+xuLd&(0WcGGAUKAv$$
zXLe$u|DO$sY1DJn`u}T#V%mw13OP6MiseM34FalsEekP5Q0q1vL-?c!ko4jv$&TeY
z`;oEQ547HK#tS3F2X{hR;V};mMZ~d1ny>|qV-QIhI7KhJCA%cGn+^HBv!jS!)7ta%
zQs<PFj>fAUtUPcujHjYvCq2=y!$MX}IR*JQ@#}WPo#Z5N`9-{MQJxP;saPmcR2pNX
z;$2bxe6EC(ES6D=_<hVG-wcJtl|`;v{WG_E?&{~t`5Xg+50!B|>0g21jFcVEw`P&=
zXhJWQvBopPlyM9sUngOb$MSE10|({&xd5!8EQLmmR72qA+$cp$SKcAh5?Z`5O2*K~
zl<PYb$C{2-mDzNjOQ)$A+Spi)Bu)5874sWSZ#3FN9W4jErV674E&OuDITYz(qfdO$
zQa(&}UsK4izP*_|>PMS<?v@i-4e=>EBqzz}7<x_GQ(uN&lyYaHodJu|p2UpmwtHfi
zIO?jDj2ZHj3`g{1d=5Y_RHW&H-3!slUYK2h4|w3Eqv2Pysq}3KdC?xYBTO38-I0VF
zsa`~V9eyF2ujq%DO1vnZ)?;>`nv@sq#)eSrS?6J`#BDX!<zX%26X9>^2rTJS3(<5q
zzp`T`1B7TK$8mc3CDM2Z$(GBekCumZ2y`^QLYjp~fE@>3cu&cpg<h#}{i0<fvA3hK
z$pI}pIhB@sJW4HCbz+=~Q7ZCcRAWeo5fqfjl^DK`MktjWTOGIHb6t4#0$q8~h;c(c
zn4}0}QDmr|z1K3vD1(<>78~9$(mB>l*bnPqyy2(DlMD6W;Yo=~bRj2~5vC_M3MeD;
z2fkO5ua#ug$#(h1iO+br`yoBo_ZCW=`N;9^A2nr*@wvJ}F>3O)68I@C==lvN9kJbn
zowdQd$$_mspLGP~6Yr<EmzDQk!5cVnV(ZgZeNh;c)%SWJI4t|4Lr`E2Cm?j7rm!Ap
zENoI#V+Dy5rsGYI@SyD6Tzrb8tT?aR8x?p59pVSsB3RxcWRr#n83BUx<0D-G{)1OC
zK2vnxo*NTVc;^b1A0XZa{W%67d->z9QGM~06!OIrOyqy*gt^_A4}x^}`3j#q^74aM
z9^y<8WAZ5F)ECeG1{daI-52;$-!BRFp;~EMO&?*gQ+yA8CB>g#!3W)fXG=t*EA2BU
z8NKiGPY$lZHOdzjh*?2vr0MGX6QcHG(;e@?{GX*dikYCpOuVV1Lw8yq!TTr#(gKW~
zJ#6d?Y7(}nMnr-Lj4S!Ziw1+=`k@SZY)!)29Tpacx&JFRD$KX=XSM!UgylNK#wy4t
ztaQLDJ2J=7U4D#jks#J2eh&?*z!)R`l`yvRC>&xn5qgalBrkBqCZoksYfr+8-JV+<
z&WjLPub|Jd&M(Dl9o`#d<B)D1Ph>&vag>unzl+fjHDWDo4~}iPg?%S^^k~Av#Mdpn
z^f<0K@9$DyS=IB7(r|xib_iua9j%~m0WKG~Y%5<+(bg--1!N4QF|2OKu<Ap5MQL@h
z7gRcE32DR?d$v94xNAZyedWcczM*bVeH5-_FeP`OkB4O#s>)Bz2bDO6kHH!H;A0#z
z>WqhvT5$OD#XsLnhu0#;hd;V$bc`SIJ@#%_Rl+1JW;8O8Pn{5-Y?~o=w?Wzk9h&f&
z!j=E~ha~unEu1UlAqkQ~9Fky?;>p(cnaKQO=vgs;oUk1;0%AF|QcY@~Ur}MlA%9sh
zeL2Os*uE%H!$%HjzXI;?vSLDI&fJ_!F9jGrnqZ;tX?_kqlS(Lw_&I`achK8IEY#=W
zWe{F0mEsflcoRlBf<ERzA4otM?V3rG>?4LJYS*NtPMkDn4xUF2PfF78IALO{J!$xe
zfRyAU6^dtj^6=#0qv)$Uky;e~jmE$Df*k&h0VN)N{V1BBBk^}MDLoSZ5|7GIPPLri
zQ4UIr;eYYKG!htJI@5Zi1eNw*(OSg+`|tk~7NEZo8lhdl-yzjB6S~MYcrm_-*jp<A
zkFE&xT_2fl1J3Te7hT;;b4uPj&)kGgj?ro)zM0dbwL2ts4qY8fUh{Wgx~RTHnMSyL
zP(Hq47peVIrW=3X6@~c9suHyN8xubm{KiVbUm3(V$^3KV_dSjL2CkR+@ijf)-})g*
zE#H}lpk;ylbqaqa=MeuiDSQ~rcWqMm@~eBC=27^@cQq+tx5A&~lcH(+75=jdzfIw{
zDSTbw^8lA=UWIRbhmevw75;cdPL}dL*^vr=px49*Ul-L5D}0l}ze(W_QTR0qKT+Wu
z-$kV4F$!OPm6l7H6+ZQuaak4qE`tebQ}`JQe~!X0Q}|g5f1<))pz!5az&W)<;r~U+
zU!m~jSL!*nTH!BH@~>3*H!A#kg-<cnxYjEC=L{xjqr(4A;jdHp)e65!;V)76n-u=j
z3g4sfDe@cFZiPSBV1l+N{5KT-eub~j25kyoSMuu${~CqwRrm!8zf<9ND17aE6C?6l
znvIMrR^c}oOwf3RpRMo*DtzO+yOeBF_?b%nAqs!F!cSEAk16~y3ZHZ}F0;aS8%$8E
z!j~bBQ*8?0h<%hWN8w+~ImJI&3cpz4FHrbqg<qoZOB8;E!v9g>S1Wu3df&BD;Tzxm
zro?)M|B^2^{_e5DZ&LV;3jZO6zfR%bs_>f>{x1rDlfrLR_#TCSyTaeC@b6UkEeiib
zg}-0nZ&Ubf3jbAwuPgk2DSWTO@2BuP75>KxU;Dwth<plv!oYXMD*QkBxbT04->&cn
zD*P^mZ&LUtEBqk}KThE%Dt!5;DL8eE!mm*Bn-zY%!nZ2?_Y}TO;m=X{a}@r!3O`HX
zf2!~oDEtiyzeM5VtNXsILgAn0<7!&9!e68CS1SAg3cp_A-=Xl=D*V$Gext&FP~opr
z_`fRrCWXIO;crs-pD28f!aqad?^gI&()q3yg>Ut7v7S-*%FhXCZ3=(4l3!Q&#@~#f
zWUs>iS;^n2@bRY{{MU~rekT9_%Foqe75)zjKVIQKuJ8vc{H+S#r0^pZI}B0y#@~{l
zVu=cWks@b|!hc5Ln-%^#g>O~(D-^y>;s06T&r$e;6n>V%zenLOQ1~+xeu=_2{x$?9
zS1A1Jl>F5S-}oCGl)O^mf3D=OSNQW3{#u29j>2zL_^TBDI)&e<@S7CAUEyz1_~$Bo
zkHY^(;qO-XTNHka!aq;p?^pOHh2N&|^A*0X@JkiGSK*(p@H-X$1qxp~WMV}A{|AL1
ztMI>2`0)zAS>X><_y-ieN#S3p@P{b;EQOz_@LyB-V-)_I3g4{o8x_7);oqk4Z3_Qu
zg+E8(U!?G}6#ig^zd+$%tnf<|{t$&<q44t*ezn3+Q}`<t{w#%Gukh;>{#u3qSB2lG
z@E=zA>lA*v!f#UeISPN1!XK*eJqq8Y@OLZxT!r7F@V`{}`xX91h2N&|XDfVN;a{Tg
zy$b&ph2N?0Eec=z$;62K{{w{|tMF?Te!RkeLE#Tn`2SY;CWXIR;SW*xKPmh~g^yoB
z@m*sS{*ykgrkNG~TMFN*@YgDQo5D|3_;VEgr3ybw;SW>z3l#ok3cp0*PgeL93jYy>
zU#;-(SNJOx{&0m~ukasL_-hsZy$Zik;XkkN*D3r73cpF=FH`uN6#nH3-=pw73V*l4
zPgM9V3ja5SzhB``RrqZRf2qRP75+4Z?^XD(DEv-^pQP}$!zM=L|KBV8ScRXg@Z%N!
z2!%gT;oq$AO$z^Yg+D~$Z&LV)3jc0}KStpfDSWfSzd_+!75;pMZ&UcA75*HBe}%%&
zQusR+{sM(RM&Xwz{Pz`pg~I=v!mn2NS1SCK3cp_A*DL(D75-X<?@;)S3jaR}f1Sb~
ztMHo?ev882r0}OJe2>E4q40Mr{NELRi^8u|`1=+9IECM)@ZVMVy29VD@VyHE35DOO
z@ZVAR+Rr9N<o_Qj{8)wmnZl1(`2SG&0~P*@3g4vg4=Ma13V*S}PgM9<Df}@C|1O1Z
zR`^#de5=AgsPJtHf1$#+)^x;L-7)7ZGilaZk2A8X#ago?_7$yb)CFL5U34M-jXTeb
zC&CM;WP4i|u8Rf{rkSd}h4HC`BMEyL_az)fxQTHj;b_8*jDKAU97DLC@ehQ15w2$Z
z6=CY0_7cXQ6OJXE#rQ+QeF)na?<L%qu$l4egij)z$ap(pIvQy=G2TqLKjC=B&k{bF
zu*Uca!f}MXzk_knKM2!NLwg(J`v{*xxP|d*!lx4UFutAeX@r{?FC#pFa3kX-gij}2
z&$yKE8HB4DFC=^>;S$F42oEHj#rS%{XA!nB&LB)%xOOw+sf5oaoXB`0;RM1a#$yT7
zmZ?3SaWdg^32TfmAxvA9cJFW0{tF132)8jFMEHEdEsRekOk0e0597Xs|3J8jaU|gj
z2{$tSwHo*$!u5=QAUv3GHRG=cUre}!@#lnTY2BX1_(Q@&3ELR&C432CGvn6@(~_n=
zk@0rIw3KN#G2Tp=mN4z{jGrYuoUq3D3Bokxw|kFp{}ZMuy}ga`eT0(;w=iB!n5N8j
z598Yj(^T2s#CRFuk%Su=FCjdNa6RKv!lMaSGhRse3c@9f=MknUr#*}D^@M5aXty!W
zAWR{>-OP9@VG613iHs)_P9bb!JeKfy!tsog313B6V|)o=3N7v4|8oBmHWO}RJc#fF
z!YzzXB|MR^hjCxRlL$94jwC#pa3kYi{{n0wT+jFi!cz!WGyaP3HH1qTe@@s+IE(Rz
zglQ?;ZezTc@KnNP#;+5mB~p7L<L!iLtJQ8|yqPd<f!gC4KTCKzVU6(<gl7=;{>uGN
z*haXG@qL7^CEUVzHQ@}x9>%v5o=Lcg@iM}hgc}(zAv}w4J>ycsvk6x-UP$;l!X=F7
z5uQUhi}Ce@|47)zID_!@gw2en625_OBIAjK=MpwC9!vN}!tsog3IB<(#`qG#cEa9Y
zxc>=f5pH8Vh;TOH7RIL%o=4cjxG&)x!cB}L3Fi`SWc=%&foaLoUeEXk!g++N8Gl81
zKH(C^pA)7Ds6C7EhlKM9+ZgX9ypXV&@#}<dBAm#0JK+MtCdQiy7ZQ$V{4C)j!W!cz
z2p1Ffc5?p{E+O2;_&&ln6K-L=ns6y$598Yjml1Aayo|7ua3kX-gv$xnGcF~(h;TLI
zg@kV*T*7!B;R?c8jISq5ONe$G;|#(}2%8yCC44L4M8*>d)6%Qm#CR;>D#G!MlL`Ns
zu*Ucj!ha#`{h9lpa5dpJ#)AkiCEUXJRKm*$dl>g6Ttm2taU|j8gc}+ES_NE7xSsJ3
zgjW!*X8aZ5+X$C1{+#ei!dZ+zB<vz=W4xE}?S##YUnlG)oXB`P;Z=l9j5iamBOK58
zS;BV^))+rQ_^*V$hq?a=*As4Id>`RE3AZp_O?Wk7598Yj-$l5I@iM||2sbibLildN
z^^8ji-$S^X@j}A)5-wppkMLT;S&XkI{5Qfj#u<d~BWz|omGJ$96B$n=+(6jGcr4++
z6OLz`O!xu98skd{KS<d76Zb#iM#6232N8aVa0}y83IBtzhjCxR4-;-;97*^Q!i|i7
ztpt9Qa6RK62tP)+n(<eJA17SG_;bSR2xl?=knleV+ZgX9`~+b$<JSp4NjQ=5cEalk
zn;35<{1oAM#?KOdny|+B3Bu11_8#K?C)`B1jq!bi|3$ck@oK`)6812@o$zynn;0)6
zyn%2d<0XVQ60T=lO89xg)r=Pset~ca<9UQP5zb<KJ>eG#+ZbmM-b~occq-wS2q!Y0
zNVu7>iSby%TL{N9PA2>^VU6)6gtrp*{>c4L*h9FD@gTz62)8gkmGE}L9>#qMze2c)
zaU|g#gc}+EdMog&gzFjqKzJwNYQ|p?-bJ{C@#lnh6V77%A>r2u+ZgX9{5oMX<JSql
zK{%1|cEWoIn;35<{3hXe#?KOdi?GJ{3Bvy-?EQiJpKuG|Hpcf6ew%O$<JE-s6812@
zo$xz^n;0)6ypM1r<0XXOC0x(Al<<3ms~ImO{666l#`6g8C!EFjdcq$NwlU5i{2^g8
z<Eex{BAm#0BH>oTCdOk4e@r-@aWdgg2y2WlA^a&}@AusQgxd(WF&;$tGr}#5PbK_0
zVGrZJgufu%#5j`h0m6-pe_aCnCE<F;KM+1hxSH`-g#SahgmH42Nz1G6x=7GjR@awS
z_xN+bw7N$PCQ^6$Rus`AF?-$k3Z?7sL+=}3i92sHJ{M6p{$EJ7*2O$Q<jPLx>Dcz3
zOXaZb?bFp3cV0E>(vX`e@-OB5qm~dE`n%QTwQl?NYU{SnC~L${>)XFO2SC6#r9wb#
zS6d#}W0bERKLO>m^2=vgYsUW>xil!`><5(xfv=B(X8HpBvoq!_;0V%^>osWmtw_up
z9(Ueyjjnob@6ry6KaepR0`+`vSJ%3TxY`)-+WX<3U6v#GS8H#Mrsoca8F}lfyw<w%
z4t*|S8Vde)GxV)1_v+19<4~ghFgAO;EIJiLo0nPelgmx8pw(q*v({9#X>rT@Q<B@#
zRy{_Gt38SH#grf)XK{~z0wVQv=wfy4vbxLL^qaAKbXoM8DjkK-L4J3+4$(FgY0~s4
z<b<MVU03^QP!vinAPFZ42`@*Yfc{o!5tD=n&v$+iEm5rZJkq-p^EQ;)554!JYFZ0I
z@BMKvYc-v~*E1gfYYA%mJ49GrwKiyBbyw}zt5BiUy@hbx^IrWb>;OYFAx_k9R_eE*
z{&ol^<DElFlnEtpj8fHtO6LeAS_~zuLWy!xVjwHAOEmdXlnYViNAO)(?agofP5u}r
zl2ukUCLP|K2$A|Qj9EB$E3(4`jk_%Cdb3h7UkFaqF3YAE!8s^6o?VvRP#%g~8c}2|
zOtluYo@tFI5zi~}TyGs`SsAzdS;|Ss9cy*fRj}R6c(hc*Yy|ZsST(?;C6rXNg$SDO
z;9|@)EReyh@nyxZz!_*3+{RrFDR)zm7rFaqAOMH3#S~$SZQuB9@pgo?g)j&ySmTMO
zX7DM{uG#|9gKDfBKODvNeKZbSwRD68Er~$O<*X7Ty$sIp;$rMH`ABkaBq@*(Efg?A
z0l4kIiqL{gdPWO4OVVEbfH$P-A5Y?L?4_uF22{7YWA1{Komzz9Lti4Jez{O`j!=>a
zl(e&wwS;ISE}y}8{H3JDTOF*0)xD9DH?t8h3GGl12u&(c!Wf3UD26nsGrqMD;{GDU
z=@%nNgy@IA3K3T-B2q=^2dMM}Q98-v+NnFxk`NLqg@m8K_A5c|Dg&6?a>TlHM-vzn
z7O8VIjpJWUN205$)9Nbsx@L7i*~_WAc@W|YgXH4v=iwjpmKBv)>qc2XHrFz%{}epy
zpJCXsxofIAw75IILlT~1<LD=-VFHWS>gL$C%hDPPPWz1_Lv-A#Gh_x|dNiev5$TbB
z0dX%F0;nh2PnE^uo;TR&dNG7Wx4XwX3eb{Rw1k3aPI;$$R;$&W-losuPTVg7nuw+Q
z(X_`WvOB%-mEV4sL!>*s)otk*y4z*xa9diVES)Y(CpRc3z114A8^gle-Wv|0*tup$
zA{p7RugU7F^18}9T(fl8_un@`t4LH5(!Tlxw(pl9o7Yhj>-t{-z4WCR)Y86~0q)p}
zq+ON+)MQKvFmpe_VB01ME^I5gY*Df^^1`CRD-N(3!IW{&J$@8vw!hu5SBHDn0oco{
zpU(c)CMGUnuQu3g8+1ezy$uBnrx*4*;I`;wFWqf95M}9r;iSC|81}j%*j^^Ty_R-(
zpp$e-*z3WC&}y$9#$KDuY_E+Vn_opa*vkgJ^lLdN=-|)*yn$r=4jJSLb))`-9M-y7
zy8bo17V5Fj%2~AoExSgv>?ss5T1G!b?yka=Ti&^u%&FHSoN_P|HWqX5!6*pcWoZ-D
zVzQS((Tz31UpSO+v${o`ymEJ3&216Xi?n=%lI5^z6()-<6m9gI;UEa#^bCc+GLynp
zoAeJ+t8j1R)%#IUaF4IbN9#=7x>_q2K+b9wu!XT}CtBO8{{|m&)n+0Gi*WO!ulf0;
z1=TTyO1gHs5e@3bSD@x3kN%JU$U@Q{yDTfAyViau6>@EJwOC#0^)AaQYJy&lA=myN
zxK~ihCjACVc@rrZNv>Km=?c^E0>_LP@7}^J*G^&L%P{@AYA5^krAM~;X{a_x6XDb=
zksw;(MJwEvm9?I@<$b~97Pn79)xvyOdSOzF!02sj&*0VCaxRMaw*c1Mdys5EzVi6h
zByiSpeJec5XDl%a$UuKZI!3`#6fk_Hjk~p)<n5}hgw9g(yYnHr!YBE5A-P(2gqB<@
zBwwsZUIEG0x-BFV)4Q<JE$EtVj!3^eS5wv5J8t<PRBm<0EJfXiwTQUoxxh$?iK~@M
zx2s93&%c!I-ROjD{R_5y1(}=IAt+};Irm0lq0z}gwN)r(H2Q0#;~D>I?Vr*=^aST@
z%Y*nDA^v5Z#z!45N-&dc)5k$~SFI3Q)rlpoXwoBSa<?Yk4tY=<%jF4pQWqgx3fLaY
zMdA6O3a6upn-?H1%aL|ykG7B%q2LTCDE*B}a(l57m_Ra(_RhPCJGtS2uf1YA^+3>#
zufT`v^+x!w+)ohKDnh>1HLJ~)-kRpJv|vYd$YCPq+^zqJai9;!=nXL(chpqvK>Zg&
zh|k682l%x1NM%TEa#=RC_acV7+{;6?YMtH>p;+3zyp3kNx~fL~X9Na=*+Sz}G`7uP
zyDjTnwR5Sp2-O^$+?GaHZ7y>#N*D0MTCsey)^S#2NxF;sj@Ae8N{&@7jyQS-Olfu3
zarAL-q(vx#ne69keTT7F*b09@(ei5jU$98GvF@_$fWp=KWaug@&?~+`C$WWYLRq(E
z2Rd|@MHuKIk|v`O8)$nlV_h)g^dJV;w-0V<)EA5T+80nRcU6nKTv{&@o*)7ko7T;-
z4C)5zb4rl#b`vouME-;9L%s7i@vu&>$Adq8nUTKONH3vu)|MP-9zsTd3lcLOOlY}D
zAAmql@0X=N)}vNe)e&Qf=Spuw)%)R(xj7h{^J&6&j~_$NH+>dnn!WZ)Qhd~Okj;OA
zGh&)88>eadS2VDF9@wJaPFkWmbj%S<2X0FXb`9iMmNnQCtwOk6i}|rpbbHlGjFM`7
zFl>o+Mm6MMzci~6%B|BMLR(nS)@Zg_`8xd$DC**PDr%?`1K85$isd5b3RA7rXQ35X
zakRPPH^K3`l3uI%Rg1H1^97m~{STxzY@iRq7<7vOv~&mA38Ps$5Ed{yco`bNQ2m8+
zaLtbhaVzzY+E{LjzMa}c6}$+)MxX4~t5K8ek1QC_Z5ap^<FU+fl^;Ts&|T>V5GD3c
zcBSt_!QJSco!RJ-*T@y!<L{&AH$M&2JI`Igz0(A;`A*S0gU6zG(u5XG`dvJL;CHu+
zhiZK(9{iJ&o6S1Tav-kGM&_o&g&gKtx3xvn(M(-clYSX=Vbcryb2{cw*(YRcX{tIJ
z@FHNe|APM8#X$su5rY>XX`E$WTpe7AD%y{Vi07|pt$qmhR4bx%u81DmsiIvRpM=C*
z;Bl6LK8f*=NIbv9YQ5C(OQ|}gvqXAei8WOR5Ow7MYtqlBQQ@lHjio{R$*3pEV<4&&
zdqPUV0(sl#QAVV=gCYf`)>Q5Di(0F{r?V(efG9>%6g|6V?GvJGP)MKT7d6l?s#<po
zQJVrpF_NO_**$BZ5M?D%ou5msth+c!iHze@?Qp1$k4ZDEayHVsOW*Q|%<9G_r`AiN
zEQegRhv<Q~$gbK>W*%_W9$_Xa>#Ei8uiLWERU1tYyaaXC#xirDtF|8=Fl~@Hm@uBu
zuG)bhu}}rJ(>Y;F6Nr8`jURU%J;TJD*L^qTMLrXVe!DTlZ55NwM&>%<FRt44D8=?S
zGWSQ*Wd-^KG}X;nT(v2XRI`Qtg>4^(7H($IL{fVILNJ1~_Sa|}iBO=rGyP;_2=;$N
z=iVQheJ-`Un`}ev*%ZEnUG&>q(WccTWmoNfsD^@YzVYrG(R;&@CVG!j^e)QE>2>4(
zF&d-)ssl*pWz;CJ-CW*XXGXKJIjh}=GVbz2?z`y$OS)PV!9xGVkE9e9wX3#8R_f*_
z9=Hvx15N~UHd3n9^}Ti5XVF%7G)}H+8(A76P2(p>a&IB*PDH3f_Bzgvyzb1{SbZz>
zb=9szI#=c9$Dqy`$cm>x<-pDv2EHssGI=D^L;Eab4bhu0Cl7{8Kc%+viKKp>c%Fk2
z`f2=(x#(9K5i~)6FCLn7T|Dg8zk`#BDVO3m<_RhD{ZITd-w@9%b1OeX=3|D;zZ){w
z7&7mtiNnq3db%AU%6~S^=ibKr9WVC7bnNUN^~??E`)7HI8(%}u&wn7M1-U7LX|QnY
zfR0-GGvqmS<J(5U9WJNQ!0zx2lHf#BsBk+9lM}hxcF}LZ!v!X|JZW`xZCIkExpt<x
zwx?m8{s&a&s$F9&+uZ!<s%-_MW()m`fy7R>kpOC}L5=XHn;@P0sU6<sbNLqJan<g@
zzwLhz3jT$&BMwou`Y2y16q5Z+5vhS?3j$9gw5xU<2(=a6v6eWlEh|Cvx8cW#B=n3(
z!g<|yQ(ok2q6}tZj@jCaOK)WECEU>E$ROQ|9^rdNsm<JTT$BFoe%cKPLr)<Yazrc5
zQwO@AZ3rt=ggpdd#`4aq*C752U#mx-kgwqYp$JSk3(3)wF}Cyg{l$&1_??FQB^cwg
zh&XkI=g|8I2(!|Ld+9EE)SD^X`rb*@_NPYdvUGyr{T2b*CC6~78c(dXMvt&wq<Ogv
zmXBCQ(y(?}I|b!qu}*x_5r<!Se*dlMz|k=d7%{0>-4&hsZ8X+ucEsQKig^FxJL!pe
zb&g5HNjS|u^sjdpf7dO^<9m-o?ZZOWoqojXj=2}PU9psk%9fMP#(5jn=hf*firt-8
z;dBCX4s|ZJf8wZsx2j}h*2S-{JH`UZc!3pnw<qp-67PFq#N$!4$E*Jvfg@0U6@2dK
z@<o5Z7If%Z)uzuEouISBq1ifm1mhfc4V_3+s6*a1{nHN(!=ow7*cWIKpQz`(ZwxZB
z0hh#Brbq4yjBfo49D(@HBV~V7b?R@>DEWE+S)L0c&{Sawe{)6u)O6UO=O(KQ>RLS9
zl{hQjO0&0qD^Ee>1lw=`L}#Jc=0d$1sDy|8?!=_mT(9dF!qH{($|=Jpy_8(v4RJL)
zY`lR+S#gS_yDSIvFL?A;>H1W3nO`Akf0LmEB#=@#RYjdwlYQuj4kzR_EFR1!M0nFL
zAc{o&Z~TBFH9Jgd*ZTY4=dZBOlO!|9KiHSk(SH-*q<ySIcjN74B)O>EZ;AeeNl3Le
zpQ?QL3+>#qynKjX{VqhKYX*v@BL7akAA$hdgp(;ALx%m)UeQb@cr5p(@rn8<_bi-U
zn5e7k`d<x)$3c`%!;8GATLYzjzVSxm5$=+XSFoQ&;$&95-$lF-{()N9zLbH+;~4!n
z@>FS!YE){Bw?w0n`~rd98aw3<{kQj6l@|76o&c~PbdSNJpH3|F>6jqJ<S>X@fbF&6
z18pu#tNs^qYmUvrjcE*wL*xAJVAx0Ki}9qt<clq8e`b-zu*ZVs$UP`-5v^*1CT;rr
z)GC@<UD$U1D*R*}x^9#H8+Qk1+hlcZ*Z;oHaO0XCS?sk<hQ_<G71Yy(2kzF_pk+b+
z09y;gSlzuLe81IwO)U9`zRu`>Isir`74I08z#u{&qx~&LZC;#1>YoZ#y?QUivrzFC
z_PA-UzsDx~`V01u@z`Z);?8-1JHXQ9)?Ai7*r@M;YAu)wX#EP`!giF5Q-$`!U)I5F
z7{#;j&IDc3X^icySQ#TPEczuh6T_V{+(uqN#}hBV4K^lv&1i<lWoZ_3o*etcV4*XO
zGPhG1=>)>$9{mj9Q0*mu9;0tUCy2_~Q)!5OiM@jA5AmY?u#cPS2^!x<d}ja0IA?#s
zSRXeb9;brBQ0OTNc(4M$u0?Mm)(jUx2TW{g(UaebPy=tXV(#3A?zYCZ4`u-{i%o=1
z1OycQkpLVV#ZX$;M+i@9)5oIyA{un_r{mV|jv)O$Vb4Rf38`Nbr5dqyrYP#u?=BwM
zXvKXkA@uXv*Xq7PpW|~uv0h=@`(g`*J;xt>{KFrA5{>==g*5}mORP`@CKuynE#_f4
zw_v=y{+6K+%o#Xdo}nZ%_Io&eQv7FWhnW_kt}V2INOk?1=DM5q_RBqS%ahSraYJiO
znCat&)+I8z+sx$eHYSf|F?p(lNqKeL(5m`0SKUCsG`ZO|;w?=8>`Up0&D-LvHJ^F0
z>ZbX_wSyyqu}8+ptFyp{mfgE7J7^iU%i=My+t9jz=6syaxM#Il>(V`4IA3c<n1;jg
z{?3|{-pnuAYTM8Tj3opv-eK$U>{8rFEQo2xh!a`P<gTMFxc;5bkn|Gh6q=sgeEwl|
zbs*TuA=>1!tS7O$mxBRLg&v2S%g@=!`PuC2V_%y&_U+c6Bxk^B7$!SvfqttHvKvi8
z)9^y>S5#|^UUT5WV86?<PhThs?AMb}s}cWA0rgRwHN?sBjhR05_`m5*=o51Mqi<kk
zoZgem7#7!0f{l#f=#Qs1)UV`6FVRAYRvVPx_nKGCa+sONS=!>3(`!R)l<F?VGU^bG
zBASzEK6;#NBtLKy7q{XP?)fSYBI|xE9JcH8#jNC^1;a-a9^6Po$kBi2cvI!k9hgEO
zVUsZ|Hu>~-qrL4`k|?_q?z0C1TJ#BGba?cEurL?YKcT*$S0!p@eT;9lN&8J-<)=^+
zjKy4wzM2+*2)zrD$3J1Sf6#c~_c&|Bs<Zq0J<->nVtt1gz&51`r&GV?4Q?}-X!Al*
zEx&FzYc_oe8mO1=A&)fr$LJec6H%}kia?|KEA(L%jQlknB~(8h%ZlpPxzi81tkE@9
zhw$S4bu@@h#pvW0HwG!D$rd?HqEB$7KbxG&o!$rq;n=h2oy&gxZA45q>sFB$y1$H$
zc3B!^FIiAc1D|B?*E7k@>d=$!*GZIR9Zk};Q>kibg%6Zn#7n*YNDY`!`JIP)LQM>=
zKGKkQC6%Ep?S1hN$@c9PN_f0&(%(bBEAi1-57PSTYqEW}@e<GdCDu1y{Y)dy{u$k1
zWL}QSeCrQie*M3%`Lh`IC;uSBOTBiPX{UjGsTn&~+6(*LNsRyC{*&dA(TETJ`Yu&e
zGU`*?=ce}22RgV)-h<Ibl{~reH+g<v(_vEl(~Ing=Xfo-Tzn!0qyAB00bNfJ;R4}D
zv1d>Ey&<N1RO$C$RliG*l72tc9MnHO)%V|?`%}L)ME?Zn6Ec4K?fnKtqT|qK8nrL9
z{ptHv%KjTZ0)M=f#|Le6VX_eY{RX86)vxORk_aDR?R`TCe+jl{FgE1#8b{1U`WKKR
ze2SGJf1>YKb>Gh-Kb;ATAiL3ePy{|*pU5)=Rt|^|ys-=8A3gUcWMh1aIm4^}S#hQ?
z<MYz+{U^ug(Z^%Jc;WWxdKnE27hXJ+d-dzshpN07K^M?Kr+M>7@;ll+)4A6@V$lF&
zB<f3ca)<JSF88yBd=Y<i{b%+G+TdBddS9wYMiMcc#b8w0+r9sk@eLn4ldH!hNb?8O
z>CHHnV-&}0T|dMoB^JVguY;t1kRLq$18^vR`SBHQCONh5@X3FElSTdLU!U;Yi0M$S
zAI_vE^Wv9Qh43d!6BNUvWt~2br>3fPw03xgZ0yGF7SGR7eO#tDt+(Cj`{kK{)pbog
zN|Q0MilE~RTG`Nwe7F8B4GtXUV4eeS)^02l^y^;*4|^kQ^l7^TU(MpicDFaq9s*nE
zkH^CPMt{y%`*Sk~Xp(Q7T8QDG#MeN7fY)4Mbzc{69r}ZH+pm%OYrBk*NaqPpZil)&
zp3o39P8>n;_PU;usJ|coIe2|5^fSgcohe|wssCHpUDxmN`KWKbbDZ-Tomap(#{8A2
z%s14Z=xb#?bPJh;`i|B^Pl$k>7;qk?=qu05YIwMvA%`2QuhG<jA=iEqxb(IYk8&2I
z^C%iN;q-xxv3#$?xP60!(8H-bz~OZSUfo6-!l%fA@VU1Ya_#$L0@l9kc2FYp5@%kD
zyZgrvr7QLO+oSF;j5U96azR#D=xdA>|N753uoLH3!Q(T;c;t2WGai;pV>0NxIZz++
zThTvl`ZV%g9uA`6^wLsaBt6Tz?LU$FmGDQQtuekS9&N(@k*@Plna?6V`-}Q~dC*l}
zI6$Y~pGG%@DLT!n!!g1hvTZ;|x@oRNU(yWp)*<yUO*vnP_xl^Q1-|{`R@O31JQDdy
zgBwDu7Xs|B!4JIFaS?Gf{IWqrmd<v56)cQK#zSYT^IL#@cUt}vL9Jz<qb=igqF>M`
z!#@M^2ju<8H#Eb_XG`8?%s&(WR)|2^CN{0Qp10fRGC6<y$0zwC*<ljdi%<DbezX7>
zB^V1XA#6X?hrsQ6AJU#apNX_@L`!(^R@2y1<A=}x$v?#TiC6UwrT+rj-_2ex_-tT|
z4{37gUnIY$@kZU`#<`d{_$K$lc8<Q%!W$1U3qb;Q&f7kZ)PEskiJ=DupEDco9yq?~
zDD7C<L%PXIX~`g8?uZaX?yy@d`P6vj_s?-#xa;7@J@&uPx26eyhOnOUd-V}wzNSij
z!_^qi#(YG5CjEoQRF^MK1;-PiuQ9*S_&d#~cX#>HAAJ5{#4qy0uko7DWjVxK*X1<G
z+*OD4DZW4^{nu}wv++_C-E%DV(a-bEoq_s?^B)?sSlb57$5DD8c%z(WFdt8&nIiOj
zoXC?1hSQne;Q6?_y`B?86z%A4FT*}!40Yl(#_L;ocIyA6c#Pn93iS@T8ol_$C;s{G
zu?W-nMRv2_8!vaaUpRaC{Gn%m=J_#j{`L3QO~QwcrN8)i?FOOWQT;i`7gPO>_4~8G
z{*yw~kM_ds>Uc@?%_r>t++GAgUc-o}_#<1Jk2X0fmc7V!Obpe3!S<Bj-*c6ddwJy$
z1FfpZmlwAU(6pE7#6@p=i8>Ta3RnSW$}RdMR2{8salV9!)1yzK7%4LErbw;tCGQh!
z;7y=S(0&u5;BISOG}gWJJuk%X%S0>cQDQB{_HR%~_%=SEO7R?}_T!*73o|=arKgaY
zsTxen;)Av<SA5?~KbMu@FHl+C8y8R^@qI2@Xn#);M<~)HeYA;X>NMN<WL9t~T8z_o
z%AhyCf2P06dh!RIp(^;ZR+7jEq{#LcNL9HOUgxqr0&(kf(`Lx6scOPO6n*$zR8bBi
z-~?=28-0CKACD~UvwVrLknv?es+d1Prl*R!ZLAl6v5GA9&I{5~?Yt&c>ehE7%(R<)
zLa9a^98-U=Y4tJml#BHC>6{5)R>Q`UUctC+@oey++p+SG(jTNQ<S*N$LosaD(xDW#
zC~iP8TcHtegB9=lo)kBH7sWY5ga^M4TrT<dEAi}jwEnEwNgqY+)JLE_sEZc06oW#p
z7b!kZW7)i7(}kP%AuVY^M?2p=Vi?f0qK>D4`5o1GQJ@XF>S8~`JDHvOgZ?obR6pt4
zDb^4?*L$(ECNXYXto|BzvzP(I`lF^J%a`Anzg&2o_YCRIXZe;EJ(2>8-1lK&KrywZ
zkLWb4|FKl1oO&xwW1`3={oe#=jYOYCrh|Y#kql0y{oN*O6n{Gdk8Q~2(I4del;ojJ
z#Io~+c#KcZ(4?Qs#-t2Qyvva1al7>xVdZtkGLz)3L$-D3o-8%Q$awCzFZI#A&7_uw
zGCH*L>bDW*rUJ|N|50MnQsx8Ll=noP`o$Zmyw4YUTpx(FjgE0fYny=mS^Wm8_Bi)v
z)L-I#cPHzE1rq!n=S$B+Sho#V==Em6d`bR-t}63+Nze1cf2rp&b{-av9CAM9c#id?
z`gw{dW&i5mKftDlJ2(V8q5XrgUy6ejAqhSE&2wy0|9+`~hppTq8S`E6d}G*yBwXrS
zm-)vh<@eR4dY7`kHr5`r8G;YFu<}*b??L0UoIWtJUB8-a*+V?>ttaG|hOg*upRX$W
z&%@Mgcrx~%QTmHQDz6BP@t(->%q8D{xoG2#+{Rl+Y~=Mx<kMmNM}2>;_@Dlr81F4S
zh(0$4KSix~3CI%+egh5<jS!X%SOJLrN=W$_o4nn^Fyr;yBoQ3;>mT@Q?bP=e70K6+
z>lxF~6pK6z)BW;!fBQ3H>I6f4o70H*-Ti^=CE}@A-;ge7Cgd7hz=w^BX<kBK=7{-$
zGE(67>d7JlBGXR&@Uw<-WjqO%FXbw6|3{87{`J!<qE`c~7v7)!@lNRPm1^nw{X#CQ
zb@jhkEioA_hke8BPu2Yn#t3Yq>oehU#ws5X2odCM3TNniYt1fvbeN~kXuX-3zVnIU
z^chDbTn6qJ#i8ylb2KRuZhuB&^p+-kmJ*+9!Iw~B=zK!4)H!^1+o8`f7+&M#)?X*v
zzk7ej`}&(CzAw59vCZ&LvL|ea@xyy%E$KDurHESFKBG@Lux-UUvrRt<PR5SHjTYr)
zeLb9y9^D@v7K>dBTye?}CHn4|{GrG7^v#&m#X_YXc7sLhg|Dce(P+^#pGKOxp8k>M
z3w1rcj3%I?t*1XWO3C%~Wtc^T7lm0*cL=Ad551m#i^YXmPd`K>Ce(U*ZGg;*Q0wV(
zAy}mO*VEIb$k6NQ7P3*#>*;Th<!I~aUub3(e`5oiggK-h;!DkXX{qk(=?YqEn=od$
zKiqOXP5r^9)pZQ{V_Hwope{Ut_4F+i`>-aM(H1zojrgj@kKlOp2ANc@Z?}p$C}cbm
z^Q-s9SI|uQTdefgSFLGwt7XoFYi3Nh&rD6XXIRo_WLogY@+z|D7Z=&f@(b}BnT5qp
zhh`P5>%`9ujvYIxxF|1wetD_GQsKxccV^Eka7@fE$}U}O%FE6#aO93PU6k85e5vWP
z($Z|j^s}v{6CDe(7v&e11{U<|m2OF&lA4wlURJuJa7unbLAdgnsp+ZHuL)l~Grusu
zXnuNet|OcTvJ7O{GczXO@4^I`EF=>;8@Yq*Hq+tEEX~eAjXl*idEzy8+l-7%ZL(ut
z`TY5gQd3EBsq-i{&ETHm65(XqCQO=U$+S<HJ|XRo*P|&x%K19gmYs8x!#M%JG`YAe
zzbsH%P#H^6?u<O6V74>AxTweMX~o&OJ-Q5hP9wkLpPbAoDbe60rdvw$oep~;8d$n`
z2pGdm{6oK{MaB8Krb~vIz#7`EHP=xgI0woEs6+zAkJSr#ijXjpGVs&(VFZ=o2Q#5q
zSy}dchpHKViF54OMUEV2acK%Fyvj6qs8(8%W6!}a=62JR<j-7OIIp;1c6LFzV+b-9
zmdzg;pfwfFE}ehWvKN(gFIVbt9<?0FN4~(qC8foA7|o^jobu9AN0HN>S6&1)@GJI5
zl^4EfU=_hWm{H!d1I=_gN-$&t2P<{y0{kp$uETD`E)lov^Bqom8JpI=XylLu*+sbp
z4!h5Pau#HlO8*%K0V2_K$xwE>&;m=#ixe?N8N(sTYwh#O^YSocg<nE$2%d1??kVtM
zf8?XMGwF|$(kwF#Ynw1K$_ujLR3-<<T}Q5ox`c;aV#P%j7zqfxrkuq&1&%Vr-#}qA
zGbd#FWcfn5r7+()(}|&5=FHCt9PAfC#zh6?8h-FSA3q@LEJfTJI~Knx&-#~^=QvG8
z<p{k+#ZJ>AvcEQkqrb^nY%+ctn)I2s*y$)E-JA;?CfVagTM#jZ_tkQ;i%3Wg2By<t
zQbv!d0A3GkI$>&)@eA2QhH7GLu`(!AfFGE*=NFYZ@Ef+cBF>H)HxQqQ-j3(>4|jEq
z#B&Okq;o;jK%JlkpaaeZAItDHl#Xe1KyOWJz~W&f=pHO!9zpu5tzBJvKof9a(u|k0
z>v1A@0JIgaK}O=#?+9Ks@55_^et7eL2s8n709J$zpd&$>K{G)+K%Jod@IrDQJ|>t3
zdI+=tbR<q>*MnxFo?&lwb!~t?b3rSCJ3#l5-gtX_i1Y@{gnp|)ouI9tYe0v!pd4r#
z=swU&&_gJHgeY(VK97+Ay&FI$gZA4CJ)!R~&;sBipld*fy#sx~PXo;a?ErOxuG)um
z(Dk4jK=*+5L;F+Sg<U~cy@$^SlRZG!pq<0sM?TOr(0!nlpzD#oen0d9-UB)SpSMf+
z0QG^UfM$ZGf$l@TxgVll;7ZUnpzA?5fbId^2YLkb5NJPqCVK$h&nJM61Wf@wg7O8R
znZWBoouJL2Yd~8;H-H`i-3OY`3OS&|KnLI@d>ZIT&;rm*&{d#L(Dk5eK$}4?#CO;_
zKsNyQ`xtUShk+gfO#>Z(7r6zXBS9-cGeK8@rlDLjs1x`I=o-+3Pap?04Rjx90q7yn
zRiFd#E$ar*k)TIFGeJ{6MS0LP&^43}x&d@O=swV9&_kddpabH-$C7d+=pN8a(0-qx
zJoG3a3fus?hSEVdfVP6}15NlGazG0}2jF9dt)L@8J3uo*6TU!s&=k-$plP5RKr2D_
zf$jl41lj@Gfcofr_XF@|vJ!M8DE-}#Owd+PCuqMfAqO-CbOUH5=srpZJp|eTIsnI)
z{SKl$XbNa1Xd0*!bS~%`&}PsLpsk?$Kojr<{X?KBpaW>UfsO=i0L=v51L_1l0=fpY
zpN{gNDWC%|F6V+~g02E}f;NEeKzojWt^pp_j{KmNp!+Bt^bqKJ&;fY;+W|TfwBJ{d
z1DXQr1T6qv1G)-y184*2KF|)(L!d`MJ5W#MH;{u<&-I`qK|4S*L5F<{IiPbv*MO#d
z2RWdtK=*+*gB}9y=SBH5kq$Z%bS`Kn=qgYrXfx;<&{oh5pb6ijJm_4|L!bqq1Mr>W
zRiGn5*Mnw)?g4dz9sykgn)U<AgRTPI2igI82(;iwls^mY038Y13YrNz>=4R>rhu-Y
zbkGf;t3U^whw`BNfcyOfIiU1+B?sW+;su~1L05rhg6;uzf_8we0Zlm!IiP8vDUh?L
z6LNt2{Q^0l=o-EvmSkuV74Z>g_v;;7kGEkLf{XB{mHZtaqR05rPL7{)^1!KaC*2ZT
ztzA86+$AF}q>tipd2GwH{ZDpv!M%MKm7R;b1$+~6C!8F=B63pyUN@iuKuWK~-MGH1
ztCJ3BjP$4<qbUcaH{dSWg`YqG^`&R|%Qu4^zq_mJS~Wc~jWbaBR@^qEBkcRj&-Rx;
z0=DU2_*13y_Rm-UoCu@-1n{$-?drNLAbp50Jq3KKpX&0Jzs@gzu1ue<rbnjw^s7X=
z8RaLc=~e#n4M;CRx=Bq}?AI*QhpOqh{_?Fz*Iw(P{Ydxrke(2U^w-0ePm$>jYW<P-
z`r0=a=_ObXgiEia@<_+9_S^4|e*GI{x=Bq}+mCe9D_vb!2KdWQ@Rx5zdVCM*N06=|
zeTiB=vfQVC0=@;5g>(!Xzx?a{@>67bIQ!2<dI`!SZ1~G(_{&#P`8T_|E>+X>{OJu;
z9_iuyw^^o#E8mLrCajmj)qg~$r>OPc;;%me<FXCq5pMkYXZq7qWIC1|{&b5!eJ-VA
zJr+*?N|`=HEuZ5r-+**)59OO>dbs$|igc}|t1Fy-M`U>lZ@%#>h9}3L1bh~+C;d~9
z9uN6r)$+o9xcpqCCn7zZ{7R&okRHxH4M$0DK1zD4Ob=K85u}s;;p$Jo!0IVI<tXX2
zzv!uarA!Z}Ujx#~e&OOnvrG@Cf9p}wj~pdE0s921Kb(FkNT>b_mp&KiW~7f({bP~e
zzblcRh4gUxH^}sG<D(hrB|VgHmFZYM`Tg7Gum6Zl4>vv%Fj13!bJg;ZVuDKmF9qq1
zNWVf&FZIixi*ygt!`Y`2=}kyaQ_DwQ!vaWt1Jc`&9!|ezq_^~t-YV<=gDU?|e*KQf
z@-L|Ak<a?-Pr$yc<n695EVupk6Uz%8-zhRZ?D$8z7v&Sx@;CYGuSB}GH~jjh0qOBb
z4`;t-Sw39+ZbiBY<)^FlE9;LVNVg$<UO@Vd{`SN0jXl(#g7kW%U#^y4=x^U#SsvR@
zzy0R;(<@~<h(BGKKN_U`i`Ddb{_@STe7c$*Y4z#fiu4xf$7@_;eK*5j{s__&v7Zm;
z9|_n8n~@%Fe5T0saN~O}(z8%LoP8>hUV`*XRsGfZAL-RdH>>Hl`t@%{dLz>RsHR7X
z37W=FE7IGL9?pM`AiV|Y*be&bpYE5Rfc-nAhtod=={?DxE6azgzY^&l$e*j$uf&%I
zq-*=a>(?yPuTsk|_Uqq@bQ8*l>%Sv1{Zh5OV*do3J0zlfIR8jNx*6%={BthSZ9SB)
zM0ys|!`Y`nmJb&{nvtIPV)*%`mC7R>+g^YFUhB8d5v2EIp9Cn@ll&A?yr=dfJ-&zb
zRU)0*AI`oFNH0P96jlF7vA`$)Zbo_|(!=#{t4s&+w_jP`9YI1p%7^ox1f2IY^&me*
zrib&Nxk&e*e7N>iBE6*t`3*81Val(+5+9qT{7Gv1pZ)#YigYjJhiyO7vv3X>&VLiI
zp)WytxcMyw>GentJO3lS3F()q`YrJ5SBZ2F(hsWXk)Lq}>fZ*Ek8@G8ny$t_l8^Lo
z_H8BkNT05jSLZ*ZdyzguO)v86mw<^s9_O#&?3Y3d1Ehzmf37SaF1-@zCX}bP`t~2n
z`9XtB4>!LwBi)Sh<JIyNe*IgKp4Ef?N04qqdbshEfQf<9!_A*5GCiFBb7gus|Efef
z>6fnRugu>KNUukFxc+HIdK1!T1(X*HT+*o(>0YGY8jvm);FNv@={B63hwI-2oO_y)
zo)u7D96(U{6r@)pJ>2-4i}a=*<X0lS5$WN^PXp3DJ(O=odOXhU!}VXQOb-`7k09NQ
z@?qPLj!#5-*z*sWezj_!C4T>#i*y^x!_EBut*oCbWqLUKH6XpZ2mP9n?)^Bt{aTS8
zkM{)VW`F(4`sN7Le{WY8mO=jXDSrKl9kIlg#wVk_@Bi`?lXmvmNH)$#FGRZS*{-fC
z(C8=}CKzkX$P(n5(61Nb3*||}y&w5jP`<CDlzep9;m_wpy6M5LE)PUSzoX=fdX~;a
zIU~tyz`g0=uC7%$zlz?a<|~g>^X)*s>J438<)G2iAk?Sdd?Op{*Mam-<g287Bh-9!
z80Xh-0L}|-8@sv|Qob|Qe6(lt=SxApP005;<?B-Pi);?amy3K^&x`eM^mj@=+M@|^
ztlui+oAW5np+Td2EBT@l=`7Y~mkmhwJl559elMyQ#xdF(wL3=5N9UrOUXbm$O_9fY
zPk%e2A%f1Q=?}U_f33=k4zTkuB<+X1RMPJnMPAh40R1wNPkXVetC;Pq<l{4If4!B+
zw;TBelf1K)d{OZM@*Y9Hl9#YQrhMlq`67=P&1L)TLB26t#6C3oBqg8U{W#wt<g0&K
z=KB%AuwPKV1c(^3Rp$FBe7?!ZN8gdA{YmsIN<Ip~O8*uhpY_$Qt_;feq>?X^7CU~s
ztU<mOoL95FyOexE^5|SN7VnpDMwaMXm3&b*1oYbh<SThs)_a?hFQ{JnVUJj>Q&?WH
zl24Hb)wQns`$t3%m{J~r{$Sh{(NCMCX_NXa(9*Do!e1dBsNI9|_e91&AEkX5`C<fq
zM(eKVr*IC3v|$<+N|#-iqt!?BSstbRBjV({qqK)3`#ct<{VVe17oxNSQGM4$Yu`pa
z0Qty&wpOTVg?%<gYR^aX*%_(rh&cKENNsNffA|vVS7@7&adX7F$j>6Qhobv6_0~F~
z`|awj-5b;Iliu3qUj2URt-aCflvT0XyRpp?VBOib9@>%Y1Pz0H;hhoxj?nf+_Io-?
z`!e#BSEIDUk?Ta={WL5(#>b>-6Gw3V*y%BU(q<0DGudE7Oq$OI_n#iocSfJvA|h@)
zd2NKjmHAGK`GdjfR<0`|UZViFDl!HxacAThuBV?i0d5&{tu}@08X9rFMt}Nvbwr=d
z5fS%CAk#Y$asP?XevF8DATpveVmKnW(HC*E`)rEPa!<yeZ7y9)T&^=dW=Mbz!FXU$
z`UNrL0`gzne`){@NKc9wqET->5E1iHWW>V}kD@`lqhj8V(!Pj_X^)QhB`OZjE2Hxb
zHHA(CV-%UkFM}KW|Ni-3wLlh4^0*pllEh__SRh~bY5z?Y9N~4E)(7K*E&~q?!WRVL
zM2UG#hH@Da<EVnK-5B$9^%q3Td__F)I~qJzd}Cq;uL<#lB?w=GBwZ=fY5htUjzIbP
z>35!%5PJkhNaD+65)@M^U!LBA@}3lr8>N1{h62VbZoUv&#pN|IrMN<uhYzFu@13PM
z?M#uZ#m*5A2ESysz=obYaKXaSC0|B;zl&*AYvauT#R}0LLl3XS@68nDevufV{r^|j
zTG@ZWN*|Vf(W^+LoFVC8Nk>aMMbg=l=1ICp(iM{4Bk5z3J}>D`N#B$7prnT-?L~_l
zTxUo+SklpwPLXuBq<NAql5~Zn_elDfq|ZybQ_}Y&Jt*m6Nqd#Z`XwDK>1au(NIF~6
zJV_Txx<b->Bz;WM=Ox`K>3fnMl=QHqy>6EEOFCH6(UMM)bhe~<k}i^Tg{1dL`k188
zOS)6i_ar?i>0wEGmCE`h9W3c+NvB9UThcs97fHH8(t9L*Ow#8i-6`pNk{*=wu%x}p
zWc`v3mUOhFQzV@&X`ZBuBwZotJ(4~q>GP8Al=MAG4@!Di(q6QQ!gYqEgC!j;=@dz4
zOPVL?B1u<BdXJ=!N&38`J0*Qj(u0y7mb4dbwy1td2TM9y(kYV8mNZY&MUt+N^d3nc
zlk|B>cS`!6qz5HEENQPrvVKViOFCN8DU!~XG*8k+lCF^S9!VejKfPTIkQ_&K9{Y&!
zANJW`Y!gT}1cc4cY3KfhDL6|Snbq%dCmGo>jP`c-bSvM_vbT5A85=GkICg<kN);s=
z?7)G+$Y7hZ3Fb!zE(C1igrHEwKv79lMWTQy%AzVIAO%Uj*ZsZO*`C?^Gs%>6Z}*$-
z*FCRazkdC?w}<pG(kDotCOt)ZhIHK_mQT8#w3l=%>2;(dq%)+qksc+zpY$=(CrF<r
zJw<wkblqW=Pr9D8mvk%Xb)+MtGo-hX9woh>^fA&WNS`J>MS6yG-7L!|T~FFex|Q@g
z(h<@b(%VRnlHO1H80izFPm`V^Jwv+gW|mL7p0t;AE9rHlBcwB=w~-zty`S_k(kDot
zCOt)ZhIE|?wn6_%*OT^=ZY8~rbcA$<^fuC?r1z6PM*0Nl)1;?J&ycR8fA(V1^`yO|
zTS>1Y9U<NKi;NW<|6aR!^XoHL?j4?<s844$c64`SU*}JoZ~nXd{*G+-RTlM%V+H(r
zWu{!4k{^5cT^-pDKXc`VYVF{LcNFUz#>QsGHxwtw$0sK?OciT|dU15a)Pcf8d8)9%
zCJv{o2Wo{<*(7}(ekgDdzmwQt+4l96yShgD{9<RRQ10pP9`?HmeI?)T?=6(Gg=|;<
zaA9MAe^);yLrYw3@IKYAizjRI&i2%wYsT9=_4}If=XvTsHRHb%`CZNUI#2z#X8ie)
zpVy3^<Eek!jGybBq<_<lpXXKCe$Dv#-drkvLF6wtr+)!{9vVmN=Qi&`uUmpx&ei5!
z<mFE|AilQ2+qP%5Hs5RWE{^>2_?<TI5>Nf{X8eWTkrY3#4ZAJV{J%EuQt#NanK%pc
zxy^eq0;Fm9WuB|=lA#UW`>4|oEPjQMrt1zk^w0Bt{y$A1zR2|9Y_B2z3=>>tXi`2_
zbK9{$m<dU)X8Ok+d?)emJNSO$fBKC15dE=kLA&<>2bWl^%#P+A{DZ_79Q;Az-*oV=
z5<kbmzeD^=2mg2C-*@m|5-&OU`DQ-Wj@Rg!1ks=0BmQd#_laNd(<rI=u!VTPgTIyd
z4hKI-{6+`AmH3#0Tl>=P-QwW)G5z};{PVy?PX_L?^|bBnc|RgPM;sb#u3r<MchbMw
zOe~>S4*xe2f6&1PiJx@v5#pyE{AS=%Po4Ll&-zFh?<}v;Pj!C!B=EF;@g4G6aO~1|
ziJx?Ec{i?<>&6*=1N^L5KQ-b-oi4yc_pB6q(gj@jX#9fJ_ugdEC+$+J`oG<zzsS4W
z>36~}?Rh1pi%sJR`}uCeCux7)PkcY^kIM6y!IO6H5vF(T)t7-Ay}D1z_weNQ`mRZz
zw1>|Cf4Ru#Zkz6vO#d1L1_Yn~v<0%n-$wkzqZYWH_?^V_k67S$i62+o8W3+6@o&7)
zLu)0kQ%t|`kiBQ^9pWVj$Vh$Wtby{B{~qF1KJR&Cc^~o27c9_6T=sklACL99koYSQ
z;1PV_UJEFHpSbt9#m!nSK8_PV`GCc>+`k}xg!0k!Ujv@@ys}4A%FTYFna_m?011Bb
zxW!*;iT8Hm)ddS^{ohG^@rxG!L#97QJj?oM`tJcxtIs+po+<b9mci@Er<3?R>!b3S
z0N#%1m8*x-z|;6YNj@j2C$A#^9}^$=j0Ijp{I%!W`pi>4TAv$%Ujq5$-)|GDe>X?`
z#Ct4me!GB=@*A|Y`aDAZN9;g`9&4{35nud@{XR%s_6(-+xd94b^oQ-Tf$85%eExn5
zsN8-`JVQO#^ckpT8vg;{rd??NHT?Q+;+aob{%khyTf`SWX>on7i!Mm#-veCuFV5Ta
z3@3TF5^w*I#Wz{v-9>!i;}*D*_&*V!V}I!)-tz*>f8nDRkhMLzWRF+c^M0uXel8xw
z$l+r)|7DhVoy0Sgzw#U+e&kCwq4tZT2506Nd6en*JM9IR*OZ&R$0pWxd;#%+zp=QM
zyMy=<+MiP_<GsW^>hmj!e^YVt=_8(L=ktEp0vX~t;@(FraFBS7_-XRjdj2bLts9dX
za^A(3{{Zzw)6W7=OMfTR&)sLa$l9x1A1CfDny>Kq?e%lwi`0j`#5aI>8vh$x;12-5
z7~_7`86O`q>0v*QTE^?h|69OCPxd>{)r)aQ@C^IAKJU&J<I!DA-~JiPcmw%=tws7D
zF#U;7+l1P_7re-pJLl-tEb;0CHa*nOTqlU{zt?_$iuiv3$D<{#voB4@JAjMaZlT;T
zyqaqZaFIjxuk3g27ji^x8vh?M{YmzBmD`yX=`(OOF7X=kO{kx_)&n=?Qa`iAtHh7c
z54nx_KM+6hum!Fr{%^$FAF+Vh!-$VBgh6;Y)5J-S!O!;=Xvd~`b2sr5_gFyt(J1l!
zF^j8x`wa0c+pCX!<hPM&a@&5necn9#%N0!jFz~eW&occ0^>ZuJ-;RtTw<E_bpnCH6
z#1|dCy8I<J{k&uUhln3>+VMTW)9NF?m6USlAGGBTvp#K50KpeHK5Bge;-^_3ZQnm5
zo}s?stGT{GJWKnqlQ>r3Q|ogDa49!`+$O{_yt&>&++)9cJ@Kg)eD2irY?n7M{kKi}
z^S#qYEpQd_UjfhXbDUiGGFzY1l=IcRxtDnMvlh_)^-kh>%17n)IpAsX|68Wd(Ei^`
z{+IurEq8$ZNcr4Ke4hIA`%M33;A#9%F#X)3O(=Uj<jNtV$hrM1_WNc_ybl6T<MZ_v
z_{%ev&q?|xIxcJ_zMuYp>cgJ{Pvd{A1^%lR_*H*k%boj6&s-|ES;aqOac$oxQRpRD
z4R+&i_aeRcne892>18jJTwf!f{6qFTn#o-2VSLi+zYDm?bAbAJJ=0GB7yCT-i;O|k
z&eRPb*ymeqLh~D8eEehbnfsE(wcY=bc$MSSfW7VgOw-d|?I3;>3Mutjq~2;j{TJYA
z^*`ML&%Po({T73R{~fkK*~%i<ffnhHGrjj-n{M%WmYyVj@=p6*_O{9OU%;jQrzr<?
zPjkHr1FqmFXt!@9K2H4ff(7&$j}YHaJyAXUBJu1;ZTi=c&pGQY|0?^1+W)=8kC4BX
z`)S}>7FihbRi@uhJ<Rat4~S>)w}95?s#n=^+mBoPYNme^@zY<lz%Jr96VH6m0&gMy
zIPkRkJfr-n&#G51gF=cttHjSEpK;*r(A#6wTf$xqxS5yHe%{O*caqNu+W*bOe?q+7
zdEPuarj$G9$p5p%y+<trsHM67j(C;0%Izfa1@;T=cNe_c^7kIL`Cmjn{lw=!X#wSP
z2l4FZEUtF44T>T4%rHI7skwF%&ws*xS3XC9OMgF1xt+!I9|fM)F2@ZYCW?_Cwn%^B
zYi)fNIPPe<KJmp5TP9l1w-Fy;yMV2^=7Fcx=bx3&ahp!%cG>G}x%<ChaeOt`8;GA|
zecnj?R^XyPZrtDw;G#b#s6RS>eT;k-ziJu0iF}?V-p+ofdV4`fdi}Q>9QynxmVr6@
z03Qo2(oZpc<}RBaZl1aB11{~}{-FK-2I8lP&pGuu8xsek&&O;+ZI^&}mi9;GUnV}!
zcG2{6z|-3069&ilLb<JDeV!zrMe@=9`>PgwF3hI$e{~CdJ8*ffc|I4W1LnGg_#*Ak
zF5*uS&wSYe?<9VnZ}|++o_7)7PQ3aN3&<X0xxP+(;q&%8%$K>IC7xNd-__1+>rD6G
zMu4Zu^Ia|QyMdc_e8}?Ga-V3C{v~)2Nq>^|?LwQ)J3u^tj|FZaeh=`p`Yba2Jmsc*
zo>BY}%RuenOS)~j8QLpNe<Se|57_jo4|BlN%Ka46FR<MGtmjXa&u1;5{4edX<z^i{
ze;e_Ihi!V*=l2m`WV>tq9|xXRpYH+Bunuu@ihTBe#iClD^}TE_maF`Gi4Qz#(?c!I
zl_P$Vc3b(pkN6^=OXc=$;2GwPlb<mC9LMDW-rNU=LG&TR`V11EBVJv!fR_8$#P?Iq
zTbcg9H9gy1?S|i<F1PD}OSy}DUgdvpi}XKd`qR`CmD{BoEuVSz>zA@VUKkb+9}W+U
zO%4~v!kI!1=dV@j9<~vL<-^5twH}&NLHz{Moj^7w5I>p7?@gxaPeMsvoyjV6b|x}*
zb|s-?CAzYSR9#8-UCDZNCF|9lEU`P8s;4heVsA24Us5<5lkEL$pBLV+qca<Z2kN6Z
zHV;QvPS?waL!6#>Q{L~XHi30X>-RU(7I4nsOgXI17$%*K8<xrRL<=?vmFP;=)=znK
zzdtE$zkj0_Ub|!WrfYVD;nehSm@}u;MyG{_H?*j85PxcYl6uW6*6LGr9N5@V^h)Jg
z`2bEq#CezX@vt~Hi9IXO^3tR#c?kzEPKAZ(!yfjxRL9Eoa;c*~+uxs_C>%kDz?q%3
z!fc4MGi$TnNUbnl4olPH<Fm*T`)0aMQ_?WpI(W^_APjbG2}2~lcGuo8cr(E_ZyA&#
zrzS&t{-n3%tsS-UNC#R3J+g|OIg>cLwNjrAXR=<n<@#OM?96ROjy*d!M-0t&8*?0K
zhzG_IxHxw)lI-rSTZeE??4D~j?Fdl%4ZF4v)&1=BMCBdRX#aXG=XW=s+lkxVrSeE&
zdaNFn$|9NEP&N#L;>>U$>aq(7I>YPCd8i>ylPu>dLxo%=2pVVw65ZSx3~oop)Es^n
z&-AqCYa+iXVoT?0N?10vkJByn=*nPRDzXt`otPG5=XXt%59!^>8tUKQNSfbn9(1je
zs@Bb8Ku#zR2iIcb)FjSbMXMaZ0k*k(F%TK{M0e`7LZv>H+Y<(UUj%A$kz%KjV(ZE~
z;tV!MXL{y_{BYagcHfVkPZ|WlOny(erMi2MZJySQD}ilB-!`K(b4=>Jlo%e4kAQ}B
z>k^aeAS$zLGm!sKy+7Pi#xbRn5SYVLbY*5w2(b*JKs-cXl&p{|n%WHvT)QK;Y4g5)
zVOK}@(yi_=mZvo>ACpL57*0&q#xX8x(<JLt@3lvminK+SeRLN}&NQ9rF(HOjm?P6;
z^4lxulWf<aD7#Y~?G`=5uw@gmu0g?w`O57W_JaWREvc~36Gb>(rpesS-Pw6<xjtl1
zo-LO)%h|G<C$UX1$is|D*VZR#QI+oDLh+`kCM#3`J?d3dSv09!D%1<MI7nAIqwjad
z&!x>3Au(T0^i{<w1`tLu2zu?ou)%nKd$FmVJK!3GH#BKYrD6o(H)}~<1`n7&v_=Jn
z_2zO3C*W3RtsMO>y=OtRQy%h!ib0~i;ZTnes@MhBM7n|wieo`u%wPJ^!%b%yqvF1A
zhj4R@b$+H&F%PpbGUqc*4{L6Ak5A5&bAy#&w^);2F{l{PF?L|+n<$pWoOIV`tL0!f
z^~`Bf==$y<+pp`ZZ3f6`$Ve!ZMw$xpMTiFfnrf*{B0?1rho*uBNIDx>!S3-xh<2!(
z+G@Lrh-S1rhM_OFW84o`BI|8IFbGD2NDGeB7|uf(f^z#HgKg*uTdF825GyfM?}W3H
zZ2qv2GgU&jG;M9FWULD^BHu2zeY@rG>fm7hU}K0|%b?n@rrPmzFx`m3tA^dVc$7$0
zrH;0DL)uW}k;&SjLah{8Cb;Pb%tQ9^4y>^rqFt;`hxK6ho^UW-UBqZveK8z-s03Wd
z$wM$$B|A<gbP$YIt2moGG5pP-M}<RuA<TibQVl*E>ytd2+78%8CE~t@qr_?=6TSyM
z6_skMY-ne33TtKfyBMEMmqn%UPX;TL-iPzZ%h=*P1{I!|suw2exnW~+MJXEfUa9>;
zJ!2<siz{GkRNoTK2?JRO%?KbaL~(Rr07Kg~n{uJw(Iu6DHCk;e^q2#{gXRXc-8Jqa
zfo)|flvBaOs(rzOnBZaf4pv3blkAvw2(OaJ!B9fE#KIdRZ~VlS4G?`gb{QvdMkH^_
zFi#H8+n8yZmawy<v!llf-|sP9`5ofM+79pb(iG!i%J8#j2J76`IGTxXqr*Q$x}4&O
zZz;o=aiI-QRJk!eESKG=!DzX7Q#e+hFryhCsoAq`rM7+Qe~Xzj_0X!vJQv5qb$BJ5
z#HD@2WhGKMtUY#_XOA4bE?9b)_-wE(ztVOKjPdHK8xxPY$v7rs#>x-XdkmP{23s$)
z722}Fp?<5ZGU{7<27zP79F}Xf$(rj$7`<Loy|#;0T}{QdOU>{ajHz-xq1sMUp>4$<
z*tI>@xagZ}b4yEQqwvPCMs`nzKc`UX$DEeMheQmdQWdwsI<uSgEi$opmjnlgty-_v
zKyCKnn{+4L33;Q4<4~E24cv44)&NAUWRQoMHLr_^{0Mj)Zk4ySG7fN4#hDN<FU6ba
zj(HQi9GPh+T)jo!;-^|;RG~lVtXNki?x}b)QxjFZ(V;$)&?O^`_z0>FMoSwqRA&$H
z|6h7$<g28#hudbsQVvX4%z<%R#u}ZlFeR^*2<5#-uAs!9QWG&Xi^$QqH4zmiwA0BB
z-}h2qha!3&>z>QlUO!v1FS@Wk2dBlWR~xw2J409y8AV{!g7N0cnmN%fbYn+Pb)bX9
zQD>UP$pd&VNDHed;V;{h*AzPPeR1mJ<1I?|#|t-=#T7>&DiMU*<~pttGJ#8sEPIi|
z#7Z#wPvIpdaP}vM<!5I>6w!bNBMO!qm!2%M>}aCha1)yXS0crDgTTr($4_lM)KwW*
z$<X}~IijF+6l8a6IQq>-WtHP!!@{hoA^PCxmJ3sM2w02zDv}ac**YDqY!*6F3BlKb
zOhcE7cJ-7h@_HSau7+l+YTF83u@_UH(&TtpC>HTTCU=697=HY0<7Hfd-=lLLM5Tl?
zMtMIwSuIbD<IP~99U;1>4#8)w7_}--$UD!%%1CbCRIN7*QFNGt!y`j%T|SAU2~XI%
z9b=W7%;Xv~(@r0#nbk%Cw+cArFny4DYro!w*{K{bg_v7G&@&@1hYDsQF9lPM&1PVg
zQD<d7piB^@TkWYf!kAHHrfJ}kX>{6Xm$a@yADE6}F(DX?re^S!-z>`urpJi7CPq@3
z(8qFWymF;a($MMEx(QNSal#RQNHu(s7vC835VkOGw@j+7?~Q@pgh63_8?)x7@bprH
zPJGt9=Fqx~w^7rrto5T*G6PWKvho2a8SHkIK|B&`^3{?#gNS;EuxCS|8{)Ie42afg
zl&uzy!6Kl;b>S^*!r4sjF#H{^B>Z1Q@t%wMa6Lq}Wpz^KvHBJ?SSMK|!Su~8p50WQ
z4ecANIL@tkZYiUHYE<hOHUro?sIIb$*#WIImT#WIudKZ~%TO6Hj1tSuZ5wnaeuJpJ
z45fDB#4zI;1`9ZAGAUXzGHGUKt8knep#ll$$ub6>dE!vB-?K6?i1BEC-B^k&X5w{A
zn{z={XpF%49>##n6R&7pB)&+K4Yz@jEX*+Y5xFwSp{{4J`4xlkIg0s%_L)~}Krz%@
zV`fgHc;8+Pv#xy61m)7Ksf<i%9lCjS;<?p84H;|;)^glAkvt=6vDzY{QKGrCFy7_b
zJ-=UGqL(k%Mj9{D3ubZUixl#5_R#HLiPg7w2a_I>mBse(z%)X+a5j3(61-VH5;NjV
zcWKP%$HAx$*YKuRNJjX?-7Z9mg?H*IW76+3A<xzOUC<cEQ9no_IZjO63pUS(8uyzq
z7*l#P&iqz_8(5yAb30V;Hp@C(XlMlWji{{mmvChMkG#R2*4`j)rFesg?%TC`X;eEy
zTGdiLqU<nA2(z1kXrxvyC#NpXs=vQ63g9%}qKl0daHZw-!FBUyUHH1jfrh(Mp3vUJ
zh-|u^j^Q1#F(qBYC}Kmkm8t5av}}rD@;l=%Zmme%Mtdl=u2mnJo}fk1^esG`?kOx2
zsyDQz2I;AlF-up!!1#%2rnIS`Wh%9HuNhr4eGNjSv2qtOK7L`70pf)+eqlszj9pM)
zKeqN~R{mu1WNAYaV4sp*FNC`ofMH(NiDdb;8xd18wkFg$+31aU^mCeU$0kH5UCeA3
zeBI?k3oTR$b#eEktg(+5h7clcvB;lIdo{E83DUwSFPc0wML}9lD3Jqdq1zu~8(Eeb
zW72oBNQE&0aC>Adh|4Oj$%jhW=;HcQ=INhHY*<3WVA3?fw(HDIk>S<5DSmIfTbjKc
z@#R4@8se5?Q#Mj!G%+^2(5((ude(*LwNWc$xKQJ6?Nzc(pwQWR8NO+q!3ktyn^ZK_
z<Mgmu6swG9HM-Y!>D@*>moa=;nND%s6HeCZJ$G?%*F5W!#=|mXG)nvy7?tFO@n*>7
zY#HzLmKFJ?1z?@OHa34IY@_SpcJ@R=z5Duh+KBiz8pn|7oC{`TnRe<nkr`EzmG$SQ
zxVY)MaDmO1vBX1kMGw*ZhI@GgC)&4^Lc5RWsJC+)gRWy^Xs((qs*@{<l(=zb`*7hf
zBAkdv;C<JQ6J^YzxNlO*Z^c0#uLNUYX00IRIjI}c60+`(WW8(ejvaZs7hqF)v@lbd
ztOb77ZV|vDyu?=h&O<dY4zae0-HsI#5yu8dT}N5cU2KGB5ZH{rsNU2$78P1*<Y_Ux
zi5}bTSujJN*|X5tuFz!ER}nRB_+sgEx!APUQH^Ljr7LqkE7Sa)Mux%c!Te0BZgAbR
zRF<g}9&i=zo2e7pNW#~S$uOZgjM8cQO;WZcH`ycWV5}@;OtB5su~n<(j=$DZ25f4^
zv`*YkIEX#rZe8k^kxn=CgoV&w<<j=Dh7AMWP*i}kW5{U(jM>=GbQs%Mu~3z3{U|M;
zKEQ?>IKW#gc|0pNJ>nU$P#@wq8YZ498@__^p-th<Qt+}h*VyYS_U6RBd_<`d!9q7+
zmD?jrX}b9yyVPat{T^9g@xr|mhbqvj!SdAfxab^yUV;-RP0c0VgJ44Q{LVg8dN>U;
zgGLNn0xD*dSiRRJPOpvo27}|xVMGRqU@M!|d1ZSN%_gyidlN0E=%~rLDf$2kNSh4m
zx<oPp=)YAAY-Af4{fTyCS<B`c8nY1UsAs~tYZ`y^SO%4$B3c%4hQ6hiMVi*)ThA#Q
zTdorJuRoY5?H<vAr%=ZMrRvFoU(~<t(l7R}IL`}>&|z2Rq|$u`?cNoL8<WyPsf0QC
z)Z~bpQ6`+?d06GsT}6pd5xBTpfKeo7z!H0bf#?d20!02`s{bQO9J$j`d&lOo?n7}y
zP9u|XuUy%64Ngx45)uv%;N({8TF5U?(3XtmJA79VsOLsa?vg`YRY1n^?3M^7n}<at
zB({P=bOx~Y4rhlHF+%AkC_X}tSCS(`<kU(V>P~)y1w+{n>u$I*0}z&*-BoNzXQQs?
zHf*x4LAuLj?&70kVwc#li*<)+$1aRNW{Ax{48W=^x{7YDbT@b*SOc37iM*A9Hkkx5
zwAL&ntx~V{Rk2K4D`R89sO>{exWUmq?WG7Yv|i6%ZO9)z9A>>xEfn#~sTR9pT7*HM
zL1{gat)~$>uVZR<96yQ#t=B9a)lc|^QO9UuYSil}%}yYhrTBeXhuq;|%S~TG+=E3c
zNCtkYj@7*m<JEQGW5<C>e8plZ{+hiw9kogGBZ`jlD1Ur5S^^XO2JS^{=864w(=xNa
zOS3%mv+$A9!P3;UFkUGl|72YX1|q-J=)e@k>yVS6vEfpGG`e;wzaKt>cUbUPk?4G9
zJqPf$`~ke)KgTHw$9v${uyR(Wz4RPtrTZ_pcpHfP<XrO;mRA+wyx>OuOa@<YDb@Fa
zSk&^+&qDEYW%G^>Enm--R(cL^=zW)NLVkJYhvNF)52Z(>5L{A6`LFdCOs==!zZ|im
z`SqM?rTU%`y|4N8`TvOdr5<wWdrOq+drl<(i{-u~!ln7k_$+c|Gezg_==nRBN#fYW
z<6t6WKR(6t>-pG9kLVAanWNpIH#zzB9BrkVp4p=M>vM<-$II9Awv~QIf2i%0tNi3#
zlz4u9kB!oEH6f|{{Bz9T$@_YKx6<#bV0mB4l1u6B{H*P(=X@)DsgvI=|EQB+&jVMw
z;|eLQajD!Cna2mIi<YnFhAX{YZ!m*fzWj_`npkvOE`9IM;h)*>%e9{z6L1m!Ykob4
z{C4Jd>#N_DK83tef6cGwl^<CqKeCwXd-y9N*8F;|`Q2K7KEK}Aa`m|;kvCqxo_~Jq
zS;OAjXG!!|^D8}xJMsL{Iq9cumVM8UvYHQ?Pw7vb{Cb}HzF*jUT2JMo`SiO~Q!sr#
zJqKo#`3E#1shj^lflK|fPCCuM;NX&0t~hm6%-p~Hmxg$AJa|UNnVtBr<*WUY{;74h
jIqcuNFS8|d;+%x|rR6GiF+QZ_e=2WN{zem$y7~VX`@z7<

literal 109984
zcmeFad0<pU_C5YsA`wv{Zn!rn3aFT{1{4iTYX<^E7Dd!)Leh|6Hj@s5iW3P@ni!1;
z$~cb39k(&=h@xQ`f_uP?!7ZX_hoD5pB`zbsb8gjpUA-YNpWlDqr;qA$UcFmYw{G3K
zRqu7@ukMTs+P7<G*+&QKJd3F29>t{T3)3GC<w>(rtxi^&b)t2G<W10dVkJ2)b)TJD
zmP@jcJqYLWv()wFXY~-?PJ16B(5lg?eP4ThU!A_MPTyyz7BleBX`(__N2EMAs^X&3
zQ*}N&WnLbc+x<EG-2G>PQRx$%6mO^YeHY<Awr^jLB%N`hE-#*5a&^1xR5wT3b|hqU
zz<J^LDb_`k?i{wU;PtzXyfbk8N#{TQ<bR(?lOFAa8-^f{3nx{ZwzhkMmA1J{d)MT4
z<s;L2G@X*!v$kEz=rmMlgq67Vg!AuCs85)_-EtkC)V{`Ybv!1iW8L(+UJIutKK<P2
zvp-0?sO}WYN@zdPvW|GWyo+^P=Yw-=+f|HM^lW*-(6zmqjy|>beO=lgzQ9${u3bv|
zMJe@%Bqkj)z2lP!ixN8@b<pT8t{$#5%UYPY^|)NCd|~;tj@D+&b*AN7a98J%&F#uu
z<%>4APdcbw(xM(oBds2pnQm+5)b{N<v~yWq(ynkFz5M8Q)~aXR$8@)n4r#T}wodid
zDcu$fTAbL<O75`SI>_~Y?-3)?GDjzNxe-}i-Cfq(mX+APW0KXae9>*Lj?_ge&S5yu
z!Fevu;W&Au;T*va5yxoY3vgbjd5>{D9iKBa9*1-S&P<J&Ku>Z^!oQRC=P5`p#(4?O
zsW>miISnU|%Tyx&{zZTG8eA4W=V<IhIumCe&MP&SkF)@1p~jI(5r|?<mLT=xoP+af
zobzyAgVQ{&1-K69e4N+gyaA_uROk<tNEhJ@Xk4Y!Tani2&rFu!T#9oU&O34PSdQ~9
zoPQ_4aW_uwimZF_`F@f(9>Dn!&J_eW9>e(r&L?p`h4UGl&*FRzCyy6!zKC-r&X;l8
z$1D27YNW5?d`;suNZ0Dm>yZ8n=LVb`alV1`O`LDxgf&_3;CvV7W}H0USBdoj(hqTN
z!TE2TpW^%+XA{mZaBjo-CC=?Qzrh*A*^ILVCy$*tzm-p+f4;-VRt>*L`XkOZoWJAT
zgVQ<M!584PkB<19fb$^D(GMJqvop>vIJ@FZ!g)AO9^G`x&qwLc#~|&F^H`0KLwW+v
z6LGq5_P}{6&R#fA!^z`xq`h(W)%Z-E(tY>SpZn`{pib4DmNiJ@!4W*K4*{2o^BkPR
zBd(tZe7@#JARUd<jq?JW7vkjMK{`f%9<S3(q!V#Y#yJIN*V>%Hi`Lva|H-jyYo6-x
zV(qbSe*5{*B}Ytobw|HnFFyIf&R>)~IpMdlSC#gg`|D>9e%JR%-`ESjdv24f$8Q(>
zvvS<?kJP>S=A5&u7Cf|VV(skl8Q--(_mCl-YM$JA)o-u<^V9b_T)ZjR=YgkYU4KHC
z=0P7l@^JIXKd(&qx~6?gbH{aisvju)@8yp@KjeuMZol--wxgbY_|S$~iPLJP9(QlX
zsxOyZ_tL7@-n($}2a~_r-S@~&tNQhscy9goqmlQUlfRug|GAz&-E_z~OZ#2j`;VIH
zf8>9E`269yfBfZ~ql#be`P(CJeLp|=<*18p*)sX+;um`s^>2GC)7$OAM~|Ld_|Nxt
zJ#nqKzH;EC>(kebt^DKdF-4yqx9Akt5hGkZdfahD{k*ypZg06Xv*|y-oRY9^)9crF
zJmi+m?JqiE#E-X*fBW|_{_gXx+nz9IY{9dm+#l?GYt8D<mZsHzUx|B8{w(n5XAhQc
zI;Q!h`As<!JFOf$^~Rm&zx(Xfm#;kMoG0G;KDm1KPxtm*fBKP!)OCD&;Mmqa@BGu(
z{nDv#zBlOme~tU1?>~N8`r{vu^tpB3vdJUf`sS)#Yft*^!p2=6?Rui@lIzo-`K{{x
zyPi*dWQyC>Bm1r+>SvyC^VaQsz8z5h$5S0T{@}m;<+E3vRdVeg_nfoo@~gU}Z|nQ_
zysMTRGJniXO<&i(R^6QS_19gdj+vVK!i4|auxr!zV>-Wm@DI<ef9CqPn$Jjo_^h`#
zeh@gS_Mvav{?hPe;&*>sIV<P#kB<0d?8GaURGfKKuWJW<df3M!x(prLn)T(se*ALT
zeP?$4dG>XOJ+|nOb5<s}?mY62!t*YAv`3#gyVf50X!hz$m;BiEUwg*%_ij&L*7Dx2
z!|puu_y>pXIJz!>_=>u<OWqvx_Z>_A-QDxav4{M<_=o?VHL=gj_nkYld#6p`49H)(
zbpE>hgK8?vJ{@@Du1&vxk$FX*!>5k9uG3T9U#kB8unT{D*LT!YTZ(U(v-9?2SO0Xu
zwrj3A_{Y(Qbe(&~63?T#mw%M^N5+m`U*}9%{!T*u<2S$1|Dt=^R}?g_X*gr`w8K8R
z-aUFx^6lfc|C%@Gxbx<Z?0DPEhhJEHZ)xE0<=1`p{mbv<)}OHPrG&Rq2H(5Gd-+eD
z`;`x?>iynB-o)h7e*0r)|I3Dce8jyk`9>`quw-OG&W5(zy8ZjAsV6)&VnyAEjD^=6
zdiFo(zP;mww^m($>;+#OHR7_*3wCW=^!M|QzwFla%bGe&n)}t=r``1Br$=2}_0og~
z(l7sN&rRFETifM`ci+yesGIV9kJFzR_Va(2ewp!b>(3oF-1qJ&74JVZrhI(&`_H`P
z^fg7Eq8oPmPpEK@?*07c+LXH2F8KHE$`@`dzhunx<K|y{`;XsknsV^lX?unpS9$P^
z_ivkU?euS-+gShoL#Zd;zWMh5UfpTSWgiVc?3y=!9C+0~-nn_$(mQTlcW%@2L;l_2
z_^P2d9)I7R+gD}$ko8gj51)E((`^mS->seh&3P;RZ@YSY^=U=Rqfd5uq07^c)E!a$
zOwZT1_MG<PRm-<7x%0Hq6^DM*c=w74p3T2}x2a*_C1Y+IHaO$UkNaG=YxCVFkLc+-
z<cGKFzyIdk6K~&n(#XLVd~(@Gcic4Q)ngYNb58AHO~<_KnfAcB!^b@OTIy%*e;Bsm
ziod<HV#(uYcW$g|xnZmFQqlibp9$X_O<U>kSF!j<yT;~&2vlR`-#sWcKM28Wto$g}
za>c06nnPpf|29rNzdI*({-!wP-hD*u{D0{ao4+kieNH$mcK%HxV)IAGY41%~2gK6P
zN+>55KO|1Mb&0X__m5L<2Kb&hBaa<jV#}G95Su?f4)2Y_e;21;8sg;t4Hb-~2hPc{
z`1|76=j(CuPmW{fpP?UJk;(zii(_wzaoUw1$F7!Qu@I~NC2{1u8K-{_jwAofICh>m
zGIqc3iDOs4#wqu-IPzQK<nMZ7?DqZ#dX8m>j0I!q=kz$^<)%3Hb9Wqlwzg-vU98_%
zsZQqj<T3js#dl#4j^lWhSS@EOzJ`G}jue$x8wV?XPf!uZY5uyi6u*93IG@>1@of?O
zlklIc&kb767%k`J(-r@r_Uvhzzp}sLr+%ymkLG7eKX$RYaKVDZuFvtX6UzTV=RZwd
zwqAh!lRvLs5q5hgpP=~q_Y^-~=U;!M;y>eJ2FD>PvFdew4%)5=yWF0~sQk~orFdKZ
zyT>a2kw(Q2)pAaTov=Q;8x&!$JB}Zy_&2^*ylpd2>VABGmEvvr-|K!Ht^2V=%U`Yg
zai;FazM8*uh?3t;^R|9|K|!p~xgV*F={kR+*7Fry9OB4QiS^=nDu3HfMVzmB-x-SU
ztrvN?T^+*|f7K2JZM&L`@x^kJw4Uwy9E<sg{HhHqze^?7mAH}oLM|9_K#g+D)cScs
z>j&l}$IAz+{QX~6nB$Gd%99oUg4WwyMO#7L?{BSD#NnF13?0XE6LkIUe$U2$BA>7O
z-PUvVNURsF?pE)iJ+uUse0QeGf5J9}FV|Ot-4(y}ZAIjvJv_4F*ljkhvD|8H|HvlC
z>+rYaUujm@w%gOR9=d5gxYT9q$8%KvWA&oiuK%~%&RyEh?f!Zfeu?sz)++(Foea=^
z;4UuUaV*gFPt|&E+o<?%DzT2$@}K@n5ilD${2k~IyI7aqtME+tJ073t@iJQ5RhqEa
zXXvTwvqRgDJuX(F;gr+zq{{fX?)Oi%9X_Y~(e{Ve^-*$;`$XmUY54;nhwXi0wIar8
zz6a__zE#^BvdOVo>v`%c3eVR3GR@boQ^aV^zo+dg(w|&^ijsfkn<}GS?v1*?-q7`j
zJC)-W?LTkXs<7=R&(`+5PV2`Wk5jbXHf!Fl{|v+xY}aS?O3o!Jv4-pVM~<Ux3=rmz
z^kbJKDt;k1jd0lGW!XT*fAOi}$Ed`*Q|qChZr3Qyr<|(tzpVX|E$72D#XtL<%81+L
zs60yX2Wh?8{-mGw8w0exrRw~%T`IrlLq*u~Ge>g#9%I#~hL2zF8L_?hy`=Jw&~gre
zK=P5}uB+Bt<hUzIQ~77;`rG|-l%8kb42IX|l_Zt_g4Y#q+fTFhpM!NjPSy1}7XoSj
zP1<k6?a1-49!G=qII_$APTSiZT5tAv|3{qhb?q^#J}11b<l1sdFu<vYS9Lvy=yGpG
zO8#iAw@l5SsO@B0y&`PCJyzSv&aV}3%ilFX$$3iic7NTEj-dSezf<{PmU6tW$IBjV
zS9bn1&EKzi+b_+4f1}<SA6ECjg7LuPzh@}<3$@;+3u{>;4^{k*^@_0j<twd+i`OXL
zuKzK5zKHY#KWn><9PhogoCmc&?fM_1$BRqX-<E&-k*YqC<GNOl)5vjMf2zuV@D^21
zyWhV&OYxs=Q@mYn9U%4mtM-R^>ax{O*Z&%AKej%v({{U{US+iTQ&3^*`BiP7Ff%!B
z#DJyzUAjJiauoJeylfJKxk&S4^?bTg`;$JJ--ZfP&dv>rn5%j2VUoW``=wf79uu_v
z-}sTjwqI>b*7<dP?D?ht`HFXctMc3W&+Dc5J9Io?=fA$Q;v?sg3~jeNzf*F&DzRF0
zxj%oYh<=*y3xQJq21OwJljB%zpT$}acK*9`oVH`9%4gT}!Zamko36i0UA8*tdPdG)
z|2$abuUo4!+U=Tni0a3iA6LA$y-GJ=T(Ey9=>Em9lEb6N@5Jp2BRr7fb8VlCBkcSS
z9sfl7hbM=t`n=Vm?$6M2ZaG%*U;IlEc6%4<ao16gQ=9j7z@9$hv?lH6;g00U((OHR
zox-|#^1O-qzjmeKZF^3D<02m!mrguJ@sa-FDRdn3_x?c1vGth&eX>55jvITc%hun9
zsQektiWsH&6wM!}c{>iou!H^&wK8_8jKn-%gJH1Tay{RrD%v_)+uIs#Z?>F^^>}<o
z+w(Y`-*dK-6S-dSVc<~yXWyud59s`lrzyVpJ4M)jrV9#U{>X7UNZajjZMRQqIge^T
zv++$u+@bk{VRw|{U!@3JpWE~}eM#H7t<MLve;D(g%5V4Mc3tieo!_>Tq&VaHlyg<N
zqqQBvjO7@w^WPKUH&&x-S<hSER{3nZZ90_wb+DCMtp(`hjf2!UTJxYHdTM{QN&CZz
zdc4D&<Y>_LaI3b5vo!BR!&&Z_j}>9J>*OwqKUa^JhjjkBGjzN3e3z#ABAvfABL9ne
z-ih=NkDsaJ%-yC0*zNsV+S|pt_(jFr{^1T(jP=ZZRq;J^xhLuVt%>O0qtjIWByCrt
zb^e_&Fwyf@ia15{1==1aeX9st56dy`n7>Z@2V0-7>iTTe^N8KQkD_ACzd?_qGA+Na
zwyUN_McC`O2X&lw%NvTvFp*=1_E)2I{%p<9fn%in&aW%N?#HwHEB+JRE|`HFi?m;T
zV!Oijv(3dA;FPmb+e1&CKTF%;<hK=pa6*nd-~gH5zgA(FN~~D}6~9i8Ycx}i8#*X{
z@EV0(DzT2%_POK}MHFcMH4IG3zg&;Iewx2XkLyT(dz{vD<o>{`$0#{{b^UEU_y#L}
zdr%3s>v_pYwH}X*=XYs2k^U`1+y6a!eA#l2fT40;`%({-Z*~3iv>%wG{eW#(D>`cZ
zXnoppI*(9%`|p$hJAa-YzkMV6tI?(M&(QXEqAoWTj-C3<X;6gSkFV-+dWW{#Q+56m
zJFEO_Ur_|yz8wG6_SR3!x9fR793$obr0vk=cj@-tquXoqKk0Gwjvhz0AAUiPyEAvH
z^6YVx2m9fC7u54_J6)eE4^?`;;(0~v))!tlTGeNPmVcJczvcwRzoPZvj4y3Jk@MXI
zUH^-9{asp458TN1{`$2dbhE8CD3JUx?axos`7?$o{wh80pk6sn*5j+6_LFx1{#(~S
z($8Fu4yF9Bw7<3UKd=3Et@hiKwfuYF(3t<`Hx*%*J6w<V#xE6b>%WU0U!Q3^9IWL$
z(o4yiuG@v_PmWK}kF@hUo>G|4EqJ&(s{Gs5C~Vi~ARP~^)b?Z7KOfmdpDR^<bdwxY
zbv<|LdZIWvb{?$q?|wz$RhoYW<BoDNb^XzNIVS7;OEqt|_YAF{$bEprw4E&0{>QeH
zH*|e^eWm2u^-tFQ^_RE8`L!KXeR8xO=BvwA?r_Dg)OK~G=I_+wuA8>oBF&GCGaj$e
z`i$JCU9RhYn6?vp-2I~K89DBrfT2=vNB>(DSfI<D*HiJ4``drl_I$0jXH27VEYtS?
z;YSK%x{+hMwx7s-@7Mb%`Q@LeeD-)}HCUgSFDd>Ul~@_*U;2Ta+7H0Y<k*UF&-@o?
zL+GdZ-?W~)Yky+vIZgAC@vs|?j&dUHyh8Jl<E0M*0OsEvq35e$VB{nH%pckh577?8
zwx7Ow{2u+Ws<^GUdvv*x_CHJ8L!|vbaIPwMp|&f#-)}_2S)WMz|3TaHj%HP^JuVh&
zyGqsL&ekX2JEELzx?gg%p0^HE{F$FAq5%1LOgUZg_iMY--C!N1?QPW-mCqhuFT;>1
zKmRMm!~EnZ!@wsW>CYGI`65eCuv2yZThdhi$=@l$*0bgH&MGJ>^p^UwO8j20<(-*V
zm}hy@Cy(>y_)2`U@=E=_lF8#n<rfwDCTGpa_o-{KuXwZPW>G*^e%>|6IIZyNg3LKc
zbMiCiOoHrb<0tu3QnGWiO1%D(tUP~ddbWFTR%vP8tU~w1?1^Ku<7CPA70&YKx-(UV
zp{R0U8OoX)B`Z8*bYTMvisty-nc0w3U={cZ2xFHyFuSODo{=-Vc%I5Q#H&gk>Mw%Q
zO7aS4xeKg0rG>@#=%2|vM*aGGy~s9K<x5BF?YmR@&&Vpx%hnRov!kmzDGSPRyKzqT
zW?qn&IXT%|kU4fzsy8!bfY&=G%bSm`@%pn+{G^fIYtWDA!t5)(*|}GGXJ+N)tJ)4K
zDD(T~4$LpgzS7N}nB?!Dj5_6(6kY8tork_H@cK$hib~Ms>>QPSa9N>q14(~balS8o
zOlHbJk8f_)tfIn8O0tTJeI=;$z#QMqtg?JRj3wXa_oYut_M+0GQqm`-cqa|;X3R(d
zWWmz&-pQ2Y%_}I*PtPbwcJGhK{_(`5xW~k(NMv1EtB{Cfug8~_<3%6%q#{vG%b1b8
zmm5;t`%<DbUJ5PEE3k5WGs<Rp3yN}RaH$w%elKWI%~(iFE-TEts?6st_Lrnb);&gH
zqUY>6Gjyk>q!eUb>GLWXQTLBY8IY?w&YN9a=JmVX=#8P?akym)N=={ab*Bu1{lJ#d
zJ4G{Q`?CG+(Zv&Ba=PaZq;^@8+!J5||AU0lDQMm-m{sBbq^4rOGMz}Z_8&B=sBmW9
ztg;fHd#*3L%un|=65gu>17EqCs9RE<TgFex$jB_hnD>>8^yOyF$tx;xrz97a<>#{x
zy?!gpUnHX{Qbx*PN5(i`!3BBw`R-gQDJtJkC*S0}0`$!|*p9n@GP)8&(CZgY3<`zC
zx<7f6&p){&E1NDVWsq$FaJ<<*uOAIeNiHh(71APO3+SKhRA8d7pvdnVO%ry{o)N3G
z{sWxMnQ%is{|L+i^GfqdsmebqDJ9iim@{FfEg{RFS5z3M&yq8WvU1{dS4#h@OY;0a
zFC2SO$-KPGv2)M?Ud%5(Ol`JLj8b#TCA<k;fqUHZX4B25q?FJ-C^?yP@?!U6@<e2E
z?!!b^Tv9YMFW*<<%`Ph`@fD)XvO-R2Dg8@*g%np>nl;OpHxc7~@EnY>qLTDHxcNM{
zJC;UM22uW`c?C0y@-NQHFT=bwHal}RvqlRTSc(}}I|De^xbq1Hew-<2vKPaBathox
zW|=|W37BbERNVY=i;z*PDjBD!xVH|H2~qWYbQ|;LVy2B@=mRE}6}pvTB0G2BB)_lN
zJv(zY{H5icQh0S9`eCB4w5$ML*gI?PTrZ}v(jo>d{(0Ux$s&G0iCDXLM%m1nz7l)Z
zNQt27n^>6$y1!dZrR>&GI!{!0;{ts8U`_&g5-XL+Y>r!C8@AJpFcDj=VlXy{EImb)
z9!d8{pK>fjRX4tzI5$8W6UU~c<Q5_JQ2L1>70vteOr!N|99`E55-Lve4=lYps~9@r
zWaTTQbN1%VOrKhcxMMJeHF`1~Ge15S7WuvDP$=fGyi%`3E70eaK2B^<bpI5vjP$(O
zMTH2(%Ch}=5SBh^pqJf{YF|}>D9e#cZug)$GVJEK2m0p~``i<8!z8ziipWZ*V5;+4
z5^Gtx=ux;-+uv$q%7@!RrDl|1cA?5pa594|@2s+{k{r}tg<iCoiH-@x&LW)_yA({8
z2<zxEymJP|sYyv$p{jswU=fXhfk|``ZM|_aru6qApyRAE1D-#9Ohn%!@?wh^PiT+w
zCk02z3$wTV7B?^_%byizHtBz%&p#=zpe#Sj@5>oAFFW5is;I2c@6ObH^d~j7JyK|Z
z$5GK<x~v3&Gr~~QHX^+krpJOjf2OZw<|IGn#!`P?cBy+Fj?C=YR&klXG_t2<cot;k
z!@#_BGJCt8<NK2wF>jt(;`2pb(3W0Y#1P1y3b82h$Ia<vg4ssQndT}gV#V0scNP3g
zwvH_^PwekBGsKI<tz#6j&<Yv*{tCldCo_8vS80=mc;UWv#LLy2?Opf3hAI8MBc%6o
z;YvzuS1C1hz<xJ7LZ~x2?Qe_!hZ-bP^LeGkMWrfOVI;G^#Shkte7&mN@7wbV84tR#
zm}YE-X@*ln`osf>blmO<^y7nR_?TTvd<YkOh1oLis43&XqbL~U1k5+~RB-^S!%#PI
zVx$0Bn8@Ng(!-z?F<0_~(O+22DDTNB>j3JJlEPU@?GI>4==G@bBCF)i&Ua_(nUMJp
zr04-CS?Ufen^{zHHD}t$wJ>HS_I-?A#1@90`Oyp5%3@zc))L(mA#k=t=m%I~Zk+V@
z!kg%pbK^|PJwSOw^z8CKb>N^9Un%sct7NDWm5Q{QNc~`l#MkctL%-uCuC;@0V+(CS
zq3>$1Ol4S5W5E>K#$kwU8~NUDu=CK0V#VBpW?-9BuQ%i68Gr?xQewOup%oac$&MTJ
z@(#_4Q??QB!#!Uti$b+66}r{P6CpPRn>gN*tioA{jGeVadgfTHDCb~48sz16pEi6i
z7P%!<A@>ZmTZvhst*)q6gf7Ics?>&KeR6TA!}Rb=^^W!-^e&p0o|%^?l}qtXM3-aD
z$;GeZh4c2orS^BcV$1*<E0jMApq-6CTc2^lHrpK0dEu)1)BJxTIIb$=2;N84M&1qO
z(hmBYs-~~or^e_y8sj}Ou;1sQjdACUV0-5uogYzMw|l6wK;sf5#zJDwL^$Ql$+DP;
zTqj3JgM7$vyF(*4WYBT4rZ6zX&X10p879P9&CD@lQ-mLdMJ0%}o$-}UM+u#~WeauR
zmOKNwa*CX(CWfzj_M&YRrs54ft`AJx;l;s)?7y;OYcNXiek%}8_fJMtN(w_u1og6m
zE<F!BfeTbB+21H}RR5S!{eB0lcvdsc3pWM(FjgG<FD7g~fS@B2Ps{A3#(_v17upL!
zL)COKM05pJ@2|2%_t-^|cG{uFehZPcftn=LJ|7&_z+$z-ZaWQBH6_{Mity0zz`7S6
z&{$u=&71n<(hU?8e89$I#P0j82gcl-MIDc+{eoVnz^|dd{(Rw~*P;90nc=FTsf8Zn
z>~CY7m3HQVb`*zoc2+T-V)*CTX0gAuV3ed6jXId#UkNgd5d?>cIe?L31c?W5cdDa^
z16=!JIUpN#?gFlUsf$T|u95clVXQ2e2B?izY+n`NSx5R*Y_^7;6^|}vWaoBvPVI*Y
z2fh?dQT~BXne{3%a;;%+w?MN8Ud;c+iWied*eZ@8*8wlV>FhC`NnfcQ@u-K@|Nrff
z501Y>j!Q9i$On5TT_|32ZwoRsNep;zdF;-NidOZyC=|nTLnCfH9Xq$P^%asLuL8u(
zjoKP3u-JF(OCIqdIV!ruo~Sz$F9oOv0pa1eu7p+q+7C53YqxCNISUUeWsxU#<y_Kl
z)bL^c5Qnt$vk5XIO!B8<1J0ktZ7MDl<H{J6lgGzj+|TmL;}6_#i@;dqHr|0t!Mb!X
z)~z{31>UUeY&;hYweU|?rxI)EsnI7B0%YMe8*EHDE8rXiH^uY3S!^Jen|!e8<-?l+
zY9WMu9T`sMqExR*WA;eA<$}|1VI~;8u8C`%`zAP^weHj1_E?J=M{(}n-~3Tl4PO=8
zuHzKw*tyXg5n1&jmVFc7WklX%t|e1qt|i0Pk%nLd2T@D92<f5a2;XzUN|Vpc<=Nd>
zcz#wClhB{7So$-x$@SqGAc7NnKN{olPwq3ft3tId$V<nIVfqn_dNLc5WL6R*-iu{g
zR1@X-Ki(6G9QJS=L%owm=}<Vr{Y93j*WA&Abo|&*Rb!RDZ*|8qSwA>H+%hn7(90%#
z-b}R2H#gf?j0iefC91a=nwU-juRY*(0KU8^TY(5T{n_$P4o1zst)++L=<O|8dD<&y
zEwscZFWhOwS3#Tt{!i3Wkmr`y{r*pc<>)6fMt?<Y!A5j?*scvnTGVnsY@i*$@{%!`
zvoqw)Dw}I8S;ep6<(_;zLgh52$KC-2_|fb1*u9VSS4y%mlXJf@Q-{+-vvP8<UtU@?
z6U#I-+}?{kfN(!0_DT>QV!vA#zNX`H*Iv^dNWBJ`j*Bnm%5vfWHJ@6LH5acQI&Uw9
z-cMCaHuZk0dRTWLj|)?|_`|z$c;YAX(g8G>BE(S}PI|w)fXkN{%P%-oS?%Zg^6@HC
z?3Ynvz27Q+#yt^UY$Aq0iuq<OS7Ku(4a6p?yv~JwaaMVFS2k+(fZHRV;%MQF4flVQ
z=M=VI@Apm|yT7IJU1-`IDtX|GD+~oRR#c?-SEBT@4|^@nb6O`N+{<!8c9!(!wY()}
zZZ*YrIC!2D?QopsL(CZpx1>5@GM$iG?Wd$h2g)i;HYY@_*8X1o&_;Z&yv~eS!Jg?d
z^;0H$qiG*Q14l&+nw<wM(Lu9)Y2W7|C(Mo(W~?S-YAbR|i&QrrI%pNr+#>aYSH(@e
zL<!?F7-b?xV`7A0`xXSLt;v0jGQ-{LK7^&AEh-FT_~!HitR2yS{`_AX#=hy$w7(Q>
ze*@s|<W=>kV9j2#Mm`my4^o-+^GkWYBJmrhS6!jV9FbjsRSjZb8Rl^pi!lm{6hkE-
z(pRA)o~{lw*OU9PHVH+GTsO>d+Ogl0iYy`Qmm0&OqP;%%joMZp8@aEpoZ?=Z#2btd
z$PRxn6M5rAK+tbYyFoM+GE!rwWq*}1#5+mW{RnRK<|fAK{tDq%F<#^3{*1FndVpfG
z@gxyqCLVCuA*(2Cv&_a@<aqtU{&v*<Hi>%S01D{^J0@5>sV@N)mHE9zGxbXZGYj~=
z0DPC|fBw1-H~w?+`ntR{E8{LI(Al>y(Tu^{qw#~B0bxN-R1wMUsM&lkvD8?WgYfXr
zqN%-%3VAyPGQ`*DS6XEIP#l%v{UerCc@d*Y&Fy>3A9AkP%GZwr7&^z8-euy(K8?O6
zgGA5y+Ld9~Ry`<<Ut**yw5R77kMZp-q<t4=PtW^)N4WEWzIAvL=6)jU3_r_9+G38O
z-KNM~wmYUJIKhg<a}hc1Hw@yxoWWorrq7Fz6yX7t_3XPcV%b8}q#0QxI?&cd%%Bie
z|Fc;Y)1f&Rs$q_5+fzBWIrMIIcKnUz*qhZ7YlLl9??pt^rWGe@Eg^ibdjEy|`Bt^j
zQSi-p4z9igrk-5$VeJex!9%0Ss@!gV(JXxXQ@(n6z>Xb5TFrP^oMx5ai&oiu<4XoI
zzA+J+g)zB@ET3-_YX2V7GC3PZtsBC2?;>1RwlRB)WQ?1E`Pc!0i^=9ooUk%wXdDgu
z_QgwM_Hbg(uqAFr!9IWM(~#w?R3c>U*FrTUZ|tO~pgux3@)aOwIxx5AqB4d;ASw}>
zf+;CenSn2kNROxs^2OxXOE$w0BjiU7e*H{gFI9@XCA@;@%eb=*mP3)V&AvDjd>co5
z6GSY7m6r|=C&LVRFS9UA(lO!C>=<Et`xbu4+!^X}^#JTZ3|T)HfFig8qK8zkdGZ;Y
ztgckABQBF?SF+;C>YvW<A<I)NH}-N6pTOjF9XLJxnS<7Qxs&{bc-R-&p)(}bi4ehu
z__pqoz4$_=`o2`;qHGM-2WsIG_BG)MVNpdE7wdOfs02K&bCe)D&@*%7wQvO?j4>1x
zUqATb-&OnHX5%gKutzC-dy5%MxUiS+5P~>0l={(z_CAZaQ|){3N;91`zJdwoj<8I5
zn0@JM=h=un$H2rp7vKBw;%&HGtV43VSNi68)r!QjDfzk@f<nGU8$k+%Z>wFczUbet
zw8#P~_2*=taRxxY!GyE2@pVn26rya!dwqP*4F%6AEmc1%VR`YygE=C;vc@kdQSgPM
zMtKMHOSUe|NFO=M+rMARK<8uVQ~!R0tWi^^ju@HlP3bql_}Fi#$~|=|gbeJLk`e~{
zYq9DofBB+4{*qaHr0RE4I#>z99E86+$G>rgAE%K+rTo`s)YT4FqTEdho#Yc!UTZJ-
zV3B73rcgWU|NZy>G;p9A5JMCTN*&4HV8d@n;mka39)9`dU~2(59_?Uz7wGG0z)Nqt
z;~@O?Eo-snj?Bx$FM=gn%QP-KE)T!_)WN#fxj(_`qpwlkdrw`7U##k6J+8TVo1Q{>
z2U#y@oYi(`d;G;OYn{f$g7Me&H1_vdf?vXGZ+!vG!~VOAAmvEASEaRQ^^gxXKXawx
zS48mg$JuacMa1un+PLm)B?pbR)+rJ4?@EK`2(^!TgFiz5CJ(P|G<f?rm&i97Jbo(5
zIkp%)|1Ow)G#Na8BF{O320zbX@i*5D9zP}L9Bl@#e;P@y)fvC{c8Zf5zsqm%_HPsM
za`$AVAL@XAQ_VhH2G76IW*@x`o_|BdK9UVy|8$XDN;P==bdqzV8GO3K;_sswJpWdj
zePkN^4K|5%s=@Q`%Gt+sgP&=WNOKL|$Zi!Ie5R3quEE2wouk~~LvsNB{-D9@pK6kO
zY7Cygm24l&4ZheWk=7bK2C;LjFnIi=m~+$_{O=BnemD4~248RR{2P7tvC-h|-(KUj
zMuWG1LyY_ugYRd^X)<{I6L-vR1q~iQDdrrl2G6l-A8iJIr%f_V=^==A`>cFY|0Ejx
zJqDj-@b=%T;N|WHf47m}W$^aj^yB5;2LG6mKiS}aH2745xBu=RFQ=Kjk>6wR_TMh#
z<xGR$VC0`_@J|~2bc45jC@<$4JRH4q6dU}d4r^I+4gNZVFE{wr2EWMQ?Kp>*YYcvy
zk$<_t^KT#8N3FqQf^m)&27j%?T2`IGcQN>t20zr0UvKbjM*fWkZ~x6kUT!paOqkBG
z#o+n(8||aX;J>#?q(OtX|27RTw;H_u2{yT_&EPAA(f{-h*?*0}CmOu@w=|Os{#7G?
zcY{C4sK3kLcN_V88@%%yak!pr@c5}!=SVg9ha496Z}43W-ed4p2A^s0{F|EgG1cI^
z82ogD=iltKk6eQvZIeig4gM#CpKI`~248OQE<^qzgAW+_YYd)$%hW!W8+@}(BCR#}
z%M5;n!H+liI)i`M;8z-alEK#-{J#u-qrpF5@Qnt4kx~CG2H#}lZ!-82gAW?~T?XH3
z@E;m{o55dV@YaAHg6RJz8+@X{|I^@;3?4td>m1z;e!asYmNxh!48FI)e`D~;2EW+g
zQw{!XLvLvYZ~whn-sUm*MTVSAgU3%(I>%Ilf8Sv(Yr4TNGx%JCKg!^X4gNHvo^uV}
z{w+%0R&MZlhMYwPZ~vw`FV`6S)kgm120z2#YYm=%N7p`982mJwL|SL?_HV%Q@=Aj@
z{@lD(Z}9IJayA<Lc7ty;_>Bg?#o#*|^=vZu{~Gy&2EWAMTMhmfgKsl<{!L@&7}!HT
z(*AFBNUY=x{x1ffWboG;e0PJNZSXFG_ZfU|gXiC2wvS|k|J5durW*X02A^i|yA0lA
z@TVB^GY!7p$UoKK?=bl32H#@vxdwl`!5166UV_TixdxwX)Ti9w#~b`2gMZVIUt{py
z4Su=7A8W{|HF+cd3WLAc;Oh+jYlB~D@MjqE>kYop$iLCxk2Cm2gSUV4mY25}{0m0@
zCW9|C_@Kd0H0sl8@Fy7g+YJ6hgSQ6t5Jda`+~5-p{yc+EGWbsozPrJnWbiJ7&oKDj
z2LB&}Pd50?2A^v19~gX^!EZ8nkHNbPKGWcH41TJ?FE{w<27j)h=UjsyVdO71_%958
zuEGCo@Z|>oSA$<<@B@u<YYcv<k$<_tXB&L2!Pgo53WM)q@O1{Sx0dAUN`voj)TiFy
z-!tTFH28B2zR}><8vGW6pJnh(2LCsM4;uVoMtxchzNeAD&ERh{cx$j~J?(#`!6zF0
zxdxwP@C63n-QYhmc$dMSWz@5`!Jlg6Pd4}k2A^v1CmV9o4Bl<z_ZWPJkw4SmA2sq%
zHTVQ0|8#@THS*^g{2@mEVuSzK$UoQMs|~)~;3pdVB7^_C!Pgl4;|9On;CmT-t-)6q
z{0f7A(ctS0-u@dUyu8xj=NS3x4Zf41hm8h*jKMb={3C{(Ee3zMk-y2{Pc!ld4gPe4
zZ#DR_2H$4zy$#+P(nApKzmHLFqQQ?d_#}hxW60@l@ZTD|%iw=7_}&IT$>5U>zOTWj
z8vGdspJwpGje2?veut4i)8Gdf`KKEEl}7&Q27ji(=NkMc248IOFB$w?gReFCa)Uq1
z;1?PE6$W2p@Y4-`xxo)J+Er`tqm2A34E{NTuQT{n2EWqaQw;g_2LHT~f1|-)VDOCw
zkDvN?jx7eCX2@?c_+%$H_J0iicq4zS!Cz$LZ!`ErBfq8h^J)LL82J+oeu9xd$>7%*
z`MVqZZwBu&_!NWhZSYeKKH1>kF!)r1?{DyF27jT!dklVn!Dkx$K!cxZ@aG%+bc27;
z;ByUrl%bzugCAt%pKI_>8+^IJPcq~zGWh$9{51xjWaM9N@Pm!~wFX~q@ScjDiJq#2
zvin_@r?TGPeov#PVtwK|YtNu(K<(-E9RBTkavDAn%goo@v<FA8$B0>SbEDw<iQ5y`
z3%-lE196?;CBz+xYXvVRP9Uxkyny&1;&Q>)5Yrr+iv^buClXH=Je&An;!MFah&vOf
z37$rL2ywFDiNxH>ZFUJBLwqQ4lHifVhY?$XhY@otvAOjR5WV^nbIYr_NpN4{!-*RO
z_ayE{Trc=I;v<Ob1a~7olDJlIXX2xXYXr9^KAN~(@UQm)A46O$csDVZdd<@X?;t*w
zI8*R8VlJ_o(*$oNKAt#P@MdBzg_>Q0-yr4^s5wdST4F9)nk~Vrh)*VN{V&`946%#2
zN$_LDJ%}3x-%rd?v$<aIUBssl*9l%i%n+}+R`6nChA7Q7f)@~TYHuzVd=2sG#KnS3
zh&g39PZvC!m{V_arr;UGoGP2s1WzO86xp0Ccp@>Uu4b3uF~po|nv(>NB<@FS2_8m#
zHgW6k(*MMCz0FO6`x4WYH8%?GNlcg2Trc=IV!E8>I>FtD>1vv51$QQ<YiO<!+@6?2
zwz*vJueHEKh>Hd9CQc=uE_eqqmp;vzg0~S5BTf^%mG~UuWWk$>8NxTa1iwK%oH$AF
zT4FA7n=Qesh|ec({Z0CxIE}bT@MFXyh#LjpPdt*iUhrMSqloJSFCiXHTq}4nF+=s{
z8o>*QIYl;?3%-W<LgHe<CBz=$>4IkyrxRxio<Tf@I8E?0;<3cZf+rGl>D}xSJcf82
zagyMX#N&xA!NZ6r5V!s+{ZE`p+$6X!@kPXqf_oA(bZ@Q~d>k=Dy5>5;-H0a>*9z`T
zJcYPMaC_p5iOU85dN1%L#KnSl6LX4eo-TL?@ukF>g0~S*BTf^%mH0B^WWk$>FDG^h
zeuJ1HRdbTywZwlRwgj&t_7b=LBK=Q1ow!NxW5ij+je_qdo<UqM_%7mX;yS@gh;xW*
z1urJ{5!VP_Ks=MUT<|r-vxtiYmk@ITYMw55HgO(trr;UGvx(CLPb0pPI9c#S;(TJ4
z;4#Dn#7Tlj5*HF%f`<_o5x2HU{}UGzHwo@bd=+t{;GV=K#Px!YBQ7Pb6Wop1Ph2aw
zGjSPljo|jgbBN0Y|9TJb)x^bucN5Pgo-TL?@jT*8!P|(hAx;y#mH1lXWWk$>uOoH|
zeuH>EagyM*#McvBf>#m$mALh1>3`yK;wHh55#K=EDENNj1;q7&?;@@st`oe3cp-7E
z;Kjt1#5IB!5Z_2#F8CVan}~}Amk=)^o-TMcaez2e@C@R`#A$-35myl>3!X@PGqFqX
z7~*Q;B*7zzZy~k>4<o*nxb-LLf8rYACc%A)ZzFCL+>>|-alPQ<h;JvZ6Won>DRHgf
z&cw@zYXr9^zJs`2@UQ;>zLU6E@NVMe#M1@uApRS1rr>SFcM+!v-b(y;;$*>_iSH(M
z34Vk4AH+$5*Am}DYzbaPd@ph9kJA6dwZu(=A0xhxxKZ%^#P<`|3%-l^0pdErONbvN
zt`)qP_#xsN!3&5VCN39z4e=wy#ez$SR}fDZJe&AY;!MFah#w<P6FiOhapGjb6N#T7
zb_pIs{3LOb;E}}tB(?+(BYujw^#|#H;yU6c!F`FJCT<kmllU3pdcnsLKTBLExEt|v
z#I=Gu6F*N}Be*^B3&iDuf4v*{MdD(?yNOp4PZzv{_$A^@!P|&e5vK{>O8hc$vf$0c
zuMoQgzd^j3I7#qY;#Y|+!K;X0BX0eV^gnSuag*T3h}RG|3cjCsEpffzyNK5j*9l%i
zyq>sL@M7YB5!VP_K)ivtT<|r-4aCKQONciTPZvC!_;un;!83^8AWjoJjrdLCWWf`O
z-y(Ji9z*;#agyMX#P1MWf`<{mOWe9!`k%OwxJhtd;!VVjf_oBgCaxEJ9PxX^b%MJQ
zzfW8%xHItw#5IE36MsltF8J5K1AjzZEO<BZ7UJoGcM$)ZI8*R8;;qDKg0~WXOq?uu
zGw~<HF2QdQe@dJrcrEc~#FpSy#GezlelPt`+(g_Y_%Y%yh#LjpPrQw|UhrMSUlP{|
zUPAm8ajoFR#M_B$1TP@|nz&r>HN@W#7Yi;S4iZlnJe#<gI8*Qp;vK|kf~OI;5GM<s
zNW7ESC3p<+x5P<;M-qQWYzZDlyo<QCRr;T}mAFZ8U*hkH8wK|y-c4LD_&DPK5Z4Lr
zM*IVDt>DhYKN8mnZcqFZak=1M?*jgrxLELR;x^*xf_D)ALYygh8}YBiX@a*B|3;iF
zcr)?u#4f>a5dW7rN$^_YKZq^CtBCgyx9*buC+4ez%}s(IBjzLU=0?Hy6SpU>7kn2n
zw~m|Z1TP`xtAWk6f)^8Wi@mu<@B(7K!qi+Y_!{C)#KnS3h`D9eJYDc?V!oo;oGEw)
zF&~*XrwN`$%q{umWWf`O`ASc-OYj)tLy40Fk0j<Jlx9otFygMnt=~!i6LYJjxk+$e
zVs244HwvD1o6DM6v!|CzJ=MeeAcLnmp(m4}ZT_RM&2kd*aGT}eJxyItE{ATdbvi}<
zU;f7YgLX6N^M_~6A00e_R?nK?`JQ$gJe&UT9|-~5Ym|V*Jxw#ao;*5qfBEp2aGzCn
z_7qRW@OO~Q@&vZ~yLhUGF9sjH0xbyU;-3u(w*j}i!Y<cNo3F*iDYyrPOlJ)$>$@(i
z@A^Xh73;`O^4zd<C90d_sp{CPH$Hd*?qEtiG7n&l0u8~Bv1ndc@9)f9P-%i?HB@Fm
zrY{YdtKGGp>SctUz=zCTfDl92zy`?Jb*qwb6BMu5hTyZ<glMRoj9VxW>bYqGb~`^n
zXyyr2PNk%Z)l4k^VV)`>y1rm7tS>tZDlb7+J6Bc0ZCZK&%NkE$t<LR1$$^i8OVM`$
zDW+nz6y`srN+2#*4?horq(Q+~zTLBDC5qeAJg6gtS0$XIr4E49SKG6SXx3mfE55Ye
zj<iBY+BY4faTl<0ZB$C2lHG*HF|quGs4hQA<NVW6@Y-8kQr0T0Qapim`Z`O`McjfO
zW5YfVz6=n*D&LN`8}{W+yJ45Bs_elf)jxsCVpSC;mj7}T#ZRg#<586%x+-U*Dvk~o
zVD}(?!OuAb&vpv_66!;*y^c$sfV-~2y|R;Zc|xON>Koh}VNcBSo?;ps+*?pvFzz~N
zY%Z%=rIJ7;w+C%hQJ)Nv<$%JT<1X*I@NV9yiK<NOezMeBPhiRlPZiUKU=ef&Zjq;A
z{vyji1N0)*=u13R2_vgrxS=69tc{r(s@%A3%A%KO0)Y*|rHGy4x4xg#`urAK?c3o(
zJQY&Eq~=MedqZU={)ISN{VBL3_&J24J~gOMjpgr&`qZ!na}lQmBy+`Txy9dUC4^Zm
z31YJtjn`GlWwGV&$Jt%px`PTzn0K>k$W=0?)~KwjSS~H--QdCC<2U7HN8Ei7r-lY!
z;DO*|2;N||L+u-oDcD2RemQEt-129m_REzvry}nURjLv@#rD_m;wm+oqz;3irucQZ
z$*IGMP93;{g50~c+@&J7A^3aL$aERwe2ydN{bt>mmP|;wSW6j?yBQiw$A1rN53iFc
z>M}=Ez!9bT!&7xdFvm0C1o5B#gy8GwKF<y7*;kh-FN@81S7825PoS(RFeQlcKU}P|
zb3JSgzU^dB^`W<CTGpycR;7B73uQsery&Vm2~AbZ4_aMsc^en;snhkCFx*YbyxSVw
zt%=}U#^Fkb>oFI(o2slpy(i%2CAc)Jr9UpUyZ#tn>iVM9^+o$4yQ4+!K)1oa=zC~s
zsNw)!#WQ5gG*nG#t#a@5EcaB6YlDjo`VmV7+&@&z|H1N~SmpjfO!2Q7wki68wz^NQ
za<}$z2k~z^_fGr^5$)Vf&3!Nw#F}51d59YF1m*_=WnTrRplu1y1(b#^hp}o6Jq=&F
zY6WXoJ?M|IqM?o`FqjS-shHmc4K2sT26r1<0HV|N2Lp(9P5|s6P&9HVV7u!(;i{Nq
zRl)_jwDZKI(b%>s_gB)`ARBujM~wUTiuu1wV}F;%u0j>01;M%B7>#Y~gC0N~++X2e
zX>73hFX4?HF`3hD4Qd%!osUbN!1MHC(R1)qi<A;BfthM0?1bM~#nls(@XHuc!X6~S
z(J;ye@szm6{wvk@UP)^VtnpN9H@X%#o<{A#6YSzBbk~UYal!dWySh2;s{MM;o|jRv
zJ<X57DJwrLl33|g)FhMxF^e$0OIX1;LESKqb54Vs9fHazCk@Z`9W><`3O})=eqnvr
zg-4;M8r(NarfPRR{OxibM-0OB?WuOxda9??qr%UyrMUkum|sEH^Yb=-u1cu7$t9O6
zu((MNte2Z`^(&Ud%DQi6Y!G-SBk)ad4=lxQ?;tzpDwUJXXFiO(Oq8a2_MYup10%S=
zwpuQfD%$Oli04X7cPOaJeY4icJ1Bh)s&8xLP|=7MaL|ZXWPGt{$afOjFrwo3c3mqy
zutZOVdwIL&w<SQj4k{MCR3$umqiW%O7;>s7d6BKE9`r#vZbKLJGFs;JWIZmbc6|7i
z-IGnC@^Xr9a4*y)T&PPp-!1_&tB8MAi{Ak2vK8luf7%g$wIluph_7;&i{clGLZDGJ
z^r+U5LbQoo%&e6(rHZyr{$tUVjg@#N$zad#wJ#);O~!>saDh_pg%qh6J5|EGPo-i@
zE9Nh4=SRhsiUm)1D)!lzcDsXU>dS0yGnc}UIZ4YLD>55k4U4$xf^ycr8ie%*H{(LU
zy)>5Dt2T{@Hv0t-psHS3Up7>mc2y<&OWC5KlJ>^!oy103E`SvU*0r2p<<`1m5WRV!
zYW!x{6xwN(B{UC*JiD>Cd+bjBew%J=U`zOvbWY1a29$c-JO)$oAPe+)tMoUo)DqT&
zu9V?Q(BcTYVLctyWg{+EQ`v|q%6oBSdjb;72A<Cmdt5e^&8(&?%TVEq>Cv`eRZ-1|
z?Wi8oAQ?R4F=&J7m@5LJql(pH_O=tbkQTB`gjZEFSY{`2h@!4PKzmioR7hCOf^iMO
zlU&<^Yo4m=OvNs@$0cuD&D-3I0`B^%N(v6R*K^>zH>xomaKD8>CE#v^w0d}x5vx^v
zT6ryH;r>IZqUvRY8G)}e0=qH-Ut|PY5SZf*C~=b~@GJ8@1lOsU4X-hv4W0@&p&F^^
zhF~|z;z)h~HM85FSoIGoqNP{WI83+_>KTUfx+6+ugFv{=hG0A8GbM!lQ8`Lzt0CB|
zK9}?JY77XP$G9zsSh}c)WnN2(G7m9=YvJ-*utJ4ZICblUx@k@obq7S+gp^ll5K^e$
zAf)6`C|R3bcZdr<KuzATo^}M|8gYebUKKm%2Igakmz4*mKw9-&C?ndIYGF$s+`z#h
zw$z>jxhmmWVH$#`pb$N0wq1xha{-JA4)b55m<)dtNiYcm1|dGQQ4Srh(AIMoYR+*;
zQ&<605OaAS8Qv0uU~dB4OKtAq)KNX<-e^;(3z@=QaE`bE%!b!G;^yG7XnBCkRW>V-
zq^>mt+pBycnEKTjsx0g)bQ0Iqk|1sZH=wQ!!NWQ8=%Ks)i|Brx(Be`3ilQ5W4?zER
zmq{7@S%wz10fB&hQLJDY<muj;jeNl$F-GwM$=s-}x?yK36dPFm31+Rp^IPz*C$OxM
zp!zNR3rA3oWOSf<BQI6Cld!B?iH}j;S(ylJphr8DJ3X3AgU0;#S27L3i%=2Wqc>iF
z9$l(>^gkn{M}I*QJg*6NR97Zb6z;fJMyre;Lh*0aYViwr1FkkywxTlV6}WAFI;WDg
zo<C3_gl}t_I&#LTs$`oy?J(zR>0)z#pwBAg#&#{QHdOxL$l|ASD%obrA~I!BB}$g~
zC}qE!(LoiGRb;gxkE$0xo#XCKLHuY6q9&D~VrjBeDo-lKtnFH``eTDU%hxn@2&}7E
zO<-Zi$!<kM@LH-Yu%;zJ&3@ID-D#Cu^<b!mj;f>`jtDkNiBOeq2&P7gaG4^;P=vd?
zegFLK>Sk5phTw0XN0i^&yjlEhLuC>;Dg)N=_xZ{iZl^_}XR76*4hK#`)2k}Q7TtAK
zm0M8Nfcv>B)h72!rHp|4m8wdqB4+_MhqFMXl#7YL-5o<81EWrcJC_eAV;6GKgbtNK
zx{NS-(AR_p{X<lV%t#wv;}jA+72ZBj`8&l34gW7$9LWzLlBrnD43__ps(<i9Ghh5e
z6wL{kO|QL9HGR26tY|uCJPCtNqjRku{!JQ6TFffi!RKI)ifgMxOMTV`8UvLbq0Fjf
z@~_emcf{7?RuwBgww1%e(i*RlPs)_LRxd<F6k}SUGUvvssfqepHs((~>ck>veU1|C
zi^iDJH)1Ip*s7-PZ!!BQHQBnrTu5ECbgPh>v}>Nt>I;WL*lFyS1o6butAszPxs|3&
z_>F?L$puk}Q^AJdzZFc@ncq+_QNyZFP^0h}v+F{6y!SYJMdB)SLql-nXO0hbI`th?
zOZ!!)s?$&zo1+!--U|%Qu5^EcvuoZjyyho|*E}h_=9@mIa0&DtMm>TlG?E+E(_hDM
z<5QF9#_z_g8SR=^fQ4($y@vKDJ-UpeuPWg%VH$!jKz>DQJ6ZohD^~M~wA||951j{t
z{DpIgZIIXDt}XMF9XHZa0|>n<8(9F}Sj^?`lcWdMj1W^4k0cvcvPTmvW`io$Cj}}W
zME;7^{MYgyUnNi?|M4hKS@BmNLCalj(NOUN|E6nT`vR43vAAkUWRRn>9OMka*c-iN
zWG*9&8swFAlF5$qR>L}Qry@7Jb`S&wha#*9tgCV8hT!Qcmo0y#>L<}EitB>`7hz%k
zj25>F!IWgIXVG@sYHT^hkYjVUuusrT-RV!`a`0R#zuFfxdb~!3S~`K+ld<!%1%4(-
z?R%=+TRaK!33WRQb@K$UYf~}5)hg>Gx@|cfd0IUQotV3q)hapKz_od*Gnuw}0__Q%
zt(_zkpfXeh6J_J7A^0COs7B}D)j(M*Ok?}iE~_euHzP2Dqy|2iQeCQQ9hm<+vc1Y|
zQ+^L5NrluEG$aXE17&U4r)gK#=Z5)hoHWW@p6a9iQVxr&O7P+~dd#YXYWdU<9FF$s
zTD;2Kud%q`lNgp-s8cm}Yd8~9pR_Kx9K+Q%ay+?!GN8VG47$`VVYo7ad9kuTg#i(s
z-6+263!;}rNuiQm+%iL1K@Yb3L=_l;EF45bFl!5>og%1V4O)bbTq~LId}Q}OJ%Np$
zpYc*~u$|otY70*F1h*-<>sWRWo6^!9`yME09J+`NlU3H*PaW0i-Mg5#o`hSqQDD$O
z_pBT?THXzQ0p*03jVonMru%8*qbS@^HKi#%fJx=hiv}U^bD@tsRf9H_u{Rr}Nx|nn
zK>H-?xVF&JCLwq(yt1QuLthQS^DwK}n|htA+`mI(w?as$Mj@-krSp*+mulou8Z6DR
zXk6Q(OgZMYo$pJ%#2V^9j4#9go^cJmj!stcXaQ6@_x$knXu}A>Dw5Y5(zv#;ud7vh
z#m=dy=JG(<Qd6f~h+~mkHDzfavm-YYy57RKk+`4W<}&Af!zH?(q9Jvs->shVRFpI1
zqQ>@w&%)v2ZZI3vAnFhM2q1FCdfHjzoTM^|)Mw~rSrII7$|l&`89RsA9=zvfnC}qg
zP_1k^!&uFH099(&r|OtPX=<{22_I{4AD6i5p(Zxv2hs&ste>j=py+lI?6$WIA=#M!
zq-x41xGkt=xCc-Ik*vQ4s85n%ToVQWw6X-Qqvbf(iDn1l2AG<z!eFUF#roc%2LQ@q
z#LKx`<?$5!WEH%jGoX5WE+#+=3+umFoySFCG46%-;`Q1L&IEtfYyai2(Y?mK@ZfdQ
z*X+iIV6nO=z15MGLxEvR{hYHyNU33CL$y@wez~Y}3YyEdO8oFI-pQGwI<uqD74=wW
z$tA|0#z;#>2cgi=<7xT?JldeOZ*cE|Bbc&9Bynx#-lYTLUKn;NAnAl%)z)?`?@Jy$
zyfB@^$tWsv3Uaq8N85^>$ker6m+A-QoL$T@e<Unf-O9auzP{iItX47O^OpG9WwKZc
zRC8I0W$<*683EbGf|EX_YajK@wa>Wnu6?G|bnSCRZPz}&x~_c+>bv$SYwX%*zUc@n
zQ*k%Pty%$AXC|)1<ckgCU^`E{#+Uh7;v~<{|M6T|pXBMY-V<o@RMdCHYX%+F8dlao
z4H6-{kiDVuRESKbkZM_!OC()gnI?5tYg`@8S6918RxM*oJXOz2yQ)?*PThm$#F4yM
zU3ml(7|pd9XjO{DGC(c8Jk{s6_f#*FELG3*MqE(JvfameOjd@JqiDQ3Pt`xzdi0xq
z!ggxQi7Ga-iwyE!HTD^g1wF)7SHy}xP{FNlhuK$HuihfX(n_&DgA~UOZ7*Dvmgqzq
zoK27uaf>|E6Z;NIq{`dhgZpCo@>@*l2W{Jb^j%hu;gLOKP?Da<8-mjsc>{9u4m`ug
zNbiBq;$r^bY@&tn@wP1fexXxkW<5oO<c77aCo3Wmied7&O|^%+y2`|55{A4c*8Wo{
zYjl<CH7<jLY4i<&b)m3)bl_{glF+8Kh8sWQJ+A|{9FJM#RarbEL&H0?eGzy9Ww14f
z<Kc_#S<W6^+CNfE3Du<AsysuTQXKS+{a?(NOwXj6*?4?zB8Sh;d>`KaH|-JYjAP}7
zMaz8<k~DhOw6(WuWcMv)L)Ccx=~=U_Jw}_;|6$UXqFs))*sam^N=3b$YmOyZx+Lno
zyJ-N&Lg&V`vY4%2(}be8Kxy6ejKaH8an}~UOGNeThRxK&5;uAR;~GtWDX;S&rmvdP
zs6Fi5Hy}|z4fsT?&3QQR53yE0u7_ZI6R10^&YtN&Hv}JnD)oTN(0y;twHs-r73)(`
zRolYDbhXjw*(#5k0%^?>R$dERu{-p9lmo@dyrT8yXx}9&ixV%Rj<=%{oE(3EFtNM^
zB$9xF5|8}Ah7{UdQy_`68)pY6WN3$rkz>i8)o2c~LJ{MDr1j9L)5m82vtbQEH(F)4
z#Jq}2TQPiN-id4d4CfK0sf%6@Y076ru3?%ASr=5r#(R{}BTY;43|;F#*VfPO26ieP
z6|PCIU_eWk^k#PpotB6Rn;nKrnTajQ_PsC2y-p-_=O&7x6(OH(MKYd+X$U^d3?XB-
z2V~TU__I3qqw9-VXTNfaY-XG~?`BjIir&7Ife=yWLXH$&=iSVp>Ktv=YR#(0*CJ@c
zuv*7vY%kIw<V9k7gWk4jV>n~Hjl(p01sXzSHU$69piFd&;VciV;b0$@dNL3~bSfF8
z<DP;iu?Mc)e4ccSjN#iUCq{&-ebx`isRAgzI07rr<j`r;E&LEeBc{z(AoxHfe+v29
z$W^5*BppH3tcA#z0cFdKnvJCWGK|O4>QO`oIDX7As<jQlg{V<RV4F;dp9fE)3)SPk
zFIs|#tEb><51WD6r#@+wL{Q4YU)Tsm4YKb@6$h+Ff|_~0RwDqq6*WMY5~}W5A1dY)
zneWY;4(H(2mDn!R6|bTZa_{Y0q!Tc3y@NNw^t5RDJle41_rg5sc-SP`)@fx|%+D<x
znCPjrB*XZ?guJjz2AUJGD1*9-6`?_Kg<sZ68R^&?2y9XjB~;yp%Ez8l{{l}Q`^s~u
znYt3;fv#xsRDO(?W?{88=t?I>cwmESJ}0>7@r7D!hK)y*<%}#Ek7%_T#lnc<twjDn
zAsjW;+@q4<=@oh|E<_M!VXK0M*$})_RKnh}vn>nr-_)ZsAm3O^nN+;iC(8&e%ynfP
zr>xorXs+;`&k=LOEvSe+CY%*WbgM5xBO+TZrl1Ec=Ab@ye52eYYOAb8+5d-13)1-6
zN`+26MJXW*M9m{?k@cxoio2d|my(?xxe}uwG>ayOxi+<Qvq!`TaZXy1+%!^Ms^Vci
zu+-?4c&tsuWBP(UE*pZg*%m$d&@Gp3L5)xb*0Z|zrK2(vZH+lyqpc=ll%OSI7GhuD
z!co}wMfBA~Gs=s&_LI6T+OBQ4YV;7Ey|YabJ+%H`TC-?JA{6CV#s1X(ujq?V?IYT$
ztas;Y)E+7jw%0ckj?O7jzp7`(nD#y$g98~`qO2s24343QW%mkaB-1zK!X#E;8C8qf
zZn-H`1+FWweS80Dmh~!kX4#9w>RCg+Er!S7jIP`p)kjS9d~<^|vSmx?gpG{qDLCDb
zE_K?8o<KI*1n$PmWs07n1~DhkCR|>LbUCGb`YN+?*K&)e`pAEw3YKK4?$RD93PRW|
zQ*B2Iz#Gmwhg6|hggUR++<;A3PgOEXWicB)fq%1vmpOH_#0vLH4oSIlaQK}U%AF{A
z3My9CsPDYNzEj`SPHtk?Q3;SQ6+43+sv3(1PfYThfhEG4UppY1oh5gjlLd`W(z#bM
z#KY(Mu!duoMT*Omc$x3Ls(88U!duYvW<DTft3IFyt(d>kDmxYV!*yqikp4J<GjC5N
z6MM33Xn;*%51=yAgj{xlYC=6O+4R$BZ}17RhIJBu+xvg$`QT{wbH(~xV>Y4hiP}k$
zg-%0oySN4nfogZqv9x4G(z{_EXb4WHn?)yZm2jl}w(rLvoe9T<BH*{+5Cfb|)sPB^
zC9+Yipm13l_3UbAEbikdxcd}i6b!`eFdOKDb_|`z9tBrOv=lW8zNR;c9t9I*@I&tr
zqo6_N!+jV9e=uXjD3~omol$VDbY^nID0p6+cEl)14!?7R+!;9vuC?zJv&8!ZGGeLi
zMl{1V@m{Z>V#<W#je;4pB$WjgmZU%aPov;nF~WTq1xKSlLZcwnX@WTlX1wgw&m0Ah
zLrKAh#TwQHUlnC+p|L)To)4bKel|yeZJzMT&MY2bv{4UwenG){)(FLN+O`kXj!MIR
z?6INiuxpA^ok%V!numo|Pi+Sc2U*kCG3bp*ezvq_D;_rR;wLB|A}WrUTNkf#24MJH
z7>=UWA*%3KWF){`I55Q{{mM?0`Ij+FlF{<sVZn>6%ao8uM2NHe?agIDfHIpreHyuM
zGt|AM>z#PznP1$(6oq!3`jTpw^MoM0wxND%!$^^O=>Cej0#9s5%+c2poojm5hI2AE
zmfbRQ%Y=l!koHeQA8v5}jB%zK*H=8OYK`o%|Expj8<4SvnsO!qJZE_e{iWO?Z6!5Q
zV;SP7q86%mDEk%X&SI;Ndi^9$B;qNDiW`Ra!Gy(4EM!m%6xmyV{k$kf&h4hKxL>-o
zA$W~bBPW8^S87bxog}VXU)eyjuCjF88nSwOKLQ<i3j4eYs~N4P2VjPtKzG_I4PR;!
z-lu!(0j27bsRnW8tq=<@)DY|<6ulApC&`|PeiC;^maeJ>X;>#PUI@k9s%5fjfF(%_
zrK7)44$s~<mMaZu+_PohgHGcL*@}q!FiiCH##aLTdUrrO4b@?b5Krh~<IIN)1J8g^
zV~5oq$CM|s_+1oj?oz>G7+Cn@@W??N;C4rp!|INPIa6}H)=z>8?Mm7;5#2Nd7fIJr
z18l#oj97|^WpACI7r}O{AMRb_>TIc@GrKy`U3NWMX7`8vjF4Ul9-m<$B0HJ#8iKu^
zqfh-83>5TdYWqG5E(d-#Bk&!+3sDLaZ#kBE0-q6o37iptk>UGij0d?jq~6O7tpue)
zjOS^2Fa*c*aD;eJkBz8DJ?gPCI*!LB>}T5JoD(x1s&mZrS;ukAICJP)`Cf=P$(6jm
z=UI*imG=l4U-YqZYe#{MEcpcyq{?}%hRJXQ!RH1rHgx8X&~o_Trjl+p^AHCRm^pk~
zLImCprllhg?i$CqR=*vc5d4Leu2}D}N3|ZuW)qbcW%SspD{>^P0K%ZF@JEhMKv>8>
zc8AV_bjiqyv>|wf+}2&H0XSYDR`1Z!7L1(^rgt&TB<A?Zgb_I7C)A7RI>%GeI?>qh
zr>h)Mp_cTOuq(PH+n}&eOL{vkarP{g13R9sG_=jyMPP$=<me?l0L0eBXTdKS-o)6C
zj2;BXqFJGln~Z`XrY!84hgy$rp(Sv}ShKBz=jW~1b%jx&X<^N~QrTEkM|F_qg=r1q
z^h~?gvG+F&(#Wy59UU1Odp|>^=GdDBtJP!gcN%cy*i)})DMNvJqx%aj4cqwrkSc-^
zh0O{n&~W6ez+tTSY!|#>_a_E(xYvqyu}9UJeUOAcfa9i1XbA4Wr3mk<ucWbU$gxr;
zY&^Q&6m&9%7l>Dn+Ld8C2%E1C$89Z`Z=(Et+cP1rh`MLm79kEXFs>d!N0>vhtpiml
zpdUB`A=w%90x%A}9-KlRhFOIr3Cu=a<tT-zH3UD>SF1(opd?Otn$1%0$~CHQ^GfC&
z(5u>CZN&YpI`dagfvuQdZ~0G?b+Z3t-P%Tv>@9Gm50}VI%6Ee=s?0&lpJrEKX_(ZF
ztYq~-gwzrrq0X#27Ot{Q0mH!%dpB52F}ev4(tj%bqLwn<wi9O~v7k+z^v@V})V$KP
zqv_B#RHxiN9#+&r+gM`pOV?#=!$!Qd)$J&?D{!{q&pXg3%Mz5)AZ`&guLMOkLn8L;
z=n~+hyWSz+7W{xOS?Ej4SXb$vdV5&2VP`;PXQ0d23su<nk=)X_9CQxiZ?B3pQgCIZ
z=N$J6Ki*}eJpok@Y_C7B(0W5(iQeWy3%GBE)pcR9)Pao~N9oY4bp6rD(L9;gm`$G7
zuYmZKSJPZ0Gxt&A5IpT)HNJlm)u5&zxa*ByOJpK#)b-WvfF6ggx%z|{1=m49^FP&n
zs?E&nt}E#+VnuPZn>uZRsJPR@fw;G&%u$wfRvq+=Vb9n5!nEPBD|!Ymk3&(=LG{4H
z_2)KEJLUA>4Sx0nq{8yC0VpD5f2$tFZcGof(FBIm1e^^s>B(PVe#{Zab!cDnAjij0
zcdJ@<oq-!fUbQo9U?6N9-+iy?X1{ZGJ?aC#)sdYH>qah2(45xvB<-P)GhnA&)-9=D
z<la+0A1@*wgzH;clI&bPbuLF%lIbOLFcxE9dF<(USBf8xx)MD*9)d9znjOR4YP9Q|
z|CpIu)lg)ry(;|(NNl`4Jcd)_FuiPW@1f^`e!?c9i!cf48}z~yi_usSvj;7<qgv`H
zl<m)||A*1o$i<e?81(%)?pFWt=o+zto6u@nmzQ3o0^bpD`PeDnj;eIlyTz&FeUngs
z*>jzGkPZtEJzeQwEc{fkjLWnLSQ~yUW^E~Lq*cPqPz+{NG+OH!q1jX`Q5q4xClEPb
z$6gkFA^l;mON>%EDK!MwLK(K<ps0whVr%q6o;<3A>XAywn0y%h5ZgAQVigq?CSb~j
zJBCXDKF7T!>K87$U|c5Ssb=t6nKNYYVGv4bi=_LgQ=AoTUFQt<REy?<roHxr=F}8{
z0W9~q=y=tfXH&x;k+uLf1V?asln0@(WL8t=-YY34{HD3FqO14f4%<stP&u~VBTQQ9
z{^Ca+{e_t^*HyhETo9(m?x>$sxgV8-{WiU~R6YFk!=SQ_4@;wv5jN-EQaVMN1Hdub
zZnI}-W9djx+?5B%meNE`xgVl?9Jxp1qLfxoHozGobPHY|4DIASg=XlNz#gR9VubU0
zSPg<GV|RsZ+!7hY+b=GmYaZr|!jiG6TkXcipm?NY*a5x}$nQAe=7wOo<l+)h;C-w@
zbr(G8Ku}ce{zl*RD=I0^%=th`FOT?w1(tlyPk0;@Rr~O6tC$}nKD~;QS*-Xph{oC@
zE5f~;D7F#HDL#NsbcU1~tXg+NF;H!L(mo+#mk&-(@6m9CKr=s?!or)Pg$@);vggZ$
z;6j))6p5GK@a7pLjK=j)8zMwRMdU|A6(QIEXSJJZ)-J56{nX45OF!#E7G%$>j?0Ru
zo^tw2XuVEEkPy6I2DY>ik6asqGs1c)*&KCoCb+clQCl&;%?iCQ9P-w(@fEjjK|~Vh
zBic3{8oEwwd;J5ZzQdG~7n+J}Yl<^vzk7eUr8Hn=%5S!WR<(|u$Vw}ywqT4&3>9)}
z9#)AT?sIzBu7tLvRbhSaL9`~v`ASY~j|TOZiuZN)&>0`8r}!Al+#uir=G_ND8cu~e
zw&kB3UTaya({5N`s_kAvTV(KP)#4T)UM7eCpxST?L*XevH3ZkOtkAy2WT;ZRwoB>d
zbgpXLg$N>;*iwhi3$_g4ty|PQgSeOL7?GX;*A%pr$Y6I~SMMtysvM28cOB;aqu1fe
zlG|9QVaZ!mybUXA2;L37+2b@^*j|u&K*Lc3Kkvl4qK0!ZHr2zKCW?Ps)Zp~etAO&>
zmAv@dM8#CBcgcIbVPjV8Gi))x6b;AwWu_ma{a}((FBrC|Ny)apu*$PbyE~PY*$Ce!
z6Q$~!ZBRR&MWS%2+i0|7FMIQ0@*zIi5s$b(&hWpjHhgrc`iIQitfI`HNeni`q(qnP
z2+fCOT4iBn!-ksRA%fgEm*p0{3^xm)tm-2xF@<3Y=MsxnE1sh-rj+o@xYVxk^>tXB
z;%`D!@UK9G4u0vQSALl_px=NYmM6pOaZep_;e_$tN$KOf6W!w`Om^ePyys@kDk}7r
z<`sBLd<8{*9}89FB_&zK#lDhZ!$uVq&di%tR^oHd^<|g&vu5P`M&=b}mCSR^%*xC6
z<qUK6%IUnf`^Hbn$jB_pEA;zHM*4EI=HwNXgxzY?ZJc}D1?d_1>8HKP8|N#yATK{Z
zo{Y)q<I=}pxR-WJ&MU|(oHec}#}`ioRRkg4$rDG6a!2YeCW(sc+)?_Q<nvE1$;w8N
zdn<1A$P2xh6DCf!M*C)z&6?#aaTOPp_-)<pRUZ?j=j6V4C1j2mHP$`ZJ7WBZj7z7v
zMZHn=box0nEBi{He?(zc{=Cw>(lC*+>ftWTnK0A7Im@3{R2Y6sq&;O6W##PElxU5)
zMWz0J*~P^cfU0g(eJ?4=^ZQ^6MG=-5S>8lnR!)p;CB@m^?5zC!D7nl(X<osMqWp`q
z^2>a^k+Gn3R-dp&@#d_OSyHoD<(c>QDviFk#OL3aaxp0V;WrnT6wSn-EAeKRm6Z4j
z{oa{ng{VqSPRXCji+^i)5mD_a3-8d#>y!MxVhpM9!351*Al|{{FLFW4McG+?9~^~i
z#yr2T)RkB0^5^<oT9gZ&T$V4U)30;na2s{$s0{Zc$cxAfC*l)_<jZkU31WE3b9>FD
za1@k1FFW5?8YOPhWK4-M`^{aD=bz+9ElT})+2LkC$CvFdf-CfnLyt&nym^JCz7oGR
zBdato+gs`{!SEh7ti)HGpOx*4VC61#fd%&}9p>_Si;GHKy|2#A%g%LSjP<&vtdHwF
z=p?W7tn*6quJQHm6LHmw7MfjDh=R+q{jS0?xWmFCziUobeqK(DoT#<<c4@f&2M*bF
zHOG**Kn7875NEnXsB2D9UQXXLp-F!q^OmsBydQFN?At^4{K<_*`K7)>-d|dpHOps~
zU+T{pHcV;#T$FR(X)&t7@^jf<pEp;}V`>ib&hq)ar80uOa|ZU#%__{v_j#SRW#?v<
zsJ5Mn`_;9+`z61mtk95Z%dmAv*X5m2Hgl#gR7^Cb^c<BJ^=Nl|hjX6jWLf;->C`*+
z?AeIVxd<p)k)Fi}0FSFjA?=Mx_ff=$8A#tmh?I|XC1S)UkovKY@!TPn^&WOAn~+|Q
zSh`CW%W~uKR4URTST5uvJrC*4<dHss^kJlLA>D{{C(<8~b~)6t4#R5)15dZC@kl2l
z%|@DtdXypUjdUSW{-vj9kuE~I4e1J`e<0n6bo^Qb$w&i86A!j5zFFNHX+J)XMtV6?
z{$(gWpI(HN&&*dKeFf=8q+5^%k$#0V5$*dG>HA2JTfb+|4@lGST;GLuKE~(!NVg*8
zUy@q7Vb7kWNVg$96YUt=0DU3NN1BiHMx-|*J?(Yq3+YIt?<1X!v<>NXNV{X>WdqVw
zq$j<xXU|lm=OLYoGy~~!q;DXt=XJa#*Mu|`sf9hR%aOW}UWYUd>2jpgkv@mC9O(w6
zwMc(QT90(aTWBZJok+Xj9gSa+o{4n!+o(6v<w*09zJT;*q??h}AwBvX)EnukNPj^3
zI@0ckLvKh^kzW7qo;{Z%y&vgyNY^608|nK<UqPDDxM$B-NN+^i5qstjB0Ux9R-~hm
zHf=(=NOvQ>8R^NJQ7+PcNE?xsAZ<l@Gt#6ZU`I%kk$#3W6KNaLVx%Yif9-vJd|Xwv
z_DM1cr6Pn{ONEL7OD)*OOp>N4R_I42kTz|{B!r5UX+9>+lzfp)LW+nLdLtJrR=`-H
zLP4liDpujG6)M&O5dpQY)=;q&tXQxVtcZon<<k4Cy`O!~nKS34P4NBW{r!4=ZJwF6
z_V-?U?X}n5XZG*WAHW{qgTVWMyMPY?-vAB($Bsq4fW^SqfHlCeyN(=50_%Zmfl1(2
z;I+Wrz|Fw7faQNcy|E=#11tqz3tSAm4cH6Z0$d9W4Wiz_F~B{*nZS2|J-`X$&_2MK
zz$t&k>#4x60apQ^0Nw*E+=FofECU_{b^=d24gCk40z3??0p>o3as$r<-U^%od=S_I
z+y#6Z_!jV4;P~+vZ@^Mu?p~A|I0e`XyaTuvIPQ6r8(09`1DpqZ2iOdp@FB}u51a|S
zAGicK@dflZuno8l_#NOj;3L32z<&VW0iOM5#6KN#EN~{U54Z$)H}D$ZGr)DgcY)i0
zE&Cv+z#D<1&Op2U1@#6#0;~kS3v2-vz6f~%&H`=%-T-_WxCeL$nE6-8%LJ4kSOB~l
z7zN7j_*?@#e?RgCE(UG`_5t?*<@Zb90saa&;lq%Rmr!otbuXjbz|bp*4_pdd2iyeQ
z29#ez+XH+7_zv)G-~^mU%R7kpz=gmiz+T`r!0!Rq0k;CT0e1oS0G|iG1APChi2o7G
zIs-TpSOZ)F>;YZ_{1$K>@G;;v;A_A=z)`OuKJa|tggndo3~(l}0k{Mxzj1yI@SA@}
zeBh6P+km@(dw~0Z?*Q*R1pRUr>IIwyO#B1#3VZ;#3ixZ_J;0ZMj{!4Z$2bR$0fx@D
ztYY9q;8nmf;61=b;AY?|;O~I<0AB?@1|0QI#0QQ8hHz;4Gr)<!8-ZoOb-+g8kAbUz
zF9GiXX1sy;z$w5Nf%0p-p^27tIdCGd1y}~`12zKJ16Khb1l|Lb`(2&@js?C9JO?=H
z9Fz}O0DJ)$1%Bi(^bBw+a1HR2z|Fu4;7;IOz&C(T0>|Rjg9E@~p!|+|4RGe07_Y#2
zz*~Wffe!+21nvUf4txW+3pf_@pgd4J>dCme%F5`?%{b$v6GpAcz}N#7B8nS|P)6uh
z>*U<>lgH0H<$YI;T4`N)`X|pTnuwOv@ymc6cVa9Efse#pg8$XnPKRo>tl1~$_GQj}
zf6fAAfUm;82LB`AW6E&&^Bn#<ke@@kP*D#5(++<d$YT-z6K?);hrb8(j}F0q2mC_t
zC%EHZ>cpRbcE|@m>gH$8R{<oB+)I8r_@8w1dz|=7z+VCWY&ZWJhkp(Dw}Id2=4Zz2
z^w)vE7yNuTKjFmR2L3_tp^BXLsde~!z<+rN{yX5m2)+^p(?9i2{0W&ur#};XNgpPP
zlYhAre+l@z!1wFFJVdz${14o5<VaJtEB?jTXJvgRTO}fGydGgL`Oc9eXQQmP3}t;d
zLjnkYJNTF0bL7aY_-yktU$)~5IpgYRrIPLwZqJyWXPx1eagp=Tz0gk~;KNAKw8OmB
zS=WcMeh@-RlF#HU%!8;)9zNS)KC7~^CWF+1|H3~9<qUnzl}^Q1vsQ<$&(1nc6<Lj|
zwFo;NyVlErp{rbBBiCnTF4bvDp6ih3We8V*&vu?yIC;JR@^bJW0){ShrNcZkH#up^
zPY1t+unQ2@PAl_b6+p^A1?AofyF7;W47~}7GVM^w@-NJ^!!AbHSH6Gb$bC}R|8|8n
zb^W%ktE9UMVU};gFZZF3L$A5Qnsm45zLq*}Mp)T^echWb!)D%*pWBjenWohJkA8df
zfg?vwKzzH*bDVm=0RH!f;9Dry*TJuI$Io1%5|BQ~1OIp6pY7&<&Pjh3_`ATL;^r@N
z_)Ecm0eqMnPWdiz_$$G`A99m$^D`Um{MUp3CirK$`MP+z{@}j_zF+!Ew$bmegTG$d
zbO51DzsfkfMfa1`b5suc35QzzbV(lgl}M)ve9;x5n_cO+^hG7p2-5d0;4eWM6-dJ_
zM`oE_*VW+nfq!WVzslxs0{;>4^W1!nL8-&j<YNirq+g-iTG}EPvakv9zktu7-@$M+
zZE=x|^~@_Jq@*(z!j!oUb1FXDVKNul<tPTf1bn|Xm-sc{7l8j!cl^tBIfS1CzZm>H
zH{Vr{Tgmt93yJ?A(+}eB0)Gn9mucHBf2EWE8{nS_eu<k8-P3{~(oUt2DJkE5_#Ao&
z2D)h{(NA~jbn?Ix+1`q9WfJaASGX@ac`e0czZ+rv`b6ro68r)1+ws}XCsR-2!e0;m
zxSt(4GS$s*cGBMt{zUL$&N}fw<CHN3nK&DKX{*o?ELKciI8Ww*J{e(7M;KL)=Uri3
z^_YzH`}qjtR}V?!V(=$}e=k1Uc}RQR?v%9!VZJv+8mqxydz3UbBh2ZK!0vSCqfG`S
zyF0-@6Z||k-z8gz$oG>CNwXNTk&F1f_#Ao$Gq-6ksY{zvms0pGmLl8?cRopnzZm?h
z!T0MI>DONHd%>UPj^Cl<OF6co95;b47j%U_>nevHt8Yt4NoPCKY5BRojhP322=Z|`
z_#!`{(_HDeWGDm!W*Nfx^{wPF5&T;4MLz8IaLGm)_*a7;qz@XwPY%If1%4a&)7|N#
zy<1U9$^Rbk`vT(UfiJ$PmEcdp=g@RG0!*LPIsNf6(zp&`79)(!&#YDfB#%+>mpuf&
zUw;UHGWfp*Kd6l^2LBQ8k&TnS%NCTI(7!(f{|4~;!4J}D8^OOBe5s>de%3D!q=Pbc
zz!&=_G##rN(+0G0qz>W}+<`Db^V3Q1hi(NQBIU%7IBhru{5!x8yZNFt@m<od1b;L5
z{ce8d*X=fL0e{l=;5J+h{#fwObH|T6>2Cu6Oz?x+=xOjz58&q^&0KWOIPe$XbLccU
ztbE!i7acqcVS@U(0E~R_gUT2MzZCo+z0d)E75J6zI%Zy^Dk<Y${J!!wQ9|s{tI$)X
zj2I7DF>QlNTW>`=+u?_8NJ&RevclgDe%7P@<&b<PKo%x|ukx85oKGGWr<Xq-EHksf
zkAk1?E{AK4wv_Qx#ue%!X1tW+9+V@A_zUql^bEAJsjt>^%954(Y(zS@AzYA-c>?@f
z!AG^7G-&%seDTX(Hw1qa7VjP4&vM8Ayq3K@&?bYw2K=RNzRU@(^``iEpB^F)@%=so
zeo!66C;TAzLGrx`{LMq~p9X&u_?_-DW{MVt9S_<;@ZSYr<~F;Xr4FunEf-x-H4t1z
z@r6_l!H<Gp4*rMTc`VlLDE-g@ei`_sZa&H&MG*cP@Rxue)Q_9Np9g-oJAS5~I3@m0
z@YjQ1k-~Ru>_gx`34V$^a^98rxflx1gCEpK1>ipmevvzUSNlc5e--?&oA1(*9pKBa
z-T3KoRe$h9*w?tg9e;(>ew)ER8~h*{*$Muc;0M+35coyl`{h3wc@{%P&Ii90pF{J}
z3uZi;HMPsunh6zp4Z`~Mbspj`0sm_758|_(j>y|Sr#>qYX3i7X_i=~GoT~yzd#ne4
zIry{Oe54_6CgE=f|3>hO+<aGmya4_k;0MW*H~?1%<e!H$^T2-){FJqj)WPMaD?^wU
z5yr1grHqZ>9{@k@E+ZWs!e0e`0rsdMnoc`$y&`?`1Q=(7ulnRsD9j=HWGB*UMA%Qe
z)5_F|NnVG*?*~7~F3tTQ^eFiG?)WZ!UI6~x;0KM5DERBZKi3^!FXxqw&;kCv;LmgO
zGqsT+<y!;(PVie{_uBQ$YE}z%;co_C_UW#}{OIt%pe9D)?*xD4ui=NsXPcj?otnZw
z1pYSgD^vJ-{uO>MOyrlr52{}Q`1^+7N5S7S1iu6PLGXiQWewv8wf|=D4<LS!{jn4L
zH^2|lKZn52{!OsYITy=6i65k23cx=Eevq9T1^-Xr2bI4A{J()8G#=N0{|xvk^Fx=T
zFE%s(Ab-?O@Lxy#Am76w@c#w=$KCzk<fNZF9t^ZkQ2Q2uKN|cX`G|ra0zatzJHVIy
zUThyZ@_Ct){~GW=2>xg{-(`bu2LA}kAC&%1@N*GgE`oOC|6P@Vl7I0334T!dbKw?x
z4g8??Edc)z_<sGDhb*Ju?*;!?DdmU0{JB%dB*I+(TmLdh8n=Qkd*0virtvK&jjagt
zKL`_~M|Ok%G3<W_)$=Xz&l-Y19udzNf?o>$IPhUsI_>P56BdJ?1Ab6D_JThO{JHM<
z*E;oE3;xH!pX%nj*6drsF9knn9PS2x;t=`21^y)PgU0!IFotTMQt&4rzMmY3oGk`_
zH26XF>ji%*_(5{K7W@mqhq>akuWRpYEBFQAL!CK%*WTD}@TGp!+<cdv_ZIk{1Ru@d
z#CQ3n$D<O{zz@;`rQpv3|5SH;&e^K|_)`deo}2ITL-mp$)P8Hh&l{qATfskl2>x#H
z$AXXP-l?B<;i|R3Ti}cQrObn3*QgbjUe8Z}VJB_umj<F+Gr<=b^J@cH+pR?7eh~a0
z;B)A-kW&uH=R1z9uR=PvAzV;f+ynj%;9rPzY@K$s6aO*rzdJ<y7s3C=5d08~q3Z+q
zdB}1i_*a6z*qy&#pQ;|0wzwE!?nM}}*X%MjIAv@G|9<fE+<ccj+(3R%8*BuB9paa{
z<D(s=86^KFz~2sjQ2dvVlKv=6NRocgc$^Hrl;5wfRsF%g6Z{#T^0l*k&ERhZKj!9V
z>WNR{-vIvazU8s=x&9XZM(|$;ALg6WzER!2Lc*nL<5y?giCOPwnb~iSvfjyT`}&F2
zVCWAJJMeOQvR}!tzMPT$Y=-sAj0+U)q&)cVXJ=n*eGZ5)@_)*w@wxW=ABU{(W@O(L
zvUX%-uL)V-&&+-$%X&C7``Ik3FDv^<mi5b!j&sHZz`l&^FJ)PS8QCvpS}$c}|31^|
z&(iE^ap0FT_h%d!ZQU2j9vp4G9LnA?+WKa8_Wh%+^*PzMjkbQ2ll`Bgt*@Q%n+%YC
z_&%mP19*SNoX4}QUu0(gD9idoX7;zUtml|+?#fKqzrV`N#zo0%NR#59^f5%cF#8hg
z3Z2Y}?b$unS2FY$Q%Av3(d<jDmUOxfbNe|Nr%%5O2T(G4|Cp7rKNCXzAEC&TA#1l(
zTPzFp968o-x@||z$*wSM?R>udLm8)hJi9C7KQl((l40E?WwkRf86C&tEpWUAj<>+^
z7C7Dl$6Mfd3mk8O<1KK!1&+7C@fJvHfmwHIQqxwl_}#3j&X}X==KHyx@1>sa<@~Pv
z3W(aw@7RS>Pw7`RzMtPK|E@p4y{ewIQ_oTecbj^!y{I1Lcf+TR<jWdgo>3aCf4!@Z
z#2sQmu`E>2M&-DNZ<XKIDhDWhjr{&9%SW;Wcei@7h*p6vPbR-_W<iyY3p5<NQV+HX
z)N@0ZMnfOfslS_Yu4MUe{6;+{zjt}0@8T;p{RqFCavb9K{x;43FMfx+;{S_hGyBb3
z*e1$D53!HfPuxJ<LL49t6894i6LW6TIgBGtB9;)#iM7NgVh^#8*iYO*+(H~64ifhh
z4-<2~#{7wsh$X~wVlA<W*hB0i_7gV{w-5)2gT(#B!^E65%%3=kSVAl())JeDJ;Xj@
zKXC(b3vqxrNZe06Ow9Q@^CwOsmJrK{wZtZ353!HfPuxJ<LL49t6894i6LW56{=`Yd
z5@I>Ame@q>A@&jbi5rMphy%ny;(p>`V$LnhpE!wFLM$iN5}SxU#6Ds_aRYG+aez2T
z+)q4A%(<2M6DJW%h~>muViU24*hlOqZXj+U4iE>4`-z8%Ip1LZ#7V>wVmYyv*hK6h
z_7VGu8;Dzo1H?h%e&S(bPCxS}P9l~N%ZatbCSnh<kJwM#K-@wcAPy4u6Au$}zRCQF
zlZYk6a$+s9iP%HzBlZ(F5VsHqh=aua#KXj#walM5iC97`C)N_1h&{wUVn1;MaSL&P
zI7r-2JWR~_7V{@gB9;)#iM7NgVh^#8*iYO*+(H~64ifhh4-<24WB$ZR#1djTv6k3G
z>>>6M`-vNfTZjY1LE?VmVPekh%%3=kSVAl())JeDJ;Xj@KXC(b3vqxrNZe06Ow3uw
z{E3r@CB$-KEwPE%L+m5=6E_gI5C@2Z#Qns>#GG$4f8r!!39+15OKc+c5c`Pz#0|tP
z!~x<UaX;}eG3O5EPn<+7A(j(siA}^F;>J>K@99*WzZcD&dw$;JOX|DZlHGZw`NjDK
zQ^VcrTlm^=X?{WRxti+42>z6X9g8Pu2f9OLS(%oxUsLH>ma$t?=^@M5x2g1O%h-vj
z^c>6Bm#Op<EMupp(nndwo=v5nXc;>`l|I^v(hf_dzt399_D!XqWUcX~zuz+Jg4FOQ
zTTB1$rs&UE)+yHbH~r|j))Vjg(NDG3oSEJMS=JcKtUuhbvaAo->yA|VSj*3@&9Xjd
zZT>{M9A#PKutDxeKh5%!yDY3mw)@Cqmh~ZbG1O<NP00Gk5idj5U71#TJD*JYc2E0D
z=_I_xb)ZRaIzD8$$|L#7Q;z=`KJC*Ax};;)ml9T<75Jaw)6T!d0?V)bGOc7-r&*iX
z4rU!C=Y=Hw2+N6<RnI=s*KO6`KcT@o83mT`PkHDwLC>%pxiagi2GW};Z)QEYlJu4T
z(g|a$Q$0T*eLLxAk-nRB>#!!6@{C1&CEr=3FJky=($|xI73nvCF6E!Z^3NiDE9qwa
z{z1}_t#5noQt9Vft9|6(ul@I_@TXZ@J?;N4=$6m8I~5Do3?DiFn4+JSCeKBrd*y$Y
zqNk6q3ee9+JLEsED=Km<&sRu4tPYaltcxaFzb1V;=_Y(3I$F}1{;CdW=wBp#^JAKR
zCeyi-^ppOe31^W0BI!@=*Mzf49}9tz^bh_;6O4bMob;ThG+mwR#K+~Jk3slFKhbpK
zKe&<M&-<IEoAe(d{kG>dU7c^m$1FHH#%NN?-+a)eJZ~M;<O|7N0{W>K*DrYb_u47`
z=`0OM|7xZ`@Ao=kQ-+_Cp8KSxPh!3qsGyYR?mub*i)Ecp`k{w3-L%`aq@VX|O;_iv
z@bQPF<5xM=WBPqO6osUJ^BzqXeI!o@=?C}g@21~ZlYYl@nr_m$m-IiAK2Ni)1E8Ba
zlBD3nC+d99c~MhM`WKOYI{WbyMrsG$FP*y?{!1_GfF|FU0>V!mt@Ax{K!-QwzW{WR
ztNtHqy1Z8|kGyZF<nuNCJ*L6JnG!``x?2-2ApIcemu=UCkC8qJ9VF>Yp*$}jy`1!Y
zztDsl(r+NWmUPoTPm*5Fe$Qw4=SbhXT@x-PUEW8Md@c6Nr%C@J>3i7z=aK#}>8oDR
z1k*kzykDoYi}{*yJ&*M04julZOlKMCt0=c0CjBR*-|>hhtQQ50N8bO+Q=d}*TnM^f
zzkfP_{sqvbz6T!D>69_K4WvKKa-K^1lcaB>yrCJ@^A71={gVfQl=N4z{HA?kq*wh_
z$Nvb^xf681@;|`vd!N(+O}@{NK8E_x=z*6=pZ6yn9>r77^juw@n%7e4HK6<D+s^Q>
z|5AsS_h99bccA>j?_l`PvtD5x+zO!qr5$R1tqC(oUrPEoma~ZT+d&^=Irb`&RnJ`v
ze=GaN$kqKSJOjBvy|aOQ>t5Bw#pJ!gbY9r03G)7(Jgumdl(UEOY})O+q<iPDUy^>;
zPj&p+8mzyOK7sW$=}h^6)axYcB-$HB9^@Sz$@e!;>jV}soz<jIIj9LJwtBXbUih&7
zF2DC7&u>Y;@mKo0Nq_uUo&JlvG~JY^p7fh2SJO1x`Y!0GLfZ2T!@t9EZS2EQFp#AD
zZ~sh3Nib#@biZ^W3}3>0O}~7F^lg;4X-wyjr1uYK!bH-uQ2@zz9Q8&k>6er49WU3B
zzKQaw&gUb}-$)-A)CqJmoo}3`)4yZ4CK!2>_a7wR)lX^qXoi=K4xvBA`Ngz<l=NE4
zRUX4HC%u9C;VjZ`CcTC7a31LokUsMnO{gIKub_*)@iOfVii35C;jd;nO<#QAL;n3T
z6?7^8=eO#FP55b~pSWAo!_2pp^oK|{{dGU-Rh&o8Wcb6RZ+}!1jJ`bu4JG-mqI@o8
z_|K7^q`aB-?<9RE^`4=B6?7S2ulztKAioVG&yN^Bhy9MP>e);B0m`9C=Olc{V;DC~
z23^YO9d{R!{t)HU#BC<MkNsuZ=bNCLwsoh!gW<=reeNUka6tHzC+PC@?$ZG)8UE9x
zms1azcD{x5%PChz4-AlgGTYy@&)-S++S_Mgf|houe@$m-()kqVe)U?!@Sma{WA|7O
zlKw;5jV9k00@C?!hTqC@YUc|9lk(j3qAt%z*$&G=7kT*XotmJ2;|U+H0zHpm-0*d#
zvzPNr37HR&{wU?d<U1M@w~V`k9Csmx&jVe`Gv;AU$Rm9^>92d*xmVE{$OX4DeD(oN
zH0`sK^p$LfQpPzgPv^VoXPRK<kpj~H>d{NLkp2sfBbW{9*+KfD$MyGnHCU&frPH7M
z2Tg!js;8QCuU`5O((74Hbf0>DOZwjJ`un9Ctdq~y>3i+oeA0i%_J1G4SCPKt_nMGT
z`dy@t`>7^GN#8+wa)&15k-mrYx3_8n#9lq};*j*C*WL?5z=ZznuXQ+MKio%pKKpAf
z(|L(>Z@<V7aY;H}yEjUD8RcXa)A=^(w{F)2qyJwa{e0?SV+UrPqtk!%B^}<#Pnh(X
zp8ow3>CaM+8GB;~>E88=g^Z;9Pjg-~`ey>^cfY3dHR;SJJ(u({&9<%~{j%+vVAerf
zNWX`A%#`0k0Li!U7dreGn9k*-?_<9=lHN<Ycb&K%bU%IdTZVsozfOSJTOXM0A6~Az
zm-1|SNQXys^|X+_net}F%L<jwN!I87j}9llH73uU4DYpXH<5n&9vv>O!TK-Ke@uOD
z(m(fH|8h15(AR)2<?nb-Cv3*iqohAcc{cOcspskNEidZu7$)kO1G-;(E(@S<RP>Xq
zU);lV*ba{{{p%jn^r;%GKa)QAtR~DNJ^FE7o|`xxjU0X+bieYfVfd!M=me0gdLALY
zjO&61(hnHAXFN`y!gixP8@s5Ubg!NBdC>jJb2Gzx*IB<+;bCXqrwby#izm<P4FAMa
z`g^|yYZe+x`f=||n(!UcZvfpd-*pV1_g5Xzl>bfA`=8cyBUjTP7?S?o2Q=NJc{%AT
zDW7J%-$(izf70PGoYnIh=((6Ty?*$&8UAa3)8SAI^<3}?o$nK*8+p4HbeVsx8JJM;
zU^u8}6~jMFyUXZ}mr3`om)|0N@irZQInx<apz~crIWg_?DbhdmiVkSx=L*szuWI_q
zOlJe=rmaX)u!G?**siIjokt@hY3DJN|8_>&K>EP}P3R^4HPFS*dE_UWVA|(|LY>YI
z)(hFGXA0?$?bF{)``ki$i20iGKTi5vPw4PtG+0?hI^UfeG(mp9Ql816OZoFT@0;|$
zp~CAh?!P}|_(y)Fsnf}Pp6S29dO_^fb7HYhe;?a9kMv^F-}2P!JET8IxgE#wIZ$X)
z&Yv<JBR}VZZqj5>1&bK|#@}nI(TCq4-P<qEfo|H$oz4j*I^UufHMNw?GSb&`yws4s
zmUQp><#Eyva9kMqd71RT@74*Jad$ErR^;R__i8$;ZJiCew8M{B9;iF@l#<@~OZ|NY
z>DQ26M)^OL^o^vK?9v1yKYK~PlI;(1RnK{-xRUdS_4hmt)@;(fe!1nKi{6;b@y_5@
zJHv0~{B<rFKLXva{r58b#e+Jak<Tfppp@spK}|R1uOR((j-!W|{!-Gtew#I<-}I~w
zcrnBOjP#Nnnqcz%AJV<{eA)TBJQr-&;Z6UpCjC0nO&y;m{Wp)LhW{7nCXD9jzo&lE
zpFSOQX@{~u=y08kbZtQRjSTN?pP!RnO8uG7bY2fgXUt5UZ?#8%euDJT)DKgc&XuHl
z_d$LNx?g>t37{WgI!!;-38Q%GIqOrpJZm1(-_IaDLAqD3-bA|BpRi5Q8QBGU7~bn&
zdXseTe01ssx;%LtM@Bv`0X-M{x!(D73Fv<9(8F{lP){ylnI0znt6MeU6w)6f-78nG
zlJ51_eey#8e3yVOay9sjP8jN4Jzpk$JN4>9(w`#zO^^JXJWHqJwX4q{eJl0SSf+Ck
z=%V*(f1(K|ke&ct%5w+x+nJ<qCf#dSzW};ld;XK*$5U^Ze5cO#4<7+t^1YIF*J$Q@
zE9sLx`ty0xGhfpIO~0QtN2k;Ol%^ZK+DQ78y_&9m#|IzpBmK{hr>1j|^pam_dY%Sr
z>RkWw&jVe`zlHjtm*KxgdL`#0d{xiS6rEpO@I2_!FP~$-)RFf#)A{aBO@Mk-&n0F4
z<!KI}e-(5o=i)!<_(l)!BYp2aO+Q_OwJ4&)d;M?MlkW95-T}IwJnUt7ul+osT&M3{
z*R0BxbkfiNT*&ZVzefY<cTmnv`5y$`uRQx0{sGF5X-})6Guf5IJv;dgR%5&~z6=*P
z;bN?0YpkKAy)E8l#bS-^v1Kjo^>r<=M%>@j6|3v+wHn%6J6ht&cw_$b88fE)2Z}W%
z+7h_ntFvxJ43}(muCSUq>ssTn#_rbE6^P>chKryQ$&{e6%Py-7m&RhtlFjORq&i$p
z)f10(^h62^(y7T#D;5x5815*o-N|@w3|DDg5e-l4VCtHfRACF9<TbgwEim-L83^3g
z9c$?Att<?qsp3s_-7QI#JT9kdNi?j8s6>m*y@8ij&5lMRRk7MgRdvOpg<xJ5E_7cY
zm59#oNkqa(I9e5{srgK-{E~%pYY;RRQ-Q;-+oGa9iJ^iPx-W)GRL!qWmgw7{DiYOo
z6^TeBIGdtr$8afBaP-1}=(zJKFj6tvuS;$YYf#dVz@=1ItJ}<CxLyh^t8eMT#dxuT
zM8b+)RJmx*>`FxGu8&oyyBF=-?qad>s@V%7vB<(QG+NoE`JM5md~{TAZwz-acD3V9
zzC?0Gtfv4%c+tX3Vv&m(;o`C?JK;s;<<+?CuV(g~$_QFMR<mF(6GS(6wa4^@RaUI*
zGYe-gsF;hssZLIBYVW+NuCuYCYJNEEx|J%js3ummusRtok-N1Z#wZZRT3bWBqPi#+
zDNe5Fh>M&oL>Gk%y4w<0c0<&YofYBY)O+H<E;cgO7?%N9QC$#=M2c`(SENUkp;$7g
zkld!nL@vYOaH0EtwTcEL5|-<`V#&_BM6#=*Ap&V=fHXuF`IoWA6md~?sR*HjDu`Vy
zx0WI0ctvGvIO0<>Z{Z`A_2EclIxcc+MpVduk)4<fu-LM0X}ap<G&SrZWreCnBdAC?
zj653ZI&fWNaz&(7O6-i&a8X<Qsu(ZRYwtwG!v&4{Iy&6$7Kt_-C49Ju_oL~{$y~(@
z7shJU6?eK-IL77{MlkXUVwPt>De-7E*We{Y_Y6@=4D*yy;yy>!55w(a^qbtir|!{<
zw6;d)tLj!KMMFXtwqF%%iMK)L#^Rlw?VVuHnsrfS#hkf|7srb7i-#^_L!{K)Q5ZqW
zJ?hp;DWyo4^k!OGE{)`AR>V<)t-7+YE?K9BgCa$1V%5`Z$r-_5nOj&<T^Q?$M559y
zDBv*^qfjKOISwgB`$+8?bo-Bd96&DgfGv%ciEe9OQ4#ev0)}=EYFikqLVMKU8#2W}
zlNkz^^EJm?pw2M*3L;VOSexfF9#!i>7?F_71!&ruZcO5_1g7$e#TexCs*7V~9W`Mn
zl%adtQ_jc?`x3>9>TqmcRjTrfM0(I?WgUx<v916#y<E158w=Z;ARFLT>8_|w&hTf9
zKnqNdRmU3IJ62#wQ_L57Td8o`+@NcDBf5)KGr0=ut+^_CSV>W^ib+J~hD8aea2RXF
zW~{Y$Cu8kRvCg`-W${>3YZ7^0oia%B3o%EDz8&s3m&(gTFjVD>?8>L!RV!5w7qoZ8
z+gfqEZ%i~(O25o!kD)pqokq=wSzebKBBr`P6{VuGzCcX@Xn_(`xv{-9R@cw~A&BAn
z%+5O4ma$~rvIvYm>9$y_SSV=U=zL5R)g`g&Vl(Jd^{l5$Y|UYZ68Yw68$nAt(~!S@
zckxFubJtZz7U_x5Wj=(9%w3!f9o@0yQ04;$t7ATxxwBa2Q!{&x$e?g)HC)LHmx!t~
zm#WHQp&|<7Li8AfQYo*dx|S};6^tH=Nr^oj8=C~Z=;_ypYj&x^P~rrbs>Q?-@l$=?
zfe%~9&kjk^h{H(BNSA9O;p|^5*U;*oaOnrMX<K_ICVyM<({;aXtD~+W)1sBBM+&|A
z1`|TEJ=Uu)(5)DOkVnu%7-{8+D0DPtH`sZYjAF5#x>ySqglb&2R?mrD4MTWYtK4;-
ztm{m|xbEmqc2Q%+3iAu|rzvIOSQA|<u$*kw)<hyzwS~jF_syEb)8A$s8%dNIt3;NG
zYGHsSsuT<=Vp=`ctZivc?6LAZiAzJu?2D%IAJtl6+vdS=2f;LToAGc9HC-)oU<jzr
zmPr#uY;1|98%{1+@>DbtDKk}*C7!N_l9&0eNFx@&Vlnttz4TB<f1oSWDj+&P0U3g^
z0vW06l6#$Fa<#ozIz}L!sKg?z{iQKlXU&2+I(tq<ESz7IKSK#zX(NQ8PS#V<HPWfm
zoGC>wa$w$sr!^+zK4DpRDeV(MVO4pR92}bxa+^8@Fm*f&Jt#U+PUt$@-bt4+k1!fZ
zSGKNWL0igyqJ&iSq_P*WqN{`bu1s}zJE&F&!?4g(r>rsS3e}8@Zi%8}pxO##3bhwq
zM;Ufu(T{L7$fe`Frn|z&uyZe2{DcZ-9a=aNk=9UBh`vc(I{Fx|Vko%vv(|SK(|4U{
z<UMU!TkJ9al*qc5Xh9t@19cY*yQafgC_?D{tsVPD_%6h7UnsA-&Z}PnvNVHMv|N#L
z5f}kZn;9aFoKQSTVG@E{M2YgWSWMPMvS!lOy?6dYF1~&QnY$6m=*HoyJvH}B!`bt>
zNohnz$n;LQIE=nYGv33ca@Tw`-r3~bHy>GnC)%)_?Qm?8W@c}fe)RZ9O7x;wtqJs!
z-AAj9sEX6D2!iXztVD-z=wtett;T3Gu5j;!ri?aVrC~fPM;YYBva;4)<f^=9M)8PQ
z+F~zV1Ey%d;Dj+%={4{OrUX>mWo4#Vn1u!O!r8`^m@i5}7I$4$*MR}$@v6y;F(ckq
zS1*g<ri#T~o$!=m5`){Z0@frxwzVf?@Jxj<uo7Ldq*^w&B(8{8U|NCet1t|z8nw3J
zn?Pa}Sha{_tkiHZEhd(!d=x7ZnbuR+(v9Iv8@@`GITvX?nQj16M<7A8sZkH`b0y=;
z+B@Mr!2BCH^-WI;FAGv5CSD`DATlqlk%Cl)TT$y38gy833zl^#QA4DmB_alF+7j6_
z!V2S8qCgF1CFJsnBob=v5O(`O(x+6K+m11(7JX{f4I{2GUf;b89n`2696^?(ZL6zE
z7R}n5NP5S4+f}Vm?Y1d|(G4YQ8>EeT>aCN0h@#M#Zeb0n30HY<V80J%LV)$X08Ve)
z91snmmOLo^u+_4pg{f0^aO?q}wi)1>f`*;JG#QP?JqGuob}j1qYIkml-bLYjkS^TN
zTtO48OekF|TI<_eVm-<Q2~j-S90{FHnL}XP6QJx=Us2JTsG>&N+-!ou<1braHKG@!
zDjr3M5_7&iik40EY9Dl9NxWUHox5Hw6`ocFw|TW{h+3sKFutOk>P!)g(^)y>{gz7@
z^?Fs~P5wP}f2oL90+TViUHAEL1qdaT1%unM>fdPrmYFttiaXj9ZEE*Y+x%DygbQq1
zd!$M^ZG$HyPY0Wol<K#bd!?fmI#6j%o$<I=LqIpl@;6;IC~G_-t7^?t`a&j^qscOq
z2v+Axoxp=qV$A5Y+Ctl=RyUxhi>j~-rvwLzTJIL6Y5QZ?3bK{Gkv&iInPrWwRxc}6
zyS!bAR%lQeYC!`nNI@6H+ZvSuak<LSeqKw~E7{oBI+pD>+m83z@Sb`pO{D~GxKb`l
zDy4v<5JWY}V250`IsJqvd~EB{($>0O?0&+@WE{-O1{moA0D9BdqUqIMR4uZnheOH+
z#IbGv&5J~;AjVZlAjQ$9w;qbzTMsG^Zi^@vqS`vd$Tc1kENPWlU@VxU$Y8iI4kvIO
z*8TN$4OjRqWM%)swMHFo&x%^&8o~G_{q>hkrBm8g$0GCpu4UVZRw73KtEP7^k~yUq
zez5vf-X?n0hTDLF_Pm}?yc^c;1sR4S8qWv-OY53*J(gyPHo`F(z}}S9gDd120THuM
zbaL~AFe=aAd6K?Ra_5Z}OY;dsC{w*B$6i*?RxIl2DJv?Y$Mew&;fd!|UhI0?F|BsF
zjiY7+z3*Mb+M6QSn+!a$096-|6%KA-!W5r5oY~W2Jh)JubT2HpuAqeZs$00lQxvQ=
zot!4{wvE}f_Ntq-u20c7BQWXhsl?xl9at`NLV?R8y_8XYGQDnbFI%KxL@gc7K1uNj
z7sRlusE_J+cW~xGoT0T-yaRz97})1V520Wq*>4MnW#-anXSAOS358qhu81p_S$90`
z%*{OSZk%!##1<gRC2+Ed3qUk?SE6kh{0Hsz%W(!o`8$oU7!jR;JsQ15bF?!l(4=ZS
z=~c7Wo+&K6D4wiNw05^(joLVOMMFz`ZaaQIEfU44x{+)a%k0$vUoVcoRKbGBPORCg
zvG;Vus7RT#<<!A2oCLP&6j!!dZ23D=>G8vIqQ0{ZCkCZH+{aLo{%_gDXiY>~IO^^B
zSS<|$hK;fK(mJBl-a%h@No8eJuRi9)o9lWK?VT{j+qzp?loM8+?(s$sm)Ox4#9PY~
zEiI8|7Ar^@r5pkh9m3;E!yiUihv?%GShv@8cEN23HOX@|uIV(OMI5E5dL^x9;Y&N=
za6+x%n2M}euKF=8P(gJ(S<{I_T5ZcPg3OLL4yDQVO!^rFpJS42E#GYc89`_u3F6F}
zt%)>ux}(h9^BKl?46}IJl&t0zpPeMwaWi7obGbG%#z;WxaUIhdZ<W=T&jQll_J_&q
z%<57%nwm$+4eHrB3}AH8v7QNpUFM#du{vn5XJ%Q-y0)8+X}8H3lj8X8>1ylf#Dcxa
zrv-*vPG!wtMe46rTH0|`(XCRv<{~_*SdOY=IF3h8c~q5=fRTYkeSlk8=E&g|S8erS
z&vJQp8zu|Lx`kOO-exv>ZMBPRgWQbAFfq&aCQKmhIzSzYwGVZ;o=96qiqT+P_Rg4+
zVFCxY)^%D1%4yPi%=0q%{w*5YY<Z#5zN*nP(6D=G7K+;D;GQg2-Su5v`tX9*9f7VI
zoK*F?YV=Mjn-h+DeF)Lj5J%ZKNm1TW^h~RzPP(Ebs26-j&L;LyuKe^}Y^ZYl8}d|@
zsx7yXp{rEScp42uNRFQC(v<r6ueg)!__cKt`uB)f&bIT~rL2a%`#WFl&`Y1;@MS~u
z6`n%{bO;Z70H)|Q+LUmIsSP5vkMjR!LkO-s^tJDXka+phH-xZ(Aji)~V%E(KV_yu%
zT(J!?T~2($SK?8R>I||tF(rd^aqt{&(30`q4xa}5f80k;ZNFsV82pfW(K!Mo>^LUF
zM}1R5)m={6tK+uAFnraq|02C(h~4Z~ya=Pt+Ty65B38`9iCbA0*;{}-Vy_NOV^f5a
zqjLaTmSjhDd1Bj5`SCm*&I6iwGX-{OXM0nk1*dl~w312!knqvgdL12fz>cUb=F2i-
zIE^6((X`m5sK?=-@j0q?tWLJ!@Q`N=MCvhY(i#N|KIMgnWv|~&x+1=U>n9mUBalR8
z4qy&ddvvOSgFPF83u7E>_YQY8v0#GY$^Q0bc(*3-U_UBa7)$lJ;usEMo2fLdb(C$S
zwgIY>ML2tuI%nupEKM+><Gl^>4jc+eYhPO2BL3P)(c7i)+ekyEw|rU&urfm$N<tjZ
zj<*wekS@jflNJU_QYn(K_x&oBcoDqp6|Hcwt6)0h2)YQ9A9er|T^;Q_M3-V6dCUjA
zPGS4@>8H_~HCPa1c0KyLI?6YWM)aG@kD+Z*OSkDLJ>qFKdr@zj)1CmvAFAh9?A}Ns
zBN!j44Uu*dsIb6ZNLOLWnT(goet%?ny<~@)j@YlR`%ArN<@k!+?=HBG!lyTnF_lpQ
z+k_s<&ESMT%8Jf?<W=<3Nc><O`hBtY)I1n{X0vDnOQ2K>j3<FcARJ~dKzSmZ)qH>`
zYG01pe1km7>6wP+^n;WzC$-M3$HGyWvFY>h2&ya_{c=jsy;YRv7AcLiH7;s060P2V
z)Kh||MlMIF@0tkif|M19au@j@Ehw%|#ycX*qsz6!L<wvR`}sacdq*~Y?U(V`Nh1<j
z*t5_{_Iq2Z57pss-(F1966kr4CwI9yC+OKCQk&wgH)kAMEp4A1P35)4)QS@aujLds
zY;Lu4>Dz%QP`ZUJ12R=x&}HIK)iPX;%NYAZzosl_W79n;=_hTBn(>H}3}(IjQ%?WX
zGI+uoy{_CQ;!pP+l#Rt@n_FpLnX6>X`I)i9U9`zL6LreSoD-4b;Cd~4^l?uGGlEf@
zqNrq*Y))AEyix2?CBeNZuc+GV;}M>k;q-((g?7H?3gPk91)ly((+oJVZu=9JhhD@>
z=IUVYMk2*L@vJl4FSj<v#igdF;^Fo<Trhqo*PO91)O<?w1V{GLg)g{X-lj^iJuYdx
zDuG#}D&E!IDvsN^bMacNyrPGON0MoWEQfp@!T(I=h?lbghp!><=B;#Z)rL4010SLN
zS{V%I_vD!xS=Wn47K%+arCjeR+ja50V>(1tV8+^2QL1eu=2AmXy&e(D_=aSO6D>`w
zhOb<3#NG2QwrjV-`>HyIoTD=21?$uaK%cN=q3A*m9+$dTj!|8B-+D=V!8i4#7wl{_
zbgXbn5>{TiG)t#I4m8@EL_Akp1P7Daj!j!Xn$hLmSH{@$x-HZkj3uS^L1T5QJvVqc
z@YrYY5qoaz)evWHNQ>xYXJf&BcMU?sP2%)vn%XPPj0w*$reVAWXD-{8Ex@s6{3r_c
z&z9lEyriP3v(D<x-SipAqXq1_+@A4m`$Dgt{CC%2yRhuRs#I-b8;kF~>>fMsYR<<d
zhDfegmI@0hvBwkXjW<AL;Joe}9IV6EqwK7Ra*kh_=6e{@w`1kwz%dd6COb7Yn@@Nb
zAfAsOeaWxyPP8;mO*C5STXS7kvz6btq75-M;zwx|rw6||f}br(`4R&Ud);*sfuA~B
zl2*Q&D)aF%e_1=e;tV?e%J1kPP`fg!^5f0?<(%e5Bx1fH^#(Y1w5LTiflf^SsE$s<
zd`6yit%(LiX-`UCD6W`m`Pdt=^6|?);-uuyA-!ji-%FF<+R9`~=6*JV6FoVEJm2&2
z{V?w2HSx_oZU#4=s3T-C-Y8Fe{LG*AKKwWF&3$hMbMtfr$u|T4P5kkiY|R2n+9tla
z7tY|Ta~O~yX}BNxy(zhK*u*#Y#~Iv2hAF=ZXYdllmphEjcXQ92!R^dU$}ee{@{?>e
z;)C!^d~+Y2!Idnq$=}2`^>1VRIZV*pTW8SR+br?N;lDS&{4SVek%Ry8lzuf8Pt#xR
zzfxBB(}geY_~srwg9WF$L(6aA>Br!;p7`dzJcA~jNyU`k;Fmq|&AoaCAC8)s8olXB
zb=~DR_wN~`_}Ynh>wgR5oBlKR^cnn#`5`-|kY}f)Kim$G{7rmwpP#`^p7`GUzvqc>
z?)|gzBRU`R*R-4Y{(vXGxgXHrE-9Sr@#g<Hc=B6!Mn26wf@`Phh~D-#`5XMdh%3Kg
zXyTiD&Nni?35Ou+G2ab72dc=fiEr*D+|2l<VNJPBJX5ag5LZfS$LIZpn?Iq;?-3t+
z{Frq1d*a*o92V&K8%;ojlgE5D_;=9U`J4OTHkRu6W}K_YuK4ErzeqFXH@`2ro$;GZ
zKw_Tf``h>~ITd*5CjPk|x-ONDBP{uDWY-<^5c7|oqp32_$Zy)2dYgL5_%dbJA@$!`
gm+PFyUF4=Z`5LOI7r*$m_v-lO{umS9l;3>+UocUEHUIzs

diff --git a/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote b/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote
index c807c5134edd93d6ba623375d08fbf32ae0ea559..e6f36f58084a02f46414983d1572a3ed94db50c1 100755
GIT binary patch
delta 86244
zcmZ_131AdO`oLS&GYJV0oInTxIfihBJKTp6CI^rRXt)#uN;nO7IE{$R5KvKBHVD$F
zpaD_Q^$3a%y6O%$x~LIR@jwHjvWglNTvU|ztDfrV{_lIQcH64oS6`i7clCse@3z`p
z-KxSb1M6Eq7Qf{;uh{b2`nOsd{rm3vwqEO`gQCn6S(&$GoDkyRi$?lFLw4D)r0p|u
z-;3$(>Un$MFCwA!A|jTgM#RLmN^h?ylLtguui-*8k0}4w3M2bOnevnSU%a{F*G6*D
z*G5sxNXv>yK!{MTD64#NT$ep5$%GW7#xq1|d-rZ=ncZFP-EeJor?$Q?ix<R2{0r^m
z(jtB<IZ4#w(8_J`k(KK=TnhM&_<bAR$ZnFY#D`aItAkjOiX<m>PswB}hsuv!xA@Dy
zR9sJv@v9f{PuV_%93P72G_-$1YEH|<@pv1DMOo5@$vI7ul%k5vz*Z7}bIR$D<SaqB
zLh^{n{vpfqP(VI)!`7U^vi*xEbNY1+ik==(O-c<#9Z?s%tg9isny<jFIph`NHK$>h
z4)KZv&8@IY4m$ejaHNM6YqfuG?6TYAEd6AWBnHC8w@dMw_74aV6wQ}QQL6<*u_HsB
z1V+MTcS})XL^C)RKE6VVom#&LyXGD#D#@`bz+|{`ILa8sCCX9cwQR*pz|5ZwaZdZ+
zE>ei#0VxV~1m$={x^HN|;M$(PdLEIf^`5{EJz<C~Iv+2>Co&D{JetlP?6n;?Jh7l<
zuqJGX)mr>9cK?lWR(&RmWbr-h|J4wLw6b>|uKAqeY5hO2t9u)<`t?d7>UZ#niQ2zD
zT>2l6Sc|QSlZx%%OV)|c2d7KXM&y4X#YP>VBMK*$#EE=Zy?}0T^}jdFSlBWc$?=Nj
zI-US_tt+2n;W;-^T<t#zyZDG#Ow-ZMfzztJVvXh{aP4!3=%SfdEHck|L>Fw8&pQ=Q
zHDc|E{HKd&XeiXarLYNw(gxmG!+8|W-_UMRt6;)HyZVyE9;LO{X|8+Wtht_8v~R+5
zp7V+`I@T}Y!V+m^E1hio2v>g~#aQis9!`775Lud4H3s&1M5*RPgXUS-*AS}Rt8z3z
zfni6KQ00mn!xQ&=V+mxy=YBCnbFJSUu1)laJk5RJ${wCrv9sXaJ)8{W!Zmxmq6}Nb
zf1?o;q6$HRj^Jjc(AyKs@eH_lJPk_w&xJ>HrC9u5MX(gEbtP~IoH^2o?@qY%Pml0<
zqWbIL&oZQ_((!E$BB*h-$ZfD#C`E==cnU6j-f0;J;6434u@U4bTm{<&I1XnpBKQce
z8stg1Cf_S+b%wu!eVvV1j|hH;p!DC~SdTxi0;G^#oq<2#v3GmKS<N22`rqOVIgQ{#
zhOi3l-vSQwb28KxE}h|w5Z&O~>kP3bnxUbh4}#2FJh2jArPvig4(v;HUcq&6DMLqp
zo#1%5{)V`vtw!gQEgFth?z@mykMf9Ot+f^|Bv!kpZ-A>{`$ay4bIT<o){-8DYc6eA
zy0m4mjQ%}ANBb6H|DTRpHN58|uV|~2`362W(GY&kKf#r>;(%sJ{sOaQtbNfXh3_6}
zPrsg2_{ab{KDDmeS932tq-KGPMc4s_&=DsC-C&XF5z}-8{o(58q-ajUDuMu9Jlm-U
znu4gFDPvvm2Kd}ADRy8h{YluhgT10!XW$kuo!K`Hfli<lg$i15Z(X8gu*h>td?%cV
z@<pwGADmC~S86JO2XLJIV#C+Bw#=!a8K`!u#4jLTJXJEyM)@r`;~4qW<zO%r0Xh~%
zwe&xR{eRyuw5(+?vzL?lCB*rz<}c&OO|p}EBJ3k;0i8@TT-ip(x=$Wlb(axqj0Nx<
z`geV;UkK+j%Gd+ejd8r6($_uWfG%z^3KhM1p}PJ`;Qp=*EPyK+B~(9E<t|hFyfed>
z!{a+U1NVJ!G5?8BZZ*SgfY0uCX0V6j7{N-1$XNe-8ii-jq9UvSZ@>`-h%}wS!*B%+
zz^_a6AsjmBEJ04gw(;XQ3m-X3|JC7tfh+5idyFc;??D8g&GCvVt>B4gDV1sY^L7o9
z3`gh^b`>^;YXZ&?+Z^`Y;*5A1aA5=HHl08w+<$SLD1_CB)(?&Zq}Z)%Fqng2{1j(^
z$cJl8r>PXe0Y;#<I>MXa%r4HPF%7njAIBZ=8saO#txC8GE@No0!`}=0Ka%1|w7@~}
zcLbr8GBz!4hR>dmu@Y~EFLjS5p=xBO;;GKGx(g0a19lC)4*NfL8s$E?m{+RCc-02p
zRQim78CC{^bwo9S%;~fcT_q>r%2`eoo`%o4mLI<=p6rzH54f5^MztAL;>0=(0yHwa
zb2fxS%)K?*zX@DOhAsUd0n!nay1HQ}c*jh;5hjVgaBYSsww}2f7EHS%bcDm<{C&}0
zpcWEilz%TCjGs~!!<p|hjOqf;hHLNhi00%lsB*j*LCFs+GYO!08C><0SJc<T#cFsD
z{<K5O|6aK8UmoGtydDlQm)Z-9O>l(NRF$jvw!ojUoOw<2c6jqLYnlyOt#T&<AN_i<
zR(MYFI757<`4xElO-?`G5BERpjjdGPfe$dO+5^jnuz1rMxK6^WUr`kv&CqvnB|C#7
zY6YWm_+JE(u}&+!3}1@#SSwTIA5T%LX%l`OK{9-FoD}CZcZMS!3=z@X18#fJ87ce0
zHNEL}+J6*WoyE99`c?*RMBp3ZRM{jIAeX63j|;cJk?W;ct$7(-LIQS;g<x;UZZkgd
zFl-w?jwj$!*4O#CRe|=vn~S9g+O703g3mfSWA{67EpxTqCO&|7j$&`4-}yJNx06$2
zKfuMUJh2S@3ZIQ-C|P(Cg$VrREVApuDO1(>i}5isHAYZ<jkEY`4wruCi7kiQEC0`V
zCAtQ>!xc>E(-=WjiTc2q?|8*>UBX<rekZ3}j)N=T_X^ghmi|oG_hQhAZ~=lUsw726
zuo^DRaz?)O@Ofq+)r{0o`Y0@J^u`uNPs1bbvHg?9F2y~0Wjen7aPdFAf}U(;;5`*S
zT7K0}dEW?Xlf1Fz%~^QQ*Gx{j%71`2=E>MT=R6#G(-0wQMD=^ML9S*tv`4~ZxMl?<
zCdaCVn!zC!@3wzB9Naj=S&ei;5NPdGSr$B=^_RU^yax6&neb|@8W;oDcJ##B$W8D?
z=7ui#s|07ji*sdc{XZWLutc^eyIbp0{<5J?KfeQo^RA(`0*<`oY`h+UH<Cbaoxo0b
z2MO32tb%v4BelnYJ#gh4UNJ}4;2{-0#O$ft(9ycof7$)cJF7v#Hhvucfy-Pwlpo=m
z!|XtG1Q+0}SSzP5)uRS7h2YSD8Zg8aa81lVS+s%u1=Oex-!X`wzH1ZN9j;J2MV;e5
zaP2G^TUiI->YJQKITEg630*>FRh3Ut{=8CU)Si1bJckvLo^HirIN?@jhZ9_m;4@ba
zSHo3Ik9IHk8(es`CpLd?RQe3HRXV{f%72h^2J!;zXZ>jpOmDy$>>TH4{ljqaHL-y%
zDBe~Pgr#V!OMDF8=nC*D?A~>rgCnC1v0GQ+1=xMckVNyYI_7lCRQSXIXD(<7*WBzC
zJFM``|LqV2Xk?{2hrM9`zYTFz^FX-&1B@~RPz#8BIP#`9HZYBWtJs%gS+8Isj!M>7
zYMfB7WGY-U(8<6|IJRU{^Z#uKD(IJXm51Q^%w9AnD}pufGYL+suYe1CJJaoExcWV(
z%AbHMSzt^gyc!pF!zwamsYdxKj*_0yeZ4jR??q5bl}ytKz6sa9<0NoQ`LA})lRkkr
zvkz0_fGXkVaAXey5dy{Ez&l95?v_8mBVuigMO+dE7|Q}i?ax$z`UpatOxgXmAzaJJ
zUl*M~Gk7%{6IG>Z=uL<HOwacEpp)WvopGWcyfeDMu=KBni`fMi>iCAk6{|dAgrxtg
z42(wLd)JwrCcrhEBOTQNCc~jIRGDscv*4^uJTYofdMo_QTo$3){|?x%*8TXa`0s&J
z9(Gnhe}^ke85x5*zyk=Xn7vAMfbDSM4bIuiQ*d>ma~Ai8ihwp?zk<VX#gpFHlB^oe
zsEAW-)@rn$z$I+GM(Fr!;Cy;X(0=FNBdGDS_0<ZZ0b~E$EVnf$!d_06YBV>5&(bFB
zR^9~m-Ruz&U7*(RPS;T07q0JW1A}2qbpG?>x<y6cQYeMfTzs2~;0`HHpsT7h1h;kh
z-vej3_<q>$;)mg}A3Dct+u%Tm`nSj8rxBbXfm)rwE;z*%!K-i{{obzfgK%?~|2uGk
zYti`;oI!$iBR&aNGH}_A{0y8HtnsKLJF7(hK~U>@r$56rY^Qy?N-x5pB<FBQasXb-
z8r>c@65u`bBKwsjsRMFt{McP#KZgwmaH|^Z4F@-~CbJtwmI^?>Kc^Ljz*T9CbUFjq
zDW*gj+J8J8I_)edCd2-)R|K^GBKTsyGacU!7qXpSt^M!3g7;s_z>%i~e?y^`&1074
z2VlX#G)HqKTtz=_tNBT|`h;`h{2UxuW{B7HT=FWMf6OCx>l%*0<Exw+I(!A~ubBP6
zS4VgPh2lrN;u#&mDL9k!7yF%`g(KHGL-UVt1t<Ep|1WUBrT@EP&gk0e8ua?8F|H`s
z3Rth8zFL2eAiy-8fkv>8DzYPN1{b;-*+jUaofO4de<fVY*2}Jujj(vn8G0XqGaqtZ
z+2e}mIj<o2Jc6S)$=I^_O;}xmD5Dz5d$8KC+k65(*Tk8uzJe=vIj`&oICPs+#pmJT
z@0}yIzZAd3W;7aqP{b#5L~_|_gbm>8A6WP6DryR!Wh_@UqQ-tdT*@)$nrH-~16<SG
z$xt`AupL#bUtwQ3^0G5s=ff$2vEK^M{9lNmx9jk80vuqOJys`B3}=4Lk*RJ(v*3z<
zJ8h;EKI7^|x4{=(`-Hn;f1+2=&aE1)fNPKFHl*hN2N6_dGPLRhDiy!%bgNx(rmKct
zg%e!Ibo=09+K4hf{`U#&XXjL?3vdQ@FG?@K)u#-x1AlA&CtISHDpqe<RdHRo=2>Up
zX$FsI#JXKa*cz@Xby|IUSltTPjkYuVSwoMwq+eloxMnDQOxHj^IK&}UZ#Dj_Mm88h
z@ib=%l?U6#k7FFXkwxWB+)Dpu<$uIEcAEv~alMqO6Wj@hik&l|H<Z3Me(a;LkMo4K
zYW`6Xexw4p&hbvcRVB`L`4_l10aRe6{}-G&+F8?i8wpXv*2^BK8p5?~OzawI4i{c@
z;%f_6r*Mw1=06o-X9P!Fo5!o+5Z%mPLgm9g)_A+M{wNheH|H4cM)-Ue=O}zKoVm*@
z@^zIjgy%4~^Abppn<IER8=s&Os8#KH1Qk=*eCjHE7>?ZNtcbS5w(;Y57WQp(4k%uR
zL+dz#)(O4=_oo2)=&Ay~qrzY3)X;};U_I-vN^+nSK35U2-Byt+{tli)fW?}BfooZS
zoYTGJ54e~Wja|iR8g$<V*Ml=z0oeug!)M=f&hk6J{#BI!oIXJ4j=+6wI1sMvMZecI
zGDHRVuX9Q^5)Lfl7)@vJ2G}-!9Mj>mzcWzj8kqw}b~}^VcDO27?ko^?{!avZVBbM{
zh0efU_-N~B5^B-;HeAB~Uj?&j5XWc4XIJ?dxN?FKn_K<^Z*&b@wQ#VMandfqMFiE8
zoNFG@m>hm>#7;OGz@@YiRXUa6mGEX)gz3uvb!Wey0awCy4RnJyvX>0$@O>N8|NMto
z0cpWt6e0`^cB>x-pJ8OP-{Bb8Hhvt_;LHQg<TVTSWgD^M`~~nux}jYox55=P>P*6_
z0^HG9^}p%Px_mPVw(;Y59<EuyVY7Y(ufU-R9&rRe6~SH=;TaYnn*R;&S?X-BkHc9*
zo$2~NuzNErf~hp3QJz?i>%zsZBbkQq<_nThJ(^&e@?Xs9wa!pFoX_6SuCdPWnKBu>
zaO??BOy^Sp-3SN5{SP@Or@<iz0xYTgI)Q8843<!;%2f$R!AEa%A{-A_a4or9M_3GB
zWTf1v_2<I*-0|As7sKaVv*9W@jWeX^D@_WDdl6JHR@?pfe%Q};o3>~TOk3fqBqk%B
z;A61+%Jn%F-nD8D!)IOl{k?GWZ0B13P1rf0VE%s}L1Zc;6(?w_#3$fF)&l{mSj~o~
z;hG~xY(4O;Vg?esgcso2N*Vk5;!n7kMW#J&)M-LVesu<_hVcJ6|8Iq$mK>JqD(nD9
z=#wWbHpJENSy!XVg?BG=J~PROD;{IBN{&<k3gJqw{7Q6uH^L=^SG_<LXp-XP9_#Sh
zn*U1>_*^+$1iL?!S*83rI-RH^T&E)7m@T0B0p;&H7u*6@6!QUx_TK^5)bqw(;j?hs
zPi$s6f3|YGTSYM2siHlwNOKOO4=8^crD_wZgzv)BTqE6aIP|k~LGdZ<XXx&&^}mEu
zT&HY5!&Qe>o6z(B9|(%?;J`t5D{mT0tk}TA@id&#l7mX^-w?jYqI0oMAPp`a=85&9
zc5pQ{uvzPOhb#Efhn2onfPP9}ElTbAe+YuYE9_gTI$|UoVv%UKiR<BNKe1udx!zRe
z|BB=UBO2jM`1pq`GIfUTfb;(&V;9e>VcYm|m~dvThyA~v{T@M3{i<^ew*xM0>kO4o
z!)f0+hta#>v##_1Ff8b{YM@f<iT!XDyX*R}dS!3Ig*}{c<z409oBJ2L-=9F>KEFQ$
z2k3^%U$xp_;ab-~lGv1wnpQZoqYvJ^-#Oz+g)h2#O$)f1jfveT+rw#1`GTZO=dcTc
z;#DlWb(Qvl-IvLO;Ln&$>_(UmM<zL!?o(j*wOkoo!aHxP<G)++Jw~iiu7zuZTq@c9
zbfXfCbz1orIKqD3uOoO;ML?reMy>PzrTm+6prA8+5EiaevUgzjt=Ad2lx4a-aQz1k
zE_ZD#Y7w}P%j28TZ<`rnqfQ_Ru4M0K*HB});!98LUauv*W<NI^<U<`ic7xN3_~1;3
z?*sSe4#|(dY7_a*=zjrDw~Mvl1{8LX!||FY!hudsKb{VU=ytaL9QYFY_Og2c?7m{T
z8+Pv#*25=~o!RoiX7s;e*7bHH+<}7oxP2F#aNc=mufQef+f{e~uHfp{Zd6C%!i$DD
zt4n+wR_Fc7s1kn$SJ4P*%NBnVL=ZXZe5r8`u1t*X%~XJ&;bOH2)d7Bo1HaQP^=xQd
ziT<6=35pLE_1IqP45Y#rSK8BUvS<!>aUIPD(^Uj~R>BNw<)|~9!8bdF#G|@hPxvUe
zTwQbqa^S#4PwXBrA6|2<GZJ10*KiMLuMe(=&t7kDx4EEzU8m<kF-Hkb@WFwVK!R8Z
z_rJk8`@J15#NX~Ecf%L+oHgA#*nRr_5FBafw4uk~O4`^6U4UJ%|3)^a7Bm0vLEy`9
zPO1I{uc>2*N*&-WxN5vp#mC_gBV(TS{|v5q%q!08D*p<$jUUH<mH%pPMs*FV#!^+R
zKjBdO|3m~K*9k{sI5T9wGoNS;XOKaA;K_h5HIB2k<H;gXTm}0$qOlwCV0h>E&Z0IC
z-oxE*t<K;GcutGx5sWqek3mq;)!8T%!IgZuXpdZz;Rxq?S(HXK%0+O(M5hLBgR6+p
zo()5a*%^)1`uD=cv?05jZiY{Aj!1^1^Z#}P>bPAQmB7>Ro_R8MzP}5u=J;GSO69)?
z_HS`kJpWSRT?dnI!W-v1$MMzh#GOu!d;}L#fNA#pe-c3{dprA`eg&s}<usz7V83g#
z`8!;@lAV#RQf~_)Jj|t%&Y%w#Y|s6go5IChyQx>CdQ}@Zu*M^v;r`hwQ6~icFR4Nu
z;Q$o@mr!0ELAK&ooZag+iVv_V*8XEu_<u7L^D&C5p(5D5+58*qKD>TN>2oj{^y>(=
zBdBpLoBsj3FAiUZi)jP)(0l|gV`FCzOx5t7j?O{khj7(yXF5Kq*mW&;7Ooi2{BIA1
zKOv}I%1%Ysz$G~3x{OY3Nvn5{Wbk;`DOfsO(bbtNy1;8Vgt8lPZ}>H?=}zeQuU7uu
z+5b1!f?Ndd)9cZ2B`3$c>S&e1g`YYX5XEpP#N?yH&s6N1eiy<KKGU(gWf>g1_v01a
z1!n|VQ8JcT39Lo1`GB(<egO8@p#^CFN8yPFqGv#AdffxNKfirL>F<r|t2JUZEEq?~
zAC1l@K843K{o1|gdpMZ+tT)!`FC(bg?h)!5T{WV_R=g56qglEJ8p9zL?W#m71MT3_
z7o5qeD;!}ry;|!JQvQ9NL#iQgA-y7?{jY6B|MR-4>;@ES@8(!cE8GOz#*bsBihwG$
z2bQI9nyU@m4xcUYSf6660<466%xo&G;(OqoT*carc73ZLRmK`FLo4h+q5d4F5k3P~
zJj>XvSIN8KQs(daTK{!;&PXT22VnPM^ZRf~qO+|36fVwj4m!UKB3Ml;->D<~Qz@`a
zwiAf=bFN1M8?}EDY#Zfl#DCiIfqNBhRl;U)VO<)G<~DFOC#Ck#+zl?RVKb{6ZEzrh
z5M#fcU=Dno2rr6ggavRRyIGo*l|UhE8|5tGKb0?X9gkaOU<zE!>}l8N9Ju;gHk&%Z
zC9rEU3X0ni1Xuyt-E0*caec#aA3PENX(F1y2IZgc^y^1p_p<wWxQcz&M8qmXd*I{r
z0#yLDF*yX=#;?ZzcM*hK-++7w2hyDrjBge1b<G8E<#=Z|{3l#Ph7&|I!T8qr&vp)@
z>%n>SBD;W%;In-D)mG`N`Tt4;5$<p@bp$=({GlvJWHiG5uzMX}0Ix}Mz5yK#yO-@#
z;S9cjw8xba*gY-;VcP`#xGE8Bz9_|J+KsBR9dM|Pvv+$19{ZuQrrQf={vpLBt$#={
zA7clIONCd5lsi1k1v1L-!M5?^_|jp{{}B9yz`bra($%uvh-Ih_EUsWEmC*=&inZ}$
zUkUGI;_zz!bhvuBN7Nco{Z4Qd-v^YSt1{RZE{<pVweSD0LJ+xvb-Px`f$JwY{d5>y
z$w{Z(ste&f*Kz$6IDo$025y0O`kX_nd9eQ}z7M1osthcLLwq?khxTU)Rx1V9*!>_p
zjRaJLs?~0TZR5xBCVaL(KP|$oBK!=l5iHU4{Qnhv(Y15>5w2#%V|VlOu;3OgI7dft
z3BiG4=Yzr~ZKx5(ZoAvHglk+8w1GFX`Zz%j)rCS|xH{x?vpl%Mwf`?r;lFTtRUsT2
z#r*Hr8Ms~vTmdG)y_<4Vso&{jcu!iKFbq{AiDE8%f^k{RW@<^b1U}<RU^yHZ$kq;h
zRYR-c{$!Bo(D#W7cuzCdU$5x|HX(3-Z1yBv>B`VEa221`D5GY-{cvp-Ur6W#{;k5#
zquY9-34Q?khv11(8TcHoSi<C^(>n{-1pRCtwcvXMX-_$;)L&rtt=C`h#m3I&u>m7U
zG0SXK11f<u*!@a6!gKiYIe@=vLp|X_CL?>r)gKOqT6jgJZZrV|6V)h$Q5OuuVZr2N
z2N(}$7JEbpu}W|%TtS3mHP3{nv32umUI^E|>Wmw=!DsTE>3Eg8f2-k}&k?!`*P_th
zHJ`7Cs}4Agaue*n1=|K^x|Zcn!>{Ey=K(LkYo<Dj)V**OXF^^wsM^fiZRvk|>N}gw
z4^TMDx<6l+;B&akUtBQg44#3{F!|8%tr|E7XS(i=FTw6_$&%X9h8WsYw0<LaCpCLX
z$JaE7Aix<<mR9h?k<VFD={C_BuI}l)v!1Z~{C+T;5s}t6pz4(jg#(;$cy)Y*aDv~d
zk?Y|jtcYy?;3NdqeD8Ng2bc*zv(afZ3t_d*v4_@jcsw5z*fn%FTsn>KesuVCaAwHq
z=NsV?*UqOB&fs294@5!n6ax2;!m8j(66m5Md>LNztF&6ND&ZS&z%}$9gm><BE}4$P
z6{np$ppW6wLa!*+RsJP>G*$ri|NlWy&2_oGV7LIgPrHrw_)q18L}w@wKH>VfEE)E3
zcWjTO&EeY4Oy~L)wt>s;bXt8UxW>L~R`Y*9B}j3O%L8zT1%%ybhQih3Sx|VQjcx+G
z*>x^B8LnmPwp(X#Hk@#W^Ntt56(w<2H&w0vHn{&}mi5$#wf{E}WYQ??70RRV9#;t;
zhtD%iF4qY>4ZDwIcEg*`7$T(o_rm?T9cS}o^^!xd>N3%H&c!91nP#Xq7v29KM-XAb
zVVC$z_|gZ?Z#>Sy`4iQA9gXk;Y#Tq0v<}RMoNC#P(hsXua20;4&2)gPQk}V@7wm7#
zL1k?IM^MaeRSkYBKsGGwH5yz1?|$AJYlIV(K661p>(7J>8}R|6&fr2gZ8ppHxM+N3
zaD?-P5n4Z_^!b)-nmzxoM^M8xT%J~V1fIqZ3`#V=2>T8=Rs2ubHhvs$Du37U`w2Ku
z-&wSN1{dGu3`D1uKEIa>cF{Th4nc`)==~YKwB8;nlf@r!h+)GnfhU7*+R|Q@$BRTb
zlII-9C&NBk{WLPCYBUvge^JpMUi_>#80&UdA#fj|41sOq$1w&zGS}&*H^J3IoZa$t
z*!QsWLBd?PG~g^?mcYe}oKHG$hx^~p@2>(xuBOM;2)ZzqE2Dmpuud`OG4@VqBV57s
z8*RmP#8%k78GRN$y3|?szo^2yK0r7Ei<g|~`)#;3c$$rYeg(%7lpS>5(Z?zP+wFWT
zwcz*)u3+PK99C6&5w0rs3cu#~j&!jz&N*KaTv_Ct8>Yh5>>EN_KOGJh-@+FPIs=_m
z0N2G~U-&$&K1=)O!9I>u`s)Zr!V%SFFlxOp9u5qVu?!W%c_F(E`NUjU9d0Y53b3Rj
z{XdJbU6oJ?mZMO++8Mc4sR;RbtzG5o;40T~`zCnv2hR8ZkHWU`<9HnQf8!ibRKW>T
zomc!yNBUoYD)*wSGVmG-mDxt@KK}q*=<+`d+s2RML%4(`mI|-h(09t8uVC#A{{nYm
zK~jNO#s3FfcpHsWj|<*lCk7T)Fm|`92iwMvqZxd{wK;7KPjppj7r1sDzlcOvCD0c>
z;(8?ou;}Z=UkF!iaV{o<;}KkB_i7KlH>&{J__1$+H?vB<gj*#rA1)p0+?Xu=AAMCI
zziZ977OuYDh|L`jz^>!?pxB1sM65)~;&~Oo)h5Dlnco>mUWW@^^Zx<ZHhvuMz!4_1
z1l%gaAHhD?-R~E0MO}N`VE+FWf%}x}JY336*k@L&H^gOFt>X)IFNp6<o8gt&3Habk
zd>xmk6Ko6%*Yw*=u{M6}j__IgA~VQ9(+|OxdzgM{MJmU6@a6}&Hq+DRD7d7Dvq-%W
zuHdK3_P{hn`Ew{`H{uewc3kv+-+E<mh`Z%N5?8zwt~8?E-kSeI2;2t@CcK8bUDb|M
zfUR&f-A?6R@#9M0)yQ6k-G6{#FB}@?`s4zRyzDGOKZ18NFden$|IZO*GTqt<ehrV0
zwFw`~DH3pX!(ZUi_6#KYmHe*w1Lqj7UKdLE21_zjREE=G_XS1|SndB+`j$d}1SNku
zB_0Hqc6LU(VQ?Yq{!(27*TL?uT*kxx{mu<ZF<jzWMbCoW2OKM5>pLLspREl19l_XP
z&bQf{;L;Ax@%dJGOKVccsG;^5SfukeEp!dM02eY++Do{9DgSMBJ3Y`Gh12q#Us}Bn
zXR<X8)<;iO(I*IcPjjZnuarU?XLI@;e9?7D^(%b*q;vNxx)Kpbvm<nTO<?ygpZ##f
zc;^D6Bg_gY7+WA@A*gnhFb}qkAID_4mI&-NG7CPzZn+RYRe%L>h&E(rU>Th8wDa5i
zm2er$a(nW+7xph@{#TQaS{DCZ1z?~!K$R-q4BN(!<4L&UWoLW-9Gtn|`5o|X*nLB?
zAFg!$RI3_xA3}Ws9|^ja*{2Ys^<w9v-|2U70wZB>5?3Ys1uk{<g5TgkinHpy4Chah
zu@4#=bfbhE?Plrlsc_r>a-pH)Zvh9_5P>~%wNU|naen!f0lR-@+XHt0{e>*p*Nkqa
z2dX?3evmUzjf6|PJFjdkY#Tq0som&*y%*clXOft&B5<98t@xh+YyXEI`XAm2`xZMR
z-EKI|HCG&hD_BMQNL<zE(Qfqr4A+|PV-$*ML}^;#l!|a2Kg)@a2KW{(Z0uY>`~pXw
zaaz6XPJF9qlzJ$y1KU5*R0%eP^ZPm5^EN>QY5#Tvo#AT6>ajWlJz-JcOg8;g1UEZN
ztg&#yB-(&}r^Rp?>wLSF&w||pQz`7e4+!3Yps?8aV><W1mD`<LtMzc;JtqTO;2Pmf
zE|0@i=%?unJPVh8<*bt5gv*|H?ggviwKv38L_zT}f?^`H8{ro!0>=Jg3a*w^7vPDX
zFkkCN_6K~?wMO(ZAe6Ggv5(>E!IgWRA1?XeYJP@muQA)fJ8SH5gZaNR0@01LSzUsj
z@I@4AHTQ>oESv2N24Fw4mp#xFz?Z&q#)a$PVpc%*`d})Y#ZYSx%_Z=WfbO<x|343b
z`(ki8?Cu2>a50mKicr<aCRn%@AdkYCu3q#MTyl+J)u3v0&%te7E1*4aDFt8zkM95X
zB2a&q${u(Q!R|k)a9jm=-}#d199%QqxdAz^SULv~f5JyxpI&)-@CrENvuh*)=C22-
zUKZW|CnLyU!C}9nG}!%TaoWPx=W>KoCF-Lh;8?9pa~8b$5ZzY4vRv3t0&}$g2sl#T
z9%mB8ICwm-)NjxK6A=_La{2XiJOw^bXfG1$h?#J0TW4P|A3njVnQm@n=r*{5t(qD)
z)K=|2xUw~XQPZ`l^r?Z3uqwb-*xEbV`~MdZxc_Fu>#(nYOCnvO{VKp2j|k`p-h$Iy
zJDF2(sGW7BVkPhmEVu)zul2u&6Uv;S{yZF+&H2CGivK{+n{KB{q>jhy^kg&XiXaIt
zdDrPysc`HRixRbi-G3UVCtNtwjxR|JfcvxTZmTmCfCJZfg0Ul$VF=E+PN}YiH}7&*
zwG-gbM!sCuH8KsZ;K;@<;T*V%fhwf^m%;vi{ANVI(jYu1$GQK%8(wpi`~QH>Q3V2D
zwe#z@2jS|`T%GC&cfg+=1ayQ?!BuOF*e93I!4W=MIA_IYh=Z{IZRgvu&*43seA?6b
z4{&9WUp&~`?+Xarhf3aFjP)F;*sZ)iycnIeI)PU3ngPxqmFNLy4dk0oJyPbvh5L-y
zFD8e>>W@?H)&(krgM}=E?S45H!NyeQM&w3#S_|iQJU7FYY#wd>>2QD7f@BWt9=L9Y
z3;8vjtzQn8au&>iJ8djU+}n%(mxdtP=#s>G6nZn#MLAhKt|ENXxjcSeMd-T!KM32#
zkK<j~T|+hh<Np&JVVuc8S9QbRdeQ$Pe(FODl6X0_Y^Ch6x(Qs#3dZJ^%737<zi+4f
zxzNei;d{fj@#7c_2L?IY?V+%LoAU~;4JrXQlhJp|-&9i(xc2)~;8Jc2RE?+{&x8XR
zoP=tg4_AET49#m`_wWBV!L{x9zEHpO=U^W{_j^r;5AH=Up4lm&6%MHY7o0nvw_xur
zB(MG7hfi=5I!#CL30&z~AAAL0OmG%3-@>-><7n82M&2=YFc}oh5R7-N(YnGTTmwgM
zc#Uh7d^PMoZodZhe?=qK862ya4AfVNQDa5080vh)QUYi4N}`RD`F|dQn!&b!zlH;M
zaYe8U_PbUvE8qy*@qkX?F8G9Nwp$C2aNP-2!iBEg^G>*my`p-B>1zIe1Hl$@SeqEl
z@w>2K4QMy2kKhBsxpev*4m9ELSeN)4*w1G?c8UM1{QGjQs0XSBeTh#SKeitZ=_Q)A
z{~v^)W`@&<hQmJB<T4H}IpQ?JB3S(ut35=fs(2Ed!2(28sp{9W6uT~=7Q%iwTIKwG
zA=u~hs$Wz_=l@j*#wW!Zg;)buvCN*R6W9P(%=3y!-DrZ3zz0}lD5Iv=op5cj^Q+e9
z6;tJQw++Lex$cHvhpTAhz3ut`5P~Btne0}56wVLX1CdXBp#r?){5=0#SdhU&EL8))
zz-hlaHS{}Nd#AHEl>KN^tgwo;ejT`ko6@uP{O?2H{(}K6;f)vgU66jq9bnt|arA(z
zS2_bpf4HKlv&;^_5$<1@UaSn}!!@m=F{m259?l%k{yzhmD#1+%e6B_147kEI5|+Y+
zY`?R#{%!D{QkHCWqTz$E`hx;?4c-g)=XXFpt-l4X;fls?13Tcdqul?KK`R6QKv2t)
z%2wD7Z{|p5y!L+;&UZ~Vufw+S<9H7q@A5wmSA6bt)6d{azKH0Lt}5^uxaM)@|7Ua+
zpF=Q~fk(Yl)r!w6g-3`$ccV-2;>phM`3x14Hh%0Y;T?<<%B>Rc!(}hh%60rb;7hkU
zzohC1*9I>*Rg#TB-B2i_A}D~XIlZ>~^>uL7dOlLo^ZP_NZ7Y9RO!G{o&p5C}hY!MQ
z=$3YY?uL(Z%T}c`v=$D!4~-s1P<b!!P)E1}c7Im$ES#~>x#4&LKEf$kKqvSr9AUOp
zL%u4&Uf6wn{|+qZ74|ECzd!x2rm=G+^0^AoRjtdT5qu5%Uvxe!{t<Tnov+{E2%l)o
z(fV=#HON=8cCV=mXL>zsSflZ!z~VdCbUvUFWA*3Guh-h55P0!}BhBM_mQ0>AbK3Mt
zCDY3$-!gyRq=nPx&R;xzQt86^GiJ`2zHrhl(-%*=WzPI5ljlsDId9S8$@8X8f3f!M
zs=B^Kix)1QKSSI)brJU5DT`kG*NL(#8c!NJV*V{+=AzPtGv_T<o@Fx@&Nbib5#MOA
zm^5^L>B8xyV$!hbi<NKblEt@9Et$Mf%u&iS#H2|JrY$O+v}D@MrPF5i0v9b<sKmKb
zmdu<p?ZrK3|Gc7c$<i_>Z?O#RzBo29ZW(WLh?!)_R&nj9`IpQdhP<+IL!8TK*~&c5
zL#>PV&B=ImU2h46_yduNS_Wr`?Qn16u?_oO@lS&PX+CAh&WS6)S*YD(o-|~e#&;2_
z8c{L!O0?LjiEquu9@!>13eg9|^0AKWH17NG7O01?H(;;BX0#XYAZ8wmnO?|lfLQ^G
zdFV7HMOuKk3Vw~}-k2t18Gg#JTon=Qd2p{dQKV#oy^iRJhuU4)rZ);%I?%&pPF>O1
zra8K|A=`ny7I_}FZ8C|2gIkK8?3HbjZ^!Eu?R_;~>&+ov**U1XRC`yb_uvdy>S|Xr
z68|Y$wqHko1U)|475t3X8d|3#KZeXU>aa|i7owK+U#11oNaubb{u?~P5RaQbd1Z$l
z>JN?<;K#Tpp5%Fg5CgGS@F<gqUvJ#%mhMOF?q)%pY$NBJbK_)_rW{mRrZ;v69fE6g
zvB%sRCp))hxVJ`2R`k}N=DZFswJfvE67xTCvisF6=ws$eRMwK#25in&Et5;|yU{5{
z=I6=7pW_S5Y&OTm%QgdUK+I0ZGTrg6LcJ26t<{vr-_da!Mo#ghxhq~aDadenjn(d-
zv776hEydlG==jCo|1nn}PuBWR;a6=Y*O84{@U!=rVKlSMFs)YC%&sGww7y=)d^2LD
z@qzZ9pzW>ZZFOX;rs^bBnO2mefLO1DnRLYi<}SPks4qa2Veu#aiT7sgcBr=0#pYWs
z%M@U<Ub9SF!W=ad6J)F8Y{dI`{)31cGVzx=Ac6iq6_HxuPe%U-Vx6E<;g?~8ziVL`
zzqS+2l?k%5oMS$lAUgyfM6-?dVX78!#PS6X%WLr^&uy;P!OSW4>6oK&(n?7!i6@mu
zna`q@^<Q;Lt;{0g*o6HXVdvsjhg8}$5O*QqyAJUdlJCsPiHtojq4&K`ryqIX<7e@A
z^T9;fy{SP^HCfi`D25YKnN8-YMA@iudmT!N_%y;AEAq_5y0rh#&92JYV~(mTn~eJ>
ziZ2kyMC=E2AiiR=%troOqTNa`!IDP*!&DhFYtXrehv`=ICm-X@*XznF;|8O$-8@Mu
zEy7xtb1LzMQ)o-N*G#S_+w{2rZ~`qBxf0~A)-wHc;iloohR?c9R32}eGwaFDBh{V0
zGEMQ`LJ+BgP0>+}Aow#{7S-f3nlN$5TJikJV`$wH^He?AB&iI{N6bf>;zcvAzHF0N
zN2d~R_NXr#b^H}yPSvdMa``?;JV=}QT}Q%&rDdK{Y|gDOdj-|Oh%%LU?L_rkt*T`A
zyWIETK924xtv^fqJ)rH=TE4&)hQ+wZfmxu57j)9QRK%DW_!V(z*D6UCw0IuDHWb=g
zUIfs#vfefCOp>kS4D<0MIk?4ZNV3s9Pn1Wr)+4wBJV!Sq%SL8O1KGSs1*ptORD1Gp
zUm?nM4x>KIddN7E7T2Tur8&8QY$SJ^OB%>tiDU5Lo=-ev?rOk@%+}sANr>hWK$#^x
zFPo<b)b$OVYJJ;Dr>VRr;?6<#B2Tss-kn!=huQiH2D4QpFiywx2<jZ2i<|Hs$n%sr
z>I&JR^-NU0A(qxWGjJcqo{Trwf#Mn!#BAMAHrn8q-jTN=`WG?Wjfyhs;483Kpneg%
zrIvH1B8qudBOiyXlg?N^I(jE6{$>7og&dsyHyz|z)c;LJhhv}P>581&EpZ0fbZlit
z6U_&>FKNCV9%A0%lUD}gaXzL~3F#>B)b<s~n-Jupu7|PYN|~?lABy}Vp0L(Ef$rDv
zH9Vj3D08jWKMfCqIo`1Tisyd9sN+P8*(6!Ell#r9lVy{eUe@6(AO3R-={}4ud+eBb
zi*Prg$DT{Pr_)xRcj&NlU9x{f)3CyBG<VVTuHJ<j-xP|uJWF}_b)A?(R1FAr5x)hn
zZ93uD%k!F+J!;B^vPt4$)b2#IUA^8WK~pEZl^6z5z#_c<<hh&Y2K+c{75VrL(1lRj
z7G;JYTZ3#Uwlax??Tq_F&2zP08TL0k{fJMQj|p=En=P4G>lJ3Wp}czBX6>mI<`dCr
zB3X~Z9-XL?E923T{{z35v|Gui^3=j6_QOhmS!5P8lC2V(p{mS-=7L64;B2&>LNtsN
zce`RxD$i@xmq?X!4)LXC<)=)2b1zXeJ%e+^|9FMSbPMzQM$9f7&Eyo>ro}sGEaf@j
z(rkg&HbRd!N2L(a4!qXsJd|s-&)~`C9Vz6zK3S|JD}HlVM@F4JDbioiO{-t4?Ynfa
z@8KJHdTDNf-Ae1!km7Yb$^^*47~(u`wr(tYwaC>f*O8Qgmyci<cs@qn$Xw7^wrc&0
zjxidZF%GX4TJ{6(t>&)Aw5kC(Pw+G~-)k%zC4GtpXWEwOWB$}wUMcT2lTu~(#;5S%
zCRFs`Sx$fvJWraVQW@qN;8mj28KVOa$6J}1*gFW4hE99@&YSB~Ws^Qxi2003v?cS!
zJU8>afy#1%Ovg{^Xnseg%x%a<m?u+Zt9BgKiPHq_i%6M=;5u*ys!Oyi*KFNHUO8ZQ
zRKxmjIw>e~0JQ}?*TBtqE)Z0iCkWaC_XeKv=1+05QQT~tv&_FWp>bSo?jpi*j}dSv
z%74f9>VPWn(*(}dS-yfu`{BP!`;UeXDkW_m(R$qp^D44PxOJM$HO%5>@{(M~MAanu
zFyei-KK3h0YHqnwE|zP}<mPgi+@U1$O>;qW-jLUntz{bq{N9Q0>rlU&-#3>X<b!78
z7IIA714w(RGBuHv1ne&V!+$%NpS6$ynQsU7Cmz#*uQg}3WMMi}XH-kYO51_`U!Ey0
z9&XiGB0u`I4pz&o#F`%08ryVC>hnBhDom#BCUIZkJU~val%|~tubf~`ZY3Mdea}*i
z{^vGVM(%RCmG36lqh+13U)Am_wcSU@b5h%%Ab*|bFU`vOw|S}+nTWH>VEX*hA9p`0
zY38$3gPKQM{6lL$r|lg&_A2-fO3+4rq{?Ad`K2#0pgkWn-}F=0pPRq=Wv;x+yt*}s
z++yC?TJ{>E4v*Ezlrm38t=NCJ$6SmJ=<MWCM=Hv+(zbHnYQEQ6c8;5lXo)G)$wXbV
zUApX$tWKb8vk2cy=1u8xzP#CbDdw4UIYNfa9&LCB-<hM@$X@Nf(Ap(B_H%Y<>~t;j
zn@_c&G|ACQI@d;K$sf#)ZAtE|OM4mGP0Yz{Dd{dt)A&M~o7&2Gag*>_Zca{TL~Pzp
zHt%@I6-qh#=yd+>l21nVi#fTS94<$i&$g4j`iw<#l8*I_OY5Jw-*)j>-1X3#irz7v
zPtC^dWt*nm5&f#wTGKym^PxGaJ=wk4oZDUwZg)zH?<U~2E-fYIb0^FEY`#Z`WS_>b
zP*tX-`Dc6CVZe<#)LQK=T(K14zK<sjzg7f$LI+jB^Q`G2sR!N_JYOTapGMWFX=6Jg
zB=hmH%|i334)V&Ydu#7h?e#8pGVvB8|6a%cx*Z04q1Jy3zo$)^AsgLTif91A3bFr%
z-3FWc0LwJc>W{gixD|H*y_>O@p!XB*wLGJEx@mo-UK^bOLm4umMF-@ZFN+U(+~#(e
zMUpjBX2=xzsQGq=ys}`qj;1lA<lj*7@T}IdaoC^e?2Xj6igF9`KIossZm8vtn>{+p
zCZnqHzQvVtA}U$%4=$OCqN9ph%YIS<%(qd?`tR%b&E~n%YRPeY@+HD5^H(Nch4cXu
z?TDQ~CVH4B5zB*C(Skd59M9p~kTlxs>~Vf9F5|ZqnKFa$=g!!=dU+Q9May?0pG2%~
zlMbh8+28QEjLvkz79smuhvRfs%<+nz;vF1MpimD_uInsw9m;pAh_PS8F2s)`ddu8z
zzTQbTi4J~vbJK7A17*suJ}TD}T|L}g%%3`OAQ3WKbml<fRUP;feEJjF4D40d%G`>b
zsQs5?H`8+E_ai)r7;{La9?xR@uGYHBe<glyvq8fh$j9KG&-1Jl*IMFXI^cO|E>90v
zLU&slEL=#WkxSl^Fk^Xs#GlVLMSq^bW^xzVDtMm`@`aA}khZ5_KSvZ&M{_S+t94C+
zw}CfdPbH5l(W%4JA6Y}5kF}n%ZsQrO<50XAf0r?z?;?}NZ9`!Mkty>X_HWp6tSeS)
znUc>U;5+Eurz@)zejWZ6orjbHW;OgdPbN=mGr22MK)TtZD+kgKAxYtRlu&<SD^p-j
z?#d?Dr|~VFvxy|o1^3^~^?3K`rp3`P>`zE<20<Rv!EE^&m)=e0$*!{Vjcri*8Syi^
z7)ObjYd~=wGHy=9B<wNhczM?I%+pa!#jb?;^iLF`*Gs2y2Khnac*UIDO?K!xeH`)#
z|EwebXkz_{^cex%8J>&>hI7`)KbWZgiN_z%c_>$uDf{7r;a9uK_Ht9EjIhw!o5|X?
z{VC6Qy9Dp6_|Wxb$<n9uyhTP%iz<2S^=<r<kef&+IL(wjsL^R=>mG8j>~2nmMr}DH
z#vje23<NA}e*U{I#a^H1*vW0aT;oE+Y;j>44cOQpDv;SNsw{U-q`<px>$W39L{Va=
z%#vsFISCun@PZLoP?{T&Wz)E}U~#5YSyq<S1ivV2(X#Czm51rb5+XNc&AEJe!o>Ze
zs3drKf(WR5HSNh_Fw4yDDX)_y=K7xUNBNw&vX`77kD4cY$u<qAs$5(u-zp;Iqo+hd
z1-0YMhP`Ds={K``%kC|#jOB(3;wi1=-WMsDlE*2{zLVRAnt$uf5FpG~ddq&nU$aF~
zF%fhLB&Mo5slDU!1Us)kk=OJ9SBm6G`B`<Q%8@6^?7W}&^YVlv#9`}H+u<|Bte-A_
zQC4qMD4%~TCMzcpDj56C!gea03a?lkd)=JhM`kBxjPVqS$imWT<|}<<KlzUNQy-aD
zC$C{ue5cz>%!<A;rEVx7e3yfluSh;<KG#>K)?FG94K04hd`Iyt+yAingW~dF_Z2sz
zlYGQ%&=1}oykf<*=!}Y(-QeIp{0HH`{>a8*n@6kU7nB}f<SC-ORiZli!fEuoMTGBE
zL8B$T3&h0t&|9tbBDP*(@VjV5nc+>E%B_ih}PlAltw?DFLs)2X`#s_yX5!JQD{
z-;KidVmI2?sKqZ+jx{=vGRQCETP9hJfeI{3YMC-9&?<gVxTc@Xlp{UYGaL`$Q%>yi
zgQ0@O)Lrp3vD}yCzfHN%<diQGk=(4xk<|Zztya6Xbaj{zVMO53(IS$Q<uPv>z;?o8
zE*ZeWbFBH`0NK5-iX=jW>2J!i<^fSu1}9{EIqSsD{Gp0S@}8LgN3yIw{)!(vX#O=o
zHY28{1L-_t%>h{LqREB|(rR+r4jtwxiX+RM<&+uKcJ{gAFz*;BySKI@-Tt;Ldzu!1
z8<FnkpJWZR@(xN1B>Ft&+XLnJlw#kaKXchR9yXTx#)zV^`wF*t%mG)){>^KAi~6b(
zQ(@)xR(ehksaU`NV8wc0!0?SV7v!>Ft-MNhX|1x9@?I!k^|bhL=ts6zDw!A@DL8+U
zdjj+Ot7MxYN>n#9(((9BqNp`dD$d+jnLif05hYMuPRCWOO0+V&QB}$ZjjC1+X;dWw
zIa#VWS?0Jb*(PYI9F7K2ZCsUXGUL|8kH|m4S7kGvsc0h6=nS|x(t9v#v#dZs$pcw-
zGAh}W{n5Bq<%qJK3`zCXdqnOVL!Wwo<F=o3MA0}azT7PS>D*3YuLun{K9E&%U-sZx
zqean-NcvzEVL4-&viB3A2y9&MF<W0P$F#8D#Oaf=Z0L&16PA$QRlJD?w9&EV`l}g(
zJmxD`bJ3zwjq4PdX;lWT%PI|;9`mQGtyYydh(T#7;UAk4T9JOlNQuU64jLp=(_-nI
zl4Tz<KB#mOGFEjq=M0j)X+hftS#i8FNDfO<{{*l%qrCH~K#MAmnKW1qOIVC3F>_)k
zbK+o`rdse|D`Q&+%fay~$*Bj;(}U&E=BoA9rS+=nZP&63J*nj&72TLF<(<vEfZUvL
ztCeuCndX^*EF`8r*|I~d6Bz%MWuup0o}i*pwO|lUV6eZ?&VBg>S(bypilog!^Zso4
z5P9#KBReFiYUxV#G*s=!oSY+XP!*RW+r+EDp<B$;IkHn@tI`<unn#Sby*zyi6Cxhp
z4QA_Hs~2VGvRSj5Z?6KowvXK?iW)^Gm;P_9*CDyA(V={0_kTZ>ul(=Np|8_EuD|{A
z1R3#UWdsuFN#EznHi7^5YZ1#&e`u8b{QKnzJy8!`osiL!S8f+H9ltZnT(7AxPxfwS
zwIixU5Q0Bjowagb;b4^kgFd0`RjT2KpUUH~BTY5@$PBT({HQ2PAST6$1HbHSCJdEp
zn%llp0_)eS>3B+kD7yoVgp3zwo6iiDUE*)!t-bKI`T0=!ZR?k4_-l_9X1$8te@tQ4
zW7w<TkJ`=07G~`;|1*q+u#5;+<(b{`Whcx1<ze&Ye3@>$@5+}g<CmlJ*dg<2+|5*i
z9=byfb5z%sy=g4%2i=ZrO0Ic2U$&1|ncO!t+@L_tGvZY%YC6m;A1<GifpDi0@>)4v
z)rav-qeXdi9OZ68`MWU*q(?%-RHs~ZAXH%N^ZZgigpQqSEV<^0n7EI;iy_96Jlv`r
zp@6Yu5N=gAZpD}M$F0g6$~KntR66F#k+NT+ie4}Vc+3{p%E5hCkhL0WH-!D=ez78F
zx+rU&(Dtn2-2I^yN0wZk5X9}FK0k?MrYg4)Fz>&XVyUpFhnjC(D;HX&YBns~_c~f0
z#hOtlkI8ryN&Ii-pwV(&d^zsk+2*m)a$fvOCBHd5bd2mI6O>0}Fzd~-v9h<N7RWVU
z7%PV+DW4Fdjk0}a+&DS>e?&Kplh-v<zJXl1Je@9<nRjx%8lo*(Uii>BxlA@)MTKw9
z=I;P>YFVmE-;6OQ^wv#Tb<7)Yke%aK5k7andG8H!p{!&6d4p_{phQU-ZDyOz$IH?Y
zYoQhyy-I$bB$l_QX}1of?eWt&RfBw?^zwxJEbT?Jlz*!B|ApDv2t2%=RmryPMWSpq
zI$bkfEHO`vmyv|OTOp>Fn3Xrmw1jB<ZDyH~|3fv3{${0}Te7K0ro=0lnrAjElIa=i
zERU&Hrmkg)U}tK|JEH89`Ijds|D=pXv&?Bl(r+c$BBRl4b8QjH-)Z^f&NjzRkgsVP
zGc%kvQP!8TPPpAoa<%N8UAf|}_|;DrXN!qRxfR<o4_0j3N=2^P8!AW-c&ZYpNH61$
zjN~xoc$X-@`sT@-<=ryDoIXkB#;YQGhMLb!lI;_sRYZY5n<RTD+->E(SBaTcEHe|<
zK#85E&Nd5+$>}|o-;1-%vSQgh;a-c{%rv(Y%e1!9a4*iRaFy4{xaRY6+N|;$nL(LO
z5KA)^XN!ytGw(Wgc>)XK8$bC}7Ttwxdyd&~GA&oV;O*JwpviJyY6WipWU<_MlksTI
z<q3*CIcDk<*)6exQl(7xFY=fpr^sG3kCjv8qJ;Hm)XSJU)4ViAX7*6}^)fchet}nS
z4X*n_1&4S8qOg3M;*`Biu)}-aS#m;5%u_n%O;hDR8mnQTd>9p29oi-W<=YfznJuQt
z{$|oNS*LkwK&_R_txESsLiq**n@Wm5m)X%Qn<lf;ZQ42O@&vW=|GN0MWXVAKH3e3F
zIYZ13r^!xrlzaP+tIayoW$PsCy;X<u$1=Jkn0=<p*}Si9(^<nOm~T&)tt<s`tm5H}
zh*J1%y7fX=&fpd_!F*tbd^dhQ8GSj=oOO#FXw^hQ#xJwXXKvv%nJMBHPU~1%mB_0S
zRI&-wi@Cf+`mMQZ>H%|miCorD)t_1^mcPY60Eas_*X%Y^-W;#o#$V?BGvzy}8*z)N
ztR~+UW!1M_o}l=#q2|(AGCk&})`Pq8dqDZ+n$OLWMUEeFFW2F6^URjBWom0XPSr<d
z=x}Wq7@F*DQq_z~8yg-sTY6+Hjwyi(Yfe$|w;5{An<G!BZNeR)wd9bm@iQ(@P<$?P
zYDcqVE~{*h`QTjHT<$ero-12IAIz10`M&w<TzQp0n>lsghHVFemv5Aj4cp=)qUaEA
z5xIYxpRJ+C95qi~*-XV3m`>e{r!_{{2~Llu;W=d9HBaVrRPV2xKFCtGYD%PhX?no0
z`l9bxWj;~HwpR(}PxEARFNr3Y$@68~=D6hXDyunpR95zewxvfZ){FhLQghUN_J$sF
z+x+O-X+CFHWPZLJmaNh<Xwmbg$s$pm{)@-#TFU-5!JJzvTTNHKaV!>kvI@CtSo!)?
zY`X+g0_E25_uRD06HHZuOgibGt7VWVO8F|U@&D>Qgx+SQH&m3J)_Q~T%C}LHQq}>c
zIbeZ&G+vD}XY$O?7s%B32homS7XE#Kyk90%0)}de8y7K*JZjOZ+2(7DWZQT}kL@$R
zUPR;BhP%zw(2Beh{)sh{RxMHNnPIkEEa&yvid#_mQ$7-9ub#d<fvo1+veU?_v}{$j
ztnmekO9R$>+q+o)J<V1t|CDwy<?;j-Ev=t+u|!tId{2KN%Fa)Y`X(yh%%!s0>Knz=
z%=lYnH|-7&yj9MS@+os$ncOIcn0;@Pzgw>&L?s^ii&le4Fte9&_Nd-LLS~?2Ap6u6
zMj*7DSF=oZ=vp6fpvS7o1LT0p0iFTd2gD87A@)^lTgVba1qtn=fpM&$c1j86-etV`
z`sT;W808Yoq}yfZcGjC;TCp{acdFd0r}#d%1OCNEy}63Wyy<q9tV--zTE11i{seRV
z?J_qeYN$lvQ@6{BQtk~8x`Xk7?s4x5)~Wl=r&q`?Dc!mty=1ldd{Dj~|1IxV999a=
z>MFOfKT`0-bo03tGCjN}Btz2DXn5EhRL-qOTXS-`Y|$og%@b62&Q!q;c57u@Rr1`5
z<=YAFrIbaraYb#z+sZlZCJS$^V!<CrtV^ewkK85Oo6YZ(Z<3|2@02gf{o#l1lJ7**
z5QoF_*2u@a?e<2P<!X<K?BdyhVzHvUH2KfeBmRlrpV{aKQkI;H=D6?QWSFx2@He8*
zHB;BgH|(-}vW`vi!SG+}<n?j!N;~y%xM-v7W<+6Fc-<!XFG+P=yP3f@H~isd86WjC
zhJ{Z*B(L)#?D?p?!Xt--bGFNOy&NnP!M^bEC*=aEDsYqR7GC_6oa2?F!l#~<v%PY7
zc;pMx<B=o7qh69f#>r7;_CL7>8WrC4Px)Om_J)VU+uz{p0y*4#>0fdxL7GP70ZC$?
z@8uwjTNf+HNRuE>hMOOd1qQ{KaY!~wklE(4cbG?2X;OxnPrM_0lESHDvQ7BXJMyH;
z(<jxGI@kQGnl;|=aL4!L82##osi4iwEyrXsYR?@b=*TcZhm(2p)F<+)x@V4hioRDb
zZ?D<@Q`yq2_(&%4<{tS-j#iod=>&IJL(D-RbGkDmy!2x^OXZc2gRL6(9#)~+Yj=3`
zr}A1OL1py*0qZ)4cb=4kJjT5p!)I$`d&5}QDeU`FrqoXzMWYc1A_YSy?Oj_FoMOFI
z<FGmKM|pGVaMipyrOTeQ-}3vE$i#m0xgS|Ejx<}>vNSswDOkS1V-EgNwhm9Ml`|zZ
z^vQXOS@V;us%ur#;YdN=JagT<GS#g5S$<Cm9{EMC;<cxrmoG$H^)MCYT`KwM3-Uv0
zH;=piD_@Y*&4AzJ9$wDRzsc)WH5B|Iw^07C|4_ao!w>!`8yRRldl~WG2d*(@S>cVt
zFTHBCRQexFV>u1ydc(MiMDI6@)9POBGFQP{JVvRN0SesEYs{AiA9&Rm7=F!boRvKi
z0)hM;6uAGsNWMD%d0Z9#KqO!FG9%;h;flRXqNDwuH{)|p|I*g{Al_(|C~_jvVCJ9k
zM%x1Rdix9W);+lBhol=aKI~*gGBlE3FVZecMH)+hxRr@xd97rEN>jQ<AZpVhX4Ns$
z<1Br1eI28DVkGL5kZrzP#~76Q{}ZZiLM&8|g#Q!j`h@@gP){cqEnC`oGZ^p_$(NOP
zC)KYLjQgn8g^5NNSsi{f(fG+|qDCVlyJESFaMT-+7oLREvp1Sg);BK1{{u+bZ*EUA
z_BGjQDQ<}=K5Z$6%-b6nr&3=9ax!Gcn{PE(wQt>~m{zm!3gg@Of8xIXkU7O?+?%T0
z=w)nL8r4%wJ28?CzPCL97}=p%3J2lz?A2zghGgXkkaE~;)`+YevJ}_G6c1a973MFE
zj2q-b=I|6_xZE5zQ;fQj;%;wj75Cl7MpL7GJDNf{dw~STdJ)-PST@tk0U75e$kh~j
zgg3Z8+iZ|#bZ9zu!^2ss<*=bG&o*jme-}l{yFACdHqGeNtSf%vfLKx<z>fHDQv0l^
zfB611<GH%BotfRn=+$^czNJ_8@>Fk8fTWB_VOBfyo;F4cxiS1?8)Ki8znin#8QqN4
zIpK%e8Ei&BqCgFYg%@@(f|MZqbA~Y^-fH!}!{M8|8;v|NJG`)`!S;1XczZ9SnVtoP
znfrPhh3!KnGm5-L4I@)ZUtQ)YGDbDLut8<vv~Lmrwn))2ija^Q$_V%GW7K+NyKu(=
zw2I7X#7Uh(v+utl8dL>)N(GzYTjV9z-S&HnUd2!85xjHw%s?Yk%ICt3vy8{2e2Mci
zV_3ZM6~oN<K}HAWscwUe32F4VE9$0xt15UC?iO_qL_}WT=XRWa8I7o;mj)TD;y<Rq
zysANijm{leqltZ+wlxcQe4dDJSYViEiSnL2J+vZ~eaV-TM45HL!N4%sxGx*^Ly?|s
z?K=-kl_QB69KQ&zCw8+~R_h;ir4h}YC`+x|;@d-Gqh)Etc}c*yKAQ7e%vS@(y;1jJ
z^O|g9lU<^(vl)XvF|%`xR_56p!yo?%S>ii^2DwI0`KkG4j?pSyoNK)1u|_iAVKaT0
zk={TJras1n?^LxMHb)O5!cWz^oo@ax%y|0Bzi^jluU_tp$O{eWeEcJW7cL{qiEIwP
zm~RX=xZU~X8Y4ZC5&aY5Fb<h5htn3+-+eG9oA(Yk?rM;PTb#~ZWZ5&qEk_uutX4nL
z=-_3j|JEEi%;;e9$DtR;H$b)F6m#CSB%O)7JbUy1NxCPpoXFa6uj`DFalD$9V~rM`
zKtWq`+gPJXs(ORTobFm(hzwc7*Gco8vBoS~(SUJ=-{74W-C#7c>VWbtyTN##QJ~X!
zT0^cmdc3jSYQ*^bKHg|W!)|t?@s~_hwWLPRZ00j{B4Y6{v)cq?Ofu;NChfDt)>M{j
z-ao;(E}4Sa+SV#;h|+Eru0PRO?Iq*ui;XSvfZ1)b@jKLUiqS7#mF?i+@Qf+O%hG~<
zhr{X9jP}wh*Q%kb7bWZ!6W4v_8Lu=W!_1qf8`lu{sp-c1vVHiD8OE&=`@${8zh!(c
zRJ;-M&`hJf<@Q9(pJy6VlG;;6<@-VfZQ1Fx4^N(De4{4Af;q+nCbh@s7-N{mew$+y
zw%!{lpv(+8kB+vAb&xi%Il8J0&zfsYicjXCPNfxMXi*KE$?%g!#+8YPqwU@NYmxDT
zRo#t;ncuE7rkG=w7|AR^W-Kvgt7dR)sgWzEn+;YQ>%u#3rB<x`ic{v$+l)@y9bSq%
zI)$@XePWrhiA-O8JHyK;bMEcNXAIESEa&?eqU*H6c#EQ3SYcch4g8haH)wRz?(nRj
z@ratg6IU8Fym&G+&wAr23|r+YLwn7)R*|8}VKUUohz9${9CVkRmv6%J?oz|rLGzKj
z$;QZNHb$y!Txo{Z7^&St89PcnMac|q2A$=Usy5cmKNoteflV@bzlFa&$e9P$7}>0E
z;_qP&SZ4OR$JiHS5DI-5-Ij>2y|GJERsFrGYjtsYsd(#?Ok<V3xu%hOw{BB2gY93#
zp3nAg%LOKWMaoZ^J}Nxh9@{_Vf5Iz&qh^S?^j@Q7T3@A{<)5i+B2iyNiv%*nw=^l?
z$i2q2deQc@-(QeYY$Ufgdu%c~H&#t7?LeiK9Th^E_U3|3#+8<K^L^x$y>+Vj;3lIN
zTZXqc8U0$QKulF-uP*tMGwjmRZn^A`-xC)Gmi{@ReYmx0d?_uR5IT1){VS@Iqjlz^
zW43<K$c<MyjZ6#AdeHdUo20-;UG&3FF-KJzjRy8tBC1!}DU8ADgmLZ!;SCKbUouen
zeG^)7mGT=BS}{QJ0kJ~KQ}%^le#BTOi?WoE`nwvNz-8@;;MIKT4P=)u@v#Y3a+T&F
z<xgJS2?Uhi*P#{J$}capVzA<UVug~YM#7I&8toH{a+Ht+a<NnXKN3)KA7`$4%Aa25
zPGG3=`!cj*nDWaGtr(&>A}l-QP`KsOlps-gsvPE<BP)$20}B+A-Zj{&K<@ZPD8JL8
z6(f~jAhbg1_zsB`!xbM0$3J7VPw5n`@-~Z%goJOGol}>8rEG&!2Jt>)COWN!ua6iX
zGDMkO_c6A%H%IO>I>)QH<1U#i_8ChNUD(IMsJ)rJpNY4<Ic~p2+x8nxpjY;@)<$K&
zF&L}Q0mJW8gAz+5)p(4dp@R0|=?93}YGxv9<?LfL{`ThjgGAQBeEgu%Dz1ZSX6DNO
z8co7A2aR!TEW&*c8~-pG&j_tJeO6BFwd831a@xnW<jQ+K{Og-WOC$aUU~i7u?5MFY
zNx4^2ds}K+`O3}KM~$ZD3rCI7md>f%aNW0!Zgog(z;W`?!MyRf(LDJOXT1AZ<hSy-
zKCEWQ!{NJ*8w2C;-uto9q>su{0!Na}D-|O0=;%joxIAGNsfet~+3}<PywQ?_o3i&N
z`&c6I2M(5W2q%4Fbe8`QZ|?#gMUgga_w->BFyMdz0wNCQa1IAC0>W?(2XRC|PHI5J
zsEAQOqargRYDCn4NTY%VL^mqSMr0Xu5r?Cw=te~r6*anutfEFm42tmIPfs=7vb_Jl
z@Av)ho9n7n-OuyXv8$_Ux~sc`*JD?@)$aYV-nNcYwaJ}l$O2F2Ku%BG_OWiHIW}o0
z5_7-M*O<X+M94Un=>C<yBdz<5k_&Yy#NWJ~%j(~2<8LBd$@t>RZbv5J@L*S#=<>aO
zhuM%WQ@dupuO+v%wr>@11An4~Ge_Gye&Q(ivX>v$2iJ2~prJcYoSDrEv}Qyb+rd71
zSkE;-VC`z3_cKeh-S*U<^`FfC@gcqJ%a7<`X5u+V^s@s4?8lF=$h|DJ$eqg~_xWKb
zCiL5)FL3JE-~Q+bvtuti@Qa>hFU-;#*g3!G^~^mBNqERF45wS|TYk|y)teHYA=d;Y
zlGXOjYn{k@>{otaO6g^P`3t?Om)*3QPSPuJUbTLrW|x^%I7tOl6Pi|Z#fQfyaMQhf
z{J7&2T$$KNL#at`s$q1jB{EI6e()EuR~W`w9U`i0x~qBNd9=AB->$I<K%eCl?8CdC
zh}fSSM)N?#woIcnUI)|os*bE>bPkTe1;9dn60}UTw2Y+&6X>RN;}_<k#QK^>Gu@fQ
zxCZfQhEdb(Cg<Da_VIQ_)AW-Qkv^PK?cf9-t2;|%GyAS~PofuFSuWd+>lgz9{p?9~
zjO@T5dwv~bJ=C|Zaar&d>a}E>{bXHZsIxFher11G*BI%@mrk(<on`Du*-WwPW*V<G
zKG0>xpyMYdY~z}iHnPsoI^%hCbFxleQVaFBzs)qx?w2WrnXJvJS=m4I-6!mR;TK1L
zsj2Ke`Fu-@&*^Uc;86Q5?T(OCtB4l*I|bceIrlH(hX&b0>lx=~Is-uJp0cdVHe7Oi
zT))Jl^^7wEwIsQ^Kb9<Sa-BZDf8xdZMmwDyTAySYS8`qD!iGjy?yT82H8L91kYiym
zq(<z8jf}Io4EAU7+9cj+WIP?HFO`pf)p<q+McG|+d_s!txs8oYfgyI^bBwdt_&xd@
z<254IYhr{0+Y;TI7_0Tbz4kxDM(296SWD0TqMeM3p~xo-rM)xkHcgF-(?wMSl^t(t
z>~A<kiZPh_v!%)82DFxZIx5YVXW!Y3{xHOTrkOD#Fw}0?+$gTY@~r!t`uMv1Zjg&R
z`aru!N25pLP;=uMgSE-}cEGOI#zic66OGy$PjE#nfpEHBUsi9j;_vsbfD>I-{(Aoe
z(T@azMU*J8#qOSC+{jAk*&L&p`)lEy9HVP9SrlFt%;16F?EGludwnY^oeO!j$A%|L
zGV$ttiRPV+g{hSqKQUhEVqAX?$0+=6)$8sRKe^g>>{D&LtSa^XcE7I1`4sr}uC(0J
z#1mZ&u1amSzw2g9RzD~0G3QbH?e?7WjE=3{!v9yL5PiHlnRx#^V~MQMt~=l8;#BJM
zE%wUujgbupN~anepD`Fe@OpSi^GJZbB87#6?OM6U_v1v|Yj<g09=^uUp8Y2#-_qhr
zbAhzW<f}q6c$Zc$(bZv&aBQ~4^QQesH;-tu9Nt_!+qDf0i1yNwb#f=(d}2agV)O+D
zzm%W1Kj>jh2)tl-?uobBey}Hpbd4S8WsGgTnM>kwS}^d);!qZfb#D!f*V*!PXpHVH
z7<o5jFXzXw#sX%o#E5WLM`FMo`|0Cl*=aHZtS8pVv-=RS0x#KZE~Njjv+uvq7|bu?
zcP}(94J7QAy^ZU6By?eKBPykM<&;v?pH7;TqHMIU6zP48Q722WQDX4J%Pqw%Grdww
zCDzGOEa+qKoJ8X7K1OprFw*{^FK5ps39FycNk@F%0K*Q5dY~~(PnUMAk*&A2!+A!b
za}c$w6HD@pC-sISIRwVQe?L_&u`e2CG_NB^LT8XI`apLU&zIVF4KvyYlJ=TmM!&$N
z_7B4t;*yC*!;KMvx`$P5L%sAur`@?4%l@vcBdTRZZm>O3>L6pLeefcq*NEFGTti09
zuV?&za!S~*!rDJO&$t79DaXZ~F82KN`-I&m^*N3{LMKww$IT;*j`bx@fL1&<Y5AXY
z*7l;)`rGSA7*8?GkGa^G90<<FX6_MtQ`G2Va$SMR?-HX$V0WU`CB}DF&>0H21a+D5
zxcdwDmCKB?GdOSSHEHZ@W*cIEewopLMm~BOT_C^+HqHpQ*%}`bYFzcrcl~8E);Z-e
zmXfzN{N~>$+(VA6MXsGC{?+#UamJ>)BD?#etCxqlSA8#G>&L|S@y49gr3Ehae|ov`
zY#>cE-69L@2d*?WrwcU9=2G#fd?QOg_dak-q-3_nMZLzK%1D&DwookcBylDi<D4q!
zgA-p*Hu%A{HPL8_@lBu}bzwGO_g=+{lI@yvX<OTDH=0T#ZMI*TYTO&VpEUHti9y#G
zFX)UVKVHi@w$+|`ozaeqQH!oK+6HQi$-VogE}xH|wk7fA^~P0V+HtzECQW3`D8fId
z8;=DA%&m#JHyXCix}?#~#`}z@-`&ihRnO^w<tyegKZPlgGoTGj^q6V<O-g%YmchLb
ziLV{7-@Mgm)>gv#bjubD@EKSAKB4)jPgXSB^T~=j^pl}UEj0Ox-zSL8q}T2GMaGD9
z3AgVWU^7+`1NS>^?R`bY<j`GhL|)1Ssrt@{+l<R7`?I$h*U(LB-)@Wv)O`RGtvbFu
z$xpxCbfXZz=<YC9@nqiFvyCA^Q3sCM*UdKi1s8~Eeq!xx;}@Owc|K-bpBCt!sDB?r
z1AJYv@lYV`L5vpnZDyai-?$(spmQl~zOmK`YhTz$<{Km2@X!a0uT?nFV}bEPpf5{v
zgZ?aQomw?Bzb<C2$SZeOd_uv}b85EJ`*vA4d#B!Vk)-=`YUgaT<f4V#VC&&5)&|<G
z9-=e@?b{zR&ZSI`KV-bGOY`R>Q#(}V{dQu)J~==8Cgv_=2_<vaB4%Wc`XfZ#Zol#f
zoqc;kU(7M7{fN}~pIv51e%dvOPaiev8bJ}(*_Jr6)L3t13+PAhU7lIk#M;9m@39F}
zgCFv1PWJR}8mx_d41G!G*#(K+%Z!Xbefd$an_;^Ps-ntEdM$+)4^A9eZp=^pdRRQj
zzPHr4+4+@lxPRh!sd2mIY}G00y!u)@{;V-2J(c9dg2bU`je=l(!7h`uIcqC#4h8v*
z>G0;nb$>R-n?V8nC;QD8jrGANCH#xsdyZK@v23l86$n1X=vV!E;+2<-FKX6)8ueTw
zK1_TrBtE~%XcSx};bh{?O-9W?P!!q}`=i%b&pF}!c5s2oj=rQ350;|TkJw)%QLd2i
ziA3!O&5^pjV5`yCNvE>XUbB^BFKI?TxAkoXmnYlW1GX96xXN<dHsjoe?~afhv^g29
z^;+$4=E|6?_lDUoiWN6qwi&;2kInx44Wm<kxjnRqLGVeef4%M41W7o~q*Rh0VMVlP
zMU|43ZsDc|y0bVurR@Emlhh_k7nqs9IC1Hl#^tFs2p94iePw)|DBWd5ox5Fc8|?=D
zE(PM=qbxqBY(6nTPhQeXveoIQviNjv#`O2Dz>Dq?)`yM_s~P9A;y!!wJH~i!#=ZKE
zktRuR|GQDbX+NeyhUsT0+1#qcrgx1OELn#9!+7c(t_b$y`k}Tt|N1~gpIMWmI)B%R
z3F>NEtG&jWK-#m!>lTUIAAiV@DbO&xW_DuLM@D|2(VxJ$xjFBPEA?Ay9?)Cr$(4m7
z3!Jc<>@ylEecvH|eVOQwBu4Ht7U=riY`gMPV{vVXq_aCs{AL3ab3Ze5<6N=a5?SU9
z$l*7%lLPW|B2O;0pZ<bzWnkjhFH&Q1+x^CYK%-RIy#cv+yIUW-?E&L^&Ny0sq@Ml)
ziKOS$&bDX%%g7Es57pnESotr53zpmMM&B5>sH?7tCEplxjh27KAar%p8RBbS>NAG#
zIX*#;bp9!vAD>g3fl`*y!|V&cH(qP%m~d>wZ@|x_k3JC7^M7g<VZ+Uy<%5RV=TsTP
z+OC(Bm~O?dOK)1`n05JSe<UZA{=xD=73>`m-Q0=xqAH`eJ9$#Hp?3HOqcdav$RCU?
zmRrt7zZmIhOuNmG8<(ekdugx#^<J~Lo|Yh$P~R`?w{){Z`UapzN2Bc$!`u}V4O78;
zhB-oo6Rk|Me!zu(miea9&|Tra!;n^gr1n#4d7c~+vM=mtG)VNSVLm7`)thISV<rC#
z^CtHOyZv}AQcfKb`@34^LubEFj?E$q*hd%mIZGzkF%5ox7um|qixYNjvyPtjcj$}A
zQu{9<vtC*SAcy*`kok7U-9Sdp!)>!7g|j8xi1SVF+ihV+gu9n-=q*zNGm{WH<JZKF
zI%exY9R^jgm5T*xaynku9GNb%rsr#I?6GH=kDkuxEc3?Gi05RQo&H^%*pz8D3pA57
z<*5x8zq7LDoH=J{{`tQ(P4-;4zu)l*DP2OZZ+^^_W`CMxe$+@NO>Jtk8TI<dd(_zb
z$%_4bk57;~G#r?Cuc7(885m*T*~C0s?)^3~^W|sLuVGV9OOs3e@v40ZBio#e)SlYb
zoErRq#IpL^@3l1#IAK<4o7$OeYl~dlsuhW^|C%_`&TJJ(-vitg((>#U9n8Z)A@j#X
zlaA&NJ@~QM9Z7thV-B*?KLPG$8EofvH}is@Fz6f}khr(I*)MQXm8dx19Hpmyie;$p
zO1oVT`q*axeJtC~??ERM74vbIo^*v&IB{=J>0}C4_A<F>azx_f1lq{Y>Y+W($f?T;
zL}p`+5wIB-86;eHM`BYSQ#YOR-`tDx@9b|DrT;?IrlI^iJ8yvbVT3K^y|>q`BSQAm
zE_W_gFd%EmyGP2;S)HG=?v;-~-%z>p6N%2D@8p8`32ppHd(uGjteCT1fRk9nm56I@
z=JT%GN)}2IbA3rfg(;#e#j|*RAjoqCH(@8&07WO+Ik#q%ZlTf{ocR}to+ZP)_d*L7
z4tV+lS&WFS`EERx$FHS{lE?_j03$i+a^B^o4@sR#XJs^SX{cNhh^s<AL?JI#wkPX1
zAyh8EOI(}T*u1;5WTDu}%;MTP@nCsON^=To-#o~y8!+vA2AO|yuCK5?uFGJvS))@Y
zqQLY}`O`#;53r{XHt)`4DO|fcHB6)~0-PKs8t2I{vE6=ts5vrCbn?^O7-rrgOQYL|
zo0pozy~uudxET)Z2&eAsP#pOiVFxZU8*pQz;YH?KXQ}T0RLueXLQXnW+Y-NAWIijK
zQl7lne2SG~@2Gj55tJ^V4NR=L)SMltwcD|mp-4`h^GBPV1Ho!B=$Dv2#yns&azHvl
zWJcj#dm=qr4L&wOWM{u_Paki#O&JfipB!&C3LcftwJot}y!m{f(XWyUhf|NtEWE2C
z(xdU9V-rLc+HB9AX!Z`K^!AR4W)mlD>_~h)(QFjR{0*aSZR2g~_h0pageSLI(8kWX
z!fc-<q)%+NT>s$do?BSO3BOn;KE$4Qg_$1}k{d2f&!61p@PK0zgwNs>Z^5a<*yB8T
zW$?h69_jj1=(`6q&x_P6_ovu2IkJzN>Sj_+WL#-h1!5L$Q^HM#e0tt7DZr0$=ZUei
zkQY*jMw|k52v<<oOv1GYSFtRrO}H-M9KzfV99vUtD4tIE43T4hjFm-*a0$OPju5Ub
za?bE7!aQ!HRSeQ%MPCDHKov&&m7<)CKUEPb$Y8IM7ZUkB!s*zTbJTVbmV)M7u%(Th
zGNB!J&zLje_X!bh^1gyj(ybt^4Ju9!d8}1&%1LW=GM1`Qh>|~x;!GmQO^s5*Qi~Gg
zj}w;Kg~)I*;S9o2!Vf8%LhOqPOHHHL&m~-gu(ZHz!cuFg+%1Hq#Y!kpAz`OgV<dDP
zAemJWp5iK~^%aDrrX_^ODtVNIMk;xf42Bby4CGJ^A}lo)`v_sD>9Ogd<YM2Auv9=Q
z*^%%FDQOju4M^>)Flwg4Qsc%dLIv{rgq>qfAwq<mRzWT+NJ%HOH8JQa^CPRy#T<*0
ze%!3*YPU{1Uu<{2)@)EGI+p&GUEL-!_*Z_YhBid)an~|iUu@rXt$DViZHTtE(q3%;
z`C7Ai9T`uv7y{i(0a5$&YnhInVV2QL*2(pGkmX{#>2+p<hLZEVe%hzJai#aYJANG}
z`yhAp(Z*eDU*cCUMkV(+IHjDwsD0OUW~Zk9M4j+fhNR@&Ssc>aGY)RajN0!?DyOP=
z#_LBuJaTE&W{=(zXE&fJPF?NPLnf=8#Pzb;kv=P{oxk6}Y9|wr)y_9JnCAtecGKw$
zg-I&r`-`%R_}`^mO+p$-+AF7<57lo?GR9ZhoNV#8#OFBtx;^MdvpcEYd82u5P*nB$
z*(+}}->EI?8m~V(Bjww33eB}#2V)#=Ah+RfGUY0AjhoHvz<#^S&E_U9Rym{b$V819
z=2ZrZlpAg_&#oc!%p=>iqD$?Cx0q+0BU97dt8>@|Go#RIlF|FSIVVouHoDZ_e9P(T
z@WU-;3+hn+R&ywsOuAKSl$d*~*+sA4l`_`pS32jtf!dszTlkPFtRG9fd7HW32#Qkw
z*}nHqb8RqJ!bcPQ_Usr4_PC6nosq=NF>`9ox)C%oOg~yv-zwp{eeJLBH#-I;eAf0v
zllkUDfnaYjnrg3ofVGMfJ|I6mpAYs#t{=76FEB?2FO=|)@~iXfVEw*8+zbr;V#W`z
zrSl7D030}O*IsNkadmmFZlLIXPV`%BUZDs31N9Hs&n_{CI)?h7@SJdR-mFDeUG;H4
zbnIFQ+T<p74R$21uuXP34g&bm(#~G@n3)}v^y=?OeDRohd!W%^;K-HBW{;hgJA2%F
z%U4`?>F*On#`6A_C(JcLCp_40yNorV6MkJQ8iHQ0NK9U4b_t~C0eA6)c4FCbGb0cb
zopB_w=1Fsl9vmTtzuGJcn<bX6Fb|sPQ6#Jkhon}9BWdWveG_}0m6f5ZN`%YIh@L(Q
z%YMCh{HNe~vnWHTDeo0)QW*)^$ucu@{R`%(z-j07YgVV`^xA8jIepC<bG32GG=9xm
zvrQm4nl!aZi6t+Yj~n$b6JVpT&J)5bhyFf6xY1U7P{Qok#IHM8Sr}^jc1ET?y8BP#
zuranL7A4Hdf%@Ymfyt=^l0$x<Abi#qyXMOrjFirv{Ic0BI8h>Pv**8T9%y{I#G!9Z
zZqu!;%;q}rM3%W(E3zA3VP<Dt0qNuCo}AG|`(hxTXYYE&oH#&ea+`Q|SzapMS;Pw`
zdGXUJv$#JMcNUkpn@K4#?QiD(KyVVM<=bOkH9Mx~6Aol++4h20&9UjC$><#)nmF{T
z`EoseI6wQo`MoTfmV98|h!@;rUWqq#k3)O*m=6p6BSrr(WmD3<dmWPldmZ7J56u|D
zJs&zW=p#q>>__I4S5GFZI9t!84k1SUm66MPU4=~NDVD{1ew)yWX`*DzZxh6>Ms|5Y
zlVp1RZWDjw;lf2ZS?GG)`5U+CLfodisZiqOPm0e8{Y%S~wEpC#Wk2_^Df@(vd~9|}
z7prjgyzA_oeda8w+dlJ5`JuJ`Q>TpGKXc0X>}O7;x>uUhhsvTi&PgDC=+n7R)XvWf
zEg+S<tL8nWC5>5QMLziM8QOa*SNzaF^l_794M%To=F(xT`7Igy^=4u}udn@GB|rZz
zwd;ItW|C#A&z-!Me9p)@&>r(es*%2MTBrLz9dF4$&8Guf>>m5gK7qc8+xMGndD?7G
zK45kUP9aIX+J5|ic?Y8UUok={(a^8Vv(KR+qU`7Uaq!Uz@e@_A={(_KB#WniN36MA
zY5dBRE2Q?GgXXP_q$B=i_N!UMYr~3hB_Adp|Cf11Ad&Trx!Ss9Ti*bWg#YI*%V!%=
zVSgG~iF!Yn{mm<mjpzFYyx3>}Za6L<SAtuG+ko4J+l$k%3*mBc5!^mp9xjT@$IZmW
zab>vOxc#`p;}h$DHb(|xm6Q2&=xJQ>q|?h!XZwF4i)lN_XfG~{LX164K~7CP?z1WN
zg-^BnPdI#4AYfmT9lXr0>=JB{m{o0_VZ=(lF?p>ze}_zs##Mjmk*nTW+V#MK5Br3W
zi$4AbeJJw0^Y>n#5~bI8XPv}fI=$}Q$)OD*J8rlu{=)BX_6Z@+KwgBe<9}Jv?Bk=o
zTSOiz-e9+hZf@771cg(8?@|doXp=&uXC$}k^)=-MYx;zcI|+Q3XnV|@Zp6yJH?=fU
zs=${VAAhVyB>rc!&t5nPSK~`gUAuOTJQ91acY0OIJ{vz(!~DpQMXP$}D0x)L-`=$#
za?Yi9_a3X{MJMH}BWpTb+B=?-chyRjf;{>zsc6!>4CZwsS`y#D|NAdZ`&PF8^1=#O
z3RG3~*P+%DJy!K&s;2orxdlO~c}-o2Tw0>?4|=butCT+fa4P<(O<jmwT52!;X`8wV
zg=CQbb1H#To4OFWlfci3`GyrVjpAz4?wM(|)XS^wSJJFj#$E&uh@kYST{9iQ&ZDON
zu?UKOwa1H~@>kPtR}VqiZ}#&dh#xcUI}q^r;?Z=gbs#s<w1#D;HOvmAzT{w@_$*oD
z_-Dy{+`_>%x<%}FLso-W6m?C?I{xR^lIwqZExGjYYsuFtTXl;gm&zTwq%=9bTWRvr
z{-w!9hqZ1c$ZH_~Z0l3WLF=DNZhYmbWNEXtT{j?4NB-4scO}29h$m<EyDJ%e`*7Dv
z<n@t1yyed1;ooN`A83AOQonFR*NE<<V_#FpsxzbjI1{7Shn=5%{INF48Xt8^<~Hrr
ztqgfx<RaVpb13;`pL3G8RXx?U8hIV$cVu>qtQp)nl0C6g<l!x|x<wM3>sYtyfgJmS
zOqyV<Jqb^bTlV}+s})ztUKA`0+8+sq?La-SIL&Tb4`ksE2D0tj@Qg~@?zeQBx3Xs9
z&3e|7v_O7h(mB>&f-!Tgu8qRw;d<e6a4m3IxD4Fzzwzmf&HrOA_HT(-zs#q#a5Kqx
zF>W?tw*aeO<%`OjJQn#XRls7OUB*UiaPx4(UrE`EzZfnR-N_2Y{E!N}1!zHLtBEAe
zjUWY+a1OdMT!_LH;zFtrm8x*Lucho<xx|Sgn};jIT{>&JcF~Q~wNZ%&TUuuZV*4{K
z?I7+Dt_pV;SB*P{)9P7T8m>03J}!)FgUi8PfV&Xa7dH?$1a}dRW3P?Ijl*4zn}oX>
zcP;KF-0iqHPVDckXLZsZAW(v{aVv1o;nw2*hTDYOjN60z40o_zVrXk?q!|mnr)y!{
zGTdt1M%=MeTom4g+lM=ZoA(KJd-%B7$GVo>$H$IwW&hB%s=XYl5BcotNAx{hHu}6z
zX<ht6^kMYJu-%Am9(Kn*)??a!5?YM#f5YWch_Mu6#*MRXzUt~ue8wU1czdgR;Ji;8
zTiRc6JCrYBaewvkH@>jwbld7^^=WV&?}D9yyB#+V_aN?(b1XI&k4}_#wB`kyF1^#x
z#LebytK!xvKBQzB_UQAhPO&^*m^bQw(M=<-65Kjm{)2q^O)@X!)p~K<f`|CZ7tUXx
zr4Rp+MJI124B?7!#klN+*x(N1+Arb@KsbLI{%{iAeq8z^hQ@SlXk`S%U?2X1#i?+)
z@<l%FQ4+>USiXKCezx-S@u#WqHvCMRFGAv$;aZOC(DH^3EvGO`wA9-5&<a)<+M!cj
z>{P-3kIb8e{*a-t>)e9G+`#{vzu}aG#CD=jw$m5(r{fl2x3YILcjcrJ`$GvOa-}l%
zXRuLa;Fn1_V<n#wJdNw6WTTW{cp95xM;1;cu<A4hlJLfr_LI3*qgX*X-?2W8t5mWo
z<!dkgyIsg93;V+U^m33VN#ij7sm@$$4042-Yg5V=%J&;~^T`JI!hV}ki90W_&JFb1
z*~Zd#`JAxpDhTiPIbqka&d}_57ppLl*VBsBY&kq&YKJV7J3Xd$Oy_gBIFX6Jz~CE=
z$cNj19%$VW+jg$0?fYMJtIqjDelbl1uKVv4BqRKXgq%w3|6}5(t(aSYF!A>{iFuj3
ziv7Iv{?vku6=M?L(A3i%Q(=Fm5-!Ac>1kY2$qp(%llV(}o?_Tbgi2Q63%fG$OOQK9
z{PV%qxrq~lts8<_8@us=yz_9~O)ZYwjXQ?3SC6np+2tdw>?IdlWwFIkQ!B--#%;iD
z!&Tt+;dYKRwS)MFaoQy;|8ZHk7PuT-FWgw%6kH*0Hm(@A7*~p0joX0Rh7;Qg+&<hv
z++m!?>O#vPlPp{doEZHlE=O!}dAL#kjd=f_Hb?v4h5z5Od@`N(A7)&H-Mmw5OI(((
zD@g&Cxq|<&1^!P8eY!YPPbd68k(HgM0_&74IWkdmq_ru%Zl1hBoxdO2ru?IA?T^M=
zttM3{`F{0%m#^{V_gWRc{KPtfFF&b{;n&00`PNAmemefy__gty;Ai5u#czb4gCEB4
zir+dhX@YfQAQt93BaKgS`MhpEiaS;QKfp1q#VMIfCT;;Jr&H(_H*muLk2pH}=@QO3
zB{N~yozDKhBg^uoJu~J5e<E%H;^-xUo4G6ZC*sCipltoRQkA}6CgE+keYgqo&+iT^
z&X}H(XDi=t6Y<F=`oeyjnbS@C;(TkI9lz2tW4-v+!K%}^bxO8P`MXbJv)?B>><jzT
zGw)=GMYj|GRJZp|bC4rEsFc;GF-o7~Ma=Sr{fT8KhU8lh2hNM~vfO;16Lwt=;fT)(
zJI<~(+4?3ns)RxEf6;Y&=nwhDG!eM&|F$4A|Csn`E9Mqp8I_*-FlU>axvQA@@E=O(
zB(h(ONqoQ}PgktMB6kZe;bYjgSorU5xRT9OeyRAmrx@<^@nK)spI~D$*6bIjSTQ>}
z#WE8)S6hXKb<?byXLhnn3#^`YWr3BgIQ?4dPJ8aP))>3;TC1^M`o4YqTC179drzX-
zb=IdA?*T7eQfQfVOPd6o3yaCUrY6gj(2sU~?%oDE*yj~mm&QuDix9t_OY(9zfp?FW
z3*W}4YVmuebNK*&_*uHP6F=HQ*D6Gx%{Q{~tJ>*WC4NZ<zOgCH>&dI|^T+C1HGb)x
zTn+q)#mPLr*@>TjHy3g67hJ}r2K?|_Twcf~5a(6%Ir#G3!(9AQI3n`eeAE{|i60d?
z`my-orU9)0U&AjF{c2V|BL6c(EPnpqOsyQhbQ5P9e&q%n{=An>tr}lGh#2~qc<bpU
z*#yD~Q;UcIf2{a_rPTQO>q!_t%6rh4D!&w8KG<9)`k<ke<2xVs#gAmrV#32{G5iwz
z3jFdyS`0sQJuQYW9}z!@ACCQl&lrmWzZzdYH?HktIIPRhZv1?{1`)!SuZU*hS56CP
zVf<1KS+?l;KofVzv~{<0*9Cun5iN%=-%ZWKmk&Ef@iXwp;>#!Q^YIVvr3LY0-5PRD
zfq;C*ya>O=tbi8BZ$C4j72|K1&aEE&)qDqODSifTa4*GgJexj)AG(~rf`4ctH5Sh1
zXB~cZYrY(hKkXc9EZmqrg5P306~-^$=3MWW%e-v#Tw;l%cuv?%Pc)rjrI{@oU&TTN
zY3kO1ru|Gr;}geB%jJvF@)t>5bgMNi5WDLjhsx(F3A?V~U#9k-xFy)xxTUzqaQ}@o
zTmP3C{4r6x_ia{_5#<?{_FuUZfB55$@@Z^eLiX}WdE)uotfA?BD*1lRF)kgJ*SE9{
z@=ZDZHeuxa-JJTYZT<r(KY}p(Vdf8fY2AWzte1FOLZAZyI@^EkB*FAWmKMT|x0}wh
zS~r|G)6(w2iSKt)>`C*i^Be9)`~gmUzguU&DC%Mo+KUt4buDIb|G^I2V|A*LODy>&
z?_No)6Qf#IB7Tom9f*x@nyQ(U{#~`8m75QHR9Nozd_-7oFi4O2f^eP+e?z#Ku-pNX
z#qa_ZK1#Smh51g4wwSQo&dMOX6kpOgi|}5J4S52M0QoYAWYUaqV-;>gI80cIcmd%S
zDm;X6whB)m+@7%f6%fu*^4ke_Q{iI5xhlMba4*7Apq1@nT$5H%jzYf0Ad1%r_fg@u
z2uD=-L&AMk_#okaDtwr5e-$>{TiO5>u1|QN3b!FVNQEyTJXpf=9qwU3o>E*vc!&y5
zCp?_6)P6Q$`ACWU-9vbkl0QOttdg%FJW<J|XXmT1^zbPv{5&>=DqK!D#`xr9@K>Nn
zh5tsFamkUtMmVm*I|$EH;a!A_Rk(ui0u}y*aES_kNqDgee?xew3jaWOSt=aUjsc}g
zQLBTcty1BJgv(U8IpNhR+<|bp3il$sPK6_cH>mJ1!W&h1G~uKQUrD$?g(nl97O-!g
zZ!JmxdZ4BKIG8&Md?~8F-Rc2raxi-+4LH<(=mD!ylWO9WU(1Z}hMXCxzhsyJ9a-{P
z=L=3Rg#*n^V?$i7hflC4msst)mSl6so_mdYV{$y7Sb>OT<S^VCy#Q1CY<fWNj5kV@
z11+N>vAM)*lpfn1$4G?wAWkhuJJ>SzBGpGBaMBuL8Qn;Wce?PHI45b~BFoq(7S|vc
zhzpFej7kyQ;R;4u#(ojphu|YlwZLVTaS(y=8Bp_U{_RF%CGGK#S$FWM)jf|{%>q^S
z(Z{SldRaTW$K%#HI=gVk;RmYiTOYS>>|TAB#n%`)W1J$aFfHR4l4doLOyt&PppRt~
zA<)l4uto&^ErXBBHh#idd|vLOC>}!bBpp03)G|UMSneh^%rYWU+UF76dIlE5EhGG>
z{pAzZi3aI6Q)+33X6-1o(;j^-<B<K^GHbYAl5G!JZnY5q#^qLheOtDD-*W3heP6b{
zW4Uz+Z+gsn(&`!r*~6c-I&)`o&XZOz{cu}*<C9jH`<9<PX|)WwUyu*n@P2I`I>@u@
zK1Iyo_LQfrX`^;_2xu3GJfA>Fj$OK8G~S0mz5yiTS)id|b$*V{@}&s19>bBh5c%}M
zTo4G%>bVi!#nK+L<mhS*3+)!A)@*%4M|)+dRTjvyhpphcX_0;X3JPA9V?V#bD%98I
z*o{|O=jy9F+ZV31hUm%8cJWGU3dQ<%rPW)n=wf$xnkr=3Bc8SvbXwgtp!Jqg=1RuT
zo2e4+N9vUD71Qc`n5IpYuoK^#U1c>5OtjmrvKj?uCkCyuvIBZ$w?yGH*7-UC`?)_^
z&AQ6Rs^pTy8RfLQeCN%W_L_8_Oyrp&-wL(i{;*Maf)JWs_VGVi4fVz6*$vCAC4q=t
zUPen!Oz6*9Z>8&{x%S^*q~iD&t+jgU2XpQDYppBw;tLYrthMIV;8hQW8?9}Dnf7_F
zS~>c$9`-e_T0`~Hp7u-lq;~vO>*7GBJz|rU#euzMlQosQ$?t5kUe?Qd+xNd_jRHS?
z%^Jsk{oJHwpOK6NoDZnz$%y??(z;(~i_+DbtvMvMe=}8{Y1i6fE!0=_w@bHJo%Qnm
z_WN6`=K8+=cJ&r(gkII(&f97=)vNp4*Kf53kZSo>?)yjVJzJ?&)Xv_<ee1;o?2ETq
z1N3zR>_@lJ5vvE-f7@nt!?+4H#`U*bt83;nFUXP5^X*;Rt;^0Rg!Ad=`m#ZG#~oH!
z*XAaoJFIyD+TerNt<u2o#4S5%$v~l9@}|{MFBxbj-?XwC$($ksN<gLytwv|AB*}Mu
zwWhriKfY=GGhJU0P5k41>()U13MP;$RQi<-QYTG%cgx64O!>g7m)<VIU{e1s6jyTG
z(pv^BBYm!=1%G2`ss9LK`HD|^E5znpd(|h_V!a}2_xM!0UDUq)Q>)e4he<t5zUN-v
zl5b;iT`RrMRUltF%(egaDNQ@p-v6m}xn6dO-TyOdO`x0o<7d`Iv{|ImTA-JVvfrt+
z^7LJ!?5xk}`cv%O&p8geM%!<FZjI6pjkcS9VKvYXkG6Y!VV%vP9`gl*Q-*!p7c}s^
zF^Q+Xu+9qT$uai2e_GFoKViRBQa5@TC+M_>jJAuNS+dT6#P9p9tI~rN%$YfcU4DcM
z&G&q3Jr~d`#@adGTW_=1;Q05}>R>=iyl{jb#@Ahr{9;|mLF-m+wdMS{vYN4@z<#9K
zYNxN7XeX<!C__n|qgEzENvoq)#}>nB$>0MV1{s%(9L6<i$d#_3vt?w=wTq5g(}?%w
zQEPhRX_p6_LH^7_PA$7Jkp@asUZ1z?a{I<#t-i8-;#aFNNB*5(>1JU&{F~JR;lSTy
z6u8nZ`OWIvJf0tLcD4j=m?}vafqH>x4hM};QWkObOY-f*zggY&rTKRIV^#rvV#zV9
zVY*(PpLo#>{-P7GpHB;p)3@c@R(kLnI_wSU!L!Nl{`6ocHn+T-9-N^sn`C#a5p1My
zm}C#F5$tfzp-BO)J?YnNeI8%RqVoiAlFw^po=2wX_Tn1Bj_0(WtW4&(Cbyq7$+JJh
zgw9dBCRr8QZEFTQ>KiB9<7<+~?#cEoHOYFW{cO$Pg-w!IF$YUlGWI)Jy>T*~<W+Xy
zj9^Q;RqHc?w>2qc!ILefFSw?MPMWTpVsAQwJT?2<GlFviQxb(4!I#o%4wxFy<`G37
zV0Xw2wg}|gBQt|t^~F;Y_hkl43wy}aEn`{*vr-PTw#IZNUM}qTie(xTmf1^uCap>L
z1e?^9NlTb{>3-N5Q^jZOdqaFiq<`U)Z~DDKULs1bk@2C>&H?Mn?=+cf<g64ir`@D`
zgRP{i-+wQ$>A%n6Q>mSp*U3|bmA)tDA-fu}9N~udVKpF;dtWeL?;vZGZm0)jG)aFd
z@T4wVqvUTJa!z55p5*-PgUjvC4+PJ)7VxDwKBD+Q@T`Webp~HemP{mu{Bb_a=U|St
zJIoI@2u|Y@Njn2}lLf*0cK-ZedQ*3Db))AojY@Pkv*Gsr^MmJF&ZKF7{!s8NJ2^jS
zwk+g`<zc3Zy3pV<QW%!M4d|+9+<tb`1=t@Wo3>>fBvWLk)NM6OE}5&`?3hM7K0uoc
zw=a5tCSn?${y?yimBmci-hNOjGQfV0ng?dv+aI7qFc<!cJu8@iw%V>!5^P{67X+CJ
zN`j`9PuGvweHR3qGge%^AlRv4X>O|6(i@MVJqVW)?q>%c44z%9g%THE5YX27Qma}J
zOlRajN)ilnZ7{W>{5mPHS1h7}p$CH@d)kA+#+FQI>r%qf2ZLFhDlcO@WEH<gWa97d
zYb~jC6=&r>bQ13CYZ#GNA(sv=hf%m1&a+>AFxbF~3}A?|Z{ncW5Y1w6ihaog!3Iqo
zLr{*)=`xGh4?YxZN2h-IA=;$S{^TKAQnQcZHSERIl}|hwJ_ax`ErY9AAURFb+>SmR
zG?~5ZF%Jhj4wT8O06Xb4hp^4PBsJYgVTH%SQNsQ*NV=jcLsv%A_mw7ZZiy&r|L}0|
zY?*dymjs(J-E=OY%k3J?M%t4HA>E!?5^Nkyj$vGI`frmO)hv()Fn7dLM!V71NX;dx
z)aS}k$+X)q45rtRA7f4jlup^e9<?yom?8hBg~8T31N-9(gN^FtU6HDV^u@ib#tPwb
zr(x~Ng+cS2@Rg}y%wH?lmhaat3ZB~|iAj>^{-Y^Lx$}WEm~csc>gc-6G%$Wquvyb`
z<kA~NzMsRt4W3C@vJuX;my-?s>eWSbgl<#VrktvL)gpQ*M{-f{e9n;0k8oV`?Thea
z$h__mP9>J$i3c7H9!!&k<gDev0)xRW@mXo`h{<S|_{THB?m820?Xuu=ENc=Qo)2E1
z#(E;ra&7QgEXE|>ULOqW3~h<;Hw35Z%*kW^7ObxaLWvt*4TcSwxE@Xha}0*F<knzu
zpibyV3eBIPaftaVwa0A_p4(Ke*~%Z+0v*5VB+KnNIbk0Yvvj#k#-EYetg9`%!)0d`
z&w=@Hd$3bXuCk@f<Ype<gJ?vj;Lp&`a{T#cyDZJheA2^Zx#`De7H7Dex@z~?e3u(I
z`ZFJNIW?Z1|FFv}WSG&k0Vf5GoCpIFkL?IP+%pzp7|vtB)47DvvlXv{QNKEbN5ccU
zrN#u|tKr%36jBhL1v>+p;)mdgj8p>@KLTek{<yppUcf2o^72+(%88Cj&CinHN(`Jc
zMJ2Eic6ztsJ+S<ymTo8ZU%*KpAB4kPFmJE)Kfo0&xZth$7r3x3TPzhHYsICM@^-qj
zK#&CW)*3(1JcBei>jF0GsTQgW=k?OHQkB3t@X|h>J{t~m1>RKp95`C!6`(u3;X$s(
zs`Prr5R^X5PE<x3DL_9-;4#l&Fr5B`rymZ>wZ>wV!AOa}O?R%4N&E@0Tye-&`kUeC
zyX?(WJO|d;RJegoE%ved5tMw%)>vh*6wW-rmPf@;!<ApLEio+>VGUgRy;p#j;G8P9
zI4b)MVh`_Cyba#)1G`-nzX6xi?bRWOY40Ei@dMvIrk}%cu9&;W@GH0yc8}S&@Wh&i
z^NUhy@FSeZ6??Y=)o?lN#{V6TX0e%06~Lrv;*DL7rST~m1Z|Czoum$23A-8AhjZGo
z3r{7`7%qoheN#Bp-izNFj>E3LJ-ick3)BV9?x6HBYLJT{9{~$_=a^gwZ-d<$^@qb9
zy#(^$64))!2+?=)GQ1QHUFhk@!Ex9v&=sOz#!V3Ni%An+jX+y&Xd6@kZibU^mg=Rq
z!};sH0?maZuX%@T5xk1KHmScf?K!xdyE!gzhUH#Y5J7<54Ljjh+)7B3OSvL=7r`x`
zvSCLV?1kU1Ol`^+{bz8E{i(+lgb%<|VHt^qzl96FGMx|D3Ll2${s9>{`d?wWKfx17
z4xfPK{=|Lc?+DB`B*0CFj>@10EcXs*07qX7mYWkUXTfr#!sRBg+_Z4H1zZZ}srcFO
zL0C>hDPVgz_iOZWDOUtJ2;vA_gKlsIoK*Uruso7dsJIU-kEVna_lM<?6jwhOmPb=u
z{V<8|(_bv{zvUj3YjCL~fS^Dnco|#-%XA_IoFECnZU$FM0^fNVTqW^gH~uveA9mwk
zFY#gNgp%HkaM^ckf^r2j5F~vDx59gUd<R?wCshJ-VR_^wtoUA79(_qyJYV9&jTJux
z%Ofx{&PV|k!Rb}pgA6Od5(M%rjGMsYusjRn>Ys$=nHX2U5|(FTT>Z1KJR9TcpNHky
z7+1drmPce<{aUyzhQMv2^$6ry8TXL90?V^9ZVfiURj^xwEwJ{3*U5Ikd2kf5)bLGs
zsZak7T<O!l2gh=L^qTZT1oAMBdyGGU<)Iw6K%c|%FpjJL5|)Q@T>ZZ!zEA(H#OK)>
zSO0^=hYM5ze{RG1Cr{<LEpSv4fZYs!mjrnD#!Vo=4kvjS$4$V3_rh-c8gLa{q%t@Y
zt~kPt)Ih358o@lf<IIlILg%)X`QImKAqm1eRRV3`(xYC4jxdk*FoUHM><SnC>J=y#
z=1HHE@h^nSelwlfP%`WX$B$7U6@PFHLGJGim8yipg^zO#6<-1upYSpm4OhW#jmN?1
zn&sT^k_@K6@&u84R$U86V7CRQ!}17`Oh#fKn}I-{9dbKW5iHLPWh;X_VR=@lh2pue
zJR>ww@qMs78|2zAfaUR^h|({F<*}gQikHChI8a`SW7-pvfZ>&}6pq4S6=4<3MvRlE
zO&MGSyT|k;ILGn|umK+H<5%H0>>koBa9+^0kJZp#M^K2sJ*K<ha2oe~Q~}<Fi(xl`
zJ#aMLxe@6!;m2@x4GN^}D`6f-I(bOH6t0=N6DLjnHJp>7(vKnd4#83c?lJulmK)S`
z0VjiB;GJ+-@$c|K*gd93JDSv|uK}0WN^Jv_3~RxaaJGtH2Tre@(zENNhE@+jE&|t}
zA)K!aM1Kxk;?p;UH~91|;e$SX8#r{PSAY(1#K)cC!W5_G|ML)(`V20BD|~z*%p%cQ
zAxVMy!Z|)32<Q8F2wdXhi{K>eR`3$|prcpw|7ZkRbv%P{a0Jd#O?Ek4=+jSvm-_Tq
z!#jQYYvC%N{suT)*GumvI4bPU|FaOpeFnF|<vzX>uJrL;aOf;A!F%9{kMD;|*-qfj
z9VPHaxR<KXO1K)1xt(quf^d`6bw6q1t#CG6q9VKrXPxVv*Sp}cu+9$`vHt)rXzF#+
zO1KzyJL%W(QaDeg_dQ(M)QY9<G?Bm$2(p@S%Uv1#1V`bp;-BF{9|su1O5k**&w!W0
zZiVW=Rj^xtCa~5#wF5&6+zyU5=T^QuRCYj6fPvg&6@yOjh8A7{y25*5H^cK`E!)!M
z2e{bh!IeI~46cS<`zd12a|-UF^d>kGLl993%s>!_-6kr6bJ-Q)w#e;pp^s<7Ww1P1
zDFv7dCw*KD7jVOxn+T5nVYnD>s@R5O<p?55@Dze7*zNTz;BZHttZ)Xi8rsvM@9Nn<
z1Lt(}3iv0u04`MNy#Vip-9zvfICNfWKeH70)%MK)*$CVk?80CyTvH|Rkt7Ja$Mh>W
za=urC??mt8pWzbNt-vpE5_T(aRP+~GT6dNHvG$yQ2NASX0;2;Z?Co()IM>H@;Cvr9
zfQx<H1TOb+3wST=_K|LI6)YEVqy_rLM8IXth!PBdbJ$qm9`iwP0qi#EP<SuwHt7iX
zAncxoW8td79#4X+hj=Y`H5?l1aqKz-vTY%(3UCW7d)3`0zZ=en-2yxW%bxX2b(TCK
zJlq>Mo`Z`=c<Y0|!Q~gT-$bSV7MvbEnO{u%5J5Hq_q6y5mYuY24S#}Vr>)B;;HZz!
z;5VD>(slI>;6gZ}GHeN#`?xDy1*gt3=KoxYaETY8w?y#q0Eqy*=kaK`8qQW3Oo78A
zy`gpn9D&^yybBJEVnd9wFNRB8mihl-F&OP7upG{T-GS)^I1esS39N<p`uJtIV2syk
zUxQ0v*M0{azRc6_g7aWm!a4K*CkV<AxX0)ad~mGS2@gvG<GdygbfSiE>aV7j2`}|=
zOSm|n8tc-_Gqho_d!3XUl-gymv%OG$|BK)n1mSDk1Z!z`!MWGEO<qg87moP&0XPpP
zeKM$}mB3|i*jY%|(rkF=4IV!UYd3r8Jq>5U{QgZPuo^*QN08!maM;JMz=vkIy}XvT
z1+Icq(`zm5O*mtwr>~Isx4HvWE$uTn4^H)oTH2RzF;|K+Q}cf<?I#58rDGoI&;mJ(
zT<TJ?#)BZW;X`pZgF0GcxC-Vh#MRNxgR9}V;(@SiqH%cy?A{y{`)goxF7rPv<|Gh9
z;NB(_gN1Mg2Cn`&*u5_(`Xrp`)9-+@U^$OPzY7lEZ)tM1S@>PJ2b{VFR7d*+&by!a
z-_76~1fwu;`6qaykAH*neH`elY181;X<0`L!ZUq*20YuxA$XpT>%j}|Xa08+XoO%f
z25te)g_pu^6Ss$#`M3vM>a)*-SNV7pT;}6R@M<3yz~wQYU<QH>uv?(naMGt=0B`g0
z({Ke`;ta)gv@*EG!(Ii}!m`g)Y9NQ;6}Vf8r{4s}auK+lWHW*dK7*a`MjyWmCw;sh
z-U*khWBe^#w9rfM2e=w`Tkr@h`&Zox{@R)IPd2l<g5wxu`Pk^9X<;9y!`VKr1?Tv<
zE-d?4qpCm+;C--rNL#>_KJM6s^KZXT&<%rwKJE!0@^N3d%Ey<$)o_8za131jh`Yk6
zqfLOf!EOs&4ex~A0!|-=AZxK_a4Q^!Ln^^L;2hYk!98#<xP{Ue!__|h6Y#Ob8)v7T
zumYv_8%@(bjy-n)N3D74eqCKo&kgWyxST=>zY3SMOx;u!{sOLo3zhy?SRN9PA~e&}
zw6lBA?|s}GcDDpb{5;s*3IKD5=Z^wyJnZZZAVG;R<rJO<I~xX+z6f@A3yA%Ecr5z^
zWPTKW<P?4qPVFBMJy-bu^anwjwib5x6G#HvV0S-(@LQ+wZg@ER4dgJ2{*zO9KkRNa
z5d9I@-CR&tuJ9lC3t+P+LqksLMz2IT;}p(>-5m#_ZvrRTfv`Z;IOi0Oz{9(wcA|*=
zNVo#d!#*a0YY;el6I2auf-AX`w@xMa037Ka(0EY5so_f4-Gd<UUxeK~2*TT7XY+xI
zzX$HsBcN4qpI`I`VQ1%o&KWL(;|QEx2P%Ua{1BblKeaDV^mSo(`+;yH*xhm<+yZvD
z8VGlUovj9{0Nr3`zk%WlU}wXD%<vMSk3<-dY7*f=5@BHKM!N6_*xg+xd?hTq>q=CD
z(_weloakr6?p6fhg|NF7L3kzXY(J10UIYmQ?*0SeH(_`Gf$%=q-E$!P9bCe_(cLP3
z;6mQ8&sGB&YD8ZjcD5cU&Vf_A4pRF5u<SC3<)~va7J<9FKoTf`L+m2ks1mpX-UZKA
z{3z^hH4yu?u)DQDc)RG?M^L2fcf;<!0?~g5v!8&+oRxvun;PY(HWY|K2D}uOu}=7G
z*xgtl+!1y+76=c4b0@KJQT3r~;F(vYHZq7l20L2~RC@Da?P^xIZVjJ7ki6PEP1eJ*
zfvd5KunEqGyD8oQFZS^}a0M)XAu{|(aW7?m;1sSBj`<=SM-aM3Z89T)ntlGrjZWe0
zQ@A@EA->z9{Z8R197QifwY10tIOZ1=Ajrocq6}u8!t>x#^b^rHBY{P5IV^v|&%v@8
zUIrH7zrs<N-_;IphU0L$;&=OS{*@q*g|9^TrzEHnkP`lU3R@9QQ}m0m7kyn=cFczr
zH-#(ULd7|!@P%;XI&U%?9*I#S1jAK?@fi4dDqMwrfYRS|3eSNfJn-cn;|EURC9pih
zm#L)+ybN9nOAAN^N@ED*i7|$INAL_RPm=9Y1}`WE9sNtNJn)sF>|ciEL9l&FpM)3R
z=;>dFOPQ<P^xuXnZu0s->|+F#2*O$_g9EU}BW>A=zk}rwH#dQwV0rY-E#Ohbm^%dw
z^kwtaOdgq2oCeF2Yvqb-!@FUnk7@M~?7GFv;A~i)kvpg&G==5aIhV7={x*+0!Sak<
znX>N&%d>YI757jKItA<p%Oi4C4m1DfA&_V1_9}ym;r(#RAWa(s@1EmjFcFS3Ze%F?
zNw7Sh>GD*#)W_Gu<vzX%PQue9zRdr(A;^jQjtLxra|5Xo7sK(pcqgOEa3MTpt~XgN
zfy?3cO1})==hLr*v+rgvo0&?l3@*Blxq^|>ng9QSpz{8JR;mov!|{h0I@M6R363oE
z^gG1<Q7^%_;he|4KC>Gh3+Jf<eF&GrZUH}occt{H`Tt7<nNN7f>>IcOj;aKHgu~0|
zbgF<y;i}SrwoDZ;(2qm#w5P8DPb~A&3&B~>d0Zdf3rF4g{~QF_&-2<$l|c)*Vhv-z
znjPE2hhFsb-Qbe-UV=U0Xo4p)^i&f^;P}goD}fXbf-7F}^uyu98<|ZRf1F-<34+2`
zISUkD2A9IcdP;veoSkHsph|EGT(;TMUk~SO^QPyUV0n;prb_P?xW)E>7EwGKE`VdM
z!CVCU5NLWT!~5XM*S+5X55gsHc=|_#-|{-`WAI`hm%>YZ{3o~+&Q=8|hb!J<{!dqe
z^$7N2Fh%jJaAcP^{ceXR!mj>pI0+Z3L-Zb;Q^CNZT4*nv{hp`)6khs4YP*^Yy<fo<
zA8;OyQW+kS2z$I<`Xf9FcJ<Y8IqcRT(4PXsZUxSOD}DNUusl5M>Knpgc&9306JdGE
zSpK9$EfL7mzwSWN7M7=iUCx0seB50UfZYsw!rAc2P6$Uo_6D{=qTlD~hr_YqpL)~j
zNCcH|fhy2AIPWu0pAYYYIZ2!XPZfI~-vICT@l5z2><-1Z!DW@+X&vj&`B#mgN;Szn
z7*u`1F;~0*&e-qiAAyg3=_U9Wyx{=La6MI^r{KJQ@k2w!e-_UA#?${9F8kJtzqUW;
zU%_`?i8e@r@I;m1CQ0B2T0|wd9ghF#>36~V4toi{2QU5ETeR+l%i(US#Xg6t;4H;o
z(ga%i5zpXTxEGwK3=YG^a76JjxWcD52haz8@hXr37s7bXA<86>goi6`42L*H-1K5C
z5JVBU8McR)!b?>P<iI(<1)P26(n-&Q+aG73QTm>68H2_##eLyb1~1W6Ej9?=X|e@T
z>4(AjX{jxUq!-ghBPd2ts!Es-=cKczQSmjfyzZc};%V?$IHEGR2`;U{i!hb`R>h#x
z0x>vRliU>-D=v4K`Tt=A+8JJgkHYe{fUzpVV{obLz*JnSc!T0+;IXwl`{&_uIHYP^
z4kyoK)26adz-1^Mz0Cic#GtO1!0WKQ*dbNIH0>Su7>wuC=zRi_Ox_Qv_(QlDcJ-eT
zsDO)={s26-9`A2Zd>Af)rG=gO|91qX2qMa$)<Di{*zKiRaJ7%K;ZS{E1FG!LgClTQ
zaSu2jjw;TB%i$J^FNP~&vQ5qZ6A@%L@Q(S_a4ziDa26bg-5SK<4X}G!mB5GKOqIcM
zSj#%u$+YL-9C*Ldm%~{Nb$*<3{!6c3kDvsBTf$9n63$YcY#UtJh&OF2ehZE_*0p?<
z;CrHn-2!|F%X<}OD*b1$yl)|*_yAlE&vfViZxLiS;XR=$gCF7au%22ZQow4s2%e%6
zI1X3B?nr42LeY%9%_@EkIH$R%KNFUhgt&*S9vp||>~LD(90XMe7AS5m1}(iNYXcXy
zqD!j`_@awe*xEbB=fg=jsp4M<%iBmQ759Vl+jyOPD7>K!Zy9l$oDZyM`!L8>5iWyG
zc4NExE8uL{?GsnS#jsnW>)|Bq#-9Nn^u@mo4zZcLK&2mpi`ucn)$LTp2;{vfIjV*a
z!(n!N_fipTi2$35m%(w^9jI2p^1hWRO8)|^b)-d9dTZfiC$CQ>;JvXN>4m97un|GD
zvzNgZI3IR1cnhB9<M-eq*e&2*c%6?wgR6Xe0B+yKtMDN>R)D}w;3ovDd<MV5m2kQ$
zpgx!;h229?6VB=Co%eO&sE-@Lg+6Wum%#2&-gYqOU%5}f=QFhmUxXfTl~3Of4t4V~
z917>aZh|A>sE^0Pg+87Pm%wg?t{=?#SLzeY#Gu^Acfv^@-wW@A-9GRzT;bC%g)4o$
z0<QA$^RRZFSAduJ3=x|iJi$gd?Bnflj*s7gBR<{>=fUn!{W+ZP(;t8f;Nj{J9)j2T
zIQA0)t-ELND;$O01W&+)aG9FTf*kxv4+^ZfHeA)q)7OWS5wAjx;hiv^GtM+45bCSD
zS59ME8w8~Y+<~Vv+`ivQ1MLEMBRowd&==0`@9|K0v5zl>wE<oqxdINu+f@9k;r%D|
z%>Oqc2o3Z~JWC?L@+Wh{9K}&}9^V5O4)W|5z(w#rWxog>HQ3AWF*qyF)0e{95EWnM
z|7Q?{5#*{2UQp~7;3YVJsF&f(Z~>gJGTa33gx!H>8{B@Fr+*Wc*BzxR`*&b@4U+sh
z^Zy4*;7%SN!4<>32%o`w;l(O}FX3qyc?*m~@L|~1{{$D0@MgbXVeMk?km*B+4>LQa
z=Kq=qDiMTK2902OWmU0ik*09`67QI{hD%|$lXirY@H%CGzSv*Nz@|7N`cd9ok_T78
zneO}_MGzY89is_i0Pj`>x*Dz+!`s*uPluzIc_Zg6Sl*pgqV#j%oUz_ybPp`=-f}0a
z1#k(R<<9?)A}B{NKqc@Ltc~+(SO$k-_bhl(5`f(V6L1n9tKx5hv&QqTaMc3a;nE3S
zA9)i_pXi(aD-c8xxSePZoP;w}f*-*ZKK=}@^znXJ-ZT|f_TRzs2C5c{kHXzxS8onw
zQF=M^zo`srBgn@fU2#LW0Cx2);A!wORijRD`jreUihIJ*d`~|ZJ~YXj?}x)<Cwnb6
z5-y(1{9m9l9E+d|gI<dB;hd|yCcQ=?z`06)3!H@W(o*BZ9k9Gb%&kBS&VsrB>Wm}z
z!1A^+S3e)#5JTW5P=bK(_?<L(6i$Z+s0^3Eg|OQqE8sZn9-`;q5;#-Ym&2=Iw@>^P
zJ_fse=oL6taGRH45<wyCCin&{Z=7>8{5xC%yVLE5a5?OrhWp_PpZ;68%E!OJq1(Lz
z>ccqyvJto$)ELI(0&i54OFdY-)6+MC(|sI<Lofr0)5NV{d9~ec6~6-<k5gmSN4mpR
zcj?Y6jbvz#4CDNhci&|wgTWZcJMoHCf)~RFVYh&zBmo~!fUA8x1=jBI?5~AG@D!Ea
zbhro(E1nfYP>NuH65I_}!EUFV59i<OHCc%y0K1*=akw1LR`H*LE8&o;&@*t=ecn+0
zg2XTO;>TW+1Q2Xc32cB1?)QFLy$Uab8`A{RWN(W7LVnE}DgFn%^ic*H#rxr`<*fS^
z{{U+%7&+Bha2$?5%W0ZAL^17*;f!q0u}oGOG=wYv?A4$RT(u_k)TR`mli2^o8(6x+
zd9XVY_JE6g`o3@}O!7_zhQX`gu*1y%Q3MqTGL*q2xDuYG_(nKd?j>*wJl4l|!uhZ}
zecuN!ebKXD441<0xUw8BT&pfvIP*V(@Jn7VULz6Ub*cdC;rw-84L8CiaE8)v6MNXL
z;ahMy998=FVC}EoY5EU13zh;o^Z!2)<Rgfv2;ab^uzObg1edS(GBht@<b$Uv`?_!>
z;nlDq9EaU;;aqqr99H%%;SDeaPR;*q5hQ&E=fRcmJQd+WxNw7amJAeoIL=rqePRS0
z+3byUqv2AxOvN7$=WR)~klbgS3`gO;?)-lpf}E}1X;%m@h218+1wIIqpOfJnxC#y_
zz7N*6d0YaA;JDH+g-d+;C*j=f6v&<bpFt22fr?ND=lS>rIO^jy@K_(e2<Q8F9bDk!
z1YGFjSK%T!=1vw{5R@X=sfNyXBmuxJ(ED&49-s>N5nKt6Qv8M3zu^s3KZ!jYXCRU$
zKjC9JWOYUmQQk7@X3!7;uN^&kV=*iSyS$;eHCzO{_8s9k>`q2qVR;LwTcHc!B<zk0
zQMd|jp$dEz9DbYmznc=wMlkkmuhZNEYwz&c7Nvhk5`f(dZ8-FIuhW&n1+d%co)dkA
z7k@3Rz3UCV8{tR{f!nEGM^NhH_uz6^noydo63%&_(^MUzZ{WfYys`ZloaCD^tCW4(
z#po+JM2gRXD?azGls185@h`mdx;26lxJ1>s8yx;81CvUiKfH9m*Q=v&)|XxjOoR*J
z3>Ci!j{A5CybI1#`ll}D{3}1ee5?e2#vuHaSHcZ&1?(oc3C{V^Ytn6E|C85e-hdCm
z?jfpx(+_)xcrP4*%iR{enDejdXYUYvje&N=<DcP>k7Yo}g56G-28ZDsbx3N%(dyI#
z`f|wX!;xP-eG|Cwm>0ig3_%eBw^z1@m%#&6i*$x7;fUh%;qdRC{zA9_&QbdQ@KSiJ
z;vsMqjOX;BD1pdvPanHXA|M#946cOt`?vtkIpOJVf@k}<2;S-AIGm~R0U4EEF`N&(
z6?zEX28UAmnD!`wN(6^hgeQdqd|OBHpWsrRPvod$_9C2P8pKJ}=x=bCHw5fcycIqM
zyPfP!SYGPuo@MXA@^awSPI}D$?<)cEohJGSmY4Fno$fPOUe@dKmvDO@e*;HgH-joT
z4|bPmN8#15JI(|yVa*7SlKf=;4<eA)7`xMD23!DVs{&=hC9o3#`$ljT>^5l_&gGp4
zZXaqb_HdDk-vthxk$QuGbjl0h2rTJ4^M3?EAp*C=L*Q~aQzbAGu7LL|o(ONqNWF4E
z5}X28!LI%FaHy77z!~sZIIQB|4##0yEH(e%ji4AoFJ-U*u7Kl;m%`E7UWTQT0NhRK
z%V2pEwOfI5SUc0}0~;hh?6zo9;=?EJ|G$o)0)gAa@4z`B@0fl7M|`{w&Vx-=fCKPw
zc#7h0;d0m=*p9%VI$nCe!(m}}{<lVQRSSWeU@bVu$MxV`A2)#`KF)@t@H|z3F7RI1
zZIK>ui1%K&+#fE6V=YvKVF={a*2mOob}{U{@>=P~D0VNgOn~Lh)Ekw3J}hsnb}u+g
zg*P<v^wZ!e=XmMg1m~a2r;}!?3~obEc`l#%QbTDRp4g0DtGF00Y2oQh;EdK@6EB4e
z;D}03zA;_~XDR*@ob>Tu;JvN+be7xXFC(bLfL`g016$xKAHN05yR+S1{w^%<)OIu2
z3unWw{xdiNuU6@O30J|ziVwl@SR1~*r38n?0A{>!3UFK!;8i2t6sKLvNC>;f@JzVc
zr*8-sw)6DO;GJ+p#cvDe@T!pkiaW!x#R%LCdLpQX+be?saHNBG-VTRLVAp;OtabGC
zSHe*^qT(086|kFOAsp%CrC)R@=YJuB92FskK`HFc-}k~v*v;@kn3pU%uO60x$A$|!
zdkH=z@!?FB-m?-PjwoIu@w@OEbj68FIsYmUxW{l4234@z0z2SvSFZr?!1=Iy2=>6G
z!s;ye9Iou<4P^h4_`J3-tTOxoj==8S@!zDzh26a-(nnEZc#4Wp1I{_$>y#ll;^PKz
z9_)7Frf?K?2cB%W9M)6;I|=7{>BY`RumORaKp!{>yLUeG;Jp`k`Y2rK<8kmoA72Gm
z!Ru57Zh*^rcr7Mxr>%rlelhJ1iGUzOMVJd~J-xGHK3oX9r_Dmq_wr`D$KV{;?L#Zz
zeAqpt&%tqcxJrL5Tyav*{QnAqg9utEgDr6MLXY2o7sGL-e-{q-_WH<2Vh@M))M;A@
z7e>58co43H-O25HxN%?A31$BO1wj>pLY2V@IKQ9QB59+!@(J%%$M8%zZ-6(yH-M9{
z`*ZzVxMHBEZ!PwNyhGXn&L5oWl+OI$13?^tds_5^qj_F|hQdkMZ6Wyr{C+s33M}75
zuZG<wzZwn=!Cn>kdN?wa4-+cB8Lk*+XjT0FbVj<{5hO==3Em|J@L?5UKD;gJ)o39c
zxrA>psscPA_He%96>!eTlbuZa6Pye4mLey=)o|5lS#xs!%RurHf`encPO?!VT;?6)
z?eG*>^;+#6xBxz=GW-V|9qS$AO1K1e3-m9z3eHjXKf>X0%!cm#e^dnHy#naGjHnQH
z2ck3JQaDp3kO{AYcPTy_j!!V0*HB1>n!|Y$-BoUy)*jZb@cL9YI0VO%s)>6esJPM_
zI0nHv`ChNT1TKf&UOXPIg6F9MOoC@l;xtuf!8P!)tN65q(ig({Q@jGqf(xfG|0k8f
zYy>%1bAVI=^MtSQ)@<`70oeT=@CbacfY(x~3OojvUgx#g({KeGR_U*StKb5~>*4(C
zng8>ZU=xDm_1+=a4rff`n5hijhI`%M)#wAb6n4*weeh1$9eNMIIn%uss)CdMPgnN>
z9aV7#aQqSi8<59t4P}oau1IOBD0?c@P$_OvX{1=UJSsvo%OkL8fE7X-E4ooZ5h)EX
zLlreBDpo98fr1n<D34fB7YdRp=(YqK4`72La5zx<pS|B?h8xfE@cZU@=gz&et6>HQ
zWfGpnndAdhC=KM^#jKuTR=-X-hNIHKAWmUk57|lE1N+NO`t>S5E|B^ParI2II1R$-
z3Ufhs;3T`HIB}-6nlvK_#AlhWV)x+K!wjiZI1$f(#Jp&x;AM>9IqA?$yrhabBKaS}
z=|_3&#E;?X#~ESqQ+LsRW_D|*T<-ZIGK_fAJXX)*$XsTz4A~0Y{4^uPn@4r{PdM`e
z>${YH3x`&i1~;knmpBNfy_<g<u6~pD<UdC5zKiG2zm_iw(xZK3sO2ruO|Ix5uEPUl
zgc`B)wwW`>aST^W`BS)gllhQ(9w**u?QE0uTpow$yJoGpBuOB#)x3K1apXPo>b(M|
zuwET|;mugD1%2?cZRX8bi2d8mE&4eQv6FtKT+k4e9|xsF$zKyl?KB-2jjQTS2gc)O
z%!p@)ybL=Zm^a}JoM(9!?tpdv**Jvtt9$_t?rJHYbe>Te2yAZaHt;;o>}kp1tinw=
zCLLIVi}sp#`FebAzj-XT;1QpgTd)JC8_eSKAuj%u5y{Re_5Gi~zC(0C8aRZjKQ|v5
zkEjZ;o+Dr5U`AKqbG}vi51aDmaJ12sZ+$oEc)rx%5r>ZO5lr0zwf=V{5dM<qRA%{A
zIPsO~@wM1FY9?hrRRPxTg_~7=tltZ_<22SUqLDa!%sj>=ICk9q6rElFC#wQqQ-Ms1
za#f(24Dy&fgqM70X8CN~^E4fj{7>V^8O~jj$8aHTI&0Q~WjJx3IiVk8T*X>CRDb_}
zoj``b0GR`C;gFAKPAc52+>VX0#M@Q=OW65K{1I;MNO>9JPjI+Xt`iX-#0ji#!4d3q
zX0tVZ|2#XZzb4@K=eFE}(>Qu51>_2y(KMLvZnmagJngX4#obIz`Fl8u^)0v@r?9>S
zy|DlLoZTe#^}&&@91xn6K%puSqyq6ERp2rXX_6ruf~&5`b!ugVhT%|8wjPuGqjB^H
z9IPfIG7eY&kOt%y#_`GC9D61C6Ds|Bb~rdAL-ZhlxqY}o$&kdUaBj>0$9qB*=t~9C
zzych)F}LNOFUIx#Ip|5!|A?c!HhW6_ui`2k60gOz1Ni?RbEUye1i}N|4e->E*5L?V
zCh0qI=4O-rA#TQLX>dQT8)Povpvr#>9g^~mIB}aPe-u~U#tsL4CBwI>g25)kSychn
zlQ3rtF5)aT$=@EQhnV~Um5zr>hc3h6J4|{H9Kn2l&Ca=NaU#iI%85yVegx9PIP^{$
zh~Vf5Dx?o;M26tnQD##820M2fkHs-uCPO?8@5a5wNtJ)JyIY*9Z$3`O3Fv$D9D!=A
zpVOCd?L8(vh0~Vb!Op#=!Frs)6Qsd3&ft}DK?iVk%sfS3jA8vx5eP|v<77x<ea}y;
z3dfoBT%M<DtUq9M!s+qcLMh(``zJ6bq`~euQ)))27miFc7j#20&wqk|{!sWcRlv$H
z1m{j-{H4O-%6wePNQ1?=8S{3{-l7TkU>Q3AO8zOhdMbM;$_J229DR^UDUao&NdlQi
zxB|Jy^Kj^KbtJW0;UesR$~?Ev;@Wu(u{8LyN?*X9i1J=ojjNWhk*0KDy~@ATycf3M
zOpTd@$sMYK=gedBkt(p<%-a1ph4n2tgrhH*^hO-Rt7Q@&$Nm+jzEimNC6oVuIECc{
zNYZJ04?TLBbC0FRot0mqLUC7I{Tg#X8VKRk>+EwWBXA8?rwp=~WN$$sR;LYi5EtPv
zzEwOLmt*=te%AkK1kwbSONKeP?9Uw7Cw>t}*O*zo39rKuN&gU^#ClSGhQohh<4x)C
zF<iNpwM8!IJg!?OPmNmtZ@AYA;E6c0-reU?J*VX!&r$hFpD!I)?D0yhjz`ps(OVwB
zukuTJcKzRP1yqBLID77*G;qq}HYGH$f%6z8y{pH)usW}CfTS0C{0poOdgPY5-~UIN
zK=vNq@9`{;7vm@u=qs%CIOXwnk3YjH%IDF3w!utG!1b57E}OyiF|3ZNtdx726T8@#
zdVGz?g;*VNseaT557wsrY@k>N#8W(;jn&zi1v2XwdA!Qww>?f{b>?Q7T)^kzqz;_)
zWN0(?;sV`pXe(Dh1?q3N7kC`;c$mkNJ$}SB*Dn?R-jiX4$0_Ca&7|7s@fKW6dT+=5
zg@ZkKE-n!N3#agE@oANR8|(i%3HZlxW!ueacDcs|xQcW=>-%~<5NE9PUwS+m7w&NP
zI979@R5@t{rW1(bYC5Dc%<_0H&XBG<u+Zb>xNxUw;5Cm^I7<3Tsedz0+JPMe_{&-?
z4ea&!AdZnfK`Qvl<C8c|dby-u@VLWxelz<6vlet6&+}&odXgbZ2HikEkBe{}>H4`J
z?r{u<{%$%@=5YmHLi#G{U~)Y3&koEZLu8j(JeGR=5{{DIO)6O9@fMsQT{p1P<Gnbv
z+kD>mJn0D>!*j_Hkp{lQ^{PNCcTT9GvE2@bKgw<S#pJFYUyW0wC#1fx$H|)rgg!Ql
zNs-6HRRPlVC!`XOC*xq6^<Em7?(xI8mULbIiHn_d783C9aUVl9>!0=b6`UqrPpY*Z
zzl)3ZnmM!E;{!NFy1U-<TQnD$_5UaV|36HIQy#Z2<(`t>Q?BrGT#Ng*c5g`^yzdit
z*JgDAx8UjnCVw&BXn6+C;OvN~^?we5XoDG|g&seL)p;Y~YzOk36<D7);&)D5Sgq38
z;WwA`06PY*$Jx_n#r0U7Je!l_rtib*6k6q^Dv%-I#M<yL^=?0L;VZ0n<nyzE-Z!|z
zc%9zSIfwn0+fL+nJ}d`t&~gDTkPZA)ef@B6*X-~`;AR5qEG+j5Iy!e^brhxcXsk}F
z%A<n;^52Km>2lh$usWVf=U<A|nQ7Xu<0y{s6shuSadn&{Pu2Rb0)HdWm<_moPgOA4
zRPceSz;YUoz)|T?172yRH{zxTO!;r{G0UwcaSN4cKih$P0w=8uJ@6UJVXRL3(pOZ7
zJ6Ilw{gwyge9I$oPs=4(9VVRICPNZe74Y9h^^i`(L983Bz#%I=sVcBM4~MP%OK@My
zwOE}PtVirM%;~}G|0N@^hCqXrVFNChX$ow?m6)%D*%A9#<;O+heK=HM{7)RW`~^;7
zeL+X@#tL@$^GgLM3DlE8`z&s-tRC1V%Wd%)%N=nZyEy9dU2qWV2D{-H&Xe|f;ds&t
z{Fp!jt8X@HQvMWI{mv8^jOSwAz@2!R<&k)m<uQ1p<*~TlavUcctU!W56V8(!{uVc5
zrbxC!kK!{{`h4uL4W+)qr8w8}3pfw!4!??nmfwi;{1sS%bz~^C{5CGKyakW4ybZ@K
z@4}ThPddB@S7F`3Pw{Na|BmzgttOy;{5hSR|B@lq;?B-_%>MA%^gzyu3we`WyCdRv
zF2%W|yAAlA>v5jt8*vBA5$v~ody+uD6&QwtmdD_3mM7ql<tezQ<qBM2S=Ha$@{>4h
zIk}iXUn}q;*2fP9xN~pd0ap57u{wrOU-A3+R?BHzWce^2YWXyd8Yi9p_wgz<0cRi{
zWw{6!TON*MSpE3<=YAZw{5Vdu_{4=jh-2gr@Xsqa-BKUx-&z9dcx3mA0?t<4VEJRL
zjzQMx2XMynQQT<xG;Xq-dp}PNo2}~d?Xf!2IO`-m@)HO?Wis@@5z9Zqw_3gl54Ai9
zM=jruM_De$6L6l4zyw?`>0E!nsUV<^pRdclcmmENIK<YVy2lG}0XuoRvp&EMR!l0a
z)8E8l%kSZH3r+ei>?|_ghYN8;eg9V-I7Fb*%5VbDvfR4NajGo0#|@SPIL5}A`U-<M
zj&%pG!)2HTv+ex^ug<1t*Z+Vsgh1aVrog?p(DHN~!@7Y=9LKuC$M7mEe+^!3rPtzP
zR{A>Jtn9A;0jHk8Nh`x=Se*vyzB&WWf3P|QRQnVT;e5HGR#WIOR`*DaXlLbT^iy);
zLcYgA+@JIf=Vtom^uH!vS~+&ow5cW2OQ%dLFD*$-pEhxFS?TnWNu}i_lgg%z8(UT~
zdFqVvu~WxyShU916wZ%VRGM5m>B)@mj@GTNtJ#0pcV%03aGpP>=Imi#k2ba4**vP|
zsz%?nIU6_D{HoE{y{-B{Th~-Gz0udBo%%4_+{Bljn&%sRSGG}`M@MS5Hu{FQs;cR7
zgoyeunpe~Ri0|q)Y8&L`HDlEu)r&f%{@Asq_K0sltNIOxj`(~z`Q5oT{iCL=`uRWq
C0X2C5

delta 86407
zcmaI<31AdO`hbmB^-Mwn1SSwdK#oZW5bkg(V1P*oAs}D`<Om4E9RwkqMnq;fR8+(u
zNaKM9MHlZV=%Atw;Sd!y;)RMD@j_)aDk|%Wd{6aMr~9}6|M%5yQ}w*{*4cGePuyPJ
z_S`#d%l*>7IsKvdoWHyx=daCgv@y>0eddP-=}AXKnJ2m?|Gq1Q5J&bH8A}Y=XUp=;
zC*;9B8SNW*d*Uym;msl{mZwI=l#NQSN|eb%qO9*_LbQr1|NTW#WXew-+B3D&w?^_k
z-x|ejqHW5f0U^ReMA?ITuI#!mWy_PfjSX+rmNvOPWYw0@xt$Brz6vgmi~5(?X{1H{
zRvPjk@g7;TGd{Xzvmg%ACmpfeN<;aPXP%*o;}33mCAWF5lD@WPXFVN45~BW>GTF+p
z6(YR3^*{a<Vtg#N22uZvT|+4WB~&`$Lt9ew+9XcE+jvcsHQq8kuX&P{EXf;0rX>DW
zheeq`nzx)ht(H6@dT{8<{FuV*Ej#iC%d9=0<z3P(6z=5_)uhx!)D!iw&vi3|S94?R
z;A>tnQF9u0?NF~s(A*aL^kGLo0}k|(Vx#u&gB`v;&eBg7Nn!w8bG;P%wSPc}khpJ^
z6t!AVfE^g-Brpm-ce4~FMl6Hl;Bg@-c5D4&?BK0ZRFGp;fa$P*B+3}YCD`F&uVpJ<
z4p;tah%?&%dXYi|+oUMc5v;`{kiDhD;?cdz8+b&v)_Vk7{L2vc>3r+~C-yXWk+F34
zVK?l&<&nj0Lcx88Sg*w&W1rg+XVqu2NEZKwEB-J<wvOg39Q=afY5o6V7xgvj>enlU
z*x1n{rfC0$aP9XVu@PGpCl$Nm09hwKADkgYJ8_f<x9R|$P^eiRCkkQp0=mOR$F|H_
z(k2ub;uWoQJOS*=txi72!Dp_bxY~ajcFj?*n5Cm#0B^7MiVd2V!=+CfqN`?Jv8X)b
z5nZuWK5tY!)2M4l<Ud1nrJ+#!mI7};)Hd?gHJk_Fqg&c9Z5#3}v9pyV_9?9cPIEl~
z@4v=V7wzltnWwzsYaQ!XaCM2avXwzLeuj%alwzFrKMQYv&Ja18RW(MdJfc){qCxYl
zzSt0|-K%mmLLpq{5hYZ);%4y4gWkFXvfu@04be*L_kc_5dqlowUaj!c-|S-Nz<qi-
z85jZwU+{`DY!&|`BP2vNf&?AGRHe|@Q<vj8aLq&-l=i;{KGuz5@qZP;3b@pjzzuNa
zC?~!f;o9>a;q%1wH^EbAG?hBO?I8p~SBu;UhnGl^r4=59tDkmS#v!<0e~;DTRR)g1
z*|1%J_Z2fD_z15W<Y#bjxL4Hb41WWccQNXEMCeBZwQqXsdOWo&g3ZXT&cJ`+_iy%y
z)0#ba1?D<KPE)x0h*y+r|JHEy5+_5MaP1stgy;^J78zngEJFo?+7^{_J#{6%RIw|9
zJh(j7c?FllwG18obb=G%lr3>9+KwqATQnT2+&3XD8siZ`t+f%ZCRV$rZ-KM>@sF;T
zdvLZoZ`8G<2jJlOEh|>E35Dt3lXbLjAg*w!Rm1&Gctxg8<~w-7Rfh0u{ss1v`G96g
z{-X0`UHhU-it<~iJ)MnIxNra+pITS1x0RP35~MxY5q3mDyzb<)I~>mTh*>&<esIy#
zQnVsr6+r;5neWs?q4K|4)^))N@PcQh*oCe1r(u^8s9I-Wu9wbSJ_~_PpcDlkEx3;^
z(MmX+@09pPxDw@aTK_iqD9s<sO5iRWB{f^Vy|ztWkY=FTsY>@*#Ajzprr8+30q;Ie
zK6N=53`LZVMNuvNkKvk4TMEkBgerSGssD|*$kqI19Jxt$Qcr}-$yz`slMH9yY}7Tr
zRQL?@PDAbA8a_(*v<IjTaP7+;aY&c0TO9Al*OwQl>#Hvc`&<bQg8i!vq57pN=TOC!
z&WwIJe4>jpP)}6;x5bGl<y13T5H6w9+x@!)t{p1ty4$sJj9?2~Qy^hQfB_**=a5-P
z_;}%dU809zL4Td7`Dw-4__3dZ3*Vx@>hMw6pG594ssOLSQ?BufO6^}ALeMJP^5<<D
z;&V7a7qF}F6dcTThScxj^103k_d8tOh`CHBa9)L97AJ~eHG+BL;an;9>KbeepP1o{
z2cc#Ng4>;@(Hf32;$-RwJHwS-ok^l6Y#Tq05pV*NX9;dq!Xo(G-$q^d32?<nQXJJK
zo}$99mG&$c60;GMd@Soqd^P-c50o&fMwY{?W;xU7YB)*_*fn%3TtRsI9p3@_d8LX~
z8`!4w8S%1UmBAfw<!l;=RU@H#;$anlPHtD>)9@3n<;FpH0aa#~@F-lwph9P{BK#08
zrIFd4@e5dVGDMB`{~E3)!?ym<aIM|zLgeUA1W(L!Mqf`oBJAX;TffwY!~48qq>eBR
zK3Wy)1#02ohedB5jGtn4gDc--*wpX5A6&{1-ijQm8pwrD|HP^i{g4t2Ly-NLS2Wba
z!xeBpmLpSi1QX!uS3Sb7c`7X4auyOZ;Xtm_&E~;Ves>nqi{Oft&a}E5F2BJL%hUo}
zW#D=xxY7_OiC6JW@CkZ>U1jC)zSq2UE0cTSe1=hbV0j1*zwQiFkHOv-Rp?lJ`(XbO
zLmbrwI0OesFO&4G1dkzD74NZDqACOL!9}zQzmDK@c;t8~&T2lVB49lg)%*{<>##F2
zdTD*ZzH~e7-v%xk$he}p16)3o8Y6ux16@^se5NklP5Qur2~w=rJPbZf0#uo$KMMAS
z?Ka~RC9rM$IF`b-tfvccs{-8!r%ji2jdUwKr86tYEUj=ag3^Bst5K)`55iev*vsfD
z+Y7HJK&IAz39f1Dsmstocz0cfl0`KfB|dwk{0MF}lhFy|V_-T37hUEJ)%Dx&5!C+Z
zsayX3rWC&5mFODy8}?<fmLm?8K_ih?{<c@F(i!r>8#_DQECcqx?-hM?qrMm}-{ZtL
z2+p1!awd-}5LD+lBi~f`@5Oc%Cy51cc#^kn5p*rw@D|%YS*(VS_2QN343)z*FL_0l
z&cOY!ZTvW%giAxLVR8UfqUR9w`;OU3SNThD3bWNL?SDW;@P;A6)QIZ$Z^1>(hW1GK
zIUEd8Vj8}R?>nV`o#X#AoWelVN5}Uk915g6RhGyO`$RLNZgJQg&SEm5=B>%Z50`fK
z)U}b$a9KBJ<m?Tf9U|-2`#ErwC9yr*<-_L+oNhj%KK*a6YpA^ng~0R9Mr%HtLIQS<
zm&2L)vMz%+z**F<-K}nf{jYk(0$qb!RQNT_p1KX)4fng#d1X)5SN%^LKlbw|oMQ!J
zzmolMkg3ZqK^2@+*UEk3Ev28$*kL6=$>54Q|720CcsMnxGw=tT;@UI*9YWxH17DqE
zqXAXS5;vdBsCSqI7fp2<WlK27lDS0tcUAtpQ)Sd1yB~ap6_GvN=EA;fouPL)e4i_W
zp(_w%Gw`SA2#eurMygEB(_#0XYo78S<eYhg;0o58_P}%-yqlfk0v&!UTr<L%8}3oz
zBT{6>3LFv-AV^^eYzNp4yLXu{z=1J_*sH6s3U;3=d;({`=XA@j;E4mAx!^}Q$ijS=
z4*x40ea@@b@XY_J|5Utbh+{g(_26?fN<jd%fM^B>DB(Em?}xJwdg{7eC)od-Gg)QB
zLH1$vLhBV?47-<X1+b59Y1O!z|3@L%$n0e=Ag+YFCOWPDD!96jGu_UHi;g>0z7+PC
zFkO?N8W+~UDl%m?p<&&O<Fsp=y$P<RP0hmJn*X;UD1FCC-~pwu-Z?+|7o5haiW&!0
zqkJ3=yud(&K=EEUlNC)r?f()yvaXT&#5-{HI2JHeiPHZV792#;ZDaHQ7YIswI|Ipg
z@J2QwstVOA_Gh>v&QrHO_*3ya&N$&sVlHqkFzUlKY#h}JM<tjB`__BJNZkh7!sQfr
zqCNj-AqZY0>)JqfSd3#F&;c%ibLi*Fs6}Z$+;t&~Q0+egt~lm&%kl8mdz}@~6xhF*
zkx}c<fU_w;sXhNMLQp-y8F*H}MOQdyZ?~xk`a0ccE9~3tty_}qfKT5MCy3u_wEt3!
z{zzEWz?1M%+KhVVieH3l{8EHkX~A0vPOzrit@%TEJts*un!kWcIFM1VL`Cp5Tt3w!
zqPj#sDSy{c?rB5=bhUwoux<P}`oO+0`>)tL_KATA(p&)w;Egv(aROzP;3zoL<v$+I
za`80S@8S~peb;f?LO2>`?$8;&7M``5`M*{tuo^*%D}tNhL$5ohQFp?vT>kgM8(fRd
zUGQ!a?5g8?4E8f{*^|>V@b(%H7X`5bJr9?LT<`Q%1VOgbKApqEuwZXzce}S$fFx(A
z{s8VrH?m*JCrV!%KlWd61taSr+^Pm82j*!PoHmgNhr(|==gmzJWVc|X(-~-`n7vz;
zj-Wj(zH$~6-QkLeR|K?w04^KuOvi<AHQV|1+J6jOONR3`7dNK=m9lxXr`s7Qgc+F@
zXoYLw?B>p*a~WLpv2((FJse$Wi2Zu@ycs@v+#~ktD!v^);W{ta3fHirvTN+&#`M3M
z`@P}`9l=v5RC3;8SNU^rfHN~jQ>zO1!#+;%ZT~1-N}KTGujYz(;3#KunYso)R_uze
z1{MV@(X{{R5Q1YU%+d+`43|@7b^?FEMXpBIkDQdZmm;Y3uYybIwwans;qbf8P`nbZ
zyvKQkYZNbZUP<US1S6-)x@Gjku(||MMm3rzVYPp^c`v*mP0wEVAAtSOI<N2zSnx{h
zDz1iW{_PyGeWIAzGJqQKhr}rag@Um;CJ;ZsML)9-&{gz1T+7(5YDCQie=BBs-Jtb7
z$@H4mPKN5k)$OTb{Ysm_0UD`2eP_a3>(XQX?}{MDb@+J+9A%Md2grddzvakOH>v{I
zcg$%-BjH)DZgnLrTx-0Ua7Ch5O?Xy~E`Uqz>0ZtM%MoPvWT@2%tWq3ty4me;r7MAL
zu+Mc&cRw7YjVR;ef6v1e?4*id^$Me~dy)DsTy)A1yYRQ>|C36ADpqe<Rq?lQ@F`~?
z`U5`Jly$t0@I0Ko*y#mE6B=Ea!}Z`P9Bu!tUtt3{IE)sdYoIAC8e0pT*!+JHf*Q_v
z8tMf6N<kYxc279Pb<92h4j*-n;R@jV3}<>>ufnt2wd1=-u{M6}-EcYQ3w8VdJxbw-
z6XDBn_B?0%{0_XA04lJm(of(@4#@3RUIPaiXzYROC%BY0WOw6pa5X2Vg*v=w$}4Eb
zIlevr*F&(uwV7-I3%Z%Tq{@WL-}TlVD0EX1^l*;l`ohb)IY;3G;mT*cqEJ`)Xm|m0
zeJsO<m<;Ez6Jom<8@aAQ;G4;2RKK%p;Xtvoe_svT#*gC`xO}H`U~#ANzk?%ao!~w2
zK89|)#E-$@%bgl}7LIO~AyGjNRF%G{A{fYu221g4@EHOu)BFxxI?pT4=w9*>T;rNO
zPr~lo;P2pCRz!9I|AtGL8-hAM59{xWb(H^%K2T_Yz<q6)3i~-duon#JD!`wyV>6Y(
zPH=Q7$7(vl-mq=_IC9~V|1wbN8YzSWY&`5XvKr0~t#cL(>;ET$t#J8adWC)^JK$sK
zu_V+Y^ilXUw;3v!Rf9OD5T9M;QP|HJ-Yo5Z7*25wT*u&0t!rcPK7yj4bIo%S&i>A*
zJMs7duASk$qCep@SA_p4|CgQpzqc6~ctzILKz%r6kVk}dhMF{E{;xQ~l1K|KLLtDw
zV7K~q@GeF+`yKXxZR5ugfGZC<lUD&;&Ou~3eyT=B!DVzqyGF*sJ{onl_Mg;@{uiF@
ztkajFU>iS<+u-054xjZaxJ&s@@ra{XDuEp;!f#lRX#N)*UEyrEE8+bG&UAekc5iM^
zhE#wnJasw#7Ot7&oYDLQr~Toa+nrVZ%Q(%}8Tto4n&8w}y;LTn>tx-9V<Y%v2A>M(
zMwklkW1!<rSp^8CBZ#t9@@qi{c(-f*?go!s=|tEE_O<tjRXV~PxQvl<s}4ULKFS@h
zdS$AHir}-ZQ?zNYpEIS{D@_WCIS71=)d@PlLb!&Fgvy~>Qe6jUbG7Z$35H<zmFul4
zylWiT0++b<|2yFOxIwAb@jnbZ2Nul#Pa_D-Vx;0IK;?KZJdyQ4fGSqA;VW>EZZ=Lw
za6~Z!iCw~X;nJP5?(2(>;Tq-wd))X64*%iIjz7V5_W^4D|F4RGi`r71fTuaHgi+;$
z#fE4B*SZ>2TeuHPwB1@i6ZY+5^Gc3X0lLC|=86*S-xn?=d?Bm~bSZp#mB%{Fw&wqQ
z1m&(AUJkoIl$oadT}Lces|fh;BcKyl45#r<)hkp9l)=8~e88do!*H;nx9%0*0&o9?
z%`N*+E63#s%I7;(v=t6B`(^0}DwIEs(r&en!(VfWq)MbpTnUTcoC}KxT)}xlAMO7t
z+{(2idmGL^s@eqoUq$c{f|?sRfY9Bl22Q9Oczoh}cy}Al_q6{{a2bovWjcYgD*Q0#
zzCflil2QZPwSEKGH<=pHlUGx?hJ(&=_WYlYpqi1-E^#MVut>DqL@(v<CpL^a=Nqj2
zUzBwx8bjf6AF;?ptO_&<KKi|^yLb*NeQo^MOX12|j|kbv=_?Twy;OHDNA;6nrZZID
z1aJS*Ih?*7F5y#%Od?YWZh@n8TOX{t?E`Rj*ja)-3|DiqYWJGQm49EpAW@A*MX(ov
z`#e7i7t#%tziL#+;d8Em<QuqQ$eA6_z-fn^GoN2!;X3*J4_rhWvKyt*f&#VR3z9OO
zfdm9K>sWT{ch(qoUoN+Vr!bk=jW81q1f5IwL9qLpZXA4?ue<CTnyL5}qpnewz@;HB
znM!m9N|oSBr<Iq%0d@_3?Y|B#p;0QM*7^4;|CSsu=nU_K6I`cekHPL+uqa&1GTnX!
zhvCrQu8qYp1n%SWQ*iJiLu}Ove6ReUcWUSt*!Q)k?t{6%;Dkfmc#uJL5Lv$^Rb0#m
zXF7Z`ypKaIKmO{KXSSsOMY&!NYC&%lGC3NZsJR~;<-)=4$GNbe+u8bs@GA7}CDka{
zedRI}cJC9efhRU`X3OO*>3=n>>+MDuM!|jjemlJ3tn<$9f={DwS78P0bFH9u!`0^u
zaaxzS5?1H`%BT|WgR^Obs%<Jh7(x(u%lT5{DD1Bv+nXtcx8WK_vVJ<ihj8>Sx}}~C
zKUexUIY%mI;BW)B*E$2g!u!_P(`~Xi2S34CupM9M9~HqiXS-bQB67$#J4Ga@x?Ll9
zB(1os&OjSDdd^dKFPI4@xR&W%;Go}GAM}DtCOF&sOJUb(dPo#10UxAfTL~nH(eOS3
z*a=R6t6eu9GvPAMa0+#Xu7=&G;Va+(n^n6Fg<wB7DkHW2?Qq2;Hm4Rd|8GT5?z%*}
z7fwhpM1>CU2%J61sp3jlw0C$P9Ngs<XLXeyQ2yHZvENhvH*!;|U)f1Gd%FIlL(Tu+
zAlTqK;rK-<xF(<T@NP0_4?JFmu>1Jdrjp2$MWSd9mvcm8H{y%nEbjOB<FDF~AMV$K
zgHD~nEcgt)D1g5;|Mx)P>+Wn6E{6Ra64@iyKsdk$2st{!%i#@dEb=v931>4q+Oy#l
z#Vk_CY5zHJ4L7HDH(dr#jAb}B|F1@%j^CA0Ilc+*w@B8V_umc|-REq*w!#&#z2do7
zg(pHa@Tl+)!zrcC@%$6;$=yzk?18JTospjZUqDdH-p(%30eJg2P9u5?u5c|fKZHxy
zvNO_ES_4<}fx<4GfirNpyR#nn9WG%qQm;t$sta&*gGW5U1ZL$pzBLJOw7N`3*bEMH
zDdp7>wpRRtvwQ8JnAuKEx2nWFRQNX;iusmA@x`!vvpElTA7-ym`W#e-{5pcw2!gI<
z^UbjPBJobRhGR6lTkV3+v9+}ak|*GPuI~e$g|nY?rsEeByVj70V4wPeG)-sVEd)i^
zvQyDDZ~_jeI~$2#;r;HB3=X<Z!Ty1LRJlD@B($L$a|o5G<M+WiT-%+{@wZU^+3f#Y
zX+c{A?$hk<u>Uz{e}6Gt{i$;ykpqi0Og=jNP<VrD`W+1iUU2RY#=-7;z!~uF&`D>>
zRe~Um{eGFw@nX259xXr*9M{7qnO^teucp_nu>14ddzAixn7&#gJ^_oceXQpb5%>ht
zuNvA_8+jcLRX*vhYxSQZ2tMQym0U_Ig>PVWxtyZ|`~nO1da4zx49K?h8qR)qY5znx
zz~*(m=9bF;V&{-59j@jT1+;(Xw#@$xT~*c_g;E;T2CdK^wv8XhP!$0Kkv*`Cf&H#F
zFafS;=Zu7l;c}*D6;>7KI=Ez>$NE%Taj0#G0DM1g4;*)*uyKLY2=9TjIBd3C`GasR
z3yy|5!pGq=qnr#s3;Uj8tk-8sFT$r2on?I$T*Fq+4*y07f%|m&V|YESez#8GHx+<o
zvmM}Xc-d%{WZK{3$DxfMJCT2O@xi;A8&nB>a5d`*WfZr7i@4NUra6>}ptgqHt!~BH
zuwX8*6YK|%TfrBph*boG;c9lbG&73}VB08X5&!u2a$S#G=}&}f>N#7v8F10%Y(8V@
zg~WUW?%8QE9OY1n5z>m_Iym5(p4Y%9@t>vrZ&CikoqoO*b}z#phO@a*nWFWdg16Eu
zQ~{DjC|T@BU>oH;%s&B+Mq>)e;+WzLXCSJ9qX(S1;XByR>}DtM8yqCV2_ja2zm-3O
zggvmybcSMjiB&)~|0g0SaeZ=;3<tQ=$<`5M!AA?6k5aqC?sffOIH8gAjc6h4UdB&?
zck_iLyE&^yr^5C)5mE!ias;;V<JbnL{YQ%JI>0?}m~LgaiKpTBKXlf35xDZdQv9v;
z_bcYZrT}rNHc$m;df6+=7{3N<6B2%0A3Fkzzk}WD_;YY!oKcsd^KiH^L#vEM7@uLs
zN6B#}!&yCfW!gUtE*j|(wMI<e4`=gHbcwzH?~0&?@Bg0A0eZp#AM1D`QzhsJZ%lCd
z?LgR1Bh1qN1+dq3d_NJ6qHni>tKh6;=MZZaT=6L1532KbmB11Nf}PF+9pOr);2O*C
zfHO%zMX0vdrqb8OkNqlK(x0Ci;a2e-hl7$On_drm3YT?pX2);fA~rH1`<?!PAk0W-
zXW&;jf4cL*VNyG4gr5M+(g~!%K_aljw}7|%m|)1E`sk!9Twd4BlEnbn7k1jvU=_ZG
z^MBRNRE`S}2(HWhnnx;j1-Js<+k%@*{fZ{Q6I;Xy!%#JnC}zSnjJrhyQ%kJ*@T?lA
z+b)5l1K8@JuhLry_Y3p=HSrO^CvH-Li&#SG1UAC%kI{C(epiMbgtJRj=-6!d0$e(f
zFDP^Z2UPe)bX!j>!8hTGp?G3c2HuB#p%qLnI>FBo1k*X8(EJs={ZVJt`aSHv1v?9u
zr8=8QZzeUc)VcR-1iN2JTevG<MhD1%YC~CYsG8X+P3NdP0@21RD(pohJVlK{7<J*0
ztNfXqZ2i%2Wzad#p9K5rJL~>w@YihRyaZDvE`dv5a>k8C>i%sORpQqQ*P#%+j1Pr$
z6|RK?uKE3DIQy{EC^y3HTQL)^bS={>;G7}O`M_>CVV1LKjbw({Y;rW}B?GF_97LgC
zlC$}I6CU}kbBpyp{KP+8IOq(10?%Udq2XI4Jf*_B?vj6nosZ4x2~T_45JP*4)=vx}
z$dX3g(AgM{a`&5~6;k2A7c8-Kn@ER?I1jL2K^E*j&+iNGj!NqrQB~zRaFp54tK%zx
zH~5_z35`ThNR`<NSHeXNJ>qK}U>dw&tJ7#o;P4b@Xf1<J@PR?0*1rL+<tjI-c|BZ7
zud(C56)tslLLspY!EP=J?SbS0*!?51M_~WG&Mny!Z~|MkfKFg9EL=nHOK?_|bLmtC
z``B5PYyEfN+9IzATCb4(|HlYMl7L-;FW{mx&Vu1b*nRqaPWjK`#6(AU0j^>DuG)ZV
zL-jhKf7sdWHi1jKGM(#J*aALxqi*zS{`VsYx-Or)DQ@N*pI;2CU54Fga^Rv#EGRs&
zMt23A<~k>w0GG0rvzK7g;SD!9uXrxJou9&tBU;u!Nn#O#eT?-vx>er}SGrEM?t-JP
z65b0h<DPGou7L{JeMIvZoc4_&!rDIq?;GQMma`vL)x|4U`+q|mL{OP#)HS*{;3x|Y
zyTl*Et5|SM)DfP7k8%xXPuD-fw(;X=)REb+x6^G?VYLdb#835#)^JgCHFwbe)#T9$
zK}9Afm--#`glkyEX6pc#z~LNckvSOdQ|YX5#wdMuN&&4u4X$p=2Z}nr5_tQ3mhEw|
z3@n5L=QxxieJe*RQ~=iHvozlf2k900nzzDVU+H`g_$XX{$f@FIVB7d{yb4#7TC7nf
zi?`uuLub)?9IoNqFl1Np2?XV-ob%}%e+iemhThZg{>}DCnJj*R1q+OkSgK0@1Bct#
z%kp?}0S*jxj_2!T5kIZoE?`5LFDy9!(-E{naF)%ey^QV&yN^^Zg>B=<F&r*j=ycPu
za1pEIGW=AACd1|TIv*&^glhxN0%ks3vy|`uj_Cl45uBrh0a#6sE8#wj<;tjEEUbs0
z;C$vBtP;2t_A&k11J@SVy*Ygd9(k>^?tc_M<N6@s6*wH?h{PVt4<ac2ij9GO1#iIT
z4m*>@J1PJRjzTh^w%?z^KH9|lIzvCf+0(tkula9y<JZo)pQjTe>15{|u^}8PV&f3j
z0n!lE%;k#(oq==}z;%(>6<$WG&(Zz^;Bwb-`Vcs(x(r6G7e*`QsMZgw8XFJi*R2VC
zVrD1$zdG$!Mk&lkVL!Lss)UM{z@;=AujcDignS2NSNVE4+ci*agwsBBejjibY#Tq0
zd*KQ`XxN2Y#s5er`k&ABtCXivh*ITtg3rSKJfrS@|3$dk<^M8l8$XU?aF*-$gkQp8
zzLK>w{5|{x@s*>iGVn_XLG^XCQav#I1^2nw`9@QAwxcH_IDPPT*XFc2JjGR|8F1-%
zele*N=n5Zoy^_Ij*cE@M073Q+=caNrT*mIz9v8;Jw(;Y*3Qh~#$LICLY`B)+bR^)X
zO0?jA^q0ea*P3xHTr|P3=Z=uLO)0p}=S_HfU5S##!zuvtz1=3BgwOe%fn*QtbIt!R
z!nX0_I0Oe=Yr?nSw7R?CP?GpSMbN+*IBH<`sn-v%x|%y_wR%JR4y$#1k?sY5!%<fP
z@m-j#Qk<c?0UUNszdqPDel`BLMNsRy+3W_lx|MwZtx#3z061+M*Jyga9|oW9=`2z&
zhkZS~bpz8x<<B9N-H4~crQ>7w{VKlMu)2vZ!m`f)7a;I^oPNIob{{a@4kvVRX2UIT
z5#7#D1S-ON;q|UY_5|$ygA5T^6gr<^yb1?c<SvNK|NN2(K_5n@V>-h3;Yy}k6{kwz
zBs{UMP58vOaE+@Qeh=4nU>wn}q*n0<&as^AN&#PGNrsAwFAa`eXi)!3&=G;!|0|=o
zJAC@QQ{p~wE!*dQ+J7KiO^Hf14~5-dy^Mw{4mvj`<KZmVDmn;T2OReK|FsC*9|qkB
zzdy|Rc6%dS+tE2r-vZy4uCmLk6oz;Z4rj3K)-|vju4bgP4w)#>bIN}w-A<1)Rq*yg
z=KqsKpi2BYf=c$r4K-K8duKVb<EP5MowGUp5*DlvRKHhi$TRS`&z-yAKVbJUTv9ih
zz*`)-=nSU9zDb^t2<imeB5<#OdMS35Z~$x@KaL4-Dff7G8wtV_A9GH(=PG?p)kb2e
z0+zzN@!zF;)wS@sF5Lgy?=*}c%k`Pejc_&t#UXO6O1KHOjUUGj*cWlO=eyv_gU;`S
zAA{XDCNIGLsm}KRN8s#`bqW>BVKstk*E0JfczbVlKDx@jgf}od_91aqqVM5aS1<S(
zjxzArBjxY#(P^@7Mda;HqwMDFEgQn2U4L@Jp%Z9|AfcHva<xzaesg{q)dqI|47Vff
z{(B6);PRGqGd)lZP~rLBPc;>l;UREs59gJQ=+69a8$XUoDC8`&7nw<7wu;~YFN}Vx
z4l0+yw(;Xw`#)U%KYTx2&P8Mqx+(*Y!6%q%)ySxpc>BAn`JXEEX@x2jvR!Mwci<WZ
zB72GUk&18=zuSqAMOXt@vsD|X!+#G4Sj5_m?oW94IvS;Jbmw9FC!8unjYB<nM;xoE
z94SEycsuuaHmAcyjMa7mS#X#S3he2&n+iYGSz?WVH%y}q=vO)(KF2!WZsbAOJurpl
zAaGv@EQPCs&L7uN8T3Eo++y7fN8fcaa3>s;&g610oQ-~(&cH)(?KjT2@+y4p>ALTR
zLgEO5h7+A7(K~RBa4wBLfKPtH^sF1zr*PSg&g}LzT+52aK6d*N_E$MSWU7UW=tXv~
zh-1LW`qCaJ*#9>`5az}ss&m`~E<>SKa~fREnNYrdMSi%V*cn(l!K=P?#(^GikQI;p
ziiW`Z8CmV2xDYPP)!kI>|3@QmUks`vH+QQqh5bw(Dniw2SHNM{f+Ga?bM=~A;L{_V
zuVgmCUF%jnNn$%(O9523wC4Yv2-M%DvIm++VE3O`*sB6?cblne;20d7<80O5S8O;3
z4xho*jHEs*!%5;RxM-BqW=_N5N!G?EcK`nig58~+kxTtr!To1(ygg|X>T@}rfo5>{
znDgmYD>&^nx~YC;?cfR$SfKU0!oeiF-zSQ`@Cjb2UyVPi+vOmrW~B1#1wtO2&u6)1
zI)dSFX?tfMa5+4YRkGbpC&4~$M%6f>wr2C;igW;@rsr#wJ~glvR_WaUTYE({{wcwo
z2;6_eVFz5UZnboY9#R4L)GDC$AA|j_jmrTQo)ys7+W$2;{IPS~{uaEU%o*C>hXah$
zA$z3y6hWV`T}3|e9lX&M!N1|t?>OD+57>QZ71xV*%%@zXIs;AMiSz9El0*x5A7{gM
z8}h@^5zPM^bqPBnIHOOc)O6b&PUCXfuHs8!aTi}MYyEuK_nb4ZjD)j~b0VVsi{Xk(
z_|r`Kl}>}t<T>~MbK!)yxPJ-g9G4;}uXcV7cO6_bma9@7;d*$=5kN<H3!KgOeEW3x
zO>mTt5YAX^h=<_{&U*W3|5xCC9CF$GHtY}aiwAqV{Q&~^A=20I(R`;7o`IL~Y}5%{
zz@0F_`6Cm_y%}iWYCTf6gR2<{Pv{JEfz=<U+N%qc4Tq{(_1Ys<9|S4Qog0w;@Yk)K
z-|-BB{p=NO{h{zaW>5PSjfC9;*JQZ*en)=>T*lci$IA?3N#g3>YX6TwMW6(WQ8??`
zKHsP!e7)|nSlpo^blv|y4BN(!<7wDkL(%{7e-{q0k;y_=^}3IH)BmD=>O%|ujY28k
z=h<WRpRk{ER-6A({+BxY`8ZytpKG^59X>_9Ty6Z=ZQ<x3XLsEJuGs0kg6<(Dc*prA
zbAJ`Ub<CCr*Z%EP@o?qO!1Dx=s}fue`%XARa|x_Auga(u(+U;7y%cBkJKqGCU*$|z
zp`8d$Fgpdb!Xt1gi%d0=sRSN_*U#kzY5zU&#AEh3A3xiH{d7CKO&o^HxKkRZ!@mLB
z#*gDyxTaIxL1ak$g&@tfe^2VmM#434q`>8_RdH*@uH*I2aK$$?Vx7T0iW8hyGyo14
zI3KYT!j-&|ScaMZM<WOZY(YIS4t~NFK`~t6tXRZUIMB-(is!&JuGww@JdQP_-KfKG
zHHXov^r}WTz}biR%_*Fr=KuQ<v>L{OBQd7%G#utzaJx}O;Cz-?J{`d;aI`sR#k$0=
z!4)hz8*2ZLl>fz?BkF<bC)hTA<+zyV+t|oy&Hrr>1bN4q$W;!zz-g|@r7wK?sM81s
z!0NAB*(2XzIGY8Cs#4XBN5H#Xmri5g3fOKV#c;XLtA0@#oBwAbI6;IHbp}e{Y!0KR
zXkH5Y7I{Uqek_Bl;X|wulu^^`2Dp?@D%ZoR0Cy;MoeP@qlp5y}Y6qNc?WFYl{|JJ@
z%bY`{ClrV6cjyzZssMCTdv<#R4zunr(h0r?Z~w!op-<pamfa=V|4aA@;ty*64nEDz
z=xKZYuSMYg0|NiRDSz<0ApMT(T|!80{MgBG(OPF9NrQbY>z38R4+oy(n-E%|s*x<X
zCOsB`s<DgVN><%<fB$D7f^yfQvH<qEM#3VvdZP1@$t1YnVwP<6VhK!x-G5-<YIq;N
z1F{?KDmX|lu-m|T_}nq>UsN|yIlK))DI=S$uoX^o{Q;qS;G?d|W(RB=KaOYM6E6R~
zu<r|}o9>7Gt|Od7aPSf4|0i@6A44#Ofk(Yl)r#L&3J(y0?nWo!vox{|noq&D@#FX}
zoLTAgbG7F>w})1)<4=ZHl{vqpY7Uo%Xf$?-(-G7xMH{1Pq!V0p+F9ZBfV1!9qZB>A
z=fK-{@Q21U4_Eq6dBuG?{4}_nZfO^2F8qEw=a1PefJ5#>qU#a(Z)0H55w3^bpV8b7
z@8&O`**BSY!i8O(D!vB}Fk7l2Ulm{{>^`o43J%}s^pZXO=>Ne~=LX~z6`-41m&YPF
z0$1#DJ}iC*cK_Y4kKsUA)~)luglqUp*6uYwz?EJP`^i{*XW{UVuIapAQ_k!^cYeL*
z?N0{wd~mc?+@8$0pRV7wbjhMQ^A^lrGHvOy>C0w|YiBN9vTV^cGnR^_%W%xu^VNy6
z#vMzhFPJxL_Ov?PxwDsP{dr4Em&{wZOiU{nxoEEWZqN9p!*DMuT{62=#Z@+E$u**M
z`Lb(gmP}tF7AVy@V%o6T%d9ZdmM&hRc*u<9^A^k!)21z+wX}5F@>%m%%$nDGPu1!3
zjrV+TZfs)Q^}MYDvzsB?#&w`G?KQ73<VDT6d{jTfx6Cg7?T*&_<{G@ZmB*ywZ$#Ts
z%i>J118$SYHs8<;Un2P5=Ie&+l6V7nHEJ8o^M-8K>}En$BbtuA1}(N?;zzTuN45)%
zLG(Vcuw%E(m;8Gh-WRaL*mn|a6E>r{cncYSB(2V<KU_5)W`!pfqQloc*1?4u#P;*J
zjSoL%SdNNUu@}O<<3zC~WN#um<3VGuGks9V(ShzGbLz&$Hm%WJiR@wQjmU>$+lCM4
z1qZR>R<oy9wo6`x7a#RnM%{yzo3p*LOK3KsJGw$0SL{l?DelquPuH?49X*R%>mp$?
zDa=APo98`bwmG2Ve8v`FGwh3|I`>QQ-|7*Dc*IPOlO20r4-dyLtiwHw{-wCpcV^1u
z<JT8=BI*5%oo&vGlkH@wd3&5}-r|@}xi2<<YRxk3u_NZoak5K#S27<UAS-rJ#-mQP
z>bVNJ!c2^pJqEtv%6}_VHWFkj_VZeG2thZXvlv-APd@$}Pguq@m&eO?{U;)3S#6mf
zct3-B1w3D?DUWh=+-5U+kC^Yq%jUy7xxB8_?q9Ll%UI@G-0G}=-z@we!<s}i(fW_#
z_paHqo^0CscAXQY%kijX3bk4Tb9z15Je|4RGSd(%jSsZ<6m8#U-d#_&ZE=lOYfCxQ
z5nU^oGi<Tdd>8NjJ+*flp<crKZtV7`w$a6IqhlL^&3eo-?FsXi*(pJ`P0mAH#q$Fq
zRzKo`IVpkuJ_`|l$wy2_pPz-O-!EH+Uw#QDSj+geUEkc9AiK!<=35D}W9S|<+i4$e
zLqt5We8DrA=PRD;T(4sWx(9X4u{fzqQftW5j7OPIW0v*rMOwC$IPOC77h$i#%^xqb
zO(Ss=f>fT@5uGvDBr^6qkKRu@oqptjKLa5)nJ*;D9xWt6zr?FnM{yY;l`&0OUp8&l
zQHN3@me1B$F~satpZ5QSIZ9bCn2YPn=2z}V@mb=y3VWLl#8+vS*}}i`v|9-#ThiD+
zOoesk7IbdqVLBE4$j2n}v-<L)xBw~-ndeER_48Vnb13nOQ)o+CW%g_!+x0yQs1xI5
zt^~t6kxO*pX5%-7;5Tad+vY6|WS3Ew>KHkW7d!A`|6rM!I;xQbe^Se0K8&}7i9_b+
z`G?1&b?2F~p=_SSHMQVKN~95Vui3t#Y)5yGg{@~^(NH$+^e4Vo@YK>)#$w-%{g;k}
z3rNd63QP0$hO&3)W*vGbUb|8KL8~ggEiU&)-0!2iR_o8#e%rKtO3RnH!m!vDLtqwQ
zVvkO`GL{1|2ft!Nn1hq#@YYWud=Q09?fI;>mG!pyaFT2*XPbwU<lxpXA<0AYEK$Cu
zwI0Bo%k!q0-bnW7xf!Q2qfz54J+Yi;jm}-nM~F+1aoj8>pnJ+((?~Xz&zW~MlD!kh
z;qxZad(C$nF&eA0KV=#rx`qJCEa!R7l#ON6ZZG3hYu7G1P33(R?p$Phcmg_j4_?=5
zGnXK(*OS1NI;I_{^NF;$3hx0tyUoRoWykcZQTdKoGI;nDM7)MQ9dEAo#0Y})HglWE
zrd#~dJ4&6%ze)@nP*LU%xH0x>)c?e8tL2=R2-eMFJ@WC$I_ZoRqN8`2;uW*CPY%xI
zYQ-{7q5dWr9f|#Kp6<xcVSkNm4z@D<l0|%o`@H6raK5?2Coc-c;e1G^vPMUFleQZn
zZ%&YNx*o=nD`mdHe;D!;JQ1yX0^RT6%XmKHQD(H({|X)kb97<-#m<9-(TjKUl4RLl
z9x|^=md&q<=x|nG{+Ua<_o3TRr?2F1kj_-}N|EusO`Tb-!!C5m9@phkN_Uy>Cd;M+
zpGA$MSL>HZ%dyoj>1GgBV}kuhMe7Q}#|k3KvtP?9%=9L*dE!yj)+2gIz24@b?K)u&
z@5CSqI0>)wJU8)7#BT`CaQyn~LVU0F3y`V((J*Xf5((QC_lKGnYQ5{Q)i#Dx3d?*<
zn2&I=-4eHZh51nvIq*tTdn%1ZM0AQs?nGgqPSgs?KVBXA6Zk!+-O6t!Pc6I+`yM60
zEHmddm2DF*LRFc2%*{=yzy)YMhKOS#vDX!YQmNFc95oBhFvQoIm7g+6=9fg#;wzk^
z{>O_01hL&rN?~T%V)n#p{SF$}@_g*lRDUJCg3x2l#VJJeFkW})Jgn1dC*c|9judj<
zfGk#!m2~q!mTYRCNRj^GJ+%7e+P+x_`!~!_cdb8xO9G+|a=+I30=`@cF#&QgmN-8!
zbDPQDt@E|YXp%DUYD%y_cs@p+YHn^O+oqod8}Zm?9bO^LKjFULe76~`Y5>j?JT1*%
zn#ra~C(%&5VP!5hlT+nI@)omOs_fD16h6brK~J6=2r!C=0V$PX&WBe?EQ}bV1CPX8
znX9pP5hM*A4mri|<}<0XdEbGE3lL|L`5@0!o>x&>MUXivsE+2p$dp-$>@xFws%+bS
zC|;)sdWllQ+zr=*JE6Kl%ks_K=JKNcdtw^azZs;U%wg1)@Qi?4^88LvW&TCb*0{Iu
zOfZw<Wz)F%IOmy9Hm7k6GT$Y_D|ZpF0OfLQuMVgJKStm@o#n<v+7JI{wEr0RkW$j-
zeyx{Hm=};8h~%cpA%?l3r93BZ-@<^Dd_;+Dee8WoYDO=T%j9il-&S&%d`L;+j^ei#
zSJSw8xIE@Ft>lEn<2vAf%;eUxqug%xZ!O2hZAIEk6|1>?fDk?8Y5wiFt&I%GVJh5^
z#CLVLQReM!<V8J8bT*Y#eSNS-d)w9@IA+*@J;JK9#D%Cn4qj`e)#5#^u~Wy!w{w=c
z!%S~W0lo$fl97w#HkFBvve;bHRyMt+TB|9G^O!nwqsy&?cfp31b;f>CyIX48K2|-c
zWB3I5%RJ}nVq|0|{pNA+BK2BDFth#ApR^fH(^|u=vb26o<CEIng^Dsy^ZZ35?c|55
zC?<ad#Fsc&d){My@29puGh3v~A##9uRXT;8V_ugodk>w7&Lvv!Q7bCvpPi1=`qM_c
z(Yr(Iw$ZjySz-Q?F1y6dLbS|G&ma?t=Fkk;G5KCs1WWM!uX#g;TqLiu%3^xk$&oT-
zUeS)V(+}q2cCvT-&vk&gI`$u35u|HbTl4jHl*SjUsD_y`NB-L!o=I}wxwNlCJJnp1
zDKFwL7dYk_9myA#Z9bbR7s{z-dVATs(_u%;a`n-wn_TiBvR};`+RMx2Nb_|<^&N}k
zGkV##F0B`FA9L|I-1X6$i5_<#;uEuH2S)ksh<?{<8FW_Le1M~mY7laoc}oX5xP6WG
zxtXw+yA+j}AO2Y87Xmch)=?%TH`bDG&{n3k*`lND*uPi@yiI!xS45L>Z{%r?Ut5Ab
zqJyg7L#-JksTbaxdA>okS)~<9u_HoKijQrU;y$P&xX~73A3)YyhiReptFe=jPecB1
z9cI5P%wnzo27ZsDd)!HLs-?CF!2moi$BtrWU~@xYnIx_Jpew>_ap$6^LN7<}SKPPr
zjN#!Jy3YKfN}Xcv$ddJ1cR-$n|3^G-vkJaT>2;L%o8M>2i&|f=V@PFK-Gr})XPuUf
zGt)cCi;^$K>oY`6bS57$M|EN@dWVQhT=A=Gs)6v&E}2R@3;Cy7R%^c7nql`9aDn*@
zk;{GNn(m~LgXAva?~I*DdfDcGl<yJiO@{uZ%kVV5$wb#d=Y%6-@eh7GkSQ}5f3AhC
z+ZA;Q^_!MIiabcHZqp8Kp=BGD2kzN~os9jh4!0Wn8n2im-p27SB59~Rv6Z<T<(qKx
zO_F#S`wILxg}2NWZLf%7-M^2cyO-yTmNZ1~O5$z+cQcc_a6qxf9N2{eikEZ@ALG-X
znCD=x$5y5cJ5l@JfPImcE5BNJATj2VN&}wd_zlv!%0G;s+uW&P74osT7x6qP#b`@h
zs0026UBlDMmC((WhV@@(6DclvTf$t)^Ar9%$v{7zL1xdcvTf*A9i&D_`<k|AVn0O`
zhK}Yo_-Cza5<C;$hCPEku0<z-r$4eLJSViCvaaI^=r|PLjlav7?{t+(S3ZcsNFr0_
zN9=RhajZ4gYnhVIC*a%Y-Kr~VG5j+813Gsr1<Z}`=R7@m{ASN?%n0qwE4pz&eJ_$`
zJQajGkFCrIb4@okzR4Qj)H$0>0^M+LG@rq{Zx1bwg<<zjdUFY~O9!*%x487Cn&-R8
zE|W4*`3><Cx){fZnJ=Y85i)K{L=bx%I&nOA@+{I(%);IYKfzOkUT>YoH^>hY$BU|e
zcI-9xO62?}9VP#%$A(eqGXl7?JXz*tEOV^?1mymrpU}A{Uz92P-XoDuy2}od?X-Ve
zwv5V6=7DV1ymP+vOw=ie_f(wd`*LLI5Bc6=Bd>L(JpS@d{<DzVL{x<4n6f9eI>$`!
zDF@4*=5*+alSjnFxAW<(Lqu6otFu3L4SIc^<Dc#H4KaRik}H0nLnAg0hKI}C)|Hk!
zFFM>?wPR;OfhbPwoLzFJkkhfTO@225i%W+@W!W4qFE~k5s)GM?V@>Fbves=f2dO;F
zMV1h~Drdok3zP3VD2hu$7bc5<%2$hC44|vb++Ol>x!c^_Oa3f>GS~E$ljYmyXT4>+
zCbLy8{$9I7MAwd)5e*N|9&a}3BfHC>ncGM9Xl-R|NO*WWrM28u(cur~b8=Jl+0L8H
z+xsvK1k4xu$V);O=ZWGV5p)eCrm8xrz2U-SJFmZx*NgyH%j8M<S#_q$L7m!pKXLxT
z<OJfdb*k;~S>oznE__ke;EHgeh>pw23xtP{`))~l6;6d$ERMfyF6t|D6HCT=ibZru
z=`QnyzVZ@TZT`|%riDf{sf_P@Z%IGPb8+diWu9VkQ_0`O{{6v#@ck3I&^Y<Et;4dk
z|16jP_JC+|LHQruIxJ25k8}BNB)sK+VC%31?Vs=RUrYD|OW}>J!%W}BGNpc*9ps?d
z{bHF~f1#bkF>|!yS+@USbD`p5+y8BIlj0)V|26X|I5c+po@M#ys4S+8NU8K+)^vIB
z+=VGrWLZ+1ltF>E@q<qN^F>+Hyr#cpN5#aZq2FHZik(5FDp?2IdD`vAJwv-&;$ExW
zDYzfh?nbz)v^yax;wGK~%)lkGnd+#K;3YC!jxxqGIS=JOQ;5CxaCrE&yscnRtn%gf
zuUmop>w>jQMRZ6`#VFqRp&eFFvUGKrFkwXCfiWVQmt&Zp^<%wen7{X9^*h#V(O>qs
zSVb5m!i?8tS*w63z8+4<`l{r_R8j1YChx2B|45cK#9#442h7#|WlLhZyFcA^toZ^~
z`&hE!;b}E_nFYf<#a^<^sZyCyZD*fp%sfw`>2{<Kz9q{ZqmQXX57HL{ZM}oi0*OAu
z95p~rOquRmdVUCd*w>5|zOkZsTvgFd!+c?Y?APi`-_na!iFq|^8?5o18d|>j;NkMk
zzJTEyYyOhQVS@is*)?5dDdpX8q3YZ5SE3)?QK4jFaCG?D&%{K-yyjBbZm1H~&yIF_
z|0+?OPLztXt11e|Vsp!>D_@i;RwY`K+q5#}gQk`1hBmDffxH}5oL1%sL>01B_;1Bp
zsnDxd;WfQVl%4xX6f3`Vxs0MJnrJLK0}e*}3>E=TP9UJ<fgC#-6-Ua!SX}G!MA>5u
zf(_MsMD80)FMe<9&R^;2<E;3G<m|XobXL3JLgNEjDXVe^Up+<?&xvLXRuQgc^i=ji
zA{2qGn+<bkjvU+CeiNrYlVt^~FHBavjyIti=U6j&AfujP_8-V~k4iPJb9A1v*V4MI
z(i|Xn=Hh`?tGZ<%!(bWVADR<hopIDiiN$TcGEk<b)ur>LEc=iVN$DkIt?Oi-87TYE
zg3<?Bar7S~hb2)C;mtC#)&*Ku8s_bY60Sv*m^~%SJUK|FsTMrQ%2?~ca&Y{0L^bn(
zIb*OaXr&rbeOj-o-u7*}(r4QYQqhg=YGj#*2FvXUN;@H|_k43!Ko$|xa{<}0t`iu4
z%Ca%5E=*pDlBxxRXaa-%MRxAj{w~Y%w12w;W|Lfb4|#twS9VNN)zXdXX`<SX`DL!0
zpeim;wu@JR!!ykpd9rgetI`-~TSbk`-k!ci2~m%4yty;a>O}|g*fCnoxA$<nwvSH|
z#Z9BrOaHgl<4G=OOt?_l{oV@~D*rq4=<BqP@z-CNETf*BtUv-i>6#(3UEu%swTM-x
zJ~YaH{p-TyUZ{r$CS>*EmD>f)!0+pouGdsOMD}TKwIg1+AOwFLJ7-N*(O{JUgFd0`
zb*kY<I^}chlBODdbcR^9_LwM3AST6$1HS5HZWt;zw6c9?1U7G03xFBJMOhe)gseUD
z%&rBpYy2wS+Owz3sRi<f^au@q<ME=L{n-7+7UevQz5czJ-D+G>&LMN&FdD)QM6fPk
zJ~d2sw%lJGG(R6EGi-N!p==W$Lg%3>vor3N%IzV`HH^F6+Vr8ZbQq-Km@&+pQ7Aja
zt4vnqMK%@6g+{z;MJ)oRahd#=%#A#DnH()=tNJj$Yq~U_j-%YoiL5)bN=7t1Om)h2
zhxmCzmFKthL+RK<jO8PaiYbT4yBKOL&&RFG5e^v32jNy_<8FR=KisOk;ap>RFQsE%
zHA-HRsG=8)0fxC_lpK8V8nRZSY8U&f*Tm|)*`lmfLgs13LtYE7KDzwEWaaiypPxjt
zQ<c7vZ#EfCv6TB%o;hf=Tw;}~WgznWXj&e{`u1{pT*j|MA^sQhl`G_?`1QE^3^m7$
zkqa$%{P@U`F|xBvP#)3295tRBEBjbpfnjE^ak3yu`Gjc^%FZ@#949aPAJJ#y<mD}u
zZ(x{Ql|dKF&i`z)8lrDhvOuKZO1V<DxRDCq&Ny>p=QgFP^eu6R-?%C#-u!sH>=LhZ
zhrDLipCFgWc=Lt{vU$Qyh?26}%`@+tAWKJXf?8+wzWUc`VpRv4c6uOfpP$aD8sxL3
z7bYwJq^zYS%0E^6{}%e9%n01OnRU?42a83Sim_YPo~zA?6J<1EvlU|I)uw-vOiQ@K
zioacn8TdaGbZE%cW_XfJi7!WUX285>lFZ0b?$%i|tz3=1aAA_2s~K;LvQHLWm>f%E
zX^HvuB<WAM)6#A_&umys`Wv96tReHv_lxEJgqtlIJ3q30GB=tsKJxGsxnB0kt5|(g
z{QAd&xnfGvkn)|`hs$@Wrn{~xJUk=dsjNp$dKrLZG><vQyIlFDm{(1eH_LkFH&f-1
zconZF&+I-;c1XD0N`n$lnI`)r+ybc@HMdWb*$KBniJfQ8GpnbO(~Xwjo)YuiG}%g1
zyZL6TpiIk*h1)Z~+*Mv9YsBYkX|%TzZj>t1#Gv%IRMa|a%lw<pT$s!ne9|YM%3@XO
z2Mf$QgEU?B(jOdZz7mvGsj6=Lv&Aaobq1sb7bYwA6qwtl%kGJHs#4ANFEz}!rpw;6
z4>?0FP1u4)gRGhJ%~dmGcF!1XnfEO3-Wpx0p(DHjK1JNAIOPB{>}9=fEIGkn%2hh%
zXEWsE%~XZ0RTWqr-YEiWt;+S9J7&s$?Q&z%)PP!`$9x(_8JZc}ROWnHo-XFOnKCEC
zrrlSwRA;;KZSX(IlCkt#imbAgmurrnB|FzsdJq1*)x3F@Oi!}jTy?l`90N=}^SN1a
zK5s02HXDa}=BU}St)(E2m*1NeRSMV5wqB{6!<B43v)LT^PW)D~{(QjvVU8S-u$8D2
zvVJQuyUyicn^|Hmr-dx8=E_U0dP>OJGSB>buJl{e*UYGyQ6g71QK8ggaqSzTOu2^)
zGoLDvQ{(T#ZTw+2nJ3>)-G*DtWkLCtD65`(VX_H7lxO}iPiEBlsU_iF+i#fJ{c2h4
z_!0Li?KdP~-hZ`BO}FDzy=0E|YsYxd{6O={mQ>o<$Ol(TkF1MhMxfl9R<@INyFBx!
z`SMhnaz|+|dE{&2oC}i`pUIxt#XP-$g|=a~xJI^;Rp!Ol$TrZG*GRwoz+7>SywtCz
z)v7H!4}~sFlF=<Y<D;VZ2yPL*bElvEsA0Z)4F|Y)6JKCDbu*Fn7~Qe+=<ExVmCRFR
z#xIn4oz(kVdmucV)okUA=-L$-0mJHzzT*{zL|M0|t7k4=C|h|+w4S+nq0DTBOTJ%e
zH8hXP%7O6CjA;30agcUuzPpeOsbQusiY=cOae_wX7s+ACDm|krJbd9SSuCnEelyG`
z7qO|XXa0MUY&%=|#t~aD79uwdTe~?G+b+S3z*=hrd}>y#1nmk$S;psTB_xVdzR7R)
zzk2tecP}Mv$NY3k>kZ0ZyOWZXvJ|-6e4$i65MM#-`#NAwT`W`M??XF&No3_>d8bTx
z2ryJrOj*i6@}NcQ=9xK5WoG<N=%FL#oTW4#rPpq5cy;~>|CE|(>z40=Ju}Svm&%2G
zD{u=cf5u0m?8Q?TCX>}dTXqUrrIxMBl{LOVury%3x7=m&jx<|s?WeSh85bt2WN7`g
zi)FI1&iB+8qU`MSm~W!;tz0gvt-cWqnm4VG-L*UN;tIN)eB4YglUrqh`FxrD%X$@I
zD(C1wv>Hr3^T2hSfU0+pkR9k0$o;ah5eTp1)m$e#c58q*&~x4N{<42Xe^37h`^WX)
zC92AIE@73Sf`qHo3Vv~^+AP&Gb64`_8<>+;GS1aAZ(k|9w71^;it-(4yi?^~Kg0LA
z9q=Dc2U@9!%+FS`YE@#-inTk`>#t`fUoVH$iJB-;Wcu}Tij-B6S5`4T&^_v}W~q9}
z?7UijMd=!a=p`G>9wGU1{Exg}5mgE;>;Hh;I20ZJ$ZWIwYMBw~7nWgZX*7wNudI>W
zzL;OGk*(WBN9?A$^X3XRu{$c-sge&VU-cltz3fwpYvYRBMbg*GER}^3>sa&05$lSX
zW}BO22lKx5@^!K_=SI0l9*VTQNxmIRgFia;)6McBZ~Lk!^IYw5ksF*J2#VEfOOwy1
z9`#T0{>tG-AZ7W#V>y0)ql{3Ng|`!ZzPaso`HEeZ;!W(6k3?2)lH=pzm3C@0^2rw2
z-H1UT(rBA}RZ<;qn+&%3k@4GQe9X@XL}uJ0FZUvR_I}yeBMTx`JLNlGj<<=RDl+b0
za<NnuxJ`DCoZT%Kcx6#!`jc|LSB{9h^^Ej*<jBaod*#n@vdBE}BG*wxk)AKfA7imM
ziAFMBkxir=VfKAh&Lqg)ugXJ`#HJqL1ddxDE67NbApeTocTf&DD8{#kWzz(iXa4mT
zGpQ;~O2F*!w(La;(~rw`kyUTY&s3g@t0{H9xw@KV-iXMf)pD$UbpaK$rP=DZOh&Ey
zae|JF5cD!KZ%+S2URwX_W1eCav$x88<P+J(^nE0gC~cdM<QSFN#V5E+D==R<!I4ry
z<c|~bYL!<)4z_CC8&#n?Xm{lJCvvorprX68f1|8O)@O2%$Ee>qQc@#37)GNmk@8bA
zrD19jjYiNa3Z@;{SQDCIy;UP>zW9@znmR%?Z;l3YryaEXJ|!}7$n5?zi^h@W&YxMF
z9f=NKwb*06`jbqLocvkNlhjc0S&A9_MON0gswo;Bp1;s+^o~q5vwxNUrUY$%lk0fx
z5B?^fjkW543iA$?-1&F;p|qPv{Gak!N!`3~PVVF7EcuTdud3nozvO+Cf6jlE@5o4t
z^RlUdR<{d?tIRhp$g8agjOcR%j5aF3Bx$Uo<$NfOOG&kfVVolOsCP75d5lsk2bB0u
zkFiJ|xodziAd=%XPRpLYK%nqpRpP2>p*jOn?u0|pLe<TTtcNdC>}6OQ<M+HCKjhSJ
z?aV9VjkbvidC^!f^M-gMb2y{T!J_<4cQ5@ZX+qYAovla;qJ<5j?Q`r%tprF%nK+i%
zR-)9JvW?uBO_TT`-pGiv{LJKfMytdFF`tAy^Wu8Opw$1DP`B2r3-!$Z4fWywN2tyT
zMw>QPj1=Br#80&I<)k_%!MKfT{WZbpD&LQ^O*DQnnm@{LWaNfd$tVZR0r~r5a7ONW
zvtvW!_xLA)l-JCRB%`YN<CfyaI>jd}#d7n%Nye9{`+>YH*=6dr1`GEcJ70!nuK8;t
z<A?ZHaNk*Fo@#8|nyTdJb>6nZr8g|H)@Shf!4W)--152<UV}4ox0(+mla)7tl!N9y
zO~}eoOYy!s#WyU)u(`CUF@a0JH=D9)y)V){#i%bSZbmb!xR*CGS{NNNX$opJo4{Z%
zq7N38&GT|dCd!mt&7ntmgPZfrP0fvtEyi!T_fpkz*wn7gHEL@A62;1U?GW?r=0@k1
zUGWo##PYQP?5O`LwbzRIN1CJ=Pt}*1=79{OceBxjmR{NOGrh$Dk}{%2Ihkf+JEOJS
z9_iT5sFLz;^M_2MyWyP?Y1!UjL;48?YBD79YX>7l2_j3fj5+aEtM^4CpLaK!dSqVY
z*PaGj*n&t#Z=<E21_EYYAET&)m^Y``Tihf%qx8j<o?_#QCcodMvT(|`v>f?yija^k
zIz{&NF={<BGxBIZT1ED|h?6>tk~=5Ff~sJTt6+0{OTFZ}`$2E<i})!$f@ei$4KT8$
ztc=`ssqv7M5zfqvVe!gW1k9TT8Xf7`PYpCCr_tXU*H8UHRq!^>{#w^R6czb_Uo$xY
zGn!IIeFquq;!jdwdBA*SkkO^n5ViW;wzFlx<MTv)!ve!R%a!-^>EYF>Y)!tJCd#a9
z6$XaE#%;N%ABpzLv~OB0Ri@H89Ki@KMfS2<Rtq3?krK-te|EEe>mQDci<PBm)cky~
zF+P^_pgAC5+!}L7%{KzZHoHV~av6g@GY{k$ZOxKA!yo?{S>n5jO?gHy<in9iYVwTz
z9&04?Ma>5bjEqDzn)(<Q&X}V{86C~D1;(=Y&(*t~ZY~^VJa&=#q2=1#t*d-d`Fj&O
zpD0TH-9VNX-4N+rXbd)Fxw&)%5zz!cAui)J^ZpUE1*K!mHtSzz+|;NsZt-pQQlGMC
zMDD-LSZB3*5=>;QS1S{7z#KD*vM7}%bIhMck#uj|YjZdJpQQUB%ZuI@*)!T0700WQ
z<BZmx=<s%C`Z%L`s(ORTobXy*=wNvGE*0|^=IC+8)wH4)#u<KtcmBzEqoq{`ERFse
zZ#>Q9@z?}fL%w-@g7Ki$i1ArD(P&G<zGtHGk4#myB&elaCNy;_Vll^B#m3lV3LcnN
zWr?l1j3b@N#^uQr%+|IFSfI39MmA11)_ci#a?rR>9yXr}8h=5LPB$)z?@QSZAB=oE
z-FRMFP<1f!;7p@~w92)vVExjB17gailb(r6Gdjfle3mhSz@26r@5%O&^Rta>CHC^U
z#+$M5-a}@=JfnjZm*<eVWS%i2sXbM+wkkY4ldVqs$d~hs@6>Gg`g~(DlUn-)##pAY
z>lPSA=`7SJGhO9@F;=mT(B?JAmX(no78ujwlMjh9m7Orvt{OO#VewMqqC~{8_HM3T
zYW!qXce8+b^%`S_`TjD4t##zvWyXBf492W5hH$;OX`Qhtl6ft)V&zwSV;(6pI%{|2
z58SaioYiWFmBuzQ{qjnNmm>4uD~*%P<ZoPWe4(Oy>;~fvin4sQacTU;WK4W%KEK-N
zuHBIzRvY)L`TLfzQNxQTLkq1pp2D!TMrEkV9I=iJ&4`ep&PFWQSLQ1>k{2twuOdI)
zsD`y8W}BPI#>iMUMyhOFv~7ct+Ff+oRq82DW^6O)D_^NvW4`{i$YYIb689VU+hd%W
zzro05eRI<Wrhvc9JsXUw5aUt!!`QY&eCw^dsa3UaWr2gdzk@se#Vl6QhnpICV8>21
zF)05_C~*zpZU0PeP4O#Me#$6+J_4-sPx+tl{qQ$xa?L+(G1{al5IO$&{ju!?637r=
z@uWlow;Hn=#G2DV|L~Nck=)*V<}Ra4Gu6V<4pms$QAsM(-u&e*<04DDRTVj9Z=Ghg
z*k<&W?afi!j7wUpK+IHSFD^gNF?MNb_aW?%-xa?PC_O*9ePri0<11<DgweTa#Xm8f
zJgu_`9dqY)V@SNpX*3x5VY~6GH%Wnyx)^|+V!r!;(R6@v`Z%PxREZdZ)gj|GlZ7`t
zbnWti%I~Z2>OsmcH@rGWaa34#%8|&$_ZypJvGVj$e}l18$x8<!c(G7=1G#IL``82r
zls{?aDu3oAcLGC{-zhfB$}bRJt#o`xI4D+pDAK0F=#W^fJXHciu~Yv4Bv7FInWU7R
z!r8VvfkNf?U3m3y<u^3EdYIxvVzrW|R7LK8loBM4P(pHenfcZOM)LueDI~p-*s4J8
z_(m(gZ^NrESAO~7)uR+wiPcKZw(F+HjSeZDV^!X6sgaQI!^$)2{;!m6aLORwXPt@7
zYLO3LH9lm7GM_xa(AwU7>wwWEzKFQu&YJ%mFjgR1Ud6hoy?LODdAGg!L5$K58qJ~p
z2U%*Pa?luz_1r<j?^9zED<su;jDqm+_K|N660_CJM9!M|$7%fS&E&&G*1>Fl*x(0@
zs+pPcPosGxc-Xj-jYZ`7BgW%KGtMwhot9I2FF)3BEw!t1<^4Od;&r2q5kCnyP+;Ek
zrm-Zc821`#@4lM0zP0AgH;oo%uVcm-OXtgBkz0=$-RqIq3-6JS4(7-28Lg6E<CM3G
z-B4S9x>|b~(MbIJ#(+4yb3Znk_f=U+;7F2brCdZG7<2!G3zM%Q6_HahFaC=2lPo#7
zE%!k3|HIpxfJafR4ck4v3<Cz7KnP(G2MieYum}+l1_%%VAs{N78Wt55F$!W-WJcs@
z)QAHjjSCtT6_w+O9CgrxIxJ%JpeHJFaF2?LdsIYZ{qLuznr@Nz|Ihn=|M%v)DpmLM
zJhgOHbyZDwb$6H*0?%Ds)joOqdwQopQ0yvSwO@K)@0jTv+O+O-oIJQ8)HV70`?^st
zI2)UKza{n0^ry_=<v_;n<a1x@Tk7?`QgY!~=HPE2#-bm!NfNGQd~s#hDp_QRD@(5Z
zLBGyy4mWDh^c%^oUdNFakB(|nB%F1QJ?s}+xu1Q)Zhd%DcL5r{{_yd6EI=DXwF#Z=
zYj*1e=D%6G+RyH0g|^-Pa<~4ASr|XPpZ(FVdW4Di(64&Sz)-uz9@e-M(`(#?%(G97
zJUn^uM!mv0#zK429wx_r_6>XV9OB%wNAG0rSVF3=?O`n4Z135lcWcTOXSpCKF|3Xw
zE^%V+u>1bTbkfhh;5WKdKl{Po=pp@*&;F(#ZqQ>MHIU+jNKa;3*?I3Bn#@i2s!0<M
zO?G8sBej%f`%B&E+VE;*+HN+R@HfzIV;D`k3{?HnUC9eClEFjduV`!n(64d|_Q8$!
z4z#Bl#z}#J_FoL+WW2u`##foLj=`|Z;Obu~KL>J?kC?^^gUNG1$oP#pDA~J#(N=dR
zFfKrxmtiz8d&$W*t+SR_HcLM;3z?%R)fUd~3A(d9HuC~`_oEkESt{H2g^faf4SgCm
z@&d!{2ARfl(1uLooZvMa*YcFzy0J0BnT%7P+p`-R;~e>l5_?NyV@ukm#9rFOc)8VA
zJ?0EMba?V+E@)}vSSvD^7jH;q7Nr<<?8~x@mV>jUFw?XJ4QhvkfB1;~HvHNcEJw<Z
z9_`bOxt~(AgWz!Ijh*(8RQsqF{uc#ZReR&##19X*U(Yi7WI1Pl`ku0^$<~cMG;vU}
zX;b6)KtoBc@z05stuN6h4Ni8?Hah7nl+MjD&S!!2Vvf->Ko$Sg+-O!`+QMK)7-(m<
zFj@vW+g-%#lsvtK@o*qp4nF>Q_c<99Wn0;y$!WIrR>n(#B6~wCqY1V3ZY$$uE}q?W
zf)NS4l6>w2V~rlT)t(+Px;K@TQ7G@T4l*c)qaQ7)q4-($;}K&_NL2MvEpKD&Y+fX#
z7|!9dhsxysvzB_eSZZ&$eXtGPVWi!ztucqS#@%g=@=R7{y<gWSHRdOStV8u-_6uE%
zzR6i98IKq&OL}(#MxAV&!CE(YPY2^(u7)KM&epSK;U<gy!CwcQ=)!tolIVK^!D}fH
z(Vy#PT+Sk>eZJAw{h=^A-{{#^)`RB+GuTL)S5jR2_Mn<t=OSLC36Uw^kxYDVD*14}
zu_V1n<JU#s9>%37&_t29>RxrP_Q{pL10QRXWI?GH+8cWqeaLZN4=V1yWXqlgSEM%E
zvwIoS)GtYUdoPZDi+!k<(e-4v@c&gQWR)G?yE%F0DaLYHo_*8X=;0jHr~JU~V~lHF
zD1B;pe9kcZz^jqdPl^WEDpFcH%)Y&k@#91h_up1oG#p<Wv}pd$!_(&Fo^Mu2y-d3(
zJV)DH++TE=%ny#u=6KPppXudM?J9>il+Sl<1BJ!?c`ts!lq(NU9-e%+z~D#n8vCri
z#^k_X?We_i%5K<?Catn>=x0ngc_a5f<*Z;dmW81#5*uF~n546d>$|h{Ba6l5{f++4
z&tSP8@Qg%=YGrZ`2ju7PV11ITIOV7Cb3{6FZqFhXj{rX2pWgqRUH>#=IKPF*o@SiM
z^Qe!UW?UM0!Ok3D6iXr2A61C#*`!G!R-fe+!Y&_Rj6YI}mn24^SBiOay;59AtRtn!
z7-;acM)Hh-#z}f$tbN5G&Yj}qUj`Z7bi~gNG3<b-hZ-aG&~?ta!S;~K;YMjtNSm6>
zE;8=dn~$XhjG=!%Q;O|3Mi?h$N<-)jv1RY-&boP=9T;hJ=68G7k;dS_IQxo`3~(Eg
z_lz_~2O95DM;q?1mpawXh1gNX**SYv&5GP$JER<gjF|S!GmQSDuctW8^Wz=9nRD>S
zgs@YEwVmDPh`y2<aVLxIzZ{&r?T9|#(MK6^)a3E=8AjLa1;h!^mItP;d`xFqFFGDJ
z>^<6efI)uyXk%I+C^icZ*#nA=fhJcJnD>f}+`#taqs7J#R#3Kka>4PQvyFS)-?n|v
zF<NGD+BR+7%GpU-WKTWEXhtPpa}HfV#;*xRqyy7xxK-V^KMawrSSM7;NJ`$?s4EXn
zzL^|ZhFn@E{#3icMB}B#<%Dm1`+}7bZd6Oy+LwHPqOl--QGu)c=bdXj8mK3lUeWvP
z<IXoWgan%B-P$R+?R+CgKk-&@BezyGF6XuSSO%i>RRu?$BF;2pqH_pbu)k!Q!B4Ht
z$$O?7-v*j;EX)NHv{cQ5iB~k|s<yV#zUKleX`|iuLgS|3ZKSOqNN%~%cv5F5x$<I8
zvCa0E7aN_p4AuA&qhp{^1tvFbyKtrW^|mBOTxv`g)2A*oo~kFZwiIFdY~${rfca`t
zzud5OSzcUWyu*Mx`%1>FrYttKx2qmp$m|rMM9z3NEcwD5;~!Gmc~=?S_PCw++GqBN
ztBtlDC0xSEoy+*1G5z4=lg5AaVB75<J(x+Gk4S0ZY3CiBEH+a&*~_msMu#Naxo3c_
zSXVP{kM3koDKn;p15P@XEWK3Ud9%zohqAZ7#<+-XviKU~>_Fqil5JglWr|;Y+vpVG
zVfLPDja58Kb?<e?=|NEkcH7@vXAE}2+Kgn6`NnTL_0utCTw0G`(#vmRXn?=D*;pK?
zcLx!c4Vq@hZZS>`CIFp_V7C}+gQC#BwCCMwjCRAX-)elL!pRqIGoB0#Voo#Y&$7~K
z*dXhha+ZnOwvQhCh=PUkZ|$TH>ak>gwcc)-q<b*EQ@3qu=@Ra(^>vnN!|X?Ir!>Rt
zeYYDYQl=J*jdygZ{`^#Wf6Ai$hbMnXYUbdizJvvoOkGQvk7@O#MBHNcy_3$qCAsiU
z+N9B4Qf5{tjz2wl-d#pxBe)!4W-2*vh4Gw`C!qgy)5@&U*4B1v`tHdW2H)d%k?bLU
zX}C7wUi69X^Djw`zt6}BWXo@ZZHzZPP?gn=?Jw$OLz44W8n^1fNdklIV5M=T^8;b`
z@Z^F@<66tvsFUcv=F;Vl8Z$!a6sKI0ob{+NGng&d;{t8L+S)6_L4I92{7UkhHSFFD
z3h4Xo5l<V>1uG?d*na7HGdtO8t&tN5KE&X+cT=+O-;K{2G<p>ELL+{<_)Rw19bYn9
z1fB5a<cODy1_1}O5_{6iEapU^)!H}RW_GkceA$QxACklm+0#-eS4sG{<l+i*oNi}q
zGFmz2wL9&un`nB88vVjvxXFm>4LaGIHyORSyz=`d<HY9gj3(6?+JX!gdhNG3Gi6K`
zdn4@bo3Y|{%VuLAch~HxuNmEj$c>?;jGOmkz3-X>lO^Fei{C^^l=aZk2kVrql69<e
zAS@FrdE8(@N}D1*U~b8Z<U6k!=cZR6T*15NOXHhlWUEo^+~<13=rruG6o|Wyvhtj<
z;qYWVHMYNGD?c7(<>}moVQh65?4rAqq1v%w*RFFt@nie5H;hT#hCA&|qn;!^^j}7T
zvwr)(WR!l4QZL+_9PpO$q$P`x?ZyKqa4B#w7Y?-zC6@-G`rHPz>aAN3Pga-F9{soR
zRG{8!;`NFq?8)yjVhS|R<Gxz%`$kEi#p6;F=7tfUov+Vp@TJ~PPd!xn1hT+y_Wd6i
zEtG!H$Uo^1CExzQsL*x2gI)5mv8<6Kq`#^ykjQNZCG}4X-8k_{U}Lnx8IL2cX-CH6
zH6l-y+b4a-ure%p?Puws_=(SrF9R*oW%tJ8@>ksBv7h+D_>mKi+8?Kf{z@XD{6-z^
z-MfsupiuUPWZSO|u2gQZ@A<}<r!Kc9v%fVK8tv9$5T4p*j`-TA`kYbQ4^7si-G7Oc
z#1}MToRl^62>Zn!jF;OuCM?CoZ^p0J_um!MKEJe$YT?lKmBU8ZD}OXbc67{WLh&2Z
zn^rkyJ$~64%@2p(T{*0VT_ew<TR7fsTxSe$=T3??!mg|{x-;azU1w~x+;ZNt#|YI6
z>_|R*&^R~!!;23v1<e6^y%$I&Jm?{Nq;7Tz{R61ql_xXey16wdDyD*QhB;b=laCr^
zcEE*=run+j++E<l$&i*kPWwezFHD^twqNXGG)t}znYYVKHR3pPf)wC5^9uLgy4|87
zDW}`Sp54$~-11#=Y#Y6gU37t8b7X=&yV<YXkUg|vQL<ShGgGhkHuPDPjRIk_X}xy<
zY3q?;^Np_m1~T&R$jgbA&fiYB1t*+7sMnH=C|f;W)7zy7X01$g#?Q%NndZrX%$+^v
zNIbbtpeCmUjm>c(v1!vs>tyd}Y~FP&qbBC%#}FTCVs`tpI5{B8Y#V4R`LL&&4Hw!~
zISY<ou%e{T{x+$8OLh)EG&!wHE^KOkz?5d6mt(%)LMBsfTH85Ihs68V-|^9dI|m(_
zEXUA%SaMu*^9eI>hJEk^v!&ejZEcpwucm7wre3d}{J4tOrII(EY)+H=248kCFAPcs
zIm7L79nCMDur}Nt(8=uB$jSIst!xK+{paMoPG<W+NZ^Jrj~%V*Z0-&UnfsFWcQLo<
z!H>l5Q1X&|bC?zS1h`RWC|P%kSrn{g)Y%<P278->14mTJ;y&hhy<RPr;X$9-Po7FA
z`xKy)b+l_wrIU$@`FL$#x<Wdf4EB>wrl4(qlj|jWB~E^zll-V2(f7Fg#;ibOHrAM_
z+j5CP!i`@|4j5?arc?ea`&0g+A!b?VccQilmyEEhhnVk0r*ozBmByJOH1;oC>0GLy
z@wL>A<K)+@PI~Sz8W<F=a(*E(V(Zmh3qPz)8f$-AXf}yCI|VpBMO=%x!3J%@*v*M0
zlEgw^5@UoJrR4{7gr^6BJXLT7c5?YobdsI?3=JvW5~VX3Ax4RwpA3uM4lh|!`0%^3
z7!liyx8kuPel1OrM9z>5Fp`s~;4KZg&aCrLap25wl_U^Xg<MP_$0^$*$2U1#CBI8t
zoB7zhRh?KOb~3fNc1}E4*6Qhk+CL998wWCMYnb_%b9LpU!S>o=X4@7=&qRUQ;i`v;
z79VQ=FwDF$i=}X*z3E{heGTBqFmc~-878*a9Y>hs>Pb%W%lyU><~&&%?Hg&HX%cso
zJ$95C3G*YEmOt`$hJC{*vzc6G7-jymiR%6jG$<S#PKT?LSD#@%D*I7dk1-!$rTEeq
z^AaPtPdeV9WY;sz`GJNz9DAunIdzib&F+D;!LZ~HXPI9bE%u8w9Yv4MDZOEPv~T<2
z2PTWG<tF=wiDt)OT5q?WWVQ$%kj|A#4wz&<5ojTxNh?I>mflbk?b~YDfyp8ZzhdiC
z%mI!u4;>GiVzv$*l!zOXmrOBR1hNErb&Pk)9<u6P2~X=#(aBzMuGu-~5TuW9yHbDm
z;eH!go{RSB%=k$A!*k7&Vj+z#+osR74!a8vOcrj!DW1!z!`S0I>1FVMn;z}?OZbP|
zvrdUNt@5YXCN*xLo9YHqO;(&|)&*iA>L$Vcg%Wz6W4<r$JfYSUc_~ZD7Muc22-k3|
zS%kxc>o`jr5f+<#!rTa)kRdj5MiFi(a_kSVw2-(74rUKwiJQ-ws*Z4du^+6(%Dw>_
z0CgDcBrFB0BIAz<XNnOS>`?MjBEL=eII*GCwi0#<aq7lSa>|6a+&E{!<b#u=pXJYa
z0iBZxX{}Rn^2uYZic>{eYf>?7!to>)rsR*JXe?n)(n`W|6ba<_sBoAJmk}2GV#13F
zHzHh$eK}!?Ta5ie!cst~f%$~xAmrfY5zZo<pg^T6ToWUqO8`lzj_?du!BL+_SQ1JQ
zo}lE#Bs7k&)L1bYj3O-6Dor&^$;CcOSPGQDrY~Ws39;`**g0Sd-IefYX}3Bc4{%C~
zQCq@JO;Dg#gr#sb$g>GM)lDJ7ge7hmxhx^2py5}OTc(-sTbZM2i^SkX^W_eKPH9Hl
zPtP=)WsaFZ2hZEvAv$~?KUBl(#@O%8WVRk{2QD^SI@;!F8!7E*yUWGqNtrU9<}d`h
zmjcGvQ!i#Zc7|C-FIgyO^BBu$`@xINX3Zt%MT51Ec^%V$o3`wmC|e-64Adr$w%_ur
zm!Xo|8=O+k-xxb^iP^1<KT#*Vi6JSq@G6@0+KgQrv&PtCFCmkoRXpMKvsN8DbBuk%
zCFZ>?n^P4>uXb{f$!e$1rLx+|hGexf<}y}0%>Y^LTzZ*#N??rr;AISj>;iU+CjJ;U
z5dT@q)uh>GgsVwyXPb+&JCcm?xwgRZx5U%_CVR_lvp1<8oNb;M6jjq<cH7I%HycT~
z{-(R;q<#C)<>p$hgE0;_lN<1*rd&h5<qEDdePOS?!hDH~RnBNUHhIgH=5&Ka%J)~9
zE$hoXbLT5`Njr0%+2jP7nigJ=&;FM=rB>^V0Y5A_eB_SNID62%V;#d4^UPe1Vfj3B
z1ett_4OhhUtIZyI_9>Jxb6Dkqw})yA=5Exc^n~@@$r0C>&ly2c>Id!M_2$}OKMDVo
z<i}^%K(PNg{O&AF{=C4vutDR&Xk?hatAV~r!i|U8m)v4@4GtmPWJB`)Tg=6Q;2<%Y
zX7{*_wTct|On!Sl5gdSA|H<ynPtD*!3Gb7io!<nr3xT*9DEe&9PcMh~2{arI?6DW$
zX|~Rml+lg&gYIB*<DKStdT<!Z?9c4>%gqr^BKqLSf=Fu7RZFK&|6nLOeWQKTa&i;9
zW*d_qFE`oXcsjtZmJW8WyUn~{ktDG(Iqh!q+CYmDz@GCf=1-VaFn{9PD<8b%%!89f
z#`1p8J?2wECp^S{;$GH-PIwcyTG8t}lAqmc_6URoZs38yWUG~CMj$9?97=Y*-`uDN
z$BN<4HjBcx$(#qxU1n$;5>|#I(<{UARP^ql$+I7om7%LjRz7A%_0ZW^4(`vxKf4|`
z%QDV^+GyJ8t!txUJG$D;O2(fs#|Mr%sekokdQxBfS7%c1`jolGIBFjM>S?n>ASj7y
z7bLU)Zr)>LPZVI2u+AgN4~;lDS-8ck_Lk?&uC4vLUA3j*j&EdS>1Xx+WePUNhGgTU
zIW3SazzY?e1X8CToGjdAi+$@lS|hEqKU-(E4W2KN*u?z2`DH5!OXr%_p;t$l&UNC6
zEQ>S3zVBQ!FJ~&Gk6U<TN*C?Zf%ph})C=a6LZN9L;vH8PrQ<arUL?iG%8r`G{i(Rq
zxWwH+O3CkDFn0!m(?M;9y?wpeH8hQIAW!RPXS`@m2+e>p2E>b!vtBfxZ^}>S_V1WK
z%Bm^*UGs9ho8C3g$NMr(XKy#}5ZdnO&i=P4`;me>9Fq%oIKu5a%oxJ6-*afodycOC
z`{w-@TtHT}{5F<j2s7+I6g{`U$WGu1mSx-bPj1gVkvMz*WGPbpypl^=r$X7irtIh0
z!ln5+==xs2pSyHn?$TXRD)CY?;tRrGYgv-kV{Te@hYw8IBRuZ|vq$JctRi{lOuOm>
z^C~&E56$D{msanOoiaZ6u~WwOpEw8g+$ZMj5whxyzsiV(AO3jZy^TtW!WE=)V_o?J
zyn5xTb3{J;#yQ&C4?XzPknjhsQ}rFa$u1=$cGA3zoqAibUo_aBUCZykarTm0Gm9)A
zt#$Is{*<9}n7#eebR~V})XsBhFZ*-z;lL*Qh0o1_-0I)=xyf!P_IZ9`_6W*>aD}VI
zm*#bdmVe0*r9`iPX|_CJT6{q<Tlsz-{`=(k;kuV~p6)PG<+Ei$y^u?dUzu`=)INKc
zc{M}no4d@x4fgW>ve5gc7Jfh3;%oD~K&$`yz0I1IocoPgWsNNy%$E*^{O4|k&pxHX
z{xnu3uc|YLnCH#sl@=>_sorMXR$MJEJSm{%;tFu1a1(HGTmrWam%>dUUNwFVt`@f&
zry<M3jmOQv&7GXg{?!~8h?PtaXl2K6p=rmKAItXtK*pzC$!G#@1#ZWGkg4V?Y{VD-
zpA@2UdLUqr%?qAm*Y*fDOTPP?d7Kf;|IXya?%e*tr{dIKC_4Sk6+OSKcyC~p(nol6
zo~I8-pK$)(9+;2ZiLdl4W;ebuHKJK`%VjsjpFH^bz$)Y!q?*G!@O1nyYTADA`+(f&
z;_`KNhvE&L2Iix162O-XZs%V8jnRJ5g1T3qs+#%Kz$)ZU0zW2i-EYn|V#Rf)Ru98c
ze2H++-SwmKznTNf;2^vLUvlc%vuE_q*xLhEEBk6?uW2RG)0eIqP^08^O8&;yis%Vv
z-Z)^dlIQ%CPXCRqYobqeIdeb+Cf}Y~R4K?~(DItr1Bzf?fTES)8~FeFt<9j?lfO8v
z6qW)deo8)Z*j%p162GJmRbH3s)umV0VdPR3@jvLjx=tv)_G>!+(RCe0E>%^Af6Tg0
zppXo-J?R9FuIn&zCxJc5Z*(hY8hO8)_PtH4c6#yec8hved!r1&w<3t{vo8_B%zdWa
zFAG7=e)|Ox#P^%_Z6YW-U^fUMh#WNS4-f=)@a|>nWUd3=5VGug&8q|Hk4cz^KS|X;
z^hv4&w`6$zUbS}ju+=P9hq?hpo%HL=spo!qIkjT<%c+-Z+xLp-e54xrcVjD4vwKyh
z&Ky#iTDn{7m5;nW@=rEBkQ(;f1F7{dJdmnvySC>9<RRo=?Y|-QMNK?4ckm6V;x~5p
zj3duRe#ge^Q@an&Pu+IX^(p<dbv<j->DaG^t<2LyhUL7!_0^GmQuo~5AyxnVZmEJc
z-Fg)vZ%ky7ZTdBw`eNV-scY&U=(z%UCi3gDx<;QG-aVQ(rCaoljaT)mO}5FjuGV=2
z%H2(<g1z<zJoY==-!!q>2X@(+Ss;&F_RRuQ_Ef>VkbSFQo&7WzsJ7q3GvW;_dvb{C
zjUSin*wk8HFHoD@(8~H-Ft%td-?+t9<JRM<aI0`Da22?%|KJmL8~(4k*q;(Fdw~x%
z<93sA)_Pq#@Vu9K=a=~M7tUXxTx^6x#Bt>nBEy+4asWy$eij_Txpwz$i1{HMb_=iy
z!_E{!oEt$3CgCb{c@!jps}o<@l)RkQuT#D&mpC=ZxY(xU;m*8jwsywlv$gTb?{lr=
z1F=O-Ev+0^flJ_);a1@8!&TxQ!##z24wu5chO5E7hx-WkDeep0*SPO-KjD7G{f;|`
z3uIfGg{zOt#I?XhaAMy++v=uu0`hTvaD#BC<Hq48<IclP#m&RTaOK&_C)!)%%vj=W
zU8}^kdWTxV^~cRW%GIDBg=`9L7S8+_yMOb6m=AR=`jM`U$K~zNwNl<(yA>DufWG%W
zpGU*5#<mW>4*N=M=40C*Hqp=jP>*RdP-G$e-*D^5YX^mxbNN+QPQRcV-;hY|>tyu~
zoU(xT&W^!dsC)^FJIlwn`NE=m$iAS9HLzI&-bddA*8-P|>w@dq+G5-BS;=r$Yf-RG
z&H_HhhdZELFU7MISJ=<?vbx2pc{ky<|3&v5d4+i^U@u(l?R<PrGOv@@1oBe9j5`c%
zF3w+|oWvhlba7+}++JLWHz%&fZNtr7%4fN7Ca<;gr{NDr(ape><5G9V3_c2+HlBi?
zaaTH=uY8I49pSml)AG3TS1Z33|2q{fv<*HYz!#KptyUOXyNO-eUDl=D3}%UTTBp9+
zF5a;|>!G7u>}bLNk6d$9hQh9M3leh!|8IW5F$(peFY%&d=p{Y30OJXZy_>l!M;7~2
zm6RxBe+K<k27Z}@E0C``hFh;>+mv5-44cry5;u}gAlHFEC)WRA`%oXNMQm3UU$A}p
z&#pwtN|hf!hE2jJtMrBa=~W?*;^waXvs<**80ILJDdnnT7(L^YZT5xz32sY%RA8ML
z=s&Zg#W&(TC+xaX!gGC2*mbNkG`qIDRhnGg*NQf1w>7|bhJz-be=xQAhN*4SO-*Ft
zXCPaK+iEu)YF!sAjPNC~|3#PE`VaZV7$f<%nsWc0f>a!%&@=v+_%RR6EkGslXPg-G
zGItd-PW(d&okVtsF^Lzn^>p)9*q^C{>#)sf^Jf=TvYE<XCVs!84A=X3moMy3Figfy
ziw_!ZotQj-xOG`Dr~fG?-_FDJM#uZP$K&SX?2OUYcso4W%Ck#GTdQMPV@)lB>x?VF
z72=9<Q*cFR@&$7Ixwts40=ELU3Ri_&kK2LUiL1jMz=g)~(NSCk*BMuUE5wOyG5!?X
zOx#>toaKd9L0|=L6;6!)6IUg+xN6+C|3<t&r%iMJS@{1gt0mL#{=<y-V#j^TKjag3
zIi`-dl`HrUYv6xU=wrpX@L0nCi7f9J2hdB&qGu*A8Ed^1YP@TN{21ne-%e@&?oM{!
zNml!*rRv)=i{v{i{N0K#KiCrZ$Kx-@m)}(P;Wx#92tNmZHGWI{r|?_jKZD;9{{{Sf
z{1kpq{8y73CRvvUVwIQhy|<%W?Zu{6gF9OOe}QA#s-rTOOxyxcPN&eTE@K#&{eQ#J
z*^iZQ#ZegwyY5)_{~g&1U)sB4KJX{v7C_^uC4!r|EB7bj#>-H)eqBV>nqMa2LgZ6$
zljWb^%~f2X{MAS4#Wv-Wedr7O)7d@SwEuR#HPNm;&oX1{=QD>~|7X`r$qJP}{unkh
ze6qQ|us^*;$g^-o3;ygTEHH;TikV7Tb_}C(pKOIM>`!oY^6~SnI|8TF$qRhPdQRAN
zHH2$@PS|nwWmBzhW7`r8lK+eDnZ<v|FQ$pWb^oUY+5N}Fk9lBj0a`6FwcU4cwz-+R
zirsvw<miIT5MvVGywuZ$?o5Y8?%GRuKDMil;kGK-ZskWvYgNHfhD9P&vbnynD--`d
z<eHs1!;0CF8J3w$%&<xg+nQ<hb5<Dkz?oJD`?8tV_4fBOtuuKMbI6WfY_-xO+wAi$
zw%Y3B|D9ZTvGuXFywoxq$N4ydTvCj3D?y^y{bXlzTQ|_qKBd$;GZx|Qf;Nk*@p6|T
z&UZ3}uhq2z{PiI&9N=$l!Z%a#YjSn1So9tE>@9w2XI(475A$iLnZm_fT)?lLptE<9
z)yD!|i{sZV;&Znmzmdx~_!;Oc@hktyg@x4w<W2Hb`0=fLpcg*^r$m0dp;hBY@oPl>
z1{VgzZ^I`^MUTH1zw#tL{`>)Ue>Jr{eEG6)0e<a2Snl9QUS_$2U-CS8;F~Wn#No%+
z(LK27q(ze~WAH2SYlYXD+FtRW<LLO3O)Y*Nehn{YFHn9IU%nb$B>Esfb?_s+?tcP)
zDubF49!1UIhw)4C^RJ+0@DrC(Gx#+zB9;@V{5LmEM1j8oU%q)>i9e+YKi2VU=TJlV
z@+s4GqUU=_Df|dcS1tPMxWR(o3x5axjBB{rg71916<@wCU58(RzZXCL9@n=&GPT%D
z0wDs=H0Lq{zI;nP2Y(eGWs2ahnM>cmFW}<_`S_h@bGHY-;`#s=mFTOyUcLxFaV|%W
zKMQ}RaGs%+;+Nr<;eU4mM=XqAj=zdmkS9JeBU-+^i(LKfz01;YG%Pp&@x#-tzh?$4
zvt90V#y-TZ>0PAoFi!rWe79QuijqUGwnheGZN8?hd@fGdb)j!q8~+EcH#U861-Mgj
z|BW<H{x385W1@ENYpm9ziyQK-+yBBH{=*;R|1s>&L^kn=Jo(Ty)`-x+5-uj+$JN8)
zW|lTVK1j;nc^JuG=*slRbtAW>{V1|si!DujsohX(sqzhhE(ElEjQ=Yq305y*s=@8E
zn=i6XZeGFb=x@h~?{~B8?tihmwb9Lhlf{RqKf;Occk3OqlXONMxqnc!lnb*s@m<&X
zDoe|_O0L><sUIb}d@8s$N$Laj*uFODBb1}KLM?42op~xOcYdxVEcY6u=PV>#q{6om
zE+;H^gJfA;p~80)PN?vMgqIPPTU%=gufUgd{!Vy@#@0Q77l0f}Aen3=EFTJz@N0x4
zgr$h@5zbZNuL(2QIr9C4I}?__P-kjh$y*SX@4ragJi_uhDhc-{+@G)%XmID4<$N7N
zpcusfr8u9ke0oRZmlBSu@HK=7sc<>r!799r@DLSVMYvFfpCdd}g<m5)OoiVgJY2#t
zGWZrKQi?-_PgmhaT`X-BVJUSh!o`H;uN~p>O5T(31SKCtc#4us4=+(+>FG06cmy``
zkvK`Sm~f1N%7L?hG8LXom;uU>pHDch!WR-=q{6cZm#c6o;R+SLo^V2i|3Y|~3g1R}
zg$gerd|x`u?G>O>DgH`$l?p#kc(n>|BD_Y0-ymG2!XFZTMulq$uT$Y~39nb-UkRsF
z_%Pv_Dy(<q(o}%%E$2mZ`=(p1YeTmVv$Ul}7ORLLuQO|Xn>B4fwfyWGDb>YSR*(7<
zS=q&m@O+Xl^_rYD>AzHjd;G|vmpGr&`g<hsm0`?}OGcCJX$h-y&oEy`EasM^-ip%n
zAyy!283hc<Mt{JR{u~PE-SNhYa;Rk_k{c3Mi%@J?93v6x!#M36?QqLjfm9!lz)9<L
z%ji#9yc32;%{jur8J2OMSX_i)C@wJGGAc!IohvxYGFFM;W(4nZ>IKfRjMWH?Pk;vB
z@NXL$D`ii*+qy3BoxS~Tt1XYr{(d)ONM2{V?>*KDdPYZkB0it6x%wXK^4{xiu=waN
zXOL5b2TjXZkECq_BvZHp8W_kCBhZ^5cuE9AEF-ndZgsD<?3BIBP%K7qKQ$j1VHq_d
zSm`D<(lVk_+9wcP&9`C#qb#F#nf=AR*5PJhMqfj!p=~Eh?bOF0%UEOYzt0+_hdbD(
zue5T-zkH>YtrvE%Z(eDgrcdc$Z&_)L4OG}U_gg)Ak<zI9t?t~LTyVeDU!U8_UVpz8
z;YROA_gn44?zip(m%Y=7M;EH?#t#s4t3BfZYu5Opt^w^-k=ODmqOi1G$S_)ch#-ze
z#=1ar!|MJxo#hLJeEDcA@?4S69?oThz*YUYEvSu=`nXD3S8HBp=T=(t^@48pLzUKQ
zo*o?eAeUG7+Lt~^!SlM?PdsRq>b<(#tsb&Y)I0aEPkYEZU61y#%OB!%UR&+&AF>AM
z#XapV4|51B?9mTf72P`b3TOkQlm(LU6K1-EGmttZe8IH3@1|<gCG57@uD{A^6Zp{X
zw90A`IFKB+%E}AqC8s1yAF=xA1nkEjv)cBIpTe0yddF2!@AB0;W7f;kd6ps168R>m
z1GkZl(!+$%thWz6W;NHddfUxcTg$mYT(z2-{4l9MZv8W)M^3f>^)v^Lf5uv?pFZ<c
z`_{GA`Ff~t^4qo6qWXdLcIkR+GmlW7@}iZm&+lhn^rAIFkMy_yj!$ZbUbMypmf53U
zvT|tHi(axW<VN+IFIms)`2+1+Ube=AAHQr(3@ow>QkH#Ol##uX3ecm2?DtdFExI|_
zzF>p3fTVVA;E;FQ4L4d#^xQ(ba--E<&o8v!*=U`lPbswbZnQ@0rG<9UCaaBJR%l<k
z$ton(s!ePUNZH#raa1*S-ezuZXAQN-Y_<w@-Wq<_W;$ZoQ2QU7tzH<{p~g7-6>Cj{
zTILLC3BA_d`igbVadmJB{akOw>wvdd5j}N7vUrQNC_o*&`>ItL*qWSIO-*uBJ@LBL
zRp&|5)azDGi#YR*3@CxwOe<Rb?p$w@FBEHS)+c{{-TFMFXN*n$`yK1*Kz8xifL4b}
zKfhV}qzU!r981o4*J>K-RK#GCeFKW~X}3_jfMry3;s*CKv}E6jxE^H<wMV@927A><
z)-t_#tljrx>2_o7Yd^Nyx13Ar5%N9p+;)6{iK}0sfzv_0WVFuy$H!Fd4twXv*13A#
zID5z^)>DCJ?4Lie&Y;eswN`~5KFfZy)+*9Ron_~IO4r|M7ko-Pj5^!?=cm?qeb(7_
zo6oFf`rNbazMokwY3j2-V{odlulbA$HqS{u@R`*lphwTKpZVN+MEuD+twiIRb2vd~
zHD|P47MB?`b93_GPHTE7Sj-Nse8aBV!}aLDd~ZD-(2FP9`9E53uqENpkJg%CAf9}3
z4?QdpxA**JokpYe+G};>{5XFvW5+K0&b?MAJ@;HYwbv?UD9QZY%3@z}``@jux#OwH
z;B7R6j7vs7<C;|DL$08^Wo*5{F8ke@MZ7P5w`RBc?p(HeVS9Wjr<UD{NJAwmuiqPW
zo_+Z~YY>Cf)BCJewEUa<=w_96WWSY*aOi#+1xoD1erwQ4+SGuv0VZ(Sg_493Xc{Qa
zr_l(dXhbbLJk{R4-|DUBOtm{7ux8>Xmmjd2hxGiZ$*0ZWZ#n_{iF(0_df`;t3I#8s
z!(J8&wj{q>Lcwkf`Ok-fbM#iz?5_2LE%btE_K5nyE+@=lD_CdJZ+!A8d>oF>6TCvc
z5|(uenU>qj>Ib`?u*Nl6;F?@}#H8B(1QR+(qXuMEXLoE6?5g*lZcl1J9^<Fm^BR!#
zZu`*&!P8ntS>Z$_D;fKptX?~kPIQJHI4;<ZZgui;!E0JaF5t(Rm_F&6es{#Q*9G=V
z$B}2;{{Fb&!obdCX-4qx^%`uxFrY0WioV(Ik`>Gi)Y{{+f<5)Di;_2I1uIMY%G51m
zT7p?ApU#wGy28^6VDV)>6JMq=@n!ZBpGj-#UxKY0$fPCAymSlfjH%)?_Pr)PBhuIS
z<Quvvcw+q&y++1|Iy)b1EWhPsu934+#GH0hZwj`Tu71l+#HRl~j?Y0=W1d;83M+k2
z%oFxr#L~jeZ^mkKvf$=miQYxlD7{b@X0#4H5V&8L9aQpHh@4YcqbE6kQ{XDQ`)$FN
zRz}Bw7MEa?=B{-GBFRKz$RFp!tu*EiyUVS?X2I|H27GnEZe0<~wo7gehT6E3s~f$V
zX;h-SnQgUixixsA<xHCPr;CG4?9{D6vt1pNXc1<rs7nkk-9=#eE9l11M&%B++f-nG
zfNVPEv9vNpcGR(T?vb9W-0YY}yWU2fY_-q0jVfXqoqb!dg|&i@Aa=HImjl^sKhBZ!
zQqEUyqeCzk?!(^7;&aDq?94=v&#hJjO}i=)G_4Z4e##zH5j=^p;)04`x8{*k)5VtF
zcmVB8xRUT-J8*ljWy4%04)qQ28FnwVx{6?kk^gs+V3_NGsa3--mznm1OF6*s?ZL1;
z>-JzPOD42u(!$EygE^ck&tp3@w~&hwBt67eTQWh{aaK-2C*eW9iV=A!a_QjGjKXF1
z6;k<IZx4p76uR;D6|{PNQDre%&9KMb7HrnWu>_@Qj+I-=zI}196TSNR#Z*e2{n28o
zGH(Bl*L*#5R|)ZC{3v8*YQ^uRI#x(d-JE0>-w`yKzU;H_2zDJRv)4@Qq}P0h?K(Bz
zNNI(4z{Q09Wsr15mp3lpe3E{U)cHx4i1?t_9l@3|?=(sT+cMvDPtfN^oy{)XBdrj!
z=O%)!f>CCXMNS89U4Jht<jqVa@wCx4^!3w|i7NGp(kxka=Ow{V{rD8s1Ep6sv&Sz9
zwqne`VoC61oss>XCBYU=tItc%n$j6}upFy{tDK6pYnKGg6Ds-Srt}>DQMtC&c3^4n
z#J<rIewkq6Z%#?doe-qLgu_e_(!4G+4@_DbY}+OuxpW7S@5G=Go=aG=5ngAnBpW)`
zi%WyetY>Dh$2ooQbxY}@wB*uYA5M|(chau4_8E9GW?ph9=Mro1<ZX8ach!@X<W(z!
zGYv+&<R_KEJto6l^52gHd+W@&jaCOAXI+zA_eAj0dMqfC?bZe##p0ag8_xwJI%8Y%
z$92IAbtdJr{}IgQC8){EUJOPInYr#r1@jHYv(%<wc_6dyCkoA<q0z+r#qEi&1W#-u
zS9s-*Ek2H4dW7k0{MFibiy0Ty95m89O|@0mx$G?CsfqVq33iLgwK(}RoO^y;MrlE(
zaJY%%k7((#R4Z3IPIX!C4bm=iT$W9od?x)?mzz2KuL_sb1L|*!U1lZ2l%|<?xPlf=
z1T%T}mf#)zVhP4!xyYb%O`~6XK5naib_kyZZ)Wi?gM#n{@Bw&+;;Ue1L{q#N{;(jR
z6)L_Hu3!Lic?F!oDd+Oa_FU(w8K0h@C4+}BaLyE!z<Su}-HNxv@}pY1o!EZ{M}52t
zuH^E1XQlrME^engL%!Jm2G@0D+nv%MXwP+){LWmLR|5THjbCV<K|OfIsciI9HPjfc
z?$7mhWq$&k6ZQ0Ya3$B{O{LF=Ys$O=^o9!->)I}rUjG<^NJ7`R!tET<U`gO^&tN!Q
zey^t=1<M7=a+Se2iT?^a4;4>_<&s05;w#~rx46-ucmW*W&V2zowbV%L76joh*`BBj
zR=~^P`xHM6m+WGTZM}4ar{GAPSAf66Rq$$MzfSCb;>LpF&2Yibx-&XRhOfc-TwYO4
z5Yyg7kg(YMrXqX_%U4_7Hv9@Mf!#Ly9{#X_;ryhO1b>FB8yiOsU@x4{b$&PgLAWNz
z&~~cyO{tm_T#nV_<2DF78Am!vCR_r$8D_&(a8xDG3eNBB>D#~wIHL3?!}76GH-2Zh
z2zCq9177XwV;n&Nf?5Qu<efG-4KD2J9nlcD5_S_Pg2Uas0*w|u>}Gf-oH)(XPlV;$
zwyyp>(cfoiyAZRDnDZJze5IkSQw6vZj^59m3)M@ng=?Sj3bYVTz3esFQaG1;I_bZ9
z+T(CO_k3L507tm76hshU6GSz<h5HTl<T|ei-a?T52{#jz!4CMcPt%+CMgIx>_|Eim
z5W-)=!7ofg=>)%rL%U4p+q<IQ4a-diGH}x02g^+io?~+OFf2DQZYF<6V0It@?o4!5
z2K8aN!9WE#`i8LF!EiYTmb)1)w}$1;hReBd6fRQn^Wbt=PD3eRXLyI`<vOnj@)1P7
z^$dE!rEp5=`@!<;N~z+3usp*ORy+ijXIEVPa9EyUarGl5KJ4npNPIZv8k{K!AegBV
zJO|GCfz1_)Crbjbo51;!0PMz}F7aVE{zVf1M=$=R5+9aMC<lBwT=XM*rCh-r1SLL$
ztKl*qUk4}PluBSBEYHS76yF5PGcqB?w@Q3imL!tlVpyJ)$xyr$UR}o>%ZL&zM<5T)
zxCz_?%R@7+{(e{<o^kaL!Se8otA7-hhiF{=6R<o)<LaM+<yjh6zZNcvA#khsIRx@h
zja#KJ!17Rydjv1R3D`Y?jc_Ggqz+&UybCT?{5o9lv!{O(j>E40Z8%ngAf+O_hd>_M
zDOLOtED!Iv1^N`0hjv{37qC3M<LbYb_&)vj5}#*qT>VcHAD*ew|Fr|>pFF+e*1+$Q
z0PJRPP!ibVB@kdElsvTKCSbv3up7TVoPf(z2FJstznRYUf9V4)V4m@DW=G)@J97RN
z`vkd?AY82y=m1Cec@et8JOaemd()k&C!Dk2D^LN<6G2Dfp9U8lU`L^{9}GthQXs{{
zV+eL2po==abd>NR+DP$OI8Wp5go<z$oPgaUo(QjoBg%dTEKeG_XVt}UE$lYIY*-#O
zlDR}`AT|eqJY(c`sxnxfEy_~{*TeEmQLf^JuslmNMe)tBJVWH#SHSWZQB>)d!1DOe
zD8<WRd2FaC%`xp>NdSRc!b-Rfj;IK$;INs#j3Omo4d=k?)Diz3uCcrVtb_OZ_(eDp
z<nF3UZzH@5P8T><UwainxE?o;l)+Xw1<zFc7MvII64(yc!ET#=09V(iK+3)r=CP$C
zP5OoKap~K1asc1JH5n@X7=j-V6g2eO^k-P^Vsn5_8~p~)gd>U%!sW1AV@4;c)Tgfx
z7dPU9ld^9J$Kmu5*Vi)P)k+_e5;aAz!)MSO){gfQI04Rw-2~de6MXu1aJf(40j~1t
zyTG+R?hc2;PWtKj{}cq#w7@yyQ{hq{p9WX@co1CU<DsyY=@s~NIN!%-z$LJI0At~D
zN3Z7pvk<KF8BBz0eF>ZkhZ}ndOoa<zw~8--XZrLP!wH}MGC1YqE8sd|cmBT$L8OV7
z;5BfukFSU0KE462^6_8bS|8s6N7-}W&K(JO3fy1yiHG17aLgS@o<Wd8Agea%6Pw^_
zIHCA;c-@KKdA$|h3+w!F5&L)HP+PB)*1~zP+eyEH3*aJ^-j8s+trbh(Y9fK35Uj(X
zKpFf3*ZC6s6%L=|We{Kp%ZEeCJ_9a*-2=*m6R=x=)^H_Utn53%btiG3-yJHuAPD8A
zHx0-Ub%Q6g^D^uSm%(m^ec;MGOOqeqVqXNuJ9vB!yaIOZXNW!QE=sR}YhwtaDuFo&
zB0TWmR#6$e19ofVS~$$E2AAi<MX)@9DFs*vm-x6G4s}iMCKml2a30)7+1qfe7=c@W
z2M{FSd}Z(;oPukd!K}Xau;_bv_K(0duv?(V;Ls^-BvA36gv(&J3H}CG!LsdH3iM)U
z=KpF0?h$OoU@zQ28N4qE_V(KJE4a3gcLYC(o;z3W5&a71!|nn62A9C@0sJod0hZQV
zrGKC^=U+L3c1mD$p@cqe0PpZ|Caewg?3=-PK5h*c`#2XagWW#T3r@gt5l3oZa7+Y4
zy(%q)YhbrZhruCsAGlRI0xpByDjf}%!|rJ~0ZtToJQZHSR)rju;RSG&k7Ji0ki85M
zWiStxE$wcE8)4b!;1*ypEL+>N0_h{TSNIHX+;|+$8{@4H{s9*kvl&ID|4(=|tn!O#
z?;)s0;GPy=!Ls4jJ;GmL*^uk<VYtr6$H{NDv7Wve9EPJR!*+17k9)!iIDM8e{})Jv
zabAQ062ZrX5&?G4<FnuuaGuIw2AqQ3p>_^j3%fOV16(zp-7?C)9L_&WvCRK>hyemO
zft7F#><&y%!n@#vN?<KqcDBdQ!=ZD$PWdvN54-kT;FM3l72X9;Aw6gQ{|G_R1h0+0
zgX0suPPiLh2fIxW=*AJk>AwbA7F;mN<92X)3HyMYxgkRv3A@)xxkIU)13P;V<@dh`
zE<%vU-KX?lL+u8bJ-DjM8)`SfMLxa_E`~{;)2N}AfUDq$vyg76*>KHe9^VhkR=so+
zHq;)5BQU>z(+R9Wkn0H2{0yAu;}_uFbKG9uP}>Oah0{&eP<tKDn(OInB>vUzz|>It
z1TKcteWIcE1)ShYaaMZ%Z>ar(z`b;=X^pf%K4*oxl+0R8YXq-}yBTC^t>7w{vk;f5
zodU0e<BEsEvWv#$(Xe}WQ0y;)H>2nGZ#sb(0{1?l7%YLeV&Li@hus^4qEErweEKc$
z4p`1((Qk!o%XvarrT-SZ4^CeL%G5rBL$@&hyBU0oAnO({!(ZT5KHd*Ud>rV`Js>#U
zrkPq0?&ag-;Ql@i!-YO>3Xi%a<|WVq!FUYZ0-Xp?fZZzY3{UZKU%13)Uj)za@pyQq
zkEg=3d^{5_jrjy~5R}305zU9=K79qe$j1-E2{_>l#hKb_c;{m80M^2?8C8xzn&bs|
z_Z^=8B{;Shf!j$oASg?C2G#I<AHN01eY_JchpSW@e-9Td@zVPVUI)81xCfR^uI>Tu
z>(2QnJ6>JEAq;l-*yuqg@^J{><>Q8MosS#CvbnWb6{s0}AM7@1E?nv3u01&aR`~?I
zFj(#5e()L}4}z<FJQiLD&r}(n4VT{Ou5dE7$?ziBt$_>Ra@Z~4?C}V8_zbRwYvHg;
z@H)5-c8}mM@Buhi>C54D%RK$P@cQIyE$bb&xXt!jn|dF_5~p(Nv!!3Alg6B$55U{t
zDhegM3Qn|3-&7TT4X%T~Q~FO~xd9?YXsg%L^uC(&NP)}kV0TY|#P0^Xdja5>*5{7`
zZ4m5i4p0Rec@&=sJ39uH{(RWoE+83P0q<agfO|mK9>srw(;Eon0J*}y_z!}5+TF0b
zp+FLR5_UHf2(LYg{{e4h!+|uT=r<q5|AgJ02BQB6c6S#vmMi>U_yzE{u)EbjBK&+5
z?}yzj2coz7F(0x8p+eP2v!nQAcq{rvN`ER`%wB{dI3@{<M&N8tP)9HUE@A)OGb+NB
zaEiSNt(Z~8{ubEXgdn^Eb~hmiKM6a#4^;dY;q~lBsNp`p=>G*fTMu;3a1nfgz}a@7
zGN^-h4@qwf6#eh8yZ=Bq(BIp0AbcF`?lll@3YQh8k5ITJ?CdvC1&YAVh69=5#h|@J
z80tmnA`#$-if{_-Zm$#jVX$nkODO$Wu)A$e^wVK?D}rzt>~2L6z6Ex+AIJ<Zg8LD;
z`wxVlhTZ)K!kb`s&w=o}aQLi%woS#~1@DAqs1f}^*x7oZxY=pcu(RtRt?vlSE`wOU
zYLkHo+}#C|z!*5eHnR09fobq4wi3)&JRf$q8i@Vfu)DQDc#Y`UYEY)^{{g%E3Pk@d
z%zlEHb7@fo-y(1~6bSzU=df`=#yVkr0P9xRJ+GU>?#2Sq=fmsJbN|(`9}VxGp5DkH
z`U_!atAR@I3OIfND_r*oD-cB0jxC9>5|$lYtyF|Z;99ts;=jUK7kT_FTnx)!nD`qM
z_gD6B9L4_@j`<>dfgl0rsR%zE#eso;)E|Eop8%(bpP}M+IEwqgHRxq%mIE9N$NYjZ
z2x>8iDuXFU@uhHNCU2QT-<AZff%9Sc6HdT!SOym1``{Xw-_;JUhP8{+dqzaRZXoAh
z7=bK&Mew>L2ulJ|!Viz)?_k+Fzf9?Whh@urL~$remBOWpn;pfua0-2i()WzUIHF5=
zpPmv7!obgGz@_L5m43oed;y$7@3!%kNAdNrJn`pt+8f~<ID}XZ@a7l-d9Ey<S`<MA
zEYF*5RR&8HgO2_lSRM<@Q1+FuJTCU3(mw`gm3sQ8;K&tT`p>|{S9*OQwh=)If{2#R
z;0-tqyPfb|SRRCP3-~@<%L8(50c#a&{K*0Ag5^23y^4Q;<@vTM#lOPi=N-|<v;zo6
zUF~I{4`Nwa#xsX1LI{>8>|8z`9(Aq9*|0o`w_4e^gyjjn^@`gl2Au+SfaO8DI)|D6
zyCIM#>~<)F-tdeCu0cKRG<ZDYgqy(-Sc`k7SrIHxb7rXcXTT93kAw4lJOPfvv&3HJ
z{}KdM2;B4XLO2DpSH>yv<*>Fe;OtS5DlUU}!bK{>>*4$x)t*`OH^Nh3SAPq<8ZIUS
ziGK&Y_h#k_MoMS?Uyh*U)__*23|7M09SofqNQ6h=)DlnsSFvC2CHM?nb&uC){sHfR
z-2$ZG$h}?xx4@%d^y&G3D}rSR+%|g~F1|0l`%Vh*9$X1;RRyetOCJnq_o)Kzg2Ssk
zeI5MaYA?Ot;1!R1d=Q@Sgrv{;FBzDFsUifcR0hYv#eZY$SF>Xic-C4^-x3Zdy#(9B
zHE^z;PVZz`dx3E!kmfFM@jpC$Pk8Q&%%<e;6rcb>9fEwt1K`L@0p}u#6tECpoeD_5
zbL>aLc^f_bIJj!FH$6{)<!R5kD!p^zRd7`Cba)pWa}6#=Fl7s=>*)+;!zI<;?|`{*
z_;pWzt?*W_)5hT}AKwh;_;@iKf%8-WY`AzU^M6PQRw9^y!3@Q#;1t|S@f!F;pZ*y*
z`UYdYYNF@isu~6sRYNbstKatYufRFm)7#agM&E>sw{u>PR~ft`5isbi_&s<V?CL*(
z^Z)G~!7g|L>>j|+aEVWU0G3CLUA;DhP7G72P7_+fF$D4_CCWe`j|jU1NhU1M4!hh8
zuJG{*k^t;x&=y_|AL)c}%?IAV)<yImditJlZ0pC~bb2a+l25!vWE8H3m#G2{ho{tf
zZGMK>`*=J&!^e~1nXr3Wm%w?SdZ+b;LpcA+5Y(wEnT0{==d`)vIdH{JPk${u|4T2y
zI9%`*%WyqigEzs|U-Lsl#lIb1@vWy{2Iu|Y#lL$9=ie>_#p(zjkOY5Zg`*ODL=yOk
z8c_+Zfwf;e{omm!zj_Hi59jRh7OgMC`EW0lel=VQ=P2G9Ls0HB*aokMi<H3!aOgMh
zy!{+5hF$%)a3$;>z%OteT&UvjhohVlqZAt`5^z*;>^KB92;2mlz&XG3V7#h<W^mR1
zfV0tDI%z9-%^?OFrEd%KT-OnO9-M3H&O^6i-vutR*n+6+d%(5z(pwPG$F%+kLOh0C
zsY)~)u7cfOKN^;oBe>(hneYxcs+wd19BIJYJyrbk6obxLbs=1X-sQ^`S2=p-|9J@F
z$9W0Phvl6D6I23mSl%_@CUCRjbxL0W?~r#9C|&~RH`2ARI$|4+0=ty{emE~v>1F<Z
zL=4!M>L&0MEU$q`m$07pEIdC;KhmZzz$w^mvJ@O@>gl(@#c;XG@C|qeT%h;^IGmlX
zVQ2o|i6DX?stk6+C2)@7!*H38j~_}Wg7cNW6`X=2irc`ou-hrS!THVT6H4D3E`dol
zJ^v3uuo{8e=A+<suzQ44U@gZxf{Wk+*gdVThG)T9DuYFE9L`gmfUDr0ifwpBb6yYO
z&i^YBgj;wed<2fdIjWOA3753wEv<^z!WwT^C{YPMFM8N5Knj+(FU(c?EwH?mA*%Qd
zIKMS|cmCgoU^N1UXQ$V`2bV|m^dgZy@(H{bo}m)>0xoI8S)uH|f#Yr2+pM?_t~$xn
z?}6o&B5so%fVFm9f1wSW40(x;R*IlPaRXRu?^RhCu7fp|K{Mf#y*6$QNAq+orQ+wp
z^6rva#U0>U*zM%q;ew95lO$Ie^g%F%-PdlL4S*NHu6`)I8g~1{C^*!~JEC!L6n5iJ
zf@i{R{1P|;&s6zc2=DF04p;ZIxEz7JWhGx7;XJsK-QN9Ggar}-HWl9pYrKuc?X<VS
z@^+RPO1~72!zq>C-EcJD>r?l`6Jp(^7p9xwVFWd>o5AC7E$n8n7XHr1&%=9Rw}3Ch
zy?S`O1upgR8}J&~J>YlX*e;*J`v`J-dIq1uC2&X;@N0M`>^8wqa1|U;`rqLi9|wlh
zM|@l#4)^kg^336!fB8N^a}0`MH^DY=sZZYlPWbfQ;VPg0RJg{+gWx(JpALsl@eXL*
zaL&JoPcRvSd>_w%qdvY2E`r@Ang<vA^f9=^$K`OTkC(u4AKxPp5<bDhaHWsez*Rne
z7Eby2Ww;v7QB6<{*ZTBtz`Nj4O8*YrtGCCo_YuT>f=}TZ*iG<DxDH;eX0z|%R9~u4
z@vm^{X`cQd93A9Uu|a1kg55q-pFjdmqzlCL9|V!X-oTRsuYr|;7J>T@@kYWtc(sqa
z!&!x%zAqey-99uFu7o$M8W;u780zYo|IbE{KoC*}QzQZ`f3kwPKyk4;k7vPk!#w*r
z@Lu>sWq%F4ZMc_V99~i6>2HSPr>pof|5qTWL{OkISgP19z&&v72rt7*co$rvGJFIs
z8tDx@Pr_?pSN}9DFGUI|`)6T!S(5xY^Z$A!a3_xqaPb*lge~v{c$rFIEBqZ?uJ|2z
z?r2Z{J{;oVSa<gO6pq8Ql>OJRJUoj&J^%lNprn{3kunIJPN#y)RgHvTZJgU?^|VHC
z1a>=VQ#cAgqwHIY{dh(~#V3pYEN?F82A9HF?)={eK>~r>hJ(f6Y+fO+3N#8XK8M$H
zD?SUZf!(2Z3M_Bhay#t>a21@V>}SF9rmlR&bKvkq4#1uN=Of5RP^b)Ug5zSK_zt)d
zcB^oOBrwTK@P0T7Pf+%cz$@TIss`4;5q{OoQ2M9ga`?#npF~jOGk6h>p6g9M8{lFe
zZ-GmE{7+ckM-@>SybH^Fs&W<A!q52h-@-f2W&Ssn!LJBvF$gKvM$pS)SAQJ*9ekfU
zqHMVQd<GW9ZQ+^{Pu~@uHO-svd%`<lw+T*#L(`f6XQ~VbA}F2iwdruU3U;e>v_zQU
z&5q~7QMjmHdYqUB%e%(h1Go@g0dxP=8AoQp@=h{We+66+L*OQGHG+7VXD}ZwhYM8!
zZ-nb$w?@ig?HaF%5^xyKQua2S3%dt!A3Ps+`_O}MY!?DI!N(BP!5J!nDp=l6=jxw>
z!*DC5Pr>=H+e!Zk7yI<v;8Gud3@2c>fM1W`{9BE{&7cm06ue$dE(hTF^`1U3k{a=G
zJvad~kT_M`5SAC=ZBy~HVC@EuSoM(;;L?S>wO{ecBRT)%O?eqg&=muDgI<|Rus1yO
zFJ1xqNdi6|443(MBpml~F`R&BsPxW)_rejyQ(_1rH+gM*30w-ho$?B}7Iv%bYDoYt
zPzl@s=ilrNG&jK|a9ABs1zcM04b@8}KJ3Pi-6ILy!Xtkw!3W@7a4*HH;8wTN=CqN_
zhEI$AQhv=EX?_XLS<XPC_@D5Km8|;}?||bEGIFZ3;0suLjMFsTL^17W1eK4oOja4l
zQl;c6?+C(h>EF^%ZAt;M;Yv7H6|e<d4ZG)c8+flzp9e>(ye8}c=fV*uKJ$Mc1jPt4
zRDwlt2|P>j*>H_7fpg&<KAr*B!tV4v8_rqlWq2JNf!(=c5nKnalJuSVA3^2cy<U8m
zM0keTOch`yTnoEL_%IxP)+_LnVh>Y+P7|$#^WkE}&%<#zNAXMW3Rnu{%>S<=s6`M}
z25-ZW=e)DxeK<eqW%w-|g=Z=I-{BPO9-(#yigjLwK{yAFDEkb!0H(m{`9Bju)MwBN
zE`b-R2)S?_T&TFS*gvmpamG^V6Q{tb4c<uCAC7EHKXxkmL2xxJH6-^LPls#Z9q#-;
z7D3e}@3cDy&e`l$;kock*d4nsfJ@=9D$r~=?&GWB1RPiT7#x1Zv;Pac4(18B^bu4b
zNQpr49dNaem%=qZz6;*r;}vkNkMD(d`S^ag&d00Zy>QGOSRO|Z*}}(3)X@2?BmlSt
zdI8p6^O|4-Tmp|*1%6HJVK=?^#r}0_fPqM={7WB8lkA1#@=j7W18p>C!9V$0ib}8^
zEbk~SQ``vN3%mABVeJiXGHL<KJ4@XHMBpgwo;7{oQaD$oHv+DFgZaOg5==+11B2Cy
zXTk9|b?rOFS4jf@@-kciC*TQ6e>1!bcH<{RU*pBU8;-x_4ZRP;sTcycQ$2+s^0vp%
z!}+jOp;XzcaMe4UrmBhFhU?z-#`e$Q=qC&;%Kit@*V06a_rb-Vc~?rUF>?NW?w!|-
z5QO1`I--_vB|KLp&=Jo0!t2$2;1#f214H0CI77uhAJ)F~_<DF0T%`0jjUoU1ub7XO
zU>OFL@G8X*z{R_~1RjB_e)g*LNwNRM>oZmGtleG{CE;?|ZQ_^V6kO%j=orqw(mh_4
zzlA~E#~;E8AAbd}fZa~`16&E`t0wssuGyP@Kwp~dAe`Fg=`F_Ox&vPPj2MEw2;5%T
z1a5WEJI!+75;&@Aq%~aW)91pw;C!X;2<IH~nj{}Ch4Gv|)Q3RIr;iPg2#3830>j`L
zJ{|*C`FH|+z{lsqMVfKs0>ee{G9O<K*TRLW09V0<JmDWsb4;6$paj8gW$+ha-8eG)
zErufoZ|+rXwgRp)4H8Ws(L-=0ZwUBM@e}ZT-t_BsvZvu1*geahh2<5(Yn=3$|6fo7
z;yYEe0hU+yy1jS{EU)x+c`Lle$8W<a*v;VIa5Y?@(yN6#^NIqu6YhfLUB=@@FZ2KR
z2;^nQIV!?0@Gdw{@qReWI}Mx&*aybarm$P3_26}|+lLy#;p2D}u!`RtPQdw!BXA0q
z^qu+tWCV2x+!E)*`5Ea~0!S}D6)uK%ssx6>1r5`$91#6TxD?J(`f+dqb_+NO-T_CH
zek!ara%wC+|6hV2guq=u%z=yHxQY;iYv9g`Z<Yj(_p0;`Sl(Oh9)JzU;UZ=KfW(L0
z8huRShk1>(JO4k0pcsK$#m~Z3a7ZPv9!~jq6I>0OO8*AD6`rAZ8=RkM92wX?f)lWt
z-cGnu*q#5sL$Dfwo8WG^%Et%bbw0MnQ6L{557)qpQ~{d96L{~1TO)1YgpWJIA>M-#
z%T*D2AdnYeA5f=RZ&+T8Eq~I+rzv(Xu?&Xg{nhK0{cu>`YwcceI0G(d>FLjecb?#-
zKLM^ikxwkmRT-2ZD2W(HZY*8|f7q5@tA^Ig;cz=oe>GfjvRB11co!U1>D>h9<#}h-
zVmRvK<?w_&KC$Iic_o4p4BR$)94__oT3Ft&?e_9@u)KLYM`idjyxOPV0;k|LO1~8@
z?Z7~$_#Ie_b>y2{O7MXgz?|+*m46`#z`Ydz07p7`ZMp|8gI&FLCa1AaUmq^w)gw_A
zKNGHk3l-<Uv8*m$0&NkL!R~3+2~NQg6`>~_?&{f}2FGDnKMbycqso2^T-?pe@EkY=
zyXl{QCg)!r0(WG(5Q9iQUno-vTn0yB_Xy|0ad?L6bPM2J-Ms{FlK4HmCc0hX!)}4@
zlK60qO8@>dIsb}#dTsa!2BolD!oR|muv>s<;ab=|tzLv9y<k<KYPf{=6uJXhjl_o|
zD#IOc3eHvbpT!W=_4cahYdG4+8%XNlD%kCmzexf<J_J|8ZYK_nrwQOGD#PR9`~ps6
zm0`B<sa|@q)(8p^xCyj}qp*ADvl~32ucz+=m-sje&-C#KxD<Xy6<|D^*Uzi5DR2p_
z@{4KHBm#mA72#qy4$o421zZQar%jpY`+Kup9Ik@hK2#3Z!tQCAfVI=S1<KuUF`U*j
z|38ReCW2g*;p1?PkE`G;UXT-4`gL$6?DmljVh>03^lAGlTo?74@LzDrAg@nshwp=(
zPAK#L#|TOX^9p~J!Iy9??AFK+@OSVI)rNcE>OyaRKLkgIVz2bUvpB+Gp1u)W4ZBU6
z1=kKwcS>jeZ-ao>FdR88I>0rsTcGZ6^mMO=`olBeuqtpNTn4*UJ_=5Vy(*A=={_}r
zFXAbd&&wB&;v)(C{&Yq<`CNN+jF;d{F@Sfg2v@*`W4$9PgHv!q6=0#*k2ACq#pQ4n
z>~`wK@H&{c6gm0b374KNYfjF8nceO|F!LO*lRPXDV7HCez&l|#!Dr!J@Gh0%OK{Bu
zuZ>@Y!xOy%)xf22zOsK0u7nS`^M9=fCV2(;8m@!g0{jd|CVL~>ez+IBRVAp)%ZsM)
z1rNmy;A%L%%B`n0f#c_SeX1p#fMY3D#qAIjmv{q57q|+Js{{++{Hfj%4uVVJMM_@;
z?}oXy=+x+Fc>WCI$ZtdWN_;KscIqi`-3;dcluBSaf~pH>AeF$S!WVgKwksq7*!>-F
zEj;sLUQ4A8APz?^^=j-^xEPM8^zVX8;hBn8!nKz&|CcDiBM73iye3!!SHSLmfM?+K
zmw89D9*)fR&WcTN5$q1VZ~Q-9-3xG3)fvF?n*iNliEK0?VuoxjXtYw64qBkm2|^Jo
zj9X!NKUM-UNR;pt5NCsm#)=wW1mBB*T0uxlFff2dc`DkjfN!D85R~V*w5ag~{eQdP
z$vNCGBmBPe-E+@9_ndq0#^fGchfNQCf=i|{OLPz(z|q)rD%2x5Mxb$qz59KOlQ^Oc
zJcGkC?L%tY;haNk$}hl4T%`3C;^1t1bGjTy?z9Kg6UVCN&?O9OnFW6$aCpA`DApe*
z@1{$&!oj#5J!UR6LvhC{MzJ<<49;J`7}5L*IC2luPCXe1@1uv+mBX2T(S@ukspoty
z340%~(`o^3tY#EzmsR7+Mf6ZG)8NN&^hNIPy8Y8QzSQn;sgy5k<<*^&$zO?stJ$A^
zGkWcC=3mWPJ}795Hj)rpXHUTvT!M?Whj!tr4R*}z#YtSF+wa4b8|_1C6HaYz<!#mS
zV>t1ay;uA<CJArbOYcv(@f~~VZ9Rf>zQs<PTwILJy`Tf`xXoS}3vmrLr>F?Wx7+d_
zlE1^=3u3(pgm>Bo^u^x0wgG>^m6#q+kC7p`3ezEJkHXcCOYtHne+rIcb19#Nm+WfU
zKIYvm2?RF%DpTQoIJ&zf!Fv>!<EZX%G2V&Iwfs3e|6@BXSKz&PzK)64apW_5^Qp&Q
zeol|1$CP~jCoptB4bUBI#lipBhsO7%0&K>}M|ernR1ok!m;5HX{Xv|-C0gH6DL+I5
z)TeM9=g63l`~PVIbzd=0b(H6fWK0~fEj|xd9kYY70LQVJ_m@gO<}o0BO2RmD++K)^
zaUC|(cmPhG$UH@-@Bb0m;2SE?Q94>SNRgn^WCG6r&W`dac=`7<Nb{%T#%2~z(k`Ea
z%YU->f*4Ny!Wc5s7<*Z*<$Fe%`+p6AD1l-f15e|4fFY(8E)%!q#aQZ9lHZOuf2r5u
z%5&LXdtfuJ!zJo1IEBqAcpq1_=VfdB{CT>|J|a*PWDO#1;eMPrpAGZ~zsAuEGA~<`
zD`iS5z~&VEAo<vwg6vVustznEr47i%HQ1bj4!H3`mX6}*&(jSklnpMT0`=vxK_`}Y
z(gyXwUgy?cTzjZDj(6eJW18O=C%UpGmG;ODI9S9E^c3ENXLe($D9s-(<=uJ1L9@0n
zN+2i95o$si4)<WOD{a8-vcXkUpgWj_<9}q4EcHA*un()Rs;h8<%Vrm?|6%N1)7lHE
zpTVJP`2Qc(y2GUe>aNYa08hI372JrIX!&X!EwSbGxDqFHhnsLoKYIXMB)>lm((QNQ
z)SvD4yRkQbHyrfTgwLgd>utgTsQ{Znctm^yE39e$NgTP+<~K_@mY=MV1IQZ90S&h0
zZE+*!^J}{NJe-R0OF2>9pnyPR2n)sP4lcuqp;Sm4q{1FJG|~>LYjBn0fjEgPw2Oz~
zL8BOS>N3g4<_Ty9jvXdo&e1&tf}@#M+QPr#&^TLOizAMg<0@=+xCW>2MBQNnj*e&j
zF+HF+aqt#9i?)wu{)Y*Kbc4MlM6fyM`=!DOw)~h>P|7^l3cr>7iJU^+{%2f+6I$Me
z`4=s-J(i0bu{q#QW0?Ob0;Ws4$OgCCgdX@l>}iF4#5^t~b%*_M<z!Y|(;mAKkGY*U
z0BZhF9GuEK6!igQJWkAIQ0lZSj}eH<AY>nMj;G=Hy;P_Pb8*dscHS<)p@-;VbBd(=
z5#EWY*M%pr_Y^PE)CN2!`HSs!VFivZv4b%7x>WG2ohIvKgXit2-Gsx~oRY0L@q#Vi
zg_C%d4&uGIW~r@j9}X?E`G;^Acg!>}=A9rAUCxS-+T!oTFH@oVG!DMX7|<OA#!|s+
zyysGTAP>u0goE^y6k=JEu${U)uEYJ+eeo$wwe&dm|6v3ot65!76Dsg=+@!uAC)U_e
zy%hJXvp1J|JQJHi`3|o82QSvt2JgYmYq_`R0X5;0bvkS0{@-bw6TpLUBMFt-fYClz
zNd6o4ZZ^;7hp;T7Xl_PN`}~^ZH)P7w_y0{!KnCG1EUPn?Xan~7d>kh!=eZ?4Xixi`
zJDwfzhTmc>FZB8MxQcR4S?2y<Yy;^EZ}z#==Xp3m1?C7J@VVCKRX)Fi!y9B!#xh5+
zqa~2>2e>4ikntWY3tG<6b9%(*AAQb?o}FKaWg$%Yk{<469MeEQ6HpKJc?y;lHj8xB
z&-M9HpW{9^U|E^7LJwe@I%Wc2_!5r$-0Bv_#M>MJ707S5xA*xnpL_cp@p)3l9KTjL
z)0a@~bFFxb9aJy+yaIno`ITPg2M*TboUNIcv&s)3?8ITbO3U|4{#Gu%^&0q|K+QIL
zH_M)Iw%cQGyB+lx``i^ro$^2U+!wE>JfCW03=9&-oWMu|30$KIr9Mx^(H*t{b9}xR
zuczGX;1Qo|af0&YTK_T}a|5pvsM=|Fu)*goI7#_Ltsvp^7dY}R4=`H(mCygfTPe@c
z^3xNTe{SH{rL>R)vx5SkyW^7g?7Z*e^Yu7RxoN-<pU2|-T|BgE1IkJne{NtJ35_JQ
z(}ejxKZp~Q7ijrnpI6|NvxC(>Z@}^Q?eoUAm@lvg=X_u<y<g&ixI8N}XukG2h3o#s
z+OwuheLjb-4FB8SOD^;|b_szv35&FX?mqXC4HCA+1ALC)C6rg|_9K0sh(jOR?QcKZ
z{Qf6_8WQTYf(1S=!jawfoId08%XlZ{{FR<oYkhtbhxeHKJwHWrmbw4$CQ#!f?DP33
zPEp=PcaS}aBihR&RjbSnJK&+8WZv2=^ZE)L{M6?6!@V4j!ckl<^~wFef<VFq@Eo7-
z!Lr^+INg98uNs^6MuOhSGf&`XZssjddAu?3IV?-k){!q>gJpTz?Cgv;Vp*zI9Fv3{
z1iWY){v+4+lV=jxypb=+3-mt2`K6{j*E@(iIzE9595>^T<Mw5o0)2s>)K`FeX3Xz@
z5V(|pEQZyZ2IS$Zv8+XD+!xDoRdV5w^1otPUe34_%lcF%e?FE)(~N8I2wcJ}lI@?t
zOUhXfRqp>1c!|LAbRgruO9hi{1$9z^;|4qiN3=n2;Z;ugE}WWXxBm>Ea(oy!i{<_=
z4fu(`&rU+yTPc`sE4UELa$n|%3UR*UuDGM)Zn(g4F}~9A04%Ew$F^#h+$0t7eNoe;
z!*B?j9gf9CPI;MB;CLGD<>cRmiyc3JOR(v&M=&c2$65l52_&6_7jSr%-CzZ-!hFO_
z_t+bfj|ZwZ;-cBsjkwbBcDx>&1KN!nXY+=?pjPk&fdmQ02XNBy*Er?)1on7qqsc#o
zgN}d3A#8Tos+{kNVbjC8xH9GhE+Q}o%O@Kd6TiXL^XvxQa1Ay)yc)+H7vnm|*WpIT
z198IfO*ocx0>cTUaE|uKINXdGBIyQ|WA83oJ_F}qbA<D8JI4!g5Ss=+j6;qeFK7OR
zoxrmslsJy#h~pJF>Ubrtbo@_Tg>$sQ>v1(U4SX9fa{O*N^RJ$Od<C=5^*$zHeT&<B
zP1vi-l;>riJoAk@(<4FeN8Fn7%npLy1(WF!$DMII$Cu%t<8X{Xz7yz;J377&7dXBV
zha3;ZT^x_aMUKnxm5%SgVaKs~1bRAw`>|PnFpo3$I4*X|pU1Kep*iB$aDT@Qc%b9=
z@F2(gal|_2g>K_gYXe?aJi>8zJjQV!9L4e#<j>8x-0^L=vc)IQ%vOiU&*RTRT+>n?
z_rE6yJlPV+^OoWzj@MvW2RU=Zd0qpKJKlwtJN^W}>bMEZs>Wvfqgd87PCG`6QUpHb
zJ3=NQdkRf`(7G*d?f7Ec&haHU==gHn(Qz^Ef^)P7`r~RX=lJuyVFYB2`o{EyC(j#)
zllrPqDZc|B=1riPQJ?3{!N;*_@WVLe_$9pQF<ZV8H#)Az&A3E9|H}?G6X^B0-QYu9
z?D!xqaeNfZa`mPGCvk}PUYa9p#zp#GOsPM&f?0ssVS0ZZa7DU2egDt%t{`yA+2C5-
z?D%FJdeZJ-1TMm+!U?z>n+Dy9E1dEwyviwm64!|{_kZ5bN}%3J*ob90(9ESX&)b1z
z8BpVoaT3dCJvqVySe6Twb0q#o{FIp`C(oSnxf%aJdBe}M0t<WRRZgE;Ho0Qb^zpY&
znmK;P%v)wQtXvZ~9L{Ztr(>@q1A|*-<<@?CFi`j_Sr{*sRokg4a7ml0&RHys*S9I~
zyX;;YYVT+YbZ#pTtR+WyXsylboOOQft4)D+ZRMf0lGupa%}s&AHu93_WbJ{bz>uu!
z+N%x`k%!J;?Z`uc-?ow09WSq4ApevL@_P9v%VHi16lYa8{BkG|$j(2_F_<qIn)3C3
Dm+30k

diff --git a/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so b/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so
index 852b143d9adfadab3ae04ca8ae6e332958406259..7c58f73b8decbb32c255e68b804a60b061f0c6cd 100755
GIT binary patch
delta 14826
zcmaKz3s_Xu+Q-)(E)5Dg$Xy2*K?CtNnio{k0nxzBL#Tz9ql2O%c|paZQfKs(qG^Il
zlT2*v*cW*z3j3lyPmP2|iiu@?bg~*dU8uAut7Dy$^Znnw)`snV&hxG3*}VVXyVhQ7
z?RV|H_TIA>o~HwRPX|=m6kk=}<lsXq7(2A0YD(|;Gck-6#4;AlkSvUuVDi?=t!iU(
zkPSJus^=4xqN;aXF576vLQ$j_Gqatvi%h*>pU?o7_$h3*$)bD`%6q_CqWtGHfK9^o
zBDU6<jCmAMe*?JR5XMHpQhR3@b3uwk6?}}%a~lf5QUSlnQvXtZFe=g#aSsDW8#vy;
z{R})n>1tm+!eJ=jb}$x+%BcexU>`b!c&faZ*v{B-M3wTZ3>7{J?kHj`Tb6&$kblg;
z$6atZ7HK<#u-A13FnFwmcqn7*WQ9L~J(C!krVyLiHC=g%$nQF)Ay9{(EE461qvKj4
zcEuVT@dh4Z;0ZdW4(5aRZ(%GDIvh=G)a6qHy9~VdW_Ga}I6S(L9l5yxdl~F=hz970
z-vl>J#Sq{~X$a1N-8hN0A^QWYCF(#xfJPz$A>h`|4!a^)930-kjG+%|g|~vsr{OF`
zMbyABT?KT+w;T9Q9aDq(23`Sn;;g43qxv@K^XdHKdIxx)fe#w^zYKf??ADzVFB<ZX
zgT2V7%bX(kx-Q>EM|4K#Aa2t!9nohxru^$Vru;v^>31-;44RJ6gp(m7hp}qOw}73P
zH`*kJf)AlQ@?Gi<;c$5D*hv-`3Dy!dFb;ejm!T7y8dz+|UuNJn241h@B-HUJ*oUji
zOY;v^ctlr#j_@@dQwQEO@F{SI17j*r!Zt(xCkFlj?9>g>HLz_WeA-D7%fAdxblHQ@
zmQ2*_XakQU#%@*^8p%7r7ZK<VFdfEB@FO_YwArHEu)hm@J+o_I4&0pYVt>8a!JY%#
zF>)9iwSfk31+EoJ7BqOuQ2s2qy7S`d#V&(AxHx?{PU>Jlu-KRW(gAk6iE$c*^QEs2
zbq#~t>*%`X9bh+(i=s-^%)V(i#tIF*#K7eSUaw;s?#IBsNu5`Si&avAE(hDA+kyK1
z6xf5~q@8Kjd)|=$BG`fB*6ws4QF+A8hVpa(U+K!z33*k=z38O8c9Vm#e-i6%)$zB0
zQxFxHUM!d;gVQTS1K})!4FT5`;!KezXgYYm6SJh`EO6<)j13AA9hwR*KyP=`q~}5p
zm%^bXI^vb!)&=5*L`U#2xb05Ht{{(iE7*-o$|?DnZhsqmuj-iUJ7M6HI#ws83*?yG
zIVS9X!A^PVQG?%sn{p9Wc|^Z~-MBVNC5O|cB@;Cmtz#OtTXjtN1G~77I<Dio95_P8
zGQmgl_~CAgdBB-yz9B3mV#%Vi`Ae3qTu``d{?Zj|$_iI3TvXq)`!c0|Pw19lnr=OE
zi{Sku?hWY<T8b@{KN>Mm>BnD+7#NlernDJbPyS6rq7uVHBJWfzd}icMC60d?IbE^V
zkBOS7^zW?}6jwnabPvgzNr8yx2cr{}Xnry}ad4QH0Xqd-Uu@afhGFZEZ5%cXznWrs
zW<*R7#t*ln0Wm{EqJ*1u<ApJaVF{o$*j8Z+=R0DO`%*xrV*3j=N`0`|NXQh)FT_kx
zl6ZLRt8vv!&~7PXbf|UEIa0Iw%dwv+E;mBrT`jUJpqr$o>#terO88r)u7+-t+5_#A
zx)!=!>N@BSsT-iLOYMcGBcZM|Lz~y&Vk2k;SY$>Ubga}qXq(jS(5X^)K-;BeCAjva
zHbZAfZNmXLq_)7HBXunFRH+@45c5232V@jToeJ%g+74YTbvnP=C&rZnPnq=4yizW8
z0sL;Mi=iu|E{Cp^x)Qot>RM=z)L!UXsoS9Ir0#%jklK77#!8Es4bUtz(xF?W&Vz1~
zx)|Cgbvbmq)NbewsVktbOI-=g%ESOwLz|`cKwG4)y$=&)qy#5|94mDl3fZJ?fKHX#
z3q43`H_F?kZiYWy>Q?9ssXa7sQn$gMBef4Yk2Kn6?SKNA(E;t0`Z{#6)bvDDDzzE9
zTxtunTk2To3aM?-l~Sid@0Qw&4pmEShu^hdf)5!Usnd~hNNSomYo&I;e^hFEd#jT=
z2mS`B^Ps&_7eF^l?SyWXx){1mXcsF5_+&;obi34U=nknXpnul<4vewX)$spYY7hJj
zZ>Mw(*1~U=x(?bRbpv#))Lv-YTEt(yf&r;AqZK+s>dGvPnbhhn<9<=zi^j}S(<?xz
z)bzw+k(zFzky6v6R;<+Y)MJ&}<H8PQID%jkx{Y*&bb!#IIFo~egpMTLP3RF+rn}Jl
z$saELq$7m3l8zGk3o0Ki{iNv~6ln&^xPs#Z%%vUTg|^~|gXzA2)Q|i&RkDk;bAO?a
zl0QZ0W#k_q^da)63B8^4K%rYm<H3v4AgW)zLom*5al>ZN{-Z=*3u!!_P`ZM-Hkj@$
zNb6{Sda6gd$Um{ggk@7nx(gwl;7Z?^+_~hLDm?wDjTu50k$<+(mq^bM`UKTEPv{nE
zt5E3Uys&Rf|EW~6NO&G0T`crE(wNC8_2V^tW5NQdE$mMzlAlCRny6-YDK$~~a-nyV
z{~@7&CjGF`$Egg>1xT03zftHfc;BR$xIoIK+d5Jxwe`5rf&A{Im|H_Bcen6dp?&uW
z{Rru&g#Py${&-TfYdaM^AY67-m^5DK5u`JPPA7e<&>5s35!yj|m(bay<wHXbX?jgS
zno4?>(0Qb7LeC(bOHZRn1q5<b=aR<5BPA#4K|&XizEkL8(l{1M%Sg-9y_B>(OUp>#
zBg&MM&JlVY=_H}uo6&yK9s!R~1|GvGRgjjK!FJL!g};)teDv8(`gY;3CjDoj_mjrO
zP02%giqMBh=L=m+njU!Q_>U4iC^G6uKPvQb((?RkAkBsU1nGrBdr7Ylx{35~p_@r#
zXeqUjmXk&+X*p?JgmxuGi9&6Z5h3&?((=Ohk+uu}7o_FI+fG`ZwpU2Y3%rB0oCJO*
zy+G`HowU5L{!JPuEfS6&(_NCB1p-Ms$d9>{v^?iSNy~HCLRwxUk)-#F^0A~*F(oT$
zdH&l-%jvBj>H9>PRMOok{z!ue<iueoEidvBq~$b}PFhYS8Kh^6eI2CbRF_R!&f7Vp
z<)ko`w45yRNXsj322J)!vJ(ZQCyPqwl9n@sleC=wib%`(wwUxBQGOX|IiZ)5mN$$t
z(sCXuCoSiRb);vg??8B3AP|!;dxUhBsI-D~s?gg>M+#j@I#B4{q+{@a-+Na|RM;t2
z9^007?HQN9C-d^8l!Zzw-<&ek<;^XzdLOT1-u?LxVp8excLQHE@Vf?XHgIDXms5S0
z4E*8E?CQ~av!llu12=W?161Ls25vL(dj@VX@adb_#d_!>aK9k}D(fPkvMvHD>ms1C
zE&^)}5l~qdfwh@9!SoSO3+N)CvMvH;h6t#vi-5|y2&k-!fXccEl)0!vT?Ez`BA~J^
z0xIhwpt3FkD(fP!+7JPibrEo>4&6DRvMvHD>msnq5CN5S5l~qd0hM(TP`N7tu6)&@
zi$JL%0xIhwpt3FkD(fPkvMvHE4G~beGXk#t`Km(~0hM(TP+1p&6^00?tc!rkx(KMO
zi-5{n1YEf8RfjGD%MB4wSr-A7brDcm7Xg)Z5m;u3fXXrgE}D2$hb{st>ms1CE&}%%
zBA~J^0xIhwpt3FkDvJoX)R|9p=ps;Jh=9tv2&k-!fXccEsH}^?QbPn(RwG~)Q?KgK
zML^{<eDdIg;F=Boo{E>33?8fa__Kp2#QPkW-?MT{Hs`Y0thMGKbKqW$&5;;7{xhx&
ziBLY}QA5g<X?)v|QOYHLa!4ZnULMj<$>YI8XDc7`!l9%$4;`y~#7_>LgTL|i`AQpK
zZy&0>&yU%cC?E2uVdIn!>+c>Gpx{O^bNF-0IsV7+808F)x^24B!k6B*&WeD&iQCDh
zQ=6)o<LP`GUVsN>dc!T&pT9fDl@W<3)^|jN(o#QiM4f^|_-f=l<xM_m)EwmvesGj)
zN*1ER=H`|#@6-9rD-J?C7~<-3{~Svu`-Ljp2feX{mD4^Z96a*NCuX7H_g*@<Ad62K
z-P>xy2xIqh>Ps*0rh}-wGx5sN_TV=h_$<`IPmeyVnE0yn^GX`e9J59_z+V|NQ8~eX
z8&j_|@?&EkR=(l`#w}3>@hbe?&wm?tB(V4o|16%DVdYI3>B<%UN5;RD@A+@HZ&42O
zjpOT;4fP`@L@253A^#k9G3pnK!&z$fP3iU14gahTZ(6U8HU2q=nJ4exzK1OB@Nu`}
zP;dkt$)`Lm^L=Fozmd6D$>C3AeXCUSjT2*ovwPJAr#;M{o*325?O+x^!W@2zzcDeo
z+x-sKOT~Bjr@#^|_icV-VoZo@=!SVH5Z=ravZK2_<LbF#8VcO)<>P^k$Q_T|s%Abf
zJKFU*a))@C<#KMs+7Y>I?uF*vckEw<Bk-;&b*%Q!Npi3{mKXjD4MjvDFYAq(!cXqL
zBQhtl?jgT_R+8(p@*WOWK>1nFW-lvP02>G!(HL=U!Fv%+k>hPja6;Jl1aHC<G&W%g
zk>eeS!Sp86qjPXbRPv3J5(CXYAvl>4@d3hGm7V<Zq(uBxCez=(lT&b(9-I_wcK8EK
zsR5}i1h+V@2PmkUdnU#8ml-F1^s8?+CU1?$@B8$*8=rsroX`L1UcUpsqqg$Xlk>Y(
zL%Q)WVZX+c?l>QKKDXpF|K*Odz_Ym}+xYUFogo|r*#G91j0YvuC+Ch)!XAf9`7F1@
z?Z5Hn?)s%u+D%GReg0j)DyGQq`IJ9>Jm6$*$pa`8fTt*>G2rLVe?PF7c};7X*LxrN
zH(?G@^IQ3}yd;eEs=PkRF1{l#CVunx{y7dr?sNZvy#e0!rL6HU2iIa3_eZ%UPJS|P
z0<Ku)?mkKukH7l|<#isGZw)+^Thhn}=TA`3;r!UZbGaqE`PTfM!S7<fz$TtBeH4BY
z4$F_3d><SN`j&9jKgWbdvd-32X^r3vV#)`(C8lrV>zH>z>A=RCgC>;w{Qa7$G%uzX
z_zF+eRI#gk$joToJl!7O^c|fgY;6xOn@iRCP_&q;c{{h{T^==KoKnf}o-r}Fnfh@W
z{kWBTXIPX1eqn|+@GUx?A7?}iD55<OU(?s3;Z!u7eMK~!A{$oFa29`QdJNB=Sr~#b
zT7aDb-r+CIOo;jxy`WC^mt{{j)l{|f%QF)()hPGGQbT9z<L{|e`r*h;7eyl}XymJJ
zvE%ugs$1obm$Bm){F{3hBCK~6<b={Z@^$=6*(h4c4;FM-ajW?C8xc7=-0A4UMofP0
zKjL5hi65Ia*Nl#_j`+G}9zNR|h}LTPu-OyB{zT<)TcXPL@%6LEDeL%avz<Y>vkm`-
zC(Jo(#koeAzIOkdP*j|Q9h8rS??WCmcSI^i*wik1o`volML~q?rA*oL%;uUZ2mH0X
zaPA1T7s%YkYv$&daWYWV3;5N!DPt#7#W<0)P>;tx8M3b1weY=NQ<VXKI{ZmEKWLwg
zjeOdl-!dbTR96d+n`ckOZO-(S=+ameKlHWe(ip8vs3skL4_`HJbYBN5p)<G`rEaHA
z;KDGW#3)&M9e-zDwgq7@eJQFOiCuAu&yO*U|C*1QKQs!*K|2QE45cf|dAg>``Za%O
z{$b@b@9*pviG0&{Vqg1PH3x@c|7ZB0oiUi!*E$OVag)Yr=X3<&)ExB{k1qVF@5L`^
zDA$(;Ab9MpO$Y6$e$zX-B@<+yj`9Hu{*^S@P--BR!q{k~627dTyzuWz+IIM@Ik_dD
zto!8Iq{bSL%D6$9F4Y{gVpa-(eT>S?;Zqjfnx@LhUBy+-yt()`Dn6U<TQod)9VY&6
zr}%q|Mvb(c_4_lhIk078%fU7kTOKy+IluoAY?rY4uzi889ov=q+ZP{E2Cr-N`<)m3
ze%pI~e-XA?Y-R8J{TH#7UG)2}W6S#herzY|ql(`$)n6|$2l2Wn&#qEO2z|(*4>+to
zw{)dR{cuw^0Kdd6g=-R~ZdfmvKG3wnsvl=y>BCG1EPa&8OT~}Quq+K_VJ)!qA;tke
zeS@)q>HCWvmcG3>VClPy9r^UlMNK8BCk>AYW(XS^rjIM$6#P~T>j2weEd%h|Fsw5T
zJHwX3ro(#iJ8ljveKje7rBp0!IjlPk2Lf9OTMO$!!}Mju18#$@gQYJb-ZZ?inejtK
zD;zdhA8a~o2W%ewfH4pi!&+d=rFFyFz!k7|*h*LjY&C2itOwQ!TT2!PQU_ZKZh&>e
zddb4y3|r}fqZPsfM;lqN54H}r9o7rm0ow|D9o7fSmca*WhGm1$Iamv9EUXRI25W~+
zh4rFdJFE{j9hSZ+IAG};LJq9UF$lee$b&6_b;3GfOJR#)-LR#wm9XWo9#}VQ9c%?G
z9%xuKtOvFh)(d|fY%6R7Y$b&rwhp!#)(3wpYzHh7AGfw|_+4hqC4iZ*^dK>xFJBuI
zLMy5QLFs(Q+QiI_$fMs{Y@5}xrKErJ`&ZV$($6KgV2i<aJV`8<O2@9`bKoZ_?8?gc
z)wRjdty|DrZ1jPEJ`B(Y0ne8DVfX*DC(n!MUZ1+TSyAl#$1RbHhX+^eg+5R*SaI@}
ziU~?O58XP<bm$H~VQZ$6!8dNrGF{B!r?!qaHRbZ4ZBvyZp1;lBXZxLK71j4NRXs4p
zgx<Q}0^!Ko?&LMw`kJoa$-UcROifeyN89YZ&37T*ICzwPSS|gmC>>hPlOM}59i7IP
zKUSy&^7D_STAg_qRH`|ks&zvhRsGq^YSp3mq#d#SxV0r8ztF?eD_}>f=_k~xbW?r^
zfy2@;o&RI|Xgrnn*)h=6I+N$@C><4vUu~VD^cAI@svD<l0yE3euAECEXQIeq5vBFt
z?--;gHr{7vs^Z|;JB#u6*v`0-8FO(0h}s%Va|2M@t6JOm!M0lD)QOxHnX``nYp2z;
z-^t@Dt4wu;^+zkWDnYbBVFVw$tI^cDfZy0<HF=7-m0L|6MfGF(6G0&!jGqP7+4)D+
ziB#tI>LGF3`j*Vl1oP9%0h9a%jX8PdlQF}zbxp=xxqgWm{TX}}_R<#R7<07+3r2qt
zzwl&?OIuH2%+;2=8U5NSHKYG12D@JnRxB8fNL!0(^vi`He96A>-us=nC@}oSGIfP~
z*S?Pwd;PMff-DGK{hA}sn{Zyozc3+*E;mX6aydaoCwEi*o)vt<3kj}t@q03gg{kiB
zPL{6#_p5H?(-?{Cm23c(*ifRYk5YiT90PMfCyO7&3+T#J^TiOJmOsV^uqbjbWW#oo
zr3%E)u(f+NrpuqP4?7uGKGNhTRfF>4JHz9h+*{4q*~xuWPJW7Sc<}|77l)6N81qs$
z_7FnZ4DKoUFW@$Ccgfqq?O+NNmEQxt4yFc)4}fDAWA%z;T2N{S50P94c7lgXeoeuB
zv0CS7M1fZDZL+{wuz3k%BPD+Tc7T&5e+H)eJVk&ya1BiN{ArRET<olvsRu^+Az(Ld
z%)v6>V#0m#1I*G|12HJjZ8>@^3)sM8bvyuEqT}J<eP9^11LGj}gS7^-!O!UUE^sXv
z{Zz}}<AURPT?O;ON5I;U6oX#?(+z@-U^Vz<uvzjt@aww#&EPk5TnTOjYeTXR+y<t}
zlIn9Eg5xqAFskDPh<|{CBsYND!P6x-g2%1s>R=0aJXq_{hv3QJTV#13_%5(k-?!j=
zFbuVKFZ&s4hHi%&;5lIJh|LJae6SYMUf@Mwy06j^Siv5!NZr{`mHCaASG(M(O2JCf
z!*FTWGsE@V0G=wZXSx%g0@MALl41&HAA{Sr;9Mof`m}-67n$g4A8+7111}UjR^5em
zSZOGrU*bvmI}G_x@!aEm)P-e@hMZO%+py=~bxbvVYvA7v+>ORTre3fX14nl;t#-v4
ztuBYkG*xSf@&_9D|63KBgz^&%<tH0>S|{TSq6Y5aTVF{GFY0s)?^6ESD}7v)|DYj<
zmW^tO@;4iJhmNU3T*p*?pMjq?@S!eVtak8lmqX>34E(Br|7PGO1D`eUd;FdMPIO%|
zcxkb!mZ;n92EJ<GYX<(!!0PoSE2R4Li(L^W6YGV=u8FP$x(sEaV#x*`U|_p}M;LgV
zfgJ|UzM1(yU+s%uWM-NZGZ){S!<HL(je*Mzyn%0R=;NX@fELthiH@nt!2fAIt-id+
zQ2tp1A2u+px0Oj;W3P}S1-B*`S|9sAVCP^_gjSE92h$TgC0fXQ2~4kqHZbLX1NMT8
zB>xK5|Lckwmy&)_zZEQ2DC71=6^w#IzZ82GSpSb4>u3jj&~?fNJYcaTUF?4vtX~rU
zE#=$0{_`XXcL)8y%8UW4|EYrZ)ebC#L%(MJad77fUv%&YSihq90@yZMthc2D2*mwN
zzl3=R*o}O8wxaxbU=P@b{fOO^@4#!JR^Txz00%uuP=P&QFPNrJTD*P;?9=f};0~}E
zcXBG<1lBKB{|A^}ysUD6dONX!onR^-iDxMHL_GeK$_fV2L)9d4MAYCgunnwsz+`6m
zV26&^f}LOr43&Qn?AGymut&!m!Cr8vEWZWpn}qmlj_q*JTeHTy!1VU4@m{d@_CZHb
z3w9#E4Z9H^18eUe#4m%j_YUG$!TSHRI03%s%0U=q1!v&UUO8xoHgFN%G0SBBWw4g$
zGW`jxy@XJCdQ8<`K8SmOwU-X!II#AzLEIng(%v%2F$50%GV^rsey4cDp!_UwT_MI)
z4%t+&ewBFvSWDEwrC|L(3q1hV|G(9v-RbMmBC*c77aqwTGgMGx;1>;i(!lS5?WN+R
z3&+4+1v}((j>Qzleg|vsF&0w~W)7w6PbTVd91XTi#A#r9k)e~2{&T<q-ml}8VC^-P
z^4EjuMMsp6U^~G2h5psxSrwuI3;WT3zy^nwsKFNv?5+RiZ&w3MBk@zk>BO}Eu!4Qk
z@L|RGm(MR<y>ju=Ws6p8E8FXX8s7}$t}yd|TJ_GW!eh8IybpgeyeC%1_vX98qj}t!
z?)6!3&r;&h$h5V~vuCpT^NSai7cN=3Vt&!m``0YJjfd}v=lARx-+fi->ZL2n7SAtS
zvz+&Mr+58`cXliIRV9KqpUF@*@}RScLFstOOXma5CWp}PGPITPnefxEHplscXCF3Q
z9L;|{n-E09@8EGQ$srX`w8ipCEs4SOM2k1kl9n{Qw(e;`jTpytez_%vx3+`^(I{qv
zl7rO$mgFJll7kq%k@8_s^ek)Rv!H13@NMo~atIB}8Eh5&*tukU5WR3N1vUSMzklU@
z-%U|^^IZJR;j8esfj^1AjGx5cK>iK>R`c-lDax084E{1+cs?bz<8N3-s2eR@6KeWD
D7kg5$

delta 15013
zcmZ{r4O|q}-p9`@FD(kT@*=3KEJ%Tbx)~W7>7uA$*hNLfmt;{aBuadZ%-YpV(KNxM
zi6y$+Yen{Qsr453dGxxPCM7Bw^-!5@JyzbbE|vArz3M){|I8UU&i&l;`ON-*-~V}?
z|ID17**WXq8|2v=RBcl{H3@@4vlla#y|`vlzX9iB8MDPP7Rrz;jG15xSBR}<&7hDJ
z<fPQR60cmWdEe!tQj{XGEl`#AL0pQ>Gl8*i*hFS#4{KTKC*?1fI1zd^w)`2WqfmQh
z_B1$YC}Zic#5-a2MD-gOb1}~yC<LGaEqboXjD2X}&kX#(2L8^#KN<Kp17An~%<W(-
z8W}WzFpS)f0U>@W-$86=>;R%j`SZYfqCI#3T(J<Lho=1HhWu3qUIX@Zl@DVZTv7oX
zEVheym=;QE@F3WRLtuuR4uMZ9PXjmscHX5OA}ar-luw7`TZyTBr-5AogCiJwswb)t
zW8hmQrU8!x@7uswFm!Lkpj67I`i~j-$s5_l*1=)x3W=F*yRiV<1@=3%4k!XIfwL#$
zkYM1n$8>1ihyV?S_##+OG=T36+yQRy8n7#hu^=3iA&ixvFf|YX?z|6YEeaCHf%Qb4
zBpG<P#MI$P15XD#aTU-Zr1loc`E>oQk{#eR2Cg#j76Wf1md=sq4EZmB9nxi7FXg*v
zkKU9V#D^uOJ$g@K%Kt)Q%D)J1n#fo=H0|Nv!3T2~s}=ks*pC6U3;qS1q~*KR3Y~Cx
z?2KiK0#-VkM4}E7!JW7)ozT?5WJCTm1J5>aiNuL$qa5tVRpz7XpBmgI6`(!zNK6BG
z$-u9HI~+Ks;v_t5$UkP_kHP-&-IwvFVB0wCPzoBS0R#*cLPC%s5)B}h9}kUpC6X7_
ztT1#f3<VeDVkls0B@?_3ry5Pf#6^Zy9sqY{bsw0g4f&f4{4BT;2M&jZ+H<kJa8%)1
zp=3b^uNf-51@?4ZT#>99T#bv<kCUGU^qrynPX_KZaNq{USrW>7zY}$j&(S>}B!}Yq
z$39a4sf#SIo+xrT2ELD<eJ9>kB6;b6F9ka%bbr-73U-h0=G9VpYIQBR0Xs@{Y1*qc
z<Ua?tVaN5aJ|EbFz}Zk<jpQk*0-cg=5>p4C8u$XSG|>m&1Wv{vVIo;5>p@(lb<mqd
zu^6y@K28;JhQ@=f#rURE*=1&f;pmvjSX!7Cw>!Xv818nH#`$19(ST-v^A|AIA{v}e
z<?|T3j68~a1=x*i$|-oeRNn^Q9*HAq+WV9205?cXVQm7tCv+czzk!|N+@lW9fwLze
ztYScZDvyifvfxgzo@fWINj!==h`>pyCn^vrG36&pO!+C`v?+Xlj|aTtdOoD|EiEoz
zRI;$BV(H=s7L_ktTJ+$;<weVu7cXDveLcJ?G|Yo4v_<l)sF`6spcY~4$?KzTQwH%<
zQMX0Vai_E%TQ43G9k2A^w?yBo+{~9mZ&l*B5;INd@0}hqPPwJ8T2MR72SP^()=e^m
zl^^O8uf*`r`@|0k*E3)dBWA;vjqP@9N!aejrl(k592LtQvBAnMe0=OMemT~xgz+b0
z<0A&3$TDn8uvz#kv4awNL*byar?64#hb>-J+D8=sA$E+Cz=y=W(>JaZeO53w2w8UM
z9HHIbYjIyFE>?yKc&V0chCU`VU3SevTi|aMIu5#BXdAR&=oIKnLffG`gwBBO6xsnz
zdqP9Wfi^G0%ukRHu!xL8=s2OB&^Dn<p;LsefVK<mhRzVW3OZA04|c#IbT#}rLf1m)
z3GEv{5uNK5kWnaf1GH1<M(9$Zo1n{urn{92p__SvHP+RR+||NEw}Wn>JK?Vq+WZja
zVxi-pYlXH$dxXw`ZV=iD-6(V=bd%84&_1CXpquqr`T(sWqaC_k=niPV(B^U+ETQ9|
zJA}4DcM6>XP0tqUc>`@0Is@7wv;#U$=$vv)CsqM|1m7lfJ_@A>T?lO#+6kR5G{X#&
zA#^GH4xuZcvxT;yOpef%@aGHdhAt#c(^(b3DKe^|ONFk5t`OP-T`6<}v|H##=qjO`
zpsR)ULDvf1482=u2L|L3x)pxcJ^@Z-Gzi^}jMs#A!`~>hAN~VEd*E*p`V#y;p*x_P
zh3<rI6`CFa+l4km`!(%i7QiKu5eMBNv<<pb=oILyLc1LZ8J<e%7-zsAEVKuHv(P#4
zTZGPsjuW~N+9tFUI%NgoPYsm<>>{HAI$P-Ku{dUWg(2u*rB>dD&MZRHBSEy#bO#eB
zG<~O9bv+Ip3r%-HNkTVRP+>LpEYyTiAYI|F#9LkXVx^Qx89g+;52tu2eYYT`laAE1
zm2|YGFH`v#O(&6#)pP-sp$Ai>xuoeX5Ge_J7E0d<q{%Pl<J|L9nI?Or1C)`h>E+}f
ztm)UtpQ`EYq;J!78)@8CQA+1qY=a`^Qmql1r;Q)7#YU{7GIYz1be?}{i=CcFW$0@L
z=@@C8UX+5VWUi)5$e*X_G#Yw=rpw7cThkXw&(ZWTJ~<)wo`clFe9dznlX~a^O&3tf
z5>0O=jqe#s>qukrqm;($6JlNA)E8<}vXVxNl&(^1C{3w_dO@D5sf9{S?<W5$O<yJb
zxTX(M8M^q9E|PzZrY}qSAhrKAJ!l|>Q-$?fp=cV!vziXanUt_i(`{6Ni!xMxyQZ&D
zqt9!4GwByJ9ZY3*YdVemb(-Ez{ymz`Sdab_M{AIY0VHN=+ClnOO=pu{r|BHhVnBJM
zBQ$?LX?nasDj+>e(}kqPi8q(DINDCqbhAPGUqXOePD-Vu(==U9`d&>}kj7q6T25MA
zEtRCjS-zU|46TftbdILikxtNb73m$C-VE*1@8+s0LtIANNl(`b)sha^^lsAkXxc;i
zeogNqjY)t~1L;YcevR~0O*fLJ8z!U!q#skIi!~8EtpYnpx>(ab(p=NWNQ+aune-!?
zzlHR0O}CQ9v8B{TIz-d$q{W<bo^-V4_mfudxWN|*#Kdulv|TH7nY6g}J4lPO{R(L@
zKXj57bI4WF;(BD5{SxPC?F5q+SCE;sm_x!{1P-kbO_zylG#yP^oYirp#ieB>EzW8i
zX>>>_iS%|&r;rx2V;X7PXi&0~?xE>)(wLIe>o0>q%odrX#eC)<E#{<b(zCV7Ii$r*
znMYd8@A;&~>{38l%sPdn#U(eFwAw4~3pSwt#0grXC6pm1mQvEVU!zn`I!V(Nq~~aQ
zIq4WpSCS6W^lH+WD=4{1i@9VS=>pAPMH&xf_>yI8Gr?FbqndPzrni%h(sV89U`_8P
z-3Jc`1Ggo|L>yz4aqX$U>~jTfX7BRE<ONC_U!Oe8bu72c>U*|^`F2lzEDO&u7Yy8P
z;By9UG4Oi^_H}b5wg2gj?CRBaqodbp12-G^Xg5Df4W2jf#|CaS@CP@ri}jKsu+k6#
zm8A%%EJZ+NDFP}>5l~r*zzRbIR%GD}lOv!OkRqV66akf`2rM^5KxHWcDoYViS&G1N
z7d0S7V3{ETDoYViS&D$lQUp|%BA~Jqfu)8BxKxL94yY_eKxHWcDoYVqVu*msQUp|%
zBA{}21YA>9hZKPdLj+WoBA~Jq0hOf)s4PW5WhnxSZ-{_v_f*v(ML=aK0xC-pP+5w=
zBZdg5EJZ+NDFQ0%5pdzUR~=FW9yUZkWhnwGOA%06ih#;e1XPwHP%cHlMH7$ekRqV6
z6akf`2&gPY;2}c<RF)#3vJ?T8wFtP>nNM{{5hyc6KxHWcDoYViS&D$lQUp|%BCx0{
z0#<G6RUJ|UR6flo3>g@@`^mt~$_ZXJWR&t1e{sl|0be;VzmLr=Tc68jv(~zL=D@cT
zHb;E;=r6f4G)nRFn4!y+seIGW5y}OAWN17-zZ{yR6!6etvz5<z@i5ZshmBG`<41<g
z!RG*bk@6{j!ahv-h`(-sP-*8ex8JR_d#BtUq$ub3^x-cnXZW?@u}TY%xnr7gnlHLz
zwG{z7jIWcm$Jf>{$KI(n+!dx}`FdNdSH3^PmGpQNOGuAWPJ8c6Z&Z|H{F^)HDM$GD
z5pxtDuOHz`ZKw>)v1GAdsltBfH7%@iEFxnn$Sq?&R=L1|hR?lOzt+La?&_Du=8=2v
zRAj09tM4~)7FEL;qmCnpm2%Hr_D~-J71YK*zH7hIgRjXrqYUR$MlMrc;>Smh!*Iez
zd6f_Np;3=3Kk|`xKd7YhdVFr@VVSQ6S6mCs;?7JfZ_CV3uJDL^{-ylJ!$xmV-r_Z*
zy~=v;_%TsRhIv(B4r`0~)#7kgnEjI?f4&}=)!|Ex>|7R@v!8kLuliS6F2RT6bwlEA
z$G*^L8uC$|n)RVl$ggMZRPy+9W4}|L=WE8rg-(iW3{8E3?;RJ@<BtwzxsEXRKEeMs
zu1}9z2a604HuDR>&B#^p2){lqHcZd_fDg>>)1y+)e3y>~KBDKg@OjyNTqSz$@$C~m
zORy)tB^B9A19K7`tdX_${&luq{PgyT)gnJ#%Foy9pV&UpE%KA3{AtDB<=JSXck}j%
z(UYPZZTCj4aCQdf;<!~t42)hl+7TZ*`m5_-R7Q=B`o->Jg;B0w4yVB;1~lKT&|`(R
zLC+VuAM{$GqoMyObT8<mLYtw#g66H+F?@bhl=3WB#>WRY|AgaO9Aypm388G^#pC1g
zxkml0AD<jtt7XOXbOeG-DM2ZlySOt*!Pu(=-y-r?gX!>^e083H|H8}Hzq;_UKX75^
zH3y!Y+xUbDQ+w=!bmR2w_?iE4!kOSNaPjgL6PE{nmRmN7|1fcDkL@VHPUMzd|B3%N
zae((vIU|*bt?(&la?9L->xT=xf6u*SQrf)d@_tiHZ9ni2?)z-;+1xT5VHSj$OgS2K
z<*R@0*~xsS70l;*i2Q4DeX99Be)<oIILUtcLw`&P5&5wLCj1zf<4_~CXJ?S_i3)b~
zsrr?u;(kB3%*M0w$0!!QCcnR8;&u7|P*(9RQ>?)s<(94GuTL4HV8m16f<MnKE8t;M
zw}!T$UT`ztGj)XWF`rTp8}<+!3dVMUFDZx`tkUt^GShbh8kui?#cfCH>W^Ly%=zlW
zx|&oUW-s`PkJr`s`0E9+JZGAHz`F10Tx2V7IOm!$NIy!JQk(yiTlOs9G3{<8pP!vJ
zF7!R>_ycr&D|ZxF@O%4|0&DPxv;)-dLaKp?n!eTgO+mlGqTgiEuY!I}=(jQqM`S)K
z1bxDXP9GSvA0wdg-6F~zIaXKGz>BBHD{j7KdK`7K$y+zQK}o`Hn%cAulF`9S-=X4}
zx|&->#V=8DA73<c0b+P+W==TnRoS-#-poePe4bF)Va1o!+kX$tq1~iy8pf~&UsUdE
z1KxUz51BRBjDfL^0gY9>W|lP=y_NDeXN`%#rN`Fb3y~U|&9BY6Tgl`jXFEf1bN%xr
zzGwDHE6yy+^jr$e2}i@ZsGywGe9!Tx=cK3LaGEY@BOi<5l%pU*^=6hBc~)~>jRXEt
z{^gu>bri^)#0SjHG2_Tn(~11yxyhp@Qo}fF^e~S`ovYt!ZQY}X??hcqCj6c7C!VaU
znLwjI$4}3F*NjL~Tbuc=`|T<Cf;D}k4QUjLmxyLY>O(>^8Sp##Pxs%IkcCF*bWK62
z(KHBL1tydjAxdZR+<Dm+gu(Q+*5sY2it~A1tm*nU{Ox(eVz3`nF$gCoT|UnD>T1q^
z!~Z#Nzp|dcSd<iveAD+@UHiLr^}|qqKL5NZR+-0tEh-GggAz_Try~TX<!@i}=bS$$
zZ2y`L<r5V_2p)TPZM_}Mul*#qY>XIFIe(@2Ux}fHQfX8Q$3`!8;iC7w`JXDOo8Y(R
z<(7TE`tuhPkJfoqo`6A}udBCW9twipNM(xn2PL<rs&Z0yah1p4SiFjg&*S|T4i9}2
z-yuDY@u>?(-0Ars5IBg<hwT`)W^66kTCweJ2?T;q1p;%iIkA;sE5%msJ+$yuWys|-
zDApPXRG$q5+OWl)3j{7=%l;4+TWK4b#TNHbAdu$W@!-2A>wkRh#u)P!l!b?Et8j$T
zyC-_*<o4!PEH<g{ni}ySZ!UvtJYyDEA2<%S71jpphfRU)fVIQsr{Jj>mZjpY0IUTz
z2i5_9KCBH~2y2IR!a86}iIHCcOUVtJpNiX!DhLlcu7>p`<5zvMU=OSX9X7x^Q}H7`
zYz1r+EWPGwhNbt$?XdJ>rxTVEz09FRFLK<e=oGdZmR{g^FaUaa;{nr)o5oZ;B&u(1
z;Gj1)t+4d6#t%y`YC2%)B@Md`4VPm;uyMlLU~S+OSUap8)&ZLVn-A-Nb;9Pr`msa#
zuod7!SU0Q_mboS%G!WHrRKVK6m9QSL8@3U)3f2c(4ciJ^3+spVz;?hkz_K)yg|)yo
z!P;Pbuy)vHSO?l|g>}NV!@6Ppur3e8B`T1H#$fYdJ7Jx$^iHD!mfmN$Vd+&yH7vcx
z@W9e5j7C^`ec^U!uPZ#T^m@VvKfRi0g{9XL)f9TzMp$|q;fEh@BKU192H&1;#w3I8
zGGTkcPJk6ZZmKKn=xu%k{IotS7}m<4UJ;*Fic<80Ps)0=b_MC*1A)bRVd)3Ho3PPO
zV2z2mfq>0GRq`d_Cr(sloB7u(2K8y*fPKM6Z!>5C3BA2&*x*fA8MxUyVEsv@53Tap
zZAB9v6rdlZl$em927Z0RPNk7QS2YBW1jnkzC`~+g<L#!T9DdivEaf17Y~xtmYx6@J
zN1L)I@oO9Nl#6`QCVT(zJcJsJ+-$0ObP`6TxZed~w+r+5j!g;uE0NzQ=WiDI*^~L%
zP4<3OlMzP53iTgX>%XYwhj;S+&*YfW{=gSLQ>?7xC!R^cz&f7kYr=c^o|_Mt@}}`4
zo8wFw1^lzkcPY(0YRhe={2AP_r2_ZmZ*D=knY?97U!}wQ&6YGp@$jgvDR`L3+*;b-
zho9Sg7*k}>7N2Qu5C;94@{ce$EC>1Ft$jyk;1|wXEx%FAZxQ)dk+V+AY0z@ov>f=N
zti`-{b&bhB-@C7RqY^?ZC=T%9+m2?F!jUyCsaU$`k>w8*%_u4^FIP{H1IVBiadPg|
zuH3`Ope2WNR`5OB#&{p)Q$oTTaB%5lYUF2Y<4ySs_|LUNapV$q#rE~%(6%EhD}11N
zud>HPXAPw$K4n*|lEl~G^B{e?^rb|`lKLtOqyOT)+8R)!Utbnt^y|y^jDE4&j>kUV
zdw$v!oVy|aK3aWUsj;rUgvIFB*SZ+}_W69%^Peb<-jWwWEC`hMp;uoqDOMiy#+XF9
z@+bu{I_|iB9#9RqhbkP4`N}s2x*X#7?I5uVBd?3;vPQL)szEjF@}^6dQV>=OQBp5N
zl_OQAi<43imhDio(M5_Bq+Z~RJu6-g1Tpnj;05gWE=M1=V8u?=paRVVSZdS7m;{h|
z?NSZ$`>J9_7xz<nW*7HY`TOVj%0Iv1a$xsylHr`}!Du0C4)|ulPk{@;Jq2$Dmx3u!
zRDK7z5=<Qu?*Z3>X~4uUgB!s^1vi5I;NgP*s^CjA?!j)yQ547r-ysT|1XqFY6#Nm`
z2OcE&3ozXTPz0#MU%+%jaGziW(~=uZJ4X3oG<lZcjhV=|;HH7Cz!a``5Q_r)!FEx=
z2L4py!QdW`bmtETj{?J}9o!9(3D!Hv29K3^GB^i}@u=lzxZuc@8Ylwa3)T+_t+c%l
zOy3o>2TQ?I!Dhj$!Lz0O_279DSA*w+^+WPJxDrg~KDFoC2ge#XU{uE&5Kn<|b*kJ1
zt^!XJd=&f{SRY^u_ySlT(8u6!z&DBVe((=qy**l~`!g7Z+Mmg;KwXh4TnGOPtnaZI
zJq5seNF%`^i`BaXb%0i|1FWT<Y?#XYx3`wMs?d~zuigFR64x(%{WgIMy0{lR4%V+<
z#nhX920p)mF&o-caJ}A4V-bn2cKzK6xroyZ`4cssP5Ej8dFd)uC^l4BVqlknYYqGo
zPdL!uWkXAUGvu5$@C5_^C^1b;R}B1511oggM56NMZl+bem@B#+Dn}c*zr=|sKS*M#
zpK9RY1|DVLtS(N**Ekx;>2hd1*}&6z)?ea#FEDt^_>#Z$chP_zHRL>L;HL~+Wnfx9
zt0x-I4g<en;QDUXBG}OFP<QCh27cSX|7~EOfsY%w#lUB~n6G-fkL&X;)Z%y1z&}b%
z@%l|-+TW`NHqnJD66N1y;D{U9#G<j<H=YNZEmM+i%wj199%|q_4V-D<alHPWVXm15
z?_2}VH*l%Mw8M)frX5;g;KvN?x{>*qrU)0++jyg&J!@cY;9bNx$=xut_;)W@UK;BK
z(=9$FTBhv-(^I4kOiO2J@i9GJmf+Dyb<k>K`JXzM!Oo%D)J+v=&US-o3Lx$WmKU1e
z1D5}`;-vD!(1s|#4y-M>*Xr*FYYYDIElUlarULkCrGIk}|4s$)U7sWx7=SUzf6SQ(
zmj4m36fCb^-U^nNOaBFI!*fl#XzvR!Jx_<Df2~6}+;GqwOamB<uR(d``n_PcLwn|;
z{25>mnC>r#onRlBB1B8`7lZv0uLgI3&G>$&@>{{Q6yH^g3gmbL4tZ_=e}Qe|x<mK{
z*bTlc8u&lr@!iX#uY&0p26X@?GaG<gG5Uo;X9t)b&?zu9z-+ozn}Fl5J96N#P3SI=
z2X=tNMTMzgr^GYCZi(*)dnBF@mi|1zmVy1qZ%2Ey!<FEsiHLu>a6AEr_*;ltU@chw
z8_5Q+{)j>i>;UVJCd7XN>xr(a17Q6Th4K%9^+yxplT_a|83jd1KY>I3=g&p3{yahz
zI>7pK2=VV={rQ792zSH!a|dw*SYCr4OLxm6(EwAx56f%n)8Vi!#Hf%-4U7fLe=o}i
z`?~VO*bK0qsKHVLuQKpve)eFL%U*$DqI7Q@+9s695%)V5Qv^E>)}LoAre5qrLkkxT
z{41EAXXq@W|3UP`*|JaKQDFVql=5@JaR*KhJX(b)R!FB;6?R)~Ac`#jCx9tg*iV5)
zlrIwP;wl3_<Nfl`4?)9*-}xVZ794+jM(N7R?q&M4MBm%@=;2`A^F+VMe_y@NkGvnt
zcSiK{-gjb^Vks&rU07NC;NnM$N*1kLw&;#+J8tFIn@3x_{SPlLfuHHm@x{v?_7=Wh
zr*QMhOr?~MKN%lljiO(37M&avM!(9?wwdpMpMJAx<bOT+xTzq6k7yYfLdVv}XSECp
zqeoBLGWgRi@u74Ri-*tsEvbrwf7B8mLbt3Ao^UFbTTX?C&;iQ>4GK~J=*e@T=poe1
zmq5|&YYN{5MF+~mkDMA5Mn~WrwjBP$sX=%@+xv7fS{{kdS9vi$6ZuAb=JPl4>EoZ^
n)6A7K$x1jMj86}rg3llLWB4@l`ZLLKorf5ESvoSfpw#sL<b;#c

diff --git a/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a b/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a
index 032a59aa14a44fb29289323554a068ff5494b6a8..f0c666e5bb606945670fa32c2017e74e0991c766 100644
GIT binary patch
delta 115
zcmZ3dvq@*eEGE{%(xMXm$+MWOC$DEx+Wd#<B{QSN<Y#Oej5eG3+1Il}<@fRBZ`R{C
zU}Y?r953R}*fM#&$Z1C9$<?CDjGrgZ7gc98ntU8c7EFFGsxD!|!N3r}$iM(Z49q~x
J0>lD93;@%BAMOAE

delta 102
zcmdm_vrcEjET+lxnXFkDfMD}YCLWf_4;eKXEjIJBuV<fpm@j{`A-^FjW6tDc5r4*p
z$(uz^Gs;b_7gc6_H+iwBI-~C7(?Bw3@@G+X2?GuWh5$we1|VW!24awr0zeD^CG#0h

diff --git a/src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so b/src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so
deleted file mode 100644
index 7d8a61050ba50bd3775d1898beec42d04e7b031a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 47058
zcmeIb349dA_CH$FGYKIAP8LGg;*bRbA`SroF^0(`frQOr5yPUHtRx{MLpB2<3?VFP
z$ju-KDqse<z;%tv&&&0ybQVHH6hu@M!4N<Y7sBR6Mdp1^cXiT?px5_4pZ7k$_fPvN
z&iB-E>YP(mr>lE9<>I*Uqcs{0Q!Xvr$`o!W&rAm$$_&htg)7x=pqJaY4y{lRR4J#k
z-9ZgV%pfr2OY)7x%U5h3W0X9IRm!V?ZE`x)4J`v1`p1QG0VMJ7NJVn022Y8~bx4y>
z7pAc)0Ny?v1(c}V3#t8T_dvS`+C9+jfp!nHd!XF|?H*|NK)VOpJ<#rfb`SiY=m9M^
zI=qrG5zp!(__M(Dpnlc?if8Xl*}o)w`)c#RMJE56509)0DE1qZdH3eV*L<3`j>*_}
z$QQN#@apjP7yc_s4(#6Mzsjcd9MW@B!kzj5{%61m<BLuI*!Rs`FUzwdp8veh)8Ur)
zIvLJ<-eKJO$6vK{THIJQ^=NvK-#}0OiFej5`eN*YolfhfDXzO+s~0AEXWy|is-f@7
zmr4R18J{hi9Od&+$I8a}A<b(_b(;bo7<cES^NlT;`*c^fj7y(WtoippVv;-#y}0?~
zx8Er|z4-9^kEA`-`5S++PuMzV{I~(ZDY50p1CIozIsRiBl^-!-`NbCoZg^tvlbYzo
zyHbBx(j(j4d+MOBYo}W0Oh0GY6twlB_1<Sccqcogcu&D&v%8P|_WQ%zOCL(xapGCW
z(r4eEdaCbh$B!1CSZdK;jq&O5;e<!-)aJc3vipwb{$A9{&{F-zn4LqyPfvV$+jBEA
zx@_Bero*dGJB0gx_42qk&pWhiX24$OuXl~!Z!uq)Qt)M6(S|DLwVmF(AFHpbY#E>M
zZT-oH-MvCO5BujkXD=oE=d}IB_g*S|;h!B=#r1z>*wCtv7o@D%Yd)B;dh}ZwQ}MNP
z*Cu@N{#oDqr}Z$F>RvFGYFAWg1&ttRG{(-J9S1CrU;dxY8qIAU%(!syZN`qC+6Q%>
ze!_!O6voR>h>0HTYt&Oo2fvO}l5-8QS?RO$2KO)qQr>NTJ~6Qqq775iQ!{h(a`I+p
zr56nLG2TvP-hN(_9*iCz^T31&gL@l$QkK@wf0AWNwBZ3u+{D-^7Q@uIDN_wG(Z--|
zog;=AhewPUJZ#9&p~Gi%>pYa$A)GaqYVKE5lJ)a5`j9*?Kku02oUDwz{M@YMo9fx7
z%arugsd;%h1%rDUyUV)xDp^p!pwRGOL3;k+V55NwZ}abN9BeSUU4|(SB_+hF>qPv%
zPW%mZ{JI;5xYvm=#KxH)95c9^v8&uy7voT=j^Tm03DiL^V-LB&&v*}S95;eGpOP>m
zZg6*FH@Q@A940rL@IY+bcxtFf^y^|APDO(aQ{tl|?imIL8q_1M@nG6gO=pIDFQF8#
z|6ZH6cG{5@9m|rF3?5Ub?6CiR-zW9&?f&tD>ErVsn4bFM@H-ba-nHbhPa~$7w)Wro
z%=k^S4lF;jDQ!v0)~(AYzjW%G@4Ib3monp!_UD4%{O!RxQTKF;2|W9+^N;?$F!8M)
zR=#!L!J+*oct+i^@7a}2FMpD?ws=)m;7@tF#dm%4pTEyL86NcB#m^?pD!MOR)92_b
z2UfkZ((}bkVOhzR@7Be18UF3ZxvrFb8;mU%4qZL|+=#MB?}pB0oqjk{Qt)Zy{kb0R
zck%7|YUIkdf`*=tUNh_mo4xab-+uTXo8P9(_Fs11^?J`YW2*)|_wHA*xqiX*^Ze`n
z^}}-$uYR6bJHU2sZp%v{vlf=5J#p4?@{FL-Zkc1uGTs5t_72en8UyV9{pvf^&ku|q
z+VRIh<F>D_@cMb{8}}JIQ3XGphY{<D8E+%*cAZWubg&!!hzZkmGJ1P@-RY_E@aUk`
z8UsANfpwav`eSwJyqKohL`gjuSbi7`4ci)h^y42EoPO@j>pf1q{=$l|Pv7Ws#e3U}
zk=7uwpWn_^QR5!I{rgXk{%Clj!`cm#?%VzL%e&%_cRMroz<U?g{`%Vgd=%63p<d6=
zEMDK)@E^aAqUQJV({<R?^|`S%9R_@N`<8DyjD2Lso39k7CM5LyE$)k_0`uzkcZ~dp
zhppIlV$l`<!2we|{#kOi*7a@pxAjY2J{pqg^~O&=>mmlc@npl-pUo@3@cuo`$Nu$0
z&E;_mznS9~XSSc8{c5LoFAj}(qxUnfy;>X{9sShRL06w%wS3Sd-P$<~HlN2HI&~#`
z%?FvApPQ`xD8QERN>${CKF|6Tt;zcShrqzRaZ4+HG5+mh_d#P%6ueot=i$=}E$6>P
zh%z8VdGQeC;pQXbk~jWu#sH&n5QYY^s^6IO-1Pk999rt~nje=w-w=-QVJJid$jY5<
z$ji-{-~X>hH@+gZetIdo$!pFqIWMn}BHQ30#-S0ojKc<xKx7*NcJOe{{(l^I|12yt
zM7-VkiM20Ia~1z%N7QWnI_HpSv%JN^Y+Yd3(lOC>_x&_5?TPE#@}IPIeSBrI=Igh&
z&EJ@_@1Aib)dkPLXWiP<^zq3llhPhtn6r69)(@{PKX>_?Uvn+fK3*1Bb@9hv?(jJ@
z|MOR;-*3pUZ+kKGmq~}GF84g>^z)7^G6wD*o&S%8DT{*^9@u^6tCQlo)bn9`x4fy#
z+dp!^_&!fPmOF0Cw4)DHes%G|Bkynj?u)UhGxnI4_pv9uP}M8<y@<bGnb;hDeDiI#
zFMk;9ap~vUlaH^?J9g;gw;y+$c68vi@1NQgwQcs}c@K2HQeO18!k3Fr_8Rnd!uub5
z92&Xw(uR*m2CSX;p2l7fI3}!i>~0DRCC1_#!a_pYrPC(A2d5Xzo;`ik<K}<9^zUCE
zGWMpm-$fUoyLr^0`yrx+m(f$Baord=G@dt&5gmWM;5PrI7-20Cg~UaV-E-_8rp9ew
zewqAB@I$?ZrR=%>^HbI@nipJI^5B{Q!;ii3em7mGzkUD7;Flbw7mwG??mqt2`%1e$
zc>dwbLmsX_Q@rZrmy0_O*>wEJiT>881K*5$b;q2nm`{F=pCO!{c;%^gOPYTyt~+mE
zbNu#c|BkDAJ1X^i%ajXuC1k95_CIUGPOe???xnLSZyh?*?}*=~;ZvqAK43Q|M_lPu
z5xHdg;hfD~N~UdialyW)!p@kRPCT|SfBK0*-_;wvhOU~QX|%k2diTJn71iD5oXFfd
zZuOaG&-AGKVa@YbgIxLP*Q29L@6a`#${06nRr4zotA2`j+P`7Xw+D`GT6!fV_vQZ-
zK04;izg*7`M~vu+7@@t@Hs*hEj2IFz0vn!tjXXvSiNJpn`@b{r{aI-F=W|*41#7>V
z{lvj}W3DY(&>`u=7dlQTPRLyw_C>*|@+U`ZIDK`+`Zuk2ZZqwD*jYT>^X_pgI!^WZ
zVRz7!yZ)^)Y99Ze6B9>oefhhNi(dQTObpv$|75$XcKR!I#~R|r3wumY_nR00O2{aE
zZ#MhdJLz>l9$Gp*BR}Y+zd5s4P0c&d^!*A=s&~hgqw045Jh<$mc@Ix{c<z!{oWDML
z-)P66pbs*0M*N&`!0*=+bwxA7P7eIJ->JUSp1$MVQ^Bjgx%=rWcP}fca(zDR!mclD
zyUQQlJH=UdXh~SvkeBah8THkjuTSc?BIVt`4f-y7^|G$1RpwFQ&vlN?SvsKb#E%Kj
z^m@!EPuNoY^}>12G}TaOi8PK>MUI;WmbU!{B8O_ffv|Do4qgfm8?RK}wY`7+zE2m3
zul7iL&erXc@oyOki?sLtdq>j#*B(f2nVd3f(+ORV+j@=KeRaeOOA>DDwj^ZW>3$RL
zxqaZJedAVFz1MWgzWO!x!TD?H7j|VlU;Kz}{#!!M^JjcJg*N8)Pkw0r*5req++Y6a
z6OG?)3Cv%$?&R{uRbRgDzae0x_QaDbUO2q%BUja%`=WQ+e*HGIcQ;4cLQ~EU0bh@C
zURd<6#Qv#siVEKwHTL8vam}W2`;NW%LE6E90oP1>0uDU>+N|5Z2y&LM7ouYB-ts@2
z<0rf^`t?xb5o@FQrI(}nzxGzZTQv_jRy;kp;I)+H_k6qMg?IP09DHc+uLi&3#UB;w
zhP}Qyer)F7X9T(0uXYc#d!XF|?H*|NK)VOpJ<#rfb`P|B03P_mA7|PN+da_kfp!nH
zd*J^#4=nzZ@BgdzF{XU~ufzBM2G(2teqZIOn|{B~zrCk#@NfG4KL2)~zTvNiEK2l!
ze(U%9s`h{O{XYFY!KrbK6@vQW3x97=<)RN0^a=w?e1IF@+l>zeZ$$sSKz&#zHVU-L
zBz*=U$xi^UTqJvz!hn+eR5v~oeAQ?MNsI+UzqxMZq_=Vf11v<cOhX$OM)oWNA3BKs
z21Dl8fhrgEw*#qa2<m`R{ryUz#54AZ8-LV|Kk3GwbK@_#@jtoo*TLIOjQN97{~e)s
z6ZFAnBN9J^cq3yw1(|<@28>^%e+GCPCRr>nwJ&rlU*N_s0WY@J@5olD%4s~FCH`*N
zk0kFM@CNvw{`Q0XFRJQO|9in(hs)zb^-rkEX*|xTc&dNZjd!{6I@qaPRKKem-&e(x
zzW0LPx<a1&KJY-Ms+`)-cjFhi@nzr*t^V+0tKG_<1>a<n<&*u}z{gI&ctCHmm&U~o
z`$-q#Pk~o1>i?V@-va(ftG<4W;R7P^F2>SOm)ds*e{~|}E9wy+2wu6Uk5D&0Ld8>m
z_qy>9g12J+)A*6REVuFnZu}BAex)0~%8g%5ylQ?t<5vDWc#~@VI#uOl&+BgdZWT}V
zG^lv0|Fw#z@>Ae<jbSVYnC$x&{5$cCt>^rY;G59@5zb!%AF3+93ckw7SR^m+r32s>
z$qxp973<LoO!6nVl}~ZwXSnfcDn1x8a=<rXeTiJ&YE?PeSEb^q|7~vk%ivo~7*9U`
zcDt3=yYZiaZ;ESMzh8nk#I)Jta;xv*flm2F{daTYgWULF;vtXXJ*|&H;HM(Uw}2;q
zMS?HG{G+%`e3D!H4DeT@+s12|Tltf2d<FPwj3<UnlJ^4ml~_M?=^_7RxB72@uWDUy
zzN`^^CDv;b76j>Y&aM6hH~y*{?{eckJ>|aXU-DNVc+u1rFZzME!+vrz^%w0{Ki-X>
z=*B0x@o6fa=1V?!>*%)o<D=m1ac%rURejR06#NeOgW6KOtaK}X2D}0OQSO%__$v6<
zfcm8WhpPHCpAV~elK-U}f1G&L{Xy3Od_U+2;>$dlH}NZF`992#bp>xs!F=NL`wsBE
zl5v0Je1GsQ4>2~dqpZ&m@QF}%jUe+D@XAH{OapI8Wo$ohpF;J=Gxj~o$Q~PbJJzF>
z^J`S?4anQ1;z`~XH-3kTC!g*DZy(*}|M$RK`TV5*4uOvy3!id*ny5b3&-a|a3SPOW
zzu#0m$?uGLt6Ws>tKz9VSjAI$IQW5+5c4o^sed**y>MdD+}H<Ma$$0AhV*YHD=5rQ
zO)g|})90qnwXsZm7MPZvl$l<VJUcHpDL;K~USWDtK~C0OmYbfOpE5ryw~);%$}dcs
zTU3}{l9Yz;fAaI@OU-TbS!!NE2K^(8&CF+MIqCWNd7xPZww&bolavmiN>ai6xhZ)$
zEUC4(<g~Q3j3G)DHyzqqBoz;pw1&C`wtOg*LEX>ID^8b9%8*K-jV<p{3Fne!t?Fmo
zqJsjcRG68co}88hsd@QXh4cTQrY$`m-#^Vw&P`2EnwyuFPJQO4qSJY@N=cKGl5=uc
zMou2AhkB6Ck}|S#v((-z%FW43opXy`pnZCNQf6^UQeH0klqHj6TeY|4=VfHUyGfE`
z3X==d;cU26<<GRN;sW>+niZtmq~@x^f}#|1cANK6n=+)q<$fU1S!uV>uV7AkPEuBG
zao(Ktq<P6%sY%7@`2|_f^#+jzSwmQAdR7j56b?;q_0BZeJ91`HQff(Z5-cdmDx^s|
zA-Q1AXmW1hExK;CoyIJ+C_g_vw=gNAC^wZQ7i7`Q#=n^~{BTcxdPY)eUJ<4(pW8P~
z$RQCa(D%{&WLr{3emeA{sZBnY28KLWn2!qi>9ey63Lz~cKY4CCtj@?wN=YtAFHnT$
z<YC(6WToUM=g(I;05gLe03kW)xr|TNq$ydsOj?0ZlK@r=^vzJu45@pr*j!Acyi}Gn
z8tpP~7#}jfb*_@p<UI@hGl~A0O#e(%RFvI8b2X_jFR3^$r!aYTI$V>FZ~GYOl&REg
zU3u_Y7R<fb%AhgLK(oTUq8wISkjrgL%Sp}2D@gyHBpydd-?oLr;-f7|vRY77nxGgv
zDSRZ&7;MC6(PSOgw`E-F0+uvBZ??*{MKrtNThwBxJp>DFPLYiV6*RDn8yqnwH}BD0
zCC<o-wM|YEkOl!6askExawZNzFR>AE#ggmfNr-X3g*`WnItA5;*5%piL)mg$UQQ0?
z)NCn2(adaJ3FO?lnD8(ytspNcGdUMAlV#-DFkuRki__Eo+o06895Gd5C<YhiV@YJl
zF(`u!O3y7;dyvmgdHEqWA@(n59XeE3CbZ%~X<1>=CE;z3oLiJbfdkVoLp7qXOPbcO
zI4?_@<(S}7XvJEU3@(_LkI;SdJC5Y9wq-7bH(pbk5{Pw@hmuqB@(Wpd$*t~O`RO_7
zSf`353`>_{Oio@(vOFG2xSrHHAk$h=>MfLmTa$_-WcRjKPA-_svI<6HN)-Na>}m_^
z85!J+l%fpS0L3svZU_UCwsP3Nag>q@3X4*tHJ(0uLbA<<Z*XZ945Lv~&w4BbMt*J!
zZ2X=BmtiF3$cNh#hJJRgG&v)bd-dRm5oU&vmO7{)kLc`F1>Jne=zc&|SrzsklxPp4
zT+aebM=~NkgER-}uU?gCfbE?0o+=CUHkE+qfvud+R^=1SKng)xi4=iUf^@5kwyd<b
zrE57-7o-=E?m?n`F<onr9!8ppMDN6AA<^}qOjsGG*mg<RD*TJrb&R%irAQl)=$ed#
z*Kg8g$G^ABu=U>wKogN3Kw5x=M^Wh=m<2QmiLOO5Vez1kBK1Xz=6P!J2$;1zV>Ku}
zBR`I`g6FBpM$QCsx{lLjpxA!1e9jZWaR2Vg<vHP;js&ehdK#%G68&k^6G+36W+VO4
z^&~IEV<meIsTk=go;M#9yISd@cK~$#4Jl1R|GWMfpi;NL3cJHQk<p!J(sN}HQfJO9
zq#po1lPlLS1*m9O;7Ft>q|r$9u2y|@QZaaBVFsjFq`^p`NcSNrS2qQ?(YwKiAq_%G
zM8cyE8-s*L5;g@1k8RQw$0-qijyr-Tb4+9!(hQ^&BpVXFdoDnlhonc!M50S86K3Xg
zDkvVW@Vv_y9{t!-B)WPa-60|QA3u)o1ua+QF9v=LX#`Rs(n6$kB;_&z3`FXKv;>JB
z_wGm9ghYx`+_8Y_IL-m};&>%!Cl%InycN`+V-Ylv<F7z_aeNijz_CA^5URrA92=28
zP?bNFW75mSaUdKQ%drtSo?|<38poB8W9PUGco)Yu;KM51!tqw*cS0iDH-Lt7Yypi`
zVdANd2?1rQk`J7y!n0I$tg8Gp70y)E$x&g_kK{Lj7OHRw$1R`>Ii{dy=U5L+b}DUG
za$F5s&hbIeN>%<Ejz!S*98(>#pX3-Jvr2`@N0e_s{ti|CPL8d}ujkkXe3|1aU>y>*
zCG5{}C1@ze)V7RcYD@K~z8!QI#|$?~kz+65dZi8)V<X3U<R9cX5cmkk2H+-+!-3Co
zYy@uMcm(iOj!nR{SV@O?U@wj>z<Q1ofde_V0vkBa1P<re25jWm4m^V6mB5i4S3!Rh
z$Cb#B=XgEvc#cKXu_*PCpUCkJU@OPfz?mHH0?y%>-q+hWZbW_|$Ml}8gySQ~U&yf)
zH#s}USCPMxW3M@wKO6@FujkkZT*Yxb@D7fxz||a=0Po_s5?JJT2XG_DBJe?uj{qOx
zxCOY0W3L>H9mj#dEgTzwuW}p?Oo5Na&Is(q@d#i&$0p!Fj^lxQaohwy8#uNgKb+%4
zU?ay?;GrDTjc5eNnaDSBYy*zv*nm3m9G4*9!m%AVk>i!XR*oxyGdW%lY~#2JxP;>!
zz;=$Sfmd?83%HVF5qLevjldf?HbI{%jt?S#E5}yk@8I|d@?YlIj{Itln~=Yg<0|Cu
z;`l7`MUGp58#%rTe2`<>2p-|s3%H46J@8qM1A$vOHUMAcI2`yI$95A27MoKV-x0{y
zaa@IbFOK7pujkkT9LRAZuz_PMa5%@Az($U3z_A=x;-+j->fZ(V*vwJCBKp#EOuvu$
zb4>5<139M6doKmYpkI#Z{eLLO^lp10?D7;K6xU_KS{Zl3eD?H^u|HwjVBm_RI^Htg
zO8I&j4<+m?<6eaQW&AzW?<(U^!hG{HmFn;f^DM%A0~QK<J!zAME0ps2rsFGWOB*R%
zJ1M`Pj0-8BZvbDW`~h<Q8p3zV_%LDGRN)#*@@b=mD~<AvGQN!1>lq>AStN6qj1Lp0
zjW@3EsSe-Bl~G&T5aRlZut|a}mT<I;$5UI{1mdbEj5$k}j_M@HIF0hh%XlE^HdV$s
zlusKtT&D=nlyN<^eMH9lNuOjHzeD*cGX9G4Q)N7r+NR04oG|t&bd?cCe5Gq3VZ=GQ
zPElL_K+=iy0Z&&i!svspYg7kq=-N;HqD;ahr$ojZD1U*BuMwtAA+C3*4s8-~ouYi&
z1mgN$h2J6h%jEn{RDZdQ{YeLUFv8UdbGXlH86T$lPs`Y!>aUUUWs>=vjLQi>FJm3m
z*&ySAl+PdF)=<8*5yjd@pTSXbok+rbOihIQ%K5Q`%VZo+nCmm1aA!H+LYOwexTX@O
zhk9Izg!$Z_MVQYAD`7sz(g;5!*Uuz8P{uig$H~}67&g#VNSLq362g2PEhIcmu45-0
zFXJ-8ePq0n@LCy{6ZVpEC1Ji!*ARYC&R<WslZ-bIrcFGqD#8!Tcq?JV9=diA9xLOQ
z2~U=BHDT=4=-Nq`9#V1bBFyK&JB0Z>7YRQt*QqDW=XoPxK5zFEj*#mdB#d#T>o8#+
z(~c14G3_hD{&Jlr!oD&-MVQB>vxIr9_?|G2EiHukJikns$DgZ&d5pS7n6FKGfe_5s
zuZ}QZJ6?o&jOs+#B<r9jyhO(Sg!z09B+S=TFT#928wjI6x<U!Bk#RU-9(xB8=Ih-^
z*juhMlrWE1BM9@j5lNWGLla>huVM+$klV%+=5csDVSZ1s5aw}aDq$Xf5()FQHj6Ni
z30A_RWtnM&c|6G^%;Rtl;ZV7bjqpqv7ZUC&;}XLBUbc{MzMO9-%wtFy;i+=|O2YiU
zP)^t^=T{OAm+>0HellK9SSRBRgac%JtA7P?V-@@Im{T~i(HUs!n7wq7>-Hz>SxMlL
z0hhKecHPeE@S05wyrdEB4lhxo7fh@~6Rr)96xob-P~Ibkl@RTTHa%tpUg~)?@Y3_H
zUw1zz_NZp!a}G9h#zn72yjl~hEI+wi(-Xywjvl6Cjvz6pl8G-&UJ#8KemEh!i+Mxy
z1Evj*NO6Osi@8fV6F2acc%$Z2ze{~Wb}zBQL3QR6f832f;>I6x<M+Gq4Q{;H#+Q)%
zFWq?PtMo6Gf8@qDy79Gb{G%lAD>wc#H~yd-zt4?V*<b8tzr?HTmw1)^60fpf;#Kxb
zyvqI}H~S@CWxvF$?3Z|z{SvRTztGKoiC5V#@hbZzUS+?;tL!guvtQy>_Dj6Veu-Dv
zFYzk-C0=EJzMK6Lud-j_RrX7~%6^Gg*+0+Ceu-DvFYzk-C0=E}#H;MLx!EuAD*Gi~
zWxvF$?3Z|z{SvRTKhMp6iC5V#@hbZzUS+?;tL)EpvtQy>_Dj6Veu-DvFYzk-=epT1
z@hbZzUS+?;tL&F}mHiU0vOmYoeu-DvFYzk-C0=E}#H;L|<7U6atL&F}mHiU0vR~p=
z_Gi1<FYzk-C0=E}#H;L=c$NJUud+YO&3=hj*)Q=b`)`cacsFe7n7sk_6wO{%BGmS|
zw43fJxK}VyxWsTzVIrPd#iS`r?3n%B`_k*>8$>jW(ob(TA!}kn_O;}V+cmgfZF(QC
zkWFkc?jg?%FlqFLUYEb~H3nLkwYso}DER(#%{5av$SyT67S{QmGlxp;%?a7d6VUG9
zMklqUWV)kM)CrP3J|Vk=mr-65<;_UQ&P6}O>rGah9(^2I=$cvO(ie1UGU@jNi@?lk
zEc7xlUwz}T7}FiPsP$u6qI7SgK18<o8?v`X6Cm{oUg=xmcU*H#e*`}En4FMJc94IT
z#V&IMHVL7EDb*HZWhL>(KE|!E$zQ}zak&554V^Yod$RvH={hSRI}o}yE&jTM^nC`p
zDe|dYKsggPIt8*(OtDctDyO=$(6@eSLbkrQ+_$i&(m`c{snii53ZcLyq?3}b5zV6{
z*;4zmC^PK`qI*|UJ+Fp$omTW646Ltx*V!x@eUG7k7N3ya7*Bm~bhh<f7DfJ<f&K;F
zzg|@ODs}WhKcqviCbOgi_0wmdSGW=Kz0AFx`h&flo#2n7BtJ1Bdp(!m;v(B@Hqy~_
zM_H8Ksx3%GTNCUHfS&%*>`I0AigVpeDYjDNuN`pqaZkwfjniH6ih<nP-p&9M)4zGY
zf4C$Y^Cnx74ZlU0Fh?Pqbf-kNYoh}s`^|ecI%21K+Pz{S=hNQKlsN2lYPHThuUTjP
zCqL}$P`lI_JmB7UKKj))lXU-*bjQ3NsmSNLuMOG|wIZk@ipJM7!{j3v@rMW4D}9Rk
zTGThePeSw3kwP7<q0-6XC{w&R%8JC0j7WMUOFZdEbC<^Ey6xEZzF0dNOYiDn@S&5B
zIYSN~b5=fv@tStbX@}i<*lilA8FYm7dL$wH$1(DlU4Y%2MV4qryGgMd9rpTnorP#8
z*^fMb%zyGp-+jkoq_&uE2UTs$(6-;4e`%W#1MH^dhe{V=&RFe>MjT<Y!jUgTvT#l8
z@<Sw#c%5lmfrn{(0rB=&`=Qtvo!v9m;|e|ip0(Ee@uv}!J?+t=zQ~m0yCm&9-z7b2
z*sK&&*QTi%Y*DWh@%npWg!0unhJwE6_h$9aop8<aAA4KCn*FTgViH?8G}W-EOH;h>
zJ?5_EevfNPe8n`I4sEgC7W!a(g#)ZaNA@-iE=YmB%<5C%1s|MS;F?(qp60sn0Bi#8
zB{KaX6T^>j3E2}BA4uam&sH6K%(1%un4=JM8Yqn~gZzLRpX$Sq<s-6^X2_^9RTh!_
zVBj)SK%~7PAd>t`b5lTGM6_X6Al3le8xX0iz2+g!?`(z)k54u__nD0P=_BI@rJ!yl
zbbAK*%&IRa1-~AAFO1jb`~AWXm>7P%PRRacblW(;iaq*%<O{fmX<JH1hK#^dhPZU{
z{`!ajb9C48@I&u9W2fruLbSd*dJqfm_uxinWnFLQdaNBygAmy@_LwvA@Wako=u<O6
zi0tr5sk2lx>OZ2suukh$eU+fEs)mSwO=3jAhPsG==V~GX*6)dsV<qgR@ehO?J>nq6
zU-+O%pjeFO2X?sH*_6Z<NIs;xyuL(Ld;-N6+%GAHmF_7;Y<pp{0k4t=MvFcqlYF2t
zA^t&b6~?LQXWTo0#l?3o>d|EKCA~G4gzO_czYAi&WDCZ3+g>(<>Uv=<OEE{>Y(f8s
z!Mdn!SZe{$TN~qX>00se?Y~|<zWt$CZMlD(aHYhw$?@Ce<J&zFwU>OObyu$EAK&h3
zIlkRg>ruW;)YfAiGA1sAt)?fsm<+Q%z5L$x{pf$+eAmntqnFY5eb%j6WYvNzxLE&i
z@TK0zf-m_zJp7U#W77NB@Jl|JPcNfx<^30pVbKBQg2m^GQT#~n742aSH3bwCuYvr4
z8pA9P(^`jcXp6I}xodSCuqUvGSzE1*)mH24N?&4jw%B#LbbBXjtdCAy8{<WJ+`mHw
zZE{$&e|f0MzqqfsU#~ZLSOc2k{5(uJ&l7U=F{LlXiBCHedG~nUmaNEQ=)=1y)=!7J
zBvYtg>I{nxgsi|~jVZBc!25<--z@eh>}8Gfi4iwB45np89?`N+fhJF@7VW!;(0>VZ
z1$L#EZYO<7S5L^5WY%b_qcQ$|eEfwyPfPbU(x2`v@zVXxW}~^`CzZWn(TGz<`C?4_
zfC|u!?muRXsb<dg-88m|-@Lc|#}b+&ti;;<-ga+{eGQG*w1n(waq>Fa{L7|w%w%4M
zI8hjR#4ziptFD=s(T0gOo6*GYALK)kEq0eJqFykU{RZxzBFV=6Gn~u5NU<35HIn?n
zg`K`I%xZxAdM+P7!rEFEA76LKb$s1U&}TqPK&5qp{*<wkDEMlr-Nq^t?j59$d-)EO
z>v7MdwelkH62we1tts5IQNJV=eq@JUca}=D+DUW2RR-jd|LI;r_ph7Dr*Z-NM`>MZ
z@uxp=)>YN$D>=HfNN;9d|B7ZMo=1*3eN4{6z-VXTXzvN<7DcmJ9dOU`z<Uc3{?TGT
zs=gm;;|;5p#j;hWqFJ#~?7mBgH?7jvYcGYFwdcE|zTu=fEPawtA7XMii1#{U4huG6
z&jp-`G6DG^;(AA4zfj-$_|X-8^!<El;>T2+5rZ8yme>la7kE<Wqci(d?=S^B8j#<B
z@};14@$nVGVwt1PGP<H0>cuysUM%h*rZVhx*E_s%AMp{FIVj#xxffy#)_aK%z0tvH
zUw4Ki3grfiP_C_i-5DG&lnbJdbH7=hGn!f-XN#%(u7+Bccq<#zU>Dih6|xhX&0%CC
z;SrLJWfCU)wa5>HKEt~W3n1Nhm_i(-Vu+)*F}|V}y5pYCET@G&amd%8-Y~xewg<k@
z!WN@`n2*>vx<a(XRYd$o@!4wgLwy@!<Dc1*0b9rpX-qIqG(IWNxeVjgRKto>YPIz#
zwYvHgGpqN7%=imJ@PweLQKN!JM;$@je_K4}*tWnm9rL;Jzmees8K#@asBe{FSVeuG
z<a4Db<_hhrLT3u+DGtZOSHbWVVdEupSP$S}_#?PhD36IA=}4;?i7}6@2m|&5{-iOk
z;uA|u#qFqX!PuK){K{LfW{Dpm*7s#5p@=1y7N(j;Iy`VMeF!u}9O>`})zxiuo&@cH
z{LetQApfEjdmdm<l<oV;HM7B__tivt8<}ZJQB-&r+c}hb)qeJpKjy?z*r9{|&D<6b
z_*W3k#UXLJ^PNfmX2C$V5DtW0Q=?Y1(LocUMn#4ArePi*fo~;SfXUBnltC}T9ufBZ
z_?v6`t>zrfx7#4o#^)R1M95G5Sf~#+Ep-q-0{l?zYSz~;%$Meow5A%zRMcP&QM~X*
zosv_!K0{-0PsJR&ml*|`SC%HrpRSL{Xn9W2IMSR@#**yUiZR8Zal-k5Cw&bm&?f|O
zs_&V}VQU^oEJHv2nSDI<)2qqyr*l_h_HhQ+Y;^SDYgC7MOl!0<!12#t5Z67x|Irwn
z<Np8VFZ$jF-~W-^_qg1*|DxQ>KmW1Z)m(0c%N*t@IvlCKDSmbF=8BY_X+Fu_ta=gi
zJ}n@<E6tDk_~?q%9<#esdAPZ+V~N<;Q4=3maR6AT>FcOPjG@?>c1L>99<-}Po3Sx{
z9otcshB`|@>riJP^2XHmb?il*I?RpE;C0|Xj319a;ww(0?mxiSTgFt(M%j0DeH{nT
z&pxzWhO+J8KeUXk*b|>n@e#_GgWqF$pn`q4(fLv1xQYhMxdzL`il1;V(;R-+=^k6?
zKKVz!_ToO#pA^Dq&v&!-e`Q_0@r!FZo;UH#O6%D>(}+C{i}tgY;@*P2e{~G}PBFNn
zSn3$=H%z|=c7}r&_UN1q@W&Xx(bWy`lcrJUTx!x*YZ5Xp#=p;I3DH9J>BXh%To<=)
z*DcZET|8U7{(AFnBkmuyh*iSph&7;A#IoA>n2HcHE8hx!9b(w$jj<J<Tg(-kagQ^{
zbSn=PYy0X*N5rkZqM%=X*)>y#HPeDU@@i@C`%sZxG)2<B3AW7AA1Zq(R%D6PS83B&
z-yH1oDF!YC&Hx@J9?<KfyIEsmdX|@(bk%XtVQGxdQPOd=Da_GHT<Yimnkk0iz8YWA
zA6RG*oO>~*Y>eR4HUi@wrFma)E;D&lYsSyMIFZ}@&Elo7dGmHH_Jt(RY#!;Tg8Wp_
zu-cKbtud_J6Sx8M#zTD6)&N@v!q!sLM(hbUIu2tFZwAf+ZUk-yUI7{J10S};R|JHo
zeDx`?FwO4@+q=>E>A{2wFM7s0-1Q2@5--?!@j7f42l!qsaLs&?RavezEzqAu9&^7;
zEm{^SWa<jhzLtIW`{$YKS}pk=b$+>oI`F+zN0CGOD#TAI!QNI(dBjt>>LUFFlOVkh
zSc+#9pPJs)&p=0J9V|U>Yhxd-KCxI+QfjO;9zwnb_Z_@5VM49^4CT`xR127sw11w3
z{qt((Woe-HW<m0IShT)ez%%VwvC(%8TZ%p3vZ8@_hDpPo$V>FF#+ibjr&!WYT<$ot
z_{?rEP+fh83m&mv_oV@cVm_Q+9K23@c$<^z8IYIEI!ux4XyiI}UA6YmHmAQe#xLYz
z@bjck7}iH$_8{tcT+qgP-j@nK1nWV2XtR^b7m|*(Lbb&rKX+qK%d~hNHDa%+Hwjlf
zy*r<y=K$Z)+Um=$6TAJy)s7ib8^3DoZA#u0-p~r>kV{(Z;f30rp9YSr_1B53V}_Xg
zsz(cPn$wF<?+!P4R(HTYPdL2Q$r?N^=u90(dev@u>D;+OUjgz)avcQ7dmdX&3-&X@
zC;Iz5gE~ulW*XNn(8~w!BWb@)AL?pRSBMnCNjH6i1A9oDEeLY-X7MFW{XwUg54%tg
zV_jlrOQdHR`)G%Q>I-{Oe=+79?a6O((R`r3wnC0wtbAU2jz#^;7v;H1`#E|qDb1@x
z$DB`_G}S(6<7-;tz`NE^6ZJ3V+fIQ7n{bT9$G%tmlHV-sf_DrNjZcqh!MrD#9zmU?
z_gQ*8532d+Ot2Tv-!u;I0Q=%Os`CIZoBvqfYR>`BXX+*S^K3g6`3HTjY{q-4>#)yf
z@0u};my`Zd|0rLNzFkz_xMobtJkoETZH=nD9POp@nl)pZczKzsyi{qwZ_Sv4yu46V
zUZS+GTQf%F<!P$&T&4Z~HDjuI`BYVTn$mvHnlV+pJXTdcLuvofnlY8Ue5k72qO^zo
zc3vK;DxU)X$3h0#??2YR+Az+q$a>W^3u9Q)ImkcCbcbhD^YyPw{LFrrjEK{Ii!bdS
z2pWv{AC%V@*vzp3IECXLz@;4f18?TI3veyRco&k`Ot{wXQpzE}djBYchUWJTF^jcb
z6&NRzNfThML5#b})p+41F;i7f{FCxkkZ%=*09BJ4AeEc`q@2~<x;-;p)zW*v^|#W)
z<kAZGy}bOE<#PL#w=9?2FXZL)d%dga`1b3~$G11(+5R^ZzT3j{FRk~oy=8b-t9OdX
zXQ1SBE$Uxy+O++*6PvbQYu>cI*0pI}lPd+kxrlR4)R{s1%n87rLy*RB(>ksAxQ*dG
z^|O#`$8&khMZDv5UEjU9$sB3sb*k#{41@M~hb^@$h0GMhTNmWhe4;+Yo753%<T^pT
zPSZ{51oJwOjrp}{T^*8evUDWLCLNo=yW6F<DOtR=P0zq4_BDP7QrSdvSLw%$XV20)
zSx0qWM)<II%l*ap{h|teSB}E(7*RDFoy~hUI^2EPgZq-o6ko2Tevc+(k43+<HVoqC
z?m`oOGZBNs1IZVt+x#`|t1ZFB_t$Q8ZU%pn@}LiXCkt2Vl1?%f!pR5!jb4gBZl>27
z(ra%*_Vo*YsMqU%RWEstZrXkwbFTC{-kXW$NYb6gNx(Z2*d)y#O}$X05o_k)K$<yA
z_Q*J8uTUh2wR23O@0<{kEwCSu>xPKhVy&nxXhJ=jd!bbJfsDg|J?dl5n`)H0Y=MDe
zs;8@qJ8yzqYS;X=u2>KS>~(~qUVQ$NULm4Tef=cOv0%qZ@S!5!DW2N2o#FQznjgU^
z{|$3+6ZWso;L$#^3Y5k{ZLgrR_iym8=0HNW9qVT*eEb{MA@0*fJ9)q46Os*m91Olt
zuPJq(KcY|K4bUMI?^|ktmm=9umfok}QmhfhuK`>>m*HRyitTIxt*0`i(o;+CYwJJQ
zVfsiYirz={`plvCs>*m#KlDxpd#6ZCtG>Yw5%p{SpuPt6hpFl_)Ytq$eF60`9^4<n
z4g>1<`Gfje)F&O)sT8`>I8mH4!)JEz>hTdU4$53?Mw#Sy%!9t%UzA7lz{Tgmf0~1~
zH(!rhXWuUrIiU~vrjyDy)Yo<NrGp-nK0}|Ps!xi$iXWA}thbnRN?%s=mCXBE0v!CI
zP-GJG=bQi)uoehwFz+{`4&fTg=h&O;M)8{Lq*zS)i;z>(DC_S8-V97_`;dH&PXIS_
ztd^sJ9E!zc#|GG^$R=Ki#nSx8T3><rF5q4igL{EcXLp8xYCr{0R%>_C{HJzm-|xDG
z?|+5wQ(RH&{-5}x1~%1loBkWSJ-PpE(2@GCf^Riayy-K?%F9w*K~lc^S`}){k>o4<
z9w6D*>}uYv+_N@gPPm%Pk?OdDSmt1Rw9=Z#I7#b`{8{p=YbMp#h^6qGEURr!3JwGG
z!97c=PxZ*xA&6B%U8(aNj}w#^!}CZ6?a7G7qMRtLiSE)qLXOcC*J>e);!E~**UW1e
zr&5VGOZwHT<}A(mqo^<ZaDBG{_AWs^hVMBJ|KOVW1=<DwN^1zO@^GIktF7CA1TnPq
zyX(8jH`I3u%9XWH&gD{@7|O2;zVszzQysGTC|`#&F&89Tr8?$@PpQs#LDIUUaqw0A
zf!{?!P)A!IQ(RhSZV-^DF6~>XewDPYM8r4Qe*)@mK^vh~dx7fF8a5-5o@Boru>pI1
zCVY(VLqN6o4o6d~z1)m8O01>)QvALomE*e^$hu52b=XH~>dKs~w#+H~Nb51h_6hn(
z{1N*p_=DDGGx|6LpHW#d+Dt~74P!*}pcaW@Z%B=BbTsl!(243Rdx~Q8Ykwc#jG&C_
ze$3_6<NiSR80;%r=g497OEPPaN+sP&%LLlz)z>=pXs?HCrF^rh4*_N53raL+EN=Ut
z7qAc7gniIP>{)~sc|U`?B{RX-e1h);zUy6mk?vQ(As^uP4ZK$$3I1WU+l)M#Z%Ua4
zWv4LrA3<63KGfxVgfeBU(7wqPEX_w{{5B(x#;8=(Ew~GPO7k3fgh}Vk_#R^xa4CMj
zpTYCYlAIJt&Qvbjio7LA3%-|qM{`Q)qX~N2;A_pN3E8EX<B~rlTZM3vh4rvH39_l4
z6R8<_ld*<U<g#k=g(80}m#>zMXGNwtmXJMFkqx~`ZYkmi>AVB;;cyMc0C|f2h%wUG
z3V3E3h&qbhgh{_rZue%ifsZ9ycknj1^2IZ}ochN5wc`EIONVa&l;?_&I&C?ts~_3C
zS19&uVhioHqd)M(_YthYekj+?kDq;ab3e`@Td|go!w;mV)E9r=kjnO8{hXtDlw!LR
zw$!3tGkiJ#b3$4puuG)2H}OB!BmbX-{{4A<K}@xkW4@&zkK&ekEXuf!>aoZq{XS2~
zuHgDfW9>E`^#0Nt<3YYP!MC;NM=h8FU7g@54q700GWs`T4y)tfr237{6y)7(9yYo8
zzJYv?So7cbUfD}3b1d){zE3CHK0&;=iSMN{JO>D8WZ#Fw9v%06KkiEnd@tO>_sSZs
zhYv#Qg!9cV-S1*^Gs<FMzdAP4x}m*AF!E{KC=OE|`CajE3iKB6%tloE1&_a|tHeR!
zb6MxUX!jy)r+E;@b*`!3=-7cg_4pH}wI0gr*w4vxRXq=^=!<;yB-$!-K+#F4!S8-(
zzoj|lD2-(=eov)xijgL_7`YhzD9>-Wzbp)@Kjv&kJQ~IAr02w@z4CoFlFtLN?wG?=
z!|XxgG2T`Pr+O4??n67mDWXuk{G@O^6}YrErh)QPfQ2JMxq?x@m^B3#)RR2%)F+ZW
z=t=Eqe?mHl&>#1Ug}u;MGrrHdK`;KEK+((2^`g4UbEu-%aL7{Yb+SR$OUef(y%enI
zRmSy_<lRIs@<VVFvs1mkO**>=VuiMTnG@fxIE98~xVJ8I;$4{(Q-s6vJm?2KwqZ=0
z4{UVw<zsvT^I$9TXngGTH$KbLGg>6-5gmbh8Bu|JiaiwJp^&HKZRUAKp0|YTy59Q@
zicO=Dw*Y<AHsZGew4+!|?X;+)IoA7)I7ydO+qtW*nM;vR^AB^Tgm5p^B^n4i4OD}_
z4`63{za9(T0-2?d8H;>H7cKOw!*3*%XT<w?sUPH><aq;;M`cT~W{HNo@uA>pzEW8!
zd_#LJ!g4;IMG@Bwd@d?=1U#cE&)(QCNcSJ54C5)ar?^3K4`WVInE>0d&ynsw*n_pU
z5x8FD8}&YE3feGy3;ou&ZR?Kmmf(A6`)|G9cmw(5Cj)#(RDkZJE8VX>x%~BY_#GT&
zrFF7hgnPkWs;4{`D{Yk+G6{8-AjUA<S5~4A>?b7|BwwjxK^;5pU709<3-&281iUX;
z+}zvQe7<+9E}>i(sSfm@zLoewa+ct}O5;0&+o{wui?!v1X&#g*eem{(lal|aod$YF
zqHPb@DvbgBNbQ!AKWbwxhO2yCsw$&-q61IV1M_7WzC-b(HMM8CQ>b6=)YL7<np*Ch
zfO3YvdAWF29*51iuVunUr7tb!b4s01FVxoJ`xN<oP7HMJ#rsXcd^N!4_$FCr;!WWD
zs(35-L{T$q4Zd4Y%Pqxst;#;<M=p=nNg|(H0%C6y-dpA<W#F}63*|bj!L7%%6<Swl
zu}~XwN$vA-uxAP0zme~%U=!VQ5|F3)K+b!H+sPn{e6s|;Axd$;dc`$!I;~aehwoL!
zQd{f?g|l+ZF+;Z;k^%YA(mGGEMU&0Q&qRJO+R~V5kw@<i8TKV6`koMakX%|nWZ!Cx
z`Dl_&a}lz}%6i>kyA|undNbRPYb%!C#P%^{5B8TNFZd>Q&bo<x)JB763uQkL1>LTl
zk?p&m+gAhML?Ztx^2t8ZdkOx|U@7wN<2HVTSm$mdy+1R+ZuOYkG3M@m--_Qk$?qgv
zne*GYyw%u$-KzcNo3wip?MmF*sOMQD_h$^*jr%_JS5EfcWZosJeb48EncCrRE`)F7
z`7jcGN#p*ym-{Okei?y$E9Ju`#opn({3Ga1b42(7GEjG{G(WI5ff=6dqtORF?}ej%
z6!nMwA<fTWynUps?+x>58+`3<1I?d_H?d(9WYQWDPRlmj16>7f!%*Ib6R~3m@~?k$
zqYV+fd>3r6qRt{b$B~U#gUjxR+)~Ux(piiBJv~2Dov%oS+x)MFzVxhqtFeuQd}S_>
zpLX&2U_S}J!FL+OfWgpj7rv7Uet)CmL9zjKDrhs-9_`a5|02$Aru{kefIgRYKrZds
zDUbH3p~w%Z)t#66!dhJpT`}%A=tc8~?p^eJKazY08wEV?(q4e>$#<i#6tWR>X%P3%
z3EcPS+4K?Qx7vulNR~1m2J*VT(5VXbkNko-@7A|E_9`)H1az?D?;r!YUlsYvy_xn7
zlE3+$gYGet@wX=e#ee8Xb8I-t!1K>fr{%qpspiJL685rNgr<$KF*uy&)i^E_d#$Ha
z;b(j|H?j)vv6cI5TYm6J9oB|Wr@Jo&^++dbdkW)CycV*kTwANVudR)6X)Nl{y(bX&
zo@kOU^^M;{Fy0K`=tra6gy*WA|G{t9Cwf=+!5F`dcoq!GFvc-xvlUo_Z$IC<<eF)Q
zZ_UWtg1lHHdRAdn4q0npv*Zi#<O?HwQHtjcIbNsOa=1?{Za#T|e9|KO1oJ&y7prH%
zuqPhB@z8HQRnl)gH}TmgxA0kO{$KD}Ya7{T<SSYe<eLciL-8BgT8+3uJ_tdgcP<Qm
z3x{v$zNO@cy7|V_%{K$dH@J5yzBzuX?VA*9?Z&E(-t1gYZ{Ks@B=xS=#p65OeaD=h
zw!X!l<BvHze00n?!8`U`lJ~T8of3~ZJL4N(R@1k?1_|GZ_17ZlkUWq)kvbrGA$3IZ
zM!F5D6H;fSE=ac{>5+Vpe3AT+{E@mM1t4`p3PkFT)B~v}QZJ<5NI^(<Af1uMX?ypo
z=<VG}mnt)-F*2n*Pkf8tL5zN)O9N~%nV;xjig|+8n1KD+XZY5hnee{Pw8(Zl%A3B%
z{A|I=CN&IpG1$dm7lU04b}`t+U>Ad340bWt#b6hMT?}?H*u`KMgIx@EG1$dm7lU04
zb}`t+U>Ad340bWt#b6hMT?}?H*u`KM>pwrphVM#D%QRy2G6CO<<6FpO9fMd@&miBZ
zmH77LMSSDAAM{7i2|=+@NkP-1)<@$T+Zz72DfoWqN!Zv6-=iE`c<k@N7+Z<Q-#!Rm
zVZUc$C0fuYs7_KswhkEIo6Pi($~LzCjW?rjk@)vWQr)p_@+3a)MtLhq9=;hTc}EuA
zL>_Cq=<fm9(T&dA@%Mv^@Ef={;>GO;23(@QNq>G1E9rUkcIofYnfN^Rx;SDEe}{e*
zai=G~>B941rSg4L)jpT&9V8K{9;p#&Khi;@4f|a#-G?sMEF>#Z8d4@w4pLZwtjmMB
z(rKLr=}=;C#^UI-?2(4B0${pYd6{iHFVv8mS7=BsPR`0nPRU6h#EVts87N9mQ$)3$
zw2O0D!*Ae{;-@9{H^|iu;bCe0sp}hRxYfsTqlNQr>A!13I{Z<)xmmV?q?AYhq#aH$
zRrN>bna)ivNustlYuMJ`&D*6E-MmYFgee`qo0guDT$J<sv%e*met!UCdC=vm{Mh9>
zj1+jt<vN8F`w1vgCLUTvq`*&Iu7UU=&kv_~_Qk1(9dWWz57v$Kkxm)xjdMEh(Y(U?
zXl~QwAm7EbnhZ^VCQ0*eoD}$g=5rRH$<sWfxya^f3Xz}4y0S5vC{2v!PE9P%|GZl|
zpRqGe(G1YM&rU<Lwxe-j!3`&1!{U@jA1uJx&X)B2R2+;A#K=z7!OQ%_a%8$6cz&~U
zcT*mnkUl#(Wqx6L0Ypwo$BD8xiev7l?B*2}+KLL3aI9?x&ID&PcxX9QI$WJ)<*Li%
z<DaEd&T$+r4YzzOJRS0`D$B*uxT+JjNr4<37@nItUs6UMW`+Zei*nMFY=t;-9eqyC
zD@@LTa%fmUCmT~M%7<Wn8nbME0V_yN&Pj%*=v!*dkEZ7b{TAfq;?!@P!H#3z`6Zt_
zj58|faAREZ8PYf#l1}l(#SfL{=Qx9>!^J6IKJi>RjG6JXy_Mtlm2>o4547he#}h9f
z(Mv*b;Y4vd)Lx=EW?wmBoG6`GPE<PeUZOaao_DJpXe@;rl5@kE%4+_1hs#rKoVR(y
z`RFR1A1mB;Dl|V%UT&s3b()_R{kwdA`2X+n`8oB!%ctYNGluYNm3-w`Z@HvRALYF8
z-{tdDu-)_d@#4ILHaRy`SM@8OW-b2<|I?YvxTHg|L8arCWlGudLC-Rkcue&S0(3Gs
zU38W^T{!z$$_JHr>DYK6I<1^8I+B|zNQb`@<qe2R<-7qMPOk6~XdoR{jtn%%!Sgbu
zax{~8qQs+_#1kc+9|0~?;&}u4)OJ}eH6Z!aK=prJpyqA7#6iA%L^(f88|#Dqe+g(W
z|9=ypk95quOj$y1T6&2gEX{!UV#v!d<R|COPB-9x!(d2GO-(O=Ewc^sN;W75_7D26
z8d?5g{b_%({v=iXfh?>bY-k$&J1mXimoRJvybT66Gt3xK^6>qPa;e-{5@sAxvhc=#
zP<dup+QTCaxW$BpQOk+g2Bhca70u3U?W2J1;)ddaL2y-nLxC+lH7g@4J<X7nYd|=F
zlh7SD7zRUOo`L^=Id}nOvw;RSIqa?oY#o>-4F94pD8jtiG$|J<c(X}Vz$WwmD}sTx
z12itQCn+eh*>EO1`lEs2^7(FtG5Vj%YWUdyZzdMRUOLNnB~M#HMI<_-m(J$(pC>0H
zr=7@H<SQ4QyG!To(mA`*rO`@fS?j=fAZd~4d|4f3XoS`>VyUc5DFeRMMgRL+e_q%T
zl;ql|oiqR%MzZK4mXcIP8G>9!<&*@ZtxA7<@Vt0FmC+fu8cEpiWm>2m>QD~`iQ;b;
zK`FJB;cxA5&$963j)|u;d?~dcG4OhxR}KCk5>YyPm#`6Klnk6wx742%JY5~35b-t@
zFY74#LC|S$@QuLhSMmDYf5*#XSE-VBV}I`Q+<C=EO8+t?UCG@T4vDIHcm0WPEeB)g
zx{zIpZBm77_^Lnb2erq+kDwy>K+uDr2GAzZaL^V|Bd8@Dzj1-G0T?GxJ!m|r3HcUK
z1NcNxBd8VB1e!@a%59)@*+DJHUkO@;ek(!6emGN=DEKN+J^I@LY8?O_L2aPBK<Uig
zMo>Cq_XwzUIMx^_U388vU37-7eE|9dtpugBbF0ulots+)p3cmz2CW1YLFugAgP?Ry
zZWAb-k=p{=0?O`$+#Kiw8px>u)Brvl)Cg(>HGz%*wSbyHt)TIsO|ZuTY6G7LY6rD~
zveEDhXeD?Xr~!NlXcc%nXf^0cP!Y5e^dRVZ&?e9-&=$}gpmgAPHK-nR7pMVL1T}&-
zf|?-rAgC4e2&f&j3A76IER_$0EKm#RRZuG^ohNGprL$!1pmdIGB`BRCTLntz$5w;V
z*|ByL?52EBIvZ9*KAj7D5R}e@tt7vLR)f-6uuaIPb6{IQ)mNptjkDe*W18<nz2x~G
z3EBnxXiz<`+KDLUO=l1>U7p*${BX`C_?<|%AyHiq;=SenS41Y-Q_KusF3GhK{@LZq
zdjXW<V+W*eNY%k|tQ&y^LwTWr@CRr)<a7k3zPlo+FVQ4(M^?ZeNYejgA_fHZ>>$4V
zYWG092iiT*?tyj>w0of41MMDY_dvS`+C9+jfp!nHd*J`~9vJX;2))u^^FGAOEnN0~
j?dkswAJFd%_rlji)mK=F^#71F(pX48YcRYhmAL;40c)e2

diff --git a/src/runtime/hexagon_remote/qurt/halide_remote.cpp b/src/runtime/hexagon_remote/qurt/halide_remote.cpp
index 2af8b8606cdb..d110803e1a95 100644
--- a/src/runtime/hexagon_remote/qurt/halide_remote.cpp
+++ b/src/runtime/hexagon_remote/qurt/halide_remote.cpp
@@ -419,23 +419,24 @@ int halide_hexagon_remote_release_library(handle_t module_ptr) {
     return 0;
 }
 
+halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() {
+    static halide_profiler_instance_state hvx_profiler_instance;
+    return &hvx_profiler_instance;
+}
+
 int halide_hexagon_remote_poll_profiler_state(int *func, int *threads) {
     // Increase the current thread priority to match working threads priorities,
     // so profiler can access the remote state without extra latency.
     qurt_thread_t current_thread_id = qurt_thread_get_id();
     qurt_thread_set_priority(current_thread_id, 100);
 
-    *func = halide_profiler_get_state()->current_func;
-    *threads = halide_profiler_get_state()->active_threads;
+    *func = halide_hexagon_remote_profiler_get_global_instance()->current_func;
+    *threads = halide_hexagon_remote_profiler_get_global_instance()->active_threads;
     return 0;
 }
 int halide_hexagon_remote_profiler_set_current_func(int current_func) {
-    halide_profiler_get_state()->current_func = current_func;
+    halide_hexagon_remote_profiler_get_global_instance()->current_func = current_func;
     return 0;
 }
-halide_profiler_state *halide_profiler_get_state() {
-    static halide_profiler_state hvx_profiler_state;
-    return &hvx_profiler_state;
-}
 
 }  // extern "C"
diff --git a/src/runtime/hexagon_remote/qurt/known_symbols.cpp b/src/runtime/hexagon_remote/qurt/known_symbols.cpp
index 48c60b22fd85..0ed27597769e 100644
--- a/src/runtime/hexagon_remote/qurt/known_symbols.cpp
+++ b/src/runtime/hexagon_remote/qurt/known_symbols.cpp
@@ -105,7 +105,6 @@ void *get_known_symbol(const char *name) {
 
         {"halide_error", (char *)(&halide_error)},
         {"halide_print", (char *)(&halide_print)},
-        {"halide_profiler_get_state", (char *)(&halide_profiler_get_state)},
         {"qurt_hvx_lock", (char *)(&qurt_hvx_lock)},
         {"qurt_hvx_unlock", (char *)(&qurt_hvx_unlock)},
 
diff --git a/src/runtime/hexagon_remote/qurt/sim_remote.cpp b/src/runtime/hexagon_remote/qurt/sim_remote.cpp
index 45b89edc2416..771f7fa02e39 100644
--- a/src/runtime/hexagon_remote/qurt/sim_remote.cpp
+++ b/src/runtime/hexagon_remote/qurt/sim_remote.cpp
@@ -152,12 +152,12 @@ int release_library(handle_t module_ptr) {
 }
 
 extern "C" {
-halide_profiler_state profiler_state;
-int *profiler_current_func_addr = &profiler_state.current_func;
-}
+halide_profiler_instance_state hvx_profiler_instance;
+int *profiler_current_func_addr = &hvx_profiler_instance.current_func;
 
-halide_profiler_state *halide_profiler_get_state() {
-    return (halide_profiler_state *)(&profiler_state);
+halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() {
+    return &hvx_profiler_instance;
+}
 }
 
 extern "C" {
diff --git a/src/runtime/linux_clock.cpp b/src/runtime/linux_clock.cpp
index 4729b2b2bc6d..dbb8fc186cd4 100644
--- a/src/runtime/linux_clock.cpp
+++ b/src/runtime/linux_clock.cpp
@@ -76,7 +76,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
 }
 
 extern int usleep(int);
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    usleep(ms * 1000);
+WEAK void halide_sleep_us(void *user_context, int us) {
+    usleep(us);
 }
 }
diff --git a/src/runtime/osx_clock.cpp b/src/runtime/osx_clock.cpp
index 9da5f99d1c11..a604a76eaf16 100644
--- a/src/runtime/osx_clock.cpp
+++ b/src/runtime/osx_clock.cpp
@@ -45,7 +45,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
 }
 
 extern int usleep(int);
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    usleep(ms * 1000);
+WEAK void halide_sleep_us(void *user_context, int us) {
+    usleep(us);
 }
 }
diff --git a/src/runtime/posix_clock.cpp b/src/runtime/posix_clock.cpp
index ddf258d3e0b2..7f0359b2ef6f 100644
--- a/src/runtime/posix_clock.cpp
+++ b/src/runtime/posix_clock.cpp
@@ -39,7 +39,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
 }
 
 extern int usleep(int);
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    usleep(ms * 1000);
+WEAK void halide_sleep_us(void *user_context, int us) {
+    usleep(us);
 }
 }
diff --git a/src/runtime/posix_timer_profiler.cpp b/src/runtime/posix_timer_profiler.cpp
index 83dcfdfde2e8..9fde50ee0611 100644
--- a/src/runtime/posix_timer_profiler.cpp
+++ b/src/runtime/posix_timer_profiler.cpp
@@ -50,7 +50,7 @@ WEAK extern "C" void halide_start_timer_chain() {
         halide_profiler_state *s = halide_profiler_get_state();
         itimerval timer_state;
         timer_state.it_interval.tv_sec = 0;
-        timer_state.it_interval.tv_usec = s->sleep_time * 1000.0;
+        timer_state.it_interval.tv_usec = s->sleep_time;
         timer_state.it_value = timer_state.it_interval;
 
         signal(SIGPROF, &profiler_handler);
diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp
index ccbe0bf11ecb..a5633898b8a9 100644
--- a/src/runtime/profiler_common.cpp
+++ b/src/runtime/profiler_common.cpp
@@ -10,7 +10,16 @@
 extern "C" {
 // Returns the address of the global halide_profiler state
 WEAK halide_profiler_state *halide_profiler_get_state() {
-    static halide_profiler_state s = {{{0}}, 1, 0, 0, 0, nullptr, nullptr, nullptr};
+    static halide_profiler_state s = {
+        {{0}},    // The mutex
+        nullptr,  // pipeline stats
+        nullptr,  // sampling thread
+        nullptr,  // running instances
+        nullptr,  // get_remote_profiler_state callback
+        1000,     // Sampling rate in us
+        0         // Flag that tells us to shutdown when it turns to 1
+    };
+
     return &s;
 }
 
@@ -19,10 +28,25 @@ extern "C" void halide_start_timer_chain();
 extern "C" void halide_disable_timer_interrupt();
 extern "C" void halide_enable_timer_interrupt();
 #endif
+
+WEAK void halide_profiler_lock(struct halide_profiler_state *state) {
+#if TIMER_PROFILING
+    halide_disable_timer_interrupt();
+#endif
+    halide_mutex_lock(&state->lock);
+}
+
+WEAK void halide_profiler_unlock(struct halide_profiler_state *state) {
+#if TIMER_PROFILING
+    halide_enable_timer_interrupt();
+#endif
+    halide_mutex_unlock(&state->lock);
+}
 }
 
 namespace Halide {
 namespace Runtime {
+
 namespace Internal {
 
 class LockProfiler {
@@ -31,16 +55,10 @@ class LockProfiler {
 public:
     explicit LockProfiler(halide_profiler_state *s)
         : state(s) {
-#if TIMER_PROFILING
-        halide_disable_timer_interrupt();
-#endif
-        halide_mutex_lock(&state->lock);
+        halide_profiler_lock(s);
     }
     ~LockProfiler() {
-        halide_mutex_unlock(&state->lock);
-#if TIMER_PROFILING
-        halide_enable_timer_interrupt();
-#endif
+        halide_profiler_unlock(state);
     }
 };
 
@@ -64,7 +82,6 @@ WEAK halide_profiler_pipeline_stats *find_or_create_pipeline(const char *pipelin
     }
     p->next = s->pipelines;
     p->name = pipeline_name;
-    p->first_func_id = s->first_free_id;
     p->num_funcs = num_funcs;
     p->runs = 0;
     p->time = 0;
@@ -91,60 +108,48 @@ WEAK halide_profiler_pipeline_stats *find_or_create_pipeline(const char *pipelin
         p->funcs[i].active_threads_numerator = 0;
         p->funcs[i].active_threads_denominator = 0;
     }
-    s->first_free_id += num_funcs;
     s->pipelines = p;
     return p;
 }
 
-WEAK void bill_func(halide_profiler_state *s, int func_id, uint64_t time, int active_threads) {
-    halide_profiler_pipeline_stats *p_prev = nullptr;
-    for (halide_profiler_pipeline_stats *p = s->pipelines; p;
-         p = (halide_profiler_pipeline_stats *)(p->next)) {
-        if (func_id >= p->first_func_id && func_id < p->first_func_id + p->num_funcs) {
-            if (p_prev) {
-                // Bubble the pipeline to the top to speed up future queries.
-                p_prev->next = (halide_profiler_pipeline_stats *)(p->next);
-                p->next = s->pipelines;
-                s->pipelines = p;
-            }
-            halide_profiler_func_stats *f = p->funcs + func_id - p->first_func_id;
-            f->time += time;
-            f->active_threads_numerator += active_threads;
-            f->active_threads_denominator += 1;
-            p->time += time;
-            p->samples++;
-            p->active_threads_numerator += active_threads;
-            p->active_threads_denominator += 1;
-            return;
-        }
-        p_prev = p;
-    }
-    // Someone must have called reset_state while a kernel was running. Do nothing.
+WEAK void update_running_instance(halide_profiler_instance_state *instance, uint64_t time) {
+    halide_profiler_func_stats *f = instance->funcs + instance->current_func;
+    f->time += time;
+    f->active_threads_numerator += instance->active_threads;
+    f->active_threads_denominator += 1;
+    instance->samples++;
+    instance->active_threads_numerator += instance->active_threads;
+    instance->active_threads_denominator += 1;
+    instance->billed_time += time;
 }
 
 extern "C" WEAK int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t) {
-    int func, active_threads;
+    if (!s->instances) {
+        // No Halide code is currently running
+        return 0;
+    }
+    halide_profiler_instance_state *instance = s->instances;
+
     if (s->get_remote_profiler_state) {
         // Execution has disappeared into remote code running
         // on an accelerator (e.g. Hexagon DSP)
-        s->get_remote_profiler_state(&func, &active_threads);
-    } else {
-        func = s->current_func;
-        active_threads = s->active_threads;
+
+        // It shouldn't be possible to get into a state where multiple
+        // pipelines are being profiled and one or both of them uses
+        // get_remote_profiler_state.
+        halide_debug_assert(nullptr, s->instances->next == nullptr);
+
+        s->get_remote_profiler_state(&(instance->current_func), &(instance->active_threads));
     }
+
     uint64_t t_now = halide_current_time_ns(nullptr);
-    if (func == halide_profiler_please_stop) {
-#if TIMER_PROFILING
-        s->sampling_thread = nullptr;
-#endif
-        return -1;
-    } else if (func >= 0) {
-        // Assume all time since I was last awake is due to
-        // the currently running func.
-        bill_func(s, func, t_now - *prev_t, active_threads);
+    uint64_t dt = t_now - *prev_t;
+    while (instance) {
+        update_running_instance(instance, dt);
+        instance = instance->next;
     }
     *prev_t = t_now;
-    return s->sleep_time;
+    return 0;
 }
 
 WEAK void sampling_profiler_thread(void *) {
@@ -153,19 +158,17 @@ WEAK void sampling_profiler_thread(void *) {
     // grab the lock
     halide_mutex_lock(&s->lock);
 
-    while (s->current_func != halide_profiler_please_stop) {
-        uint64_t t1 = halide_current_time_ns(nullptr);
-        uint64_t t = t1;
-        while (true) {
-            int sleep_ms = halide_profiler_sample(s, &t);
-            if (sleep_ms < 0) {
-                break;
-            }
-            // Release the lock, sleep, reacquire.
-            halide_mutex_unlock(&s->lock);
-            halide_sleep_ms(nullptr, sleep_ms);
-            halide_mutex_lock(&s->lock);
+    uint64_t t1 = halide_current_time_ns(nullptr);
+    uint64_t t = t1;
+    while (!s->shutdown || s->instances) {
+        int err = halide_profiler_sample(s, &t);
+        if (err < 0) {
+            break;
         }
+        // Release the lock, sleep, reacquire.
+        halide_mutex_unlock(&s->lock);
+        halide_sleep_us(nullptr, s->sleep_time);
+        halide_mutex_lock(&s->lock);
     }
 
     halide_mutex_unlock(&s->lock);
@@ -209,43 +212,135 @@ WEAK halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state(const ch
     return nullptr;
 }
 
-// Returns a token identifying this pipeline instance.
-WEAK int halide_profiler_pipeline_start(void *user_context,
+// Populates the instance state struct
+WEAK int halide_profiler_instance_start(void *user_context,
                                         const char *pipeline_name,
                                         int num_funcs,
-                                        const uint64_t *func_names) {
+                                        const uint64_t *func_names,
+                                        halide_profiler_instance_state *instance) {
+    // Tell the instance where we stashed the per-func state - just after the
+    // instance itself.
+
+    // First check that the layout agrees with the amount of stack space
+    // allocated in the pipeline
+    static_assert((sizeof(halide_profiler_func_stats) & 7) == 0);
+    halide_profiler_func_stats *funcs = (halide_profiler_func_stats *)(instance + 1);
+
+    // Zero initialize the instance and func state
+    memset(instance, 0, (uint8_t *)(funcs + num_funcs) - (uint8_t *)instance);
+
+    instance->funcs = funcs;
+
     halide_profiler_state *s = halide_profiler_get_state();
+    {
+        LockProfiler lock(s);
+
+        // Push this instance onto the running instances list
+        if (s->instances) {
+            s->instances->prev_next = &(instance->next);
+
+            // If there was something already running using the remote polling
+            // method, we can't profile something else at the same time.
+            if (s->get_remote_profiler_state) {
+                error(user_context) << "Cannot profile pipeline " << pipeline_name
+                                    << " while pipeline " << s->instances->pipeline_stats->name
+                                    << " is running, because it is running on a device.";
+                return halide_error_code_cannot_profile_pipeline;
+            }
+        }
+        instance->next = s->instances;
+        instance->prev_next = &(s->instances);
+        s->instances = instance;
+
+        // Find or create the pipeline statistics for this pipeline.
+        halide_profiler_pipeline_stats *p =
+            find_or_create_pipeline(pipeline_name, num_funcs, func_names);
+        if (!p) {
+            // Allocating space to track the statistics failed.
+            return halide_error_out_of_memory(user_context);
+        }
 
-    LockProfiler lock(s);
+        // Tell the instance the pipeline to which it belongs.
+        instance->pipeline_stats = p;
 
-    if (!s->sampling_thread) {
+        if (!s->sampling_thread) {
 #if TIMER_PROFILING
-        halide_start_clock(user_context);
-        halide_start_timer_chain();
-        s->sampling_thread = (halide_thread *)1;
+            halide_start_clock(user_context);
+            halide_start_timer_chain();
+            s->sampling_thread = (halide_thread *)1;
 #else
-        halide_start_clock(user_context);
-        s->sampling_thread = halide_spawn_thread(sampling_profiler_thread, nullptr);
+            halide_start_clock(user_context);
+            s->sampling_thread = halide_spawn_thread(sampling_profiler_thread, nullptr);
 #endif
+        }
     }
 
-    halide_profiler_pipeline_stats *p =
-        find_or_create_pipeline(pipeline_name, num_funcs, func_names);
-    if (!p) {
-        // Allocating space to track the statistics failed.
-        return halide_error_out_of_memory(user_context);
+    instance->start_time = halide_current_time_ns(user_context);
+
+    return 0;
+}
+
+WEAK int halide_profiler_instance_end(void *user_context, halide_profiler_instance_state *instance) {
+    uint64_t end_time = halide_current_time_ns(user_context);
+    halide_profiler_state *s = halide_profiler_get_state();
+    LockProfiler lock(s);
+
+    if (instance->should_collect_statistics) {
+
+        uint64_t true_duration = end_time - instance->start_time;
+        halide_profiler_pipeline_stats *p = instance->pipeline_stats;
+
+        // Retire the instance, accumulating statistics onto the statistics for this
+        // pipeline. Fields related to memory usages are tracked in the pipeline stats
+        p->samples += instance->samples;
+        p->time += true_duration;
+        p->active_threads_numerator += instance->active_threads_numerator;
+        p->active_threads_denominator += instance->active_threads_denominator;
+        p->memory_total += instance->memory_total;
+        p->memory_peak = max(p->memory_peak, instance->memory_peak);
+        p->num_allocs += instance->num_allocs;
+        p->runs++;
+
+        // Compute an adjustment factor to account for the fact that the billed
+        // time is not equal to the duration between start and end calls. We
+        // could avoid this by just making sure there is a sampling event a the
+        // start and end of the pipeline, but this would overcount whatever the
+        // last value of current_func is at the end of the pipeline, and is
+        // likely to undercount time spent in the first func in a
+        // pipeline. Sampling events need to happen independently (in the random
+        // variable sense) of any changes in current_func.
+        double adjustment = 1;
+        if (instance->billed_time > 0) {
+            adjustment = (double)true_duration / instance->billed_time;
+        }
+
+        for (int f = 0; f < p->num_funcs; f++) {
+            halide_profiler_func_stats *func = p->funcs + f;
+            const halide_profiler_func_stats *instance_func = instance->funcs + f;
+            // clang-tidy wants me to use a c standard library function to do
+            // the rounding below, but those aren't guaranteed to be available
+            // when compiling the runtime.
+            func->time += (uint64_t)(instance_func->time * adjustment + 0.5);  // NOLINT
+            func->active_threads_numerator += instance_func->active_threads_numerator;
+            func->active_threads_denominator += instance_func->active_threads_denominator;
+            func->num_allocs += instance_func->num_allocs;
+            func->stack_peak = max(func->stack_peak, instance_func->stack_peak);
+            func->memory_peak = max(func->memory_peak, instance_func->memory_peak);
+            func->memory_total += instance_func->memory_total;
+        }
     }
-    p->runs++;
 
-    return p->first_func_id;
+    // Remove myself from the doubly-linked list
+    *(instance->prev_next) = instance->next;
+    if (instance->next) {
+        instance->next->prev_next = instance->prev_next;
+    }
+    return 0;
 }
 
 WEAK void halide_profiler_stack_peak_update(void *user_context,
-                                            void *pipeline_state,
+                                            halide_profiler_instance_state *instance,
                                             uint64_t *f_values) {
-    halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
-    halide_abort_if_false(user_context, p_stats != nullptr);
-
     // Note: Update to the counter is done without grabbing the state's lock to
     // reduce lock contention. One potential issue is that other call that frees the
     // pipeline and function stats structs may be running in parallel. However, the
@@ -253,15 +348,15 @@ WEAK void halide_profiler_stack_peak_update(void *user_context,
     // unless user specifically calls halide_profiler_reset().
 
     // Update per-func memory stats
-    for (int i = 0; i < p_stats->num_funcs; ++i) {
+    for (int i = 0; i < instance->pipeline_stats->num_funcs; ++i) {
         if (f_values[i] != 0) {
-            sync_compare_max_and_swap(&(p_stats->funcs[i]).stack_peak, f_values[i]);
+            sync_compare_max_and_swap(&(instance->funcs[i]).stack_peak, f_values[i]);
         }
     }
 }
 
 WEAK void halide_profiler_memory_allocate(void *user_context,
-                                          void *pipeline_state,
+                                          halide_profiler_instance_state *instance,
                                           int func_id,
                                           uint64_t incr) {
     using namespace Halide::Runtime::Internal::Synchronization;
@@ -272,34 +367,34 @@ WEAK void halide_profiler_memory_allocate(void *user_context,
         return;
     }
 
-    halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
-    halide_abort_if_false(user_context, p_stats != nullptr);
+    halide_abort_if_false(user_context, instance != nullptr);
     halide_abort_if_false(user_context, func_id >= 0);
-    halide_abort_if_false(user_context, func_id < p_stats->num_funcs);
+    halide_abort_if_false(user_context, func_id < instance->pipeline_stats->num_funcs);
 
-    halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];
+    halide_profiler_func_stats *func = &instance->funcs[func_id];
 
     // Note: Update to the counter is done without grabbing the state's lock to
-    // reduce lock contention. One potential issue is that other call that frees the
-    // pipeline and function stats structs may be running in parallel. However, the
-    // current desctructor (called on profiler shutdown) does not free the structs
-    // unless user specifically calls halide_profiler_reset().
-
-    // Update per-pipeline memory stats
-    atomic_add_fetch_sequentially_consistent(&p_stats->num_allocs, 1);
-    atomic_add_fetch_sequentially_consistent(&p_stats->memory_total, incr);
-    uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&p_stats->memory_current, incr);
-    sync_compare_max_and_swap(&p_stats->memory_peak, p_mem_current);
+    // reduce lock contention. One potential issue is that another call that
+    // frees the pipeline and function stats structs may be running in
+    // parallel. However, the current destructor (called on profiler shutdown)
+    // does not free the structs unless user specifically calls
+    // halide_profiler_reset().
+
+    // Update per-instance memory stats
+    atomic_add_fetch_sequentially_consistent(&instance->num_allocs, 1);
+    atomic_add_fetch_sequentially_consistent(&instance->memory_total, incr);
+    uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&instance->memory_current, incr);
+    sync_compare_max_and_swap(&instance->memory_peak, p_mem_current);
 
     // Update per-func memory stats
-    atomic_add_fetch_sequentially_consistent(&f_stats->num_allocs, 1);
-    atomic_add_fetch_sequentially_consistent(&f_stats->memory_total, incr);
-    uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&f_stats->memory_current, incr);
-    sync_compare_max_and_swap(&f_stats->memory_peak, f_mem_current);
+    atomic_add_fetch_sequentially_consistent(&func->num_allocs, 1);
+    atomic_add_fetch_sequentially_consistent(&func->memory_total, incr);
+    uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&func->memory_current, incr);
+    sync_compare_max_and_swap(&func->memory_peak, f_mem_current);
 }
 
 WEAK void halide_profiler_memory_free(void *user_context,
-                                      void *pipeline_state,
+                                      halide_profiler_instance_state *instance,
                                       int func_id,
                                       uint64_t decr) {
     using namespace Halide::Runtime::Internal::Synchronization;
@@ -310,12 +405,11 @@ WEAK void halide_profiler_memory_free(void *user_context,
         return;
     }
 
-    halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
-    halide_abort_if_false(user_context, p_stats != nullptr);
+    halide_abort_if_false(user_context, instance != nullptr);
     halide_abort_if_false(user_context, func_id >= 0);
-    halide_abort_if_false(user_context, func_id < p_stats->num_funcs);
+    halide_abort_if_false(user_context, func_id < instance->pipeline_stats->num_funcs);
 
-    halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];
+    halide_profiler_func_stats *func = &instance->funcs[func_id];
 
     // Note: Update to the counter is done without grabbing the state's lock to
     // reduce lock contention. One potential issue is that other call that frees the
@@ -324,10 +418,10 @@ WEAK void halide_profiler_memory_free(void *user_context,
     // unless user specifically calls halide_profiler_reset().
 
     // Update per-pipeline memory stats
-    atomic_sub_fetch_sequentially_consistent(&p_stats->memory_current, decr);
+    atomic_sub_fetch_sequentially_consistent(&instance->memory_current, decr);
 
     // Update per-func memory stats
-    atomic_sub_fetch_sequentially_consistent(&f_stats->memory_current, decr);
+    atomic_sub_fetch_sequentially_consistent(&func->memory_current, decr);
 }
 
 WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) {
@@ -360,7 +454,7 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
 
     for (halide_profiler_pipeline_stats *p = s->pipelines; p;
          p = (halide_profiler_pipeline_stats *)(p->next)) {
-        float t = p->time / 1000000.0f;
+        float total_time = p->time / 1000000.0f;
         if (!p->runs) {
             continue;
         }
@@ -368,10 +462,10 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
         bool serial = p->active_threads_numerator == p->active_threads_denominator;
         float threads = p->active_threads_numerator / (p->active_threads_denominator + 1e-10);
         sstr << p->name << "\n"
-             << " total time: " << t << " ms"
+             << " total time: " << total_time << " ms"
              << "  samples: " << p->samples
              << "  runs: " << p->runs
-             << "  time/run: " << t / p->runs << " ms\n";
+             << "  time per run: " << total_time / p->runs << " ms\n";
         if (!serial) {
             sstr << " average threads used: " << threads << "\n";
         }
@@ -423,12 +517,18 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
             for (int i = 0; i < p->num_funcs; i++) {
                 halide_profiler_func_stats *fs = p->funcs + i;
 
-                // The first func is always a catch-all overhead
+                // The first id is always a catch-all overhead
                 // slot. Only report overhead time if it's non-zero
                 if (i == 0 && fs->time == 0) {
                     continue;
                 }
 
+                // These two ids are malloc and free. Don't print them if there
+                // were no heap allocations.
+                if ((i == 2 || i == 3) && p->num_allocs == 0) {
+                    continue;
+                }
+
                 f_stats[f_stats_count++] = fs;
             }
 
@@ -606,7 +706,6 @@ WEAK void halide_profiler_reset_unlocked(halide_profiler_state *s) {
         free(p->funcs);
         free(p);
     }
-    s->first_free_id = 0;
 }
 
 WEAK void halide_profiler_reset() {
@@ -616,6 +715,7 @@ WEAK void halide_profiler_reset() {
     // state without grabbing the global profiler state's lock.
     halide_profiler_state *s = halide_profiler_get_state();
     LockProfiler lock(s);
+    halide_abort_if_false(nullptr, s->instances == nullptr);
     halide_profiler_reset_unlocked(s);
 }
 
@@ -624,12 +724,16 @@ __attribute__((destructor))
 #endif
 WEAK void
 halide_profiler_shutdown() {
+    using namespace Halide::Runtime::Internal::Synchronization;
+
     halide_profiler_state *s = halide_profiler_get_state();
     if (!s->sampling_thread) {
         return;
     }
 
-    s->current_func = halide_profiler_please_stop;
+    int one = 1;
+    atomic_store_relaxed(&(s->shutdown), &one);
+
 #if TIMER_PROFILING
     // Wait for timer interrupt to fire and notice things are shutdown.
     // volatile should be the right tool to use to wait for storage to be
@@ -643,7 +747,9 @@ halide_profiler_shutdown() {
     s->sampling_thread = nullptr;
 #endif
 
-    s->current_func = halide_profiler_outside_of_halide;
+    // The join_thread should have waited for any running instances to
+    // terminate.
+    halide_debug_assert(nullptr, s->instances == nullptr);
 
     // Print results. No need to lock anything because we just shut
     // down the thread.
@@ -664,7 +770,7 @@ WEAK void halide_windows_profiler_shutdown() {
     // sections in a static destructor as it may run after threads
     // have been killed by the OS. Furthermore, may calls, even things
     // like EnterCriticalSection may be set to kill the process if
-    // called during process shutdown. Hence kthis routine doesn't attmept
+    // called during process shutdown. Hence this routine doesn't attmept
     // to clean up state as the destructor does on other platforms.
 
     // Print results. Avoid locking as it will cause problems and
@@ -674,8 +780,4 @@ WEAK void halide_windows_profiler_shutdown() {
 #endif
 }  // namespace
 
-WEAK void halide_profiler_pipeline_end(void *user_context, void *state) {
-    ((halide_profiler_state *)state)->current_func = halide_profiler_outside_of_halide;
-}
-
 }  // extern "C"
diff --git a/src/runtime/profiler_inlined.cpp b/src/runtime/profiler_inlined.cpp
index f31289cbcb36..fc5bda1b1bcc 100644
--- a/src/runtime/profiler_inlined.cpp
+++ b/src/runtime/profiler_inlined.cpp
@@ -3,21 +3,27 @@
 
 extern "C" {
 
-WEAK_INLINE int halide_profiler_set_current_func(halide_profiler_state *state, int pipeline, int func, int *sampling_token) {
+WEAK_INLINE int halide_profiler_set_current_func(halide_profiler_instance_state *instance, int func, int *sampling_token) {
     if (sampling_token == nullptr || *sampling_token == 0) {
 
         // Use empty volatile asm blocks to prevent code motion. Otherwise
         // llvm reorders or elides the stores.
-        volatile int *ptr = &(state->current_func);
+        volatile int *ptr = &(instance->current_func);
         // clang-format off
         asm volatile ("":::);
-        *ptr = pipeline + func;
+        *ptr = func;
         asm volatile ("":::);
         // clang-format on
     }
     return 0;
 }
 
+// Called once we're sure we're not in bounds query code
+WEAK_INLINE int halide_profiler_enable_instance(halide_profiler_instance_state *instance) {
+    instance->should_collect_statistics = 1;
+    return 0;
+}
+
 // Invariant: shared xor local, and both are either 0 or 1. 0 means acquired.
 WEAK_INLINE int halide_profiler_acquire_sampling_token(int32_t *shared, int32_t *local) {
     using namespace Halide::Runtime::Internal::Synchronization;
@@ -42,15 +48,15 @@ WEAK_INLINE int halide_profiler_init_sampling_token(int32_t *sampling_token, int
     return 0;
 }
 
-WEAK_INLINE int halide_profiler_incr_active_threads(halide_profiler_state *state) {
+WEAK_INLINE int halide_profiler_incr_active_threads(halide_profiler_instance_state *instance) {
     using namespace Halide::Runtime::Internal::Synchronization;
 
-    return atomic_fetch_add_sequentially_consistent(&(state->active_threads), 1);
+    return atomic_fetch_add_sequentially_consistent(&(instance->active_threads), 1);
 }
 
-WEAK_INLINE int halide_profiler_decr_active_threads(halide_profiler_state *state) {
+WEAK_INLINE int halide_profiler_decr_active_threads(halide_profiler_instance_state *instance) {
     using namespace Halide::Runtime::Internal::Synchronization;
 
-    return atomic_fetch_sub_sequentially_consistent(&(state->active_threads), 1);
+    return atomic_fetch_sub_sequentially_consistent(&(instance->active_threads), 1);
 }
 }
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
index 7955e8749df7..4ce1092264c9 100644
--- a/src/runtime/runtime_api.cpp
+++ b/src/runtime/runtime_api.cpp
@@ -165,9 +165,10 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_print,
     (void *)&halide_profiler_get_pipeline_state,
     (void *)&halide_profiler_get_state,
+    (void *)&halide_profiler_instance_start,
+    (void *)&halide_profiler_instance_end,
     (void *)&halide_profiler_memory_allocate,
     (void *)&halide_profiler_memory_free,
-    (void *)&halide_profiler_pipeline_start,
     (void *)&halide_profiler_report,
     (void *)&halide_profiler_reset,
     (void *)&halide_profiler_stack_peak_update,
@@ -195,7 +196,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_set_trace_file,
     (void *)&halide_shutdown_thread_pool,
     (void *)&halide_shutdown_trace,
-    (void *)&halide_sleep_ms,
+    (void *)&halide_sleep_us,
     (void *)&halide_spawn_thread,
     (void *)&halide_start_clock,
     (void *)&halide_start_timer_chain,
diff --git a/src/runtime/runtime_atomics.h b/src/runtime/runtime_atomics.h
index 5c82e0a66008..61139a622d75 100644
--- a/src/runtime/runtime_atomics.h
+++ b/src/runtime/runtime_atomics.h
@@ -128,7 +128,8 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val)
     return __sync_or_and_fetch(addr, val);
 }
 
-ALWAYS_INLINE void atomic_store_relaxed(uintptr_t *addr, uintptr_t *val) {
+template<typename T>
+ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) {
     *addr = *val;
 }
 
@@ -247,7 +248,8 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val)
     return __atomic_or_fetch(addr, val, __ATOMIC_RELAXED);
 }
 
-ALWAYS_INLINE void atomic_store_relaxed(uintptr_t *addr, uintptr_t *val) {
+template<typename T>
+ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) {
     __atomic_store(addr, val, __ATOMIC_RELAXED);
 }
 
diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h
index 8df9dcb8eb2c..b42728be27e6 100644
--- a/src/runtime/runtime_internal.h
+++ b/src/runtime/runtime_internal.h
@@ -153,28 +153,35 @@ WEAK void *halide_get_library_symbol(void *lib, const char *name);
 
 WEAK int halide_start_clock(void *user_context);
 WEAK int64_t halide_current_time_ns(void *user_context);
-WEAK void halide_sleep_ms(void *user_context, int ms);
+WEAK void halide_sleep_us(void *user_context, int us);
 WEAK void halide_device_free_as_destructor(void *user_context, void *obj);
 WEAK void halide_device_and_host_free_as_destructor(void *user_context, void *obj);
 WEAK void halide_device_host_nop_free(void *user_context, void *obj);
 
-// The pipeline_state is declared as void* type since halide_profiler_pipeline_stats
-// is defined inside HalideRuntime.h which includes this header file.
+struct halide_profiler_instance_state;
 WEAK void halide_profiler_stack_peak_update(void *user_context,
-                                            void *pipeline_state,
+                                            halide_profiler_instance_state *instance,
                                             uint64_t *f_values);
 WEAK void halide_profiler_memory_allocate(void *user_context,
-                                          void *pipeline_state,
+                                          halide_profiler_instance_state *instance,
                                           int func_id,
                                           uint64_t incr);
 WEAK void halide_profiler_memory_free(void *user_context,
-                                      void *pipeline_state,
+                                      halide_profiler_instance_state *instance,
                                       int func_id,
                                       uint64_t decr);
-WEAK int halide_profiler_pipeline_start(void *user_context,
+WEAK int halide_profiler_instance_start(void *user_context,
                                         const char *pipeline_name,
                                         int num_funcs,
-                                        const uint64_t *func_names);
+                                        const uint64_t *func_names,
+                                        halide_profiler_instance_state *instance);
+WEAK int halide_profiler_instance_end(void *user_context,
+                                      halide_profiler_instance_state *instance);
+
+WEAK void halide_start_timer_chain();
+WEAK void halide_disable_timer_interrupt();
+WEAK void halide_enable_timer_interrupt();
+
 WEAK int halide_host_cpu_count();
 
 WEAK int halide_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf,
diff --git a/src/runtime/windows_clock.cpp b/src/runtime/windows_clock.cpp
index 717681070c68..94609f63642c 100644
--- a/src/runtime/windows_clock.cpp
+++ b/src/runtime/windows_clock.cpp
@@ -37,7 +37,11 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
     return (int64_t)(ns_per_tick * clock);
 }
 
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    Sleep(ms);
+WEAK void halide_sleep_us(void *user_context, int us) {
+    // Unfortunately you can't sleep on windows in microsecond amounts. The
+    // below call results in calling Sleep(0) for times less than 1 ms, which is
+    // documented to yield the rest of the time slice. If there is no other
+    // thread waiting to run, it may get rescheduled immediately.
+    Sleep(us / 1000);
 }
 }

From a0e1dc041dda1daf95b4f8f15f4403d652b61473 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Wed, 26 Jun 2024 00:23:57 +0200
Subject: [PATCH 144/186] Fix device slices for Buffer with fixed
 dimensionality in template. (#8313)

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/runtime/HalideBuffer.h        | 27 +++++++++++++++++----------
 test/correctness/device_slice.cpp | 26 ++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/src/runtime/HalideBuffer.h b/src/runtime/HalideBuffer.h
index 7f914d0a4ff2..1c2607ef027d 100644
--- a/src/runtime/HalideBuffer.h
+++ b/src/runtime/HalideBuffer.h
@@ -302,15 +302,21 @@ class Buffer {
     // Note that this is called "cropped" but can also encompass a slice/embed
     // operation as well.
     struct DevRefCountCropped : DeviceRefCount {
-        Buffer<T, Dims, InClassDimStorage> cropped_from;
-        explicit DevRefCountCropped(const Buffer<T, Dims, InClassDimStorage> &cropped_from)
+        // We will only store Buffers that have a dynamic number of dimensions.
+        // Buffers that cropped or sliced from need to be first converted to
+        // one with variable size. This is required because we cannot possibly
+        // know what the actual dimensionality is of the buffer this is a
+        // crop or slice from. Since cropping a sliced buffer is also possible,
+        // no optimizations can be made for cropped buffers either.
+        Buffer<T, AnyDims> cropped_from;
+        explicit DevRefCountCropped(const Buffer<T, AnyDims> &cropped_from)
             : cropped_from(cropped_from) {
             ownership = BufferDeviceOwnership::Cropped;
         }
     };
 
     /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
-    void crop_from(const Buffer<T, Dims, InClassDimStorage> &cropped_from) {
+    void crop_from(const Buffer<T, AnyDims> &cropped_from) {
         assert(dev_ref_count == nullptr);
         dev_ref_count = new DevRefCountCropped(cropped_from);
     }
@@ -513,15 +519,15 @@ class Buffer {
     void complete_device_crop(Buffer<T, Dims, InClassDimStorage> &result_host_cropped) const {
         assert(buf.device_interface != nullptr);
         if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == halide_error_code_success) {
-            const Buffer<T, Dims, InClassDimStorage> *cropped_from = this;
             // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
             // is it possible to get to this point without incref having run at least once since
             // the device field was set? (I.e. in the internal logic of crop. incref might have been
             // called.)
             if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
-                cropped_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
+                result_host_cropped.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
+            } else {
+                result_host_cropped.crop_from(*this);
             }
-            result_host_cropped.crop_from(*cropped_from);
         }
     }
 
@@ -545,16 +551,17 @@ class Buffer {
     void complete_device_slice(Buffer<T, AnyDims, InClassDimStorage> &result_host_sliced, int d, int pos) const {
         assert(buf.device_interface != nullptr);
         if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == halide_error_code_success) {
-            const Buffer<T, Dims, InClassDimStorage> *sliced_from = this;
             // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
             // is it possible to get to this point without incref having run at least once since
             // the device field was set? (I.e. in the internal logic of slice. incref might have been
             // called.)
             if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
-                sliced_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
+                // crop_from() is correct here, despite the fact that we are slicing.
+                result_host_sliced.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
+            } else {
+                // crop_from() is correct here, despite the fact that we are slicing.
+                result_host_sliced.crop_from(*this);
             }
-            // crop_from() is correct here, despite the fact that we are slicing.
-            result_host_sliced.crop_from(*sliced_from);
         }
     }
 
diff --git a/test/correctness/device_slice.cpp b/test/correctness/device_slice.cpp
index 3bebc6bbb541..6c7d3d1544fd 100644
--- a/test/correctness/device_slice.cpp
+++ b/test/correctness/device_slice.cpp
@@ -79,6 +79,32 @@ int main(int argc, char **argv) {
         });
     }
 
+    printf("Test nondestructive slicing with given dimensions.\n");
+    {
+        Halide::Runtime::Buffer<int32_t, 3> gpu_buf = make_gpu_buffer(hexagon_rpc);
+        assert(gpu_buf.raw_buffer()->device_interface != nullptr);
+
+        const int slice_dim = 0;
+        const int slice_pos = 31;
+        Halide::Runtime::Buffer<int32_t, 2> sliced = gpu_buf.sliced(slice_dim, slice_pos);
+        assert(sliced.raw_buffer()->device_interface != nullptr);
+
+        assert(sliced.dimensions() == 2);
+        assert(sliced.extent(0) == kEdges[1]);
+        assert(sliced.extent(1) == kEdges[2]);
+
+        sliced.copy_to_host();
+        sliced.for_each_element([&](int y, int c) {
+            const int x = slice_pos;
+            assert(sliced(y, c) == x + y * 256 + c * 256 * 256);
+        });
+
+        gpu_buf.copy_to_host();
+        gpu_buf.for_each_element([&](int x, int y, int c) {
+            assert(gpu_buf(x, y, c) == x + y * 256 + c * 256 * 256);
+        });
+    }
+
     printf("Test slice of a slice\n");
     {
         Halide::Runtime::Buffer<int32_t> gpu_buf = make_gpu_buffer(hexagon_rpc);

From 3d20677cb57f0037bf5350bb5a6e76f0afe9e8da Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 25 Jun 2024 15:24:12 -0700
Subject: [PATCH 145/186] Remove deprecated operators (#8321)

tuple_select and the Internal versions of various fixed-point helpers were deprecated in Halide 17; we should remove them entirely for Halide 18.
---
 .../src/halide/halide_/PyIROperator.cpp       |  38 -----
 src/IROperator.h                              | 133 ------------------
 2 files changed, 171 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyIROperator.cpp b/python_bindings/src/halide/halide_/PyIROperator.cpp
index 81a51398bb51..3dd297a8c2e3 100644
--- a/python_bindings/src/halide/halide_/PyIROperator.cpp
+++ b/python_bindings/src/halide/halide_/PyIROperator.cpp
@@ -128,44 +128,6 @@ void define_operators(py::module &m) {
         return py::cast(false_expr_value);
     });
 
-    m.def("tuple_select", [](const py::args &args) -> py::tuple {
-        // HALIDE_ATTRIBUTE_DEPRECATED("tuple_select has been deprecated. Use select instead (which now works for Tuples)")
-        PyErr_WarnEx(PyExc_DeprecationWarning,
-                     "tuple_select has been deprecated. Use select instead (which now works for Tuples)",
-                     1);
-
-        _halide_user_assert(args.size() >= 3)
-            << "tuple_select() must have at least 3 arguments";
-        _halide_user_assert((args.size() % 2) != 0)
-            << "tuple_select() must have an odd number of arguments";
-
-        int pos = (int)args.size() - 1;
-        Tuple false_value = args[pos--].cast<Tuple>();
-        bool has_tuple_cond = false, has_expr_cond = false;
-        while (pos > 0) {
-            Tuple true_value = args[pos--].cast<Tuple>();
-            // Note that 'condition' can be either Expr or Tuple, but must be consistent across all
-            py::object py_cond = args[pos--];
-            Expr expr_cond;
-            Tuple tuple_cond(expr_cond);
-            try {
-                tuple_cond = py_cond.cast<Tuple>();
-                has_tuple_cond = true;
-            } catch (...) {
-                expr_cond = py_cond.cast<Expr>();
-                has_expr_cond = true;
-            }
-
-            if (expr_cond.defined()) {
-                false_value = select(expr_cond, true_value, false_value);
-            } else {
-                false_value = select(tuple_cond, true_value, false_value);
-            }
-        }
-        _halide_user_assert(!(has_tuple_cond && has_expr_cond))
-            << "tuple_select() may not mix Expr and Tuple for the condition elements.";
-        return to_python_tuple(false_value);
-    });
     m.def("mux", (Expr(*)(const Expr &, const std::vector<Expr> &)) & mux);
 
     m.def("sin", &sin);
diff --git a/src/IROperator.h b/src/IROperator.h
index a96ef6223c0d..fb84fecdb688 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -801,10 +801,6 @@ inline Expr select(Expr c0, Expr v0, Expr c1, Expr v1, Args &&...args) {
 /** Equivalent of ternary select(), but taking/returning tuples. If the condition is
  * a Tuple, it must match the size of the true and false Tuples. */
 // @{
-HALIDE_ATTRIBUTE_DEPRECATED("tuple_select has been deprecated. Use select instead (which now works for Tuples)")
-Tuple tuple_select(const Tuple &condition, const Tuple &true_value, const Tuple &false_value);
-HALIDE_ATTRIBUTE_DEPRECATED("tuple_select has been deprecated. Use select instead (which now works for Tuples)")
-Tuple tuple_select(const Expr &condition, const Tuple &true_value, const Tuple &false_value);
 Tuple select(const Tuple &condition, const Tuple &true_value, const Tuple &false_value);
 Tuple select(const Expr &condition, const Tuple &true_value, const Tuple &false_value);
 // @}
@@ -813,16 +809,6 @@ Tuple select(const Expr &condition, const Tuple &true_value, const Tuple &false_
  * a Tuple, it must match the size of the true and false Tuples. */
 // @{
 template<typename... Args>
-HALIDE_ATTRIBUTE_DEPRECATED("tuple_select has been deprecated. Use select instead (which now works for Tuples)")
-inline Tuple tuple_select(const Tuple &c0, const Tuple &v0, const Tuple &c1, const Tuple &v1, Args &&...args) {
-    return tuple_select(c0, v0, tuple_select(c1, v1, std::forward<Args>(args)...));
-}
-template<typename... Args>
-HALIDE_ATTRIBUTE_DEPRECATED("tuple_select has been deprecated. Use select instead (which now works for Tuples)")
-inline Tuple tuple_select(const Expr &c0, const Tuple &v0, const Expr &c1, const Tuple &v1, Args &&...args) {
-    return tuple_select(c0, v0, tuple_select(c1, v1, std::forward<Args>(args)...));
-}
-template<typename... Args>
 inline Tuple select(const Tuple &c0, const Tuple &v0, const Tuple &c1, const Tuple &v1, Args &&...args) {
     return select(c0, v0, select(c1, v1, std::forward<Args>(args)...));
 }
@@ -1696,125 +1682,6 @@ Expr rounding_mul_shift_right(Expr a, Expr b, Expr q);
 Expr rounding_mul_shift_right(Expr a, Expr b, int q);
 //@}
 
-namespace Internal {
-
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widen_right_add(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widen_right_add(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widen_right_mul(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widen_right_mul(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widen_right_sub(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widen_right_sub(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widening_add(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widening_add(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widening_mul(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widening_mul(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widening_sub(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widening_sub(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widening_shift_left(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widening_shift_left(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widening_shift_left(const Expr &a, int b, T * = nullptr) {
-    return Halide::widening_shift_left(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widening_shift_right(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widening_shift_right(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr widening_shift_right(const Expr &a, int b, T * = nullptr) {
-    return Halide::widening_shift_right(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr rounding_shift_left(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::widening_shift_left(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr rounding_shift_left(const Expr &a, int b, T * = nullptr) {
-    return Halide::widening_shift_left(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr rounding_shift_right(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::rounding_shift_right(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr rounding_shift_right(const Expr &a, int b, T * = nullptr) {
-    return Halide::rounding_shift_right(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr saturating_add(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::saturating_add(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr saturating_sub(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::saturating_sub(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr halving_add(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::halving_add(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr rounding_halving_add(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::rounding_halving_add(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr halving_sub(const Expr &a, const Expr &b, T * = nullptr) {
-    return Halide::halving_sub(a, b);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr mul_shift_right(const Expr &a, const Expr &b, const Expr &q, T * = nullptr) {
-    return Halide::mul_shift_right(a, b, q);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr mul_shift_right(const Expr &a, const Expr &b, int q, T * = nullptr) {
-    return Halide::mul_shift_right(a, b, q);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr rounding_mul_shift_right(const Expr &a, const Expr &b, const Expr &q, T * = nullptr) {
-    return Halide::rounding_mul_shift_right(a, b, q);
-}
-template<typename T = void>
-HALIDE_ATTRIBUTE_DEPRECATED("This function has been moved out of the Halide::Internal:: namespace into Halide::")
-Expr rounding_mul_shift_right(const Expr &a, const Expr &b, int q, T * = nullptr) {
-    return Halide::rounding_mul_shift_right(a, b, q);
-}
-}  // namespace Internal
-
 }  // namespace Halide
 
 #endif

From dd6c98b821b222a04a6ea52f59bcb3c4f0e80edd Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 25 Jun 2024 16:02:16 -0700
Subject: [PATCH 146/186] Correct the Halide version number in setup.py (#8325)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4939b88a3151..d11b2e594c22 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name="halide",
-    version='17.0.0',
+    version='18.0.0',
     author="The Halide team",
     author_email="halide-dev@lists.csail.mit.edu",
     description="Halide is a programming language designed to make it easier "

From 9b703f3232127125c7b0584a3373c431802925f0 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Tue, 25 Jun 2024 16:59:43 -0700
Subject: [PATCH 147/186] Provide a minimum OS version for MachO objects
 (#8323)

This gives LLVM enough information to generate a
"platform load-command" in the object file.

Fixes #7941
---
 src/LLVM_Runtime_Linker.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 69f32c35831e..09e58bea894b 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -422,6 +422,25 @@ llvm::DataLayout get_data_layout_for_target(Target target) {
 
 namespace Internal {
 
+namespace {
+
+std::optional<llvm::VersionTuple> get_os_version_constraint(const llvm::Triple &triple) {
+    if (!triple.isOSBinFormatMachO()) {
+        return std::nullopt;
+    }
+
+    if (triple.getOS() == llvm::Triple::MacOSX && triple.getArch() == llvm::Triple::x86_64) {
+        // At time of writing (June 2024), this is one version prior
+        // to the oldest version still supported by Apple.
+        return llvm::VersionTuple(11, 0, 0);
+    }
+
+    llvm::VersionTuple t = triple.getMinimumSupportedOSVersion();
+    return t.empty() ? std::nullopt : std::make_optional(t);
+}
+
+}  // namespace
+
 llvm::Triple get_triple_for_target(const Target &target) {
     llvm::Triple triple;
 
@@ -555,6 +574,14 @@ llvm::Triple get_triple_for_target(const Target &target) {
         // Return default-constructed triple. Must be set later.
     }
 
+    // Setting a minimum OS version here enables LLVM to include platform
+    // metadata in the MachO object file. Without this, Xcode 15's ld
+    // issues warnings about missing the "platform load command".
+    if (auto version = get_os_version_constraint(triple)) {
+        // llvm::Triple determines the version by parsing the OSName.
+        triple.setOSName((triple.getOSName() + version->getAsString()).str());
+    }
+
     return triple;
 }
 

From cab27d83d09b429f26bca8e78ba05cd8aa77f71a Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 26 Jun 2024 09:08:15 -0700
Subject: [PATCH 148/186] Fix horrifying bug in lossless_cast of a subtract
 (#8155)

* Fix horrifying bug in lossless_cast of a subtract

* Use constant integer intervals to analyze safety for lossless_cast

TODO:

- Dedup the constant integer code with the same code in the simplifier.
- Move constant interval arithmetic operations out of the class.
- Make the ConstantInterval part of the return type of lossless_cast
(and turn it into an inner helper) so that it isn't constantly
recomputed.

* Fix ARM and HVX instruction selection

Also added more TODOs

* Using constant_integer_bounds to strengthen FindIntrinsics

In particular, we can do better instruction selection for pmulhrsw

* Move new classes to new files

Also fix up Monotonic.cpp

* Make the simplifier use ConstantInterval

* Handle bounds of narrower types in the simplifier too

* Fix * operator. Add min/max/mod

* Add cache for constant bounds queries

* Fix ConstantInterval multiplication

* Add a simplifier rule which is apparently now necessary

* Misc cleanups and test improvements

* Add missing files

* Account for more aggressive simplification in fuse test

* Remove redundant helpers

* Add missing comment

* clear_bounds_info -> clear_expr_info

* Remove bad TODO

I can't think of a single case that could cause this

* It's too late to change the semantics of fixed point intrinsics

* Fix some UB

* Stronger assert in Simplify_Div

* Delete bad rewrite rules

* Fix bad test when lowering mul_shift_right

b_shift + b_shift < missing_q

* Avoid UB in lowering of rounding_shift_right/left

* Add shifts to the lossless cast fuzzer

This required a more careful signed-integer-overflow detection routine

* Fix bug in lossless_negate

* Add constant interval test

* Rework find_mpy_ops to handle more structures

* Fix bugs in lossless_cast

* Fix mul_shift_right expansion

* Delete commented-out code

* Don't introduce out-of-range shifts in lossless_cast

* Some constant folding only happens after lowering intrinsics in codegen

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/CodeGen_ARM.cpp                    |  64 +++--
 src/CodeGen_X86.cpp                    |  11 +-
 src/Expr.cpp                           |   4 +-
 src/FindIntrinsics.cpp                 | 328 ++++++++++++-------------
 src/HexagonOptimize.cpp                | 120 +++++----
 src/IRMatch.h                          |  47 ++++
 src/IROperator.cpp                     | 220 +++++++++--------
 src/IROperator.h                       |  15 +-
 test/correctness/lossless_cast.cpp     |  56 ++++-
 test/correctness/simd_op_check_arm.cpp |  23 +-
 test/correctness/simd_op_check_x86.cpp |  11 +
 test/correctness/simplify.cpp          |   7 +
 12 files changed, 514 insertions(+), 392 deletions(-)

diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index d0538d6ccca8..0e4de6baa050 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1212,50 +1212,42 @@ void CodeGen_ARM::visit(const Add *op) {
     Expr ac_u8 = Variable::make(UInt(8, 0), "ac"), bc_u8 = Variable::make(UInt(8, 0), "bc");
     Expr cc_u8 = Variable::make(UInt(8, 0), "cc"), dc_u8 = Variable::make(UInt(8, 0), "dc");
 
-    // clang-format off
+    Expr ma_i8 = widening_mul(a_i8, ac_i8);
+    Expr mb_i8 = widening_mul(b_i8, bc_i8);
+    Expr mc_i8 = widening_mul(c_i8, cc_i8);
+    Expr md_i8 = widening_mul(d_i8, dc_i8);
+
+    Expr ma_u8 = widening_mul(a_u8, ac_u8);
+    Expr mb_u8 = widening_mul(b_u8, bc_u8);
+    Expr mc_u8 = widening_mul(c_u8, cc_u8);
+    Expr md_u8 = widening_mul(d_u8, dc_u8);
+
     static const Pattern patterns[] = {
-        // If we had better normalization, we could drastically reduce the number of patterns here.
         // Signed variants.
-        {init_i32 + widening_add(widening_mul(a_i8, ac_i8),  widening_mul(b_i8, bc_i8)) + widening_add(widening_mul(c_i8, cc_i8), widening_mul(d_i8, dc_i8)), "dot_product"},
-        {init_i32 + widening_add(widening_mul(a_i8, ac_i8),  widening_mul(b_i8, bc_i8)) + widening_add(widening_mul(c_i8, cc_i8), i16(d_i8)), "dot_product", Int(8)},
-        {init_i32 + widening_add(widening_mul(a_i8, ac_i8),  widening_mul(b_i8, bc_i8)) + widening_add(i16(c_i8), widening_mul(d_i8, dc_i8)), "dot_product", Int(8)},
-        {init_i32 + widening_add(widening_mul(a_i8, ac_i8),  i16(b_i8)) + widening_add(widening_mul(c_i8, cc_i8), widening_mul(d_i8, dc_i8)), "dot_product", Int(8)},
-        {init_i32 + widening_add(i16(a_i8), widening_mul(b_i8, bc_i8)) + widening_add(widening_mul(c_i8, cc_i8), widening_mul(d_i8, dc_i8)), "dot_product", Int(8)},
-        // Signed variants (associative).
-        {init_i32 + (widening_add(widening_mul(a_i8, ac_i8),  widening_mul(b_i8, bc_i8)) + widening_add(widening_mul(c_i8, cc_i8), widening_mul(d_i8, dc_i8))), "dot_product"},
-        {init_i32 + (widening_add(widening_mul(a_i8, ac_i8),  widening_mul(b_i8, bc_i8)) + widening_add(widening_mul(c_i8, cc_i8), i16(d_i8))), "dot_product", Int(8)},
-        {init_i32 + (widening_add(widening_mul(a_i8, ac_i8),  widening_mul(b_i8, bc_i8)) + widening_add(i16(c_i8), widening_mul(d_i8, dc_i8))), "dot_product", Int(8)},
-        {init_i32 + (widening_add(widening_mul(a_i8, ac_i8),  i16(b_i8)) + widening_add(widening_mul(c_i8, cc_i8), widening_mul(d_i8, dc_i8))), "dot_product", Int(8)},
-        {init_i32 + (widening_add(i16(a_i8), widening_mul(b_i8, bc_i8)) + widening_add(widening_mul(c_i8, cc_i8), widening_mul(d_i8, dc_i8))), "dot_product", Int(8)},
+        {(init_i32 + widening_add(ma_i8, mb_i8)) + widening_add(mc_i8, md_i8), "dot_product"},
+        {init_i32 + (widening_add(ma_i8, mb_i8) + widening_add(mc_i8, md_i8)), "dot_product"},
+        {widening_add(ma_i8, mb_i8) + widening_add(mc_i8, md_i8), "dot_product"},
+
         // Unsigned variants.
-        {init_u32 + widening_add(widening_mul(a_u8, ac_u8),  widening_mul(b_u8, bc_u8)) + widening_add(widening_mul(c_u8, cc_u8), widening_mul(d_u8, dc_u8)), "dot_product"},
-        {init_u32 + widening_add(widening_mul(a_u8, ac_u8),  widening_mul(b_u8, bc_u8)) + widening_add(widening_mul(c_u8, cc_u8), u16(d_u8)), "dot_product", UInt(8)},
-        {init_u32 + widening_add(widening_mul(a_u8, ac_u8),  widening_mul(b_u8, bc_u8)) + widening_add(u16(c_u8), widening_mul(d_u8, dc_u8)), "dot_product", UInt(8)},
-        {init_u32 + widening_add(widening_mul(a_u8, ac_u8),  u16(b_u8)) + widening_add(widening_mul(c_u8, cc_u8), widening_mul(d_u8, dc_u8)), "dot_product", UInt(8)},
-        {init_u32 + widening_add(u16(a_u8), widening_mul(b_u8, bc_u8)) + widening_add(widening_mul(c_u8, cc_u8), widening_mul(d_u8, dc_u8)), "dot_product", UInt(8)},
-        // Unsigned variants (associative).
-        {init_u32 + (widening_add(widening_mul(a_u8, ac_u8),  widening_mul(b_u8, bc_u8)) + widening_add(widening_mul(c_u8, cc_u8), widening_mul(d_u8, dc_u8))), "dot_product"},
-        {init_u32 + (widening_add(widening_mul(a_u8, ac_u8),  widening_mul(b_u8, bc_u8)) + widening_add(widening_mul(c_u8, cc_u8), u16(d_u8))), "dot_product", UInt(8)},
-        {init_u32 + (widening_add(widening_mul(a_u8, ac_u8),  widening_mul(b_u8, bc_u8)) + widening_add(u16(c_u8), widening_mul(d_u8, dc_u8))), "dot_product", UInt(8)},
-        {init_u32 + (widening_add(widening_mul(a_u8, ac_u8),  u16(b_u8)) + widening_add(widening_mul(c_u8, cc_u8), widening_mul(d_u8, dc_u8))), "dot_product", UInt(8)},
-        {init_u32 + (widening_add(u16(a_u8), widening_mul(b_u8, bc_u8)) + widening_add(widening_mul(c_u8, cc_u8), widening_mul(d_u8, dc_u8))), "dot_product", UInt(8)},
+        {(init_u32 + widening_add(ma_u8, mb_u8)) + widening_add(mc_u8, md_u8), "dot_product"},
+        {init_u32 + (widening_add(ma_u8, mb_u8) + widening_add(mc_u8, md_u8)), "dot_product"},
+        {widening_add(ma_u8, mb_u8) + widening_add(mc_u8, md_u8), "dot_product"},
     };
-    // clang-format on
 
     std::map<std::string, Expr> matches;
     for (const Pattern &p : patterns) {
         if (expr_match(p.pattern, op, matches)) {
-            Expr init = matches["init"];
-            Expr values = Shuffle::make_interleave({matches["a"], matches["b"], matches["c"], matches["d"]});
-            // Coefficients can be 1 if not in the pattern.
-            Expr one = make_one(p.coeff_type.with_lanes(op->type.lanes()));
-            // This hideous code pattern implements fetching a
-            // default value if the map doesn't contain a key.
-            Expr _ac = matches.try_emplace("ac", one).first->second;
-            Expr _bc = matches.try_emplace("bc", one).first->second;
-            Expr _cc = matches.try_emplace("cc", one).first->second;
-            Expr _dc = matches.try_emplace("dc", one).first->second;
-            Expr coeffs = Shuffle::make_interleave({_ac, _bc, _cc, _dc});
+            Expr init;
+            auto it = matches.find("init");
+            if (it == matches.end()) {
+                init = make_zero(op->type);
+            } else {
+                init = it->second;
+            }
+            Expr values = Shuffle::make_interleave({matches["a"], matches["b"],
+                                                    matches["c"], matches["d"]});
+            Expr coeffs = Shuffle::make_interleave({matches["ac"], matches["bc"],
+                                                    matches["cc"], matches["dc"]});
             value = call_overloaded_intrin(op->type, p.intrin, {init, values, coeffs});
             if (value) {
                 return;
diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
index 5dd6a17e02d2..7a8d1c720098 100644
--- a/src/CodeGen_X86.cpp
+++ b/src/CodeGen_X86.cpp
@@ -538,8 +538,8 @@ void CodeGen_X86::visit(const Cast *op) {
     };
 
     // clang-format off
-    static const Pattern patterns[] = {
-        // This isn't rounding_multiply_quantzied(i16, i16, 15) because it doesn't
+    static Pattern patterns[] = {
+        // This isn't rounding_mul_shift_right(i16, i16, 15) because it doesn't
         // saturate the result.
         {"pmulhrs", i16(rounding_shift_right(widening_mul(wild_i16x_, wild_i16x_), 15))},
 
@@ -736,7 +736,12 @@ void CodeGen_X86::visit(const Call *op) {
         // Handle edge case of possible overflow.
         // See https://github.com/halide/Halide/pull/7129/files#r1008331426
         // On AVX512 (and with enough lanes) we can use a mask register.
-        if (target.has_feature(Target::AVX512) && op->type.lanes() >= 32) {
+        ConstantInterval ca = constant_integer_bounds(a);
+        ConstantInterval cb = constant_integer_bounds(b);
+        if (!ca.contains(-32768) || !cb.contains(-32768)) {
+            // Overflow isn't possible
+            pmulhrs.accept(this);
+        } else if (target.has_feature(Target::AVX512) && op->type.lanes() >= 32) {
             Expr expr = select((a == i16_min) && (b == i16_min), i16_max, pmulhrs);
             expr.accept(this);
         } else {
diff --git a/src/Expr.cpp b/src/Expr.cpp
index c3a7deb483aa..d73bd72660fa 100644
--- a/src/Expr.cpp
+++ b/src/Expr.cpp
@@ -8,7 +8,7 @@ const IntImm *IntImm::make(Type t, int64_t value) {
     internal_assert(t.is_int() && t.is_scalar())
         << "IntImm must be a scalar Int\n";
     internal_assert(t.bits() >= 1 && t.bits() <= 64)
-        << "IntImm must have between 1 and 64 bits\n";
+        << "IntImm must have between 1 and 64 bits: " << t << "\n";
 
     // Normalize the value by dropping the high bits.
     // Since left-shift of negative value is UB in C++, cast to uint64 first;
@@ -28,7 +28,7 @@ const UIntImm *UIntImm::make(Type t, uint64_t value) {
     internal_assert(t.is_uint() && t.is_scalar())
         << "UIntImm must be a scalar UInt\n";
     internal_assert(t.bits() >= 1 && t.bits() <= 64)
-        << "UIntImm must have between 1 and 64 bits\n";
+        << "UIntImm must have between 1 and 64 bits " << t << "\n";
 
     // Normalize the value by dropping the high bits
     value <<= (64 - t.bits());
diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp
index 793234c8b3ff..b72122460706 100644
--- a/src/FindIntrinsics.cpp
+++ b/src/FindIntrinsics.cpp
@@ -2,6 +2,7 @@
 #include "CSE.h"
 #include "CodeGen_Internal.h"
 #include "ConciseCasts.h"
+#include "ConstantBounds.h"
 #include "IRMatch.h"
 #include "IRMutator.h"
 #include "Simplify.h"
@@ -45,23 +46,6 @@ bool can_narrow(const Type &t) {
            t.bits() >= 8;
 }
 
-Expr lossless_narrow(const Expr &x) {
-    return can_narrow(x.type()) ? lossless_cast(x.type().narrow(), x) : Expr();
-}
-
-// Remove a widening cast even if it changes the sign of the result.
-Expr strip_widening_cast(const Expr &x) {
-    if (can_narrow(x.type())) {
-        Expr narrow = lossless_narrow(x);
-        if (narrow.defined()) {
-            return narrow;
-        }
-        return lossless_cast(x.type().narrow().with_code(halide_type_uint), x);
-    } else {
-        return Expr();
-    }
-}
-
 Expr saturating_narrow(const Expr &a) {
     Type narrow = a.type().narrow();
     return saturating_cast(narrow, a);
@@ -77,34 +61,6 @@ bool no_overflow(Type t) {
     return t.is_float() || no_overflow_int(t);
 }
 
-// If there's a widening add or subtract in the first e.type().bits() / 2 - 1
-// levels down a tree of adds or subtracts, we know there's enough headroom for
-// another add without overflow. For example, it is safe to add to
-// (widening_add(x, y) - z) without overflow.
-bool is_safe_for_add(const Expr &e, int max_depth) {
-    if (max_depth-- <= 0) {
-        return false;
-    }
-    if (const Add *add = e.as<Add>()) {
-        return is_safe_for_add(add->a, max_depth) || is_safe_for_add(add->b, max_depth);
-    } else if (const Sub *sub = e.as<Sub>()) {
-        return is_safe_for_add(sub->a, max_depth) || is_safe_for_add(sub->b, max_depth);
-    } else if (const Cast *cast = e.as<Cast>()) {
-        if (cast->type.bits() > cast->value.type().bits()) {
-            return true;
-        } else if (cast->type.bits() == cast->value.type().bits()) {
-            return is_safe_for_add(cast->value, max_depth);
-        }
-    } else if (Call::as_intrinsic(e, {Call::widening_add, Call::widening_sub, Call::widen_right_add, Call::widen_right_sub})) {
-        return true;
-    }
-    return false;
-}
-
-bool is_safe_for_add(const Expr &e) {
-    return is_safe_for_add(e, e.type().bits() / 2 - 1);
-}
-
 // We want to find and remove an add of 'round' from e. This is not
 // the same thing as just subtracting round, we specifically want
 // to remove an addition of exactly round.
@@ -130,103 +86,129 @@ Expr find_and_subtract(const Expr &e, const Expr &round) {
     return Expr();
 }
 
-Expr to_rounding_shift(const Call *c) {
-    if (c->is_intrinsic(Call::shift_left) || c->is_intrinsic(Call::shift_right)) {
-        internal_assert(c->args.size() == 2);
-        Expr a = c->args[0];
-        Expr b = c->args[1];
+class FindIntrinsics : public IRMutator {
+protected:
+    using IRMutator::visit;
 
-        // Helper to make the appropriate shift.
-        auto rounding_shift = [&](const Expr &a, const Expr &b) {
-            if (c->is_intrinsic(Call::shift_right)) {
-                return rounding_shift_right(a, b);
-            } else {
-                return rounding_shift_left(a, b);
-            }
-        };
+    IRMatcher::Wild<0> x;
+    IRMatcher::Wild<1> y;
+    IRMatcher::Wild<2> z;
+    IRMatcher::Wild<3> w;
+    IRMatcher::WildConst<0> c0;
+    IRMatcher::WildConst<1> c1;
 
-        // The rounding offset for the shift we have.
-        Type round_type = a.type().with_lanes(1);
-        if (Call::as_intrinsic(a, {Call::widening_add})) {
-            round_type = round_type.narrow();
-        }
-        Expr round;
-        if (c->is_intrinsic(Call::shift_right)) {
-            round = (make_one(round_type) << max(cast(b.type().with_bits(round_type.bits()), b), 0)) / 2;
+    std::map<Expr, ConstantInterval, ExprCompare> bounds_cache;
+    Scope<ConstantInterval> let_var_bounds;
+
+    Expr lossless_cast(Type t, const Expr &e) {
+        return Halide::Internal::lossless_cast(t, e, &bounds_cache);
+    }
+
+    ConstantInterval constant_integer_bounds(const Expr &e) {
+        // TODO: Use the scope - add let visitors
+        return Halide::Internal::constant_integer_bounds(e, let_var_bounds, &bounds_cache);
+    }
+
+    Expr lossless_narrow(const Expr &x) {
+        return can_narrow(x.type()) ? lossless_cast(x.type().narrow(), x) : Expr();
+    }
+
+    // Remove a widening cast even if it changes the sign of the result.
+    Expr strip_widening_cast(const Expr &x) {
+        if (can_narrow(x.type())) {
+            Expr narrow = lossless_narrow(x);
+            if (narrow.defined()) {
+                return narrow;
+            }
+            return lossless_cast(x.type().narrow().with_code(halide_type_uint), x);
         } else {
-            round = (make_one(round_type) >> min(cast(b.type().with_bits(round_type.bits()), b), 0)) / 2;
+            return Expr();
         }
-        // Input expressions are simplified before running find_intrinsics, but b
-        // has been lifted here so we need to lower_intrinsics before simplifying
-        // and re-lifting. Should we move this code into the FindIntrinsics class
-        // to make it easier to lift round?
-        round = lower_intrinsics(round);
-        round = simplify(round);
-        round = find_intrinsics(round);
-
-        // We can always handle widening adds.
-        if (const Call *add = Call::as_intrinsic(a, {Call::widening_add})) {
-            if (can_prove(lower_intrinsics(add->args[0] == round))) {
-                return rounding_shift(cast(add->type, add->args[1]), b);
-            } else if (can_prove(lower_intrinsics(add->args[1] == round))) {
-                return rounding_shift(cast(add->type, add->args[0]), b);
+    }
+
+    Expr to_rounding_shift(const Call *c) {
+        if (c->is_intrinsic(Call::shift_left) || c->is_intrinsic(Call::shift_right)) {
+            internal_assert(c->args.size() == 2);
+            Expr a = c->args[0];
+            Expr b = c->args[1];
+
+            // Helper to make the appropriate shift.
+            auto rounding_shift = [&](const Expr &a, const Expr &b) {
+                if (c->is_intrinsic(Call::shift_right)) {
+                    return rounding_shift_right(a, b);
+                } else {
+                    return rounding_shift_left(a, b);
+                }
+            };
+
+            // The rounding offset for the shift we have.
+            Type round_type = a.type().with_lanes(1);
+            if (Call::as_intrinsic(a, {Call::widening_add})) {
+                round_type = round_type.narrow();
+            }
+            Expr round;
+            if (c->is_intrinsic(Call::shift_right)) {
+                round = (make_one(round_type) << max(cast(b.type().with_bits(round_type.bits()), b), 0)) / 2;
+            } else {
+                round = (make_one(round_type) >> min(cast(b.type().with_bits(round_type.bits()), b), 0)) / 2;
+            }
+            // Input expressions are simplified before running find_intrinsics, but b
+            // has been lifted here so we need to lower_intrinsics before simplifying
+            // and re-lifting. Should we move this code into the FindIntrinsics class
+            // to make it easier to lift round?
+            round = lower_intrinsics(round);
+            round = simplify(round);
+            round = find_intrinsics(round);
+
+            // We can always handle widening adds.
+            if (const Call *add = Call::as_intrinsic(a, {Call::widening_add})) {
+                if (can_prove(lower_intrinsics(add->args[0] == round))) {
+                    return rounding_shift(cast(add->type, add->args[1]), b);
+                } else if (can_prove(lower_intrinsics(add->args[1] == round))) {
+                    return rounding_shift(cast(add->type, add->args[0]), b);
+                }
             }
-        }
 
-        if (const Call *add = Call::as_intrinsic(a, {Call::widen_right_add})) {
-            if (can_prove(lower_intrinsics(add->args[1] == round))) {
-                return rounding_shift(cast(add->type, add->args[0]), b);
+            if (const Call *add = Call::as_intrinsic(a, {Call::widen_right_add})) {
+                if (can_prove(lower_intrinsics(add->args[1] == round))) {
+                    return rounding_shift(cast(add->type, add->args[0]), b);
+                }
             }
-        }
 
-        // Also need to handle the annoying case of a reinterpret cast wrapping a widen_right_add
-        // TODO: this pattern makes me want to change the semantics of this op.
-        if (const Cast *cast = a.as<Cast>()) {
-            if (cast->is_reinterpret()) {
-                if (const Call *add = Call::as_intrinsic(cast->value, {Call::widen_right_add})) {
-                    if (can_prove(lower_intrinsics(add->args[1] == round))) {
-                        // We expect the first operand to be a reinterpet cast.
-                        if (const Cast *cast_a = add->args[0].as<Cast>()) {
-                            if (cast_a->is_reinterpret()) {
-                                return rounding_shift(cast_a->value, b);
+            // Also need to handle the annoying case of a reinterpret cast wrapping a widen_right_add
+            if (const Cast *cast = a.as<Cast>()) {
+                if (cast->is_reinterpret()) {
+                    if (const Call *add = Call::as_intrinsic(cast->value, {Call::widen_right_add})) {
+                        if (can_prove(lower_intrinsics(add->args[1] == round))) {
+                            // We expect the first operand to be a reinterpet cast.
+                            if (const Cast *cast_a = add->args[0].as<Cast>()) {
+                                if (cast_a->is_reinterpret()) {
+                                    return rounding_shift(cast_a->value, b);
+                                }
                             }
                         }
                     }
                 }
             }
-        }
 
-        // If it wasn't a widening or saturating add, we might still
-        // be able to safely accept the rounding.
-        Expr a_less_round = find_and_subtract(a, round);
-        if (a_less_round.defined()) {
-            // We found and removed the rounding. However, we may have just changed
-            // behavior due to overflow. This is still safe if the type is not
-            // overflowing, or we can find a widening add or subtract in the tree
-            // of adds/subtracts. This is a common pattern, e.g.
-            // rounding_halving_add(a, b) = shift_round(widening_add(a, b) + 1, 1).
-            // TODO: This could be done with bounds inference instead of this hack
-            // if it supported intrinsics like widening_add and tracked bounds for
-            // types other than int32.
-            if (no_overflow(a.type()) || is_safe_for_add(a_less_round)) {
-                return rounding_shift(simplify(a_less_round), b);
+            // If it wasn't a widening or saturating add, we might still
+            // be able to safely accept the rounding.
+            Expr a_less_round = find_and_subtract(a, round);
+            if (a_less_round.defined()) {
+                // We found and removed the rounding. Verify it didn't change
+                // overflow behavior.
+                if (no_overflow(a.type()) ||
+                    a.type().can_represent(constant_integer_bounds(a_less_round) +
+                                           constant_integer_bounds(round))) {
+                    // If we can add the rounding term back on without causing
+                    // overflow, then it must not have overflowed originally.
+                    return rounding_shift(simplify(a_less_round), b);
+                }
             }
         }
-    }
-
-    return Expr();
-}
 
-class FindIntrinsics : public IRMutator {
-protected:
-    using IRMutator::visit;
-
-    IRMatcher::Wild<0> x;
-    IRMatcher::Wild<1> y;
-    IRMatcher::Wild<2> z;
-    IRMatcher::Wild<3> w;
-    IRMatcher::WildConst<0> c0;
-    IRMatcher::WildConst<1> c1;
+        return Expr();
+    }
 
     Expr visit(const Add *op) override {
         if (!find_intrinsics_for_type(op->type)) {
@@ -548,6 +530,11 @@ class FindIntrinsics : public IRMutator {
             }
         }
 
+        // Do we need to worry about this cast overflowing?
+        ConstantInterval value_bounds = constant_integer_bounds(value);
+        bool no_overflow = (op->type.can_represent(op->value.type()) ||
+                            op->type.can_represent(value_bounds));
+
         if (op->type.is_int() || op->type.is_uint()) {
             Expr lower = cast(value.type(), op->type.min());
             Expr upper = cast(value.type(), op->type.max());
@@ -565,7 +552,6 @@ class FindIntrinsics : public IRMutator {
             auto is_x_same_uint = op->type.is_uint() && is_uint(x, bits);
             auto is_x_same_int_or_uint = is_x_same_int || is_x_same_uint;
             auto x_y_same_sign = (is_int(x) && is_int(y)) || (is_uint(x) && is_uint(y));
-            auto is_y_narrow_uint = op->type.is_uint() && is_uint(y, bits / 2);
             if (
                 // Saturating patterns
                 rewrite(max(min(widening_add(x, y), upper), lower),
@@ -667,32 +653,16 @@ class FindIntrinsics : public IRMutator {
                         rounding_mul_shift_right(x, y, cast(unsigned_type, c0)),
                         is_x_same_int && x_y_same_sign && c0 >= bits - 1) ||
 
-                rewrite(shift_right(widening_mul(x, y), c0),
-                        mul_shift_right(x, y, cast(unsigned_type, c0)),
-                        is_x_same_int_or_uint && x_y_same_sign && c0 >= bits) ||
-
-                rewrite(rounding_shift_right(widening_mul(x, y), c0),
-                        rounding_mul_shift_right(x, y, cast(unsigned_type, c0)),
-                        is_x_same_int_or_uint && x_y_same_sign && c0 >= bits) ||
-
-                // We can also match on smaller shifts if one of the args is
-                // narrow. We don't do this for signed (yet), because the
-                // saturation issue is tricky.
-                rewrite(shift_right(widening_mul(x, cast(op->type, y)), c0),
-                        mul_shift_right(x, cast(op->type, y), cast(unsigned_type, c0)),
-                        is_x_same_int_or_uint && is_y_narrow_uint && c0 >= bits / 2) ||
-
-                rewrite(rounding_shift_right(widening_mul(x, cast(op->type, y)), c0),
-                        rounding_mul_shift_right(x, cast(op->type, y), cast(unsigned_type, c0)),
-                        is_x_same_int_or_uint && is_y_narrow_uint && c0 >= bits / 2) ||
+                // We can also match whenever the cast can't overflow, so
+                // questions of saturation are irrelevant.
+                (no_overflow &&
+                 (rewrite(shift_right(widening_mul(x, y), c0),
+                          mul_shift_right(x, y, cast(unsigned_type, c0)),
+                          is_x_same_int_or_uint && x_y_same_sign && c0 >= 0) ||
 
-                rewrite(shift_right(widening_mul(cast(op->type, y), x), c0),
-                        mul_shift_right(cast(op->type, y), x, cast(unsigned_type, c0)),
-                        is_x_same_int_or_uint && is_y_narrow_uint && c0 >= bits / 2) ||
-
-                rewrite(rounding_shift_right(widening_mul(cast(op->type, y), x), c0),
-                        rounding_mul_shift_right(cast(op->type, y), x, cast(unsigned_type, c0)),
-                        is_x_same_int_or_uint && is_y_narrow_uint && c0 >= bits / 2) ||
+                  rewrite(rounding_shift_right(widening_mul(x, y), c0),
+                          rounding_mul_shift_right(x, y, cast(unsigned_type, c0)),
+                          is_x_same_int_or_uint && x_y_same_sign && c0 >= 0))) ||
 
                 // Halving subtract patterns
                 rewrite(shift_right(cast(op_type_wide, widening_sub(x, y)), 1),
@@ -908,13 +878,16 @@ class FindIntrinsics : public IRMutator {
         }
         // TODO: do we want versions of widen_right_add here?
 
-        if (op->is_intrinsic(Call::shift_right) || op->is_intrinsic(Call::shift_left)) {
+        if (op->is_intrinsic(Call::shift_right) ||
+            op->is_intrinsic(Call::shift_left)) {
             // Try to turn this into a widening shift.
             internal_assert(op->args.size() == 2);
             Expr a_narrow = lossless_narrow(op->args[0]);
             Expr b_narrow = lossless_narrow(op->args[1]);
             if (a_narrow.defined() && b_narrow.defined()) {
-                Expr result = op->is_intrinsic(Call::shift_left) ? widening_shift_left(a_narrow, b_narrow) : widening_shift_right(a_narrow, b_narrow);
+                Expr result = op->is_intrinsic(Call::shift_left) ?
+                                  widening_shift_left(a_narrow, b_narrow) :
+                                  widening_shift_right(a_narrow, b_narrow);
                 if (result.type() != op->type) {
                     result = Cast::make(op->type, result);
                 }
@@ -928,7 +901,8 @@ class FindIntrinsics : public IRMutator {
             }
         }
 
-        if (op->is_intrinsic(Call::rounding_shift_left) || op->is_intrinsic(Call::rounding_shift_right)) {
+        if (op->is_intrinsic(Call::rounding_shift_left) ||
+            op->is_intrinsic(Call::rounding_shift_right)) {
             // Try to turn this into a widening shift.
             internal_assert(op->args.size() == 2);
             Expr a_narrow = lossless_narrow(op->args[0]);
@@ -1490,27 +1464,45 @@ Expr lower_rounding_mul_shift_right(const Expr &a, const Expr &b, const Expr &q)
     // one of the operands and the denominator by a constant. We only do this
     // if it isn't already full precision. This avoids infinite loops despite
     // "lowering" this to another mul_shift_right operation.
-    if (can_prove(q < full_q)) {
-        Expr missing_q = full_q - q;
-        internal_assert(missing_q.type().bits() == b.type().bits());
-        Expr new_b = simplify(b << missing_q);
-        if (is_const(new_b) && can_prove(new_b >> missing_q == b)) {
-            return rounding_mul_shift_right(a, new_b, full_q);
+    ConstantInterval cq = constant_integer_bounds(q);
+    if (cq.is_single_point() && cq.max >= 0 && cq.max < full_q) {
+        int missing_q = full_q - (int)cq.max;
+
+        // Try to scale up the args by factors of two without overflowing
+        int a_shift = 0, b_shift = 0;
+        ConstantInterval ca = constant_integer_bounds(a);
+        while (true) {
+            ConstantInterval bigger = ca * 2;
+            if (a.type().can_represent(bigger) && a_shift + b_shift < missing_q) {
+                ca = bigger;
+                a_shift++;
+            } else {
+                break;
+            }
         }
-        Expr new_a = simplify(a << missing_q);
-        if (is_const(new_a) && can_prove(new_a >> missing_q == a)) {
-            return rounding_mul_shift_right(new_a, b, full_q);
+        ConstantInterval cb = constant_integer_bounds(b);
+        while (true) {
+            ConstantInterval bigger = cb * 2;
+            if (b.type().can_represent(bigger) && a_shift + b_shift < missing_q) {
+                cb = bigger;
+                b_shift++;
+            } else {
+                break;
+            }
+        }
+        if (a_shift + b_shift == missing_q) {
+            return rounding_mul_shift_right(simplify(a << a_shift), simplify(b << b_shift), full_q);
         }
     }
 
     // If all else fails, just widen, shift, and narrow.
-    Expr result = rounding_shift_right(widening_mul(a, b), q);
-    if (!can_prove(q >= a.type().bits())) {
-        result = saturating_narrow(result);
+    Expr wide_result = rounding_shift_right(widening_mul(a, b), q);
+    Expr narrowed = lossless_cast(a.type(), wide_result);
+    if (narrowed.defined()) {
+        return narrowed;
     } else {
-        result = narrow(result);
+        return saturating_narrow(wide_result);
     }
-    return result;
 }
 
 Expr lower_intrinsic(const Call *op) {
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index 6834d4abe7f3..13b2b5d24559 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -382,6 +382,7 @@ typedef pair<Expr, Expr> MulExpr;
 // the number of lanes in Broadcast or indices in a Shuffle
 // to match the ty lanes before using lossless_cast on it.
 Expr unbroadcast_lossless_cast(Type ty, Expr x) {
+    internal_assert(x.defined());
     if (x.type().is_vector()) {
         if (const Broadcast *bc = x.as<Broadcast>()) {
             if (ty.is_scalar()) {
@@ -410,56 +411,78 @@ Expr unbroadcast_lossless_cast(Type ty, Expr x) {
 // multiplies in 'mpys', added to 'rest'.
 // Difference in mpys.size() - return indicates the number of
 // expressions where we pretend the op to be multiplied by 1.
-int find_mpy_ops(const Expr &op, Type a_ty, Type b_ty, int max_mpy_count,
+int find_mpy_ops(const Expr &op, Type result_ty, Type a_ty, Type b_ty, int max_mpy_count,
                  vector<MulExpr> &mpys, Expr &rest) {
-    if ((int)mpys.size() >= max_mpy_count) {
-        rest = rest.defined() ? Add::make(rest, op) : op;
-        return 0;
-    }
 
-    // If the add is also widening, remove the cast.
-    int mpy_bits = std::max(a_ty.bits(), b_ty.bits()) * 2;
-    Expr maybe_mul = op;
-    if (op.type().bits() == mpy_bits * 2) {
-        if (const Cast *cast = op.as<Cast>()) {
-            if (cast->value.type().bits() == mpy_bits) {
-                maybe_mul = cast->value;
-            }
+    auto add_to_rest = [&](const Expr &a) {
+        if (rest.defined()) {
+            // Just widen to the result type. We run find_intrinsics on rest
+            // after calling this, to find things like widen_right_add in this
+            // summation.
+            rest = Add::make(rest, cast(result_ty, a));
+        } else {
+            rest = cast(result_ty, a);
         }
+    };
+
+    if ((int)mpys.size() >= max_mpy_count) {
+        add_to_rest(op);
+        return 0;
     }
-    maybe_mul = as_mul(maybe_mul);
 
-    if (maybe_mul.defined()) {
-        const Mul *mul = maybe_mul.as<Mul>();
-        Expr a = unbroadcast_lossless_cast(a_ty, mul->a);
-        Expr b = unbroadcast_lossless_cast(b_ty, mul->b);
+    auto handle_mul = [&](const Expr &arg0, const Expr &arg1) -> bool {
+        Expr a = unbroadcast_lossless_cast(a_ty, arg0);
+        Expr b = unbroadcast_lossless_cast(b_ty, arg1);
         if (a.defined() && b.defined()) {
             mpys.emplace_back(a, b);
-            return 1;
-        } else {
+            return true;
+        } else if (a_ty != b_ty) {
             // Try to commute the op.
-            a = unbroadcast_lossless_cast(a_ty, mul->b);
-            b = unbroadcast_lossless_cast(b_ty, mul->a);
+            a = unbroadcast_lossless_cast(a_ty, arg1);
+            b = unbroadcast_lossless_cast(b_ty, arg0);
             if (a.defined() && b.defined()) {
                 mpys.emplace_back(a, b);
-                return 1;
+                return true;
             }
         }
+        return false;
+    };
+
+    if (const Mul *mul = op.as<Mul>()) {
+        bool no_overflow = mul->type.can_represent(constant_integer_bounds(mul->a) *
+                                                   constant_integer_bounds(mul->b));
+        if (no_overflow && handle_mul(mul->a, mul->b)) {
+            return 1;
+        }
+    } else if (const Call *mul = Call::as_intrinsic(op, {Call::widening_mul, Call::widen_right_mul})) {
+        bool no_overflow = (mul->is_intrinsic(Call::widening_mul) ||
+                            mul->type.can_represent(constant_integer_bounds(mul->args[0]) *
+                                                    constant_integer_bounds(mul->args[1])));
+        if (no_overflow && handle_mul(mul->args[0], mul->args[1])) {
+            return 1;
+        }
     } else if (const Add *add = op.as<Add>()) {
-        int mpy_count = 0;
-        mpy_count += find_mpy_ops(add->a, a_ty, b_ty, max_mpy_count, mpys, rest);
-        mpy_count += find_mpy_ops(add->b, a_ty, b_ty, max_mpy_count, mpys, rest);
-        return mpy_count;
-    } else if (const Call *add = Call::as_intrinsic(op, {Call::widening_add})) {
-        int mpy_count = 0;
-        mpy_count += find_mpy_ops(cast(op.type(), add->args[0]), a_ty, b_ty, max_mpy_count, mpys, rest);
-        mpy_count += find_mpy_ops(cast(op.type(), add->args[1]), a_ty, b_ty, max_mpy_count, mpys, rest);
-        return mpy_count;
-    } else if (const Call *wadd = Call::as_intrinsic(op, {Call::widen_right_add})) {
-        int mpy_count = 0;
-        mpy_count += find_mpy_ops(wadd->args[0], a_ty, b_ty, max_mpy_count, mpys, rest);
-        mpy_count += find_mpy_ops(cast(op.type(), wadd->args[1]), a_ty, b_ty, max_mpy_count, mpys, rest);
-        return mpy_count;
+        bool no_overflow = (add->type == result_ty ||
+                            add->type.can_represent(constant_integer_bounds(add->a) +
+                                                    constant_integer_bounds(add->b)));
+        if (no_overflow) {
+            return (find_mpy_ops(add->a, result_ty, a_ty, b_ty, max_mpy_count, mpys, rest) +
+                    find_mpy_ops(add->b, result_ty, a_ty, b_ty, max_mpy_count, mpys, rest));
+        }
+    } else if (const Call *add = Call::as_intrinsic(op, {Call::widening_add, Call::widen_right_add})) {
+        bool no_overflow = (add->type == result_ty ||
+                            add->is_intrinsic(Call::widening_add) ||
+                            add->type.can_represent(constant_integer_bounds(add->args[0]) +
+                                                    constant_integer_bounds(add->args[1])));
+        if (no_overflow) {
+            return (find_mpy_ops(add->args[0], result_ty, a_ty, b_ty, max_mpy_count, mpys, rest) +
+                    find_mpy_ops(add->args[1], result_ty, a_ty, b_ty, max_mpy_count, mpys, rest));
+        }
+    } else if (const Cast *cast = op.as<Cast>()) {
+        bool cast_is_lossless = cast->type.can_represent(constant_integer_bounds(cast->value));
+        if (cast_is_lossless) {
+            return find_mpy_ops(cast->value, result_ty, a_ty, b_ty, max_mpy_count, mpys, rest);
+        }
     }
 
     // Attempt to pretend this op is multiplied by 1.
@@ -471,7 +494,7 @@ int find_mpy_ops(const Expr &op, Type a_ty, Type b_ty, int max_mpy_count,
     } else if (as_b.defined()) {
         mpys.emplace_back(make_one(a_ty), as_b);
     } else {
-        rest = rest.defined() ? Add::make(rest, op) : op;
+        add_to_rest(op);
     }
     return 0;
 }
@@ -554,10 +577,10 @@ class OptimizePatterns : public IRMutator {
             // match a subset of the expressions that vector*vector
             // matches.
             if (op->type.is_uint()) {
-                mpy_count = find_mpy_ops(op, UInt(8, lanes), UInt(8), 4, mpys, rest);
+                mpy_count = find_mpy_ops(op, op->type, UInt(8, lanes), UInt(8), 4, mpys, rest);
                 suffix = ".vub.ub";
             } else {
-                mpy_count = find_mpy_ops(op, UInt(8, lanes), Int(8), 4, mpys, rest);
+                mpy_count = find_mpy_ops(op, op->type, UInt(8, lanes), Int(8), 4, mpys, rest);
                 suffix = ".vub.b";
             }
 
@@ -588,7 +611,7 @@ class OptimizePatterns : public IRMutator {
                         new_expr = Call::make(op->type, "halide.hexagon.pack.vw", {new_expr}, Call::PureExtern);
                     }
                     if (rest.defined()) {
-                        new_expr = Add::make(new_expr, rest);
+                        new_expr = Add::make(new_expr, find_intrinsics(rest));
                     }
                     return mutate(new_expr);
                 }
@@ -598,10 +621,10 @@ class OptimizePatterns : public IRMutator {
             mpys.clear();
             rest = Expr();
             if (op->type.is_uint()) {
-                mpy_count = find_mpy_ops(op, UInt(8, lanes), UInt(8, lanes), 4, mpys, rest);
+                mpy_count = find_mpy_ops(op, op->type, UInt(8, lanes), UInt(8, lanes), 4, mpys, rest);
                 suffix = ".vub.vub";
             } else {
-                mpy_count = find_mpy_ops(op, Int(8, lanes), Int(8, lanes), 4, mpys, rest);
+                mpy_count = find_mpy_ops(op, op->type, Int(8, lanes), Int(8, lanes), 4, mpys, rest);
                 suffix = ".vb.vb";
             }
 
@@ -631,7 +654,7 @@ class OptimizePatterns : public IRMutator {
                         new_expr = Call::make(op->type, "halide.hexagon.pack.vw", {new_expr}, Call::PureExtern);
                     }
                     if (rest.defined()) {
-                        new_expr = Add::make(new_expr, rest);
+                        new_expr = Add::make(new_expr, find_intrinsics(rest));
                     }
                     return mutate(new_expr);
                 }
@@ -650,11 +673,11 @@ class OptimizePatterns : public IRMutator {
 
             // Try to find vector*scalar multiplies.
             if (op->type.bits() == 16) {
-                mpy_count = find_mpy_ops(op, UInt(8, lanes), Int(8), 2, mpys, rest);
+                mpy_count = find_mpy_ops(op, op->type, UInt(8, lanes), Int(8), 2, mpys, rest);
                 vmpa_suffix = ".vub.vub.b.b";
                 vdmpy_suffix = ".vub.b";
             } else if (op->type.bits() == 32) {
-                mpy_count = find_mpy_ops(op, Int(16, lanes), Int(8), 2, mpys, rest);
+                mpy_count = find_mpy_ops(op, op->type, Int(16, lanes), Int(8), 2, mpys, rest);
                 vmpa_suffix = ".vh.vh.b.b";
                 vdmpy_suffix = ".vh.b";
             }
@@ -682,7 +705,7 @@ class OptimizePatterns : public IRMutator {
                     new_expr = halide_hexagon_add_2mpy(op->type, vmpa_suffix, mpys[0].first, mpys[1].first, mpys[0].second, mpys[1].second);
                 }
                 if (rest.defined()) {
-                    new_expr = Add::make(new_expr, rest);
+                    new_expr = Add::make(new_expr, find_intrinsics(rest));
                 }
                 return mutate(new_expr);
             }
@@ -2271,6 +2294,9 @@ Stmt scatter_gather_generator(Stmt s) {
 }
 
 Stmt optimize_hexagon_instructions(Stmt s, const Target &t) {
+    debug(4) << "Hexagon: lowering before find_intrinsics\n"
+             << s << "\n";
+
     // We need to redo intrinsic matching due to simplification that has
     // happened after the end of target independent lowering.
     s = find_intrinsics(s);
diff --git a/src/IRMatch.h b/src/IRMatch.h
index 4f6dfb13c145..da5a300cfb01 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -2068,6 +2068,53 @@ HALIDE_ALWAYS_INLINE auto cast(halide_type_t t, A &&a) noexcept -> CastOp<declty
     return {t, pattern_arg(a)};
 }
 
+template<typename A>
+struct WidenOp {
+    struct pattern_tag {};
+    A a;
+
+    constexpr static uint32_t binds = bindings<A>::mask;
+
+    constexpr static IRNodeType min_node_type = IRNodeType::Cast;
+    constexpr static IRNodeType max_node_type = IRNodeType::Cast;
+    constexpr static bool canonical = A::canonical;
+
+    template<uint32_t bound>
+    HALIDE_ALWAYS_INLINE bool match(const BaseExprNode &e, MatcherState &state) const noexcept {
+        if (e.node_type != Cast::_node_type) {
+            return false;
+        }
+        const Cast &op = (const Cast &)e;
+        return (e.type == op.value.type().widen() &&
+                a.template match<bound>(*op.value.get(), state));
+    }
+    template<uint32_t bound, typename A2>
+    HALIDE_ALWAYS_INLINE bool match(const WidenOp<A2> &op, MatcherState &state) const noexcept {
+        return a.template match<bound>(unwrap(op.a), state);
+    }
+
+    HALIDE_ALWAYS_INLINE
+    Expr make(MatcherState &state, halide_type_t type_hint) const {
+        Expr e = a.make(state, {});
+        Type w = e.type().widen();
+        return cast(w, std::move(e));
+    }
+
+    constexpr static bool foldable = false;
+};
+
+template<typename A>
+std::ostream &operator<<(std::ostream &s, const WidenOp<A> &op) {
+    s << "widen(" << op.a << ")";
+    return s;
+}
+
+template<typename A>
+HALIDE_ALWAYS_INLINE auto widen(A &&a) noexcept -> WidenOp<decltype(pattern_arg(a))> {
+    assert_is_lvalue_if_expr<A>();
+    return {pattern_arg(a)};
+}
+
 template<typename Vec, typename Base, typename Stride, typename Lanes>
 struct SliceOp {
     struct pattern_tag {};
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
index 2011fdfa06bf..9be9fb55396f 100644
--- a/src/IROperator.cpp
+++ b/src/IROperator.cpp
@@ -6,12 +6,14 @@
 #include <utility>
 
 #include "CSE.h"
+#include "ConstantBounds.h"
 #include "Debug.h"
 #include "Func.h"
 #include "IREquality.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "IRPrinter.h"
+#include "Interval.h"
 #include "Util.h"
 #include "Var.h"
 
@@ -434,141 +436,152 @@ Expr const_false(int w) {
     return make_zero(UInt(1, w));
 }
 
-Expr lossless_cast(Type t, Expr e) {
+Expr lossless_cast(Type t, Expr e, std::map<Expr, ConstantInterval, ExprCompare> *cache) {
     if (!e.defined() || t == e.type()) {
         return e;
     } else if (t.can_represent(e.type())) {
         return cast(t, std::move(e));
-    }
-
-    if (const Cast *c = e.as<Cast>()) {
+    } else if (const Cast *c = e.as<Cast>()) {
         if (c->type.can_represent(c->value.type())) {
-            // We can recurse into widening casts.
-            return lossless_cast(t, c->value);
-        } else {
-            return Expr();
+            return lossless_cast(t, c->value, cache);
         }
-    }
-
-    if (const Broadcast *b = e.as<Broadcast>()) {
-        Expr v = lossless_cast(t.element_of(), b->value);
+    } else if (const Broadcast *b = e.as<Broadcast>()) {
+        Expr v = lossless_cast(t.element_of(), b->value, cache);
         if (v.defined()) {
             return Broadcast::make(v, b->lanes);
-        } else {
-            return Expr();
         }
-    }
-
-    if (const IntImm *i = e.as<IntImm>()) {
+    } else if (const IntImm *i = e.as<IntImm>()) {
         if (t.can_represent(i->value)) {
             return make_const(t, i->value);
-        } else {
-            return Expr();
         }
-    }
-
-    if (const UIntImm *i = e.as<UIntImm>()) {
+    } else if (const UIntImm *i = e.as<UIntImm>()) {
         if (t.can_represent(i->value)) {
             return make_const(t, i->value);
-        } else {
-            return Expr();
         }
-    }
-
-    if (const FloatImm *f = e.as<FloatImm>()) {
+    } else if (const FloatImm *f = e.as<FloatImm>()) {
         if (t.can_represent(f->value)) {
             return make_const(t, f->value);
-        } else {
-            return Expr();
-        }
-    }
-
-    if (t.is_int_or_uint() && t.bits() >= 16) {
-        if (const Add *add = e.as<Add>()) {
-            // If we can losslessly narrow the args even more
-            // aggressively, we're good.
-            // E.g. lossless_cast(uint16, (uint32)(some_u8) + 37)
-            // = (uint16)(some_u8) + 37
-            Expr a = lossless_cast(t.narrow(), add->a);
-            Expr b = lossless_cast(t.narrow(), add->b);
-            if (a.defined() && b.defined()) {
-                return cast(t, a) + cast(t, b);
-            } else {
-                return Expr();
-            }
         }
-
-        if (const Sub *sub = e.as<Sub>()) {
-            Expr a = lossless_cast(t.narrow(), sub->a);
-            Expr b = lossless_cast(t.narrow(), sub->b);
-            if (a.defined() && b.defined()) {
-                return cast(t, a) - cast(t, b);
-            } else {
-                return Expr();
-            }
-        }
-
-        if (const Mul *mul = e.as<Mul>()) {
-            Expr a = lossless_cast(t.narrow(), mul->a);
-            Expr b = lossless_cast(t.narrow(), mul->b);
-            if (a.defined() && b.defined()) {
-                return cast(t, a) * cast(t, b);
-            } else {
+    } else if (const Shuffle *shuf = e.as<Shuffle>()) {
+        std::vector<Expr> vecs;
+        for (const auto &vec : shuf->vectors) {
+            vecs.emplace_back(lossless_cast(t.with_lanes(vec.type().lanes()), vec, cache));
+            if (!vecs.back().defined()) {
                 return Expr();
             }
         }
-
-        if (const VectorReduce *reduce = e.as<VectorReduce>()) {
-            const int factor = reduce->value.type().lanes() / reduce->type.lanes();
-            switch (reduce->op) {
-            case VectorReduce::Add:
-                // A horizontal add requires one extra bit per factor
-                // of two in the reduction factor. E.g. a reduction of
-                // 8 vector lanes down to 2 requires 2 extra bits in
-                // the output. We only deal with power-of-two types
-                // though, so just make sure the reduction factor
-                // isn't so large that it will more than double the
-                // number of bits required.
-                if (factor < (1 << (t.bits() / 2))) {
-                    Type narrower = reduce->value.type().with_bits(t.bits() / 2);
-                    Expr val = lossless_cast(narrower, reduce->value);
-                    if (val.defined()) {
-                        val = cast(narrower.with_bits(t.bits()), val);
-                        return VectorReduce::make(reduce->op, val, reduce->type.lanes());
+        return Shuffle::make(vecs, shuf->indices);
+    } else if (t.is_int_or_uint()) {
+        // Check the bounds. If they're small enough, we can throw narrowing
+        // casts around e, or subterms.
+        ConstantInterval ci = constant_integer_bounds(e, Scope<ConstantInterval>::empty_scope(), cache);
+
+        if (t.can_represent(ci)) {
+            // There are certain IR nodes where if the result is expressible
+            // using some type, and the args are expressible using that type,
+            // then the operation can just be done in that type.
+            if (const Add *op = e.as<Add>()) {
+                Expr a = lossless_cast(t, op->a, cache);
+                Expr b = lossless_cast(t, op->b, cache);
+                if (a.defined() && b.defined()) {
+                    return Add::make(a, b);
+                }
+            } else if (const Sub *op = e.as<Sub>()) {
+                Expr a = lossless_cast(t, op->a, cache);
+                Expr b = lossless_cast(t, op->b, cache);
+                if (a.defined() && b.defined()) {
+                    return Sub::make(a, b);
+                }
+            } else if (const Mul *op = e.as<Mul>()) {
+                Expr a = lossless_cast(t, op->a, cache);
+                Expr b = lossless_cast(t, op->b, cache);
+                if (a.defined() && b.defined()) {
+                    return Mul::make(a, b);
+                }
+            } else if (const Min *op = e.as<Min>()) {
+                Expr a = lossless_cast(t, op->a, cache);
+                Expr b = lossless_cast(t, op->b, cache);
+                if (a.defined() && b.defined()) {
+                    debug(0) << a << " " << b << "\n";
+                    return Min::make(a, b);
+                }
+            } else if (const Max *op = e.as<Max>()) {
+                Expr a = lossless_cast(t, op->a, cache);
+                Expr b = lossless_cast(t, op->b, cache);
+                if (a.defined() && b.defined()) {
+                    return Max::make(a, b);
+                }
+            } else if (const Mod *op = e.as<Mod>()) {
+                Expr a = lossless_cast(t, op->a, cache);
+                Expr b = lossless_cast(t, op->b, cache);
+                if (a.defined() && b.defined()) {
+                    return Mod::make(a, b);
+                }
+            } else if (const Call *op = Call::as_intrinsic(e, {Call::widening_add, Call::widen_right_add})) {
+                Expr a = lossless_cast(t, op->args[0], cache);
+                Expr b = lossless_cast(t, op->args[1], cache);
+                if (a.defined() && b.defined()) {
+                    return Add::make(a, b);
+                }
+            } else if (const Call *op = Call::as_intrinsic(e, {Call::widening_sub, Call::widen_right_sub})) {
+                Expr a = lossless_cast(t, op->args[0], cache);
+                Expr b = lossless_cast(t, op->args[1], cache);
+                if (a.defined() && b.defined()) {
+                    return Sub::make(a, b);
+                }
+            } else if (const Call *op = Call::as_intrinsic(e, {Call::widening_mul, Call::widen_right_mul})) {
+                Expr a = lossless_cast(t, op->args[0], cache);
+                Expr b = lossless_cast(t, op->args[1], cache);
+                if (a.defined() && b.defined()) {
+                    return Mul::make(a, b);
+                }
+            } else if (const Call *op = Call::as_intrinsic(e, {Call::shift_left, Call::widening_shift_left,
+                                                               Call::shift_right, Call::widening_shift_right})) {
+                Expr a = lossless_cast(t, op->args[0], cache);
+                Expr b = lossless_cast(t, op->args[1], cache);
+                if (a.defined() && b.defined()) {
+                    ConstantInterval cb = constant_integer_bounds(b, Scope<ConstantInterval>::empty_scope(), cache);
+                    if (cb > -t.bits() && cb < t.bits()) {
+                        if (op->is_intrinsic({Call::shift_left, Call::widening_shift_left})) {
+                            return a << b;
+                        } else if (op->is_intrinsic({Call::shift_right, Call::widening_shift_right})) {
+                            return a >> b;
+                        }
                     }
                 }
-                break;
-            case VectorReduce::Max:
-            case VectorReduce::Min: {
-                Expr val = lossless_cast(t, reduce->value);
-                if (val.defined()) {
-                    return VectorReduce::make(reduce->op, val, reduce->type.lanes());
+            } else if (const VectorReduce *op = e.as<VectorReduce>()) {
+                if (op->op == VectorReduce::Add ||
+                    op->op == VectorReduce::Min ||
+                    op->op == VectorReduce::Max) {
+                    Expr v = lossless_cast(t.with_lanes(op->value.type().lanes()), op->value, cache);
+                    if (v.defined()) {
+                        return VectorReduce::make(op->op, v, op->type.lanes());
+                    }
                 }
-                break;
             }
-            default:
-                break;
-            }
-        }
-    }
 
-    if (const Shuffle *shuf = e.as<Shuffle>()) {
-        std::vector<Expr> vecs;
-        for (const auto &vec : shuf->vectors) {
-            vecs.emplace_back(lossless_cast(t.with_lanes(vec.type().lanes()), vec));
-            if (!vecs.back().defined()) {
-                return Expr();
+            // At this point we know the expression fits in the target type, but
+            // what we really want is for the expression to be computed in the
+            // target type. So we can add a cast to the target type if we want
+            // here, but it only makes sense to do it if the expression type has
+            // the same or fewer bits than the target type.
+            if (e.type().bits() <= t.bits()) {
+                return cast(t, e);
             }
         }
-        return Shuffle::make(vecs, shuf->indices);
     }
 
     return Expr();
 }
 
 Expr lossless_negate(const Expr &x) {
-    if (false /* const Mul *m = x.as<Mul>() */) {  // disabled pending #8155
-        /*
+    if (const Mul *m = x.as<Mul>()) {
+        // Check the terms can't multiply to produce the most negative value.
+        if (x.type().is_int() &&
+            !x.type().can_represent(-constant_integer_bounds(x))) {
+            return Expr();
+        }
+
         Expr b = lossless_negate(m->b);
         if (b.defined()) {
             return Mul::make(m->a, b);
@@ -577,7 +590,7 @@ Expr lossless_negate(const Expr &x) {
         if (a.defined()) {
             return Mul::make(a, m->b);
         }
-        */
+
     } else if (const Call *m = Call::as_intrinsic(x, {Call::widening_mul})) {
         Expr b = lossless_negate(m->args[1]);
         if (b.defined()) {
@@ -596,8 +609,7 @@ Expr lossless_negate(const Expr &x) {
     } else if (const Cast *c = x.as<Cast>()) {
         Expr value = lossless_negate(c->value);
         if (value.defined()) {
-            // This works for constants, but not other things that
-            // could possibly be negated.
+            // This logic is only sound if we know the cast can't overflow.
             value = lossless_cast(c->type, value);
             if (value.defined()) {
                 return value;
diff --git a/src/IROperator.h b/src/IROperator.h
index fb84fecdb688..ef2ef3526bb5 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -8,6 +8,7 @@
  */
 
 #include <cmath>
+#include <map>
 
 #include "Expr.h"
 #include "Tuple.h"
@@ -140,10 +141,16 @@ Expr const_true(int lanes = 1);
  * falses, if a lanes argument is given. */
 Expr const_false(int lanes = 1);
 
-/** Attempt to cast an expression to a smaller type while provably not
- * losing information. If it can't be done, return an undefined
- * Expr. */
-Expr lossless_cast(Type t, Expr e);
+/** Attempt to cast an expression to a smaller type while provably not losing
+ * information. If it can't be done, return an undefined Expr.
+ *
+ * Optionally accepts a map that gives the constant bounds of exprs already
+ * analyzed to avoid redoing work across many calls to lossless_cast. It is not
+ * safe to use this optional map in contexts where the same Expr object may
+ * take on a different value. For example:
+ * (let x = 4 in some_expr_object) + (let x = 5 in the_same_expr_object)).
+ * It is safe to use it after uniquify_variable_names has been run. */
+Expr lossless_cast(Type t, Expr e, std::map<Expr, ConstantInterval, ExprCompare> *cache = nullptr);
 
 /** Attempt to negate x without introducing new IR and without overflow.
  * If it can't be done, return an undefined Expr. */
diff --git a/test/correctness/lossless_cast.cpp b/test/correctness/lossless_cast.cpp
index 692abc0db7d4..22d3506d7859 100644
--- a/test/correctness/lossless_cast.cpp
+++ b/test/correctness/lossless_cast.cpp
@@ -81,11 +81,7 @@ int lossless_cast_test() {
     e = cast(i64, 1024) * cast(i64, 1024) * cast(i64, 1024);
     res |= check_lossless_cast(i32, e, (cast(i32, 1024) * 1024) * 1024);
 
-    if (res) {
-        std::cout << "Ignoring bugs in lossless_cast for now. Will be fixed in #8155\n";
-    }
-    return 0;
-    // return res;
+    return res;
 }
 
 constexpr int size = 1024;
@@ -235,6 +231,25 @@ Expr random_expr(std::mt19937 &rng) {
     }
 }
 
+bool definitely_has_ub(Expr e) {
+    e = simplify(e);
+
+    class HasOverflow : public IRVisitor {
+        void visit(const Call *op) override {
+            if (op->is_intrinsic({Call::signed_integer_overflow})) {
+                found = true;
+            }
+            IRVisitor::visit(op);
+        }
+
+    public:
+        bool found = false;
+    } has_overflow;
+    e.accept(&has_overflow);
+
+    return has_overflow.found;
+}
+
 bool might_have_ub(Expr e) {
     class MightOverflow : public IRVisitor {
         std::map<Expr, ConstantInterval, ExprCompare> cache;
@@ -331,8 +346,11 @@ int test_one(uint32_t seed) {
     buf_i8.fill(rng);
 
     Expr e1 = random_expr(rng);
+    Expr simplified = simplify(e1);
 
-    if (might_have_ub(e1)) {
+    if (might_have_ub(e1) ||
+        might_have_ub(simplified) ||
+        might_have_ub(lower_intrinsics(simplified))) {
         return 0;
     }
 
@@ -348,12 +366,26 @@ int test_one(uint32_t seed) {
         return 0;
     }
 
+    if (definitely_has_ub(e2)) {
+        std::cout << "lossless_cast introduced ub:\n"
+                  << "seed = " << seed << "\n"
+                  << "e1 = " << e1 << "\n"
+                  << "e2 = " << e2 << "\n"
+                  << "simplify(e1) = " << simplify(e1) << "\n"
+                  << "simplify(e2) = " << simplify(e2) << "\n";
+        return 1;
+    }
+
     Func f;
     f(x) = {cast<int64_t>(e1), cast<int64_t>(e2)};
     f.vectorize(x, 4, TailStrategy::RoundUp);
 
     Buffer<int64_t> out1(size), out2(size);
     Pipeline p(f);
+
+    // Check for signed integer overflow
+    // Module m = p.compile_to_module({}, "test");
+
     p.realize({out1, out2});
 
     for (int x = 0; x < size; x++) {
@@ -367,12 +399,8 @@ int test_one(uint32_t seed) {
                 << "out1 = " << out1(x) << "\n"
                 << "out2 = " << out2(x) << "\n"
                 << "Original: " << e1 << "\n"
-                << "Lossless cast: " << e2 << "\n"
-                << "Ignoring bug for now. Will be fixed in #8155\n";
-            // If lossless_cast has failed on this Expr, it's possible the test
-            // below will fail as well.
-            return 0;
-            // return 1;
+                << "Lossless cast: " << e2 << "\n";
+            return 1;
         }
     }
 
@@ -405,7 +433,9 @@ int fuzz_test(uint32_t root_seed) {
 
     std::cout << "Fuzz testing with root seed " << root_seed << "\n";
     for (int i = 0; i < 1000; i++) {
-        if (test_one(seed_generator())) {
+        auto s = seed_generator();
+        std::cout << s << "\n";
+        if (test_one(s)) {
             return 1;
         }
     }
diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp
index 3ebf5071569e..50eca65d34f4 100644
--- a/test/correctness/simd_op_check_arm.cpp
+++ b/test/correctness/simd_op_check_arm.cpp
@@ -561,7 +561,7 @@ class SimdOpCheckARM : public SimdOpCheckTest {
                         // use the forms with an accumulator
                         check(arm32 ? "vpadal.s8" : "sadalp", 16, sum_(i16(in_i8(f * x + r))));
                         check(arm32 ? "vpadal.u8" : "uadalp", 16, sum_(i16(in_u8(f * x + r))));
-                        check(arm32 ? "vpadal.u8" : "uadalp*", 16, sum_(u16(in_u8(f * x + r))));
+                        check(arm32 ? "vpadal.u8" : "uadalp", 16, sum_(u16(in_u8(f * x + r))));
 
                         check(arm32 ? "vpadal.s16" : "sadalp", 8, sum_(i32(in_i16(f * x + r))));
                         check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(i32(in_u16(f * x + r))));
@@ -595,17 +595,10 @@ class SimdOpCheckARM : public SimdOpCheckTest {
                             check(arm32 ? "vpaddl.u8" : "udot", 8, sum_(i32(in_u8(f * x + r))));
                             check(arm32 ? "vpaddl.u8" : "udot", 8, sum_(u32(in_u8(f * x + r))));
                             if (!arm32) {
-                                check("sdot", 8, i32_1 + i32(i8_1) * 3 + i32(i8_2) * 6 + i32(i8_3) * 9 + i32(i8_4) * 12);
-                                check("sdot", 8, i32_1 + i32(i8_1) * 3 + i32(i8_2) * 6 + i32(i8_3) * 9 + i32(i8_4));
-                                check("sdot", 8, i32_1 + i32(i8_1) * 3 + i32(i8_2) * 6 + i32(i8_3) + i32(i8_4) * 12);
-                                check("sdot", 8, i32_1 + i32(i8_1) * 3 + i32(i8_2) + i32(i8_3) * 9 + i32(i8_4) * 12);
-                                check("sdot", 8, i32_1 + i32(i8_1) + i32(i8_2) * 6 + i32(i8_3) * 9 + i32(i8_4) * 12);
-
-                                check("udot", 8, u32_1 + u32(u8_1) * 3 + u32(u8_2) * 6 + u32(u8_3) * 9 + u32(u8_4) * 12);
-                                check("udot", 8, u32_1 + u32(u8_1) * 3 + u32(u8_2) * 6 + u32(u8_3) * 9 + u32(u8_4));
-                                check("udot", 8, u32_1 + u32(u8_1) * 3 + u32(u8_2) * 6 + u32(u8_3) + u32(u8_4) * 12);
-                                check("udot", 8, u32_1 + u32(u8_1) * 3 + u32(u8_2) + u32(u8_3) * 9 + u32(u8_4) * 12);
-                                check("udot", 8, u32_1 + u32(u8_1) + u32(u8_2) * 6 + u32(u8_3) * 9 + u32(u8_4) * 12);
+                                check("udot", 8, u32(u8_1) * 200 + u32(u8_2) * 201 + u32(u8_3) * 202 + u32(u8_4) * 203);
+                                // For signed, mapping the pattern above to sdot
+                                // is a wash, because we can add more products
+                                // of i8s together before they overflow an i16.
                             }
                         } else {
                             check(arm32 ? "vpaddl.s8" : "saddlp", 8, sum_(i32(in_i8(f * x + r))));
@@ -621,15 +614,15 @@ class SimdOpCheckARM : public SimdOpCheckTest {
                         // signed, because the intermediate type is u16
                         if (target.has_feature(Target::ARMDotProd)) {
                             check(arm32 ? "vpadal.s16" : "sdot", 8, sum_(i32(in_i8(f * x + r))));
-                            check(arm32 ? "vpadal.u16" : "udot", 8, sum_(i32(in_u8(f * x + r))));
+                            check(arm32 ? "vpadal.s16" : "udot", 8, sum_(i32(in_u8(f * x + r))));
                             check(arm32 ? "vpadal.u16" : "udot", 8, sum_(u32(in_u8(f * x + r))));
                         } else {
                             check(arm32 ? "vpadal.s16" : "sadalp", 8, sum_(i32(in_i8(f * x + r))));
-                            check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(i32(in_u8(f * x + r))));
+                            check(arm32 ? "vpadal.s16" : "sadalp", 8, sum_(i32(in_u8(f * x + r))));
                             check(arm32 ? "vpadal.u16" : "uadalp", 8, sum_(u32(in_u8(f * x + r))));
                         }
                         check(arm32 ? "vpadal.s32" : "sadalp", 4, sum_(i64(in_i16(f * x + r))));
-                        check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(i64(in_u16(f * x + r))));
+                        check(arm32 ? "vpadal.s32" : "sadalp", 4, sum_(i64(in_u16(f * x + r))));
                         check(arm32 ? "vpadal.u32" : "uadalp", 4, sum_(u64(in_u16(f * x + r))));
                     }
 
diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp
index 8286bc68f9e6..4a81dfbdf926 100644
--- a/test/correctness/simd_op_check_x86.cpp
+++ b/test/correctness/simd_op_check_x86.cpp
@@ -253,6 +253,17 @@ class SimdOpCheckX86 : public SimdOpCheckTest {
             for (int w = 2; w <= 4; w++) {
                 check("pmulhrsw", 4 * w, i16((i32(i16_1) * i32(i16_2) + 16384) >> 15));
                 check("pmulhrsw", 4 * w, i16_sat((i32(i16_1) * i32(i16_2) + 16384) >> 15));
+                // Should be able to use the non-saturating form of pmulhrsw,
+                // because the second arg can't be -32768, so the i16_sat
+                // doesn't actually need to saturate.
+                check("pmulhrsw", 4 * w, i16_sat((i32(i16_1) * i32(i16_2 / 2) + 16384) >> 15));
+
+                // Should be able to use pmulhrsw despite the shift being too
+                // small, because there are enough bits of headroom to shift
+                // left one of the args:
+                check("pmulhrsw", 4 * w, i16_sat((i32(i16_1) * i32(i16_2 / 2) + 8192) >> 14));
+                check("pmulhrsw", 4 * w, i16((i32(i16_1) * i32(i16_2 / 3) + 8192) >> 14));
+
                 check("pabsb", 8 * w, abs(i8_1));
                 check("pabsw", 4 * w, abs(i16_1));
                 check("pabsd", 2 * w, abs(i32_1));
diff --git a/test/correctness/simplify.cpp b/test/correctness/simplify.cpp
index be1421d5e11c..264471294380 100644
--- a/test/correctness/simplify.cpp
+++ b/test/correctness/simplify.cpp
@@ -1313,6 +1313,13 @@ void check_bounds() {
     check(max(x * 4 + 63, y) - max(y - 3, x * 4), clamp(x * 4 - y, -63, -3) + 66);
     check(max(x * 4, y - 3) - max(x * 4 + 63, y), clamp(y - x * 4, 3, 63) + -66);
     check(max(y - 3, x * 4) - max(x * 4 + 63, y), clamp(y - x * 4, 3, 63) + -66);
+
+    // Check we can track bounds correctly through various operations
+    check(ramp(cast<uint8_t>(x) / 2 + 3, cast<uint8_t>(1), 16) < broadcast(200, 16), const_true(16));
+    check(cast<int16_t>(cast<uint8_t>(x)) * 3 >= cast<int16_t>(0), const_true());
+    check(cast<int16_t>(cast<uint8_t>(x)) * 3 < cast<int16_t>(768), const_true());
+    check(cast<int16_t>(abs(cast<int8_t>(x))) >= cast<int16_t>(0), const_true());
+    check(cast<int16_t>(abs(cast<int8_t>(x))) - cast<int16_t>(128) <= cast<int16_t>(0), const_true());
 }
 
 void check_boolean() {

From a4a7531d8f4a14220d1c35888c86ebd632639394 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Wed, 26 Jun 2024 14:30:20 -0700
Subject: [PATCH 149/186] Consider *all* Exprs a func uses, not just the RHS,
 in Li2018 (#8326)

Fixes #8312
---
 src/DerivativeUtils.cpp | 52 ++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/src/DerivativeUtils.cpp b/src/DerivativeUtils.cpp
index c9028679a1af..86f5902017ff 100644
--- a/src/DerivativeUtils.cpp
+++ b/src/DerivativeUtils.cpp
@@ -253,7 +253,7 @@ map<string, Box> inference_bounds(const vector<Func> &funcs,
     for (auto it = order.rbegin(); it != order.rend(); it++) {
         Func func = Func(env[*it]);
         // We should already have the bounds of this function
-        internal_assert(bounds.find(*it) != bounds.end());
+        internal_assert(bounds.find(*it) != bounds.end()) << *it << "\n";
         const Box &current_bounds = bounds[*it];
         internal_assert(func.args().size() == current_bounds.size());
         // We know the range for each argument of this function
@@ -262,29 +262,33 @@ map<string, Box> inference_bounds(const vector<Func> &funcs,
             scope.push(arg, current_bounds[i]);
         }
         // Propagate the bounds
-        for (int update_id = -1; update_id < func.num_update_definitions(); update_id++) {
-            // For each rhs expression
-            Tuple tuple = update_id == -1 ? func.values() : func.update_values(update_id);
-            for (const auto &expr : tuple.as_vector()) {
-                // For all the immediate dependencies of this expression,
-                // find the required ranges
-                map<string, Box> update_bounds =
-                    boxes_required(expr, scope, func_value_bounds);
-                // Loop over the dependencies
-                for (const auto &it : update_bounds) {
-                    if (it.first == func.name()) {
-                        // Skip self reference
-                        continue;
-                    }
-                    // Update the bounds, if not exists then create a new one
-                    auto found = bounds.find(it.first);
-                    if (found == bounds.end()) {
-                        bounds[it.first] = it.second;
-                    } else {
-                        Box new_box = box_union(found->second, it.second);
-                        bounds[it.first] = new_box;
-                    }
-                }
+        class CollectExprs : public IRMutator {
+        public:
+            using IRMutator::mutate;
+            Expr mutate(const Expr &e) override {
+                exprs.push_back(e);
+                return e;
+            }
+            std::vector<Expr> exprs;
+        } expr_collector;
+        func.function().mutate(&expr_collector);
+
+        Expr bundle = Call::make(Int(32), Call::bundle, expr_collector.exprs, Call::PureIntrinsic);
+        map<string, Box> update_bounds =
+            boxes_required(bundle, scope, func_value_bounds);
+        // Loop over the dependencies
+        for (const auto &it : update_bounds) {
+            if (it.first == func.name()) {
+                // Skip self reference
+                continue;
+            }
+            // Update the bounds, if not exists then create a new one
+            auto found = bounds.find(it.first);
+            if (found == bounds.end()) {
+                bounds[it.first] = it.second;
+            } else {
+                Box new_box = box_union(found->second, it.second);
+                bounds[it.first] = new_box;
             }
         }
         for (int i = 0; i < (int)current_bounds.size(); i++) {

From a6f5ca4b34d78362ea703dcbaad5bd527efd9fb3 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Wed, 26 Jun 2024 17:01:10 -0700
Subject: [PATCH 150/186] Remove remaining dregs of tuple_select (oops) (#8329)

* Remove remaining dregs of tuple_select (oops)

* Update tuple_select.py
---
 python_bindings/src/halide/halide_/PyIROperator.cpp | 2 +-
 python_bindings/test/correctness/tuple_select.py    | 2 +-
 src/IROperator.cpp                                  | 8 --------
 3 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyIROperator.cpp b/python_bindings/src/halide/halide_/PyIROperator.cpp
index 3dd297a8c2e3..3456af4fc40a 100644
--- a/python_bindings/src/halide/halide_/PyIROperator.cpp
+++ b/python_bindings/src/halide/halide_/PyIROperator.cpp
@@ -90,7 +90,7 @@ void define_operators(py::module &m) {
                     // We don't want to throw an error here, since the catch(...) would catch it,
                     // and it would be hard to distinguish from other errors. Just set the string here
                     // and jump to the error reporter outside the catch.
-                    tuple_error_msg = "tuple_select() may not mix Expr and Tuple for the condition elements.";
+                    tuple_error_msg = "select() on Tuples may not mix Expr and Tuple for the condition elements.";
                     goto handle_tuple_error;
                 }
 
diff --git a/python_bindings/test/correctness/tuple_select.py b/python_bindings/test/correctness/tuple_select.py
index 8b7839f9a413..a320bb0743e9 100644
--- a/python_bindings/test/correctness/tuple_select.py
+++ b/python_bindings/test/correctness/tuple_select.py
@@ -72,7 +72,7 @@ def test_tuple_select():
         # fmt: on
     except hl.HalideError as e:
         assert (
-            "select() may not mix Expr and Tuple for the condition elements."
+            "select() on Tuples may not mix Expr and Tuple for the condition elements."
             in str(e)
         )
     else:
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
index 9be9fb55396f..6ee62d66015c 100644
--- a/src/IROperator.cpp
+++ b/src/IROperator.cpp
@@ -1516,10 +1516,6 @@ Expr select(Expr condition, Expr true_value, Expr false_value) {
     return Select::make(std::move(condition), std::move(true_value), std::move(false_value));
 }
 
-Tuple tuple_select(const Tuple &condition, const Tuple &true_value, const Tuple &false_value) {
-    return select(condition, true_value, false_value);
-}
-
 Tuple select(const Tuple &condition, const Tuple &true_value, const Tuple &false_value) {
     user_assert(condition.size() == true_value.size() && true_value.size() == false_value.size())
         << "select() on Tuples requires all Tuples to have identical sizes.";
@@ -1530,10 +1526,6 @@ Tuple select(const Tuple &condition, const Tuple &true_value, const Tuple &false
     return result;
 }
 
-Tuple tuple_select(const Expr &condition, const Tuple &true_value, const Tuple &false_value) {
-    return select(condition, true_value, false_value);
-}
-
 Tuple select(const Expr &condition, const Tuple &true_value, const Tuple &false_value) {
     user_assert(true_value.size() == false_value.size())
         << "select() on Tuples requires all Tuples to have identical sizes.";

From 461c12871f336fe6f57b55d6a297f13ef209161b Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Thu, 27 Jun 2024 22:21:44 -0700
Subject: [PATCH 151/186] Fix incorrect output in Python tutorial, lesson 5
 (#8331)

---
 python_bindings/tutorial/lesson_05_scheduling_1.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python_bindings/tutorial/lesson_05_scheduling_1.py b/python_bindings/tutorial/lesson_05_scheduling_1.py
index 1f3d8842e2c5..1fa4ed138e39 100644
--- a/python_bindings/tutorial/lesson_05_scheduling_1.py
+++ b/python_bindings/tutorial/lesson_05_scheduling_1.py
@@ -75,8 +75,8 @@ def main():
         output = gradient.realize([4, 4])
 
         print("Equivalent C:")
-        for yy in range(4):
-            for xx in range(4):
+        for xx in range(4):
+            for yy in range(4):
                 print("Evaluating at x = %d, y = %d: %d" % (xx, yy, xx + yy))
 
         print()

From 0f34e2f60f519e7f4f0f007f929a7d6b0136504b Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Mon, 15 Jul 2024 09:15:40 -0700
Subject: [PATCH 152/186] Detect ARM CPU features for host target and in
 runtime (#8298)

Adds feature detection for ARM CPUs to the runtime library and to
the host target feature computation. Supports Windows, macOS,
Linux, iOS, and Android.

Also fix bug in Type::max() and Type::min() for float16.

Fixes #4727
Fixes #6106
Fixes #7901
Fixes #7979
Fixes #8340
---
 Makefile                                      |   5 +
 src/LLVM_Runtime_Linker.cpp                   |  64 +++++++--
 src/Target.cpp                                | 122 ++++++++++++++++-
 src/Type.cpp                                  |   4 +-
 src/Util.cpp                                  |   9 +-
 src/runtime/CMakeLists.txt                    |   5 +
 src/runtime/aarch64_cpu_features.cpp          | 125 +++++++++++++++++-
 src/runtime/arm_cpu_features.cpp              | 112 ++++++++++++++--
 src/runtime/linux_aarch64_cpu_features.cpp    |   2 +
 src/runtime/linux_arm_cpu_features.cpp        |   2 +
 src/runtime/osx_aarch64_cpu_features.cpp      |   2 +
 src/runtime/osx_arm_cpu_features.cpp          |   2 +
 .../windows_aarch64_cpu_features_arm.cpp      |   2 +
 tutorial/lesson_15_generators_usage.sh        |   8 +-
 14 files changed, 422 insertions(+), 42 deletions(-)
 create mode 100644 src/runtime/linux_aarch64_cpu_features.cpp
 create mode 100644 src/runtime/linux_arm_cpu_features.cpp
 create mode 100644 src/runtime/osx_aarch64_cpu_features.cpp
 create mode 100644 src/runtime/osx_arm_cpu_features.cpp
 create mode 100644 src/runtime/windows_aarch64_cpu_features_arm.cpp

diff --git a/Makefile b/Makefile
index 5a5cd0189260..5d23cfd197f7 100644
--- a/Makefile
+++ b/Makefile
@@ -828,6 +828,8 @@ RUNTIME_CPP_COMPONENTS = \
   hexagon_dma_pool \
   hexagon_host \
   ios_io \
+  linux_aarch64_cpu_features \
+  linux_arm_cpu_features \
   linux_clock \
   linux_host_cpu_count \
   linux_yield \
@@ -839,6 +841,8 @@ RUNTIME_CPP_COMPONENTS = \
   msan \
   msan_stubs \
   opencl \
+  osx_aarch64_cpu_features \
+  osx_arm_cpu_features \
   osx_clock \
   osx_get_symbol \
   osx_host_cpu_count \
@@ -873,6 +877,7 @@ RUNTIME_CPP_COMPONENTS = \
   wasm_cpu_features \
   webgpu_dawn \
   webgpu_emscripten \
+  windows_aarch64_cpu_features_arm \
   windows_clock \
   windows_cuda \
   windows_d3d12compute_arm \
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 09e58bea894b..e601a3325ce6 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -46,20 +46,31 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
         return std::unique_ptr<llvm::Module>();                                                                         \
     }
 
+#define DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, bits)              \
+    do {                                                        \
+        if (debug) {                                            \
+            return get_initmod_##mod##_##bits##_debug(context); \
+        } else {                                                \
+            return get_initmod_##mod##_##bits(context);         \
+        }                                                       \
+    } while (0)
+
 #define DECLARE_CPP_INITMOD_LOOKUP(mod)                                                                     \
     std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
         if (bits_64) {                                                                                      \
-            if (debug) {                                                                                    \
-                return get_initmod_##mod##_64_debug(context);                                               \
-            } else {                                                                                        \
-                return get_initmod_##mod##_64(context);                                                     \
-            }                                                                                               \
+            DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64);                                                       \
+        } else {                                                                                            \
+            DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 32);                                                       \
+        }                                                                                                   \
+    }
+
+#define DECLARE_CPP_INITMOD_LOOKUP_64(mod)                                                                  \
+    std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
+        if (bits_64) {                                                                                      \
+            DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64);                                                       \
         } else {                                                                                            \
-            if (debug) {                                                                                    \
-                return get_initmod_##mod##_32_debug(context);                                               \
-            } else {                                                                                        \
-                return get_initmod_##mod##_32(context);                                                     \
-            }                                                                                               \
+            internal_error << "No support for 32-bit initmod: " #mod;                                       \
+            return nullptr; /* appease warnings */                                                          \
         }                                                                                                   \
     }
 
@@ -70,6 +81,11 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
     DECLARE_INITMOD(mod##_64)       \
     DECLARE_CPP_INITMOD_LOOKUP(mod)
 
+#define DECLARE_CPP_INITMOD_64(mod) \
+    DECLARE_INITMOD(mod##_64_debug) \
+    DECLARE_INITMOD(mod##_64)       \
+    DECLARE_CPP_INITMOD_LOOKUP_64(mod)
+
 #define DECLARE_LL_INITMOD(mod) \
     DECLARE_INITMOD(mod##_ll)
 
@@ -183,18 +199,28 @@ DECLARE_NO_INITMOD(metal_objc_x86)
 DECLARE_LL_INITMOD(arm)
 DECLARE_LL_INITMOD(arm_no_neon)
 DECLARE_CPP_INITMOD(arm_cpu_features)
+DECLARE_CPP_INITMOD(linux_arm_cpu_features)
+DECLARE_CPP_INITMOD(osx_arm_cpu_features)
 #else
 DECLARE_NO_INITMOD(arm)
 DECLARE_NO_INITMOD(arm_no_neon)
 DECLARE_NO_INITMOD(arm_cpu_features)
+DECLARE_NO_INITMOD(linux_arm_cpu_features)
+DECLARE_NO_INITMOD(osx_arm_cpu_features)
 #endif  // WITH_ARM
 
 #ifdef WITH_AARCH64
 DECLARE_LL_INITMOD(aarch64)
 DECLARE_CPP_INITMOD(aarch64_cpu_features)
+DECLARE_CPP_INITMOD(linux_aarch64_cpu_features)
+DECLARE_CPP_INITMOD(osx_aarch64_cpu_features)
+DECLARE_CPP_INITMOD_64(windows_aarch64_cpu_features_arm)
 #else
 DECLARE_NO_INITMOD(aarch64)
 DECLARE_NO_INITMOD(aarch64_cpu_features)
+DECLARE_NO_INITMOD(linux_aarch64_cpu_features)
+DECLARE_NO_INITMOD(osx_aarch64_cpu_features)
+DECLARE_NO_INITMOD(windows_aarch64_cpu_features_arm)
 #endif  // WITH_AARCH64
 
 #ifdef WITH_NVPTX
@@ -1206,9 +1232,23 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
             }
             if (t.arch == Target::ARM) {
                 if (t.bits == 64) {
-                    modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
+                    if (t.os == Target::Android || t.os == Target::Linux) {
+                        modules.push_back(get_initmod_linux_aarch64_cpu_features(c, bits_64, debug));
+                    } else if (t.os == Target::OSX || t.os == Target::IOS) {
+                        modules.push_back(get_initmod_osx_aarch64_cpu_features(c, bits_64, debug));
+                    } else if (t.os == Target::Windows) {
+                        modules.push_back(get_initmod_windows_aarch64_cpu_features_arm(c, bits_64, debug));
+                    } else {
+                        modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
+                    }
                 } else {
-                    modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
+                    if (t.os == Target::Android || t.os == Target::Linux) {
+                        modules.push_back(get_initmod_linux_arm_cpu_features(c, bits_64, debug));
+                    } else if (t.os == Target::OSX || t.os == Target::IOS) {
+                        modules.push_back(get_initmod_osx_arm_cpu_features(c, bits_64, debug));
+                    } else {
+                        modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
+                    }
                 }
             }
             if (t.arch == Target::POWERPC) {
diff --git a/src/Target.cpp b/src/Target.cpp
index 53e85196dae3..c0cd3e9bab3a 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -21,9 +21,35 @@
 #endif
 
 #ifdef _MSC_VER
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
 #include <intrin.h>
+#include <windows.h>
 #endif  // _MSC_VER
 
+#ifdef __APPLE__
+#include <mach/machine.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+
+#if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP 0
+#endif
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP 0
+#endif
+#ifndef HWCAP_SVE
+#define HWCAP_SVE 0
+#endif
+#ifndef HWCAP2_SVE2
+#define HWCAP2_SVE2 0
+#endif
+#endif
+
 namespace Halide {
 
 using std::string;
@@ -31,13 +57,14 @@ using std::vector;
 
 namespace {
 
-#ifdef _MSC_VER
-static void cpuid(int info[4], int infoType, int extra) {
+#if defined(_M_IX86) || defined(_M_AMD64)
+
+void cpuid(int info[4], int infoType, int extra) {
     __cpuidex(info, infoType, extra);
 }
-#else
 
-#if defined(__x86_64__) || defined(__i386__)
+#elif defined(__x86_64__) || defined(__i386__)
+
 // CPU feature detection code taken from ispc
 // (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)
 
@@ -47,10 +74,10 @@ void cpuid(int info[4], int infoType, int extra) {
         : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
         : "0"(infoType), "2"(extra));
 }
-#endif
+
 #endif
 
-#if defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
 
 enum class VendorSignatures {
     Unknown,
@@ -143,6 +170,29 @@ Target::Processor get_amd_processor(unsigned family, unsigned model, bool have_s
 
 #endif  // defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
 
+#ifdef __APPLE__
+
+template<typename T>
+std::optional<T> getsysctl(const char *name) {
+    T value;
+    size_t size = sizeof(value);
+    if (sysctlbyname(name, &value, &size, nullptr, 0)) {
+        return std::nullopt;
+    }
+    return std::make_optional(value);
+}
+
+bool sysctl_is_set(const char *name) {
+    return getsysctl<int>(name).value_or(0);
+}
+
+bool is_armv7s() {
+    return getsysctl<cpu_type_t>("hw.cputype") == CPU_TYPE_ARM &&
+           getsysctl<cpu_subtype_t>("hw.cpusubtype") == CPU_SUBTYPE_ARM_V7S;
+}
+
+#endif  // __APPLE__
+
 Target calculate_host_target() {
     Target::OS os = Target::OSUnknown;
 #ifdef __linux__
@@ -164,8 +214,66 @@ Target calculate_host_target() {
 #if __riscv
     Target::Arch arch = Target::RISCV;
 #else
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
     Target::Arch arch = Target::ARM;
+
+#ifdef __APPLE__
+    if (is_armv7s()) {
+        initial_features.push_back(Target::ARMv7s);
+    }
+
+    if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
+        initial_features.push_back(Target::ARMDotProd);
+    }
+
+    if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
+        initial_features.push_back(Target::ARMFp16);
+    }
+#endif
+
+#ifdef __linux__
+    unsigned long hwcaps = getauxval(AT_HWCAP);
+    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+
+    if (hwcaps & HWCAP_ASIMDDP) {
+        initial_features.push_back(Target::ARMDotProd);
+    }
+
+    if (hwcaps & HWCAP_ASIMDHP) {
+        initial_features.push_back(Target::ARMFp16);
+    }
+
+    if (hwcaps & HWCAP_SVE) {
+        initial_features.push_back(Target::SVE);
+    }
+
+    if (hwcaps2 & HWCAP2_SVE2) {
+        initial_features.push_back(Target::SVE2);
+    }
+#endif
+
+#ifdef _MSC_VER
+
+    // Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
+#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)
+
+    // This is the strategy used by Google's cpuinfo library for
+    // detecting fp16 arithmetic support on Windows.
+    if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
+        IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
+        initial_features.push_back(Target::ARMFp16);
+    }
+
+    if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+        initial_features.push_back(Target::ARMDotProd);
+    }
+
+    if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
+        initial_features.push_back(Target::SVE);
+    }
+
+#endif
+
 #else
 #if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
     Target::Arch arch = Target::POWERPC;
diff --git a/src/Type.cpp b/src/Type.cpp
index 1cd95e0a6b01..48d7d90e80bf 100644
--- a/src/Type.cpp
+++ b/src/Type.cpp
@@ -35,7 +35,7 @@ Halide::Expr Type::max() const {
     } else {
         internal_assert(is_float());
         if (bits() == 16) {
-            return Internal::FloatImm::make(*this, 65504.0);
+            return Internal::FloatImm::make(*this, (double)float16_t::make_infinity());
         } else if (bits() == 32) {
             return Internal::FloatImm::make(*this, std::numeric_limits<float>::infinity());
         } else if (bits() == 64) {
@@ -59,7 +59,7 @@ Halide::Expr Type::min() const {
     } else {
         internal_assert(is_float());
         if (bits() == 16) {
-            return Internal::FloatImm::make(*this, -65504.0);
+            return Internal::FloatImm::make(*this, (double)float16_t::make_negative_infinity());
         } else if (bits() == 32) {
             return Internal::FloatImm::make(*this, -std::numeric_limits<float>::infinity());
         } else if (bits() == 64) {
diff --git a/src/Util.cpp b/src/Util.cpp
index b266efeda55e..3ff261a7b24c 100644
--- a/src/Util.cpp
+++ b/src/Util.cpp
@@ -859,7 +859,14 @@ void run_with_large_stack(const std::function<void()> &action) {
 // Portable bit-counting methods
 int popcount64(uint64_t x) {
 #ifdef _MSC_VER
-#if defined(_WIN64)
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64_EC)
+    int popcnt = 0;
+    while (x) {
+        x &= x - 1;
+        popcnt++;
+    }
+    return popcnt;
+#elif defined(_WIN64)
     return __popcnt64(x);
 #else
     return __popcnt((uint32_t)(x >> 32)) + __popcnt((uint32_t)(x & 0xffffffff));
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index 3366f2113969..5426c355823c 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -32,6 +32,8 @@ set(RUNTIME_CPP
     hexagon_dma_pool
     hexagon_host
     ios_io
+    linux_aarch64_cpu_features
+    linux_arm_cpu_features
     linux_clock
     linux_host_cpu_count
     linux_yield
@@ -43,6 +45,8 @@ set(RUNTIME_CPP
     msan
     msan_stubs
     opencl
+    osx_aarch64_cpu_features
+    osx_arm_cpu_features
     osx_clock
     osx_get_symbol
     osx_host_cpu_count
@@ -80,6 +84,7 @@ set(RUNTIME_CPP
     # webgpu
     webgpu_dawn
     webgpu_emscripten
+    windows_aarch64_cpu_features_arm
     windows_clock
     windows_cuda
     windows_d3d12compute_arm
diff --git a/src/runtime/aarch64_cpu_features.cpp b/src/runtime/aarch64_cpu_features.cpp
index 90f54a17b3ea..385dc5ed6b76 100644
--- a/src/runtime/aarch64_cpu_features.cpp
+++ b/src/runtime/aarch64_cpu_features.cpp
@@ -5,9 +5,130 @@ namespace Halide {
 namespace Runtime {
 namespace Internal {
 
+#if LINUX
+
+extern "C" unsigned long getauxval(unsigned long type);
+
+#define AT_HWCAP 16
+#define AT_HWCAP2 26
+
+// https://cs.android.com/android/platform/superproject/main/+/main:bionic/libc/kernel/uapi/asm-arm64/asm/hwcap.h
+// https://github.com/torvalds/linux/blob/master/arch/arm64/include/uapi/asm/hwcap.h
+#define HWCAP_ASIMDHP (1 << 10)
+#define HWCAP_ASIMDDP (1 << 20)
+#define HWCAP_SVE (1 << 22)
+#define HWCAP2_SVE2 (1 << 1)
+
+namespace {
+
+void set_platform_features(CpuFeatures &features) {
+    unsigned long hwcaps = getauxval(AT_HWCAP);
+    unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+
+    if (hwcaps & HWCAP_ASIMDDP) {
+        features.set_available(halide_target_feature_arm_dot_prod);
+    }
+
+    if (hwcaps & HWCAP_ASIMDHP) {
+        features.set_available(halide_target_feature_arm_fp16);
+    }
+
+    if (hwcaps & HWCAP_SVE) {
+        features.set_available(halide_target_feature_sve);
+    }
+
+    if (hwcaps2 & HWCAP2_SVE2) {
+        features.set_available(halide_target_feature_sve2);
+    }
+}
+
+}  // namespace
+
+#elif OSX
+
+extern "C" int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+
+namespace {
+
+bool sysctl_is_set(const char *name) {
+    int enabled = 0;
+    size_t enabled_len = sizeof(enabled);
+    return sysctlbyname(name, &enabled, &enabled_len, nullptr, 0) == 0 && enabled;
+}
+
+void set_platform_features(CpuFeatures &features) {
+    if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
+        features.set_available(halide_target_feature_arm_dot_prod);
+    }
+
+    if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
+        features.set_available(halide_target_feature_arm_fp16);
+    }
+}
+
+}  // namespace
+
+#elif WINDOWS
+
+typedef int BOOL;
+typedef unsigned long DWORD;
+
+extern "C" BOOL IsProcessorFeaturePresent(DWORD feature);
+
+#define PF_FLOATING_POINT_EMULATED (1)
+#define PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE (27)
+#define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE (43)
+
+// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
+#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)
+
+namespace {
+
+void set_platform_features(CpuFeatures &features) {
+    // This is the strategy used by Google's cpuinfo library for
+    // detecting fp16 arithmetic support on Windows.
+    if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
+        IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
+        features.set_available(halide_target_feature_arm_fp16);
+    }
+
+    if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+        features.set_available(halide_target_feature_arm_dot_prod);
+    }
+
+    if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
+        features.set_available(halide_target_feature_sve);
+    }
+}
+
+}  // namespace
+
+#else
+
+namespace {
+
+void set_platform_features(CpuFeatures &) {
+}
+
+}  // namespace
+
+#endif
+
 WEAK CpuFeatures halide_get_cpu_features() {
-    // AArch64 has no CPU-specific Features.
-    return CpuFeatures();
+    CpuFeatures features;
+    features.set_known(halide_target_feature_arm_dot_prod);
+    features.set_known(halide_target_feature_arm_fp16);
+    features.set_known(halide_target_feature_armv7s);
+    features.set_known(halide_target_feature_no_neon);
+    features.set_known(halide_target_feature_sve);
+    features.set_known(halide_target_feature_sve2);
+
+    // All ARM architectures support "No Neon".
+    features.set_available(halide_target_feature_no_neon);
+
+    set_platform_features(features);
+
+    return features;
 }
 
 }  // namespace Internal
diff --git a/src/runtime/arm_cpu_features.cpp b/src/runtime/arm_cpu_features.cpp
index b6361279fa48..d5be501c0f62 100644
--- a/src/runtime/arm_cpu_features.cpp
+++ b/src/runtime/arm_cpu_features.cpp
@@ -5,26 +5,110 @@ namespace Halide {
 namespace Runtime {
 namespace Internal {
 
+#if LINUX
+
+extern "C" unsigned long getauxval(unsigned long type);
+
+#define AT_HWCAP 16
+
+// https://cs.android.com/android/platform/superproject/+/master:bionic/libc/kernel/uapi/asm-arm/asm/hwcap.h
+// https://github.com/torvalds/linux/blob/master/arch/arm/include/uapi/asm/hwcap.h
+#define HWCAP_ASIMDHP (1 << 23)
+#define HWCAP_ASIMDDP (1 << 24)
+
+namespace {
+
+void set_platform_features(CpuFeatures &features) {
+    unsigned long hwcaps = getauxval(AT_HWCAP);
+
+    if (hwcaps & HWCAP_ASIMDDP) {
+        features.set_available(halide_target_feature_arm_dot_prod);
+    }
+
+    if (hwcaps & HWCAP_ASIMDHP) {
+        features.set_available(halide_target_feature_arm_fp16);
+    }
+}
+
+}  // namespace
+
+#elif OSX
+
+typedef int integer_t;
+
+typedef integer_t cpu_type_t;
+typedef integer_t cpu_subtype_t;
+
+#define CPU_TYPE_ARM ((cpu_type_t)12)
+#define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */
+
+extern "C" int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+
+namespace {
+
+bool sysctl_is_set(const char *name) {
+    int enabled = 0;
+    size_t enabled_len = sizeof(enabled);
+    return sysctlbyname(name, &enabled, &enabled_len, nullptr, 0) == 0 && enabled;
+}
+
+bool is_armv7s() {
+    cpu_type_t type;
+    size_t type_len = sizeof(type);
+    if (sysctlbyname("hw.cputype", &type, &type_len, nullptr, 0)) {
+        return false;
+    }
+
+    cpu_subtype_t subtype;
+    size_t subtype_len = sizeof(subtype);
+    if (sysctlbyname("hw.cpusubtype", &subtype, &subtype_len, nullptr, 0)) {
+        return false;
+    }
+
+    return type == CPU_TYPE_ARM && subtype == CPU_SUBTYPE_ARM_V7S;
+}
+
+void set_platform_features(CpuFeatures &features) {
+    if (is_armv7s()) {
+        features.set_available(halide_target_feature_armv7s);
+    }
+
+    if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
+        features.set_available(halide_target_feature_arm_dot_prod);
+    }
+
+    if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
+        features.set_available(halide_target_feature_arm_fp16);
+    }
+}
+
+}  // namespace
+
+#else
+
+namespace {
+
+void set_platform_features(CpuFeatures &) {
+}
+
+}  // namespace
+
+#endif
+
 WEAK CpuFeatures halide_get_cpu_features() {
     CpuFeatures features;
-    // All ARM architectures support "No Neon".
+    features.set_known(halide_target_feature_arm_dot_prod);
+    features.set_known(halide_target_feature_arm_fp16);
+    features.set_known(halide_target_feature_armv7s);
     features.set_known(halide_target_feature_no_neon);
-    features.set_available(halide_target_feature_no_neon);
+    features.set_known(halide_target_feature_sve);
+    features.set_known(halide_target_feature_sve2);
 
-    // TODO: add runtime detection for ARMv7s. AFAICT Apple doesn't
-    // provide an Officially Approved Way to detect this at runtime.
-    // Could probably use some variant of sysctl() to detect, but would
-    // need some experimentation and testing to get right.
-    // features.set_known(halide_target_feature_armv7s);
-    // if () {
-    //    features.set_available(halide_target_feature_armv7s);
-    // }
+    // All ARM architectures support "No Neon".
+    features.set_available(halide_target_feature_no_neon);
 
-    // TODO: add runtime detection for ARMDotProd extension
-    // https://github.com/halide/Halide/issues/4727
+    set_platform_features(features);
 
-    // TODO: add runtime detection for ARMFp16 extension
-    // https://github.com/halide/Halide/issues/6106
     return features;
 }
 
diff --git a/src/runtime/linux_aarch64_cpu_features.cpp b/src/runtime/linux_aarch64_cpu_features.cpp
new file mode 100644
index 000000000000..dd92cd5e940a
--- /dev/null
+++ b/src/runtime/linux_aarch64_cpu_features.cpp
@@ -0,0 +1,2 @@
+#define LINUX 1
+#include "aarch64_cpu_features.cpp"
diff --git a/src/runtime/linux_arm_cpu_features.cpp b/src/runtime/linux_arm_cpu_features.cpp
new file mode 100644
index 000000000000..924f6f0fe9d4
--- /dev/null
+++ b/src/runtime/linux_arm_cpu_features.cpp
@@ -0,0 +1,2 @@
+#define LINUX 1
+#include "arm_cpu_features.cpp"
diff --git a/src/runtime/osx_aarch64_cpu_features.cpp b/src/runtime/osx_aarch64_cpu_features.cpp
new file mode 100644
index 000000000000..cf1db4c84de3
--- /dev/null
+++ b/src/runtime/osx_aarch64_cpu_features.cpp
@@ -0,0 +1,2 @@
+#define OSX 1
+#include "aarch64_cpu_features.cpp"
diff --git a/src/runtime/osx_arm_cpu_features.cpp b/src/runtime/osx_arm_cpu_features.cpp
new file mode 100644
index 000000000000..5ad3be0c5995
--- /dev/null
+++ b/src/runtime/osx_arm_cpu_features.cpp
@@ -0,0 +1,2 @@
+#define OSX 1
+#include "arm_cpu_features.cpp"
diff --git a/src/runtime/windows_aarch64_cpu_features_arm.cpp b/src/runtime/windows_aarch64_cpu_features_arm.cpp
new file mode 100644
index 000000000000..5266c0772c5f
--- /dev/null
+++ b/src/runtime/windows_aarch64_cpu_features_arm.cpp
@@ -0,0 +1,2 @@
+#define WINDOWS 1
+#include "aarch64_cpu_features.cpp"
diff --git a/tutorial/lesson_15_generators_usage.sh b/tutorial/lesson_15_generators_usage.sh
index f8bf34eebbdf..a3a36f32753a 100755
--- a/tutorial/lesson_15_generators_usage.sh
+++ b/tutorial/lesson_15_generators_usage.sh
@@ -194,21 +194,21 @@ check_no_runtime()
     -f my_first_generator_basic \
     -e object,c_header\
     -o . \
-    target=host-x86-64-no_runtime
+    target=x86-64-linux-no_runtime
 
 ./lesson_15_generate \
     -g my_first_generator \
     -f my_first_generator_sse41 \
     -e object,c_header\
     -o . \
-    target=host-x86-64-sse41-no_runtime
+    target=x86-64-linux-sse41-no_runtime
 
 ./lesson_15_generate \
     -g my_first_generator \
     -f my_first_generator_avx \
     -e object,c_header\
     -o . \
-    target=host-x86-64-avx-no_runtime
+    target=x86-64-linux-avx-no_runtime
 
 # These files don't contain the runtime
 check_no_runtime my_first_generator_basic.o
@@ -223,7 +223,7 @@ check_symbol     my_first_generator_avx.o my_first_generator_avx
     -r halide_runtime_x86 \
     -e object,c_header\
     -o . \
-    target=host-x86-64
+    target=x86-64-linux
 check_runtime halide_runtime_x86.o
 
 # Linking the standalone runtime with the three generated object files

From a05f45948489009458270014c2fddd539605b4b0 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Tue, 16 Jul 2024 17:34:08 +0200
Subject: [PATCH 153/186] Fix injection of GPU buffers that do not go by a Func
 name (i.e. alloc groups). (#8333)

* Fix injection of GPU buffers that do not go by a Func name (i.e. alloc groups).

* Cleanup
---
 src/Profiling.cpp                             | 13 +++--
 test/correctness/CMakeLists.txt               |  1 +
 .../correctness/gpu_alloc_group_profiling.cpp | 52 +++++++++++++++++++
 3 files changed, 61 insertions(+), 5 deletions(-)
 create mode 100644 test/correctness/gpu_alloc_group_profiling.cpp

diff --git a/src/Profiling.cpp b/src/Profiling.cpp
index b0178b4a073a..e26a076df084 100644
--- a/src/Profiling.cpp
+++ b/src/Profiling.cpp
@@ -216,8 +216,8 @@ class InjectProfiling : public IRMutator {
                                  const Expr &condition,
                                  const Type &type,
                                  const std::string &name,
-                                 bool &on_stack) {
-        on_stack = true;
+                                 bool &can_fit_on_stack) {
+        can_fit_on_stack = true;
 
         Expr cond = simplify(condition);
         if (is_const_zero(cond)) {  // Condition always false
@@ -236,7 +236,7 @@ class InjectProfiling : public IRMutator {
         // it would have constant size).
         internal_assert(!extents.empty());
 
-        on_stack = false;
+        can_fit_on_stack = false;
         Expr size = cast<uint64_t>(extents[0]);
         for (size_t i = 1; i < extents.size(); i++) {
             size *= extents[i];
@@ -261,9 +261,12 @@ class InjectProfiling : public IRMutator {
         auto [new_extents, changed] = mutate_with_changes(op->extents);
         Expr condition = mutate(op->condition);
 
-        bool on_stack;
-        Expr size = compute_allocation_size(new_extents, condition, op->type, op->name, on_stack);
+        bool can_fit_on_stack;
+        Expr size = compute_allocation_size(new_extents, condition, op->type, op->name, can_fit_on_stack);
         internal_assert(size.type() == UInt(64));
+
+        bool on_stack = can_fit_on_stack && !op->new_expr.defined();
+
         func_alloc_sizes.push(op->name, {on_stack, size});
 
         // compute_allocation_size() might return a zero size, if the allocation is
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 46b5338d8ecd..6d16d8612594 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -126,6 +126,7 @@ tests(GROUPS correctness
       gameoflife.cpp
       gather.cpp
       gpu_allocation_cache.cpp
+      gpu_alloc_group_profiling.cpp
       gpu_arg_types.cpp
       gpu_assertion_in_kernel.cpp
       gpu_bounds_inference_failure.cpp
diff --git a/test/correctness/gpu_alloc_group_profiling.cpp b/test/correctness/gpu_alloc_group_profiling.cpp
new file mode 100644
index 000000000000..d2efcdf60c2a
--- /dev/null
+++ b/test/correctness/gpu_alloc_group_profiling.cpp
@@ -0,0 +1,52 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char *argv[]) {
+
+    Target t = get_jit_target_from_environment();
+    if (!t.has_gpu_feature()) {
+        printf("[SKIP] GPU not enabled\n");
+        return 0;
+    }
+
+    // There was a bug that causes the inject profiling logic to try to
+    // lookup a Func from the environment, by the buffer name of an allocation group.
+    // Of course there is no Func for that name.
+    // This happens when the buffer originally was intended for GPUShared, but got somehow
+    // lifted to Heap (which I ran into before, without doing it explicitly like this below).
+    //  --mcourteaux
+
+    Var x{"x"}, y{"y"};
+
+    Func f1{"f1"}, f2{"f2"};
+    f1(x, y) = cast<float>(x + y);
+    f2(x, y) = f1(x, y) * 2;
+
+    Func result{"result"};
+    result(x, y) = f2(x, y);
+
+    Var xo{"xo"}, yo{"yo"}, xi{"xi"}, yi{"yi"};
+    result
+        .compute_root()
+        .gpu_tile(x, y, xo, yo, xi, yi, 16, 16)
+        .reorder(xi, yi, xo, yo);
+
+    f2.compute_at(result, xo)
+        .gpu_threads(x, y)
+        .store_in(MemoryType::Heap);
+
+    f1.compute_at(result, xo)
+        .gpu_threads(x, y)
+        .store_in(MemoryType::Heap);
+
+    result.print_loop_nest();
+
+    t.set_feature(Target::Profile);  // Make sure profiling is enabled!
+    result.compile_jit(t);
+    result.realize({64, 64}, t);
+    // result.compile_to_conceptual_stmt("gpu_alloc_group_profiling.stmt.html", {}, Halide::HTML, t);
+
+    printf("Success!\n");
+    return 0;
+}

From b741d9c5b2705c45fd7c8462eb0b4bfa35a3e6d8 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Tue, 16 Jul 2024 17:34:53 +0200
Subject: [PATCH 154/186] Adaptive Dark colorscheme for Stmt HTML. Ability to
 programmatically export conceptual stmt files. (#8327)

* A few color tweaks for a darker colorscheme.

* Dark color scheme for Stmt HTML. Ability to programatically export the conceptual stmt files.

* Toolbar for HTML Stmt viewer with various settings.

* Cleanup.
---
 python_bindings/src/halide/halide_/PyFunc.cpp |   5 +
 .../src/halide/halide_/PyPipeline.cpp         |   5 +
 src/Func.cpp                                  |   7 +
 src/Func.h                                    |   8 +
 src/Pipeline.cpp                              |   8 +
 src/Pipeline.h                                |   8 +
 src/StmtToHTML.cpp                            |  34 +
 src/irvisualizer/generate_palettes.py         |  42 +-
 src/irvisualizer/html_template_StmtToHTML.css | 756 +++++++++++-------
 src/irvisualizer/html_template_StmtToHTML.js  |  70 +-
 ...html_template_StmtToHTML_dependencies.html |  68 +-
 11 files changed, 700 insertions(+), 311 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyFunc.cpp b/python_bindings/src/halide/halide_/PyFunc.cpp
index bcc889b6d9ce..495187581aaa 100644
--- a/python_bindings/src/halide/halide_/PyFunc.cpp
+++ b/python_bindings/src/halide/halide_/PyFunc.cpp
@@ -287,6 +287,11 @@ void define_func(py::module &m) {
                     f.compile_to_lowered_stmt(filename, args, fmt, to_aot_target(target));
                 },
                 py::arg("filename"), py::arg("arguments"), py::arg("fmt") = Text, py::arg("target") = Target())
+            .def(
+                "compile_to_conceptual_stmt", [](Func &f, const std::string &filename, const std::vector<Argument> &args, StmtOutputFormat fmt, const Target &target) {
+                    f.compile_to_conceptual_stmt(filename, args, fmt, to_aot_target(target));
+                },
+                py::arg("filename"), py::arg("arguments"), py::arg("fmt") = Text, py::arg("target") = Target())
             .def(
                 "compile_to_file", [](Func &f, const std::string &filename_prefix, const std::vector<Argument> &args, const std::string &fn_name, const Target &target) {
                     f.compile_to_file(filename_prefix, args, fn_name, to_aot_target(target));
diff --git a/python_bindings/src/halide/halide_/PyPipeline.cpp b/python_bindings/src/halide/halide_/PyPipeline.cpp
index a87ffcedf39d..aeac06fe8092 100644
--- a/python_bindings/src/halide/halide_/PyPipeline.cpp
+++ b/python_bindings/src/halide/halide_/PyPipeline.cpp
@@ -139,6 +139,11 @@ void define_pipeline(py::module &m) {
                     p.compile_to_lowered_stmt(filename, args, fmt, to_aot_target(target));
                 },
                 py::arg("filename"), py::arg("arguments"), py::arg("fmt") = Text, py::arg("target") = Target())
+            .def(
+                "compile_to_conceptual_stmt", [](Pipeline &p, const std::string &filename, const std::vector<Argument> &args, StmtOutputFormat fmt, const Target &target) {
+                    p.compile_to_conceptual_stmt(filename, args, fmt, to_aot_target(target));
+                },
+                py::arg("filename"), py::arg("arguments"), py::arg("fmt") = Text, py::arg("target") = Target())
             .def(
                 "compile_to_file", [](Pipeline &p, const std::string &filename_prefix, const std::vector<Argument> &args, const std::string &fn_name, const Target &target) {
                     p.compile_to_file(filename_prefix, args, fn_name, to_aot_target(target));
diff --git a/src/Func.cpp b/src/Func.cpp
index 3373348d37a9..29c1427274bc 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -3542,6 +3542,13 @@ void Func::compile_to_lowered_stmt(const string &filename,
     pipeline().compile_to_lowered_stmt(filename, args, fmt, target);
 }
 
+void Func::compile_to_conceptual_stmt(const string &filename,
+                                      const vector<Argument> &args,
+                                      StmtOutputFormat fmt,
+                                      const Target &target) {
+    pipeline().compile_to_conceptual_stmt(filename, args, fmt, target);
+}
+
 void Func::print_loop_nest() {
     pipeline().print_loop_nest();
 }
diff --git a/src/Func.h b/src/Func.h
index f63195beb082..32d8f1e58c69 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -954,6 +954,14 @@ class Func {
                                  StmtOutputFormat fmt = Text,
                                  const Target &target = get_target_from_environment());
 
+    /** Write out a conceptual representation of lowered code, before any parallel loop
+     * get factored out into separate functions, or GPU loops are offloaded to kernel code.r
+     * Useful for analyzing and debugging scheduling. Can emit html or plain text. */
+    void compile_to_conceptual_stmt(const std::string &filename,
+                                    const std::vector<Argument> &args,
+                                    StmtOutputFormat fmt = Text,
+                                    const Target &target = get_target_from_environment());
+
     /** Write out the loop nests specified by the schedule for this
      * Function. Helpful for understanding what a schedule is
      * doing. */
diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
index 79d1701a2593..9109c0dd4ac1 100644
--- a/src/Pipeline.cpp
+++ b/src/Pipeline.cpp
@@ -356,6 +356,14 @@ void Pipeline::compile_to_lowered_stmt(const string &filename,
     m.compile(single_output(filename, m, fmt == HTML ? OutputFileType::stmt_html : OutputFileType::stmt));
 }
 
+void Pipeline::compile_to_conceptual_stmt(const string &filename,
+                                          const vector<Argument> &args,
+                                          StmtOutputFormat fmt,
+                                          const Target &target) {
+    Module m = compile_to_module(args, "", target);
+    m.compile(single_output(filename, m, fmt == HTML ? OutputFileType::conceptual_stmt_html : OutputFileType::conceptual_stmt));
+}
+
 void Pipeline::compile_to_static_library(const string &filename_prefix,
                                          const vector<Argument> &args,
                                          const std::string &fn_name,
diff --git a/src/Pipeline.h b/src/Pipeline.h
index 37537db04fb7..ef3e5f11eca3 100644
--- a/src/Pipeline.h
+++ b/src/Pipeline.h
@@ -281,6 +281,14 @@ class Pipeline {
                                  StmtOutputFormat fmt = Text,
                                  const Target &target = get_target_from_environment());
 
+    /** Write out a conceptual representation of lowered code, before any parallel loop
+     * get factored out into separate functions, or GPU loops are offloaded to kernel code.r
+     * Useful for analyzing and debugging scheduling. Can emit html or plain text. */
+    void compile_to_conceptual_stmt(const std::string &filename,
+                                    const std::vector<Argument> &args,
+                                    StmtOutputFormat fmt = Text,
+                                    const Target &target = get_target_from_environment());
+
     /** Write out the loop nests specified by the schedule for this
      * Pipeline's Funcs. Helpful for understanding what a schedule is
      * doing. */
diff --git a/src/StmtToHTML.cpp b/src/StmtToHTML.cpp
index 79cf6563551e..0811f38175de 100644
--- a/src/StmtToHTML.cpp
+++ b/src/StmtToHTML.cpp
@@ -2378,7 +2378,10 @@ class PipelineHTMLInspector {
     void generate_body(const Module &m) {
         stream << "<body>\n";
         stream << "  <div id='page-container'>\n";
+        generate_toolbar(m);
+        stream << "     <div id='content'>\n";
         generate_visualization_panes(m);
+        stream << "     </div>\n";
         stream << "  </div>\n";
 #if INLINE_TEMPLATES
         stream << "<script>\n"
@@ -2393,6 +2396,37 @@ class PipelineHTMLInspector {
         stream << "</body>";
     }
 
+    void generate_toolbar(const Module &m) {
+        stream << "<div id='toolbar'>\n";
+
+        // IR settings
+        stream << "<form id='form-ir-settings' name='form-ir-settings'>IR:\n";
+        stream << "   <label><input type='checkbox' name='checkbox-show-ir-wrap' checked />Wrap</label>\n";
+        stream << "   <label><input type='checkbox' name='checkbox-show-ir-line-nums' checked />Line numbers</label>\n";
+        stream << "   <label><input type='checkbox' name='checkbox-show-ir-costs' checked />Costs</label>\n";
+        stream << "</form>\n";
+
+        // Which panes to show
+        stream << "<form id='form-panes' name='form-panes'>Panes:\n";
+        stream << "   <label><input type='checkbox' name='checkbox-show-ir' checked />Show IR</label>\n";
+        stream << "   <label><input type='checkbox' name='checkbox-show-assembly' checked />Show Assembly</label>\n";
+        Buffer<> device_code_buf = m.get_device_code_buffer();
+        if (device_code_buf.defined()) {
+            stream << "   <label><input type='checkbox' name='checkbox-show-device-code' checked />Show Device Code</label>\n";
+        }
+        stream << "</form>\n";
+
+        // Color theme
+        stream << "<form id='form-theme' name='form-theme'>Theme:\n";
+        stream << "    <label><input type='radio' name='theme' value='auto' checked />Auto</label>\n";
+        stream << "    <label><input type='radio' name='theme' value='classic-light' />Classic Light</label>\n";
+        stream << "    <label><input type='radio' name='theme' value='gruvbox-dark' />Gruvbox Dark</label>\n";
+        stream << "<label><input type='radio' name='theme' value='gruvbox-light' />Gruvbox Light</label>\n";
+
+        stream << "</form>\n";
+        stream << "</div>\n";
+    }
+
     // Generate the three visualization panes
     void generate_visualization_panes(const Module &m) {
         int pane_count = 0;
diff --git a/src/irvisualizer/generate_palettes.py b/src/irvisualizer/generate_palettes.py
index 7c6f43d23704..465426006f05 100644
--- a/src/irvisualizer/generate_palettes.py
+++ b/src/irvisualizer/generate_palettes.py
@@ -3,6 +3,47 @@ def make_oklch(l, c, h):
 
 
 STEPS = 20
+for color_theme, fL in [("light", 1.0), ("dark", 0.7)]:
+    print()
+    print("[data-theme=\"" + color_theme + "\"] {")
+
+    for i in range(STEPS):
+        f = i / (STEPS - 1)
+        col = make_oklch((0.9 - f * 0.5) * fL, 0.05 + 0.1 * f, 140)
+        print("    .block-CostColor%d:first-child { border-left: 8px solid %s; }" % (i, col))
+    print("    .block-CostColorNone:first-child { border-left: transparent; }")
+    print()
+
+    for i in range(STEPS):
+        f = i / (STEPS - 1)
+        col = make_oklch((0.9 - f * 0.5) * fL, 0.05 + 0.1 * f, 140)
+        print("    .line-CostColor%d:first-child { border-right: 8px solid %s; }" % (i, col))
+    print("    .line-CostColorNone:first-child { border-right: transparent; }")
+    print()
+
+
+    for i in range(STEPS):
+        f = i / (STEPS - 1)
+        col = make_oklch((0.9 - f * 0.5) * fL, 0.05 + 0.1 * f, 300)
+        print("    .block-CostColor%d:last-child { border-left: 8px solid %s; }" % (i, col))
+    print("    .block-CostColorNone:last-child { border-left: transparent; }")
+    print()
+
+    for i in range(STEPS):
+        f = i / (STEPS - 1)
+        col = make_oklch((0.9 - f * 0.5) * fL, 0.05 + 0.1 * f, 300)
+        print("    .line-CostColor%d:last-child { border-right: 8px solid %s; }" % (i, col))
+    print("    .line-CostColorNone:last-child { border-right: transparent; }")
+
+    print("} /* End of", color_theme, "theme. */")
+
+print()
+print("Theme agnostic")
+print()
+
+def make_oklch(l, c, h):
+    return ("oklch(calc(%.1f%% * var(--cost-Lf)) %.2f %.0f)" % (l * 100, c, h))
+
 
 for i in range(STEPS):
     f = i / (STEPS - 1)
@@ -31,4 +72,3 @@ def make_oklch(l, c, h):
     col = make_oklch(0.9 - f * 0.5, 0.05 + 0.1 * f, 300)
     print(".line-CostColor%d:last-child { border-right: 8px solid %s; }" % (i, col))
 print(".line-CostColorNone:last-child { border-right: transparent; }")
-print()
diff --git a/src/irvisualizer/html_template_StmtToHTML.css b/src/irvisualizer/html_template_StmtToHTML.css
index 1b1d718b55ee..c33b308f5ecc 100644
--- a/src/irvisualizer/html_template_StmtToHTML.css
+++ b/src/irvisualizer/html_template_StmtToHTML.css
@@ -1,7 +1,8 @@
 /* General CSS Rules*/
 * {
-    font-family: Consolas, 'Liberation Mono', Menlo, Courier, monospace;
+    --font-family: Consolas, 'Liberation Mono', Menlo, Courier, monospace;
     font-size: 12px;
+    font-family: Consolas, 'Fira Code',Courier New,Monaco,Andale Mono,Ubuntu Mono,monospace;
 }
 
 body {
@@ -10,16 +11,249 @@ body {
     line-height: 14px;
 }
 
+[data-theme="classic-light"] {
+    --bg-color: #ffffff;
+    --fg-color: #000000;
+
+    --bg0:      #ffffff;
+    --bg1:      #fcfcfc;
+    --bg2:      #f0f0f0;
+    --bg3:      #ececec;
+    --bg4:      #e0e0e0;
+    --bg5:      #ddccab;
+
+    --fg0:      #000;
+    --fg1:      #333;
+    --red:      #c00;
+    --orange:   #a50;
+    --yellow:   #bb0;
+    --green:    #080;
+    --aqua:     #099;
+    --blue:     #01a;
+    --purple:   #705;
+
+
+    --bg_diff_green:    #e4edc8;
+    --bg_visual_green:  #dde5c2;
+    --bg_diff_red:      #f8e4c9;
+    --bg_visual_red:    #f0ddc3;
+    --bg_diff_blue:     #e0e9d3;
+    --bg_visual_blue:   #d9e1cc;
+    --bg_visual_yellow: #f9eabf;
+    --bg_current_word:  #f3eac7;
+
+    --bg_red:           #a55;
+    --bg_green:         #4b4;
+    --bg_yellow:        #ca0;
+
+    --grey0:            #aaa;
+    --grey1:            #999;
+    --grey2:            #555;
+
+    --cost-Lf: 1.0;
+}
+
+[data-theme="gruvbox-light"] {
+    --bg-color: #f9f5d7;
+    --fg-color: #654735;
+
+    --bg0:      #fbf1c7;
+    --bg1:      #f4e8be;
+    --bg2:      #f2e5bc;
+    --bg3:      #eee0b7;
+    --bg4:      #e5d5ad;
+    --bg5:      #ddccab;
+
+    --fg0:      #654735;
+    --fg1:      #4f3829;
+    --red:      #c14a4a;
+    --orange:   #c35e0a;
+    --yellow:   #b47109;
+    --green:    #6c782e;
+    --aqua:     #4c7a5d;
+    --blue:     #45707a;
+    --purple:   #945e80;
+
+    --bg_diff_green:    #e4edc8;
+    --bg_visual_green:  #dde5c2;
+    --bg_diff_red:      #f8e4c9;
+    --bg_visual_red:    #f0ddc3;
+    --bg_diff_blue:     #e0e9d3;
+    --bg_visual_blue:   #d9e1cc;
+    --bg_visual_yellow: #f9eabf;
+    --bg_current_word:  #f3eac7;
+
+    --bg_red:           #ae5858;
+    --bg_green:         #6f8352;
+    --bg_yellow:        #a96b2c;
+
+    --grey0:            #a89984;
+    --grey1:            #928374;
+    --grey2:            #7c6f64;
+
+    --cost-Lf: 0.8;
+}
+[data-theme="gruvbox-dark"] {
+    --bg-color: #f9f5d7;
+    --fg-color: #654735;
+
+    --bg0:      #282828;
+    --bg1:      #32302f;
+    --bg2:      #32302f;
+    --bg3:      #45403d;
+    --bg4:      #45403d;
+    --bg5:      #5a524c;
+
+    --fg0:      #d4be98;
+    --fg1:      #ddc7a1;
+    --red:      #ea6962;
+    --orange:   #e78a4e;
+    --yellow:   #d8a657;
+    --green:    #a9b665;
+    --aqua:     #89b482;
+    --blue:     #7daea3;
+    --purple:   #d3869b;
+
+    --bg_diff_green:    #32361a;
+    --bg_visual_green:  #333e34;
+    --bg_diff_red:      #3c1f1e;
+    --bg_visual_red:    #442e2d;
+    --bg_diff_blue:     #0d3138;
+    --bg_visual_blue:   #2e3b3b;
+    --bg_visual_yellow: #473c29;
+    --bg_current_word:  #32302f;
+
+    --bg_red:           #ea6962;
+    --bg_green:         #a9b665;
+    --bg_yellow:        #d8a657;
+
+
+    --grey0:            #7c6f64;
+    --grey1:            #928374;
+    --grey2:            #a89984;
+
+    --cost-Lf: 0.6;
+}
+
+body {
+    background-color: var(--bg0);
+    color: var(--fg0);
+
+    --cs-keyword: var(--orange);
+    --cs-int: var(--aqua);
+    --cs-uint: var(--aqua);
+    --cs-float: var(--aqua);
+    --cs-str: var(--green);
+    --cs-type: var(--blue);
+    --cs-symbol: var(--purple);
+    --cs-assign: var(--red);
+    --cs-comment: var(--green);
+    --cs-operator: var(--red);
+    --cs-line-num: var(--grey0);
+    --cs-register: var(--orange);
+    --cs-variable: var(--grey2);
+}
+
+
+/* Syntax highlighting */
+span.keyword      { color: var(--cs-keyword); font-weight: bold; }
+span.IntImm       { color: var(--cs-int); }
+span.UIntImm      { color: var(--cs-uint); }
+span.FloatImm     { color: var(--cs-float); }
+span.StringImm    { color: var(--cs-str); }
+span.Type         { color: var(--cs-type); font-weight: bold; }
+span.Symbol       { color: var(--cs-symbol); }
+span.Assign       { color: var(--cs-operator); }
+span.Comment      { color: var(--cs-comment); font-style: italic; }
+span.Operator     { color: var(--cs-operator); }
+#ir-code-pane     b.variable     { color: var(--cs-variable); }
+#device-code-pane b.variable     { color: var(--cs-register); }
+
+b.Highlight, b.Highlight *   {
+    font-weight: bold;
+    background-color: var(--bg_yellow) !important;
+    color: var(--bg0) !important;
+}
+span.Highlight, span.Highlight * {
+    font-weight: bold;
+    background-color: var(--bg_red) !important;
+    color: var(--bg0) !important;
+}
+[data-theme*="light"] {
+    b.HighlightToggle0 { font-weight: bold; background-color: oklch(80% 0.18 0); }
+    b.HighlightToggle1 { font-weight: bold; background-color: oklch(80% 0.18 50); }
+    b.HighlightToggle2 { font-weight: bold; background-color: oklch(80% 0.18 100); }
+    b.HighlightToggle3 { font-weight: bold; background-color: oklch(80% 0.18 150); }
+    b.HighlightToggle4 { font-weight: bold; background-color: oklch(80% 0.18 200); }
+    b.HighlightToggle5 { font-weight: bold; background-color: oklch(80% 0.18 260); }
+    b.HighlightToggle6 { font-weight: bold; background-color: oklch(80% 0.18 310); }
+}
+[data-theme*="dark"] {
+    b.HighlightToggle0 { background-color: oklch(40% 0.18 0); }
+    b.HighlightToggle1 { background-color: oklch(40% 0.18 50); }
+    b.HighlightToggle2 { background-color: oklch(40% 0.18 100); }
+    b.HighlightToggle3 { background-color: oklch(40% 0.18 150); }
+    b.HighlightToggle4 { background-color: oklch(40% 0.18 200); }
+    b.HighlightToggle5 { background-color: oklch(40% 0.18 260); }
+    b.HighlightToggle6 { background-color: oklch(40% 0.18 310); }
+}
+
+span.OpF32  { color: hsl(106deg 100% 40%); font-weight: bold; }
+span.OpF64  { color: hsl(106deg 100% 30%); font-weight: bold; }
+span.OpB8   { color: hsl(208deg 100% 80%); font-weight: bold; }
+span.OpB16  { color: hsl(208deg 100% 70%); font-weight: bold; }
+span.OpB32  { color: hsl(208deg 100% 60%); font-weight: bold; }
+span.OpB64  { color: hsl(208deg 100% 50%); font-weight: bold; }
+span.OpI8   { color: hsl(46deg 100% 45%); font-weight: bold; }
+span.OpI16  { color: hsl(46deg 100% 40%); font-weight: bold; }
+span.OpI32  { color: hsl(46deg 100% 34%); font-weight: bold; }
+span.OpI64  { color: hsl(46deg 100% 27%); font-weight: bold; }
+span.OpVec2 { background-color: var(--bg_visual_red); font-weight: bold; }
+span.OpVec4 { background-color: var(--bg_visual_blue); font-weight: bold; }
+span.Memory { color: #d22; font-weight: bold; }
+
+span.Pred  b { background-color: var(--bg_visual_green); font-weight: bold !important; color: var(--red); }
+span.Label b { background-color: var(--bg_visual_red); font-weight: bold !important; color: var(--aqua); }
+
+
+/* Speedlight syntax */
+.shj-syn-instruction { color: var(--purple); }
+.shj-syn-register { color: var(--cs-register); }
+.shj-syn-offset { color: var(--aqua); }
+.shj-syn-label { color: var(--red); }
+.shj-syn-asm-directive { color: var(--blue); }
+
+.shj-syn-cmnt{
+    font-style:italic;
+    color: var(--cs-comment);
+}
+.shj-syn-err,.shj-syn-kwd{
+    color: var(--cs-keyword);
+}
+.shj-syn-num,.shj-syn-class{
+    color: var(--cs-int);
+}
+.shj-syn-str{
+    color: var(--cs-str);
+}
+.shj-syn-bool{
+    color: var(--cs-blue);
+}
+.shj-syn-type,.shj-syn-oper{
+    color:#5af
+}
+.shj-syn-section,.shj-syn-func{
+    color:#84f
+}
+.shj-syn-deleted,.shj-syn-var{
+    color:#f44
+}
+
 .no-select {
     user-select: none;
     -webkit-user-select: none;
 }
 
-div#page-container {
-    height: 100vh;
-    display: flex;
-    flex-direction: column;
-}
 
 a,
 a:hover,
@@ -37,6 +271,42 @@ table {
     font-size: 12px;
 }
 
+
+div#page-container {
+    height: 100vh;
+    width: 100vw;
+    display: flex;
+    flex-direction: column;
+}
+
+/* Toolbar */
+
+#toolbar {
+    background-color: var(--bg1);
+    border-bottom: 1px solid var(--fg1);
+    padding: 1px;
+    /*
+    position: absolute;
+    top: 0;
+    left: 0;
+    z-index: 10;
+    width: 100%;
+    */
+}
+#toolbar form {
+    display: inline;
+    margin-left: 1em;
+    margin-right: 3em;
+}
+
+
+div#content {
+    display: flex;
+    flex-flow: column;
+    flex: 1 1 auto;
+    overflow-y: auto;
+}
+
 /* Visualization panes */
 div#visualization-panes {
     display: flex;
@@ -54,15 +324,29 @@ div.pane {
     counter-reset: line;
     min-width: 5vw;
     width: 33vw;
+    padding-top: 10px;
+    padding-bottom: 10px;
 }
 div.pane.collapsed-pane {
     display: none;
 }
 
 div#ir-code-pane {
-    padding-left: 50px;
-    padding-top: 20px;
     width: 50vw;
+    padding-left: 24px;
+}
+[data-wrap="false"] {
+    #ir-code-pane {
+        overflow-x: scroll;
+    }
+    #ir-code-pane div.Module {
+        width: max-content;
+    }
+}
+[data-show-line-nums="true"] {
+    div#ir-code-pane {
+        padding-left: 50px;
+    }
 }
 
 div#host-assembly-pane {
@@ -74,12 +358,38 @@ div#device-code-pane {
     width: 30vw;
 }
 
+/* Hiding logic */
+[data-show-ir="false"] {
+    div#ir-code-pane {
+        display: none;
+    }
+    div#resize-bar-0 {
+        display: none;
+    }
+}
+[data-show-assembly="false"] {
+    div#resize-bar-0 {
+        display: none;
+    }
+    div#host-assembly-pane {
+        display: none;
+    }
+}
+[data-show-device-code="false"] {
+    div#device-code-pane {
+        display: none;
+    }
+    div#resize-bar-1 {
+        display: none;
+    }
+}
+
 span.line {
     counter-increment: line;
 }
 span.line:before {
     content: counter(line) ".";
-    color: rgb(175, 175, 175);
+    color: var(--cs-line-num);
     position: absolute;
     left: 0px;
     width: 4em;
@@ -135,6 +445,22 @@ div.resize-bar > div.collapse-btns button.collapse-right.active:before {
     /*content: "\21A4";*/
     content: "<<";
 }
+div.collapse-btns *:before {
+    color: var(--fg0);
+}
+div.resize-bar {
+    background: var(--bg_visual_green);
+    border-left: 1px solid rgb(120, 130, 105);
+    border-right: 1px solid rgb(120, 130, 105);
+}
+
+[data-theme*="dark"] {
+    div.resize-bar {
+        background: rgb(101, 111, 96);
+        border-left: 1px solid rgb(120, 130, 105);
+        border-right: 1px solid rgb(120, 130, 105);
+    }
+}
 
 /* Resizer Preview */
 div#resizer-preview {
@@ -156,7 +482,7 @@ div#device-code-pane code {
 
 /* IR Code Section CSS */
 .ModuleBody {
-    padding-left: 70px !important;
+    padding-left: 60px !important;
 }
 
 div.Function {
@@ -214,99 +540,6 @@ div.For.for-type-gpu_thread div.For.for-type-gpu_thread {
     padding: 0;
 }
 
-b.Highlight {
-    font-weight: bold;
-    background-color: #DDD;
-}
-
-span.Highlight {
-    font-weight: bold;
-    background-color: #FF0;
-}
-
-b.HighlightToggle0 { font-weight: bold; background-color: oklch(80% 0.18 0); }
-b.HighlightToggle1 { font-weight: bold; background-color: oklch(80% 0.18 50); }
-b.HighlightToggle2 { font-weight: bold; background-color: oklch(80% 0.18 100); }
-b.HighlightToggle3 { font-weight: bold; background-color: oklch(80% 0.18 150); }
-b.HighlightToggle4 { font-weight: bold; background-color: oklch(80% 0.18 200); }
-b.HighlightToggle5 { font-weight: bold; background-color: oklch(80% 0.18 260); }
-b.HighlightToggle6 { font-weight: bold; background-color: oklch(80% 0.18 310); }
-
-span.OpF32 {
-    color: hsl(106deg 100% 40%);
-    font-weight: bold;
-}
-
-span.OpF64 {
-    color: hsl(106deg 100% 30%);
-    font-weight: bold;
-}
-
-span.OpB8 {
-    color: hsl(208deg 100% 80%);
-    font-weight: bold;
-}
-
-span.OpB16 {
-    color: hsl(208deg 100% 70%);
-    font-weight: bold;
-}
-
-span.OpB32 {
-    color: hsl(208deg 100% 60%);
-    font-weight: bold;
-}
-
-span.OpB64 {
-    color: hsl(208deg 100% 50%);
-    font-weight: bold;
-}
-
-span.OpI8 {
-    color: hsl(46deg 100% 45%);
-    font-weight: bold;
-}
-
-span.OpI16 {
-    color: hsl(46deg 100% 40%);
-    font-weight: bold;
-}
-
-span.OpI32 {
-    color: hsl(46deg 100% 34%);
-    font-weight: bold;
-}
-
-span.OpI64 {
-    color: hsl(46deg 100% 27%);
-    font-weight: bold;
-}
-
-span.OpVec2 {
-    background-color: hsl(100deg 100% 90%);
-    font-weight: bold;
-}
-
-span.OpVec4 {
-    background-color: hsl(100deg 100% 80%);
-    font-weight: bold;
-}
-
-span.Memory {
-    color: #d22;
-    font-weight: bold;
-}
-
-span.Pred {
-    background-color: #ffe8bd;
-    font-weight: bold;
-}
-
-span.Label {
-    background-color: #bde4ff;
-    font-weight: bold;
-}
-
 /* Collapse button and indent div logic */
 input.show-hide-btn {
     appearance: none;
@@ -323,6 +556,12 @@ input.show-hide-btn {
     transition: transform 0.2s;
     transform: rotate(0deg);
 }
+[data-theme*="dark"] {
+    input.show-hide-btn {
+        border: 1px solid #654735;
+        color: #e2cca9;
+    }
+}
 input.show-hide-btn:checked {
     transform: rotate(-90deg);
 }
@@ -349,7 +588,7 @@ div.indent {
 }
 
 input.show-hide-btn:hover {
-    color: #c30000;
+    color: var(--red);
 }
 
 /* The structure always has to be <button/><label/><div class=op-btns>[...]</div><div class=indent>...</div>.
@@ -370,7 +609,7 @@ input.show-hide-btn + label + div.op-btns + div.indent {
 
 /* TODO: add the same logic for hoving the closing brace. */
 input.show-hide-btn:hover + label + div.op-btns + div.indent {
-    border-left: 2px dotted black;
+    border-left: 2px dotted var(--fg0);
 }
 input.show-hide-btn:hover + label > span:last-child,
 input.show-hide-btn:hover + label + div.op-btns + div.indent + span.ClosingBrace {
@@ -380,8 +619,8 @@ input.show-hide-btn:hover + label + div.op-btns + div.indent + span.ClosingBrace
     height: 7px;
     width: 7px;
     display: inline-block;
-    border-bottom: 2px dotted black;
-    border-left: 2px dotted black;
+    border-bottom: 2px dotted var(--fg0);
+    border-left: 2px dotted var(--fg0);
     position: relative;
     left: -18px;
     content: " ";
@@ -461,59 +700,9 @@ div.code.ptx {
     tab-size: 26;
 }
 
-span.comment {
-    color: #998;
-    font-style: italic;
-}
-
-span.keyword {
-    color: #333;
-    font-weight: bold;
-}
-
-span.IntImm {
-    color: #099;
-}
-
-span.UIntImm {
-    color: #099;
-}
-
-span.FloatImm {
-    color: #099;
-}
-
-span.StringImm {
-    color: #d14;
-}
-
-span.Type {
-    color: #445588;
-    font-weight: bold;
-}
-
-span.Symbol {
-    color: #990073;
-}
-
-span.Assign {
-    color: #d14;
-    font-weight: bold;
-}
-
-div.WrapLine {
-    margin-left: 30px;
-    text-indent: -30px;
-}
 
-span.OpeningBrace {
-    margin-left: 0.3em;
-}
-
-span.Comment {
-    color: green;
-    font-style: italic;
-}
+div.WrapLine      { margin-left: 30px; text-indent: -30px; }
+span.OpeningBrace { margin-left: 0.3em; }
 
 span.IfSpan,
 span.ClosingBrace,
@@ -531,6 +720,8 @@ div.Evaluate {
 span.ClosingBrace + span span.IfSpan {
     counter-increment: none;
 }
+
+[data-show-line-nums="true"] {
 span.IfSpan:before,
 span.ClosingBrace:before,
 div.WrapLine:before,
@@ -541,7 +732,7 @@ div.Evaluate:before,
 div.Allocate:before,
 div.Function:before,
 div.Buffer:before,
-div.Evaluat:before {
+div.Evaluate:before {
     font-weight: normal;
     content: counter(line) '. ';
     display: inline-block;
@@ -549,10 +740,11 @@ div.Evaluat:before {
     left: 0px;
     width: 50px;
     text-align: right;
-    color: rgb(175, 175, 175);
+    color: var(--cs-line-num);
     user-select: none;
     -webkit-user-select: none;
 }
+}
 
 .tooltip-parent {
     position: relative;
@@ -585,6 +777,20 @@ span.tooltip {
 
 /* Cost model */
 
+[data-hide-cost="true"] {
+    div.node-cost {
+        display: none;
+    }
+    .ModuleBody {
+        padding-left: 30px !important;
+    }
+}
+[data-show-line-nums="false"] {
+    div.node-cost {
+        left: 15px;
+    }
+}
+
 div.node-cost {
     position: absolute;
     left: 52px; /* Enough for 4 digit line counter. */
@@ -605,95 +811,101 @@ div.cost-btn:hover {
     cursor: pointer;
     /*border: 1px solid lightgray;*/
 }
-
-.block-CostColor0:first-child { border-left: 8px solid oklch(90.0% 0.05 140); }
-.block-CostColor1:first-child { border-left: 8px solid oklch(87.4% 0.06 140); }
-.block-CostColor2:first-child { border-left: 8px solid oklch(84.7% 0.06 140); }
-.block-CostColor3:first-child { border-left: 8px solid oklch(82.1% 0.07 140); }
-.block-CostColor4:first-child { border-left: 8px solid oklch(79.5% 0.07 140); }
-.block-CostColor5:first-child { border-left: 8px solid oklch(76.8% 0.08 140); }
-.block-CostColor6:first-child { border-left: 8px solid oklch(74.2% 0.08 140); }
-.block-CostColor7:first-child { border-left: 8px solid oklch(71.6% 0.09 140); }
-.block-CostColor8:first-child { border-left: 8px solid oklch(68.9% 0.09 140); }
-.block-CostColor9:first-child { border-left: 8px solid oklch(66.3% 0.10 140); }
-.block-CostColor10:first-child { border-left: 8px solid oklch(63.7% 0.10 140); }
-.block-CostColor11:first-child { border-left: 8px solid oklch(61.1% 0.11 140); }
-.block-CostColor12:first-child { border-left: 8px solid oklch(58.4% 0.11 140); }
-.block-CostColor13:first-child { border-left: 8px solid oklch(55.8% 0.12 140); }
-.block-CostColor14:first-child { border-left: 8px solid oklch(53.2% 0.12 140); }
-.block-CostColor15:first-child { border-left: 8px solid oklch(50.5% 0.13 140); }
-.block-CostColor16:first-child { border-left: 8px solid oklch(47.9% 0.13 140); }
-.block-CostColor17:first-child { border-left: 8px solid oklch(45.3% 0.14 140); }
-.block-CostColor18:first-child { border-left: 8px solid oklch(42.6% 0.14 140); }
-.block-CostColor19:first-child { border-left: 8px solid oklch(40.0% 0.15 140); }
+[data-theme*="dark"] {
+    div.cost-btn {
+        outline: 1px solid rgba(255,255,255,0.15);
+    }
+    .line-CostColorNone.block-CostColorNone { outline: none; }
+}
+
+.block-CostColor0:first-child { border-left: 8px solid oklch(calc(90.0% * var(--cost-Lf)) 0.05 140); }
+.block-CostColor1:first-child { border-left: 8px solid oklch(calc(87.4% * var(--cost-Lf)) 0.06 140); }
+.block-CostColor2:first-child { border-left: 8px solid oklch(calc(84.7% * var(--cost-Lf)) 0.06 140); }
+.block-CostColor3:first-child { border-left: 8px solid oklch(calc(82.1% * var(--cost-Lf)) 0.07 140); }
+.block-CostColor4:first-child { border-left: 8px solid oklch(calc(79.5% * var(--cost-Lf)) 0.07 140); }
+.block-CostColor5:first-child { border-left: 8px solid oklch(calc(76.8% * var(--cost-Lf)) 0.08 140); }
+.block-CostColor6:first-child { border-left: 8px solid oklch(calc(74.2% * var(--cost-Lf)) 0.08 140); }
+.block-CostColor7:first-child { border-left: 8px solid oklch(calc(71.6% * var(--cost-Lf)) 0.09 140); }
+.block-CostColor8:first-child { border-left: 8px solid oklch(calc(68.9% * var(--cost-Lf)) 0.09 140); }
+.block-CostColor9:first-child { border-left: 8px solid oklch(calc(66.3% * var(--cost-Lf)) 0.10 140); }
+.block-CostColor10:first-child { border-left: 8px solid oklch(calc(63.7% * var(--cost-Lf)) 0.10 140); }
+.block-CostColor11:first-child { border-left: 8px solid oklch(calc(61.1% * var(--cost-Lf)) 0.11 140); }
+.block-CostColor12:first-child { border-left: 8px solid oklch(calc(58.4% * var(--cost-Lf)) 0.11 140); }
+.block-CostColor13:first-child { border-left: 8px solid oklch(calc(55.8% * var(--cost-Lf)) 0.12 140); }
+.block-CostColor14:first-child { border-left: 8px solid oklch(calc(53.2% * var(--cost-Lf)) 0.12 140); }
+.block-CostColor15:first-child { border-left: 8px solid oklch(calc(50.5% * var(--cost-Lf)) 0.13 140); }
+.block-CostColor16:first-child { border-left: 8px solid oklch(calc(47.9% * var(--cost-Lf)) 0.13 140); }
+.block-CostColor17:first-child { border-left: 8px solid oklch(calc(45.3% * var(--cost-Lf)) 0.14 140); }
+.block-CostColor18:first-child { border-left: 8px solid oklch(calc(42.6% * var(--cost-Lf)) 0.14 140); }
+.block-CostColor19:first-child { border-left: 8px solid oklch(calc(40.0% * var(--cost-Lf)) 0.15 140); }
 .block-CostColorNone:first-child { border-left: transparent; }
 
-.line-CostColor0:first-child { border-right: 8px solid oklch(90.0% 0.05 140); }
-.line-CostColor1:first-child { border-right: 8px solid oklch(87.4% 0.06 140); }
-.line-CostColor2:first-child { border-right: 8px solid oklch(84.7% 0.06 140); }
-.line-CostColor3:first-child { border-right: 8px solid oklch(82.1% 0.07 140); }
-.line-CostColor4:first-child { border-right: 8px solid oklch(79.5% 0.07 140); }
-.line-CostColor5:first-child { border-right: 8px solid oklch(76.8% 0.08 140); }
-.line-CostColor6:first-child { border-right: 8px solid oklch(74.2% 0.08 140); }
-.line-CostColor7:first-child { border-right: 8px solid oklch(71.6% 0.09 140); }
-.line-CostColor8:first-child { border-right: 8px solid oklch(68.9% 0.09 140); }
-.line-CostColor9:first-child { border-right: 8px solid oklch(66.3% 0.10 140); }
-.line-CostColor10:first-child { border-right: 8px solid oklch(63.7% 0.10 140); }
-.line-CostColor11:first-child { border-right: 8px solid oklch(61.1% 0.11 140); }
-.line-CostColor12:first-child { border-right: 8px solid oklch(58.4% 0.11 140); }
-.line-CostColor13:first-child { border-right: 8px solid oklch(55.8% 0.12 140); }
-.line-CostColor14:first-child { border-right: 8px solid oklch(53.2% 0.12 140); }
-.line-CostColor15:first-child { border-right: 8px solid oklch(50.5% 0.13 140); }
-.line-CostColor16:first-child { border-right: 8px solid oklch(47.9% 0.13 140); }
-.line-CostColor17:first-child { border-right: 8px solid oklch(45.3% 0.14 140); }
-.line-CostColor18:first-child { border-right: 8px solid oklch(42.6% 0.14 140); }
-.line-CostColor19:first-child { border-right: 8px solid oklch(40.0% 0.15 140); }
+.line-CostColor0:first-child { border-right: 8px solid oklch(calc(90.0% * var(--cost-Lf)) 0.05 140); }
+.line-CostColor1:first-child { border-right: 8px solid oklch(calc(87.4% * var(--cost-Lf)) 0.06 140); }
+.line-CostColor2:first-child { border-right: 8px solid oklch(calc(84.7% * var(--cost-Lf)) 0.06 140); }
+.line-CostColor3:first-child { border-right: 8px solid oklch(calc(82.1% * var(--cost-Lf)) 0.07 140); }
+.line-CostColor4:first-child { border-right: 8px solid oklch(calc(79.5% * var(--cost-Lf)) 0.07 140); }
+.line-CostColor5:first-child { border-right: 8px solid oklch(calc(76.8% * var(--cost-Lf)) 0.08 140); }
+.line-CostColor6:first-child { border-right: 8px solid oklch(calc(74.2% * var(--cost-Lf)) 0.08 140); }
+.line-CostColor7:first-child { border-right: 8px solid oklch(calc(71.6% * var(--cost-Lf)) 0.09 140); }
+.line-CostColor8:first-child { border-right: 8px solid oklch(calc(68.9% * var(--cost-Lf)) 0.09 140); }
+.line-CostColor9:first-child { border-right: 8px solid oklch(calc(66.3% * var(--cost-Lf)) 0.10 140); }
+.line-CostColor10:first-child { border-right: 8px solid oklch(calc(63.7% * var(--cost-Lf)) 0.10 140); }
+.line-CostColor11:first-child { border-right: 8px solid oklch(calc(61.1% * var(--cost-Lf)) 0.11 140); }
+.line-CostColor12:first-child { border-right: 8px solid oklch(calc(58.4% * var(--cost-Lf)) 0.11 140); }
+.line-CostColor13:first-child { border-right: 8px solid oklch(calc(55.8% * var(--cost-Lf)) 0.12 140); }
+.line-CostColor14:first-child { border-right: 8px solid oklch(calc(53.2% * var(--cost-Lf)) 0.12 140); }
+.line-CostColor15:first-child { border-right: 8px solid oklch(calc(50.5% * var(--cost-Lf)) 0.13 140); }
+.line-CostColor16:first-child { border-right: 8px solid oklch(calc(47.9% * var(--cost-Lf)) 0.13 140); }
+.line-CostColor17:first-child { border-right: 8px solid oklch(calc(45.3% * var(--cost-Lf)) 0.14 140); }
+.line-CostColor18:first-child { border-right: 8px solid oklch(calc(42.6% * var(--cost-Lf)) 0.14 140); }
+.line-CostColor19:first-child { border-right: 8px solid oklch(calc(40.0% * var(--cost-Lf)) 0.15 140); }
 .line-CostColorNone:first-child { border-right: transparent; }
-.block-CostColor0:last-child { border-left: 8px solid oklch(90.0% 0.05 300); }
-.block-CostColor1:last-child { border-left: 8px solid oklch(87.4% 0.06 300); }
-.block-CostColor2:last-child { border-left: 8px solid oklch(84.7% 0.06 300); }
-.block-CostColor3:last-child { border-left: 8px solid oklch(82.1% 0.07 300); }
-.block-CostColor4:last-child { border-left: 8px solid oklch(79.5% 0.07 300); }
-.block-CostColor5:last-child { border-left: 8px solid oklch(76.8% 0.08 300); }
-.block-CostColor6:last-child { border-left: 8px solid oklch(74.2% 0.08 300); }
-.block-CostColor7:last-child { border-left: 8px solid oklch(71.6% 0.09 300); }
-.block-CostColor8:last-child { border-left: 8px solid oklch(68.9% 0.09 300); }
-.block-CostColor9:last-child { border-left: 8px solid oklch(66.3% 0.10 300); }
-.block-CostColor10:last-child { border-left: 8px solid oklch(63.7% 0.10 300); }
-.block-CostColor11:last-child { border-left: 8px solid oklch(61.1% 0.11 300); }
-.block-CostColor12:last-child { border-left: 8px solid oklch(58.4% 0.11 300); }
-.block-CostColor13:last-child { border-left: 8px solid oklch(55.8% 0.12 300); }
-.block-CostColor14:last-child { border-left: 8px solid oklch(53.2% 0.12 300); }
-.block-CostColor15:last-child { border-left: 8px solid oklch(50.5% 0.13 300); }
-.block-CostColor16:last-child { border-left: 8px solid oklch(47.9% 0.13 300); }
-.block-CostColor17:last-child { border-left: 8px solid oklch(45.3% 0.14 300); }
-.block-CostColor18:last-child { border-left: 8px solid oklch(42.6% 0.14 300); }
-.block-CostColor19:last-child { border-left: 8px solid oklch(40.0% 0.15 300); }
+
+.block-CostColor0:last-child { border-left: 8px solid oklch(calc(90.0% * var(--cost-Lf)) 0.05 300); }
+.block-CostColor1:last-child { border-left: 8px solid oklch(calc(87.4% * var(--cost-Lf)) 0.06 300); }
+.block-CostColor2:last-child { border-left: 8px solid oklch(calc(84.7% * var(--cost-Lf)) 0.06 300); }
+.block-CostColor3:last-child { border-left: 8px solid oklch(calc(82.1% * var(--cost-Lf)) 0.07 300); }
+.block-CostColor4:last-child { border-left: 8px solid oklch(calc(79.5% * var(--cost-Lf)) 0.07 300); }
+.block-CostColor5:last-child { border-left: 8px solid oklch(calc(76.8% * var(--cost-Lf)) 0.08 300); }
+.block-CostColor6:last-child { border-left: 8px solid oklch(calc(74.2% * var(--cost-Lf)) 0.08 300); }
+.block-CostColor7:last-child { border-left: 8px solid oklch(calc(71.6% * var(--cost-Lf)) 0.09 300); }
+.block-CostColor8:last-child { border-left: 8px solid oklch(calc(68.9% * var(--cost-Lf)) 0.09 300); }
+.block-CostColor9:last-child { border-left: 8px solid oklch(calc(66.3% * var(--cost-Lf)) 0.10 300); }
+.block-CostColor10:last-child { border-left: 8px solid oklch(calc(63.7% * var(--cost-Lf)) 0.10 300); }
+.block-CostColor11:last-child { border-left: 8px solid oklch(calc(61.1% * var(--cost-Lf)) 0.11 300); }
+.block-CostColor12:last-child { border-left: 8px solid oklch(calc(58.4% * var(--cost-Lf)) 0.11 300); }
+.block-CostColor13:last-child { border-left: 8px solid oklch(calc(55.8% * var(--cost-Lf)) 0.12 300); }
+.block-CostColor14:last-child { border-left: 8px solid oklch(calc(53.2% * var(--cost-Lf)) 0.12 300); }
+.block-CostColor15:last-child { border-left: 8px solid oklch(calc(50.5% * var(--cost-Lf)) 0.13 300); }
+.block-CostColor16:last-child { border-left: 8px solid oklch(calc(47.9% * var(--cost-Lf)) 0.13 300); }
+.block-CostColor17:last-child { border-left: 8px solid oklch(calc(45.3% * var(--cost-Lf)) 0.14 300); }
+.block-CostColor18:last-child { border-left: 8px solid oklch(calc(42.6% * var(--cost-Lf)) 0.14 300); }
+.block-CostColor19:last-child { border-left: 8px solid oklch(calc(40.0% * var(--cost-Lf)) 0.15 300); }
 .block-CostColorNone:last-child { border-left: transparent; }
 
-.line-CostColor0:last-child { border-right: 8px solid oklch(90.0% 0.05 300); }
-.line-CostColor1:last-child { border-right: 8px solid oklch(87.4% 0.06 300); }
-.line-CostColor2:last-child { border-right: 8px solid oklch(84.7% 0.06 300); }
-.line-CostColor3:last-child { border-right: 8px solid oklch(82.1% 0.07 300); }
-.line-CostColor4:last-child { border-right: 8px solid oklch(79.5% 0.07 300); }
-.line-CostColor5:last-child { border-right: 8px solid oklch(76.8% 0.08 300); }
-.line-CostColor6:last-child { border-right: 8px solid oklch(74.2% 0.08 300); }
-.line-CostColor7:last-child { border-right: 8px solid oklch(71.6% 0.09 300); }
-.line-CostColor8:last-child { border-right: 8px solid oklch(68.9% 0.09 300); }
-.line-CostColor9:last-child { border-right: 8px solid oklch(66.3% 0.10 300); }
-.line-CostColor10:last-child { border-right: 8px solid oklch(63.7% 0.10 300); }
-.line-CostColor11:last-child { border-right: 8px solid oklch(61.1% 0.11 300); }
-.line-CostColor12:last-child { border-right: 8px solid oklch(58.4% 0.11 300); }
-.line-CostColor13:last-child { border-right: 8px solid oklch(55.8% 0.12 300); }
-.line-CostColor14:last-child { border-right: 8px solid oklch(53.2% 0.12 300); }
-.line-CostColor15:last-child { border-right: 8px solid oklch(50.5% 0.13 300); }
-.line-CostColor16:last-child { border-right: 8px solid oklch(47.9% 0.13 300); }
-.line-CostColor17:last-child { border-right: 8px solid oklch(45.3% 0.14 300); }
-.line-CostColor18:last-child { border-right: 8px solid oklch(42.6% 0.14 300); }
-.line-CostColor19:last-child { border-right: 8px solid oklch(40.0% 0.15 300); }
+.line-CostColor0:last-child { border-right: 8px solid oklch(calc(90.0% * var(--cost-Lf)) 0.05 300); }
+.line-CostColor1:last-child { border-right: 8px solid oklch(calc(87.4% * var(--cost-Lf)) 0.06 300); }
+.line-CostColor2:last-child { border-right: 8px solid oklch(calc(84.7% * var(--cost-Lf)) 0.06 300); }
+.line-CostColor3:last-child { border-right: 8px solid oklch(calc(82.1% * var(--cost-Lf)) 0.07 300); }
+.line-CostColor4:last-child { border-right: 8px solid oklch(calc(79.5% * var(--cost-Lf)) 0.07 300); }
+.line-CostColor5:last-child { border-right: 8px solid oklch(calc(76.8% * var(--cost-Lf)) 0.08 300); }
+.line-CostColor6:last-child { border-right: 8px solid oklch(calc(74.2% * var(--cost-Lf)) 0.08 300); }
+.line-CostColor7:last-child { border-right: 8px solid oklch(calc(71.6% * var(--cost-Lf)) 0.09 300); }
+.line-CostColor8:last-child { border-right: 8px solid oklch(calc(68.9% * var(--cost-Lf)) 0.09 300); }
+.line-CostColor9:last-child { border-right: 8px solid oklch(calc(66.3% * var(--cost-Lf)) 0.10 300); }
+.line-CostColor10:last-child { border-right: 8px solid oklch(calc(63.7% * var(--cost-Lf)) 0.10 300); }
+.line-CostColor11:last-child { border-right: 8px solid oklch(calc(61.1% * var(--cost-Lf)) 0.11 300); }
+.line-CostColor12:last-child { border-right: 8px solid oklch(calc(58.4% * var(--cost-Lf)) 0.11 300); }
+.line-CostColor13:last-child { border-right: 8px solid oklch(calc(55.8% * var(--cost-Lf)) 0.12 300); }
+.line-CostColor14:last-child { border-right: 8px solid oklch(calc(53.2% * var(--cost-Lf)) 0.12 300); }
+.line-CostColor15:last-child { border-right: 8px solid oklch(calc(50.5% * var(--cost-Lf)) 0.13 300); }
+.line-CostColor16:last-child { border-right: 8px solid oklch(calc(47.9% * var(--cost-Lf)) 0.13 300); }
+.line-CostColor17:last-child { border-right: 8px solid oklch(calc(45.3% * var(--cost-Lf)) 0.14 300); }
+.line-CostColor18:last-child { border-right: 8px solid oklch(calc(42.6% * var(--cost-Lf)) 0.14 300); }
+.line-CostColor19:last-child { border-right: 8px solid oklch(calc(40.0% * var(--cost-Lf)) 0.15 300); }
 .line-CostColorNone:last-child { border-right: transparent; }
 
-
 .NoChildCost { border-left: none !important; }
 
 .line-CostColorNone.block-CostColorNone { outline: none; }
@@ -703,11 +915,7 @@ div.cost-btn:hover {
  */
 [class*=shj-lang-]{
     white-space:pre;
-    background:white;
-    color:#112;
-    box-shadow:0 0 5px #0001;
     text-shadow:none;
-    font: 12px Consolas,Courier New,Monaco,Andale Mono,Ubuntu Mono,monospace;
     line-height:14px;
     box-sizing:border-box;
     width: fit-content;
@@ -719,7 +927,7 @@ div.cost-btn:hover {
     border-radius:5px
 }
 [class*=shj-lang-]::selection,[class*=shj-lang-] ::selection{
-    background:#bdf5
+    background: var(--bg-color)
 }
 [class*=shj-lang-]>div{
     display: block;
@@ -730,60 +938,9 @@ div.cost-btn:hover {
     flex:1;
     outline:none
 }
-.shj-numbers{
-    padding-left:5px;
-    counter-reset:line
-}
-.shj-numbers div{
-    padding-right:5px;
-    width:0;
-}
-.shj-numbers div:before{
-    color:#999;
-    display:block;
-    content:counter(line);
-    opacity:.5;
-    text-align:right;
-    margin-right:5px;
-    counter-increment:line;
-    width:5em;
-}
-.shj-syn-cmnt{
-    font-style:italic
-}
-.shj-syn-err,.shj-syn-kwd{
-    color:#e16
-}
-.shj-syn-num,.shj-syn-class{
-    color:#f60
-}
-.shj-numbers,.shj-syn-cmnt{
-    color:#999
-}
-.shj-syn-insert,.shj-syn-str{
-    color:#7d8
-}
-.shj-syn-bool{
-    color:#3bf
-}
-.shj-syn-type,.shj-syn-oper{
-    color:#5af
-}
-.shj-syn-section,.shj-syn-func{
-    color:#84f
-}
-.shj-syn-deleted,.shj-syn-var{
-    color:#f44
-}
 .shj-oneline{
     padding:12px 10px
 }
-.shj-lang-http.shj-oneline .shj-syn-kwd{
-    background:#25f;
-    color:#fff;
-    padding:5px 7px;
-    border-radius:5px
-}
 .shj-multiline.shj-mode-header{
     padding:20px
 }
@@ -796,4 +953,3 @@ div.cost-btn:hover {
     border-radius:5px;
     margin-bottom:20px
 }
-
diff --git a/src/irvisualizer/html_template_StmtToHTML.js b/src/irvisualizer/html_template_StmtToHTML.js
index 6465bb9051fb..ab92c4aa878e 100644
--- a/src/irvisualizer/html_template_StmtToHTML.js
+++ b/src/irvisualizer/html_template_StmtToHTML.js
@@ -157,7 +157,11 @@ function collapseTab(index) {  // eslint-disable-line no-unused-vars
             resizer = resizer.nextElementSibling.nextElementSibling;
         }
         if (resizer !== null) {
-            let colRightBtn = resizer.firstElementChild.firstElementChild.nextElementSibling.firstElementChild;
+            let colRightBtn = resizer
+                .firstElementChild
+                .firstElementChild
+                .nextElementSibling
+                .firstElementChild;
             colRightBtn.classList.toggle('active');
         }
     }
@@ -194,11 +198,71 @@ function scrollToDeviceCode(lno) {  // eslint-disable-line no-unused-vars
             behavior : "smooth"
         });
 
-        line.style.backgroundColor = 'lightgray';
+        line.style.backgroundColor = 'var(--bg_visual_green)';
         setTimeout(function() {
-            line.style.backgroundColor = 'transparent';
+            line.style.backgroundColor = null;
         }, 1000);
     } else {
         console.error("Jump to device code references line number " + lno + ", which is out or range of the " + lineSpans.length + " lines.");
     }
 }
+
+function initToolbar() {
+
+    /* IR Settings */
+    function make_toggler(ckbx, attr, inv) {
+        if (ckbx === null || ckbx === undefined) return;
+        ckbx.addEventListener('change', function() {
+            if (ckbx.checked ^ inv) {
+                document.body.setAttribute(attr, "true")
+            } else {
+                document.body.setAttribute(attr, "false")
+            }
+        });
+        if (ckbx.checked ^ inv) {
+            document.body.setAttribute(attr, "true")
+        } else {
+            document.body.setAttribute(attr, "false")
+        }
+    }
+    make_toggler(document.getElementsByName("checkbox-show-ir-line-nums")[0], "data-show-line-nums", false);
+    make_toggler(document.getElementsByName("checkbox-show-ir-costs")[0], "data-hide-cost", true);
+    make_toggler(document.getElementsByName("checkbox-show-ir-wrap")[0], "data-wrap", false);
+
+    /* Hiding panes */
+    make_toggler(document.getElementsByName("checkbox-show-ir")[0], "data-show-ir", false);
+    make_toggler(document.getElementsByName("checkbox-show-assembly")[0], "data-show-assembly", false);
+    make_toggler(document.getElementsByName("checkbox-show-device-code")[0], "data-show-device-code", false);
+
+    /* Theme toggling */
+    var themeRadios = document.getElementById("form-theme").theme;
+
+    var radioButtonAuto = themeRadios[0];
+    const darkModePreference = window.matchMedia("(prefers-color-scheme: dark)");
+    function updateAutoTheme() {
+        console.log("update auto theme");
+        if (radioButtonAuto.checked) {
+            if(darkModePreference.matches) {
+                document.body.setAttribute("data-theme", "gruvbox-dark");
+            } else {
+                document.body.setAttribute("data-theme", "classic-light");
+            }
+        }
+    }
+    radioButtonAuto.addEventListener('change', function() {
+        updateAutoTheme();
+    });
+    updateAutoTheme();
+    darkModePreference.addEventListener("change", e => updateAutoTheme());
+    darkModePreference.addListener(e => updateAutoTheme());
+
+    for (var i = 1; i < themeRadios.length; i++) {
+        themeRadios[i].addEventListener('change', function() {
+            document.body.setAttribute("data-theme", this.value);
+        });
+        if (themeRadios[i].checked) {
+            document.body.setAttribute("data-theme", themeRadios[i].value);
+        }
+    }
+}
+initToolbar();
diff --git a/src/irvisualizer/html_template_StmtToHTML_dependencies.html b/src/irvisualizer/html_template_StmtToHTML_dependencies.html
index 9fe9477e8fe2..004bd5096401 100644
--- a/src/irvisualizer/html_template_StmtToHTML_dependencies.html
+++ b/src/irvisualizer/html_template_StmtToHTML_dependencies.html
@@ -1,11 +1,65 @@
 <script src='http://code.jquery.com/jquery-1.10.2.js'></script>
 
 <!-- Assembly Code links (Speed Highlight) -->
-<script type="module">
-  import { highlightAll } from 'https://cdn.jsdelivr.net/gh/speed-highlight/core/dist/index.js';
-  import 'https://cdn.jsdelivr.net/gh/speed-highlight/core/dist/languages/asm.js';
-  highlightAll({hideLineNumbers: true});
+<script type="module" name="hl">
+  const shj = await import('https://cdn.jsdelivr.net/gh/speed-highlight/core/dist/index.js');
+  var asmRules = {}
+  asmRules["default"] = [
+    {
+      type: 'cmnt',
+      match: /(;|#).*/gm
+    },
+    {
+      expand: 'str'
+    },
+    {
+      // value (ex: "$0x1")
+      type: 'num',
+      match: /\$-?(0x)?[\da-fA-F]*\b/g
+    },
+    {
+      // offsets (ex: "0x201(%reg)")
+      type: 'offset',
+      match: /-?[\da-fA-F]+\b/g
+    },
+    {
+      type: 'kwd',
+      // ex: "section .data"
+      match: /^[a-z]+\s+[a-z.]+\b/gm,
+      sub: [
+        {
+          // keyword (ex: "section")
+          type: 'func',
+          match: /^[a-z]+/g
+        }
+      ]
+    },
+    {
+      // lock instruction (ex: "mov")
+      type: 'instruction',
+      match: /^\t*lock( |\t)+[a-z][a-z\d]*\b/gm,
+    },
+    {
+      // instruction (ex: "mov")
+      type: 'instruction',
+      match: /^\t*[a-z][a-z\d]*\b/gm,
+    },
+    {
+      // registers
+      match: /%[a-z\d]+/g,
+      type: 'register'
+    },
+    {
+      // assembler directives
+      match: /\t*\.[a-z\d]+\b/g,
+      type: 'asm-directive'
+    },
+    {
+      // labels
+      match: /\.L[A-Za-z\d_]+:?/g,
+      type: 'label'
+    }
+  ];
+  shj.loadLanguage("asm", asmRules)
+  shj.highlightElement(document.getElementById("assemblyContent"), "asm", undefined, {hideLineNumbers: true});
 </script>
-
-<style type='text/css'>
-</style>

From bebb888ffbee87f7a16f559d3a003a51146bc497 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 23 Jul 2024 10:37:37 -0700
Subject: [PATCH 155/186] Add ARMv8.x feature flags (#4489)

* Add ARMv8.3a feature flag

This allows selecting the ARMv8.3a feature set via a new Feature flag. We don't (yet) add any specialization to our codegen (beyond what LLVM will do for us under the hood).

* Update CodeGen_ARM.cpp

* Update CodeGen_ARM.cpp

* Update CodeGen_ARM.cpp

* Add Features for all the ARM v8.x architectures

* Update CodeGen_ARM.cpp

* Fixes

* get_runtime_compatible_target() should use meet

* Add ARMv8a

* trigger buildbots
---
 .../src/halide/halide_/PyEnums.cpp            |  9 ++
 src/CodeGen_ARM.cpp                           | 71 +++++++++++++-
 src/Target.cpp                                | 98 ++++++++++++++++++-
 src/Target.h                                  | 13 +++
 src/runtime/HalideRuntime.h                   | 11 ++-
 test/correctness/cross_compilation.cpp        | 10 ++
 test/correctness/simd_op_check.h              |  9 ++
 test/correctness/target.cpp                   | 12 +++
 8 files changed, 225 insertions(+), 8 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp
index 5a442bdca737..6aa83534b4b4 100644
--- a/python_bindings/src/halide/halide_/PyEnums.cpp
+++ b/python_bindings/src/halide/halide_/PyEnums.cpp
@@ -179,7 +179,16 @@ void define_enums(py::module &m) {
         .value("ARMFp16", Target::Feature::ARMFp16)
         .value("LLVMLargeCodeModel", Target::Feature::LLVMLargeCodeModel)
         .value("RVV", Target::Feature::RVV)
+        .value("ARMv8a", Target::Feature::ARMv8a)
         .value("ARMv81a", Target::Feature::ARMv81a)
+        .value("ARMv82a", Target::Feature::ARMv82a)
+        .value("ARMv83a", Target::Feature::ARMv83a)
+        .value("ARMv84a", Target::Feature::ARMv84a)
+        .value("ARMv85a", Target::Feature::ARMv85a)
+        .value("ARMv86a", Target::Feature::ARMv86a)
+        .value("ARMv87a", Target::Feature::ARMv87a)
+        .value("ARMv88a", Target::Feature::ARMv88a)
+        .value("ARMv89a", Target::Feature::ARMv89a)
         .value("SanitizerCoverage", Target::Feature::SanitizerCoverage)
         .value("ProfileByTimer", Target::Feature::ProfileByTimer)
         .value("SPIRV", Target::Feature::SPIRV)
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 0e4de6baa050..24bd7d9e4e34 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -32,6 +32,42 @@ using namespace llvm;
 
 namespace {
 
+// Populate feature flags in a target according to those implied by
+// existing flags, so that instruction patterns can just check for the
+// oldest feature flag that supports an instruction.
+//
+// According to LLVM, ARM architectures have the following is-a-superset-of
+// relationships:
+//
+//   v9.5a > v9.4a > v9.3a > v9.2a > v9.1a > v9a;
+//             v       v       v       v       v
+//           v8.9a > v8.8a > v8.7a > v8.6a > v8.5a > v8.4a > ... > v8a;
+//
+// v8r has no relation to anything.
+Target complete_arm_target(Target t) {
+    constexpr int num_arm_v8_features = 10;
+    static const Target::Feature arm_v8_features[num_arm_v8_features] = {
+        Target::ARMv89a,
+        Target::ARMv88a,
+        Target::ARMv87a,
+        Target::ARMv86a,
+        Target::ARMv85a,
+        Target::ARMv84a,
+        Target::ARMv83a,
+        Target::ARMv82a,
+        Target::ARMv81a,
+        Target::ARMv8a,
+    };
+
+    for (int i = 0; i < num_arm_v8_features - 1; i++) {
+        if (t.has_feature(arm_v8_features[i])) {
+            t.set_feature(arm_v8_features[i + 1]);
+        }
+    }
+
+    return t;
+}
+
 // Substitute in loads that feed into slicing shuffles, to help with vld2/3/4
 // emission. These are commonly lifted as lets because they get used by multiple
 // interleaved slices of the same load.
@@ -201,7 +237,7 @@ class CodeGen_ARM : public CodeGen_Posix {
 };
 
 CodeGen_ARM::CodeGen_ARM(const Target &target)
-    : CodeGen_Posix(target) {
+    : CodeGen_Posix(complete_arm_target(target)) {
 
     // TODO(https://github.com/halide/Halide/issues/8088): See if
     // use_llvm_vp_intrinsics can replace architecture specific code in this
@@ -2445,9 +2481,9 @@ string CodeGen_ARM::mcpu_target() const {
         }
     } else {
         if (target.os == Target::IOS) {
-            return "cyclone";
+            return "apple-a7";
         } else if (target.os == Target::OSX) {
-            return "apple-a12";
+            return "apple-m1";
         } else if (target.has_feature(Target::SVE2)) {
             return "cortex-x1";
         } else {
@@ -2465,9 +2501,36 @@ string CodeGen_ARM::mattrs() const {
     if (target.has_feature(Target::ARMFp16)) {
         attrs.emplace_back("+fullfp16");
     }
+    if (target.has_feature(Target::ARMv8a)) {
+        attrs.emplace_back("+v8a");
+    }
     if (target.has_feature(Target::ARMv81a)) {
         attrs.emplace_back("+v8.1a");
     }
+    if (target.has_feature(Target::ARMv82a)) {
+        attrs.emplace_back("+v8.2a");
+    }
+    if (target.has_feature(Target::ARMv83a)) {
+        attrs.emplace_back("+v8.3a");
+    }
+    if (target.has_feature(Target::ARMv84a)) {
+        attrs.emplace_back("+v8.4a");
+    }
+    if (target.has_feature(Target::ARMv85a)) {
+        attrs.emplace_back("+v8.5a");
+    }
+    if (target.has_feature(Target::ARMv86a)) {
+        attrs.emplace_back("+v8.6a");
+    }
+    if (target.has_feature(Target::ARMv87a)) {
+        attrs.emplace_back("+v8.7a");
+    }
+    if (target.has_feature(Target::ARMv88a)) {
+        attrs.emplace_back("+v8.8a");
+    }
+    if (target.has_feature(Target::ARMv89a)) {
+        attrs.emplace_back("+v8.9a");
+    }
     if (target.has_feature(Target::ARMDotProd)) {
         attrs.emplace_back("+dotprod");
     }
@@ -2482,7 +2545,7 @@ string CodeGen_ARM::mattrs() const {
         }
     } else {
         // TODO: Should Halide's SVE flags be 64-bit only?
-        // TODO: Sound we ass "-neon" if NoNEON is set? Does this make any sense?
+        // TODO: Should we add "-neon" if NoNEON is set? Does this make any sense?
         if (target.has_feature(Target::SVE2)) {
             attrs.emplace_back("+sve2");
         } else if (target.has_feature(Target::SVE)) {
diff --git a/src/Target.cpp b/src/Target.cpp
index c0cd3e9bab3a..9270033f35e9 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -688,7 +688,16 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"arm_fp16", Target::ARMFp16},
     {"llvm_large_code_model", Target::LLVMLargeCodeModel},
     {"rvv", Target::RVV},
+    {"armv8a", Target::ARMv8a},
     {"armv81a", Target::ARMv81a},
+    {"armv82a", Target::ARMv82a},
+    {"armv83a", Target::ARMv83a},
+    {"armv84a", Target::ARMv84a},
+    {"armv85a", Target::ARMv85a},
+    {"armv86a", Target::ARMv86a},
+    {"armv87a", Target::ARMv87a},
+    {"armv88a", Target::ARMv88a},
+    {"armv89a", Target::ARMv89a},
     {"sanitizer_coverage", Target::SanitizerCoverage},
     {"profile_by_timer", Target::ProfileByTimer},
     {"spirv", Target::SPIRV},
@@ -1258,6 +1267,40 @@ int Target::get_vulkan_capability_lower_bound() const {
     return 10;
 }
 
+int Target::get_arm_v8_lower_bound() const {
+    if (has_feature(Target::ARMv8a)) {
+        return 80;
+    }
+    if (has_feature(Target::ARMv81a)) {
+        return 81;
+    }
+    if (has_feature(Target::ARMv82a)) {
+        return 82;
+    }
+    if (has_feature(Target::ARMv83a)) {
+        return 83;
+    }
+    if (has_feature(Target::ARMv84a)) {
+        return 84;
+    }
+    if (has_feature(Target::ARMv85a)) {
+        return 85;
+    }
+    if (has_feature(Target::ARMv86a)) {
+        return 86;
+    }
+    if (has_feature(Target::ARMv87a)) {
+        return 87;
+    }
+    if (has_feature(Target::ARMv88a)) {
+        return 88;
+    }
+    if (has_feature(Target::ARMv89a)) {
+        return 89;
+    }
+    return -1;
+}
+
 bool Target::supports_type(const Type &t) const {
     if (t.bits() == 64) {
         if (t.is_float()) {
@@ -1461,7 +1504,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
     // (c) must match across both targets; it is an error if one target has the feature and the other doesn't
 
     // clang-format off
-    const std::array<Feature, 23> union_features = {{
+    const std::array<Feature, 33> union_features = {{
         // These are true union features.
         CUDA,
         D3D12Compute,
@@ -1482,20 +1525,32 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         CUDACapability75,
         CUDACapability80,
         CUDACapability86,
+
         HVX_v62,
         HVX_v65,
         HVX_v66,
         HVX_v68,
+
         VulkanV10,
         VulkanV12,
         VulkanV13,
+
+        ARMv8a,
+        ARMv81a,
+        ARMv82a,
+        ARMv83a,
+        ARMv84a,
+        ARMv85a,
+        ARMv86a,
+        ARMv87a,
+        ARMv88a,
+        ARMv89a,
     }};
     // clang-format on
 
     // clang-format off
-    const std::array<Feature, 15> intersection_features = {{
+    const std::array<Feature, 23> intersection_features = {{
         ARMv7s,
-        ARMv81a,
         AVX,
         AVX2,
         AVX512,
@@ -1634,6 +1689,43 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
         output.features.reset(HVX_v68);
     }
 
+    // Pick tight lower bound for ARM capability. Use fall-through to clear redundant features
+    int arm_v8_a = get_arm_v8_lower_bound();
+    int arm_v8_b = other.get_arm_v8_lower_bound();
+
+    // Same trick as above for CUDA
+    int arm_v8_capability = (int)std::min((unsigned)arm_v8_a, (unsigned)arm_v8_b);
+    if (arm_v8_capability < 80) {
+        output.features.reset(ARMv8a);
+    }
+    if (arm_v8_capability < 81) {
+        output.features.reset(ARMv81a);
+    }
+    if (arm_v8_capability < 82) {
+        output.features.reset(ARMv82a);
+    }
+    if (arm_v8_capability < 83) {
+        output.features.reset(ARMv83a);
+    }
+    if (arm_v8_capability < 84) {
+        output.features.reset(ARMv84a);
+    }
+    if (arm_v8_capability < 85) {
+        output.features.reset(ARMv85a);
+    }
+    if (arm_v8_capability < 86) {
+        output.features.reset(ARMv86a);
+    }
+    if (arm_v8_capability < 87) {
+        output.features.reset(ARMv87a);
+    }
+    if (arm_v8_capability < 88) {
+        output.features.reset(ARMv88a);
+    }
+    if (arm_v8_capability < 89) {
+        output.features.reset(ARMv89a);
+    }
+
     result = output;
     return true;
 }
diff --git a/src/Target.h b/src/Target.h
index 7150513e6451..81da10a37126 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -154,7 +154,16 @@ struct Target {
         ARMFp16 = halide_target_feature_arm_fp16,
         LLVMLargeCodeModel = halide_llvm_large_code_model,
         RVV = halide_target_feature_rvv,
+        ARMv8a = halide_target_feature_armv8a,
         ARMv81a = halide_target_feature_armv81a,
+        ARMv82a = halide_target_feature_armv82a,
+        ARMv83a = halide_target_feature_armv83a,
+        ARMv84a = halide_target_feature_armv84a,
+        ARMv85a = halide_target_feature_armv85a,
+        ARMv86a = halide_target_feature_armv86a,
+        ARMv87a = halide_target_feature_armv87a,
+        ARMv88a = halide_target_feature_armv88a,
+        ARMv89a = halide_target_feature_armv89a,
         SanitizerCoverage = halide_target_feature_sanitizer_coverage,
         ProfileByTimer = halide_target_feature_profile_by_timer,
         SPIRV = halide_target_feature_spirv,
@@ -335,6 +344,10 @@ struct Target {
      * features are set. */
     int get_vulkan_capability_lower_bound() const;
 
+    /** Get the minimum ARM v8.x capability found as an integer. Returns
+     * -1 if no ARM v8.x features are set. */
+    int get_arm_v8_lower_bound() const;
+
     /** Was libHalide compiled with support for this target? */
     bool supported() const;
 
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 736d64478c8b..ce3fd013ec1b 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1434,7 +1434,16 @@ typedef enum halide_target_feature_t {
     halide_target_feature_arm_fp16,               ///< Enable ARMv8.2-a half-precision floating point data processing
     halide_llvm_large_code_model,                 ///< Use the LLVM large code model to compile
     halide_target_feature_rvv,                    ///< Enable RISCV "V" Vector Extension
-    halide_target_feature_armv81a,                ///< Enable ARMv8.1-a instructions
+    halide_target_feature_armv8a,                 ///< Enable ARMv8a instructions
+    halide_target_feature_armv81a,                ///< Enable ARMv8.1a instructions
+    halide_target_feature_armv82a,                ///< Enable ARMv8.2a instructions
+    halide_target_feature_armv83a,                ///< Enable ARMv8.3a instructions
+    halide_target_feature_armv84a,                ///< Enable ARMv8.4a instructions
+    halide_target_feature_armv85a,                ///< Enable ARMv8.5a instructions
+    halide_target_feature_armv86a,                ///< Enable ARMv8.6a instructions
+    halide_target_feature_armv87a,                ///< Enable ARMv8.7a instructions
+    halide_target_feature_armv88a,                ///< Enable ARMv8.8a instructions
+    halide_target_feature_armv89a,                ///< Enable ARMv8.9a instructions
     halide_target_feature_sanitizer_coverage,     ///< Enable hooks for SanitizerCoverage support.
     halide_target_feature_profile_by_timer,       ///< Alternative to halide_target_feature_profile using timer interrupt for systems without threads or applicartions that need to avoid them.
     halide_target_feature_spirv,                  ///< Enable SPIR-V code generation support.
diff --git a/test/correctness/cross_compilation.cpp b/test/correctness/cross_compilation.cpp
index 7831e2c303f7..fc18ce51f81b 100644
--- a/test/correctness/cross_compilation.cpp
+++ b/test/correctness/cross_compilation.cpp
@@ -21,6 +21,16 @@ int main(int argc, char **argv) {
         "arm-64-android",
         "arm-64-android-hvx",
         "arm-64-ios",
+        "arm-64-ios-armv8a",
+        "arm-64-ios-armv81a",
+        "arm-64-ios-armv82a",
+        "arm-64-ios-armv83a",
+        "arm-64-ios-armv84a",
+        "arm-64-ios-armv85a",
+        "arm-64-ios-armv86a",
+        "arm-64-ios-armv87a",
+        "arm-64-ios-armv88a",
+        "arm-64-ios-armv89a",
         "arm-64-linux",
         "arm-64-noos-semihosting",
         "arm-64-windows",
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index f386b7efc094..25b641800987 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -120,7 +120,16 @@ class SimdOpCheckTest {
                  Target::ARMDotProd,
                  Target::ARMFp16,
                  Target::ARMv7s,
+                 Target::ARMv8a,
                  Target::ARMv81a,
+                 Target::ARMv82a,
+                 Target::ARMv83a,
+                 Target::ARMv84a,
+                 Target::ARMv85a,
+                 Target::ARMv86a,
+                 Target::ARMv87a,
+                 Target::ARMv88a,
+                 Target::ARMv89a,
                  Target::AVX,
                  Target::AVX2,
                  Target::AVX512,
diff --git a/test/correctness/target.cpp b/test/correctness/target.cpp
index 7c8fcbe4d15f..acd468e9a8c7 100644
--- a/test/correctness/target.cpp
+++ b/test/correctness/target.cpp
@@ -205,6 +205,18 @@ int main(int argc, char **argv) {
         return 1;
     }
 
+    t1 = Target("arm-64-linux-armv87a-armv8a");
+    t2 = Target("arm-64-linux-armv82a-armv83a");
+    if (!t1.get_runtime_compatible_target(t2, t1)) {
+        printf("get_runtime_compatible_target failure\n");
+        return 1;
+    }
+    ts = t1.to_string();
+    if (ts != "arm-64-linux-armv8a") {
+        printf("get_runtime_compatible_target failure: %s\n", ts.c_str());
+        return 1;
+    }
+
     printf("Success!\n");
     return 0;
 }

From 5d1472fa05c95b24038eefcd75c2823a3181a2ef Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Tue, 23 Jul 2024 14:56:54 -0700
Subject: [PATCH 156/186] Allow LLVM 20 (#8352)

---
 Makefile                         | 6 +++++-
 dependencies/llvm/CMakeLists.txt | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 5d23cfd197f7..a49615a91a91 100644
--- a/Makefile
+++ b/Makefile
@@ -2281,6 +2281,10 @@ ifneq (,$(findstring clang version 19.0,$(CLANG_VERSION)))
 CLANG_OK=yes
 endif
 
+ifneq (,$(findstring clang version 20.0,$(CLANG_VERSION)))
+CLANG_OK=yes
+endif
+
 ifneq (,$(findstring Apple LLVM version 5.0,$(CLANG_VERSION)))
 CLANG_OK=yes
 endif
@@ -2301,7 +2305,7 @@ $(BUILD_DIR)/clang_ok:
 	@exit 1
 endif
 
-ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 160 170 180 190))
+ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 160 170 180 190 200))
 LLVM_OK=yes
 endif
 
diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt
index d070caf53b19..cc6018cfab10 100644
--- a/dependencies/llvm/CMakeLists.txt
+++ b/dependencies/llvm/CMakeLists.txt
@@ -24,8 +24,8 @@ if (LLVM_PACKAGE_VERSION VERSION_LESS 16.0)
     message(FATAL_ERROR "LLVM version must be 16.0 or newer")
 endif ()
 
-if (LLVM_PACKAGE_VERSION VERSION_GREATER 19.0)
-    message(WARNING "Halide is not tested on LLVM versions beyond 19.0")
+if (LLVM_PACKAGE_VERSION VERSION_GREATER 20.0)
+    message(WARNING "Halide is not tested on LLVM versions beyond 20.0")
 endif ()
 
 # LLVM_DEFINITIONS is a space-separated list instead of a more typical

From e9b9bdc1ca327a10435af769a4f048d637ab4afd Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 26 Jul 2024 11:13:49 -0700
Subject: [PATCH 157/186] Don't use le32/le64 (#8344)

Use i386/x86-64 and wasm32/wasm64 targets instead of le32/le64 for the runtime.
---
 Makefile                   | 32 +++++++++++++++++++++++++++-----
 src/runtime/CMakeLists.txt | 22 ++++++++++++++++------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index a49615a91a91..b84f7ea1f7fc 100644
--- a/Makefile
+++ b/Makefile
@@ -1053,9 +1053,9 @@ $(BIN_DIR)/build_halide_h: $(ROOT_DIR)/tools/build_halide_h.cpp
 .SECONDARY:
 
 # Compile generic 32- or 64-bit code
-# (The 'nacl' is a red herring. This is just a generic 32-bit little-endian target.)
-RUNTIME_TRIPLE_32 = "le32-unknown-nacl-unknown"
-RUNTIME_TRIPLE_64 = "le64-unknown-unknown-unknown"
+# Don't be fooled: these are just generic 32/64-bit targets for our purposes here
+RUNTIME_TRIPLE_32 = "i386-unknown-unknown-unknown"
+RUNTIME_TRIPLE_64 = "x86_64-unknown-unknown-unknown"
 
 # Windows requires special handling.  The generic windows_* modules must have -fpic elided
 # and (for 64 bit) must set wchar to be 2 bytes.  The windows_*_x86 and windows_*_arm
@@ -1068,7 +1068,11 @@ RUNTIME_TRIPLE_WIN_X86_32 = "i386-unknown-windows-unknown"
 RUNTIME_TRIPLE_WIN_X86_64 = "x86_64-unknown-windows-unknown"
 RUNTIME_TRIPLE_WIN_ARM_32 = "arm-unknown-windows-unknown"
 RUNTIME_TRIPLE_WIN_ARM_64 = "aarch64-unknown-windows-unknown"
-RUNTIME_TRIPLE_WIN_GENERIC_64 = "le64-unknown-windows-unknown"
+# TODO: was le64 here, not sure if this is correct or not
+RUNTIME_TRIPLE_WIN_GENERIC_64 = "x86_64-unknown-windows-unknown"
+
+RUNTIME_TRIPLE_WEBGPU_32 = "wasm32-unknown-unknown-unknown"
+RUNTIME_TRIPLE_WEBGPU_64 = "wasm64-unknown-unknown-unknown"
 
 # `-fno-threadsafe-statics` is very important here (note that it allows us to use a 'modern' C++
 # standard but still skip threadsafe guards for static initialization in our runtime code)
@@ -1084,6 +1088,7 @@ RUNTIME_CXX_FLAGS = \
     -fno-vectorize \
     -fno-threadsafe-statics \
     -fno-rtti \
+    -fno-jump-tables \
     -Wall \
     -Wcast-qual \
     -Werror \
@@ -1093,7 +1098,8 @@ RUNTIME_CXX_FLAGS = \
     -Wno-unknown-warning-option \
     -Wno-unused-function \
     -Wvla \
-    -Wsign-compare
+    -Wsign-compare \
+    -Wno-sync-alignment
 
 $(BUILD_DIR)/initmod.windows_%_x86_32.ll: $(SRC_DIR)/runtime/windows_%_x86.cpp $(BUILD_DIR)/clang_ok
 	@mkdir -p $(@D)
@@ -1119,6 +1125,22 @@ $(BUILD_DIR)/initmod.windows_%_64.ll: $(SRC_DIR)/runtime/windows_%.cpp $(BUILD_D
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WIN_GENERIC_64) -fshort-wchar -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/windows_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.windows_$*_64.d
 
+$(BUILD_DIR)/initmod.webgpu_%_32.ll: $(SRC_DIR)/runtime/webgpu_%.cpp $(BUILD_DIR)/clang_ok
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WEBGPU_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_32.d
+
+$(BUILD_DIR)/initmod.webgpu_%_64.ll: $(SRC_DIR)/runtime/webgpu_%.cpp $(BUILD_DIR)/clang_ok
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_64.d
+
+$(BUILD_DIR)/initmod.webgpu_%_32_debug.ll: $(SRC_DIR)/runtime/webgpu_%.cpp $(BUILD_DIR)/clang_ok
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m32 -target $(RUNTIME_TRIPLE_WEBGPU_32) -DCOMPILING_HALIDE_RUNTIME -DBITS_32 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_32_debug.d
+
+$(BUILD_DIR)/initmod.webgpu_%_64_debug.ll: $(SRC_DIR)/runtime/webgpu_%.cpp $(BUILD_DIR)/clang_ok
+	@mkdir -p $(@D)
+	$(CLANG) $(CXX_WARNING_FLAGS) -g -DDEBUG_RUNTIME $(RUNTIME_CXX_FLAGS) -m64 -target $(RUNTIME_TRIPLE_WEBGPU_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/webgpu_$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.webgpu_$*_64_debug.d
+
 $(BUILD_DIR)/initmod.%_64.ll: $(SRC_DIR)/runtime/%.cpp $(BUILD_DIR)/clang_ok
 	@mkdir -p $(@D)
 	$(CLANG) $(CXX_WARNING_FLAGS) $(RUNTIME_CXX_FLAGS) -fpic -m64 -target $(RUNTIME_TRIPLE_64) -DCOMPILING_HALIDE_RUNTIME -DBITS_64 -emit-llvm -S $(SRC_DIR)/runtime/$*.cpp -o $@ -MMD -MP -MF $(BUILD_DIR)/initmod.$*_64.d
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index 5426c355823c..fba385a85ee7 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -161,6 +161,8 @@ set(RUNTIME_CXX_FLAGS
     -fno-threadsafe-statics
     # Necessary for using virtual functions in the runtime code.
     -fno-rtti
+    # Will generate bad code in some situations
+    -fno-jump-tables
     -Wall
     -Wc++20-designator
     -Wcast-qual
@@ -173,6 +175,7 @@ set(RUNTIME_CXX_FLAGS
     -Wvla
     -Wsign-compare
     -Wimplicit-fallthrough
+    -Wno-sync-alignment
 )
 
 option(Halide_CLANG_TIDY_BUILD "Generate fake compile jobs for runtime files when running clang-tidy." OFF)
@@ -217,17 +220,24 @@ foreach (i IN LISTS RUNTIME_CPP)
                     # unfortunately, clang doesn't automatically set this flag even though the
                     # ABI is msvc on windows
                     set(fshort-wchar -fshort-wchar)
-                    set(TARGET "le64-unknown-windows-unknown")
+                    # TODO: was le64 here, not sure if this is correct or not
+                    set(TARGET "x86_64-unknown-windows-unknown")
                 endif ()
             endif()
-        # Everything else
+        elseif (i MATCHES "webgpu")
+            if (j EQUAL 32)
+                # wasm32 will fail for some i386 builds, but i386 won't
+                set(TARGET "wasm32-unknown-unknown-unknown")
+            else ()
+                set(TARGET "wasm64-unknown-unknown-unknown")
+            endif ()
         else()
+            # don't be fooled: these are just generic 32/64-bit targets for our purposes here
             if (j EQUAL 32)
-                # (The 'nacl' is a red herring. This is just a generic 32-bit little-endian target.)
-                set(TARGET "le32-unknown-nacl-unknown")
+                # wasm32 will fail for some i386 builds, but i386 won't
+                set(TARGET "i386-unknown-unknown-unknown")
             else ()
-                # generic 64-bit code
-                set(TARGET "le64-unknown-unknown-unknown")
+                set(TARGET "x86_64-unknown-unknown-unknown")
             endif ()
         endif ()
 

From c7e1b991bcce88fb2cebe6272beaa635681d4970 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 29 Jul 2024 13:52:40 -0700
Subject: [PATCH 158/186] Bump Halide version to 19 in main branch (#8357)

* Bump Halide version to 19 in main branch

* Update setup.py
---
 CMakeLists.txt              | 2 +-
 setup.py                    | 2 +-
 src/runtime/HalideRuntime.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 287ec4496f6b..67659af9a861 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.22...3.23)
 project(Halide
-        VERSION 18.0.0
+        VERSION 19.0.0
         DESCRIPTION "Halide compiler and libraries"
         HOMEPAGE_URL "https://halide-lang.org")
 
diff --git a/setup.py b/setup.py
index d11b2e594c22..bea2e7f1dd92 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 setup(
     name="halide",
-    version='18.0.0',
+    version='19.0.0',
     author="The Halide team",
     author_email="halide-dev@lists.csail.mit.edu",
     description="Halide is a programming language designed to make it easier "
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index ce3fd013ec1b..e44527ad147e 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -23,7 +23,7 @@
 // our CMake build, so that we ensure that the in-build metadata (eg soversion)
 // matches, but keeping the canonical version here makes it easier to keep
 // downstream build systems (eg Blaze/Bazel) properly in sync with the source.
-#define HALIDE_VERSION_MAJOR 18
+#define HALIDE_VERSION_MAJOR 19
 #define HALIDE_VERSION_MINOR 0
 #define HALIDE_VERSION_PATCH 0
 

From 15c181fc8b37e7a92e757b6298863670f0ac3df5 Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Mon, 29 Jul 2024 14:04:49 -0700
Subject: [PATCH 159/186] Drop support for LLVM 16 in main (#8358)

* Drop support for LLVM 16 in main

Per policy, Halide 19 will support LLVM 17, 18, 19 (plus top-of-tree which is 20)

* clang-format
---
 Makefile                                 |  4 ----
 src/CodeGen_ARM.cpp                      | 10 +--------
 src/CodeGen_RISCV.cpp                    |  5 -----
 src/JITModule.cpp                        | 10 ---------
 src/LLVM_Headers.h                       | 14 ++-----------
 src/LLVM_Output.cpp                      | 19 -----------------
 src/LLVM_Runtime_Linker.cpp              |  8 --------
 src/WasmExecutor.cpp                     | 26 ++----------------------
 test/correctness/simd_op_check_riscv.cpp |  4 ----
 test/correctness/simd_op_check_sve2.cpp  | 12 +++++------
 10 files changed, 10 insertions(+), 102 deletions(-)

diff --git a/Makefile b/Makefile
index b84f7ea1f7fc..fbffb7a5b48b 100644
--- a/Makefile
+++ b/Makefile
@@ -113,11 +113,7 @@ LLVM_CXX_FLAGS += -DLLVM_VERSION=$(LLVM_VERSION_TIMES_10)
 WITH_X86 ?= $(findstring x86, $(LLVM_COMPONENTS))
 WITH_ARM ?= $(findstring arm, $(LLVM_COMPONENTS))
 WITH_HEXAGON ?= $(findstring hexagon, $(LLVM_COMPONENTS))
-ifeq ($(shell test $(LLVM_VERSION_TIMES_10) -ge 170; echo $$?),0)
 WITH_RISCV ?= $(findstring riscv, $(LLVM_COMPONENTS))
-else
-# leave WITH_RISCV undefined
-endif
 WITH_AARCH64 ?= $(findstring aarch64, $(LLVM_COMPONENTS))
 WITH_POWERPC ?= $(findstring powerpc, $(LLVM_COMPONENTS))
 WITH_NVPTX ?= $(findstring nvptx, $(LLVM_COMPONENTS))
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 24bd7d9e4e34..4cd05e3c7b5f 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1491,15 +1491,10 @@ void CodeGen_ARM::visit(const Store *op) {
         std::ostringstream instr;
         vector<llvm::Type *> arg_types;
         llvm::Type *intrin_llvm_type = llvm_type_with_constraint(intrin_type, false, is_sve ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed);
-#if LLVM_VERSION >= 170
-        const bool is_opaque = true;
-#else
-        const bool is_opaque = llvm::PointerType::get(intrin_llvm_type, 0)->isOpaque();
-#endif
         if (target.bits == 32) {
             instr << "llvm.arm.neon.vst"
                   << num_vecs
-                  << (is_opaque ? ".p0" : ".p0i8")
+                  << ".p0"
                   << ".v"
                   << intrin_type.lanes()
                   << (t.is_float() ? 'f' : 'i')
@@ -1526,9 +1521,6 @@ void CodeGen_ARM::visit(const Store *op) {
                       << (t.is_float() ? 'f' : 'i')
                       << t.bits()
                       << ".p0";
-                if (!is_opaque) {
-                    instr << (t.is_float() ? 'f' : 'i') << t.bits();
-                }
                 arg_types = vector<llvm::Type *>(num_vecs + 1, intrin_llvm_type);
                 arg_types.back() = llvm_type_of(intrin_type.element_of())->getPointerTo();
             }
diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
index 6bbc38532ecf..f768031dfc14 100644
--- a/src/CodeGen_RISCV.cpp
+++ b/src/CodeGen_RISCV.cpp
@@ -143,9 +143,6 @@ CodeGen_RISCV::CodeGen_RISCV(const Target &t)
     user_assert(native_vector_bits() > 0) << "No vector_bits was specified for RISCV codegen; "
                                           << "this is almost certainly a mistake. You should add -rvv-vector_bits_N "
                                           << "to your Target string, where N is the SIMD width in bits (e.g. 128).";
-#if LLVM_VERSION < 170
-    user_warning << "RISCV codegen is only tested with LLVM 17.0 or later; it is unlikely to work well with earlier versions of LLVM.\n";
-#endif
 }
 
 string CodeGen_RISCV::mcpu_target() const {
@@ -349,11 +346,9 @@ bool CodeGen_RISCV::call_riscv_vector_intrinsic(const RISCVIntrinsic &intrin, co
         left_arg->getType(),
         right_arg->getType(),
     };
-#if LLVM_VERSION >= 170
     if (round_any) {
         llvm_arg_types.push_back(xlen_type);
     }
-#endif
     if (intrin.flags & AddVLArg) {
         llvm_arg_types.push_back(xlen_type);
     }
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index 735f782f67c1..c3a4d57d9ba7 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -361,7 +361,6 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
         for (auto const &iter : module.exports()) {
             orc::SymbolStringPtr name = symbolStringPool->intern(iter.first);
             orc::SymbolStringPtr _name = symbolStringPool->intern("_" + iter.first);
-#if LLVM_VERSION >= 170
             auto symbol = llvm::orc::ExecutorAddr::fromPtr(iter.second.address);
             if (!newSymbols.count(name)) {
                 newSymbols.insert({name, {symbol, JITSymbolFlags::Exported}});
@@ -369,15 +368,6 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
             if (!newSymbols.count(_name)) {
                 newSymbols.insert({_name, {symbol, JITSymbolFlags::Exported}});
             }
-#else
-            auto symbol = llvm::JITEvaluatedSymbol::fromPointer(iter.second.address);
-            if (!newSymbols.count(name)) {
-                newSymbols.insert({name, symbol});
-            }
-            if (!newSymbols.count(_name)) {
-                newSymbols.insert({_name, symbol});
-            }
-#endif
         }
     }
     err = JIT->getMainJITDylib().define(orc::absoluteSymbols(std::move(newSymbols)));
diff --git a/src/LLVM_Headers.h b/src/LLVM_Headers.h
index 6b5013b72cf0..bd76de8baae6 100644
--- a/src/LLVM_Headers.h
+++ b/src/LLVM_Headers.h
@@ -1,10 +1,10 @@
 #ifndef HALIDE_LLVM_HEADERS_H
 #define HALIDE_LLVM_HEADERS_H
 
-#if LLVM_VERSION >= 160
+#if LLVM_VERSION >= 170
 // We're good to go
 #else
-#error "Compiling Halide requires LLVM 16.0 or newer"
+#error "Compiling Halide requires LLVM 17.0 or newer"
 #endif
 
 // No msvc warnings from llvm headers please
@@ -22,18 +22,13 @@
 
 #if WITH_WABT || WITH_V8
 #include <lld/Common/Driver.h>
-#if LLVM_VERSION >= 170
 #include <lld/Common/ErrorHandler.h>
 #endif
-#endif
 #include <llvm/ADT/APFloat.h>
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/ADT/StringMap.h>
 #include <llvm/ADT/StringRef.h>
-#if LLVM_VERSION < 170
-#include <llvm/ADT/Triple.h>
-#endif
 #include <llvm/ADT/Twine.h>
 #include <llvm/Analysis/AliasAnalysis.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
@@ -86,15 +81,10 @@
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Target/TargetMachine.h>
 #include <llvm/Target/TargetOptions.h>
-#if LLVM_VERSION >= 170
 #include <llvm/TargetParser/Triple.h>
-#endif
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/IPO/AlwaysInliner.h>
 #include <llvm/Transforms/IPO/Inliner.h>
-#if LLVM_VERSION < 170
-#include <llvm/Transforms/IPO/PassManagerBuilder.h>
-#endif
 #include <llvm/Transforms/Instrumentation.h>
 #include <llvm/Transforms/Instrumentation/AddressSanitizer.h>
 #include <llvm/Transforms/Instrumentation/SanitizerCoverage.h>
diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp
index e40441b388f0..1446fd895bca 100644
--- a/src/LLVM_Output.cpp
+++ b/src/LLVM_Output.cpp
@@ -384,25 +384,6 @@ void emit_file(const llvm::Module &module_in, Internal::LLVMOStream &out,
     // Make sure things marked as always-inline get inlined
     pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
 
-#if LLVM_VERSION < 170
-    // Remove any stale debug info
-    //
-    // Note: this pass was added in https://github.com/halide/Halide/pull/2060;
-    // based on the comments, it looks like it was an attempt to fix an error,
-    // but didn't actually fix it, and (apparently) just got left in?
-    //
-    // There is a 'new' equivalent that we could add in the optimization pass
-    // in Codegen_LLVM.cpp, but since this seems to be have added in error,
-    // we're just going to elide it for LLVM >= 17.0
-    pass_manager.add(llvm::createStripDeadDebugInfoPass());
-
-    // Enable symbol rewriting. This allows code outside libHalide to
-    // use symbol rewriting when compiling Halide code (for example, by
-    // using cl::ParseCommandLineOption and then passing the appropriate
-    // rewrite options via -mllvm flags).
-    pass_manager.add(llvm::createRewriteSymbolsPass());
-#endif
-
     if (target_machine->isPositionIndependent()) {
         Internal::debug(1) << "Target machine is Position Independent!\n";
     }
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index e601a3325ce6..33441a6f4b8e 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -410,17 +410,9 @@ llvm::DataLayout get_data_layout_for_target(Target target) {
         }
     } else if (target.arch == Target::POWERPC) {
         if (target.bits == 32) {
-#if LLVM_VERSION >= 170
             return llvm::DataLayout("E-m:e-p:32:32-Fn32-i64:64-n32");
-#else
-            return llvm::DataLayout("E-m:e-p:32:32-i64:64-n32");
-#endif
         } else {
-#if LLVM_VERSION >= 170
             return llvm::DataLayout("e-m:e-Fn32-i64:64-n32:64-S128-v256:256:256-v512:512:512");
-#else
-            return llvm::DataLayout("e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512");
-#endif
         }
     } else if (target.arch == Target::Hexagon) {
         return llvm::DataLayout(
diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp
index bfe66213f44f..93f86ed80c88 100644
--- a/src/WasmExecutor.cpp
+++ b/src/WasmExecutor.cpp
@@ -46,10 +46,8 @@
 // clang-format on
 
 #if WITH_WABT || WITH_V8
-#if LLVM_VERSION >= 170
 LLD_HAS_DRIVER(wasm)
 #endif
-#endif
 
 namespace Halide {
 namespace Internal {
@@ -336,10 +334,7 @@ std::vector<char> compile_to_wasm(const Module &module, const std::string &fn_na
 
     std::string lld_arg_strs[] = {
         "HalideJITLinker",
-#if LLVM_VERSION >= 170
-        "-flavor",
-        "wasm",
-#endif
+        "-flavor", "wasm",
         // For debugging purposes:
         // "--verbose",
         // "-error-limit=0",
@@ -350,8 +345,7 @@ std::vector<char> compile_to_wasm(const Module &module, const std::string &fn_na
         obj_file.pathname(),
         "--entry=" + fn_name,
         "-o",
-        wasm_output.pathname()
-    };
+        wasm_output.pathname()};
 
     constexpr int c = sizeof(lld_arg_strs) / sizeof(lld_arg_strs[0]);
     const char *lld_args[c];
@@ -359,7 +353,6 @@ std::vector<char> compile_to_wasm(const Module &module, const std::string &fn_na
         lld_args[i] = lld_arg_strs[i].c_str();
     }
 
-#if LLVM_VERSION >= 170
     // lld will temporarily hijack the signal handlers to ensure that temp files get cleaned up,
     // but rather than preserving custom handlers in place, it restores the default handlers.
     // This conflicts with some of our testing infrastructure, which relies on a SIGABRT handler
@@ -383,21 +376,6 @@ std::vector<char> compile_to_wasm(const Module &module, const std::string &fn_na
     }
 
     std::signal(SIGABRT, old_abort_handler);
-#else
-    // lld will temporarily hijack the signal handlers to ensure that temp files get cleaned up,
-    // but rather than preserving custom handlers in place, it restores the default handlers.
-    // This conflicts with some of our testing infrastructure, which relies on a SIGABRT handler
-    // set at global-ctor time to stay set. Therefore we'll save and restore this ourselves.
-    // Note that we must restore it before using internal_error (and also on the non-error path).
-    auto old_abort_handler = std::signal(SIGABRT, SIG_DFL);
-
-    if (!lld::wasm::link(lld_args, llvm::outs(), llvm::errs(), /*canExitEarly*/ false, /*disableOutput*/ false)) {
-        std::signal(SIGABRT, old_abort_handler);
-        internal_error << "lld::wasm::link failed\n";
-    }
-
-    std::signal(SIGABRT, old_abort_handler);
-#endif
 
 #if WASM_DEBUG_LEVEL
     wasm_output.detach();
diff --git a/test/correctness/simd_op_check_riscv.cpp b/test/correctness/simd_op_check_riscv.cpp
index 350faa129848..50ff0b38c399 100644
--- a/test/correctness/simd_op_check_riscv.cpp
+++ b/test/correctness/simd_op_check_riscv.cpp
@@ -135,10 +135,6 @@ class SimdOpCheckRISCV : public SimdOpCheckTest {
 }  // namespace
 
 int main(int argc, char **argv) {
-    if (Halide::Internal::get_llvm_version() < 160) {
-        std::cout << "[SKIP] simd_op_check_riscv requires LLVM 16 or later.\n";
-        return 0;
-    }
     return SimdOpCheckTest::main<SimdOpCheckRISCV>(
         argc, argv,
         {
diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index 1a176dbccecd..9f6b958d6c2a 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -298,7 +298,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 add_8_16_32_widen(sel_op("vmovl.u", "ushll"), widen_i(u_1));
 
                 // VMOVN    I       -       Move and Narrow
-                if (Halide::Internal::get_llvm_version() >= 140 && total_bits >= 128) {
+                if (total_bits >= 128) {
                     if (is_arm32()) {
                         add_16_32_64_narrow("vmovn.i", narrow_i(i_1));
                         add_16_32_64_narrow("vmovn.i", narrow_u(u_1));
@@ -460,12 +460,10 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 add_8_16_32(sel_op("vrshr.u", "urshr", "urshl"), cast_u((widen_u(u_1) + 1) >> 1));
 
                 // VRSHRN   I       -       Rounding Shift Right Narrow
-                if (Halide::Internal::get_llvm_version() >= 140) {
-                    // LLVM14 converts RSHRN/RSHRN2 to RADDHN/RADDHN2 when the shift amount is half the width of the vector element
-                    // See https://reviews.llvm.org/D116166
-                    add_16_32_narrow(sel_op("vrshrn.i", "raddhn"), narrow_i((widen_i(i_1) + (cast_i(1) << (bits / 2 - 1))) >> (bits / 2)));
-                    add_16_32_narrow(sel_op("vrshrn.i", "raddhn"), narrow_u((widen_u(u_1) + (cast_u(1) << (bits / 2 - 1))) >> (bits / 2)));
-                }
+                // LLVM14 converts RSHRN/RSHRN2 to RADDHN/RADDHN2 when the shift amount is half the width of the vector element
+                // See https://reviews.llvm.org/D116166
+                add_16_32_narrow(sel_op("vrshrn.i", "raddhn"), narrow_i((widen_i(i_1) + (cast_i(1) << (bits / 2 - 1))) >> (bits / 2)));
+                add_16_32_narrow(sel_op("vrshrn.i", "raddhn"), narrow_u((widen_u(u_1) + (cast_u(1) << (bits / 2 - 1))) >> (bits / 2)));
                 add_16_32_64_narrow(sel_op("vrshrn.i", "rshrn"), narrow_i((widen_i(i_1) + (1 << (bits / 4))) >> (bits / 4 + 1)));
                 add_16_32_narrow(sel_op("vrshrn.i", "rshrn"), narrow_u((widen_u(u_1) + (1 << (bits / 4))) >> (bits / 4 + 1)));
 

From 423df3c50b4b5c9b05b6b2cae16aff535dcf81c0 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 31 Jul 2024 19:15:47 +0300
Subject: [PATCH 160/186] `Python_bindings`-test-as-installed (#8355)

Support not building python bindings, while running python tests
against installed halide, and call `enable_testing()` there
so that `ctest` can work.
---
 python_bindings/CMakeLists.txt | 39 +++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/python_bindings/CMakeLists.txt b/python_bindings/CMakeLists.txt
index 25f61fe7dcdd..d7c1c6b031b0 100644
--- a/python_bindings/CMakeLists.txt
+++ b/python_bindings/CMakeLists.txt
@@ -1,6 +1,10 @@
 cmake_minimum_required(VERSION 3.22...3.23)
 project(Halide_Python)
 
+if (PROJECT_IS_TOP_LEVEL)
+    enable_testing()
+endif ()
+
 include(CMakeDependentOption)
 
 ##
@@ -12,6 +16,11 @@ set(CMAKE_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard to use")
 option(CMAKE_CXX_STANDARD_REQUIRED "Prevent CMake C++ standard selection decay" ON)
 option(CMAKE_CXX_EXTENSIONS "Enable C++ vendor extensions (e.g. GNU)" OFF)
 
+# Support not actually building the bindings, but using the ones we find
+# via `find_package(Halide)`. This allows running tests against the
+# installed Halide package.
+option(WITH_PYTHON_BINDINGS "Build Python bindings" ON)
+
 # Duplicated options from parent project
 option(WITH_TESTS "Build tests" ON)
 option(WITH_TUTORIALS "Build tutorials" ON)
@@ -41,18 +50,22 @@ if (Python3_VERSION VERSION_LESS "3.8")
 endif ()
 message(STATUS "Found Python ${Python3_VERSION} at ${Python3_EXECUTABLE}")
 
-if (PYBIND11_USE_FETCHCONTENT)
-    include(FetchContent)
-    FetchContent_Declare(
-        pybind11
-        GIT_REPOSITORY https://github.com/pybind/pybind11.git
-        GIT_TAG v${PYBIND11_VER}
-    )
-    FetchContent_MakeAvailable(pybind11)
-else ()
-    find_package(pybind11 ${PYBIND11_VER} REQUIRED)
+if (WITH_PYTHON_BINDINGS)
+    # If we are actually going to build the bindings, we need pybind11.
+    if (PYBIND11_USE_FETCHCONTENT)
+        include(FetchContent)
+        FetchContent_Declare(
+            pybind11
+            GIT_REPOSITORY https://github.com/pybind/pybind11.git
+            GIT_TAG v${PYBIND11_VER}
+        )
+        FetchContent_MakeAvailable(pybind11)
+    else ()
+        find_package(pybind11 ${PYBIND11_VER} REQUIRED)
+    endif ()
 endif ()
 
+# Note: this must happen, especially when WITH_PYTHON_BINDINGS is OFF.
 find_package(Halide REQUIRED Halide)
 if (NOT Halide_ENABLE_RTTI OR NOT Halide_ENABLE_EXCEPTIONS)
     message(FATAL_ERROR "Python bindings require RTTI and exceptions to be enabled.")
@@ -91,9 +104,11 @@ endfunction()
 # Add our sources to this sub-tree.
 ##
 
-add_subdirectory(src)
+if (WITH_PYTHON_BINDINGS)
+    add_subdirectory(src)
+endif ()
 
-if (WITH_PYTHON_STUBS)
+if (WITH_PYTHON_BINDINGS AND WITH_PYTHON_STUBS)
     add_subdirectory(stub)
 endif ()
 

From 77e5dd1e130bb38d4ae7e0c4d55e5ddb857cf235 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Wed, 31 Jul 2024 19:25:59 -0700
Subject: [PATCH 161/186] Remove warning for unsupported compilers (#8362)

---
 CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67659af9a861..9110ba1ac7b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,9 +152,6 @@ function(set_halide_compiler_warnings NAME)
         $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-unused-member-function>
         $<$<CXX_COMPILER_ID:Clang,AppleClang>:-Wno-unused-template>
 
-        # This warning was removed in Clang 13
-        $<$<AND:$<CXX_COMPILER_ID:Clang,AppleClang>,$<VERSION_LESS:$<CXX_COMPILER_VERSION>,13.0>>:-Wno-return-std-move-in-c++11>
-
         $<$<CXX_COMPILER_ID:MSVC>:/W3>
         $<$<CXX_COMPILER_ID:MSVC>:/wd4018>  # 4018: disable "signed/unsigned mismatch"
         $<$<CXX_COMPILER_ID:MSVC>:/wd4141>  # 4141: 'inline' used more than once

From 837308f1c4ca3038165a7b9e3a7a69fdfbefca55 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Thu, 1 Aug 2024 06:41:55 -0700
Subject: [PATCH 162/186] Bump CMake minimum version to 3.28 (#8363)

This is in line with our policy to track the
version included in the latest Ubuntu LTS.
Version 24.04 LTS now includes CMake 3.28.
---
 CMakeLists.txt                                                  | 2 +-
 README_cmake.md                                                 | 2 +-
 apps/CMakeLists.txt                                             | 2 +-
 apps/HelloBaremetal/CMakeLists.txt                              | 2 +-
 apps/HelloBaremetal/cmake-external_project/CMakeLists.txt       | 2 +-
 .../cmake-external_project/generator/CMakeLists.txt             | 2 +-
 apps/HelloBaremetal/cmake-super_build/CMakeLists.txt            | 2 +-
 apps/HelloBaremetal/cmake-super_build/app/CMakeLists.txt        | 2 +-
 apps/HelloBaremetal/cmake-super_build/generator/CMakeLists.txt  | 2 +-
 apps/HelloBaremetal/cmake-twice/CMakeLists.txt                  | 2 +-
 apps/HelloWasm/CMakeLists.txt                                   | 2 +-
 apps/bgu/CMakeLists.txt                                         | 2 +-
 apps/bilateral_grid/CMakeLists.txt                              | 2 +-
 apps/blur/CMakeLists.txt                                        | 2 +-
 apps/c_backend/CMakeLists.txt                                   | 2 +-
 apps/camera_pipe/CMakeLists.txt                                 | 2 +-
 apps/compositing/CMakeLists.txt                                 | 2 +-
 apps/conv_layer/CMakeLists.txt                                  | 2 +-
 apps/cuda_mat_mul/CMakeLists.txt                                | 2 +-
 apps/depthwise_separable_conv/CMakeLists.txt                    | 2 +-
 apps/fft/CMakeLists.txt                                         | 2 +-
 apps/hannk/CMakeLists.txt                                       | 2 +-
 apps/hannk/cmake/superbuild/CMakeLists.txt                      | 2 +-
 apps/harris/CMakeLists.txt                                      | 2 +-
 apps/hexagon_benchmarks/CMakeLists.txt                          | 2 +-
 apps/hist/CMakeLists.txt                                        | 2 +-
 apps/iir_blur/CMakeLists.txt                                    | 2 +-
 apps/interpolate/CMakeLists.txt                                 | 2 +-
 apps/lens_blur/CMakeLists.txt                                   | 2 +-
 apps/linear_algebra/CMakeLists.txt                              | 2 +-
 apps/local_laplacian/CMakeLists.txt                             | 2 +-
 apps/max_filter/CMakeLists.txt                                  | 2 +-
 apps/nl_means/CMakeLists.txt                                    | 2 +-
 apps/resize/CMakeLists.txt                                      | 2 +-
 apps/stencil_chain/CMakeLists.txt                               | 2 +-
 apps/unsharp/CMakeLists.txt                                     | 2 +-
 apps/wavelet/CMakeLists.txt                                     | 2 +-
 cmake/BundleStatic.cmake                                        | 2 +-
 cmake/FindHalide_WebGPU.cmake                                   | 2 +-
 cmake/HalideGeneratorHelpers.cmake                              | 2 +-
 cmake/HalideTargetHelpers.cmake                                 | 2 +-
 cmake/TargetExportScript.cmake                                  | 2 +-
 packaging/common/HalideConfig.cmake                             | 2 +-
 packaging/common/HalideHelpersConfig.cmake                      | 2 +-
 python_bindings/CMakeLists.txt                                  | 2 +-
 src/runtime/hexagon_remote/android/CMakeLists.txt               | 2 +-
 src/runtime/hexagon_remote/qurt/CMakeLists.txt                  | 2 +-
 test/integration/CMakeLists.txt                                 | 2 +-
 test/integration/aot/CMakeLists.txt                             | 2 +-
 test/integration/jit/CMakeLists.txt                             | 2 +-
 test/integration/xc/CMakeLists.txt                              | 2 +-
 51 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9110ba1ac7b7..3c09c3443b2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22...3.23)
+cmake_minimum_required(VERSION 3.28)
 project(Halide
         VERSION 19.0.0
         DESCRIPTION "Halide compiler and libraries"
diff --git a/README_cmake.md b/README_cmake.md
index 38cf7978c1c3..6b075646da7f 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -558,7 +558,7 @@ No matter how you intend to use Halide, you will need some basic CMake
 boilerplate.
 
 ```cmake
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(HalideExample)
 
 set(CMAKE_CXX_STANDARD 17)  # or newer
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 13d73167e865..e5954e4046ac 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -2,7 +2,7 @@
 # Test apps from the perspective of a consuming project.
 ##
 
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(Halide_apps)
 
 enable_testing()
diff --git a/apps/HelloBaremetal/CMakeLists.txt b/apps/HelloBaremetal/CMakeLists.txt
index 2b8e6bfd5d2a..1d0661412a2a 100644
--- a/apps/HelloBaremetal/CMakeLists.txt
+++ b/apps/HelloBaremetal/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 project(HelloBaremetal)
 
diff --git a/apps/HelloBaremetal/cmake-external_project/CMakeLists.txt b/apps/HelloBaremetal/cmake-external_project/CMakeLists.txt
index fd9e6af3fec0..a7906ba72a31 100644
--- a/apps/HelloBaremetal/cmake-external_project/CMakeLists.txt
+++ b/apps/HelloBaremetal/cmake-external_project/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 # Enable assembly language (.s) support additionally
 project(HelloBaremetal LANGUAGES C CXX ASM)
diff --git a/apps/HelloBaremetal/cmake-external_project/generator/CMakeLists.txt b/apps/HelloBaremetal/cmake-external_project/generator/CMakeLists.txt
index ca15a2e44731..58f083330829 100644
--- a/apps/HelloBaremetal/cmake-external_project/generator/CMakeLists.txt
+++ b/apps/HelloBaremetal/cmake-external_project/generator/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 project(HelloBaremetal-gen)
 
diff --git a/apps/HelloBaremetal/cmake-super_build/CMakeLists.txt b/apps/HelloBaremetal/cmake-super_build/CMakeLists.txt
index f863167fc693..33f20c672e12 100644
--- a/apps/HelloBaremetal/cmake-super_build/CMakeLists.txt
+++ b/apps/HelloBaremetal/cmake-super_build/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 project(HelloBaremetal-Super)
 
diff --git a/apps/HelloBaremetal/cmake-super_build/app/CMakeLists.txt b/apps/HelloBaremetal/cmake-super_build/app/CMakeLists.txt
index 3bbfb32a97c1..1e7864afbbee 100644
--- a/apps/HelloBaremetal/cmake-super_build/app/CMakeLists.txt
+++ b/apps/HelloBaremetal/cmake-super_build/app/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 # Enable assembly language (.s) support additionally
 project(HelloBaremetal-app LANGUAGES C CXX ASM)
diff --git a/apps/HelloBaremetal/cmake-super_build/generator/CMakeLists.txt b/apps/HelloBaremetal/cmake-super_build/generator/CMakeLists.txt
index a73caef3ad2b..076686b9919d 100644
--- a/apps/HelloBaremetal/cmake-super_build/generator/CMakeLists.txt
+++ b/apps/HelloBaremetal/cmake-super_build/generator/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 project(HelloBaremetal-gen)
 
diff --git a/apps/HelloBaremetal/cmake-twice/CMakeLists.txt b/apps/HelloBaremetal/cmake-twice/CMakeLists.txt
index 2ed355c18766..048db985b2a4 100644
--- a/apps/HelloBaremetal/cmake-twice/CMakeLists.txt
+++ b/apps/HelloBaremetal/cmake-twice/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 # Enable assembly language (.s) support additionally
 project(HelloBaremetal LANGUAGES C CXX ASM)
diff --git a/apps/HelloWasm/CMakeLists.txt b/apps/HelloWasm/CMakeLists.txt
index 206639111e84..4786a4b80bb8 100644
--- a/apps/HelloWasm/CMakeLists.txt
+++ b/apps/HelloWasm/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(HelloWasm)
 
 enable_testing()
diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
index 0a2f9eb82eec..e59c498c2d8d 100644
--- a/apps/bgu/CMakeLists.txt
+++ b/apps/bgu/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(bgu)
 
 enable_testing()
diff --git a/apps/bilateral_grid/CMakeLists.txt b/apps/bilateral_grid/CMakeLists.txt
index 2b32f0911755..fd221fd74231 100644
--- a/apps/bilateral_grid/CMakeLists.txt
+++ b/apps/bilateral_grid/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(bilateral_grid)
 
 enable_testing()
diff --git a/apps/blur/CMakeLists.txt b/apps/blur/CMakeLists.txt
index f4c3f5324a84..522afd090f61 100644
--- a/apps/blur/CMakeLists.txt
+++ b/apps/blur/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(blur)
 
 enable_testing()
diff --git a/apps/c_backend/CMakeLists.txt b/apps/c_backend/CMakeLists.txt
index 0d134532dda8..3ccc7e48db71 100644
--- a/apps/c_backend/CMakeLists.txt
+++ b/apps/c_backend/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(c_backend)
 
 enable_testing()
diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
index 0d5a94e26614..94bf7a1ae447 100644
--- a/apps/camera_pipe/CMakeLists.txt
+++ b/apps/camera_pipe/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(camera_pipe)
 
 enable_testing()
diff --git a/apps/compositing/CMakeLists.txt b/apps/compositing/CMakeLists.txt
index f4f7584ce2b9..550d880dc3f8 100644
--- a/apps/compositing/CMakeLists.txt
+++ b/apps/compositing/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(compositing)
 
 enable_testing()
diff --git a/apps/conv_layer/CMakeLists.txt b/apps/conv_layer/CMakeLists.txt
index 94674097d290..bce88ec887f9 100644
--- a/apps/conv_layer/CMakeLists.txt
+++ b/apps/conv_layer/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(conv_layer)
 
 enable_testing()
diff --git a/apps/cuda_mat_mul/CMakeLists.txt b/apps/cuda_mat_mul/CMakeLists.txt
index 352553ec048b..3cb508c3a6d9 100644
--- a/apps/cuda_mat_mul/CMakeLists.txt
+++ b/apps/cuda_mat_mul/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(cuda_mat_mul)
 
 # This just checks whether CUDA is available ahead of time to allow
diff --git a/apps/depthwise_separable_conv/CMakeLists.txt b/apps/depthwise_separable_conv/CMakeLists.txt
index 11e24f335d11..389b173762b3 100644
--- a/apps/depthwise_separable_conv/CMakeLists.txt
+++ b/apps/depthwise_separable_conv/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(depthwise_separable_conv)
 
 enable_testing()
diff --git a/apps/fft/CMakeLists.txt b/apps/fft/CMakeLists.txt
index 90f099b34bf5..aa87eb0056e3 100644
--- a/apps/fft/CMakeLists.txt
+++ b/apps/fft/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(fft)
 
 enable_testing()
diff --git a/apps/hannk/CMakeLists.txt b/apps/hannk/CMakeLists.txt
index 6fabe90f0963..ee4bde531856 100644
--- a/apps/hannk/CMakeLists.txt
+++ b/apps/hannk/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(hannk)
 
 # We need to set this for some of the subprojects pulled in by TFLite (eg flatbuffers)
diff --git a/apps/hannk/cmake/superbuild/CMakeLists.txt b/apps/hannk/cmake/superbuild/CMakeLists.txt
index 1ae17705f3cd..7877eb00c565 100644
--- a/apps/hannk/cmake/superbuild/CMakeLists.txt
+++ b/apps/hannk/cmake/superbuild/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22...3.23)
+cmake_minimum_required(VERSION 3.28)
 project(hannk_superbuild LANGUAGES NONE)
 
 ##
diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
index 135129be8752..408fd8b9cd83 100644
--- a/apps/harris/CMakeLists.txt
+++ b/apps/harris/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(harris)
 
 enable_testing()
diff --git a/apps/hexagon_benchmarks/CMakeLists.txt b/apps/hexagon_benchmarks/CMakeLists.txt
index c01ad22035bd..161bc8343937 100644
--- a/apps/hexagon_benchmarks/CMakeLists.txt
+++ b/apps/hexagon_benchmarks/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(hexagon_benchmarks)
 
 enable_testing()
diff --git a/apps/hist/CMakeLists.txt b/apps/hist/CMakeLists.txt
index 7aebca4984ec..37608d5f6a23 100644
--- a/apps/hist/CMakeLists.txt
+++ b/apps/hist/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(hist)
 
 enable_testing()
diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
index 6e61e7ecd0ee..089808dbb89d 100644
--- a/apps/iir_blur/CMakeLists.txt
+++ b/apps/iir_blur/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(iir_blur)
 
 enable_testing()
diff --git a/apps/interpolate/CMakeLists.txt b/apps/interpolate/CMakeLists.txt
index d723ac3b35da..3ef298858033 100644
--- a/apps/interpolate/CMakeLists.txt
+++ b/apps/interpolate/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(interpolate)
 
 enable_testing()
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
index dcd9a70e4ac7..92fcd5a54a3f 100644
--- a/apps/lens_blur/CMakeLists.txt
+++ b/apps/lens_blur/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(lens_blur)
 
 enable_testing()
diff --git a/apps/linear_algebra/CMakeLists.txt b/apps/linear_algebra/CMakeLists.txt
index adbe63b91df4..a1ebd4815211 100644
--- a/apps/linear_algebra/CMakeLists.txt
+++ b/apps/linear_algebra/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(linear_algebra)
 
 enable_testing()
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index 077382da3a77..3c52c1c2a41d 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(local_laplacian)
 
 enable_testing()
diff --git a/apps/max_filter/CMakeLists.txt b/apps/max_filter/CMakeLists.txt
index 68b228438c5f..ccd48274cd26 100644
--- a/apps/max_filter/CMakeLists.txt
+++ b/apps/max_filter/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(max_filter)
 
 enable_testing()
diff --git a/apps/nl_means/CMakeLists.txt b/apps/nl_means/CMakeLists.txt
index a92bedb14a3b..6653cfe7b8b3 100644
--- a/apps/nl_means/CMakeLists.txt
+++ b/apps/nl_means/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(nl_means)
 
 enable_testing()
diff --git a/apps/resize/CMakeLists.txt b/apps/resize/CMakeLists.txt
index 1b5d14233c74..d9d385da55a7 100644
--- a/apps/resize/CMakeLists.txt
+++ b/apps/resize/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(resize)
 
 enable_testing()
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
index ed85b1ba0c36..2a64a719209f 100644
--- a/apps/stencil_chain/CMakeLists.txt
+++ b/apps/stencil_chain/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(stencil_chain)
 
 enable_testing()
diff --git a/apps/unsharp/CMakeLists.txt b/apps/unsharp/CMakeLists.txt
index 443cf92fb0a3..7153dfbf6a4a 100644
--- a/apps/unsharp/CMakeLists.txt
+++ b/apps/unsharp/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(unsharp)
 
 enable_testing()
diff --git a/apps/wavelet/CMakeLists.txt b/apps/wavelet/CMakeLists.txt
index bd142bcf07c4..a72c3195ad0f 100644
--- a/apps/wavelet/CMakeLists.txt
+++ b/apps/wavelet/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(wavelet)
 
 enable_testing()
diff --git a/cmake/BundleStatic.cmake b/cmake/BundleStatic.cmake
index 023db8edd1f2..252328d0ca75 100644
--- a/cmake/BundleStatic.cmake
+++ b/cmake/BundleStatic.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 ##
 # This module provides a utility for bundling a set of IMPORTED
diff --git a/cmake/FindHalide_WebGPU.cmake b/cmake/FindHalide_WebGPU.cmake
index 22a8fc38b326..9752281508cd 100644
--- a/cmake/FindHalide_WebGPU.cmake
+++ b/cmake/FindHalide_WebGPU.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 # tip: uncomment this line to get better debugging information if find_library() fails
 # set(CMAKE_FIND_DEBUG_MODE TRUE)
diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake
index ae2698cf1ce2..b2eb2f96407f 100644
--- a/cmake/HalideGeneratorHelpers.cmake
+++ b/cmake/HalideGeneratorHelpers.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 option(Halide_NO_DEFAULT_FLAGS "When enabled, suppresses recommended flags in add_halide_generator" OFF)
 
diff --git a/cmake/HalideTargetHelpers.cmake b/cmake/HalideTargetHelpers.cmake
index 78adb705486c..ca0477b7b729 100644
--- a/cmake/HalideTargetHelpers.cmake
+++ b/cmake/HalideTargetHelpers.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 ##
 # Utilities for manipulating Halide target triples
diff --git a/cmake/TargetExportScript.cmake b/cmake/TargetExportScript.cmake
index 0e8b427e77fa..bc386ba32921 100644
--- a/cmake/TargetExportScript.cmake
+++ b/cmake/TargetExportScript.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 include(CheckLinkerFlag)
 
diff --git a/packaging/common/HalideConfig.cmake b/packaging/common/HalideConfig.cmake
index b611aadaf77e..664479f33a29 100644
--- a/packaging/common/HalideConfig.cmake
+++ b/packaging/common/HalideConfig.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 macro(Halide_fail message)
     set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${message}")
diff --git a/packaging/common/HalideHelpersConfig.cmake b/packaging/common/HalideHelpersConfig.cmake
index b1801e99c6e6..aa98ce7847e2 100644
--- a/packaging/common/HalideHelpersConfig.cmake
+++ b/packaging/common/HalideHelpersConfig.cmake
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 set(Halide_HOST_TARGET @Halide_HOST_TARGET@)
 
diff --git a/python_bindings/CMakeLists.txt b/python_bindings/CMakeLists.txt
index d7c1c6b031b0..3cc922965545 100644
--- a/python_bindings/CMakeLists.txt
+++ b/python_bindings/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22...3.23)
+cmake_minimum_required(VERSION 3.28)
 project(Halide_Python)
 
 if (PROJECT_IS_TOP_LEVEL)
diff --git a/src/runtime/hexagon_remote/android/CMakeLists.txt b/src/runtime/hexagon_remote/android/CMakeLists.txt
index 1e7465f68f75..fab50f839dbe 100644
--- a/src/runtime/hexagon_remote/android/CMakeLists.txt
+++ b/src/runtime/hexagon_remote/android/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(halide-hexagon_remote-android)
 
 set(_aarch64 "")
diff --git a/src/runtime/hexagon_remote/qurt/CMakeLists.txt b/src/runtime/hexagon_remote/qurt/CMakeLists.txt
index 1a0f42566752..4c4a0f5923e9 100644
--- a/src/runtime/hexagon_remote/qurt/CMakeLists.txt
+++ b/src/runtime/hexagon_remote/qurt/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 
 # The Hexagon toolchain is broken
 set(ENV{HEXAGON_SDK_ROOT} "${HEXAGON_SDK_ROOT}")
diff --git a/test/integration/CMakeLists.txt b/test/integration/CMakeLists.txt
index 600b1d765a36..44d832c15a6f 100644
--- a/test/integration/CMakeLists.txt
+++ b/test/integration/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(integration_tests NONE)
 
 enable_testing()
diff --git a/test/integration/aot/CMakeLists.txt b/test/integration/aot/CMakeLists.txt
index b370a642339b..1d18de58be21 100644
--- a/test/integration/aot/CMakeLists.txt
+++ b/test/integration/aot/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(aot)
 
 enable_testing()
diff --git a/test/integration/jit/CMakeLists.txt b/test/integration/jit/CMakeLists.txt
index a6f24342184a..40deef476baa 100644
--- a/test/integration/jit/CMakeLists.txt
+++ b/test/integration/jit/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(jit)
 
 enable_testing()
diff --git a/test/integration/xc/CMakeLists.txt b/test/integration/xc/CMakeLists.txt
index 8552e9cefc62..0ac1840d9582 100644
--- a/test/integration/xc/CMakeLists.txt
+++ b/test/integration/xc/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.28)
 project(xc)
 
 enable_testing()

From 18727887704463b6b45788ee151c42e1acde46eb Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Thu, 1 Aug 2024 09:11:07 -0700
Subject: [PATCH 163/186] Add helper functions to query properties of the
 lowered Target (#8192) (#8359)

* Add helper functions to query properties of the lowered Target (#8192)

* Add Python bindings

* clang-format

* clang-format

* Add comments
---
 Makefile                                      |  2 +
 .../src/halide/halide_/PyIROperator.cpp       |  7 +++
 src/CMakeLists.txt                            |  2 +
 src/IR.cpp                                    |  5 ++
 src/IR.h                                      |  7 +++
 src/IROperator.cpp                            | 20 +++++++
 src/IROperator.h                              | 46 ++++++++++++++++
 src/Lower.cpp                                 |  3 ++
 src/TargetQueryOps.cpp                        | 54 +++++++++++++++++++
 src/TargetQueryOps.h                          | 24 +++++++++
 test/correctness/CMakeLists.txt               |  1 +
 test/correctness/target_query.cpp             | 48 +++++++++++++++++
 12 files changed, 219 insertions(+)
 create mode 100644 src/TargetQueryOps.cpp
 create mode 100644 src/TargetQueryOps.h
 create mode 100644 test/correctness/target_query.cpp

diff --git a/Makefile b/Makefile
index fbffb7a5b48b..3bbe54baa331 100644
--- a/Makefile
+++ b/Makefile
@@ -590,6 +590,7 @@ SOURCE_FILES = \
   StripAsserts.cpp \
   Substitute.cpp \
   Target.cpp \
+  TargetQueryOps.cpp \
   Tracing.cpp \
   TrimNoOps.cpp \
   Tuple.cpp \
@@ -774,6 +775,7 @@ HEADER_FILES = \
   StripAsserts.h \
   Substitute.h \
   Target.h \
+  TargetQueryOps.h \
   Tracing.h \
   TrimNoOps.h \
   Tuple.h \
diff --git a/python_bindings/src/halide/halide_/PyIROperator.cpp b/python_bindings/src/halide/halide_/PyIROperator.cpp
index 3456af4fc40a..430db4622ec4 100644
--- a/python_bindings/src/halide/halide_/PyIROperator.cpp
+++ b/python_bindings/src/halide/halide_/PyIROperator.cpp
@@ -202,6 +202,13 @@ void define_operators(py::module &m) {
     m.def("likely_if_innermost", &likely_if_innermost);
     m.def("saturating_cast", (Expr(*)(Type, Expr)) & saturating_cast);
     m.def("strict_float", &strict_float);
+    m.def("target_arch_is", &target_arch_is);
+    m.def("target_bits", &target_bits);
+    m.def("target_has_feature", &target_has_feature);
+    m.def("target_natural_vector_size", [](const Type &t) -> Expr {
+        return target_natural_vector_size(t);
+    });
+    m.def("target_os_is", &target_os_is);
     m.def("logical_not", [](const Expr &expr) -> Expr {
         return !expr;
     });
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3a89e132b9c3..b7b465aee1eb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -160,6 +160,7 @@ set(HEADER_FILES
     StripAsserts.h
     Substitute.h
     Target.h
+    TargetQueryOps.h
     Tracing.h
     TrimNoOps.h
     Tuple.h
@@ -346,6 +347,7 @@ set(SOURCE_FILES
     StripAsserts.cpp
     Substitute.cpp
     Target.cpp
+    TargetQueryOps.cpp
     Tracing.cpp
     TrimNoOps.cpp
     Tuple.cpp
diff --git a/src/IR.cpp b/src/IR.cpp
index 804e41234f71..3454b48f7936 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -680,6 +680,11 @@ const char *const intrinsic_op_names[] = {
     "sorted_avg",
     "strict_float",
     "stringify",
+    "target_arch_is",
+    "target_bits",
+    "target_has_feature",
+    "target_natural_vector_size",
+    "target_os_is",
     "undef",
     "unreachable",
     "unsafe_promise_clamped",
diff --git a/src/IR.h b/src/IR.h
index f21f3a9a52ba..c04c5068cb24 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -612,6 +612,13 @@ struct Call : public ExprNode<Call> {
         sorted_avg,
         strict_float,
         stringify,
+
+        target_arch_is,
+        target_bits,
+        target_has_feature,
+        target_natural_vector_size,
+        target_os_is,
+
         undef,
         unreachable,
         unsafe_promise_clamped,
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
index 6ee62d66015c..f2e91fd5307e 100644
--- a/src/IROperator.cpp
+++ b/src/IROperator.cpp
@@ -2735,4 +2735,24 @@ Expr concat_bits(const std::vector<Expr> &e) {
     return Call::make(t.with_bits(t.bits() * (int)e.size()), Call::concat_bits, e, Call::Intrinsic);
 }
 
+Expr target_arch_is(Target::Arch arch) {
+    return Call::make(Bool(), Call::target_arch_is, {Expr((int)arch)}, Call::PureIntrinsic);
+}
+
+Expr target_os_is(Target::OS os) {
+    return Call::make(Bool(), Call::target_os_is, {Expr((int)os)}, Call::PureIntrinsic);
+}
+
+Expr target_bits() {
+    return Call::make(Int(32), Call::target_bits, {}, Call::PureIntrinsic);
+}
+
+Expr target_has_feature(Target::Feature feat) {
+    return Call::make(Bool(), Call::target_has_feature, {Expr((int)feat)}, Call::PureIntrinsic);
+}
+
+Expr target_natural_vector_size(Type t) {
+    return Call::make(Int(32), Call::target_natural_vector_size, {make_zero(t.element_of())}, Call::PureIntrinsic);
+}
+
 }  // namespace Halide
diff --git a/src/IROperator.h b/src/IROperator.h
index ef2ef3526bb5..2a65769dd277 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -11,6 +11,7 @@
 #include <map>
 
 #include "Expr.h"
+#include "Target.h"
 #include "Tuple.h"
 
 namespace Halide {
@@ -1689,6 +1690,51 @@ Expr rounding_mul_shift_right(Expr a, Expr b, Expr q);
 Expr rounding_mul_shift_right(Expr a, Expr b, int q);
 //@}
 
+/** Return a boolean Expr for the corresponding field of the Target
+ * being used during lowering; they can be useful in writing library
+ * code without having to plumb a Target through call sites, so that you
+ * can do things like
+ \code
+    Expr e = select(target_arch_is(Target::ARM), something, something_else);
+ \endcode
+ * Note that this doesn't do any checking at runtime to verify that the Target
+ * is valid for the current hardware configuration.
+ */
+//@{
+Expr target_arch_is(Target::Arch arch);
+Expr target_os_is(Target::OS os);
+Expr target_has_feature(Target::Feature feat);
+//@}
+
+/** Return the bit width of the Target used during lowering; this can be useful
+ * in writing library code without having to plumb a Target through call sites,
+ * so that you can do things like
+ \code
+    Expr e = select(target_bits() == 32, something, something_else);
+ \endcode
+ * Note that this doesn't do any checking at runtime to verify that the Target
+ * is valid for the current hardware configuration.
+ */
+Expr target_bits();
+
+/** Return the natural vector width for the given Type for the Target
+ * being used during lowering; this can be useful in writing library
+ * code without having to plumb a Target through call sites, so that you
+ * can do things like
+ \code
+    f.vectorize(x, target_natural_vector_size(Float(32)));
+ \endcode
+ * Note that this doesn't do any checking at runtime to verify that the Target
+ * is valid for the current hardware configuration.
+ */
+//@{
+Expr target_natural_vector_size(Type t);
+template<typename data_t>
+Expr target_natural_vector_size() {
+    return target_natural_vector_size(type_of<data_t>());
+}
+//@}
+
 }  // namespace Halide
 
 #endif
diff --git a/src/Lower.cpp b/src/Lower.cpp
index f092e2e711ef..19be543975f1 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -70,6 +70,7 @@
 #include "StrictifyFloat.h"
 #include "StripAsserts.h"
 #include "Substitute.h"
+#include "TargetQueryOps.h"
 #include "Tracing.h"
 #include "TrimNoOps.h"
 #include "UnifyDuplicateLets.h"
@@ -144,6 +145,8 @@ void lower_impl(const vector<Function> &output_funcs,
     // Create a deep-copy of the entire graph of Funcs.
     auto [outputs, env] = deep_copy(output_funcs, build_environment(output_funcs));
 
+    lower_target_query_ops(env, t);
+
     bool any_strict_float = strictify_float(env, t);
     result_module.set_any_strict_float(any_strict_float);
 
diff --git a/src/TargetQueryOps.cpp b/src/TargetQueryOps.cpp
new file mode 100644
index 000000000000..337d90c29b70
--- /dev/null
+++ b/src/TargetQueryOps.cpp
@@ -0,0 +1,54 @@
+#include "TargetQueryOps.h"
+
+#include "Function.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+
+class LowerTargetQueryOps : public IRMutator {
+    const Target &t;
+
+    using IRMutator::visit;
+
+    Expr visit(const Call *call) override {
+        if (call->is_intrinsic(Call::target_arch_is)) {
+            Target::Arch arch = (Target::Arch)*as_const_int(call->args[0]);
+            return make_bool(t.arch == arch);
+        } else if (call->is_intrinsic(Call::target_has_feature)) {
+            Target::Feature feat = (Target::Feature)*as_const_int(call->args[0]);
+            return make_bool(t.has_feature(feat));
+        } else if (call->is_intrinsic(Call::target_natural_vector_size)) {
+            Expr zero = call->args[0];
+            return Expr(t.natural_vector_size(zero.type()));
+        } else if (call->is_intrinsic(Call::target_os_is)) {
+            Target::OS os = (Target::OS)*as_const_int(call->args[0]);
+            return make_bool(t.os == os);
+        } else if (call->is_intrinsic(Call::target_bits)) {
+            return Expr(t.bits);
+        }
+
+        return IRMutator::visit(call);
+    }
+
+public:
+    LowerTargetQueryOps(const Target &t)
+        : t(t) {
+    }
+};
+
+}  // namespace
+
+void lower_target_query_ops(std::map<std::string, Function> &env, const Target &t) {
+    for (auto &iter : env) {
+        Function &func = iter.second;
+        LowerTargetQueryOps ltqo(t);
+        func.mutate(&ltqo);
+    }
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/TargetQueryOps.h b/src/TargetQueryOps.h
new file mode 100644
index 000000000000..0cc8023b48a5
--- /dev/null
+++ b/src/TargetQueryOps.h
@@ -0,0 +1,24 @@
+#ifndef HALIDE_TARGET_QUERY_OPS_H
+#define HALIDE_TARGET_QUERY_OPS_H
+
+/** \file
+ * Defines a lowering pass to lower all target_is() and target_has() helpers.
+ */
+
+#include <map>
+#include <string>
+
+namespace Halide {
+
+struct Target;
+
+namespace Internal {
+
+class Function;
+
+void lower_target_query_ops(std::map<std::string, Function> &env, const Target &t);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 6d16d8612594..8ca5cfb05045 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -311,6 +311,7 @@ tests(GROUPS correctness
       strict_float_bounds.cpp
       strided_load.cpp
       target.cpp
+      target_query.cpp
       tiled_matmul.cpp
       tracing.cpp
       tracing_bounds.cpp
diff --git a/test/correctness/target_query.cpp b/test/correctness/target_query.cpp
new file mode 100644
index 000000000000..c3bfb9e8f123
--- /dev/null
+++ b/test/correctness/target_query.cpp
@@ -0,0 +1,48 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    // For simplicity, only run this test on hosts that we can predict.
+    Target t = get_host_target();
+    if (t.arch != Target::X86 || t.bits != 64 || t.os != Target::OSX) {
+        printf("[SKIP] This test only runs on x86-64-osx.\n");
+        return 0;
+    }
+
+    t = t.with_feature(Target::Debug);
+
+    // Full specification round-trip, crazy features
+    Target t1 = Target(Target::OSX, Target::X86, 64,
+                       {Target::CUDA, Target::Debug});
+
+    Expr is_arm = target_arch_is(Target::ARM);
+    Expr is_x86 = target_arch_is(Target::X86);
+    Expr bits = target_bits();
+    Expr is_android = target_os_is(Target::Android);
+    Expr is_osx = target_os_is(Target::OSX);
+    Expr vec = target_natural_vector_size<float>();
+    Expr has_cuda = target_has_feature(Target::CUDA);
+    Expr has_vulkan = target_has_feature(Target::Vulkan);
+
+    Func f;
+    Var x;
+
+    f(x) = select(is_arm, 1, 0) +
+           select(is_x86, 2, 0) +
+           select(vec == 4, 4, 0) +
+           select(is_android, 8, 0) +
+           select(is_osx, 16, 0) +
+           select(bits == 32, 32, 0) +
+           select(bits == 64, 64, 0) +
+           select(has_cuda, 128, 0) +
+           select(has_vulkan, 256, 0);
+
+    Buffer<int> result = f.realize({1}, t1);
+
+    assert(result(0) == 2 + 4 + 16 + 64 + 128);
+
+    printf("Success!\n");
+    return 0;
+}

From 14035e36338aa46467688ef5c3759fa30c6d5713 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Sat, 3 Aug 2024 00:29:08 +0700
Subject: [PATCH 164/186] Make pybind11 minimum version check compatible with
 pybind11 v3. (#8366)

Concretely:

https://github.com/pybind/pybind11/blob/48f25275c44d52d0ceade122e328dc1f2e48ef44/include/pybind11/detail/common.h#L12-L14

This is needed for a Google-internal deployment, but is a useful fix regardless.
---
 python_bindings/src/halide/halide_/PyHalide.cpp | 11 +++++++----
 python_bindings/stub/PyStubImpl.cpp             | 13 ++++++++-----
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyHalide.cpp b/python_bindings/src/halide/halide_/PyHalide.cpp
index 750ee6cc092f..810096ad85b6 100644
--- a/python_bindings/src/halide/halide_/PyHalide.cpp
+++ b/python_bindings/src/halide/halide_/PyHalide.cpp
@@ -26,11 +26,14 @@
 #include "PyType.h"
 #include "PyVar.h"
 
-static_assert(PYBIND11_VERSION_MAJOR == 2 && PYBIND11_VERSION_MINOR >= 6,
-              "Halide requires PyBind 2.6+");
+#if !defined(PYBIND11_VERSION_HEX) || PYBIND11_VERSION_HEX < 0x02060000
+#error "Halide requires PyBind 2.6+"
+#endif
 
-static_assert(PY_VERSION_HEX >= 0x03000000,
-              "We appear to be compiling against Python 2.x rather than 3.x, which is not supported.");
+// Note: This check will be redundant when PyBind 2.10 becomes the minimum version.
+#if PY_VERSION_HEX < 0x03000000
+#error "We appear to be compiling against Python 2.x rather than 3.x, which is not supported."
+#endif
 
 #ifndef HALIDE_PYBIND_MODULE_NAME
 #define HALIDE_PYBIND_MODULE_NAME halide_
diff --git a/python_bindings/stub/PyStubImpl.cpp b/python_bindings/stub/PyStubImpl.cpp
index 474f45c5e8c8..5a8518126c8b 100644
--- a/python_bindings/stub/PyStubImpl.cpp
+++ b/python_bindings/stub/PyStubImpl.cpp
@@ -8,11 +8,14 @@
 
 #include "Halide.h"
 
-static_assert(PYBIND11_VERSION_MAJOR == 2 && PYBIND11_VERSION_MINOR >= 6,
-              "Halide requires PyBind 2.6+");
-
-static_assert(PY_VERSION_HEX >= 0x03000000,
-              "We appear to be compiling against Python 2.x rather than 3.x, which is not supported.");
+#if !defined(PYBIND11_VERSION_HEX) || PYBIND11_VERSION_HEX < 0x02060000
+#error "Halide requires PyBind 2.6+"
+#endif
+
+// Note: This check will be redundant when PyBind 2.10 becomes the minimum version.
+#if PY_VERSION_HEX < 0x03000000
+#error "We appear to be compiling against Python 2.x rather than 3.x, which is not supported."
+#endif
 
 namespace py = pybind11;
 

From 1a7b91426b26f94eec76d4b19a9a3f04e74b7c47 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Fri, 2 Aug 2024 11:45:58 -0700
Subject: [PATCH 165/186] Quick CMake fixes enabled by 3.28 (#8365)

* Use FindCUDAToolkit in apps/cuda_mat_mul

* Replace dummy FindHalide.cmake with pkg redirects

Having the dummy file on disk is confusing and is easy
to accidentally install when modifying CMake install
rules. Better to use the CMake 3.24+ feature of the
package redirects dir to truly disable find_package
for Halide inside the build.

* Avoid creating a dummy file for Halide_Python

* Fix formatting in CMakeLists.txt

* Add SpirvIR.h to the list of Halide sources

* Use BUILD_LOCAL_INTERFACE in PyStubs

* Consistently use HALIDE_H variable

* Comment overriding POSITION_INDEPENDENT_CODE

* Check Halide_STATIC_DEFINE at configure time.

This avoids sending a generator expression downstream. These
are functionally identical, but it's just one less thing to
evaluate.

* Use BUILD_LOCAL_INTERFACE for SPIRV-Headers
---
 CMakeLists.txt                            | 23 ++++++++++++++-----
 apps/cuda_mat_mul/CMakeLists.txt          |  9 ++++----
 cmake/FindHalide.cmake                    |  6 -----
 dependencies/wasm/CMakeLists.txt          |  2 +-
 python_bindings/src/halide/CMakeLists.txt |  7 +-----
 python_bindings/stub/CMakeLists.txt       |  4 ++--
 src/CMakeLists.txt                        | 27 ++++++++++++++---------
 7 files changed, 41 insertions(+), 37 deletions(-)
 delete mode 100644 cmake/FindHalide.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c09c3443b2e..2364f58ee4f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,17 @@ project(Halide
 
 enable_testing()
 
+##
+# Disable find_package(Halide) inside the build
+##
+
+file(CONFIGURE OUTPUT "${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/HalideConfig.cmake"
+     CONTENT [[set(Halide_FOUND 1)
+               set(Halide_VERSION @Halide_VERSION@)]])
+
+file(CONFIGURE OUTPUT "${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/HalideHelpersConfig.cmake"
+     CONTENT "set(HalideHelpers_FOUND 1)\n")
+
 ##
 # Set up project-wide properties
 ##
@@ -45,9 +56,9 @@ set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard to use. Halide requires
 option(CMAKE_CXX_STANDARD_REQUIRED "When enabled, the value of CMAKE_CXX_STANDARD is a requirement." ON)
 option(CMAKE_CXX_EXTENSIONS "When enabled, compiler-specific language extensions are enabled (e.g. -std=gnu++17)" OFF)
 
-if(CMAKE_CXX_STANDARD LESS 17)
+if (CMAKE_CXX_STANDARD LESS 17)
     message(FATAL_ERROR "Halide requires C++17 or newer but CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}")
-endif()
+endif ()
 
 # Build Halide with ccache if the package is present
 option(Halide_CCACHE_BUILD "Set to ON for a ccache enabled build" OFF)
@@ -68,12 +79,12 @@ if (Halide_CCACHE_BUILD)
 
     # Per https://ccache.dev/manual/latest.html#_precompiled_headers,
     # we must set -fno-pch-timestamp when using Clang + CCache + PCH
-    if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+    if (CMAKE_C_COMPILER_ID MATCHES "Clang")
         string(APPEND CMAKE_C_FLAGS " -Xclang -fno-pch-timestamp")
-    endif()
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    endif ()
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
         string(APPEND CMAKE_CXX_FLAGS " -Xclang -fno-pch-timestamp")
-    endif()
+    endif ()
 
     message(STATUS "Enabling ccache usage for building.")
 endif ()
diff --git a/apps/cuda_mat_mul/CMakeLists.txt b/apps/cuda_mat_mul/CMakeLists.txt
index 3cb508c3a6d9..7a9341e888c7 100644
--- a/apps/cuda_mat_mul/CMakeLists.txt
+++ b/apps/cuda_mat_mul/CMakeLists.txt
@@ -3,13 +3,13 @@ project(cuda_mat_mul)
 
 # This just checks whether CUDA is available ahead of time to allow
 # skipping this app when CUDA/cuBLAS are not installed on the system.
-find_package(CUDA)
-if (NOT CUDA_FOUND)
+find_package(CUDAToolkit)
+if (NOT CUDAToolkit_FOUND)
     message(WARNING "Could NOT find CUDA")
     return()
 endif ()
 
-if (NOT CUDA_CUBLAS_LIBRARIES)
+if (NOT TARGET CUDA::cublas)
     message(WARNING "Could NOT find cuBLAS")
     return()
 endif ()
@@ -35,8 +35,7 @@ add_halide_library(mat_mul FROM mat_mul.generator
 
 # Main executable
 add_executable(runner runner.cpp)
-target_include_directories(runner PRIVATE ${CUDA_INCLUDE_DIRS})
-target_link_libraries(runner PRIVATE Halide::Tools mat_mul ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
+target_link_libraries(runner PRIVATE mat_mul Halide::Tools CUDA::cudart CUDA::cublas)
 
 # Test that the app actually works!
 add_test(NAME mat_mul COMMAND runner)
diff --git a/cmake/FindHalide.cmake b/cmake/FindHalide.cmake
deleted file mode 100644
index cffa7dd90742..000000000000
--- a/cmake/FindHalide.cmake
+++ /dev/null
@@ -1,6 +0,0 @@
-# This file should NOT be installed.
-# It is used by python_bindings (and future externalizable projects) to satisfy
-# calls to `find_package(Halide)` when used in-tree.
-
-message(VERBOSE "Spoofing find_package(Halide) since in-tree builds already have Halide available.")
-set(Halide_FOUND 1)
diff --git a/dependencies/wasm/CMakeLists.txt b/dependencies/wasm/CMakeLists.txt
index c5122a042dd5..66a354af5547 100644
--- a/dependencies/wasm/CMakeLists.txt
+++ b/dependencies/wasm/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_dependent_option(WITH_V8 "Include V8 for WASM testing" OFF "TARGET_WEBASSE
 
 if (WITH_WABT AND WITH_V8)
     message(FATAL_ERROR "Cannot use both WABT and V8 at the same time, disable one of them.")
-endif()
+endif ()
 
 if ("${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Windows")
     if (WITH_WABT)
diff --git a/python_bindings/src/halide/CMakeLists.txt b/python_bindings/src/halide/CMakeLists.txt
index df3d8fe8bbfb..9d2523de9169 100644
--- a/python_bindings/src/halide/CMakeLists.txt
+++ b/python_bindings/src/halide/CMakeLists.txt
@@ -66,14 +66,9 @@ target_link_libraries(Halide_Python PRIVATE Halide::Halide)
 #   Ref: https://stackoverflow.com/questions/59860465/pybind11-importerror-dll-not-found-when-trying-to-import-pyd-in-python-int
 #   Ref: https://bugs.python.org/issue36085
 #   Ref: https://docs.python.org/3/whatsnew/3.8.html#bpo-36085-whatsnew
-# TODO: copying a dummy file here works around a CMake limitation. The issue is that if $<TARGET_RUNTIME_DLLS:...> is
-#   empty, then copy_if_different errors out, thinking it doesn't have enough arguments.
-#   Ref: https://gitlab.kitware.com/cmake/cmake/-/issues/23543
-set(dummy_file "${CMAKE_CURRENT_BINARY_DIR}/.dummy_file")
-file(TOUCH "${dummy_file}")
 add_custom_command(
     TARGET Halide_Python POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different "${dummy_file}" $<TARGET_RUNTIME_DLLS:Halide_Python> $<TARGET_FILE_DIR:Halide_Python>
+    COMMAND ${CMAKE_COMMAND} -E copy -t $<TARGET_FILE_DIR:Halide_Python> $<TARGET_RUNTIME_DLLS:Halide_Python>
     COMMAND_EXPAND_LISTS
     VERBATIM
 )
diff --git a/python_bindings/stub/CMakeLists.txt b/python_bindings/stub/CMakeLists.txt
index 42d71e0b4270..cc5354470466 100644
--- a/python_bindings/stub/CMakeLists.txt
+++ b/python_bindings/stub/CMakeLists.txt
@@ -4,8 +4,8 @@ add_library(Halide::PyStubs ALIAS Halide_PyStubs)
 # Don't add a direct dependency on pybind11::pybind11 here: that will add a
 # phantom dependency which gets propagated into our install setup. All we
 # really need here is a path to the include directories for pybind11, which
-# BUILD_INTERFACE will accomplish,
-target_link_libraries(Halide_PyStubs PRIVATE Halide::Halide $<BUILD_INTERFACE:pybind11::pybind11>)
+# BUILD_LOCAL_INTERFACE will accomplish,
+target_link_libraries(Halide_PyStubs PRIVATE Halide::Halide $<BUILD_LOCAL_INTERFACE:pybind11::pybind11>)
 
 set_target_properties(Halide_PyStubs PROPERTIES
                       EXPORT_NAME PyStubs
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b7b465aee1eb..c7087b7b141b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -338,6 +338,7 @@ set(SOURCE_FILES
     SlidingWindow.cpp
     Solve.cpp
     SpirvIR.cpp
+    SpirvIR.h
     SplitTuples.cpp
     StageStridedLoads.cpp
     StmtToHTML.cpp
@@ -367,13 +368,13 @@ set(SOURCE_FILES
 set(C_TEMPLATE_FILES
     CodeGen_C_prologue
     CodeGen_C_vectors
-    )
+)
 
 set(HTML_TEMPLATE_FILES
     StmtToHTML_dependencies.html
     StmtToHTML.js
     StmtToHTML.css
-    )
+)
 
 ##
 # Build and import the runtime.
@@ -416,13 +417,13 @@ endforeach ()
 
 set(HALIDE_H "${Halide_BINARY_DIR}/include/Halide.h")
 set(LICENSE_PATH "${Halide_SOURCE_DIR}/LICENSE.txt")
-add_custom_command(OUTPUT "${Halide_BINARY_DIR}/include/Halide.h"
+add_custom_command(OUTPUT "${HALIDE_H}"
                    COMMAND ${CMAKE_COMMAND} -E make_directory "$<SHELL_PATH:${Halide_BINARY_DIR}/include>"
                    COMMAND build_halide_h "$<SHELL_PATH:${LICENSE_PATH}>" ${HEADER_FILES} > "$<SHELL_PATH:${HALIDE_H}>"
                    DEPENDS build_halide_h "${LICENSE_PATH}" ${HEADER_FILES}
                    WORKING_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}"
                    VERBATIM)
-add_custom_target(HalideIncludes ALL DEPENDS "${Halide_BINARY_DIR}/include/Halide.h")
+add_custom_target(HalideIncludes DEPENDS "${HALIDE_H}")
 
 ##
 # Define the Halide library target.
@@ -532,8 +533,10 @@ add_library(Halide::Halide ALIAS Halide)
 
 target_link_libraries(Halide PRIVATE Halide::LLVM)
 target_link_libraries(Halide PUBLIC Halide::LanguageOptions)
-target_compile_definitions(Halide PRIVATE $<$<STREQUAL:$<TARGET_PROPERTY:TYPE>,STATIC_LIBRARY>:Halide_STATIC_DEFINE>)
 target_compile_features(Halide PUBLIC cxx_std_17)
+if (NOT BUILD_SHARED_LIBS)
+    target_compile_definitions(Halide PRIVATE Halide_STATIC_DEFINE)
+endif ()
 
 include(TargetExportScript)
 ## TODO: implement something similar for Windows/link.exe
@@ -547,9 +550,11 @@ set(Halide_SOVERSION_OVERRIDE "${Halide_VERSION_MAJOR}"
 mark_as_advanced(Halide_SOVERSION_OVERRIDE)
 
 set_target_properties(Halide PROPERTIES
-                      POSITION_INDEPENDENT_CODE ON
-                      VERSION ${Halide_VERSION}
-                      SOVERSION ${Halide_SOVERSION_OVERRIDE})
+                      VERSION "${Halide_VERSION}"
+                      SOVERSION "${Halide_SOVERSION_OVERRIDE}")
+
+# Always build with PIC, even when static
+set_target_properties(Halide PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 # Note that we (deliberately) redeclare these versions here, even though the macros
 # with identical versions are expected to be defined in source; this allows us to
@@ -576,6 +581,7 @@ endif ()
 ##
 # Set compiler options for libHalide
 ##
+
 set_halide_compiler_warnings(Halide)
 
 if (CMAKE_GENERATOR MATCHES "Visual Studio")
@@ -592,7 +598,7 @@ target_compile_definitions(Halide
                            # in the Windows API.
                            $<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>
                            $<$<CXX_COMPILER_ID:MSVC>:_SCL_SECURE_NO_WARNINGS>
-                           )
+)
 
 ##
 # Set up additional backend options for Halide
@@ -624,7 +630,7 @@ if (TARGET_SPIRV)
         HINTS "${Halide_SOURCE_DIR}/dependencies/spirv"
     )
     target_compile_definitions(Halide PRIVATE WITH_SPIRV)
-    target_link_libraries(Halide PRIVATE "$<BUILD_INTERFACE:SPIRV-Headers::SPIRV-Headers>")
+    target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:SPIRV-Headers::SPIRV-Headers>")
 endif ()
 
 option(TARGET_WEBGPU "Include WebGPU target" ON)
@@ -632,7 +638,6 @@ if (TARGET_WEBGPU)
     target_compile_definitions(Halide PRIVATE WITH_WEBGPU)
 endif()
 
-
 ##
 # Add autoschedulers to the build.
 ##

From 37ab461ba7b47493c2fa4eb21a918563edc2d095 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Tue, 6 Aug 2024 11:36:19 -0400
Subject: [PATCH 166/186] Distribute GenGen as a static library (#8367)

Also use a mutex and timestamp checking to ensure that
multiple generators in the same directory do not race
to place Halide.dll next to them on Windows.
---
 apps/hexagon_benchmarks/CMakeLists.txt | 37 +++++++++----------
 cmake/HalideGeneratorHelpers.cmake     | 50 +++++++++++++++-----------
 cmake/MutexCopy.ps1                    | 31 ++++++++++++++++
 packaging/CMakeLists.txt               |  5 ++-
 tools/CMakeLists.txt                   | 12 +++++--
 5 files changed, 89 insertions(+), 46 deletions(-)
 create mode 100644 cmake/MutexCopy.ps1

diff --git a/apps/hexagon_benchmarks/CMakeLists.txt b/apps/hexagon_benchmarks/CMakeLists.txt
index 161bc8343937..bfee0a09d760 100644
--- a/apps/hexagon_benchmarks/CMakeLists.txt
+++ b/apps/hexagon_benchmarks/CMakeLists.txt
@@ -7,35 +7,32 @@ enable_testing()
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED YES)
 set(CMAKE_CXX_EXTENSIONS NO)
-set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
 # Find Halide
 find_package(Halide REQUIRED)
 
-macro(add_generator_and_library FILTER_NAME)
-    set(GENERATOR_EXE ${FILTER_NAME}.generator)
-    set(GENERATOR_SRC ${FILTER_NAME}_generator.cpp)
-    add_halide_generator(${GENERATOR_EXE} SOURCES ${GENERATOR_SRC})
-    add_halide_library(${FILTER_NAME} FROM ${GENERATOR_EXE})
-endmacro()
+# Add Halide libraries
+add_halide_generator(dilate3x3.generator SOURCES dilate3x3_generator.cpp)
+add_halide_library(dilate3x3 FROM dilate3x3.generator)
+
+add_halide_generator(gaussian5x5.generator SOURCES gaussian5x5_generator.cpp)
+add_halide_library(gaussian5x5 FROM gaussian5x5.generator)
 
-add_generator_and_library(dilate3x3)
-add_generator_and_library(gaussian5x5)
-add_generator_and_library(median3x3)
-add_generator_and_library(sobel)
+add_halide_generator(median3x3.generator SOURCES median3x3_generator.cpp)
+add_halide_library(median3x3 FROM median3x3.generator)
+
+add_halide_generator(sobel.generator SOURCES sobel_generator.cpp)
+add_halide_library(sobel FROM sobel.generator)
 
 # Main executable
 add_executable(process process.cpp)
 target_compile_options(process PRIVATE $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-O2>)
+target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL)
+target_link_libraries(process PRIVATE Halide::Tools dilate3x3 gaussian5x5 median3x3 sobel)
+
 if (Halide_TARGET MATCHES "hvx")
-  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL TARGET_HAS_HVX)
-else()
-  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL)
-endif()
-target_link_libraries(process
-                      PRIVATE
-                      Halide::Tools
-                      dilate3x3 gaussian5x5 median3x3 sobel)
+    target_compile_definitions(process PRIVATE TARGET_HAS_HVX)
+endif ()
 
 # Test that the app actually works!
 add_test(NAME hexagon_benchmarks COMMAND process -n 1)
diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake
index b2eb2f96407f..ffa69377e15b 100644
--- a/cmake/HalideGeneratorHelpers.cmake
+++ b/cmake/HalideGeneratorHelpers.cmake
@@ -83,7 +83,7 @@ function(add_halide_generator TARGET)
             list(LENGTH ARG_SOURCES len)
             if (NOT len EQUAL 1)
                 message(FATAL_ERROR "Python Generators must specify exactly one source file.")
-            endif()
+            endif ()
 
             # Make a fake target here that we can attach the Python source to,
             # so that we can extract 'em in add_halide_library()
@@ -102,7 +102,8 @@ function(add_halide_generator TARGET)
                 # we *want* deprecation warnings to be propagated. So we must set
                 # NO_SYSTEM_FROM_IMPORTED in order for it to be seen.
                 set_target_properties(${TARGET} PROPERTIES NO_SYSTEM_FROM_IMPORTED YES)
-                target_compile_options(${TARGET} PRIVATE
+                target_compile_options(
+                    ${TARGET} PRIVATE
                     $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wdeprecated-declarations>
                     $<$<CXX_COMPILER_ID:MSVC>:/w14996>  # 4996: compiler encountered deprecated declaration
                 )
@@ -128,7 +129,7 @@ function(add_halide_generator TARGET)
         set(stub_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.${GEN_NAME}.${MODULE_NAME}.py_stub_generated.cpp")
         if (NOT EXISTS "${stub_file}")
             file(WRITE "${stub_file}" "${stub_text}")
-        endif()
+        endif ()
 
         Python3_add_library(${TARGET}_pystub MODULE WITH_SOABI "${stub_file}" ${ARG_SOURCES})
         set_target_properties(${TARGET}_pystub PROPERTIES
@@ -236,7 +237,7 @@ function(add_halide_library TARGET)
             message(FATAL_ERROR "Unable to locate FROM as either ${ARG_FROM} or ${FQ_ARG_FROM}")
         endif ()
         set(ARG_FROM "${FQ_ARG_FROM}")
-    endif()
+    endif ()
 
     get_property(py_src TARGET ${ARG_FROM} PROPERTY Halide_PYTHON_GENERATOR_SOURCE)
     if (py_src)
@@ -244,7 +245,7 @@ function(add_halide_library TARGET)
         if (NOT TARGET Halide::Python)
             message(FATAL_ERROR "This version of Halide was built without support for Python bindings; rebuild using WITH_PYTHON_BINDINGS=ON to use this rule with Python Generators.")
         endif ()
-        
+
         if (NOT TARGET Python3::Interpreter)
             message(FATAL_ERROR "You must call find_package(Python3) in your CMake code in order to use this rule with Python Generators.")
         endif ()
@@ -262,11 +263,11 @@ function(add_halide_library TARGET)
             "$<TARGET_FILE:Python3::Interpreter>" $<SHELL_PATH:${py_src}>
         )
         set(GENERATOR_CMD_DEPS ${ARG_FROM} Halide::Python ${py_src})
-    else()
+    else ()
         set(GENERATOR_CMD "${ARG_FROM}")
         set(GENERATOR_CMD_DEPS ${ARG_FROM})
         _Halide_place_dll(${ARG_FROM})
-    endif()
+    endif ()
 
     if (ARG_C_BACKEND)
         if (ARG_USE_RUNTIME)
@@ -348,10 +349,10 @@ function(add_halide_library TARGET)
     ##
 
     _Halide_get_platform_details(
-            is_crosscompiling
-            object_suffix
-            static_library_suffix
-            ${ARG_TARGETS})
+        is_crosscompiling
+        object_suffix
+        static_library_suffix
+        ${ARG_TARGETS})
 
     # Always emit a C header
     set(generator_outputs c_header)
@@ -428,7 +429,8 @@ function(add_halide_library TARGET)
                               LINKER_LANGUAGE CXX)
         if (NOT Halide_NO_DEFAULT_FLAGS)
             # Silence many useless warnings in generated C++ code compilation
-            target_compile_options("${TARGET}" PRIVATE
+            target_compile_options(
+                "${TARGET}" PRIVATE
                 $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-Wno-psabi>)
         endif ()
         _Halide_fix_xcode("${TARGET}")
@@ -571,8 +573,14 @@ function(_Halide_place_dll GEN)
         return()
     endif ()
 
-    add_custom_command(TARGET ${GEN} POST_BUILD
-                       COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:Halide::Halide> $<TARGET_FILE_DIR:${GEN}>)
+    add_custom_command(
+        TARGET ${GEN} POST_BUILD
+        COMMAND powershell -NoProfile -ExecutionPolicy Bypass
+        -File "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/MutexCopy.ps1"
+        -src "$<TARGET_FILE:Halide::Halide>"
+        -dstDir "$<TARGET_FILE_DIR:${GEN}>"
+        VERBATIM
+    )
     set_property(TARGET ${GEN} PROPERTY Halide_GENERATOR_HAS_POST_BUILD 1)
 endfunction()
 
@@ -605,8 +613,8 @@ function(add_halide_runtime RT)
         get_target_property(aliased ${ARG_FROM} ALIASED_TARGET)
         if (target_type STREQUAL "EXECUTABLE" AND NOT aliased)
             add_executable(_Halide_gengen ALIAS ${ARG_FROM})
-        endif()
-    endif()
+        endif ()
+    endif ()
 
     # The default of NO_THREADS/NO_DL_LIBS is OFF unless Halide_RUNTIME_NO_THREADS/NO_DL_LIBS is defined globally
     if (NOT DEFINED ARG_NO_THREADS)
@@ -620,10 +628,10 @@ function(add_halide_runtime RT)
     _Halide_gengen_ensure()
 
     _Halide_get_platform_details(
-            is_crosscompiling
-            object_suffix
-            static_library_suffix
-            ${ARG_TARGETS})
+        is_crosscompiling
+        object_suffix
+        static_library_suffix
+        ${ARG_TARGETS})
 
     if (is_crosscompiling)
         set(GEN_OUTS "${RT}${static_library_suffix}")
@@ -746,7 +754,7 @@ function(_Halide_target_export_single_symbol TARGET SYMBOL)
         file(WRITE
              "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.${SYMBOL}.ldscript.apple"
              "_${SYMBOL}\n")
-    endif()
+    endif ()
     if (NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.${SYMBOL}.ldscript")
         file(WRITE
              "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.${SYMBOL}.ldscript"
diff --git a/cmake/MutexCopy.ps1 b/cmake/MutexCopy.ps1
new file mode 100644
index 000000000000..5469d9fba19b
--- /dev/null
+++ b/cmake/MutexCopy.ps1
@@ -0,0 +1,31 @@
+param([string]$src, [string]$dstDir)
+
+try {
+    $bytes = [System.Text.Encoding]::UTF8.GetBytes($dstDir)
+    $hash = [System.Security.Cryptography.SHA512]::Create().ComputeHash($bytes)
+    $key = "Halide-" + ([Convert]::ToBase64String($hash) -replace ('/', '-'))
+
+    $m = New-Object System.Threading.Mutex($false, $key)
+    if (!$m) {
+        throw "Failed to create mutex $key"
+    }
+
+    $m.WaitOne() | Out-Null
+
+    $name = Split-Path $src -leaf
+    $dst = Join-Path $dstDir $name
+    if (Test-Path $dst) {
+        $srcTime = (Get-Item $src).LastWriteTime
+        $dstTime = (Get-Item $dst).LastWriteTime
+        if ($dstTime -ge $srcTime) {
+            Return
+        }
+    }
+
+    Copy-Item $src $dstDir
+} finally {
+    if ($m) {
+        $m.ReleaseMutex() | Out-Null
+        $m.Dispose() | Out-Null
+    }
+}
diff --git a/packaging/CMakeLists.txt b/packaging/CMakeLists.txt
index d1e29be7fa52..e93df7a35817 100644
--- a/packaging/CMakeLists.txt
+++ b/packaging/CMakeLists.txt
@@ -21,9 +21,7 @@ set(Halide_INSTALL_TOOLSDIR "${CMAKE_INSTALL_DATADIR}/tools"
 # Main library exports
 ##
 
-target_sources(Halide_Generator INTERFACE $<INSTALL_INTERFACE:${Halide_INSTALL_TOOLSDIR}/GenGen.cpp>)
-
-install(TARGETS Halide Halide_Generator Halide_LanguageOptions
+install(TARGETS Halide Halide_Generator Halide_GenGen Halide_LanguageOptions
         EXPORT Halide_Targets
         RUNTIME COMPONENT Halide_Runtime
         LIBRARY COMPONENT Halide_Runtime
@@ -216,6 +214,7 @@ install(FILES
         ${Halide_SOURCE_DIR}/cmake/FindHalide_WebGPU.cmake
         ${Halide_SOURCE_DIR}/cmake/HalideTargetHelpers.cmake
         ${Halide_SOURCE_DIR}/cmake/TargetExportScript.cmake
+        ${Halide_SOURCE_DIR}/cmake/MutexCopy.ps1
         DESTINATION ${Halide_INSTALL_HELPERSDIR}
         COMPONENT Halide_Development)
 
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 2d66e5271e35..16af91e4c4df 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -41,12 +41,20 @@ target_sources(Halide_RunGenMain INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOU
 target_link_libraries(Halide_RunGenMain INTERFACE Halide::Runtime Halide::ImageIO Halide::Tools)
 set_target_properties(Halide_RunGenMain PROPERTIES EXPORT_NAME RunGenMain)
 
+add_library(Halide_GenGen STATIC GenGen.cpp)
+add_library(Halide::GenGen ALIAS Halide_GenGen)
+set_target_properties(Halide_GenGen PROPERTIES EXPORT_NAME GenGen)
+
+target_link_libraries(Halide_GenGen PUBLIC Halide::Halide ${CMAKE_DL_LIBS})
+
 add_library(Halide_Generator INTERFACE)
 add_library(Halide::Generator ALIAS Halide_Generator)
-target_sources(Halide_Generator INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/GenGen.cpp>)
-target_link_libraries(Halide_Generator INTERFACE Halide::Halide ${CMAKE_DL_LIBS})
 set_target_properties(Halide_Generator PROPERTIES EXPORT_NAME Generator)
 
+target_link_libraries(
+    Halide_Generator INTERFACE "$<LINK_LIBRARY:WHOLE_ARCHIVE,Halide::GenGen>"
+)
+
 add_library(Halide_Tools INTERFACE)
 add_library(Halide::Tools ALIAS Halide_Tools)
 target_include_directories(Halide_Tools INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)

From 17bd517a96dc14287b3b0412277bddf023d55a14 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Tue, 6 Aug 2024 17:44:19 -0400
Subject: [PATCH 167/186] Clean up serialization build code (#8369)

---
 packaging/CMakeLists.txt |  2 +-
 src/CMakeLists.txt       | 72 ++++++++++++++--------------------------
 2 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/packaging/CMakeLists.txt b/packaging/CMakeLists.txt
index e93df7a35817..4ed6f9919e68 100644
--- a/packaging/CMakeLists.txt
+++ b/packaging/CMakeLists.txt
@@ -37,7 +37,7 @@ if (TARGET Halide_Adams2019)
 endif ()
 
 # Halide_LLVM
-foreach (dep IN ITEMS Halide_LLVM Halide_wabt Halide_flatbuffers)
+foreach (dep IN ITEMS Halide_LLVM Halide_wabt)
     if (TARGET ${dep})
         install(TARGETS ${dep} EXPORT Halide_Targets)
     endif ()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c7087b7b141b..c49f1f9e76ab 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -442,21 +442,20 @@ add_library(Halide
 
 # Build serialization, enabled by default
 option(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON)
-
-# flatbuffers is small and compiles quickly, but if you want/need to use
-# a local version (via find_package), configure with FLATBUFFERS_USE_FETCHCONTENT=OFF
-option(FLATBUFFERS_USE_FETCHCONTENT "Enable to download the Flatbuffers library via FetchContent" ON)
-set(FLATBUFFERS_VER 23.5.26 CACHE STRING "The Flatbuffers version to use (or download)")
-
 if (WITH_SERIALIZATION)
+    # flatbuffers is small and compiles quickly, but if you want/need to use
+    # a local version (via find_package), configure with FLATBUFFERS_USE_FETCHCONTENT=OFF
+    option(FLATBUFFERS_USE_FETCHCONTENT "Enable to download the Flatbuffers library via FetchContent" ON)
+    set(FLATBUFFERS_VER 23.5.26 CACHE STRING "The Flatbuffers version to use (or download) ")
+
     if (FLATBUFFERS_USE_FETCHCONTENT)
         include(FetchContent)
-        message(STATUS "Fetching flatbuffers ${FLATBUFFERS_VER}...")
         FetchContent_Declare(
             flatbuffers
             GIT_REPOSITORY https://github.com/google/flatbuffers.git
             GIT_TAG v${FLATBUFFERS_VER}
             GIT_SHALLOW TRUE
+            SYSTEM
         )
         # configuration for flatbuffers
         set(FLATBUFFERS_BUILD_TESTS OFF)
@@ -464,46 +463,25 @@ if (WITH_SERIALIZATION)
         FetchContent_MakeAvailable(flatbuffers)
         set_target_properties(flatbuffers PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-        add_library(Halide_flatbuffers INTERFACE)
-        target_sources(Halide_flatbuffers INTERFACE $<BUILD_INTERFACE:$<TARGET_OBJECTS:flatbuffers>>)
-        target_include_directories(Halide_flatbuffers
-                                   SYSTEM # Use -isystem instead of -I; this is a trick so that clang-tidy won't analyze these includes
-                                   INTERFACE
-                                   $<BUILD_INTERFACE:${flatbuffers_SOURCE_DIR}>/include
-                                   $<BUILD_INTERFACE:${flatbuffers_BINARY_DIR}>/include)
-        set_target_properties(Halide_flatbuffers PROPERTIES EXPORT_NAME flatbuffers)
-
+        add_library(flatbuffers::flatbuffers ALIAS flatbuffers)
         add_executable(flatbuffers::flatc ALIAS flatc)
+
         message(STATUS "Using fetched-and-built flatbuffers, version ${FLATBUFFERS_VER}")
+        set(flatbuffers_target "$<BUILD_LOCAL_INTERFACE:flatbuffers::flatbuffers>")
     else ()
-        # Sadly, there seem to be at least three variations of the Flatbuffer package
-        # in terms of the case of the relevant CMake files; if we guess wrong, we
-        # fail on case-sensitive file systems. We'll try this as a hack workaround:
-        # just try all three. (Note that the internal CMake library name appears to be
-        # `flatbuffers` in all cases.)
-        set(FB_NAME "")
-        foreach (N IN ITEMS flatbuffers Flatbuffers FlatBuffers)
-            # TODO: should we check the version here?
-            find_package(${N} QUIET)
-            if (${N}_FOUND)
-                set(FB_NAME ${N})
-                message(STATUS "Using installed flatbuffers, version ${${N}_VERSION}")
-                break()
-            endif ()
-        endforeach ()
-
-        if (NOT FB_NAME)
-            message(FATAL_ERROR "WITH_SERIALIZATION is ON and FLATBUFFERS_USE_FETCHCONTENT is OFF, "
-                                "but could not find flatbuffers installed locally. "
-                                "Either install flatbuffers or build with WITH_SERIALIZATION=OFF.")
-        endif ()
-
-        add_library(Halide_flatbuffers ALIAS flatbuffers::flatbuffers)
+        # Sadly, there seem to be at least three variations of the Flatbuffer
+        # package in terms of the case of the relevant CMake files. Fortunately,
+        # the IMPORTED targets appear to be consistently named `flatbuffers`.
+        find_package(
+            flatbuffers ${FLATBUFFERS_VER}
+            NAMES flatbuffers Flatbuffers FlatBuffers
+            REQUIRED
+        )
+        set(flatbuffers_target flatbuffers::flatbuffers)
     endif ()
 
-    set(fb_dir "${Halide_BINARY_DIR}/flatc/include")
-
     set(fb_def "${CMAKE_CURRENT_SOURCE_DIR}/halide_ir.fbs")
+    set(fb_dir "${Halide_BINARY_DIR}/include/flatc")
     set(fb_header "${fb_dir}/halide_ir.fbs.h")
     add_custom_command(
         OUTPUT "${fb_header}"
@@ -512,21 +490,21 @@ if (WITH_SERIALIZATION)
         VERBATIM
     )
     add_custom_target(generate_fb_header DEPENDS "${fb_header}")
-    set_source_files_properties("${fb_header}" PROPERTIES GENERATED TRUE)
 
     add_dependencies(Halide generate_fb_header)
     target_include_directories(Halide PRIVATE "$<BUILD_INTERFACE:${fb_dir}>")
-    target_link_libraries(Halide PRIVATE Halide_flatbuffers)
+    target_link_libraries(Halide PRIVATE ${flatbuffers_target})
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION)
 endif ()
 
 # Enable serialization testing by intercepting JIT compilation with a serialization roundtrip;
 # This is used only for special builds made specifically for testing, and must be disabled by default.
-option(WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING "Intercepting JIT compilation with a serialization roundtrip, for test only" OFF)
+cmake_dependent_option(
+    WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING "Intercepting JIT compilation with a serialization roundtrip, for test only" OFF
+    "WITH_SERIALIZATION" OFF
+)
 if (WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
-    if (WITH_SERIALIZATION)
-        target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
-    endif ()
+    target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
 endif ()
 
 add_library(Halide::Halide ALIAS Halide)

From 40ab265fb3d2995fe8c86471f53d04d34fe14b19 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Wed, 7 Aug 2024 10:57:28 -0400
Subject: [PATCH 168/186] List headers with target_sources FILE_SETS (#8370)

Removes instances of target_include_directories
and installation rules based on those directories.
These are now automatically computed from the
BASE_DIRS (defaults to current source dir) argument
to target_sources.

This models the build more accurately and avoids
accidental installation of unwanted headers. Also
forces us to think about the linking relationships
between components; ideally this will result in a
more accurate build graph.
---
 packaging/CMakeLists.txt                      | 37 ++++------
 src/CMakeLists.txt                            | 72 +++++++++++--------
 src/autoschedulers/adams2019/CMakeLists.txt   |  6 +-
 .../anderson2021/CMakeLists.txt               | 18 ++---
 src/autoschedulers/common/CMakeLists.txt      | 32 ++++++---
 src/autoschedulers/li2018/CMakeLists.txt      |  2 -
 .../mullapudi2016/CMakeLists.txt              |  1 -
 src/runtime/CMakeLists.txt                    | 11 +--
 test/generator/CMakeLists.txt                 |  4 +-
 tools/CMakeLists.txt                          | 41 +++++++++--
 10 files changed, 136 insertions(+), 88 deletions(-)

diff --git a/packaging/CMakeLists.txt b/packaging/CMakeLists.txt
index 4ed6f9919e68..ed5c4b9b77c3 100644
--- a/packaging/CMakeLists.txt
+++ b/packaging/CMakeLists.txt
@@ -27,7 +27,7 @@ install(TARGETS Halide Halide_Generator Halide_GenGen Halide_LanguageOptions
         LIBRARY COMPONENT Halide_Runtime
         NAMELINK_COMPONENT Halide_Development
         ARCHIVE COMPONENT Halide_Development
-        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+        FILE_SET HEADERS COMPONENT Halide_Development)
 
 if (TARGET Halide_Adams2019)
     install(TARGETS Halide_Adams2019 Halide_Li2018 Halide_Mullapudi2016 Halide_Anderson2021
@@ -43,25 +43,27 @@ foreach (dep IN ITEMS Halide_LLVM Halide_wabt)
     endif ()
 endforeach ()
 
+##
+# Runtime headers
+##
+
+install(TARGETS Halide_Runtime
+        EXPORT Halide_Interfaces
+        FILE_SET HEADERS COMPONENT Halide_Development)
+
 ##
 # Library-type-agnostic interface targets
 ##
 
 target_sources(Halide_RunGenMain INTERFACE $<INSTALL_INTERFACE:${Halide_INSTALL_TOOLSDIR}/RunGenMain.cpp>)
 
-install(TARGETS Halide_Tools Halide_ImageIO Halide_RunGenMain Halide_ThreadPool
-        EXPORT Halide_Interfaces
-        INCLUDES DESTINATION ${Halide_INSTALL_TOOLSDIR})
+install(FILES ${Halide_SOURCE_DIR}/tools/RunGenMain.cpp
+        DESTINATION ${Halide_INSTALL_TOOLSDIR}
+        COMPONENT Halide_Development)
 
-install(TARGETS Halide_Runtime
+install(TARGETS Halide_Tools Halide_ImageIO Halide_RunGenMain Halide_ThreadPool
         EXPORT Halide_Interfaces
-        INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-
-# Captures both the runtime and Halide.h
-install(DIRECTORY ${Halide_BINARY_DIR}/include/
-        TYPE INCLUDE
-        COMPONENT Halide_Development
-        FILES_MATCHING PATTERN "include/*.h")
+        FILE_SET HEADERS COMPONENT Halide_Development DESTINATION ${Halide_INSTALL_TOOLSDIR})
 
 ##
 # Patch RPATH for executable targets
@@ -113,17 +115,6 @@ install(FILES
 # Tools
 ##
 
-install(DIRECTORY ${Halide_SOURCE_DIR}/tools/
-        DESTINATION ${Halide_INSTALL_TOOLSDIR}
-        COMPONENT Halide_Development
-        FILES_MATCHING
-        PATTERN "*.h"
-        PATTERN "*.cpp"
-        PATTERN "*.m"
-        PATTERN "binary2cpp.cpp" EXCLUDE
-        PATTERN "build_halide_h.cpp" EXCLUDE
-        PATTERN "find_inverse.cpp" EXCLUDE)
-
 install(PROGRAMS ${Halide_SOURCE_DIR}/src/autoschedulers/adams2019/adams2019_autotune_loop.sh
                  ${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021/anderson2021_autotune_loop.sh
         DESTINATION ${Halide_INSTALL_TOOLSDIR}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c49f1f9e76ab..79c5cd404f0f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,3 +1,10 @@
+##
+# Declare the Halide library target.
+##
+
+add_library(Halide)
+add_library(Halide::Halide ALIAS Halide)
+
 ##
 # Lists of source files. Keep ALL lists sorted in alphabetical order.
 ##
@@ -5,7 +12,12 @@
 # The externally-visible header files that go into making Halide.h.
 # Don't include anything here that includes llvm headers.
 # Also *don't* include anything that's only used internally (eg SpirvIR.h).
-set(HEADER_FILES
+target_sources(
+    Halide
+    PRIVATE
+    FILE_SET private_headers
+    TYPE HEADERS
+    FILES
     AbstractGenerator.h
     AddAtomicMutex.h
     AddImageChecks.h
@@ -175,9 +187,13 @@ set(HEADER_FILES
     VectorizeLoops.h
     WasmExecutor.h
     WrapCalls.h
-    )
+)
 
-set(SOURCE_FILES
+# The sources that go into libHalide. For the sake of IDE support, headers that
+# exist in src/ but are not public should be included here.
+target_sources(
+    Halide
+    PRIVATE
     AbstractGenerator.cpp
     AddAtomicMutex.cpp
     AddImageChecks.cpp
@@ -363,7 +379,7 @@ set(SOURCE_FILES
     VectorizeLoops.cpp
     WasmExecutor.cpp
     WrapCalls.cpp
-    )
+)
 
 set(C_TEMPLATE_FILES
     CodeGen_C_prologue
@@ -381,12 +397,13 @@ set(HTML_TEMPLATE_FILES
 ##
 
 add_subdirectory(runtime)
+target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:Halide::initmod>")
+target_link_libraries(Halide INTERFACE Halide::Runtime)
 
 ##
 # Build the template files via binary2cpp.
 ##
 
-add_library(Halide_c_templates OBJECT)
 foreach (f IN LISTS C_TEMPLATE_FILES)
     set(SRC "$<SHELL_PATH:${CMAKE_CURRENT_SOURCE_DIR}/${f}.template.cpp>")
     set(DST "c_template.${f}.template.cpp")
@@ -395,7 +412,7 @@ foreach (f IN LISTS C_TEMPLATE_FILES)
                        COMMAND binary2cpp "halide_c_template_${f}" < "${SRC}" > "${DST}"
                        DEPENDS "${SRC}" binary2cpp
                        VERBATIM)
-    target_sources(Halide_c_templates PRIVATE ${DST})
+    target_sources(Halide PRIVATE ${DST})
 endforeach ()
 
 foreach (f IN LISTS HTML_TEMPLATE_FILES)
@@ -407,34 +424,33 @@ foreach (f IN LISTS HTML_TEMPLATE_FILES)
                        COMMAND binary2cpp "${VARNAME}" < "${SRC}" > "${DST}"
                        DEPENDS "${SRC}" binary2cpp
                        VERBATIM)
-    target_sources(Halide_c_templates PRIVATE ${DST})
+    target_sources(Halide PRIVATE ${DST})
 endforeach ()
 
-
 ##
 # Build the Halide mono-header.
 ##
 
 set(HALIDE_H "${Halide_BINARY_DIR}/include/Halide.h")
 set(LICENSE_PATH "${Halide_SOURCE_DIR}/LICENSE.txt")
+set(headers "$<TARGET_PROPERTY:Halide,HEADER_SET_private_headers>")
 add_custom_command(OUTPUT "${HALIDE_H}"
                    COMMAND ${CMAKE_COMMAND} -E make_directory "$<SHELL_PATH:${Halide_BINARY_DIR}/include>"
-                   COMMAND build_halide_h "$<SHELL_PATH:${LICENSE_PATH}>" ${HEADER_FILES} > "$<SHELL_PATH:${HALIDE_H}>"
-                   DEPENDS build_halide_h "${LICENSE_PATH}" ${HEADER_FILES}
+                   COMMAND build_halide_h "$<SHELL_PATH:${LICENSE_PATH}>" "${headers}" > "$<SHELL_PATH:${HALIDE_H}>"
+                   DEPENDS build_halide_h "${LICENSE_PATH}" "${headers}"
                    WORKING_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}"
+                   COMMAND_EXPAND_LISTS
                    VERBATIM)
 add_custom_target(HalideIncludes DEPENDS "${HALIDE_H}")
+add_dependencies(Halide HalideIncludes)
 
-##
-# Define the Halide library target.
-##
-
-add_library(Halide
-            ${SOURCE_FILES}
-            ${HEADER_FILES}
-            # Including these as sources works around the need to "install" Halide_initmod
-            $<TARGET_OBJECTS:Halide_initmod>
-            $<TARGET_OBJECTS:Halide_c_templates>)
+target_sources(
+    Halide
+    INTERFACE
+    FILE_SET HEADERS
+    BASE_DIRS "${Halide_BINARY_DIR}/include"
+    FILES "${Halide_BINARY_DIR}/include/Halide.h"
+)
 
 ##
 # Flatbuffers and Serialization dependencies.
@@ -491,8 +507,14 @@ if (WITH_SERIALIZATION)
     )
     add_custom_target(generate_fb_header DEPENDS "${fb_header}")
 
-    add_dependencies(Halide generate_fb_header)
-    target_include_directories(Halide PRIVATE "$<BUILD_INTERFACE:${fb_dir}>")
+    target_sources(
+        Halide
+        PRIVATE
+        FILE_SET fb_headers
+        TYPE HEADERS
+        BASE_DIRS "${fb_dir}"
+        FILES "${fb_header}"
+    )
     target_link_libraries(Halide PRIVATE ${flatbuffers_target})
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION)
 endif ()
@@ -507,8 +529,6 @@ if (WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
 endif ()
 
-add_library(Halide::Halide ALIAS Halide)
-
 target_link_libraries(Halide PRIVATE Halide::LLVM)
 target_link_libraries(Halide PUBLIC Halide::LanguageOptions)
 target_compile_features(Halide PUBLIC cxx_std_17)
@@ -542,10 +562,6 @@ target_compile_definitions(Halide PUBLIC
                            HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR}
                            HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH})
 
-
-target_include_directories(Halide INTERFACE "$<BUILD_INTERFACE:${Halide_BINARY_DIR}/include>")
-add_dependencies(Halide HalideIncludes)
-
 if (TARGET Halide_wabt)
     target_link_libraries(Halide PRIVATE Halide_wabt)
     target_compile_definitions(Halide PRIVATE WITH_WABT)
diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt
index e05a3137238d..9234ff6e6f64 100644
--- a/src/autoschedulers/adams2019/CMakeLists.txt
+++ b/src/autoschedulers/adams2019/CMakeLists.txt
@@ -81,7 +81,7 @@ if (WITH_UTILS)
                    retrain_cost_model.cpp
                    $<TARGET_OBJECTS:adams2019_weights_obj>)
     target_include_directories(adams2019_retrain_cost_model PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019")
-    target_link_libraries(adams2019_retrain_cost_model PRIVATE ASLog adams2019_cost_model adams2019_train_cost_model Halide::Halide Halide::Plugin)
+    target_link_libraries(adams2019_retrain_cost_model PRIVATE Halide::ASLog adams2019_cost_model adams2019_train_cost_model Halide::Halide Halide::Plugin)
 endif ()
 
 # =================================================================
@@ -103,7 +103,7 @@ add_autoscheduler(
 )
 
 target_include_directories(Halide_Adams2019 PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019")
-target_link_libraries(Halide_Adams2019 PRIVATE ASLog ParamParser adams2019_cost_model adams2019_train_cost_model)
+target_link_libraries(Halide_Adams2019 PRIVATE Halide::ASLog adams2019_cost_model adams2019_train_cost_model)
 
 # ====================================================
 # Auto-tuning support utilities.
@@ -120,7 +120,7 @@ endif ()
 
 if (WITH_TESTS)
     add_executable(adams2019_test_function_dag test_function_dag.cpp FunctionDAG.cpp)
-    target_link_libraries(adams2019_test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(adams2019_test_function_dag PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
     add_test(NAME adams2019_test_function_dag COMMAND adams2019_test_function_dag)
     set_tests_properties(adams2019_test_function_dag PROPERTIES LABELS "adams2019;autoschedulers_cpu")
 endif()
diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt
index 6b21568deecf..6f6f3c0feb82 100644
--- a/src/autoschedulers/anderson2021/CMakeLists.txt
+++ b/src/autoschedulers/anderson2021/CMakeLists.txt
@@ -36,7 +36,7 @@ if (WITH_UTILS)
                    retrain_cost_model.cpp
                    $<TARGET_OBJECTS:anderson2021_weights_obj>)
     target_include_directories(anderson2021_retrain_cost_model PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_retrain_cost_model PRIVATE ASLog anderson2021_cost_model
+    target_link_libraries(anderson2021_retrain_cost_model PRIVATE Halide::ASLog anderson2021_cost_model
         anderson2021_train_cost_model Halide::Halide Halide::Plugin)
 endif ()
 
@@ -60,7 +60,7 @@ add_autoscheduler(
 )
 
 target_include_directories(Halide_Anderson2021 PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-target_link_libraries(Halide_Anderson2021 PRIVATE ASLog ParamParser
+target_link_libraries(Halide_Anderson2021 PRIVATE Halide::ASLog
     anderson2021_cost_model anderson2021_train_cost_model)
 
 ## ====================================================
@@ -90,44 +90,44 @@ if (WITH_TESTS)
 
     add_executable(anderson2021_test_function_dag test_function_dag.cpp FunctionDAG.cpp)
     target_include_directories(anderson2021_test_function_dag PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_function_dag PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_function_dag PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
 
     _add_test(anderson2021_test_function_dag)
 
     add_executable(anderson2021_test_bounds test/bounds.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_bounds PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_bounds PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_bounds PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
 
     _add_test(anderson2021_test_bounds)
 
     add_executable(anderson2021_test_parser test/parser.cpp)
     target_include_directories(anderson2021_test_parser PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_parser PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_parser PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
 
     _add_test(anderson2021_test_parser)
 
     add_executable(anderson2021_test_state test/state.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_state PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_state PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_state PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
 
     _add_test(anderson2021_test_state)
 
     add_executable(anderson2021_test_storage_strides test/storage_strides.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_storage_strides PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_storage_strides PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_storage_strides PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
 
     _add_test(anderson2021_test_storage_strides)
 
     add_executable(anderson2021_test_thread_info test/thread_info.cpp LoopNest.cpp
         FunctionDAG.cpp GPULoopInfo.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_thread_info PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_thread_info PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_thread_info PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
 
     _add_test(anderson2021_test_thread_info)
 
     add_executable(anderson2021_test_tiling test/tiling.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_tiling PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_tiling PRIVATE ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_tiling PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
 
     _add_test(anderson2021_test_tiling)
 endif()
diff --git a/src/autoschedulers/common/CMakeLists.txt b/src/autoschedulers/common/CMakeLists.txt
index 39140ff0dc6e..13c8860cf92b 100644
--- a/src/autoschedulers/common/CMakeLists.txt
+++ b/src/autoschedulers/common/CMakeLists.txt
@@ -1,16 +1,28 @@
 add_library(Halide_Plugin INTERFACE)
 add_library(Halide::Plugin ALIAS Halide_Plugin)
-target_include_directories(Halide_Plugin INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
-target_link_libraries(Halide_Plugin INTERFACE Halide::Halide)
+target_sources(
+    Halide_Plugin
+    INTERFACE
+    FILE_SET HEADERS
+    FILES
+    Errors.h
+    HalidePlugin.h
+    ParamParser.h
+    cmdline.h
+    PerfectHashMap.h
+)
+target_link_libraries(Halide_Plugin INTERFACE Halide::Halide Halide::ASLog)
 
-add_library(ASLog STATIC ASLog.cpp)
-target_include_directories(ASLog PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
-set_property(TARGET ASLog PROPERTY POSITION_INDEPENDENT_CODE YES)
-
-# Sigh, header-only libraries shouldn't be special
-add_library(ParamParser INTERFACE)
-target_include_directories(ParamParser INTERFACE
-                           $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+add_library(Halide_ASLog STATIC)
+add_library(Halide::ASLog ALIAS Halide_ASLog)
+target_sources(
+    Halide_ASLog
+    PRIVATE ASLog.cpp
+    PUBLIC
+    FILE_SET HEADERS
+    FILES ASLog.h
+)
+set_property(TARGET Halide_ASLog PROPERTY POSITION_INDEPENDENT_CODE YES)
 
 if (WITH_UTILS)
     add_executable(featurization_to_sample featurization_to_sample.cpp)
diff --git a/src/autoschedulers/li2018/CMakeLists.txt b/src/autoschedulers/li2018/CMakeLists.txt
index 4872fa709971..4b4b5fc1e804 100644
--- a/src/autoschedulers/li2018/CMakeLists.txt
+++ b/src/autoschedulers/li2018/CMakeLists.txt
@@ -1,3 +1 @@
 add_autoscheduler(NAME Li2018 SOURCES GradientAutoscheduler.cpp)
-target_link_libraries(Halide_Li2018 PRIVATE ParamParser)
-
diff --git a/src/autoschedulers/mullapudi2016/CMakeLists.txt b/src/autoschedulers/mullapudi2016/CMakeLists.txt
index 7b7b3cfa3162..41a21ab1b086 100644
--- a/src/autoschedulers/mullapudi2016/CMakeLists.txt
+++ b/src/autoschedulers/mullapudi2016/CMakeLists.txt
@@ -1,2 +1 @@
 add_autoscheduler(NAME Mullapudi2016 SOURCES AutoSchedule.cpp)
-target_link_libraries(Halide_Mullapudi2016 PRIVATE ParamParser)
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index fba385a85ee7..061b6331fcce 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -146,6 +146,7 @@ set(RUNTIME_HEADER_FILES
 # in a different directory ONLY IF that source was created
 # by add_custom_command, as is the case in this directory.
 add_library(Halide_initmod OBJECT)
+add_library(Halide::initmod ALIAS Halide_initmod)
 
 # Note: ensure that these flags match the flags in the Makefile.
 # Note: this always uses Clang-from-LLVM for compilation, so none of these flags should need conditionalization.
@@ -337,8 +338,6 @@ foreach (i IN LISTS RUNTIME_HEADER_FILES)
                        DEPENDS "${i}" binary2cpp
                        VERBATIM)
     target_sources(Halide_initmod PRIVATE "_initmod_${SYM_NAME}.cpp")
-
-    configure_file(${i} "${Halide_BINARY_DIR}/include/${i}" COPYONLY)
 endforeach ()
 
 ##
@@ -347,10 +346,14 @@ endforeach ()
 
 add_library(Halide_Runtime INTERFACE)
 add_library(Halide::Runtime ALIAS Halide_Runtime)
-target_include_directories(Halide_Runtime INTERFACE $<BUILD_INTERFACE:${Halide_BINARY_DIR}/include>)
 set_target_properties(Halide_Runtime PROPERTIES EXPORT_NAME Runtime)
-option(Halide_BUILD_HEXAGON_REMOTE_RUNTIME "Build the hexagon remote runtime for offloading to Hexagon (HVX)" OFF)
 
+target_sources(Halide_Runtime
+               INTERFACE
+               FILE_SET HEADERS
+               FILES ${RUNTIME_HEADER_FILES})
+
+option(Halide_BUILD_HEXAGON_REMOTE_RUNTIME "Build the hexagon remote runtime for offloading to Hexagon (HVX)" OFF)
 if (Halide_BUILD_HEXAGON_REMOTE_RUNTIME AND NOT Halide_CLANG_TIDY_BUILD)
   add_subdirectory(hexagon_remote)
 endif ()
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 2c010ae07717..866ca2589e6c 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -177,10 +177,10 @@ function(_add_halide_aot_tests NAME)
     set(TARGET_CPP "generator_aotcpp_${NAME}")
 
     if (${_USING_WASM})
+        # for runtime, mini_webgpu.h
+        list(APPEND args_INCLUDES "${Halide_SOURCE_DIR}/src/runtime")
         if ("${Halide_TARGET}" MATCHES "webgpu")
             set(OPTIONS "-DTEST_WEBGPU")
-            # for mini_webgpu.h
-            list(APPEND args_INCLUDES "${Halide_SOURCE_DIR}/src/runtime")
         endif ()
         add_wasm_executable("${TARGET}"
                             SRCS "${args_SRCS}"
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 16af91e4c4df..275a94184916 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -21,25 +21,33 @@ target_compile_options(regexp_replace PRIVATE $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
 add_library(Halide_ImageIO INTERFACE)
 add_library(Halide::ImageIO ALIAS Halide_ImageIO)
 set_target_properties(Halide_ImageIO PROPERTIES EXPORT_NAME ImageIO)
+
 target_link_libraries(Halide_ImageIO
                       INTERFACE
+                      Halide::Runtime
                       $<TARGET_NAME_IF_EXISTS:PNG::PNG>
                       $<TARGET_NAME_IF_EXISTS:JPEG::JPEG>)
 target_compile_definitions(Halide_ImageIO
                            INTERFACE
                            $<$<NOT:$<TARGET_EXISTS:PNG::PNG>>:HALIDE_NO_PNG>
                            $<$<NOT:$<TARGET_EXISTS:JPEG::JPEG>>:HALIDE_NO_JPEG>)
-target_include_directories(Halide_ImageIO INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+target_sources(Halide_ImageIO INTERFACE FILE_SET HEADERS FILES halide_image_io.h)
 
 ##
-# Utility targets meant for users
+# RunGenMain
 ##
 
 add_library(Halide_RunGenMain INTERFACE)
 add_library(Halide::RunGenMain ALIAS Halide_RunGenMain)
+set_target_properties(Halide_RunGenMain PROPERTIES EXPORT_NAME RunGenMain)
+
 target_sources(Halide_RunGenMain INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/RunGenMain.cpp>)
+target_sources(Halide_RunGenMain INTERFACE FILE_SET HEADERS FILES RunGen.h)
 target_link_libraries(Halide_RunGenMain INTERFACE Halide::Runtime Halide::ImageIO Halide::Tools)
-set_target_properties(Halide_RunGenMain PROPERTIES EXPORT_NAME RunGenMain)
+
+##
+# Generator meta-target
+##
 
 add_library(Halide_GenGen STATIC GenGen.cpp)
 add_library(Halide::GenGen ALIAS Halide_GenGen)
@@ -55,13 +63,34 @@ target_link_libraries(
     Halide_Generator INTERFACE "$<LINK_LIBRARY:WHOLE_ARCHIVE,Halide::GenGen>"
 )
 
+##
+# Dependency-free header-only libs
+##
+
 add_library(Halide_Tools INTERFACE)
 add_library(Halide::Tools ALIAS Halide_Tools)
-target_include_directories(Halide_Tools INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
 set_target_properties(Halide_Tools PROPERTIES EXPORT_NAME Tools)
 
+target_sources(
+    Halide_Tools
+    INTERFACE
+    FILE_SET HEADERS
+    FILES
+    halide_benchmark.h
+    halide_image.h
+    halide_image_info.h
+    halide_malloc_trace.h
+    halide_trace_config.h
+)
+
+##
+# Simple thread pool
+##
+
 add_library(Halide_ThreadPool INTERFACE)
 add_library(Halide::ThreadPool ALIAS Halide_ThreadPool)
-target_link_libraries(Halide_ThreadPool INTERFACE Threads::Threads)
-target_include_directories(Halide_Tools INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
 set_target_properties(Halide_ThreadPool PROPERTIES EXPORT_NAME ThreadPool)
+
+target_link_libraries(Halide_ThreadPool INTERFACE Threads::Threads)
+target_sources(Halide_ThreadPool INTERFACE FILE_SET HEADERS FILES halide_thread_pool.h)
+

From 59da730b46135e7cf45eb0f1d0dbf67e941ac68a Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Wed, 7 Aug 2024 18:31:19 -0400
Subject: [PATCH 169/186] Clean up autoscheduler dependencies (#8372)

---
 src/autoschedulers/adams2019/CMakeLists.txt   |  7 +++--
 .../anderson2021/CMakeLists.txt               | 30 ++++++++++---------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/autoschedulers/adams2019/CMakeLists.txt b/src/autoschedulers/adams2019/CMakeLists.txt
index 9234ff6e6f64..a4db5e4d8910 100644
--- a/src/autoschedulers/adams2019/CMakeLists.txt
+++ b/src/autoschedulers/adams2019/CMakeLists.txt
@@ -81,7 +81,7 @@ if (WITH_UTILS)
                    retrain_cost_model.cpp
                    $<TARGET_OBJECTS:adams2019_weights_obj>)
     target_include_directories(adams2019_retrain_cost_model PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019")
-    target_link_libraries(adams2019_retrain_cost_model PRIVATE Halide::ASLog adams2019_cost_model adams2019_train_cost_model Halide::Halide Halide::Plugin)
+    target_link_libraries(adams2019_retrain_cost_model PRIVATE adams2019_cost_model adams2019_train_cost_model Halide::Plugin)
 endif ()
 
 # =================================================================
@@ -103,7 +103,7 @@ add_autoscheduler(
 )
 
 target_include_directories(Halide_Adams2019 PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/adams2019")
-target_link_libraries(Halide_Adams2019 PRIVATE Halide::ASLog adams2019_cost_model adams2019_train_cost_model)
+target_link_libraries(Halide_Adams2019 PRIVATE adams2019_cost_model adams2019_train_cost_model)
 
 # ====================================================
 # Auto-tuning support utilities.
@@ -120,7 +120,8 @@ endif ()
 
 if (WITH_TESTS)
     add_executable(adams2019_test_function_dag test_function_dag.cpp FunctionDAG.cpp)
-    target_link_libraries(adams2019_test_function_dag PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(adams2019_test_function_dag PRIVATE Halide::Plugin)
+
     add_test(NAME adams2019_test_function_dag COMMAND adams2019_test_function_dag)
     set_tests_properties(adams2019_test_function_dag PROPERTIES LABELS "adams2019;autoschedulers_cpu")
 endif()
diff --git a/src/autoschedulers/anderson2021/CMakeLists.txt b/src/autoschedulers/anderson2021/CMakeLists.txt
index 6f6f3c0feb82..00fa4f297769 100644
--- a/src/autoschedulers/anderson2021/CMakeLists.txt
+++ b/src/autoschedulers/anderson2021/CMakeLists.txt
@@ -16,7 +16,7 @@ add_library(anderson2021_weights_obj OBJECT ${WF_CPP})
 
 # cost_model, train_cost_model
 add_executable(anderson2021_cost_model.generator cost_model_generator.cpp)
-target_link_libraries(anderson2021_cost_model.generator PRIVATE Halide::Halide Halide::Generator)
+target_link_libraries(anderson2021_cost_model.generator PRIVATE Halide::Generator)
 
 add_halide_library(anderson2021_cost_model FROM anderson2021_cost_model.generator
                    GENERATOR cost_model
@@ -36,8 +36,13 @@ if (WITH_UTILS)
                    retrain_cost_model.cpp
                    $<TARGET_OBJECTS:anderson2021_weights_obj>)
     target_include_directories(anderson2021_retrain_cost_model PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_retrain_cost_model PRIVATE Halide::ASLog anderson2021_cost_model
-        anderson2021_train_cost_model Halide::Halide Halide::Plugin)
+    target_link_libraries(
+        anderson2021_retrain_cost_model
+        PRIVATE
+        anderson2021_cost_model
+        anderson2021_train_cost_model
+        Halide::Plugin
+    )
 endif ()
 
 ###
@@ -60,8 +65,7 @@ add_autoscheduler(
 )
 
 target_include_directories(Halide_Anderson2021 PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-target_link_libraries(Halide_Anderson2021 PRIVATE Halide::ASLog
-    anderson2021_cost_model anderson2021_train_cost_model)
+target_link_libraries(Halide_Anderson2021 PRIVATE anderson2021_cost_model anderson2021_train_cost_model)
 
 ## ====================================================
 if (WITH_UTILS)
@@ -75,8 +79,6 @@ endif ()
 # which is handled in tests/autoschedulers/anderson2021)
 
 if (WITH_TESTS)
-    ##
-
     function(_add_test TARGET)
         add_test(NAME ${TARGET} COMMAND ${TARGET})
         add_dependencies(${TARGET} Halide::Anderson2021)
@@ -90,44 +92,44 @@ if (WITH_TESTS)
 
     add_executable(anderson2021_test_function_dag test_function_dag.cpp FunctionDAG.cpp)
     target_include_directories(anderson2021_test_function_dag PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_function_dag PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_function_dag PRIVATE Halide::Plugin)
 
     _add_test(anderson2021_test_function_dag)
 
     add_executable(anderson2021_test_bounds test/bounds.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_bounds PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_bounds PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_bounds PRIVATE Halide::Plugin)
 
     _add_test(anderson2021_test_bounds)
 
     add_executable(anderson2021_test_parser test/parser.cpp)
     target_include_directories(anderson2021_test_parser PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_parser PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_parser PRIVATE Halide::Plugin)
 
     _add_test(anderson2021_test_parser)
 
     add_executable(anderson2021_test_state test/state.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_state PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_state PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_state PRIVATE Halide::Plugin)
 
     _add_test(anderson2021_test_state)
 
     add_executable(anderson2021_test_storage_strides test/storage_strides.cpp FunctionDAG.cpp LoopNest.cpp GPULoopInfo.cpp State.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_storage_strides PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_storage_strides PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_storage_strides PRIVATE Halide::Plugin)
 
     _add_test(anderson2021_test_storage_strides)
 
     add_executable(anderson2021_test_thread_info test/thread_info.cpp LoopNest.cpp
         FunctionDAG.cpp GPULoopInfo.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_thread_info PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_thread_info PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_thread_info PRIVATE Halide::Plugin)
 
     _add_test(anderson2021_test_thread_info)
 
     add_executable(anderson2021_test_tiling test/tiling.cpp Tiling.cpp)
     target_include_directories(anderson2021_test_tiling PRIVATE "${Halide_SOURCE_DIR}/src/autoschedulers/anderson2021")
-    target_link_libraries(anderson2021_test_tiling PRIVATE Halide::ASLog Halide::Halide Halide::Tools Halide::Plugin)
+    target_link_libraries(anderson2021_test_tiling PRIVATE Halide::Plugin)
 
     _add_test(anderson2021_test_tiling)
 endif()

From 206c03fa79fb37e4c4ef4e872bae1056ec1f449f Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Wed, 7 Aug 2024 23:39:31 -0400
Subject: [PATCH 170/186] Use a Find module for V8 (#8373)

This also adjusts the cache variable names to follow the
conventions set forth in the CMake documentation, here:

https://cmake.org/cmake/help/latest/manual/cmake-developer.7.html#standard-variable-names
---
 README_webassembly.md            |  8 ++++----
 cmake/FindV8.cmake               | 33 ++++++++++++++++++++++++++++++++
 dependencies/wasm/CMakeLists.txt | 16 +++-------------
 src/CMakeLists.txt               |  4 ++--
 4 files changed, 42 insertions(+), 19 deletions(-)
 create mode 100644 cmake/FindV8.cmake

diff --git a/README_webassembly.md b/README_webassembly.md
index 0fdcf80f15f1..7e71e394dee7 100644
--- a/README_webassembly.md
+++ b/README_webassembly.md
@@ -109,8 +109,8 @@ v8](https://v8.dev/docs/embed). The process for Halide is summarized below.
 
 With V8 built, we can pass the CMake options:
 
-- `V8_INCLUDE_PATH`, path to V8 includes, e.g. `$HOME/v8/v8/include`
-- `V8_LIB_PATH`, path to V8 static library, e.g. `$HOME/v8/v8/out.gn/x64.release.sample/obj/libv8_monolith.a`
+- `V8_INCLUDE_DIR`, path to V8 includes, e.g. `$HOME/v8/v8/include`
+- `V8_LIBRARY`, path to V8 static library, e.g. `$HOME/v8/v8/out.gn/x64.release.sample/obj/libv8_monolith.a`
 
 An example to configure Halide with V8 support, build and run an example test:
 
@@ -121,8 +121,8 @@ $ export HL_JIT_TARGET=${HL_TARGET}
 $ cmake -G Ninja \
       -DWITH_WABT=OFF \
       -DWITH_V8=ON \
-      -DV8_INCLUDE_PATH=$HOME/v8/v8/include \
-      -DV8_LIB_PATH=$HOME/v8/v8/out.gn/x64.release.sample/obj/libv8_monolith.a \
+      -DV8_INCLUDE_DIR=$HOME/v8/v8/include \
+      -DV8_LIBRARY=$HOME/v8/v8/out.gn/x64.release.sample/obj/libv8_monolith.a \
       -DHalide_TARGET=${HL_TARGET} \
       /* other cmake settings here as appropriate */
 
diff --git a/cmake/FindV8.cmake b/cmake/FindV8.cmake
new file mode 100644
index 000000000000..887d52f45329
--- /dev/null
+++ b/cmake/FindV8.cmake
@@ -0,0 +1,33 @@
+if (EXISTS "${V8_INCLUDE_PATH}")
+    message(DEPRECATION "V8_INCLUDE_PATH has been renamed to V8_INCLUDE_DIR")
+    set(V8_INCLUDE_DIR "${V8_INCLUDE_PATH}")
+    set(V8_INCLUDE_DIR "${V8_INCLUDE_PATH}" CACHE PATH "")
+endif ()
+
+find_path(V8_INCLUDE_DIR v8.h)
+
+if (EXISTS "${V8_LIB_PATH}")
+    message(DEPRECATION "V8_LIB_PATH has been renamed to V8_LIBRARY")
+    set(V8_LIBRARY "${V8_LIB_PATH}")
+    set(V8_LIBRARY "${V8_LIB_PATH}" CACHE FILEPATH "")
+endif ()
+
+find_library(
+    V8_LIBRARY
+    NAMES v8_monolith
+    PATH_SUFFIXES
+    out.gn/x64.release.sample/obj
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    V8
+    REQUIRED_VARS V8_LIBRARY V8_INCLUDE_DIR
+    HANDLE_COMPONENTS
+)
+
+if (V8_FOUND AND NOT TARGET V8::V8)
+    add_library(V8::V8 UNKNOWN IMPORTED)
+    set_target_properties(V8::V8 PROPERTIES IMPORTED_LOCATION "${V8_LIBRARY}")
+    target_include_directories(V8::V8 INTERFACE "${V8_INCLUDE_DIR}")
+endif ()
diff --git a/dependencies/wasm/CMakeLists.txt b/dependencies/wasm/CMakeLists.txt
index 66a354af5547..db9ffd972fdf 100644
--- a/dependencies/wasm/CMakeLists.txt
+++ b/dependencies/wasm/CMakeLists.txt
@@ -53,19 +53,9 @@ if (WITH_WABT)
 endif ()
 
 if (WITH_V8)
-    # Instructions to build V8 can be found in README_webassembly.md.
-    if (NOT V8_INCLUDE_PATH)
-        message(FATAL_ERROR "Please set V8_INCLUDE_PATH on the CMake command line.")
-    endif()
-    if (NOT V8_LIB_PATH)
-        message(FATAL_ERROR "Please set V8_LIB_PATH on the CMake command line.")
-    endif()
-
-    message(STATUS "Using V8")
-    add_library(Halide_V8 STATIC IMPORTED GLOBAL)
-    set_target_properties(Halide_V8 PROPERTIES IMPORTED_LOCATION "${V8_LIB_PATH}")
-    target_include_directories(Halide_V8 INTERFACE "${V8_INCLUDE_PATH}")
-endif()
+    find_package(V8 REQUIRED)
+    set_target_properties(V8::V8 PROPERTIES IMPORTED_GLOBAL TRUE)
+endif ()
 
 function(add_wasm_executable TARGET)
     set(options)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 79c5cd404f0f..a5ee7ecb39dd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -567,8 +567,8 @@ if (TARGET Halide_wabt)
     target_compile_definitions(Halide PRIVATE WITH_WABT)
 endif ()
 
-if (TARGET Halide_V8)
-    target_link_libraries(Halide PRIVATE Halide_V8)
+if (TARGET V8::V8)
+    target_link_libraries(Halide PRIVATE V8::V8)
     target_compile_definitions(Halide PRIVATE WITH_V8)
 endif ()
 

From 8feee81b334967e1cf2db3211ddbf76e3735e8b4 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Wed, 7 Aug 2024 23:39:50 -0400
Subject: [PATCH 171/186] Use a Find module for NodeJS (#8374)

---
 cmake/FindNodeJS.cmake           | 35 ++++++++++++++++++++++++++++++++
 dependencies/wasm/CMakeLists.txt | 27 ++++++------------------
 2 files changed, 41 insertions(+), 21 deletions(-)
 create mode 100644 cmake/FindNodeJS.cmake

diff --git a/cmake/FindNodeJS.cmake b/cmake/FindNodeJS.cmake
new file mode 100644
index 000000000000..a1168435e1cb
--- /dev/null
+++ b/cmake/FindNodeJS.cmake
@@ -0,0 +1,35 @@
+if (EXISTS "${NODE_JS_EXECUTABLE}")
+    message(DEPRECATION "NODE_JS_EXECUTABLE has been renamed to NodeJS_EXECUTABLE")
+    set(NodeJS_EXECUTABLE "${NODE_JS_EXECUTABLE}")
+    set(NodeJS_EXECUTABLE "${NODE_JS_EXECUTABLE}" CACHE PATH "")
+endif ()
+
+find_program(
+    NodeJS_EXECUTABLE
+    NAMES node nodejs
+)
+
+set(NodeJS_VERSION "")
+if (NodeJS_EXECUTABLE)
+    execute_process(
+        COMMAND "${NodeJS_EXECUTABLE}" --version
+        OUTPUT_VARIABLE NodeJS_VERSION
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    string(REPLACE "v" "" NodeJS_VERSION "${NodeJS_VERSION}")
+endif ()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    NodeJS
+    REQUIRED_VARS NodeJS_EXECUTABLE
+    VERSION_VAR NodeJS_VERSION
+    HANDLE_COMPONENTS
+)
+
+if (NodeJS_FOUND AND NOT TARGET NodeJS::node)
+    add_executable(NodeJS::node IMPORTED)
+    set_target_properties(
+        NodeJS::node PROPERTIES IMPORTED_LOCATION "${NodeJS_EXECUTABLE}"
+    )
+endif ()
diff --git a/dependencies/wasm/CMakeLists.txt b/dependencies/wasm/CMakeLists.txt
index db9ffd972fdf..38ad1f79198f 100644
--- a/dependencies/wasm/CMakeLists.txt
+++ b/dependencies/wasm/CMakeLists.txt
@@ -143,25 +143,10 @@ function(add_wasm_halide_test TARGET)
         return()
     endif ()
 
-    add_halide_test("${TARGET}"
-                    GROUPS ${args_GROUPS}
-                    COMMAND ${NODE_JS_EXECUTABLE} "${Halide_SOURCE_DIR}/tools/launch_wasm_test.js" "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.js" "${Halide_TARGET}")
-endfunction()
-
-function(find_node_js)
-    find_program(NODE_JS_EXECUTABLE node nodejs REQUIRED)
-
-    execute_process(COMMAND "${NODE_JS_EXECUTABLE}" --version
-                    OUTPUT_VARIABLE NODE_JS_VERSION_RAW
-                    OUTPUT_STRIP_TRAILING_WHITESPACE)
-    string(REPLACE "v" "" NODE_JS_VERSION "${NODE_JS_VERSION_RAW}")
-
-    if (NODE_JS_VERSION VERSION_LESS "16.13")
-        message(FATAL_ERROR "Halide requires Node v16.13 or later, but found ${NODE_JS_VERSION_RAW} at ${NODE_JS_EXECUTABLE}. Please set NODE_JS_EXECUTABLE on the CMake command line.")
-    endif ()
+    find_package(NodeJS 16.13 REQUIRED)
+    add_halide_test(
+        "${TARGET}"
+        GROUPS ${args_GROUPS}
+        COMMAND "${NodeJS_EXECUTABLE}" "${Halide_SOURCE_DIR}/tools/launch_wasm_test.js" "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.js" "${Halide_TARGET}"
+    )
 endfunction()
-
-if (Halide_TARGET MATCHES "wasm")
-    # Check and warn up front if a suitable Node.js isn't found
-    find_node_js()
-endif ()

From 3ed55b4295b874946a7fcd5473fb2e554ef41243 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Thu, 8 Aug 2024 14:23:44 -0400
Subject: [PATCH 172/186] Move dependencies/wasm to use sites (#8377)

Also replace WITH_WABT and WITH_V8 with Halide_WASM_BACKEND, which can
be either wabt, V8, or a CMake false-y value such as OFF. Deprecation
notices are provided to ease user transitions.
---
 CMakeLists.txt                   |   2 +-
 README_cmake.md                  |   6 +-
 dependencies/CMakeLists.txt      |   8 --
 dependencies/wasm/CMakeLists.txt | 152 -------------------------------
 src/CMakeLists.txt               |  80 ++++++++++++++--
 test/generator/CMakeLists.txt    |  94 +++++++++++++++++++
 6 files changed, 171 insertions(+), 171 deletions(-)
 delete mode 100644 dependencies/CMakeLists.txt
 delete mode 100644 dependencies/wasm/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2364f58ee4f1..5af14c0080ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,7 +188,7 @@ option(THREADS_PREFER_PTHREAD_FLAG "When enabled, prefer to use the -pthread fla
 find_package(Threads REQUIRED)
 
 ## Complex dependencies
-add_subdirectory(dependencies)
+add_subdirectory(dependencies/llvm)
 
 ## Image formats
 
diff --git a/README_cmake.md b/README_cmake.md
index 6b075646da7f..60adfc76d61d 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -458,9 +458,9 @@ The following options enable/disable various Halide-specific backends:
 The following options are WebAssembly-specific. They only apply when
 `TARGET_WEBASSEMBLY=ON`:
 
-| Option      | Default | Description                               |
-|-------------|---------|-------------------------------------------|
-| `WITH_WABT` | `ON`    | Include WABT Interpreter for WASM testing |
+| Option                | Default | Description                                                                              |
+|-----------------------|---------|------------------------------------------------------------------------------------------|
+| `Halide_WASM_BACKEND` | `wabt`  | Select the backend for WASM testing. Can be `wabt`, `V8` or a false value such as `OFF`. |
 
 ### Find module options
 
diff --git a/dependencies/CMakeLists.txt b/dependencies/CMakeLists.txt
deleted file mode 100644
index 56078b20d4e0..000000000000
--- a/dependencies/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-##
-# Third-party dependencies in their own subdirectories
-##
-
-add_subdirectory(llvm)
-
-# Needs cache vars set by llvm, do not reorder.
-add_subdirectory(wasm)
diff --git a/dependencies/wasm/CMakeLists.txt b/dependencies/wasm/CMakeLists.txt
deleted file mode 100644
index 38ad1f79198f..000000000000
--- a/dependencies/wasm/CMakeLists.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-include(FetchContent)
-include(CMakeDependentOption)
-
-cmake_dependent_option(WITH_WABT "Include WABT Interpreter for WASM testing" ON "TARGET_WEBASSEMBLY" OFF)
-cmake_dependent_option(WITH_V8 "Include V8 for WASM testing" OFF "TARGET_WEBASSEMBLY" OFF)
-
-if (WITH_WABT AND WITH_V8)
-    message(FATAL_ERROR "Cannot use both WABT and V8 at the same time, disable one of them.")
-endif ()
-
-if ("${CMAKE_HOST_SYSTEM_NAME}" STREQUAL "Windows")
-    if (WITH_WABT)
-        message(STATUS "WITH_WABT is not yet supported on Windows")
-        set(WITH_WABT OFF CACHE BOOL "WITH_WABT is not yet supported on Windows" FORCE)
-    endif ()
-endif ()
-
-if (WITH_WABT)
-    set(WABT_VER 1.0.33)
-
-    message(STATUS "Fetching WABT ${WABT_VER}...")
-    FetchContent_Declare(wabt
-                         GIT_REPOSITORY https://github.com/WebAssembly/wabt.git
-                         GIT_TAG ${WABT_VER}
-                         GIT_SHALLOW TRUE)
-
-    # configuration for wabt
-    set(WITH_EXCEPTIONS ${Halide_ENABLE_EXCEPTIONS})
-    set(BUILD_TESTS OFF)
-    set(BUILD_TOOLS OFF)
-    set(BUILD_LIBWASM OFF)
-    set(USE_INTERNAL_SHA256 ON)
-    FetchContent_MakeAvailable(wabt)
-
-    set_target_properties(wabt PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-    # Disable this very-noisy warning in GCC
-    target_compile_options(wabt
-                           PRIVATE
-                           $<$<CXX_COMPILER_ID:GNU>:-Wno-alloca-larger-than>)
-
-    # TODO: we want to require unique prefixes to include these files, to avoid ambiguity;
-    # this means we have to prefix with "wabt-src/...", which is less bad than other alternatives,
-    # but perhaps we could do better (esp. if wabt was smarter about what it exposed?)
-    add_library(Halide_wabt INTERFACE)
-    target_sources(Halide_wabt INTERFACE $<BUILD_INTERFACE:$<TARGET_OBJECTS:wabt>>)
-    target_include_directories(Halide_wabt
-                               SYSTEM # Use -isystem instead of -I; this is a trick so that clang-tidy won't analyze these includes
-                               INTERFACE
-                               $<BUILD_INTERFACE:${wabt_SOURCE_DIR}>/include
-                               $<BUILD_INTERFACE:${wabt_BINARY_DIR}>/include)
-    set_target_properties(Halide_wabt PROPERTIES EXPORT_NAME wabt)
-endif ()
-
-if (WITH_V8)
-    find_package(V8 REQUIRED)
-    set_target_properties(V8::V8 PROPERTIES IMPORTED_GLOBAL TRUE)
-endif ()
-
-function(add_wasm_executable TARGET)
-    set(options)
-    set(oneValueArgs)
-    set(multiValueArgs SRCS DEPS INCLUDES OPTIONS ENABLE_IF)
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (args_ENABLE_IF AND NOT (${args_ENABLE_IF}))
-        return()
-    endif ()
-
-    # Conceptually, we want something like this:
-    # add_executable(${TARGET} ${args_SRCS})
-    # if (args_INCLUDES)
-    #     target_include_directories("${TARGET}" PRIVATE ${args_INCLUDES})
-    # endif()
-    # if (args_DEPS)
-    #     target_link_libraries(${TARGET} PRIVATE ${args_DEPS})
-    # endif ()
-
-    find_program(EMCC emcc REQUIRED HINTS "$ENV{EMSDK}/upstream/emscripten")
-
-    # TODO: this is currently hardcoded to settings that are sensible for most of Halide's
-    # internal purposes. Consider adding ways to customize this as appropriate.
-    set(EMCC_FLAGS
-        -O3
-        -std=c++17
-        -Wall
-        -Wcast-qual
-        -Werror
-        -Wignored-qualifiers
-        -Wno-comment
-        -Wno-psabi
-        -Wno-unknown-warning-option
-        -Wno-unused-function
-        -Wsign-compare
-        -Wsuggest-override
-        -s ASSERTIONS=1
-        -s ALLOW_MEMORY_GROWTH=1
-        -s ENVIRONMENT=node
-        -s STACK_SIZE=98304
-        ${args_OPTIONS}
-    )
-
-    if ("${Halide_TARGET}" MATCHES "webgpu")
-        set(EMCC_FLAGS
-            ${EMCC_FLAGS}
-            -s USE_WEBGPU=1
-            -s ASYNCIFY
-        )
-    endif ()
-
-    set(SRCS)
-    foreach (S IN LISTS args_SRCS)
-        list(APPEND SRCS "${CMAKE_CURRENT_SOURCE_DIR}/${S}")
-    endforeach ()
-
-    set(INCLUDES)
-    foreach (I IN LISTS args_INCLUDES)
-        list(APPEND INCLUDES "-I${I}")
-    endforeach ()
-
-    set(DEPS)
-    foreach (D IN LISTS args_DEPS)
-        list(APPEND DEPS $<TARGET_FILE:${D}>)
-    endforeach ()
-
-    add_custom_command(OUTPUT "${TARGET}.wasm" "${TARGET}.js"
-                       COMMAND ${EMCC} ${EMCC_FLAGS} ${INCLUDES} ${SRCS} ${DEPS} -o "${TARGET}.js"
-                       DEPENDS ${SRCS} ${DEPS}
-                       VERBATIM)
-
-    add_custom_target("${TARGET}" ALL
-                      DEPENDS "${TARGET}.wasm" "${TARGET}.js")
-
-endfunction()
-
-function(add_wasm_halide_test TARGET)
-    set(options)
-    set(oneValueArgs)
-    set(multiValueArgs GROUPS ENABLE_IF)
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (args_ENABLE_IF AND NOT (${args_ENABLE_IF}))
-        return()
-    endif ()
-
-    find_package(NodeJS 16.13 REQUIRED)
-    add_halide_test(
-        "${TARGET}"
-        GROUPS ${args_GROUPS}
-        COMMAND "${NodeJS_EXECUTABLE}" "${Halide_SOURCE_DIR}/tools/launch_wasm_test.js" "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.js" "${Halide_TARGET}"
-    )
-endfunction()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a5ee7ecb39dd..ce22fbf65990 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -562,14 +562,80 @@ target_compile_definitions(Halide PUBLIC
                            HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR}
                            HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH})
 
-if (TARGET Halide_wabt)
-    target_link_libraries(Halide PRIVATE Halide_wabt)
-    target_compile_definitions(Halide PRIVATE WITH_WABT)
-endif ()
+##
+# WasmExecutor backend selection
+##
+
+if (TARGET_WEBASSEMBLY)
+    include(FetchContent)
+
+    set(Halide_WASM_BACKEND "wabt"
+        CACHE STRING "Which backend to use for Halide's WASM testing.")
+    set_property(CACHE Halide_WASM_BACKEND PROPERTY STRINGS "wabt;V8;OFF")
+
+    if (WITH_WABT AND NOT WITH_V8)
+        message(DEPRECATION "WITH_WABT has been replaced by Halide_WASM_BACKEND=\"wabt\"")
+        set(Halide_WASM_BACKEND "wabt")
+    elseif (NOT WITH_WABT AND WITH_V8)
+        message(DEPRECATION "WITH_V8 has been replaced by Halide_WASM_BACKEND=\"V8\"")
+        set(Halide_WASM_BACKEND "V8")
+    elseif (WITH_WABT AND WITH_V8)
+        message(FATAL_ERROR "Cannot use both WABT and V8 at the same time, disable one of them.")
+    elseif (DEFINED WITH_WABT AND DEFINED WITH_V8 AND NOT WITH_WABT AND NOT WITH_V8)
+        message(DEPRECATION "Disabling both WITH_WABT and WITH_V8 has been replaced by Halide_WASM_BACKEND=\"OFF\"")
+        set(Halide_WASM_BACKEND "OFF")
+    endif ()
+
+    if (MSVC AND Halide_WASM_BACKEND STREQUAL "wabt")
+        message(WARNING "wabt is not yet supported on Windows")
+        set(Halide_WASM_BACKEND "OFF")
+    endif ()
 
-if (TARGET V8::V8)
-    target_link_libraries(Halide PRIVATE V8::V8)
-    target_compile_definitions(Halide PRIVATE WITH_V8)
+    if (Halide_WASM_BACKEND STREQUAL "wabt")
+        set(WABT_VER 1.0.33)
+
+        message(STATUS "Fetching WABT ${WABT_VER}...")
+        FetchContent_Declare(wabt
+                             GIT_REPOSITORY https://github.com/WebAssembly/wabt.git
+                             GIT_TAG ${WABT_VER}
+                             GIT_SHALLOW TRUE)
+
+        # configuration for wabt
+        set(WITH_EXCEPTIONS ${Halide_ENABLE_EXCEPTIONS})
+        set(BUILD_TESTS OFF)
+        set(BUILD_TOOLS OFF)
+        set(BUILD_LIBWASM OFF)
+        set(USE_INTERNAL_SHA256 ON)
+        FetchContent_MakeAvailable(wabt)
+
+        set_target_properties(wabt PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+        # Disable this very-noisy warning in GCC
+        target_compile_options(wabt
+                               PRIVATE
+                               $<$<CXX_COMPILER_ID:GNU>:-Wno-alloca-larger-than>)
+
+        # TODO: we want to require unique prefixes to include these files, to avoid ambiguity;
+        # this means we have to prefix with "wabt-src/...", which is less bad than other alternatives,
+        # but perhaps we could do better (esp. if wabt was smarter about what it exposed?)
+        add_library(Halide_wabt INTERFACE)
+        target_sources(Halide_wabt INTERFACE $<BUILD_LOCAL_INTERFACE:$<TARGET_OBJECTS:wabt>>)
+        target_include_directories(Halide_wabt
+                                   SYSTEM # Use -isystem instead of -I; this is a trick so that clang-tidy won't analyze these includes
+                                   INTERFACE
+                                   $<BUILD_INTERFACE:${wabt_SOURCE_DIR}>/include
+                                   $<BUILD_INTERFACE:${wabt_BINARY_DIR}>/include)
+        set_target_properties(Halide_wabt PROPERTIES EXPORT_NAME wabt)
+
+        target_link_libraries(Halide PRIVATE Halide_wabt)
+        target_compile_definitions(Halide PRIVATE WITH_WABT)
+    elseif (Halide_WASM_BACKEND STREQUAL "V8")
+        find_package(V8 REQUIRED)
+        target_link_libraries(Halide PRIVATE V8::V8)
+        target_compile_definitions(Halide PRIVATE WITH_V8)
+    elseif (Halide_WASM_BACKEND)
+        message(FATAL_ERROR "Unknown Halide_WASM_BACKEND `${Halide_WASM_BACKEND}`")
+    endif ()
 endif ()
 
 ##
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 866ca2589e6c..3e1a656cefcf 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -12,6 +12,100 @@ else()
     set(_USING_WASM 0)
 endif()
 
+function(add_wasm_executable TARGET)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs SRCS DEPS INCLUDES OPTIONS ENABLE_IF)
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (args_ENABLE_IF AND NOT (${args_ENABLE_IF}))
+        return()
+    endif ()
+
+    # Conceptually, we want something like this:
+    # add_executable(${TARGET} ${args_SRCS})
+    # if (args_INCLUDES)
+    #     target_include_directories("${TARGET}" PRIVATE ${args_INCLUDES})
+    # endif()
+    # if (args_DEPS)
+    #     target_link_libraries(${TARGET} PRIVATE ${args_DEPS})
+    # endif ()
+
+    find_program(EMCC emcc REQUIRED HINTS "$ENV{EMSDK}/upstream/emscripten")
+
+    # TODO: this is currently hardcoded to settings that are sensible for most of Halide's
+    # internal purposes. Consider adding ways to customize this as appropriate.
+    set(EMCC_FLAGS
+        -O3
+        -std=c++17
+        -Wall
+        -Wcast-qual
+        -Werror
+        -Wignored-qualifiers
+        -Wno-comment
+        -Wno-psabi
+        -Wno-unknown-warning-option
+        -Wno-unused-function
+        -Wsign-compare
+        -Wsuggest-override
+        -s ASSERTIONS=1
+        -s ALLOW_MEMORY_GROWTH=1
+        -s ENVIRONMENT=node
+        -s STACK_SIZE=98304
+        ${args_OPTIONS}
+    )
+
+    if ("${Halide_TARGET}" MATCHES "webgpu")
+        set(EMCC_FLAGS
+            ${EMCC_FLAGS}
+            -s USE_WEBGPU=1
+            -s ASYNCIFY
+        )
+    endif ()
+
+    set(SRCS)
+    foreach (S IN LISTS args_SRCS)
+        list(APPEND SRCS "${CMAKE_CURRENT_SOURCE_DIR}/${S}")
+    endforeach ()
+
+    set(INCLUDES)
+    foreach (I IN LISTS args_INCLUDES)
+        list(APPEND INCLUDES "-I${I}")
+    endforeach ()
+
+    set(DEPS)
+    foreach (D IN LISTS args_DEPS)
+        list(APPEND DEPS $<TARGET_FILE:${D}>)
+    endforeach ()
+
+    add_custom_command(OUTPUT "${TARGET}.wasm" "${TARGET}.js"
+                       COMMAND ${EMCC} ${EMCC_FLAGS} ${INCLUDES} ${SRCS} ${DEPS} -o "${TARGET}.js"
+                       DEPENDS ${SRCS} ${DEPS}
+                       VERBATIM)
+
+    add_custom_target("${TARGET}" ALL
+                      DEPENDS "${TARGET}.wasm" "${TARGET}.js")
+
+endfunction()
+
+function(add_wasm_halide_test TARGET)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs GROUPS ENABLE_IF)
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (args_ENABLE_IF AND NOT (${args_ENABLE_IF}))
+        return()
+    endif ()
+
+    find_package(NodeJS 16.13 REQUIRED)
+    add_halide_test(
+        "${TARGET}"
+        GROUPS ${args_GROUPS}
+        COMMAND "${NodeJS_EXECUTABLE}" "${Halide_SOURCE_DIR}/tools/launch_wasm_test.js" "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.js" "${Halide_TARGET}"
+    )
+endfunction()
+
 # Emit two halide_library targets, one with the default backend with the given name,
 # and (optionally) one with the C++ backend with the name NAME_cpp. (The CPP one defaults to being
 # emitted, but can be skipped if OMIT_C_BACKEND is specified.)

From 56f14c89bb21b91cdc167968007211a68d9fbe25 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Thu, 8 Aug 2024 21:40:20 -0400
Subject: [PATCH 173/186] Replace FetchContent with a custom dependency
 provider (#8378)

The build no longer uses FetchContent, instead using find_package
always and everywhere. When Halide is the top-level project, it will
(by default) inject a dependency provider that overrides the wabt,
flatbuffers, and pybind11 packages with FetchContent. Users can opt
out by setting Halide_USE_FETCHCONTENT=NO.

This also bumps the required wabt version to the latest release
(1.0.36). This version includes a patch I submitted that fixes the
CMake package when wabt is built with OpenSSL rather than picosha2.

Here are relevant links to the docs:

  * https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html#dependency-providers
  * https://cmake.org/cmake/help/latest/command/cmake_language.html#dependency-providers
---
 CMakeLists.txt                 |  6 +++
 cmake/dependencies.cmake       | 64 +++++++++++++++++++++++
 packaging/CMakeLists.txt       |  2 +-
 python_bindings/CMakeLists.txt | 23 +--------
 src/CMakeLists.txt             | 94 ++++++++--------------------------
 5 files changed, 94 insertions(+), 95 deletions(-)
 create mode 100644 cmake/dependencies.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5af14c0080ac..a62ef6bbc3bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,10 @@
 cmake_minimum_required(VERSION 3.28)
+
+option(Halide_USE_FETCHCONTENT "When Halide is top-level, use FetchContent for build-time dependencies." ON)
+if (Halide_USE_FETCHCONTENT)
+    list(APPEND CMAKE_PROJECT_TOP_LEVEL_INCLUDES "${CMAKE_CURRENT_LIST_DIR}/cmake/dependencies.cmake")
+endif ()
+
 project(Halide
         VERSION 19.0.0
         DESCRIPTION "Halide compiler and libraries"
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
new file mode 100644
index 000000000000..2ec0205ba576
--- /dev/null
+++ b/cmake/dependencies.cmake
@@ -0,0 +1,64 @@
+include(FetchContent)
+
+FetchContent_Declare(
+    flatbuffers
+    GIT_REPOSITORY https://github.com/google/flatbuffers.git
+    GIT_TAG 0100f6a5779831fa7a651e4b67ef389a8752bd9b # v23.5.26
+    GIT_SHALLOW TRUE
+)
+
+FetchContent_Declare(
+    pybind11
+    GIT_REPOSITORY https://github.com/pybind/pybind11.git
+    GIT_TAG 5b0a6fc2017fcc176545afe3e09c9f9885283242 # v2.10.4
+    GIT_SHALLOW TRUE
+)
+
+FetchContent_Declare(
+    wabt
+    GIT_REPOSITORY https://github.com/WebAssembly/wabt.git
+    GIT_TAG 3e826ecde1adfba5f88d10d361131405637e65a3 # 1.0.36
+    GIT_SHALLOW TRUE
+)
+
+macro(Halide_provide_dependency method dep_name)
+    set(${dep_name}_FOUND 1)
+
+    ## Set up sub-builds for Halide's requirements
+    if ("${dep_name}" STREQUAL "flatbuffers")
+        set(FLATBUFFERS_BUILD_TESTS OFF)
+        set(FLATBUFFERS_INSTALL OFF)
+    elseif ("${dep_name}" STREQUAL "pybind11")
+        # No special build options necessary
+    elseif ("${dep_name}" STREQUAL "wabt")
+        set(WITH_EXCEPTIONS "${Halide_ENABLE_EXCEPTIONS}")
+        set(BUILD_TESTS OFF)
+        set(BUILD_TOOLS OFF)
+        set(BUILD_LIBWASM OFF)
+        set(USE_INTERNAL_SHA256 ON)
+    else ()
+        set(${dep_name}_FOUND 0)
+    endif ()
+
+    if (${dep_name}_FOUND)
+        list(APPEND Halide_provide_dependency_args "${method}" "${dep_name}")
+        FetchContent_MakeAvailable(${dep_name})
+        list(POP_BACK Halide_provide_dependency_args method dep_name)
+
+        ## Patches for broken packages
+        if ("${dep_name}" STREQUAL "flatbuffers")
+            if (NOT TARGET flatbuffers::flatbuffers)
+                add_library(flatbuffers::flatbuffers ALIAS flatbuffers)
+                add_executable(flatbuffers::flatc ALIAS flatc)
+            endif ()
+        endif ()
+        if ("${dep_name}" STREQUAL "wabt")
+            set_target_properties(wabt PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        endif ()
+    endif ()
+endmacro()
+
+cmake_language(
+    SET_DEPENDENCY_PROVIDER Halide_provide_dependency
+    SUPPORTED_METHODS FIND_PACKAGE
+)
diff --git a/packaging/CMakeLists.txt b/packaging/CMakeLists.txt
index ed5c4b9b77c3..5d1239384120 100644
--- a/packaging/CMakeLists.txt
+++ b/packaging/CMakeLists.txt
@@ -37,7 +37,7 @@ if (TARGET Halide_Adams2019)
 endif ()
 
 # Halide_LLVM
-foreach (dep IN ITEMS Halide_LLVM Halide_wabt)
+foreach (dep IN ITEMS Halide_LLVM)
     if (TARGET ${dep})
         install(TARGETS ${dep} EXPORT Halide_Targets)
     endif ()
diff --git a/python_bindings/CMakeLists.txt b/python_bindings/CMakeLists.txt
index 3cc922965545..c97fe35128ea 100644
--- a/python_bindings/CMakeLists.txt
+++ b/python_bindings/CMakeLists.txt
@@ -32,10 +32,6 @@ cmake_dependent_option(
     WITH_TESTS OFF
 )
 
-# Set the expected (downloaded) version of pybind11
-option(PYBIND11_USE_FETCHCONTENT "Enable to download pybind11 via FetchContent" ON)
-set(PYBIND11_VER 2.10.4 CACHE STRING "The pybind11 version to use (or download)")
-
 ##
 # Dependencies
 ##
@@ -44,25 +40,10 @@ set(PYBIND11_VER 2.10.4 CACHE STRING "The pybind11 version to use (or download)"
 # Development.Module and Development.Embed. We don't need the Embed
 # part, so only requesting Module avoids failures when Embed is not
 # available, as is the case in the manylinux Docker images.
-find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
-if (Python3_VERSION VERSION_LESS "3.8")
-    message(FATAL_ERROR "Halide requires Python v3.8 or later, but found ${Python3_VERSION}.")
-endif ()
-message(STATUS "Found Python ${Python3_VERSION} at ${Python3_EXECUTABLE}")
+find_package(Python3 3.8 REQUIRED Interpreter Development.Module)
 
 if (WITH_PYTHON_BINDINGS)
-    # If we are actually going to build the bindings, we need pybind11.
-    if (PYBIND11_USE_FETCHCONTENT)
-        include(FetchContent)
-        FetchContent_Declare(
-            pybind11
-            GIT_REPOSITORY https://github.com/pybind/pybind11.git
-            GIT_TAG v${PYBIND11_VER}
-        )
-        FetchContent_MakeAvailable(pybind11)
-    else ()
-        find_package(pybind11 ${PYBIND11_VER} REQUIRED)
-    endif ()
+    find_package(pybind11 2.10.4 REQUIRED)
 endif ()
 
 # Note: this must happen, especially when WITH_PYTHON_BINDINGS is OFF.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ce22fbf65990..78b137239272 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -459,41 +459,19 @@ target_sources(
 # Build serialization, enabled by default
 option(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON)
 if (WITH_SERIALIZATION)
-    # flatbuffers is small and compiles quickly, but if you want/need to use
-    # a local version (via find_package), configure with FLATBUFFERS_USE_FETCHCONTENT=OFF
-    option(FLATBUFFERS_USE_FETCHCONTENT "Enable to download the Flatbuffers library via FetchContent" ON)
-    set(FLATBUFFERS_VER 23.5.26 CACHE STRING "The Flatbuffers version to use (or download) ")
-
-    if (FLATBUFFERS_USE_FETCHCONTENT)
-        include(FetchContent)
-        FetchContent_Declare(
-            flatbuffers
-            GIT_REPOSITORY https://github.com/google/flatbuffers.git
-            GIT_TAG v${FLATBUFFERS_VER}
-            GIT_SHALLOW TRUE
-            SYSTEM
-        )
-        # configuration for flatbuffers
-        set(FLATBUFFERS_BUILD_TESTS OFF)
-        set(FLATBUFFERS_INSTALL OFF)
-        FetchContent_MakeAvailable(flatbuffers)
-        set_target_properties(flatbuffers PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-        add_library(flatbuffers::flatbuffers ALIAS flatbuffers)
-        add_executable(flatbuffers::flatc ALIAS flatc)
-
-        message(STATUS "Using fetched-and-built flatbuffers, version ${FLATBUFFERS_VER}")
-        set(flatbuffers_target "$<BUILD_LOCAL_INTERFACE:flatbuffers::flatbuffers>")
+    # Sadly, there seem to be at least three variations of the Flatbuffer
+    # package in terms of the case of the relevant CMake files. Fortunately,
+    # the IMPORTED targets appear to be consistently named `flatbuffers`.
+    find_package(
+        flatbuffers 23.5.26 REQUIRED
+        NAMES flatbuffers Flatbuffers FlatBuffers
+    )
+
+    if (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
+        target_sources(Halide PRIVATE "$<TARGET_OBJECTS:flatbuffers::flatbuffers>")
+        target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:$<COMPILE_ONLY:flatbuffers::flatbuffers>>")
     else ()
-        # Sadly, there seem to be at least three variations of the Flatbuffer
-        # package in terms of the case of the relevant CMake files. Fortunately,
-        # the IMPORTED targets appear to be consistently named `flatbuffers`.
-        find_package(
-            flatbuffers ${FLATBUFFERS_VER}
-            NAMES flatbuffers Flatbuffers FlatBuffers
-            REQUIRED
-        )
-        set(flatbuffers_target flatbuffers::flatbuffers)
+        target_link_libraries(Halide PRIVATE flatbuffers::flatbuffers)
     endif ()
 
     set(fb_def "${CMAKE_CURRENT_SOURCE_DIR}/halide_ir.fbs")
@@ -515,7 +493,6 @@ if (WITH_SERIALIZATION)
         BASE_DIRS "${fb_dir}"
         FILES "${fb_header}"
     )
-    target_link_libraries(Halide PRIVATE ${flatbuffers_target})
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION)
 endif ()
 
@@ -567,8 +544,6 @@ target_compile_definitions(Halide PUBLIC
 ##
 
 if (TARGET_WEBASSEMBLY)
-    include(FetchContent)
-
     set(Halide_WASM_BACKEND "wabt"
         CACHE STRING "Which backend to use for Halide's WASM testing.")
     set_property(CACHE Halide_WASM_BACKEND PROPERTY STRINGS "wabt;V8;OFF")
@@ -592,42 +567,15 @@ if (TARGET_WEBASSEMBLY)
     endif ()
 
     if (Halide_WASM_BACKEND STREQUAL "wabt")
-        set(WABT_VER 1.0.33)
-
-        message(STATUS "Fetching WABT ${WABT_VER}...")
-        FetchContent_Declare(wabt
-                             GIT_REPOSITORY https://github.com/WebAssembly/wabt.git
-                             GIT_TAG ${WABT_VER}
-                             GIT_SHALLOW TRUE)
-
-        # configuration for wabt
-        set(WITH_EXCEPTIONS ${Halide_ENABLE_EXCEPTIONS})
-        set(BUILD_TESTS OFF)
-        set(BUILD_TOOLS OFF)
-        set(BUILD_LIBWASM OFF)
-        set(USE_INTERNAL_SHA256 ON)
-        FetchContent_MakeAvailable(wabt)
-
-        set_target_properties(wabt PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-        # Disable this very-noisy warning in GCC
-        target_compile_options(wabt
-                               PRIVATE
-                               $<$<CXX_COMPILER_ID:GNU>:-Wno-alloca-larger-than>)
-
-        # TODO: we want to require unique prefixes to include these files, to avoid ambiguity;
-        # this means we have to prefix with "wabt-src/...", which is less bad than other alternatives,
-        # but perhaps we could do better (esp. if wabt was smarter about what it exposed?)
-        add_library(Halide_wabt INTERFACE)
-        target_sources(Halide_wabt INTERFACE $<BUILD_LOCAL_INTERFACE:$<TARGET_OBJECTS:wabt>>)
-        target_include_directories(Halide_wabt
-                                   SYSTEM # Use -isystem instead of -I; this is a trick so that clang-tidy won't analyze these includes
-                                   INTERFACE
-                                   $<BUILD_INTERFACE:${wabt_SOURCE_DIR}>/include
-                                   $<BUILD_INTERFACE:${wabt_BINARY_DIR}>/include)
-        set_target_properties(Halide_wabt PROPERTIES EXPORT_NAME wabt)
-
-        target_link_libraries(Halide PRIVATE Halide_wabt)
+        find_package(wabt 1.0.36 REQUIRED)
+
+        if (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
+            target_sources(Halide PRIVATE "$<TARGET_OBJECTS:wabt::wabt>")
+            target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:$<COMPILE_ONLY:wabt::wabt>>")
+        else ()
+            target_link_libraries(Halide PRIVATE wabt::wabt)
+        endif ()
+
         target_compile_definitions(Halide PRIVATE WITH_WABT)
     elseif (Halide_WASM_BACKEND STREQUAL "V8")
         find_package(V8 REQUIRED)

From 6f650c637ace4ab133c63c76ba674c6030482395 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Fri, 9 Aug 2024 19:18:19 +0300
Subject: [PATCH 174/186] Two more build fixes (#8371)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Integration test: do forward C/CXX compiler to the inner CMake invocation

* `_Float16`: on i386, needs gcc14 + SSE2

It is not known by GCC13:
https://ci.debian.net/packages/h/halide/testing/i386/50047733/

and fails with
```
/usr/bin/g++ -DHALIDE_ENABLE_RTTI -DHALIDE_VERSION_MAJOR=18 -DHALIDE_VERSION_MINOR=0 -DHALIDE_VERSION_PATCH=0 -DHALIDE_WITH_EXCEPTIONS -isystem /usr/include/halide18 -O3 -DNDEBUG -MD -MT CMakeFiles/main.dir/main.cpp.o -MF CMakeFiles/main.dir/main.cpp.o.d -o CMakeFiles/main.dir/main.cpp.o -c /tmp/autopkgtest.pviDWM/build.Sjp/src/test/integration/jit/main.cpp
In file included from /tmp/autopkgtest.pviDWM/build.Sjp/src/test/integration/jit/main.cpp:1:
/usr/include/halide18/Halide.h: In member function ‘Halide::float16_t::operator _Float16() const’:
/usr/include/halide18/Halide.h:3054:40: error: SSE register return with SSE2 disabled
 3054 |     explicit operator _Float16() const {
      |                                        ^
/usr/include/halide18/Halide.h:3057:16: error: SSE register return with SSE2 disabled
 3057 |         return result;
      |                ^~~~~~
/usr/include/halide18/Halide.h: In constructor ‘Halide::Expr::Expr(_Float16)’:
/usr/include/halide18/Halide.h:4679:64: error: invalid conversion from type ‘_Float16’ without option ‘-msse2’
 4679 |         : IRHandle(Internal::FloatImm::make(Float(16), (double)x)) {
      |                                                                ^
ninja: build stopped: subcommand failed.

```
with GCC14.
---
 src/runtime/HalideRuntime.h     |  2 +-
 test/integration/CMakeLists.txt | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index e44527ad147e..f9079d5d7cb0 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -117,7 +117,7 @@ extern "C" {
 // For now, we say that if >= v12, and compiling on x86 or arm,
 // we assume support. This may need revision.
 #if defined(__GNUC__) && (__GNUC__ >= 12)
-#if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__)
+#if defined(__x86_64__) || (defined(__i386__) && (__GNUC__ >= 14) && defined(__SSE2__)) || defined(__arm__) || defined(__aarch64__)
 #define HALIDE_CPP_COMPILER_HAS_FLOAT16
 #endif
 #endif
diff --git a/test/integration/CMakeLists.txt b/test/integration/CMakeLists.txt
index 44d832c15a6f..867bc74c31d4 100644
--- a/test/integration/CMakeLists.txt
+++ b/test/integration/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.28)
-project(integration_tests NONE)
+project(integration_tests CXX)
 
 enable_testing()
 
@@ -46,7 +46,7 @@ foreach (bsl IN ITEMS "" "-DBUILD_SHARED_LIBS=NO" "-DBUILD_SHARED_LIBS=YES")
                      ${CMAKE_CTEST_COMMAND}
                      --build-and-test "${CMAKE_CURRENT_LIST_DIR}/jit" "${build_dir}"
                      --build-generator Ninja
-                     --build-options ${bsl} ${hsl} ${comp} -DCMAKE_BUILD_TYPE=Release
+                     --build-options -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} ${bsl} ${hsl} ${comp} -DCMAKE_BUILD_TYPE=Release
                      --test-command ${CMAKE_CTEST_COMMAND} --output-on-failure)
 
             # Run ldd on the output binary. The pass/fail regexes are set later.
@@ -75,7 +75,7 @@ add_test(NAME aot_shared_generator
          ${CMAKE_CTEST_COMMAND}
          --build-and-test "${CMAKE_CURRENT_LIST_DIR}/aot" "${CMAKE_CURRENT_BINARY_DIR}/aot-shared"
          --build-generator Ninja
-         --build-options -DCMAKE_BUILD_TYPE=Release
+         --build-options -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_BUILD_TYPE=Release
          --test-command ${CMAKE_CTEST_COMMAND} --output-on-failure)
 
 add_test(NAME aot_static_generator
@@ -83,7 +83,7 @@ add_test(NAME aot_static_generator
          ${CMAKE_CTEST_COMMAND}
          --build-and-test "${CMAKE_CURRENT_LIST_DIR}/aot" "${CMAKE_CURRENT_BINARY_DIR}/aot-static"
          --build-generator Ninja
-         --build-options -DHalide_SHARED_LIBS=NO -DCMAKE_BUILD_TYPE=Release
+         --build-options -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DHalide_SHARED_LIBS=NO -DCMAKE_BUILD_TYPE=Release
          --test-command ${CMAKE_CTEST_COMMAND} --output-on-failure)
 
 add_test(NAME aot_shared_generator_adams2019
@@ -91,7 +91,7 @@ add_test(NAME aot_shared_generator_adams2019
          ${CMAKE_CTEST_COMMAND}
          --build-and-test "${CMAKE_CURRENT_LIST_DIR}/aot" "${CMAKE_CURRENT_BINARY_DIR}/aot-shared-auto"
          --build-generator Ninja
-         --build-options -DCMAKE_BUILD_TYPE=Release -Daot_USE_AUTOSCHEDULER=YES
+         --build-options -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_BUILD_TYPE=Release -Daot_USE_AUTOSCHEDULER=YES
          --test-command ${CMAKE_CTEST_COMMAND} --output-on-failure)
 
 add_test(NAME aot_static_generator_adams2019
@@ -99,7 +99,7 @@ add_test(NAME aot_static_generator_adams2019
          ${CMAKE_CTEST_COMMAND}
          --build-and-test "${CMAKE_CURRENT_LIST_DIR}/aot" "${CMAKE_CURRENT_BINARY_DIR}/aot-static-auto"
          --build-generator Ninja
-         --build-options -DHalide_SHARED_LIBS=NO -DCMAKE_BUILD_TYPE=Release -Daot_USE_AUTOSCHEDULER=YES
+         --build-options -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DHalide_SHARED_LIBS=NO -DCMAKE_BUILD_TYPE=Release -Daot_USE_AUTOSCHEDULER=YES
          --test-command ${CMAKE_CTEST_COMMAND} --output-on-failure)
 
 # Cannot use autoscheduler with generators linked to STATIC Halide

From 8643007422597b0bf645165f03c397759d4f58d5 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Fri, 9 Aug 2024 12:23:15 -0400
Subject: [PATCH 175/186] Fix Numpy 2.0 compatibility bug in lesson 10 (#8381)

Numpy 2.0 no longer performs narrowing conversions
automatically. We manually mask here instead.

Fixes #8380
---
 python_bindings/tutorial/lesson_10_aot_compilation_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python_bindings/tutorial/lesson_10_aot_compilation_run.py b/python_bindings/tutorial/lesson_10_aot_compilation_run.py
index 1c3eb15d7da6..ef39e3411ec4 100644
--- a/python_bindings/tutorial/lesson_10_aot_compilation_run.py
+++ b/python_bindings/tutorial/lesson_10_aot_compilation_run.py
@@ -31,7 +31,7 @@ def main():
     input = np.empty((640, 480), dtype=np.uint8, order='F')
     for y in range(480):
         for x in range(640):
-            input[x, y] = x ^ (y + 1)
+            input[x, y] = (x ^ (y + 1)) & 0xFF
 
     # And the memory where we want to write our output:
     output = np.empty((640, 480), dtype=np.uint8, order='F')

From 0058528108917da7c7f71042e8a07747ffb8d452 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Fri, 9 Aug 2024 12:25:32 -0400
Subject: [PATCH 176/186] Rework LLVM into Find module and enact new component
 policy. (#8379)

Our usage of LLVM now requires at least the X86 and WebAssembly
backends. We also now unconditionally enable all backends supported
by the LLVM we found.
---
 CMakeLists.txt                                |  13 +-
 CMakePresets.json                             |   4 +-
 README_cmake.md                               |  12 +-
 cmake/FindHalide_LLVM.cmake                   | 150 ++++++++++++++
 dependencies/README.md                        |   5 +-
 dependencies/llvm/CMakeLists.txt              | 189 ------------------
 packaging/CMakeLists.txt                      |  31 ++-
 python_bindings/apps/CMakeLists.txt           |   4 +-
 python_bindings/test/CMakeLists.txt           |   2 +-
 .../test/correctness/CMakeLists.txt           |   2 +-
 python_bindings/tutorial/CMakeLists.txt       |  10 +-
 run-clang-tidy.sh                             |   4 +-
 src/CMakeLists.txt                            | 154 +++++++-------
 src/runtime/CMakeLists.txt                    |   4 +-
 test/autoschedulers/CMakeLists.txt            |   2 +-
 test/autoschedulers/li2018/CMakeLists.txt     |   2 +-
 test/generator/CMakeLists.txt                 |  28 +--
 tutorial/CMakeLists.txt                       |   6 +-
 18 files changed, 298 insertions(+), 324 deletions(-)
 create mode 100644 cmake/FindHalide_LLVM.cmake
 delete mode 100644 dependencies/llvm/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a62ef6bbc3bb..0cb939a21848 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,13 +106,6 @@ else ()
     set(Halide_ANY_SANITIZERS_ENABLED 0)
 endif ()
 
-# Enable the SPIR-V target if requested (must declare before processing dependencies)
-option(TARGET_SPIRV "Include SPIR-V target" OFF)
-option(TARGET_VULKAN "Include Vulkan target" ON)
-if (TARGET_VULKAN)
-    set(TARGET_SPIRV ON) # required
-endif()
-
 # Helper function to set C++ compiler warnings in a sane way
 function(set_halide_compiler_warnings NAME)
     target_compile_options(
@@ -193,8 +186,10 @@ endfunction()
 option(THREADS_PREFER_PTHREAD_FLAG "When enabled, prefer to use the -pthread flag to explicit linking" ON)
 find_package(Threads REQUIRED)
 
-## Complex dependencies
-add_subdirectory(dependencies/llvm)
+## LLVM
+find_package(Halide_LLVM 17...20 REQUIRED
+             COMPONENTS WebAssembly X86
+             OPTIONAL_COMPONENTS AArch64 AMDGPU ARM Hexagon NVPTX PowerPC RISCV)
 
 ## Image formats
 
diff --git a/CMakePresets.json b/CMakePresets.json
index 84eac16d30c5..9eaa883f8ca9 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -69,7 +69,7 @@
       "displayName": "Debian (Debug)",
       "description": "Debug build assuming Debian-provided dependencies",
       "cacheVariables": {
-        "Halide_SHARED_LLVM": "ON"
+        "Halide_LLVM_SHARED_LIBS": "ON"
       }
     },
     {
@@ -109,7 +109,7 @@
       "inherits": "release",
       "cacheVariables": {
         "CMAKE_PREFIX_PATH": "/opt/homebrew;/opt/homebrew/opt/llvm;/opt/homebrew/opt/jpeg",
-        "Halide_SHARED_LLVM": "YES"
+        "Halide_LLVM_SHARED_LIBS": "YES"
       }
     },
     {
diff --git a/README_cmake.md b/README_cmake.md
index 60adfc76d61d..558fd1fd5fff 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -392,7 +392,7 @@ compiled.
 |------------------------------------------|-----------------------|------------------------------------------------------------------------------------------------------------------|
 | [`BUILD_SHARED_LIBS`][build_shared_libs] | `ON`                  | Standard CMake variable that chooses whether to build as a static or shared library.                             |
 | `Halide_BUNDLE_LLVM`                     | `OFF`                 | When building Halide as a static library, unpack the LLVM static libraries and add those objects to libHalide.a. |
-| `Halide_SHARED_LLVM`                     | `OFF`                 | Link to the shared version of LLVM. Not available on Windows.                                                    |
+| `Halide_LLVM_SHARED_LIBS`                | `OFF`                 | Link to the shared version of LLVM. Not available on Windows.                                                    |
 | `Halide_ENABLE_RTTI`                     | _inherited from LLVM_ | Enable RTTI when building Halide. Recommended to be set to `ON`                                                  |
 | `Halide_ENABLE_EXCEPTIONS`               | `ON`                  | Enable exceptions when building Halide                                                                           |
 | `Halide_TARGET`                          | _empty_               | The default target triple to use for `add_halide_library` (and the generator tests, by extension)                |
@@ -589,7 +589,7 @@ If Halide is not globally installed, you will need to add the root of the Halide
 installation directory to [`CMAKE_PREFIX_PATH`][cmake_prefix_path] at the CMake
 command line.
 
-```
+```console
 dev@ubuntu:~/myproj$ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="/path/to/Halide-install" -S . -B build
 ```
 
@@ -784,10 +784,10 @@ Variables set by the package:
 
 Variables that control package behavior:
 
-| Variable                   | Description |
-|----------------------------|-------------|
-| `Halide_PYTHON_LAUNCHER`   | Semicolon separated list containing a command to launch the Python interpreter. Can be used to set environment variables for Python generators. |
-| `Halide_NO_DEFAULT_FLAGS`  | Off by default. When enabled, suppresses recommended compiler flags that would be added by `add_halide_generator` |
+| Variable                  | Description                                                                                                                                     |
+|---------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------|
+| `Halide_PYTHON_LAUNCHER`  | Semicolon separated list containing a command to launch the Python interpreter. Can be used to set environment variables for Python generators. |
+| `Halide_NO_DEFAULT_FLAGS` | Off by default. When enabled, suppresses recommended compiler flags that would be added by `add_halide_generator`                               |
 
 
 ### Imported targets
diff --git a/cmake/FindHalide_LLVM.cmake b/cmake/FindHalide_LLVM.cmake
new file mode 100644
index 000000000000..d403f91b7ccd
--- /dev/null
+++ b/cmake/FindHalide_LLVM.cmake
@@ -0,0 +1,150 @@
+# This file wraps the upstream package config modules for LLVM, Clang, and LLD
+# to fix pathological issues in their implementations. It creates imported targets
+# that wrap the key features needed by Halide.
+
+set(REASON_FAILURE_MESSAGE "")
+
+# Fallback configurations for weirdly built LLVMs
+set(CMAKE_MAP_IMPORTED_CONFIG_MINSIZEREL MinSizeRel Release RelWithDebInfo "")
+set(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO RelWithDebInfo Release MinSizeRel "")
+set(CMAKE_MAP_IMPORTED_CONFIG_RELEASE Release MinSizeRel RelWithDebInfo "")
+
+find_package(LLVM ${PACKAGE_FIND_VERSION} CONFIG)
+
+set(Halide_LLVM_VERSION "${LLVM_PACKAGE_VERSION}")
+
+# TODO: deprecated in Halide 19.0.0, remove in Halide 20.0.0
+if (NOT DEFINED Halide_LLVM_SHARED_LIBS AND DEFINED Halide_SHARED_LLVM)
+    set(Halide_LLVM_SHARED_LIBS "${Halide_SHARED_LLVM}")
+    message(DEPRECATION
+            "Halide_SHARED_LLVM has been renamed to Halide_LLVM_SHARED_LIBS.")
+endif ()
+
+if (NOT DEFINED Halide_LLVM_SHARED_LIBS)
+    # Normally, we don't like making decisions for our users. However,
+    # this avoids an incompatible scenario that is checked below. So
+    # if we didn't do this, the package would fail to be found and
+    # the user would have to either rebuild LLVM or flip this value.
+    if (LLVM_FOUND AND "WebAssembly" IN_LIST LLVM_TARGETS_TO_BUILD AND LLVM_LINK_LLVM_DYLIB)
+        set(Halide_LLVM_SHARED_LIBS YES)
+    else ()
+        set(Halide_LLVM_SHARED_LIBS NO)
+    endif ()
+endif ()
+
+option(Halide_LLVM_SHARED_LIBS "Enable to link to shared libLLVM" "${Halide_LLVM_SHARED_LIBS}")
+
+if (LLVM_FOUND)
+    find_package(Clang HINTS "${LLVM_INSTALL_PREFIX}" "${LLVM_DIR}/../clang" "${LLVM_DIR}/../lib/cmake/clang")
+
+    foreach (comp IN LISTS LLVM_TARGETS_TO_BUILD)
+        if (comp STREQUAL "WebAssembly")
+            set(Halide_LLVM_${comp}_FOUND 0)
+
+            find_package(LLD HINTS "${LLVM_INSTALL_PREFIX}" "${LLVM_DIR}/../lld" "${LLVM_DIR}/../lib/cmake/lld")
+            if (NOT LLD_FOUND)
+                string(APPEND REASON_FAILURE_MESSAGE
+                       "WebAssembly was not found because liblld is missing. Did you install liblld-dev?\n")
+                continue()
+            endif ()
+
+            # LLVM has a mis-feature that allows it to build and export both static and shared libraries at the same
+            # time, while inconsistently linking its own static libraries (for lldWasm and others) to the shared
+            # library. Ignoring this causes Halide to link to both the static AND the shared LLVM libs and it breaks at
+            # runtime. See: https://github.com/halide/Halide/issues/5471
+            if (LLVM_LINK_LLVM_DYLIB AND NOT Halide_LLVM_SHARED_LIBS)
+                string(APPEND REASON_FAILURE_MESSAGE
+                       "WebAssembly was not found because LLD required by was linked to shared LLVM "
+                       "(LLVM_LINK_LLVM_DYLIB=${LLVM_LINK_LLVM_DYLIB}) but static LLVM was requested "
+                       "(Halide_LLVM_SHARED_LIBS=${Halide_LLVM_SHARED_LIBS}).\n")
+                continue()
+            endif ()
+        endif ()
+
+        set(Halide_LLVM_${comp}_FOUND 1)
+    endforeach ()
+
+    set(Halide_LLVM_SHARED_LIBRARY "LLVM")
+    if (Halide_LLVM_SHARED_LIBS AND NOT TARGET "${Halide_LLVM_SHARED_LIBRARY}")
+        string(APPEND Halide_LLVM_SHARED_LIBRARY "-NOTFOUND")
+        string(APPEND REASON_FAILURE_MESSAGE
+               "Halide_LLVM_SHARED_LIBS=${Halide_LLVM_SHARED_LIBS} but the shared LLVM target does not exist.\n")
+    endif ()
+endif ()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    Halide_LLVM
+    REQUIRED_VARS LLVM_CONFIG Clang_CONFIG Halide_LLVM_SHARED_LIBRARY
+    VERSION_VAR Halide_LLVM_VERSION
+    REASON_FAILURE_MESSAGE "${REASON_FAILURE_MESSAGE}"
+    HANDLE_COMPONENTS
+    HANDLE_VERSION_RANGE
+    NAME_MISMATCHED
+)
+
+function(_Halide_LLVM_link target visibility)
+    llvm_map_components_to_libnames(comps ${ARGN})
+    target_link_libraries("${target}" "${visibility}" ${comps})
+endfunction()
+
+if (Halide_LLVM_FOUND)
+    set(Halide_LLVM_COMPONENTS "")
+    foreach (comp IN LISTS Halide_LLVM_FIND_COMPONENTS)
+        if (Halide_LLVM_${comp}_FOUND)
+            list(APPEND Halide_LLVM_COMPONENTS "${comp}")
+        endif ()
+    endforeach ()
+
+    if (NOT TARGET Halide_LLVM::Core)
+        add_library(Halide_LLVM::Core INTERFACE IMPORTED)
+
+        # LLVM_DEFINITIONS is a space-separated list instead of a more typical
+        # CMake semicolon-separated list. For a long time, CMake could handle
+        # this transparently but, since LLVM 17, the flag -D_FILE_OFFSET_BITS=64
+        # appears on 32-bit Linux. The presence of the `=` here stops CMake
+        # from splitting on spaces, instead corrupting the command line by
+        # folding the other flags into the value of -D_FILE_OFFSET_BITS=64.
+        # For better or worse, since the flag also appears twice, the second
+        # `=` is folded into the value of the first and we get errors of the
+        # form:
+        #
+        #   <command-line>: error: token "=" is not valid in preprocessor expressions
+        #
+        separate_arguments(LLVM_DEFINITIONS NATIVE_COMMAND "${LLVM_DEFINITIONS}")
+        list(REMOVE_ITEM LLVM_DEFINITIONS "-D_GLIBCXX_ASSERTIONS") # work around https://reviews.llvm.org/D142279
+        list(APPEND LLVM_DEFINITIONS "LLVM_VERSION=${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR}")
+
+        target_compile_definitions(Halide_LLVM::Core INTERFACE ${LLVM_DEFINITIONS})
+        target_include_directories(Halide_LLVM::Core INTERFACE "${LLVM_INCLUDE_DIRS}")
+
+        set_property(TARGET Halide_LLVM::Core PROPERTY INTERFACE_CXX_RTTI "${LLVM_ENABLE_RTTI}")
+        set_property(TARGET Halide_LLVM::Core APPEND PROPERTY COMPATIBLE_INTERFACE_BOOL CXX_RTTI)
+
+        if (LLVM_LIBCXX GREATER -1)
+            target_compile_options(Halide_LLVM::Core INTERFACE "$<$<LINK_LANGUAGE:CXX>:-stdlib=libc++>")
+            target_link_options(Halide_LLVM::Core INTERFACE "$<$<LINK_LANGUAGE:CXX>:-stdlib=libc++>")
+        endif ()
+
+        if (Halide_LLVM_SHARED_LIBS)
+            target_link_libraries(Halide_LLVM::Core INTERFACE LLVM ${CMAKE_DL_LIBS})
+        else ()
+            _Halide_LLVM_link(Halide_LLVM::Core INTERFACE orcjit bitwriter linker passes)
+        endif ()
+    endif ()
+
+    foreach (comp IN LISTS Halide_LLVM_COMPONENTS)
+        if (NOT TARGET Halide_LLVM::${comp})
+            add_library(Halide_LLVM::${comp} INTERFACE IMPORTED)
+            target_link_libraries(Halide_LLVM::${comp} INTERFACE Halide_LLVM::Core)
+
+            if (NOT Halide_LLVM_SHARED_LIBS)
+                _Halide_LLVM_link(Halide_LLVM::${comp} INTERFACE ${comp})
+            endif ()
+
+            if (comp STREQUAL "WebAssembly")
+                target_link_libraries(Halide_LLVM::WebAssembly INTERFACE lldWasm)
+            endif ()
+        endif ()
+    endforeach ()
+endif ()
diff --git a/dependencies/README.md b/dependencies/README.md
index 9e11b262e70c..d59dfd2dba79 100644
--- a/dependencies/README.md
+++ b/dependencies/README.md
@@ -1,8 +1,7 @@
 # Dependencies
 
-## LLVM / WASM
-
-These are Halide's wrappers for the LLVM and WASM CMake builds/packages.
+This folder contains vendored dependencies for building Halide. They do not
+form part of the API surface.
 
 ## SPIR-V
 
diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt
deleted file mode 100644
index cc6018cfab10..000000000000
--- a/dependencies/llvm/CMakeLists.txt
+++ /dev/null
@@ -1,189 +0,0 @@
-##
-# Find LLVM and check the version.
-##
-
-include(CMakeDependentOption)
-include(BundleStatic)
-
-# Fallback configurations for weirdly built LLVMs
-set(CMAKE_MAP_IMPORTED_CONFIG_MINSIZEREL MinSizeRel Release RelWithDebInfo "")
-set(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO RelWithDebInfo Release MinSizeRel "")
-set(CMAKE_MAP_IMPORTED_CONFIG_RELEASE Release MinSizeRel RelWithDebInfo "")
-
-find_package(LLVM ${Halide_REQUIRE_LLVM_VERSION} REQUIRED)
-find_package(Clang REQUIRED CONFIG HINTS "${LLVM_DIR}/../clang" "${LLVM_DIR}/../lib/cmake/clang")
-
-set(LLVM_PACKAGE_VERSION "${LLVM_PACKAGE_VERSION}"
-    CACHE INTERNAL "LLVM version")
-
-message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
-message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
-message(STATUS "Using ClangConfig.cmake in: ${Clang_DIR}")
-
-if (LLVM_PACKAGE_VERSION VERSION_LESS 16.0)
-    message(FATAL_ERROR "LLVM version must be 16.0 or newer")
-endif ()
-
-if (LLVM_PACKAGE_VERSION VERSION_GREATER 20.0)
-    message(WARNING "Halide is not tested on LLVM versions beyond 20.0")
-endif ()
-
-# LLVM_DEFINITIONS is a space-separated list instead of a more typical
-# CMake semicolon-separated list. For a long time, CMake could handle
-# this transparently but, since LLVM 17, the flag -D_FILE_OFFSET_BITS=64
-# appears on 32-bit Linux. The presence of the `=` here stops CMake
-# from splitting on spaces, instead corrupting the command line by
-# folding the other flags into the value of -D_FILE_OFFSET_BITS=64.
-# For better or worse, since the flag also appears twice, the second
-# `=` is folded into the value of the first and we get errors of the
-# form:
-#
-#   <command-line>: error: token "=" is not valid in preprocessor expressions
-#
-separate_arguments(LLVM_DEFINITIONS NATIVE_COMMAND "${LLVM_DEFINITIONS}")
-set(Halide_LLVM_DEFS ${LLVM_DEFINITIONS} $<BUILD_INTERFACE:LLVM_VERSION=${LLVM_VERSION_MAJOR}${LLVM_VERSION_MINOR}>)
-
-# Note, removing -D_GLIBCXX_ASSERTIONS is a workaround for https://reviews.llvm.org/D142279
-list(REMOVE_ITEM Halide_LLVM_DEFS "-D_GLIBCXX_ASSERTIONS")
-
-##
-# Options for which version of LLVM to use
-##
-
-option(Halide_SHARED_LLVM "Link against shared LLVM (ignores components)." OFF)
-cmake_dependent_option(Halide_BUNDLE_LLVM "When built as a static library, include LLVM's objects." OFF
-                       "NOT BUILD_SHARED_LIBS" OFF)
-
-##
-# Promote LLVM/Clang executable targets
-##
-
-set_target_properties(llvm-as clang PROPERTIES IMPORTED_GLOBAL TRUE)
-
-# clang-tools-extra is optional, but provides the clang-format target
-if (TARGET clang-format)
-    set_target_properties(clang-format PROPERTIES IMPORTED_GLOBAL TRUE)
-endif ()
-
-##
-# Create options for including or excluding LLVM backends.
-##
-
-set(active_components orcjit bitwriter linker passes)
-set(known_components AArch64 AMDGPU ARM Hexagon NVPTX PowerPC RISCV WebAssembly X86)
-
-foreach (comp IN LISTS known_components)
-    string(TOUPPER "TARGET_${comp}" OPTION)
-    string(TOUPPER "WITH_${comp}" DEFINE)
-
-    if (comp STREQUAL "RISCV" AND LLVM_PACKAGE_VERSION VERSION_LESS 17.0)
-        # We default the RISCV target to OFF for LLVM versions prior to 17.0;
-        # it's not clear how robust and well-tested Halide's RISCV codegen
-        # is with LLVM16, and a great deal of effort is being put into
-        # improving it in LLVM17... so default to off so that people won't
-        # hurt themselves too badly.
-        cmake_dependent_option(${OPTION} "Include ${comp} target" OFF
-                               "${comp} IN_LIST LLVM_TARGETS_TO_BUILD" OFF)
-    else ()
-        cmake_dependent_option(${OPTION} "Include ${comp} target" ON
-                               "${comp} IN_LIST LLVM_TARGETS_TO_BUILD" OFF)
-    endif ()
-    if (${OPTION} OR Halide_SHARED_LLVM)
-        message(STATUS "Enabling ${comp} backend")
-        list(APPEND Halide_LLVM_DEFS $<BUILD_INTERFACE:${DEFINE}>)
-        list(APPEND active_components ${comp})
-    else ()
-        message(STATUS "Disabling ${comp} backend")
-    endif ()
-endforeach ()
-
-set(wasm_libs "")
-if (TARGET_WEBASSEMBLY)
-    find_package(LLD CONFIG REQUIRED HINTS "${LLVM_DIR}/../lld" "${LLVM_DIR}/../lib/cmake/lld")
-    message(STATUS "Using LLDConfig.cmake in: ${LLD_DIR}")
-
-    # LLVM has a mis-feature that allows it to build and export both static and shared libraries at the same
-    # time, while inconsistently linking its own static libraries (for lldWasm and others) to the shared library.
-    # Ignoring this causes Halide to link to both the static AND the shared LLVM libs and it breaks at runtime.
-    # From issue: https://github.com/halide/Halide/issues/5471
-    if (LLVM_LINK_LLVM_DYLIB AND NOT Halide_SHARED_LLVM)
-        message(FATAL_ERROR "LLD was linked to shared LLVM (see: LLVM_LINK_LLVM_DYLIB), "
-                "but static LLVM was requested. Re-configure with Halide_SHARED_LLVM=YES "
-                "to enable WebAssembly, or disable WebAssembly with TARGET_WEBASSEMBLY=OFF.")
-    endif ()
-
-    set(wasm_libs lldWasm)
-endif ()
-
-##
-# Create Halide::LLVM library alias pointing to the correct LLVM
-# among shared, static, and bundled.
-##
-
-add_library(Halide_LLVM INTERFACE)
-add_library(Halide::LLVM ALIAS Halide_LLVM)
-
-set_target_properties(Halide_LLVM PROPERTIES EXPORT_NAME LLVM)
-target_include_directories(Halide_LLVM INTERFACE "$<BUILD_INTERFACE:${LLVM_INCLUDE_DIRS}>")
-target_compile_definitions(Halide_LLVM INTERFACE ${Halide_LLVM_DEFS})
-
-# Link LLVM libraries to Halide_LLVM, depending on shared, static, or bundled selection.
-if (Halide_SHARED_LLVM)
-    # llvm_map_components_to_libnames is not safe to call if the LLVM static libraries
-    # aren't in the package. This happens on Gentoo Linux at least, but might also happen
-    # with custom LLVM build configurations.
-    target_link_libraries(Halide_LLVM INTERFACE LLVM ${wasm_libs} ${CMAKE_DL_LIBS})
-else ()
-    llvm_map_components_to_libnames(llvm_targets ${active_components})
-    list(APPEND llvm_targets ${wasm_libs})
-    if (Halide_BUNDLE_LLVM)
-        bundle_static(Halide_LLVM LIBRARIES ${llvm_targets})
-    else ()
-        target_link_libraries(Halide_LLVM INTERFACE ${llvm_targets})
-    endif ()
-endif ()
-
-##
-# Language options interface library
-##
-
-add_library(Halide_LanguageOptions INTERFACE)
-add_library(Halide::LanguageOptions ALIAS Halide_LanguageOptions)
-
-set_target_properties(Halide_LanguageOptions PROPERTIES EXPORT_NAME LanguageOptions)
-
-option(Halide_ENABLE_RTTI "Enable RTTI" ${LLVM_ENABLE_RTTI})
-if (Halide_ENABLE_RTTI AND NOT LLVM_ENABLE_RTTI)
-    message(FATAL_ERROR "Can't enable RTTI. LLVM was compiled without it")
-endif ()
-
-if (Halide_ENABLE_RTTI)
-    message(STATUS "Compiling Halide WITH RTTI.")
-    target_compile_definitions(Halide_LanguageOptions INTERFACE HALIDE_ENABLE_RTTI)
-else ()
-    message(STATUS "Compiling Halide WITHOUT RTTI.")
-    target_compile_options(Halide_LanguageOptions INTERFACE
-                           $<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/GR->
-                           $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang,AppleClang>:-fno-rtti>)
-endif ()
-
-option(Halide_ENABLE_EXCEPTIONS "Enable exceptions" ON)
-if (Halide_ENABLE_EXCEPTIONS)
-    message(STATUS "Compiling Halide WITH exceptions.")
-    target_compile_definitions(Halide_LanguageOptions INTERFACE HALIDE_WITH_EXCEPTIONS)
-else ()
-    message(STATUS "Compiling Halide WITHOUT exceptions.")
-    target_compile_options(Halide_LanguageOptions INTERFACE
-                           $<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/EHs-c->
-                           $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang,AppleClang>:-fno-exceptions>)
-    target_compile_definitions(Halide_LanguageOptions INTERFACE
-                               $<$<COMPILE_LANG_AND_ID:CXX,MSVC>:_HAS_EXCEPTIONS=0>)
-endif ()
-
-if (LLVM_LIBCXX GREATER -1)
-    message(STATUS "LLVM linked to libc++. Adding to interface requirements.")
-    target_compile_options(Halide_LanguageOptions INTERFACE
-                           $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,CXX>:-stdlib=libc++>)
-    target_link_options(Halide_LanguageOptions INTERFACE
-                        $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,CXX>:-stdlib=libc++>)
-endif ()
diff --git a/packaging/CMakeLists.txt b/packaging/CMakeLists.txt
index 5d1239384120..675c52b19983 100644
--- a/packaging/CMakeLists.txt
+++ b/packaging/CMakeLists.txt
@@ -21,7 +21,7 @@ set(Halide_INSTALL_TOOLSDIR "${CMAKE_INSTALL_DATADIR}/tools"
 # Main library exports
 ##
 
-install(TARGETS Halide Halide_Generator Halide_GenGen Halide_LanguageOptions
+install(TARGETS Halide Halide_Generator Halide_GenGen
         EXPORT Halide_Targets
         RUNTIME COMPONENT Halide_Runtime
         LIBRARY COMPONENT Halide_Runtime
@@ -36,13 +36,6 @@ if (TARGET Halide_Adams2019)
             NAMELINK_COMPONENT Halide_Development)
 endif ()
 
-# Halide_LLVM
-foreach (dep IN ITEMS Halide_LLVM)
-    if (TARGET ${dep})
-        install(TARGETS ${dep} EXPORT Halide_Targets)
-    endif ()
-endforeach ()
-
 ##
 # Runtime headers
 ##
@@ -52,7 +45,7 @@ install(TARGETS Halide_Runtime
         FILE_SET HEADERS COMPONENT Halide_Development)
 
 ##
-# Library-type-agnostic interface targets
+# Halide tools
 ##
 
 target_sources(Halide_RunGenMain INTERFACE $<INSTALL_INTERFACE:${Halide_INSTALL_TOOLSDIR}/RunGenMain.cpp>)
@@ -151,15 +144,19 @@ endif ()
 # If Halide explicitly links against shared LLVM or if it is a static library
 # and we are not bundling our static dependencies, then end-users must have
 # the relevant system libraries installed.
-if (Halide_SHARED_LLVM OR (NOT BUILD_SHARED_LIBS AND NOT Halide_BUNDLE_LLVM))
+if (Halide_LLVM_SHARED_LIBS OR (NOT BUILD_SHARED_LIBS AND NOT Halide_BUNDLE_LLVM))
     set(depFile "${CMAKE_CURRENT_BINARY_DIR}/Halide-${type}-deps.cmake")
-    file(WRITE "${depFile}" "find_dependency(LLVM ${LLVM_PACKAGE_VERSION})\n")
-
-    if (TARGET_WEBASSEMBLY)
-        file(APPEND "${depFile}" [[find_dependency(LLD HINTS "${LLVM_DIR}/../lld" "${LLVM_DIR}/../lib/cmake/lld")]])
-    endif ()
-
-    install(FILES "${depFile}"
+    file(CONFIGURE
+         OUTPUT "${depFile}"
+         CONTENT [[
+set(Halide_LLVM_SHARED_LIBS @Halide_LLVM_SHARED_LIBS@)
+find_dependency(
+    Halide_LLVM @Halide_LLVM_VERSION@
+    COMPONENTS @Halide_LLVM_COMPONENTS@
+)
+]] @ONLY)
+
+    install(FILES "${depFile}" "${Halide_SOURCE_DIR}/cmake/FindHalide_LLVM.cmake"
             DESTINATION ${Halide_INSTALL_CMAKEDIR}
             COMPONENT Halide_Development)
 endif ()
diff --git a/python_bindings/apps/CMakeLists.txt b/python_bindings/apps/CMakeLists.txt
index 1709ea434e9e..0e5599c998a2 100644
--- a/python_bindings/apps/CMakeLists.txt
+++ b/python_bindings/apps/CMakeLists.txt
@@ -1,9 +1,9 @@
-if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm")
+if (Halide_TARGET MATCHES "wasm")
     message(WARNING "Python apps are skipped under WASM.")
     return()
 endif ()
 
-if (TARGET_VULKAN AND Halide_TARGET MATCHES "vulkan")
+if (Halide_TARGET MATCHES "vulkan")
     message(WARNING "Python apps are skipped under Vulkan.")
     return()
 endif ()
diff --git a/python_bindings/test/CMakeLists.txt b/python_bindings/test/CMakeLists.txt
index f25330797c9f..d957d9fb5c2b 100644
--- a/python_bindings/test/CMakeLists.txt
+++ b/python_bindings/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm")
+if (Halide_TARGET MATCHES "wasm")
     message(WARNING "Python tests are skipped under WASM.")
     return()
 endif ()
diff --git a/python_bindings/test/correctness/CMakeLists.txt b/python_bindings/test/correctness/CMakeLists.txt
index 74acaed0deea..46dfae3dda60 100644
--- a/python_bindings/test/correctness/CMakeLists.txt
+++ b/python_bindings/test/correctness/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "webgpu")
+if (Halide_TARGET MATCHES "webgpu")
     message(WARNING "Python correctness tests are not supported with WebGPU.")
     return()
 endif ()
diff --git a/python_bindings/tutorial/CMakeLists.txt b/python_bindings/tutorial/CMakeLists.txt
index 1f9fa17f3e0f..9b17c7516f77 100644
--- a/python_bindings/tutorial/CMakeLists.txt
+++ b/python_bindings/tutorial/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "webgpu")
+if (Halide_TARGET MATCHES "webgpu")
     message(WARNING "Python tutorials are not supported with WebGPU.")
     return()
 endif ()
@@ -24,12 +24,12 @@ set(tests
 set(PYPATH_lesson_10_aot_compilation_run "$<TARGET_FILE_DIR:lesson_10_halide>")
 
 foreach (test IN LISTS tests)
-    if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm" AND test MATCHES "lesson_10")
+    if (Halide_TARGET MATCHES "wasm" AND test MATCHES "lesson_10")
         message(WARNING "Not all tutorials build under WASM.")
         continue()
     endif ()
 
-    if (TARGET_VULKAN AND Halide_TARGET MATCHES "vulkan" AND test MATCHES "lesson_10")
+    if (Halide_TARGET MATCHES "vulkan" AND test MATCHES "lesson_10")
         message(WARNING "Not all tutorials build under Vulkan.")
         continue()
     endif ()
@@ -42,9 +42,9 @@ foreach (test IN LISTS tests)
     )
 endforeach ()
 
-if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm")
+if (Halide_TARGET MATCHES "wasm")
     message(WARNING "Not all tutorials build under WASM.")
-elseif (TARGET_VULKAN AND Halide_TARGET MATCHES "vulkan")
+elseif (Halide_TARGET MATCHES "vulkan")
     message(WARNING "Not all tutorials build under Vulkan.")
 else ()
     ## Add some hacks for getting CMake to delay compiling lesson_10_halide until after the test has run. The "better" way
diff --git a/run-clang-tidy.sh b/run-clang-tidy.sh
index 1b4fc808a0a9..baf896d3f78e 100755
--- a/run-clang-tidy.sh
+++ b/run-clang-tidy.sh
@@ -55,12 +55,12 @@ fi
 CLANG_TIDY_BUILD_DIR=`mktemp -d`
 echo CLANG_TIDY_BUILD_DIR = ${CLANG_TIDY_BUILD_DIR}
 
-# Specify Halide_SHARED_LLVM=ON because some installers may provide only that.
+# Specify Halide_LLVM_SHARED_LIBS=ON because some installers may provide only that.
 echo Building compile_commands.json...
 cmake -DCMAKE_BUILD_TYPE=Debug \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
       -DHalide_CLANG_TIDY_BUILD=ON \
-      -DHalide_SHARED_LLVM=ON \
+      -DHalide_LLVM_SHARED_LIBS=ON \
       -DLLVM_DIR=${CLANG_TIDY_LLVM_INSTALL_DIR}/lib/cmake/llvm \
       -S ${ROOT_DIR} \
       -B ${CLANG_TIDY_BUILD_DIR} \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 78b137239272..3c481a60e2aa 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,16 @@
 add_library(Halide)
 add_library(Halide::Halide ALIAS Halide)
 
+##
+# CodeGen backends
+
+# LLVM
+foreach (backend IN LISTS Halide_LLVM_COMPONENTS)
+    string(TOUPPER "WITH_${backend}" definition)
+    target_compile_definitions(Halide PRIVATE "${definition}")
+    target_link_libraries(Halide PRIVATE Halide_LLVM::${backend})
+endforeach ()
+
 ##
 # Lists of source files. Keep ALL lists sorted in alphabetical order.
 ##
@@ -506,8 +516,6 @@ if (WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
 endif ()
 
-target_link_libraries(Halide PRIVATE Halide::LLVM)
-target_link_libraries(Halide PUBLIC Halide::LanguageOptions)
 target_compile_features(Halide PUBLIC cxx_std_17)
 if (NOT BUILD_SHARED_LIBS)
     target_compile_definitions(Halide PRIVATE Halide_STATIC_DEFINE)
@@ -543,47 +551,45 @@ target_compile_definitions(Halide PUBLIC
 # WasmExecutor backend selection
 ##
 
-if (TARGET_WEBASSEMBLY)
-    set(Halide_WASM_BACKEND "wabt"
-        CACHE STRING "Which backend to use for Halide's WASM testing.")
-    set_property(CACHE Halide_WASM_BACKEND PROPERTY STRINGS "wabt;V8;OFF")
-
-    if (WITH_WABT AND NOT WITH_V8)
-        message(DEPRECATION "WITH_WABT has been replaced by Halide_WASM_BACKEND=\"wabt\"")
-        set(Halide_WASM_BACKEND "wabt")
-    elseif (NOT WITH_WABT AND WITH_V8)
-        message(DEPRECATION "WITH_V8 has been replaced by Halide_WASM_BACKEND=\"V8\"")
-        set(Halide_WASM_BACKEND "V8")
-    elseif (WITH_WABT AND WITH_V8)
-        message(FATAL_ERROR "Cannot use both WABT and V8 at the same time, disable one of them.")
-    elseif (DEFINED WITH_WABT AND DEFINED WITH_V8 AND NOT WITH_WABT AND NOT WITH_V8)
-        message(DEPRECATION "Disabling both WITH_WABT and WITH_V8 has been replaced by Halide_WASM_BACKEND=\"OFF\"")
-        set(Halide_WASM_BACKEND "OFF")
-    endif ()
+set(Halide_WASM_BACKEND "wabt"
+    CACHE STRING "Which backend to use for Halide's WASM testing.")
+set_property(CACHE Halide_WASM_BACKEND PROPERTY STRINGS "wabt;V8;OFF")
+
+if (WITH_WABT AND NOT WITH_V8)
+    message(DEPRECATION "WITH_WABT has been replaced by Halide_WASM_BACKEND=\"wabt\"")
+    set(Halide_WASM_BACKEND "wabt")
+elseif (NOT WITH_WABT AND WITH_V8)
+    message(DEPRECATION "WITH_V8 has been replaced by Halide_WASM_BACKEND=\"V8\"")
+    set(Halide_WASM_BACKEND "V8")
+elseif (WITH_WABT AND WITH_V8)
+    message(FATAL_ERROR "Cannot use both WABT and V8 at the same time, disable one of them.")
+elseif (DEFINED WITH_WABT AND DEFINED WITH_V8 AND NOT WITH_WABT AND NOT WITH_V8)
+    message(DEPRECATION "Disabling both WITH_WABT and WITH_V8 has been replaced by Halide_WASM_BACKEND=\"OFF\"")
+    set(Halide_WASM_BACKEND "OFF")
+endif ()
 
-    if (MSVC AND Halide_WASM_BACKEND STREQUAL "wabt")
-        message(WARNING "wabt is not yet supported on Windows")
-        set(Halide_WASM_BACKEND "OFF")
-    endif ()
+if (MSVC AND Halide_WASM_BACKEND STREQUAL "wabt")
+    message(WARNING "wabt is not yet supported on Windows")
+    set(Halide_WASM_BACKEND "OFF")
+endif ()
+
+if (Halide_WASM_BACKEND STREQUAL "wabt")
+    find_package(wabt 1.0.36 REQUIRED)
 
-    if (Halide_WASM_BACKEND STREQUAL "wabt")
-        find_package(wabt 1.0.36 REQUIRED)
-
-        if (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
-            target_sources(Halide PRIVATE "$<TARGET_OBJECTS:wabt::wabt>")
-            target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:$<COMPILE_ONLY:wabt::wabt>>")
-        else ()
-            target_link_libraries(Halide PRIVATE wabt::wabt)
-        endif ()
-
-        target_compile_definitions(Halide PRIVATE WITH_WABT)
-    elseif (Halide_WASM_BACKEND STREQUAL "V8")
-        find_package(V8 REQUIRED)
-        target_link_libraries(Halide PRIVATE V8::V8)
-        target_compile_definitions(Halide PRIVATE WITH_V8)
-    elseif (Halide_WASM_BACKEND)
-        message(FATAL_ERROR "Unknown Halide_WASM_BACKEND `${Halide_WASM_BACKEND}`")
+    if (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
+        target_sources(Halide PRIVATE "$<TARGET_OBJECTS:wabt::wabt>")
+        target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:$<COMPILE_ONLY:wabt::wabt>>")
+    else ()
+        target_link_libraries(Halide PRIVATE wabt::wabt)
     endif ()
+
+    target_compile_definitions(Halide PRIVATE WITH_WABT)
+elseif (Halide_WASM_BACKEND STREQUAL "V8")
+    find_package(V8 REQUIRED)
+    target_link_libraries(Halide PRIVATE V8::V8)
+    target_compile_definitions(Halide PRIVATE WITH_V8)
+elseif (Halide_WASM_BACKEND)
+    message(FATAL_ERROR "Unknown Halide_WASM_BACKEND `${Halide_WASM_BACKEND}`")
 endif ()
 
 ##
@@ -609,42 +615,54 @@ target_compile_definitions(Halide
 )
 
 ##
-# Set up additional backend options for Halide
-##
+# RTTI and exceptions settings
 
-option(TARGET_OPENCL "Include OpenCL-C target" ON)
-if (TARGET_OPENCL)
-    target_compile_definitions(Halide PRIVATE WITH_OPENCL)
-endif ()
+option(Halide_ENABLE_RTTI "Enable RTTI in Halide" "${LLVM_ENABLE_RTTI}")
+set_property(TARGET Halide PROPERTY CXX_RTTI "${Halide_ENABLE_RTTI}")
+set_property(TARGET Halide APPEND PROPERTY COMPATIBLE_INTERFACE_BOOL CXX_RTTI)
 
-option(TARGET_METAL "Include Metal target" ON)
-if (TARGET_METAL)
-    target_compile_definitions(Halide PRIVATE WITH_METAL)
+if (Halide_ENABLE_RTTI)
+    target_compile_definitions(Halide PUBLIC HALIDE_ENABLE_RTTI)
+else ()
+    target_compile_options(
+        Halide
+        PUBLIC
+        "$<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/GR->"
+        "$<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang,AppleClang>:-fno-rtti>"
+    )
 endif ()
 
-option(TARGET_D3D12COMPUTE "Include Direct3D 12 Compute target" ON)
-if (TARGET_D3D12COMPUTE)
-    target_compile_definitions(Halide PRIVATE WITH_D3D12)
+option(Halide_ENABLE_EXCEPTIONS "Enable exceptions in Halide" ON)
+if (Halide_ENABLE_EXCEPTIONS)
+    target_compile_definitions(Halide PUBLIC HALIDE_WITH_EXCEPTIONS)
+else ()
+    target_compile_options(
+        Halide
+        PUBLIC
+        "$<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/EHs-c->"
+        "$<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang,AppleClang>:-fno-exceptions>"
+    )
+    target_compile_definitions(
+        Halide PUBLIC "$<$<COMPILE_LANG_AND_ID:CXX,MSVC>:_HAS_EXCEPTIONS=0>"
+    )
 endif ()
 
-if (TARGET_VULKAN)
-    message(STATUS "Enabling Vulkan target")
-    target_compile_definitions(Halide PRIVATE WITH_VULKAN)
-endif()
+##
+# GPU CodeGen backends
 
-if (TARGET_SPIRV)
-    find_package(
-        SPIRV-Headers 1.5.5 REQUIRED
-        HINTS "${Halide_SOURCE_DIR}/dependencies/spirv"
-    )
-    target_compile_definitions(Halide PRIVATE WITH_SPIRV)
-    target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:SPIRV-Headers::SPIRV-Headers>")
-endif ()
+find_package(
+    SPIRV-Headers 1.5.5 REQUIRED
+    HINTS "${Halide_SOURCE_DIR}/dependencies/spirv"
+)
+
+target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:SPIRV-Headers::SPIRV-Headers>")
 
-option(TARGET_WEBGPU "Include WebGPU target" ON)
-if (TARGET_WEBGPU)
-    target_compile_definitions(Halide PRIVATE WITH_WEBGPU)
-endif()
+target_compile_definitions(Halide PRIVATE WITH_D3D12)
+target_compile_definitions(Halide PRIVATE WITH_METAL)
+target_compile_definitions(Halide PRIVATE WITH_OPENCL)
+target_compile_definitions(Halide PRIVATE WITH_SPIRV)
+target_compile_definitions(Halide PRIVATE WITH_VULKAN)
+target_compile_definitions(Halide PRIVATE WITH_WEBGPU)
 
 ##
 # Add autoschedulers to the build.
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index 061b6331fcce..3f5f876b1e37 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -203,9 +203,9 @@ foreach (i IN LISTS RUNTIME_CPP)
                 endif ()
             # Windows on ARM
             elseif (i MATCHES "windows_.*_arm$")
-                if (j EQUAL 32 AND TARGET_ARM)
+                if (j EQUAL 32 AND "ARM" IN_LIST Halide_LLVM_COMPONENTS)
                     set(TARGET "arm-unknown-windows-unknown")
-                elseif (j EQUAL 64 AND TARGET_AARCH64)
+                elseif (j EQUAL 64 AND "AArch64" IN_LIST Halide_LLVM_COMPONENTS)
                     set(TARGET "aarch64-unknown-windows-unknown")
                 else ()
                     continue()
diff --git a/test/autoschedulers/CMakeLists.txt b/test/autoschedulers/CMakeLists.txt
index 38dd5a65a176..c79e6d040899 100644
--- a/test/autoschedulers/CMakeLists.txt
+++ b/test/autoschedulers/CMakeLists.txt
@@ -5,7 +5,7 @@
 # autoschedulers_cuda: this autoscheduler is expected to work on Cuda targets (but probably not other GPU targets)
 
 add_subdirectory(adams2019)
-if (TARGET_NVPTX)
+if ("NVPTX" IN_LIST Halide_LLVM_COMPONENTS)
   add_subdirectory(anderson2021)
 else()
   message(STATUS "Not testing anderson2021 because PTX is not enabled for this build of Halide")
diff --git a/test/autoschedulers/li2018/CMakeLists.txt b/test/autoschedulers/li2018/CMakeLists.txt
index 2260a87dfd3d..ee49c585e8c1 100644
--- a/test/autoschedulers/li2018/CMakeLists.txt
+++ b/test/autoschedulers/li2018/CMakeLists.txt
@@ -28,7 +28,7 @@ tests(GROUPS li2018 autoschedulers_cpu autoschedulers_gpu
 add_dependencies(li2018_test Halide::Li2018)
 
 if (WITH_PYTHON_BINDINGS)
-    if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "webgpu")
+    if (Halide_TARGET MATCHES "webgpu")
         message(WARNING "li2018_gradient_autoscheduler_test_py is not supported with WebGPU.")
     else()
         find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index 3e1a656cefcf..be1ccdd47527 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -6,7 +6,7 @@
 # Some tests are not available when compiling for WASM.
 ##
 
-if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm")
+if (Halide_TARGET MATCHES "wasm")
     set(_USING_WASM 1)
 else()
     set(_USING_WASM 0)
@@ -232,11 +232,13 @@ function(_add_one_aot_test TARGET)
             target_compile_definitions("${TARGET}" PRIVATE WITH_DAWN_NATIVE)
         endif ()
     endif ()
-    if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda")
-        target_link_libraries("${TARGET}" PRIVATE CUDA::cuda_driver CUDA::cudart)
-    endif ()
-    if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl")
-        target_link_libraries("${TARGET}" PRIVATE OpenCL::OpenCL)
+    if ("NVPTX" IN_LIST Halide_LLVM_COMPONENTS)
+        if (Halide_TARGET MATCHES "cuda")
+            target_link_libraries("${TARGET}" PRIVATE CUDA::cuda_driver CUDA::cudart)
+        endif ()
+        if (Halide_TARGET MATCHES "opencl")
+            target_link_libraries("${TARGET}" PRIVATE OpenCL::OpenCL)
+        endif ()
     endif ()
     add_halide_test("${TARGET}" GROUPS generator ${args_GROUPS})
 endfunction()
@@ -307,11 +309,13 @@ function(_add_halide_aot_tests NAME)
     endif()
 endfunction()
 
-if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda")
-    find_package(CUDAToolkit REQUIRED)
-endif ()
-if (TARGET_NVPTX AND Halide_TARGET MATCHES "opencl")
-    find_package(OpenCL REQUIRED)
+if ("NVPTX" IN_LIST Halide_LLVM_COMPONENTS)
+    if (Halide_TARGET MATCHES "cuda")
+        find_package(CUDAToolkit REQUIRED)
+    endif ()
+    if (Halide_TARGET MATCHES "opencl")
+        find_package(OpenCL REQUIRED)
+    endif ()
 endif ()
 
 ##
@@ -425,7 +429,7 @@ if (NOT ${_USING_WASM})
                           FEATURES c_plus_plus_name_mangling
                           EXTERNS cxx_mangling_externs)
     _add_halide_aot_tests(cxx_mangling)
-    if (TARGET_NVPTX AND Halide_TARGET MATCHES "cuda")
+    if ("NVPTX" IN_LIST Halide_LLVM_COMPONENTS AND Halide_TARGET MATCHES "cuda")
         add_halide_library(cxx_mangling_gpu
                            FROM cxx_mangling.generator
                            GENERATOR cxx_mangling
diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt
index ee81fcb7a545..6e872e2153d4 100644
--- a/tutorial/CMakeLists.txt
+++ b/tutorial/CMakeLists.txt
@@ -49,8 +49,8 @@ add_tutorial(lesson_07_multi_stage_pipelines.cpp WITH_IMAGE_IO)
 add_tutorial(lesson_08_scheduling_2.cpp WITH_IMAGE_IO WITH_OPENMP GROUPS multithreaded)
 add_tutorial(lesson_09_update_definitions.cpp WITH_IMAGE_IO WITH_OPENMP GROUPS multithreaded)
 
-if (TARGET_NVPTX)
-    if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm")
+if ("NVPTX" IN_LIST Halide_LLVM_COMPONENTS)
+    if (Halide_TARGET MATCHES "wasm")
         # TODO: Requires custom build rules to work under wasm.
         message(WARNING "Not all tutorials build under WASM.")
     else ()
@@ -154,7 +154,7 @@ set_tests_properties(tutorial_lesson_15_check_files PROPERTIES
                      FIXTURES_REQUIRED tutorial_lesson_15)
 
 # Lesson 16
-if (TARGET_WEBASSEMBLY AND Halide_TARGET MATCHES "wasm")
+if (Halide_TARGET MATCHES "wasm")
     # TODO: Requires custom build rules to work under wasm
     message(WARNING "Not all tutorials build under WASM.")
 else ()

From ba085221b5ecfcf22b839aea9162d2a4050e8e51 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Fri, 9 Aug 2024 12:58:42 -0400
Subject: [PATCH 177/186] Remove vestigial AMDGPU backend (#8382)

The backend was started in 2018 but never completed.
Removing the stale references reduces confusion.
---
 CMakeLists.txt       |  2 +-
 Makefile             |  8 --------
 README_cmake.md      | 23 -----------------------
 src/CodeGen_LLVM.cpp |  6 ------
 4 files changed, 1 insertion(+), 38 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0cb939a21848..71356c1f77d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,7 +189,7 @@ find_package(Threads REQUIRED)
 ## LLVM
 find_package(Halide_LLVM 17...20 REQUIRED
              COMPONENTS WebAssembly X86
-             OPTIONAL_COMPONENTS AArch64 AMDGPU ARM Hexagon NVPTX PowerPC RISCV)
+             OPTIONAL_COMPONENTS AArch64 ARM Hexagon NVPTX PowerPC RISCV)
 
 ## Image formats
 
diff --git a/Makefile b/Makefile
index 3bbe54baa331..c12f2c8ad947 100644
--- a/Makefile
+++ b/Makefile
@@ -118,8 +118,6 @@ WITH_AARCH64 ?= $(findstring aarch64, $(LLVM_COMPONENTS))
 WITH_POWERPC ?= $(findstring powerpc, $(LLVM_COMPONENTS))
 WITH_NVPTX ?= $(findstring nvptx, $(LLVM_COMPONENTS))
 WITH_WEBASSEMBLY ?= $(findstring webassembly, $(LLVM_COMPONENTS))
-# AMDGPU target is WIP
-WITH_AMDGPU ?= $(findstring amdgpu, $(LLVM_COMPONENTS))
 WITH_OPENCL ?= not-empty
 WITH_METAL ?= not-empty
 WITH_D3D12 ?= not-empty
@@ -147,10 +145,6 @@ PTX_CXX_FLAGS=$(if $(WITH_NVPTX), -DWITH_NVPTX, )
 PTX_LLVM_CONFIG_LIB=$(if $(WITH_NVPTX), nvptx, )
 PTX_DEVICE_INITIAL_MODULES=$(if $(WITH_NVPTX), libdevice.compute_20.10.bc libdevice.compute_30.10.bc libdevice.compute_35.10.bc, )
 
-AMDGPU_CXX_FLAGS=$(if $(WITH_AMDGPU), -DWITH_AMDGPU, )
-AMDGPU_LLVM_CONFIG_LIB=$(if $(WITH_AMDGPU), amdgpu, )
-# TODO add bitcode files
-
 OPENCL_CXX_FLAGS=$(if $(WITH_OPENCL), -DWITH_OPENCL, )
 OPENCL_LLVM_CONFIG_LIB=$(if $(WITH_OPENCL), , )
 
@@ -213,7 +207,6 @@ CXX_FLAGS += $(D3D12_CXX_FLAGS)
 CXX_FLAGS += $(WEBGPU_CXX_FLAGS)
 CXX_FLAGS += $(POWERPC_CXX_FLAGS)
 CXX_FLAGS += $(EXCEPTIONS_CXX_FLAGS)
-CXX_FLAGS += $(AMDGPU_CXX_FLAGS)
 CXX_FLAGS += $(RISCV_CXX_FLAGS)
 CXX_FLAGS += $(SPIRV_CXX_FLAGS)
 CXX_FLAGS += $(VULKAN_CXX_FLAGS)
@@ -248,7 +241,6 @@ LLVM_STATIC_LIBFILES = \
 	$(AARCH64_LLVM_CONFIG_LIB) \
 	$(POWERPC_LLVM_CONFIG_LIB) \
 	$(HEXAGON_LLVM_CONFIG_LIB) \
-	$(AMDGPU_LLVM_CONFIG_LIB) \
 	$(SPIRV_LLVM_CONFIG_LIB) \
 	$(VULKAN_LLVM_CONFIG_LIB) \
 	$(WEBASSEMBLY_LLVM_CONFIG_LIB) \
diff --git a/README_cmake.md b/README_cmake.md
index 558fd1fd5fff..caca4c3f611f 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -432,29 +432,6 @@ apply when `WITH_TESTS=ON`:
 | `WITH_TEST_PERFORMANCE`   | `ON`    | enable performance testing        |
 | `WITH_TEST_GENERATOR`     | `ON`    | enable the AOT generator tests    |
 
-The following options enable/disable various LLVM backends (they correspond to
-LLVM component names):
-
-| Option               | Default              | Description                         |
-|----------------------|----------------------|-------------------------------------|
-| `TARGET_AARCH64`     | `ON`, _if available_ | Enable the AArch64 backend          |
-| `TARGET_AMDGPU`      | `ON`, _if available_ | Enable the AMD GPU backend          |
-| `TARGET_ARM`         | `ON`, _if available_ | Enable the ARM backend              |
-| `TARGET_HEXAGON`     | `ON`, _if available_ | Enable the Hexagon backend          |
-| `TARGET_NVPTX`       | `ON`, _if available_ | Enable the NVidia PTX backend       |
-| `TARGET_POWERPC`     | `ON`, _if available_ | Enable the PowerPC backend          |
-| `TARGET_RISCV`       | `ON`, _if available_ | Enable the RISC V backend           |
-| `TARGET_WEBASSEMBLY` | `ON`, _if available_ | Enable the WebAssembly backend.     |
-| `TARGET_X86`         | `ON`, _if available_ | Enable the x86 (and x86_64) backend |
-
-The following options enable/disable various Halide-specific backends:
-
-| Option                | Default | Description                            |
-|-----------------------|---------|----------------------------------------|
-| `TARGET_OPENCL`       | `ON`    | Enable the OpenCL-C backend            |
-| `TARGET_METAL`        | `ON`    | Enable the Metal backend               |
-| `TARGET_D3D12COMPUTE` | `ON`    | Enable the Direct3D 12 Compute backend |
-
 The following options are WebAssembly-specific. They only apply when
 `TARGET_WEBASSEMBLY=ON`:
 
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index f1e55654f540..373cc0764813 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -95,12 +95,6 @@ using std::vector;
 #define InitializeNVPTXAsmPrinter() InitializeAsmPrinter(NVPTX)
 #endif
 
-#ifdef WITH_AMDGPU
-#define InitializeAMDGPUTarget() InitializeTarget(AMDGPU)
-#define InitializeAMDGPUAsmParser() InitializeAsmParser(AMDGPU)
-#define InitializeAMDGPUAsmPrinter() InitializeAsmParser(AMDGPU)
-#endif
-
 #ifdef WITH_AARCH64
 #define InitializeAArch64Target() InitializeTarget(AArch64)
 #define InitializeAArch64AsmParser() InitializeAsmParser(AArch64)

From ff538b102b6961d54e76fe4517ec3e8351dc5641 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Fri, 9 Aug 2024 15:10:20 -0400
Subject: [PATCH 178/186] Reflow src/CMakeLists.txt in logical groups (#8383)

* style: move core features closer to library definition
* style: move target export script to its own section
* style: group LLVM and GPU backends together
---
 src/CMakeLists.txt | 105 ++++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 48 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3c481a60e2aa..d5353dd00c43 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,15 +5,25 @@
 add_library(Halide)
 add_library(Halide::Halide ALIAS Halide)
 
-##
-# CodeGen backends
+# Language standard
+target_compile_features(Halide PUBLIC cxx_std_17)
 
-# LLVM
-foreach (backend IN LISTS Halide_LLVM_COMPONENTS)
-    string(TOUPPER "WITH_${backend}" definition)
-    target_compile_definitions(Halide PRIVATE "${definition}")
-    target_link_libraries(Halide PRIVATE Halide_LLVM::${backend})
-endforeach ()
+# Inform the sources if we're building a static or shared library
+if (NOT BUILD_SHARED_LIBS)
+    target_compile_definitions(Halide PRIVATE Halide_STATIC_DEFINE)
+endif ()
+
+# Set the (shared) library version
+set(Halide_SOVERSION_OVERRIDE "${Halide_VERSION_MAJOR}"
+    CACHE STRING "SOVERSION to set for custom Halide packaging")
+mark_as_advanced(Halide_SOVERSION_OVERRIDE)
+
+set_target_properties(Halide PROPERTIES
+                      VERSION "${Halide_VERSION}"
+                      SOVERSION "${Halide_SOVERSION_OVERRIDE}")
+
+# Always build with PIC, even when static
+set_target_properties(Halide PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 ##
 # Lists of source files. Keep ALL lists sorted in alphabetical order.
@@ -462,6 +472,34 @@ target_sources(
     FILES "${Halide_BINARY_DIR}/include/Halide.h"
 )
 
+##
+# CodeGen backends
+##
+
+# LLVM backends
+foreach (backend IN LISTS Halide_LLVM_COMPONENTS)
+    string(TOUPPER "WITH_${backend}" definition)
+    target_compile_definitions(Halide PRIVATE "${definition}")
+    target_link_libraries(Halide PRIVATE Halide_LLVM::${backend})
+endforeach ()
+
+# GPU backends
+find_package(
+    SPIRV-Headers 1.5.5 REQUIRED
+    HINTS "${Halide_SOURCE_DIR}/dependencies/spirv"
+)
+
+target_link_libraries(
+    Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:SPIRV-Headers::SPIRV-Headers>"
+)
+
+target_compile_definitions(Halide PRIVATE WITH_D3D12)
+target_compile_definitions(Halide PRIVATE WITH_METAL)
+target_compile_definitions(Halide PRIVATE WITH_OPENCL)
+target_compile_definitions(Halide PRIVATE WITH_SPIRV)
+target_compile_definitions(Halide PRIVATE WITH_VULKAN)
+target_compile_definitions(Halide PRIVATE WITH_WEBGPU)
+
 ##
 # Flatbuffers and Serialization dependencies.
 ##
@@ -516,29 +554,6 @@ if (WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
 endif ()
 
-target_compile_features(Halide PUBLIC cxx_std_17)
-if (NOT BUILD_SHARED_LIBS)
-    target_compile_definitions(Halide PRIVATE Halide_STATIC_DEFINE)
-endif ()
-
-include(TargetExportScript)
-## TODO: implement something similar for Windows/link.exe
-# https://github.com/halide/Halide/issues/4651
-target_export_script(Halide
-                     APPLE_LD "${CMAKE_CURRENT_LIST_DIR}/exported_symbols.osx"
-                     GNU_LD "${CMAKE_CURRENT_LIST_DIR}/exported_symbols.ldscript")
-
-set(Halide_SOVERSION_OVERRIDE "${Halide_VERSION_MAJOR}"
-    CACHE STRING "SOVERSION to set for custom Halide packaging")
-mark_as_advanced(Halide_SOVERSION_OVERRIDE)
-
-set_target_properties(Halide PROPERTIES
-                      VERSION "${Halide_VERSION}"
-                      SOVERSION "${Halide_SOVERSION_OVERRIDE}")
-
-# Always build with PIC, even when static
-set_target_properties(Halide PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
 # Note that we (deliberately) redeclare these versions here, even though the macros
 # with identical versions are expected to be defined in source; this allows us to
 # ensure that the versions defined between all build systems are identical.
@@ -592,6 +607,17 @@ elseif (Halide_WASM_BACKEND)
     message(FATAL_ERROR "Unknown Halide_WASM_BACKEND `${Halide_WASM_BACKEND}`")
 endif ()
 
+##
+# Attach symbol export scripts
+##
+
+## TODO: implement something similar for Windows/link.exe
+# https://github.com/halide/Halide/issues/4651
+include(TargetExportScript)
+target_export_script(Halide
+                     APPLE_LD "${CMAKE_CURRENT_LIST_DIR}/exported_symbols.osx"
+                     GNU_LD "${CMAKE_CURRENT_LIST_DIR}/exported_symbols.ldscript")
+
 ##
 # Set compiler options for libHalide
 ##
@@ -647,23 +673,6 @@ else ()
     )
 endif ()
 
-##
-# GPU CodeGen backends
-
-find_package(
-    SPIRV-Headers 1.5.5 REQUIRED
-    HINTS "${Halide_SOURCE_DIR}/dependencies/spirv"
-)
-
-target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:SPIRV-Headers::SPIRV-Headers>")
-
-target_compile_definitions(Halide PRIVATE WITH_D3D12)
-target_compile_definitions(Halide PRIVATE WITH_METAL)
-target_compile_definitions(Halide PRIVATE WITH_OPENCL)
-target_compile_definitions(Halide PRIVATE WITH_SPIRV)
-target_compile_definitions(Halide PRIVATE WITH_VULKAN)
-target_compile_definitions(Halide PRIVATE WITH_WEBGPU)
-
 ##
 # Add autoschedulers to the build.
 ##

From 7b53a8833f378e73b9d00622d60434d03162210d Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Fri, 9 Aug 2024 18:30:51 -0400
Subject: [PATCH 179/186] Introduce HalideFeatures system for optional
 components (#8384)

Previously, our `option()` declarations were scattered and not well
documented. They certainly weren't self-documenting. Some of them
depended on other options and used various ways to handle conflicts.
Sometimes inconsistencies were handled with fatal errors, other times
by silently overriding an option.

With this PR, we introduce a new `Halide_feature` function that is
designed to handle interdependent options and default initialization
in a much more regular way.

It behaves very much like option in its first three parameters:

    Halide_feature(CMAKE_FLAG "documentation string" DEFAULT_VALUE)

Only now `DEFAULT_VALUE` can be more intelligent than simply `ON`
or `OFF`. It can also be `TOP_LEVEL`, which is `ON`
iff `CMAKE_PROJECT_TOP_LEVEL` is true. It can also be `AUTO` which
is `ON` iff the `DEPENDS` clause is defined and true. For example,

    Halide_feature(WITH_TEST_RUNTIME "Build runtime tests" AUTO
                DEPENDS NOT MSVC)

If a feature is set to `ON` but its `DEPENDS` clause is false, a warning
will be issued and the feature will be forced `OFF` in the cache.

Furthermore, these features register their documentation strings with
the built-in `FeatureSummary` system so now instead of a stream of
easy-to-miss messages, the configuration ends with a summary of what
is enabled and disabled:

    -- The following features have been enabled:

    * Halide_ENABLE_EXCEPTIONS, Enable exceptions in Halide
    * Halide_ENABLE_RTTI, Enable RTTI in Halide
    * WITH_AUTOSCHEDULERS, Build the Halide autoschedulers
    * WITH_PACKAGING, Halide's CMake package install rules
    * WITH_PYTHON_BINDINGS, Halide's native Python module (not the whole pip package)
    * WITH_SERIALIZATION, Include experimental Serialization/Deserialization code
    * WITH_TESTS, Halide's unit test suite
    * WITH_TUTORIALS, Halide's tutorial code
    * WITH_UTILS, Optional utility programs for Halide, including HalideTraceViz
    * WITH_TEST_AUTO_SCHEDULE, Build autoscheduler tests
    * WITH_TEST_CORRECTNESS, Build correctness tests
    * WITH_TEST_ERROR, Build error tests
    * WITH_TEST_WARNING, Build warning tests
    * WITH_TEST_PERFORMANCE, Build performance tests
    * WITH_TEST_GENERATOR, Build generator tests
    * WITH_TEST_RUNTIME, Build runtime tests

    -- The following features have been disabled:

    * WITH_DOCS, Halide's Doxygen documentation
    * WITH_TEST_FUZZ, Build fuzz tests

A feature may be marked as `ADVANCED`, which excludes it from the
feature summary unless the log level is set to verbose. It also marks
it as advanced in the cache, which hides it from the default view in
the CMake GUI and the curses-TUI.

Finally, features are computed early in the build so that subdirectories
see a consistent view. Some generator tests that were broken under
static Halide (meaning no autoschedulers) are now properly skipped by
directly checking `WITH_AUTOSCHEDULERS`.
---
 CMakeLists.txt                      | 76 +++++++++++------------------
 README_cmake.md                     | 14 +++---
 cmake/HalideFeatures.cmake          | 59 ++++++++++++++++++++++
 packaging/CMakeLists.txt            | 49 +++++++++----------
 python_bindings/apps/CMakeLists.txt |  5 ++
 python_bindings/test/CMakeLists.txt |  5 ++
 src/CMakeLists.txt                  | 15 ++----
 src/runtime/CMakeLists.txt          | 14 ++++--
 test/CMakeLists.txt                 | 40 ++++++---------
 test/generator/CMakeLists.txt       |  5 +-
 10 files changed, 161 insertions(+), 121 deletions(-)
 create mode 100644 cmake/HalideFeatures.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 71356c1f77d4..c30ec5236023 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,12 +28,12 @@ file(CONFIGURE OUTPUT "${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/HalideHelpersConfig.c
 ##
 
 # Import useful standard modules
-include(CMakeDependentOption)
 include(CheckCXXSymbolExists)
 
 # Make our custom helpers available throughout the project via include().
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake)
 include(HalideGeneratorHelpers)
+include(HalideFeatures)
 
 # Build Halide as a shared lib by default, but still honor command-line settings.
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
@@ -66,10 +66,8 @@ if (CMAKE_CXX_STANDARD LESS 17)
     message(FATAL_ERROR "Halide requires C++17 or newer but CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}")
 endif ()
 
-# Build Halide with ccache if the package is present
-option(Halide_CCACHE_BUILD "Set to ON for a ccache enabled build" OFF)
-mark_as_advanced(Halide_CCACHE_BUILD)
-
+# Build Halide with ccache if the package is present and the user requested it
+Halide_feature(Halide_CCACHE_BUILD "Build with CCache as best configured for Halide" OFF ADVANCED)
 if (Halide_CCACHE_BUILD)
     find_program(CCACHE_PROGRAM ccache REQUIRED)
 
@@ -91,8 +89,6 @@ if (Halide_CCACHE_BUILD)
     if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
         string(APPEND CMAKE_CXX_FLAGS " -Xclang -fno-pch-timestamp")
     endif ()
-
-    message(STATUS "Enabling ccache usage for building.")
 endif ()
 
 # Detect whether or not ASAN is enabled. Don't cache the result to ensure this
@@ -206,70 +202,56 @@ find_package(JPEG)
 find_package(PNG)
 
 ##
-# Declare options
-##
-
-# Declare these options after we include dependencies (since it declares Halide_ENABLE_RTTI etc)
-# but before we add any subdirectories, since any option you test before it is defined is
-# implicitly false the *first* time that the build file is processed, and there are some
-# out-of-order dependencies here (e.g, code in src/ eventually checks WITH_UTILS).
-# This is especially subtle since it means that some options can end up with different
-# values if you build a target as part of the initial CMake run, so (e.g.) a `make install`
-# from as totally clean build might neglect to install some pieces.
-
-option(WITH_TESTS "Build tests" "${PROJECT_IS_TOP_LEVEL}")
-option(WITH_TUTORIALS "Build tutorials" "${PROJECT_IS_TOP_LEVEL}")
-option(WITH_DOCS "Build documentation" OFF)
-option(WITH_UTILS "Build utils" "${PROJECT_IS_TOP_LEVEL}")
-cmake_dependent_option(
-    WITH_PYTHON_BINDINGS "Build Python bindings" "${PROJECT_IS_TOP_LEVEL}"
-    "Halide_ENABLE_RTTI AND Halide_ENABLE_EXCEPTIONS" OFF
-)
+# Optional features. These settings are defined early so that subdirectories see a consistent view
+
+Halide_feature(Halide_BUNDLE_STATIC "Bundle Halide's static dependencies" OFF ADVANCED
+               DEPENDS NOT BUILD_SHARED_LIBS)
+
+Halide_feature(Halide_ENABLE_EXCEPTIONS "Enable exceptions in Halide" ON)
+Halide_feature(Halide_ENABLE_RTTI "Enable RTTI in Halide" ON
+               DEPENDS LLVM_ENABLE_RTTI)
+
+Halide_feature(WITH_AUTOSCHEDULERS "Build the Halide autoschedulers" ON
+               DEPENDS BUILD_SHARED_LIBS)
+Halide_feature(WITH_DOCS "Halide's Doxygen documentation" OFF)
+Halide_feature(WITH_PACKAGING "Halide's CMake package install rules" TOP_LEVEL)
+Halide_feature(WITH_PYTHON_BINDINGS "Halide's native Python module (not the whole pip package)" ON
+               DEPENDS Halide_ENABLE_EXCEPTIONS AND Halide_ENABLE_RTTI)
+Halide_feature(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON)
+Halide_feature(WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING
+               "Intercepting JIT compilation with a serialization roundtrip, for test only"
+               OFF ADVANCED
+               DEPENDS WITH_SERIALIZATION)
+Halide_feature(WITH_TESTS "Halide's unit test suite" TOP_LEVEL)
+Halide_feature(WITH_TUTORIALS "Halide's tutorial code" TOP_LEVEL)
+Halide_feature(WITH_UTILS "Optional utility programs for Halide, including HalideTraceViz" TOP_LEVEL)
 
 ##
 # Add source directories
-##
 
 add_subdirectory(src)
 add_subdirectory(tools)
 
-##
-# Add tests, tutorials, etc. if we're not being imported into another CMake project.
-##
-
 if (WITH_TESTS)
-    message(STATUS "Building tests enabled")
     add_subdirectory(test)
-else ()
-    message(STATUS "Building tests disabled")
 endif ()
 
 if (WITH_PYTHON_BINDINGS)
-    message(STATUS "Building Python bindings enabled")
     add_subdirectory(python_bindings)
-else ()
-    message(STATUS "Building Python bindings disabled")
 endif ()
 
 if (WITH_TUTORIALS)
-    message(STATUS "Building tutorials enabled")
     add_subdirectory(tutorial)
-else ()
-    message(STATUS "Building tutorials disabled")
 endif ()
 
 if (WITH_DOCS)
-    message(STATUS "Building docs enabled")
     add_subdirectory(doc)
-else ()
-    message(STATUS "Building docs disabled")
 endif ()
 
 if (WITH_UTILS)
-    message(STATUS "Building utils enabled")
     add_subdirectory(util)
-else ()
-    message(STATUS "Building utils disabled")
 endif ()
 
-add_subdirectory(packaging)
+if (WITH_PACKAGING)
+    add_subdirectory(packaging)
+endif ()
diff --git a/README_cmake.md b/README_cmake.md
index caca4c3f611f..f92a2fd79be7 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -412,13 +412,13 @@ through the [`add_subdirectory`][add_subdirectory] or
 [`FetchContent`][fetchcontent] mechanisms. They control whether non-essential
 targets (like tests and documentation) are built.
 
-| Option                 | Default              | Description                                                      |
-|------------------------|----------------------|------------------------------------------------------------------|
-| `WITH_TESTS`           | `ON`                 | Enable building unit and integration tests                       |
-| `WITH_PYTHON_BINDINGS` | `ON` if Python found | Enable building Python 3.x bindings                              |
-| `WITH_DOCS`            | `OFF`                | Enable building the documentation via Doxygen                    |
-| `WITH_UTILS`           | `ON`                 | Enable building various utilities including the trace visualizer |
-| `WITH_TUTORIALS`       | `ON`                 | Enable building the tutorials                                    |
+| Option                 | Default | Description                                                      |
+|------------------------|---------|------------------------------------------------------------------|
+| `WITH_TESTS`           | `ON`    | Enable building unit and integration tests                       |
+| `WITH_PYTHON_BINDINGS` | `ON`    | Enable building Python 3.x bindings                              |
+| `WITH_DOCS`            | `OFF`   | Enable building the documentation via Doxygen                    |
+| `WITH_UTILS`           | `ON`    | Enable building various utilities including the trace visualizer |
+| `WITH_TUTORIALS`       | `ON`    | Enable building the tutorials                                    |
 
 The following options control whether to build certain test subsets. They only
 apply when `WITH_TESTS=ON`:
diff --git a/cmake/HalideFeatures.cmake b/cmake/HalideFeatures.cmake
new file mode 100644
index 000000000000..170fb9c7c3f4
--- /dev/null
+++ b/cmake/HalideFeatures.cmake
@@ -0,0 +1,59 @@
+if (PROJECT_IS_TOP_LEVEL)
+    include(FeatureSummary)
+    cmake_language(
+        DEFER DIRECTORY "${Halide_SOURCE_DIR}"
+        CALL feature_summary WHAT ENABLED_FEATURES DISABLED_FEATURES
+    )
+endif ()
+
+function(_Halide_feature_info opt doc)
+    if (NOT PROJECT_IS_TOP_LEVEL)
+        return()
+    endif ()
+
+    set(notice "")
+    if (ARG_ADVANCED)
+        cmake_language(GET_MESSAGE_LOG_LEVEL log_level)
+        if (log_level MATCHES "^(VERBOSE|DEBUG|TRACE)$")
+            set(notice " (advanced)")
+        else ()
+            return()
+        endif ()
+    endif ()
+
+    add_feature_info("${opt}${notice}" "${opt}" "${doc}")
+endfunction()
+
+function(Halide_feature OPTION DOC DEFAULT)
+    cmake_parse_arguments(PARSE_ARGV 3 ARG "ADVANCED" "" "DEPENDS")
+
+    if (DEFAULT STREQUAL "TOP_LEVEL")
+        set(default_value "${PROJECT_IS_TOP_LEVEL}")
+    elseif (DEFAULT STREQUAL "AUTO")
+        set(default_value ${ARG_DEPENDS})
+    else ()
+        set(default_value ${DEFAULT})
+    endif ()
+
+    if (${default_value})
+        set(default_value ON)
+    else ()
+        set(default_value OFF)
+    endif ()
+
+    option("${OPTION}" "${DOC}" "${default_value}")
+    if (ARG_ADVANCED)
+        mark_as_advanced("${OPTION}")
+    endif ()
+
+    if (${OPTION} AND DEFINED ARG_DEPENDS AND NOT (${ARG_DEPENDS}))
+        list(JOIN ARG_DEPENDS " " depends)
+        message(WARNING "${OPTION} forcibly disabled -- requires ${depends}")
+        set("${OPTION}" 0)
+        set("${OPTION}" "${${OPTION}}" CACHE BOOL "${DOC}" FORCE)
+    endif ()
+
+    _Halide_feature_info("${OPTION}" "${DOC}")
+
+    set("${OPTION}" "${${OPTION}}" PARENT_SCOPE)
+endfunction()
diff --git a/packaging/CMakeLists.txt b/packaging/CMakeLists.txt
index 675c52b19983..7f642eee9512 100644
--- a/packaging/CMakeLists.txt
+++ b/packaging/CMakeLists.txt
@@ -29,7 +29,7 @@ install(TARGETS Halide Halide_Generator Halide_GenGen
         ARCHIVE COMPONENT Halide_Development
         FILE_SET HEADERS COMPONENT Halide_Development)
 
-if (TARGET Halide_Adams2019)
+if (WITH_AUTOSCHEDULERS)
     install(TARGETS Halide_Adams2019 Halide_Li2018 Halide_Mullapudi2016 Halide_Anderson2021
             EXPORT Halide_Interfaces
             LIBRARY DESTINATION ${Halide_INSTALL_PLUGINDIR} COMPONENT Halide_Runtime
@@ -59,36 +59,33 @@ install(TARGETS Halide_Tools Halide_ImageIO Halide_RunGenMain Halide_ThreadPool
         FILE_SET HEADERS COMPONENT Halide_Development DESTINATION ${Halide_INSTALL_TOOLSDIR})
 
 ##
-# Patch RPATH for executable targets
+# Install command-line utils
 ##
 
-file(RELATIVE_PATH lib_dir
-     ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}
-     ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+if (WITH_AUTOSCHEDULERS AND WITH_UTILS)
+    file(RELATIVE_PATH lib_dir
+         ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}
+         ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
 
-if (APPLE)
-    set(rbase @loader_path)
-else ()
-    set(rbase $ORIGIN)
-endif ()
+    if (APPLE)
+        set(rbase @loader_path)
+    else ()
+        set(rbase $ORIGIN)
+    endif ()
 
-foreach (util IN ITEMS adams2019_retrain_cost_model
-                       adams2019_weightsdir_to_weightsfile
-                       anderson2021_retrain_cost_model
-                       anderson2021_weightsdir_to_weightsfile
-                       featurization_to_sample
-                       get_host_target)
-    if (TARGET ${util})
-        if (NOT CMAKE_INSTALL_RPATH)
-            set_target_properties(${util} PROPERTIES INSTALL_RPATH "${rbase};${rbase}/${lib_dir}")
-        endif ()
-        install(
-            TARGETS ${util}
-            EXPORT Halide_Interfaces
-            COMPONENT Halide_Development
-        )
+    set(utils
+        adams2019_retrain_cost_model
+        adams2019_weightsdir_to_weightsfile
+        anderson2021_retrain_cost_model
+        anderson2021_weightsdir_to_weightsfile
+        featurization_to_sample
+        get_host_target
+    )
+    if (NOT CMAKE_INSTALL_RPATH)
+        set_target_properties(${utils} PROPERTIES INSTALL_RPATH "${rbase};${rbase}/${lib_dir}")
     endif ()
-endforeach ()
+    install(TARGETS ${utils} EXPORT Halide_Interfaces COMPONENT Halide_Development)
+endif ()
 
 ##
 # READMEs and other top-level documentation
diff --git a/python_bindings/apps/CMakeLists.txt b/python_bindings/apps/CMakeLists.txt
index 0e5599c998a2..e63b38e8f31e 100644
--- a/python_bindings/apps/CMakeLists.txt
+++ b/python_bindings/apps/CMakeLists.txt
@@ -8,6 +8,11 @@ if (Halide_TARGET MATCHES "vulkan")
     return()
 endif ()
 
+if (NOT WITH_AUTOSCHEDULERS)
+    message(WARNING "Python apps are skipped without autoschedulers")
+    return()
+endif ()
+
 set(TEST_TMPDIR "$<SHELL_PATH:${CMAKE_CURRENT_BINARY_DIR}>")
 set(TEST_IMAGES_DIR "$<SHELL_PATH:${CMAKE_CURRENT_SOURCE_DIR}/../../apps/images>")
 
diff --git a/python_bindings/test/CMakeLists.txt b/python_bindings/test/CMakeLists.txt
index d957d9fb5c2b..786664317f17 100644
--- a/python_bindings/test/CMakeLists.txt
+++ b/python_bindings/test/CMakeLists.txt
@@ -3,5 +3,10 @@ if (Halide_TARGET MATCHES "wasm")
     return()
 endif ()
 
+if (NOT WITH_AUTOSCHEDULERS)
+    message(WARNING "Python tests are skipped without autoschedulers")
+    return()
+endif ()
+
 add_subdirectory(correctness)
 add_subdirectory(generators)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d5353dd00c43..22a10753f0d8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -505,7 +505,6 @@ target_compile_definitions(Halide PRIVATE WITH_WEBGPU)
 ##
 
 # Build serialization, enabled by default
-option(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON)
 if (WITH_SERIALIZATION)
     # Sadly, there seem to be at least three variations of the Flatbuffer
     # package in terms of the case of the relevant CMake files. Fortunately,
@@ -546,10 +545,6 @@ endif ()
 
 # Enable serialization testing by intercepting JIT compilation with a serialization roundtrip;
 # This is used only for special builds made specifically for testing, and must be disabled by default.
-cmake_dependent_option(
-    WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING "Intercepting JIT compilation with a serialization roundtrip, for test only" OFF
-    "WITH_SERIALIZATION" OFF
-)
 if (WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
     target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING)
 endif ()
@@ -642,8 +637,9 @@ target_compile_definitions(Halide
 
 ##
 # RTTI and exceptions settings
+##
 
-option(Halide_ENABLE_RTTI "Enable RTTI in Halide" "${LLVM_ENABLE_RTTI}")
+# RTTI
 set_property(TARGET Halide PROPERTY CXX_RTTI "${Halide_ENABLE_RTTI}")
 set_property(TARGET Halide APPEND PROPERTY COMPATIBLE_INTERFACE_BOOL CXX_RTTI)
 
@@ -658,7 +654,7 @@ else ()
     )
 endif ()
 
-option(Halide_ENABLE_EXCEPTIONS "Enable exceptions in Halide" ON)
+# Exceptions
 if (Halide_ENABLE_EXCEPTIONS)
     target_compile_definitions(Halide PUBLIC HALIDE_WITH_EXCEPTIONS)
 else ()
@@ -677,9 +673,6 @@ endif ()
 # Add autoschedulers to the build.
 ##
 
-if (BUILD_SHARED_LIBS)
-    message(STATUS "Building autoschedulers enabled")
+if (WITH_AUTOSCHEDULERS)
     add_subdirectory(autoschedulers)
-else ()
-    message(STATUS "Building autoschedulers disabled (static Halide)")
 endif ()
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index 3f5f876b1e37..aa17db86f4f9 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -179,8 +179,10 @@ set(RUNTIME_CXX_FLAGS
     -Wno-sync-alignment
 )
 
-option(Halide_CLANG_TIDY_BUILD "Generate fake compile jobs for runtime files when running clang-tidy." OFF)
-mark_as_advanced(Halide_CLANG_TIDY_BUILD)
+Halide_feature(
+    Halide_CLANG_TIDY_BUILD "Generate fake compile jobs for runtime files when running clang-tidy." OFF
+    ADVANCED
+)
 
 foreach (i IN LISTS RUNTIME_CPP)
     foreach (j IN ITEMS 32 64)
@@ -353,7 +355,11 @@ target_sources(Halide_Runtime
                FILE_SET HEADERS
                FILES ${RUNTIME_HEADER_FILES})
 
-option(Halide_BUILD_HEXAGON_REMOTE_RUNTIME "Build the hexagon remote runtime for offloading to Hexagon (HVX)" OFF)
-if (Halide_BUILD_HEXAGON_REMOTE_RUNTIME AND NOT Halide_CLANG_TIDY_BUILD)
+Halide_feature(
+    Halide_BUILD_HEXAGON_REMOTE_RUNTIME "Build the hexagon remote runtime for offloading to Hexagon (HVX)" OFF
+    DEPENDS NOT Halide_CLANG_TIDY_BUILD
+    ADVANCED
+)
+if (Halide_BUILD_HEXAGON_REMOTE_RUNTIME)
   add_subdirectory(hexagon_remote)
 endif ()
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index dcbaeb6812bf..bab103196493 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -8,41 +8,41 @@ target_link_libraries(_test_internal PRIVATE Halide::Test)
 target_include_directories(_test_internal PRIVATE "${Halide_SOURCE_DIR}/src")
 target_precompile_headers(_test_internal PRIVATE <Halide.h>)
 if (Halide_CCACHE_BUILD)
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        target_compile_options(_test_internal PRIVATE
-            "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-Xclang -fno-pch-timestamp>"
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        target_compile_options(
+            _test_internal PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-Xclang -fno-pch-timestamp>"
         )
-    endif()
-endif()
+    endif ()
+endif ()
 
 add_halide_test(_test_internal GROUPS internal)
 
-option(WITH_TEST_AUTO_SCHEDULE "Build autoscheduler tests" ON)
+Halide_feature(WITH_TEST_AUTO_SCHEDULE "Build autoscheduler tests" ON)
 if (WITH_TEST_AUTO_SCHEDULE)
     add_subdirectory(autoschedulers)
 endif ()
 
-option(WITH_TEST_CORRECTNESS "Build correctness tests" ON)
+Halide_feature(WITH_TEST_CORRECTNESS "Build correctness tests" ON)
 if (WITH_TEST_CORRECTNESS)
     add_subdirectory(correctness)
 endif ()
 
-option(WITH_TEST_ERROR "Build error tests" ON)
+Halide_feature(WITH_TEST_ERROR "Build error tests" ON)
 if (WITH_TEST_ERROR)
     add_subdirectory(error)
 endif ()
 
-option(WITH_TEST_WARNING "Build warning tests" ON)
+Halide_feature(WITH_TEST_WARNING "Build warning tests" ON)
 if (WITH_TEST_WARNING)
     add_subdirectory(warning)
 endif ()
 
-option(WITH_TEST_PERFORMANCE "Build performance tests" ON)
+Halide_feature(WITH_TEST_PERFORMANCE "Build performance tests" ON)
 if (WITH_TEST_PERFORMANCE)
     add_subdirectory(performance)
 endif ()
 
-option(WITH_TEST_GENERATOR "Build generator tests" ON)
+Halide_feature(WITH_TEST_GENERATOR "Build generator tests" ON)
 if (WITH_TEST_GENERATOR)
     add_subdirectory(generator)
 endif ()
@@ -58,14 +58,10 @@ endif ()
 # some platform specific ifdefs for attributes and types that are causing compile 
 # errors.
 #
-cmake_dependent_option(WITH_TEST_RUNTIME "Build runtime tests" ON
-                       "NOT MSVC" OFF)
-
+Halide_feature(WITH_TEST_RUNTIME "Build runtime tests" AUTO
+               DEPENDS NOT MSVC)
 if (WITH_TEST_RUNTIME)
-    message(STATUS "Building internal runtime tests enabled")
     add_subdirectory(runtime)
-else ()
-    message(STATUS "Building internal runtime tests disabled")
 endif ()
 
 # FIXME: failing_with_issue is dead code :)
@@ -98,14 +94,8 @@ endif ()
 # is true: just because our compiler supports fuzzing doesn't mean we want to
 # build the fuzz tests, because they won't really build properly without the
 # right preset specified.
-cmake_dependent_option(
-    WITH_TEST_FUZZ "Build fuzz tests" OFF
-    HAS_FUZZ_FLAGS OFF
-)
-
+Halide_feature(WITH_TEST_FUZZ "Build fuzz tests" AUTO
+               DEPENDS HAS_FUZZ_FLAGS)
 if (WITH_TEST_FUZZ)
-    message(STATUS "Building fuzz tests enabled")
     add_subdirectory(fuzz)
-else ()
-    message(STATUS "Building fuzz tests disabled")
 endif ()
diff --git a/test/generator/CMakeLists.txt b/test/generator/CMakeLists.txt
index be1ccdd47527..431bd4d795ff 100644
--- a/test/generator/CMakeLists.txt
+++ b/test/generator/CMakeLists.txt
@@ -339,11 +339,13 @@ set(EXTRA_ALIAS_LIBS alias_with_offset_42 alias_Adams2019 alias_Li2018 alias_Mul
 _add_halide_libraries(alias)
 foreach (LIB IN LISTS EXTRA_ALIAS_LIBS)
     _add_halide_libraries(${LIB}
+                          ENABLE_IF WITH_AUTOSCHEDULERS
                           FROM alias.generator
                           GENERATOR_NAME ${LIB}
                           PLUGINS Halide::Adams2019 Halide::Li2018 Halide::Mullapudi2016)
 endforeach ()
 _add_halide_aot_tests(alias
+                      ENABLE_IF WITH_AUTOSCHEDULERS
                       HALIDE_LIBRARIES alias ${EXTRA_ALIAS_LIBS})
 
 # all_type_names_aottest.cpp
@@ -368,6 +370,7 @@ _add_halide_aot_tests(async_parallel
 # autograd_generator.cpp
 _add_halide_libraries(autograd)
 _add_halide_libraries(autograd_grad
+                      ENABLE_IF WITH_AUTOSCHEDULERS
                       GRADIENT_DESCENT
                       FROM autograd.generator
                       GENERATOR_NAME autograd
@@ -375,7 +378,7 @@ _add_halide_libraries(autograd_grad
                       AUTOSCHEDULER Halide::Mullapudi2016
                       PLUGINS Halide::Mullapudi2016)
 _add_halide_aot_tests(autograd
-                      ENABLE_IF TARGET Halide::Mullapudi2016 AND NOT ${_USING_WASM}
+                      ENABLE_IF WITH_AUTOSCHEDULERS AND NOT ${_USING_WASM}
                       HALIDE_LIBRARIES autograd autograd_grad
                       GROUPS multithreaded)
 

From 3cdeb5398fb87be699fa830f843ca5d05fe6b983 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Fri, 9 Aug 2024 22:55:07 -0400
Subject: [PATCH 180/186] Scan generated export files to determine
 dependencies. (#8385)

This commit contains a module for declaring that an export file might
depend on another CMake package that was found by find_package. Such
dependencies are collected in a project-wide property (rather than a
variable) along with a snippet of code that reconstructs the original
call.

Then, after we have installed an export file via install(EXPORT), we can
call a helper to add install rules that will read the file as-generated
by CMake to check whether any of these packages could be required.

CMake does not like to expose this information, in part because
generator expressions make computing the eventual link set undecidable.
Even so, for our purposes if Pkg:: appears in our link-libraries list,
then we need to find_package(Pkg).

This module implements that heuristic.

So why is this hard? It's because checking whether a dependency is
actually included is very complicated. A library will appear if:

1. It is SHARED or MODULE
2. It linked privately to a STATIC target
    - These appear as $<LINK_ONLY:${dep}>
3. It is STATIC and linked publicly to a SHARED target;
4. It is INTERFACE or ALIAS and linked publicly
5. It is included transitively via (4) and meets (1), (2), or (3)
6. I am not sure this set of rules is exhaustive.

There is an experimental feature in CMake 3.30 that will some day
replace this module.
---
 CMakeLists.txt                         |   5 +-
 cmake/HalidePackageConfigHelpers.cmake | 107 +++++++++++++++++++++++++
 packaging/CMakeLists.txt               |  35 ++++----
 src/CMakeLists.txt                     |   3 +
 4 files changed, 130 insertions(+), 20 deletions(-)
 create mode 100644 cmake/HalidePackageConfigHelpers.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c30ec5236023..167d486225f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,8 +32,9 @@ include(CheckCXXSymbolExists)
 
 # Make our custom helpers available throughout the project via include().
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake)
-include(HalideGeneratorHelpers)
 include(HalideFeatures)
+include(HalideGeneratorHelpers)
+include(HalidePackageConfigHelpers)
 
 # Build Halide as a shared lib by default, but still honor command-line settings.
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
@@ -187,6 +188,8 @@ find_package(Halide_LLVM 17...20 REQUIRED
              COMPONENTS WebAssembly X86
              OPTIONAL_COMPONENTS AArch64 ARM Hexagon NVPTX PowerPC RISCV)
 
+_Halide_pkgdep(Halide_LLVM PACKAGE_VARS Halide_LLVM_SHARED_LIBS)
+
 ## Image formats
 
 # This changes how find_xxx() commands work; the default is to find frameworks before
diff --git a/cmake/HalidePackageConfigHelpers.cmake b/cmake/HalidePackageConfigHelpers.cmake
new file mode 100644
index 000000000000..4efe5ebf8718
--- /dev/null
+++ b/cmake/HalidePackageConfigHelpers.cmake
@@ -0,0 +1,107 @@
+#[==========================================================================[
+  HalidePackageConfigHelpers
+
+  This module contains a system for declaring that an export file might
+  depend on another CMake package that was found by find_package. Such
+  dependencies are collected in a project-wide property (rather than a
+  variable) along with a snippet of code that reconstructs the original
+  call.
+
+  Then, after we have installed an export file via install(EXPORT), we can
+  call a helper to add install rules that will read the file as-generated
+  by CMake to check whether any of these packages could be required.
+
+  CMake does not like to expose this information, in part because generator
+  expressions make computing the eventual link set undecidable. Even so,
+  for our purposes if `Pkg::` appears in our link-libraries list, then
+  we need to find_package(Pkg). This module implements that heuristic.
+
+  So why is this hard? It's because checking whether a dependency is
+  actually included is very complicated. A library will appear if:
+
+    1. It is SHARED or MODULE
+    2. It linked privately to a STATIC target
+         - These appear as $<LINK_ONLY:${dep}>
+    3. It is STATIC and linked publicly to a SHARED target;
+    4. It is INTERFACE or ALIAS and linked publicly
+    5. It is included transitively via (4) and meets (1), (2), or (3)
+    6. I am not sure this set of rules is exhaustive.
+
+  There is an experimental feature in CMake 3.30 that will some day
+  replace this module.
+#]==========================================================================]
+
+##
+# Helper for registering package dependencies
+
+function(_Halide_pkgdep PKG)
+    cmake_parse_arguments(PARSE_ARGV 1 ARG "" "" "PACKAGE_VARS")
+
+    set(code "")
+    foreach (var IN LISTS ARG_PACKAGE_VARS)
+        string(APPEND code "set(${var} [[${${var}}]])\n")
+    endforeach ()
+
+    if ("${${PKG}_COMPONENTS}" STREQUAL "")
+        string(APPEND code "find_dependency(${PKG} ${${PKG}_VERSION})")
+    else ()
+        string(APPEND code
+               "find_dependency(\n"
+               "    ${PKG} ${${PKG}_VERSION}\n"
+               "    COMPONENTS ${${PKG}_COMPONENTS}\n"
+               ")")
+    endif ()
+
+    set_property(DIRECTORY "${PROJECT_SOURCE_DIR}" APPEND PROPERTY pkgdeps "${PKG}")
+    set_property(DIRECTORY "${PROJECT_SOURCE_DIR}" PROPERTY "pkgdeps[${PKG}]" "${code}")
+endfunction()
+
+##
+# Helper for generating a file containing find_dependency() invocations
+# by applying a heuristic to the actual dependency set.
+
+function(_Halide_install_code)
+    # This is just to keep the code in cmake_install.cmake readable.
+    set(code "")
+    set(sep "")
+    math(EXPR ARGC "${ARGC} - 1")
+    foreach (i RANGE "${ARGC}")
+        string(APPEND code "${sep}${ARGV${i}}")
+        set(sep "\n  ")
+    endforeach ()
+    install(CODE "${code}" COMPONENT "${ARG_COMPONENT}")
+endfunction()
+
+function(_Halide_install_pkgdeps)
+    cmake_parse_arguments(
+        PARSE_ARGV 0 ARG "" "COMPONENT;DESTINATION;FILE_NAME;EXPORT_FILE" ""
+    )
+
+    set(depFile "${CMAKE_CURRENT_BINARY_DIR}/${ARG_FILE_NAME}")
+
+    _Halide_install_code(
+        "file(READ \"\${CMAKE_INSTALL_PREFIX}/${ARG_DESTINATION}/${ARG_EXPORT_FILE}\" target_cmake)"
+        "file(WRITE \"${depFile}.in\" \"\")"
+    )
+
+    get_property(pkgdeps DIRECTORY "${PROJECT_SOURCE_DIR}" PROPERTY pkgdeps)
+    foreach (dep IN LISTS pkgdeps)
+        get_property(pkgcode DIRECTORY "${PROJECT_SOURCE_DIR}" PROPERTY "pkgdeps[${dep}]")
+        _Halide_install_code(
+            "if (target_cmake MATCHES \"${dep}::\")"
+            "  file(APPEND \"${depFile}.in\""
+            "       [===[${pkgcode}]===] \"\\n\")"
+            "endif ()"
+        )
+    endforeach ()
+
+    _Halide_install_code(
+        "configure_file(\"${depFile}.in\" \"${depFile}\" COPYONLY)"
+    )
+
+    install(
+        FILES "${depFile}"
+        DESTINATION "${ARG_DESTINATION}"
+        COMPONENT "${ARG_COMPONENT}"
+    )
+endfunction()
\ No newline at end of file
diff --git a/packaging/CMakeLists.txt b/packaging/CMakeLists.txt
index 7f642eee9512..8c263522f29d 100644
--- a/packaging/CMakeLists.txt
+++ b/packaging/CMakeLists.txt
@@ -138,25 +138,11 @@ else ()
     set(type static)
 endif ()
 
-# If Halide explicitly links against shared LLVM or if it is a static library
-# and we are not bundling our static dependencies, then end-users must have
-# the relevant system libraries installed.
-if (Halide_LLVM_SHARED_LIBS OR (NOT BUILD_SHARED_LIBS AND NOT Halide_BUNDLE_LLVM))
-    set(depFile "${CMAKE_CURRENT_BINARY_DIR}/Halide-${type}-deps.cmake")
-    file(CONFIGURE
-         OUTPUT "${depFile}"
-         CONTENT [[
-set(Halide_LLVM_SHARED_LIBS @Halide_LLVM_SHARED_LIBS@)
-find_dependency(
-    Halide_LLVM @Halide_LLVM_VERSION@
-    COMPONENTS @Halide_LLVM_COMPONENTS@
-)
-]] @ONLY)
-
-    install(FILES "${depFile}" "${Halide_SOURCE_DIR}/cmake/FindHalide_LLVM.cmake"
-            DESTINATION ${Halide_INSTALL_CMAKEDIR}
-            COMPONENT Halide_Development)
-endif ()
+install(FILES
+        "${Halide_SOURCE_DIR}/cmake/FindHalide_LLVM.cmake"
+        "${Halide_SOURCE_DIR}/cmake/FindV8.cmake"
+        DESTINATION ${Halide_INSTALL_CMAKEDIR}
+        COMPONENT Halide_Development)
 
 install(EXPORT Halide_Targets
         DESTINATION ${Halide_INSTALL_CMAKEDIR}
@@ -203,6 +189,17 @@ install(FILES
         DESTINATION ${Halide_INSTALL_HELPERSDIR}
         COMPONENT Halide_Development)
 
+##
+# Compute find_dependency calls for Halide
+##
+
+_Halide_install_pkgdeps(
+    FILE_NAME Halide-${type}-deps.cmake
+    EXPORT_FILE Halide-${type}-targets.cmake
+    DESTINATION "${Halide_INSTALL_CMAKEDIR}"
+    COMPONENT Halide_Development
+)
+
 ##
 # Documentation
 ##
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 22a10753f0d8..05408ce88973 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -513,6 +513,7 @@ if (WITH_SERIALIZATION)
         flatbuffers 23.5.26 REQUIRED
         NAMES flatbuffers Flatbuffers FlatBuffers
     )
+    _Halide_pkgdep(flatbuffers)
 
     if (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
         target_sources(Halide PRIVATE "$<TARGET_OBJECTS:flatbuffers::flatbuffers>")
@@ -585,6 +586,7 @@ endif ()
 
 if (Halide_WASM_BACKEND STREQUAL "wabt")
     find_package(wabt 1.0.36 REQUIRED)
+    _Halide_pkgdep(wabt)
 
     if (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
         target_sources(Halide PRIVATE "$<TARGET_OBJECTS:wabt::wabt>")
@@ -596,6 +598,7 @@ if (Halide_WASM_BACKEND STREQUAL "wabt")
     target_compile_definitions(Halide PRIVATE WITH_WABT)
 elseif (Halide_WASM_BACKEND STREQUAL "V8")
     find_package(V8 REQUIRED)
+    _Halide_pkgdep(V8)
     target_link_libraries(Halide PRIVATE V8::V8)
     target_compile_definitions(Halide PRIVATE WITH_V8)
 elseif (Halide_WASM_BACKEND)

From 6dc2b3e7fe0457ff429b18910eff452b3415a64e Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Sun, 11 Aug 2024 22:24:25 -0400
Subject: [PATCH 181/186] Rewrite bundle_static to be much more efficient.
 (#8386)

The `bundle_static` function now detects the private static dependencies
on the given target (in our case, always Halide) and uses the platform
librarian tool to merge static dependencies into a static library. It
picks which tool to use by checking, in order:

* When targeting Windows, it looks for `lib.exe`.
* When targeting macOS, it checks if `libtool` is the Apple libtool.
* Whether `ar` is GNU ar and if so, generates an MRI script.
* Otherwise, a `FATAL_ERROR` is issued.

To mark a static library for bundling, we link privately and use
the `$<BUILD_LOCAL_INTERFACE:...>` generator expression. This prevents
it from being exported, too.

The generator expression that implements this logic is quite complex.
It involves meta-programming generator expressions during evaluation
and then evaluating them. Even so, this saves a considerable amount of
time unpacking LLVM into a temporary directory and adding the objects
to the link line (the previous approach).
---
 CMakeLists.txt           |   3 +
 CMakePresets.json        |   2 +-
 README_cmake.md          |  16 +--
 cmake/BundleStatic.cmake | 302 +++++++++++++++------------------------
 src/CMakeLists.txt       |  26 +++-
 5 files changed, 146 insertions(+), 203 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 167d486225f7..a6b189757246 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ include(CheckCXXSymbolExists)
 
 # Make our custom helpers available throughout the project via include().
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake)
+include(BundleStatic)
 include(HalideFeatures)
 include(HalideGeneratorHelpers)
 include(HalidePackageConfigHelpers)
@@ -213,6 +214,8 @@ Halide_feature(Halide_BUNDLE_STATIC "Bundle Halide's static dependencies" OFF AD
 Halide_feature(Halide_ENABLE_EXCEPTIONS "Enable exceptions in Halide" ON)
 Halide_feature(Halide_ENABLE_RTTI "Enable RTTI in Halide" ON
                DEPENDS LLVM_ENABLE_RTTI)
+Halide_feature(Halide_BUNDLE_STATIC "Bundle Halide's static dependencies" OFF ADVANCED
+               DEPENDS NOT BUILD_SHARED_LIBS)
 
 Halide_feature(WITH_AUTOSCHEDULERS "Build the Halide autoschedulers" ON
                DEPENDS BUILD_SHARED_LIBS)
diff --git a/CMakePresets.json b/CMakePresets.json
index 9eaa883f8ca9..e0b6e065ee6e 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -163,7 +163,7 @@
       "binaryDir": "static-Release",
       "cacheVariables": {
         "BUILD_SHARED_LIBS": "NO",
-        "Halide_BUNDLE_LLVM": "YES"
+        "Halide_BUNDLE_STATIC": "YES"
       }
     },
     {
diff --git a/README_cmake.md b/README_cmake.md
index f92a2fd79be7..d58fb18b767a 100644
--- a/README_cmake.md
+++ b/README_cmake.md
@@ -388,14 +388,14 @@ Halide reads and understands several options that can configure the build. The
 following are the most consequential and control how Halide is actually
 compiled.
 
-| Option                                   | Default               | Description                                                                                                      |
-|------------------------------------------|-----------------------|------------------------------------------------------------------------------------------------------------------|
-| [`BUILD_SHARED_LIBS`][build_shared_libs] | `ON`                  | Standard CMake variable that chooses whether to build as a static or shared library.                             |
-| `Halide_BUNDLE_LLVM`                     | `OFF`                 | When building Halide as a static library, unpack the LLVM static libraries and add those objects to libHalide.a. |
-| `Halide_LLVM_SHARED_LIBS`                | `OFF`                 | Link to the shared version of LLVM. Not available on Windows.                                                    |
-| `Halide_ENABLE_RTTI`                     | _inherited from LLVM_ | Enable RTTI when building Halide. Recommended to be set to `ON`                                                  |
-| `Halide_ENABLE_EXCEPTIONS`               | `ON`                  | Enable exceptions when building Halide                                                                           |
-| `Halide_TARGET`                          | _empty_               | The default target triple to use for `add_halide_library` (and the generator tests, by extension)                |
+| Option                                   | Default               | Description                                                                                       |
+|------------------------------------------|-----------------------|---------------------------------------------------------------------------------------------------|
+| [`BUILD_SHARED_LIBS`][build_shared_libs] | `ON`                  | Standard CMake variable that chooses whether to build as a static or shared library.              |
+| `Halide_BUNDLE_STATIC`                   | `OFF`                 | When building Halide as a static library, merge static library dependencies into libHalide.a.     |
+| `Halide_LLVM_SHARED_LIBS`                | `OFF`                 | Link to the shared version of LLVM. Not available on Windows.                                     |
+| `Halide_ENABLE_RTTI`                     | _inherited from LLVM_ | Enable RTTI when building Halide. Recommended to be set to `ON`                                   |
+| `Halide_ENABLE_EXCEPTIONS`               | `ON`                  | Enable exceptions when building Halide                                                            |
+| `Halide_TARGET`                          | _empty_               | The default target triple to use for `add_halide_library` (and the generator tests, by extension) |
 
 The following options are _advanced_ and should not be required in typical workflows. Generally, these are used by
 Halide's own CI infrastructure, or as escape hatches for third-party packagers.
diff --git a/cmake/BundleStatic.cmake b/cmake/BundleStatic.cmake
index 252328d0ca75..ae59b4f9e55a 100644
--- a/cmake/BundleStatic.cmake
+++ b/cmake/BundleStatic.cmake
@@ -1,201 +1,123 @@
 cmake_minimum_required(VERSION 3.28)
 
 ##
-# This module provides a utility for bundling a set of IMPORTED
-# STATIC libraries together as a merged INTERFACE library that,
-# due to CMake Issue #15415, requires manual propagation to its
-# linkees, unfortunately.
-#
-# This is useful when a STATIC library produced by your project
-# depends privately on some 3rd-party STATIC libraries that are
-# tricky to distribute or for end-users to build. CMake handles
-# this by assuming that imported libraries will be easy to find
-# in an end-user's environment so a simple find_dependency call
-# in the package config will suffice. Unfortunately, things are
-# not so simple. Some libraries (eg. LLVM) can be built in many
-# different configurations, and dependents can be built against
-# one fixed configuration. If we have LLVM -> X -> Y where X is
-# my library and Y is some other user's library, then Y must be
-# very careful to build LLVM in _exactly_ the same way as X was
-# configured to use. While this might be acceptable in a super-
-# build, it fails when we want to release binary packages of X.
-##
-
-# All of the IMPORTED_ and INTERFACE_ properties should be accounted for below.
-# https://cmake.org/cmake/help/v3.22/manual/cmake-properties.7.html#properties-on-targets
-
-# Irrelevant properties:
-# IMPORTED_IMPLIB(_<CONFIG>) # shared-only
-# IMPORTED_LIBNAME(_<CONFIG>) # interface-only
-# IMPORTED_LINK_DEPENDENT_LIBRARIES(_<CONFIG>) # shared-only
-# IMPORTED_LINK_INTERFACE_LIBRARIES(_<CONFIG>) # deprecated
-# IMPORTED_LINK_INTERFACE_MULTIPLICITY(_<CONFIG>) # static-only. irrelevant when all objects listed.
-# IMPORTED_NO_SONAME(_<CONFIG>) # shared-only
-# IMPORTED_SONAME(_<CONFIG>) # shared-only
-
-function(bundle_static TARGET)
-    set(options)
-    set(oneValueArgs)
-    set(multiValueArgs LIBRARIES)
-    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(interfaceLib ${TARGET})
-    set(objectLib ${interfaceLib}.obj)
-
-    add_library(${objectLib} OBJECT IMPORTED)
-    set_target_properties(${objectLib} PROPERTIES IMPORTED_GLOBAL TRUE)
-
-    target_sources(${interfaceLib} INTERFACE $<BUILD_INTERFACE:$<TARGET_OBJECTS:${objectLib}>>)
-
-    set(queue ${ARG_LIBRARIES})
-    while (queue)
-        list(POP_FRONT queue lib)
-        if (VISITED_${lib})
-            continue()
-        endif ()
-        set(VISITED_${lib} TRUE)
-
-        if (NOT TARGET ${lib})
-            target_link_libraries(${interfaceLib} INTERFACE ${lib})
-            continue()
-        endif ()
-
-        get_property(isImported TARGET ${lib} PROPERTY IMPORTED)
-        get_property(type TARGET ${lib} PROPERTY TYPE)
-
-        if (NOT isImported OR NOT "${type}" STREQUAL "STATIC_LIBRARY")
-            target_link_libraries(${interfaceLib} INTERFACE ${lib})
-            continue()
-        endif ()
-
-        transfer_same(PROPERTIES INTERFACE_POSITION_INDEPENDENT_CODE
-                      FROM ${lib} TO ${interfaceLib})
-
-        transfer_append(PROPERTIES
-                        INTERFACE_AUTOUIC_OPTIONS
-                        INTERFACE_COMPILE_DEFINITIONS
-                        INTERFACE_COMPILE_FEATURES
-                        INTERFACE_COMPILE_OPTIONS
-                        INTERFACE_INCLUDE_DIRECTORIES
-                        INTERFACE_LINK_DEPENDS
-                        INTERFACE_LINK_DIRECTORIES
-                        INTERFACE_LINK_OPTIONS
-                        INTERFACE_PRECOMPILE_HEADERS
-                        INTERFACE_SOURCES
-                        INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
-                        FROM ${lib} TO ${interfaceLib})
-
-        transfer_same(PROPERTIES IMPORTED_COMMON_LANGUAGE_RUNTIME
-                      FROM ${lib} TO ${objectLib})
-
-        transfer_locations(FROM ${lib} TO ${objectLib})
-
-        get_property(deps TARGET ${lib} PROPERTY INTERFACE_LINK_LIBRARIES)
-        list(APPEND queue ${deps})
-    endwhile ()
+# Merge all the static library dependencies of TARGET into the library as a
+# POST_BUILD step.
+
+function(_bundle_static_replace VAR BEFORE AFTER)
+    string(REPLACE "$<" "$\\\\<" AFTER "${AFTER}")
+    string(REPLACE ">" "$<ANGLE-R>" AFTER "${AFTER}")
+    string(REPLACE "," "$<COMMA>" AFTER "${AFTER}")
+    string(REPLACE ";" "$<SEMICOLON>" AFTER "${AFTER}")
+    set("${VAR}" "$<LIST:TRANSFORM,${${VAR}},REPLACE,${BEFORE},${AFTER}>")
+    set("${VAR}" "$<LIST:TRANSFORM,${${VAR}},REPLACE,\\\\<,<>")
+    set("${VAR}" "$<GENEX_EVAL:${${VAR}}>" PARENT_SCOPE)
 endfunction()
 
-function(transfer_same)
-    set(options)
-    set(oneValueArgs FROM TO PROPERTIES)
-    set(multiValueArgs)
-    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    foreach (p IN LISTS ARG_PROPERTIES)
-        get_property(fromSet TARGET ${ARG_FROM} PROPERTY ${p} SET)
-        if (NOT fromSet)
-            continue()
-        endif ()
-        get_property(fromVal TARGET ${ARG_FROM} PROPERTY ${p})
-
-        get_property(toSet TARGET ${ARG_TO} PROPERTY ${p} SET)
-        if (NOT toSet)
-            set_property(TARGET ${ARG_TO} PROPERTY ${p} ${fromVal})
-        endif ()
-
-        get_property(toVal TARGET ${ARG_TO} PROPERTY ${p})
-        if (NOT "${fromVal}" STREQUAL "${toVal}")
-            message(WARNING "Property ${p} does not agree between ${ARG_FROM} [${fromVal}] and ${ARG_TO} [${toVal}]")
-        endif ()
-    endforeach ()
+function(_bundle_static_check_output VAR)
+    execute_process(COMMAND ${ARGN} OUTPUT_VARIABLE "${VAR}" RESULT_VARIABLE "_${VAR}" ERROR_QUIET)
+    if (_${VAR})
+        set("${VAR}" "")
+    endif ()
+    set("${VAR}" "${${VAR}}" PARENT_SCOPE)
 endfunction()
 
-function(transfer_append)
-    set(options)
-    set(oneValueArgs FROM TO PROPERTIES)
-    set(multiValueArgs)
-    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    foreach (p IN LISTS ARG_PROPERTIES)
-        get_property(fromSet TARGET ${ARG_FROM} PROPERTY ${p} SET)
-        if (fromSet)
-            get_property(fromVal TARGET ${ARG_FROM} PROPERTY ${p})
-            set_property(TARGET ${ARG_TO} APPEND PROPERTY ${p} ${fromVal})
-        endif ()
-    endforeach ()
+function(_bundle_static_is_apple_libtool result item)
+    _bundle_static_check_output(version_info "${item}" -V)
+    if (version_info MATCHES "Apple, Inc.")
+        set(result 1 PARENT_SCOPE)
+    else ()
+        set(result 0 PARENT_SCOPE)
+    endif ()
 endfunction()
 
-function(transfer_locations)
-    set(options)
-    set(oneValueArgs FROM TO)
-    set(multiValueArgs)
-    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    get_property(configs TARGET ${ARG_FROM} PROPERTY IMPORTED_CONFIGURATIONS)
-    foreach (cfg IN LISTS configs ITEMS "")
-        if (cfg)
-            string(TOUPPER "_${cfg}" cfg)
-        endif ()
-
-        get_property(lib TARGET ${ARG_FROM} PROPERTY "IMPORTED_LOCATION${cfg}")
-        if (lib)
-            cmake_path(GET lib STEM stage)
-            set(stage "${CMAKE_CURRENT_BINARY_DIR}/${stage}.obj")
-
-            if (NOT EXISTS "${stage}")
-                file(MAKE_DIRECTORY "${stage}")
-                if (MSVC)
-                    execute_process(COMMAND "${CMAKE_AR}" /NOLOGO /LIST "${lib}"
-                                    WORKING_DIRECTORY "${stage}"
-                                    OUTPUT_VARIABLE objsInLib)
-
-                    # Process the output to a list of internal objects
-                    string(STRIP "${objsInLib}" objsInLib)
-                    string(REGEX REPLACE "(\r|\n)+" ";" objsInLib "${objsInLib}")
-                    list(TRANSFORM objsInLib STRIP)
-
-                    foreach (obj IN LISTS objsInLib)
-                        execute_process(COMMAND "${CMAKE_AR}" /NOLOGO "/EXTRACT:${obj}" "${lib}"
-                                        WORKING_DIRECTORY "${stage}")
-                    endforeach ()
-                else ()
-                    execute_process(COMMAND "${CMAKE_AR}" -x "${lib}"
-                                    WORKING_DIRECTORY "${stage}"
-                                    RESULT_VARIABLE error)
-                endif ()
-            endif ()
-
-            get_property(languages TARGET ${ARG_FROM} PROPERTY "IMPORTED_LINK_INTERFACE_LANGUAGES${cfg}")
-            if (NOT languages)
-                get_property(languages TARGET ${ARG_FROM} PROPERTY "IMPORTED_LINK_INTERFACE_LANGUAGES")
-            endif ()
-
-            message(VERBOSE "Transferring ${languages}[${cfg}] objects from ${lib} to ${ARG_TO}")
-
-            set(globs "")
-            foreach (lang IN LISTS languages)
-                if (DEFINED "CMAKE_${lang}_OUTPUT_EXTENSION")
-                    list(APPEND globs "${stage}/*${CMAKE_${lang}_OUTPUT_EXTENSION}")
-                endif ()
-            endforeach ()
-
-            file(GLOB_RECURSE objects ${globs})
-
-            foreach (obj IN LISTS objects)
-                message(VERBOSE "... ${obj}")
-            endforeach ()
-
-            set_property(TARGET ${ARG_TO} APPEND PROPERTY "IMPORTED_OBJECTS${cfg}" ${objects})
-        endif ()
+function(bundle_static TARGET)
+    get_property(type TARGET "${TARGET}" PROPERTY TYPE)
+    if (NOT type STREQUAL "STATIC_LIBRARY")
+        return()
+    endif ()
+
+    # The following code is quite subtle. First, it "recursively" (up to a depth
+    # limit) expands all the INTERFACE_LINK_LIBRARIES of the TARGET. Once the
+    # full set of library dependencies has been determined, it filters just
+    # the static libraries and replaces them with their on-disk locations.
+
+    # Start with the $<LINK_ONLY:$<BUILD_LOCAL_INTERFACE:...>> dependencies of
+    # the target. These are the privately-linked static and interface libraries
+    # that the user intends to delete upon export.
+    set(cmd "$<TARGET_PROPERTY:${TARGET},INTERFACE_LINK_LIBRARIES>")
+    set(cmd "$<FILTER:${cmd},INCLUDE,LINK_ONLY:..BUILD_LOCAL_INTERFACE>")
+
+    # Repeatedly expand and flatten: T ~> T, T.INTERFACE_LINK_LIBRARIES
+    foreach (i RANGE 5)
+        _bundle_static_replace(
+            cmd "(.+)" "\\1;$<$<TARGET_EXISTS:\\1>:$<TARGET_PROPERTY:\\1,INTERFACE_LINK_LIBRARIES>>"
+        )
+        set(cmd "$<LIST:REMOVE_DUPLICATES,$<GENEX_EVAL:${cmd}>>")
     endforeach ()
+
+    # Rewrite T ~> T^T.TYPE  -- we use ^ as a delimiter
+    _bundle_static_replace(cmd "(.+)" "\\1^$<TARGET_PROPERTY:\\1,TYPE>")
+    set(cmd "$<GENEX_EVAL:${cmd}>")
+
+    # Select exactly the set of static libraries
+    set(cmd "$<FILTER:${cmd},INCLUDE,\\^STATIC_LIBRARY$>")
+
+    # Rewrite T^... ~> $<TARGET_FILE:T>
+    _bundle_static_replace(cmd "^([^^]+)\\^.+$" "$<TARGET_FILE:\\1>")
+
+    # Rename the target to target.tmp
+    add_custom_command(
+        TARGET "${TARGET}" POST_BUILD
+        COMMAND "${CMAKE_COMMAND}" -E rename "$<TARGET_FILE:${TARGET}>" "$<TARGET_FILE:${TARGET}>.tmp"
+        VERBATIM
+    )
+
+    # Finally merge everything together using the platform tool.
+    find_program(LIB lib.exe HINTS "${CMAKE_AR}")
+    if (WIN32 AND LIB)
+        add_custom_command(
+            TARGET "${TARGET}" POST_BUILD
+            COMMAND "${LIB}" "/out:$<TARGET_FILE:${TARGET}>" "$<TARGET_FILE:${TARGET}>.tmp" "${cmd}"
+            COMMAND_EXPAND_LISTS
+            VERBATIM
+        )
+        return()
+    endif ()
+
+    find_program(LIBTOOL libtool VALIDATOR _bundle_static_is_apple_libtool)
+    if (APPLE AND LIBTOOL)
+        add_custom_command(
+            TARGET "${TARGET}" POST_BUILD
+            COMMAND "${LIBTOOL}" -static -o "$<TARGET_FILE:${TARGET}>" "$<TARGET_FILE:${TARGET}>.tmp" "${cmd}"
+            COMMAND_EXPAND_LISTS
+            VERBATIM
+        )
+        return()
+    endif ()
+
+    _bundle_static_check_output(version_info "${CMAKE_AR}" V)
+    if (version_info MATCHES "GNU")
+        string(CONFIGURE [[
+            create $<TARGET_FILE:@TARGET@>
+            addlib $<TARGET_FILE:@TARGET@>.tmp
+            $<LIST:JOIN,$<LIST:TRANSFORM,@cmd@,PREPEND,addlib >,
+            >
+            save
+            end
+        ]] mri_script)
+        string(REGEX REPLACE "(^|\n) +" "\\1" mri_script "${mri_script}")
+
+        file(GENERATE OUTPUT "fuse-${TARGET}.mri"
+             CONTENT "${mri_script}" TARGET "${TARGET}")
+
+        add_custom_command(
+            TARGET "${TARGET}" POST_BUILD
+            COMMAND "${CMAKE_AR}" -M < "${CMAKE_CURRENT_BINARY_DIR}/fuse-${TARGET}.mri"
+            VERBATIM
+        )
+        return()
+    endif ()
+
+    message(FATAL_ERROR "bundle_static_libs not implemented for the present toolchain")
 endfunction()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 05408ce88973..6f0df8f90a65 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -472,6 +472,10 @@ target_sources(
     FILES "${Halide_BINARY_DIR}/include/Halide.h"
 )
 
+if (Halide_BUNDLE_STATIC)
+    bundle_static(Halide)
+endif ()
+
 ##
 # CodeGen backends
 ##
@@ -480,7 +484,11 @@ target_sources(
 foreach (backend IN LISTS Halide_LLVM_COMPONENTS)
     string(TOUPPER "WITH_${backend}" definition)
     target_compile_definitions(Halide PRIVATE "${definition}")
-    target_link_libraries(Halide PRIVATE Halide_LLVM::${backend})
+    if (Halide_BUNDLE_STATIC AND NOT Halide_LLVM_SHARED_LIBS)
+        target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:Halide_LLVM::${backend}>")
+    else ()
+        target_link_libraries(Halide PRIVATE Halide_LLVM::${backend})
+    endif ()
 endforeach ()
 
 # GPU backends
@@ -515,7 +523,9 @@ if (WITH_SERIALIZATION)
     )
     _Halide_pkgdep(flatbuffers)
 
-    if (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
+    if (Halide_BUNDLE_STATIC)
+        target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:flatbuffers::flatbuffers>")
+    elseif (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
         target_sources(Halide PRIVATE "$<TARGET_OBJECTS:flatbuffers::flatbuffers>")
         target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:$<COMPILE_ONLY:flatbuffers::flatbuffers>>")
     else ()
@@ -588,7 +598,9 @@ if (Halide_WASM_BACKEND STREQUAL "wabt")
     find_package(wabt 1.0.36 REQUIRED)
     _Halide_pkgdep(wabt)
 
-    if (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
+    if (Halide_BUNDLE_STATIC)
+        target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:wabt::wabt>")
+    elseif (Halide_USE_FETCHCONTENT AND NOT BUILD_SHARED_LIBS)
         target_sources(Halide PRIVATE "$<TARGET_OBJECTS:wabt::wabt>")
         target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:$<COMPILE_ONLY:wabt::wabt>>")
     else ()
@@ -599,8 +611,14 @@ if (Halide_WASM_BACKEND STREQUAL "wabt")
 elseif (Halide_WASM_BACKEND STREQUAL "V8")
     find_package(V8 REQUIRED)
     _Halide_pkgdep(V8)
-    target_link_libraries(Halide PRIVATE V8::V8)
     target_compile_definitions(Halide PRIVATE WITH_V8)
+
+    get_property(type TARGET V8::V8 PROPERTY TYPE)
+    if (Halide_BUNDLE_STATIC AND type STREQUAL "STATIC_LIBRARY")
+        target_link_libraries(Halide PRIVATE "$<BUILD_LOCAL_INTERFACE:V8::V8>")
+    else ()
+        target_link_libraries(Halide PRIVATE V8::V8)
+    endif ()
 elseif (Halide_WASM_BACKEND)
     message(FATAL_ERROR "Unknown Halide_WASM_BACKEND `${Halide_WASM_BACKEND}`")
 endif ()

From 6dcdfb58e1e0f1a08419ed1944605fee0356a3f7 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Sun, 11 Aug 2024 22:25:00 -0400
Subject: [PATCH 182/186] Support using vcpkg to build dependencies on all
 platforms (#8387)

This PR adds support for using vcpkg to acquire and build Halide's
dependencies on all platforms.

It adds a top-level `vcpkg.json` file that explains the relationship
between Halide's features and its dependencies. These features include
the various LLVM `target-`s (which merely imply a dependency on the
corresponding LLVM backend), `serialization` (flatbuffers),
the `python-bindings` (pybind11), the `wasm-executor` (wabt), and a
few meta-features:

* `jit`: enables LLVM targets corresponding to the host system
* `target-all`: enables all LLVM targets
* `tests`: depends on everything needed for the tests and apps
* `developer`: includes all other features

All of these are optional (since x86 and WebAssembly are forced),
but `jit` and `serialization` are on by default.

vcpkg is intended to be an eventual replacement for FetchContent, at
least on the buildbots. It will accelerate builds beyond ccache by
directly restoring binary caches for our dependencies. Unlike
FetchContent, it does not pollute our build with third-party CMake
code. Indeed, our build has no idea at all when vcpkg is in use.

The primary drawback is that vcpkg installation happens during (or
ahead of) configuration time, so there is some initial wait.

## Try it!

I have provided many CMake presets to ease adoption. As long as you
have `VCPKG_ROOT` set to a fresh clone of `vcpkg`, they should work.
They come in two flavors:

* `vcpkg`: this acquires dependencies from the main vcpkg registry, but
  applies our own overlay, which disables building Python 3 (really!)
  and LLVM. The system is searched for these as usual.
* `vcpkg-full`: this disables the Halide overlay and attempts to build
  ALL dependencies.

All these presets enable the `developer` feature
in `VCPKG_MANIFEST_FEATURES`, which can be overridden in the usual way.

Here are the commands you should use to try it locally:

* On Linux or Windows: `cmake --preset release-vcpkg`
* On macOS: `cmake --preset macOS-vcpkg`
* To use Visual Studio: `cmake --preset win32`. Here, `vcpkg` is
  implied and `-vcpkg-full` can be added to build LLVM.
---
 .gitignore                         |   3 +
 CMakeLists.txt                     |  10 +
 CMakePresets.json                  | 147 ++++++---
 cmake/vcpkg/llvm/portfile.cmake    |   3 +
 cmake/vcpkg/llvm/vcpkg.json        | 468 +++++++++++++++++++++++++++++
 cmake/vcpkg/python3/portfile.cmake |   3 +
 cmake/vcpkg/python3/vcpkg.json     |  63 ++++
 vcpkg.json                         | 207 +++++++++++++
 8 files changed, 866 insertions(+), 38 deletions(-)
 create mode 100644 cmake/vcpkg/llvm/portfile.cmake
 create mode 100644 cmake/vcpkg/llvm/vcpkg.json
 create mode 100644 cmake/vcpkg/python3/portfile.cmake
 create mode 100644 cmake/vcpkg/python3/vcpkg.json
 create mode 100644 vcpkg.json

diff --git a/.gitignore b/.gitignore
index 2a338353fa36..a08b8e8dd7f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -202,6 +202,9 @@ install_manifest.txt
 # Ninja files
 *.ninja*
 
+# Package managers
+vcpkg_installed/
+
 ################################################################################
 ## IDE directories and metadata
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6b189757246..c92d27190ae1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,16 @@ if (Halide_USE_FETCHCONTENT)
     list(APPEND CMAKE_PROJECT_TOP_LEVEL_INCLUDES "${CMAKE_CURRENT_LIST_DIR}/cmake/dependencies.cmake")
 endif ()
 
+# TODO: remove this after updating build bots.
+if (NOT DEFINED VCPKG_OVERLAY_PORTS)
+    set(VCPKG_OVERLAY_PORTS "${CMAKE_CURRENT_LIST_DIR}/cmake/vcpkg")
+endif ()
+
+# TODO: remove this after updating build bots.
+if (NOT DEFINED VCPKG_MANIFEST_FEATURES)
+    set(VCPKG_MANIFEST_FEATURES developer)
+endif ()
+
 project(Halide
         VERSION 19.0.0
         DESCRIPTION "Halide compiler and libraries"
diff --git a/CMakePresets.json b/CMakePresets.json
index e0b6e065ee6e..0aaa71d2d74c 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,8 +1,8 @@
 {
-  "version": 3,
+  "version": 6,
   "cmakeMinimumRequired": {
     "major": 3,
-    "minor": 22,
+    "minor": 28,
     "patch": 0
   },
   "configurePresets": [
@@ -18,30 +18,42 @@
       "inherits": "base",
       "toolchainFile": "${sourceDir}/cmake/toolchain.${presetName}.cmake",
       "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "RelWithDebInfo"
+        "CMAKE_BUILD_TYPE": "RelWithDebInfo",
+        "Halide_LLVM_SHARED_LIBS": false
       }
     },
     {
-      "name": "windows-only",
-      "hidden": true,
-      "condition": {
-        "type": "equals",
-        "lhs": "${hostSystemName}",
-        "rhs": "Windows"
+      "name": "vcpkg",
+      "inherits": "base",
+      "displayName": "vcpkg deps",
+      "description": "Build dependencies (with Halide exclusions) with vcpkg",
+      "toolchainFile": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake",
+      "cacheVariables": {
+        "VCPKG_MANIFEST_FEATURES": "developer",
+        "VCPKG_OVERLAY_PORTS": "${sourceDir}/cmake/vcpkg",
+        "Halide_USE_FETCHCONTENT": false
       }
     },
     {
-      "name": "vcpkg",
-      "hidden": true,
-      "toolchainFile": "$env{VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake"
+      "name": "vcpkg-full",
+      "inherits": "vcpkg",
+      "displayName": "vcpkg deps (all dependencies)",
+      "description": "Build ALL dependencies with vcpkg",
+      "cacheVariables": {
+        "VCPKG_OVERLAY_PORTS": ""
+      }
     },
     {
       "name": "vs2022",
       "hidden": true,
       "inherits": [
-        "vcpkg",
-        "windows-only"
+        "vcpkg"
       ],
+      "condition": {
+        "type": "equals",
+        "lhs": "${hostSystemName}",
+        "rhs": "Windows"
+      },
       "generator": "Visual Studio 17 2022",
       "toolset": "host=x64"
     },
@@ -64,52 +76,112 @@
       }
     },
     {
-      "name": "debian-debug",
-      "inherits": "debug",
-      "displayName": "Debian (Debug)",
-      "description": "Debug build assuming Debian-provided dependencies",
-      "cacheVariables": {
-        "Halide_LLVM_SHARED_LIBS": "ON"
-      }
+      "name": "debug-vcpkg",
+      "inherits": [
+        "debug",
+        "vcpkg"
+      ],
+      "displayName": "Debug (vcpkg)",
+      "description": "Debug build for a single-config generator, vcpkg dependencies"
     },
     {
-      "name": "debian-release",
-      "inherits": "debian-debug",
-      "displayName": "Debian (Release)",
-      "description": "Release build assuming Debian-provided dependencies",
-      "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Release"
-      }
+      "name": "release-vcpkg",
+      "inherits": [
+        "release",
+        "vcpkg"
+      ],
+      "displayName": "Release (vcpkg)",
+      "description": "Release build for a single-config generator, vcpkg dependencies"
     },
     {
-      "name": "win32",
+      "name": "debug-vcpkg-full",
+      "inherits": [
+        "debug",
+        "vcpkg-full"
+      ],
+      "displayName": "Debug (vcpkg-full)",
+      "description": "Debug build for a single-config generator, vcpkg-full dependencies"
+    },
+    {
+      "name": "release-vcpkg-full",
       "inherits": [
-        "vs2022",
-        "base"
+        "release",
+        "vcpkg-full"
       ],
+      "displayName": "Release (vcpkg-full)",
+      "description": "Release build for a single-config generator, vcpkg-full dependencies"
+    },
+    {
+      "name": "win32",
+      "inherits": "vs2022",
       "displayName": "Win32 (Visual Studio)",
       "description": "Visual Studio-based Win32 build with vcpkg dependencies.",
       "architecture": "Win32"
     },
     {
       "name": "win64",
-      "inherits": [
-        "vs2022",
-        "base"
-      ],
+      "inherits": "vs2022",
       "displayName": "Win64 (Visual Studio)",
       "description": "Visual Studio-based x64 build with vcpkg dependencies.",
       "architecture": "x64"
     },
+    {
+      "name": "win32-vcpkg-full",
+      "inherits": [
+        "vcpkg-full",
+        "vs2022"
+      ],
+      "displayName": "Win32 (Visual Studio/vcpkg-full)",
+      "description": "Visual Studio-based Win32 build with vcpkg-full dependencies.",
+      "architecture": "Win32"
+    },
+    {
+      "name": "win64-vcpkg-full",
+      "inherits": [
+        "vcpkg-full",
+        "vs2022"
+      ],
+      "displayName": "Win64 (Visual Studio/vcpkg-full)",
+      "description": "Visual Studio-based x64 build with vcpkg-full dependencies.",
+      "architecture": "x64"
+    },
     {
       "name": "macOS",
       "displayName": "macOS (Apple Clang)",
       "description": "macOS build using Apple Clang and Homebrew LLVM",
       "generator": "Ninja",
       "inherits": "release",
+      "condition": {
+        "type": "equals",
+        "lhs": "${hostSystemName}",
+        "rhs": "Darwin"
+      },
+      "cacheVariables": {
+        "CMAKE_PREFIX_PATH": "/opt/homebrew;/opt/homebrew/opt/llvm;/opt/homebrew/opt/jpeg"
+      }
+    },
+    {
+      "name": "macOS-vcpkg",
+      "inherits": [
+        "macOS",
+        "vcpkg"
+      ],
+      "displayName": "macOS (vcpkg)",
+      "description": "macOS build with vcpkg dependencies",
+      "cacheVariables": {
+        "CMAKE_PREFIX_PATH": "/opt/homebrew/opt/llvm"
+      }
+    },
+    {
+      "name": "macOS-vcpkg-full",
+      "inherits": [
+        "macOS",
+        "vcpkg-full"
+      ],
+      "displayName": "macOS (vcpkg-full)",
+      "description": "macOS build with vcpkg-full dependencies",
       "cacheVariables": {
-        "CMAKE_PREFIX_PATH": "/opt/homebrew;/opt/homebrew/opt/llvm;/opt/homebrew/opt/jpeg",
-        "Halide_LLVM_SHARED_LIBS": "YES"
+        "CMAKE_PREFIX_PATH": ""
       }
     },
     {
@@ -182,7 +254,6 @@
       "description": "Build everything with fuzzing enabled",
       "cacheVariables": {
         "LLVM_ROOT": "$penv{LLVM_ROOT}",
-        "TARGET_WEBASSEMBLY": "NO",
         "WITH_TUTORIALS": "NO",
         "WITH_UTILS": "NO",
         "WITH_PYTHON_BINDINGS": "NO",
diff --git a/cmake/vcpkg/llvm/portfile.cmake b/cmake/vcpkg/llvm/portfile.cmake
new file mode 100644
index 000000000000..9f58ad53b64e
--- /dev/null
+++ b/cmake/vcpkg/llvm/portfile.cmake
@@ -0,0 +1,3 @@
+# This instructs vcpkg to do nothing, which causes find_package
+# to search the system for LLVM, rather than the vcpkg trees.
+set(VCPKG_POLICY_EMPTY_PACKAGE enabled)
diff --git a/cmake/vcpkg/llvm/vcpkg.json b/cmake/vcpkg/llvm/vcpkg.json
new file mode 100644
index 000000000000..e097973380fe
--- /dev/null
+++ b/cmake/vcpkg/llvm/vcpkg.json
@@ -0,0 +1,468 @@
+{
+  "name": "llvm",
+  "version": "18.1.6",
+  "port-version": 1,
+  "description": "The LLVM Compiler Infrastructure.",
+  "homepage": "https://llvm.org",
+  "license": "Apache-2.0",
+  "supports": "!uwp & !(arm & windows)",
+  "dependencies": [
+    {
+      "name": "atl",
+      "platform": "windows & !mingw"
+    },
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-get-vars",
+      "host": true
+    }
+  ],
+  "default-features": [
+    "clang",
+    "default-targets",
+    "enable-bindings",
+    "enable-terminfo",
+    "enable-zlib",
+    "enable-zstd",
+    "lld",
+    "tools"
+  ],
+  "features": {
+    "bolt": {
+      "description": "BOLT is a post-link optimizer developed to speed up large applications.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "tools"
+          ]
+        }
+      ]
+    },
+    "clang": {
+      "description": "Include C Language Family Front-end.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "tools"
+          ]
+        }
+      ]
+    },
+    "clang-tools-extra": {
+      "description": "Include Clang tools.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang"
+          ]
+        }
+      ]
+    },
+    "compiler-rt": {
+      "description": "Include compiler's runtime libraries.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang"
+          ]
+        }
+      ]
+    },
+    "default-targets": {
+      "description": "Build with platform-specific default targets.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-aarch64"
+          ],
+          "platform": "arm64"
+        },
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-x86"
+          ],
+          "platform": "x86 | x64"
+        },
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-arm"
+          ],
+          "platform": "arm & !arm64"
+        }
+      ]
+    },
+    "enable-abi-breaking-checks": {
+      "description": "Build LLVM with LLVM_ABI_BREAKING_CHECKS=FORCE_ON."
+    },
+    "enable-assertions": {
+      "description": "Build LLVM with assertions."
+    },
+    "enable-bindings": {
+      "description": "Build bindings."
+    },
+    "enable-eh": {
+      "description": "Build LLVM with exception handler.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "enable-rtti"
+          ]
+        }
+      ]
+    },
+    "enable-ffi": {
+      "description": "Build LLVM with FFI.",
+      "dependencies": [
+        "libffi"
+      ]
+    },
+    "enable-ios": {
+      "description": "Build compiler-rt for iOS SDK.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-arm"
+          ]
+        }
+      ]
+    },
+    "enable-libxml2": {
+      "description": "Build with LibXml2.",
+      "dependencies": [
+        "libxml2"
+      ]
+    },
+    "enable-mlir-python-bindings": {
+      "description": "Build MLIR Python bindings.",
+      "supports": "!(windows & static)",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "mlir"
+          ]
+        },
+        "pybind11",
+        "python3"
+      ]
+    },
+    "enable-rtti": {
+      "description": "Build LLVM with run-time type information."
+    },
+    "enable-terminfo": {
+      "description": "Use terminfo database if available."
+    },
+    "enable-zlib": {
+      "description": "Build with ZLib.",
+      "dependencies": [
+        "zlib"
+      ]
+    },
+    "enable-zstd": {
+      "description": "Build with zstd.",
+      "dependencies": [
+        "zstd"
+      ]
+    },
+    "flang": {
+      "description": "Include Fortran front end.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang",
+            "mlir",
+            "tools"
+          ]
+        }
+      ]
+    },
+    "libc": {
+      "description": "Include libc library.",
+      "supports": "linux",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang",
+            "tools"
+          ]
+        }
+      ]
+    },
+    "libclc": {
+      "description": "Include OpenCL library."
+    },
+    "libcxx": {
+      "description": "Include libcxx library.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang",
+            "libcxxabi",
+            "tools"
+          ]
+        }
+      ]
+    },
+    "libcxxabi": {
+      "description": "Include libcxxabi library.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang",
+            "libcxx",
+            "tools"
+          ]
+        }
+      ]
+    },
+    "libunwind": {
+      "description": "Include libunwind library.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "tools"
+          ]
+        }
+      ]
+    },
+    "lld": {
+      "description": "Include LLVM linker.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "tools"
+          ]
+        }
+      ]
+    },
+    "lldb": {
+      "description": "Include LLVM debugger.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang",
+            "enable-terminfo",
+            "tools"
+          ]
+        }
+      ]
+    },
+    "mlir": {
+      "description": "Include MLIR (Multi-Level IR Compiler Framework) project.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "tools",
+            "utils"
+          ]
+        }
+      ]
+    },
+    "openmp": {
+      "description": "Include LLVM OpenMP libraries.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang",
+            "utils"
+          ]
+        }
+      ]
+    },
+    "polly": {
+      "description": "Include Polly (Polyhedral optimizations for LLVM) project.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "tools",
+            "utils"
+          ]
+        }
+      ]
+    },
+    "pstl": {
+      "description": "Include pstl (Parallel STL) library.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "tools"
+          ]
+        }
+      ]
+    },
+    "target-aarch64": {
+      "description": "Build with AArch64 backend."
+    },
+    "target-all": {
+      "description": "Build with all backends.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-aarch64",
+            "target-amdgpu",
+            "target-arc",
+            "target-arm",
+            "target-avr",
+            "target-bpf",
+            "target-csky",
+            "target-directx",
+            "target-hexagon",
+            "target-lanai",
+            "target-loongarch",
+            "target-m68k",
+            "target-mips",
+            "target-msp430",
+            "target-nvptx",
+            "target-powerpc",
+            "target-riscv",
+            "target-sparc",
+            "target-spirv",
+            "target-systemz",
+            "target-ve",
+            "target-webassembly",
+            "target-x86",
+            "target-xcore",
+            "target-xtensa"
+          ]
+        }
+      ]
+    },
+    "target-amdgpu": {
+      "description": "Build with AMDGPU backend."
+    },
+    "target-arc": {
+      "description": "Build with ARC backend (experimental)."
+    },
+    "target-arm": {
+      "description": "Build with ARM backend."
+    },
+    "target-avr": {
+      "description": "Build with AVR backend."
+    },
+    "target-bpf": {
+      "description": "Build with BPF backend."
+    },
+    "target-csky": {
+      "description": "Build with CSKY backend (experimental)."
+    },
+    "target-directx": {
+      "description": "Build with DirectX backend (experimental)."
+    },
+    "target-hexagon": {
+      "description": "Build with Hexagon backend."
+    },
+    "target-lanai": {
+      "description": "Build with Lanai backend."
+    },
+    "target-loongarch": {
+      "description": "Build with LoongArch backend."
+    },
+    "target-m68k": {
+      "description": "Build with M68k backend (experimental)."
+    },
+    "target-mips": {
+      "description": "Build with Mips backend."
+    },
+    "target-msp430": {
+      "description": "Build with MSP430 backend."
+    },
+    "target-nvptx": {
+      "description": "Build with NVPTX backend."
+    },
+    "target-powerpc": {
+      "description": "Build with PowerPC backend."
+    },
+    "target-riscv": {
+      "description": "Build with RISC-V backend."
+    },
+    "target-sparc": {
+      "description": "Build with Sparc backend."
+    },
+    "target-spirv": {
+      "description": "Build with SPIRV backend (experimental)."
+    },
+    "target-systemz": {
+      "description": "Build with SystemZ backend."
+    },
+    "target-ve": {
+      "description": "Build with VE backend."
+    },
+    "target-webassembly": {
+      "description": "Build with WebAssembly backend."
+    },
+    "target-x86": {
+      "description": "Build with X86 backend."
+    },
+    "target-xcore": {
+      "description": "Build with XCore backend."
+    },
+    "target-xtensa": {
+      "description": "Build with Xtensa backend (experimental)."
+    },
+    "tools": {
+      "description": "Build LLVM tools."
+    },
+    "utils": {
+      "description": "Build LLVM utils.",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "tools"
+          ]
+        }
+      ]
+    }
+  }
+}
diff --git a/cmake/vcpkg/python3/portfile.cmake b/cmake/vcpkg/python3/portfile.cmake
new file mode 100644
index 000000000000..7b0afb6a66ab
--- /dev/null
+++ b/cmake/vcpkg/python3/portfile.cmake
@@ -0,0 +1,3 @@
+# This instructs vcpkg to do nothing, which causes find_package
+# to search the system for Python, rather than the vcpkg trees.
+set(VCPKG_POLICY_EMPTY_PACKAGE enabled)
diff --git a/cmake/vcpkg/python3/vcpkg.json b/cmake/vcpkg/python3/vcpkg.json
new file mode 100644
index 000000000000..0af7f3bfd836
--- /dev/null
+++ b/cmake/vcpkg/python3/vcpkg.json
@@ -0,0 +1,63 @@
+{
+  "name": "python3",
+  "version": "3.11.8",
+  "port-version": 4,
+  "description": "The Python programming language",
+  "homepage": "https://github.com/python/cpython",
+  "license": "Python-2.0",
+  "supports": "!uwp & !mingw",
+  "dependencies": [
+    {
+      "name": "bzip2",
+      "platform": "!(windows & static)"
+    },
+    "expat",
+    {
+      "name": "gettext",
+      "platform": "osx"
+    },
+    {
+      "name": "libffi",
+      "platform": "!(windows & static)"
+    },
+    {
+      "name": "libiconv",
+      "platform": "osx"
+    },
+    {
+      "name": "liblzma",
+      "platform": "!(windows & static)"
+    },
+    {
+      "name": "libuuid",
+      "platform": "!osx & !windows"
+    },
+    {
+      "name": "openssl",
+      "platform": "!(windows & static)"
+    },
+    {
+      "name": "python3",
+      "host": true
+    },
+    {
+      "name": "sqlite3",
+      "platform": "!(windows & static)"
+    },
+    {
+      "name": "vcpkg-get-python",
+      "host": true
+    },
+    {
+      "name": "vcpkg-msbuild",
+      "host": true,
+      "platform": "windows"
+    },
+    "zlib"
+  ],
+  "features": {
+    "deprecated-win7-support": {
+      "description": "Deprecated support for the Windows 7 platform -- may be removed at any time."
+    }
+  }
+}
diff --git a/vcpkg.json b/vcpkg.json
new file mode 100644
index 000000000000..c8e2888cc678
--- /dev/null
+++ b/vcpkg.json
@@ -0,0 +1,207 @@
+{
+  "name": "halide",
+  "version": "19.0.0",
+  "description": "Halide is a programming language designed to make it easier to write high-performance image and array processing code on modern machines.",
+  "homepage": "https://github.com/halide/Halide",
+  "license": "MIT",
+  "supports": "!uwp",
+  "builtin-baseline": "4fa096bce24561bf869e56384e655f4b2bab71a8",
+  "default-features": [
+    "jit",
+    "serialization"
+  ],
+  "dependencies": [
+    {
+      "name": "llvm",
+      "default-features": false,
+      "features": [
+        "clang",
+        "enable-eh",
+        "enable-rtti",
+        "lld",
+        "target-webassembly",
+        "target-x86",
+        "tools"
+      ]
+    },
+    "spirv-headers"
+  ],
+  "features": {
+    "developer": {
+      "description": "Include all Halide features to facilitate development",
+      "dependencies": [
+        {
+          "name": "halide",
+          "default-features": false,
+          "features": [
+            "python-bindings",
+            "serialization",
+            "target-all",
+            "tests",
+            "wasm-executor"
+          ]
+        },
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "clang-tools-extra",
+            "enable-assertions"
+          ]
+        }
+      ]
+    },
+    "jit": {
+      "description": "Include targets needed to compile for the library architecture",
+      "dependencies": [
+        {
+          "name": "halide",
+          "default-features": false,
+          "features": [
+            "target-arm"
+          ],
+          "platform": "arm32"
+        },
+        {
+          "name": "halide",
+          "default-features": false,
+          "features": [
+            "target-aarch64"
+          ],
+          "platform": "arm64"
+        },
+        {
+          "name": "halide",
+          "default-features": false,
+          "features": [
+            "target-all"
+          ],
+          "platform": "!x86 & !x64 & !arm"
+        }
+      ]
+    },
+    "python-bindings": {
+      "description": "Halide's native Python module (not the whole pip package)",
+      "dependencies": [
+        "pybind11"
+      ]
+    },
+    "serialization": {
+      "description": "Include experimental Serialization/Deserialization code",
+      "dependencies": [
+        "flatbuffers"
+      ]
+    },
+    "target-aarch64": {
+      "description": "Include AArch64 target",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-aarch64"
+          ]
+        }
+      ]
+    },
+    "target-all": {
+      "description": "Include all targets",
+      "dependencies": [
+        {
+          "name": "halide",
+          "default-features": false,
+          "features": [
+            "target-aarch64",
+            "target-arm",
+            "target-hexagon",
+            "target-powerpc",
+            "target-riscv"
+          ]
+        }
+      ]
+    },
+    "target-arm": {
+      "description": "Include ARM target",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-arm"
+          ]
+        }
+      ]
+    },
+    "target-hexagon": {
+      "description": "Include Hexagon target",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-hexagon"
+          ]
+        }
+      ]
+    },
+    "target-nvptx": {
+      "description": "Include NVPTX target",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-nvptx"
+          ]
+        }
+      ]
+    },
+    "target-powerpc": {
+      "description": "Include PowerPC target",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-powerpc"
+          ]
+        }
+      ]
+    },
+    "target-riscv": {
+      "description": "Include RISCV target",
+      "dependencies": [
+        {
+          "name": "llvm",
+          "default-features": false,
+          "features": [
+            "target-riscv"
+          ]
+        }
+      ]
+    },
+    "tests": {
+      "description": "Include dependencies needed for testing Halide",
+      "dependencies": [
+        {
+          "name": "cuda",
+          "platform": "(windows & x64 & !uwp & !xbox) | (linux & x64) | (linux & arm64)"
+        },
+        "eigen3",
+        "libjpeg-turbo",
+        "libpng",
+        "openblas",
+        {
+          "name": "opencl",
+          "platform": "(windows & x64 & !uwp & !xbox) | (linux & x64) | (linux & arm64)"
+        }
+      ]
+    },
+    "wasm-executor": {
+      "description": "Include built-in WASM executor",
+      "dependencies": [
+        "wabt"
+      ]
+    }
+  }
+}

From 818f42daf4719587f8087715732d09b40f643925 Mon Sep 17 00:00:00 2001
From: Martijn Courteaux <courteauxmartijn@gmail.com>
Date: Wed, 14 Aug 2024 00:03:31 +0200
Subject: [PATCH 183/186] Fix for the removed DataLayout constructor. (#8391)

* Fix for the removed DataLayout constructor.

* Update CodeGen_LLVM.cpp

* Update CodeGen_LLVM.cpp

* Update CodeGen_LLVM.cpp

---------

Co-authored-by: Steven Johnson <srj@google.com>
---
 src/CodeGen_LLVM.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 373cc0764813..b5489eb2b514 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1910,7 +1910,7 @@ Value *CodeGen_LLVM::codegen_buffer_pointer(const string &buffer, Halide::Type t
 
 Value *CodeGen_LLVM::codegen_buffer_pointer(Value *base_address, Halide::Type type, Expr index) {
     // Promote index to 64-bit on targets that use 64-bit pointers.
-    llvm::DataLayout d(module.get());
+    const llvm::DataLayout &d = module->getDataLayout();
     if (promote_indices() && d.getPointerSize() == 8) {
         index = promote_64(index);
     }
@@ -1951,7 +1951,7 @@ Value *CodeGen_LLVM::codegen_buffer_pointer(Value *base_address, Halide::Type ty
     }
 
     // Promote index to 64-bit on targets that use 64-bit pointers.
-    llvm::DataLayout d(module.get());
+    const llvm::DataLayout &d = module->getDataLayout();
     if (d.getPointerSize() == 8) {
         llvm::Type *index_type = index->getType();
         llvm::Type *desired_index_type = i64_t;
@@ -3286,7 +3286,7 @@ void CodeGen_LLVM::visit(const Call *op) {
     } else if (op->is_intrinsic(Call::undef)) {
         user_error << "undef not eliminated before code generation. Please report this as a Halide bug.\n";
     } else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) {
-        llvm::DataLayout d(module.get());
+        const llvm::DataLayout &d = module->getDataLayout();
         value = ConstantInt::get(i32_t, (int)d.getTypeAllocSize(halide_buffer_t_type));
     } else if (op->is_intrinsic(Call::strict_float)) {
         IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::FastMathFlagGuard guard(*builder);
@@ -4466,7 +4466,7 @@ Value *CodeGen_LLVM::create_alloca_at_entry(llvm::Type *t, int n, bool zero_init
     Value *size = ConstantInt::get(i32_t, n);
     AllocaInst *ptr = builder->CreateAlloca(t, size, name);
     int align = native_vector_bits() / 8;
-    llvm::DataLayout d(module.get());
+    const llvm::DataLayout &d = module->getDataLayout();
     int allocated_size = n * (int)d.getTypeAllocSize(t);
     if (t->isVectorTy() || n > 1) {
         ptr->setAlignment(llvm::Align(align));

From 4f30d2b1141aea24c8a1e2bcc7f488658fc5bf23 Mon Sep 17 00:00:00 2001
From: Andrew Adams <andrew.b.adams@gmail.com>
Date: Fri, 16 Aug 2024 11:41:55 -0700
Subject: [PATCH 184/186] Partially apply clang-tidy fixes we don't enforce yet
 (#8376)

* Partially apply clang-tidy fixes we don't use yet

- Put a bunch of stuff into anonymous namespaces
- Delete some redundant casts (e.g. casting an int to int)
- Add some const refs to avoid copies
- Remove meaningless inline qualifiers on in-class definitions and
constexpr functions
- Remove return-with-value from functions returning void
- Delete a little dead code
- Use std::min/max where appropriate
- Don't use a variable after std::forwarding it. It may have been moved
from.
- Use std::string::empty instead of comparing length to zero

* Undo unintentional formatting change

* Restore some necessary casts

* Add NOLINT to silence older clang-tidy
---
 python_bindings/src/halide/halide_/PyRDom.cpp |  2 ++
 src/AddImageChecks.cpp                        |  4 +--
 src/BoundaryConditions.cpp                    | 15 +++++-----
 src/Bounds.cpp                                | 19 +++++++------
 src/CPlusPlusMangle.cpp                       |  4 +--
 src/CodeGen_C.cpp                             |  6 ++--
 src/CodeGen_D3D12Compute_Dev.cpp              |  8 ++++--
 src/CodeGen_Hexagon.cpp                       |  4 +--
 src/CodeGen_Internal.cpp                      |  4 +++
 src/CodeGen_LLVM.cpp                          |  4 +--
 src/CodeGen_PTX_Dev.cpp                       |  8 ------
 src/CodeGen_PowerPC.cpp                       |  4 +--
 src/CodeGen_Vulkan_Dev.cpp                    | 12 ++++----
 src/CodeGen_WebGPU_Dev.cpp                    |  2 +-
 src/Deinterleave.cpp                          | 23 ++++++++-------
 src/Derivative.cpp                            |  8 +++---
 src/DistributeShifts.cpp                      |  2 +-
 src/Elf.cpp                                   |  3 ++
 src/EliminateBoolVectors.cpp                  |  1 +
 src/FlattenNestedRamps.cpp                    |  2 +-
 src/Func.cpp                                  |  2 +-
 src/FuseGPUThreadLoops.cpp                    |  4 +--
 src/Generator.h                               | 28 +++++++++----------
 src/HexagonOffload.cpp                        |  2 +-
 src/HexagonOptimize.cpp                       |  2 ++
 src/IRMatch.h                                 |  4 +--
 src/IRMutator.h                               |  4 +--
 src/IROperator.cpp                            | 26 +++++++++--------
 src/JITModule.cpp                             |  2 +-
 src/LLVM_Output.cpp                           |  4 +--
 src/LLVM_Runtime_Linker.cpp                   | 10 +++----
 src/LoopCarry.cpp                             |  4 +--
 src/Memoization.cpp                           |  4 +--
 src/Parameter.cpp                             | 25 +++++++++--------
 src/Pipeline.h                                |  2 +-
 src/RDom.cpp                                  |  2 ++
 src/RealizationOrder.cpp                      |  4 +--
 src/RegionCosts.h                             |  2 +-
 src/ScheduleFunctions.cpp                     |  6 ++--
 src/Simplify.cpp                              |  2 ++
 src/SlidingWindow.cpp                         |  2 +-
 src/StmtToHTML.cpp                            |  4 +--
 src/Target.h                                  |  2 +-
 src/Tracing.cpp                               |  2 +-
 src/UniquifyVariableNames.cpp                 |  2 ++
 src/Util.h                                    |  4 +--
 src/VectorizeLoops.cpp                        |  4 ++-
 src/WasmExecutor.cpp                          | 12 ++++----
 src/autoschedulers/adams2019/AutoSchedule.cpp |  7 +++--
 src/autoschedulers/adams2019/FunctionDAG.cpp  |  2 ++
 src/autoschedulers/adams2019/LoopNest.cpp     |  2 +-
 src/autoschedulers/adams2019/State.cpp        |  3 ++
 src/autoschedulers/adams2019/Weights.cpp      |  4 +--
 .../anderson2021/AutoSchedule.cpp             |  3 ++
 src/autoschedulers/anderson2021/LoopNest.cpp  |  2 +-
 src/autoschedulers/anderson2021/LoopNest.h    |  2 ++
 .../anderson2021/LoopNestParser.h             |  2 +-
 src/autoschedulers/anderson2021/State.cpp     |  2 ++
 src/autoschedulers/anderson2021/Weights.cpp   |  4 +--
 src/autoschedulers/common/cmdline.h           |  4 +--
 .../li2018/GradientAutoscheduler.cpp          |  5 ++--
 .../mullapudi2016/AutoSchedule.cpp            |  5 ++--
 src/runtime/HalideBuffer.h                    | 15 ++++------
 util/HalideTraceDump.cpp                      |  9 ++----
 util/HalideTraceViz.cpp                       |  4 +--
 65 files changed, 200 insertions(+), 181 deletions(-)

diff --git a/python_bindings/src/halide/halide_/PyRDom.cpp b/python_bindings/src/halide/halide_/PyRDom.cpp
index ca05d9babb12..6cec1d7739cc 100644
--- a/python_bindings/src/halide/halide_/PyRDom.cpp
+++ b/python_bindings/src/halide/halide_/PyRDom.cpp
@@ -5,6 +5,7 @@
 namespace Halide {
 namespace PythonBindings {
 
+namespace {
 void define_rvar(py::module &m) {
     auto rvar_class =
         py::class_<RVar>(m, "RVar")
@@ -24,6 +25,7 @@ void define_rvar(py::module &m) {
 
     add_binary_operators(rvar_class);
 }
+}  // namespace
 
 void define_rdom(py::module &m) {
     define_rvar(m);
diff --git a/src/AddImageChecks.cpp b/src/AddImageChecks.cpp
index 77d8015f32b9..6a1f66ae332b 100644
--- a/src/AddImageChecks.cpp
+++ b/src/AddImageChecks.cpp
@@ -526,7 +526,7 @@ Stmt add_image_checks_inner(Stmt s,
                         << "as the first output buffer.\n";
 
                     stride_constrained = param.stride_constraint(i);
-                } else if (image.defined() && (int)i < image.dimensions()) {
+                } else if (image.defined() && i < image.dimensions()) {
                     stride_constrained = image.dim(i).stride();
                 }
 
@@ -543,7 +543,7 @@ Stmt add_image_checks_inner(Stmt s,
                 } else {
                     extent_constrained = Variable::make(Int(32), extent0_name);
                 }
-            } else if (image.defined() && (int)i < image.dimensions()) {
+            } else if (image.defined() && i < image.dimensions()) {
                 stride_constrained = image.dim(i).stride();
                 extent_constrained = image.dim(i).extent();
                 min_constrained = image.dim(i).min();
diff --git a/src/BoundaryConditions.cpp b/src/BoundaryConditions.cpp
index de8cbe705ab3..0b36a4d5ae63 100644
--- a/src/BoundaryConditions.cpp
+++ b/src/BoundaryConditions.cpp
@@ -14,7 +14,7 @@ Func repeat_edge(const Func &source,
 
     std::vector<Expr> actuals;
     for (size_t i = 0; i < bounds.size(); i++) {
-        Var arg_var = args[i];
+        const Var &arg_var = args[i];
         Expr min = bounds[i].min;
         Expr extent = bounds[i].extent;
 
@@ -39,16 +39,15 @@ Func repeat_edge(const Func &source,
 
 Func constant_exterior(const Func &source, const Tuple &value,
                        const Region &bounds) {
-    std::vector<Var> source_args = source.args();
-    std::vector<Var> args(source_args);
+    std::vector<Var> args(source.args());
     user_assert(args.size() >= bounds.size())
         << "constant_exterior called with more bounds (" << bounds.size()
-        << ") than dimensions (" << source_args.size()
+        << ") than dimensions (" << args.size()
         << ") Func " << source.name() << " has.\n";
 
     Expr out_of_bounds = cast<bool>(false);
     for (size_t i = 0; i < bounds.size(); i++) {
-        Var arg_var = source_args[i];
+        const Var &arg_var = args[i];
         Expr min = bounds[i].min;
         Expr extent = bounds[i].extent;
 
@@ -91,7 +90,7 @@ Func repeat_image(const Func &source,
 
     std::vector<Expr> actuals;
     for (size_t i = 0; i < bounds.size(); i++) {
-        Var arg_var = args[i];
+        const Var &arg_var = args[i];
         Expr min = bounds[i].min;
         Expr extent = bounds[i].extent;
 
@@ -146,7 +145,7 @@ Func mirror_image(const Func &source,
 
     std::vector<Expr> actuals;
     for (size_t i = 0; i < bounds.size(); i++) {
-        Var arg_var = args[i];
+        const Var &arg_var = args[i];
 
         Expr min = bounds[i].min;
         Expr extent = bounds[i].extent;
@@ -187,7 +186,7 @@ Func mirror_interior(const Func &source,
 
     std::vector<Expr> actuals;
     for (size_t i = 0; i < bounds.size(); i++) {
-        Var arg_var = args[i];
+        const Var &arg_var = args[i];
 
         Expr min = bounds[i].min;
         Expr extent = bounds[i].extent;
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index 847f6b73842a..fe72e6bedfdf 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -1857,12 +1857,6 @@ Interval bounds_of_expr_in_scope_with_indent(const Expr &expr, const Scope<Inter
     return b.interval;
 }
 
-}  // namespace
-
-Interval bounds_of_expr_in_scope(const Expr &expr, const Scope<Interval> &scope, const FuncValueBounds &fb, bool const_bound) {
-    return bounds_of_expr_in_scope_with_indent(expr, scope, fb, const_bound, 0);
-}
-
 Region region_union(const Region &a, const Region &b) {
     internal_assert(a.size() == b.size()) << "Mismatched dimensionality in region union\n";
     Region result;
@@ -1877,6 +1871,12 @@ Region region_union(const Region &a, const Region &b) {
     return result;
 }
 
+}  // namespace
+
+Interval bounds_of_expr_in_scope(const Expr &expr, const Scope<Interval> &scope, const FuncValueBounds &fb, bool const_bound) {
+    return bounds_of_expr_in_scope_with_indent(expr, scope, fb, const_bound, 0);
+}
+
 void merge_boxes(Box &a, const Box &b) {
     if (b.empty()) {
         return;
@@ -3085,8 +3085,6 @@ class BoxesTouched : public IRGraphVisitor {
     }
 };
 
-}  // namespace
-
 map<string, Box> boxes_touched(const Expr &e, Stmt s, bool consider_calls, bool consider_provides,
                                const string &fn, const Scope<Interval> &scope, const FuncValueBounds &fb) {
     if (!fn.empty() && s.defined()) {
@@ -3275,6 +3273,7 @@ Box box_touched(const Expr &e, Stmt s, bool consider_calls, bool consider_provid
     internal_assert(boxes.size() <= 1);
     return boxes[fn];
 }
+}  // namespace
 
 map<string, Box> boxes_required(const Expr &e, const Scope<Interval> &scope, const FuncValueBounds &fb) {
     return boxes_touched(e, Stmt(), true, false, "", scope, fb);
@@ -3324,6 +3323,7 @@ Box box_touched(Stmt s, const string &fn, const Scope<Interval> &scope, const Fu
     return box_touched(Expr(), std::move(s), true, true, fn, scope, fb);
 }
 
+namespace {
 // Compute interval of all possible function's values (default + specialized values)
 Interval compute_pure_function_definition_value_bounds(
     const Definition &def, const Scope<Interval> &scope, const FuncValueBounds &fb, int dim) {
@@ -3338,6 +3338,7 @@ Interval compute_pure_function_definition_value_bounds(
     }
     return result;
 }
+}  // namespace
 
 FuncValueBounds compute_function_value_bounds(const vector<string> &order,
                                               const map<string, Function> &env) {
@@ -3345,7 +3346,7 @@ FuncValueBounds compute_function_value_bounds(const vector<string> &order,
 
     for (const auto &func_name : order) {
         Function f = env.find(func_name)->second;
-        const vector<string> f_args = f.args();
+        const vector<string> &f_args = f.args();
         for (int j = 0; j < f.outputs(); j++) {
             pair<string, int> key = {f.name(), j};
 
diff --git a/src/CPlusPlusMangle.cpp b/src/CPlusPlusMangle.cpp
index b5c30b4fcb65..79240083ca1e 100644
--- a/src/CPlusPlusMangle.cpp
+++ b/src/CPlusPlusMangle.cpp
@@ -44,8 +44,6 @@ Type non_null_void_star_type() {
     return Handle(1, &t);
 }
 
-}  // namespace
-
 namespace WindowsMangling {
 
 struct PreviousDeclarations {
@@ -615,6 +613,8 @@ std::string cplusplus_function_mangled_name(const std::string &name, const std::
 
 }  // namespace ItaniumABIMangling
 
+}  // namespace
+
 std::string cplusplus_function_mangled_name(const std::string &name, const std::vector<std::string> &namespaces,
                                             Type return_type, const std::vector<ExternFuncArgument> &args,
                                             const Target &target) {
diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp
index aa76a85c9664..70acef331a5b 100644
--- a/src/CodeGen_C.cpp
+++ b/src/CodeGen_C.cpp
@@ -797,7 +797,7 @@ void CodeGen_C::emit_metadata_getter(const std::string &function_name,
         stream << get_indent() << kind_names[arg.kind] << ",\n";
         stream << get_indent() << (int)arg.dimensions << ",\n";
         internal_assert(arg.type.code() < sizeof(type_code_names) / sizeof(type_code_names[0]));
-        stream << get_indent() << "{" << type_code_names[arg.type.code()] << ", " << (int)arg.type.bits() << ", " << (int)arg.type.lanes() << "},\n";
+        stream << get_indent() << "{" << type_code_names[arg.type.code()] << ", " << arg.type.bits() << ", " << arg.type.lanes() << "},\n";
         stream << get_indent() << "scalar_def_" << legalized_name << ",\n";
         stream << get_indent() << "scalar_min_" << legalized_name << ",\n";
         stream << get_indent() << "scalar_max_" << legalized_name << ",\n";
@@ -873,8 +873,8 @@ void CodeGen_C::emit_constexpr_function_info(const std::string &function_name,
         const auto name = map_name(arg.name);
 
         stream << get_indent() << "{\"" << name << "\", " << kind_names[arg.kind] << ", " << (int)arg.dimensions
-               << ", halide_type_t{" << type_code_names[arg.type.code()] << ", " << (int)arg.type.bits()
-               << ", " << (int)arg.type.lanes() << "}},\n";
+               << ", halide_type_t{" << type_code_names[arg.type.code()] << ", " << arg.type.bits()
+               << ", " << arg.type.lanes() << "}},\n";
     }
     indent -= 1;
     stream << get_indent() << "}};\n";
diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp
index 4b5ea37d8a0e..15ca77ab56a0 100644
--- a/src/CodeGen_D3D12Compute_Dev.cpp
+++ b/src/CodeGen_D3D12Compute_Dev.cpp
@@ -958,7 +958,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const FloatImm *op)
     // have seen division-by-zero shader warnings, and we postulated that it
     // could be indirectly related to compiler assumptions on signed integer
     // overflow when float_from_bits() is called, but we don't know for sure
-    return CodeGen_GPU_C::visit(op);
+    CodeGen_GPU_C::visit(op);
 }
 
 void CodeGen_D3D12Compute_Dev::add_kernel(Stmt s,
@@ -1146,10 +1146,12 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s,
         using IRVisitor::visit;
         void visit(const For *loop) override {
             if (!is_gpu(loop->for_type)) {
-                return loop->body.accept(this);
+                loop->body.accept(this);
+                return;
             }
             if (loop->for_type != ForType::GPUThread) {
-                return loop->body.accept(this);
+                loop->body.accept(this);
+                return;
             }
             internal_assert(is_const_zero(loop->min));
             int index = thread_loop_workgroup_index(loop->name);
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 8c27fadf82f0..84e5934f579e 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -1044,7 +1044,7 @@ Value *CodeGen_Hexagon::interleave_vectors(const vector<llvm::Value *> &v) {
             // Break them into native vectors, use vshuffvdd, and
             // concatenate the shuffled results.
             llvm::Type *native2_ty = get_vector_type(element_ty, native_elements * 2);
-            Value *bytes = codegen(-static_cast<int>(element_bits / 8));
+            Value *bytes = codegen(-(element_bits / 8));
             vector<Value *> ret;
             for (int i = 0; i < result_elements / 2; i += native_elements) {
                 Value *a_i = slice_vector(a, i, native_elements);
@@ -1147,7 +1147,7 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
     llvm::Type *b_ty = b->getType();
     internal_assert(a_ty == b_ty);
 
-    int a_elements = static_cast<int>(get_vector_num_elements(a_ty));
+    int a_elements = get_vector_num_elements(a_ty);
 
     llvm::Type *element_ty = get_vector_element_type(a->getType());
     internal_assert(element_ty);
diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index 21a0cea839df..dc88960dde7f 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -292,6 +292,7 @@ Expr lower_int_uint_mod(const Expr &a, const Expr &b) {
     }
 }
 
+namespace {
 std::pair<Expr, Expr> unsigned_long_div_mod_round_to_zero(Expr &num, const Expr &den,
                                                           const uint64_t *upper_bound) {
     internal_assert(num.type() == den.type());
@@ -329,6 +330,7 @@ std::pair<Expr, Expr> unsigned_long_div_mod_round_to_zero(Expr &num, const Expr
     }
     return {q, r};
 }
+}  // namespace
 
 std::pair<Expr, Expr> long_div_mod_round_to_zero(const Expr &num, const Expr &den,
                                                  const uint64_t *max_abs) {
@@ -557,6 +559,7 @@ Expr lower_round_to_nearest_ties_to_even(const Expr &x) {
     return common_subexpression_elimination(a - correction);
 }
 
+namespace {
 bool get_md_bool(llvm::Metadata *value, bool &result) {
     if (!value) {
         return false;
@@ -585,6 +588,7 @@ bool get_md_string(llvm::Metadata *value, std::string &result) {
     }
     return false;
 }
+}  // namespace
 
 void get_target_options(const llvm::Module &module, llvm::TargetOptions &options) {
     bool use_soft_float_abi = false;
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index b5489eb2b514..11142daeb2c7 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -570,7 +570,7 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
     // Define all functions
     int idx = 0;
     for (const auto &f : input.functions()) {
-        const auto names = function_names[idx++];
+        const auto &names = function_names[idx++];
 
         run_with_large_stack([&]() {
             compile_func(f, names.simple_name, names.extern_name);
@@ -3228,7 +3228,7 @@ void CodeGen_LLVM::visit(const Call *op) {
         builder->SetInsertPoint(global_not_inited_bb);
         llvm::Value *selected_value = nullptr;
         for (int i = sub_fns.size() - 1; i >= 0; i--) {
-            const auto sub_fn = sub_fns[i];
+            const auto &sub_fn = sub_fns[i];
             if (!selected_value) {
                 selected_value = sub_fn.fn_ptr;
             } else {
diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp
index 10c9463f16c4..68c1cf6d79c0 100644
--- a/src/CodeGen_PTX_Dev.cpp
+++ b/src/CodeGen_PTX_Dev.cpp
@@ -20,14 +20,6 @@
 
 #include <fstream>
 
-// This is declared in NVPTX.h, which is not exported. Ugly, but seems better than
-// hardcoding a path to the .h file.
-#ifdef WITH_NVPTX
-namespace llvm {
-FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
-}
-#endif
-
 namespace Halide {
 namespace Internal {
 
diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp
index 6d7303de3b52..c71ae3aed705 100644
--- a/src/CodeGen_PowerPC.cpp
+++ b/src/CodeGen_PowerPC.cpp
@@ -129,7 +129,7 @@ void CodeGen_PowerPC::visit(const Min *op) {
             return;
         }
     }
-    return CodeGen_Posix::visit(op);
+    CodeGen_Posix::visit(op);
 }
 
 void CodeGen_PowerPC::visit(const Max *op) {
@@ -139,7 +139,7 @@ void CodeGen_PowerPC::visit(const Max *op) {
             return;
         }
     }
-    return CodeGen_Posix::visit(op);
+    CodeGen_Posix::visit(op);
 }
 
 string CodeGen_PowerPC::mcpu_target() const {
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 75d538676384..6ae6dbae5e41 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -1261,10 +1261,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Call *op) {
                 one_constant_id = builder.declare_constant(op->type, &one_value);
             }
         } else if (op->type.is_float() && op->type.bits() == 32) {
-            float one_value = float(1.0f);
+            float one_value = 1.0f;
             one_constant_id = builder.declare_constant(op->type, &one_value);
         } else if (op->type.is_float() && op->type.bits() == 64) {
-            double one_value = double(1.0);
+            double one_value = 1.0;
             one_constant_id = builder.declare_constant(op->type, &one_value);
         } else {
             internal_error << "Vulkan: Unhandled float type in fast_inverse intrinsic!\n";
@@ -1832,7 +1832,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
             array_size = op->constant_allocation_size();
             array_type_id = builder.declare_type(op->type, array_size);
             builder.add_symbol(variable_name + "_array_type", array_type_id, builder.current_module().id());
-            debug(2) << "Vulkan: Allocate (fixed-size) " << op->name << " type=" << op->type << " array_size=" << (uint32_t)array_size << " in shared memory on device in global scope\n";
+            debug(2) << "Vulkan: Allocate (fixed-size) " << op->name << " type=" << op->type << " array_size=" << array_size << " in shared memory on device in global scope\n";
 
         } else {
             // dynamic allocation with unknown size at compile time ...
@@ -1844,7 +1844,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
             array_type_id = builder.add_array_with_default_size(storage_type_id, array_size_id);
             builder.add_symbol(variable_name + "_array_type", array_type_id, builder.current_module().id());
 
-            debug(2) << "Vulkan: Allocate (dynamic size) " << op->name << " type=" << op->type << " default_size=" << (uint32_t)array_size << " in shared memory on device in global scope\n";
+            debug(2) << "Vulkan: Allocate (dynamic size) " << op->name << " type=" << op->type << " default_size=" << array_size << " in shared memory on device in global scope\n";
 
             // bind the specialization constant to the next slot
             std::string constant_name = variable_name + "_array_size";
@@ -1876,7 +1876,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) {
             << "Allocation " << op->name << " has a dynamic size. "
             << "Only fixed-size local allocations are supported with Vulkan.";
 
-        debug(2) << "Vulkan: Allocate " << op->name << " type=" << op->type << " size=" << (uint32_t)array_size << " on device in function scope\n";
+        debug(2) << "Vulkan: Allocate " << op->name << " type=" << op->type << " size=" << array_size << " on device in function scope\n";
 
         array_type_id = builder.declare_type(op->type, array_size);
         storage_class = SpvStorageClassFunction;  // function scope
@@ -2705,7 +2705,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_device_args(const Stmt &s, uint3
 
             // Set descriptor set and binding indices
             SpvBuilder::Literals dset_index = {entry_point_index};
-            SpvBuilder::Literals binding_index = {uint32_t(binding_counter++)};
+            SpvBuilder::Literals binding_index = {binding_counter++};
             builder.add_annotation(buffer_block_var_id, SpvDecorationDescriptorSet, dset_index);
             builder.add_annotation(buffer_block_var_id, SpvDecorationBinding, binding_index);
             symbol_table.push(arg.name, {buffer_block_var_id, storage_class});
diff --git a/src/CodeGen_WebGPU_Dev.cpp b/src/CodeGen_WebGPU_Dev.cpp
index 815013798bb4..4fc5346ac13d 100644
--- a/src/CodeGen_WebGPU_Dev.cpp
+++ b/src/CodeGen_WebGPU_Dev.cpp
@@ -408,7 +408,7 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::add_kernel(
 
     close_scope("shader " + name);
 
-    for (auto [name, alloc] : workgroup_allocations) {
+    for (const auto &[name, alloc] : workgroup_allocations) {
         std::stringstream length;
         if (is_const(alloc->extents[0])) {
             length << alloc->extents[0];
diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index c43159893838..cf8652395bb7 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -404,7 +404,6 @@ Expr deinterleave(Expr e, int starting_lane, int lane_stride, int new_lanes, con
     e = common_subexpression_elimination(e);
     return simplify(e);
 }
-}  // namespace
 
 Expr extract_odd_lanes(const Expr &e, const Scope<> &lets) {
     internal_assert(e.type().lanes() % 2 == 0);
@@ -416,6 +415,13 @@ Expr extract_even_lanes(const Expr &e, const Scope<> &lets) {
     return deinterleave(e, 0, 2, (e.type().lanes() + 1) / 2, lets);
 }
 
+Expr extract_mod3_lanes(const Expr &e, int lane, const Scope<> &lets) {
+    internal_assert(e.type().lanes() % 3 == 0);
+    return deinterleave(e, lane, 3, (e.type().lanes() + 2) / 3, lets);
+}
+
+}  // namespace
+
 Expr extract_even_lanes(const Expr &e) {
     internal_assert(e.type().lanes() % 2 == 0);
     Scope<> lets;
@@ -428,11 +434,6 @@ Expr extract_odd_lanes(const Expr &e) {
     return extract_odd_lanes(e, lets);
 }
 
-Expr extract_mod3_lanes(const Expr &e, int lane, const Scope<> &lets) {
-    internal_assert(e.type().lanes() % 3 == 0);
-    return deinterleave(e, lane, 3, (e.type().lanes() + 2) / 3, lets);
-}
-
 Expr extract_lane(const Expr &e, int lane) {
     Scope<> lets;
     return deinterleave(e, lane, e.type().lanes(), 1, lets);
@@ -699,8 +700,8 @@ class Interleaver : public IRMutator {
         std::vector<Expr> args(stores.size());
         std::vector<Expr> predicates(stores.size());
 
-        int min_offset = 0;
-        std::vector<int> offsets(stores.size());
+        int64_t min_offset = 0;
+        std::vector<int64_t> offsets(stores.size());
 
         std::string load_name;
         Buffer<> load_image;
@@ -723,14 +724,12 @@ class Interleaver : public IRMutator {
             }
 
             offsets[i] = *offs;
-            if (*offs < min_offset) {
-                min_offset = *offs;
-            }
+            min_offset = std::min(min_offset, *offs);
         }
 
         // Gather the args for interleaving.
         for (size_t i = 0; i < stores.size(); ++i) {
-            int j = offsets[i] - min_offset;
+            int64_t j = offsets[i] - min_offset;
             if (j == 0) {
                 base = stores[i].as<Store>()->index.as<Ramp>()->base;
             }
diff --git a/src/Derivative.cpp b/src/Derivative.cpp
index 343ea6132620..49d86fe51528 100644
--- a/src/Derivative.cpp
+++ b/src/Derivative.cpp
@@ -397,11 +397,11 @@ void ReverseAccumulationVisitor::propagate_adjoints(
                     }
                     // Now we check all previous updates, see if the left hand
                     // side arguments overlap.
-                    Box current_box = boxes[update_id];
+                    const Box &current_box = boxes[update_id];
                     for (int prev_update_id = 0; prev_update_id < update_id;
                          prev_update_id++) {
                         // Gather two boxes from current update and previous update
-                        Box prev_box = boxes[prev_update_id];
+                        const Box &prev_box = boxes[prev_update_id];
                         internal_assert(current_box.size() == prev_box.size());
                         // If any of the boxes overlap, we need to throw an error
                         if (boxes_overlap(current_box, prev_box)) {
@@ -1369,7 +1369,7 @@ void ReverseAccumulationVisitor::propagate_halide_function_call(
     // Prepare a set of new substitution variables for func_to_update
     vector<Var> new_args;
     new_args.reserve(func_to_update.dimensions());
-    for (int arg_id = 0; arg_id < (int)func_to_update.dimensions(); arg_id++) {
+    for (int arg_id = 0; arg_id < func_to_update.dimensions(); arg_id++) {
         new_args.emplace_back(unique_name("u" + std::to_string(arg_id)));
     }
 
@@ -1805,7 +1805,7 @@ void ReverseAccumulationVisitor::propagate_halide_function_call(
             return !rdom.defined();
         }
         int update_id = func_to_update.num_update_definitions() - 1;
-        vector<Expr> prev_lhs =
+        const vector<Expr> &prev_lhs =
             func_to_update.update_args(update_id);
         internal_assert(prev_lhs.size() == lhs.size());
         // If previous update has different left hand side, don't merge
diff --git a/src/DistributeShifts.cpp b/src/DistributeShifts.cpp
index 7bb7e655ee38..5d30d8b7b9c4 100644
--- a/src/DistributeShifts.cpp
+++ b/src/DistributeShifts.cpp
@@ -195,7 +195,7 @@ class DistributeShiftsAsMuls : public IRMutator {
 
 }  // namespace
 
-Stmt distribute_shifts(const Stmt &s, const bool multiply_adds) {
+Stmt distribute_shifts(const Stmt &s, bool multiply_adds) {
     return DistributeShiftsAsMuls(multiply_adds).mutate(s);
 }
 
diff --git a/src/Elf.cpp b/src/Elf.cpp
index 9f5b47167f27..e3bf13af7137 100644
--- a/src/Elf.cpp
+++ b/src/Elf.cpp
@@ -521,6 +521,8 @@ Object::section_iterator Object::merge_text_sections() {
     return text;
 }
 
+namespace {
+
 template<typename T>
 std::vector<char> write_shared_object_internal(Object &obj, Linker *linker, const std::vector<std::string> &dependencies,
                                                const std::string &soname) {
@@ -1036,6 +1038,7 @@ std::vector<char> write_shared_object_internal(Object &obj, Linker *linker, cons
 
     return output;
 }
+}  // namespace
 
 std::vector<char> Object::write_shared_object(Linker *linker, const std::vector<std::string> &dependencies,
                                               const std::string &soname) {
diff --git a/src/EliminateBoolVectors.cpp b/src/EliminateBoolVectors.cpp
index 62cdbdbef5b5..2517ecf70c1c 100644
--- a/src/EliminateBoolVectors.cpp
+++ b/src/EliminateBoolVectors.cpp
@@ -1,3 +1,4 @@
+#include "EliminateBoolVectors.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "Scope.h"
diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp
index 92bcf3870d5d..d44a89278b26 100644
--- a/src/FlattenNestedRamps.cpp
+++ b/src/FlattenNestedRamps.cpp
@@ -91,7 +91,7 @@ class FlattenRamps : public IRMutator {
                         c /= stride;
                     }
                     // Compute the number of elements loaded
-                    extent = (int)((max_constant_offset / stride) + 1);
+                    extent = (max_constant_offset / stride) + 1;
                 }
 
                 // If we're gathering from a very large range, it
diff --git a/src/Func.cpp b/src/Func.cpp
index 29c1427274bc..e9c9f75edee9 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -2425,7 +2425,7 @@ Stage Func::specialize(const Expr &c) {
 
 void Func::specialize_fail(const std::string &message) {
     invalidate_cache();
-    (void)Stage(func, func.definition(), 0).specialize_fail(message);
+    Stage(func, func.definition(), 0).specialize_fail(message);
 }
 
 Func &Func::serial(const VarOrRVar &var) {
diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp
index 4294f2ebc825..851a60c8fef8 100644
--- a/src/FuseGPUThreadLoops.cpp
+++ b/src/FuseGPUThreadLoops.cpp
@@ -168,9 +168,7 @@ class NormalizeDimensionality : public IRMutator {
         if (op->for_type == ForType::GPUThread ||
             op->for_type == ForType::GPULane) {
             depth++;
-            if (depth > max_depth) {
-                max_depth = depth;
-            }
+            max_depth = std::max(max_depth, depth);
             Stmt stmt = IRMutator::visit(op);
             depth--;
             return stmt;
diff --git a/src/Generator.h b/src/Generator.h
index 5cc0dfc60f03..e5bd8e651970 100644
--- a/src/Generator.h
+++ b/src/Generator.h
@@ -397,7 +397,7 @@ class GeneratorParamBase {
     explicit GeneratorParamBase(const std::string &name);
     virtual ~GeneratorParamBase();
 
-    inline const std::string &name() const {
+    const std::string &name() const {
         return name_;
     }
 
@@ -489,12 +489,12 @@ class GeneratorParamBase {
 template<typename FROM, typename TO>
 struct Convert {
     template<typename TO2 = TO, typename std::enable_if<!std::is_same<TO2, bool>::value>::type * = nullptr>
-    inline static TO2 value(const FROM &from) {
+    static TO2 value(const FROM &from) {
         return static_cast<TO2>(from);
     }
 
     template<typename TO2 = TO, typename std::enable_if<std::is_same<TO2, bool>::value>::type * = nullptr>
-    inline static TO2 value(const FROM &from) {
+    static TO2 value(const FROM &from) {
         return from != 0;
     }
 };
@@ -1683,7 +1683,7 @@ class GeneratorInput_Buffer : public GeneratorInputImpl<T, Func> {
     }
 
     template<typename T2>
-    inline T2 as() const {
+    T2 as() const {
         return (T2) * this;
     }
 
@@ -1836,7 +1836,7 @@ class GeneratorInput_Func : public GeneratorInputImpl<T, Func> {
     }
 
     template<typename T2>
-    inline T2 as() const {
+    T2 as() const {
         return (T2) * this;
     }
 
@@ -3031,11 +3031,11 @@ class GeneratorContext {
     GeneratorContext with_target(const Target &t) const;
 
     template<typename T>
-    inline std::unique_ptr<T> create() const {
+    std::unique_ptr<T> create() const {
         return T::create(*this);
     }
     template<typename T, typename... Args>
-    inline std::unique_ptr<T> apply(const Args &...args) const {
+    std::unique_ptr<T> apply(const Args &...args) const {
         auto t = this->create<T>();
         t->apply(args...);
         return t;
@@ -3074,7 +3074,7 @@ class NamesInterface {
     static Expr cast(Expr e) {
         return Halide::cast<T>(e);
     }
-    static inline Expr cast(Halide::Type t, Expr e) {
+    static Expr cast(Halide::Type t, Expr e) {
         return Halide::cast(t, std::move(e));
     }
     template<typename T>
@@ -3083,16 +3083,16 @@ class NamesInterface {
     using Buffer = Halide::Buffer<T, D>;
     template<typename T>
     using Param = Halide::Param<T>;
-    static inline Type Bool(int lanes = 1) {
+    static Type Bool(int lanes = 1) {
         return Halide::Bool(lanes);
     }
-    static inline Type Float(int bits, int lanes = 1) {
+    static Type Float(int bits, int lanes = 1) {
         return Halide::Float(bits, lanes);
     }
-    static inline Type Int(int bits, int lanes = 1) {
+    static Type Int(int bits, int lanes = 1) {
         return Halide::Int(bits, lanes);
     }
-    static inline Type UInt(int bits, int lanes = 1) {
+    static Type UInt(int bits, int lanes = 1) {
         return Halide::UInt(bits, lanes);
     }
 };
@@ -3347,7 +3347,7 @@ class GeneratorBase : public NamesInterface, public AbstractGenerator {
 
     template<typename... Args,
              typename = typename std::enable_if<Internal::all_are_printable_args<Args...>::value>::type>
-    inline HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&...error_args) {
+    HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&...error_args) {
         std::vector<Expr> collected_args;
         Internal::collect_print_args(collected_args, std::forward<Args>(error_args)...);
         add_requirement(condition, collected_args);
@@ -3775,7 +3775,7 @@ class Generator : public Internal::GeneratorBase {
     }
 
     template<typename T2, typename... Args>
-    inline std::unique_ptr<T2> apply(const Args &...args) const {
+    std::unique_ptr<T2> apply(const Args &...args) const {
         auto t = this->create<T2>();
         t->apply(args...);
         return t;
diff --git a/src/HexagonOffload.cpp b/src/HexagonOffload.cpp
index 221eb9fbd042..2d7d5e74acf7 100644
--- a/src/HexagonOffload.cpp
+++ b/src/HexagonOffload.cpp
@@ -364,7 +364,7 @@ void do_reloc(char *addr, uint32_t mask, uintptr_t val, bool is_signed, bool ver
                 consumed_every_bit |= ((intptr_t)val) == -1;
                 val = ((intptr_t)val) >> 1;
             } else {
-                val = ((uintptr_t)val) >> 1;
+                val = val >> 1;
             }
             consumed_every_bit |= (val == 0);
             inst |= (next_bit << i);
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index 13b2b5d24559..7414f0fd7225 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -66,6 +66,7 @@ Expr native_deinterleave(const Expr &x) {
     return Call::make(x.type(), fn, {x}, Call::PureExtern);
 }
 
+namespace {
 bool is_native_interleave_op(const Expr &x, const char *name) {
     const Call *c = x.as<Call>();
     if (!c || c->args.size() != 1) {
@@ -73,6 +74,7 @@ bool is_native_interleave_op(const Expr &x, const char *name) {
     }
     return starts_with(c->name, name);
 }
+}  // namespace
 
 bool is_native_interleave(const Expr &x) {
     return is_native_interleave_op(x, "halide.hexagon.interleave");
diff --git a/src/IRMatch.h b/src/IRMatch.h
index da5a300cfb01..511867c5bc82 100644
--- a/src/IRMatch.h
+++ b/src/IRMatch.h
@@ -1318,7 +1318,7 @@ HALIDE_ALWAYS_INLINE double constant_fold_bin_op<And>(halide_type_t &t, double a
     return 0;
 }
 
-constexpr inline uint32_t bitwise_or_reduce() {
+constexpr uint32_t bitwise_or_reduce() {
     return 0;
 }
 
@@ -1327,7 +1327,7 @@ constexpr uint32_t bitwise_or_reduce(uint32_t first, Args... rest) {
     return first | bitwise_or_reduce(rest...);
 }
 
-constexpr inline bool and_reduce() {
+constexpr bool and_reduce() {
     return true;
 }
 
diff --git a/src/IRMutator.h b/src/IRMutator.h
index f1a375e9f127..4eed97ec4fe7 100644
--- a/src/IRMutator.h
+++ b/src/IRMutator.h
@@ -128,8 +128,8 @@ std::pair<Region, bool> mutate_region(Mutator *mutator, const Region &bounds, Ar
     for (size_t i = 0; i < bounds.size(); i++) {
         Expr old_min = bounds[i].min;
         Expr old_extent = bounds[i].extent;
-        Expr new_min = mutator->mutate(old_min, std::forward<Args>(args)...);
-        Expr new_extent = mutator->mutate(old_extent, std::forward<Args>(args)...);
+        Expr new_min = mutator->mutate(old_min, args...);
+        Expr new_extent = mutator->mutate(old_extent, args...);
         if (!new_min.same_as(old_min)) {
             bounds_changed = true;
         }
diff --git a/src/IROperator.cpp b/src/IROperator.cpp
index f2e91fd5307e..f842e8e72a16 100644
--- a/src/IROperator.cpp
+++ b/src/IROperator.cpp
@@ -647,6 +647,7 @@ void check_representable(Type dst, int64_t x) {
     }
 }
 
+namespace {
 void match_lanes(Expr &a, Expr &b) {
     // Broadcast scalar to match vector
     if (a.type().is_scalar() && b.type().is_vector()) {
@@ -658,6 +659,18 @@ void match_lanes(Expr &a, Expr &b) {
     }
 }
 
+// Cast to the wider type of the two. Already guaranteed to leave
+// signed/unsigned on number of lanes unchanged.
+void match_bits(Expr &x, Expr &y) {
+    // The signedness doesn't match, so just match the bits.
+    if (x.type().bits() < y.type().bits()) {
+        x = cast(x.type().with_bits(y.type().bits()), x);
+    } else if (y.type().bits() < x.type().bits()) {
+        y = cast(y.type().with_bits(x.type().bits()), y);
+    }
+}
+}  // namespace
+
 void match_types(Expr &a, Expr &b) {
     if (a.type() == b.type()) {
         return;
@@ -707,17 +720,6 @@ void match_types(Expr &a, Expr &b) {
     }
 }
 
-// Cast to the wider type of the two. Already guaranteed to leave
-// signed/unsigned on number of lanes unchanged.
-void match_bits(Expr &x, Expr &y) {
-    // The signedness doesn't match, so just match the bits.
-    if (x.type().bits() < y.type().bits()) {
-        x = cast(x.type().with_bits(y.type().bits()), x);
-    } else if (y.type().bits() < x.type().bits()) {
-        y = cast(y.type().with_bits(x.type().bits()), y);
-    }
-}
-
 void match_types_bitwise(Expr &x, Expr &y, const char *op_name) {
     user_assert(x.defined() && y.defined()) << op_name << " of undefined Expr\n";
     user_assert(x.type().is_int() || x.type().is_uint())
@@ -745,6 +747,7 @@ void match_types_bitwise(Expr &x, Expr &y, const char *op_name) {
 
 // Fast math ops based on those from Syrah (http://github.com/boulos/syrah). Thanks, Solomon!
 
+namespace {
 // Factor a float into 2^exponent * reduced, where reduced is between 0.75 and 1.5
 void range_reduce_log(const Expr &input, Expr *reduced, Expr *exponent) {
     Type type = input.type();
@@ -774,6 +777,7 @@ void range_reduce_log(const Expr &input, Expr *reduced, Expr *exponent) {
 
     *reduced = reinterpret(type, blended);
 }
+}  // namespace
 
 Expr halide_log(const Expr &x_full) {
     Type type = x_full.type();
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index c3a4d57d9ba7..61dae561ac2d 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -586,7 +586,7 @@ void print_handler(JITUserContext *context, const char *msg) {
     if (context && context->handlers.custom_print) {
         context->handlers.custom_print(context, msg);
     } else {
-        return active_handlers.custom_print(context, msg);
+        active_handlers.custom_print(context, msg);
     }
 }
 
diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp
index 1446fd895bca..85e50cbb3682 100644
--- a/src/LLVM_Output.cpp
+++ b/src/LLVM_Output.cpp
@@ -342,8 +342,6 @@ std::unique_ptr<llvm::Module> clone_module(const llvm::Module &module_in) {
     return std::move(cloned_module.get());
 }
 
-}  // namespace
-
 void emit_file(const llvm::Module &module_in, Internal::LLVMOStream &out,
                llvm::CodeGenFileType file_type) {
     Internal::debug(1) << "emit_file.Compiling to native code...\n";
@@ -407,6 +405,8 @@ void emit_file(const llvm::Module &module_in, Internal::LLVMOStream &out,
     llvm::reportAndResetTimings();
 }
 
+}  // namespace
+
 std::unique_ptr<llvm::Module> compile_module_to_llvm_module(const Module &module, llvm::LLVMContext &context) {
     return codegen_llvm(module, context);
 }
diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp
index 33441a6f4b8e..eda1a3b170c5 100644
--- a/src/LLVM_Runtime_Linker.cpp
+++ b/src/LLVM_Runtime_Linker.cpp
@@ -741,10 +741,6 @@ void link_modules(std::vector<std::unique_ptr<llvm::Module>> &modules, Target t,
     }
 }
 
-}  // namespace
-
-namespace Internal {
-
 /** When JIT-compiling on 32-bit windows, we need to rewrite calls
  *  to name-mangled win32 api calls to non-name-mangled versions.
  */
@@ -753,7 +749,7 @@ void undo_win32_name_mangling(llvm::Module *m) {
     // For every function prototype...
     for (llvm::Module::iterator iter = m->begin(); iter != m->end(); ++iter) {
         llvm::Function &f = *iter;
-        string n = get_llvm_function_name(f);
+        string n = Internal::get_llvm_function_name(f);
         // if it's a __stdcall call that starts with \01_, then we're making a win32 api call
         if (f.getCallingConv() == llvm::CallingConv::X86_StdCall &&
             f.empty() &&
@@ -826,6 +822,10 @@ void add_underscores_to_posix_calls_on_windows(llvm::Module *m) {
     }
 }
 
+}  // namespace
+
+namespace Internal {
+
 std::unique_ptr<llvm::Module> link_with_wasm_jit_runtime(llvm::LLVMContext *c, const Target &t,
                                                          std::unique_ptr<llvm::Module> extra_module) {
     bool bits_64 = (t.bits == 64);
diff --git a/src/LoopCarry.cpp b/src/LoopCarry.cpp
index bfc2abc8ddf1..7e9adc960f69 100644
--- a/src/LoopCarry.cpp
+++ b/src/LoopCarry.cpp
@@ -482,13 +482,13 @@ class LoopCarryOverLoop : public IRMutator {
 
             // Wrap them in the appropriate lets
             for (size_t i = initial_lets.size(); i > 0; i--) {
-                auto l = initial_lets[i - 1];
+                const auto &l = initial_lets[i - 1];
                 initial_stores = LetStmt::make(l.first, l.second, initial_stores);
             }
             // We may be lifting the initial stores out of let stmts,
             // so rewrap them in the necessary ones.
             for (size_t i = containing_lets.size(); i > 0; i--) {
-                auto l = containing_lets[i - 1];
+                const auto &l = containing_lets[i - 1];
                 if (stmt_uses_var(initial_stores, l.first)) {
                     initial_stores = LetStmt::make(l.first, l.second, initial_stores);
                 }
diff --git a/src/Memoization.cpp b/src/Memoization.cpp
index be99c3b8fcba..f41ad0c077f2 100644
--- a/src/Memoization.cpp
+++ b/src/Memoization.cpp
@@ -165,9 +165,7 @@ class KeyInfo {
         // Find maximum natural alignment needed.
         for (const ConstDependencyKeyInfoPair &i : dependencies.dependency_info) {
             int alignment = i.second.type.bytes();
-            if (alignment > max_alignment) {
-                max_alignment = alignment;
-            }
+            max_alignment = std::max(max_alignment, alignment);
         }
         // Make sure max_alignment is a power of two and has maximum value of 32
         int i = 0;
diff --git a/src/Parameter.cpp b/src/Parameter.cpp
index 41353871fd0d..59416f859050 100644
--- a/src/Parameter.cpp
+++ b/src/Parameter.cpp
@@ -268,7 +268,8 @@ bool Parameter::defined() const {
 // parameter itself, to avoid creating a reference count cycle and causing a
 // leak. Note that it's still possible to create a cycle by having two different
 // Parameters each have constraints that reference the other.
-namespace Internal {
+namespace {
+using namespace Halide::Internal;
 
 Expr remove_self_references(const Parameter &p, const Expr &e) {
     class RemoveSelfReferences : public IRMutator {
@@ -316,36 +317,36 @@ Expr restore_self_references(const Parameter &p, const Expr &e) {
     return mutator.mutate(e);
 }
 
-}  // namespace Internal
+}  // namespace
 
 void Parameter::set_min_constraint(int dim, const Expr &e) {
     check_is_buffer();
     check_dim_ok(dim);
-    contents->buffer_constraints[dim].min = Internal::remove_self_references(*this, e);
+    contents->buffer_constraints[dim].min = remove_self_references(*this, e);
 }
 
 void Parameter::set_extent_constraint(int dim, const Expr &e) {
     check_is_buffer();
     check_dim_ok(dim);
-    contents->buffer_constraints[dim].extent = Internal::remove_self_references(*this, e);
+    contents->buffer_constraints[dim].extent = remove_self_references(*this, e);
 }
 
 void Parameter::set_stride_constraint(int dim, const Expr &e) {
     check_is_buffer();
     check_dim_ok(dim);
-    contents->buffer_constraints[dim].stride = Internal::remove_self_references(*this, e);
+    contents->buffer_constraints[dim].stride = remove_self_references(*this, e);
 }
 
 void Parameter::set_min_constraint_estimate(int dim, const Expr &min) {
     check_is_buffer();
     check_dim_ok(dim);
-    contents->buffer_constraints[dim].min_estimate = Internal::remove_self_references(*this, min);
+    contents->buffer_constraints[dim].min_estimate = remove_self_references(*this, min);
 }
 
 void Parameter::set_extent_constraint_estimate(int dim, const Expr &extent) {
     check_is_buffer();
     check_dim_ok(dim);
-    contents->buffer_constraints[dim].extent_estimate = Internal::remove_self_references(*this, extent);
+    contents->buffer_constraints[dim].extent_estimate = remove_self_references(*this, extent);
 }
 
 void Parameter::set_host_alignment(int bytes) {
@@ -356,31 +357,31 @@ void Parameter::set_host_alignment(int bytes) {
 Expr Parameter::min_constraint(int dim) const {
     check_is_buffer();
     check_dim_ok(dim);
-    return Internal::restore_self_references(*this, contents->buffer_constraints[dim].min);
+    return restore_self_references(*this, contents->buffer_constraints[dim].min);
 }
 
 Expr Parameter::extent_constraint(int dim) const {
     check_is_buffer();
     check_dim_ok(dim);
-    return Internal::restore_self_references(*this, contents->buffer_constraints[dim].extent);
+    return restore_self_references(*this, contents->buffer_constraints[dim].extent);
 }
 
 Expr Parameter::stride_constraint(int dim) const {
     check_is_buffer();
     check_dim_ok(dim);
-    return Internal::restore_self_references(*this, contents->buffer_constraints[dim].stride);
+    return restore_self_references(*this, contents->buffer_constraints[dim].stride);
 }
 
 Expr Parameter::min_constraint_estimate(int dim) const {
     check_is_buffer();
     check_dim_ok(dim);
-    return Internal::restore_self_references(*this, contents->buffer_constraints[dim].min_estimate);
+    return restore_self_references(*this, contents->buffer_constraints[dim].min_estimate);
 }
 
 Expr Parameter::extent_constraint_estimate(int dim) const {
     check_is_buffer();
     check_dim_ok(dim);
-    return Internal::restore_self_references(*this, contents->buffer_constraints[dim].extent_estimate);
+    return restore_self_references(*this, contents->buffer_constraints[dim].extent_estimate);
 }
 
 int Parameter::host_alignment() const {
diff --git a/src/Pipeline.h b/src/Pipeline.h
index ef3e5f11eca3..82212380a7df 100644
--- a/src/Pipeline.h
+++ b/src/Pipeline.h
@@ -484,7 +484,7 @@ class Pipeline {
 
     template<typename... Args,
              typename = typename std::enable_if<Internal::all_are_printable_args<Args...>::value>::type>
-    inline HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&...error_args) {
+    HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&...error_args) {
         std::vector<Expr> collected_args;
         Internal::collect_print_args(collected_args, std::forward<Args>(error_args)...);
         add_requirement(condition, collected_args);
diff --git a/src/RDom.cpp b/src/RDom.cpp
index dcebe2d6e370..c51c4712eecf 100644
--- a/src/RDom.cpp
+++ b/src/RDom.cpp
@@ -66,12 +66,14 @@ const std::string &RVar::name() const {
     }
 }
 
+namespace {
 template<int N>
 ReductionDomain build_domain(ReductionVariable (&vars)[N]) {
     vector<ReductionVariable> d(&vars[0], &vars[N]);
     ReductionDomain dom(d);
     return dom;
 }
+}  // namespace
 
 // This just initializes the predefined x, y, z, w members of RDom.
 void RDom::init_vars(const string &name) {
diff --git a/src/RealizationOrder.cpp b/src/RealizationOrder.cpp
index af12ba80c228..ea5cd571f986 100644
--- a/src/RealizationOrder.cpp
+++ b/src/RealizationOrder.cpp
@@ -282,8 +282,6 @@ void sort_funcs_by_name_and_counter(vector<string> *funcs,
     }
 }
 
-}  // anonymous namespace
-
 map<string, uint64_t> compute_visitation_order(const vector<Function> &outputs) {
     vector<Function> funcs = called_funcs_in_order_found(outputs);
     map<string, uint64_t> result;
@@ -293,6 +291,8 @@ map<string, uint64_t> compute_visitation_order(const vector<Function> &outputs)
     return result;
 }
 
+}  // anonymous namespace
+
 pair<vector<string>, vector<vector<string>>> realization_order(
     const vector<Function> &outputs, map<string, Function> &env) {
 
diff --git a/src/RegionCosts.h b/src/RegionCosts.h
index 989166926c81..3eb4279bca8b 100644
--- a/src/RegionCosts.h
+++ b/src/RegionCosts.h
@@ -32,7 +32,7 @@ struct Cost {
     }
     Cost() = default;
 
-    inline bool defined() const {
+    bool defined() const {
         return arith.defined() && memory.defined();
     }
     void simplify();
diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp
index b5d8f35aac28..02b7cc98e7d1 100644
--- a/src/ScheduleFunctions.cpp
+++ b/src/ScheduleFunctions.cpp
@@ -1311,7 +1311,7 @@ class InjectFunctionRealization : public IRMutator {
 
         // Reinstate the let/if statements
         for (size_t i = containers.size(); i > 0; i--) {
-            auto p = containers[i - 1];
+            const auto &p = containers[i - 1];
             if (p.first.empty()) {
                 body = IfThenElse::make(p.second, body);
             } else {
@@ -2551,7 +2551,9 @@ bool group_should_be_inlined(const vector<Function> &funcs) {
 
 }  // namespace
 
-std::ostream &operator<<(std::ostream &out, const std::vector<Function> &v) {
+// We want this to have internal linkage, but putting it in an anonymous
+// namespace doesn't work due to two-phase lookup peculiarities.
+static std::ostream &operator<<(std::ostream &out, const std::vector<Function> &v) {  // NOLINT
     out << "{ ";
     for (size_t i = 0; i < v.size(); ++i) {
         out << v[i].name();
diff --git a/src/Simplify.cpp b/src/Simplify.cpp
index 29535d36255b..6bb34fc4db15 100644
--- a/src/Simplify.cpp
+++ b/src/Simplify.cpp
@@ -311,6 +311,7 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) {
     }
 }
 
+namespace {
 template<typename T>
 T substitute_facts_impl(const T &t,
                         const std::set<Expr, IRDeepCompare> &truths,
@@ -340,6 +341,7 @@ T substitute_facts_impl(const T &t,
 
     return substitutor.mutate(t);
 }
+}  // namespace
 
 Expr Simplify::ScopedFact::substitute_facts(const Expr &e) {
     return substitute_facts_impl(e, truths, falsehoods);
diff --git a/src/SlidingWindow.cpp b/src/SlidingWindow.cpp
index dfb50d714e37..69fa3198ceaf 100644
--- a/src/SlidingWindow.cpp
+++ b/src/SlidingWindow.cpp
@@ -657,7 +657,7 @@ class Dependencies : public IRVisitor {
 
     void visit(const ProducerConsumer *op) override {
         ScopedValue<bool> old_finding_a(in_producer, in_producer || (op->is_producer && op->name == producer));
-        return IRVisitor::visit(op);
+        IRVisitor::visit(op);
     }
 
     void visit(const Call *op) override {
diff --git a/src/StmtToHTML.cpp b/src/StmtToHTML.cpp
index 0811f38175de..bc6645d26bd1 100644
--- a/src/StmtToHTML.cpp
+++ b/src/StmtToHTML.cpp
@@ -784,7 +784,7 @@ class HTMLCodePrinter : public IRVisitor {
         scope.pop(m.name());
     }
 
-    inline std::string escape_html(std::string src) {
+    std::string escape_html(std::string src) {
         src = replace_all(src, "&", "&amp;");
         src = replace_all(src, "<", "&lt;");
         src = replace_all(src, ">", "&gt;");
@@ -902,7 +902,7 @@ class HTMLCodePrinter : public IRVisitor {
                 std::vector<std::string> operands = split_string(operands_str, ", ");
                 operands_str = "";
                 for (size_t opidx = 0; opidx < operands.size(); ++opidx) {
-                    std::string op = operands[opidx];
+                    const std::string &op = operands[opidx];
                     internal_assert(!op.empty());
                     if (opidx != 0) {
                         operands_str += ", ";
diff --git a/src/Target.h b/src/Target.h
index 81da10a37126..e48fa9ded8de 100644
--- a/src/Target.h
+++ b/src/Target.h
@@ -223,7 +223,7 @@ struct Target {
 
     bool has_feature(Feature f) const;
 
-    inline bool has_feature(halide_target_feature_t f) const {
+    bool has_feature(halide_target_feature_t f) const {
         return has_feature((Feature)f);
     }
 
diff --git a/src/Tracing.cpp b/src/Tracing.cpp
index 57bb40570854..3a95d04de0c4 100644
--- a/src/Tracing.cpp
+++ b/src/Tracing.cpp
@@ -43,7 +43,7 @@ struct TraceEventBuilder {
         // special-casing of this call to get appropriate results.
         vector<Expr> args = {Expr(func),
                              values, coords,
-                             (int)type.code(), (int)type.bits(), (int)type.lanes(),
+                             (int)type.code(), type.bits(), type.lanes(),
                              (int)event,
                              parent_id, idx, (int)coordinates.size(),
                              trace_tag_expr};
diff --git a/src/UniquifyVariableNames.cpp b/src/UniquifyVariableNames.cpp
index 85a6ba521771..bfd20969f0b6 100644
--- a/src/UniquifyVariableNames.cpp
+++ b/src/UniquifyVariableNames.cpp
@@ -172,6 +172,7 @@ Stmt uniquify_variable_names(const Stmt &s) {
     return u.mutate(s);
 }
 
+namespace {
 void check(vector<pair<Var, Expr>> in,
            vector<pair<Var, Expr>> out) {
     Stmt in_stmt = Evaluate::make(0), out_stmt = Evaluate::make(0);
@@ -193,6 +194,7 @@ void check(vector<pair<Var, Expr>> in,
         << "Correct output:\n"
         << out_stmt << "\n";
 }
+}  // namespace
 
 void uniquify_variable_names_test() {
     Var x("x"), x_1("x_1"), x_2("x_2"), x_3{"x_3"};
diff --git a/src/Util.h b/src/Util.h
index 40c48bb42b9e..6c0975d42005 100644
--- a/src/Util.h
+++ b/src/Util.h
@@ -426,7 +426,7 @@ void halide_toc_impl(const char *file, int line);
 template<typename TO>
 struct StaticCast {
     template<typename FROM>
-    inline constexpr static TO value(const FROM &from) {
+    constexpr static TO value(const FROM &from) {
         if constexpr (std::is_same<TO, bool>::value) {
             return from != 0;
         } else {
@@ -441,7 +441,7 @@ struct StaticCast {
 template<typename TO>
 struct IsRoundtrippable {
     template<typename FROM>
-    inline constexpr static bool value(const FROM &from) {
+    constexpr static bool value(const FROM &from) {
         if constexpr (std::is_convertible<FROM, TO>::value) {
             if constexpr (std::is_arithmetic<TO>::value &&
                           std::is_arithmetic<FROM>::value &&
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 1481e37b0345..17a5f84d670b 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -828,7 +828,9 @@ class VectorSubs : public IRMutator {
         if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index)) {
             return op;
         } else {
-            int lanes = std::max(predicate.type().lanes(), std::max(value.type().lanes(), index.type().lanes()));
+            int lanes = std::max({predicate.type().lanes(),
+                                  value.type().lanes(),
+                                  index.type().lanes()});
             return Store::make(op->name, widen(value, lanes), widen(index, lanes),
                                op->param, widen(predicate, lanes), op->alignment);
         }
diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp
index 93f86ed80c88..840f729e1d57 100644
--- a/src/WasmExecutor.cpp
+++ b/src/WasmExecutor.cpp
@@ -82,7 +82,7 @@ struct debug_sink {
     debug_sink() = default;
 
     template<typename T>
-    inline debug_sink &operator<<(T &&x) {
+    debug_sink &operator<<(T &&x) {
         return *this;
     }
 };
@@ -163,7 +163,7 @@ class BDMalloc {
 
         // alignment and min-block-size are the same for our purposes here.
         constexpr uint32_t kAlignment = 32;
-        const uint32_t size = std::max(align_up((uint32_t)requested_size, kAlignment), kAlignment);
+        const uint32_t size = std::max(align_up(requested_size, kAlignment), kAlignment);
 
         constexpr uint32_t kMaxAllocSize = 0x7fffffff;
         internal_assert(size <= kMaxAllocSize);
@@ -822,7 +822,7 @@ void copy_hostbuf_to_existing_wasmbuf(WabtContext &wabt_context, const halide_bu
 
 template<typename T>
 struct LoadValue {
-    inline wabt::interp::Value operator()(const void *src) {
+    wabt::interp::Value operator()(const void *src) {
         const T val = *(const T *)(src);
         return wabt::interp::Value::Make(val);
     }
@@ -867,7 +867,7 @@ inline wabt::interp::Value load_value(const T &val) {
 
 template<typename T>
 struct StoreValue {
-    inline void operator()(const wabt::interp::Value &src, void *dst) {
+    void operator()(const wabt::interp::Value &src, void *dst) {
         *(T *)dst = src.Get<T>();
     }
 };
@@ -2134,8 +2134,6 @@ void add_extern_callbacks(const Local<Context> &context,
 
 #endif  // WITH_V8
 
-}  // namespace
-
 // clang-format off
 
 #if WITH_WABT
@@ -2227,6 +2225,8 @@ const HostCallbackMap &get_host_callback_map() {
     return m;
 }
 
+}  // namespace
+
 #undef DEFINE_CALLBACK
 #undef DEFINE_POSIX_MATH_CALLBACK
 #undef DEFINE_POSIX_MATH_CALLBACK2
diff --git a/src/autoschedulers/adams2019/AutoSchedule.cpp b/src/autoschedulers/adams2019/AutoSchedule.cpp
index 083626a82423..ae3ecbe8fb0b 100644
--- a/src/autoschedulers/adams2019/AutoSchedule.cpp
+++ b/src/autoschedulers/adams2019/AutoSchedule.cpp
@@ -80,6 +80,8 @@ namespace Halide {
 namespace Internal {
 namespace Autoscheduler {
 
+namespace {
+
 using std::string;
 using std::vector;
 
@@ -498,9 +500,6 @@ IntrusivePtr<State> optimal_schedule(FunctionDAG &dag,
     return best;
 }
 
-// Keep track of how many times we evaluated a state.
-int State::cost_calculations = 0;
-
 // The main entrypoint to generate a schedule for a pipeline.
 void generate_schedule(const std::vector<Function> &outputs,
                        const Target &target,
@@ -608,6 +607,8 @@ struct Adams2019 {
 
 REGISTER_AUTOSCHEDULER(Adams2019)
 
+}  // namespace
+
 // An alternative entrypoint for other uses
 void find_and_apply_schedule(FunctionDAG &dag,
                              const std::vector<Function> &outputs,
diff --git a/src/autoschedulers/adams2019/FunctionDAG.cpp b/src/autoschedulers/adams2019/FunctionDAG.cpp
index 97fe771733f1..4bb17b265d46 100644
--- a/src/autoschedulers/adams2019/FunctionDAG.cpp
+++ b/src/autoschedulers/adams2019/FunctionDAG.cpp
@@ -554,6 +554,7 @@ void FunctionDAG::Edge::expand_footprint(const Span *consumer_loop, Span *produc
     }
 }
 
+namespace {
 class DependsOnEstimate : public IRVisitor {
 public:
     bool found_estimate = false;
@@ -571,6 +572,7 @@ bool depends_on_estimate(const Expr &expr) {
     expr.accept(&dependency_checker);
     return dependency_checker.found_estimate;
 }
+}  // namespace
 
 FunctionDAG::FunctionDAG(const vector<Function> &outputs, const Target &target) {
     map<string, Function> env = build_environment(outputs);
diff --git a/src/autoschedulers/adams2019/LoopNest.cpp b/src/autoschedulers/adams2019/LoopNest.cpp
index d76279752b37..2e0cf3bcf8d0 100644
--- a/src/autoschedulers/adams2019/LoopNest.cpp
+++ b/src/autoschedulers/adams2019/LoopNest.cpp
@@ -11,7 +11,7 @@ namespace Autoscheduler {
 // How small should an innermost loop cluster be before you just
 // entirely unroll the thing. Sized for an architecture with 16 vector
 // registers.
-const int kUnrollLimit = 12;
+constexpr static int kUnrollLimit = 12;
 
 // Given a multi-dimensional box of dimensionality d, generate a list
 // of candidate tile sizes for it, logarithmically spacing the sizes
diff --git a/src/autoschedulers/adams2019/State.cpp b/src/autoschedulers/adams2019/State.cpp
index 6cd994a0625c..7c4545fae57b 100644
--- a/src/autoschedulers/adams2019/State.cpp
+++ b/src/autoschedulers/adams2019/State.cpp
@@ -686,6 +686,9 @@ void State::apply_schedule(const FunctionDAG &dag, const Adams2019Params &params
     }
 }
 
+// Keep track of how many times we evaluated a state.
+int State::cost_calculations = 0;
+
 }  // namespace Autoscheduler
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/autoschedulers/adams2019/Weights.cpp b/src/autoschedulers/adams2019/Weights.cpp
index 8206410a397f..3ccf16778dfe 100644
--- a/src/autoschedulers/adams2019/Weights.cpp
+++ b/src/autoschedulers/adams2019/Weights.cpp
@@ -12,7 +12,7 @@ namespace Internal {
 
 using Halide::Runtime::Buffer;
 
-constexpr uint32_t kSignature = 0x68776631;
+constexpr static uint32_t kSignature = 0x68776631;
 
 void Weights::randomize(uint32_t seed) {
     std::mt19937 rng(seed);
@@ -70,7 +70,7 @@ bool Weights::load(std::istream &i) {
         for (uint32_t d = 0; d < dimension_count; d++) {
             uint32_t extent;
             i.read((char *)&extent, sizeof(extent));
-            if (i.fail() || (int)extent != (int)buf.extent(d)) {
+            if (i.fail() || (int)extent != buf.extent(d)) {
                 return false;
             }
         }
diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp
index e670fe7d8734..3e1a4c2ede9a 100644
--- a/src/autoschedulers/anderson2021/AutoSchedule.cpp
+++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp
@@ -79,6 +79,8 @@ namespace Halide {
 namespace Internal {
 namespace Autoscheduler {
 
+namespace {
+
 using std::string;
 using std::vector;
 
@@ -712,6 +714,7 @@ struct Anderson2021 {
 };
 
 REGISTER_AUTOSCHEDULER(Anderson2021)
+}  // namespace
 
 // An alternative entrypoint for other uses
 void find_and_apply_schedule(FunctionDAG &dag,
diff --git a/src/autoschedulers/anderson2021/LoopNest.cpp b/src/autoschedulers/anderson2021/LoopNest.cpp
index b24ed1815b05..ab7510d633a7 100644
--- a/src/autoschedulers/anderson2021/LoopNest.cpp
+++ b/src/autoschedulers/anderson2021/LoopNest.cpp
@@ -32,7 +32,7 @@ std::string stringify(GPU_parallelism label) {
 
 // How small should an innermost loop cluster be before you just
 // entirely unroll the thing
-const int kUnrollLimitGPU = 16;
+constexpr static int kUnrollLimitGPU = 16;
 
 bool may_subtile(const Anderson2021Params &params) {
     return params.disable_subtiling == 0;
diff --git a/src/autoschedulers/anderson2021/LoopNest.h b/src/autoschedulers/anderson2021/LoopNest.h
index b0dee6cedf24..ba6158f167a4 100644
--- a/src/autoschedulers/anderson2021/LoopNest.h
+++ b/src/autoschedulers/anderson2021/LoopNest.h
@@ -53,6 +53,8 @@ bool may_subtile(const Anderson2021Params &params);
 
 int64_t get_shared_memory_limit(const Anderson2021Params &params);
 
+int64_t get_shared_memory_sm_limit(const Anderson2021Params &params);
+
 int64_t get_active_block_hardware_limit(const Anderson2021Params &params);
 
 int64_t get_active_warp_hardware_limit(const Anderson2021Params &params);
diff --git a/src/autoschedulers/anderson2021/LoopNestParser.h b/src/autoschedulers/anderson2021/LoopNestParser.h
index c4c039e7b561..af4956de239a 100644
--- a/src/autoschedulers/anderson2021/LoopNestParser.h
+++ b/src/autoschedulers/anderson2021/LoopNestParser.h
@@ -58,7 +58,7 @@ class LoopNestParser {
             }
 
             if (tokens.back() == "gpu_simd" && compute_root_stages.count(stage) == 1 && compute_root_stages[stage] == -1) {
-                std::string vector_dim = tokens[tokens.size() - 3];
+                const std::string &vector_dim = tokens[tokens.size() - 3];
                 compute_root_stages[stage] = std::stoi(vector_dim.substr(0, vector_dim.size() - 1));
             }
 
diff --git a/src/autoschedulers/anderson2021/State.cpp b/src/autoschedulers/anderson2021/State.cpp
index e58507c5b7b8..e3d8801ca4e3 100644
--- a/src/autoschedulers/anderson2021/State.cpp
+++ b/src/autoschedulers/anderson2021/State.cpp
@@ -7,9 +7,11 @@ namespace Halide {
 namespace Internal {
 namespace Autoscheduler {
 
+namespace {
 int64_t get_stack_memory_limit(const Anderson2021Params &params) {
     return params.stack_factor * 103232;
 }
+}  // namespace
 
 uint64_t State::structural_hash(int depth) const {
     uint64_t h = num_decisions_made;
diff --git a/src/autoschedulers/anderson2021/Weights.cpp b/src/autoschedulers/anderson2021/Weights.cpp
index 8206410a397f..3ccf16778dfe 100644
--- a/src/autoschedulers/anderson2021/Weights.cpp
+++ b/src/autoschedulers/anderson2021/Weights.cpp
@@ -12,7 +12,7 @@ namespace Internal {
 
 using Halide::Runtime::Buffer;
 
-constexpr uint32_t kSignature = 0x68776631;
+constexpr static uint32_t kSignature = 0x68776631;
 
 void Weights::randomize(uint32_t seed) {
     std::mt19937 rng(seed);
@@ -70,7 +70,7 @@ bool Weights::load(std::istream &i) {
         for (uint32_t d = 0; d < dimension_count; d++) {
             uint32_t extent;
             i.read((char *)&extent, sizeof(extent));
-            if (i.fail() || (int)extent != (int)buf.extent(d)) {
+            if (i.fail() || (int)extent != buf.extent(d)) {
                 return false;
             }
         }
diff --git a/src/autoschedulers/common/cmdline.h b/src/autoschedulers/common/cmdline.h
index 1158eb151c01..403e4adb0101 100644
--- a/src/autoschedulers/common/cmdline.h
+++ b/src/autoschedulers/common/cmdline.h
@@ -484,7 +484,7 @@ class parser {
             return false;
         }
 
-        if (buf.length() > 0) {
+        if (!buf.empty()) {
             args.push_back(buf);
         }
 
@@ -520,7 +520,7 @@ class parser {
 
         std::map<char, std::string> lookup;
         for (auto &option : options) {
-            if (option.first.length() == 0) {
+            if (option.first.empty()) {
                 continue;
             }
             char initial = option.second->short_name();
diff --git a/src/autoschedulers/li2018/GradientAutoscheduler.cpp b/src/autoschedulers/li2018/GradientAutoscheduler.cpp
index 709e13b2ead5..1f1ad91a1567 100644
--- a/src/autoschedulers/li2018/GradientAutoscheduler.cpp
+++ b/src/autoschedulers/li2018/GradientAutoscheduler.cpp
@@ -737,7 +737,7 @@ void apply_schedule(const GradientAutoschedulerParams &params,
         pure_arg_bounds.reserve(update_args.size());
         int parallelism = 1;
         for (int arg_id = 0; arg_id < (int)update_args.size(); arg_id++) {
-            Expr arg = update_args[arg_id];
+            const Expr &arg = update_args[arg_id];
             const Variable *var = arg.as<Variable>();
             if (var != nullptr &&
                 !var->param.defined() &&
@@ -819,8 +819,6 @@ void apply_schedule(const GradientAutoschedulerParams &params,
     schedule_source << ";\n";
 }
 
-}  // namespace
-
 void generate_schedule(const std::vector<Function> &outputs,
                        const Target &target,
                        const GradientAutoschedulerParams &params,
@@ -950,6 +948,7 @@ struct Li2018 {
 
 REGISTER_AUTOSCHEDULER(Li2018)
 
+}  // namespace
 }  // namespace Autoscheduler
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index e3cc2ec5e825..60a30266043d 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -1023,7 +1023,7 @@ struct Partitioner {
             : cost(c), parallelism(std::move(p)) {
         }
 
-        inline bool defined() const {
+        bool defined() const {
             return cost.defined() && parallelism.defined();
         }
 
@@ -3200,8 +3200,6 @@ bool inline_unbounded(const vector<Function> &outputs,
     return inlined;
 }
 
-}  // anonymous namespace
-
 // Generate schedules for all functions in the pipeline required to compute the
 // outputs. This applies the schedules and returns a string representation of
 // the schedules. The target architecture is specified by 'target'.
@@ -3450,6 +3448,7 @@ struct Mullapudi2016 {
 };
 
 REGISTER_AUTOSCHEDULER(Mullapudi2016)
+}  // anonymous namespace
 
 }  // namespace Autoscheduler
 }  // namespace Internal
diff --git a/src/runtime/HalideBuffer.h b/src/runtime/HalideBuffer.h
index 1c2607ef027d..8a437d24ea48 100644
--- a/src/runtime/HalideBuffer.h
+++ b/src/runtime/HalideBuffer.h
@@ -422,6 +422,7 @@ class Buffer {
             buf.dim = other.buf.dim;
             other.buf.dim = nullptr;
         }
+        other.buf = halide_buffer_t();
     }
 
     /** Initialize the shape from a halide_buffer_t. */
@@ -793,7 +794,6 @@ class Buffer {
         other.dev_ref_count = nullptr;
         other.alloc = nullptr;
         move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
-        other.buf = halide_buffer_t();
     }
 
     /** Move-construct a Buffer from a Buffer of different
@@ -808,7 +808,6 @@ class Buffer {
         other.dev_ref_count = nullptr;
         other.alloc = nullptr;
         move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
-        other.buf = halide_buffer_t();
     }
 
     /** Assign from another Buffer of possibly-different
@@ -860,7 +859,6 @@ class Buffer {
         free_shape_storage();
         buf = other.buf;
         move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
-        other.buf = halide_buffer_t();
         return *this;
     }
 
@@ -874,7 +872,6 @@ class Buffer {
         free_shape_storage();
         buf = other.buf;
         move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
-        other.buf = halide_buffer_t();
         return *this;
     }
 
@@ -1158,8 +1155,8 @@ class Buffer {
     /** Initialize a Buffer from a pointer to the min coordinate and
      * a vector describing the shape.  Does not take ownership of the
      * data, and does not set the host_dirty flag. */
-    explicit inline Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
-                           const std::vector<halide_dimension_t> &shape)
+    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
+                    const std::vector<halide_dimension_t> &shape)
         : Buffer(t, data, (int)shape.size(), shape.data()) {
     }
 
@@ -1178,7 +1175,7 @@ class Buffer {
     /** Initialize a Buffer from a pointer to the min coordinate and
      * a vector describing the shape.  Does not take ownership of the
      * data, and does not set the host_dirty flag. */
-    explicit inline Buffer(T *data, const std::vector<halide_dimension_t> &shape)
+    explicit Buffer(T *data, const std::vector<halide_dimension_t> &shape)
         : Buffer(data, (int)shape.size(), shape.data()) {
     }
 
@@ -1391,7 +1388,7 @@ class Buffer {
      *     my_func(input.alias(), output);
      * }\endcode
      */
-    inline Buffer<T, Dims, InClassDimStorage> alias() const {
+    Buffer<T, Dims, InClassDimStorage> alias() const {
         return *this;
     }
 
@@ -1706,7 +1703,7 @@ class Buffer {
     }
 
     /** Slice a buffer in-place at the dimension's minimum. */
-    inline void slice(int d) {
+    void slice(int d) {
         slice(d, dim(d).min());
     }
 
diff --git a/util/HalideTraceDump.cpp b/util/HalideTraceDump.cpp
index d2b73aedc921..d196df088bb8 100644
--- a/util/HalideTraceDump.cpp
+++ b/util/HalideTraceDump.cpp
@@ -80,12 +80,9 @@ struct FuncInfo {
 
         for (int lane = 0; lane < lanes; lane++) {
             for (int i = 0; i < real_dims; i++) {
-                if (p->coordinates()[lanes * i + lane] < min_coords[i]) {
-                    min_coords[i] = p->coordinates()[lanes * i + lane];
-                }
-                if (p->coordinates()[lanes * i + lane] > max_coords[i]) {
-                    max_coords[i] = p->coordinates()[lanes * i + lane];
-                }
+                const int coord = p->coordinates()[lanes * i + lane];
+                min_coords[i] = std::min(min_coords[i], coord);
+                max_coords[i] = std::max(max_coords[i], coord);
             }
         }
     }
diff --git a/util/HalideTraceViz.cpp b/util/HalideTraceViz.cpp
index 1374d9669cf3..86e23561951c 100644
--- a/util/HalideTraceViz.cpp
+++ b/util/HalideTraceViz.cpp
@@ -1175,9 +1175,7 @@ int run(bool ignore_trace_tags, FlagProcessor flag_processor) {
                     int frames_since_first_draw = (halide_clock - first_draw_clock) / state.globals.timestep;
                     if (frames_since_first_draw < label.fade_in_frames) {
                         uint32_t color = ((1 + frames_since_first_draw) * 255) / std::max(1, label.fade_in_frames);
-                        if (color > 255) {
-                            color = 255;
-                        }
+                        color = std::min<uint32_t>(color, 255);
                         color *= 0x10101;
                         surface->draw_text(label.text, label.pos, color, label.h_scale);
                         ++it;

From 9864bd4266cd0d65815a538bd9dfafa25c7a9b00 Mon Sep 17 00:00:00 2001
From: Alex Reinking <areinking@adobe.com>
Date: Fri, 16 Aug 2024 16:51:21 -0400
Subject: [PATCH 185/186] Fix bundling error on buildbots (#8392)

LLVM as it is built on the buildbots depends on `-lrt`, which is not a
target. Filter out non-target dependencies from consideration.
---
 cmake/BundleStatic.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cmake/BundleStatic.cmake b/cmake/BundleStatic.cmake
index ae59b4f9e55a..299666a75707 100644
--- a/cmake/BundleStatic.cmake
+++ b/cmake/BundleStatic.cmake
@@ -51,11 +51,14 @@ function(bundle_static TARGET)
     # Repeatedly expand and flatten: T ~> T, T.INTERFACE_LINK_LIBRARIES
     foreach (i RANGE 5)
         _bundle_static_replace(
-            cmd "(.+)" "\\1;$<$<TARGET_EXISTS:\\1>:$<TARGET_PROPERTY:\\1,INTERFACE_LINK_LIBRARIES>>"
+            cmd "(.+)" "$<$<TARGET_EXISTS:\\1>:\\1;$<TARGET_PROPERTY:\\1,INTERFACE_LINK_LIBRARIES>>"
         )
         set(cmd "$<LIST:REMOVE_DUPLICATES,$<GENEX_EVAL:${cmd}>>")
     endforeach ()
 
+    # Ensure we are only including targets
+    _bundle_static_replace(cmd "(.+)" "$<TARGET_NAME_IF_EXISTS:\\1>")
+
     # Rewrite T ~> T^T.TYPE  -- we use ^ as a delimiter
     _bundle_static_replace(cmd "(.+)" "\\1^$<TARGET_PROPERTY:\\1,TYPE>")
     set(cmd "$<GENEX_EVAL:${cmd}>")

From 45518acbb386c7e54ba9189b4350e57cf574e5ac Mon Sep 17 00:00:00 2001
From: Steven Johnson <srj@google.com>
Date: Fri, 23 Aug 2024 10:00:23 -0700
Subject: [PATCH 186/186] Fix incorrect std::array sizes in Target.cpp (#8396)

---
 src/Target.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Target.cpp b/src/Target.cpp
index 9270033f35e9..8d13e1e79677 100644
--- a/src/Target.cpp
+++ b/src/Target.cpp
@@ -1549,7 +1549,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
     // clang-format on
 
     // clang-format off
-    const std::array<Feature, 23> intersection_features = {{
+    const std::array<Feature, 14> intersection_features = {{
         ARMv7s,
         AVX,
         AVX2,
@@ -1568,7 +1568,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result)
     // clang-format on
 
     // clang-format off
-    const std::array<Feature, 10> matching_features = {{
+    const std::array<Feature, 9> matching_features = {{
         ASAN,
         Debug,
         HexagonDma,