diff --git a/apps/hexagon_benchmarks/CMakeLists.txt b/apps/hexagon_benchmarks/CMakeLists.txt
index 9cbcc541b76a..c01ad22035bd 100644
--- a/apps/hexagon_benchmarks/CMakeLists.txt
+++ b/apps/hexagon_benchmarks/CMakeLists.txt
@@ -22,23 +22,24 @@ endmacro()
 add_generator_and_library(dilate3x3)
 add_generator_and_library(gaussian5x5)
 add_generator_and_library(median3x3)
+add_generator_and_library(sobel)
 
 # Main executable
 add_executable(process process.cpp)
 target_compile_options(process PRIVATE $<$<CXX_COMPILER_ID:GNU,Clang,AppleClang>:-O2>)
 if (Halide_TARGET MATCHES "hvx")
-  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 TARGET_HAS_HVX)
+  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL TARGET_HAS_HVX)
 else()
-  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3)
+  target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL)
 endif()
 target_link_libraries(process
                       PRIVATE
                       Halide::Tools
-                      dilate3x3 gaussian5x5 median3x3)
+                      dilate3x3 gaussian5x5 median3x3 sobel)
 
 # Test that the app actually works!
 add_test(NAME hexagon_benchmarks COMMAND process -n 1)
 set_tests_properties(hexagon_benchmarks PROPERTIES
                      LABELS hexagon_benchmarks
                      PASS_REGULAR_EXPRESSION "Success!"
-                     SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
+                     SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
\ No newline at end of file
diff --git a/apps/hexagon_benchmarks/process.cpp b/apps/hexagon_benchmarks/process.cpp
index 87a492c577d1..def519963ad0 100644
--- a/apps/hexagon_benchmarks/process.cpp
+++ b/apps/hexagon_benchmarks/process.cpp
@@ -43,10 +43,11 @@ int main(int argc, char **argv) {
     Dilate3x3Descriptor dilate3x3_pipeine(W, H);
     Median3x3Descriptor median3x3_pipeline(W, H);
     Gaussian5x5Descriptor gaussian5x5_pipeline(W, H);
+    SobelDescriptor sobel_pipeline(W, H);
     Conv3x3a32Descriptor conv3x3a32_pipeline(W, H);
 
     std::vector<PipelineDescriptorBase *> pipelines = {&conv3x3a16_pipeline, &dilate3x3_pipeine, &median3x3_pipeline,
-                                                       &gaussian5x5_pipeline, &conv3x3a32_pipeline};
+                                                       &gaussian5x5_pipeline, &sobel_pipeline, &conv3x3a32_pipeline};
 
     for (PipelineDescriptorBase *p : pipelines) {
         if (!p->defined()) {
@@ -85,4 +86,4 @@ int main(int argc, char **argv) {
 
     printf("Success!\n");
     return 0;
-}
+}
\ No newline at end of file
diff --git a/apps/onnx/Makefile b/apps/onnx/Makefile
index f714b0254b75..5188c1c85068 100644
--- a/apps/onnx/Makefile
+++ b/apps/onnx/Makefile
@@ -90,7 +90,12 @@ ifeq ($(UNAME), Darwin)
     # Keep OS X builds from complaining about missing libpython symbols
     PYBIND11_CFLAGS += -undefined dynamic_lookup
 endif
-PY_EXT = $(shell $(PYTHON)-config --extension-suffix)
+# Get the python extension module suffix from python itself. You can
+# also do this with python-config, but that's not resistant to version
+# mismatches between python and python-config. This can happen when
+# using a virtualenv, because virtualenvs override python, but not
+# python-config.
+PY_EXT = $(shell $(PYTHON) -c 'import sysconfig; print(sysconfig.get_config_var("EXT_SUFFIX"))')
 PY_MODEL_EXT = model_cpp$(PY_EXT)
 PYCXXFLAGS = $(PYBIND11_CFLAGS) $(CXXFLAGS) -Wno-deprecated-register
 
diff --git a/src/AddAtomicMutex.cpp b/src/AddAtomicMutex.cpp
index a2bf990e38f6..cf3b0ae8bb89 100644
--- a/src/AddAtomicMutex.cpp
+++ b/src/AddAtomicMutex.cpp
@@ -1,5 +1,4 @@
 #include "AddAtomicMutex.h"
-
 #include "ExprUsesVar.h"
 #include "Func.h"
 #include "IREquality.h"
@@ -11,14 +10,10 @@
 namespace Halide {
 namespace Internal {
 
-using std::map;
-using std::set;
-using std::string;
-
 namespace {
 
 /** Collect names of all stores matching the producer name inside a statement. */
-class CollectProducerStoreNames : public IRGraphVisitor {
+class CollectProducerStoreNames : public IRVisitor {
 public:
     CollectProducerStoreNames(const std::string &producer_name)
         : producer_name(producer_name) {
@@ -27,12 +22,12 @@ class CollectProducerStoreNames : public IRGraphVisitor {
     Scope<void> store_names;
 
 protected:
-    using IRGraphVisitor::visit;
+    using IRVisitor::visit;
 
     void visit(const Store *op) override {
-        IRGraphVisitor::visit(op);
+        IRVisitor::visit(op);
         if (op->name == producer_name || starts_with(op->name, producer_name + ".")) {
-            // This is a Store for the desginated Producer.
+            // This is a Store for the designated Producer.
             store_names.push(op->name);
         }
     }
@@ -42,7 +37,7 @@ class CollectProducerStoreNames : public IRGraphVisitor {
 
 /** Find Store inside of an Atomic node for the designated producer
  *  and return their indices. */
-class FindProducerStoreIndex : public IRGraphVisitor {
+class FindProducerStoreIndex : public IRVisitor {
 public:
     FindProducerStoreIndex(const std::string &producer_name)
         : producer_name(producer_name) {
@@ -51,11 +46,11 @@ class FindProducerStoreIndex : public IRGraphVisitor {
     Expr index;  // The returned index.
 
 protected:
-    using IRGraphVisitor::visit;
+    using IRVisitor::visit;
 
     // Need to also extract the let bindings of a Store index.
     void visit(const Let *op) override {
-        IRGraphVisitor::visit(op);  // Make sure we visit the Store first.
+        IRVisitor::visit(op);  // Make sure we visit the Store first.
         if (index.defined()) {
             if (expr_uses_var(index, op->name)) {
                 index = Let::make(op->name, op->value, index);
@@ -63,7 +58,7 @@ class FindProducerStoreIndex : public IRGraphVisitor {
         }
     }
     void visit(const LetStmt *op) override {
-        IRGraphVisitor::visit(op);  // Make sure we visit the Store first.
+        IRVisitor::visit(op);  // Make sure we visit the Store first.
         if (index.defined()) {
             if (expr_uses_var(index, op->name)) {
                 index = Let::make(op->name, op->value, index);
@@ -72,7 +67,7 @@ class FindProducerStoreIndex : public IRGraphVisitor {
     }
 
     void visit(const Store *op) override {
-        IRGraphVisitor::visit(op);
+        IRVisitor::visit(op);
         if (op->name == producer_name || starts_with(op->name, producer_name + ".")) {
             // This is a Store for the designated producer.
 
@@ -94,11 +89,13 @@ class FindProducerStoreIndex : public IRGraphVisitor {
 /** Throws an assertion for cases where the indexing on left-hand-side of
  *  an atomic update references to itself.
  *  e.g. f(clamp(f(r), 0, 100)) = f(r) + 1 should be rejected. */
-class CheckAtomicValidity : public IRGraphVisitor {
+class CheckAtomicValidity : public IRVisitor {
 protected:
-    using IRGraphVisitor::visit;
+    using IRVisitor::visit;
 
     void visit(const Atomic *op) override {
+        any_atomic = true;
+
         // Collect the names of all Store nodes inside.
         CollectProducerStoreNames collector(op->producer_name);
         op->body.accept(&collector);
@@ -115,13 +112,16 @@ class CheckAtomicValidity : public IRGraphVisitor {
         }
         op->body.accept(this);
     }
+
+public:
+    bool any_atomic = false;
 };
 
 /** Search if the value of a Store node has a variable pointing to a let binding,
  *  where the let binding contains the Store location. Use for checking whether
  *  we need a mutex lock for Atomic since some lowering pass before lifted a let
  *  binding from the Store node (currently only SplitTuple would do this). */
-class FindAtomicLetBindings : public IRGraphVisitor {
+class FindAtomicLetBindings : public IRVisitor {
 public:
     FindAtomicLetBindings(const Scope<void> &store_names)
         : store_names(store_names) {
@@ -133,18 +133,18 @@ class FindAtomicLetBindings : public IRGraphVisitor {
     using IRVisitor::visit;
 
     void visit(const Let *op) override {
-        include(op->value);
+        op->value.accept(this);
         {
             ScopedBinding<Expr> bind(let_bindings, op->name, op->value);
-            include(op->body);
+            op->body.accept(this);
         }
     }
 
     void visit(const LetStmt *op) override {
-        include(op->value);
+        op->value.accept(this);
         {
             ScopedBinding<Expr> bind(let_bindings, op->name, op->value);
-            include(op->body);
+            op->body.accept(this);
         }
     }
 
@@ -159,19 +159,19 @@ class FindAtomicLetBindings : public IRGraphVisitor {
     }
 
     void visit(const Store *op) override {
-        include(op->predicate);
+        op->predicate.accept(this);
+        op->index.accept(this);
         if (store_names.contains(op->name)) {
             // If we are in a designated store and op->value has a let binding
             // that uses one of the store_names, we found a lifted let.
-            ScopedValue<string> old_inside_store(inside_store, op->name);
-            include(op->value);
+            ScopedValue<std::string> old_inside_store(inside_store, op->name);
+            op->value.accept(this);
         } else {
-            include(op->value);
+            op->value.accept(this);
         }
-        include(op->index);
     }
 
-    string inside_store;
+    std::string inside_store;
     const Scope<void> &store_names;
     Scope<Expr> let_bindings;
 };
@@ -179,7 +179,7 @@ class FindAtomicLetBindings : public IRGraphVisitor {
 /** Clear out the Atomic node's mutex usages if it doesn't need one. */
 class RemoveUnnecessaryMutexUse : public IRMutator {
 public:
-    set<string> remove_mutex_lock_names;
+    std::set<std::string> remove_mutex_lock_names;
 
 protected:
     using IRMutator::visit;
@@ -200,30 +200,30 @@ class RemoveUnnecessaryMutexUse : public IRMutator {
             remove_mutex_lock_names.insert(op->mutex_name);
             Stmt body = mutate(op->body);
             return Atomic::make(op->producer_name,
-                                string(),
+                                std::string{},
                                 std::move(body));
         }
     }
 };
 
 /** Find Store inside an Atomic that matches the provided store_names. */
-class FindStoreInAtomicMutex : public IRGraphVisitor {
+class FindStoreInAtomicMutex : public IRVisitor {
 public:
-    using IRGraphVisitor::visit;
+    using IRVisitor::visit;
 
     FindStoreInAtomicMutex(const std::set<std::string> &store_names)
         : store_names(store_names) {
     }
 
     bool found = false;
-    string producer_name;
-    string mutex_name;
+    std::string producer_name;
+    std::string mutex_name;
 
 protected:
     void visit(const Atomic *op) override {
         if (!found && !op->mutex_name.empty()) {
             ScopedValue<bool> old_in_atomic_mutex(in_atomic_mutex, true);
-            include(op->body);
+            op->body.accept(this);
             if (found) {
                 // We found a Store inside Atomic with matching name,
                 // record the mutex information.
@@ -231,7 +231,7 @@ class FindStoreInAtomicMutex : public IRGraphVisitor {
                 mutex_name = op->mutex_name;
             }
         } else {
-            include(op->body);
+            op->body.accept(this);
         }
     }
 
@@ -241,11 +241,11 @@ class FindStoreInAtomicMutex : public IRGraphVisitor {
                 found = true;
             }
         }
-        IRGraphVisitor::visit(op);
+        IRVisitor::visit(op);
     }
 
     bool in_atomic_mutex = false;
-    const set<string> &store_names;
+    const std::set<std::string> &store_names;
 };
 
 /** Replace the indices in the Store nodes with the specified variable. */
@@ -276,26 +276,32 @@ class ReplaceStoreIndexWithVar : public IRMutator {
 /** Add mutex allocation & lock & unlock if required. */
 class AddAtomicMutex : public IRMutator {
 public:
-    AddAtomicMutex(const map<string, Function> &env)
-        : env(env) {
+    AddAtomicMutex(const std::vector<Function> &o) {
+        for (const Function &f : o) {
+            outputs.emplace(f.name(), f);
+        }
     }
 
 protected:
     using IRMutator::visit;
 
-    const map<string, Function> &env;
-    // The set of producers that have allocated a mutex buffer
-    set<string> allocated_mutexes;
+    // Maps from a producer name to a mutex name, for all encountered atomic
+    // nodes.
+    Scope<std::string> needs_mutex_allocation;
 
-    Stmt allocate_mutex(const string &mutex_name, Expr extent, Stmt body) {
+    // Pipeline outputs
+    std::map<std::string, Function> outputs;
+
+    Stmt allocate_mutex(const std::string &mutex_name, Expr extent, Stmt body) {
         Expr mutex_array = Call::make(type_of<halide_mutex_array *>(),
                                       "halide_mutex_array_create",
                                       {std::move(extent)},
                                       Call::Extern);
+
         // Allocate a scalar of halide_mutex_array.
         // This generates halide_mutex_array mutex[1];
         body = Allocate::make(mutex_name,
-                              Handle(),
+                              type_of<halide_mutex *>(),
                               MemoryType::Stack,
                               {},
                               const_true(),
@@ -309,37 +315,44 @@ class AddAtomicMutex : public IRMutator {
         // If this Allocate node is allocating a buffer for a producer,
         // and there is a Store node inside of an Atomic node requiring mutex lock
         // matching the name of the Allocate, allocate a mutex lock.
-        set<string> store_names{op->name};
-        FindStoreInAtomicMutex finder(store_names);
-        op->body.accept(&finder);
-        if (!finder.found) {
-            // No Atomic node that requires mutex lock from this node inside.
-            return IRMutator::visit(op);
-        }
 
-        if (allocated_mutexes.find(finder.mutex_name) != allocated_mutexes.end()) {
-            // We've already allocated a mutex.
-            return IRMutator::visit(op);
+        Stmt body = mutate(op->body);
+
+        std::string producer_name;
+        if (ends_with(op->name, ".0")) {
+            producer_name = op->name.substr(0, op->name.size() - 2);
+        } else {
+            producer_name = op->name;
         }
 
-        allocated_mutexes.insert(finder.mutex_name);
+        if (const std::string *mutex_name = needs_mutex_allocation.find(producer_name)) {
+            Expr extent = cast<uint64_t>(1);  // uint64_t to handle LargeBuffers
+            for (const Expr &e : op->extents) {
+                extent = extent * e;
+            }
 
-        const string &mutex_name = finder.mutex_name;
-        Stmt body = mutate(op->body);
-        Expr extent = Expr(1);
-        for (const Expr &e : op->extents) {
-            extent = extent * e;
+            body = allocate_mutex(*mutex_name, extent, body);
+
+            // At this stage in lowering it should be impossible to have an
+            // allocation that shadows the name of an outer allocation, but may as
+            // well handle it anyway by using a scope and popping at each allocate
+            // node.
+            needs_mutex_allocation.pop(producer_name);
+        }
+
+        if (body.same_as(op->body)) {
+            return op;
+        } else {
+            return Allocate::make(op->name,
+                                  op->type,
+                                  op->memory_type,
+                                  op->extents,
+                                  op->condition,
+                                  std::move(body),
+                                  op->new_expr,
+                                  op->free_function,
+                                  op->padding);
         }
-        body = allocate_mutex(mutex_name, extent, body);
-        return Allocate::make(op->name,
-                              op->type,
-                              op->memory_type,
-                              op->extents,
-                              op->condition,
-                              std::move(body),
-                              op->new_expr,
-                              op->free_function,
-                              op->padding);
     }
 
     Stmt visit(const ProducerConsumer *op) override {
@@ -348,50 +361,35 @@ class AddAtomicMutex : public IRMutator {
         // buffer at the producer node.
 
         if (!op->is_producer) {
-            // This is a consumer.
+            // This is a consumer
             return IRMutator::visit(op);
         }
 
-        // Find the corresponding output.
-        auto func_it = env.find(op->name);
-        if (func_it == env.end()) {
-            // Not an output.
-            return IRMutator::visit(op);
-        }
-        Func f = Func(func_it->second);
-        if (f.output_buffers().empty()) {
-            // Not an output.
+        auto it = outputs.find(op->name);
+        if (it == outputs.end()) {
+            // Not an output
             return IRMutator::visit(op);
         }
 
-        set<string> store_names;
-        for (const auto &buffer : f.output_buffers()) {
-            store_names.insert(buffer.name());
-        }
+        Function f = it->second;
 
-        FindStoreInAtomicMutex finder(store_names);
-        op->body.accept(&finder);
-        if (!finder.found) {
-            // No Atomic node that requires mutex lock from this node inside.
-            return IRMutator::visit(op);
-        }
+        Stmt body = mutate(op->body);
 
-        if (allocated_mutexes.find(finder.mutex_name) != allocated_mutexes.end()) {
-            // We've already allocated a mutex.
-            return IRMutator::visit(op);
+        if (const std::string *mutex_name = needs_mutex_allocation.find(it->first)) {
+            // All output buffers in a Tuple have the same extent.
+            OutputImageParam output_buffer = Func(f).output_buffers()[0];
+            Expr extent = cast<uint64_t>(1);  // uint64_t to handle LargeBuffers
+            for (int i = 0; i < output_buffer.dimensions(); i++) {
+                extent *= output_buffer.dim(i).extent();
+            }
+            body = allocate_mutex(*mutex_name, extent, body);
         }
 
-        allocated_mutexes.insert(finder.mutex_name);
-
-        // We assume all output buffers in a Tuple have the same extent.
-        OutputImageParam output_buffer = f.output_buffers()[0];
-        Expr extent = Expr(1);
-        for (int i = 0; i < output_buffer.dimensions(); i++) {
-            extent = extent * output_buffer.dim(i).extent();
+        if (body.same_as(op->body)) {
+            return op;
+        } else {
+            return ProducerConsumer::make(op->name, op->is_producer, std::move(body));
         }
-        Stmt body = mutate(op->body);
-        body = allocate_mutex(finder.mutex_name, extent, body);
-        return ProducerConsumer::make(op->name, op->is_producer, std::move(body));
     }
 
     Stmt visit(const Atomic *op) override {
@@ -414,7 +412,7 @@ class AddAtomicMutex : public IRMutator {
             // Lift the index outside of the atomic node.
             // This is for avoiding side-effects inside those expressions
             // being evaluated twice.
-            string name = unique_name('t');
+            std::string name = unique_name('t');
             index_let = index;
             index = Variable::make(index.type(), name);
             body = ReplaceStoreIndexWithVar(op->producer_name, index).mutate(body);
@@ -444,17 +442,21 @@ class AddAtomicMutex : public IRMutator {
             internal_assert(index.as<Variable>() != nullptr);
             ret = LetStmt::make(index.as<Variable>()->name, index_let, ret);
         }
+        needs_mutex_allocation.push(op->producer_name, op->mutex_name);
+
         return ret;
     }
 };
 
 }  // namespace
 
-Stmt add_atomic_mutex(Stmt s, const map<string, Function> &env) {
+Stmt add_atomic_mutex(Stmt s, const std::vector<Function> &outputs) {
     CheckAtomicValidity check;
     s.accept(&check);
-    s = RemoveUnnecessaryMutexUse().mutate(s);
-    s = AddAtomicMutex(env).mutate(s);
+    if (check.any_atomic) {
+        s = RemoveUnnecessaryMutexUse().mutate(s);
+        s = AddAtomicMutex(outputs).mutate(s);
+    }
     return s;
 }
 
diff --git a/src/AddAtomicMutex.h b/src/AddAtomicMutex.h
index c27b0346f349..5b11de621e97 100644
--- a/src/AddAtomicMutex.h
+++ b/src/AddAtomicMutex.h
@@ -23,7 +23,7 @@ namespace Internal {
 
 class Function;
 
-Stmt add_atomic_mutex(Stmt s, const std::map<std::string, Function> &env);
+Stmt add_atomic_mutex(Stmt s, const std::vector<Function> &outputs);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index 5965303197bc..21ca06e07285 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -1152,7 +1152,7 @@ class BoundsInference : public IRMutator {
         map<string, Function> stage_name_to_func;
 
         if (producing >= 0) {
-            fused_group.insert(make_pair(f.name(), stage_index));
+            fused_group.emplace(f.name(), stage_index);
         }
 
         if (!no_pipelines && producing >= 0 && !f.has_extern_definition()) {
@@ -1164,12 +1164,12 @@ class BoundsInference : public IRMutator {
                 if (!((pair.func_1 == stages[producing].name) && ((int)pair.stage_1 == stage_index)) && is_fused_with_others(fused_groups, fused_pairs_in_groups,
                                                                                                                              f, stage_index,
                                                                                                                              pair.func_1, pair.stage_1, var)) {
-                    fused_group.insert(make_pair(pair.func_1, pair.stage_1));
+                    fused_group.emplace(pair.func_1, pair.stage_1);
                 }
                 if (!((pair.func_2 == stages[producing].name) && ((int)pair.stage_2 == stage_index)) && is_fused_with_others(fused_groups, fused_pairs_in_groups,
                                                                                                                              f, stage_index,
                                                                                                                              pair.func_2, pair.stage_2, var)) {
-                    fused_group.insert(make_pair(pair.func_2, pair.stage_2));
+                    fused_group.emplace(pair.func_2, pair.stage_2);
                 }
             }
 
diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index 78fc4224fb61..697b9200fa33 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -610,7 +610,11 @@ void get_target_options(const llvm::Module &module, llvm::TargetOptions &options
     options.UseInitArray = true;
     options.FloatABIType =
         use_soft_float_abi ? llvm::FloatABI::Soft : llvm::FloatABI::Hard;
+#if LLVM_VERSION >= 190
+    options.MCOptions.X86RelaxRelocations = false;
+#else
     options.RelaxELFRelocations = false;
+#endif
     options.MCOptions.ABIName = mabi;
 }
 
diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp
index 551acfcdebf2..0a1403362621 100644
--- a/src/Deserialization.cpp
+++ b/src/Deserialization.cpp
@@ -504,12 +504,14 @@ void Deserializer::deserialize_function(const Serialize::Func *function, Functio
     const std::vector<std::string> trace_tags =
         deserialize_vector<flatbuffers::String, std::string>(function->trace_tags(),
                                                              &Deserializer::deserialize_string);
+    const bool no_profiling = function->no_profiling();
     const bool frozen = function->frozen();
     hl_function.update_with_deserialization(name, origin_name, output_types, required_types,
                                             required_dim, args, func_schedule, init_def, updates,
                                             debug_file, output_buffers, extern_arguments, extern_function_name,
                                             name_mangling, extern_function_device_api, extern_proxy_expr,
-                                            trace_loads, trace_stores, trace_realizations, trace_tags, frozen);
+                                            trace_loads, trace_stores, trace_realizations, trace_tags,
+                                            no_profiling, frozen);
 }
 
 Stmt Deserializer::deserialize_stmt(Serialize::Stmt type_code, const void *stmt) {
diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp
index f48bd75c37a2..92bcf3870d5d 100644
--- a/src/FlattenNestedRamps.cpp
+++ b/src/FlattenNestedRamps.cpp
@@ -81,19 +81,19 @@ class FlattenRamps : public IRMutator {
 
             // If they are, we'll have a full vector of const_indices
             if ((int)const_indices.size() == lanes) {
-
                 // Compute the stride for the underlying strided load
-                int stride = 0;
-                for (int c : const_indices) {
-                    stride = (int)gcd(stride, c);
-                }
-                for (int &c : const_indices) {
-                    c /= stride;
+                int stride = 0, extent = 1;
+                if (max_constant_offset > 0) {
+                    for (int c : const_indices) {
+                        stride = (int)gcd(stride, c);
+                    }
+                    for (int &c : const_indices) {
+                        c /= stride;
+                    }
+                    // Compute the number of elements loaded
+                    extent = (int)((max_constant_offset / stride) + 1);
                 }
 
-                // Compute the number of elements loaded
-                int extent = (int)((max_constant_offset / stride) + 1);
-
                 // If we're gathering from a very large range, it
                 // might be better to just do the gather rather than
                 // doing a big dense load and then shuffling. We
@@ -105,12 +105,22 @@ class FlattenRamps : public IRMutator {
                 // in the schedule somehow.
                 const int max_unused_lane_factor = 4;
                 if (extent < max_unused_lane_factor * lanes) {
-                    Expr dense_index = Ramp::make(min_lane, make_const(min_lane.type(), stride), extent);
-                    Expr dense_load =
-                        Load::make(op->type.with_lanes(extent), op->name, dense_index,
-                                   op->image, op->param,
-                                   const_true(extent), ModulusRemainder{});
-                    return Shuffle::make({dense_load}, const_indices);
+                    if (max_constant_offset == 0) {
+                        // It's a load of a broadcast. Convert it to a broadcast of a load
+                        Expr load = Load::make(op->type.element_of(), op->name, min_lane,
+                                               op->image, op->param,
+                                               const_true(), ModulusRemainder{});
+                        return Broadcast::make(load, lanes);
+                    } else {
+                        // Turn it into a dense load and a shuffle
+                        Expr dense_index =
+                            Ramp::make(min_lane, make_const(min_lane.type(), stride), extent);
+                        Expr dense_load =
+                            Load::make(op->type.with_lanes(extent), op->name, dense_index,
+                                       op->image, op->param,
+                                       const_true(extent), ModulusRemainder{});
+                        return Shuffle::make({dense_load}, const_indices);
+                    }
                 }
             }
         }
diff --git a/src/Func.cpp b/src/Func.cpp
index 7e0995cc33b5..1f480c99983c 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -3037,6 +3037,11 @@ Func &Func::add_trace_tag(const std::string &trace_tag) {
     return *this;
 }
 
+Func &Func::no_profiling() {
+    func.do_not_profile();
+    return *this;
+}
+
 void Func::debug_to_file(const string &filename) {
     invalidate_cache();
     func.debug_file() = filename;
diff --git a/src/Func.h b/src/Func.h
index d4074ee18cc6..bced13f79481 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -2559,6 +2559,15 @@ class Func {
      */
     Func &add_trace_tag(const std::string &trace_tag);
 
+    /** Marks this function as a function that should not be profiled
+     * when using the target feature Profile or ProfileByTimer.
+     * This is useful when this function is does too little work at once
+     * such that the overhead of setting the profiling token might
+     * become significant, or that the measured time is not representative
+     * due to modern processors (instruction level parallelism, out-of-order
+     * execution). */
+    Func &no_profiling();
+
     /** Get a handle on the internal halide function that this Func
      * represents. Useful if you want to do introspection on Halide
      * functions */
diff --git a/src/Function.cpp b/src/Function.cpp
index 88f5b851e986..b72a39e1c90a 100644
--- a/src/Function.cpp
+++ b/src/Function.cpp
@@ -110,6 +110,8 @@ struct FunctionContents {
     bool trace_loads = false, trace_stores = false, trace_realizations = false;
     std::vector<string> trace_tags;
 
+    bool no_profiling = false;
+
     bool frozen = false;
 
     void accept(IRVisitor *visitor) const {
@@ -352,6 +354,7 @@ void Function::update_with_deserialization(const std::string &name,
                                            bool trace_stores,
                                            bool trace_realizations,
                                            const std::vector<std::string> &trace_tags,
+                                           bool no_profiling,
                                            bool frozen) {
     contents->name = name;
     contents->origin_name = origin_name;
@@ -373,6 +376,7 @@ void Function::update_with_deserialization(const std::string &name,
     contents->trace_stores = trace_stores;
     contents->trace_realizations = trace_realizations;
     contents->trace_tags = trace_tags;
+    contents->no_profiling = no_profiling;
     contents->frozen = frozen;
 }
 
@@ -511,6 +515,7 @@ void Function::deep_copy(const FunctionPtr &copy, DeepCopyMap &copied_map) const
     copy->trace_stores = contents->trace_stores;
     copy->trace_realizations = contents->trace_realizations;
     copy->trace_tags = contents->trace_tags;
+    copy->no_profiling = contents->no_profiling;
     copy->frozen = contents->frozen;
     copy->output_buffers = contents->output_buffers;
     copy->func_schedule = contents->func_schedule.deep_copy(copied_map);
@@ -1141,10 +1146,6 @@ const std::vector<std::string> &Function::get_trace_tags() const {
     return contents->trace_tags;
 }
 
-void Function::freeze() {
-    contents->frozen = true;
-}
-
 void Function::lock_loop_levels() {
     auto &schedule = contents->func_schedule;
     schedule.compute_level().lock();
@@ -1168,6 +1169,16 @@ void Function::lock_loop_levels() {
     }
 }
 
+void Function::do_not_profile() {
+    contents->no_profiling = true;
+}
+bool Function::should_not_profile() const {
+    return contents->no_profiling;
+}
+
+void Function::freeze() {
+    contents->frozen = true;
+}
 bool Function::frozen() const {
     return contents->frozen;
 }
diff --git a/src/Function.h b/src/Function.h
index 66b62a01f66b..49f68805d61e 100644
--- a/src/Function.h
+++ b/src/Function.h
@@ -88,6 +88,7 @@ class Function {
                                      bool trace_stores,
                                      bool trace_realizations,
                                      const std::vector<std::string> &trace_tags,
+                                     bool no_profiling,
                                      bool frozen);
 
     /** Get a handle on the halide function contents that this Function
@@ -290,6 +291,12 @@ class Function {
      * cannot be mutated further. */
     void lock_loop_levels();
 
+    /** Mark the function as too small for meaningful profiling. */
+    void do_not_profile();
+
+    /** Check if the function is marked as one that should not be profiled. */
+    bool should_not_profile() const;
+
     /** Mark function as frozen, which means it cannot accept new
      * definitions. */
     void freeze();
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index deabd95d1d1b..f11fa3348399 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -1685,6 +1685,14 @@ class EliminateInterleaves : public IRMutator {
         return true;
     }
 
+    // Indicates the minimum Hexagon Vector Extension (HVX) target version required for using these instructions.
+    enum class HvxTarget {
+        v62orLater,  // Use for Hexagon v62 target or later
+        v65orLater,  // Use for Hexagon v65 target or later
+        v66orLater,  // Use for Hexagon v66 target or later
+    };
+    HvxTarget hvx_target;
+
     Expr visit(const Call *op) override {
         vector<Expr> args(op->args);
 
@@ -1702,27 +1710,27 @@ class EliminateInterleaves : public IRMutator {
         // does not deinterleave, and then opportunistically select
         // the interleaving alternative when we can cancel out to the
         // interleave.
-        static std::map<string, string> deinterleaving_alts = {
-            {"halide.hexagon.pack.vh", "halide.hexagon.trunc.vh"},
-            {"halide.hexagon.pack.vw", "halide.hexagon.trunc.vw"},
-            {"halide.hexagon.packhi.vh", "halide.hexagon.trunclo.vh"},
-            {"halide.hexagon.packhi.vw", "halide.hexagon.trunclo.vw"},
-            {"halide.hexagon.pack_satub.vh", "halide.hexagon.trunc_satub.vh"},
-            {"halide.hexagon.pack_satub.vuh", "halide.hexagon.trunc_satub.vuh"},
-            {"halide.hexagon.pack_sath.vw", "halide.hexagon.trunc_sath.vw"},
-            {"halide.hexagon.pack_satuh.vw", "halide.hexagon.trunc_satuh.vw"},
-            {"halide.hexagon.pack_satuh.vuw", "halide.hexagon.trunc_satuh.vuw"},
+        static std::map<string, std::pair<HvxTarget, std::string>> deinterleaving_alts = {
+            {"halide.hexagon.pack.vh", {HvxTarget::v62orLater, "halide.hexagon.trunc.vh"}},
+            {"halide.hexagon.pack.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc.vw"}},
+            {"halide.hexagon.packhi.vh", {HvxTarget::v62orLater, "halide.hexagon.trunclo.vh"}},
+            {"halide.hexagon.packhi.vw", {HvxTarget::v62orLater, "halide.hexagon.trunclo.vw"}},
+            {"halide.hexagon.pack_satub.vh", {HvxTarget::v62orLater, "halide.hexagon.trunc_satub.vh"}},
+            {"halide.hexagon.pack_satub.vuh", {HvxTarget::v65orLater, "halide.hexagon.trunc_satub.vuh"}},
+            {"halide.hexagon.pack_sath.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc_sath.vw"}},
+            {"halide.hexagon.pack_satuh.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc_satuh.vw"}},
+            {"halide.hexagon.pack_satuh.vuw", {HvxTarget::v62orLater, "halide.hexagon.trunc_satuh.vuw"}},
         };
 
         // The reverse mapping of the above.
-        static std::map<string, string> interleaving_alts = {
-            {"halide.hexagon.trunc.vh", "halide.hexagon.pack.vh"},
-            {"halide.hexagon.trunc.vw", "halide.hexagon.pack.vw"},
-            {"halide.hexagon.trunclo.vh", "halide.hexagon.packhi.vh"},
-            {"halide.hexagon.trunclo.vw", "halide.hexagon.packhi.vw"},
-            {"halide.hexagon.trunc_satub.vh", "halide.hexagon.pack_satub.vh"},
-            {"halide.hexagon.trunc_sath.vw", "halide.hexagon.pack_sath.vw"},
-            {"halide.hexagon.trunc_satuh.vw", "halide.hexagon.pack_satuh.vw"},
+        static std::map<string, std::pair<HvxTarget, std::string>> interleaving_alts = {
+            {"halide.hexagon.trunc.vh", {HvxTarget::v62orLater, "halide.hexagon.pack.vh"}},
+            {"halide.hexagon.trunc.vw", {HvxTarget::v62orLater, "halide.hexagon.pack.vw"}},
+            {"halide.hexagon.trunclo.vh", {HvxTarget::v62orLater, "halide.hexagon.packhi.vh"}},
+            {"halide.hexagon.trunclo.vw", {HvxTarget::v62orLater, "halide.hexagon.packhi.vw"}},
+            {"halide.hexagon.trunc_satub.vh", {HvxTarget::v62orLater, "halide.hexagon.pack_satub.vh"}},
+            {"halide.hexagon.trunc_sath.vw", {HvxTarget::v62orLater, "halide.hexagon.pack_sath.vw"}},
+            {"halide.hexagon.trunc_satuh.vw", {HvxTarget::v62orLater, "halide.hexagon.pack_satuh.vw"}},
         };
 
         if (is_native_deinterleave(op) && yields_interleave(args[0])) {
@@ -1738,7 +1746,8 @@ class EliminateInterleaves : public IRMutator {
                                    op->func, op->value_index, op->image, op->param);
             // Add the interleave back to the result of the call.
             return native_interleave(expr);
-        } else if (deinterleaving_alts.find(op->name) != deinterleaving_alts.end() &&
+        } else if (deinterleaving_alts.find(op->name) != deinterleaving_alts.end() && hvx_target >= deinterleaving_alts[op->name].first &&
+
                    yields_removable_interleave(args)) {
             // This call has a deinterleaving alternative, and the
             // arguments are interleaved, so we should use the
@@ -1746,14 +1755,14 @@ class EliminateInterleaves : public IRMutator {
             for (Expr &i : args) {
                 i = remove_interleave(i);
             }
-            return Call::make(op->type, deinterleaving_alts[op->name], args, op->call_type);
-        } else if (interleaving_alts.count(op->name) && is_native_deinterleave(args[0])) {
+            return Call::make(op->type, deinterleaving_alts[op->name].second, args, op->call_type);
+        } else if (interleaving_alts.count(op->name) && hvx_target >= interleaving_alts[op->name].first && is_native_deinterleave(args[0])) {
             // This is an interleaving alternative with a
             // deinterleave, which can be generated when we
             // deinterleave storage. Revert back to the interleaving
             // op so we can remove the deinterleave.
             Expr arg = args[0].as<Call>()->args[0];
-            return Call::make(op->type, interleaving_alts[op->name], {arg}, op->call_type,
+            return Call::make(op->type, interleaving_alts[op->name].second, {arg}, op->call_type,
                               op->func, op->value_index, op->image, op->param);
         } else if (changed) {
             return Call::make(op->type, op->name, args, op->call_type,
@@ -1896,8 +1905,15 @@ class EliminateInterleaves : public IRMutator {
     using IRMutator::visit;
 
 public:
-    EliminateInterleaves(int native_vector_bytes)
+    EliminateInterleaves(const Target &t, int native_vector_bytes)
         : native_vector_bits(native_vector_bytes * 8), alignment_analyzer(native_vector_bytes) {
+        if (t.features_any_of({Target::HVX_v65})) {
+            hvx_target = HvxTarget::v65orLater;
+        } else if (t.features_any_of({Target::HVX_v66})) {
+            hvx_target = HvxTarget::v66orLater;
+        } else {
+            hvx_target = HvxTarget::v62orLater;
+        }
     }
 };
 
@@ -2233,7 +2249,7 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) {
              << s << "\n";
 
     // Try to eliminate any redundant interleave/deinterleave pairs.
-    s = EliminateInterleaves(t.natural_vector_size(Int(8))).mutate(s);
+    s = EliminateInterleaves(t, t.natural_vector_size(Int(8))).mutate(s);
     debug(4) << "Hexagon: Lowering after EliminateInterleaves\n"
              << s << "\n";
 
@@ -2246,4 +2262,4 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) {
 }
 
 }  // namespace Internal
-}  // namespace Halide
+}  // namespace Halide
\ No newline at end of file
diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp
index bc03dd124d9a..a186be1874d7 100644
--- a/src/IRPrinter.cpp
+++ b/src/IRPrinter.cpp
@@ -1112,11 +1112,12 @@ void IRPrinter::visit(const VectorReduce *op) {
 
 void IRPrinter::visit(const Atomic *op) {
     if (op->mutex_name.empty()) {
-        stream << get_indent() << "atomic {\n";
+        stream << get_indent() << "atomic ("
+               << op->producer_name << ") {\n";
     } else {
-        stream << get_indent() << "atomic (";
-        stream << op->mutex_name;
-        stream << ") {\n";
+        stream << get_indent() << "atomic ("
+               << op->producer_name << ", "
+               << op->mutex_name << ") {\n";
     }
     indent += 2;
     print(op->body);
diff --git a/src/Lower.cpp b/src/Lower.cpp
index 3b357eb3061e..f092e2e711ef 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -280,10 +280,7 @@ void lower_impl(const vector<Function> &output_funcs,
     s = split_tuples(s, env);
     log("Lowering after destructuring tuple-valued realizations:", s);
 
-    // Vulkan relies on GPU var canonicalization occurring before
-    // storage flattening.
-    if (t.has_gpu_feature() ||
-        t.has_feature(Target::Vulkan)) {
+    if (t.has_gpu_feature()) {
         debug(1) << "Canonicalizing GPU var names...\n";
         s = canonicalize_gpu_vars(s);
         log("Lowering after canonicalizing GPU var names:", s);
@@ -299,7 +296,7 @@ void lower_impl(const vector<Function> &output_funcs,
     log("Lowering after storage flattening:", s);
 
     debug(1) << "Adding atomic mutex allocation...\n";
-    s = add_atomic_mutex(s, env);
+    s = add_atomic_mutex(s, outputs);
     log("Lowering after adding atomic mutex allocation:", s);
 
     debug(1) << "Unpacking buffer arguments...\n";
@@ -408,7 +405,7 @@ void lower_impl(const vector<Function> &output_funcs,
 
     if (t.has_feature(Target::Profile) || t.has_feature(Target::ProfileByTimer)) {
         debug(1) << "Injecting profiling...\n";
-        s = inject_profiling(s, pipeline_name);
+        s = inject_profiling(s, pipeline_name, env);
         log("Lowering after injecting profiling:", s);
     }
 
diff --git a/src/Profiling.cpp b/src/Profiling.cpp
index 2be058b3c8a6..414578299df6 100644
--- a/src/Profiling.cpp
+++ b/src/Profiling.cpp
@@ -3,7 +3,7 @@
 #include <string>
 
 #include "CodeGen_Internal.h"
-#include "ExprUsesVar.h"
+#include "Function.h"
 #include "IRMutator.h"
 #include "IROperator.h"
 #include "InjectHostDevBufferCopies.h"
@@ -71,13 +71,14 @@ class InjectProfiling : public IRMutator {
     vector<int> stack;  // What produce nodes are we currently inside of.
 
     string pipeline_name;
+    const map<string, Function> &env;
 
     bool in_fork = false;
     bool in_parallel = false;
     bool in_leaf_task = false;
 
-    InjectProfiling(const string &pipeline_name)
-        : pipeline_name(pipeline_name) {
+    InjectProfiling(const string &pipeline_name, const map<string, Function> &env)
+        : pipeline_name(pipeline_name), env(env) {
         stack.push_back(get_func_id("overhead"));
         // ID 0 is treated specially in the runtime as overhead
         internal_assert(stack.back() == 0);
@@ -119,10 +120,28 @@ class InjectProfiling : public IRMutator {
     bool profiling_memory = true;
 
     // Strip down the tuple name, e.g. f.0 into f
-    string normalize_name(const string &name) {
-        vector<string> v = split_string(name, ".");
-        internal_assert(!v.empty());
-        return v[0];
+    string normalize_name(const string &name) const {
+        size_t idx = name.find('.');
+        if (idx != std::string::npos) {
+            internal_assert(idx != 0);
+            return name.substr(0, idx);
+        } else {
+            return name;
+        }
+    }
+
+    Function lookup_function(const string &name) const {
+        auto it = env.find(name);
+        if (it != env.end()) {
+            return it->second;
+        }
+        string norm_name = normalize_name(name);
+        it = env.find(norm_name);
+        if (it != env.end()) {
+            return it->second;
+        }
+        internal_error << "No function in the environment found for name '" << name << "'.\n";
+        return {};
     }
 
     int get_func_id(const string &name) {
@@ -185,7 +204,6 @@ class InjectProfiling : public IRMutator {
     }
 
     Stmt visit(const Allocate *op) override {
-        int idx = get_func_id(op->name);
 
         auto [new_extents, changed] = mutate_with_changes(op->extents);
         Expr condition = mutate(op->condition);
@@ -199,6 +217,13 @@ class InjectProfiling : public IRMutator {
         // always conditionally false. remove_dead_allocations() is called after
         // inject_profiling() so this is a possible scenario.
         if (!is_const_zero(size) && on_stack) {
+            int idx;
+            Function func = lookup_function(op->name);
+            if (func.should_not_profile()) {
+                idx = stack.back();  // Attribute the stack size contribution to the deepest _profiled_ func.
+            } else {
+                idx = get_func_id(op->name);
+            }
             const uint64_t *int_size = as_const_uint(size);
             internal_assert(int_size != nullptr);  // Stack size is always a const int
             func_stack_current[idx] += *int_size;
@@ -212,6 +237,7 @@ class InjectProfiling : public IRMutator {
         vector<Stmt> tasks;
         bool track_heap_allocation = !is_const_zero(size) && !on_stack && profiling_memory;
         if (track_heap_allocation) {
+            int idx = get_func_id(op->name);
             debug(3) << "  Allocation on heap: " << op->name
                      << "(" << size << ") in pipeline "
                      << pipeline_name << "\n";
@@ -245,8 +271,6 @@ class InjectProfiling : public IRMutator {
     }
 
     Stmt visit(const Free *op) override {
-        int idx = get_func_id(op->name);
-
         AllocSize alloc = func_alloc_sizes.get(op->name);
         internal_assert(alloc.size.type() == UInt(64));
         func_alloc_sizes.pop(op->name);
@@ -256,6 +280,7 @@ class InjectProfiling : public IRMutator {
         if (!is_const_zero(alloc.size)) {
             if (!alloc.on_stack) {
                 if (profiling_memory) {
+                    int idx = get_func_id(op->name);
                     debug(3) << "  Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "\n";
 
                     vector<Stmt> tasks{
@@ -271,6 +296,13 @@ class InjectProfiling : public IRMutator {
                 const uint64_t *int_size = as_const_uint(alloc.size);
                 internal_assert(int_size != nullptr);
 
+                int idx;
+                Function func = lookup_function(op->name);
+                if (func.should_not_profile()) {
+                    idx = stack.back();  // Attribute the stack size contribution to the deepest _profiled_ func.
+                } else {
+                    idx = get_func_id(op->name);
+                }
                 func_stack_current[idx] -= *int_size;
                 debug(3) << "  Free on stack: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name
                          << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n";
@@ -283,11 +315,19 @@ class InjectProfiling : public IRMutator {
         int idx;
         Stmt body;
         if (op->is_producer) {
-            idx = get_func_id(op->name);
-            stack.push_back(idx);
-            Stmt set_current = set_current_func(idx);
-            body = Block::make(set_current, mutate(op->body));
-            stack.pop_back();
+            Function func = lookup_function(op->name);
+            if (func.should_not_profile()) {
+                body = mutate(op->body);
+                if (body.same_as(op->body)) {
+                    return op;
+                }
+            } else {
+                idx = get_func_id(op->name);
+                stack.push_back(idx);
+                Stmt set_current = set_current_func(idx);
+                body = Block::make(set_current, mutate(op->body));
+                stack.pop_back();
+            }
         } else {
             // At the beginning of the consume step, set the current task
             // back to the outer one.
@@ -498,8 +538,8 @@ class InjectProfiling : public IRMutator {
 
 }  // namespace
 
-Stmt inject_profiling(Stmt s, const string &pipeline_name) {
-    InjectProfiling profiling(pipeline_name);
+Stmt inject_profiling(Stmt s, const string &pipeline_name, const std::map<string, Function> &env) {
+    InjectProfiling profiling(pipeline_name, env);
     s = profiling.mutate(s);
 
     int num_funcs = (int)(profiling.indices.size());
diff --git a/src/Profiling.h b/src/Profiling.h
index a6040b9160af..afaa47fe6d6e 100644
--- a/src/Profiling.h
+++ b/src/Profiling.h
@@ -23,6 +23,7 @@
  *   mandelbrot:  0.006444ms (10%)   peak: 505344   num: 104000   avg: 5376
  *   argmin:      0.027715ms (46%)   stack: 20
  */
+#include <map>
 #include <string>
 
 #include "Expr.h"
@@ -30,6 +31,8 @@
 namespace Halide {
 namespace Internal {
 
+class Function;
+
 /** Take a statement representing a halide pipeline insert
  * high-resolution timing into the generated code (via spawning a
  * thread that acts as a sampling profiler); summaries of execution
@@ -37,7 +40,7 @@ namespace Internal {
  * storage flattening, but after all bounds inference.
  *
  */
-Stmt inject_profiling(Stmt, const std::string &);
+Stmt inject_profiling(Stmt, const std::string &, const std::map<std::string, Function> &env);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/Serialization.cpp b/src/Serialization.cpp
index 144d79af7e5e..c1cb3a6d1193 100644
--- a/src/Serialization.cpp
+++ b/src/Serialization.cpp
@@ -1029,6 +1029,7 @@ Offset<Serialize::Func> Serializer::serialize_function(FlatBufferBuilder &builde
     for (const auto &tag : function.get_trace_tags()) {
         trace_tags_serialized.push_back(serialize_string(builder, tag));
     }
+    const bool no_profiling = function.should_not_profile();
     const bool frozen = function.frozen();
     auto func = Serialize::CreateFunc(builder,
                                       name_serialized,
@@ -1050,7 +1051,9 @@ Offset<Serialize::Func> Serializer::serialize_function(FlatBufferBuilder &builde
                                       trace_loads,
                                       trace_stores,
                                       trace_realizations,
-                                      builder.CreateVector(trace_tags_serialized), frozen);
+                                      builder.CreateVector(trace_tags_serialized),
+                                      no_profiling,
+                                      frozen);
     return func;
 }
 
diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp
index ba4cc9b8acca..5860a7e50d0f 100644
--- a/src/StorageFlattening.cpp
+++ b/src/StorageFlattening.cpp
@@ -535,13 +535,9 @@ class FlattenDimensions : public IRMutator {
             Interval loop_bounds = Interval(expanded_min, simplify(expanded_min + expanded_extent - 1));
             it->loop_vars.push(op->name, loop_bounds);
         }
-        bool old_in_gpu = in_gpu;
-        if (op->for_type == ForType::GPUBlock ||
-            op->for_type == ForType::GPUThread) {
-            in_gpu = true;
-        }
+
+        ScopedValue<bool> old_in_gpu(in_gpu, in_gpu || is_gpu(op->for_type));
         Stmt stmt = IRMutator::visit(op);
-        in_gpu = old_in_gpu;
 
         for (auto &p : hoisted_storages) {
             p.loop_vars.pop(op->name);
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index 2ce325538a86..e3cc2ec5e825 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -1359,7 +1359,7 @@ Partitioner::Partitioner(const map<string, Box> &_pipeline_bounds,
         for (int s = 0; s < num_stages; s++) {
             FStage stg(f.second, s);
             Group g(stg, {stg});
-            groups.insert(make_pair(stg, g));
+            groups.emplace(stg, g);
         }
     }
 
diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs
index 01a987b6f430..efc465cbee82 100644
--- a/src/halide_ir.fbs
+++ b/src/halide_ir.fbs
@@ -15,7 +15,7 @@ enum SerializationVersionMinor: int {
     Value = 0
 }
 enum SerializationVersionPatch: int {
-    Value = 0
+    Value = 1
 }
 
 // from src/IR.cpp
@@ -713,6 +713,7 @@ table Func {
     trace_stores: bool = false;
     trace_realizations: bool = false;
     trace_tags: [string];
+    no_profiling: bool = false;
     frozen: bool = false;
 }
 
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 9408c59da167..1d0843be0329 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -195,7 +195,7 @@ extern void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mute
 /** Functions for constructing/destroying/locking/unlocking arrays of mutexes. */
 struct halide_mutex_array;
 //@{
-extern struct halide_mutex_array *halide_mutex_array_create(int sz);
+extern struct halide_mutex_array *halide_mutex_array_create(uint64_t sz);
 extern void halide_mutex_array_destroy(void *user_context, void *array);
 extern int halide_mutex_array_lock(struct halide_mutex_array *array, int entry);
 extern int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry);
diff --git a/src/runtime/fake_thread_pool.cpp b/src/runtime/fake_thread_pool.cpp
index 9c3cfddc5a47..531a16d1312e 100644
--- a/src/runtime/fake_thread_pool.cpp
+++ b/src/runtime/fake_thread_pool.cpp
@@ -96,7 +96,7 @@ WEAK void halide_mutex_unlock(halide_mutex *mutex) {
 // (e.g. correctness/multiple_scatter). Since we don't have threads, we don't
 // need to mutex to do anything, but returning a null would trigger an error
 // condition that would be misrepoted as out-of-memory.
-WEAK halide_mutex_array *halide_mutex_array_create(int sz) {
+WEAK halide_mutex_array *halide_mutex_array_create(uint64_t sz) {
     return &halide_fake_mutex_array;
 }
 
diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h
index feee56a4e531..89b1a929e79b 100644
--- a/src/runtime/internal/block_allocator.h
+++ b/src/runtime/internal/block_allocator.h
@@ -55,10 +55,11 @@ class BlockAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    int release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
-    int reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
-    int retain(void *user_context, MemoryRegion *region);   //< retain the region and increase the usage count
-    bool collect(void *user_context);                       //< returns true if any blocks were removed
+    int conform(void *user_context, MemoryRequest *request) const;  //< conform the given request into a suitable allocation
+    int release(void *user_context, MemoryRegion *region);          //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *region);          //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *region);           //< retain the region and increase the usage count
+    bool collect(void *user_context);                               //< returns true if any blocks were removed
     int release(void *user_context);
     int destroy(void *user_context);
 
@@ -86,13 +87,13 @@ class BlockAllocator {
     int destroy_region_allocator(void *user_context, RegionAllocator *region_allocator);
 
     // Reserves a block of memory for the requested size and returns the corresponding block entry, or nullptr on failure
-    BlockEntry *reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+    BlockEntry *reserve_block_entry(void *user_context, const MemoryRequest &request);
 
     // Locates the "best-fit" block entry for the requested size, or nullptr if none was found
-    BlockEntry *find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+    BlockEntry *find_block_entry(void *user_context, const MemoryRequest &request);
 
-    // Creates a new block entry and int the list
-    BlockEntry *create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated);
+    // Creates a new block entry and adds it tos the list
+    BlockEntry *create_block_entry(void *user_context, const MemoryRequest &request);
 
     // Releases the block entry from being used, and makes it available for further allocations
     int release_block_entry(void *user_context, BlockEntry *block_entry);
@@ -113,7 +114,7 @@ class BlockAllocator {
     bool is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const;
 
     // Returns true if the given block is suitable for the request allocation
-    bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const;
+    bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const;
 
     Config config;
     LinkedList block_list;
@@ -162,7 +163,8 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
                         << "caching=" << halide_memory_caching_name(request.properties.caching) << " "
                         << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...";
 #endif
-    BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated);
+    // Reserve a block entry for use
+    BlockEntry *block_entry = reserve_block_entry(user_context, request);
     if (block_entry == nullptr) {
         error(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size ("
                             << (int32_t)(request.size) << " bytes)\n";
@@ -173,11 +175,12 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r
     halide_abort_if_false(user_context, block != nullptr);
     halide_abort_if_false(user_context, block->allocator != nullptr);
 
+    // Reserve an initial memory region for the block
     MemoryRegion *result = reserve_memory_region(user_context, block->allocator, request);
     if (result == nullptr) {
 
         // Unable to reserve region in an existing block ... create a new block and try again.
-        block_entry = create_block_entry(user_context, request.properties, request.size, request.dedicated);
+        block_entry = create_block_entry(user_context, request);
         if (block_entry == nullptr) {
             error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size ("
                                 << (int32_t)(request.size) << " bytes)\n";
@@ -299,8 +302,8 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl
     return result;
 }
 
-bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const {
-    if (!is_compatible_block(block, properties)) {
+bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const {
+    if (!is_compatible_block(block, request.properties)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: skipping block ... incompatible properties! ("
                             << "block_resource=" << (void *)block << " "
@@ -309,16 +312,16 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
                             << "block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << " "
                             << "block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << " "
                             << "block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << " "
-                            << "request_size=" << (uint32_t)size << " "
-                            << "request_usage=" << halide_memory_usage_name(properties.usage) << " "
-                            << "request_caching=" << halide_memory_caching_name(properties.caching) << " "
-                            << "request_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                            << "request_size=" << (uint32_t)request.size << " "
+                            << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                            << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                            << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
         // skip blocks that are using incompatible memory
         return false;
     }
 
-    if (dedicated && (block->reserved > 0)) {
+    if (request.dedicated && (block->reserved > 0)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation! ("
                             << "block_resource=" << (void *)block << " "
@@ -340,7 +343,7 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
     }
 
     size_t available = (block->memory.size - block->reserved);
-    if (available >= size) {
+    if (available >= request.size) {
         return true;
     }
 
@@ -348,23 +351,23 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo
 }
 
 BlockAllocator::BlockEntry *
-BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+BlockAllocator::find_block_entry(void *user_context, const MemoryRequest &request) {
     BlockEntry *block_entry = block_list.back();
     while (block_entry != nullptr) {
         BlockEntry *prev_entry = block_entry->prev_ptr;
         const BlockResource *block = static_cast<BlockResource *>(block_entry->value);
-        if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) {
+        if (is_block_suitable_for_request(user_context, block, request)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
             debug(user_context) << "BlockAllocator: found suitable block ("
                                 << "user_context=" << (void *)(user_context) << " "
                                 << "block_resource=" << (void *)block << " "
                                 << "block_size=" << (uint32_t)block->memory.size << " "
                                 << "block_reserved=" << (uint32_t)block->reserved << " "
-                                << "request_size=" << (uint32_t)size << " "
-                                << "dedicated=" << (dedicated ? "true" : "false") << " "
-                                << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                                << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                                << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                                << "request_size=" << (uint32_t)request.size << " "
+                                << "request_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                                << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                                << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                                << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
             return block_entry;
         }
@@ -375,37 +378,37 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: couldn't find suitable block! ("
                             << "user_context=" << (void *)(user_context) << " "
-                            << "request_size=" << (uint32_t)size << " "
-                            << "dedicated=" << (dedicated ? "true" : "false") << " "
-                            << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                            << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                            << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                            << "request_size=" << (uint32_t)request.size << " "
+                            << "request_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                            << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                            << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                            << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
     }
     return block_entry;
 }
 
 BlockAllocator::BlockEntry *
-BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+BlockAllocator::reserve_block_entry(void *user_context, const MemoryRequest &request) {
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "BlockAllocator: reserving block ... ! ("
-                        << "requested_size=" << (uint32_t)size << " "
-                        << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " "
-                        << "requested_usage=" << halide_memory_usage_name(properties.usage) << " "
-                        << "requested_caching=" << halide_memory_caching_name(properties.caching) << " "
-                        << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                        << "requested_size=" << (uint32_t)request.size << " "
+                        << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                        << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                        << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                        << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
-    BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated);
+    BlockEntry *block_entry = find_block_entry(user_context, request);
     if (block_entry == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "BlockAllocator: creating block ... ! ("
-                            << "requested_size=" << (uint32_t)size << " "
-                            << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " "
-                            << "requested_usage=" << halide_memory_usage_name(properties.usage) << " "
-                            << "requested_caching=" << halide_memory_caching_name(properties.caching) << " "
-                            << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")";
+                            << "requested_size=" << (uint32_t)request.size << " "
+                            << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " "
+                            << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                            << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                            << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")";
 #endif
-        block_entry = create_block_entry(user_context, properties, size, dedicated);
+        block_entry = create_block_entry(user_context, request);
     }
 
     if (block_entry) {
@@ -449,7 +452,7 @@ int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator
 }
 
 BlockAllocator::BlockEntry *
-BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) {
+BlockAllocator::create_block_entry(void *user_context, const MemoryRequest &request) {
     if (config.maximum_pool_size && (pool_size() >= config.maximum_pool_size)) {
         error(user_context) << "BlockAllocator: No free blocks found! Maximum pool size reached ("
                             << (int32_t)(config.maximum_pool_size) << " bytes or "
@@ -476,12 +479,16 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p
                         << "allocator=" << (void *)(allocators.block.allocate) << ")...";
 #endif
 
+    // Constrain the request to the a valid block allocation
+    MemoryRequest block_request = request;
+    conform(user_context, &block_request);
+
+    // Create the block resource itself
     BlockResource *block = static_cast<BlockResource *>(block_entry->value);
-    block->memory.size = constrain_requested_size(size);
+    block->memory.size = block_request.size;
     block->memory.handle = nullptr;
-    block->memory.properties = properties;
-    block->memory.properties.nearest_multiple = max(config.nearest_multiple, properties.nearest_multiple);
-    block->memory.dedicated = dedicated;
+    block->memory.properties = block_request.properties;
+    block->memory.dedicated = block_request.dedicated;
     block->reserved = 0;
     block->allocator = create_region_allocator(user_context, block);
     alloc_memory_block(user_context, block);
@@ -561,6 +568,33 @@ size_t BlockAllocator::constrain_requested_size(size_t size) const {
     return actual_size;
 }
 
+int BlockAllocator::conform(void *user_context, MemoryRequest *request) const {
+
+    request->properties.nearest_multiple = max(config.nearest_multiple, request->properties.nearest_multiple);
+
+    if (request->properties.nearest_multiple) {
+        size_t nm = request->properties.nearest_multiple;
+        request->size = (((request->size + nm - 1) / nm) * nm);  // round up to nearest multiple
+    }
+
+    if (config.minimum_block_size) {
+        request->size = ((request->size < config.minimum_block_size) ?
+                             config.minimum_block_size :
+                             request->size);
+    }
+    if (config.maximum_block_size) {
+        request->size = ((request->size > config.maximum_block_size) ?
+                             config.maximum_block_size :
+                             request->size);
+    }
+
+    if (allocators.block.conform) {
+        return allocators.block.conform(user_context, request);
+    }
+
+    return 0;
+}
+
 bool BlockAllocator::is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const {
     if (properties.caching != MemoryCaching::DefaultCaching) {
         if (properties.caching != block->memory.properties.caching) {
diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
index d41fa57304fb..0be6041519a1 100644
--- a/src/runtime/internal/memory_resources.h
+++ b/src/runtime/internal/memory_resources.h
@@ -202,18 +202,22 @@ struct HalideSystemAllocatorFns {
 
 typedef int (*AllocateBlockFn)(void *, MemoryBlock *);
 typedef int (*DeallocateBlockFn)(void *, MemoryBlock *);
+typedef int (*ConformBlockRequestFn)(void *, MemoryRequest *);
 
 struct MemoryBlockAllocatorFns {
     AllocateBlockFn allocate = nullptr;
     DeallocateBlockFn deallocate = nullptr;
+    ConformBlockRequestFn conform = nullptr;
 };
 
 typedef int (*AllocateRegionFn)(void *, MemoryRegion *);
 typedef int (*DeallocateRegionFn)(void *, MemoryRegion *);
+typedef int (*ConformBlockRegionFn)(void *, MemoryRequest *);
 
 struct MemoryRegionAllocatorFns {
     AllocateRegionFn allocate = nullptr;
     DeallocateRegionFn deallocate = nullptr;
+    ConformBlockRegionFn conform = nullptr;
 };
 
 // --
diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
index 02c2cd7e6aa0..3588389c3747 100644
--- a/src/runtime/internal/region_allocator.h
+++ b/src/runtime/internal/region_allocator.h
@@ -46,10 +46,11 @@ class RegionAllocator {
 
     // Public interface methods
     MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
-    int release(void *user_context, MemoryRegion *memory_region);  //< unmark and cache the region for reuse
-    int reclaim(void *user_context, MemoryRegion *memory_region);  //< free the region and consolidate
-    int retain(void *user_context, MemoryRegion *memory_region);   //< retain the region and increase usage count
-    bool collect(void *user_context);                              //< returns true if any blocks were removed
+    int conform(void *user_context, MemoryRequest *request) const;  //< conform the given request into a suitable allocation
+    int release(void *user_context, MemoryRegion *memory_region);   //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *memory_region);   //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *memory_region);    //< retain the region and increase usage count
+    bool collect(void *user_context);                               //< returns true if any blocks were removed
     int release(void *user_context);
     int destroy(void *user_context);
 
@@ -73,13 +74,13 @@ class RegionAllocator {
     BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region);
 
     // Returns true if the given region can be split to accomodate the given size
-    bool can_split(const BlockRegion *region, size_t size, size_t alignment) const;
+    bool can_split(const BlockRegion *region, const MemoryRequest &request) const;
 
     // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining
-    BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment);
+    BlockRegion *split_block_region(void *user_context, BlockRegion *region, const MemoryRequest &request);
 
     // Creates a new block region and adds it to the region list
-    BlockRegion *create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated);
+    BlockRegion *create_block_region(void *user_context, const MemoryRequest &request);
 
     // Creates a new block region and adds it to the region list
     int destroy_block_region(void *user_context, BlockRegion *region);
@@ -137,30 +138,55 @@ int RegionAllocator::initialize(void *user_context, BlockResource *mb, const Mem
     allocators = ma;
     arena = MemoryArena::create(user_context, {sizeof(BlockRegion), MemoryArena::default_capacity, 0}, allocators.system);
     halide_abort_if_false(user_context, arena != nullptr);
+    MemoryRequest block_request = {};
+    block_request.size = block->memory.size;
+    block_request.offset = 0;
+    block_request.alignment = block->memory.properties.alignment;
+    block_request.properties = block->memory.properties;
+    block_request.dedicated = block->memory.dedicated;
     block->allocator = this;
-    block->regions = create_block_region(
-        user_context,
-        block->memory.properties,
-        0, block->memory.size,
-        block->memory.dedicated);
+    block->regions = create_block_region(user_context, block_request);
+    return 0;
+}
+
+int RegionAllocator::conform(void *user_context, MemoryRequest *request) const {
+    if (allocators.region.conform) {
+        return allocators.region.conform(user_context, request);
+    } else {
+        size_t actual_alignment = conform_alignment(request->alignment, block->memory.properties.alignment);
+        size_t actual_offset = aligned_offset(request->offset, actual_alignment);
+        size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, block->memory.properties.nearest_multiple);
+        request->alignment = actual_alignment;
+        request->offset = actual_offset;
+        request->size = actual_size;
+    }
     return 0;
 }
 
 MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) {
     halide_abort_if_false(user_context, request.size > 0);
-    size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
-    size_t actual_size = conform_size(request.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple);
+
+    MemoryRequest region_request = request;
+
+    int error_code = conform(user_context, &region_request);
+    if (error_code) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n";
+#endif
+        return nullptr;
+    }
+
     size_t remaining = block->memory.size - block->reserved;
-    if (remaining < actual_size) {
+    if (remaining < region_request.size) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Unable to reserve more memory from block "
-                            << "-- requested size (" << (int32_t)(request.size) << " bytes) "
+                            << "-- requested size (" << (int32_t)(region_request.size) << " bytes) "
                             << "greater than available (" << (int32_t)(remaining) << " bytes)";
 #endif
         return nullptr;
     }
 
-    BlockRegion *block_region = find_block_region(user_context, request);
+    BlockRegion *block_region = find_block_region(user_context, region_request);
     if (block_region == nullptr) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Failed to locate region for requested size ("
@@ -169,12 +195,12 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
         return nullptr;
     }
 
-    if (can_split(block_region, request.size, request.alignment)) {
+    if (can_split(block_region, region_request)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
-                            << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)";
+                            << "to accomodate requested size (" << (int32_t)(region_request.size) << " bytes)";
 #endif
-        split_block_region(user_context, block_region, request.size, request.alignment);
+        split_block_region(user_context, block_region, region_request);
     }
 
     alloc_block_region(user_context, block_region);
@@ -237,8 +263,17 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
         return false;
     }
 
+    MemoryRequest region_request = request;
+    int error_code = conform(user_context, &region_request);
+    if (error_code) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n";
+#endif
+        return false;
+    }
+
     // skip incompatible block regions for this request
-    if (!is_compatible_block_region(region, request.properties)) {
+    if (!is_compatible_block_region(region, region_request.properties)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "    skipping block region ... incompatible properties! ("
                             << " block_region=" << (void *)region
@@ -248,16 +283,13 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
         return false;
     }
 
-    size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment);
-    size_t actual_size = conform_size(region->memory.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple);
-
     // is the adjusted size larger than the current region?
-    if (actual_size > region->memory.size) {
+    if (region_request.size > region->memory.size) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "    skipping block region ... not enough space for adjusted size! ("
                             << " block_region=" << (void *)region
                             << " request_size=" << (uint32_t)(request.size)
-                            << " actual_size=" << (uint32_t)(actual_size)
+                            << " actual_size=" << (uint32_t)(region_request.size)
                             << " region_size=" << (uint32_t)(region->memory.size)
                             << ")";
 #endif
@@ -265,12 +297,12 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c
     }
 
     // will the adjusted size fit within the remaining unallocated space?
-    if ((actual_size + block->reserved) <= block->memory.size) {
+    if ((region_request.size + block->reserved) <= block->memory.size) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "    found suitable block region! ("
                             << " block_region=" << (void *)region
                             << " request_size=" << (uint32_t)(request.size)
-                            << " actual_size=" << (uint32_t)(actual_size)
+                            << " actual_size=" << (uint32_t)(region_request.size)
                             << " region_size=" << (uint32_t)(region->memory.size)
                             << ")";
 #endif
@@ -411,13 +443,11 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     return block_region;
 }
 
-bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size, size_t alignment) const {
-    size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment);
-    size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple);
-    return (block_region && (block_region->memory.size > split_size) && (block_region->usage_count == 0));
+bool RegionAllocator::can_split(const BlockRegion *block_region, const MemoryRequest &split_request) const {
+    return (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0));
 }
 
-BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) {
+BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, const MemoryRequest &request) {
 
     if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
@@ -434,33 +464,17 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
         block_region->memory.handle = nullptr;
     }
 
-    size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment);
-    size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple);
-    size_t split_offset = aligned_offset(block_region->memory.offset + size, actual_alignment);
-    size_t empty_size = block_region->memory.size - split_size;
-
-#ifdef DEBUG_RUNTIME_INTERNAL
-    debug(user_context) << "RegionAllocator: Conforming size and alignment ("
-                        << "requested_size=" << (uint32_t)size << " "
-                        << "split_size=" << (uint32_t)split_size << " "
-                        << "split_offset=" << (uint32_t)split_size << " "
-                        << "empty_size=" << (uint32_t)empty_size << " "
-                        << "requested_alignment=" << (uint32_t)alignment << " "
-                        << "required_alignment=" << (uint32_t)block->memory.properties.alignment << " "
-                        << "actual_alignment=" << (uint32_t)actual_alignment << ")";
-#endif
+    MemoryRequest split_request = request;
+    split_request.size = block_region->memory.size - request.size;
+    split_request.offset = block_region->memory.offset + request.size;
 
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Splitting "
                         << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
-                        << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)";
+                        << "to create empty region (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)";
 #endif
-
     BlockRegion *next_region = block_region->next_ptr;
-    BlockRegion *empty_region = create_block_region(user_context,
-                                                    block_region->memory.properties,
-                                                    split_offset, empty_size,
-                                                    block_region->memory.dedicated);
+    BlockRegion *empty_region = create_block_region(user_context, split_request);
     halide_abort_if_false(user_context, empty_region != nullptr);
 
     empty_region->next_ptr = next_region;
@@ -469,42 +483,52 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     }
     empty_region->prev_ptr = block_region;
     block_region->next_ptr = empty_region;
-    block_region->memory.size -= empty_size;
+    block_region->memory.size -= empty_region->memory.size;
     return empty_region;
 }
 
-BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) {
+BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryRequest &request) {
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Creating block region request ("
                         << "user_context=" << (void *)(user_context) << " "
-                        << "offset=" << (uint32_t)offset << " "
-                        << "size=" << (uint32_t)size << " "
-                        << "alignment=" << (uint32_t)properties.alignment << " "
-                        << "dedicated=" << (dedicated ? "true" : "false") << " "
-                        << "usage=" << halide_memory_usage_name(properties.usage) << " "
-                        << "caching=" << halide_memory_caching_name(properties.caching) << " "
-                        << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...";
-#endif
-    size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment);
-    size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple);
-    size_t actual_offset = aligned_offset(offset, actual_alignment);
-
-    if (actual_size == 0) {
-        error(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n";
+                        << "offset=" << (uint32_t)request.offset << " "
+                        << "size=" << (uint32_t)request.size << " "
+                        << "alignment=" << (uint32_t)request.properties.alignment << " "
+                        << "dedicated=" << (request.dedicated ? "true" : "false") << " "
+                        << "usage=" << halide_memory_usage_name(request.properties.usage) << " "
+                        << "caching=" << halide_memory_caching_name(request.properties.caching) << " "
+                        << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...";
+#endif
+
+    MemoryRequest region_request = request;
+    int error_code = conform(user_context, &region_request);
+    if (error_code) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to conform request for new block region!\n";
+#endif
+        return nullptr;
+    }
+
+    if (region_request.size == 0) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n";
+#endif
         return nullptr;
     }
 
     BlockRegion *block_region = static_cast<BlockRegion *>(arena->reserve(user_context, true));
     if (block_region == nullptr) {
-        error(user_context) << "RegionAllocator: Failed to allocate new block region!\n";
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "RegionAllocator: Failed to allocate new block region!\n";
+#endif
         return nullptr;
     }
 
     block_region->memory.handle = nullptr;
-    block_region->memory.offset = actual_offset;
-    block_region->memory.size = actual_size;
-    block_region->memory.properties = properties;
-    block_region->memory.dedicated = dedicated;
+    block_region->memory.offset = region_request.offset;
+    block_region->memory.size = region_request.size;
+    block_region->memory.properties = region_request.properties;
+    block_region->memory.dedicated = region_request.dedicated;
     block_region->status = AllocationStatus::Available;
     block_region->block_ptr = block;
     block_region->usage_count = 0;
@@ -669,6 +693,8 @@ bool RegionAllocator::collect(void *user_context) {
 
     uint32_t collected_count = 0;
     uint32_t remaining_count = 0;
+    uint64_t available_bytes = 0;
+    uint64_t scanned_bytes = 0;
     uint64_t reserved = block->reserved;
     debug(user_context) << "    collecting unused regions ("
                         << "block_ptr=" << (void *)block << " "
@@ -679,6 +705,8 @@ bool RegionAllocator::collect(void *user_context) {
     bool has_collected = false;
     BlockRegion *block_region = block->regions;
     while (block_region != nullptr) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        scanned_bytes += block_region->memory.size;
         debug(user_context) << "    checking region ("
                             << "block_ptr=" << (void *)block_region->block_ptr << " "
                             << "block_region=" << (void *)block_region << " "
@@ -687,6 +715,7 @@ bool RegionAllocator::collect(void *user_context) {
                             << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
                             << "block_reserved=" << (uint32_t)block->reserved << " "
                             << ")";
+#endif
 
         if (can_coalesce(block_region)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
@@ -705,6 +734,9 @@ bool RegionAllocator::collect(void *user_context) {
             remaining_count++;
 #endif
         }
+#ifdef DEBUG_RUNTIME_INTERNAL
+        available_bytes += is_available(block_region) ? block_region->memory.size : 0;
+#endif
         if (is_last_block_region(user_context, block_region)) {
             break;
         }
@@ -715,6 +747,8 @@ bool RegionAllocator::collect(void *user_context) {
                         << "block_ptr=" << (void *)block << " "
                         << "total_count=" << (uint32_t)(collected_count + remaining_count) << " "
                         << "block_reserved=" << (uint32_t)(block->reserved) << " "
+                        << "scanned_bytes=" << (uint32_t)(scanned_bytes) << " "
+                        << "available_bytes=" << (uint32_t)(available_bytes) << " "
                         << ")";
 #endif
 
diff --git a/src/runtime/synchronization_common.h b/src/runtime/synchronization_common.h
index cb244f360eeb..778c423e4046 100644
--- a/src/runtime/synchronization_common.h
+++ b/src/runtime/synchronization_common.h
@@ -908,7 +908,7 @@ struct halide_mutex_array {
     struct halide_mutex *array;
 };
 
-WEAK halide_mutex_array *halide_mutex_array_create(int sz) {
+WEAK halide_mutex_array *halide_mutex_array_create(uint64_t sz) {
     // TODO: If sz is huge, we should probably hash it down to something smaller
     // in the accessors below. Check for deadlocks before doing so.
     halide_mutex_array *array = (halide_mutex_array *)halide_malloc(
diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
index 96535f3446ba..055fbef72277 100644
--- a/src/runtime/vulkan_memory.h
+++ b/src/runtime/vulkan_memory.h
@@ -58,11 +58,12 @@ class VulkanMemoryAllocator {
     static int destroy(void *user_context, VulkanMemoryAllocator *allocator);
 
     // Public interface methods
-    MemoryRegion *reserve(void *user_context, MemoryRequest &request);
-    int release(void *user_context, MemoryRegion *region);  //< unmark and cache the region for reuse
-    int reclaim(void *user_context, MemoryRegion *region);  //< free the region and consolidate
-    int retain(void *user_context, MemoryRegion *region);   //< retain the region and increase its use count
-    bool collect(void *user_context);                       //< returns true if any blocks were removed
+    MemoryRegion *reserve(void *user_context, const MemoryRequest &request);
+    int conform(void *user_context, MemoryRequest *request);  //< conforms the given memory request into one that can be allocated
+    int release(void *user_context, MemoryRegion *region);    //< unmark and cache the region for reuse
+    int reclaim(void *user_context, MemoryRegion *region);    //< free the region and consolidate
+    int retain(void *user_context, MemoryRegion *region);     //< retain the region and increase its use count
+    bool collect(void *user_context);                         //< returns true if any blocks were removed
     int release(void *user_context);
     int destroy(void *user_context);
 
@@ -86,9 +87,11 @@ class VulkanMemoryAllocator {
 
     static int allocate_block(void *instance_ptr, MemoryBlock *block);
     static int deallocate_block(void *instance_ptr, MemoryBlock *block);
+    static int conform_block_request(void *instance_ptr, MemoryRequest *request);
 
     static int allocate_region(void *instance_ptr, MemoryRegion *region);
     static int deallocate_region(void *instance_ptr, MemoryRegion *region);
+    static int conform_region_request(void *instance_ptr, MemoryRequest *request);
 
     size_t bytes_allocated_for_blocks() const;
     size_t blocks_allocated() const;
@@ -113,6 +116,8 @@ class VulkanMemoryAllocator {
                                 MemoryProperties properties,
                                 uint32_t required_flags) const;
 
+    int lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements);
+
     size_t block_byte_count = 0;
     size_t block_count = 0;
     size_t region_byte_count = 0;
@@ -180,8 +185,8 @@ int VulkanMemoryAllocator::initialize(void *user_context,
     block_byte_count = 0;
     BlockAllocator::MemoryAllocators allocators;
     allocators.system = system_allocator;
-    allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block};
-    allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region};
+    allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block, VulkanMemoryAllocator::conform_block_request};
+    allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region, VulkanMemoryAllocator::conform_region_request};
     BlockAllocator::Config block_allocator_config = {0};
     block_allocator_config.maximum_pool_size = cfg.maximum_pool_size;
     block_allocator_config.maximum_block_count = cfg.maximum_block_count;
@@ -202,7 +207,7 @@ int VulkanMemoryAllocator::initialize(void *user_context,
     return halide_error_code_success;
 }
 
-MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) {
+MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, const MemoryRequest &request) {
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Reserving memory ("
                    << "user_context=" << user_context << " "
@@ -272,6 +277,7 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
         error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid memory range !\n";
         return nullptr;
     }
+#if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: MapMemory ("
                    << "user_context=" << user_context << "\n"
                    << "  region_size=" << (uint32_t)region->size << "\n"
@@ -279,8 +285,8 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) {
                    << "  region_range.head_offset=" << (uint32_t)region->range.head_offset << "\n"
                    << "  region_range.tail_offset=" << (uint32_t)region->range.tail_offset << "\n"
                    << "  memory_offset=" << (uint32_t)memory_offset << "\n"
-                   << "  memory_size=" << (uint32_t)memory_size << ") ...\n";
-
+                   << "  memory_size=" << (uint32_t)memory_size << "\n)\n";
+#endif
     VkResult result = vkMapMemory(device, *device_memory, memory_offset, memory_size, 0, (void **)(&mapped_ptr));
     if (result != VK_SUCCESS) {
         error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n";
@@ -528,6 +534,79 @@ VulkanMemoryAllocator::default_config() {
 }
 
 // --
+int VulkanMemoryAllocator::lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements) {
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Looking up requirements ("
+                   << "user_context=" << user_context << " "
+                   << "size=" << (uint32_t)block->size << ", "
+                   << "usage_flags=" << usage_flags << ") ... \n";
+#endif
+    VkBufferCreateInfo create_info = {
+        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
+        nullptr,                               // struct extending this
+        0,                                     // create flags
+        size,                                  // buffer size (in bytes)
+        usage_flags,                           // buffer usage flags
+        VK_SHARING_MODE_EXCLUSIVE,             // sharing mode
+        0, nullptr};
+
+    // Create a buffer to determine alignment requirements
+    VkBuffer buffer = {0};
+    VkResult result = vkCreateBuffer(this->device, &create_info, this->alloc_callbacks, &buffer);
+    if (result != VK_SUCCESS) {
+#if defined(HL_VK_DEBUG_MEM)
+        debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer to find requirements!\n\t"
+                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+#endif
+        return halide_error_code_device_malloc_failed;
+    }
+
+    vkGetBufferMemoryRequirements(this->device, buffer, memory_requirements);
+    vkDestroyBuffer(this->device, buffer, this->alloc_callbacks);
+    return halide_error_code_success;
+}
+
+int VulkanMemoryAllocator::conform_block_request(void *instance_ptr, MemoryRequest *request) {
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
+    if (instance == nullptr) {
+        return halide_error_code_internal_error;
+    }
+
+    void *user_context = instance->owner_context;
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Conforming block request ("
+                   << "user_context=" << user_context << " "
+                   << "request=" << (void *)(request) << ") ... \n";
+#endif
+
+    if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
+        error(user_context) << "VulkanRegionAllocator: Unable to conform block request! Invalid device handle!\n";
+        return halide_error_code_internal_error;
+    }
+
+    VkMemoryRequirements memory_requirements = {0};
+    uint32_t usage_flags = instance->select_memory_usage(user_context, request->properties);
+    int error_code = instance->lookup_requirements(user_context, request->size, usage_flags, &memory_requirements);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n";
+        return error_code;
+    }
+
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
+                   << "size=" << (uint32_t)request->size << ", "
+                   << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "required_size=" << (uint32_t)memory_requirements.size << ", "
+                   << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
+                   << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
+                   << "dedicated=" << (request->dedicated ? "true" : "false") << ")\n";
+#endif
+
+    request->size = memory_requirements.size;
+    request->properties.alignment = memory_requirements.alignment;
+    return halide_error_code_success;
+}
 
 int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block) {
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
@@ -587,53 +666,6 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block
     debug(nullptr) << "vkAllocateMemory: Allocated memory for device region (" << (uint64_t)block->size << " bytes) ...\n";
 #endif
 
-    uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties);
-
-    VkBufferCreateInfo create_info = {
-        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,  // struct type
-        nullptr,                               // struct extending this
-        0,                                     // create flags
-        sizeof(uint32_t),                      // buffer size (in bytes)
-        usage_flags,                           // buffer usage flags
-        VK_SHARING_MODE_EXCLUSIVE,             // sharing mode
-        0, nullptr};
-
-    // Create a buffer to determine alignment requirements
-    VkBuffer buffer = {0};
-    result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, &buffer);
-    if (result != VK_SUCCESS) {
-        debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer!\n\t"
-                       << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
-        return halide_error_code_device_malloc_failed;
-    }
-
-    VkMemoryRequirements memory_requirements = {0};
-    vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements);
-    vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks);
-
-#if defined(HL_VK_DEBUG_MEM)
-    debug(nullptr) << "VulkanMemoryAllocator: Block allocated ("
-                   << "size=" << (uint32_t)block->size << ", "
-                   << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
-                   << "required_size=" << (uint32_t)memory_requirements.size << ", "
-                   << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", "
-                   << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", "
-                   << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n";
-#endif
-
-    // Enforce any alignment constrainst reported by the device limits for each usage type
-    if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
-        block->properties.alignment = instance->physical_device_limits.minStorageBufferOffsetAlignment;
-    } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) {
-        block->properties.alignment = instance->physical_device_limits.minUniformBufferOffsetAlignment;
-    }
-    // Some drivers appear to report a buffer alignment constraint (regardless of usage) that can be larger than either of the above
-    if (memory_requirements.alignment > block->properties.alignment) {
-        block->properties.alignment = memory_requirements.alignment;
-    }
-    if (memory_requirements.alignment > block->properties.nearest_multiple) {
-        block->properties.nearest_multiple = memory_requirements.alignment;
-    }
     block->handle = (void *)device_memory;
     instance->block_byte_count += block->size;
     instance->block_count++;
@@ -814,6 +846,98 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context,
 
 // --
 
+int VulkanMemoryAllocator::conform(void *user_context, MemoryRequest *request) {
+
+    // NOTE: Vulkan will only allow us to bind device memory to a buffer if the memory requirements are met.
+    // So now we have to check those (on every allocation) and potentially recreate the buffer if the requirements
+    // don't match the requested VkBuffer's properties. Note that this is the internal storage for the driver,
+    // whose size may be required to larger than our requested size (even though we will only ever touch the
+    // size of the region we're managing as within our block)
+
+    VkMemoryRequirements memory_requirements = {0};
+    uint32_t usage_flags = select_memory_usage(user_context, request->properties);
+    int error_code = lookup_requirements(user_context, request->size, usage_flags, &memory_requirements);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n";
+        return error_code;
+    }
+
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements ("
+                   << "requested_size=" << (uint32_t)region->size << ", "
+                   << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
+                   << "required_size=" << (uint32_t)memory_requirements.size << ")\n";
+#endif
+
+    // Enforce any alignment constraints reported by the device limits for each usage type
+    if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) {
+        if ((request->alignment % this->physical_device_limits.minStorageBufferOffsetAlignment) != 0) {
+            request->alignment = this->physical_device_limits.minStorageBufferOffsetAlignment;
+        }
+    } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) {
+        if ((request->alignment % this->physical_device_limits.minUniformBufferOffsetAlignment) != 0) {
+            request->alignment = this->physical_device_limits.minUniformBufferOffsetAlignment;
+        }
+    }
+
+    // Ensure the request ends on an aligned address
+    if (request->alignment > config.nearest_multiple) {
+        request->properties.nearest_multiple = request->alignment;
+    }
+
+    size_t actual_alignment = conform_alignment(request->alignment, memory_requirements.alignment);
+    size_t actual_offset = aligned_offset(request->offset, actual_alignment);
+    size_t actual_size = conform_size(actual_offset, memory_requirements.size, actual_alignment, request->properties.nearest_multiple);
+
+#if defined(HL_VK_DEBUG_MEM)
+    if ((request->size != actual_size) || (request->alignment != actual_alignment) || (request->offset != actual_offset)) {
+        debug(nullptr) << "VulkanMemoryAllocator: Adjusting request to match requirements (\n"
+                       << "  size = " << (uint64_t)request->size << " => " << (uint64_t)actual_size << ",\n"
+                       << "  alignment = " << (uint64_t)request->alignment << " => " << (uint64_t)actual_alignment << ",\n"
+                       << "  offset = " << (uint64_t)request->offset << " => " << (uint64_t)actual_offset << ",\n"
+                       << "  required.size = " << (uint64_t)memory_requirements.size << ",\n"
+                       << "  required.alignment = " << (uint64_t)memory_requirements.alignment << "\n)\n";
+    }
+#endif
+    request->size = actual_size;
+    request->alignment = actual_alignment;
+    request->offset = actual_offset;
+
+    return halide_error_code_success;
+}
+
+int VulkanMemoryAllocator::conform_region_request(void *instance_ptr, MemoryRequest *request) {
+
+    VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
+    if (instance == nullptr) {
+        return halide_error_code_internal_error;
+    }
+
+    void *user_context = instance->owner_context;
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanMemoryAllocator: Conforming region request ("
+                   << "user_context=" << user_context << " "
+                   << "request=" << (void *)(region) << ") ... \n";
+#endif
+
+    if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
+        error(user_context) << "VulkanRegionAllocator: Unable to conform region request! Invalid device handle!\n";
+        return halide_error_code_internal_error;
+    }
+
+#if defined(HL_VK_DEBUG_MEM)
+    debug(nullptr) << "VulkanRegionAllocator: Conforming region request ("
+                   << "size=" << (uint32_t)request->size << ", "
+                   << "offset=" << (uint32_t)request->offset << ", "
+                   << "dedicated=" << (request->dedicated ? "true" : "false") << " "
+                   << "usage=" << halide_memory_usage_name(request->properties.usage) << " "
+                   << "caching=" << halide_memory_caching_name(request->properties.caching) << " "
+                   << "visibility=" << halide_memory_visibility_name(request->properties.visibility) << ")\n";
+#endif
+
+    return instance->conform(user_context, request);
+}
+
 int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *region) {
 
     VulkanMemoryAllocator *instance = reinterpret_cast<VulkanMemoryAllocator *>(instance_ptr);
@@ -890,7 +1014,8 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
     if (memory_requirements.size > region->size) {
         vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks);
 #ifdef DEBUG_RUNTIME
-        debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" << (uint64_t)memory_requirements.size << " bytes) ...\n";
+        debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size ("
+                       << (uint64_t)region->size << " => " << (uint64_t)memory_requirements.size << " bytes) ...\n";
 #endif
         create_info.size = memory_requirements.size;
         VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer);
diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp
index b2190f63b592..26ce8066e118 100644
--- a/test/runtime/block_allocator.cpp
+++ b/test/runtime/block_allocator.cpp
@@ -1,3 +1,7 @@
+// NOTE: Uncomment the following two defines to enable debug output
+// #define DEBUG_RUNTIME
+// #define DEBUG_RUNTIME_INTERNAL
+
 #include "HalideRuntime.h"
 
 #include "common.h"
@@ -39,6 +43,17 @@ int deallocate_block(void *user_context, MemoryBlock *block) {
     return halide_error_code_success;
 }
 
+int conform_block(void *user_context, MemoryRequest *request) {
+
+    debug(user_context) << "Test : conform_block ("
+                        << "request_size=" << int32_t(request->size) << " "
+                        << "request_offset=" << int32_t(request->offset) << " "
+                        << "request_alignment=" << int32_t(request->alignment) << " "
+                        << ") ...";
+
+    return halide_error_code_success;
+}
+
 int allocate_region(void *user_context, MemoryRegion *region) {
     region->handle = (void *)1;
     allocated_region_memory += region->size;
@@ -65,17 +80,38 @@ int deallocate_region(void *user_context, MemoryRegion *region) {
     return halide_error_code_success;
 }
 
+int conform_region(void *user_context, MemoryRequest *request) {
+    size_t actual_alignment = conform_alignment(request->alignment, 0);
+    size_t actual_offset = aligned_offset(request->offset, actual_alignment);
+    size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, actual_alignment);
+
+    debug(user_context) << "Test : conform_region (\n  "
+                        << "request_size=" << int32_t(request->size) << "\n  "
+                        << "request_offset=" << int32_t(request->offset) << "\n  "
+                        << "request_alignment=" << int32_t(request->alignment) << "\n  "
+                        << "actual_size=" << int32_t(actual_size) << "\n  "
+                        << "actual_offset=" << int32_t(actual_offset) << "\n  "
+                        << "actual_alignment=" << int32_t(actual_alignment) << "\n"
+                        << ") ...";
+
+    request->alignment = actual_alignment;
+    request->offset = actual_offset;
+    request->size = actual_size;
+    return halide_error_code_success;
+}
+
 }  // end namespace
 
 int main(int argc, char **argv) {
     void *user_context = (void *)1;
 
     SystemMemoryAllocatorFns system_allocator = {allocate_system, deallocate_system};
-    MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block};
-    MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region};
 
     // test region allocator class interface
     {
+        // Use custom conform allocation request callbacks
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, conform_region};
+
         // Manually create a block resource and allocate memory
         size_t block_size = 4 * 1024 * 1024;
         BlockResource block_resource = {};
@@ -164,8 +200,104 @@ int main(int argc, char **argv) {
         HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
     }
 
+    // test region allocator conform request
+    {
+        // Use default conform allocation request callbacks
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
+
+        // Manually create a block resource and allocate memory
+        size_t block_size = 4 * 1024 * 1024;
+        size_t padded_size = 32;
+        BlockResource block_resource = {};
+        MemoryBlock *memory_block = &(block_resource.memory);
+        memory_block->size = block_size;
+        memory_block->properties.nearest_multiple = padded_size;
+        allocate_block(user_context, memory_block);
+
+        // Create a region allocator to manage the block resource
+        RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator};
+        RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators);
+
+        // test zero size request
+        MemoryRequest request = {0};
+        instance->conform(user_context, &request);
+
+        debug(user_context) << "Test : region_allocator::conform ("
+                            << "request.size=" << int32_t(request.size) << " "
+                            << "request.alignment=" << int32_t(request.alignment) << " "
+                            << ") ...";
+
+        halide_abort_if_false(user_context, request.size == size_t(0));
+
+        // test round up size to alignment
+        request.size = 1;
+        request.alignment = 0;
+        request.properties.alignment = 4;
+        instance->conform(user_context, &request);
+        halide_abort_if_false(user_context, request.size != 4);
+        halide_abort_if_false(user_context, request.alignment != 4);
+
+        size_t nm = padded_size;
+        for (uint32_t sz = 1; sz < 256; ++sz) {
+            for (uint32_t a = 2; a < sz; a *= 2) {
+                request.size = sz;
+                request.alignment = a;
+                instance->conform(user_context, &request);
+
+                debug(user_context) << "Test : region_allocator::conform ("
+                                    << "request.size=(" << sz << " => " << int32_t(request.size) << ") "
+                                    << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") "
+                                    << "...";
+
+                halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm)));
+                halide_abort_if_false(user_context, request.alignment == a);
+            }
+        }
+
+        // test round up size and offset to alignment
+        request.size = 1;
+        request.offset = 1;
+        request.alignment = 32;
+        instance->conform(user_context, &request);
+        halide_abort_if_false(user_context, request.size == 32);
+        halide_abort_if_false(user_context, request.offset == 32);
+        halide_abort_if_false(user_context, request.alignment == 32);
+
+        for (uint32_t sz = 1; sz < 256; ++sz) {
+            for (uint32_t os = 1; os < sz; ++os) {
+                for (uint32_t a = 2; a < sz; a *= 2) {
+                    request.size = sz;
+                    request.offset = os;
+                    request.alignment = a;
+                    instance->conform(user_context, &request);
+
+                    debug(user_context) << "Test : region_allocator::conform ("
+                                        << "request.size=(" << sz << " => " << int32_t(request.size) << ") "
+                                        << "request.offset=(" << os << " => " << int32_t(request.offset) << ") "
+                                        << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") "
+                                        << "...";
+
+                    halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm)));
+                    halide_abort_if_false(user_context, request.offset == aligned_offset(os, a));
+                    halide_abort_if_false(user_context, request.alignment == a);
+                }
+            }
+        }
+
+        instance->destroy(user_context);
+        deallocate_block(user_context, memory_block);
+        HALIDE_CHECK(user_context, allocated_block_memory == 0);
+        HALIDE_CHECK(user_context, allocated_region_memory == 0);
+
+        RegionAllocator::destroy(user_context, instance);
+        HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
+    }
+
     // test region allocator nearest_multiple padding
     {
+        // Use default conform allocation request callbacks
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
+
         // Manually create a block resource and allocate memory
         size_t block_size = 4 * 1024 * 1024;
         size_t padded_size = 32;
@@ -245,6 +377,9 @@ int main(int argc, char **argv) {
         BlockAllocator::Config config = {0};
         config.minimum_block_size = 1024;
 
+        // Use default conform allocation request callbacks
+        MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr};
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
         BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
         BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
 
@@ -296,11 +431,58 @@ int main(int argc, char **argv) {
         HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
     }
 
+    // test conform request
+    {
+        uint32_t mbs = 1024;  // min block size
+        BlockAllocator::Config config = {0};
+        config.minimum_block_size = mbs;
+
+        // Use default conform allocation request callbacks
+        MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr};
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
+        BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
+        BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
+
+        MemoryRequest request = {0};
+        instance->conform(user_context, &request);
+        halide_abort_if_false(user_context, request.size != 0);
+
+        // test round up size to alignment
+        request.size = 1;
+        request.alignment = 0;
+        request.properties.alignment = 4;
+        instance->conform(user_context, &request);
+        halide_abort_if_false(user_context, request.size != 4);
+        halide_abort_if_false(user_context, request.alignment != 4);
+
+        for (uint32_t sz = 1; sz < 256; ++sz) {
+            for (uint32_t a = 2; a < sz; a *= 2) {
+                request.size = sz;
+                request.alignment = a;
+                instance->conform(user_context, &request);
+
+                debug(user_context) << "Test : block_allocator::conform ("
+                                    << "request.size=(" << sz << " => " << int32_t(request.size) << ") "
+                                    << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") "
+                                    << "...";
+
+                halide_abort_if_false(user_context, request.size == max(mbs, (((sz + a - 1) / a) * a)));
+                halide_abort_if_false(user_context, request.alignment == a);
+            }
+        }
+
+        BlockAllocator::destroy(user_context, instance);
+        HALIDE_CHECK(user_context, get_allocated_system_memory() == 0);
+    }
+
     // allocation stress test
     {
         BlockAllocator::Config config = {0};
         config.minimum_block_size = 1024;
 
+        // Use default conform allocation request callbacks
+        MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr};
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
         BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
         BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);
 
@@ -340,6 +522,9 @@ int main(int argc, char **argv) {
         BlockAllocator::Config config = {0};
         config.minimum_block_size = 1024;
 
+        // Use default conform allocation request callbacks
+        MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr};
+        MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr};
         BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator};
         BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);