diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp
index cf10f51c4663..f633409cce65 100644
--- a/src/AsyncProducers.cpp
+++ b/src/AsyncProducers.cpp
@@ -109,15 +109,55 @@ class NoOpCollapsingMutator : public IRMutator {
 class GenerateProducerBody : public NoOpCollapsingMutator {
     const string &func;
     vector<Expr> sema;
+    std::set<string> producers_dropped;
+    bool found_producer = false;
 
     using NoOpCollapsingMutator::visit;
 
+    void bad_producer_nesting_error(const string &producer, const string &async_consumer) {
+        user_error
+            << "The Func " << producer << " is consumed by async Func " << async_consumer
+            << " and has a compute_at location in between the store_at "
+            << "location and the compute_at location of " << async_consumer
+            << ". This is only legal when " << producer
+            << " is both async and has a store_at location outside the store_at location of the consumer.";
+    }
+
     // Preserve produce nodes and add synchronization
     Stmt visit(const ProducerConsumer *op) override {
         if (op->name == func && op->is_producer) {
+            found_producer = true;
+
             // Add post-synchronization
             internal_assert(!sema.empty()) << "Duplicate produce node: " << op->name << "\n";
             Stmt body = op->body;
+
+            // We don't currently support waiting on producers to the producer
+            // half of the fork node. Or rather, if you want to do that you have
+            // to schedule those Funcs as async too. Check for any consume nodes
+            // where the producer has gone to the consumer side of the fork
+            // node.
+            class FindBadConsumeNodes : public IRVisitor {
+                const std::set<string> &producers_dropped;
+                using IRVisitor::visit;
+
+                void visit(const ProducerConsumer *op) override {
+                    if (!op->is_producer && producers_dropped.count(op->name)) {
+                        found = op->name;
+                    }
+                }
+
+            public:
+                string found;
+                FindBadConsumeNodes(const std::set<string> &p)
+                    : producers_dropped(p) {
+                }
+            } finder(producers_dropped);
+            body.accept(&finder);
+            if (!finder.found.empty()) {
+                bad_producer_nesting_error(finder.found, func);
+            }
+
             while (!sema.empty()) {
                 Expr release = Call::make(Int(32), "halide_semaphore_release", {sema.back(), 1}, Call::Extern);
                 body = Block::make(body, Evaluate::make(release));
@@ -125,7 +165,18 @@ class GenerateProducerBody : public NoOpCollapsingMutator {
             }
             return ProducerConsumer::make_produce(op->name, body);
         } else {
+            if (op->is_producer) {
+                producers_dropped.insert(op->name);
+            }
+            bool found_producer_before = found_producer;
             Stmt body = mutate(op->body);
+            if (!op->is_producer && producers_dropped.count(op->name) &&
+                found_producer && !found_producer_before) {
+                // We've found a consume node wrapping our async producer where
+                // the corresponding producer node was dropped from this half of
+                // the fork.
+                bad_producer_nesting_error(op->name, func);
+            }
             if (is_no_op(body) || op->is_producer) {
                 return body;
             } else {
diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp
index b4b13104b424..fd7a12d66995 100644
--- a/src/StorageFolding.cpp
+++ b/src/StorageFolding.cpp
@@ -825,11 +825,6 @@ class AttemptStorageFoldingOfFunction : public IRMutator {
                         to_release = max_required - max_required_next;  // This is the last time we use these entries
                     }
 
-                    if (provided.used.defined()) {
-                        to_acquire = select(provided.used, to_acquire, 0);
-                    }
-                    // We should always release the required region, even if we don't use it.
-
                     // On the first iteration, we need to acquire the extent of the region shared
                     // between the producer and consumer, and we need to release it on the last
                     // iteration.
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 8fc403b298bb..9b72d5ceecb3 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -8,6 +8,7 @@ tests(GROUPS correctness
       align_bounds.cpp
       argmax.cpp
       async_device_copy.cpp
+      async_order.cpp
       autodiff.cpp
       bad_likely.cpp
       bit_counting.cpp
diff --git a/test/correctness/async_order.cpp b/test/correctness/async_order.cpp
new file mode 100644
index 000000000000..f712d7e19c43
--- /dev/null
+++ b/test/correctness/async_order.cpp
@@ -0,0 +1,94 @@
+#include "Halide.h"
+#include <stdio.h>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    if (get_jit_target_from_environment().arch == Target::WebAssembly) {
+        printf("[SKIP] WebAssembly does not support async() yet.\n");
+        return 0;
+    }
+
+    {
+        Func producer1, producer2, consumer;
+        Var x, y;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = producer1(x, y);
+        consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1);
+
+        consumer.compute_root();
+
+        producer1.compute_at(consumer, y);
+        producer2.compute_at(consumer, y).async();
+
+        consumer.bound(x, 0, 16).bound(y, 0, 16);
+
+        Buffer<int> out = consumer.realize({16, 16});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(-1);
+            }
+        });
+    }
+    {
+        Func producer1, producer2, consumer;
+        Var x, y;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = producer1(x, y);
+        consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1);
+
+        consumer.compute_root();
+
+        producer1.compute_root();
+        producer2.store_root().compute_at(consumer, y).async();
+
+        consumer.bound(x, 0, 16).bound(y, 0, 16);
+
+        Buffer<int> out = consumer.realize({16, 16});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(-1);
+            }
+        });
+    }
+
+    {
+        Func producer1, producer2, consumer;
+        Var x, y;
+
+        producer1(x, y) = x + y;
+        producer2(x, y) = producer1(x, y);
+        consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1);
+
+        consumer.compute_root();
+
+        producer1.store_root().compute_at(consumer, y).async();
+        producer2.store_root().compute_at(consumer, y).async();
+
+        consumer.bound(x, 0, 16).bound(y, 0, 16);
+
+        Buffer<int> out = consumer.realize({16, 16});
+
+        out.for_each_element([&](int x, int y) {
+            int correct = 2 * (x + y);
+            if (out(x, y) != correct) {
+                printf("out(%d, %d) = %d instead of %d\n",
+                       x, y, out(x, y), correct);
+                exit(-1);
+            }
+        });
+    }
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt
index 440851b521cb..337bc667739e 100644
--- a/test/error/CMakeLists.txt
+++ b/test/error/CMakeLists.txt
@@ -9,6 +9,8 @@ tests(GROUPS error
       auto_schedule_no_parallel.cpp
       auto_schedule_no_reorder.cpp
       autodiff_unbounded.cpp
+      bad_async_producer.cpp
+      bad_async_producer_2.cpp
       bad_bound.cpp
       bad_bound_storage.cpp
       bad_compute_at.cpp
diff --git a/test/error/bad_async_producer.cpp b/test/error/bad_async_producer.cpp
new file mode 100644
index 000000000000..9e78e268958c
--- /dev/null
+++ b/test/error/bad_async_producer.cpp
@@ -0,0 +1,31 @@
+
+#include "Halide.h"
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+
+    Func f{"f"}, g{"g"}, h{"h"};
+    Var x;
+
+    f(x) = cast<uint8_t>(x + 7);
+    g(x) = f(x);
+    h(x) = g(x);
+
+    // The schedule below is an error. It should really be:
+    // f.store_root().compute_at(g, Var::outermost());
+    // So that it's nested inside the consumer h.
+    f.store_root().compute_at(h, x);
+    g.store_root().compute_at(h, x).async();
+
+    Buffer<uint8_t> buf = h.realize({32});
+    for (int i = 0; i < buf.dim(0).extent(); i++) {
+        uint8_t correct = i + 7;
+        if (buf(i) != correct) {
+            printf("buf(%d) = %d instead of %d\n", i, buf(i), correct);
+            return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/test/error/bad_async_producer_2.cpp b/test/error/bad_async_producer_2.cpp
new file mode 100644
index 000000000000..d9929c56b3c1
--- /dev/null
+++ b/test/error/bad_async_producer_2.cpp
@@ -0,0 +1,23 @@
+#include "Halide.h"
+
+using namespace Halide;
+
+// From https://github.com/halide/Halide/issues/5201
+int main(int argc, char **argv) {
+    Func producer1, producer2, consumer;
+    Var x, y;
+
+    producer1(x, y) = x + y;
+    producer2(x, y) = producer1(x, y);
+    consumer(x, y) = producer2(x, y - 1) + producer2(x, y + 1);
+
+    consumer.compute_root();
+
+    producer1.compute_at(consumer, y).async();
+    producer2.store_root().compute_at(consumer, y).async();
+
+    consumer.bound(x, 0, 16).bound(y, 0, 16);
+
+    Buffer<int> out = consumer.realize({16, 16});
+    return 0;
+}
diff --git a/test/performance/async_gpu.cpp b/test/performance/async_gpu.cpp
index 9d78efe4022e..55263e39546f 100644
--- a/test/performance/async_gpu.cpp
+++ b/test/performance/async_gpu.cpp
@@ -9,7 +9,7 @@ Expr expensive(Expr x, int c) {
     if (c <= 0) {
         return x;
     } else {
-        return expensive(fast_pow(x, x + 1), c - 1);
+        return expensive(x * (x + 1), c - 1);
     }
 }
 
@@ -31,11 +31,12 @@ int main(int argc, char **argv) {
     }
 
     double times[2];
+    uint32_t correct = 0;
     for (int use_async = 0; use_async < 2; use_async++) {
         Var x, y, t, xi, yi;
 
-        ImageParam in(Float(32), 3);
-        Func cpu, gpu;
+        ImageParam in(UInt(32), 3);
+        Func cpu("cpu"), gpu("gpu");
 
         // We have a two-stage pipeline that processes frames. We want
         // to run the first stage on the GPU and the second stage on
@@ -50,19 +51,21 @@ int main(int argc, char **argv) {
 
         // Assume GPU memory is limited, and compute the GPU stage one
         // frame at a time. Hoist the allocation to the top level.
-        gpu.compute_at(cpu, t).store_root().gpu_tile(x, y, xi, yi, 8, 8);
+        gpu.compute_at(gpu.in(), Var::outermost()).store_root().gpu_tile(x, y, xi, yi, 8, 8);
 
         // Stage the copy-back of the GPU result into a host-side
         // double-buffer.
         gpu.in().copy_to_host().compute_at(cpu, t).store_root().fold_storage(t, 2);
 
         if (use_async) {
+            // gpu.async();
             gpu.in().async();
-            gpu.async();
         }
 
-        in.set(Buffer<float>(800, 800, 16));
-        Buffer<float> out(800, 800, 16);
+        Buffer<uint32_t> in_buf(800, 800, 16);
+        in_buf.fill(17);
+        in.set(in_buf);
+        Buffer<uint32_t> out(800, 800, 16);
 
         cpu.compile_jit();
 
@@ -70,6 +73,22 @@ int main(int argc, char **argv) {
             cpu.realize(out);
         });
 
+        if (!use_async) {
+            correct = out(0, 0, 0);
+        } else {
+            for (int t = 0; t < out.dim(2).extent(); t++) {
+                for (int y = 0; y < out.dim(1).extent(); y++) {
+                    for (int x = 0; x < out.dim(0).extent(); x++) {
+                        if (out(x, y, t) != correct) {
+                            printf("Async output at (%d, %d, %d) is %u instead of %u\n",
+                                   x, y, t, out(x, y, t), correct);
+                            return 1;
+                        }
+                    }
+                }
+            }
+        }
+
         printf("%s: %f\n",
                use_async ? "with async" : "without async",
                times[use_async]);