From 674e6cc491e2ea755cb85b61a0a6946ff923fbcc Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 1 Dec 2023 13:18:20 -0800 Subject: [PATCH] Disallow async nestings that violate read after write dependencies (#7868) * Disallow async nestings that violate read after write dependencies Fixes #7867 * Add test * Add another failure case, and improve error message * Add some more tests * Update test * Add new test to cmakelists * Fix for llvm trunk * Always acquire the folding semaphore, even if unused * Skip async_order test under wasm * trigger buildbots --------- Co-authored-by: Volodymyr Kysenko Co-authored-by: Steven Johnson --- src/AsyncProducers.cpp | 51 ++++++++++++++++ src/StorageFolding.cpp | 5 -- test/correctness/CMakeLists.txt | 1 + test/correctness/async_order.cpp | 94 +++++++++++++++++++++++++++++ test/error/CMakeLists.txt | 2 + test/error/bad_async_producer.cpp | 31 ++++++++++ test/error/bad_async_producer_2.cpp | 23 +++++++ test/performance/async_gpu.cpp | 33 +++++++--- 8 files changed, 228 insertions(+), 12 deletions(-) create mode 100644 test/correctness/async_order.cpp create mode 100644 test/error/bad_async_producer.cpp create mode 100644 test/error/bad_async_producer_2.cpp diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index cf10f51c4663..f633409cce65 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -109,15 +109,55 @@ class NoOpCollapsingMutator : public IRMutator { class GenerateProducerBody : public NoOpCollapsingMutator { const string &func; vector sema; + std::set producers_dropped; + bool found_producer = false; using NoOpCollapsingMutator::visit; + void bad_producer_nesting_error(const string &producer, const string &async_consumer) { + user_error + << "The Func " << producer << " is consumed by async Func " << async_consumer + << " and has a compute_at location in between the store_at " + << "location and the compute_at location of " << async_consumer + << ". This is only legal when " << producer + << " is both async and has a store_at location outside the store_at location of the consumer."; + } + // Preserve produce nodes and add synchronization Stmt visit(const ProducerConsumer *op) override { if (op->name == func && op->is_producer) { + found_producer = true; + // Add post-synchronization internal_assert(!sema.empty()) << "Duplicate produce node: " << op->name << "\n"; Stmt body = op->body; + + // We don't currently support waiting on producers to the producer + // half of the fork node. Or rather, if you want to do that you have + // to schedule those Funcs as async too. Check for any consume nodes + // where the producer has gone to the consumer side of the fork + // node. + class FindBadConsumeNodes : public IRVisitor { + const std::set &producers_dropped; + using IRVisitor::visit; + + void visit(const ProducerConsumer *op) override { + if (!op->is_producer && producers_dropped.count(op->name)) { + found = op->name; + } + } + + public: + string found; + FindBadConsumeNodes(const std::set &p) + : producers_dropped(p) { + } + } finder(producers_dropped); + body.accept(&finder); + if (!finder.found.empty()) { + bad_producer_nesting_error(finder.found, func); + } + while (!sema.empty()) { Expr release = Call::make(Int(32), "halide_semaphore_release", {sema.back(), 1}, Call::Extern); body = Block::make(body, Evaluate::make(release)); @@ -125,7 +165,18 @@ class GenerateProducerBody : public NoOpCollapsingMutator { } return ProducerConsumer::make_produce(op->name, body); } else { + if (op->is_producer) { + producers_dropped.insert(op->name); + } + bool found_producer_before = found_producer; Stmt body = mutate(op->body); + if (!op->is_producer && producers_dropped.count(op->name) && + found_producer && !found_producer_before) { + // We've found a consume node wrapping our async producer where + // the corresponding producer node was dropped from this half of + // the fork. + bad_producer_nesting_error(op->name, func); + } if (is_no_op(body) || op->is_producer) { return body; } else { diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp index b4b13104b424..fd7a12d66995 100644 --- a/src/StorageFolding.cpp +++ b/src/StorageFolding.cpp @@ -825,11 +825,6 @@ class AttemptStorageFoldingOfFunction : public IRMutator { to_release = max_required - max_required_next; // This is the last time we use these entries } - if (provided.used.defined()) { - to_acquire = select(provided.used, to_acquire, 0); - } - // We should always release the required region, even if we don't use it. - // On the first iteration, we need to acquire the extent of the region shared // between the producer and consumer, and we need to release it on the last // iteration. diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 8fc403b298bb..9b72d5ceecb3 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -8,6 +8,7 @@ tests(GROUPS correctness align_bounds.cpp argmax.cpp async_device_copy.cpp + async_order.cpp autodiff.cpp bad_likely.cpp bit_counting.cpp diff --git a/test/correctness/async_order.cpp b/test/correctness/async_order.cpp new file mode 100644 index 000000000000..f712d7e19c43 --- /dev/null +++ b/test/correctness/async_order.cpp @@ -0,0 +1,94 @@ +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + if (get_jit_target_from_environment().arch == Target::WebAssembly) { + printf("[SKIP] WebAssembly does not support async() yet.\n"); + return 0; + } + + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_at(consumer, y); + producer2.compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_root(); + producer2.store_root().compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.store_root().compute_at(consumer, y).async(); + producer2.store_root().compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + + printf("Success!\n"); + return 0; +} diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt index 440851b521cb..337bc667739e 100644 --- a/test/error/CMakeLists.txt +++ b/test/error/CMakeLists.txt @@ -9,6 +9,8 @@ tests(GROUPS error auto_schedule_no_parallel.cpp auto_schedule_no_reorder.cpp autodiff_unbounded.cpp + bad_async_producer.cpp + bad_async_producer_2.cpp bad_bound.cpp bad_bound_storage.cpp bad_compute_at.cpp diff --git a/test/error/bad_async_producer.cpp b/test/error/bad_async_producer.cpp new file mode 100644 index 000000000000..9e78e268958c --- /dev/null +++ b/test/error/bad_async_producer.cpp @@ -0,0 +1,31 @@ + +#include "Halide.h" + +using namespace Halide; + +int main(int argc, char **argv) { + + Func f{"f"}, g{"g"}, h{"h"}; + Var x; + + f(x) = cast(x + 7); + g(x) = f(x); + h(x) = g(x); + + // The schedule below is an error. It should really be: + // f.store_root().compute_at(g, Var::outermost()); + // So that it's nested inside the consumer h. + f.store_root().compute_at(h, x); + g.store_root().compute_at(h, x).async(); + + Buffer buf = h.realize({32}); + for (int i = 0; i < buf.dim(0).extent(); i++) { + uint8_t correct = i + 7; + if (buf(i) != correct) { + printf("buf(%d) = %d instead of %d\n", i, buf(i), correct); + return 1; + } + } + + return 0; +} diff --git a/test/error/bad_async_producer_2.cpp b/test/error/bad_async_producer_2.cpp new file mode 100644 index 000000000000..d9929c56b3c1 --- /dev/null +++ b/test/error/bad_async_producer_2.cpp @@ -0,0 +1,23 @@ +#include "Halide.h" + +using namespace Halide; + +// From https://github.com/halide/Halide/issues/5201 +int main(int argc, char **argv) { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer2(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_at(consumer, y).async(); + producer2.store_root().compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + return 0; +} diff --git a/test/performance/async_gpu.cpp b/test/performance/async_gpu.cpp index 9d78efe4022e..55263e39546f 100644 --- a/test/performance/async_gpu.cpp +++ b/test/performance/async_gpu.cpp @@ -9,7 +9,7 @@ Expr expensive(Expr x, int c) { if (c <= 0) { return x; } else { - return expensive(fast_pow(x, x + 1), c - 1); + return expensive(x * (x + 1), c - 1); } } @@ -31,11 +31,12 @@ int main(int argc, char **argv) { } double times[2]; + uint32_t correct = 0; for (int use_async = 0; use_async < 2; use_async++) { Var x, y, t, xi, yi; - ImageParam in(Float(32), 3); - Func cpu, gpu; + ImageParam in(UInt(32), 3); + Func cpu("cpu"), gpu("gpu"); // We have a two-stage pipeline that processes frames. We want // to run the first stage on the GPU and the second stage on @@ -50,19 +51,21 @@ int main(int argc, char **argv) { // Assume GPU memory is limited, and compute the GPU stage one // frame at a time. Hoist the allocation to the top level. - gpu.compute_at(cpu, t).store_root().gpu_tile(x, y, xi, yi, 8, 8); + gpu.compute_at(gpu.in(), Var::outermost()).store_root().gpu_tile(x, y, xi, yi, 8, 8); // Stage the copy-back of the GPU result into a host-side // double-buffer. gpu.in().copy_to_host().compute_at(cpu, t).store_root().fold_storage(t, 2); if (use_async) { + // gpu.async(); gpu.in().async(); - gpu.async(); } - in.set(Buffer(800, 800, 16)); - Buffer out(800, 800, 16); + Buffer in_buf(800, 800, 16); + in_buf.fill(17); + in.set(in_buf); + Buffer out(800, 800, 16); cpu.compile_jit(); @@ -70,6 +73,22 @@ int main(int argc, char **argv) { cpu.realize(out); }); + if (!use_async) { + correct = out(0, 0, 0); + } else { + for (int t = 0; t < out.dim(2).extent(); t++) { + for (int y = 0; y < out.dim(1).extent(); y++) { + for (int x = 0; x < out.dim(0).extent(); x++) { + if (out(x, y, t) != correct) { + printf("Async output at (%d, %d, %d) is %u instead of %u\n", + x, y, t, out(x, y, t), correct); + return 1; + } + } + } + } + } + printf("%s: %f\n", use_async ? "with async" : "without async", times[use_async]);