From 31368194a88405741eaf361e7a793cbb1b8f8fd9 Mon Sep 17 00:00:00 2001 From: Xuanda Yang Date: Fri, 1 Dec 2023 01:59:30 +0800 Subject: [PATCH 001/186] [serialization] Add Halide version and serialization version in serialization format (#7905) * halide version * serialization version * format * Fix Makefile * trigger buildbots --------- Co-authored-by: Andrew Adams Co-authored-by: Steven Johnson --- Makefile | 8 ++++++++ src/CMakeLists.txt | 6 ++++++ src/Deserialization.cpp | 19 +++++++++++++++++++ src/Serialization.cpp | 12 +++++++++++- src/halide_ir.fbs | 6 ++++-- 5 files changed, 48 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index d1ebace87bda..7364941941a2 100644 --- a/Makefile +++ b/Makefile @@ -247,6 +247,14 @@ CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS) # On ubuntu, this requires packages flatbuffers-compiler and libflatbuffers-dev ifneq (,$(shell which flatc)) CXX_FLAGS += -DWITH_SERIALIZATION -I $(BUILD_DIR) -I $(shell which flatc | sed 's/bin.flatc/include/') +# Note: if updating here, be sure to update in CMakeLists.txt as well +HALIDE_SERIALIZATION_VERSION_MAJOR ?= 0 +HALIDE_SERIALIZATION_VERSION_MINOR ?= 1 +HALIDE_SERIALIZATION_VERSION_PATCH ?= 0 +HALIDE_SERIALIZATION_VERSION=$(HALIDE_SERIALIZATION_VERSION_MAJOR).$(HALIDE_SERIALIZATION_VERSION_MINOR).$(HALIDE_SERIALIZATION_VERSION_PATCH) +CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_MAJOR=$(HALIDE_SERIALIZATION_VERSION_MAJOR) +CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_MINOR=$(HALIDE_SERIALIZATION_VERSION_MINOR) +CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_PATCH=$(HALIDE_SERIALIZATION_VERSION_PATCH) endif # This is required on some hosts like powerpc64le-linux-gnu because we may build diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9ef902c27be2..771944b10d42 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -509,6 +509,12 @@ if (WITH_SERIALIZATION) target_include_directories(Halide PRIVATE "$") target_link_libraries(Halide PRIVATE Halide_flatbuffers) target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION) + # Note: if updating here, be sure to update in Makefile as well + target_compile_definitions(Halide PUBLIC + HALIDE_SERIALIZATION_VERSION_MAJOR=0 + HALIDE_SERIALIZATION_VERSION_MINOR=1 + HALIDE_SERIALIZATION_VERSION_PATCH=0 + ) endif () # Enable serialization testing by intercepting JIT compilation with a serialization roundtrip; diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index c0e9f39de7bf..b27918756886 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -1403,6 +1403,25 @@ Pipeline Deserializer::deserialize(const std::vector &data) { user_warning << "deserialized pipeline is empty\n"; return Pipeline(); } + + std::string deserialized_halide_version = deserialize_string(pipeline_obj->halide_version()); + std::string halide_version = std::to_string(HALIDE_VERSION_MAJOR) + "." + + std::to_string(HALIDE_VERSION_MINOR) + "." + + std::to_string(HALIDE_VERSION_PATCH); + if (deserialized_halide_version != halide_version) { + user_warning << "deserialized pipeline is built with Halide version " << deserialized_halide_version + << ", but current Halide version is " << halide_version << "\n"; + } + + std::string deserialized_serialization_version = deserialize_string(pipeline_obj->serialization_version()); + std::string serialization_version = std::to_string(HALIDE_SERIALIZATION_VERSION_MAJOR) + "." + + std::to_string(HALIDE_SERIALIZATION_VERSION_MINOR) + "." + + std::to_string(HALIDE_SERIALIZATION_VERSION_PATCH); + if (deserialized_serialization_version != serialization_version) { + user_error << "deserialized pipeline is built with Halide serialization version " << deserialized_serialization_version + << ", but current Halide serialization version is " << serialization_version << "\n"; + } + const std::vector func_names_in_order = deserialize_vector(pipeline_obj->func_names_in_order(), &Deserializer::deserialize_string); diff --git a/src/Serialization.cpp b/src/Serialization.cpp index 2928e3b7ebbf..857c963cceab 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -1501,6 +1501,14 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector &resul buffers_serialized.push_back(serialize_buffer(builder, buffer.second)); } + std::string halide_version = std::to_string(HALIDE_VERSION_MAJOR) + "." + + std::to_string(HALIDE_VERSION_MINOR) + "." + + std::to_string(HALIDE_VERSION_PATCH); + + std::string serialization_version = std::to_string(HALIDE_SERIALIZATION_VERSION_MAJOR) + "." + + std::to_string(HALIDE_SERIALIZATION_VERSION_MINOR) + "." + + std::to_string(HALIDE_SERIALIZATION_VERSION_PATCH); + auto pipeline_obj = Serialize::CreatePipeline(builder, builder.CreateVector(funcs_serialized), builder.CreateVector(output_names_serialized), @@ -1509,7 +1517,9 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector &resul builder.CreateVector(func_names_in_order_serialized), builder.CreateVector(parameters_serialized), builder.CreateVector(external_parameters_serialized), - builder.CreateVector(buffers_serialized)); + builder.CreateVector(buffers_serialized), + serialize_string(builder, halide_version), + serialize_string(builder, serialization_version)); builder.Finish(pipeline_obj); uint8_t *buf = builder.GetBufferPointer(); diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index 479e488b6739..8148aca639a9 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -1,7 +1,7 @@ namespace Halide.Serialize; -// This corresponds to the corresponding Halide version. -file_identifier "HL17"; +// This identifies the serialized data being a Halide pipeline. Should be exactly 4 bytes. +file_identifier "HLDE"; // File extension of any written files. "hlpipe" stands for Halide Pipeline. file_extension "hlpipe"; @@ -710,6 +710,8 @@ table Pipeline { parameters: [Parameter]; external_parameters: [ExternalParameter]; buffers: [Buffer]; + halide_version: string; + serialization_version: string; } root_type Pipeline; From 4fc2a7d860c08d03ee93d47f743f4f6878b5f8a9 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 30 Nov 2023 16:31:48 -0800 Subject: [PATCH 002/186] Handle many more intrinsics in Bounds.cpp (#7823) * Handle many more intrinsics in Bounds.cpp This addresses many (but not all) of the `signed integer overflow` issues we're seeing in Google due to https://github.com/halide/Halide/pull/7814 -- a lot of the issues seems to be in code that uses intrinsics that had no handling in value bounds checking, so the bounds were naively large and overflowed. - Most of the intrinsics from FindIntrinsics.h weren't handled; now they all are (most by lowering to other IR, though the halving_add variants were modeled directly because the bitwise ops don't mesh well) - strict_float() is just a pass-through - round() is a best guess (basically, if bounds exist, expand by one as a worst-case) There are definitely others we should handle here... trunc/floor/ceil probably? * Fix round() and strict_float() handling * Update Bounds.cpp * Fixes? * trigger buildbots * Revert saturating_cast handling * Update Bounds.cpp --------- Co-authored-by: Andrew Adams --- src/Bounds.cpp | 149 +++++++++++++++++++++++++++++++++++++++++-- src/FindIntrinsics.h | 1 + 2 files changed, 143 insertions(+), 7 deletions(-) diff --git a/src/Bounds.cpp b/src/Bounds.cpp index d40922cb6db0..0ba1f5440056 100644 --- a/src/Bounds.cpp +++ b/src/Bounds.cpp @@ -41,6 +41,37 @@ using std::string; using std::vector; namespace { + +bool can_widen(const Expr &e) { + // We don't want to widen Xtensa 48-bit integers + return e.type().bits() <= 32; +} + +bool can_widen_all(const std::vector &args) { + for (const auto &e : args) { + if (!can_widen(e)) { + return false; + } + } + return true; +} + +Expr widen(Expr a) { + internal_assert(can_widen(a)); + Type result_type = a.type().widen(); + return Cast::make(result_type, std::move(a)); +} + +Expr narrow(Expr a) { + Type result_type = a.type().narrow(); + return Cast::make(result_type, std::move(a)); +} + +Expr saturating_narrow(const Expr &a) { + Type narrow = a.type().narrow(); + return saturating_cast(narrow, a); +} + int static_sign(const Expr &x) { if (is_positive_const(x)) { return 1; @@ -56,6 +87,7 @@ int static_sign(const Expr &x) { } return 0; } + } // anonymous namespace const FuncValueBounds &empty_func_value_bounds() { @@ -1195,6 +1227,15 @@ class Bounds : public IRVisitor { // else fall thru and continue } + const auto handle_expr_bounds = [this, t](const Expr &e) -> void { + if (e.defined()) { + e.accept(this); + } else { + // Just use the bounds of the type + this->bounds_of_type(t); + } + }; + if (op->is_intrinsic(Call::abs)) { Interval a = arg_bounds.get(0); interval.min = make_zero(t); @@ -1468,6 +1509,7 @@ class Bounds : public IRVisitor { } } else if (op->args.size() == 1 && (op->is_intrinsic(Call::round) || + op->is_intrinsic(Call::strict_float) || op->name == "ceil_f32" || op->name == "ceil_f64" || op->name == "floor_f32" || op->name == "floor_f64" || op->name == "exp_f32" || op->name == "exp_f64" || @@ -1518,14 +1560,107 @@ class Bounds : public IRVisitor { } interval = result; } else if (op->is_intrinsic(Call::widen_right_add)) { - Expr add = Add::make(op->args[0], cast(op->args[0].type(), op->args[1])); - add.accept(this); - } else if (op->is_intrinsic(Call::widen_right_sub)) { - Expr sub = Sub::make(op->args[0], cast(op->args[0].type(), op->args[1])); - sub.accept(this); + internal_assert(op->args.size() == 2); + Expr e = can_widen(op->args[1]) ? + lower_widen_right_add(op->args[0], op->args[1]) : + Expr(); + handle_expr_bounds(e); } else if (op->is_intrinsic(Call::widen_right_mul)) { - Expr mul = Mul::make(op->args[0], cast(op->args[0].type(), op->args[1])); - mul.accept(this); + internal_assert(op->args.size() == 2); + Expr e = can_widen(op->args[1]) ? + lower_widen_right_mul(op->args[0], op->args[1]) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::widen_right_sub)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen(op->args[1]) ? + lower_widen_right_sub(op->args[0], op->args[1]) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::widening_add)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen_all(op->args) ? + lower_widening_add(op->args[0], op->args[1]) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::widening_mul)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen_all(op->args) ? + lower_widening_mul(op->args[0], op->args[1]) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::widening_sub)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen_all(op->args) ? + lower_widening_sub(op->args[0], op->args[1]) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::saturating_add)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen_all(op->args) ? + narrow(clamp(widen(op->args[0]) + widen(op->args[1]), t.min(), t.max())) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::saturating_sub)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen_all(op->args) ? + narrow(clamp(widen(op->args[0]) - widen(op->args[1]), t.min(), t.max())) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::widening_shift_left)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen(op->args[0]) ? + lower_widening_shift_left(op->args[0], op->args[1]) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::widening_shift_right)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen(op->args[0]) ? + lower_widening_shift_right(op->args[0], op->args[1]) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::rounding_shift_right)) { + internal_assert(op->args.size() == 2); + // TODO: uses bitwise ops we may not handle well + handle_expr_bounds(lower_rounding_shift_right(op->args[0], op->args[1])); + } else if (op->is_intrinsic(Call::rounding_shift_left)) { + internal_assert(op->args.size() == 2); + // TODO: uses bitwise ops we may not handle well + handle_expr_bounds(lower_rounding_shift_left(op->args[0], op->args[1])); + } else if (op->is_intrinsic(Call::halving_add)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen_all(op->args) ? + narrow((widen(op->args[0]) + widen(op->args[1])) / 2) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::halving_sub)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen_all(op->args) ? + narrow((widen(op->args[0]) - widen(op->args[1])) / 2) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::rounding_halving_add)) { + internal_assert(op->args.size() == 2); + Expr e = can_widen_all(op->args) ? + narrow((widen(op->args[0]) + widen(op->args[1]) + 1) / 2) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::rounding_mul_shift_right)) { + internal_assert(op->args.size() == 3); + Expr e = can_widen_all(op->args) ? + saturating_narrow(rounding_shift_right(widening_mul(op->args[0], op->args[1]), op->args[2])) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::mul_shift_right)) { + internal_assert(op->args.size() == 3); + Expr e = can_widen_all(op->args) ? + saturating_narrow(widening_mul(op->args[0], op->args[1]) >> op->args[2]) : + Expr(); + handle_expr_bounds(e); + } else if (op->is_intrinsic(Call::sorted_avg)) { + internal_assert(op->args.size() == 2); + Expr e = lower_sorted_avg(op->args[0], op->args[1]); + handle_expr_bounds(e); } else if (op->call_type == Call::Halide) { bounds_of_func(op->name, op->value_index, op->type); } else { diff --git a/src/FindIntrinsics.h b/src/FindIntrinsics.h index f8ddaf171bc3..fc4c2a8e90f5 100644 --- a/src/FindIntrinsics.h +++ b/src/FindIntrinsics.h @@ -30,6 +30,7 @@ Expr lower_saturating_cast(const Type &t, const Expr &a); Expr lower_halving_add(const Expr &a, const Expr &b); Expr lower_halving_sub(const Expr &a, const Expr &b); Expr lower_rounding_halving_add(const Expr &a, const Expr &b); +Expr lower_sorted_avg(const Expr &a, const Expr &b); Expr lower_mul_shift_right(const Expr &a, const Expr &b, const Expr &q); Expr lower_rounding_mul_shift_right(const Expr &a, const Expr &b, const Expr &q); From 674e6cc491e2ea755cb85b61a0a6946ff923fbcc Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 1 Dec 2023 13:18:20 -0800 Subject: [PATCH 003/186] Disallow async nestings that violate read after write dependencies (#7868) * Disallow async nestings that violate read after write dependencies Fixes #7867 * Add test * Add another failure case, and improve error message * Add some more tests * Update test * Add new test to cmakelists * Fix for llvm trunk * Always acquire the folding semaphore, even if unused * Skip async_order test under wasm * trigger buildbots --------- Co-authored-by: Volodymyr Kysenko Co-authored-by: Steven Johnson --- src/AsyncProducers.cpp | 51 ++++++++++++++++ src/StorageFolding.cpp | 5 -- test/correctness/CMakeLists.txt | 1 + test/correctness/async_order.cpp | 94 +++++++++++++++++++++++++++++ test/error/CMakeLists.txt | 2 + test/error/bad_async_producer.cpp | 31 ++++++++++ test/error/bad_async_producer_2.cpp | 23 +++++++ test/performance/async_gpu.cpp | 33 +++++++--- 8 files changed, 228 insertions(+), 12 deletions(-) create mode 100644 test/correctness/async_order.cpp create mode 100644 test/error/bad_async_producer.cpp create mode 100644 test/error/bad_async_producer_2.cpp diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index cf10f51c4663..f633409cce65 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -109,15 +109,55 @@ class NoOpCollapsingMutator : public IRMutator { class GenerateProducerBody : public NoOpCollapsingMutator { const string &func; vector sema; + std::set producers_dropped; + bool found_producer = false; using NoOpCollapsingMutator::visit; + void bad_producer_nesting_error(const string &producer, const string &async_consumer) { + user_error + << "The Func " << producer << " is consumed by async Func " << async_consumer + << " and has a compute_at location in between the store_at " + << "location and the compute_at location of " << async_consumer + << ". This is only legal when " << producer + << " is both async and has a store_at location outside the store_at location of the consumer."; + } + // Preserve produce nodes and add synchronization Stmt visit(const ProducerConsumer *op) override { if (op->name == func && op->is_producer) { + found_producer = true; + // Add post-synchronization internal_assert(!sema.empty()) << "Duplicate produce node: " << op->name << "\n"; Stmt body = op->body; + + // We don't currently support waiting on producers to the producer + // half of the fork node. Or rather, if you want to do that you have + // to schedule those Funcs as async too. Check for any consume nodes + // where the producer has gone to the consumer side of the fork + // node. + class FindBadConsumeNodes : public IRVisitor { + const std::set &producers_dropped; + using IRVisitor::visit; + + void visit(const ProducerConsumer *op) override { + if (!op->is_producer && producers_dropped.count(op->name)) { + found = op->name; + } + } + + public: + string found; + FindBadConsumeNodes(const std::set &p) + : producers_dropped(p) { + } + } finder(producers_dropped); + body.accept(&finder); + if (!finder.found.empty()) { + bad_producer_nesting_error(finder.found, func); + } + while (!sema.empty()) { Expr release = Call::make(Int(32), "halide_semaphore_release", {sema.back(), 1}, Call::Extern); body = Block::make(body, Evaluate::make(release)); @@ -125,7 +165,18 @@ class GenerateProducerBody : public NoOpCollapsingMutator { } return ProducerConsumer::make_produce(op->name, body); } else { + if (op->is_producer) { + producers_dropped.insert(op->name); + } + bool found_producer_before = found_producer; Stmt body = mutate(op->body); + if (!op->is_producer && producers_dropped.count(op->name) && + found_producer && !found_producer_before) { + // We've found a consume node wrapping our async producer where + // the corresponding producer node was dropped from this half of + // the fork. + bad_producer_nesting_error(op->name, func); + } if (is_no_op(body) || op->is_producer) { return body; } else { diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp index b4b13104b424..fd7a12d66995 100644 --- a/src/StorageFolding.cpp +++ b/src/StorageFolding.cpp @@ -825,11 +825,6 @@ class AttemptStorageFoldingOfFunction : public IRMutator { to_release = max_required - max_required_next; // This is the last time we use these entries } - if (provided.used.defined()) { - to_acquire = select(provided.used, to_acquire, 0); - } - // We should always release the required region, even if we don't use it. - // On the first iteration, we need to acquire the extent of the region shared // between the producer and consumer, and we need to release it on the last // iteration. diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 8fc403b298bb..9b72d5ceecb3 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -8,6 +8,7 @@ tests(GROUPS correctness align_bounds.cpp argmax.cpp async_device_copy.cpp + async_order.cpp autodiff.cpp bad_likely.cpp bit_counting.cpp diff --git a/test/correctness/async_order.cpp b/test/correctness/async_order.cpp new file mode 100644 index 000000000000..f712d7e19c43 --- /dev/null +++ b/test/correctness/async_order.cpp @@ -0,0 +1,94 @@ +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + if (get_jit_target_from_environment().arch == Target::WebAssembly) { + printf("[SKIP] WebAssembly does not support async() yet.\n"); + return 0; + } + + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_at(consumer, y); + producer2.compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_root(); + producer2.store_root().compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + + { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer1(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.store_root().compute_at(consumer, y).async(); + producer2.store_root().compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(-1); + } + }); + } + + printf("Success!\n"); + return 0; +} diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt index 440851b521cb..337bc667739e 100644 --- a/test/error/CMakeLists.txt +++ b/test/error/CMakeLists.txt @@ -9,6 +9,8 @@ tests(GROUPS error auto_schedule_no_parallel.cpp auto_schedule_no_reorder.cpp autodiff_unbounded.cpp + bad_async_producer.cpp + bad_async_producer_2.cpp bad_bound.cpp bad_bound_storage.cpp bad_compute_at.cpp diff --git a/test/error/bad_async_producer.cpp b/test/error/bad_async_producer.cpp new file mode 100644 index 000000000000..9e78e268958c --- /dev/null +++ b/test/error/bad_async_producer.cpp @@ -0,0 +1,31 @@ + +#include "Halide.h" + +using namespace Halide; + +int main(int argc, char **argv) { + + Func f{"f"}, g{"g"}, h{"h"}; + Var x; + + f(x) = cast(x + 7); + g(x) = f(x); + h(x) = g(x); + + // The schedule below is an error. It should really be: + // f.store_root().compute_at(g, Var::outermost()); + // So that it's nested inside the consumer h. + f.store_root().compute_at(h, x); + g.store_root().compute_at(h, x).async(); + + Buffer buf = h.realize({32}); + for (int i = 0; i < buf.dim(0).extent(); i++) { + uint8_t correct = i + 7; + if (buf(i) != correct) { + printf("buf(%d) = %d instead of %d\n", i, buf(i), correct); + return 1; + } + } + + return 0; +} diff --git a/test/error/bad_async_producer_2.cpp b/test/error/bad_async_producer_2.cpp new file mode 100644 index 000000000000..d9929c56b3c1 --- /dev/null +++ b/test/error/bad_async_producer_2.cpp @@ -0,0 +1,23 @@ +#include "Halide.h" + +using namespace Halide; + +// From https://github.com/halide/Halide/issues/5201 +int main(int argc, char **argv) { + Func producer1, producer2, consumer; + Var x, y; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y); + consumer(x, y) = producer2(x, y - 1) + producer2(x, y + 1); + + consumer.compute_root(); + + producer1.compute_at(consumer, y).async(); + producer2.store_root().compute_at(consumer, y).async(); + + consumer.bound(x, 0, 16).bound(y, 0, 16); + + Buffer out = consumer.realize({16, 16}); + return 0; +} diff --git a/test/performance/async_gpu.cpp b/test/performance/async_gpu.cpp index 9d78efe4022e..55263e39546f 100644 --- a/test/performance/async_gpu.cpp +++ b/test/performance/async_gpu.cpp @@ -9,7 +9,7 @@ Expr expensive(Expr x, int c) { if (c <= 0) { return x; } else { - return expensive(fast_pow(x, x + 1), c - 1); + return expensive(x * (x + 1), c - 1); } } @@ -31,11 +31,12 @@ int main(int argc, char **argv) { } double times[2]; + uint32_t correct = 0; for (int use_async = 0; use_async < 2; use_async++) { Var x, y, t, xi, yi; - ImageParam in(Float(32), 3); - Func cpu, gpu; + ImageParam in(UInt(32), 3); + Func cpu("cpu"), gpu("gpu"); // We have a two-stage pipeline that processes frames. We want // to run the first stage on the GPU and the second stage on @@ -50,19 +51,21 @@ int main(int argc, char **argv) { // Assume GPU memory is limited, and compute the GPU stage one // frame at a time. Hoist the allocation to the top level. - gpu.compute_at(cpu, t).store_root().gpu_tile(x, y, xi, yi, 8, 8); + gpu.compute_at(gpu.in(), Var::outermost()).store_root().gpu_tile(x, y, xi, yi, 8, 8); // Stage the copy-back of the GPU result into a host-side // double-buffer. gpu.in().copy_to_host().compute_at(cpu, t).store_root().fold_storage(t, 2); if (use_async) { + // gpu.async(); gpu.in().async(); - gpu.async(); } - in.set(Buffer(800, 800, 16)); - Buffer out(800, 800, 16); + Buffer in_buf(800, 800, 16); + in_buf.fill(17); + in.set(in_buf); + Buffer out(800, 800, 16); cpu.compile_jit(); @@ -70,6 +73,22 @@ int main(int argc, char **argv) { cpu.realize(out); }); + if (!use_async) { + correct = out(0, 0, 0); + } else { + for (int t = 0; t < out.dim(2).extent(); t++) { + for (int y = 0; y < out.dim(1).extent(); y++) { + for (int x = 0; x < out.dim(0).extent(); x++) { + if (out(x, y, t) != correct) { + printf("Async output at (%d, %d, %d) is %u instead of %u\n", + x, y, t, out(x, y, t), correct); + return 1; + } + } + } + } + } + printf("%s: %f\n", use_async ? "with async" : "without async", times[use_async]); From dea2cf7e2228c7f5ce52fa8236c3d15fdb82b89f Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Sun, 3 Dec 2023 13:34:02 -0800 Subject: [PATCH 004/186] complete_x86_target() should enable F16C and FMA when AVX2 is present (#7971) All known AVX2-enabled architectures definitely have these features. --- src/CodeGen_X86.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index e34dd30870b4..ab099eef123c 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -47,6 +47,9 @@ Target complete_x86_target(Target t) { } if (t.has_feature(Target::AVX2)) { t.set_feature(Target::AVX); + // All AVX2-enabled architectures have F16C and FMA + t.set_feature(Target::F16C); + t.set_feature(Target::FMA); } if (t.has_feature(Target::AVX)) { t.set_feature(Target::SSE41); From 17578a104b0d9530fbb053a4eaa818580b91b2f7 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 5 Dec 2023 10:08:08 -0800 Subject: [PATCH 005/186] Add two new tail strategies for update definitions (#7949) * Add two new tail strategies for update definitions * Stop printing asm * Update expected number of partitions for Partition::Always * Add a comment explaining why the blend safety check is per dimension * Add serialization support for the new tail strategies * trigger buildbots * Add comment --------- Co-authored-by: Steven Johnson --- src/ApplySplit.cpp | 15 ++++ src/ApplySplit.h | 6 +- src/Deserialization.cpp | 4 + src/Func.cpp | 82 ++++++++++++++++++ src/IRPrinter.cpp | 6 ++ src/Schedule.h | 26 ++++++ src/ScheduleFunctions.cpp | 30 ++++--- src/Serialization.cpp | 4 + src/halide_ir.fbs | 2 + test/correctness/nested_tail_strategies.cpp | 30 ++++++- test/error/CMakeLists.txt | 2 + test/error/round_up_and_blend_race.cpp | 23 +++++ test/error/shift_inwards_and_blend_race.cpp | 19 +++++ test/performance/CMakeLists.txt | 1 + test/performance/blend_tail_strategies.cpp | 93 +++++++++++++++++++++ 15 files changed, 326 insertions(+), 17 deletions(-) create mode 100644 test/error/round_up_and_blend_race.cpp create mode 100644 test/error/shift_inwards_and_blend_race.cpp create mode 100644 test/performance/blend_tail_strategies.cpp diff --git a/src/ApplySplit.cpp b/src/ApplySplit.cpp index 7bde69a38e94..48d27b1ffc7a 100644 --- a/src/ApplySplit.cpp +++ b/src/ApplySplit.cpp @@ -107,6 +107,21 @@ vector apply_split(const Split &split, bool is_update, const s // non-trivial loop. base = likely_if_innermost(base); base = Min::make(base, old_max + (1 - split.factor)); + } else if (tail == TailStrategy::ShiftInwardsAndBlend) { + Expr old_base = base; + base = likely(base); + base = Min::make(base, old_max + (1 - split.factor)); + // Make a mask which will be a loop invariant if inner gets + // vectorized, and apply it if we're in the tail. + Expr unwanted_elems = (-old_extent) % split.factor; + Expr mask = inner >= unwanted_elems; + mask = select(base == old_base, likely(const_true()), mask); + result.emplace_back(mask, ApplySplitResult::BlendProvides); + } else if (tail == TailStrategy::RoundUpAndBlend) { + Expr unwanted_elems = (-old_extent) % split.factor; + Expr mask = inner < split.factor - unwanted_elems; + mask = select(outer < outer_max, likely(const_true()), mask); + result.emplace_back(mask, ApplySplitResult::BlendProvides); } else { internal_assert(tail == TailStrategy::RoundUp); } diff --git a/src/ApplySplit.h b/src/ApplySplit.h index 61774733b02b..5e646b22f08b 100644 --- a/src/ApplySplit.h +++ b/src/ApplySplit.h @@ -36,7 +36,8 @@ struct ApplySplitResult { LetStmt, PredicateCalls, PredicateProvides, - Predicate }; + Predicate, + BlendProvides }; Type type; ApplySplitResult(const std::string &n, Expr val, Type t) @@ -67,6 +68,9 @@ struct ApplySplitResult { bool is_predicate_provides() const { return (type == PredicateProvides); } + bool is_blend_provides() const { + return (type == BlendProvides); + } }; /** Given a Split schedule on a definition (init or update), return a list of diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index b27918756886..bea4ca0d9d92 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -350,6 +350,10 @@ TailStrategy Deserializer::deserialize_tail_strategy(Serialize::TailStrategy tai return TailStrategy::PredicateStores; case Serialize::TailStrategy::ShiftInwards: return TailStrategy::ShiftInwards; + case Serialize::TailStrategy::ShiftInwardsAndBlend: + return TailStrategy::ShiftInwardsAndBlend; + case Serialize::TailStrategy::RoundUpAndBlend: + return TailStrategy::RoundUpAndBlend; case Serialize::TailStrategy::Auto: return TailStrategy::Auto; default: diff --git a/src/Func.cpp b/src/Func.cpp index 37b64df5af5b..8f46e7316531 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -375,6 +375,79 @@ bool is_const_assignment(const string &func_name, const vector &args, cons rhs_checker.has_self_reference || rhs_checker.has_rvar); } + +void check_for_race_conditions_in_split_with_blend(const StageSchedule &sched) { + // Splits with a 'blend' tail strategy do a load and then a store of values + // outside of the region to be computed, so for each split using a 'blend' + // tail strategy, verify that there aren't any parallel vars that stem from + // the same original dimension, so that this load and store doesn't race + // with a true computation of that value happening in some other thread. + + // Note that we only need to check vars in the same dimension, because + // allocation bounds inference is done per-dimension and allocates padding + // based on the values actually accessed by the lowered code (i.e. it covers + // the blend region). So for example, an access beyond the end of a scanline + // can't overflow onto the next scanline. Halide will allocate padding, or + // throw a bounds error if it's an input or output. + + if (sched.allow_race_conditions()) { + return; + } + + std::set parallel; + for (const auto &dim : sched.dims()) { + if (is_unordered_parallel(dim.for_type)) { + parallel.insert(dim.var); + } + } + + // Process the splits in reverse order to figure out which root vars have a + // parallel child. + for (auto it = sched.splits().rbegin(); it != sched.splits().rend(); it++) { + if (it->is_fuse()) { + if (parallel.count(it->old_var)) { + parallel.insert(it->inner); + parallel.insert(it->old_var); + } + } else if (it->is_rename() || it->is_purify()) { + if (parallel.count(it->outer)) { + parallel.insert(it->old_var); + } + } else { + if (parallel.count(it->inner) || parallel.count(it->outer)) { + parallel.insert(it->old_var); + } + } + } + + // Now propagate back to all children of the identified root vars, to assert + // that none of them use a blending tail strategy. + for (auto it = sched.splits().begin(); it != sched.splits().end(); it++) { + if (it->is_fuse()) { + if (parallel.count(it->inner) || parallel.count(it->outer)) { + parallel.insert(it->old_var); + } + } else if (it->is_rename() || it->is_purify()) { + if (parallel.count(it->old_var)) { + parallel.insert(it->outer); + } + } else { + if (parallel.count(it->old_var)) { + parallel.insert(it->inner); + parallel.insert(it->old_var); + if (it->tail == TailStrategy::ShiftInwardsAndBlend || + it->tail == TailStrategy::RoundUpAndBlend) { + user_error << "Tail strategy " << it->tail + << " may not be used to split " << it->old_var + << " because other vars stemming from the same original " + << "Var or RVar are marked as parallel." + << "This could cause a race condition.\n"; + } + } + } + } +} + } // namespace void Stage::set_dim_type(const VarOrRVar &var, ForType t) { @@ -439,6 +512,10 @@ void Stage::set_dim_type(const VarOrRVar &var, ForType t) { << " in vars for function\n" << dump_argument_list(); } + + if (is_unordered_parallel(t)) { + check_for_race_conditions_in_split_with_blend(definition.schedule()); + } } void Stage::set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api) { @@ -1171,6 +1248,11 @@ void Stage::split(const string &old, const string &outer, const string &inner, c } } + if (tail == TailStrategy::ShiftInwardsAndBlend || + tail == TailStrategy::RoundUpAndBlend) { + check_for_race_conditions_in_split_with_blend(definition.schedule()); + } + if (!definition.is_init()) { user_assert(tail != TailStrategy::ShiftInwards) << "When splitting Var " << old_name diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index cd89e76417c0..dc07d0e0f010 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -180,6 +180,12 @@ std::ostream &operator<<(std::ostream &out, const TailStrategy &t) { case TailStrategy::RoundUp: out << "RoundUp"; break; + case TailStrategy::ShiftInwardsAndBlend: + out << "ShiftInwardsAndBlend"; + break; + case TailStrategy::RoundUpAndBlend: + out << "RoundUpAndBlend"; + break; } return out; } diff --git a/src/Schedule.h b/src/Schedule.h index 22908a8425e4..32a654228673 100644 --- a/src/Schedule.h +++ b/src/Schedule.h @@ -100,6 +100,32 @@ enum class TailStrategy { * instead of a multiple of the split factor as with RoundUp. */ ShiftInwards, + /** Equivalent to ShiftInwards, but protects values that would be + * re-evaluated by loading the memory location that would be stored to, + * modifying only the elements not contained within the overlap, and then + * storing the blended result. + * + * This tail strategy is useful when you want to use ShiftInwards to + * vectorize without a scalar tail, but are scheduling a stage where that + * isn't legal (e.g. an update definition). + * + * Because this is a read - modify - write, this tail strategy cannot be + * used on any dimension the stage is parallelized over as it would cause a + * race condition. + */ + ShiftInwardsAndBlend, + + /** Equivalent to RoundUp, but protected values that would be written beyond + * the end by loading the memory location that would be stored to, + * modifying only the elements within the region being computed, and then + * storing the blended result. + * + * This tail strategy is useful when vectorizing an update to some sub-region + * of a larger Func. As with ShiftInwardsAndBlend, it can't be combined with + * parallelism. + */ + RoundUpAndBlend, + /** For pure definitions use ShiftInwards. For pure vars in * update definitions use RoundUp. For RVars in update * definitions use GuardWithIf. */ diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp index 5c0b63edfe9e..9c5ca9095575 100644 --- a/src/ScheduleFunctions.cpp +++ b/src/ScheduleFunctions.cpp @@ -126,8 +126,8 @@ Stmt substitute_in(const string &name, const Expr &value, bool calls, bool provi class AddPredicates : public IRGraphMutator { const Expr &cond; - bool calls; - bool provides; + const Function &func; + ApplySplitResult::Type type; using IRMutator::visit; @@ -135,7 +135,13 @@ class AddPredicates : public IRGraphMutator { auto [args, changed_args] = mutate_with_changes(p->args); auto [values, changed_values] = mutate_with_changes(p->values); Expr predicate = mutate(p->predicate); - if (provides) { + if (type == ApplySplitResult::BlendProvides) { + int idx = 0; + for (Expr &v : values) { + v = select(cond, v, Call::make(func, args, idx++)); + } + return Provide::make(p->name, values, args, predicate); + } else if (type == ApplySplitResult::PredicateProvides) { return Provide::make(p->name, values, args, predicate && cond); } else if (changed_args || changed_values || !predicate.same_as(p->predicate)) { return Provide::make(p->name, values, args, predicate); @@ -146,20 +152,20 @@ class AddPredicates : public IRGraphMutator { Expr visit(const Call *op) override { Expr result = IRMutator::visit(op); - if (calls && op->call_type == Call::Halide) { + if (type == ApplySplitResult::PredicateCalls && op->call_type == Call::Halide) { result = Call::make(op->type, Call::if_then_else, {cond, result}, Call::PureIntrinsic); } return result; } public: - AddPredicates(const Expr &cond, bool calls, bool provides) - : cond(cond), calls(calls), provides(provides) { + AddPredicates(const Expr &cond, const Function &func, ApplySplitResult::Type type) + : cond(cond), func(func), type(type) { } }; -Stmt add_predicates(const Expr &cond, bool calls, bool provides, const Stmt &s) { - return AddPredicates(cond, calls, provides).mutate(s); +Stmt add_predicates(const Expr &cond, const Function &func, ApplySplitResult::Type type, const Stmt &s) { + return AddPredicates(cond, func, type).mutate(s); } // Build a loop nest about a provide node using a schedule @@ -227,10 +233,10 @@ Stmt build_loop_nest( stmt = substitute_in(res.name, res.value, true, false, stmt); } else if (res.is_substitution_in_provides()) { stmt = substitute_in(res.name, res.value, false, true, stmt); - } else if (res.is_predicate_calls()) { - stmt = add_predicates(res.value, true, false, stmt); - } else if (res.is_predicate_provides()) { - stmt = add_predicates(res.value, false, true, stmt); + } else if (res.is_blend_provides() || + res.is_predicate_calls() || + res.is_predicate_provides()) { + stmt = add_predicates(res.value, func, res.type, stmt); } else if (res.is_let()) { stmt = LetStmt::make(res.name, res.value, stmt); } else { diff --git a/src/Serialization.cpp b/src/Serialization.cpp index 857c963cceab..0224bef35600 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -320,6 +320,10 @@ Serialize::TailStrategy Serializer::serialize_tail_strategy(const TailStrategy & return Serialize::TailStrategy::PredicateStores; case TailStrategy::ShiftInwards: return Serialize::TailStrategy::ShiftInwards; + case TailStrategy::ShiftInwardsAndBlend: + return Serialize::TailStrategy::ShiftInwardsAndBlend; + case TailStrategy::RoundUpAndBlend: + return Serialize::TailStrategy::RoundUpAndBlend; case TailStrategy::Auto: return Serialize::TailStrategy::Auto; default: diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index 8148aca639a9..e4ac5ae49aed 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -527,6 +527,8 @@ enum TailStrategy: ubyte { PredicateLoads, PredicateStores, ShiftInwards, + ShiftInwardsAndBlend, + RoundUpAndBlend, Auto, } diff --git a/test/correctness/nested_tail_strategies.cpp b/test/correctness/nested_tail_strategies.cpp index 2a0ddc7a6bf8..a1f59d30c0bb 100644 --- a/test/correctness/nested_tail_strategies.cpp +++ b/test/correctness/nested_tail_strategies.cpp @@ -19,10 +19,12 @@ void my_free(JITUserContext *user_context, void *ptr) { void check(Func out, int line, std::vector tails) { bool has_round_up = std::find(tails.begin(), tails.end(), TailStrategy::RoundUp) != tails.end() || + std::find(tails.begin(), tails.end(), TailStrategy::RoundUpAndBlend) != tails.end() || std::find(tails.begin(), tails.end(), TailStrategy::PredicateLoads) != tails.end() || std::find(tails.begin(), tails.end(), TailStrategy::PredicateStores) != tails.end(); bool has_shift_inwards = - std::find(tails.begin(), tails.end(), TailStrategy::ShiftInwards) != tails.end(); + std::find(tails.begin(), tails.end(), TailStrategy::ShiftInwards) != tails.end() || + std::find(tails.begin(), tails.end(), TailStrategy::ShiftInwardsAndBlend) != tails.end(); std::vector sizes_to_try; @@ -68,6 +70,12 @@ int main(int argc, char **argv) { return 0; } + // We'll randomly subsample these tests, because otherwise there are too many of them. + std::mt19937 rng(0); + int seed = argc > 1 ? atoi(argv[1]) : time(nullptr); + rng.seed(seed); + std::cout << "Nested tail strategies seed: " << seed << "\n"; + // Test random compositions of tail strategies in simple // producer-consumer pipelines. The bounds being tight sometimes // depends on the simplifier being able to cancel out things. @@ -76,7 +84,8 @@ int main(int argc, char **argv) { TailStrategy::RoundUp, TailStrategy::GuardWithIf, TailStrategy::ShiftInwards, - }; + TailStrategy::RoundUpAndBlend, + TailStrategy::ShiftInwardsAndBlend}; TailStrategy innermost_tails[] = { TailStrategy::RoundUp, @@ -84,7 +93,8 @@ int main(int argc, char **argv) { TailStrategy::PredicateLoads, TailStrategy::PredicateStores, TailStrategy::ShiftInwards, - }; + TailStrategy::RoundUpAndBlend, + TailStrategy::ShiftInwardsAndBlend}; // Two stages. First stage computed at tiles of second. for (auto t1 : innermost_tails) { @@ -110,6 +120,10 @@ int main(int argc, char **argv) { for (auto t1 : innermost_tails) { for (auto t2 : innermost_tails) { for (auto t3 : innermost_tails) { + if ((rng() & 7) != 0) { + continue; + } + Func in("in"), f("f"), g("g"), h("h"); Var x; @@ -134,6 +148,10 @@ int main(int argc, char **argv) { for (auto t1 : tails) { for (auto t2 : innermost_tails) { for (auto t3 : innermost_tails) { + if ((rng() & 7) != 0) { + continue; + } + Func in, f, g, h; Var x; @@ -158,8 +176,12 @@ int main(int argc, char **argv) { // (but can handle smaller outputs). for (auto t1 : innermost_tails) { for (auto t2 : tails) { - for (auto t3 : tails) { // Not innermost_tails because of n^4 complexity here. + for (auto t3 : innermost_tails) { for (auto t4 : tails) { + if ((rng() & 63) != 0) { + continue; + } + Func in("in"), f("f"), g("g"), h("h"); Var x; diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt index 337bc667739e..ef4f5ffea614 100644 --- a/test/error/CMakeLists.txt +++ b/test/error/CMakeLists.txt @@ -94,7 +94,9 @@ tests(GROUPS error reuse_var_in_schedule.cpp reused_args.cpp rfactor_inner_dim_non_commutative.cpp + round_up_and_blend_race.cpp run_with_large_stack_throws.cpp + shift_inwards_and_blend_race.cpp specialize_fail.cpp split_inner_wrong_tail_strategy.cpp split_non_innermost_predicated.cpp diff --git a/test/error/round_up_and_blend_race.cpp b/test/error/round_up_and_blend_race.cpp new file mode 100644 index 000000000000..72244c0a6e8b --- /dev/null +++ b/test/error/round_up_and_blend_race.cpp @@ -0,0 +1,23 @@ +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + + Func f; + Var x; + + f(x) = 0; + f(x) += 4; + + // This schedule should be forbidden, because it causes a race condition. + Var xo, xi; + f.update() + .split(x, xo, xi, 8, TailStrategy::RoundUp) + .vectorize(xi, 16, TailStrategy::RoundUpAndBlend) // Access beyond the end of each slice + .parallel(xo); + + printf("Success!\n"); + return 0; +} diff --git a/test/error/shift_inwards_and_blend_race.cpp b/test/error/shift_inwards_and_blend_race.cpp new file mode 100644 index 000000000000..67b4d9a6bcf1 --- /dev/null +++ b/test/error/shift_inwards_and_blend_race.cpp @@ -0,0 +1,19 @@ +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + + Func f; + Var x; + + f(x) = 0; + f(x) += 4; + + // This schedule should be forbidden, because it causes a race condition. + f.update().vectorize(x, 8, TailStrategy::ShiftInwardsAndBlend).parallel(x); + + printf("Success!\n"); + return 0; +} diff --git a/test/performance/CMakeLists.txt b/test/performance/CMakeLists.txt index f47e92d6436b..1fecb06d0195 100644 --- a/test/performance/CMakeLists.txt +++ b/test/performance/CMakeLists.txt @@ -7,6 +7,7 @@ endif() tests(GROUPS performance SOURCES async_gpu.cpp + blend_tail_strategies.cpp block_transpose.cpp boundary_conditions.cpp clamped_vector_load.cpp diff --git a/test/performance/blend_tail_strategies.cpp b/test/performance/blend_tail_strategies.cpp new file mode 100644 index 000000000000..fa6a6f03d8c4 --- /dev/null +++ b/test/performance/blend_tail_strategies.cpp @@ -0,0 +1,93 @@ +#include "Halide.h" +#include "halide_benchmark.h" + +using namespace Halide; +using namespace Halide::Tools; + +int main(int argc, char **argv) { + Var x("x"), y("y"); + + Target t = get_jit_target_from_environment(); + + // Make sure we don't have predicated instructions available + if ((t.arch != Target::X86 && t.arch != Target::ARM) || + t.has_feature(Target::AVX512) || + t.has_feature(Target::SVE)) { + printf("[SKIP] This is a test for architectures without predication. " + "Currently we only test x86 before AVX-512 and ARM without SVE\n"); + return 0; + } + + const int N = t.natural_vector_size() * 2; + const int reps = 1024 * 128; + + Buffer output_buf(N - 1, N - 1); + Buffer correct_output; + + std::map times; + for (auto ts : {TailStrategy::GuardWithIf, + TailStrategy::RoundUp, + TailStrategy::ShiftInwardsAndBlend, + TailStrategy::RoundUpAndBlend}) { + Func f, g; + f(x, y) = cast(x + y); + RDom r(0, reps); + f(x, y) = f(x, y) * 3 + cast(0 * r); + g(x, y) = f(x, y); + + f.compute_root() + .update() + .reorder(x, y, r) + .vectorize(x, N / 2, ts); + + if (ts == TailStrategy::ShiftInwardsAndBlend) { + // Hide the stall from a load that overlaps the previous store by + // doing multiple scanlines at once. We expect the tail in y might + // be large, so force partitioning of x even in the loop tail in y. + f.update() + .reorder(y, x) + .unroll(y, 8, TailStrategy::GuardWithIf) + .reorder(x, y) + .partition(x, Partition::Always); + } + + g.compile_jit(); + // Uncomment to see the assembly + // g.compile_to_assembly("/dev/stdout", {}, "f", t); + double t = benchmark([&]() { + g.realize(output_buf); + }); + + // Check correctness + if (ts == TailStrategy::GuardWithIf) { + correct_output = output_buf.copy(); + } else { + for (int y = 0; y < output_buf.height(); y++) { + for (int x = 0; x < output_buf.width(); x++) { + if (output_buf(x, y) != correct_output(x, y)) { + printf("output_buf(%d, %d) = %d instead of %d\n", + x, y, output_buf(x, y), correct_output(x, y)); + } + } + } + } + times[ts] = t; + } + + for (auto p : times) { + std::cout << p.first << " " << p.second << "\n"; + } + + if (times[TailStrategy::GuardWithIf] < times[TailStrategy::ShiftInwardsAndBlend]) { + printf("ShiftInwardsAndBlend is slower than it should be\n"); + return 1; + } + + if (times[TailStrategy::GuardWithIf] < times[TailStrategy::RoundUpAndBlend]) { + printf("RoundUpAndBlend is slower than it should be\n"); + return 1; + } + + printf("Success!\n"); + return 0; +} From 209ec02b372e2f6bc0c7155c70ea2ffe94b15c47 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 5 Dec 2023 14:15:23 -0800 Subject: [PATCH 006/186] Add appropriate mattrs for arm-32 extensions (#7978) * Add appropriate mattrs for arm-32 extensions Fixes #7976 * Pull clauses out of if --- src/CodeGen_ARM.cpp | 46 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 826f3723e4bf..03678e5ef605 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1647,19 +1647,34 @@ string CodeGen_ARM::mcpu_tune() const { } string CodeGen_ARM::mattrs() const { + string arch_flags; + string separator; + if (target.has_feature(Target::ARMFp16)) { + arch_flags += separator + "+fullfp16"; + separator = ","; + } + if (target.has_feature(Target::ARMv81a)) { + arch_flags += separator + "+v8.1a"; + separator = ","; + } + if (target.has_feature(Target::ARMDotProd)) { + arch_flags += separator + "+dotprod"; + separator = ","; + } if (target.bits == 32) { if (target.has_feature(Target::ARMv7s)) { - return "+neon"; + arch_flags += separator + "+neon"; + separator = ","; } if (!target.has_feature(Target::NoNEON)) { - return "+neon"; + arch_flags += separator + "+neon"; + separator = ","; } else { - return "-neon"; + arch_flags += separator + "-neon"; + separator = ","; } } else { // TODO: Should Halide's SVE flags be 64-bit only? - string arch_flags; - string separator; if (target.has_feature(Target::SVE2)) { arch_flags = "+sve2"; separator = ","; @@ -1667,28 +1682,11 @@ string CodeGen_ARM::mattrs() const { arch_flags = "+sve"; separator = ","; } - - if (target.has_feature(Target::ARMv81a)) { - arch_flags += separator + "+v8.1a"; - separator = ","; - } - - if (target.has_feature(Target::ARMDotProd)) { - arch_flags += separator + "+dotprod"; - separator = ","; - } - - if (target.has_feature(Target::ARMFp16)) { - arch_flags += separator + "+fullfp16"; - separator = ","; - } - if (target.os == Target::IOS || target.os == Target::OSX) { - return arch_flags + separator + "+reserve-x18"; - } else { - return arch_flags; + arch_flags += separator + "+reserve-x18"; } } + return arch_flags; } bool CodeGen_ARM::use_soft_float_abi() const { From 17b7366ae50ddeea608c0af0fef2260937ace690 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 6 Dec 2023 15:03:14 -0800 Subject: [PATCH 007/186] Move canonical version numbers into source, not build system (#7980) (#7981) * Move canonical version numbers into source, not build system (#7980) * Fixes --- Makefile | 20 -------------------- src/CMakeLists.txt | 10 +++------- src/Deserialization.cpp | 7 ++++--- src/Serialization.cpp | 6 +++--- src/halide_ir.fbs | 10 ++++++++++ src/runtime/HalideRuntime.h | 9 +++++++++ test/runtime/CMakeLists.txt | 1 - 7 files changed, 29 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index 7364941941a2..4140da5c8f30 100644 --- a/Makefile +++ b/Makefile @@ -9,12 +9,6 @@ # For correctness and performance tests this include halide build time and run time. For # the tests in test/generator/ this times only the halide build time. -# Halide project version -HALIDE_VERSION_MAJOR ?= 17 -HALIDE_VERSION_MINOR ?= 0 -HALIDE_VERSION_PATCH ?= 0 -HALIDE_VERSION=$(HALIDE_VERSION_MAJOR).$(HALIDE_VERSION_MINOR).$(HALIDE_VERSION_PATCH) - # Disable built-in makefile rules for all apps to avoid pointless file-system # scanning and general weirdness resulting from implicit rules. MAKEFLAGS += --no-builtin-rules @@ -146,12 +140,6 @@ WITH_LLVM_INSIDE_SHARED_LIBHALIDE ?= not-empty HL_TARGET ?= host HL_JIT_TARGET ?= host -HL_VERSION_FLAGS = \ - -DHALIDE_VERSION="$(HALIDE_VERSION)" \ - -DHALIDE_VERSION_MAJOR=$(HALIDE_VERSION_MAJOR) \ - -DHALIDE_VERSION_MINOR=$(HALIDE_VERSION_MINOR) \ - -DHALIDE_VERSION_PATCH=$(HALIDE_VERSION_PATCH) - X86_CXX_FLAGS=$(if $(WITH_X86), -DWITH_X86, ) X86_LLVM_CONFIG_LIB=$(if $(WITH_X86), x86, ) @@ -222,7 +210,6 @@ LLVM_CXX_FLAGS_LIBCPP := $(findstring -stdlib=libc++, $(LLVM_CXX_FLAGS)) endif CXX_FLAGS = $(CXXFLAGS) $(CXX_WARNING_FLAGS) $(RTTI_CXX_FLAGS) -Woverloaded-virtual $(FPIC) $(OPTIMIZE) -fno-omit-frame-pointer -DCOMPILING_HALIDE -CXX_FLAGS += $(HL_VERSION_FLAGS) CXX_FLAGS += $(LLVM_CXX_FLAGS) CXX_FLAGS += $(PTX_CXX_FLAGS) CXX_FLAGS += $(ARM_CXX_FLAGS) @@ -248,13 +235,8 @@ CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS) ifneq (,$(shell which flatc)) CXX_FLAGS += -DWITH_SERIALIZATION -I $(BUILD_DIR) -I $(shell which flatc | sed 's/bin.flatc/include/') # Note: if updating here, be sure to update in CMakeLists.txt as well -HALIDE_SERIALIZATION_VERSION_MAJOR ?= 0 HALIDE_SERIALIZATION_VERSION_MINOR ?= 1 HALIDE_SERIALIZATION_VERSION_PATCH ?= 0 -HALIDE_SERIALIZATION_VERSION=$(HALIDE_SERIALIZATION_VERSION_MAJOR).$(HALIDE_SERIALIZATION_VERSION_MINOR).$(HALIDE_SERIALIZATION_VERSION_PATCH) -CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_MAJOR=$(HALIDE_SERIALIZATION_VERSION_MAJOR) -CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_MINOR=$(HALIDE_SERIALIZATION_VERSION_MINOR) -CXX_FLAGS += -DHALIDE_SERIALIZATION_VERSION_PATCH=$(HALIDE_SERIALIZATION_VERSION_PATCH) endif # This is required on some hosts like powerpc64le-linux-gnu because we may build @@ -307,7 +289,6 @@ TEST_LD_FLAGS = -L$(BIN_DIR) -lHalide $(COMMON_LD_FLAGS) # In the tests, some of our expectations change depending on the llvm version TEST_CXX_FLAGS += -DLLVM_VERSION=$(LLVM_VERSION_TIMES_10) -TEST_CXX_FLAGS += $(HL_VERSION_FLAGS) # In the tests, default to exporting no symbols that aren't explicitly exported TEST_CXX_FLAGS += -fvisibility=hidden -fvisibility-inlines-hidden @@ -1118,7 +1099,6 @@ RUNTIME_CXX_FLAGS = \ -Wno-unused-function \ -Wvla \ -Wsign-compare -RUNTIME_CXX_FLAGS += $(HL_VERSION_FLAGS) $(BUILD_DIR)/initmod.windows_%_x86_32.ll: $(SRC_DIR)/runtime/windows_%_x86.cpp $(BUILD_DIR)/clang_ok @mkdir -p $(@D) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 771944b10d42..5d15d55f4416 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -509,12 +509,6 @@ if (WITH_SERIALIZATION) target_include_directories(Halide PRIVATE "$") target_link_libraries(Halide PRIVATE Halide_flatbuffers) target_compile_definitions(Halide PRIVATE WITH_SERIALIZATION) - # Note: if updating here, be sure to update in Makefile as well - target_compile_definitions(Halide PUBLIC - HALIDE_SERIALIZATION_VERSION_MAJOR=0 - HALIDE_SERIALIZATION_VERSION_MINOR=1 - HALIDE_SERIALIZATION_VERSION_PATCH=0 - ) endif () # Enable serialization testing by intercepting JIT compilation with a serialization roundtrip; @@ -549,8 +543,10 @@ set_target_properties(Halide PROPERTIES VERSION ${Halide_VERSION} SOVERSION ${Halide_SOVERSION_OVERRIDE}) +# Note that we (deliberately) redeclare these versions here, even though the macros +# with identical versions are expected to be defined in source; this allows us to +# ensure that the versions defined between all build systems are identical. target_compile_definitions(Halide PUBLIC - HALIDE_VERSION=${Halide_VERSION} HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR} HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR} HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH}) diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index bea4ca0d9d92..90590d6f15af 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -1418,9 +1418,10 @@ Pipeline Deserializer::deserialize(const std::vector &data) { } std::string deserialized_serialization_version = deserialize_string(pipeline_obj->serialization_version()); - std::string serialization_version = std::to_string(HALIDE_SERIALIZATION_VERSION_MAJOR) + "." + - std::to_string(HALIDE_SERIALIZATION_VERSION_MINOR) + "." + - std::to_string(HALIDE_SERIALIZATION_VERSION_PATCH); + std::string serialization_version = std::to_string((int)Serialize::SerializationVersionMajor::Value) + "." + + std::to_string((int)Serialize::SerializationVersionMinor::Value) + "." + + std::to_string((int)Serialize::SerializationVersionPatch::Value); + if (deserialized_serialization_version != serialization_version) { user_error << "deserialized pipeline is built with Halide serialization version " << deserialized_serialization_version << ", but current Halide serialization version is " << serialization_version << "\n"; diff --git a/src/Serialization.cpp b/src/Serialization.cpp index 0224bef35600..a9342d95ba6d 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -1509,9 +1509,9 @@ void Serializer::serialize(const Pipeline &pipeline, std::vector &resul std::to_string(HALIDE_VERSION_MINOR) + "." + std::to_string(HALIDE_VERSION_PATCH); - std::string serialization_version = std::to_string(HALIDE_SERIALIZATION_VERSION_MAJOR) + "." + - std::to_string(HALIDE_SERIALIZATION_VERSION_MINOR) + "." + - std::to_string(HALIDE_SERIALIZATION_VERSION_PATCH); + std::string serialization_version = std::to_string((int)Serialize::SerializationVersionMajor::Value) + "." + + std::to_string((int)Serialize::SerializationVersionMinor::Value) + "." + + std::to_string((int)Serialize::SerializationVersionPatch::Value); auto pipeline_obj = Serialize::CreatePipeline(builder, builder.CreateVector(funcs_serialized), diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index e4ac5ae49aed..fe52231ffc49 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -6,6 +6,16 @@ file_identifier "HLDE"; // File extension of any written files. "hlpipe" stands for Halide Pipeline. file_extension "hlpipe"; +enum SerializationVersionMajor: int { + Value = 0 +} +enum SerializationVersionMinor: int { + Value = 1 +} +enum SerializationVersionPatch: int { + Value = 0 +} + // from src/IR.cpp union Stmt { LetStmt, diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 81088971418c..445811009abd 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -18,6 +18,15 @@ #include "runtime_internal.h" #endif +// Note that the canonical Halide version is considered to be defined here +// (rather than in the build system); we redundantly define the value in +// our CMake build, so that we ensure that the in-build metadata (eg soversion) +// matches, but keeping the canonical version here makes it easier to keep +// downstream build systems (eg Blaze/Bazel) properly in sync with the source. +#define HALIDE_VERSION_MAJOR 17 +#define HALIDE_VERSION_MINOR 0 +#define HALIDE_VERSION_PATCH 0 + #ifdef __cplusplus // Forward declare type to allow naming typed handles. // See Type.h for documentation. diff --git a/test/runtime/CMakeLists.txt b/test/runtime/CMakeLists.txt index dbbdba540448..44ebf4c39d9d 100644 --- a/test/runtime/CMakeLists.txt +++ b/test/runtime/CMakeLists.txt @@ -8,7 +8,6 @@ function(_set_target_options NAME) target_compile_definitions( ${NAME} PRIVATE - HALIDE_VERSION=${Halide_VERSION} HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR} HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR} HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH} From 9f6ec17acafa59d1da959dd39ad4383d43bcd1ee Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 6 Dec 2023 16:59:53 -0800 Subject: [PATCH 008/186] Silence useless "Insufficient parallelism" autoscheduler warning (#7990) --- src/autoschedulers/mullapudi2016/AutoSchedule.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp index a83bebc637bc..9ac542cdc38f 100644 --- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp +++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp @@ -2804,9 +2804,12 @@ void Partitioner::generate_group_cpu_schedule( } } - if (can_prove(def_par < arch_params.parallelism)) { - user_warning << "Insufficient parallelism for " << f_handle.name() << "\n"; - } + // Silenced: the user can't really do anything about it, + // and it triggers on things like tiny lookup tables + // + // if (can_prove(def_par < arch_params.parallelism)) { + // user_warning << "Insufficient parallelism for " << f_handle.name() << "\n"; + // } // Find the level at which group members will be computed. int tile_inner_index = dims.size() - outer_dims.size() - 1; From 6e57d6cb871720b9af3af24fe2cc6eba8f188fc4 Mon Sep 17 00:00:00 2001 From: Volodymyr Kysenko Date: Thu, 7 Dec 2023 08:06:31 -0800 Subject: [PATCH 009/186] Add a notebook with a visualization of the aprrox_* functions and their errors (#7974) * Add a notebook with a visualization of the aprrox_* functions and their errors * Fix spelling error --- apps/hannk/halide/common_halide.h | 2 + .../docs/approx_log2_and_applications.ipynb | 382 ++++++++++++++++++ 2 files changed, 384 insertions(+) create mode 100644 apps/hannk/halide/docs/approx_log2_and_applications.ipynb diff --git a/apps/hannk/halide/common_halide.h b/apps/hannk/halide/common_halide.h index 82a9e22d408f..e499177a9410 100644 --- a/apps/hannk/halide/common_halide.h +++ b/apps/hannk/halide/common_halide.h @@ -39,6 +39,8 @@ Halide::Expr align(const Halide::Expr &x, const Halide::Expr &n); // where N is the number of bits of the narrowed result minus one. Halide::Expr multiply_2x_high(const Halide::Expr &a, const Halide::Expr &b); +// For a visualization of the approx_* functions and their errors, see: +// apps/hannk/halide/docs/approx_log2_and_applications.ipynb // Approximate log2(x/2^q_x)*2^q. // q must be less than 16. Halide::Expr approx_log2(int q, const Halide::Expr &x, int q_x, const Halide::Type &type = Halide::Int(32)); diff --git a/apps/hannk/halide/docs/approx_log2_and_applications.ipynb b/apps/hannk/halide/docs/approx_log2_and_applications.ipynb new file mode 100644 index 000000000000..d4771b5219b3 --- /dev/null +++ b/apps/hannk/halide/docs/approx_log2_and_applications.ipynb @@ -0,0 +1,382 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "r1XiiUQGUjpx" + }, + "source": [ + "import numpy as np\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Many architectures have shifts where the right-hand-side is signed. A negative\n", + "# RHS is the same as a positive shift in the other direction.\n", + "def shift_right(x, y):\n", + " return np.floor(x / 2**y)\n", + "def shift_left(x, y):\n", + " return np.floor(x * 2**y)\n", + "def rounding_shift_right(x, y):\n", + " return np.round(x / 2**y)\n", + "def rounding_shift_left(x, y):\n", + " return np.round(x * 2**y)\n", + "\n", + "def bitwise_and(x, y):\n", + " return np.mod(x, y + 1)\n", + "\n", + "# This is sqrdmulh on ARM\n", + "def multiply_2x_high(x, y):\n", + " return rounding_shift_right(x * y, 15)\n", + "\n", + "def relative_error(x, y):\n", + " return (x - y) / (np.maximum(x, y) + 1e-3)\n", + "\n", + "def plot_results(x, exact, approxs, title, logx = False, logy = False, relative = False, log2_xscale = 0, log2_yscale = 0):\n", + " fig, [p1, p2] = plt.subplots(2, 1)\n", + "\n", + " p1.set_xlabel('x')\n", + " if logx:\n", + " p1.set_xscale('log')\n", + " p1.set_ylabel(title)\n", + " if logy:\n", + " p1.set_yscale('log')\n", + "\n", + " xscale = 2**log2_xscale\n", + " yscale = 2**log2_yscale\n", + "\n", + " exact = np.round(exact*yscale)/yscale\n", + "\n", + " p1.plot(x/xscale, exact)\n", + " for approx in approxs:\n", + " p1.plot(x/xscale, approx/yscale)\n", + "\n", + " p2.set_xlabel('x')\n", + " if logx:\n", + " p2.set_xscale('log')\n", + "\n", + " p2.set_ylabel('relative error' if relative else 'error')\n", + " for approx in approxs:\n", + " p2.plot(x/xscale, relative_error(approx/yscale, exact) if relative else approx/yscale - exact)\n", + "\n", + "def eval_poly(x, p, q):\n", + " x1 = rounding_shift_left(x, 15 - q)\n", + " y = p[0]\n", + " xi = x1\n", + " for i in p[1:]:\n", + " y = y + multiply_2x_high(i, xi)\n", + " xi = multiply_2x_high(xi, x1)\n", + " return rounding_shift_right(y, 15 - q)\n", + "\n", + "points = 6\n", + "degree = 3\n", + "log2_poly_x = np.arange(points, 2 * points + 1) / points\n", + "log2_poly_y = np.log2(log2_poly_x)\n", + "log2_poly = np.polyfit(log2_poly_x - 1, log2_poly_y, degree)\n", + "\n", + "exp2_poly_x = np.arange(points, 2 * points + 1) / points\n", + "exp2_poly_y = np.exp2(exp2_poly_x - 1) - 1\n", + "exp2_poly = np.polyfit(exp2_poly_x - 1, exp2_poly_y, degree)\n", + "\n", + "log2_poly = log2_poly[::-1]\n", + "exp2_poly = exp2_poly[::-1]\n", + "\n", + "print(log2_poly)\n", + "print(exp2_poly)\n", + "\n", + "log2_poly = np.round(log2_poly * 2**15)\n", + "exp2_poly = np.round(exp2_poly * 2**15)\n", + "exp2_poly[0] = 0\n", + "\n", + "print(log2_poly)\n", + "print(exp2_poly)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1xjo4hIEo_z5" + }, + "source": [ + "# Approximate N*log2(x*2^q_x), where N = 2^q, and the intermediate computations are\n", + "# restricted to be integers.\n", + "def approx_log2(x, q, q_x = 0):\n", + " # This can be computed with count_leading_zeros\n", + " floor_log2_x = np.select([x > 0], [np.floor(np.log2(x))], [-1])\n", + "\n", + " # We've computed log2(x*2^q_x) = log2(x) + q_x. Subtract that offset now\n", + " # before multiplying by the result quantization.\n", + " result = shift_left(floor_log2_x - q_x, q)\n", + "\n", + " frac = bitwise_and(shift_right(x, floor_log2_x - q), 2**q - 1)\n", + "\n", + " return result + eval_poly(frac, log2_poly, q)\n", + "\n", + "x = np.arange(1, 10000)\n", + "q = 15\n", + "q_x = 2\n", + "log2_x = np.log2(x / 2**q_x)\n", + "approx_log2_x = approx_log2(x, q, q_x)\n", + "\n", + "plot_results(x, log2_x, [approx_log2_x], 'log2(x)', logx=True, log2_xscale=q_x, log2_yscale=q)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "6uJN5muLsLdE" + }, + "source": [ + "\n", + "# Approximate 2^(x/2^q_x)*2^q\n", + "def approx_exp2(x, q_x, q):\n", + " int_part = shift_right(x, q_x)\n", + " frac_part = x - shift_left(int_part, q_x)\n", + "\n", + " frac_part = eval_poly(frac_part, exp2_poly, q_x)\n", + "\n", + " exp_int_part = shift_left(1, int_part + q)\n", + " return exp_int_part + rounding_shift_right(exp_int_part * frac_part, q_x)\n", + "\n", + "q_x = 10\n", + "q = 15\n", + "x = np.arange(-4000, 2000)\n", + "approx_exp2_x = approx_exp2(x, q_x, q)\n", + "exact = np.exp2(x / 2**q_x)\n", + "\n", + "plot_results(x, exact, [approx_exp2_x], '2^x', False, True, relative=True, log2_xscale=q_x, log2_yscale=q)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5BP-edzCmNBi" + }, + "source": [ + "q = 15\n", + "x = np.arange(10, 10000) * 10\n", + "round_trip_x = approx_exp2(approx_log2(x, q), q, 0)\n", + "\n", + "plot_results(x, x, [round_trip_x], '2^log2(x)', logx=True, logy=True, relative=True)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nyrzI90uNH1s" + }, + "source": [ + "# Approximate 2^q*sqrt(2^(x/2^q_x))\n", + "def sqrt_approx_exp2(x, q_x, q):\n", + " return approx_exp2(x, q_x + 1, q)\n", + "\n", + "q = 11\n", + "q_x = 8\n", + "x = np.arange(-1000, 2000)\n", + "approx_exp2_x = sqrt_approx_exp2(x, q_x, q)\n", + "exact = np.sqrt(np.exp2(x / 2**q_x))\n", + "\n", + "plot_results(x, exact, [approx_exp2_x], 'sqrt(2^x)', relative=True, log2_xscale=q_x, log2_yscale=q)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Kno5t4VihCTL" + }, + "source": [ + "# Approximate sqrt(x) = 2^((1/2)*log2(x))\n", + "def approx_sqrt(x, q):\n", + " # log2(x) will never be larger than 32, for 32-bit x. So to make the result\n", + " # fit in a 16-bit integer, we can make the precision 2^16/32 = 2048.\n", + " q_x = 11;\n", + "\n", + " log2_sqrt_x = approx_log2(x, q_x - 1)\n", + " return approx_exp2(log2_sqrt_x, q_x, q)\n", + "\n", + "q = 15\n", + "x = np.arange(1, 10000)**2\n", + "sqrt_x = np.sqrt(x)\n", + "approx_sqrt_x = approx_sqrt(x, q)\n", + "\n", + "plot_results(x, sqrt_x, [approx_sqrt_x], 'sqrt(x)', log2_yscale=q, relative=True)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "0dMecIGr92WY" + }, + "source": [ + "# Approximate 2^31/sqrt(x) = 2^(-(1/2)*log2(x))\n", + "def approx_reciprocal_sqrt(x):\n", + " q = 15\n", + " log2_sqrt_x = approx_log2(x, q - 1)\n", + " return approx_exp2(-log2_sqrt_x, q, 31)\n", + "\n", + "x = np.arange(1, 10000)**2\n", + "inv_sqrt_x = 1 / np.sqrt(x)\n", + "approx_reciprocal_sqrt_x = approx_reciprocal_sqrt(x)\n", + "\n", + "plot_results(x, inv_sqrt_x, [approx_reciprocal_sqrt_x], '1/sqrt(x)', True, True, True, log2_yscale=31)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VFC9aUFcc8d7" + }, + "source": [ + "# Approximate 2^32/x = 2^32*2^(-log2(x))\n", + "def approx_reciprocal(x):\n", + " q = 15;\n", + " log2_x = approx_log2(x, q)\n", + " return approx_exp2(-log2_x, q, 31)\n", + "\n", + "x = 1.01**np.arange(0, 2000)\n", + "inv_x = 1 / x\n", + "approx_inv_x = approx_reciprocal(x)\n", + "# This is ~sqrt(2) times more accurate, but maybe not practical for large x.\n", + "approx_inv_sqrt_x2 = approx_reciprocal_sqrt(x*x)\n", + "\n", + "plot_results(x, inv_x, [approx_inv_x], '1/x', True, True, log2_yscale=31, relative=True)\n", + "plot_results(x, inv_x, [approx_inv_sqrt_x2], '1/x', True, True, log2_yscale=31, relative=True)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "6BhQzLIZCcKC" + }, + "source": [ + "# Approximate log2(exp2(x) + c)\n", + "def approx_log2_exp2_plus_constant(x, c, q_x, q):\n", + " # When x/2^q_x is large, approx_exp2 below will overflow. But when it is large\n", + " # we don't need it to be very precise\n", + " q_exp = 16 #np.minimum(16, 16 - np.floor(np.log2(np.maximum(x, 1))))\n", + " one = 2**q_exp\n", + "\n", + " one_plus_exp2_x = one * c + approx_exp2(x, q_x, q_exp)\n", + " # Mimic overflow of int32\n", + " one_plus_exp2_x = np.mod(one_plus_exp2_x, 2**31)\n", + "\n", + " raw = approx_log2(one_plus_exp2_x, q, q_exp)\n", + "\n", + " line = rounding_shift_right(x, q_x - q)\n", + "\n", + " threshold = 30 - q_exp\n", + " result = np.select([shift_right(x, q_x) < threshold], [raw], line)\n", + " return result\n", + "\n", + "def approx_log2p1_exp2(x, q_x, q):\n", + " return approx_log2_exp2_plus_constant(x, 1, q_x, q)\n", + "\n", + "def approx_log2m1_exp2(x, q_x, q):\n", + " return approx_log2_exp2_plus_constant(x, -1, q_x, q)\n", + "\n", + "x = np.arange(-4000, 4000)*8\n", + "q_x = 11\n", + "q = 15\n", + "\n", + "exact = np.log2(np.exp2(x / 2**q_x) + 1)\n", + "approx = approx_log2p1_exp2(x, q_x, q)\n", + "plot_results(x, exact, [approx], 'log2(2^x + 1)', log2_xscale=q_x, log2_yscale=q)\n", + "\n", + "x = np.arange(1, 4000)*8\n", + "exact = np.log2(np.exp2(x / 2**q_x) - 1)\n", + "approx = approx_log2m1_exp2(x, q_x, q)\n", + "plot_results(x, exact, [approx], 'log2(2^x - 1)', log2_xscale=q_x, log2_yscale=q)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "G6n1u8fcUf-3" + }, + "source": [ + "# Approximate logistic(x) = 1/(e^-x + 1)\n", + "# = 2^log2(1/(e^-x + 1))\n", + "# = 2^-log2(e^-x + 1)\n", + "def approx_logistic(x, q_x, q):\n", + " x2 = multiply_2x_high(x, np.round(-np.log2(np.exp(1)) * 2**14))\n", + " q_exp = 11\n", + " log2_d = approx_log2p1_exp2(x2, q_x - 1, q_exp)\n", + " return approx_exp2(-log2_d, q_exp, q)\n", + "\n", + "x = np.arange(-4000, 4000)*8\n", + "q_x = 11\n", + "q = 15\n", + "exact = 1 / (1 + np.exp(-x / 2**q_x))\n", + "approx = approx_logistic(x, q_x, q)\n", + "plot_results(x, exact, [approx], '1/(1 + e^-x)', log2_xscale=q_x, log2_yscale=q)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LBXXNc_8twQD" + }, + "source": [ + "# Approximate tanh(x) = (e^2x - 1)/(e^2x + 1)\n", + "# = 2^log2((e^2x - 1)/(e^2x + 1))\n", + "# = 2^(log2(e^2x - 1) - log2(e^2x + 1))\n", + "def approx_tanh(x, q_x, q):\n", + " abs_x_base2 = multiply_2x_high(np.abs(x), np.round(np.log2(np.exp(1)) * 2**14))\n", + " q_exp = 11\n", + " log2_n = approx_log2m1_exp2(abs_x_base2, q_x - 2, q_exp)\n", + " log2_d = approx_log2p1_exp2(abs_x_base2, q_x - 2, q_exp)\n", + " # Saturate at int16\n", + " log2_n = np.clip(log2_n, -(2**15), 2**15)\n", + " log2_d = np.clip(log2_d, -(2**15), 2**15)\n", + " return np.sign(x) * approx_exp2(log2_n - log2_d, q_exp, q)\n", + "\n", + "x = np.arange(-4000, 4000)*8\n", + "q_x = 12\n", + "q = 15\n", + "exact = np.tanh(x / 2**q_x)\n", + "approx = approx_tanh(x, q_x, q)\n", + "\n", + "points = 20\n", + "poly_x = np.arange(0, points * 3) / points\n", + "poly_y = np.tanh(poly_x)\n", + "poly = np.polyfit(poly_x, poly_y, 6)\n", + "approx2 = np.polyval(poly, x / 2**q_x) * 2**q\n", + "\n", + "\n", + "plot_results(x, exact, [approx], 'tanh(x)', log2_xscale=q_x, log2_yscale=q)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From d1ecc1fb65fcf9dafb573c7f781bb1ab6f41d264 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 7 Dec 2023 08:06:57 -0800 Subject: [PATCH 010/186] Make narrowing float->int casts on wasm go via wider ints (#7973) Fixes #7972 --- src/CodeGen_WebAssembly.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp index 69d696ce9f8a..948346ad7c2a 100644 --- a/src/CodeGen_WebAssembly.cpp +++ b/src/CodeGen_WebAssembly.cpp @@ -175,6 +175,17 @@ void CodeGen_WebAssembly::visit(const Cast *op) { } } } + + // Narrowing float -> int casts should go via an integer type of the + // matching width (see https://github.com/halide/Halide/issues/7972) + if (op->value.type().is_float() && + (op->type.is_int() || op->type.is_uint()) && + op->type.bits() < op->value.type().bits()) { + Expr equiv = Cast::make(op->type.with_bits(op->value.type().bits()), op->value); + equiv = Cast::make(op->type, equiv); + codegen(equiv); + return; + } } CodeGen_Posix::visit(op); From 83febb0ad0919e85e5832371907feaa81e342b26 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 7 Dec 2023 09:46:27 -0800 Subject: [PATCH 011/186] Fix handling of assert statements whose conditions get vectorized (#7989) * Fix handling of assert statements whose conditions get vectorized * Fix test name --- src/IRPrinter.cpp | 1 - src/VectorizeLoops.cpp | 2 +- test/correctness/CMakeLists.txt | 1 + test/correctness/vectorized_assert.cpp | 46 ++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 test/correctness/vectorized_assert.cpp diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index dc07d0e0f010..52cb3714268c 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -1109,7 +1109,6 @@ void IRPrinter::visit(const VectorReduce *op) { stream << "(" << op->type << ")vector_reduce_" << op->op << "(" - << ", " << op->value << ")"; } diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index df116c841217..7ced1dab0d92 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -816,7 +816,7 @@ class VectorSubs : public IRMutator { } Stmt visit(const AssertStmt *op) override { - return (op->condition.type().lanes() > 1) ? scalarize(op) : op; + return (mutate(op->condition).type().lanes() > 1) ? scalarize(op) : op; } Stmt visit(const IfThenElse *op) override { diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 9b72d5ceecb3..da968c419593 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -335,6 +335,7 @@ tests(GROUPS correctness vectorize_mixed_widths.cpp vectorize_nested.cpp vectorize_varying_allocation_size.cpp + vectorized_assert.cpp vectorized_gpu_allocation.cpp vectorized_initialization.cpp vectorized_load_from_vectorized_allocation.cpp diff --git a/test/correctness/vectorized_assert.cpp b/test/correctness/vectorized_assert.cpp new file mode 100644 index 000000000000..3a71174c347b --- /dev/null +++ b/test/correctness/vectorized_assert.cpp @@ -0,0 +1,46 @@ +#include "Halide.h" + +using namespace Halide; + +int error_count = 0; +void my_error(JITUserContext *ucon, const char *msg) { + error_count++; +} + +int main(int argc, char **argv) { + Func f("f"), g("g"); + Var x("x"); + Param p; + + f(x) = x; + f(x) += 1; + g(x) = f(x) + f(2 * x + p); + + g.vectorize(x, 8); + f.bound_storage(x, 32); + // No way to check this at compile time. The size of f depends on both x and + // p. An assert is injected, but the assert is inside g's vectorized loop. + + g.jit_handlers().custom_error = my_error; + + g.compile_jit(); + + // Will trigger the assert + p.set(256); + g.realize({128}); + if (error_count != 1) { + printf("There should have been an error\n"); + return 1; + } + + // Will not trigger the assert + p.set(0); + g.realize({8}); + if (error_count != 1) { + printf("There should not have been an error\n"); + return 1; + } + + printf("Success!\n"); + return 0; +} From df36139fee3a1b751b2878403dfd120d9a18fb9c Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 7 Dec 2023 10:02:42 -0800 Subject: [PATCH 012/186] Fix all "unscheduled update()" warnings in our code (#7991) * Fix all "unscheduled update()" warnings in our code And also fix the Mullapudi scheduler to explicitly touch all update stages. This allows us to mark this warning as an error if we so choose. * fixes * fixes * Update recursive_box_filters.cpp --- apps/hist/hist_generator.cpp | 3 +- apps/iir_blur/iir_blur_generator.cpp | 2 ++ .../anderson2021/cost_model_generator.cpp | 2 +- .../mullapudi2016/AutoSchedule.cpp | 33 ++++++++++++++----- test/correctness/recursive_box_filters.cpp | 4 +++ test/error/tuple_output_bounds_check.cpp | 1 + 6 files changed, 35 insertions(+), 10 deletions(-) diff --git a/apps/hist/hist_generator.cpp b/apps/hist/hist_generator.cpp index 32d86d3d0186..3401088e3672 100644 --- a/apps/hist/hist_generator.cpp +++ b/apps/hist/hist_generator.cpp @@ -181,6 +181,7 @@ class Hist : public Halide::Generator { .compute_at(hist_rows.in(), y) .vectorize(x, vec); + hist_rows.update(0).unscheduled(); hist_rows.in() .compute_root() .vectorize(x, vec) @@ -199,7 +200,7 @@ class Hist : public Halide::Generator { .parallel(x) .reorder(ry, x); - cdf.compute_root(); + cdf.compute_root().update().unscheduled(); output.reorder(c, x, y) .bound(c, 0, 3) .unroll(c) diff --git a/apps/iir_blur/iir_blur_generator.cpp b/apps/iir_blur/iir_blur_generator.cpp index 1aeb3e0d1a5f..cfb967390f8c 100644 --- a/apps/iir_blur/iir_blur_generator.cpp +++ b/apps/iir_blur/iir_blur_generator.cpp @@ -51,6 +51,8 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule blur.compute_at(transpose, yo); // Vectorize computations within the strips. + blur.update(0) + .unscheduled(); blur.update(1) .reorder(x, ry) .vectorize(x); diff --git a/src/autoschedulers/anderson2021/cost_model_generator.cpp b/src/autoschedulers/anderson2021/cost_model_generator.cpp index 6dfeb0dc62b5..e40971c5729a 100644 --- a/src/autoschedulers/anderson2021/cost_model_generator.cpp +++ b/src/autoschedulers/anderson2021/cost_model_generator.cpp @@ -661,7 +661,7 @@ class CostModel : public Generator> { }; // Pipeline features processing - conv1_stage1.compute_root().vectorize(c); + conv1_stage1.compute_root().vectorize(c).update().vectorize(c); squashed_head1_filter.compute_root().vectorize(c); // Schedule features processing. The number of schedule diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp index 9ac542cdc38f..be2ede0748b0 100644 --- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp +++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp @@ -837,20 +837,27 @@ struct AutoSchedule { } } - for (const auto &m : f.second) { - const int stage = m.first; - const vector &schedules = m.second; - internal_assert(!schedules.empty()); + const int num_stages = func.updates().size() + 1; + for (int stage = 0; stage < num_stages; stage++) { schedule_ss << " " << fname; if (stage > 0) { - schedule_ss << ".update(" << std::to_string(stage - 1) << ")"; + schedule_ss << ".update(" << (stage - 1) << ")"; } - for (const std::string &s : schedules) { - schedule_ss << "\n ." << s; + auto it = f.second.find(stage); + if (it != f.second.end()) { + const vector &schedules = it->second; + internal_assert(!schedules.empty()); + for (const std::string &s : schedules) { + internal_assert(!s.empty()); + schedule_ss << "\n ." << s; + } + } else { + if (stage > 0) { + schedule_ss << ".unscheduled()"; + } } schedule_ss << ";\n"; } - schedule_ss << "}\n"; } @@ -3386,6 +3393,16 @@ string generate_schedules(const vector &outputs, const Target &target, debug(2) << "Generating CPU schedule...\n"; part.generate_cpu_schedule(target, sched); + // Ensure that all update stages are "touched" so we get no warnings/errors + for (const auto &f : sched.func_schedules) { + const Function &func = get_element(sched.env, f.first); + const int num_update_stages = func.updates().size(); + for (int stage = 0; stage < num_update_stages; stage++) { + Definition def = get_stage_definition(func, stage + 1); + def.schedule().touched() = true; + } + } + std::ostringstream oss; oss << sched; string sched_string = oss.str(); diff --git a/test/correctness/recursive_box_filters.cpp b/test/correctness/recursive_box_filters.cpp index 443542ed38bc..58012cbb50cc 100644 --- a/test/correctness/recursive_box_filters.cpp +++ b/test/correctness/recursive_box_filters.cpp @@ -26,6 +26,10 @@ int main(int argc, char **argv) { // have to pass 'true' to the atomic call to tell it to skip the check. h.update(2).atomic(true).vectorize(r, 16); + // These stages don't need scheduling + h.update(0).unscheduled(); + h.update(1).unscheduled(); + Buffer r0(size); Buffer r1(size); h.realize({r0, r1}); diff --git a/test/error/tuple_output_bounds_check.cpp b/test/error/tuple_output_bounds_check.cpp index 53b3a26a8337..74df02134182 100644 --- a/test/error/tuple_output_bounds_check.cpp +++ b/test/error/tuple_output_bounds_check.cpp @@ -17,6 +17,7 @@ int main(int argc, char **argv) { Var xo, xi; h.split(x, xo, xi, 16, TailStrategy::RoundUp); + h.update(0).unscheduled(); Buffer r0(size); Buffer r1(size); From 5aa891a78ac2aa970aff1d3128756f7884b5dab5 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 7 Dec 2023 10:03:06 -0800 Subject: [PATCH 013/186] =?UTF-8?q?Silence=20useless=20'Outer=20dim=20vect?= =?UTF-8?q?orization=20of=20var'=20warning=20in=20Mullapudi=E2=80=A6=20(#7?= =?UTF-8?q?992)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Silence useless 'Outer dim vectorization of var' warning in Mullapudi scheduler --- src/autoschedulers/mullapudi2016/AutoSchedule.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp index be2ede0748b0..2ce325538a86 100644 --- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp +++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp @@ -2479,10 +2479,13 @@ void Partitioner::vectorize_stage(const Group &g, Stage f_handle, int stage_num, // storage dimension of the func. // // TODO: Check if the warning is necessary. - if (vec_dim_index > 0) { - user_warning << "Outer dim vectorization of var \"" << vec_dim_name - << "\" in function \"" << f_handle.name() << "\"\n"; - } + // + // Disabled: this isn't really user actionable, and is just noise. + // + // if (vec_dim_index > 0) { + // user_warning << "Outer dim vectorization of var \"" << vec_dim_name + // << "\" in function \"" << f_handle.name() << "\"\n"; + // } } } From 19c1c81e8946a6d4471b65be7fb609f055b5ae68 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 8 Dec 2023 08:50:01 -0800 Subject: [PATCH 014/186] Make wasm +sign-ext and +nontrapping-fptoint the default (#7995) * Make wasm +sign-ext and +nontrapping-fptoint the default These have been supported in ~all wasm runtimes for a while now, and +nontrapping-fptoint in particular can make a big performance difference. We should enable these by default, and add a new backdoor (wasm_mvponly) for code paths that need to use the original wasm Minimum Viable Product spec only. * Update simd_op_check_wasm.cpp --- README_webassembly.md | 17 ++++++++--------- python_bindings/src/halide/halide_/PyEnums.cpp | 3 +-- src/CodeGen_WebAssembly.cpp | 8 ++------ src/Target.cpp | 3 +-- src/Target.h | 3 +-- src/WasmExecutor.cpp | 6 ++---- src/runtime/HalideRuntime.h | 3 +-- test/correctness/simd_op_check_wasm.cpp | 7 ++++--- 8 files changed, 20 insertions(+), 30 deletions(-) diff --git a/README_webassembly.md b/README_webassembly.md index f5fad7d79995..0fdcf80f15f1 100644 --- a/README_webassembly.md +++ b/README_webassembly.md @@ -6,10 +6,11 @@ backend. As WebAssembly itself is still under active development, Halide's support has some limitations. Some of the most important: +- Sign-extension operations are enabled by default (but can be avoided via + Target::WasmMvpOnly). +- Non-trapping float-to-int conversions are enabled by default (but can be + avoided via Target::WasmMvpOnly). - Fixed-width SIMD (128 bit) can be enabled via Target::WasmSimd128. -- Sign-extension operations can be enabled via Target::WasmSignExt. -- Non-trapping float-to-int conversions can be enabled via - Target::WasmSatFloatToInt. - Threads have very limited support via Target::WasmThreads; see [below](#using-threads) for more details. - Halide's JIT for Wasm is extremely limited and really useful only for @@ -152,9 +153,8 @@ cmake -DLLVM_ENABLE_PROJECTS="clang;lld" ... ``` - To run the JIT tests, set `HL_JIT_TARGET=wasm-32-wasmrt` (possibly adding - `wasm_simd128`, `wasm_signext`, and/or `wasm_sat_float_to_int`) and run - CMake/CTest normally. Note that wasm testing is only support under CMake - (not via Make). + `wasm_simd128`) and run CMake/CTest normally. Note that wasm testing is + only supported under CMake (not via Make). ## Enabling wasm AOT @@ -165,9 +165,8 @@ will), you need to install Emscripten locally. (https://emscripten.org/docs/getting_started/downloads.html). - To run the AOT tests, set `HL_TARGET=wasm-32-wasmrt` (possibly adding - `wasm_simd128`, `wasm_signext`, and/or `wasm_sat_float_to_int`) and run - CMake/CTest normally. Note that wasm testing is only support under CMake - (not via Make). + `wasm_simd128`) and run CMake/CTest normally. Note that wasm testing is + only supported under CMake (not via Make). # Running benchmarks diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp index 1913b204fbd4..f86e7072edd5 100644 --- a/python_bindings/src/halide/halide_/PyEnums.cpp +++ b/python_bindings/src/halide/halide_/PyEnums.cpp @@ -165,9 +165,8 @@ void define_enums(py::module &m) { .value("HexagonDma", Target::Feature::HexagonDma) .value("EmbedBitcode", Target::Feature::EmbedBitcode) .value("EnableLLVMLoopOpt", Target::Feature::EnableLLVMLoopOpt) + .value("WasmMvpOnly", Target::Feature::WasmMvpOnly) .value("WasmSimd128", Target::Feature::WasmSimd128) - .value("WasmSignExt", Target::Feature::WasmSignExt) - .value("WasmSatFloatToInt", Target::Feature::WasmSatFloatToInt) .value("WasmThreads", Target::Feature::WasmThreads) .value("WasmBulkMemory", Target::Feature::WasmBulkMemory) .value("SVE", Target::Feature::SVE) diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp index 948346ad7c2a..3e9aedca3fbf 100644 --- a/src/CodeGen_WebAssembly.cpp +++ b/src/CodeGen_WebAssembly.cpp @@ -333,9 +333,10 @@ string CodeGen_WebAssembly::mattrs() const { std::ostringstream s; string sep; - if (target.has_feature(Target::WasmSignExt)) { + if (!target.has_feature(Target::WasmMvpOnly)) { s << sep << "+sign-ext"; sep = ","; + s << sep << "+nontrapping-fptoint"; } if (target.has_feature(Target::WasmSimd128)) { @@ -343,11 +344,6 @@ string CodeGen_WebAssembly::mattrs() const { sep = ","; } - if (target.has_feature(Target::WasmSatFloatToInt)) { - s << sep << "+nontrapping-fptoint"; - sep = ","; - } - if (target.has_feature(Target::WasmThreads)) { // "WasmThreads" doesn't directly affect LLVM codegen, // but it does end up requiring atomics, so be sure to enable them. diff --git a/src/Target.cpp b/src/Target.cpp index 597d5bf5367d..e222e97d5282 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -533,8 +533,7 @@ const std::map feature_name_map = { {"embed_bitcode", Target::EmbedBitcode}, {"enable_llvm_loop_opt", Target::EnableLLVMLoopOpt}, {"wasm_simd128", Target::WasmSimd128}, - {"wasm_signext", Target::WasmSignExt}, - {"wasm_sat_float_to_int", Target::WasmSatFloatToInt}, + {"wasm_mvponly", Target::WasmMvpOnly}, {"wasm_threads", Target::WasmThreads}, {"wasm_bulk_memory", Target::WasmBulkMemory}, {"webgpu", Target::WebGPU}, diff --git a/src/Target.h b/src/Target.h index 76b06aed6b8e..331694e34c3a 100644 --- a/src/Target.h +++ b/src/Target.h @@ -143,9 +143,8 @@ struct Target { CheckUnsafePromises = halide_target_feature_check_unsafe_promises, EmbedBitcode = halide_target_feature_embed_bitcode, EnableLLVMLoopOpt = halide_target_feature_enable_llvm_loop_opt, + WasmMvpOnly = halide_target_feature_wasm_mvponly, WasmSimd128 = halide_target_feature_wasm_simd128, - WasmSignExt = halide_target_feature_wasm_signext, - WasmSatFloatToInt = halide_target_feature_wasm_sat_float_to_int, WasmThreads = halide_target_feature_wasm_threads, WasmBulkMemory = halide_target_feature_wasm_bulk_memory, WebGPU = halide_target_feature_webgpu, diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp index d82932bd3ea0..b99efdc6d67e 100644 --- a/src/WasmExecutor.cpp +++ b/src/WasmExecutor.cpp @@ -1308,15 +1308,13 @@ wabt::interp::HostFunc::Ptr make_extern_callback(wabt::interp::Store &store, wabt::Features calc_features(const Target &target) { wabt::Features f; - if (target.has_feature(Target::WasmSignExt)) { + if (!target.has_feature(Target::WasmMvpOnly)) { f.enable_sign_extension(); + f.enable_sat_float_to_int(); } if (target.has_feature(Target::WasmSimd128)) { f.enable_simd(); } - if (target.has_feature(Target::WasmSatFloatToInt)) { - f.enable_sat_float_to_int(); - } return f; } #endif // WITH_WABT diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 445811009abd..f50e498ce88e 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1386,9 +1386,8 @@ typedef enum halide_target_feature_t { halide_target_feature_hexagon_dma, ///< Enable Hexagon DMA buffers. halide_target_feature_embed_bitcode, ///< Emulate clang -fembed-bitcode flag. halide_target_feature_enable_llvm_loop_opt, ///< Enable loop vectorization + unrolling in LLVM. Overrides halide_target_feature_disable_llvm_loop_opt. (Ignored for non-LLVM targets.) + halide_target_feature_wasm_mvponly, ///< Disable all extensions to WebAssembly codegen (including +sign-ext and +nontrapping-fptoint, which are on by default). halide_target_feature_wasm_simd128, ///< Enable +simd128 instructions for WebAssembly codegen. - halide_target_feature_wasm_signext, ///< Enable +sign-ext instructions for WebAssembly codegen. - halide_target_feature_wasm_sat_float_to_int, ///< Enable saturating (nontrapping) float-to-int instructions for WebAssembly codegen. halide_target_feature_wasm_threads, ///< Enable use of threads in WebAssembly codegen. Requires the use of a wasm runtime that provides pthread-compatible wrappers (typically, Emscripten with the -pthreads flag). Unsupported under WASI. halide_target_feature_wasm_bulk_memory, ///< Enable +bulk-memory instructions for WebAssembly codegen. halide_target_feature_webgpu, ///< Enable the WebGPU runtime. diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp index 87f3a0263047..6b6898c82b85 100644 --- a/test/correctness/simd_op_check_wasm.cpp +++ b/test/correctness/simd_op_check_wasm.cpp @@ -16,8 +16,8 @@ class SimdOpCheckWASM : public SimdOpCheckTest { SimdOpCheckWASM(Target t, int w = 768, int h = 128) : SimdOpCheckTest(t, w, h) { use_wasm_simd128 = target.has_feature(Target::WasmSimd128); - use_wasm_sat_float_to_int = target.has_feature(Target::WasmSatFloatToInt); - use_wasm_sign_ext = target.has_feature(Target::WasmSignExt); + use_wasm_sign_ext = !target.has_feature(Target::WasmMvpOnly); + use_wasm_sat_float_to_int = !target.has_feature(Target::WasmMvpOnly); } void add_tests() override { @@ -544,6 +544,7 @@ int main(int argc, char **argv) { argc, argv, { Target("wasm-32-wasmrt"), - Target("wasm-32-wasmrt-wasm_simd128-wasm_sat_float_to_int"), + Target("wasm-32-wasmrt-wasm_simd128"), + Target("wasm-32-wasmrt-wasm_mvponly"), }); } From 96435186fe4aef03b075476eabd3618849be35eb Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 8 Dec 2023 09:50:32 -0800 Subject: [PATCH 015/186] Add join_strings() call and use it from mattrs() (#7997) * Add join_strings() call and use it from mattrs() This is a super-nit kind of fix, but the fact that we had rerolled a join-strings algo in a half-dozen places made my teeth hurt, so I decided to fix it: - Add join_strings() to Util.h - revise the mattrs() calls to use it instead of the janky mess they used This doesn't move the needle on code size or speed but it is less weird. Probably other places we could/should use this too. (Does C++20 have join/split strings in the std library yet? If not, why not?) * Update Util.h * Update Util.h * clang-tidy --- src/CodeGen_ARM.cpp | 31 +++++++++++------------------ src/CodeGen_Hexagon.cpp | 11 ++++++----- src/CodeGen_PowerPC.cpp | 32 ++++++++++-------------------- src/CodeGen_RISCV.cpp | 14 +++++++++---- src/CodeGen_WebAssembly.cpp | 32 +++++++++++------------------- src/CodeGen_X86.cpp | 39 +++++++++++++++++++++---------------- src/Util.h | 24 +++++++++++++++++++++++ 7 files changed, 94 insertions(+), 89 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 03678e5ef605..4cf1dc597ab4 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1647,46 +1647,37 @@ string CodeGen_ARM::mcpu_tune() const { } string CodeGen_ARM::mattrs() const { - string arch_flags; - string separator; + std::vector attrs; if (target.has_feature(Target::ARMFp16)) { - arch_flags += separator + "+fullfp16"; - separator = ","; + attrs.emplace_back("+fullfp16"); } if (target.has_feature(Target::ARMv81a)) { - arch_flags += separator + "+v8.1a"; - separator = ","; + attrs.emplace_back("+v8.1a"); } if (target.has_feature(Target::ARMDotProd)) { - arch_flags += separator + "+dotprod"; - separator = ","; + attrs.emplace_back("+dotprod"); } if (target.bits == 32) { if (target.has_feature(Target::ARMv7s)) { - arch_flags += separator + "+neon"; - separator = ","; + attrs.emplace_back("+neon"); } if (!target.has_feature(Target::NoNEON)) { - arch_flags += separator + "+neon"; - separator = ","; + attrs.emplace_back("+neon"); } else { - arch_flags += separator + "-neon"; - separator = ","; + attrs.emplace_back("-neon"); } } else { // TODO: Should Halide's SVE flags be 64-bit only? if (target.has_feature(Target::SVE2)) { - arch_flags = "+sve2"; - separator = ","; + attrs.emplace_back("+sve2"); } else if (target.has_feature(Target::SVE)) { - arch_flags = "+sve"; - separator = ","; + attrs.emplace_back("+sve"); } if (target.os == Target::IOS || target.os == Target::OSX) { - arch_flags += separator + "+reserve-x18"; + attrs.emplace_back("+reserve-x18"); } } - return arch_flags; + return join_strings(attrs, ","); } bool CodeGen_ARM::use_soft_float_abi() const { diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index 54f084b9c271..9463a4c921aa 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -1801,13 +1801,14 @@ string CodeGen_Hexagon::mcpu_tune() const { } string CodeGen_Hexagon::mattrs() const { - std::stringstream attrs; - attrs << "+hvx-length128b"; - attrs << ",+long-calls"; + std::vector attrs = { + "+hvx-length128b", + "+long-calls", + }; if (target.has_feature(Target::HVX)) { - attrs << ",+hvxv" << isa_version; + attrs.push_back("+hvxv" + std::to_string(isa_version)); } - return attrs.str(); + return join_strings(attrs, ","); } bool CodeGen_Hexagon::use_soft_float_abi() const { diff --git a/src/CodeGen_PowerPC.cpp b/src/CodeGen_PowerPC.cpp index 1f9c96c24d3d..6d7303de3b52 100644 --- a/src/CodeGen_PowerPC.cpp +++ b/src/CodeGen_PowerPC.cpp @@ -161,28 +161,16 @@ string CodeGen_PowerPC::mcpu_tune() const { } string CodeGen_PowerPC::mattrs() const { - string features; - string separator; - string enable; - - features += "+altivec"; - separator = ","; - - enable = target.has_feature(Target::VSX) ? "+" : "-"; - features += separator + enable + "vsx"; - separator = ","; - - enable = target.has_feature(Target::POWER_ARCH_2_07) ? "+" : "-"; - features += separator + enable + "power8-altivec"; - separator = ","; - - // These move instructions are defined in POWER ISA 2.06 but we do - // not check for 2.06 currently. So disable this for anything - // lower than ISA 2.07 - features += separator + enable + "direct-move"; - separator = ","; - - return features; + std::vector attrs = { + "+altivec", + target.has_feature(Target::VSX) ? "+vsx" : "-vsx", + target.has_feature(Target::POWER_ARCH_2_07) ? "+power8-altivec" : "-power8-altivec", + // These move instructions are defined in POWER ISA 2.06 but we do + // not check for 2.06 currently. So disable this for anything + // lower than ISA 2.07 + target.has_feature(Target::POWER_ARCH_2_07) ? "+direct-move" : "-direct-move", + }; + return join_strings(attrs, ","); } bool CodeGen_PowerPC::use_soft_float_abi() const { diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp index 234dae37e6ec..a702baff78a2 100644 --- a/src/CodeGen_RISCV.cpp +++ b/src/CodeGen_RISCV.cpp @@ -164,17 +164,23 @@ string CodeGen_RISCV::mattrs() const { // +f Single-Precision Floating-Point, // +d Double-Precision Floating-Point, // +c Compressed Instructions, - string arch_flags = "+m,+a,+f,+d,+c"; + std::vector attrs = { + "+m", + "+a", + "+f", + "+d", + "+c", + }; if (target.has_feature(Target::RVV)) { - arch_flags += ",+v"; + attrs.emplace_back("+v"); #if LLVM_VERSION >= 160 if (target.vector_bits != 0) { - arch_flags += ",+zvl" + std::to_string(target.vector_bits) + "b"; + attrs.push_back("+zvl" + std::to_string(target.vector_bits) + "b"); } #endif } - return arch_flags; + return join_strings(attrs, ","); } string CodeGen_RISCV::mabi() const { diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp index 3e9aedca3fbf..6f37f1447df1 100644 --- a/src/CodeGen_WebAssembly.cpp +++ b/src/CodeGen_WebAssembly.cpp @@ -330,46 +330,36 @@ string CodeGen_WebAssembly::mcpu_tune() const { } string CodeGen_WebAssembly::mattrs() const { - std::ostringstream s; - string sep; + user_assert(target.os == Target::WebAssemblyRuntime) + << "wasmrt is the only supported 'os' for WebAssembly at this time."; + + std::vector attrs; if (!target.has_feature(Target::WasmMvpOnly)) { - s << sep << "+sign-ext"; - sep = ","; - s << sep << "+nontrapping-fptoint"; + attrs.emplace_back("+sign-ext"); + attrs.emplace_back("+nontrapping-fptoint"); } - if (target.has_feature(Target::WasmSimd128)) { - s << sep << "+simd128"; - sep = ","; + attrs.emplace_back("+simd128"); } - if (target.has_feature(Target::WasmThreads)) { // "WasmThreads" doesn't directly affect LLVM codegen, // but it does end up requiring atomics, so be sure to enable them. - s << sep << ",+atomics"; - sep = ","; + attrs.emplace_back("+atomics"); } - // PIC implies +mutable-globals because the PIC ABI used by the linker // depends on importing and exporting mutable globals. Also -pthread implies // mutable-globals too, so quitely enable it if either of these are specified. if (use_pic() || target.has_feature(Target::WasmThreads)) { - s << sep << "+mutable-globals"; - sep = ","; + attrs.emplace_back("+mutable-globals"); } - // Recent Emscripten builds assume that specifying `-pthread` implies bulk-memory too, // so quietly enable it if either of these are specified. if (target.has_feature(Target::WasmBulkMemory) || target.has_feature(Target::WasmThreads)) { - s << sep << "+bulk-memory"; - sep = ","; + attrs.emplace_back("+bulk-memory"); } - user_assert(target.os == Target::WebAssemblyRuntime) - << "wasmrt is the only supported 'os' for WebAssembly at this time."; - - return s.str(); + return join_strings(attrs, ","); } bool CodeGen_WebAssembly::use_soft_float_abi() const { diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index ab099eef123c..8d87f4c1937e 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -987,49 +987,54 @@ string CodeGen_X86::mcpu_tune() const { // FIXME: we should lower everything here, instead of relying // that -mcpu= (`mcpu_target()`) implies/sets features for us. string CodeGen_X86::mattrs() const { - string features; - string separator; + std::vector attrs; if (target.has_feature(Target::FMA)) { - features += "+fma"; - separator = ","; + attrs.emplace_back("+fma"); } if (target.has_feature(Target::FMA4)) { - features += separator + "+fma4"; - separator = ","; + attrs.emplace_back("+fma4"); } if (target.has_feature(Target::F16C)) { - features += separator + "+f16c"; - separator = ","; + attrs.emplace_back("+f16c"); } if (target.has_feature(Target::AVX512) || target.has_feature(Target::AVX512_KNL) || target.has_feature(Target::AVX512_Skylake) || target.has_feature(Target::AVX512_Cannonlake)) { - features += separator + "+avx512f,+avx512cd"; - separator = ","; + attrs.emplace_back("+avx512f"); + attrs.emplace_back("+avx512cd"); if (target.has_feature(Target::AVX512_KNL)) { - features += ",+avx512pf,+avx512er"; + attrs.emplace_back("+avx512pf"); + attrs.emplace_back("+avx512er"); } if (target.has_feature(Target::AVX512_Skylake) || target.has_feature(Target::AVX512_Cannonlake)) { - features += ",+avx512vl,+avx512bw,+avx512dq"; + attrs.emplace_back("+avx512vl"); + attrs.emplace_back("+avx512bw"); + attrs.emplace_back("+avx512dq"); } if (target.has_feature(Target::AVX512_Cannonlake)) { - features += ",+avx512ifma,+avx512vbmi"; + attrs.emplace_back("+avx512ifma"); + attrs.emplace_back("+avx512vbmi"); } if (target.has_feature(Target::AVX512_Zen4)) { - features += ",+avx512bf16,+avx512vnni,+avx512bitalg,+avx512vbmi2"; + attrs.emplace_back("+avx512bf16"); + attrs.emplace_back("+avx512vnni"); + attrs.emplace_back("+avx512bitalg"); + attrs.emplace_back("+avx512vbmi2"); } if (target.has_feature(Target::AVX512_SapphireRapids)) { - features += ",+avxvnni,+amx-int8,+amx-bf16"; + attrs.emplace_back("+avxvnni"); + attrs.emplace_back("+amx-int8"); + attrs.emplace_back("+amx-bf16"); } } #if LLVM_VERSION >= 180 if (gather_might_be_slow(target)) { - features += ",+prefer-no-gather"; + attrs.push_back("+prefer-no-gather"); } #endif - return features; + return join_strings(attrs, ","); } bool CodeGen_X86::use_soft_float_abi() const { diff --git a/src/Util.h b/src/Util.h index 1bc53d5b3691..15c297796911 100644 --- a/src/Util.h +++ b/src/Util.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -185,6 +186,29 @@ std::string replace_all(const std::string &str, const std::string &find, const s /** Split the source string using 'delim' as the divider. */ std::vector split_string(const std::string &source, const std::string &delim); +/** Join the source vector using 'delim' as the divider. */ +template +std::string join_strings(const std::vector &sources, const std::string &delim) { + size_t sz = 0; + if (!sources.empty()) { + sz += delim.size() * (sources.size() - 1); + } + for (const auto &s : sources) { + sz += s.size(); + } + std::string result; + result.reserve(sz); + bool need_delim = false; + for (const auto &s : sources) { + if (need_delim) { + result += delim; + } + result += s; + need_delim = true; + } + return result; +} + /** Perform a left fold of a vector. Returns a default-constructed * vector element if the vector is empty. Similar to std::accumulate * but with a less clunky syntax. */ From 9c099c29379ea379309109dce9c23b731da2d8a1 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 8 Dec 2023 09:53:04 -0800 Subject: [PATCH 016/186] Teach unrolling to exploit conditions in enclosing ifs (#7969) * Teach unrolling to exploit conditions in enclosing ifs Fixes #7968 * Handle vectorization as well * Remove unused usings * Add missing print --- Makefile | 2 + src/BoundConstantExtentLoops.cpp | 136 ++++++++++++++++++ src/BoundConstantExtentLoops.h | 24 ++++ src/BoundsInference.cpp | 4 +- src/CMakeLists.txt | 4 +- src/Lower.cpp | 5 + src/Simplify.cpp | 14 +- src/Simplify.h | 17 ++- src/UnrollLoops.cpp | 79 +--------- src/VectorizeLoops.cpp | 4 +- test/correctness/CMakeLists.txt | 2 + ...roll_loop_with_implied_constant_bounds.cpp | 54 +++++++ .../vectorized_guard_with_if_tail.cpp | 42 ++++++ 13 files changed, 298 insertions(+), 89 deletions(-) create mode 100644 src/BoundConstantExtentLoops.cpp create mode 100644 src/BoundConstantExtentLoops.h create mode 100644 test/correctness/unroll_loop_with_implied_constant_bounds.cpp create mode 100644 test/correctness/vectorized_guard_with_if_tail.cpp diff --git a/Makefile b/Makefile index 4140da5c8f30..b24dfdc2d80d 100644 --- a/Makefile +++ b/Makefile @@ -459,6 +459,7 @@ SOURCE_FILES = \ BoundaryConditions.cpp \ Bounds.cpp \ BoundsInference.cpp \ + BoundConstantExtentLoops.cpp \ BoundSmallAllocations.cpp \ Buffer.cpp \ Callable.cpp \ @@ -654,6 +655,7 @@ HEADER_FILES = \ BoundaryConditions.h \ Bounds.h \ BoundsInference.h \ + BoundConstantExtentLoops.h \ BoundSmallAllocations.h \ Buffer.h \ Callable.h \ diff --git a/src/BoundConstantExtentLoops.cpp b/src/BoundConstantExtentLoops.cpp new file mode 100644 index 000000000000..d2901854f6eb --- /dev/null +++ b/src/BoundConstantExtentLoops.cpp @@ -0,0 +1,136 @@ +#include "BoundConstantExtentLoops.h" +#include "Bounds.h" +#include "CSE.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "Simplify.h" +#include "SimplifyCorrelatedDifferences.h" +#include "Substitute.h" + +namespace Halide { +namespace Internal { + +namespace { +class BoundLoops : public IRMutator { + using IRMutator::visit; + + std::vector> lets; + + Stmt visit(const LetStmt *op) override { + if (is_pure(op->value)) { + lets.emplace_back(op->name, op->value); + Stmt s = IRMutator::visit(op); + lets.pop_back(); + return s; + } else { + return IRMutator::visit(op); + } + } + + std::vector facts; + Stmt visit(const IfThenElse *op) override { + facts.push_back(op->condition); + Stmt then_case = mutate(op->then_case); + Stmt else_case; + if (op->else_case.defined()) { + facts.back() = simplify(!op->condition); + else_case = mutate(op->else_case); + } + facts.pop_back(); + if (then_case.same_as(op->then_case) && + else_case.same_as(op->else_case)) { + return op; + } else { + return IfThenElse::make(op->condition, then_case, else_case); + } + } + + Stmt visit(const For *op) override { + if (is_const(op->extent)) { + // Nothing needs to be done + return IRMutator::visit(op); + } + + if (op->for_type == ForType::Unrolled || + op->for_type == ForType::Vectorized) { + // Give it one last chance to simplify to an int + Expr extent = simplify(op->extent); + Stmt body = op->body; + const IntImm *e = extent.as(); + + if (e == nullptr) { + // We're about to hard fail. Get really aggressive + // with the simplifier. + for (auto it = lets.rbegin(); it != lets.rend(); it++) { + extent = Let::make(it->first, it->second, extent); + } + extent = remove_likelies(extent); + extent = substitute_in_all_lets(extent); + extent = simplify(extent, + true, + Scope::empty_scope(), + Scope::empty_scope(), + facts); + e = extent.as(); + } + + Expr extent_upper; + if (e == nullptr) { + // Still no luck. Try taking an upper bound and + // injecting an if statement around the body. + extent_upper = find_constant_bound(extent, Direction::Upper, Scope()); + if (extent_upper.defined()) { + e = extent_upper.as(); + body = + IfThenElse::make(likely_if_innermost(Variable::make(Int(32), op->name) < + op->min + op->extent), + body); + } + } + + if (e == nullptr && permit_failed_unroll && op->for_type == ForType::Unrolled) { + // Still no luck, but we're allowed to fail. Rewrite + // to a serial loop. + user_warning << "HL_PERMIT_FAILED_UNROLL is allowing us to unroll a non-constant loop into a serial loop. Did you mean to do this?\n"; + body = mutate(body); + return For::make(op->name, op->min, op->extent, + ForType::Serial, op->partition_policy, op->device_api, std::move(body)); + } + + user_assert(e) + << "Can only " << (op->for_type == ForType::Unrolled ? "unroll" : "vectorize") + << " for loops over a constant extent.\n" + << "Loop over " << op->name << " has extent " << extent << ".\n"; + body = mutate(body); + + return For::make(op->name, op->min, e, + op->for_type, op->partition_policy, op->device_api, std::move(body)); + } else { + return IRMutator::visit(op); + } + } + bool permit_failed_unroll = false; + +public: + BoundLoops() { + // Experimental autoschedulers may want to unroll without + // being totally confident the loop will indeed turn out + // to be constant-sized. If this feature continues to be + // important, we need to expose it in the scheduling + // language somewhere, but how? For now we do something + // ugly and expedient. + + // For the tracking issue to fix this, see + // https://github.com/halide/Halide/issues/3479 + permit_failed_unroll = get_env_variable("HL_PERMIT_FAILED_UNROLL") == "1"; + } +}; + +} // namespace + +Stmt bound_constant_extent_loops(const Stmt &s) { + return BoundLoops().mutate(s); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/BoundConstantExtentLoops.h b/src/BoundConstantExtentLoops.h new file mode 100644 index 000000000000..061064f795f9 --- /dev/null +++ b/src/BoundConstantExtentLoops.h @@ -0,0 +1,24 @@ +#ifndef HALIDE_BOUND_CONSTANT_EXTENT_LOOPS_H +#define HALIDE_BOUND_CONSTANT_EXTENT_LOOPS_H + +/** \file + * Defines the lowering pass that enforces a constant extent on all + * vectorized or unrolled loops. + */ + +#include "Expr.h" + +namespace Halide { +namespace Internal { + +/** Replace all loop extents of unrolled or vectorized loops with constants, by + * substituting and simplifying as needed. If we can't determine a constant + * extent, but can determine a constant upper bound, inject an if statement into + * the body. If we can't even determine a constant upper bound, throw a user + * error. */ +Stmt bound_constant_extent_loops(const Stmt &s); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp index d8a1ff53cc37..31b441ea4251 100644 --- a/src/BoundsInference.cpp +++ b/src/BoundsInference.cpp @@ -1013,11 +1013,11 @@ class BoundsInference : public IRMutator { } // Dump out the region required of each stage for debugging. - /* debug(0) << "Box required of " << producer.name << " by " << consumer.name - << " stage " << consumer.stage << ":\n"; + << " stage " << consumer.stage << ":\n" + << " used: " << b.used << "\n"; for (size_t k = 0; k < b.size(); k++) { debug(0) << " " << b[k].min << " ... " << b[k].max << "\n"; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5d15d55f4416..390fee9a64e5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,7 +21,8 @@ set(HEADER_FILES BoundaryConditions.h Bounds.h BoundsInference.h - BoundSmallAllocations.h + BoundConstantExtentLoops.h + BoundSmallAllocations.h Buffer.h Callable.h CanonicalizeGPUVars.h @@ -189,6 +190,7 @@ set(SOURCE_FILES BoundaryConditions.cpp Bounds.cpp BoundsInference.cpp + BoundConstantExtentLoops.cpp BoundSmallAllocations.cpp Buffer.cpp Callable.cpp diff --git a/src/Lower.cpp b/src/Lower.cpp index 67aedde288d0..37c4bac07efb 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -11,6 +11,7 @@ #include "AddParameterChecks.h" #include "AllocationBoundsInference.h" #include "AsyncProducers.h" +#include "BoundConstantExtentLoops.h" #include "BoundSmallAllocations.h" #include "Bounds.h" #include "BoundsInference.h" @@ -312,6 +313,10 @@ void lower_impl(const vector &output_funcs, s = simplify_correlated_differences(s); log("Lowering after simplifying correlated differences:", s); + debug(1) << "Bounding constant extent loops...\n"; + s = bound_constant_extent_loops(s); + log("Lowering after bounding constant extent loops:", s); + debug(1) << "Unrolling...\n"; s = unroll_loops(s); log("Lowering after unrolling:", s); diff --git a/src/Simplify.cpp b/src/Simplify.cpp index 7a2cbac5a047..339ef2917c83 100644 --- a/src/Simplify.cpp +++ b/src/Simplify.cpp @@ -355,8 +355,13 @@ Simplify::ScopedFact::~ScopedFact() { Expr simplify(const Expr &e, bool remove_dead_let_stmts, const Scope &bounds, - const Scope &alignment) { + const Scope &alignment, + const std::vector &assumptions) { Simplify m(remove_dead_let_stmts, &bounds, &alignment); + std::vector facts; + for (const Expr &a : assumptions) { + facts.push_back(m.scoped_truth(a)); + } Expr result = m.mutate(e, nullptr); if (m.in_unreachable) { return unreachable(e.type()); @@ -366,8 +371,13 @@ Expr simplify(const Expr &e, bool remove_dead_let_stmts, Stmt simplify(const Stmt &s, bool remove_dead_let_stmts, const Scope &bounds, - const Scope &alignment) { + const Scope &alignment, + const std::vector &assumptions) { Simplify m(remove_dead_let_stmts, &bounds, &alignment); + std::vector facts; + for (const Expr &a : assumptions) { + facts.push_back(m.scoped_truth(a)); + } Stmt result = m.mutate(s); if (m.in_unreachable) { return Evaluate::make(unreachable()); diff --git a/src/Simplify.h b/src/Simplify.h index 14dec65fc025..b9335c0c3de9 100644 --- a/src/Simplify.h +++ b/src/Simplify.h @@ -13,19 +13,22 @@ namespace Halide { namespace Internal { -/** Perform a a wide range of simplifications to expressions and - * statements, including constant folding, substituting in trivial - * values, arithmetic rearranging, etc. Simplifies across let - * statements, so must not be called on stmts with dangling or - * repeated variable names. +/** Perform a wide range of simplifications to expressions and statements, + * including constant folding, substituting in trivial values, arithmetic + * rearranging, etc. Simplifies across let statements, so must not be called on + * stmts with dangling or repeated variable names. Can optionally be passed + * known bounds of any variables, known alignment properties, and any other + * Exprs that should be assumed to be true. */ // @{ Stmt simplify(const Stmt &, bool remove_dead_code = true, const Scope &bounds = Scope::empty_scope(), - const Scope &alignment = Scope::empty_scope()); + const Scope &alignment = Scope::empty_scope(), + const std::vector &assumptions = std::vector()); Expr simplify(const Expr &, bool remove_dead_code = true, const Scope &bounds = Scope::empty_scope(), - const Scope &alignment = Scope::empty_scope()); + const Scope &alignment = Scope::empty_scope(), + const std::vector &assumptions = std::vector()); // @} /** Attempt to statically prove an expression is true using the simplifier. */ diff --git a/src/UnrollLoops.cpp b/src/UnrollLoops.cpp index e1726aa28ceb..2823c8b9ac9f 100644 --- a/src/UnrollLoops.cpp +++ b/src/UnrollLoops.cpp @@ -1,16 +1,10 @@ #include "UnrollLoops.h" -#include "Bounds.h" -#include "CSE.h" #include "IRMutator.h" #include "IROperator.h" #include "Simplify.h" -#include "SimplifyCorrelatedDifferences.h" #include "Substitute.h" #include "UniquifyVariableNames.h" -using std::pair; -using std::vector; - namespace Halide { namespace Internal { @@ -19,62 +13,13 @@ namespace { class UnrollLoops : public IRMutator { using IRMutator::visit; - vector> lets; - - Stmt visit(const LetStmt *op) override { - if (is_pure(op->value)) { - lets.emplace_back(op->name, op->value); - Stmt s = IRMutator::visit(op); - lets.pop_back(); - return s; - } else { - return IRMutator::visit(op); - } - } - Stmt visit(const For *for_loop) override { if (for_loop->for_type == ForType::Unrolled) { - // Give it one last chance to simplify to an int - Expr extent = simplify(for_loop->extent); Stmt body = for_loop->body; - const IntImm *e = extent.as(); - - if (e == nullptr) { - // We're about to hard fail. Get really aggressive - // with the simplifier. - for (auto it = lets.rbegin(); it != lets.rend(); it++) { - extent = Let::make(it->first, it->second, extent); - } - extent = remove_likelies(extent); - extent = substitute_in_all_lets(extent); - extent = simplify(extent); - e = extent.as(); - } + const IntImm *e = for_loop->extent.as(); - Expr extent_upper; - bool use_guard = false; - if (e == nullptr) { - // Still no luck. Try taking an upper bound and - // injecting an if statement around the body. - extent_upper = find_constant_bound(extent, Direction::Upper, Scope()); - if (extent_upper.defined()) { - e = extent_upper.as(); - use_guard = true; - } - } - - if (e == nullptr && permit_failed_unroll) { - // Still no luck, but we're allowed to fail. Rewrite - // to a serial loop. - user_warning << "HL_PERMIT_FAILED_UNROLL is allowing us to unroll a non-constant loop into a serial loop. Did you mean to do this?\n"; - body = mutate(body); - return For::make(for_loop->name, for_loop->min, for_loop->extent, - ForType::Serial, for_loop->partition_policy, for_loop->device_api, std::move(body)); - } - - user_assert(e) - << "Can only unroll for loops over a constant extent.\n" - << "Loop over " << for_loop->name << " has extent " << extent << ".\n"; + internal_assert(e) + << "Loop over " << for_loop->name << " should have had a constant extent\n"; body = mutate(body); if (e->value == 1) { @@ -94,9 +39,6 @@ class UnrollLoops : public IRMutator { } else { iters = Block::make(iter, iters); } - if (use_guard) { - iters = IfThenElse::make(likely_if_innermost(i < for_loop->extent), iters); - } } return iters; @@ -105,21 +47,6 @@ class UnrollLoops : public IRMutator { return IRMutator::visit(for_loop); } } - bool permit_failed_unroll = false; - -public: - UnrollLoops() { - // Experimental autoschedulers may want to unroll without - // being totally confident the loop will indeed turn out - // to be constant-sized. If this feature continues to be - // important, we need to expose it in the scheduling - // language somewhere, but how? For now we do something - // ugly and expedient. - - // For the tracking issue to fix this, see - // https://github.com/halide/Halide/issues/3479 - permit_failed_unroll = get_env_variable("HL_PERMIT_FAILED_UNROLL") == "1"; - } }; } // namespace diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index 7ced1dab0d92..89c4f020af51 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -951,7 +951,9 @@ class VectorSubs : public IRMutator { if (op->for_type == ForType::Vectorized) { const IntImm *extent_int = extent.as(); - if (!extent_int || extent_int->value <= 1) { + internal_assert(extent_int) + << "Vectorized for loop extent should have been rewritten to a constant\n"; + if (extent_int->value <= 1) { user_error << "Loop over " << op->name << " has extent " << extent << ". Can only vectorize loops over a " diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index da968c419593..6b4529be6be5 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -319,6 +319,7 @@ tests(GROUPS correctness uninitialized_read.cpp unique_func_image.cpp unroll_dynamic_loop.cpp + unroll_loop_with_implied_constant_bounds.cpp unrolled_reduction.cpp unsafe_dedup_lets.cpp unsafe_promises.cpp @@ -337,6 +338,7 @@ tests(GROUPS correctness vectorize_varying_allocation_size.cpp vectorized_assert.cpp vectorized_gpu_allocation.cpp + vectorized_guard_with_if_tail.cpp vectorized_initialization.cpp vectorized_load_from_vectorized_allocation.cpp vectorized_reduction_bug.cpp diff --git a/test/correctness/unroll_loop_with_implied_constant_bounds.cpp b/test/correctness/unroll_loop_with_implied_constant_bounds.cpp new file mode 100644 index 000000000000..c38d59c5214a --- /dev/null +++ b/test/correctness/unroll_loop_with_implied_constant_bounds.cpp @@ -0,0 +1,54 @@ +#include "Halide.h" + +using namespace Halide; + +int main(int argc, char **argv) { + // This test verifies that unrolling/vectorizing is capable of inferring + // constant bounds of loops that are implied by containing if statement + // conditions, e.g the following structure should work: + + /* + let extent = foo + if (foo == 7) { + unrolled for (x from 0 to extent) {...} + } + */ + + for (int i = 0; i < 2; i++) { + Func intermediate("intermediate"); + + Func output1("output1"), output2("output2"); + + Var x("x"), y("y"), c("c"); + + intermediate(x, y, c) = x + y + c; + + output1(x, y, c) = intermediate(x, y, c); + output2(x, y, c) = intermediate(x, y, c); + + Expr three_channels = + (output1.output_buffer().dim(2).extent() == 3 && + output1.output_buffer().dim(2).min() == 0 && + output2.output_buffer().dim(2).extent() == 3 && + output2.output_buffer().dim(2).min() == 0); + + if (i == 0) { + intermediate.compute_root() + .specialize(three_channels) + .unroll(c); + } else { + intermediate.compute_root() + .specialize(three_channels) + .vectorize(c); + } + + Pipeline p{{output1, output2}}; + + // Should not throw an error in loop unrolling or vectorization. + p.compile_jit(); + } + + printf("Success!\n"); + + return 0; +} diff --git a/test/correctness/vectorized_guard_with_if_tail.cpp b/test/correctness/vectorized_guard_with_if_tail.cpp new file mode 100644 index 000000000000..62bf975d93f1 --- /dev/null +++ b/test/correctness/vectorized_guard_with_if_tail.cpp @@ -0,0 +1,42 @@ +#include "Halide.h" + +using namespace Halide; + +int main(int argc, char **argv) { + Var x; + + for (int i = 0; i < 2; i++) { + Func f, g; + f(x) = x; + g(x) = f(x) * 2; + + g.vectorize(x, 8, TailStrategy::GuardWithIf); + + f.compute_at(g, x); + + // A varying amount of f is required depending on if we're in the steady + // state of g or the tail. Nonetheless, the amount required has a constant + // upper bound of 8. Vectorization, unrolling, and variants of store_in that + // require constant extent should all be able to handle this. + if (i == 0) { + f.vectorize(x); + } else { + f.unroll(x); + } + f.store_in(MemoryType::Register); + + Buffer buf = g.realize({37}); + + for (int i = 0; i < buf.width(); i++) { + int correct = i * 2; + if (buf(i) != correct) { + printf("buf(%d) = %d instead of %d\n", + i, buf(i), correct); + return 1; + } + } + } + + printf("Success!\n"); + return 0; +} From 357e64685619ab0aaee03f2efa5a4e38d4fb5372 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 8 Dec 2023 11:17:30 -0800 Subject: [PATCH 017/186] Do some basic validation of Target Features (#7986) (#7987) * Do some basic validation of Target Features (#7986) * Update Target.cpp * Update Target.cpp * Fixes * Update Target.cpp * Improve error messaging. * format * Update Target.cpp --- python_bindings/test/correctness/target.py | 5 +- src/Target.cpp | 83 ++++++++++++++++++++++ src/Target.h | 6 ++ test/correctness/target.cpp | 7 +- 4 files changed, 93 insertions(+), 8 deletions(-) diff --git a/python_bindings/test/correctness/target.py b/python_bindings/test/correctness/target.py index 18eee2651301..7876bc97ecef 100644 --- a/python_bindings/test/correctness/target.py +++ b/python_bindings/test/correctness/target.py @@ -50,9 +50,6 @@ def test_target(): 32, [ hl.TargetFeature.JIT, - hl.TargetFeature.SSE41, - hl.TargetFeature.AVX, - hl.TargetFeature.AVX2, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL, hl.TargetFeature.OpenGLCompute, @@ -60,7 +57,7 @@ def test_target(): ], ) ts = t1.to_string() - assert ts == "arm-32-android-avx-avx2-cuda-debug-jit-opencl-openglcompute-sse41" + assert ts == "arm-32-android-cuda-debug-jit-opencl-openglcompute" assert hl.Target.validate_target_string(ts) # Expected failures: diff --git a/src/Target.cpp b/src/Target.cpp index e222e97d5282..49011348544f 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -785,8 +785,90 @@ void bad_target_string(const std::string &target) { << "On this platform, the host target is: " << get_host_target().to_string() << "\n"; } +void do_check_bad(const Target &t, const std::initializer_list &v) { + for (Target::Feature f : v) { + user_assert(!t.has_feature(f)) + << "Target feature " << Target::feature_to_name(f) + << " is incompatible with the Target's architecture. (" << t << ")\n"; + } +} + } // namespace +void Target::validate_features() const { + // Note that the features don't have to be exhaustive, but enough to avoid obvious mistakes is good. + if (arch == X86) { + do_check_bad(*this, { + ARMDotProd, + ARMFp16, + ARMv7s, + ARMv81a, + NoNEON, + POWER_ARCH_2_07, + RVV, + SVE, + SVE2, + VSX, + WasmBulkMemory, + WasmMvpOnly, + WasmSimd128, + WasmThreads, + }); + } else if (arch == ARM) { + do_check_bad(*this, { + AVX, + AVX2, + AVX512, + AVX512_Cannonlake, + AVX512_KNL, + AVX512_SapphireRapids, + AVX512_Skylake, + AVX512_Zen4, + F16C, + FMA, + FMA4, + POWER_ARCH_2_07, + RVV, + SSE41, + VSX, + WasmBulkMemory, + WasmMvpOnly, + WasmSimd128, + WasmThreads, + }); + } else if (arch == WebAssembly) { + do_check_bad(*this, { + ARMDotProd, + ARMFp16, + ARMv7s, + ARMv81a, + AVX, + AVX2, + AVX512, + AVX512_Cannonlake, + AVX512_KNL, + AVX512_SapphireRapids, + AVX512_Skylake, + AVX512_Zen4, + F16C, + FMA, + FMA4, + HVX_128, + HVX_128, + HVX_v62, + HVX_v65, + HVX_v66, + NoNEON, + POWER_ARCH_2_07, + RVV, + SSE41, + SVE, + SVE2, + VSX, + }); + } +} + Target::Target(const std::string &target) { Target host = get_host_target(); @@ -798,6 +880,7 @@ Target::Target(const std::string &target) { bad_target_string(target); } } + validate_features(); } Target::Target(const char *s) diff --git a/src/Target.h b/src/Target.h index 331694e34c3a..97c141f308e5 100644 --- a/src/Target.h +++ b/src/Target.h @@ -177,6 +177,7 @@ struct Target { for (const auto &f : initial_features) { set_feature(f); } + validate_features(); } Target(OS o, Arch a, int b, const std::vector &initial_features = std::vector()) @@ -357,6 +358,11 @@ struct Target { private: /** A bitmask that stores the active features. */ std::bitset features; + + /** Attempt to validate that all features set are sensible for the base Target. + * This is *not* guaranteed to get all invalid combinations, but is intended + * to catch at least the most common (e.g., setting arm-specific features on x86). */ + void validate_features() const; }; /** Return the target corresponding to the host machine. */ diff --git a/test/correctness/target.cpp b/test/correctness/target.cpp index 160d870ac09a..8fc03b589a73 100644 --- a/test/correctness/target.cpp +++ b/test/correctness/target.cpp @@ -51,11 +51,10 @@ int main(int argc, char **argv) { // Full specification round-trip, crazy features t1 = Target(Target::Android, Target::ARM, 32, - {Target::JIT, Target::SSE41, Target::AVX, Target::AVX2, - Target::CUDA, Target::OpenCL, Target::OpenGLCompute, - Target::Debug}); + {Target::JIT, Target::CUDA, Target::OpenCL, + Target::OpenGLCompute, Target::Debug}); ts = t1.to_string(); - if (ts != "arm-32-android-avx-avx2-cuda-debug-jit-opencl-openglcompute-sse41") { + if (ts != "arm-32-android-cuda-debug-jit-opencl-openglcompute") { printf("to_string failure: %s\n", ts.c_str()); return 1; } From 3d5cf40cd64b32dfecf7a584cf4790c9c3237b4d Mon Sep 17 00:00:00 2001 From: Martijn Courteaux Date: Tue, 12 Dec 2023 18:50:56 +0100 Subject: [PATCH 018/186] Inject profiling for function calls to 'halide_copy_to_host' and 'halide_copy_to_device'. (#7913) * Inject profiling for function calls to 'halide_copy_to_host' and 'halide_copy_to_device'. * WIP: I get segfaults. The device_interface pointer is bogus. * Figured it out... * Allow global sync on d3d12. * Cleanly time all buffer copies as well. * Cleanup old comment. * Following Andrews suggestion for suffixing buffer copies in the profiler. * Sort the profiler report lines into three sections: funcs, buffer copy to device, and buffer copy to host. * Inject profiling for function calls to 'halide_copy_to_host' and 'halide_copy_to_device'. * WIP: I get segfaults. The device_interface pointer is bogus. * Figured it out... * Allow global sync on d3d12. * Cleanly time all buffer copies as well. * Cleanup old comment. * Following Andrews suggestion for suffixing buffer copies in the profiler. * Sort the profiler report lines into three sections: funcs, buffer copy to device, and buffer copy to host. * Attempt to fix output parsing. * Fix crash for copy_to_device * halide_device_sync_global(NULL) -> success * Fixed the buffer copy bug. Added a new test that will cause buffer copies in two directions within the compiled pipeline. This will catch this better in the future. Tweaked the profile report section header printing. * Clang-format, my dear friend... --- src/CodeGen_Internal.cpp | 1 + src/OffloadGPULoops.cpp | 8 +- src/Profiling.cpp | 72 ++++++++++ src/runtime/HalideRuntime.h | 9 ++ src/runtime/d3d12compute.cpp | 8 +- src/runtime/device_interface.cpp | 15 ++ src/runtime/profiler_common.cpp | 128 +++++++++++++++--- src/runtime/runtime_api.cpp | 1 + test/correctness/CMakeLists.txt | 1 + .../device_buffer_copies_with_profile.cpp | 71 ++++++++++ test/performance/memory_profiler.cpp | 12 +- test/performance/profiler.cpp | 12 +- 12 files changed, 306 insertions(+), 32 deletions(-) create mode 100644 test/correctness/device_buffer_copies_with_profile.cpp diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp index 5c36ea58aae3..2fc5b5cae0df 100644 --- a/src/CodeGen_Internal.cpp +++ b/src/CodeGen_Internal.cpp @@ -40,6 +40,7 @@ bool function_takes_user_context(const std::string &name) { "halide_device_malloc", "halide_device_and_host_malloc", "halide_device_sync", + "halide_device_sync_global", "halide_do_par_for", "halide_do_loop_task", "halide_do_task", diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index 7b8464211994..46e6544036b7 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -249,7 +249,13 @@ class InjectGpuOffload : public IRMutator { Call::make(Handle(), Call::make_struct, args, Call::Intrinsic), Call::make(Handle(), Call::make_struct, arg_is_buffer, Call::Intrinsic), }; - return call_extern_and_assert("halide_" + api_unique_name + "_run", run_args); + Stmt run_and_assert = call_extern_and_assert("halide_" + api_unique_name + "_run", run_args); + if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) { + Expr device_interface = make_device_interface_call(loop->device_api, MemoryType::Auto); + Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface}); + return Block::make(run_and_assert, sync_and_assert); + } + return run_and_assert; } public: diff --git a/src/Profiling.cpp b/src/Profiling.cpp index 7bd9a9fe4db7..2be058b3c8a6 100644 --- a/src/Profiling.cpp +++ b/src/Profiling.cpp @@ -6,6 +6,7 @@ #include "ExprUsesVar.h" #include "IRMutator.h" #include "IROperator.h" +#include "InjectHostDevBufferCopies.h" #include "Profiling.h" #include "Scope.h" #include "Simplify.h" @@ -422,6 +423,77 @@ class InjectProfiling : public IRMutator { } return IfThenElse::make(std::move(condition), std::move(then_case), std::move(else_case)); } + + Stmt visit(const LetStmt *op) override { + if (const Call *call = op->value.as()) { + Stmt start_profiler; + if (call->name == "halide_copy_to_host" || call->name == "halide_copy_to_device") { + std::string buffer_name; + if (const Variable *var = call->args.front().as()) { + buffer_name = var->name; + if (ends_with(buffer_name, ".buffer")) { + buffer_name = buffer_name.substr(0, buffer_name.size() - 7); + } else { + internal_error << "Expected to find a variable ending in .buffer as first argument to function call " << call->name << "\n"; + } + } else { + internal_error << "Expected to find a variable as first argument of the function call " << call->name << ".\n"; + } + bool requires_sync = false; + if (call->name == "halide_copy_to_host") { + int copy_to_host_id = get_func_id(buffer_name + " (copy to host)"); + start_profiler = set_current_func(copy_to_host_id); + requires_sync = false; + } else if (call->name == "halide_copy_to_device") { + int copy_to_device_id = get_func_id(buffer_name + " (copy to device)"); + start_profiler = set_current_func(copy_to_device_id); + requires_sync = true; + } else { + internal_error << "Unexpected function name.\n"; + } + if (start_profiler.defined()) { + // The copy functions are followed by an assert, which we will wrap in the timed body. + const AssertStmt *copy_assert = nullptr; + Stmt other; + if (const Block *block = op->body.as()) { + if (const AssertStmt *assert = block->first.as()) { + copy_assert = assert; + other = block->rest; + } + } else if (const AssertStmt *assert = op->body.as()) { + copy_assert = assert; + } + if (copy_assert) { + std::vector steps; + steps.push_back(AssertStmt::make(copy_assert->condition, copy_assert->message)); + if (requires_sync) { + internal_assert(call->name == "halide_copy_to_device"); + Expr device_interface = call->args.back(); // The last argument to the copy_to_device calls is the device_interface. + Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface}); + steps.push_back(sync_and_assert); + } + steps.push_back(set_current_func(stack.back())); + + if (other.defined()) { + steps.push_back(mutate(other)); + } + return Block::make(start_profiler, + LetStmt::make(op->name, mutate(op->value), + Block::make(steps))); + } else { + internal_error << "No assert found after buffer copy.\n"; + } + } + } + } + + Stmt body = mutate(op->body); + Expr value = mutate(op->value); + if (body.same_as(op->body) && value.same_as(op->value)) { + return op; + } + return LetStmt::make(op->name, value, body); + } }; } // namespace diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index f50e498ce88e..eea4faf7b073 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -885,6 +885,15 @@ extern int halide_device_release_crop(void *user_context, * should rarely be necessary, except maybe for profiling. */ extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf); +/** + * Wait for current GPU operations to complete. Calling this explicitly + * should rarely be necessary, except maybe for profiling. + * This variation of the synchronizing is useful when a synchronization is desirable + * without specifying any buffer to synchronize on. + * Calling this with a null device_interface is always illegal. + */ +extern int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface); + /** Allocate device memory to back a halide_buffer_t. */ extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf, const struct halide_device_interface_t *device_interface); diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index 9d652423ff9a..adae690800cc 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -2786,8 +2786,12 @@ WEAK int halide_d3d12compute_device_sync(void *user_context, struct halide_buffe return d3d12_context.error(); } - d3d12_buffer *dbuffer = peel_buffer(buffer); - d3d12compute_device_sync_internal(d3d12_context.device, dbuffer); + if (buffer != nullptr) { + d3d12_buffer *dbuffer = peel_buffer(buffer); + d3d12compute_device_sync_internal(d3d12_context.device, dbuffer); + } else { + d3d12compute_device_sync_internal(d3d12_context.device, nullptr); + } return halide_error_code_success; } diff --git a/src/runtime/device_interface.cpp b/src/runtime/device_interface.cpp index 692a28e5fa9f..710d1259678d 100644 --- a/src/runtime/device_interface.cpp +++ b/src/runtime/device_interface.cpp @@ -231,6 +231,21 @@ WEAK int halide_device_sync(void *user_context, struct halide_buffer_t *buf) { } } +/** + * Wait for current GPU operations to complete. Calling this explicitly + * should rarely be necessary, except maybe for profiling. + * This variation of the synchronizing is useful when a synchronization is desirable + * without specifying any buffer to synchronize on. + */ +WEAK int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface) { + if (device_interface == nullptr) { + return halide_error_code_no_device_interface; + } + // This function calls immediately the device_interface implementation to syncrhonize on + // "no buffer" (i.e., nullptr buffer) to trigger a "global" device sync. + return device_interface->impl->device_sync(user_context, nullptr); +} + /** Allocate device memory to back a halide_buffer_t. */ WEAK int halide_device_malloc(void *user_context, struct halide_buffer_t *buf, const halide_device_interface_t *device_interface) { diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp index aed1376b6087..ccbe0bf11ecb 100644 --- a/src/runtime/profiler_common.cpp +++ b/src/runtime/profiler_common.cpp @@ -349,6 +349,14 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st }; } } + bool support_colors = false; + const char *term = getenv("TERM"); + if (term) { + // Check if the terminal supports colors + if (strstr(term, "color") || strstr(term, "xterm")) { + support_colors = true; + } + } for (halide_profiler_pipeline_stats *p = s->pipelines; p; p = (halide_profiler_pipeline_stats *)(p->next)) { @@ -385,14 +393,31 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st if (print_f_states) { int f_stats_count = 0; halide_profiler_func_stats **f_stats = (halide_profiler_func_stats **)__builtin_alloca(p->num_funcs * sizeof(halide_profiler_func_stats *)); + const char *substr_copy_to_device = " (copy to device)"; + const char *substr_copy_to_host = " (copy to host)"; + + int max_func_name_length = 23; // length of the section header + int num_copy_to_device = 0; + int num_copy_to_host = 0; - int max_func_name_length = 0; + uint64_t total_func_time = 0; + uint64_t total_copy_to_device_time = 0; + uint64_t total_copy_to_host_time = 0; for (int i = 0; i < p->num_funcs; i++) { halide_profiler_func_stats *fs = p->funcs + i; int name_len = strlen(fs->name); if (name_len > max_func_name_length) { max_func_name_length = name_len; } + if (strstr(fs->name, substr_copy_to_device)) { + num_copy_to_device++; + total_copy_to_device_time += fs->time; + } else if (strstr(fs->name, substr_copy_to_host)) { + num_copy_to_host++; + total_copy_to_host_time += fs->time; + } else { + total_func_time += fs->time; + } } for (int i = 0; i < p->num_funcs; i++) { @@ -418,18 +443,8 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st } } - for (int i = 0; i < f_stats_count; i++) { - size_t cursor = 0; - sstr.clear(); - halide_profiler_func_stats *fs = f_stats[i]; - - sstr << " " << fs->name << ": "; - cursor += max_func_name_length + 5; - while (sstr.size() < cursor) { - sstr << " "; - } - - float ft = fs->time / (p->runs * 1000000.0f); + const auto print_time_and_percentage = [&sstr, p](uint64_t time, size_t &cursor, bool light) { + float ft = time / (p->runs * 1000000.0f); if (ft < 10000) { sstr << " "; } @@ -451,16 +466,40 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st sstr << " "; } - int percent = 0; + int perthousand = 0; if (p->time != 0) { - percent = (100 * fs->time) / p->time; + perthousand = (1000 * time) / p->time; + } + sstr << "("; + if (perthousand < 100) { + sstr << " "; } - sstr << "(" << percent << "%)"; - cursor += 8; + int percent = perthousand / 10; + sstr << percent << "." << (perthousand - percent * 10) << "%)"; + if (!light) { + cursor += 10; + while (sstr.size() < cursor) { + sstr << " "; + } + } + }; + + auto print_report_entry = [&](halide_profiler_func_stats *fs, const char *suffix_cut) { + size_t cursor = 0; + sstr.clear(); + + sstr << " " << fs->name; + if (suffix_cut) { + sstr.erase(strlen(suffix_cut)); + } + sstr << ": "; + cursor += max_func_name_length + 7; while (sstr.size() < cursor) { sstr << " "; } + print_time_and_percentage(fs->time, cursor, false); + if (!serial) { float threads = fs->active_threads_numerator / (fs->active_threads_denominator + 1e-10); sstr << "threads: " << threads; @@ -494,6 +533,61 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st sstr << "\n"; halide_print(user_context, sstr.str()); + }; + + if (num_copy_to_host == 0 && num_copy_to_device == 0) { + for (int i = 0; i < f_stats_count; i++) { + halide_profiler_func_stats *fs = f_stats[i]; + print_report_entry(fs, nullptr); + } + } else { + const auto print_section_header = [&](const char *name, uint64_t total_time) { + size_t cursor = 0; + sstr.clear(); + sstr << " "; + if (support_colors) { + sstr << "\033[90m\033[3m"; + cursor += 9; + } + sstr << "[" << name << " "; + cursor += max_func_name_length + 7; + while (sstr.size() < cursor) { + sstr << ":"; + } + print_time_and_percentage(total_time, cursor, true); + sstr << " ::::]"; + if (support_colors) { + sstr << "\033[0m"; + } + sstr << "\n"; + halide_print(user_context, sstr.str()); + }; + + print_section_header("funcs", total_func_time); + for (int i = 0; i < f_stats_count; i++) { + halide_profiler_func_stats *fs = f_stats[i]; + if (!strstr(fs->name, substr_copy_to_device) && !strstr(fs->name, substr_copy_to_host)) { + print_report_entry(fs, nullptr); + } + } + if (num_copy_to_device) { + print_section_header("buffer copies to device", total_copy_to_device_time); + for (int i = 0; i < f_stats_count; i++) { + halide_profiler_func_stats *fs = f_stats[i]; + if (strstr(fs->name, substr_copy_to_device)) { + print_report_entry(fs, substr_copy_to_device); + } + } + } + if (num_copy_to_host) { + print_section_header("buffer copies to host", total_copy_to_host_time); + for (int i = 0; i < f_stats_count; i++) { + halide_profiler_func_stats *fs = f_stats[i]; + if (strstr(fs->name, substr_copy_to_host)) { + print_report_entry(fs, substr_copy_to_host); + } + } + } } } } diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp index 51f5b7245343..5c64391b6259 100644 --- a/src/runtime/runtime_api.cpp +++ b/src/runtime/runtime_api.cpp @@ -50,6 +50,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_device_malloc, (void *)&halide_device_release, (void *)&halide_device_sync, + (void *)&halide_device_sync_global, (void *)&halide_disable_timer_interrupt, (void *)&halide_do_par_for, (void *)&halide_do_parallel_tasks, diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 6b4529be6be5..4ee9f57480dc 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -76,6 +76,7 @@ tests(GROUPS correctness debug_to_file_reorder.cpp deferred_loop_level.cpp deinterleave4.cpp + device_buffer_copies_with_profile.cpp device_buffer_copy.cpp device_copy_at_inner_loop.cpp device_crop.cpp diff --git a/test/correctness/device_buffer_copies_with_profile.cpp b/test/correctness/device_buffer_copies_with_profile.cpp new file mode 100644 index 000000000000..7398334fc06a --- /dev/null +++ b/test/correctness/device_buffer_copies_with_profile.cpp @@ -0,0 +1,71 @@ +#include "Halide.h" + +using namespace Halide; + +int run_test(Target t) { + // Sliding window with the producer on the GPU and the consumer on + // the CPU. This requires a copy inside the loop over which we are + // sliding. Currently this copies the entire buffer back and + // forth, which is suboptimal in the general case. In this + // specific case we're folded over y, so copying the entire buffer + // is not much more than just copying the part that was modified. + + Func f0{"f0_on_cpu"}, f1{"f1_on_gpu"}, f2{"f2_on_cpu"}; + Var x, y, tx, ty; + + // Produce something on CPU + f0(x, y) = x + y; + f0.compute_root(); + + // Which we use to produce something on GPU, causing a copy_to_device. + f1(x, y) = f0(x, y) + f0(x, y + 1); + f1.compute_root().gpu_tile(x, y, tx, ty, 8, 8); + + // Which in turn we use to produce something on CPU, causing a copy_to_host. + f2(x, y) = f1(x, y) * 2; + f2.compute_root(); + + // Make the buffer a little bigger so we actually can see the copy time. + Buffer out = f2.realize({2000, 2000}, t); + // Let's only verify a part of it... + for (int y = 0; y < 100; y++) { + for (int x = 0; x < 100; x++) { + int correct = 4 * (x + y) + 2; + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + return 1; + } + } + } + return 0; +} + +int main(int argc, char **argv) { + Target t = get_jit_target_from_environment(); + if (!t.has_gpu_feature()) { + printf("[SKIP] no gpu feature enabled\n"); + return 0; + } + printf("Testing without profiler.\n"); + int result = run_test(t); + if (result != 0) { + return 1; + } + + printf("Testing thread based profiler.\n"); + result = run_test(t.with_feature(Target::Profile)); + if (result != 0) { + return 1; + } + if (t.os == Target::Linux) { + printf("Testing timer based profiler.\n"); + result = run_test(t.with_feature(Target::ProfileByTimer)); + if (result != 0) { + return 1; + } + } + + printf("Success!\n"); + return 0; +} diff --git a/test/performance/memory_profiler.cpp b/test/performance/memory_profiler.cpp index 3fb511979265..8ca5cf3c2295 100644 --- a/test/performance/memory_profiler.cpp +++ b/test/performance/memory_profiler.cpp @@ -16,13 +16,13 @@ void reset_stats() { } void my_print(JITUserContext *, const char *msg) { - float this_ms, this_threads; - int idx, this_percentage, this_heap_peak; + float this_ms, this_threads, this_percentage; + int idx, this_heap_peak; int this_num_mallocs, this_malloc_avg, this_stack_peak; int val; // printf("%s", msg); - val = sscanf(msg, " g_%d: %fms (%d%%) threads: %f peak: %d num: %d avg: %d", + val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f peak: %d num: %d avg: %d", &idx, &this_ms, &this_percentage, &this_threads, &this_heap_peak, &this_num_mallocs, &this_malloc_avg); if (val == 7) { @@ -31,7 +31,7 @@ void my_print(JITUserContext *, const char *msg) { malloc_avg = this_malloc_avg; } - val = sscanf(msg, " g_%d: %fms (%d%%) peak: %d num: %d avg: %d", + val = sscanf(msg, " g_%d: %fms (%f%%) peak: %d num: %d avg: %d", &idx, &this_ms, &this_percentage, &this_heap_peak, &this_num_mallocs, &this_malloc_avg); if (val == 6) { @@ -40,13 +40,13 @@ void my_print(JITUserContext *, const char *msg) { malloc_avg = this_malloc_avg; } - val = sscanf(msg, " g_%d: %fms (%d%%) threads: %f stack: %d", + val = sscanf(msg, " g_%d: %fms (%f%%) threads: %f stack: %d", &idx, &this_ms, &this_percentage, &this_threads, &this_stack_peak); if (val == 5) { stack_peak = this_stack_peak; } - val = sscanf(msg, " g_%d: %fms (%d%%) stack: %d", + val = sscanf(msg, " g_%d: %fms (%f%%) stack: %d", &idx, &this_ms, &this_percentage, &this_stack_peak); if (val == 4) { stack_peak = this_stack_peak; diff --git a/test/performance/profiler.cpp b/test/performance/profiler.cpp index 3912a16c4211..bf5d166c0e81 100644 --- a/test/performance/profiler.cpp +++ b/test/performance/profiler.cpp @@ -3,14 +3,14 @@ using namespace Halide; -int percentage = 0; +float percentage = 0; float ms = 0; void my_print(JITUserContext *, const char *msg) { float this_ms; - int this_percentage; - int val = sscanf(msg, " fn13: %fms (%d", &this_ms, &this_percentage); + float this_percentage; + int val = sscanf(msg, " fn13: %fms (%f", &this_ms, &this_percentage); if (val != 2) { - val = sscanf(msg, " fn13$1: %fms (%d", &this_ms, &this_percentage); + val = sscanf(msg, " fn13$1: %fms (%f", &this_ms, &this_percentage); } if (val == 2) { ms = this_ms; @@ -59,8 +59,8 @@ int run_test(bool use_timer_profiler) { printf("Time spent in fn13: %fms\n", ms); - if (percentage < 40) { - printf("Percentage of runtime spent in f13: %d\n" + if (percentage < 40.0f) { + printf("Percentage of runtime spent in f13: %.1f%%\n" "This is suspiciously low. It should be more like 66%%\n", percentage); return 1; From 6d29ad5a0b5afd650e3e3d6f977a3b03b23b3655 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 13 Dec 2023 09:02:37 -0800 Subject: [PATCH 019/186] Add missing Python bindings for various recent additions to Func and Stage (#8002) * Add missing Python bindings for various recent additions to Func and Stage We have been sloppy about maintaining these. Also added a bit of testing. * Update PyEnums.cpp --- .../src/halide/halide_/PyEnums.cpp | 10 ++++++ python_bindings/src/halide/halide_/PyFunc.cpp | 10 ++++-- .../src/halide/halide_/PyScheduleMethods.h | 9 +++++ .../src/halide/halide_/PyStage.cpp | 4 ++- .../test/correctness/boundary_conditions.py | 33 ++++++++++++------- .../test/correctness/realize_warnings.py | 24 ++++++++++++++ 6 files changed, 76 insertions(+), 14 deletions(-) diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp index f86e7072edd5..d723d66461d8 100644 --- a/python_bindings/src/halide/halide_/PyEnums.cpp +++ b/python_bindings/src/halide/halide_/PyEnums.cpp @@ -68,7 +68,12 @@ void define_enums(py::module &m) { py::enum_(m, "TailStrategy") .value("RoundUp", TailStrategy::RoundUp) .value("GuardWithIf", TailStrategy::GuardWithIf) + .value("Predicate", TailStrategy::Predicate) + .value("PredicateLoads", TailStrategy::PredicateLoads) + .value("PredicateStores", TailStrategy::PredicateStores) .value("ShiftInwards", TailStrategy::ShiftInwards) + .value("ShiftInwardsAndBlend", TailStrategy::ShiftInwardsAndBlend) + .value("RoundUpAndBlend", TailStrategy::RoundUpAndBlend) .value("Auto", TailStrategy::Auto); py::enum_(m, "TargetOS") @@ -216,6 +221,11 @@ void define_enums(py::module &m) { .value("stmt", OutputFileType::stmt) .value("stmt_html", OutputFileType::stmt_html) .value("compiler_log", OutputFileType::compiler_log); + + py::enum_(m, "Partition") + .value("Auto", Partition::Auto) + .value("Never", Partition::Never) + .value("Always", Partition::Always); } } // namespace PythonBindings diff --git a/python_bindings/src/halide/halide_/PyFunc.cpp b/python_bindings/src/halide/halide_/PyFunc.cpp index dcbd122c6228..b7e82900a6cf 100644 --- a/python_bindings/src/halide/halide_/PyFunc.cpp +++ b/python_bindings/src/halide/halide_/PyFunc.cpp @@ -205,19 +205,25 @@ void define_func(py::module &m) { }) .def("compute_at", (Func & (Func::*)(const Func &, const Var &)) & Func::compute_at, py::arg("f"), py::arg("var")) - .def("compute_at", (Func & (Func::*)(const Func &, const RVar &)) & Func::compute_at, py::arg("f"), py::arg("var")) + .def("compute_at", (Func & (Func::*)(const Func &, const RVar &)) & Func::compute_at, py::arg("f"), py::arg("rvar")) .def("compute_at", (Func & (Func::*)(LoopLevel)) & Func::compute_at, py::arg("loop_level")) .def("store_at", (Func & (Func::*)(const Func &, const Var &)) & Func::store_at, py::arg("f"), py::arg("var")) - .def("store_at", (Func & (Func::*)(const Func &, const RVar &)) & Func::store_at, py::arg("f"), py::arg("var")) + .def("store_at", (Func & (Func::*)(const Func &, const RVar &)) & Func::store_at, py::arg("f"), py::arg("rvar")) .def("store_at", (Func & (Func::*)(LoopLevel)) & Func::store_at, py::arg("loop_level")) .def("async_", &Func::async) + .def("bound_storage", &Func::bound_storage) .def("memoize", &Func::memoize) .def("compute_inline", &Func::compute_inline) .def("compute_root", &Func::compute_root) .def("store_root", &Func::store_root) + .def("hoist_storage", (Func & (Func::*)(const Func &f, const Var &var)) & Func::hoist_storage, py::arg("f"), py::arg("var")) + .def("hoist_storage", (Func & (Func::*)(const Func &f, const RVar &rvar)) & Func::hoist_storage, py::arg("f"), py::arg("rvar")) + .def("hoist_storage", (Func & (Func::*)(LoopLevel)) & Func::hoist_storage, py::arg("loop_level")) + .def("hoist_storage_root", &Func::hoist_storage_root) + .def("store_in", &Func::store_in, py::arg("memory_type")) .def( diff --git a/python_bindings/src/halide/halide_/PyScheduleMethods.h b/python_bindings/src/halide/halide_/PyScheduleMethods.h index 9086bbafc5c0..2c8c00a98f4e 100644 --- a/python_bindings/src/halide/halide_/PyScheduleMethods.h +++ b/python_bindings/src/halide/halide_/PyScheduleMethods.h @@ -33,6 +33,15 @@ HALIDE_NEVER_INLINE void add_schedule_methods(PythonClass &class_instance) { .def("fuse", &T::fuse, py::arg("inner"), py::arg("outer"), py::arg("fused")) + .def("partition", (T & (T::*)(const VarOrRVar &var, Partition partition_policy)) & T::partition, + py::arg("var"), py::arg("partition_policy")) + .def("never_partition_all", &T::never_partition_all) + .def("never_partition", (T & (T::*)(const std::vector &vars)) & T::never_partition, + py::arg("vars")) + .def("always_partition_all", &T::always_partition_all) + .def("always_partition", (T & (T::*)(const std::vector &vars)) & T::always_partition, + py::arg("vars")) + .def("serial", &T::serial, py::arg("var")) diff --git a/python_bindings/src/halide/halide_/PyStage.cpp b/python_bindings/src/halide/halide_/PyStage.cpp index e84c6fcc7189..b412a6f2b39e 100644 --- a/python_bindings/src/halide/halide_/PyStage.cpp +++ b/python_bindings/src/halide/halide_/PyStage.cpp @@ -17,7 +17,9 @@ void define_stage(py::module &m) { .def("rfactor", (Func(Stage::*)(std::vector>)) & Stage::rfactor, py::arg("preserved")) .def("rfactor", (Func(Stage::*)(const RVar &, const Var &)) & Stage::rfactor, - py::arg("r"), py::arg("v")); + py::arg("r"), py::arg("v")) + + .def("unscheduled", &Stage::unscheduled); py::implicitly_convertible(); diff --git a/python_bindings/test/correctness/boundary_conditions.py b/python_bindings/test/correctness/boundary_conditions.py index 716b0cdfd6a1..32abd12ff0e6 100644 --- a/python_bindings/test/correctness/boundary_conditions.py +++ b/python_bindings/test/correctness/boundary_conditions.py @@ -4,19 +4,22 @@ test_min = -25 test_extent = 100 -x, y = hl.Var(), hl.Var() +x, y = hl.vars("x y") def expect_eq(actual, expected): assert expected == actual, "Failed: expected %d, actual %d" % (expected, actual) -def schedule_test(f, vector_width, target): +def schedule_test(f, vector_width, target, partition_policy): if vector_width != 1: f.vectorize(x, vector_width) + f.partition(x, partition_policy); + f.partition(y, partition_policy); + if target.has_gpu_feature() and vector_width <= 16: - xo, yo, xi, yi = hl.Var(), hl.Var(), hl.Var(), hl.Var() + xo, yo, xi, yi = hl.vars("xo yo xi yi") f.gpu_tile(x, y, xo, yo, xi, yi, 2, 2) @@ -30,11 +33,12 @@ def realize_and_check( test_extent_y, vector_width, target, + partition_policy, ): result = hl.Buffer(hl.UInt(8), [test_extent_x, test_extent_y]) result.set_min([test_min_x, test_min_y]) f2 = hl.lambda_func(x, y, f[x, y]) - schedule_test(f2, vector_width, target) + schedule_test(f2, vector_width, target, partition_policy) f2.realize(result, target) result.copy_to_host() for r in range(test_min_y, test_min_y + test_extent_y): @@ -91,8 +95,8 @@ def check_mirror_interior(input, result, c, r): expect_eq(result[c, r], input[mapped_x, mapped_y]) -def test_all(vector_width, target): - # print("target is %s " % str(target)) +def test_all(vector_width, target, partition_policy): + # print("target is %s, partition_policy is %s " % (str(target), str(partition_policy))) W = 32 H = 32 @@ -137,6 +141,7 @@ def test_all(vector_width, target): test_extent, vector_width, target, + partition_policy, ) realize_and_check( bc(**image_input_args), @@ -148,6 +153,7 @@ def test_all(vector_width, target): test_extent, vector_width, target, + partition_policy, ) realize_and_check( bc(**undef_min_args), @@ -159,6 +165,7 @@ def test_all(vector_width, target): test_extent, vector_width, target, + partition_policy, ) realize_and_check( bc(**undef_max_args), @@ -170,6 +177,7 @@ def test_all(vector_width, target): H, vector_width, target, + partition_policy, ) realize_and_check( bc(**implicit_bounds_args), @@ -181,6 +189,7 @@ def test_all(vector_width, target): test_extent, vector_width, target, + partition_policy, ) @@ -189,11 +198,13 @@ def test_all(vector_width, target): vector_width_power_max = 6 # https://github.com/halide/Halide/issues/2148 - if target.has_feature(hl.TargetFeature.Metal) or target.has_feature( - hl.TargetFeature.D3D12Compute - ): - vector_width_power_max = 3 + if target.has_feature(hl.TargetFeature.Metal) or \ + target.has_feature(hl.TargetFeature.Vulkan) or \ + target.has_feature(hl.TargetFeature.OpenGLCompute) or \ + target.has_feature(hl.TargetFeature.D3D12Compute): + vector_width_power_max = 2 for i in range(0, vector_width_power_max): vector_width = 1 << i - test_all(vector_width, target) + test_all(vector_width, target, hl.Partition.Auto) + test_all(vector_width, target, hl.Partition.Never) diff --git a/python_bindings/test/correctness/realize_warnings.py b/python_bindings/test/correctness/realize_warnings.py index a76e5727c93e..f182538c3c2b 100644 --- a/python_bindings/test/correctness/realize_warnings.py +++ b/python_bindings/test/correctness/realize_warnings.py @@ -27,6 +27,30 @@ def test_warnings(): for line in stdout_lines: assert line == expected_warning +def test_unscheduled(suppress): + x = hl.Var() + f = hl.Func("f_%s" % str(suppress)) + f[x] = 0 + f[x] += 5 + f.vectorize(x, 8) + if suppress: + f.update(0).unscheduled() + + buffer = io.StringIO() + with contextlib.redirect_stdout(buffer): + f.realize([1024]) + + buffer.seek(0) + stdout_lines = buffer.readlines() + if suppress: + assert len(stdout_lines) == 0 + else: + expected_warning = "Warning: Update definition 0 of function f_False has not been scheduled" + assert len(stdout_lines) > 0 + for line in stdout_lines: + assert line.startswith(expected_warning), "\n%s\n%s" % (line, expected_warning) if __name__ == "__main__": test_warnings() + test_unscheduled(True) + test_unscheduled(False) From 6bcb6955a9b24ad34f63e0749acced8609e90741 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 14 Dec 2023 16:27:56 -0800 Subject: [PATCH 020/186] Update Halide version in setup.py to 17.0.0 (#8010) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fa88f382a122..4939b88a3151 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="halide", - version='16.0.0', + version='17.0.0', author="The Halide team", author_email="halide-dev@lists.csail.mit.edu", description="Halide is a programming language designed to make it easier " From 61b8d384b2b799cd47634e4a3b67aa7c7f580a46 Mon Sep 17 00:00:00 2001 From: Volodymyr Kysenko Date: Tue, 19 Dec 2023 14:14:05 -0800 Subject: [PATCH 021/186] Scheduling directive to support ring buffering (#7967) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Half-plumbed * Revert "Half-plumbed" This reverts commit eb9dd02c6c607f0b49c95258ae67f58fe583ff44. * Interface for double buffer * Update Provides, Calls and Realizes for double buffering * Proper sync for double buffering * Use proper name for the semaphor and use correct initial value * Rename the class * Pass expression for index * Adds storage for double buffering index * Use a separate index to go through the double buffer * Failing test * Better handling of hoisted storage in all of the async-related passes * New test and clean-up the generated IR * More tests * Allow double buffering without async and add corresponding test * Filter out incorrect double_buffer schedules * Add tests to the cmake files * Clean up * Update the comment * Clean up * Clean up * Update serialization * complete_x86_target() should enable F16C and FMA when AVX2 is present (#7971) All known AVX2-enabled architectures definitely have these features. * Add two new tail strategies for update definitions (#7949) * Add two new tail strategies for update definitions * Stop printing asm * Update expected number of partitions for Partition::Always * Add a comment explaining why the blend safety check is per dimension * Add serialization support for the new tail strategies * trigger buildbots * Add comment --------- Co-authored-by: Steven Johnson * Add appropriate mattrs for arm-32 extensions (#7978) * Add appropriate mattrs for arm-32 extensions Fixes #7976 * Pull clauses out of if * Move canonical version numbers into source, not build system (#7980) (#7981) * Move canonical version numbers into source, not build system (#7980) * Fixes * Silence useless "Insufficient parallelism" autoscheduler warning (#7990) * Add a notebook with a visualization of the aprrox_* functions and their errors (#7974) * Add a notebook with a visualization of the aprrox_* functions and their errors * Fix spelling error * Make narrowing float->int casts on wasm go via wider ints (#7973) Fixes #7972 * Fix handling of assert statements whose conditions get vectorized (#7989) * Fix handling of assert statements whose conditions get vectorized * Fix test name * Fix all "unscheduled update()" warnings in our code (#7991) * Fix all "unscheduled update()" warnings in our code And also fix the Mullapudi scheduler to explicitly touch all update stages. This allows us to mark this warning as an error if we so choose. * fixes * fixes * Update recursive_box_filters.cpp * Silence useless 'Outer dim vectorization of var' warning in Mullapudi… (#7992) Silence useless 'Outer dim vectorization of var' warning in Mullapudi scheduler * Add a tutorial for async and double_buffer * Renamed double_buffer to ring_buffer * ring_buffer() now expects an extent Expr * Actually use extent for ring_buffer() * Address some of the comments * Provide an example of the code structure for producer-consumer async example * Comments updates * Fix clang-format and clang-tidy * Add Python binding for Func::ring_buffer() * Don't use a separate index for ring buffer + add a new test * Rename the tests * Clean up the old name * Add & * Move test to the right folder * Move expr * Add comments for InjectRingBuffering * Improve ring_buffer doc * Fix comments * Comments * A better error message * Mention that extent is expected to be a positive integer * Add another code structure and explain how the indices for ring buffer are computed * Expand test comments * Fix spelling --------- Co-authored-by: Steven Johnson Co-authored-by: Andrew Adams --- python_bindings/src/halide/halide_/PyFunc.cpp | 1 + src/AsyncProducers.cpp | 297 +++++++++++-- src/Deserialization.cpp | 2 + src/Func.cpp | 6 + src/Func.h | 15 + src/Schedule.cpp | 11 + src/Schedule.h | 3 + src/ScheduleFunctions.cpp | 4 + src/Serialization.cpp | 3 +- src/StorageFlattening.cpp | 9 +- src/halide_ir.fbs | 1 + test/correctness/CMakeLists.txt | 1 + test/correctness/ring_buffer.cpp | 414 ++++++++++++++++++ test/error/CMakeLists.txt | 1 + test/error/bad_ring_buffer.cpp | 23 + tutorial/CMakeLists.txt | 3 +- tutorial/lesson_24_async.cpp | 299 +++++++++++++ 17 files changed, 1045 insertions(+), 48 deletions(-) create mode 100644 test/correctness/ring_buffer.cpp create mode 100644 test/error/bad_ring_buffer.cpp create mode 100644 tutorial/lesson_24_async.cpp diff --git a/python_bindings/src/halide/halide_/PyFunc.cpp b/python_bindings/src/halide/halide_/PyFunc.cpp index b7e82900a6cf..bcc889b6d9ce 100644 --- a/python_bindings/src/halide/halide_/PyFunc.cpp +++ b/python_bindings/src/halide/halide_/PyFunc.cpp @@ -213,6 +213,7 @@ void define_func(py::module &m) { .def("store_at", (Func & (Func::*)(LoopLevel)) & Func::store_at, py::arg("loop_level")) .def("async_", &Func::async) + .def("ring_buffer", &Func::ring_buffer) .def("bound_storage", &Func::bound_storage) .def("memoize", &Func::memoize) .def("compute_inline", &Func::compute_inline) diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index f633409cce65..783f00dd35b1 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -73,6 +73,15 @@ class NoOpCollapsingMutator : public IRMutator { } } + Stmt visit(const HoistedStorage *op) override { + Stmt body = mutate(op->body); + if (is_no_op(body)) { + return body; + } else { + return HoistedStorage::make(op->name, body); + } + } + Stmt visit(const Allocate *op) override { Stmt body = mutate(op->body); if (is_no_op(body)) { @@ -198,6 +207,9 @@ class GenerateProducerBody : public NoOpCollapsingMutator { if (starts_with(op->name, func + ".folding_semaphore.") && ends_with(op->name, ".head")) { // This is a counter associated with the producer side of a storage-folding semaphore. Keep it. return op; + } else if (starts_with(op->name, func + ".ring_buffer.")) { + // This is a counter associated with the producer side of a ring buffering. + return op; } else { return Evaluate::make(0); } @@ -243,8 +255,42 @@ class GenerateProducerBody : public NoOpCollapsingMutator { return op; } + Stmt visit(const Allocate *op) override { + Stmt body = mutate(op->body); + if (is_no_op(body)) { + return body; + } else { + return Allocate::make(op->name, op->type, op->memory_type, + op->extents, op->condition, body, + op->new_expr, op->free_function, op->padding); + } + } + + Stmt visit(const Realize *op) override { + Stmt body = mutate(op->body); + if (is_no_op(body)) { + return body; + } else { + inner_realizes.insert(op->name); + return Realize::make(op->name, op->types, op->memory_type, + op->bounds, op->condition, body); + } + } + + Stmt visit(const HoistedStorage *op) override { + Stmt body = mutate(op->body); + if (is_no_op(body)) { + return body; + } else if (inner_realizes.count(op->name) == 0) { + return body; + } else { + return HoistedStorage::make(op->name, body); + } + } + map> &cloned_acquires; set inner_semaphores; + set inner_realizes; public: GenerateProducerBody(const string &f, const vector &s, map> &a) @@ -363,57 +409,78 @@ class ForkAsyncProducers : public IRMutator { const map &env; map> cloned_acquires; - - Stmt visit(const Realize *op) override { - auto it = env.find(op->name); - internal_assert(it != env.end()); - Function f = it->second; - if (f.schedule().async()) { - Stmt body = op->body; - - // Make two copies of the body, one which only does the - // producer, and one which only does the consumer. Inject - // synchronization to preserve dependencies. Put them in a - // task-parallel block. - - // Make a semaphore per consume node - CountConsumeNodes consumes(op->name); - body.accept(&consumes); - - vector sema_names; - vector sema_vars; - for (int i = 0; i < consumes.count; i++) { - sema_names.push_back(op->name + ".semaphore_" + std::to_string(i)); - sema_vars.push_back(Variable::make(type_of(), sema_names.back())); + std::set hoisted_storages; + + Stmt process_body(const string &name, Stmt body) { + // Make two copies of the body, one which only does the + // producer, and one which only does the consumer. Inject + // synchronization to preserve dependencies. Put them in a + // task-parallel block. + + // Make a semaphore per consume node + CountConsumeNodes consumes(name); + body.accept(&consumes); + + vector sema_names; + vector sema_vars; + for (int i = 0; i < consumes.count; i++) { + sema_names.push_back(name + ".semaphore_" + std::to_string(i)); + sema_vars.push_back(Variable::make(type_of(), sema_names.back())); + } + + Stmt producer = GenerateProducerBody(name, sema_vars, cloned_acquires).mutate(body); + Stmt consumer = GenerateConsumerBody(name, sema_vars).mutate(body); + + // Recurse on both sides + producer = mutate(producer); + consumer = mutate(consumer); + + // Run them concurrently + body = Fork::make(producer, consumer); + + for (const string &sema_name : sema_names) { + // Make a semaphore on the stack + Expr sema_space = Call::make(type_of(), "halide_make_semaphore", + {0}, Call::Extern); + + // If there's a nested async producer, we may have + // recursively cloned this semaphore inside the mutation + // of the producer and consumer. + const vector &clones = cloned_acquires[sema_name]; + for (const auto &i : clones) { + body = CloneAcquire(sema_name, i).mutate(body); + body = LetStmt::make(i, sema_space, body); } - Stmt producer = GenerateProducerBody(op->name, sema_vars, cloned_acquires).mutate(body); - Stmt consumer = GenerateConsumerBody(op->name, sema_vars).mutate(body); - - // Recurse on both sides - producer = mutate(producer); - consumer = mutate(consumer); - - // Run them concurrently - body = Fork::make(producer, consumer); + body = LetStmt::make(sema_name, sema_space, body); + } - for (const string &sema_name : sema_names) { - // Make a semaphore on the stack - Expr sema_space = Call::make(type_of(), "halide_make_semaphore", - {0}, Call::Extern); + return body; + } - // If there's a nested async producer, we may have - // recursively cloned this semaphore inside the mutation - // of the producer and consumer. - const vector &clones = cloned_acquires[sema_name]; - for (const auto &i : clones) { - body = CloneAcquire(sema_name, i).mutate(body); - body = LetStmt::make(i, sema_space, body); - } + Stmt visit(const HoistedStorage *op) override { + hoisted_storages.insert(op->name); + Stmt body = op->body; - body = LetStmt::make(sema_name, sema_space, body); - } + auto it = env.find(op->name); + internal_assert(it != env.end()); + Function f = it->second; + if (f.schedule().async() && f.schedule().ring_buffer().defined()) { + body = process_body(op->name, body); + } else { + body = mutate(body); + } + hoisted_storages.erase(op->name); + return HoistedStorage::make(op->name, body); + } + Stmt visit(const Realize *op) override { + auto it = env.find(op->name); + internal_assert(it != env.end()); + Function f = it->second; + if (f.schedule().async() && hoisted_storages.count(op->name) == 0) { + Stmt body = op->body; + body = process_body(op->name, body); return Realize::make(op->name, op->types, op->memory_type, op->bounds, op->condition, body); } else { @@ -592,6 +659,117 @@ class TightenProducerConsumerNodes : public IRMutator { } }; +// Update indices to add ring buffer. +class UpdateIndices : public IRMutator { + using IRMutator::visit; + + Stmt visit(const Provide *op) override { + if (op->name == func_name) { + std::vector args = op->args; + args.push_back(ring_buffer_index); + return Provide::make(op->name, op->values, args, op->predicate); + } + return IRMutator::visit(op); + } + + Expr visit(const Call *op) override { + if (op->call_type == Call::Halide && op->name == func_name) { + std::vector args = op->args; + args.push_back(ring_buffer_index); + return Call::make(op->type, op->name, args, op->call_type, op->func, op->value_index, op->image, op->param); + } + return IRMutator::visit(op); + } + + std::string func_name; + Expr ring_buffer_index; + +public: + UpdateIndices(const string &fn, Expr di) + : func_name(fn), ring_buffer_index(std::move(di)) { + } +}; + +// Inject ring buffering. +class InjectRingBuffering : public IRMutator { + using IRMutator::visit; + + struct Loop { + std::string name; + Expr min; + Expr extent; + + Loop(std::string n, Expr m, Expr e) + : name(std::move(n)), min(std::move(m)), extent(std::move(e)) { + } + }; + + const map &env; + std::vector loops; + std::map hoist_storage_loop_index; + + Stmt visit(const Realize *op) override { + Stmt body = mutate(op->body); + Function f = env.find(op->name)->second; + Region bounds = op->bounds; + if (f.schedule().ring_buffer().defined()) { + // For the ring buffering we expand the storage by adding another dimension of + // the range of [0, ring_buffer.extent]. + bounds.emplace_back(0, f.schedule().ring_buffer()); + // Build an index for accessing ring buffer as a linear combination of all + // loop variables between the storage location (defined by the HoistStorage loop level) + // and corresponding Realize node. + int loop_index = hoist_storage_loop_index[op->name] + 1; + Expr current_index = Variable::make(Int(32), loops[loop_index].name); + while (++loop_index < (int)loops.size()) { + current_index = current_index * + (loops[loop_index].extent - loops[loop_index].min) + + Variable::make(Int(32), loops[loop_index].name); + } + current_index = current_index % f.schedule().ring_buffer(); + // Adds an extra index for to the all of the references of f. + body = UpdateIndices(op->name, current_index).mutate(body); + Expr sema_var = Variable::make(type_of(), f.name() + ".folding_semaphore.ring_buffer"); + Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema_var, 1}, Call::Extern); + Stmt release = Evaluate::make(release_producer); + body = Block::make(body, release); + body = Acquire::make(sema_var, 1, body); + } + + return Realize::make(op->name, op->types, op->memory_type, bounds, op->condition, body); + } + + Stmt visit(const HoistedStorage *op) override { + // Store the index of the last loop we encountered. + hoist_storage_loop_index[op->name] = loops.size() - 1; + Function f = env.find(op->name)->second; + + Stmt mutated = mutate(op->body); + mutated = HoistedStorage::make(op->name, mutated); + + if (f.schedule().ring_buffer().defined()) { + // Make a semaphore on the stack + Expr sema_space = Call::make(type_of(), "halide_make_semaphore", + {2}, Call::Extern); + mutated = LetStmt::make(f.name() + std::string(".folding_semaphore.ring_buffer"), sema_space, mutated); + } + hoist_storage_loop_index.erase(op->name); + return mutated; + } + + Stmt visit(const For *op) override { + loops.emplace_back(op->name, op->min, op->extent); + Stmt mutated = IRMutator::visit(op); + loops.pop_back(); + return mutated; + } + +public: + InjectRingBuffering(const map &e) + : env(e) { + } +}; + // Broaden the scope of acquire nodes to pack trailing work into the // same task and to potentially reduce the nesting depth of tasks. class ExpandAcquireNodes : public IRMutator { @@ -639,6 +817,18 @@ class ExpandAcquireNodes : public IRMutator { } } + Stmt visit(const HoistedStorage *op) override { + Stmt body = mutate(op->body); + if (const Acquire *a = body.as()) { + // Don't do the allocation until we have the + // semaphore. Reduces peak memory use. + return Acquire::make(a->semaphore, a->count, + mutate(HoistedStorage::make(op->name, a->body))); + } else { + return HoistedStorage::make(op->name, body); + } + } + Stmt visit(const LetStmt *op) override { Stmt orig = op; Stmt body; @@ -693,6 +883,9 @@ class TightenForkNodes : public IRMutator { const LetStmt *lr = rest.as(); const Realize *rf = first.as(); const Realize *rr = rest.as(); + const HoistedStorage *hf = first.as(); + const HoistedStorage *hr = rest.as(); + if (lf && lr && lf->name == lr->name && equal(lf->value, lr->value)) { @@ -707,6 +900,10 @@ class TightenForkNodes : public IRMutator { } else if (rr && !stmt_uses_var(first, rr->name)) { return Realize::make(rr->name, rr->types, rr->memory_type, rr->bounds, rr->condition, make_fork(first, rr->body)); + } else if (hf && !stmt_uses_var(rest, hf->name)) { + return HoistedStorage::make(hf->name, make_fork(rf->body, rest)); + } else if (hr && !stmt_uses_var(first, hr->name)) { + return HoistedStorage::make(hr->name, make_fork(first, hr->body)); } else { return Fork::make(first, rest); } @@ -740,6 +937,15 @@ class TightenForkNodes : public IRMutator { } } + Stmt visit(const HoistedStorage *op) override { + Stmt body = mutate(op->body); + if (in_fork && !stmt_uses_var(body, op->name)) { + return body; + } else { + return HoistedStorage::make(op->name, body); + } + } + Stmt visit(const LetStmt *op) override { Stmt body = mutate(op->body); if (in_fork && !stmt_uses_var(body, op->name)) { @@ -758,6 +964,7 @@ class TightenForkNodes : public IRMutator { Stmt fork_async_producers(Stmt s, const map &env) { s = TightenProducerConsumerNodes(env).mutate(s); + s = InjectRingBuffering(env).mutate(s); s = ForkAsyncProducers(env).mutate(s); s = ExpandAcquireNodes().mutate(s); s = TightenForkNodes().mutate(s); diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index 90590d6f15af..33fa3b36e78e 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -1017,6 +1017,7 @@ FuncSchedule Deserializer::deserialize_func_schedule(const Serialize::FuncSchedu const auto memory_type = deserialize_memory_type(func_schedule->memory_type()); const auto memoized = func_schedule->memoized(); const auto async = func_schedule->async(); + const auto ring_buffer = deserialize_expr(func_schedule->ring_buffer_type(), func_schedule->ring_buffer()); const auto memoize_eviction_key = deserialize_expr(func_schedule->memoize_eviction_key_type(), func_schedule->memoize_eviction_key()); auto hl_func_schedule = FuncSchedule(); hl_func_schedule.store_level() = store_level; @@ -1029,6 +1030,7 @@ FuncSchedule Deserializer::deserialize_func_schedule(const Serialize::FuncSchedu hl_func_schedule.memory_type() = memory_type; hl_func_schedule.memoized() = memoized; hl_func_schedule.async() = async; + hl_func_schedule.ring_buffer() = ring_buffer; hl_func_schedule.memoize_eviction_key() = memoize_eviction_key; return hl_func_schedule; } diff --git a/src/Func.cpp b/src/Func.cpp index 8f46e7316531..978d2b19a436 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -2398,6 +2398,12 @@ Func &Func::async() { return *this; } +Func &Func::ring_buffer(Expr extent) { + invalidate_cache(); + func.schedule().ring_buffer() = std::move(extent); + return *this; +} + Stage Func::specialize(const Expr &c) { invalidate_cache(); return Stage(func, func.definition(), 0).specialize(c); diff --git a/src/Func.h b/src/Func.h index ccadef338c29..d4074ee18cc6 100644 --- a/src/Func.h +++ b/src/Func.h @@ -2281,6 +2281,21 @@ class Func { */ Func &async(); + /** Expands the storage of the function by an extra dimension + * to enable ring buffering. For this to be useful the storage + * of the function has to be hoisted to an upper loop level using + * \ref Func::hoist_storage. The index for the new ring buffer dimension + * is calculated implicitly based on a linear combination of the all of + * the loop variables between hoist_storage and compute_at/store_at + * loop levels. Scheduling a function with ring_buffer increases the + * amount of memory required for this function by an *extent* times. + * ring_buffer is especially useful in combination with \ref Func::async, + * but can be used without it. + * + * The extent is expected to be a positive integer. + */ + Func &ring_buffer(Expr extent); + /** Bound the extent of a Func's storage, but not extent of its * compute. This can be useful for forcing a function's allocation * to be a fixed size, which often means it can go on the stack. diff --git a/src/Schedule.cpp b/src/Schedule.cpp index 4ebcccd5e1d8..a2a34f34862e 100644 --- a/src/Schedule.cpp +++ b/src/Schedule.cpp @@ -241,6 +241,8 @@ struct FuncScheduleContents { MemoryType memory_type = MemoryType::Auto; bool memoized = false; bool async = false; + // This is an extent of the ring buffer and expected to be a positive integer. + Expr ring_buffer; Expr memoize_eviction_key; FuncScheduleContents() @@ -362,6 +364,7 @@ FuncSchedule FuncSchedule::deep_copy( copy.contents->memoized = contents->memoized; copy.contents->memoize_eviction_key = contents->memoize_eviction_key; copy.contents->async = contents->async; + copy.contents->ring_buffer = contents->ring_buffer; // Deep-copy wrapper functions. for (const auto &iter : contents->wrappers) { @@ -405,6 +408,14 @@ bool FuncSchedule::async() const { return contents->async; } +Expr &FuncSchedule::ring_buffer() { + return contents->ring_buffer; +} + +Expr &FuncSchedule::ring_buffer() const { + return contents->ring_buffer; +} + std::vector &FuncSchedule::storage_dims() { return contents->storage_dims; } diff --git a/src/Schedule.h b/src/Schedule.h index 32a654228673..f32ce2265a0f 100644 --- a/src/Schedule.h +++ b/src/Schedule.h @@ -624,6 +624,9 @@ class FuncSchedule { bool &async(); bool async() const; + Expr &ring_buffer(); + Expr &ring_buffer() const; + /** The list and order of dimensions used to store this * function. The first dimension in the vector corresponds to the * innermost dimension for storage (i.e. which dimension is diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp index 9c5ca9095575..9525c9a07308 100644 --- a/src/ScheduleFunctions.cpp +++ b/src/ScheduleFunctions.cpp @@ -2249,6 +2249,10 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_ return true; } + if (f.schedule().ring_buffer().defined() && store_at == hoist_storage_at) { + user_error << "Func \"" << f.name() << "\" is scheduled with ring_buffer(), but has matching store_at and hoist_storage levels. Add an explicit hoist_storage directive to the schedule to fix the issue.\n"; + } + vector &sites = legal.sites_allowed; int store_idx = -1, compute_idx = -1, hoist_storage_idx = -1; for (size_t i = 0; i < sites.size(); i++) { diff --git a/src/Serialization.cpp b/src/Serialization.cpp index a9342d95ba6d..f8be69271ff0 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -1117,6 +1117,7 @@ Offset Serializer::serialize_func_schedule(FlatBufferBu const Serialize::MemoryType memory_type = serialize_memory_type(func_schedule.memory_type()); const auto memoized = func_schedule.memoized(); const auto async = func_schedule.async(); + const auto ring_buffer = serialize_expr(builder, func_schedule.ring_buffer()); const auto memoize_eviction_key_serialized = serialize_expr(builder, func_schedule.memoize_eviction_key()); return Serialize::CreateFuncSchedule(builder, store_level_serialized, compute_level_serialized, hoist_storage_level_serialized, @@ -1124,7 +1125,7 @@ Offset Serializer::serialize_func_schedule(FlatBufferBu builder.CreateVector(bounds_serialized), builder.CreateVector(estimates_serialized), builder.CreateVector(wrappers_serialized), - memory_type, memoized, async, + memory_type, memoized, async, ring_buffer.first, ring_buffer.second, memoize_eviction_key_serialized.first, memoize_eviction_key_serialized.second); } diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index 5d16d02d7ab4..223a33837c7a 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -217,10 +217,12 @@ class FlattenDimensions : public IRMutator { vector allocation_extents(extents.size()); vector storage_permutation; vector bound_asserts; + bool is_ring_buffered = false; { auto iter = env.find(op->name); internal_assert(iter != env.end()) << "Realize node refers to function not in environment.\n"; Function f = iter->second.first; + is_ring_buffered = f.schedule().ring_buffer().defined(); const vector &storage_dims = f.schedule().storage_dims(); const vector &args = f.args(); for (size_t i = 0; i < storage_dims.size(); i++) { @@ -251,6 +253,10 @@ class FlattenDimensions : public IRMutator { } internal_assert(storage_permutation.size() == i + 1); } + if (is_ring_buffered) { + storage_permutation.push_back(storage_dims.size()); + allocation_extents[storage_dims.size()] = extents[storage_dims.size()]; + } } internal_assert(storage_permutation.size() == op->bounds.size()); @@ -279,13 +285,13 @@ class FlattenDimensions : public IRMutator { builder.host = Variable::make(Handle(), op->name); builder.type = op->types[0]; builder.dimensions = dims; + for (int i = 0; i < dims; i++) { builder.mins.push_back(min_var[i]); builder.extents.push_back(extent_var[i]); builder.strides.push_back(stride_var[i]); } stmt = LetStmt::make(op->name + ".buffer", builder.build(), stmt); - if (hoisted_storages_map.count(op->name) > 0) { HoistedStorageData &hoisted_storage_data = hoisted_storages[hoisted_storages_map[op->name]]; vector bounded_extents; @@ -336,6 +342,7 @@ class FlattenDimensions : public IRMutator { stmt = LetStmt::make(min_name[i - 1], op->bounds[i - 1].min, stmt); stmt = LetStmt::make(extent_name[i - 1], extents[i - 1], stmt); } + return stmt; } diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index fe52231ffc49..e5855e301d1e 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -521,6 +521,7 @@ table FuncSchedule { memory_type: MemoryType = Auto; memoized: bool; async: bool; + ring_buffer: Expr; memoize_eviction_key: Expr; } diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 4ee9f57480dc..07921a347425 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -84,6 +84,7 @@ tests(GROUPS correctness dilate3x3.cpp div_by_zero.cpp div_round_to_zero.cpp + ring_buffer.cpp dynamic_allocation_in_gpu_kernel.cpp dynamic_reduction_bounds.cpp early_out.cpp diff --git a/test/correctness/ring_buffer.cpp b/test/correctness/ring_buffer.cpp new file mode 100644 index 000000000000..4cb6eb9ac4e0 --- /dev/null +++ b/test/correctness/ring_buffer.cpp @@ -0,0 +1,414 @@ +#include "Halide.h" + +using namespace Halide; + +int main(int argc, char **argv) { + if (get_jit_target_from_environment().arch == Target::WebAssembly) { + printf("[SKIP] WebAssembly does not support async() yet.\n"); + return 0; + } + + // Double-buffer a tile of producer computed as async. + { + Func producer("producer"), consumer("consumer"); + Var x, y, xo, yo, xi, yi; + + producer(x, y) = x + y; + consumer(x, y) = producer(x - 1, y - 1) + producer(x, y) + producer(x + 1, y + 1); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + producer + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 3 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Double-buffer a tile of producer computed as async, but the storage moved to the outside. + { + Func producer("producer"), consumer("consumer"); + Var x, y, xo, yo, xi, yi; + + producer(x, y) = x + y; + consumer(x, y) = producer(x - 1, y - 1) + producer(x, y) + producer(x + 1, y + 1); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + producer + .compute_at(consumer, xo) + .hoist_storage_root() + .ring_buffer(2) + .async(); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 3 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Double-buffer a tile of producer computed as async with multiple intermediate consumers. + { + Func producer("producer"), consumer("consumer"), interm1("interm1"), interm2("interm2"), interm3("interm3"); + Var x, y, xo, yo, xi, yi; + + producer(x, y) = x + y; + interm1(x, y) = producer(x - 1, y - 1); + interm2(x, y) = producer(x, y); + interm3(x, y) = producer(x + 1, y + 1); + + consumer(x, y) = interm1(x, y) + interm2(x, y) + interm3(x, y); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + producer + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + interm1 + .compute_at(consumer, xo); + interm2 + .compute_at(consumer, xo); + interm3 + .compute_at(consumer, xo); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 3 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Double-buffer a tile of producer computed as async with multiple intermediate consumers and output consumer. + { + Func producer("producer"), consumer("consumer"), interm1("interm1"), interm2("interm2"), interm3("interm3"); + Var x, y, xo, yo, xi, yi; + + producer(x, y) = x + y; + interm1(x, y) = producer(x - 1, y - 1); + interm2(x, y) = producer(x, y); + interm3(x, y) = producer(x + 1, y + 1); + + consumer(x, y) = interm1(x, y) + interm2(x, y) + interm3(x, y) + producer(x, y + 2); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + producer + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + interm1 + .compute_at(consumer, xo); + interm2 + .compute_at(consumer, xo); + interm3 + .compute_at(consumer, xo); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 3 * (x + y) + x + y + 2; + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Two async producers with double buffering and one consumer. + { + Func producer1("producer1"), producer2("producer2"), consumer("consumer"); + Var x, y, xo, yo, xi, yi; + + producer1(x, y) = x + y; + producer2(x, y) = x * y; + consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + producer1 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + producer2 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y) + x * y; + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Two async producers with double buffering at different storage levels and one consumer. + { + Func producer1("producer1"), producer2("producer2"), consumer("consumer"); + Var x, y, xo, yo, xi, yi; + + producer1(x, y) = x + y; + producer2(x, y) = x * y; + consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + + producer1 + .compute_at(consumer, xo) + .hoist_storage_root() + .ring_buffer(2) + .async(); + + producer2 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y) + x * y; + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Two async producers with ring buffers and two consumers. + { + Func producer1("producer1"), producer2("producer2"), interm1("interm1"), interm2("interm2"), consumer("consumer"); + Var x, y, xo, yo, xi, yi; + + producer1(x, y) = x + y; + producer2(x, y) = x + y; + interm1(x, y) = producer1(x - 1, y + 1) + producer2(x, y); + interm2(x, y) = producer1(x, y) + producer2(x + 1, y - 1); + consumer(x, y) = interm1(x, y) + interm2(x, y); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + + interm1 + .compute_at(consumer, xo); + + interm2 + .compute_at(consumer, xo); + + // Extents for ring_buffer() below are random to test various cases. + producer1 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(5) + .async(); + + producer2 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 4 * (x + y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Three async producers with ring buffers and two consumers. + { + Func producer1("producer1"), producer2("producer2"), producer3("producer3"); + Func interm1("interm1"), interm2("interm2"), consumer("consumer"); + Var x, y, xo, yo, xi, yi; + + producer1(x, y) = x + y; + producer2(x, y) = x + y; + producer3(x, y) = x * y; + interm1(x, y) = producer1(x - 1, y + 1) + producer2(x, y) + producer3(x - 1, y - 1); + interm2(x, y) = producer1(x, y) + producer2(x + 1, y - 1) + producer3(x + 1, y + 1); + consumer(x, y) = interm1(x, y) + interm2(x, y); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + + interm1 + .compute_at(consumer, xo); + + interm2 + .compute_at(consumer, xo) + // Let's hoist storage of this consumer to make it more complicated. + .hoist_storage(consumer, yo); + + // Extents for ring_buffer() below are random to test various cases. + producer1 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + producer2 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(3) + .async(); + + producer3 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(4) + .async(); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 4 * (x + y) + ((x - 1) * (y - 1)) + ((x + 1) * (y + 1)); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Two non-async ring-buffered producers and two consumers. + { + Func producer1("producer1"), producer2("producer2"), producer3("producer3"); + Func interm1("interm1"), interm2("interm2"), consumer("consumer"); + Var x, y, xo, yo, xi, yi; + + producer1(x, y) = x + y; + producer2(x, y) = x + y; + producer3(x, y) = x * y; + interm1(x, y) = producer1(x - 1, y + 1) + producer2(x, y) + producer3(x - 1, y - 1); + interm2(x, y) = producer1(x, y) + producer2(x + 1, y - 1) + producer3(x + 1, y + 1); + consumer(x, y) = interm1(x, y) + interm2(x, y); + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + + interm1 + .compute_at(consumer, xo); + + interm2 + .compute_at(consumer, xo) + // Let's hoist storage of this consumer to make it more complicated. + .hoist_storage(consumer, yo); + + // Extents for ring_buffer() below are random to test various cases. + producer1 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(3); + + producer2 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2); + + producer3 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(4); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 4 * (x + y) + ((x - 1) * (y - 1)) + ((x + 1) * (y + 1)); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + // Chain of two async double-buffered producers and consumer. + { + Func producer1("producer1"), producer2("producer2"), consumer("consumer"); + Var x, y, xo, yo, xi, yi; + + producer1(x, y) = x + y; + producer2(x, y) = producer1(x, y) + x * y; + consumer(x, y) = producer2(x, y) * 2; + + consumer + .compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + producer1 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + producer2 + .compute_at(consumer, xo) + .hoist_storage(consumer, yo) + .ring_buffer(2) + .async(); + + Buffer out = consumer.realize({128, 128}); + + out.for_each_element([&](int x, int y) { + int correct = 2 * (x + y + x * y); + if (out(x, y) != correct) { + printf("out(%d, %d) = %d instead of %d\n", + x, y, out(x, y), correct); + exit(1); + } + }); + } + + printf("Success!\n"); + return 0; +} \ No newline at end of file diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt index ef4f5ffea614..52a2a01cd65e 100644 --- a/test/error/CMakeLists.txt +++ b/test/error/CMakeLists.txt @@ -20,6 +20,7 @@ tests(GROUPS error bad_const_cast.cpp bad_device_api.cpp bad_dimensions.cpp + bad_ring_buffer.cpp bad_extern_split.cpp bad_fold.cpp bad_host_alignment.cpp diff --git a/test/error/bad_ring_buffer.cpp b/test/error/bad_ring_buffer.cpp new file mode 100644 index 000000000000..ffd06ef9d075 --- /dev/null +++ b/test/error/bad_ring_buffer.cpp @@ -0,0 +1,23 @@ +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + Func f("f"), g("g"), h("h"); + Var x("x"), y("y"); + + f(x) = x; + g(x) = f(x); + h(x, y) = g(x); + + g.compute_at(h, y); + + // ring_buffer() requires an explicit hoist_storage(). + f.compute_root().ring_buffer(2); + + h.realize({10, 10}); + + printf("Success!\n"); + return 0; +} diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt index 862db3db6bd3..ee81fcb7a545 100644 --- a/tutorial/CMakeLists.txt +++ b/tutorial/CMakeLists.txt @@ -210,6 +210,7 @@ if (TARGET Halide::Mullapudi2016) set_tests_properties(tutorial_lesson_21_auto_scheduler_run PROPERTIES LABELS "tutorial;multithreaded") endif () -# Lessons 22-23 +# Lessons 22-24 add_tutorial(lesson_22_jit_performance.cpp) add_tutorial(lesson_23_serialization.cpp WITH_IMAGE_IO) +add_tutorial(lesson_24_async.cpp) diff --git a/tutorial/lesson_24_async.cpp b/tutorial/lesson_24_async.cpp new file mode 100644 index 000000000000..191350cf5012 --- /dev/null +++ b/tutorial/lesson_24_async.cpp @@ -0,0 +1,299 @@ +// Halide tutorial lesson 24: Async execution + +// This lesson demonstrates how to asynchronously execute a function +// using scheduling directives 'async' and 'ring_buffer'. + +// On linux, you can compile and run it like so: +// g++ lesson_24*.cpp -g -I -L -lHalide -lpthread -ldl -o lesson_24 -std=c++17 +// LD_LIBRARY_PATH= ./lesson_24 + +// On os x: +// g++ lesson_24*.cpp -g -I -L -lHalide -o lesson_24 -std=c++17 +// DYLD_LIBRARY_PATH= ./lesson_24 + +// If you have the entire Halide source tree, you can also build it by +// running: +// make tutorial_lesson_24_async +// in a shell with the current directory at the top of the halide +// source tree. + +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + // Declare some Vars to use below. + Var x("x"), y("y"), c("c"), xo("xo"), yo("yo"), xi("xi"), yi("yi"), tile("tile"); + + { + // In this example we simply tell Halide to run `producer` in a + // separate thread. This is not very useful on its own, but is a good start + // for the next examples. + Func producer("producer"), consumer("consumer"); + + producer(x, y) = x + y; + consumer(x, y) = producer(x - 1, y - 1) + producer(x, y) + producer(x + 1, y + 1); + + consumer.compute_root(); + // Use async() to produce `producer` in a separate thread. + producer.compute_root().async(); + + // The high-level structure of the generated code will be: + // { + // allocate producer[...] + // thread #1 { + // produce producer { + // ... + // } + // signal that data is ready + // } + // thread #2 { + // consume producer { + // block until producer data is ready + // produce consumer { + // ... + // } + // } + // } + // } + consumer.realize({128, 128}); + } + + { + // Now let's use async() to execute two different producers simultaneously. + // This could be useful in various scenarios when you want to overlap + // computations of different functions in time. For example, you could execute + // producer1 and producer2 on different devices in parallel (e.g producer1 on CPU + // and producer2 on GPU). + Func producer1("producer1"), producer2("producer2"), consumer("consumer"); + + producer1(x, y) = x + y; + producer2(x, y) = x + y; + consumer(x, y) = producer1(x - 1, y - 1) + producer2(x, y) + producer1(x + 1, y + 1); + + // With the schedule below, `producer1` and `producer2` computations will be each + // launched in separate threads. Since `consumer` depends on both of them, and producers + // are scheduled as compute_root(), `consumer` will have to wait until `producer1` and + // `producer2` fully completed their work. The required synchronization primitives + // will be added between producers and `consumer` to ensure that it's safe for `consumer` + // to start its work and input data is fully ready. + consumer.compute_root(); + producer1.compute_root().async(); + producer2.compute_root().async(); + + // The high-level structure of the generated code will be: + // { + // allocate producer1[...] + // allocate producer2[...] + // thread #1 { + // produce producer1 { + // ... + // } + // signal that producer1 data is ready + // } + // thread #2 { + // produce producer2 { + // ... + // } + // signal that producer2 data is ready + // } + // thread #3 { + // consume producer1 { + // consume producer2 { + // block until producer1 data is ready + // block until producer2 data is ready + // produce consumer { + // ... + // } + // } + // } + // } + // } + consumer.realize({128, 128}); + } + + { + // In the previous example, we managed to run two producers in parallel, but `consumer` had + // to wait until the data is fully ready. Wouldn't it be great if we could overlap computations + // of `producer` and `consumer` too? This computational pattern is known as 'double buffering' and + // can be critical for achieving good performance in certain scenarios. The high-level idea is that + // producer is allowed to run ahead and do the next chunk of work without waiting while consumer + // is processing the current chunk. The obvious drawback of this method is that it requires twice + // as much memory for `producer`. + Func producer("producer"), consumer("consumer"); + + producer(x, y, c) = (x + y) * (c + 1); + consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c); + + consumer.compute_root(); + + // In this example the planes are processed separately, so producer can run ahead + // and start producing plane `c + 1`, while `consumer` consumes already produced plane `c`. + // One way to express it with Halide schedule is very similar to how sliding window schedules + // are expressed (see lesson_8 for details). There are indeed a lot of commonalities between the two + // because both of them are relying on a circular buffer as underlying data structure. + producer + .async() + .compute_at(consumer, c) + // fold_storage requires store_at which is separate from compute_at. + .store_at(consumer, Var::outermost()) + // Explicit fold_storage is required here, because otherwise Halide will infer that only + // one plane of `producer` is necessary for `consumer`, but for the purposes of this + // example we want at least 2. + // Please, note that adding a fold_storage(c, 2) will double the amount of storage allocated + // for `producer`. + .fold_storage(c, 2); + + // The high-level structure of the generated code will be: + // { + // allocate producer1[extent.x, extent.y, 2] + // // In this case there are two semaphores, because producer can run ahead, so we need + // // to track how much was consumed and produced separately. + // // This semaphore indicates how much producer has produced. + // producer1.semaphore = 0 + // // This semaphore indicates how much `space` for producer is available. + // producer1.folding_semaphore = 2 + // thread #1 { + // loop over c { + // // Acquire a semaphore or block until the space to produce to is available. + // // The semaphore is released by consumer thread, when the data was fully + // // consumed. + // acquire(producer1.folding_semaphore, 1) + // produce producer1 { + // // Produce the next plane of the producer1 and store it at index c % 2. + // producer1[_, _, c % 2] = ... + // // Release a semaphore to indicate that plane was produced, consumer will + // // acquire this semaphore in the other thread. + // release(producer1.semaphore) + // } + // } + // } + // thread #2 { + // loop over c { + // // Acquire a semaphore or block until the data from producer is ready. + // // The semaphore is released by producer thread, when the data was fully + // // produced. + // acquire(producer1.semaphore, 1) + // consume producer1 { + // consumer[_, _, c] = + // // Release a semaphore to indicate that plane was consumed, producer will + // // acquire this semaphore in the other thread. + // release(producer1.folding_semaphore) + // } + // } + // } + // } + consumer.realize({128, 128, 4}); + } + + { + // In the previous example, we relied on the storage folding to express double buffering + // technique, but there is another, more direct way to do that. + Func producer("producer"), consumer("consumer"); + + producer(x, y, c) = (x + y) * (c + 1); + consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c); + + consumer.compute_root(); + + // As mentioned in the previous example, the planes are processed separately, so producer can run + // ahead and start producing plane `c + 1`, while `consumer` consumes already produced plane `c`. + // A more direct way to express this would be to hoist storage of `producer` to ouside of the loop + // `c` over planes, double its size and add necessary indices to flip the planes. + // The first part can be achieved with `hoist_storage` directive and the rest is done with + // `ring_buffer`. Please, note that it's enough to provide only extent of the ring buffer, there is no + // need to specify an explicit loop level to tie ring buffer to, because the index for ring buffer + // will be implicitly computed based on a linear combination of loop variables between storage and + // compute_at/store_at levels. + producer + .async() + .compute_at(consumer, c) + .hoist_storage(consumer, Var::outermost()) + // Similarly, to the previous example, the amount of storage is doubled here. + .ring_buffer(2); + + // The high-level structure of the generated code will be very similar to the previous example. + consumer.realize({128, 128, 4}); + } + + { + // The advantage of the `hoist_storage` + `ring_buffer` approach is that it can be applied to + // fairly arbitrary loop splits and tilings. For example, in the following schedule instead of + // double buffering over whole planes, we double buffer over sub-regions or tiles of the planes. + // This is not possible to achieve with fold_storage, because it works over the *storage* + // dimensions of the function and not the loop splits. + Func producer("producer"), consumer("consumer"); + + producer(x, y, c) = (x + y) * (c + 1); + consumer(x, y, c) = producer(x - 1, y - 1, c) + producer(x, y, c) + producer(x + 1, y + 1, c); + + consumer.compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::Auto); + + producer + .async() + .compute_at(consumer, xo) + .hoist_storage(consumer, Var::outermost()) + .ring_buffer(2); + + // // The high-level structure of the generated code will be: + // { + // // The size of the tile (16, 16, 1) + extra to accomodate 3x3 filter. The fourth dimension + // // is added by ring_buffer() directive. + // allocate producer1[18, 18, 1, 2] + // // In this case there are two semaphores, because producer can run ahead, so we need + // // to track how much was consumed and produced separately. + // // This semaphore indicates how much producer has produced. + // producer1.semaphore = 0 + // // This semaphore indicates how much `space` for producer is available. + // producer1.folding_semaphore.ring_buffer = 2 + // thread #1 { + // loop over c { + // loop over yo { + // loop over xo { + // // Acquire a semaphore or block until the space to produce to is available. + // // The semaphore is released by consumer thread, when the data was fully + // // consumed. + // acquire(producer1.folding_semaphore.ring_buffer, 1) + // produce producer1 { + // // The index of ring buffer is computed as a linear combination of the all loop + // // variables up to the storage level. + // ring_buffer_index = % 2 + // // Produce the next tile of the producer1 and store it at index ring_buffer_index. + // producer1[x, y, 0, ring_buffer_index % 2] = ... + // // Release a semaphore to indicate that tile was produced, consumer will + // // acquire this semaphore in the other thread. + // release(producer1.semaphore) + // } + // } + // } + // } + // } + // thread #2 { + // loop over c { + // loop over yo { + // loop over xo { + // // Acquire a semaphore or block until the data from producer is ready. + // // The semaphore is released by producer thread, when the data was fully + // // produced. + // acquire(producer1.semaphore, 1) + // consume producer1 { + // ring_buffer_index = % 2 + // consumer[_, _, c] = + // // Release a semaphore to indicate that tile was consumed, producer will + // // acquire this semaphore in the other thread. + // release(producer1.folding_semaphore.ring_buffer) + // } + // } + // } + // } + // } + // } + consumer.realize({128, 128, 4}); + } + + printf("Success!\n"); + + return 0; +} From 6f26b044276083f172d8319fb9876d2eb80d2acd Mon Sep 17 00:00:00 2001 From: Tyler Hou Date: Tue, 2 Jan 2024 13:27:51 -0500 Subject: [PATCH 022/186] Change startswith -> starts_with (#8013) startswith was deprecated in llvm/lvm-project#75491, which means that Halide fails to compile using LLVM 18 (deprecation warning). --- src/CodeGen_LLVM.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 18e70dfb3d87..f319f204de9f 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1231,13 +1231,13 @@ void CodeGen_LLVM::optimize_module() { // Do not annotate any of Halide's low-level synchronization code as it has // tsan interface calls to mark its behavior and is much faster if // it is not analyzed instruction by instruction. - if (!(function.getName().startswith("_ZN6Halide7Runtime8Internal15Synchronization") || + if (!(function.getName().starts_with("_ZN6Halide7Runtime8Internal15Synchronization") || // TODO: this is a benign data race that re-initializes the detected features; // we should really fix it properly inside the implementation, rather than disabling // it here as a band-aid. - function.getName().startswith("halide_default_can_use_target_features") || - function.getName().startswith("halide_mutex_") || - function.getName().startswith("halide_cond_"))) { + function.getName().starts_with("halide_default_can_use_target_features") || + function.getName().starts_with("halide_mutex_") || + function.getName().starts_with("halide_cond_"))) { function.addFnAttr(Attribute::SanitizeThread); } } From 8024bdc9050c52b13e901355c6944fc26aa27874 Mon Sep 17 00:00:00 2001 From: Volodymyr Kysenko Date: Tue, 2 Jan 2024 14:52:53 -0800 Subject: [PATCH 023/186] Don't add ring_buffer semaphores if the function is not scheduled as async (#8015) Don't add ring_buffer semaphores if the function is not scheduled as asybc Co-authored-by: Steven Johnson --- src/AsyncProducers.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index 783f00dd35b1..92012ccfe4c1 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -729,11 +729,14 @@ class InjectRingBuffering : public IRMutator { current_index = current_index % f.schedule().ring_buffer(); // Adds an extra index for to the all of the references of f. body = UpdateIndices(op->name, current_index).mutate(body); - Expr sema_var = Variable::make(type_of(), f.name() + ".folding_semaphore.ring_buffer"); - Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema_var, 1}, Call::Extern); - Stmt release = Evaluate::make(release_producer); - body = Block::make(body, release); - body = Acquire::make(sema_var, 1, body); + + if (f.schedule().async()) { + Expr sema_var = Variable::make(type_of(), f.name() + ".folding_semaphore.ring_buffer"); + Expr release_producer = Call::make(Int(32), "halide_semaphore_release", {sema_var, 1}, Call::Extern); + Stmt release = Evaluate::make(release_producer); + body = Block::make(body, release); + body = Acquire::make(sema_var, 1, body); + } } return Realize::make(op->name, op->types, op->memory_type, bounds, op->condition, body); @@ -747,7 +750,7 @@ class InjectRingBuffering : public IRMutator { Stmt mutated = mutate(op->body); mutated = HoistedStorage::make(op->name, mutated); - if (f.schedule().ring_buffer().defined()) { + if (f.schedule().async() && f.schedule().ring_buffer().defined()) { // Make a semaphore on the stack Expr sema_space = Call::make(type_of(), "halide_make_semaphore", {2}, Call::Extern); From d2da00705ceb511fe69837cceab848f124d957ec Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 3 Jan 2024 20:05:37 +0000 Subject: [PATCH 024/186] Fix for top-of-tree LLVM (Fix #8017) (#8018) Fix for top-of-tree LLVM --- src/CodeGen_LLVM.cpp | 4 ++++ src/CodeGen_PTX_Dev.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index f319f204de9f..7b9eecd3d74e 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1244,7 +1244,11 @@ void CodeGen_LLVM::optimize_module() { } if (tm) { +#if LLVM_VERSION >= 180 + tm->registerPassBuilderCallbacks(pb, /*PopulateClassToPassNames=*/false); +#else tm->registerPassBuilderCallbacks(pb); +#endif } mpm = pb.buildPerModuleDefaultPipeline(level, debug_pass_manager); diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 2a47e591c503..6be2f1b7e988 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -700,7 +700,11 @@ vector CodeGen_PTX_Dev::compile_to_src() { using OptimizationLevel = llvm::OptimizationLevel; OptimizationLevel level = OptimizationLevel::O3; +#if LLVM_VERSION >= 180 + target_machine->registerPassBuilderCallbacks(pb, /*PopulateClassToPassNames=*/false); +#else target_machine->registerPassBuilderCallbacks(pb); +#endif mpm = pb.buildPerModuleDefaultPipeline(level, debug_pass_manager); mpm.run(*module, mam); From b661c8d79fa92c7deb99e4611dc4f536ea435102 Mon Sep 17 00:00:00 2001 From: Zalman Stern Date: Wed, 3 Jan 2024 17:49:56 -0800 Subject: [PATCH 025/186] Quick fix for crash that is occurring in SVE2 tests. (#8020) Broken out into separate PR for ease of review and isolated test/tracking. --- test/correctness/simd_op_check.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index d97f2c72b90c..7b1057b7f3ea 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -202,7 +202,8 @@ class SimdOpCheckTest { void visit(const Internal::Call *op) override { if (op->call_type == Internal::Call::Halide) { Internal::Function f(op->func); - if (f.has_update_definition()) { + if (f.has_update_definition() && + f.update(0).schedule().rvars().size() > 0) { inline_reduction = f; result = true; } From daf011d9739d1318fd4b10250583cf15ffc611d4 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 4 Jan 2024 17:04:18 +0000 Subject: [PATCH 026/186] Don't use variable-length arrays (#8021) There was a rogue use of VLAs (an extension we don't want to use) in one of the runtime tests. Fixed the test. I'll follow up with a separate PR to ensure this warning is enabled everywhere to flush out other usages. --- test/runtime/memory_arena.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/runtime/memory_arena.cpp b/test/runtime/memory_arena.cpp index f4d4b853eddf..3189b401c6b0 100644 --- a/test/runtime/memory_arena.cpp +++ b/test/runtime/memory_arena.cpp @@ -44,7 +44,7 @@ int main(int argc, char **argv) { MemoryArena::Config config = {sizeof(double), 32, 0}; MemoryArena *arena = MemoryArena::create(user_context, config, test_allocator); - size_t count = 4 * 1024; + constexpr size_t count = 4 * 1024; void *pointers[count]; for (size_t n = 0; n < count; ++n) { pointers[n] = arena->reserve(user_context, true); @@ -75,7 +75,7 @@ int main(int argc, char **argv) { arena.destroy(user_context); - size_t count = 4 * 1024; + constexpr size_t count = 4 * 1024; void *pointers[count]; for (size_t n = 0; n < count; ++n) { pointers[n] = arena.reserve(user_context, true); From 21accaddc5718830f77ec2ea1afa5a624edd08b0 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 4 Jan 2024 17:04:34 +0000 Subject: [PATCH 027/186] Set warnings on tests as well as src (#8022) * Don't use variable-length arrays There was a rogue use of VLAs (an extension we don't want to use) in one of the runtime tests. Fixed the test. I'll follow up with a separate PR to ensure this warning is enabled everywhere to flush out other usages. * Set warnings on tests as well as src --- CMakeLists.txt | 74 +++++++++++++++++++++++++++++++++++ cmake/HalideTestHelpers.cmake | 1 + src/CMakeLists.txt | 68 +------------------------------- test/runtime/CMakeLists.txt | 1 + 4 files changed, 77 insertions(+), 67 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a916bba26f3..6be8ece13282 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,80 @@ if (TARGET_VULKAN) set(TARGET_SPIRV ON) # required endif() +# Helper function to set C++ compiler warnings in a sane way +function(set_halide_compiler_warnings NAME) + target_compile_options( + ${NAME} + PRIVATE + $<$:-Wall> + + # variable length arrays in C++ are a Clang extension, we don't want to use them + $<$:-Wvla-extension> + + $<$:-Wcast-qual> + $<$:-Wignored-qualifiers> + $<$:-Woverloaded-virtual> + + $<$:-Wsuggest-override> + + $<$:-Winconsistent-missing-destructor-override> + $<$:-Winconsistent-missing-override> + $<$:-Wdeprecated-declarations> + + $<$:-Wno-double-promotion> + $<$:-Wno-float-conversion> + $<$:-Wno-float-equal> + $<$:-Wno-missing-field-initializers> + $<$:-Wno-old-style-cast> + $<$:-Wno-shadow> + $<$:-Wno-sign-conversion> + $<$:-Wno-switch-enum> + $<$:-Wno-undef> + $<$:-Wno-unused-function> + $<$:-Wno-unused-macros> + $<$:-Wno-unused-parameter> + + $<$:-Wno-c++98-compat-pedantic> + $<$:-Wno-c++98-compat> + $<$:-Wno-cast-align> + $<$:-Wno-comma> + $<$:-Wno-covered-switch-default> + $<$:-Wno-documentation-unknown-command> + $<$:-Wno-documentation> + $<$:-Wno-exit-time-destructors> + $<$:-Wno-global-constructors> + $<$:-Wno-implicit-float-conversion> + $<$:-Wno-implicit-int-conversion> + $<$:-Wno-implicit-int-float-conversion> + $<$:-Wno-missing-prototypes> + $<$:-Wno-nonportable-system-include-path> + $<$:-Wno-reserved-id-macro> + $<$:-Wno-shadow-field-in-constructor> + $<$:-Wno-shadow-field> + $<$:-Wno-shorten-64-to-32> + $<$:-Wno-undefined-func-template> + $<$:-Wno-unused-member-function> + $<$:-Wno-unused-template> + + # This warning was removed in Clang 13 + $<$,$,13.0>>:-Wno-return-std-move-in-c++11> + + $<$:/W3> + $<$:/wd4018> # 4018: disable "signed/unsigned mismatch" + $<$:/wd4141> # 4141: 'inline' used more than once + $<$:/wd4146> # 4146: unary minus applied to unsigned type + $<$:/wd4244> # 4244: conversion, possible loss of data + $<$:/wd4267> # 4267: conversion from 'size_t' to 'int', possible loss of data + $<$:/wd4291> # 4291: No matching operator delete found + $<$:/wd4503> # 4503: disable "decorated name length exceeded, name was truncated" + $<$:/wd4800> # 4800: forcing value to bool 'true' or 'false' (performance warning) + + # No: enable deprecation warnings + # $<$:/wd4996> # 4996: compiler encountered deprecated declaration + ) +endfunction() + + ## # Import dependencies ## diff --git a/cmake/HalideTestHelpers.cmake b/cmake/HalideTestHelpers.cmake index e938d11d53ec..b6b9b70551ff 100644 --- a/cmake/HalideTestHelpers.cmake +++ b/cmake/HalideTestHelpers.cmake @@ -54,6 +54,7 @@ function(add_halide_test TARGET) add_test(NAME ${TARGET} COMMAND ${args_COMMAND} ${args_ARGS} WORKING_DIRECTORY "${args_WORKING_DIRECTORY}") + set_halide_compiler_warnings(${TARGET}) # We can't add Halide::TerminateHandler here, because it requires Halide::Error # and friends to be present in the final linkage, but some callers of add_halide_test() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 390fee9a64e5..cfb092d29bf0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -575,73 +575,7 @@ endif () ## # Set compiler options for libHalide ## - -target_compile_options( - Halide - PRIVATE - $<$:-Wall> - - $<$:-Wcast-qual> - $<$:-Wignored-qualifiers> - $<$:-Woverloaded-virtual> - - $<$:-Wsuggest-override> - - $<$:-Winconsistent-missing-destructor-override> - $<$:-Winconsistent-missing-override> - $<$:-Wdeprecated-declarations> - - $<$:-Wno-double-promotion> - $<$:-Wno-float-conversion> - $<$:-Wno-float-equal> - $<$:-Wno-missing-field-initializers> - $<$:-Wno-old-style-cast> - $<$:-Wno-shadow> - $<$:-Wno-sign-conversion> - $<$:-Wno-switch-enum> - $<$:-Wno-undef> - $<$:-Wno-unused-function> - $<$:-Wno-unused-macros> - $<$:-Wno-unused-parameter> - - $<$:-Wno-c++98-compat-pedantic> - $<$:-Wno-c++98-compat> - $<$:-Wno-cast-align> - $<$:-Wno-comma> - $<$:-Wno-covered-switch-default> - $<$:-Wno-documentation-unknown-command> - $<$:-Wno-documentation> - $<$:-Wno-exit-time-destructors> - $<$:-Wno-global-constructors> - $<$:-Wno-implicit-float-conversion> - $<$:-Wno-implicit-int-conversion> - $<$:-Wno-implicit-int-float-conversion> - $<$:-Wno-missing-prototypes> - $<$:-Wno-nonportable-system-include-path> - $<$:-Wno-reserved-id-macro> - $<$:-Wno-shadow-field-in-constructor> - $<$:-Wno-shadow-field> - $<$:-Wno-shorten-64-to-32> - $<$:-Wno-undefined-func-template> - $<$:-Wno-unused-member-function> - $<$:-Wno-unused-template> - - # This warning was removed in Clang 13 - $<$,$,13.0>>:-Wno-return-std-move-in-c++11> - - $<$:/W3> - $<$:/wd4018> # 4018: disable "signed/unsigned mismatch" - $<$:/wd4141> # 4141: 'inline' used more than once - $<$:/wd4146> # 4146: unary minus applied to unsigned type - $<$:/wd4244> # 4244: conversion, possible loss of data - $<$:/wd4267> # 4267: conversion from 'size_t' to 'int', possible loss of data - $<$:/wd4291> # 4291: No matching operator delete found - $<$:/wd4503> # 4503: disable "decorated name length exceeded, name was truncated" - $<$:/wd4800> # 4800: forcing value to bool 'true' or 'false' (performance warning) - - # No: enable deprecation warnings - # $<$:/wd4996> # 4996: compiler encountered deprecated declaration -) +set_halide_compiler_warnings(Halide) if (CMAKE_GENERATOR MATCHES "Visual Studio") # We could expose the /MP flag to all targets, but that might end up saturating the build diff --git a/test/runtime/CMakeLists.txt b/test/runtime/CMakeLists.txt index 44ebf4c39d9d..b432b4299804 100644 --- a/test/runtime/CMakeLists.txt +++ b/test/runtime/CMakeLists.txt @@ -15,6 +15,7 @@ function(_set_target_options NAME) COMPILING_HALIDE_RUNTIME COMPILING_HALIDE_RUNTIME_TESTS ) + set_halide_compiler_warnings(${NAME}) endfunction() function(halide_define_runtime_internal_test NAME) From cdebeb8ce81f82be022bae1ecda50a09d6d8fa9e Mon Sep 17 00:00:00 2001 From: Tom Westerhout <14264576+twesterhout@users.noreply.github.com> Date: Tue, 9 Jan 2024 02:33:08 +0100 Subject: [PATCH 028/186] Fix -Wstrict-prototype warnings in HalideRuntime.h (#8027) When HalideRuntime.h is included in a C file, funtions that are declared with `()` instead of `(void)` for their arguments change meaning. These may cause issues downstream because different code is generated. --- src/runtime/HalideRuntime.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index eea4faf7b073..b61b13041b8e 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -214,7 +214,7 @@ typedef int (*halide_task_t)(void *user_context, int task_number, uint8_t *closu extern int halide_do_par_for(void *user_context, halide_task_t task, int min, int size, uint8_t *closure); -extern void halide_shutdown_thread_pool(); +extern void halide_shutdown_thread_pool(void); //@} /** Set a custom method for performing a parallel for loop. Returns @@ -751,7 +751,7 @@ extern int halide_get_trace_file(void *user_context); /** If tracing is writing to a file. This call closes that file * (flushing the trace). Returns zero on success. */ -extern int halide_shutdown_trace(); +extern int halide_shutdown_trace(void); /** All Halide GPU or device backend implementations provide an * interface to be used with halide_device_malloc, etc. This is @@ -1005,7 +1005,7 @@ extern void halide_memoization_cache_release(void *user_context, void *host); /** Free all memory and resources associated with the memoization cache. * Must be called at a time when no other threads are accessing the cache. */ -extern void halide_memoization_cache_cleanup(); +extern void halide_memoization_cache_cleanup(void); /** Verify that a given range of memory has been initialized; only used when Target::MSAN is enabled. * @@ -1911,7 +1911,7 @@ enum { /** Get a pointer to the global profiler state for programmatic * inspection. Lock it before using to pause the profiler. */ -extern struct halide_profiler_state *halide_profiler_get_state(); +extern struct halide_profiler_state *halide_profiler_get_state(void); /** Get a pointer to the pipeline state associated with pipeline_name. * This function grabs the global profiler state's lock on entry. */ @@ -1930,14 +1930,14 @@ extern int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *pre * running; halide_profiler_memory_allocate/free and * halide_profiler_stack_peak_update update the profiler pipeline's * state without grabbing the global profiler state's lock. */ -extern void halide_profiler_reset(); +extern void halide_profiler_reset(void); /** Reset all profiler state. * WARNING: Do NOT call this method while any halide pipeline is * running; halide_profiler_memory_allocate/free and * halide_profiler_stack_peak_update update the profiler pipeline's * state without grabbing the global profiler state's lock. */ -void halide_profiler_shutdown(); +void halide_profiler_shutdown(void); /** Print out timing statistics for everything run since the last * reset. Also happens at process exit. */ @@ -1946,12 +1946,12 @@ extern void halide_profiler_report(void *user_context); /** For timer based profiling, this routine starts the timer chain running. * halide_get_profiler_state can be called to get the current timer interval. */ -extern void halide_start_timer_chain(); +extern void halide_start_timer_chain(void); /** These routines are called to temporarily disable and then reenable * timer interuppts for profiling */ //@{ -extern void halide_disable_timer_interrupt(); -extern void halide_enable_timer_interrupt(); +extern void halide_disable_timer_interrupt(void); +extern void halide_enable_timer_interrupt(void); //@} /// \name "Float16" functions From 91b063dfb30d531bccd03a2e0958951c2c394436 Mon Sep 17 00:00:00 2001 From: Volodymyr Kysenko Date: Mon, 8 Jan 2024 20:57:15 -0800 Subject: [PATCH 029/186] Stronger chain detection in LoopCarry pass (#8016) * Stronger chain detection in LoopCarry * Make sure that types are the same * Add a comment * Run CSE before calling can_prove * Test for loop carry * clang-tidy * Add missing override * Update comments --- src/LoopCarry.cpp | 32 +++++++++++++++-- test/correctness/CMakeLists.txt | 1 + test/correctness/loop_carry.cpp | 64 +++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 test/correctness/loop_carry.cpp diff --git a/src/LoopCarry.cpp b/src/LoopCarry.cpp index 5f4d7bb519d3..050cdfbfc8d9 100644 --- a/src/LoopCarry.cpp +++ b/src/LoopCarry.cpp @@ -283,11 +283,34 @@ class LoopCarryOverLoop : public IRMutator { // For each load, move the load index forwards by one loop iteration vector indices, next_indices, predicates, next_predicates; + // CSE-d versions of the above, so can_prove can be safely used on them. + vector indices_csed, next_indices_csed, predicates_csed, next_predicates_csed; for (const vector &v : loads) { indices.push_back(v[0]->index); next_indices.push_back(step_forwards(v[0]->index, linear)); predicates.push_back(v[0]->predicate); next_predicates.push_back(step_forwards(v[0]->predicate, linear)); + + if (indices.back().defined()) { + indices_csed.push_back(common_subexpression_elimination(indices.back())); + } else { + indices_csed.emplace_back(); + } + if (next_indices.back().defined()) { + next_indices_csed.push_back(common_subexpression_elimination(next_indices.back())); + } else { + next_indices_csed.emplace_back(); + } + if (predicates.back().defined()) { + predicates_csed.push_back(common_subexpression_elimination(predicates.back())); + } else { + predicates_csed.emplace_back(); + } + if (next_predicates.back().defined()) { + next_predicates_csed.push_back(common_subexpression_elimination(next_predicates.back())); + } else { + next_predicates_csed.emplace_back(); + } } // Find loads done on this loop iteration that will be @@ -299,11 +322,16 @@ class LoopCarryOverLoop : public IRMutator { if (i == j) { continue; } + // can_prove is stronger than graph_equal, because it doesn't require index expressions to be + // exactly the same, but evaluate to the same value. We keep the graph_equal check, because + // it's faster and should be executed before the more expensive check. if (loads[i][0]->name == loads[j][0]->name && next_indices[j].defined() && - graph_equal(indices[i], next_indices[j]) && + (graph_equal(indices[i], next_indices[j]) || + ((indices[i].type() == next_indices[j].type()) && can_prove(indices_csed[i] == next_indices_csed[j]))) && next_predicates[j].defined() && - graph_equal(predicates[i], next_predicates[j])) { + (graph_equal(predicates[i], next_predicates[j]) || + ((predicates[i].type() == next_predicates[j].type()) && can_prove(predicates_csed[i] == next_predicates_csed[j])))) { chains.push_back({j, i}); debug(3) << "Found carried value:\n" << i << ": -> " << Expr(loads[i][0]) << "\n" diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 07921a347425..cd66f21a346e 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -199,6 +199,7 @@ tests(GROUPS correctness likely.cpp load_library.cpp logical.cpp + loop_carry.cpp loop_invariant_extern_calls.cpp loop_level_generator_param.cpp lossless_cast.cpp diff --git a/test/correctness/loop_carry.cpp b/test/correctness/loop_carry.cpp new file mode 100644 index 000000000000..4cfba7d25f3f --- /dev/null +++ b/test/correctness/loop_carry.cpp @@ -0,0 +1,64 @@ +#include "Halide.h" +#include + +using namespace Halide; +using namespace Halide::Internal; + +// Wrapper class to call loop_carry on a given statement. +class LoopCarryWrapper : public IRMutator { + using IRMutator::visit; + + int register_count_; + Stmt mutate(const Stmt &stmt) override { + return simplify(loop_carry(stmt, register_count_)); + } + +public: + LoopCarryWrapper(int register_count) + : register_count_(register_count) { + } +}; + +int main(int argc, char **argv) { + Func input; + Func g; + Func h; + Func f; + Var x, y, xo, yo, xi, yi; + + input(x, y) = x + y; + + Expr sum_expr = 0; + for (int ix = -100; ix <= 100; ix++) { + // Generate two chains of sums, but only one of them will be carried. + sum_expr += input(x, y + ix); + sum_expr += input(x + 13, y + 2 * ix); + } + g(x, y) = sum_expr; + h(x, y) = g(x, y) + 12; + f(x, y) = h(x, y); + + // Make a maximum number of the carried values very large for the purpose + // of this test. + constexpr int kMaxRegisterCount = 1024; + f.add_custom_lowering_pass(new LoopCarryWrapper(kMaxRegisterCount)); + + const int size = 128; + f.compute_root() + .bound(x, 0, size) + .bound(y, 0, size); + + h.compute_root() + .tile(x, y, xo, yo, xi, yi, 16, 16, TailStrategy::RoundUp); + + g.compute_at(h, xo) + .reorder(y, x) + .vectorize(x, 4); + + input.compute_root(); + + f.realize({size, size}); + + printf("Success!\n"); + return 0; +} From 8d3c12e632d0e85687feec37084cca71ab32753a Mon Sep 17 00:00:00 2001 From: Mike Woodworth Date: Tue, 16 Jan 2024 10:55:53 -0800 Subject: [PATCH 030/186] adds mappings for f16 variants of halide float math (#8029) * adds mappings for f16 variants of halide float math * fix clang format errors * trigger buildbots --------- Co-authored-by: Steven Johnson --- src/CodeGen_Metal_Dev.cpp | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index 471b76b925ee..69d47279e9ae 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -795,6 +795,31 @@ void CodeGen_Metal_Dev::init_module() { << "#define tanh_f32 tanh\n" << "#define atanh_f32 atanh\n" << "#define fast_inverse_sqrt_f32 rsqrt\n" + << "#define is_nan_f16 isnan\n" + << "#define is_inf_f16 isinf\n" + << "#define is_finite_f16 isfinite\n" + << "#define sqrt_f16 sqrt\n" + << "#define sin_f16 sin\n" + << "#define cos_f16 cos\n" + << "#define exp_f16 exp\n" + << "#define log_f16 log\n" + << "#define abs_f16 fabs\n" + << "#define floor_f16 floor\n" + << "#define ceil_f16 ceil\n" + << "#define trunc_f16 trunc\n" + << "#define pow_f16 pow\n" + << "#define asin_f16 asin\n" + << "#define acos_f16 acos\n" + << "#define tan_f16 tan\n" + << "#define atan_f16 atan\n" + << "#define atan2_f16 atan2\n" + << "#define sinh_f16 sinh\n" + << "#define asinh_f16 asinh\n" + << "#define cosh_f16 cosh\n" + << "#define acosh_f16 acosh\n" + << "#define tanh_f16 tanh\n" + << "#define atanh_f16 atanh\n" + << "#define fast_inverse_sqrt_f16 rsqrt\n" // This is quite annoying: even though the MSL docs claim // all versions of Metal support the same memory fence // names, the truth is that 1.0 does not. From d2eed57d224b2de7d7b4349025eb06606bccf773 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 16 Jan 2024 20:00:36 +0000 Subject: [PATCH 031/186] Fix build breakage for wasm targets (#8031) Update HalideTestHelpers.cmake --- cmake/HalideTestHelpers.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/HalideTestHelpers.cmake b/cmake/HalideTestHelpers.cmake index b6b9b70551ff..8f39cec026a4 100644 --- a/cmake/HalideTestHelpers.cmake +++ b/cmake/HalideTestHelpers.cmake @@ -54,7 +54,9 @@ function(add_halide_test TARGET) add_test(NAME ${TARGET} COMMAND ${args_COMMAND} ${args_ARGS} WORKING_DIRECTORY "${args_WORKING_DIRECTORY}") - set_halide_compiler_warnings(${TARGET}) + if (NOT Halide_TARGET MATCHES "wasm") + set_halide_compiler_warnings(${TARGET}) + endif () # We can't add Halide::TerminateHandler here, because it requires Halide::Error # and friends to be present in the final linkage, but some callers of add_halide_test() From 3a7720492e777b7509f1be60d0cb93389d6fe44e Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 17 Jan 2024 15:35:07 +0000 Subject: [PATCH 032/186] Require LLVM >= 16.0 (#8003) * Require LLVM >= 16.0 Per policy, we only support top-of-tree LLVM, plus two versions back; let's update to require LLVM >= 16, and drop workarounds for older versions. * LLVM_VERSION < 170 --- dependencies/llvm/CMakeLists.txt | 2 +- src/CodeGen_ARM.cpp | 4 +--- src/CodeGen_LLVM.cpp | 20 -------------------- src/CodeGen_RISCV.cpp | 2 -- src/JITModule.cpp | 12 ------------ src/LLVM_Headers.h | 4 ++-- src/LLVM_Runtime_Linker.cpp | 4 ---- test/correctness/simd_op_check_arm.cpp | 3 +-- test/correctness/simd_op_check_wasm.cpp | 15 ++++----------- 9 files changed, 9 insertions(+), 57 deletions(-) diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt index 8ab9fa3d2506..48b8642494dd 100644 --- a/dependencies/llvm/CMakeLists.txt +++ b/dependencies/llvm/CMakeLists.txt @@ -20,7 +20,7 @@ message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") message(STATUS "Using ClangConfig.cmake in: ${Clang_DIR}") -if (LLVM_PACKAGE_VERSION VERSION_LESS 15.0) +if (LLVM_PACKAGE_VERSION VERSION_LESS 16.0) message(FATAL_ERROR "LLVM version must be 15.0 or newer") endif () diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 4cf1dc597ab4..9c6525703f16 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1144,10 +1144,8 @@ void CodeGen_ARM::visit(const Store *op) { llvm::Type *intrin_llvm_type = llvm_type_of(intrin_type); #if LLVM_VERSION >= 170 const bool is_opaque = true; -#elif LLVM_VERSION >= 150 - const bool is_opaque = llvm::PointerType::get(intrin_llvm_type, 0)->isOpaque(); #else - const bool is_opaque = false; + const bool is_opaque = llvm::PointerType::get(intrin_llvm_type, 0)->isOpaque(); #endif if (target.bits == 32) { instr << "llvm.arm.neon.vst" diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 7b9eecd3d74e..a5c32cf83cc7 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1177,35 +1177,19 @@ void CodeGen_LLVM::optimize_module() { if (get_target().os == Target::OS::Linux) { sanitizercoverage_options.StackDepth = true; } -#if LLVM_VERSION >= 160 mpm.addPass(SanitizerCoveragePass(sanitizercoverage_options)); -#else - mpm.addPass(ModuleSanitizerCoveragePass(sanitizercoverage_options)); -#endif }); } if (get_target().has_feature(Target::ASAN)) { -#if LLVM_VERSION >= 150 - // Nothing, ASanGlobalsMetadataAnalysis no longer exists -#else - pb.registerPipelineStartEPCallback([&](ModulePassManager &mpm, OptimizationLevel) { - mpm.addPass(RequireAnalysisPass()); - }); -#endif pb.registerPipelineStartEPCallback([](ModulePassManager &mpm, OptimizationLevel) { AddressSanitizerOptions asan_options; // default values are good... asan_options.UseAfterScope = true; // ...except this one constexpr bool use_global_gc = false; constexpr bool use_odr_indicator = true; constexpr auto destructor_kind = AsanDtorKind::Global; -#if LLVM_VERSION >= 160 mpm.addPass(AddressSanitizerPass( asan_options, use_global_gc, use_odr_indicator, destructor_kind)); -#else - mpm.addPass(ModuleAddressSanitizerPass( - asan_options, use_global_gc, use_odr_indicator, destructor_kind)); -#endif }); } @@ -2046,11 +2030,7 @@ void CodeGen_LLVM::add_tbaa_metadata(llvm::Instruction *inst, string buffer, con } void CodeGen_LLVM::function_does_not_access_memory(llvm::Function *fn) { -#if LLVM_VERSION >= 160 fn->addFnAttr("memory(none)"); -#else - fn->addFnAttr(llvm::Attribute::ReadNone); -#endif } void CodeGen_LLVM::visit(const Load *op) { diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp index a702baff78a2..6bbc38532ecf 100644 --- a/src/CodeGen_RISCV.cpp +++ b/src/CodeGen_RISCV.cpp @@ -174,11 +174,9 @@ string CodeGen_RISCV::mattrs() const { if (target.has_feature(Target::RVV)) { attrs.emplace_back("+v"); -#if LLVM_VERSION >= 160 if (target.vector_bits != 0) { attrs.push_back("+zvl" + std::to_string(target.vector_bits) + "b"); } -#endif } return join_strings(attrs, ","); } diff --git a/src/JITModule.cpp b/src/JITModule.cpp index eb274bc6c59d..0d37c07284c3 100644 --- a/src/JITModule.cpp +++ b/src/JITModule.cpp @@ -225,11 +225,7 @@ JITModule::Symbol compile_and_get_function(llvm::orc::LLJIT &JIT, const string & auto addr = JIT.lookup(name); internal_assert(addr) << llvm::toString(addr.takeError()) << "\n"; -#if LLVM_VERSION >= 150 void *f = (void *)addr->getValue(); -#else - void *f = (void *)addr->getAddress(); -#endif if (!f) { internal_error << "Compiling " << name << " returned nullptr\n"; } @@ -1014,20 +1010,12 @@ JITModule &make_module(llvm::Module *for_module, Target target, } uint64_t arg_addr = llvm::cantFail(runtime.jit_module->JIT->lookup("halide_jit_module_argument")) -#if LLVM_VERSION >= 150 .getValue(); -#else - .getAddress(); -#endif internal_assert(arg_addr != 0); *((void **)arg_addr) = runtime.jit_module.get(); uint64_t fun_addr = llvm::cantFail(runtime.jit_module->JIT->lookup("halide_jit_module_adjust_ref_count")) -#if LLVM_VERSION >= 150 .getValue(); -#else - .getAddress(); -#endif internal_assert(fun_addr != 0); *(void (**)(void *arg, int32_t count))fun_addr = &adjust_module_ref_count; } diff --git a/src/LLVM_Headers.h b/src/LLVM_Headers.h index ad3f25365577..6b5013b72cf0 100644 --- a/src/LLVM_Headers.h +++ b/src/LLVM_Headers.h @@ -1,10 +1,10 @@ #ifndef HALIDE_LLVM_HEADERS_H #define HALIDE_LLVM_HEADERS_H -#if LLVM_VERSION >= 140 +#if LLVM_VERSION >= 160 // We're good to go #else -#error "Compiling Halide requires LLVM 14.0 or newer" +#error "Compiling Halide requires LLVM 16.0 or newer" #endif // No msvc warnings from llvm headers please diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp index c946faad4850..0531f7bc3365 100644 --- a/src/LLVM_Runtime_Linker.cpp +++ b/src/LLVM_Runtime_Linker.cpp @@ -402,11 +402,7 @@ llvm::DataLayout get_data_layout_for_target(Target target) { if (target.bits == 32) { return llvm::DataLayout("e-m:e-p:32:32-i64:64-n32-S128"); } else { -#if LLVM_VERSION >= 160 return llvm::DataLayout("e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"); -#else - return llvm::DataLayout("e-m:e-p:64:64-i64:64-i128:128-n64-S128"); -#endif } } else { // Return empty data layout. Must be set later. diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp index 68fbf91a0081..acc3edcc4a8a 100644 --- a/test/correctness/simd_op_check_arm.cpp +++ b/test/correctness/simd_op_check_arm.cpp @@ -948,8 +948,7 @@ class SimdOpCheckARM : public SimdOpCheckTest { // LLVM15 emits UZP2 if the shift amount is half the width of the vector element. const auto shrn_or_uzp2 = [&](int element_width, int shift_amt, int vector_width) { constexpr int simd_vector_bits = 128; - if (Halide::Internal::get_llvm_version() >= 150 && - ((vector_width * element_width) % (simd_vector_bits * 2)) == 0 && + if (((vector_width * element_width) % (simd_vector_bits * 2)) == 0 && shift_amt == element_width / 2) { return "uzp2"; } diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp index 6b6898c82b85..89aad9e5c389 100644 --- a/test/correctness/simd_op_check_wasm.cpp +++ b/test/correctness/simd_op_check_wasm.cpp @@ -388,17 +388,10 @@ class SimdOpCheckWASM : public SimdOpCheckTest { // check("v128.load64_zero", 2 * w, in_u64(0)); // Load vector with identical lanes generates *.splat. - if (Halide::Internal::get_llvm_version() >= 160) { - check("i8x16.splat", 16 * w, in_u8(0)); - check("i16x8.splat", 8 * w, in_u16(0)); - check("i32x4.splat", 4 * w, in_u32(0)); - check("i64x2.splat", 2 * w, in_u64(0)); - } else { - check("v128.load8_splat", 16 * w, in_u8(0)); - check("v128.load16_splat", 8 * w, in_u16(0)); - check("v128.load32_splat", 4 * w, in_u32(0)); - check("v128.load64_splat", 2 * w, in_u64(0)); - } + check("i8x16.splat", 16 * w, in_u8(0)); + check("i16x8.splat", 8 * w, in_u16(0)); + check("i32x4.splat", 4 * w, in_u32(0)); + check("i64x2.splat", 2 * w, in_u64(0)); // Load Lane // TODO: does Halide have any idiom that obviously generates these? From 22f9bb9247b3e384bbd9d8e7ff96501a29b49265 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 17 Jan 2024 16:26:43 +0000 Subject: [PATCH 033/186] Add test for #8029 (#8032) Tweak correctness_float16_t so that it uses one of the transcendal functions (sqrt) that were missing in Metal. --- test/correctness/float16_t.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/correctness/float16_t.cpp b/test/correctness/float16_t.cpp index 2c10f3e81ae6..d135e8108fa7 100644 --- a/test/correctness/float16_t.cpp +++ b/test/correctness/float16_t.cpp @@ -236,14 +236,14 @@ int run_test() { Param mul("mul"); Func output; - output(x, y) = x * y * (input(x, y) * mul); + output(x, y) = x * y * (sqrt(input(x, y)) * mul); Var xi, yi; output.gpu_tile(x, y, xi, yi, 8, 8); mul.set(float16_t(2.0f)); Buffer in(8, 8); - in.fill(float16_t(0.25f)); + in.fill(float16_t(0.0625f)); input.set(in); Buffer buf = output.realize({8, 8}); for (int y = 0; y < 8; y++) { From e0e9f637635c29f92f40890c4ba0c539b32141cf Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 22 Jan 2024 21:43:00 +0000 Subject: [PATCH 034/186] Tweak the Printer code in runtime for smaller code (#8023) * Tweak the Printer code in runtime for smaller code TL;DR: template expansion meant that we had more replicated code than expected from the inline expansion of code in Printer and friends. Restructured and added NEVER_INLINE to try to make the call sites as small as possible. It's a modest code-size savings but nonzero... e.g., the linux-x86-64 .o output from correct_cross_compilation drops from 164280 bytes to 162936 bytes. * Update printer.h * debug * Update HalideTestHelpers.cmake * Update printer.h * fixes --- src/runtime/d3d12compute.cpp | 4 +- src/runtime/posix_error_handler.cpp | 20 ++- src/runtime/printer.h | 226 ++++++++++++++-------------- src/runtime/runtime_internal.h | 2 + src/runtime/to_string.cpp | 15 ++ src/runtime/tracing.cpp | 2 +- 6 files changed, 140 insertions(+), 129 deletions(-) diff --git a/src/runtime/d3d12compute.cpp b/src/runtime/d3d12compute.cpp index adae690800cc..f4f85180a56e 100644 --- a/src/runtime/d3d12compute.cpp +++ b/src/runtime/d3d12compute.cpp @@ -98,11 +98,11 @@ static constexpr uint64_t trace_buf_size = 4096; WEAK char trace_buf[trace_buf_size] = {}; WEAK int trace_indent = 0; -struct trace : public BasicPrinter { +struct trace : public PrinterBase { ScopedMutexLock lock; explicit trace(void *user_context = nullptr) - : BasicPrinter(user_context, trace_buf), + : PrinterBase(user_context, trace_buf, trace_buf_size), lock(&trace_lock) { for (int i = 0; i < trace_indent; i++) { *this << " "; diff --git a/src/runtime/posix_error_handler.cpp b/src/runtime/posix_error_handler.cpp index d40790fad15d..27bcc1f5b28f 100644 --- a/src/runtime/posix_error_handler.cpp +++ b/src/runtime/posix_error_handler.cpp @@ -7,18 +7,16 @@ extern "C" { extern void abort(); WEAK void halide_default_error(void *user_context, const char *msg) { - char buf[4096]; - char *dst = halide_string_to_string(buf, buf + 4094, "Error: "); - dst = halide_string_to_string(dst, dst + 4094, msg); - // We still have one character free. Add a newline if there - // isn't one already. - if (dst[-1] != '\n') { - dst[0] = '\n'; - dst[1] = 0; - dst += 1; + // Can't use StackBasicPrinter here because it limits size to 256 + constexpr int buf_size = 4096; + char buf[buf_size]; + PrinterBase dst(user_context, buf, buf_size); + dst << "Error: " << msg; + const char *d = dst.str(); + if (d && *d && d[strlen(d) - 1] != '\n') { + dst << "\n"; } - (void)halide_msan_annotate_memory_is_initialized(user_context, buf, dst - buf + 1); - halide_print(user_context, buf); + halide_print(user_context, dst.str()); abort(); } } diff --git a/src/runtime/printer.h b/src/runtime/printer.h index be3620020824..6a379561dbe5 100644 --- a/src/runtime/printer.h +++ b/src/runtime/printer.h @@ -41,179 +41,174 @@ constexpr uint64_t default_printer_buffer_length = 1024; // Then remember the print only happens when the debug object leaves // scope, which may print at a confusing time. -namespace { -template -class Printer { - char *buf, *dst, *end; - void *user_context; - bool own_mem; +class PrinterBase { +protected: + char *dst; + char *const end; + char *const start; + void *const user_context; + + NEVER_INLINE void allocation_error() const { + halide_error(user_context, "Printer buffer allocation failed.\n"); + } public: - explicit Printer(void *ctx, char *mem = nullptr) - : user_context(ctx), own_mem(mem == nullptr) { - if (mem != nullptr) { - buf = mem; - } else { - buf = (char *)malloc(buffer_length); + // This class will stream text into the range [start, start + size - 1]. + // It does *not* assume any ownership of the memory; it assumes + // the memory will remain valid for its lifespan, and doesn't + // attempt to free any allocations. It also doesn't do any sanity + // checking of the pointers, so if you pass in a null or bogus value, + // it will attempt to use it. + NEVER_INLINE PrinterBase(void *user_context_, char *start_, uint64_t size_) + : dst(start_), + // (If start is null, set end = start to ensure no writes are done) + end(start_ ? start_ + size_ - 1 : start_), + start(start_), + user_context(user_context_) { + if (end > start) { + // null-terminate the final byte to ensure string isn't $ENDLESS + *end = 0; } + } + + NEVER_INLINE const char *str() { + (void)halide_msan_annotate_memory_is_initialized(user_context, start, dst - start + 1); + return start; + } + + uint64_t size() const { + halide_debug_assert(user_context, dst >= start); + return (uint64_t)(dst - start); + } + + uint64_t capacity() const { + halide_debug_assert(user_context, end >= start); + return (uint64_t)(end - start); + } - dst = buf; + NEVER_INLINE void clear() { + dst = start; if (dst) { - end = buf + (buffer_length - 1); - *end = 0; - } else { - // Pointers equal ensures no writes to buffer via formatting code - end = dst; + dst[0] = 0; } + } -#if HALIDE_RUNTIME_PRINTER_LOG_THREADID - uint64_t tid; - pthread_threadid_np(0, &tid); - *this << "(TID:" << tid << ")"; -#endif + NEVER_INLINE void erase(int n) { + if (dst) { + dst -= n; + if (dst < start) { + dst = start; + } + dst[0] = 0; + } } - // Not movable, not copyable - Printer(const Printer ©) = delete; - Printer &operator=(const Printer &) = delete; - Printer(Printer &&) = delete; - Printer &operator=(Printer &&) = delete; + struct Float16Bits { + uint16_t bits; + }; - Printer &operator<<(const char *arg) { + // These are NEVER_INLINE because Clang will aggressively inline + // all of them, but the code size of calling out-of-line here is slightly + // smaller, and we ~always prefer smaller code size when using Printer + // in the runtime (it's a modest but nonzero difference). + NEVER_INLINE PrinterBase &operator<<(const char *arg) { dst = halide_string_to_string(dst, end, arg); return *this; } - Printer &operator<<(int64_t arg) { + NEVER_INLINE PrinterBase &operator<<(int64_t arg) { dst = halide_int64_to_string(dst, end, arg, 1); return *this; } - Printer &operator<<(int32_t arg) { + NEVER_INLINE PrinterBase &operator<<(int32_t arg) { dst = halide_int64_to_string(dst, end, arg, 1); return *this; } - Printer &operator<<(uint64_t arg) { + NEVER_INLINE PrinterBase &operator<<(uint64_t arg) { dst = halide_uint64_to_string(dst, end, arg, 1); return *this; } - Printer &operator<<(uint32_t arg) { + NEVER_INLINE PrinterBase &operator<<(uint32_t arg) { dst = halide_uint64_to_string(dst, end, arg, 1); return *this; } - Printer &operator<<(double arg) { + NEVER_INLINE PrinterBase &operator<<(double arg) { dst = halide_double_to_string(dst, end, arg, 1); return *this; } - Printer &operator<<(float arg) { + NEVER_INLINE PrinterBase &operator<<(float arg) { dst = halide_double_to_string(dst, end, arg, 0); return *this; } - Printer &operator<<(const void *arg) { - dst = halide_pointer_to_string(dst, end, arg); + NEVER_INLINE PrinterBase &operator<<(Float16Bits arg) { + double value = halide_float16_bits_to_double(arg.bits); + dst = halide_double_to_string(dst, end, value, 1); return *this; } - Printer &write_float16_from_bits(const uint16_t arg) { - double value = halide_float16_bits_to_double(arg); - dst = halide_double_to_string(dst, end, value, 1); + NEVER_INLINE PrinterBase &operator<<(const void *arg) { + dst = halide_pointer_to_string(dst, end, arg); return *this; } - Printer &operator<<(const halide_type_t &t) { + NEVER_INLINE PrinterBase &operator<<(const halide_type_t &t) { dst = halide_type_to_string(dst, end, &t); return *this; } - Printer &operator<<(const halide_buffer_t &buf) { + NEVER_INLINE PrinterBase &operator<<(const halide_buffer_t &buf) { dst = halide_buffer_to_string(dst, end, &buf); return *this; } - template - void append(const T &value) { - *this << value; - } - - template - void append(const First &first, const Second &second, const Rest &...rest) { - append(first); - append(second, rest...); - } - - // Use it like a stringstream. - const char *str() { - if (buf) { - if (printer_type == StringStreamPrinterType) { - msan_annotate_is_initialized(); - } - return buf; - } else { - return allocation_error(); - } - } - - // Clear it. Useful for reusing a stringstream. - void clear() { - dst = buf; - if (dst) { - dst[0] = 0; - } + template + void append(const Args &...args) { + ((*this << args), ...); } - // Returns the number of characters in the buffer - uint64_t size() const { - return (uint64_t)(dst - buf); - } + // Not movable, not copyable + PrinterBase(const PrinterBase ©) = delete; + PrinterBase &operator=(const PrinterBase &) = delete; + PrinterBase(PrinterBase &&) = delete; + PrinterBase &operator=(PrinterBase &&) = delete; +}; - uint64_t capacity() const { - return buffer_length; - } +namespace { - // Delete the last N characters - void erase(int n) { - if (dst) { - dst -= n; - if (dst < buf) { - dst = buf; - } - dst[0] = 0; +template +class HeapPrinter : public PrinterBase { +public: + NEVER_INLINE explicit HeapPrinter(void *user_context) + : PrinterBase(user_context, (char *)malloc(buffer_length), buffer_length) { + if (!start) { + allocation_error(); } - } - const char *allocation_error() { - return "Printer buffer allocation failed.\n"; - } - - void msan_annotate_is_initialized() { - (void)halide_msan_annotate_memory_is_initialized(user_context, buf, dst - buf + 1); +#if HALIDE_RUNTIME_PRINTER_LOG_THREADID + uint64_t tid; + pthread_threadid_np(0, &tid); + *this << "(TID:" << tid << ")"; +#endif } - ~Printer() { - if (!buf) { - halide_error(user_context, allocation_error()); + NEVER_INLINE ~HeapPrinter() { + if (printer_type == ErrorPrinterType) { + halide_error(user_context, str()); + } else if (printer_type == BasicPrinterType) { + halide_print(user_context, str()); } else { - msan_annotate_is_initialized(); - if (printer_type == ErrorPrinterType) { - halide_error(user_context, buf); - } else if (printer_type == BasicPrinterType) { - halide_print(user_context, buf); - } else { - // It's a stringstream. Do nothing. - } + // It's a stringstream. Do nothing. } - if (own_mem) { - free(buf); - } + free(start); } }; - // A class that supports << with all the same types as Printer, but // does nothing and should compile to a no-op. class SinkPrinter { @@ -227,13 +222,13 @@ ALWAYS_INLINE SinkPrinter operator<<(const SinkPrinter &s, T) { } template -using BasicPrinter = Printer; +using BasicPrinter = HeapPrinter; template -using ErrorPrinter = Printer; +using ErrorPrinter = HeapPrinter; template -using StringStreamPrinter = Printer; +using StringStreamPrinter = HeapPrinter; using print = BasicPrinter<>; using error = ErrorPrinter<>; @@ -244,17 +239,16 @@ using debug = BasicPrinter<>; #else using debug = SinkPrinter; #endif -} // namespace // A Printer that automatically reserves stack space for the printer buffer, rather than malloc. // Note that this requires an explicit buffer_length, and it (generally) should be <= 256. template -class StackPrinter : public Printer { +class StackPrinter : public PrinterBase { char scratch[buffer_length]; public: - explicit StackPrinter(void *ctx) - : Printer(ctx, scratch) { + explicit StackPrinter(void *user_context) + : PrinterBase(user_context, scratch, buffer_length) { static_assert(buffer_length <= 256, "StackPrinter is meant only for small buffer sizes; you are probably making a mistake."); } }; @@ -268,6 +262,8 @@ using StackErrorPrinter = StackPrinter; template using StackStringStreamPrinter = StackPrinter; +} // namespace + } // namespace Internal } // namespace Runtime } // namespace Halide diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h index 57dfe0b1087a..027ae5c4f500 100644 --- a/src/runtime/runtime_internal.h +++ b/src/runtime/runtime_internal.h @@ -51,6 +51,8 @@ typedef ptrdiff_t ssize_t; #define WEAK __attribute__((weak)) +#define NEVER_INLINE __attribute__((noinline)) + // Note that ALWAYS_INLINE should *always* also be `inline`. #define ALWAYS_INLINE inline __attribute__((always_inline)) diff --git a/src/runtime/to_string.cpp b/src/runtime/to_string.cpp index 71d537609e83..1200ca5c07d9 100644 --- a/src/runtime/to_string.cpp +++ b/src/runtime/to_string.cpp @@ -1,8 +1,11 @@ #include "HalideRuntime.h" +#include "runtime_internal.h" extern "C" { WEAK char *halide_string_to_string(char *dst, char *end, const char *arg) { + halide_debug_assert(nullptr, dst <= end); + if (dst >= end) { return dst; } @@ -25,6 +28,8 @@ WEAK char *halide_string_to_string(char *dst, char *end, const char *arg) { } WEAK char *halide_uint64_to_string(char *dst, char *end, uint64_t arg, int min_digits) { + halide_debug_assert(nullptr, dst <= end); + // 32 is more than enough chars to contain any 64-bit int. char buf[32]; buf[31] = 0; @@ -43,6 +48,8 @@ WEAK char *halide_uint64_to_string(char *dst, char *end, uint64_t arg, int min_d } WEAK char *halide_int64_to_string(char *dst, char *end, int64_t arg, int min_digits) { + halide_debug_assert(nullptr, dst <= end); + if (arg < 0 && dst < end) { *dst++ = '-'; arg = -arg; @@ -51,6 +58,8 @@ WEAK char *halide_int64_to_string(char *dst, char *end, int64_t arg, int min_dig } WEAK char *halide_double_to_string(char *dst, char *end, double arg, int scientific) { + halide_debug_assert(nullptr, dst <= end); + uint64_t bits = 0; memcpy(&bits, &arg, sizeof(double)); @@ -234,6 +243,8 @@ WEAK char *halide_double_to_string(char *dst, char *end, double arg, int scienti } WEAK char *halide_pointer_to_string(char *dst, char *end, const void *arg) { + halide_debug_assert(nullptr, dst <= end); + const char *hex_digits = "0123456789abcdef"; char buf[20] = {0}; char *buf_ptr = buf + 18; @@ -251,6 +262,8 @@ WEAK char *halide_pointer_to_string(char *dst, char *end, const void *arg) { } WEAK char *halide_type_to_string(char *dst, char *end, const halide_type_t *t) { + halide_debug_assert(nullptr, dst <= end); + const char *code_name = nullptr; switch (t->code) { case halide_type_int: @@ -282,6 +295,8 @@ WEAK char *halide_type_to_string(char *dst, char *end, const halide_type_t *t) { } WEAK char *halide_buffer_to_string(char *dst, char *end, const halide_buffer_t *buf) { + halide_debug_assert(nullptr, dst <= end); + if (buf == nullptr) { return halide_string_to_string(dst, end, "nullptr"); } diff --git a/src/runtime/tracing.cpp b/src/runtime/tracing.cpp index 8e8769e2ad12..93a12c7d90a4 100644 --- a/src/runtime/tracing.cpp +++ b/src/runtime/tracing.cpp @@ -308,7 +308,7 @@ WEAK int32_t halide_default_trace(void *user_context, const halide_trace_event_t if (print_bits == 32) { ss << ((float *)(e->value))[i]; } else if (print_bits == 16) { - ss.write_float16_from_bits(((uint16_t *)(e->value))[i]); + ss << PrinterBase::Float16Bits{((uint16_t *)(e->value))[i]}; } else { ss << ((double *)(e->value))[i]; } From 90e909d8e56e2894d5b63e9efab2e97e058887ee Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 24 Jan 2024 18:44:47 +0000 Subject: [PATCH 035/186] Allow LLVM 19 in CMake (#8041) --- dependencies/llvm/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt index 48b8642494dd..a4aef94b08de 100644 --- a/dependencies/llvm/CMakeLists.txt +++ b/dependencies/llvm/CMakeLists.txt @@ -24,8 +24,8 @@ if (LLVM_PACKAGE_VERSION VERSION_LESS 16.0) message(FATAL_ERROR "LLVM version must be 15.0 or newer") endif () -if (LLVM_PACKAGE_VERSION VERSION_GREATER 18.0) - message(WARNING "Halide is not tested on LLVM versions beyond 18.0") +if (LLVM_PACKAGE_VERSION VERSION_GREATER 19.0) + message(WARNING "Halide is not tested on LLVM versions beyond 19.0") endif () # LLVM_DEFINITIONS is a space-separated list instead of a more typical From 9b9dfaff070653954dda3c4a872a02644e2464e3 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 25 Jan 2024 06:12:17 +1100 Subject: [PATCH 036/186] Update Makefile for llvm 19 (#8040) --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b24dfdc2d80d..39358e03ef18 100644 --- a/Makefile +++ b/Makefile @@ -2280,6 +2280,10 @@ ifneq (,$(findstring clang version 18.0,$(CLANG_VERSION))) CLANG_OK=yes endif +ifneq (,$(findstring clang version 19.0,$(CLANG_VERSION))) +CLANG_OK=yes +endif + ifneq (,$(findstring Apple LLVM version 5.0,$(CLANG_VERSION))) CLANG_OK=yes endif @@ -2300,7 +2304,7 @@ $(BUILD_DIR)/clang_ok: @exit 1 endif -ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 160 170 180)) +ifneq (,$(findstring $(LLVM_VERSION_TIMES_10), 160 170 180 190)) LLVM_OK=yes endif From 6177e519b49d4e674ddb33c8d3ae0a1a4e839b9e Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 24 Jan 2024 20:04:19 +0000 Subject: [PATCH 037/186] Update Halide version to 18 (#8043) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6be8ece13282..6b6fb85841c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.22...3.23) project(Halide - VERSION 17.0.0 + VERSION 18.0.0 DESCRIPTION "Halide compiler and libraries" HOMEPAGE_URL "https://halide-lang.org") From c1923f3691ff1ac2964a33dc599b47a88eada5b5 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 24 Jan 2024 23:53:28 +0000 Subject: [PATCH 038/186] HALIDE_VERSION_MAJOR -> 18 (#8044) --- src/runtime/HalideRuntime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index b61b13041b8e..7b84e44f6928 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -23,7 +23,7 @@ // our CMake build, so that we ensure that the in-build metadata (eg soversion) // matches, but keeping the canonical version here makes it easier to keep // downstream build systems (eg Blaze/Bazel) properly in sync with the source. -#define HALIDE_VERSION_MAJOR 17 +#define HALIDE_VERSION_MAJOR 18 #define HALIDE_VERSION_MINOR 0 #define HALIDE_VERSION_PATCH 0 From 4590a095a857d07232b2407b1b5a3fdeaa327cc2 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 26 Jan 2024 12:07:40 +1100 Subject: [PATCH 039/186] Fix for llvm trunk: Force-include more runtime types (#8045) * Fix for llvm trunk: Force-include more runtime types * Include the force-include-types module first * Fix comment * Expand comment --- src/LLVM_Runtime_Linker.cpp | 15 ++++++++++++--- src/runtime/force_include_types.cpp | 17 +++++++++++++---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp index 0531f7bc3365..ad65bdc2ebc2 100644 --- a/src/LLVM_Runtime_Linker.cpp +++ b/src/LLVM_Runtime_Linker.cpp @@ -782,6 +782,7 @@ std::unique_ptr link_with_wasm_jit_runtime(llvm::LLVMContext *c, c // things that are 'alwaysinline' can be included here but are unnecessary. vector> modules; modules.push_back(std::move(extra_module)); + modules.push_back(get_initmod_force_include_types(c, bits_64, debug)); modules.push_back(get_initmod_fake_thread_pool(c, bits_64, debug)); modules.push_back(get_initmod_posix_aligned_alloc(c, bits_64, debug)); modules.push_back(get_initmod_posix_allocator(c, bits_64, debug)); @@ -796,7 +797,6 @@ std::unique_ptr link_with_wasm_jit_runtime(llvm::LLVMContext *c, c modules.push_back(get_initmod_alignment_32(c, bits_64, debug)); modules.push_back(get_initmod_fopen(c, bits_64, debug)); modules.push_back(get_initmod_device_interface(c, bits_64, debug)); - modules.push_back(get_initmod_force_include_types(c, bits_64, debug)); modules.push_back(get_initmod_float16_t(c, bits_64, debug)); modules.push_back(get_initmod_errors(c, bits_64, debug)); modules.push_back(get_initmod_msan_stubs(c, bits_64, debug)); @@ -843,6 +843,17 @@ std::unique_ptr get_initial_module_for_target(Target t, llvm::LLVM vector> modules; + // Start with the module that defines our struct types. This must be + // included first, because when parsing modules, if two structs are + // encountered with the same fields, they are deduped, and the first name + // wins. + // + // If in the future these names become unpredictable, an alternative + // strategy is to make this module include a global variable of each type we + // care about, recover the struct types from those named globals, and then + // delete the globals in link_modules. + modules.push_back(get_initmod_force_include_types(c, bits_64, debug)); + const auto add_allocator = [&]() { modules.push_back(get_initmod_posix_aligned_alloc(c, bits_64, debug)); modules.push_back(get_initmod_posix_allocator(c, bits_64, debug)); @@ -1277,8 +1288,6 @@ std::unique_ptr get_initial_module_for_target(Target t, llvm::LLVM modules.push_back(get_initmod_runtime_api(c, bits_64, debug)); } - modules.push_back(get_initmod_force_include_types(c, bits_64, debug)); - link_modules(modules, t); if (t.os == Target::Windows && diff --git a/src/runtime/force_include_types.cpp b/src/runtime/force_include_types.cpp index f5eeda611180..99a2dea821fc 100644 --- a/src/runtime/force_include_types.cpp +++ b/src/runtime/force_include_types.cpp @@ -6,10 +6,19 @@ namespace Runtime { namespace Internal { struct AllTheTypes { - halide_filter_metadata_t a; - halide_filter_argument_t b; - halide_scalar_value_t c; - halide_semaphore_t d; + halide_buffer_t a; + halide_device_interface_t b; + halide_dimension_t c; + halide_filter_argument_t d; + halide_filter_metadata_t e; + halide_parallel_task_t f; + halide_pseudostack_slot_t g; + halide_scalar_value_t h; + halide_semaphore_acquire_t i; + halide_semaphore_t j; + halide_trace_event_t k; + halide_trace_packet_t l; + halide_type_t m; }; WEAK void halide_unused_force_include_types() { From 3657cf5f363fd64aeaf06432e62e3960800927b0 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sat, 27 Jan 2024 04:26:12 +1100 Subject: [PATCH 040/186] Fix bounds_of_nested_lanes (#8039) * Fix bounds_of_nested_lanes bounds_of_nested_lanes assumed that one layer of nested vectorization could be removed at a time. When faced with the expression: min(ramp(x8(a), x8(b), 5), x40(27)) It panicked, because on the left hand side it reduced the bounds to x8(a) ... x8(a) + x8(b) * 4, and on the right hand side it reduced the bounds to 27. It then attempted to take a min of mismatched types. In general we can't assume that binary operators on nested vectors have the same nesting structure on both sides, so I just rewrote it to reduce directly to a scalar. Fixes #8038 --- src/VectorizeLoops.cpp | 140 ++++++++++++++++------------- test/correctness/fuzz_schedule.cpp | 19 ++++ 2 files changed, 95 insertions(+), 64 deletions(-) diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index 89c4f020af51..1c3ec57f3fb7 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -29,103 +29,128 @@ Expr get_lane(const Expr &e, int l) { return Shuffle::make_slice(e, l, 0, 1); } -/** Find the exact max and min lanes of a vector expression. Not - * conservative like bounds_of_expr, but uses similar rules for some - * common node types where it can be exact. If e is a nested vector, - * the result will be the bounds of the vectors in each lane. */ -Interval bounds_of_nested_lanes(const Expr &e) { +/** A helper like .as(), but unwraps arbitrarily many layers of + * nested broadcasts. Guaranteed to return either a broadcast of a scalar or + * nullptr. */ +const Broadcast *as_scalar_broadcast(const Expr &e) { + const Broadcast *b = e.as(); + if (b && b->value.type().is_scalar()) { + return b; + } else if (b) { + return as_scalar_broadcast(b->value); + } else { + return nullptr; + } +}; + +/** Find the exact scalar max and min lanes of a vector expression. Not + * conservative like bounds_of_expr, but uses similar rules for some common node + * types where it can be exact. Always returns a scalar, even in the case of + * nested vectorization. */ +Interval bounds_of_lanes(const Expr &e) { + if (e.type().is_scalar()) { + return {e, e}; + } + if (const Add *add = e.as()) { - if (const Broadcast *b = add->b.as()) { - Interval ia = bounds_of_nested_lanes(add->a); + if (const Broadcast *b = as_scalar_broadcast(add->b)) { + Interval ia = bounds_of_lanes(add->a); return {ia.min + b->value, ia.max + b->value}; - } else if (const Broadcast *b = add->a.as()) { - Interval ia = bounds_of_nested_lanes(add->b); + } else if (const Broadcast *b = as_scalar_broadcast(add->a)) { + Interval ia = bounds_of_lanes(add->b); return {b->value + ia.min, b->value + ia.max}; } } else if (const Sub *sub = e.as()) { - if (const Broadcast *b = sub->b.as()) { - Interval ia = bounds_of_nested_lanes(sub->a); + if (const Broadcast *b = as_scalar_broadcast(sub->b)) { + Interval ia = bounds_of_lanes(sub->a); return {ia.min - b->value, ia.max - b->value}; - } else if (const Broadcast *b = sub->a.as()) { - Interval ia = bounds_of_nested_lanes(sub->b); - return {b->value - ia.max, b->value - ia.max}; + } else if (const Broadcast *b = as_scalar_broadcast(sub->a)) { + Interval ia = bounds_of_lanes(sub->b); + return {b->value - ia.max, b->value - ia.min}; } } else if (const Mul *mul = e.as()) { - if (const Broadcast *b = mul->b.as()) { + if (const Broadcast *b = as_scalar_broadcast(mul->b)) { if (is_positive_const(b->value)) { - Interval ia = bounds_of_nested_lanes(mul->a); + Interval ia = bounds_of_lanes(mul->a); return {ia.min * b->value, ia.max * b->value}; } else if (is_negative_const(b->value)) { - Interval ia = bounds_of_nested_lanes(mul->a); + Interval ia = bounds_of_lanes(mul->a); return {ia.max * b->value, ia.min * b->value}; } - } else if (const Broadcast *b = mul->a.as()) { + } else if (const Broadcast *b = as_scalar_broadcast(mul->a)) { if (is_positive_const(b->value)) { - Interval ia = bounds_of_nested_lanes(mul->b); + Interval ia = bounds_of_lanes(mul->b); return {b->value * ia.min, b->value * ia.max}; } else if (is_negative_const(b->value)) { - Interval ia = bounds_of_nested_lanes(mul->b); + Interval ia = bounds_of_lanes(mul->b); return {b->value * ia.max, b->value * ia.min}; } } } else if (const Div *div = e.as
()) { - if (const Broadcast *b = div->b.as()) { + if (const Broadcast *b = as_scalar_broadcast(div->b)) { if (is_positive_const(b->value)) { - Interval ia = bounds_of_nested_lanes(div->a); + Interval ia = bounds_of_lanes(div->a); return {ia.min / b->value, ia.max / b->value}; } else if (is_negative_const(b->value)) { - Interval ia = bounds_of_nested_lanes(div->a); + Interval ia = bounds_of_lanes(div->a); return {ia.max / b->value, ia.min / b->value}; } } } else if (const And *and_ = e.as()) { - if (const Broadcast *b = and_->b.as()) { - Interval ia = bounds_of_nested_lanes(and_->a); + if (const Broadcast *b = as_scalar_broadcast(and_->b)) { + Interval ia = bounds_of_lanes(and_->a); return {ia.min && b->value, ia.max && b->value}; - } else if (const Broadcast *b = and_->a.as()) { - Interval ia = bounds_of_nested_lanes(and_->b); + } else if (const Broadcast *b = as_scalar_broadcast(and_->a)) { + Interval ia = bounds_of_lanes(and_->b); return {ia.min && b->value, ia.max && b->value}; } } else if (const Or *or_ = e.as()) { - if (const Broadcast *b = or_->b.as()) { - Interval ia = bounds_of_nested_lanes(or_->a); + if (const Broadcast *b = as_scalar_broadcast(or_->b)) { + Interval ia = bounds_of_lanes(or_->a); return {ia.min && b->value, ia.max && b->value}; - } else if (const Broadcast *b = or_->a.as()) { - Interval ia = bounds_of_nested_lanes(or_->b); + } else if (const Broadcast *b = as_scalar_broadcast(or_->a)) { + Interval ia = bounds_of_lanes(or_->b); return {ia.min && b->value, ia.max && b->value}; } } else if (const Min *min = e.as()) { - if (const Broadcast *b = min->b.as()) { - Interval ia = bounds_of_nested_lanes(min->a); + if (const Broadcast *b = as_scalar_broadcast(min->b)) { + Interval ia = bounds_of_lanes(min->a); + // ia and b->value have both had one nesting layer of vectorization + // peeled off, but that doesn't make them the same type. return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)}; - } else if (const Broadcast *b = min->a.as()) { - Interval ia = bounds_of_nested_lanes(min->b); + } else if (const Broadcast *b = as_scalar_broadcast(min->a)) { + Interval ia = bounds_of_lanes(min->b); return {Min::make(ia.min, b->value), Min::make(ia.max, b->value)}; } } else if (const Max *max = e.as()) { - if (const Broadcast *b = max->b.as()) { - Interval ia = bounds_of_nested_lanes(max->a); + if (const Broadcast *b = as_scalar_broadcast(max->b)) { + Interval ia = bounds_of_lanes(max->a); return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)}; - } else if (const Broadcast *b = max->a.as()) { - Interval ia = bounds_of_nested_lanes(max->b); + } else if (const Broadcast *b = as_scalar_broadcast(max->a)) { + Interval ia = bounds_of_lanes(max->b); return {Max::make(ia.min, b->value), Max::make(ia.max, b->value)}; } } else if (const Not *not_ = e.as()) { - Interval ia = bounds_of_nested_lanes(not_->a); + Interval ia = bounds_of_lanes(not_->a); return {!ia.max, !ia.min}; } else if (const Ramp *r = e.as()) { Expr last_lane_idx = make_const(r->base.type(), r->lanes - 1); - if (is_positive_const(r->stride)) { - return {r->base, r->base + last_lane_idx * r->stride}; - } else if (is_negative_const(r->stride)) { - return {r->base + last_lane_idx * r->stride, r->base}; + Interval ib = bounds_of_lanes(r->base); + const Broadcast *b = as_scalar_broadcast(r->stride); + Expr stride = b ? b->value : r->stride; + if (stride.type().is_scalar()) { + if (is_positive_const(stride)) { + return {ib.min, ib.max + last_lane_idx * stride}; + } else if (is_negative_const(stride)) { + return {ib.min + last_lane_idx * stride, ib.max}; + } } } else if (const LE *le = e.as()) { // The least true this can be is if we maximize the LHS and minimize the RHS. // The most true this can be is if we minimize the LHS and maximize the RHS. // This is only exact if one of the two sides is a Broadcast. - Interval ia = bounds_of_nested_lanes(le->a); - Interval ib = bounds_of_nested_lanes(le->b); + Interval ia = bounds_of_lanes(le->a); + Interval ib = bounds_of_lanes(le->b); if (ia.is_single_point() || ib.is_single_point()) { return {ia.max <= ib.min, ia.min <= ib.max}; } @@ -133,17 +158,17 @@ Interval bounds_of_nested_lanes(const Expr &e) { // The least true this can be is if we maximize the LHS and minimize the RHS. // The most true this can be is if we minimize the LHS and maximize the RHS. // This is only exact if one of the two sides is a Broadcast. - Interval ia = bounds_of_nested_lanes(lt->a); - Interval ib = bounds_of_nested_lanes(lt->b); + Interval ia = bounds_of_lanes(lt->a); + Interval ib = bounds_of_lanes(lt->b); if (ia.is_single_point() || ib.is_single_point()) { return {ia.max < ib.min, ia.min < ib.max}; } - } else if (const Broadcast *b = e.as()) { + } else if (const Broadcast *b = as_scalar_broadcast(e)) { return {b->value, b->value}; } else if (const Let *let = e.as()) { - Interval ia = bounds_of_nested_lanes(let->value); - Interval ib = bounds_of_nested_lanes(let->body); + Interval ia = bounds_of_lanes(let->value); + Interval ib = bounds_of_lanes(let->body); if (expr_uses_var(ib.min, let->name)) { ib.min = Let::make(let->name, let->value, ib.min); } @@ -166,19 +191,6 @@ Interval bounds_of_nested_lanes(const Expr &e) { } }; -/** Similar to bounds_of_nested_lanes, but it recursively reduces - * the bounds of nested vectors to scalars. */ -Interval bounds_of_lanes(const Expr &e) { - Interval bounds = bounds_of_nested_lanes(e); - if (!bounds.min.type().is_scalar()) { - bounds.min = bounds_of_lanes(bounds.min).min; - } - if (!bounds.max.type().is_scalar()) { - bounds.max = bounds_of_lanes(bounds.max).max; - } - return bounds; -} - // A ramp with the lanes repeated inner_repetitions times, and then // the whole vector repeated outer_repetitions times. // E.g: <0 0 2 2 4 4 6 6 0 0 2 2 4 4 6 6>. diff --git a/test/correctness/fuzz_schedule.cpp b/test/correctness/fuzz_schedule.cpp index 07f940ed82e3..9f0f86e3854b 100644 --- a/test/correctness/fuzz_schedule.cpp +++ b/test/correctness/fuzz_schedule.cpp @@ -183,6 +183,25 @@ int main(int argc, char **argv) { check_blur_output(buf, correct); } + // https://github.com/halide/Halide/issues/8038 + { + Func input("input"); + Func local_sum("local_sum"); + Func blurry("blurry"); + Var x("x"), y("y"), yi("yi"), yo("yo"), xi("xi"), xo("xo"), yofxi("yofxi"), yofxio("yofxio"), yofxii("yofxii"), yofxiifyi("yofxiifyi"), yofxioo("yofxioo"), yofxioi("yofxioi"); + input(x, y) = 2 * x + 5 * y; + RDom r(-2, 5, -2, 5, "rdom_r"); + local_sum(x, y) = 0; + local_sum(x, y) += input(x + r.x, y + r.y); + blurry(x, y) = cast(local_sum(x, y) / 25); + local_sum.split(y, yi, yo, 2, TailStrategy::GuardWithIf).split(x, xi, xo, 5, TailStrategy::Predicate).fuse(yo, xi, yofxi).split(yofxi, yofxio, yofxii, 8, TailStrategy::ShiftInwards).fuse(yofxii, yi, yofxiifyi).split(yofxio, yofxioo, yofxioi, 5, TailStrategy::ShiftInwards).vectorize(yofxiifyi).vectorize(yofxioi); + local_sum.update(0).unscheduled(); + blurry.split(x, xo, xi, 5, TailStrategy::Auto); + Pipeline p({blurry}); + auto buf = p.realize({32, 32}); + check_blur_output(buf, correct); + } + printf("Success!\n"); return 0; } From 45d78509df9c69ebb3d805d547cf6e54859379c4 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sat, 27 Jan 2024 07:01:41 +1100 Subject: [PATCH 041/186] Track whether or not let expressions failed to solve in solver (#7982) * Track whether or not let expressions failed to solve in solver After mutating an expression, the solver needs to know two things: 1) Did the expression contain the variable we're solving for 2) Was the expression successfully "solved" for the variable. I.e. the variable only appears once in the leftmost position. We need to know this to know property 1 of any subexpressions (i.e. does the right child of the expression contain the variable). This drives what transformations we do in ways that are guaranteed to terminate and not take exponential time. We were tracking property 1 through lets but not property 2, and this meant we were doing unhelpful transformations in some cases. I found a case in the wild where this made a pipeline take > 1 hour to compile (I killed it after an hour). It may have been in an infinite transformation loop, or it might have just been exponential. Not sure. * Remove surplus comma * Fix use of uninitialized value that could cause bad transformation --- src/ModulusRemainder.h | 6 ++++-- src/Solve.cpp | 35 ++++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/src/ModulusRemainder.h b/src/ModulusRemainder.h index c0341b75abf6..cbcdce10b98c 100644 --- a/src/ModulusRemainder.h +++ b/src/ModulusRemainder.h @@ -7,6 +7,8 @@ #include +#include "Util.h" + namespace Halide { struct Expr; @@ -83,8 +85,8 @@ ModulusRemainder modulus_remainder(const Expr &e, const Scope /** Reduce an expression modulo some integer. Returns true and assigns * to remainder if an answer could be found. */ ///@{ -bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder); -bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder, const Scope &scope); +HALIDE_MUST_USE_RESULT bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder); +HALIDE_MUST_USE_RESULT bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder, const Scope &scope); ///@} void modulus_remainder_test(); diff --git a/src/Solve.cpp b/src/Solve.cpp index a08eedadbd27..22bd14e44412 100644 --- a/src/Solve.cpp +++ b/src/Solve.cpp @@ -44,18 +44,22 @@ class SolveExpression : public IRMutator { map::iterator iter = cache.find(e); if (iter == cache.end()) { // Not in the cache, call the base class version. - debug(4) << "Mutating " << e << " (" << uses_var << ")\n"; + debug(4) << "Mutating " << e << " (" << uses_var << ", " << failed << ")\n"; bool old_uses_var = uses_var; uses_var = false; + bool old_failed = failed; + failed = false; Expr new_e = IRMutator::mutate(e); - CacheEntry entry = {new_e, uses_var}; + CacheEntry entry = {new_e, uses_var, failed}; uses_var = old_uses_var || uses_var; + failed = old_failed || failed; cache[e] = entry; - debug(4) << "(Miss) Rewrote " << e << " -> " << new_e << " (" << uses_var << ")\n"; + debug(4) << "(Miss) Rewrote " << e << " -> " << new_e << " (" << uses_var << ", " << failed << ")\n"; return new_e; } else { // Cache hit. uses_var = uses_var || iter->second.uses_var; + failed = failed || iter->second.failed; debug(4) << "(Hit) Rewrote " << e << " -> " << iter->second.expr << " (" << uses_var << ")\n"; return iter->second.expr; } @@ -75,7 +79,7 @@ class SolveExpression : public IRMutator { // stateless, so we can cache everything. struct CacheEntry { Expr expr; - bool uses_var; + bool uses_var, failed; }; map cache; @@ -388,16 +392,25 @@ class SolveExpression : public IRMutator { const Mul *mul_a = a.as(); Expr expr; if (a_uses_var && !b_uses_var) { + const int64_t *ib = as_const_int(b); + auto is_multiple_of_b = [&](const Expr &e) { + if (ib) { + int64_t r = 0; + return reduce_expr_modulo(e, *ib, &r) && r == 0; + } else { + return can_prove(e / b * b == e); + } + }; if (add_a && !a_failed && - can_prove(add_a->a / b * b == add_a->a)) { + is_multiple_of_b(add_a->a)) { // (f(x) + a) / b -> f(x) / b + a / b expr = mutate(simplify(add_a->a / b) + add_a->b / b); } else if (sub_a && !a_failed && - can_prove(sub_a->a / b * b == sub_a->a)) { + is_multiple_of_b(sub_a->a)) { // (f(x) - a) / b -> f(x) / b - a / b expr = mutate(simplify(sub_a->a / b) - sub_a->b / b); } else if (mul_a && !a_failed && no_overflow_int(op->type) && - can_prove(mul_a->b / b * b == mul_a->b)) { + is_multiple_of_b(mul_a->b)) { // (f(x) * a) / b -> f(x) * (a / b) expr = mutate(mul_a->a * (mul_a->b / b)); } @@ -776,6 +789,7 @@ class SolveExpression : public IRMutator { } else if (scope.contains(op->name)) { CacheEntry e = scope.get(op->name); uses_var = uses_var || e.uses_var; + failed = failed || e.failed; return e.expr; } else if (external_scope.contains(op->name)) { Expr e = external_scope.get(op->name); @@ -790,11 +804,14 @@ class SolveExpression : public IRMutator { Expr visit(const Let *op) override { bool old_uses_var = uses_var; + bool old_failed = failed; uses_var = false; + failed = false; Expr value = mutate(op->value); - CacheEntry e = {value, uses_var}; - + CacheEntry e = {value, uses_var, failed}; uses_var = old_uses_var; + failed = old_failed; + ScopedBinding bind(scope, op->name, e); return mutate(op->body); } From 4b2d21154c5eda4e3ece657e4886d45fa78069f1 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Sat, 27 Jan 2024 00:33:24 +0000 Subject: [PATCH 042/186] Upgrade clang-format and clang-tidy to use LLVM 17 (#8042) * Upgrade clang-format and clang-tidy to use LLVM 17 * trigger buildbots * trigger buildbots * trigger buildbots * trigger buildbots --- .clang-tidy | 24 ++++++++++++++-- .github/workflows/presubmit.yml | 14 +++++----- apps/hannk/interpreter/allocation_planner.cpp | 2 +- apps/hannk/interpreter/interpreter.cpp | 2 +- apps/hannk/util/error_util.cpp | 6 ++-- apps/hannk/util/model_runner.cpp | 4 +-- run-clang-format.sh | 14 +++++----- run-clang-tidy.sh | 14 +++++----- src/Associativity.cpp | 2 +- src/AutoScheduleUtils.cpp | 2 +- src/Bounds.cpp | 2 +- src/Buffer.h | 28 +++++++++---------- src/Deinterleave.cpp | 2 +- src/Function.cpp | 11 ++++---- src/Generator.cpp | 2 +- src/Generator.h | 10 +++---- src/IRMatch.cpp | 2 +- src/LLVM_Output.cpp | 6 ++-- src/Monotonic.cpp | 2 +- src/Reduction.cpp | 2 +- src/Scope.h | 2 +- src/SpirvIR.cpp | 2 +- src/StmtToHTML.cpp | 2 +- src/Target.cpp | 2 +- src/UniquifyVariableNames.cpp | 2 +- src/autoschedulers/adams2019/AutoSchedule.cpp | 2 +- .../anderson2021/AutoSchedule.cpp | 20 ++++++------- .../anderson2021/SearchSpace.cpp | 2 +- src/autoschedulers/anderson2021/State.h | 2 +- src/autoschedulers/common/cmdline.h | 14 +++++----- .../li2018/GradientAutoscheduler.cpp | 2 +- src/runtime/cuda.cpp | 2 +- src/runtime/mini_d3d12.h | 22 +++++++++------ src/runtime/mini_vulkan.h | 2 +- src/runtime/opencl.cpp | 2 +- src/runtime/runtime_internal.h | 2 +- test/correctness/unroll_dynamic_loop.cpp | 2 +- tools/regexp_replace.cpp | 2 +- 38 files changed, 132 insertions(+), 105 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 04cf50c915ec..815ccd3339a2 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -19,6 +19,7 @@ Checks: > bugprone-dangling-handle, bugprone-dynamic-static-initializers, -bugprone-easily-swappable-parameters, + -bugprone-empty-catch, # TODO: consider enabling -bugprone-exception-escape, bugprone-fold-init-type, bugprone-forward-declaration-namespace, @@ -35,8 +36,10 @@ Checks: > bugprone-misplaced-pointer-arithmetic-in-alloc, bugprone-misplaced-widening-cast, bugprone-move-forwarding-reference, + bugprone-multiple-new-in-one-expression, bugprone-multiple-statement-macro, - -bugprone-narrowing-conversions,, + -bugprone-narrowing-conversions, + bugprone-non-zero-enum-to-bool-conversion, bugprone-no-escape, bugprone-not-null-terminated-result, bugprone-parent-virtual-call, @@ -63,6 +66,7 @@ Checks: > bugprone-suspicious-semicolon, bugprone-suspicious-string-compare, bugprone-swapped-arguments, + -bugprone-switch-missing-default-case, # TODO: consider enabling bugprone-terminating-continue, bugprone-throw-keyword-missing, bugprone-too-small-loop-variable, @@ -71,6 +75,8 @@ Checks: > bugprone-undelegated-constructor, bugprone-unhandled-exception-at-new, bugprone-unhandled-self-assignment, + bugprone-unique-ptr-array-mismatch, + bugprone-unsafe-functions, bugprone-unused-raii, bugprone-unused-return-value, bugprone-use-after-move, @@ -78,9 +84,16 @@ Checks: > clang-diagnostic-shadow-field, + cppcoreguidelines-avoid-capturing-lambda-coroutines, + cppcoreguidelines-misleading-capture-default-by-value, + -cppcoreguidelines-missing-std-forward, # TODO: consider enabling + cppcoreguidelines-rvalue-reference-param-not-moved, + misc-confusable-identifiers, -misc-const-correctness, misc-definitions-in-headers, + misc-header-include-cycle, + -misc-include-cleaner, # TODO: consider enabling misc-misleading-bidirectional, misc-misleading-identifier, misc-misplaced-const, @@ -115,6 +128,7 @@ Checks: > -modernize-replace-random-shuffle, -modernize-return-braced-init-list, -modernize-shrink-to-fit, + -modernize-type-traits, # TODO: consider enabling -modernize-unary-static-assert, -modernize-use-auto, modernize-use-bool-literals, @@ -126,11 +140,13 @@ Checks: > -modernize-use-noexcept, modernize-use-nullptr, modernize-use-override, + -modernize-use-std-print, -modernize-use-trailing-return-type, -modernize-use-transparent-functors, -modernize-use-uncaught-exceptions, - -modernize-use-using + -modernize-use-using, + performance-avoid-endl, performance-faster-string-find, performance-for-range-copy, performance-implicit-conversion-in-loop, @@ -141,13 +157,16 @@ Checks: > performance-move-constructor-init, performance-no-automatic-move, -performance-no-int-to-ptr, + performance-noexcept-destructor, performance-noexcept-move-constructor, + performance-noexcept-swap, performance-trivially-destructible, performance-type-promotion-in-math-fn, performance-unnecessary-copy-initialization, performance-unnecessary-value-param, readability-avoid-const-params-in-decls, + -readability-avoid-unconditional-preprocessor-if, readability-braces-around-statements, readability-const-return-type, -readability-container-contains, @@ -170,6 +189,7 @@ Checks: > readability-misplaced-array-index, -readability-named-parameter, -readability-non-const-parameter, + -readability-operators-representation, readability-qualified-auto, readability-redundant-access-specifiers, readability-redundant-control-flow, diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 045a313cb23c..e30a606bd8d0 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -18,11 +18,11 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 - - uses: DoozyX/clang-format-lint-action@v0.16.2 + - uses: DoozyX/clang-format-lint-action@v0.17 with: source: '.' extensions: 'h,c,cpp' - clangFormatVersion: 16 + clangFormatVersion: 17 # As of Aug 2023, the macOS runners have more RAM (14GB vs 7GB) and CPU (3 cores vs 2) # than the Linux and Windows runners, so let's use those instead, since clang-tidy is # a bit of a sluggard @@ -36,14 +36,14 @@ jobs: # from apt.llvm.org # wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 15CF4D18AF4F7421 - sudo apt-add-repository "deb https://apt.llvm.org/$(lsb_release -sc)/ llvm-toolchain-$(lsb_release -sc)-16 main" + sudo apt-add-repository "deb https://apt.llvm.org/$(lsb_release -sc)/ llvm-toolchain-$(lsb_release -sc)-17 main" sudo apt-get update - sudo apt-get install llvm-16 clang-16 liblld-16-dev libclang-16-dev clang-tidy-16 ninja-build + sudo apt-get install llvm-17 clang-17 liblld-17-dev libclang-17-dev clang-tidy-17 ninja-build - name: Run clang-tidy run: | - export CC=clang-16 - export CXX=clang++-16 - export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-16 + export CC=clang-17 + export CXX=clang++-17 + export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-17 export CMAKE_GENERATOR=Ninja ./run-clang-tidy.sh check_cmake_file_lists: diff --git a/apps/hannk/interpreter/allocation_planner.cpp b/apps/hannk/interpreter/allocation_planner.cpp index caa9bd4b2664..a037846bcb2b 100644 --- a/apps/hannk/interpreter/allocation_planner.cpp +++ b/apps/hannk/interpreter/allocation_planner.cpp @@ -250,7 +250,7 @@ void AllocationPlanner::dump(std::ostream &o) { } } line[kLineWidth] = 0; - o << "t=" << std::setfill('0') << std::setw(3) << t << ": " << line << '\n'; + o << "t=" << std::setfill('0') << std::setw(3) << t << ": " << line << "\n"; } } diff --git a/apps/hannk/interpreter/interpreter.cpp b/apps/hannk/interpreter/interpreter.cpp index a6ee64514efa..902a4a0db807 100644 --- a/apps/hannk/interpreter/interpreter.cpp +++ b/apps/hannk/interpreter/interpreter.cpp @@ -120,7 +120,7 @@ std::unique_ptr allocate_tensors(const Op *root, const InterpreterOption if (options.verbosity >= 1) { std::ostringstream oss; - oss << "Arena memory needed: " << planner.memory_needed() << '\n'; + oss << "Arena memory needed: " << planner.memory_needed() << "\n"; oss << " Offsets:"; for (int i = 0; i < planner.block_count(); i++) { oss << ' ' << planner.get_block_offset(i); diff --git a/apps/hannk/util/error_util.cpp b/apps/hannk/util/error_util.cpp index 3348faeafc0a..e9e21186b18c 100644 --- a/apps/hannk/util/error_util.cpp +++ b/apps/hannk/util/error_util.cpp @@ -45,7 +45,7 @@ Logger::Logger(LogSeverity severity) void Logger::finish() noexcept(false) { if (!msg.str().empty() && msg.str().back() != '\n') { - msg << '\n'; + msg << "\n"; } hannk_log(severity, msg.str().c_str()); @@ -62,12 +62,12 @@ Logger::~Logger() noexcept(false) { Checker::Checker(const char *condition_string) : logger(FATAL) { - logger.msg << " Condition Failed: " << condition_string << '\n'; + logger.msg << " Condition Failed: " << condition_string << "\n"; } Checker::Checker(const char *file, int line, const char *condition_string) : logger(FATAL, file, line) { - logger.msg << " Condition Failed: " << condition_string << '\n'; + logger.msg << " Condition Failed: " << condition_string << "\n"; } Checker::~Checker() noexcept(false) { diff --git a/apps/hannk/util/model_runner.cpp b/apps/hannk/util/model_runner.cpp index 0e0bf2e4e72b..76bbbeeaa37e 100644 --- a/apps/hannk/util/model_runner.cpp +++ b/apps/hannk/util/model_runner.cpp @@ -636,7 +636,7 @@ void ModelRunner::run(const std::string &filename) { std::cout << ',' << RunNames[i] << "_matches_tflite"; } } - std::cout << '\n'; + std::cout << "\n"; } } @@ -724,7 +724,7 @@ void ModelRunner::run(const std::string &filename) { } if (csv_output) { - std::cout << '\n'; + std::cout << "\n"; } } diff --git a/run-clang-format.sh b/run-clang-format.sh index 7f852b5c419d..9b5712c5e56a 100755 --- a/run-clang-format.sh +++ b/run-clang-format.sh @@ -4,23 +4,23 @@ set -e ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -# We are currently standardized on using LLVM/Clang16 for this script. +# We are currently standardized on using LLVM/Clang17 for this script. # Note that this is totally independent of the version of LLVM that you -# are using to build Halide itself. If you don't have LLVM16 installed, +# are using to build Halide itself. If you don't have LLVM17 installed, # you can usually install what you need easily via: # -# sudo apt-get install llvm-16 clang-16 libclang-16-dev clang-tidy-16 -# export CLANG_FORMAT_LLVM_INSTALL_DIR=/usr/lib/llvm-16 +# sudo apt-get install llvm-17 clang-17 libclang-17-dev clang-tidy-17 +# export CLANG_FORMAT_LLVM_INSTALL_DIR=/usr/lib/llvm-17 [ -z "$CLANG_FORMAT_LLVM_INSTALL_DIR" ] && echo "CLANG_FORMAT_LLVM_INSTALL_DIR must point to an LLVM installation dir for this script." && exit echo CLANG_FORMAT_LLVM_INSTALL_DIR = ${CLANG_FORMAT_LLVM_INSTALL_DIR} VERSION=$(${CLANG_FORMAT_LLVM_INSTALL_DIR}/bin/clang-format --version) -if [[ ${VERSION} =~ .*version\ 16.* ]] +if [[ ${VERSION} =~ .*version\ 17.* ]] then - echo "clang-format version 16 found." + echo "clang-format version 17 found." else - echo "CLANG_FORMAT_LLVM_INSTALL_DIR must point to an LLVM 16 install!" + echo "CLANG_FORMAT_LLVM_INSTALL_DIR must point to an LLVM 17 install!" exit 1 fi diff --git a/run-clang-tidy.sh b/run-clang-tidy.sh index d876c2da7292..1b4fc808a0a9 100755 --- a/run-clang-tidy.sh +++ b/run-clang-tidy.sh @@ -30,23 +30,23 @@ if [ -n "${FIX}" ]; then echo "Operating in -fix mode!" fi -# We are currently standardized on using LLVM/Clang16 for this script. +# We are currently standardized on using LLVM/Clang17 for this script. # Note that this is totally independent of the version of LLVM that you -# are using to build Halide itself. If you don't have LLVM16 installed, +# are using to build Halide itself. If you don't have LLVM17 installed, # you can usually install what you need easily via: # -# sudo apt-get install llvm-16 clang-16 libclang-16-dev clang-tidy-16 -# export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-16 +# sudo apt-get install llvm-17 clang-17 libclang-17-dev clang-tidy-17 +# export CLANG_TIDY_LLVM_INSTALL_DIR=/usr/lib/llvm-17 [ -z "$CLANG_TIDY_LLVM_INSTALL_DIR" ] && echo "CLANG_TIDY_LLVM_INSTALL_DIR must point to an LLVM installation dir for this script." && exit echo CLANG_TIDY_LLVM_INSTALL_DIR = ${CLANG_TIDY_LLVM_INSTALL_DIR} VERSION=$(${CLANG_TIDY_LLVM_INSTALL_DIR}/bin/clang-tidy --version) -if [[ ${VERSION} =~ .*version\ 16.* ]] +if [[ ${VERSION} =~ .*version\ 17.* ]] then - echo "clang-tidy version 16 found." + echo "clang-tidy version 17 found." else - echo "CLANG_TIDY_LLVM_INSTALL_DIR must point to an LLVM 16 install!" + echo "CLANG_TIDY_LLVM_INSTALL_DIR must point to an LLVM 17 install!" exit 1 fi diff --git a/src/Associativity.cpp b/src/Associativity.cpp index 794113413451..39a0011391a6 100644 --- a/src/Associativity.cpp +++ b/src/Associativity.cpp @@ -781,7 +781,7 @@ void associativity_test() { true)); } - std::cout << "Associativity test passed" << std::endl; + std::cout << "Associativity test passed\n"; } } // namespace Internal diff --git a/src/AutoScheduleUtils.cpp b/src/AutoScheduleUtils.cpp index 85a0b7e17979..5dcd9183db57 100644 --- a/src/AutoScheduleUtils.cpp +++ b/src/AutoScheduleUtils.cpp @@ -405,7 +405,7 @@ void propagate_estimate_test() { check(img.dim(0).min() + img.dim(1).min() + x, x + 2); check(img.dim(0).extent() + img.dim(1).min() + img.dim(1).extent() * x, 55 * x + 38); - std::cout << "Propagate estimate test passed" << std::endl; + std::cout << "Propagate estimate test passed\n"; } } // namespace Internal diff --git a/src/Bounds.cpp b/src/Bounds.cpp index 0ba1f5440056..a08bb0b9ad61 100644 --- a/src/Bounds.cpp +++ b/src/Bounds.cpp @@ -3919,7 +3919,7 @@ void bounds_test() { internal_assert(in.is_single_point()); } - std::cout << "Bounds test passed" << std::endl; + std::cout << "Bounds test passed\n"; } } // namespace Internal diff --git a/src/Buffer.h b/src/Buffer.h index 637ca2900f65..304a1bd197ab 100644 --- a/src/Buffer.h +++ b/src/Buffer.h @@ -394,18 +394,18 @@ class Buffer { // @} // We forward numerous methods from the underlying Buffer -#define HALIDE_BUFFER_FORWARD_CONST(method) \ - template \ - auto method(Args &&...args) const->decltype(std::declval>().method(std::forward(args)...)) { \ - user_assert(defined()) << "Undefined buffer calling const method " #method "\n"; \ - return get()->method(std::forward(args)...); \ +#define HALIDE_BUFFER_FORWARD_CONST(method) \ + template \ + auto method(Args &&...args) const -> decltype(std::declval>().method(std::forward(args)...)) { \ + user_assert(defined()) << "Undefined buffer calling const method " #method "\n"; \ + return get()->method(std::forward(args)...); \ } -#define HALIDE_BUFFER_FORWARD(method) \ - template \ - auto method(Args &&...args)->decltype(std::declval>().method(std::forward(args)...)) { \ - user_assert(defined()) << "Undefined buffer calling method " #method "\n"; \ - return get()->method(std::forward(args)...); \ +#define HALIDE_BUFFER_FORWARD(method) \ + template \ + auto method(Args &&...args) -> decltype(std::declval>().method(std::forward(args)...)) { \ + user_assert(defined()) << "Undefined buffer calling method " #method "\n"; \ + return get()->method(std::forward(args)...); \ } // This is a weird-looking but effective workaround for a deficiency in "perfect forwarding": @@ -418,10 +418,10 @@ class Buffer { // and forward it as is, we can just use ... to allow an arbitrary number of commas, // then use __VA_ARGS__ to forward the mess as-is, and while it looks horrible, it // works. -#define HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(method, ...) \ - inline auto method(const __VA_ARGS__ &a)->decltype(std::declval>().method(a)) { \ - user_assert(defined()) << "Undefined buffer calling method " #method "\n"; \ - return get()->method(a); \ +#define HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(method, ...) \ + inline auto method(const __VA_ARGS__ &a) -> decltype(std::declval>().method(a)) { \ + user_assert(defined()) << "Undefined buffer calling method " #method "\n"; \ + return get()->method(a); \ } /** Does the same thing as the equivalent Halide::Runtime::Buffer method */ diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp index 0b30cefaa292..c43159893838 100644 --- a/src/Deinterleave.cpp +++ b/src/Deinterleave.cpp @@ -836,7 +836,7 @@ void deinterleave_vector_test() { Shuffle::make({vec_x, vec_y}, {0, 2, 4, 3, 1, 3}), Shuffle::make({vec_x, vec_y}, {4, 6, 2, 7, 2, 4})); - std::cout << "deinterleave_vector test passed" << std::endl; + std::cout << "deinterleave_vector test passed\n"; } } // namespace Internal diff --git a/src/Function.cpp b/src/Function.cpp index 3000817ecb2c..795d18136843 100644 --- a/src/Function.cpp +++ b/src/Function.cpp @@ -922,13 +922,14 @@ void Function::define_extern(const std::string &function_name, contents->func_schedule.storage_dims().clear(); contents->init_def.schedule().dims().clear(); for (size_t i = 0; i < args.size(); i++) { - contents->func_schedule.storage_dims().push_back(StorageDim{arg_names[i]}); - contents->init_def.schedule().dims().push_back( - Dim{arg_names[i], ForType::Extern, DeviceAPI::None, DimType::PureVar}); + StorageDim sd = {arg_names[i]}; + contents->func_schedule.storage_dims().push_back(sd); + Dim d = {arg_names[i], ForType::Extern, DeviceAPI::None, DimType::PureVar}; + contents->init_def.schedule().dims().push_back(d); } // Add the dummy outermost dim - contents->init_def.schedule().dims().push_back( - Dim{Var::outermost().name(), ForType::Serial, DeviceAPI::None, DimType::PureVar}); + Dim d = {Var::outermost().name(), ForType::Serial, DeviceAPI::None, DimType::PureVar}; + contents->init_def.schedule().dims().push_back(d); } void Function::accept(IRVisitor *visitor) const { diff --git a/src/Generator.cpp b/src/Generator.cpp index 8b633b777dd0..8719b2f2adae 100644 --- a/src/Generator.cpp +++ b/src/Generator.cpp @@ -2247,7 +2247,7 @@ void generator_test() { // Verify that Tuple parameter-pack variants can convert GeneratorParam to Expr Tuple t(gp, gp, gp); - std::cout << "Generator test passed" << std::endl; + std::cout << "Generator test passed\n"; } } // namespace Internal diff --git a/src/Generator.h b/src/Generator.h index 4d00a0fec574..99d106056842 100644 --- a/src/Generator.h +++ b/src/Generator.h @@ -1648,15 +1648,15 @@ class GeneratorInputImpl : public GeneratorInputBase { // types in question satisfy the property of copies referring to the same underlying // structure (returning references is just an optimization). Since this is verbose // and used in several places, we'll use a helper macro: -#define HALIDE_FORWARD_METHOD(Class, Method) \ - template \ - inline auto Method(Args &&...args)->typename std::remove_reference().Method(std::forward(args)...))>::type { \ - return this->template as().Method(std::forward(args)...); \ +#define HALIDE_FORWARD_METHOD(Class, Method) \ + template \ + inline auto Method(Args &&...args) -> typename std::remove_reference().Method(std::forward(args)...))>::type { \ + return this->template as().Method(std::forward(args)...); \ } #define HALIDE_FORWARD_METHOD_CONST(Class, Method) \ template \ - inline auto Method(Args &&...args) const-> \ + inline auto Method(Args &&...args) const -> \ typename std::remove_reference().Method(std::forward(args)...))>::type { \ this->check_gio_access(); \ return this->template as().Method(std::forward(args)...); \ diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp index 55dc02dcd553..3e5d95d787e6 100644 --- a/src/IRMatch.cpp +++ b/src/IRMatch.cpp @@ -48,7 +48,7 @@ void expr_match_test() { internal_assert(expr_match(vec_wild * 3, Ramp::make(x, y, 4) * 3, matches)); - std::cout << "expr_match test passed" << std::endl; + std::cout << "expr_match test passed\n"; } namespace { diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp index 4952fc981877..6b54aeef0e97 100644 --- a/src/LLVM_Output.cpp +++ b/src/LLVM_Output.cpp @@ -60,18 +60,18 @@ void emit_big_endian_u32(std::ostream &out, uint32_t value) { out << static_cast((value >> 24) & 0xff) << static_cast((value >> 16) & 0xff) << static_cast((value >> 8) & 0xff) - << static_cast((value)&0xff); + << static_cast((value) & 0xff); } void emit_little_endian_u32(std::ostream &out, uint32_t value) { - out << static_cast((value)&0xff) + out << static_cast((value) & 0xff) << static_cast((value >> 8) & 0xff) << static_cast((value >> 16) & 0xff) << static_cast((value >> 24) & 0xff); } void emit_little_endian_u16(std::ostream &out, uint16_t value) { - out << static_cast((value)&0xff) + out << static_cast((value) & 0xff) << static_cast((value >> 8) & 0xff); } diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp index 2e2aa554e31f..dd8e17d5b177 100644 --- a/src/Monotonic.cpp +++ b/src/Monotonic.cpp @@ -768,7 +768,7 @@ void is_monotonic_test() { check_unknown(select(0 < x, max(min(x, 4), 3), 4)); - std::cout << "is_monotonic test passed" << std::endl; + std::cout << "is_monotonic test passed\n"; } } // namespace Internal diff --git a/src/Reduction.cpp b/src/Reduction.cpp index c04d11dfed7b..bacd79ac4869 100644 --- a/src/Reduction.cpp +++ b/src/Reduction.cpp @@ -88,7 +88,7 @@ void split_predicate_test() { check((x < y) && ((w == 1) || ((x == 10) && (y == z))), expected); } - std::cout << "Split predicate test passed" << std::endl; + std::cout << "Split predicate test passed\n"; } struct ReductionDomainContents { diff --git a/src/Scope.h b/src/Scope.h index 1838d14c7799..9d1cc43e1164 100644 --- a/src/Scope.h +++ b/src/Scope.h @@ -243,7 +243,7 @@ class Scope { return const_iterator(table.end()); } - void swap(Scope &other) { + void swap(Scope &other) noexcept { table.swap(other.table); std::swap(containing_scope, other.containing_scope); } diff --git a/src/SpirvIR.cpp b/src/SpirvIR.cpp index 21bf5a1e696f..761865d76b14 100644 --- a/src/SpirvIR.cpp +++ b/src/SpirvIR.cpp @@ -3951,7 +3951,7 @@ void spirv_ir_test() { binary.clear(); builder.encode(binary); - std::cout << "SpirV IR test passed" << std::endl; + std::cout << "SpirV IR test passed\n"; #else std::cout << "SpirV IR test *disabled*" << std::endl; #endif diff --git a/src/StmtToHTML.cpp b/src/StmtToHTML.cpp index 7c8c9f9c03c7..9c317ba35525 100644 --- a/src/StmtToHTML.cpp +++ b/src/StmtToHTML.cpp @@ -1124,7 +1124,7 @@ class HTMLCodePrinter : public IRVisitor { // Prints newline to stream void print_ln() { - stream << '\n'; + stream << "\n"; } // Prints a variable to stream diff --git a/src/Target.cpp b/src/Target.cpp index 49011348544f..c824fea1c928 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -1550,7 +1550,7 @@ void target_test() { internal_assert(with_vector_bits.vector_bits == 512) << "Vector bits not populated in constructor.\n"; internal_assert(Target(with_vector_bits.to_string()).vector_bits == 512) << "Vector bits not round tripped properly.\n"; - std::cout << "Target test passed" << std::endl; + std::cout << "Target test passed\n"; } } // namespace Internal diff --git a/src/UniquifyVariableNames.cpp b/src/UniquifyVariableNames.cpp index 781ba9256257..26689ec34633 100644 --- a/src/UniquifyVariableNames.cpp +++ b/src/UniquifyVariableNames.cpp @@ -248,7 +248,7 @@ void uniquify_variable_names_test() { {{x, Let::make(y.name(), 3, y)}, {x_1, Let::make(y.name(), 4, y)}}); - std::cout << "uniquify_variable_names test passed" << std::endl; + std::cout << "uniquify_variable_names test passed\n"; } } // namespace Internal diff --git a/src/autoschedulers/adams2019/AutoSchedule.cpp b/src/autoschedulers/adams2019/AutoSchedule.cpp index dd147465d791..083626a82423 100644 --- a/src/autoschedulers/adams2019/AutoSchedule.cpp +++ b/src/autoschedulers/adams2019/AutoSchedule.cpp @@ -193,7 +193,7 @@ class StateQueue { return sz; } - void swap(StateQueue &other) { + void swap(StateQueue &other) noexcept { storage.swap(other.storage); std::swap(sz, other.sz); } diff --git a/src/autoschedulers/anderson2021/AutoSchedule.cpp b/src/autoschedulers/anderson2021/AutoSchedule.cpp index 8165979f90fb..e670fe7d8734 100644 --- a/src/autoschedulers/anderson2021/AutoSchedule.cpp +++ b/src/autoschedulers/anderson2021/AutoSchedule.cpp @@ -651,12 +651,12 @@ void generate_schedule(const std::vector &outputs, } } - aslog(1) << "Number of states added: " << stats.num_states_added << '\n'; - aslog(1) << "Number of featurizations computed: " << stats.num_featurizations << '\n'; - aslog(1) << "Number of memoization hits: " << stats.num_memoization_hits << '\n'; - aslog(1) << "Number of memoization misses: " << stats.num_memoization_misses << '\n'; - aslog(1) << "Number of block memoization hits: " << stats.num_block_memoization_hits << '\n'; - aslog(1) << "Number of block memoization misses: " << stats.num_block_memoization_misses << '\n'; + aslog(1) << "Number of states added: " << stats.num_states_added << "\n"; + aslog(1) << "Number of featurizations computed: " << stats.num_featurizations << "\n"; + aslog(1) << "Number of memoization hits: " << stats.num_memoization_hits << "\n"; + aslog(1) << "Number of memoization misses: " << stats.num_memoization_misses << "\n"; + aslog(1) << "Number of block memoization hits: " << stats.num_block_memoization_hits << "\n"; + aslog(1) << "Number of block memoization misses: " << stats.num_block_memoization_misses << "\n"; aslog(1) << "Total featurization time (ms): " << stats.total_featurization_time() << "\n"; aslog(1) << "Average featurization time (ms): " << stats.average_featurization_time() << "\n"; aslog(1) << "Total enqueue time (ms): " << stats.total_enqueue_time() << "\n"; @@ -667,14 +667,14 @@ void generate_schedule(const std::vector &outputs, aslog(1) << "Total filter thread tiles time (ms): " << stats.total_filter_thread_tiles_time() << "\n"; aslog(1) << "Total filter parallel tiles time (ms): " << stats.total_filter_parallel_tiles_time() << "\n"; - aslog(1) << "Number of schedules evaluated by cost model: " << stats.num_schedules_enqueued << '\n'; - aslog(1) << "Number of tilings generated: " << stats.num_tilings_generated << '\n'; - aslog(1) << "Number of tilings accepted: " << stats.num_tilings_accepted << '\n'; + aslog(1) << "Number of schedules evaluated by cost model: " << stats.num_schedules_enqueued << "\n"; + aslog(1) << "Number of tilings generated: " << stats.num_tilings_generated << "\n"; + aslog(1) << "Number of tilings accepted: " << stats.num_tilings_accepted << "\n"; aslog(1) << "Total cost model evaluation time (ms): " << stats.total_cost_model_evaluation_time() << "\n"; aslog(1) << "Average cost model evaluation time (ms): " << stats.average_cost_model_evaluation_time() << "\n"; std::chrono::duration total_time = timer.elapsed(); aslog(1) << "Time taken for autoscheduler (s): " - << std::chrono::duration_cast(total_time).count() / 1000.0 << '\n'; + << std::chrono::duration_cast(total_time).count() / 1000.0 << "\n"; } struct Anderson2021 { diff --git a/src/autoschedulers/anderson2021/SearchSpace.cpp b/src/autoschedulers/anderson2021/SearchSpace.cpp index bad8972435ce..938a039a29ec 100644 --- a/src/autoschedulers/anderson2021/SearchSpace.cpp +++ b/src/autoschedulers/anderson2021/SearchSpace.cpp @@ -303,7 +303,7 @@ void SearchSpace::generate_children(const IntrusivePtr &state, aslog(1) << " " << e2->producer->func.name() << "\n"; } } - internal_error << "Pipeline so far doesn't use next Func: " << node->func.name() << '\n'; + internal_error << "Pipeline so far doesn't use next Func: " << node->func.name() << "\n"; } int num_children = 0; diff --git a/src/autoschedulers/anderson2021/State.h b/src/autoschedulers/anderson2021/State.h index c2b0371dce3f..53ef12a33eb4 100644 --- a/src/autoschedulers/anderson2021/State.h +++ b/src/autoschedulers/anderson2021/State.h @@ -270,7 +270,7 @@ class StateQueue { return sz; } - void swap(StateQueue &other) { + void swap(StateQueue &other) noexcept { storage.swap(other.storage); std::swap(sz, other.sz); } diff --git a/src/autoschedulers/common/cmdline.h b/src/autoschedulers/common/cmdline.h index 29783dbbd2cb..1158eb151c01 100644 --- a/src/autoschedulers/common/cmdline.h +++ b/src/autoschedulers/common/cmdline.h @@ -489,7 +489,7 @@ class parser { } for (auto &arg : args) { - std::cout << "\"" << arg << "\"" << std::endl; + std::cout << "\"" << arg << "\"\n"; } return parse(args); @@ -635,7 +635,7 @@ class parser { std::string error_full() const { std::ostringstream oss; for (const auto &error : errors) { - oss << error << std::endl; + oss << error << "\n"; } return oss.str(); } @@ -649,8 +649,8 @@ class parser { } } - oss << "[options] ... " << ftr << std::endl; - oss << "options:" << std::endl; + oss << "[options] ... " << ftr << "\n"; + oss << "options:\n"; size_t max_width = 0; for (const auto &o : ordered) { @@ -667,7 +667,7 @@ class parser { for (size_t j = o->name().length(); j < max_width + 4; j++) { oss << ' '; } - oss << o->description() << std::endl; + oss << o->description() << "\n"; } return oss.str(); } @@ -680,7 +680,7 @@ class parser { } if (!ok) { - std::cerr << error() << std::endl + std::cerr << error() << "\n" << usage(); exit(1); } @@ -813,7 +813,7 @@ class parser { actual = read(value); has = true; } catch (const std::exception &e) { - std::cout << "Exception was caught: " << e.what() << std::endl; + std::cout << "Exception was caught: " << e.what() << "\n"; return false; } return true; diff --git a/src/autoschedulers/li2018/GradientAutoscheduler.cpp b/src/autoschedulers/li2018/GradientAutoscheduler.cpp index db8a81a634ab..709e13b2ead5 100644 --- a/src/autoschedulers/li2018/GradientAutoscheduler.cpp +++ b/src/autoschedulers/li2018/GradientAutoscheduler.cpp @@ -37,7 +37,7 @@ std::vector get_int_bounds(const Box &bounds) { std::vector int_bounds; int_bounds.reserve(bounds.size()); for (int i = 0; i < (int)bounds.size(); i++) { - Interval interval = bounds[i]; + const Interval &interval = bounds[i]; Expr extent = simplify(interval.max - interval.min + 1); extent = simplify(substitute_var_estimates(extent)); const int64_t *extent_int = as_const_int(extent); diff --git a/src/runtime/cuda.cpp b/src/runtime/cuda.cpp index 6e5f0e82eff2..a5170c55d256 100644 --- a/src/runtime/cuda.cpp +++ b/src/runtime/cuda.cpp @@ -76,7 +76,7 @@ WEAK int load_libcuda(void *user_context) { halide_abort_if_false(user_context, cuInit == nullptr); halide_error_code_t result; -// clang-format off + // clang-format off #define CUDA_FN(ret, fn, args) result = get_cuda_symbol(user_context, #fn, false, fn); if (result) return result; // NOLINT(bugprone-macro-parentheses) #define CUDA_FN_OPTIONAL(ret, fn, args) result = get_cuda_symbol(user_context, #fn, true, fn); if (result) return result; // NOLINT(bugprone-macro-parentheses) #define CUDA_FN_3020(ret, fn, fn_3020, args) result = get_cuda_symbol(user_context, #fn_3020, false, fn); if (result) return result; // NOLINT(bugprone-macro-parentheses) diff --git a/src/runtime/mini_d3d12.h b/src/runtime/mini_d3d12.h index 3fe30d1dddd1..7b179fe58760 100644 --- a/src/runtime/mini_d3d12.h +++ b/src/runtime/mini_d3d12.h @@ -635,8 +635,12 @@ typedef struct _RPC_MESSAGE { #define THIS void #define DECLARE_INTERFACE(iface) interface DECLSPEC_NOVTABLE iface #define DECLARE_INTERFACE_(iface, baseiface) interface DECLSPEC_NOVTABLE iface : public baseiface -#define DECLARE_INTERFACE_IID(iface, iid) interface DECLSPEC_UUID(iid) DECLSPEC_NOVTABLE iface -#define DECLARE_INTERFACE_IID_(iface, baseiface, iid) interface DECLSPEC_UUID(iid) DECLSPEC_NOVTABLE iface : public baseiface +#define DECLARE_INTERFACE_IID(iface, iid) \ + interface DECLSPEC_UUID(iid) \ + DECLSPEC_NOVTABLE iface +#define DECLARE_INTERFACE_IID_(iface, baseiface, iid) \ + interface DECLSPEC_UUID(iid) \ + DECLSPEC_NOVTABLE iface : public baseiface #define IFACEMETHOD(method) __override STDMETHOD(method) #define IFACEMETHOD_(type, method) __override STDMETHOD_(type, method) @@ -715,7 +719,8 @@ _Post_equal_to_(pp) _Post_satisfies_(return == pp) void **IID_PPV_ARGS_Helper(T #define DECLARE_INTERFACE(iface) \ typedef interface iface { \ const struct iface##Vtbl FAR *lpVtbl; \ - } iface; \ + } \ + iface; \ typedef const struct iface##Vtbl iface##Vtbl; \ const struct iface##Vtbl #else @@ -724,7 +729,8 @@ _Post_equal_to_(pp) _Post_satisfies_(return == pp) void **IID_PPV_ARGS_Helper(T #define DECLARE_INTERFACE(iface) \ typedef interface iface { \ struct iface##Vtbl FAR *lpVtbl; \ - } iface; \ + } \ + iface; \ typedef struct iface##Vtbl iface##Vtbl; \ struct iface##Vtbl #endif @@ -2299,10 +2305,10 @@ typedef enum D3D12_SHADER_COMPONENT_MAPPING { #define D3D12_SHADER_COMPONENT_MAPPING_MASK 0x7 #define D3D12_SHADER_COMPONENT_MAPPING_SHIFT 3 #define D3D12_SHADER_COMPONENT_MAPPING_ALWAYS_SET_BIT_AVOIDING_ZEROMEM_MISTAKES (1 << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 4)) -#define D3D12_ENCODE_SHADER_4_COMPONENT_MAPPING(Src0, Src1, Src2, Src3) ((((Src0)&D3D12_SHADER_COMPONENT_MAPPING_MASK) | \ - (((Src1)&D3D12_SHADER_COMPONENT_MAPPING_MASK) << D3D12_SHADER_COMPONENT_MAPPING_SHIFT) | \ - (((Src2)&D3D12_SHADER_COMPONENT_MAPPING_MASK) << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 2)) | \ - (((Src3)&D3D12_SHADER_COMPONENT_MAPPING_MASK) << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 3)) | \ +#define D3D12_ENCODE_SHADER_4_COMPONENT_MAPPING(Src0, Src1, Src2, Src3) ((((Src0) & D3D12_SHADER_COMPONENT_MAPPING_MASK) | \ + (((Src1) & D3D12_SHADER_COMPONENT_MAPPING_MASK) << D3D12_SHADER_COMPONENT_MAPPING_SHIFT) | \ + (((Src2) & D3D12_SHADER_COMPONENT_MAPPING_MASK) << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 2)) | \ + (((Src3) & D3D12_SHADER_COMPONENT_MAPPING_MASK) << (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * 3)) | \ D3D12_SHADER_COMPONENT_MAPPING_ALWAYS_SET_BIT_AVOIDING_ZEROMEM_MISTAKES)) #define D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(ComponentToExtract, Mapping) \ ((D3D12_SHADER_COMPONENT_MAPPING)((Mapping) >> (D3D12_SHADER_COMPONENT_MAPPING_SHIFT * (ComponentToExtract)) & D3D12_SHADER_COMPONENT_MAPPING_MASK)) diff --git a/src/runtime/mini_vulkan.h b/src/runtime/mini_vulkan.h index 184282f9a878..1eff0ad7310b 100644 --- a/src/runtime/mini_vulkan.h +++ b/src/runtime/mini_vulkan.h @@ -74,7 +74,7 @@ typedef uint32_t VkSampleMask; // Provided by VK_VERSION_1_0 #define VK_API_VERSION_MAJOR(version) (((uint32_t)(version) >> 22) & 0x7FU) #define VK_API_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3FFU) -#define VK_API_VERSION_PATCH(version) ((uint32_t)(version)&0xFFFU) +#define VK_API_VERSION_PATCH(version) ((uint32_t)(version) & 0xFFFU) #define VK_MAKE_API_VERSION(variant, major, minor, patch) \ ((((uint32_t)(variant)) << 29) | (((uint32_t)(major)) << 22) | (((uint32_t)(minor)) << 12) | ((uint32_t)(patch))) #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0) diff --git a/src/runtime/opencl.cpp b/src/runtime/opencl.cpp index fac8ff41fbfc..8aaba7f6a707 100644 --- a/src/runtime/opencl.cpp +++ b/src/runtime/opencl.cpp @@ -68,7 +68,7 @@ WEAK int load_libopencl(void *user_context) { halide_abort_if_false(user_context, clCreateContext == nullptr); halide_error_code_t result; -// clang-format off + // clang-format off #define CL_FN(ret, fn, args) result = get_cl_symbol(user_context, #fn, true, fn); if (result) return result; // NOLINT(bugprone-macro-parentheses) #define CL_12_FN(ret, fn, args) result = get_cl_symbol(user_context, #fn, false, fn); if (result) return result; // NOLINT(bugprone-macro-parentheses) #include "cl_functions.h" diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h index 027ae5c4f500..8df9dcb8eb2c 100644 --- a/src/runtime/runtime_internal.h +++ b/src/runtime/runtime_internal.h @@ -222,7 +222,7 @@ ALWAYS_INLINE T is_power_of_two(T value) { namespace { template -ALWAYS_INLINE void swap(T &a, T &b) { +ALWAYS_INLINE void swap(T &a, T &b) noexcept { T t = a; a = b; b = t; diff --git a/test/correctness/unroll_dynamic_loop.cpp b/test/correctness/unroll_dynamic_loop.cpp index e43412b0c6c1..a31ca78dcf3f 100644 --- a/test/correctness/unroll_dynamic_loop.cpp +++ b/test/correctness/unroll_dynamic_loop.cpp @@ -9,7 +9,7 @@ int main(int argc, char **argv) { Buffer in(100); in.for_each_element([&](int x) { in(x) = x * 2.0f; }); - f(x) = in(x)*3; + f(x) = in(x) * 3; g(x) = f(x) * 2; Var xo, xi; diff --git a/tools/regexp_replace.cpp b/tools/regexp_replace.cpp index 956a67030a92..c0d8311db279 100644 --- a/tools/regexp_replace.cpp +++ b/tools/regexp_replace.cpp @@ -19,7 +19,7 @@ int main(int argc, const char **argv) { while (std::getline(std::cin, line)) { std::regex_replace(std::ostreambuf_iterator(std::cout), line.begin(), line.end(), re, argv[2]); - std::cout << std::endl; + std::cout << "\n"; } return 0; } From 47378ee5bd7cb304be9d61e0a636982c8a2623d0 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 29 Jan 2024 01:28:13 +0000 Subject: [PATCH 043/186] Enable `bugprone-switch-missing-default-case` (#8048) * Upgrade clang-format and clang-tidy to use LLVM 17 * trigger buildbots * trigger buildbots * trigger buildbots * trigger buildbots * Enable `bugprone-switch-missing-default-case` ...and fix existing warnings. * Update .clang-tidy * Update Parameter.cpp * Update .clang-tidy * Update .clang-tidy * Update .clang-tidy * Update .clang-tidy * Update CPlusPlusMangle.cpp --- .clang-tidy | 8 ++++---- src/CPlusPlusMangle.cpp | 20 ++++++++++++-------- src/CodeGen_OpenCL_Dev.cpp | 6 ++++++ src/HexagonOptimize.cpp | 4 ++++ src/Parameter.cpp | 8 ++++++++ src/runtime/openglcompute.cpp | 2 ++ tools/halide_image_io.h | 3 +++ 7 files changed, 39 insertions(+), 12 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 815ccd3339a2..283acd5f9bd3 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -19,7 +19,7 @@ Checks: > bugprone-dangling-handle, bugprone-dynamic-static-initializers, -bugprone-easily-swappable-parameters, - -bugprone-empty-catch, # TODO: consider enabling + -bugprone-empty-catch, -bugprone-exception-escape, bugprone-fold-init-type, bugprone-forward-declaration-namespace, @@ -66,7 +66,7 @@ Checks: > bugprone-suspicious-semicolon, bugprone-suspicious-string-compare, bugprone-swapped-arguments, - -bugprone-switch-missing-default-case, # TODO: consider enabling + bugprone-switch-missing-default-case, bugprone-terminating-continue, bugprone-throw-keyword-missing, bugprone-too-small-loop-variable, @@ -93,7 +93,7 @@ Checks: > -misc-const-correctness, misc-definitions-in-headers, misc-header-include-cycle, - -misc-include-cleaner, # TODO: consider enabling + -misc-include-cleaner, misc-misleading-bidirectional, misc-misleading-identifier, misc-misplaced-const, @@ -128,7 +128,7 @@ Checks: > -modernize-replace-random-shuffle, -modernize-return-braced-init-list, -modernize-shrink-to-fit, - -modernize-type-traits, # TODO: consider enabling + -modernize-type-traits, -modernize-unary-static-assert, -modernize-use-auto, modernize-use-bool-literals, diff --git a/src/CPlusPlusMangle.cpp b/src/CPlusPlusMangle.cpp index 05c9d552e68f..b5c30b4fcb65 100644 --- a/src/CPlusPlusMangle.cpp +++ b/src/CPlusPlusMangle.cpp @@ -246,9 +246,10 @@ MangledNamePart mangle_type(const Type &type, const Target &target, PreviousDecl return "H"; case 64: return "_J"; + default: + internal_error << "Unexpected integer size: " << type.bits() << ".\n"; + return ""; } - internal_error << "Unexpected integer size: " << type.bits() << ".\n"; - return ""; } else if (type.is_uint()) { switch (type.bits()) { case 1: @@ -261,9 +262,10 @@ MangledNamePart mangle_type(const Type &type, const Target &target, PreviousDecl return "I"; case 64: return "_K"; + default: + internal_error << "Unexpected unsigned integer size: " << type.bits() << "\n"; + return ""; } - internal_error << "Unexpected unsigned integer size: " << type.bits() << "\n"; - return ""; } else if (type.is_float()) { if (type.bits() == 32) { return "M"; @@ -546,9 +548,10 @@ std::string mangle_type(const Type &type, const Target &target, PrevPrefixes &pr } else { return "l"; } + default: + internal_error << "Unexpected integer size: " << type.bits() << ".\n"; + return ""; } - internal_error << "Unexpected integer size: " << type.bits() << ".\n"; - return ""; } else if (type.is_uint()) { switch (type.bits()) { case 1: @@ -571,9 +574,10 @@ std::string mangle_type(const Type &type, const Target &target, PrevPrefixes &pr } else { return "m"; } + default: + internal_error << "Unexpected unsigned integer size: " << type.bits() << "\n"; + return ""; } - internal_error << "Unexpected unsigned integer size: " << type.bits() << "\n"; - return ""; } else if (type.is_float()) { if (type.bits() == 32) { return "f"; diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 5712c1ea0fe9..52feed53f9e0 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -389,6 +389,9 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { rhs << "(int4)(" << coord[0] << idx << ", " << coord[1] << idx << ", " << coord[2] << idx << ", 0)).s0"; break; + default: + internal_error << "Unsupported dims"; + break; } print_assignment(op->type.with_bits(32).with_lanes(1), rhs.str()); results[i] = id; @@ -448,6 +451,9 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Call *op) { write_image << "(int4)(" << coord[0] << idx << ", " << coord[1] << idx << ", " << coord[2] << idx << ", 0)"; break; + default: + internal_error << "Unsupported dims"; + break; } write_image << ", (" << print_type(value_type.with_bits(32).with_lanes(4)) << ")(" << value << idx << ", 0, 0, 0));\n"; diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp index 3e19915e98cf..b76a9eb1cfef 100644 --- a/src/HexagonOptimize.cpp +++ b/src/HexagonOptimize.cpp @@ -91,6 +91,8 @@ string type_suffix(Type type, bool signed_variants) { return prefix + "h"; case 32: return prefix + "w"; + default: + break; } } else if (type.is_uint()) { switch (type.bits()) { @@ -100,6 +102,8 @@ string type_suffix(Type type, bool signed_variants) { return prefix + "uh"; case 32: return prefix + "uw"; + default: + break; } } internal_error << "Unsupported HVX type: " << type << "\n"; diff --git a/src/Parameter.cpp b/src/Parameter.cpp index d9616b5bebf8..41353871fd0d 100644 --- a/src/Parameter.cpp +++ b/src/Parameter.cpp @@ -142,6 +142,8 @@ Expr Parameter::scalar_expr() const { return Expr(sv.u.f32); case 64: return Expr(sv.u.f64); + default: + break; } } else if (t.is_int()) { switch (t.bits()) { @@ -153,6 +155,8 @@ Expr Parameter::scalar_expr() const { return Expr(sv.u.i32); case 64: return Expr(sv.u.i64); + default: + break; } } else if (t.is_uint()) { switch (t.bits()) { @@ -166,12 +170,16 @@ Expr Parameter::scalar_expr() const { return Expr(sv.u.u32); case 64: return Expr(sv.u.u64); + default: + break; } } else if (t.is_handle()) { // handles are always uint64 internally. switch (t.bits()) { case 64: return Expr(sv.u.u64); + default: + break; } } internal_error << "Unsupported type " << t << " in scalar_expr\n"; diff --git a/src/runtime/openglcompute.cpp b/src/runtime/openglcompute.cpp index 27397e2c008e..edb1327d90a9 100644 --- a/src/runtime/openglcompute.cpp +++ b/src/runtime/openglcompute.cpp @@ -88,6 +88,8 @@ WEAK const char *gl_error_name(int32_t err) { case 0x8031: return "GL_TABLE_TOO_LARGE"; break; + default: + break; } return ""; } diff --git a/tools/halide_image_io.h b/tools/halide_image_io.h index a9f312168b92..e039f7c2e798 100644 --- a/tools/halide_image_io.h +++ b/tools/halide_image_io.h @@ -1455,6 +1455,9 @@ bool load_mat(const std::string &filename, ImageType *im) { case miDOUBLE: type = halide_type_of(); break; + default: + check(false, "Unknown header"); + return false; } *im = ImageType(type, extents); From e2448fe535db057b18f7ca16d1c878cd045902e9 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 1 Feb 2024 09:46:10 -0800 Subject: [PATCH 044/186] Fix type error in VectorizeLoops (#8055) --- src/VectorizeLoops.cpp | 3 +- test/correctness/fuzz_schedule.cpp | 68 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index 1c3ec57f3fb7..6d10d2e9d5f3 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -134,7 +134,7 @@ Interval bounds_of_lanes(const Expr &e) { Interval ia = bounds_of_lanes(not_->a); return {!ia.max, !ia.min}; } else if (const Ramp *r = e.as()) { - Expr last_lane_idx = make_const(r->base.type(), r->lanes - 1); + Expr last_lane_idx = make_const(r->base.type().element_of(), r->lanes - 1); Interval ib = bounds_of_lanes(r->base); const Broadcast *b = as_scalar_broadcast(r->stride); Expr stride = b ? b->value : r->stride; @@ -875,6 +875,7 @@ class VectorSubs : public IRMutator { // generating a scalar condition that checks if // the least-true lane is true. Expr all_true = bounds_of_lanes(likely->args[0]).min; + internal_assert(all_true.type() == Bool()); // Wrap it in the same flavor of likely all_true = Call::make(Bool(), likely->name, {all_true}, Call::PureIntrinsic); diff --git a/test/correctness/fuzz_schedule.cpp b/test/correctness/fuzz_schedule.cpp index 9f0f86e3854b..a774335a07bf 100644 --- a/test/correctness/fuzz_schedule.cpp +++ b/test/correctness/fuzz_schedule.cpp @@ -202,6 +202,74 @@ int main(int argc, char **argv) { check_blur_output(buf, correct); } + // https://github.com/halide/Halide/issues/8054 + { + ImageParam input(Float(32), 2, "input"); + const float r_sigma = 0.1; + const int s_sigma = 8; + Func bilateral_grid{"bilateral_grid"}; + + Var x("x"), y("y"), z("z"), c("c"); + + // Add a boundary condition + Func clamped = Halide::BoundaryConditions::repeat_edge(input); + + // Construct the bilateral grid + RDom r(0, s_sigma, 0, s_sigma); + Expr val = clamped(x * s_sigma + r.x - s_sigma / 2, y * s_sigma + r.y - s_sigma / 2); + val = clamp(val, 0.0f, 1.0f); + + Expr zi = cast(val * (1.0f / r_sigma) + 0.5f); + + Func histogram("histogram"); + histogram(x, y, z, c) = 0.0f; + histogram(x, y, zi, c) += mux(c, {val, 1.0f}); + + // Blur the grid using a five-tap filter + Func blurx("blurx"), blury("blury"), blurz("blurz"); + blurz(x, y, z, c) = (histogram(x, y, z - 2, c) + + histogram(x, y, z - 1, c) * 4 + + histogram(x, y, z, c) * 6 + + histogram(x, y, z + 1, c) * 4 + + histogram(x, y, z + 2, c)); + blurx(x, y, z, c) = (blurz(x - 2, y, z, c) + + blurz(x - 1, y, z, c) * 4 + + blurz(x, y, z, c) * 6 + + blurz(x + 1, y, z, c) * 4 + + blurz(x + 2, y, z, c)); + blury(x, y, z, c) = (blurx(x, y - 2, z, c) + + blurx(x, y - 1, z, c) * 4 + + blurx(x, y, z, c) * 6 + + blurx(x, y + 1, z, c) * 4 + + blurx(x, y + 2, z, c)); + + // Take trilinear samples to compute the output + val = clamp(input(x, y), 0.0f, 1.0f); + Expr zv = val * (1.0f / r_sigma); + zi = cast(zv); + Expr zf = zv - zi; + Expr xf = cast(x % s_sigma) / s_sigma; + Expr yf = cast(y % s_sigma) / s_sigma; + Expr xi = x / s_sigma; + Expr yi = y / s_sigma; + Func interpolated("interpolated"); + interpolated(x, y, c) = + lerp(lerp(lerp(blury(xi, yi, zi, c), blury(xi + 1, yi, zi, c), xf), + lerp(blury(xi, yi + 1, zi, c), blury(xi + 1, yi + 1, zi, c), xf), yf), + lerp(lerp(blury(xi, yi, zi + 1, c), blury(xi + 1, yi, zi + 1, c), xf), + lerp(blury(xi, yi + 1, zi + 1, c), blury(xi + 1, yi + 1, zi + 1, c), xf), yf), + zf); + + // Normalize + bilateral_grid(x, y) = interpolated(x, y, 0) / interpolated(x, y, 1); + Pipeline p({bilateral_grid}); + + Var v6, zo, vzi; + + blury.compute_root().split(x, x, v6, 6, TailStrategy::GuardWithIf).split(z, zo, vzi, 8, TailStrategy::GuardWithIf).reorder(y, x, c, vzi, zo, v6).vectorize(vzi).vectorize(v6); + p.compile_to_module({input}, "bilateral_grid", {Target("host")}); + } + printf("Success!\n"); return 0; } From 80e2081153361a7e0d3671290c383b1ba891286c Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 5 Feb 2024 14:25:05 -0800 Subject: [PATCH 045/186] Update makefile to use test/common/terminate_handler.cpp (#8066) This means we actually print error messages when using exceptions and the makefile --- Makefile | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 39358e03ef18..04fc41fa4167 100644 --- a/Makefile +++ b/Makefile @@ -277,7 +277,7 @@ LLVM_SHARED_LIBS = -Wl,-rpath=$(LLVM_LIBDIR) -L $(LLVM_LIBDIR) -lLLVM LLVM_LIBS_FOR_SHARED_LIBHALIDE=$(if $(WITH_LLVM_INSIDE_SHARED_LIBHALIDE),$(LLVM_STATIC_LIBS),$(LLVM_SHARED_LIBS)) -TUTORIAL_CXX_FLAGS ?= -std=c++17 -g -fno-omit-frame-pointer $(RTTI_CXX_FLAGS) -I $(ROOT_DIR)/tools $(SANITIZER_FLAGS) $(LLVM_CXX_FLAGS_LIBCPP) +TUTORIAL_CXX_FLAGS ?= -std=c++17 -g -fno-omit-frame-pointer $(RTTI_CXX_FLAGS) -I $(ROOT_DIR)/tools $(SANITIZER_FLAGS) $(LLVM_CXX_FLAGS_LIBCPP) $(EXCEPTIONS_CXX_FLAGS) # The tutorials contain example code with warnings that we don't want # to be flagged as errors, so the test flags are the tutorial flags # plus our warning flags. @@ -951,6 +951,14 @@ INITIAL_MODULES = $(RUNTIME_CPP_COMPONENTS:%=$(BUILD_DIR)/initmod.%_32.o) \ $(RUNTIME_LL_COMPONENTS:%=$(BUILD_DIR)/initmod.%_ll.o) \ $(PTX_DEVICE_INITIAL_MODULES:libdevice.%.bc=$(BUILD_DIR)/initmod_ptx.%_ll.o) +TEST_DEPS = $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES) +ifneq (,$(WITH_EXCEPTIONS)) +# The tests will link libHalide, but also the object file that +# installs a global exception handler. +TEST_DEPS += $(BUILD_DIR)/terminate_handler.o +TEST_LD_FLAGS += $(BUILD_DIR)/terminate_handler.o +endif + # Add the Hexagon simulator to the rpath on Linux. (Not supported elsewhere, so no else cases.) ifeq ($(UNAME), Linux) ifneq (,$(WITH_HEXAGON)) @@ -1220,6 +1228,10 @@ $(BUILD_DIR)/Simplify_%.o: $(SRC_DIR)/Simplify_%.cpp $(SRC_DIR)/Simplify_Interna @mkdir -p $(@D) $(CXX) $(CXX_FLAGS) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/Simplify_$*.d -MT $@ +$(BUILD_DIR)/terminate_handler.o: $(ROOT_DIR)/test/common/terminate_handler.cpp + @mkdir -p $(@D) + $(CXX) $(CXX_FLAGS) -c $< -o $@ -MMD -MP -MF $(BUILD_DIR)/$*.d -MT $(BUILD_DIR)/$*.o + .PHONY: clean clean: rm -rf $(LIB_DIR) @@ -1380,7 +1392,7 @@ $(BIN_DIR)/%/runtime.a: $(BIN_DIR)/runtime.generator @mkdir -p $(@D) $(CURDIR)/$< -r runtime -o $(CURDIR)/$(BIN_DIR)/$* target=$* -$(BIN_DIR)/test_internal: $(ROOT_DIR)/test/internal.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) +$(BIN_DIR)/test_internal: $(ROOT_DIR)/test/internal.cpp $(TEST_DEPS) @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) $< -I$(SRC_DIR) $(TEST_LD_FLAGS) -o $@ @@ -1395,7 +1407,7 @@ $(BUILD_DIR)/halide_ir.fbs.h: $(SRC_DIR)/halide_ir.fbs flatc --cpp --cpp-std C++17 --no-union-value-namespacing --keep-prefix --filename-suffix ".fbs" -o $(BUILD_DIR) $^ # Correctness test that link against libHalide -$(BIN_DIR)/correctness_%: $(ROOT_DIR)/test/correctness/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES) +$(BIN_DIR)/correctness_%: $(ROOT_DIR)/test/correctness/%.cpp $(TEST_DEPS) @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ @@ -1410,7 +1422,7 @@ $(BIN_DIR)/correctness_halide_buffer: $(ROOT_DIR)/test/correctness/halide_buffer # The image_io test additionally needs to link to libpng and # libjpeg. -$(BIN_DIR)/correctness_image_io: $(ROOT_DIR)/test/correctness/image_io.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(RUNTIME_EXPORTED_INCLUDES) +$(BIN_DIR)/correctness_image_io: $(ROOT_DIR)/test/correctness/image_io.cpp $(TEST_DEPS) $(CXX) $(TEST_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ # OpenCL runtime correctness test requires runtime.a to be linked. @@ -1418,14 +1430,14 @@ $(BIN_DIR)/$(TARGET)/correctness_opencl_runtime: $(ROOT_DIR)/test/correctness/op @mkdir -p $(@D) $(CXX) $(BIN_DIR)/$(TARGET)/runtime.a $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ -$(BIN_DIR)/performance_%: $(ROOT_DIR)/test/performance/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h +$(BIN_DIR)/performance_%: $(ROOT_DIR)/test/performance/%.cpp $(TEST_DEPS) $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE) $< -I$(INCLUDE_DIR) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(TEST_LD_FLAGS) -o $@ # Error tests that link against libHalide -$(BIN_DIR)/error_%: $(ROOT_DIR)/test/error/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h +$(BIN_DIR)/error_%: $(ROOT_DIR)/test/error/%.cpp $(TEST_DEPS) $(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/src/runtime -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ -$(BIN_DIR)/warning_%: $(ROOT_DIR)/test/warning/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h +$(BIN_DIR)/warning_%: $(ROOT_DIR)/test/warning/%.cpp $(TEST_DEPS) $(CXX) $(TEST_CXX_FLAGS) -I$(ROOT_DIR)/test/common $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ # Runtime tests that test internals @@ -1452,13 +1464,13 @@ $(BIN_DIR)/runtime_%: $(ROOT_DIR)/test/runtime/%.cpp $(BIN_DIR)/runtime_internal $(CXX) $(TEST_CXX_FLAGS) $(RUNTIME_TESTS_CXXFLAGS) -I$(ROOT_DIR)/test/runtime -I$(ROOT_DIR)/src/runtime $(OPTIMIZE_FOR_BUILD_TIME) $^ $(COMMON_LD_FLAGS) -o $@ # Auto schedule tests that link against libHalide -$(BIN_DIR)/mullapudi2016_%: $(ROOT_DIR)/test/autoschedulers/mullapudi2016/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h +$(BIN_DIR)/mullapudi2016_%: $(ROOT_DIR)/test/autoschedulers/mullapudi2016/%.cpp $(TEST_DEPS) $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ -$(BIN_DIR)/li2018_%: $(ROOT_DIR)/test/autoschedulers/li2018/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h +$(BIN_DIR)/li2018_%: $(ROOT_DIR)/test/autoschedulers/li2018/%.cpp $(TEST_DEPS) $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ -$(BIN_DIR)/adams2019_%: $(ROOT_DIR)/test/autoschedulers/adams2019/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h +$(BIN_DIR)/adams2019_%: $(ROOT_DIR)/test/autoschedulers/adams2019/%.cpp $(TEST_DEPS) $(CXX) $(TEST_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) -o $@ # TODO(srj): this doesn't auto-delete, why not? @@ -1471,7 +1483,7 @@ $(BUILD_DIR)/%_generator.o: $(ROOT_DIR)/test/generator/%_generator.cpp $(INCLUDE @mkdir -p $(@D) $(CXX) $(TEST_CXX_FLAGS) -I$(INCLUDE_DIR) -I$(CURDIR)/$(FILTERS_DIR) -c $< -o $@ -$(BIN_DIR)/%.generator: $(BUILD_DIR)/GenGen.o $(BIN_DIR)/libHalide.$(SHARED_EXT) $(BUILD_DIR)/%_generator.o +$(BIN_DIR)/%.generator: $(BUILD_DIR)/GenGen.o $(TEST_DEPS) $(BUILD_DIR)/%_generator.o @mkdir -p $(@D) $(CXX) $(filter %.cpp %.o %.a,$^) $(TEST_LD_FLAGS) -o $@ @@ -1787,7 +1799,7 @@ $(BIN_DIR)/$(TARGET)/generator_aotcpp_define_extern_opencl: $(ROOT_DIR)/test/gen $(CXX) $(GEN_AOT_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) $(GEN_AOT_INCLUDES) $(GEN_AOT_LD_FLAGS) $(OPENCL_LD_FLAGS) -o $@ # By default, %_jittest.cpp depends on libHalide, plus the stubs for the Generator. These are external tests that use the JIT. -$(BIN_DIR)/generator_jit_%: $(ROOT_DIR)/test/generator/%_jittest.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(FILTERS_DIR)/%.stub.h $(BUILD_DIR)/%_generator.o +$(BIN_DIR)/generator_jit_%: $(ROOT_DIR)/test/generator/%_jittest.cpp $(TEST_DEPS) $(FILTERS_DIR)/%.stub.h $(BUILD_DIR)/%_generator.o @mkdir -p $(@D) $(CXX) -g $(TEST_CXX_FLAGS) $(filter %.cpp %.o %.a,$^) -I$(INCLUDE_DIR) -I$(FILTERS_DIR) -I $(ROOT_DIR)/apps/support $(TEST_LD_FLAGS) -o $@ @@ -1922,7 +1934,7 @@ $(FILTERS_DIR)/multi_rungen2: $(BUILD_DIR)/RunGenMain.o $(BIN_DIR)/$(TARGET)/run @mkdir -p $(@D) $(CXX) -std=c++17 -I$(FILTERS_DIR) $^ $(GEN_AOT_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ -$(BIN_DIR)/tutorial_%: $(ROOT_DIR)/tutorial/%.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(INCLUDE_DIR)/HalideRuntime.h +$(BIN_DIR)/tutorial_%: $(ROOT_DIR)/tutorial/%.cpp $(TEST_DEPS) @ if [[ $@ == *_run ]]; then \ export TUTORIAL=$* ;\ export LESSON=`echo $${TUTORIAL} | cut -b1-9`; \ @@ -1934,7 +1946,7 @@ $(BIN_DIR)/tutorial_%: $(ROOT_DIR)/tutorial/%.cpp $(BIN_DIR)/libHalide.$(SHARED_ -I$(INCLUDE_DIR) -I$(ROOT_DIR)/tools $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@;\ fi -$(BIN_DIR)/tutorial_lesson_15_generators: $(ROOT_DIR)/tutorial/lesson_15_generators.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o +$(BIN_DIR)/tutorial_lesson_15_generators: $(ROOT_DIR)/tutorial/lesson_15_generators.cpp $(TEST_DEPS) $(BUILD_DIR)/GenGen.o $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \ -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ @@ -1945,7 +1957,7 @@ tutorial_lesson_15_generators: $(ROOT_DIR)/tutorial/lesson_15_generators_usage.s PATH="$${PATH}:$(CURDIR)/$(BIN_DIR)" source $(ROOT_DIR)/tutorial/lesson_15_generators_usage.sh @-echo -$(BIN_DIR)/tutorial_lesson_16_rgb_generate: $(ROOT_DIR)/tutorial/lesson_16_rgb_generate.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o +$(BIN_DIR)/tutorial_lesson_16_rgb_generate: $(ROOT_DIR)/tutorial/lesson_16_rgb_generate.cpp $(TEST_DEPS) $(BUILD_DIR)/GenGen.o $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \ -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ @@ -1962,7 +1974,7 @@ $(BIN_DIR)/tutorial_lesson_16_rgb_run: $(ROOT_DIR)/tutorial/lesson_16_rgb_run.cp -lHalide $(TEST_LD_FLAGS) $(COMMON_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ @-echo -$(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate: $(ROOT_DIR)/tutorial/lesson_21_auto_scheduler_generate.cpp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR)/Halide.h $(BUILD_DIR)/GenGen.o +$(BIN_DIR)/tutorial_lesson_21_auto_scheduler_generate: $(ROOT_DIR)/tutorial/lesson_21_auto_scheduler_generate.cpp $(TEST_DEPS) $(BUILD_DIR)/GenGen.o $(CXX) $(TUTORIAL_CXX_FLAGS) $(IMAGE_IO_CXX_FLAGS) $(OPTIMIZE_FOR_BUILD_TIME) $< $(BUILD_DIR)/GenGen.o \ -I$(INCLUDE_DIR) $(TEST_LD_FLAGS) $(IMAGE_IO_LIBS) -o $@ From 93bff95c52e6599f9f779c99604002ff955d276e Mon Sep 17 00:00:00 2001 From: Teo Date: Tue, 6 Feb 2024 18:34:02 -0500 Subject: [PATCH 046/186] add unsafe_promise_clamped (#8071) add unsafe_promise_clamp --- python_bindings/src/halide/halide_/PyIROperator.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/python_bindings/src/halide/halide_/PyIROperator.cpp b/python_bindings/src/halide/halide_/PyIROperator.cpp index ce9a0ef5fec1..81a51398bb51 100644 --- a/python_bindings/src/halide/halide_/PyIROperator.cpp +++ b/python_bindings/src/halide/halide_/PyIROperator.cpp @@ -44,6 +44,7 @@ void define_operators(py::module &m) { }); m.def("clamp", &clamp); + m.def("unsafe_promise_clamped", &unsafe_promise_clamped); m.def("abs", &abs); m.def("absd", &absd); From 665804c752cba9e7b673d3778d83d58a19628948 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Tue, 6 Feb 2024 23:34:29 +0000 Subject: [PATCH 047/186] Don't require Halide_WebGPU when using wasm (#8063) (#8065) * Don't require Halide_WebGPU when using wasm (#8063) * trigger buildbots --- cmake/HalideGeneratorHelpers.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake index f62da88b1f7b..d45341536422 100644 --- a/cmake/HalideGeneratorHelpers.cmake +++ b/cmake/HalideGeneratorHelpers.cmake @@ -739,7 +739,7 @@ function(_Halide_target_link_gpu_libs TARGET VISIBILITY) target_link_libraries(${TARGET} ${VISIBILITY} "${FOUNDATION_LIBRARY}" "${METAL_LIBRARY}") endif () - if ("${ARGN}" MATCHES "webgpu") + if ("${ARGN}" MATCHES "webgpu" AND NOT "${ARGN}" MATCHES "wasm") find_package(Halide_WebGPU REQUIRED) target_link_libraries(${TARGET} ${VISIBILITY} Halide::WebGPU) endif () From 84fe5655ee569680ce116497724e28e3c3575fe5 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 7 Feb 2024 17:41:21 +0000 Subject: [PATCH 048/186] Outsmart the LLVM optimizer (#8073) The old definitions of bool_1, bool_2, bool_3 in simd_op_check_x86 (etc) all referred to the same entry in in_f32; as of https://github.com/llvm/llvm-project/pull/76367, the LLVM optimizer is smart enough to realize that (eg) bool1 != bool2 by construction, and optimizes away the code that tests their conditions, such as the one for andps and orps. Initing them from different locations is enough to outsmart the compiler. (bug was only noticed in the x86 test, but I updated the other tests to guard against future improvements there too.) --- test/correctness/simd_op_check_arm.cpp | 2 +- test/correctness/simd_op_check_hvx.cpp | 2 +- test/correctness/simd_op_check_powerpc.cpp | 2 +- test/correctness/simd_op_check_wasm.cpp | 2 +- test/correctness/simd_op_check_x86.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp index acc3edcc4a8a..e8762a6ea2d8 100644 --- a/test/correctness/simd_op_check_arm.cpp +++ b/test/correctness/simd_op_check_arm.cpp @@ -37,7 +37,7 @@ class SimdOpCheckARM : public SimdOpCheckTest { Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32); Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32); Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32); - Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f); + Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f); // Table copied from the Cortex-A9 TRM. diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp index 2832f1bc8ede..450ef3f06fe6 100644 --- a/test/correctness/simd_op_check_hvx.cpp +++ b/test/correctness/simd_op_check_hvx.cpp @@ -45,7 +45,7 @@ class SimdOpCheckHVX : public SimdOpCheckTest { Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32); Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32); Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32); - Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f); + Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f); constexpr int hvx_width = 128; diff --git a/test/correctness/simd_op_check_powerpc.cpp b/test/correctness/simd_op_check_powerpc.cpp index 2dccd72735f3..fdf28f3641a5 100644 --- a/test/correctness/simd_op_check_powerpc.cpp +++ b/test/correctness/simd_op_check_powerpc.cpp @@ -36,7 +36,7 @@ class SimdOpCheckPowerPC : public SimdOpCheckTest { Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32); Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32); Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32); - // Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f); + // Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f); // Basic AltiVec SIMD instructions. for (int w = 1; w <= 4; w++) { diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp index 89aad9e5c389..56e2e4231876 100644 --- a/test/correctness/simd_op_check_wasm.cpp +++ b/test/correctness/simd_op_check_wasm.cpp @@ -37,7 +37,7 @@ class SimdOpCheckWASM : public SimdOpCheckTest { Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32); Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32); Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32); - Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f); + Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f); check("f32.sqrt", 1, sqrt(f32_1)); check("f32.min", 1, min(f32_1, f32_2)); diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp index 51d4a0b18ccb..990e4e886307 100644 --- a/test/correctness/simd_op_check_x86.cpp +++ b/test/correctness/simd_op_check_x86.cpp @@ -57,7 +57,7 @@ class SimdOpCheckX86 : public SimdOpCheckTest { Expr u32_1 = in_u32(x), u32_2 = in_u32(x + 16), u32_3 = in_u32(x + 32); Expr i64_1 = in_i64(x), i64_2 = in_i64(x + 16), i64_3 = in_i64(x + 32); Expr u64_1 = in_u64(x), u64_2 = in_u64(x + 16), u64_3 = in_u64(x + 32); - Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f); + Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_2 < -0.3f), bool_3 = (f32_3 != -0.34f); // MMX and SSE1 (in 64 and 128 bits) for (int w = 1; w <= 4; w++) { From 78a076220a4aefdcef13d3ab7b3afa7faf8917f7 Mon Sep 17 00:00:00 2001 From: Prasoon Mishra <132343640+prasmish@users.noreply.github.com> Date: Wed, 7 Feb 2024 23:11:51 +0530 Subject: [PATCH 049/186] Add hexagon_benchmarks app for CMake builds (#8069) * Add hexagon_benchmarks app for CMake builds * Removed unnecessary -lc++abi flag from GCC build --- apps/CMakeLists.txt | 2 +- apps/hexagon_benchmarks/CMakeLists.txt | 44 ++++++++++++++++++++++++++ apps/hexagon_benchmarks/process.cpp | 7 ++-- 3 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 apps/hexagon_benchmarks/CMakeLists.txt diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 149f6a610b5c..1f6abcdc6e64 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -45,7 +45,7 @@ add_app(depthwise_separable_conv) add_app(fft) add_app(hannk) add_app(harris) -# add_app(hexagon_benchmarks) # TODO(#5374): missing CMake build +add_app(hexagon_benchmarks) # add_app(hexagon_dma) # TODO(#5374): missing CMake build add_app(hist) add_app(iir_blur) diff --git a/apps/hexagon_benchmarks/CMakeLists.txt b/apps/hexagon_benchmarks/CMakeLists.txt new file mode 100644 index 000000000000..9cbcc541b76a --- /dev/null +++ b/apps/hexagon_benchmarks/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.22) +project(hexagon_benchmarks) + +enable_testing() + +# Set up language settings +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED YES) +set(CMAKE_CXX_EXTENSIONS NO) +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +# Find Halide +find_package(Halide REQUIRED) + +macro(add_generator_and_library FILTER_NAME) + set(GENERATOR_EXE ${FILTER_NAME}.generator) + set(GENERATOR_SRC ${FILTER_NAME}_generator.cpp) + add_halide_generator(${GENERATOR_EXE} SOURCES ${GENERATOR_SRC}) + add_halide_library(${FILTER_NAME} FROM ${GENERATOR_EXE}) +endmacro() + +add_generator_and_library(dilate3x3) +add_generator_and_library(gaussian5x5) +add_generator_and_library(median3x3) + +# Main executable +add_executable(process process.cpp) +target_compile_options(process PRIVATE $<$:-O2>) +if (Halide_TARGET MATCHES "hvx") + target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 TARGET_HAS_HVX) +else() + target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3) +endif() +target_link_libraries(process + PRIVATE + Halide::Tools + dilate3x3 gaussian5x5 median3x3) + +# Test that the app actually works! +add_test(NAME hexagon_benchmarks COMMAND process -n 1) +set_tests_properties(hexagon_benchmarks PROPERTIES + LABELS hexagon_benchmarks + PASS_REGULAR_EXPRESSION "Success!" + SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") diff --git a/apps/hexagon_benchmarks/process.cpp b/apps/hexagon_benchmarks/process.cpp index 975bf8aa2da4..87a492c577d1 100644 --- a/apps/hexagon_benchmarks/process.cpp +++ b/apps/hexagon_benchmarks/process.cpp @@ -3,6 +3,10 @@ #include #include +#ifdef TARGET_HAS_HVX +#include "HalideRuntimeHexagonHost.h" +#endif + #include "halide_benchmark.h" #include "process.h" @@ -39,11 +43,10 @@ int main(int argc, char **argv) { Dilate3x3Descriptor dilate3x3_pipeine(W, H); Median3x3Descriptor median3x3_pipeline(W, H); Gaussian5x5Descriptor gaussian5x5_pipeline(W, H); - SobelDescriptor sobel_pipeline(W, H); Conv3x3a32Descriptor conv3x3a32_pipeline(W, H); std::vector pipelines = {&conv3x3a16_pipeline, &dilate3x3_pipeine, &median3x3_pipeline, - &gaussian5x5_pipeline, &sobel_pipeline, &conv3x3a32_pipeline}; + &gaussian5x5_pipeline, &conv3x3a32_pipeline}; for (PipelineDescriptorBase *p : pipelines) { if (!p->defined()) { From 37153a95d0d2d0b7b8c51c92c4a94c8cc11f8f7b Mon Sep 17 00:00:00 2001 From: Derek Gerstmann Date: Wed, 7 Feb 2024 09:43:58 -0800 Subject: [PATCH 050/186] Fix bool conversion bug in Vulkan code generator (#8067) * Fix bug in Vulkan code generator that was incorrectly passing the address of a byte vector, instead of its contents to builder.declare_constant() * Add bool_predicate_cast correctness test to verify bool conversion for Vulkan codegen works as expected --------- Co-authored-by: Derek Gerstmann --- src/CodeGen_Vulkan_Dev.cpp | 7 +++-- test/correctness/CMakeLists.txt | 1 + test/correctness/bool_predicate_cast.cpp | 39 ++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 test/correctness/bool_predicate_cast.cpp diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp index 7e06447a27fc..b86c99f9269e 100644 --- a/src/CodeGen_Vulkan_Dev.cpp +++ b/src/CodeGen_Vulkan_Dev.cpp @@ -548,6 +548,9 @@ void fill_bytes_with_value(uint8_t *bytes, int count, int value) { } SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type value_type, SpvId value_id) { + debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(): casting from value type '" + << value_type << "' to target type '" << target_type << "' for value id '" << value_id << "' !\n"; + if (!value_type.is_bool()) { value_id = cast_type(Bool(), value_type, value_id); } @@ -590,8 +593,8 @@ SpvId CodeGen_Vulkan_Dev::SPIRV_Emitter::convert_to_bool(Type target_type, Type SpvId result_id = builder.reserve_id(SpvResultId); SpvId target_type_id = builder.declare_type(target_type); - SpvId true_value_id = builder.declare_constant(target_type, &true_data); - SpvId false_value_id = builder.declare_constant(target_type, &false_data); + SpvId true_value_id = builder.declare_constant(target_type, &true_data[0]); + SpvId false_value_id = builder.declare_constant(target_type, &false_data[0]); builder.append(SpvFactory::select(target_type_id, result_id, value_id, true_value_id, false_value_id)); return result_id; } diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index cd66f21a346e..5960e7922658 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -14,6 +14,7 @@ tests(GROUPS correctness bit_counting.cpp bitwise_ops.cpp bool_compute_root_vectorize.cpp + bool_predicate_cast.cpp bound.cpp bound_small_allocations.cpp bound_storage.cpp diff --git a/test/correctness/bool_predicate_cast.cpp b/test/correctness/bool_predicate_cast.cpp new file mode 100644 index 000000000000..1043f329b76c --- /dev/null +++ b/test/correctness/bool_predicate_cast.cpp @@ -0,0 +1,39 @@ +#include "Halide.h" +#include + +using namespace Halide; + +int main(int argc, char **argv) { + + // Test explicit casting of a predicate to an integer as part of a reduction + // NOTE: triggers a convert_to_bool in Vulkan for a SelectOp + Target target = get_jit_target_from_environment(); + Var x("x"), y("y"); + + Func input("input"); + input(x, y) = cast(x + y); + + Func test("test"); + test(x, y) = cast(UInt(8), input(x, y) >= 32); + + if (target.has_gpu_feature()) { + Var xi("xi"), yi("yi"); + test.gpu_tile(x, y, xi, yi, 8, 8); + } + + Realization result = test.realize({96, 96}); + Buffer a = result[0]; + for (int y = 0; y < a.height(); y++) { + for (int x = 0; x < a.width(); x++) { + uint8_t correct_a = ((x + y) >= 32) ? 1 : 0; + if (a(x, y) != correct_a) { + printf("result(%d, %d) = (%d) instead of (%d)\n", + x, y, a(x, y), correct_a); + return 1; + } + } + } + + printf("Success!\n"); + return 0; +} From 39e5c08a88ac59ef1e848e7b7e40f2056c792b08 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 7 Feb 2024 09:49:06 -0800 Subject: [PATCH 051/186] Better validation of gpu schedules (#8068) * Update makefile to use test/common/terminate_handler.cpp This means we actually print error messages when using exceptions and the makefile * Better validate of GPU schedules GPU loop constraints were checked in two different places. Checking them in ScheduleFunctions was incorrect because it didn't consider update definitions and specializations. Checking them in FuseGPUThreadLoops was too late, because the Var names have gone (they've been renamed to things like __thread_id_x). Furthermore, some problems were internal errors or runtime errors when they should have been user errors. We allowed 4d thread and block dimensions, but then hit an internal error. This PR centralizes checking of GPU loop structure in CanonicalizeGPUVars and adds more helpful error messages that print the problematic loop structure. E.g: ``` Error: GPU thread loop over f$8.s0.v0 is inside three other GPU thread loops. The maximum number of nested GPU thread loops is 3. The loop nest is: compute_at for g$8: for g$8.s0.v7: for g$8.s0.v6: for g$8.s0.v5: for g$8.s0.v4: gpu_block g$8.s0.v3: gpu_block g$8.s0.v2: gpu_thread g$8.s0.v1: gpu_thread g$8.s0.v0: store_at for f$8: compute_at for f$8: gpu_thread f$8.s0.v1: gpu_thread f$8.s0.v0: ``` Fixes the bug found in #7946 * Delete dead code * Actually clear the ostringstream --- src/CanonicalizeGPUVars.cpp | 222 +++++++++++++++++++- src/FuseGPUThreadLoops.cpp | 40 ---- src/ScheduleFunctions.cpp | 39 ---- test/correctness/CMakeLists.txt | 3 +- test/correctness/gpu_error_1.cpp | 47 ----- test/correctness/gpu_error_2.cpp | 46 ---- test/correctness/invalid_gpu_loop_nests.cpp | 103 +++++++++ 7 files changed, 317 insertions(+), 183 deletions(-) delete mode 100644 test/correctness/gpu_error_1.cpp delete mode 100644 test/correctness/gpu_error_2.cpp create mode 100644 test/correctness/invalid_gpu_loop_nests.cpp diff --git a/src/CanonicalizeGPUVars.cpp b/src/CanonicalizeGPUVars.cpp index f399a995ef50..7e993d7a72c1 100644 --- a/src/CanonicalizeGPUVars.cpp +++ b/src/CanonicalizeGPUVars.cpp @@ -15,16 +15,16 @@ using std::string; using std::vector; namespace { -string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"}; -string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"}; +string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"}; +string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"}; string get_thread_name(int index) { - internal_assert(index >= 0 && index < 4); + internal_assert(index >= 0 && index < 3); return thread_names[index]; } string get_block_name(int index) { - internal_assert(index >= 0 && index < 4); + internal_assert(index >= 0 && index < 3); return block_names[index]; } @@ -111,10 +111,6 @@ class CanonicalizeGPUVars : public IRMutator { CountGPUBlocksThreads counter; op->body.accept(&counter); - internal_assert(counter.nblocks <= 4) - << op->name << " can only have maximum of 4 block dimensions\n"; - internal_assert(counter.nthreads <= 4) - << op->name << " can only have maximum of 4 thread dimensions\n"; if (op->for_type == ForType::GPUBlock) { name += "." + get_block_name(counter.nblocks); @@ -123,7 +119,6 @@ class CanonicalizeGPUVars : public IRMutator { name += "." + get_thread_name(counter.nthreads); debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n"; } else if (op->for_type == ForType::GPULane) { - user_assert(counter.nlanes == 0) << "Cannot nest multiple loops over gpu lanes: " << name << "\n"; name += "." + get_thread_name(0); } @@ -190,9 +185,218 @@ class CanonicalizeGPUVars : public IRMutator { } }; +std::string loop_nest_summary_to_node(const IRNode *root, const IRNode *target) { + class Summary : public IRVisitor { + public: + std::vector stack; + Summary(const IRNode *target) + : target(target) { + } + + protected: + const IRNode *target; + bool done = false; + + using IRVisitor::visit; + + void visit(const For *op) override { + if (done) { + return; + } + stack.emplace_back(); + stack.back() << op->for_type << " " << op->name; + if (op == target) { + done = true; + } else { + IRVisitor::visit(op); + if (!done) { + stack.pop_back(); + } + } + } + + void visit(const Realize *op) override { + if (done) { + return; + } + stack.emplace_back(); + stack.back() << "store_at for " << op->name; + IRVisitor::visit(op); + if (!done) { + stack.pop_back(); + } + } + + void visit(const HoistedStorage *op) override { + if (done) { + return; + } + stack.emplace_back(); + stack.back() << "hoisted storage for " << op->name; + IRVisitor::visit(op); + if (!done) { + stack.pop_back(); + } + } + + void visit(const ProducerConsumer *op) override { + if (done) { + return; + } + if (op->is_producer) { + stack.emplace_back(); + stack.back() << "compute_at for " << op->name; + IRVisitor::visit(op); + if (!done) { + stack.pop_back(); + } + } else { + IRVisitor::visit(op); + } + } + } summary{target}; + + root->accept(&summary); + + std::ostringstream result; + std::string prefix = ""; + result << "The loop nest is:\n"; + for (const auto &str : summary.stack) { + result << prefix << str.str() << ":\n"; + prefix += " "; + } + return result.str(); +}; + +// Check the user's GPU schedule is valid. Throws an error if it is not, so no +// return value required. +class ValidateGPUSchedule : public IRVisitor { + + using IRVisitor::visit; + + const IRNode *root = nullptr; + + int in_blocks = 0; + int in_threads = 0; + int in_lanes = 0; + + std::string innermost_blocks_loop, innermost_threads_loop; + std::ostringstream blocks_not_ok_reason; + + void clear_blocks_not_ok_reason() { + std::ostringstream empty; + blocks_not_ok_reason.swap(empty); + } + + void visit(const For *op) override { + if (!root) { + root = op; + } + bool should_clear = false; + if (in_blocks && op->for_type != ForType::GPUBlock && blocks_not_ok_reason.tellp() == 0) { + blocks_not_ok_reason << op->for_type << " loop over " << op->name; + should_clear = true; + } + if (op->for_type == ForType::GPUBlock) { + user_assert(blocks_not_ok_reason.tellp() == 0) + << blocks_not_ok_reason.str() << " is inside GPU block loop over " + << innermost_blocks_loop << " but outside GPU block loop over " << op->name + << ". Funcs cannot be scheduled in between GPU block loops. " + << loop_nest_summary_to_node(root, op); + user_assert(in_blocks < 3) + << "GPU block loop over " << op->name << " is inside three other GPU block loops. " + << "The maximum number of nested GPU block loops is 3. " + << loop_nest_summary_to_node(root, op); + user_assert(in_threads == 0) + << "GPU block loop over " << op->name << " is inside GPU thread loop over " + << innermost_threads_loop << ". " + << loop_nest_summary_to_node(root, op); + in_blocks++; + ScopedValue s(innermost_blocks_loop, op->name); + IRVisitor::visit(op); + in_blocks--; + } else if (op->for_type == ForType::GPUThread) { + user_assert(in_lanes == 0) + << "GPU thread loop over " << op->name << " is inside a loop over GPU lanes. " + << "GPU thread loops must be outside any GPU lane loop. " + << loop_nest_summary_to_node(root, op); + user_assert(in_threads < 3) + << "GPU thread loop over " << op->name << " is inside three other GPU thread loops. " + << "The maximum number of nested GPU thread loops is 3. " + << loop_nest_summary_to_node(root, op); + user_assert(in_blocks) + << "GPU thread loop over " << op->name << " must be inside a GPU block loop. " + << loop_nest_summary_to_node(root, op); + in_threads++; + ScopedValue s(innermost_threads_loop, op->name); + IRVisitor::visit(op); + in_threads--; + } else if (op->for_type == ForType::GPULane) { + user_assert(in_threads < 3) + << "GPU lane loop over " << op->name << " is inside three other GPU thread or lane loops. " + << "The maximum number of nested GPU thread or lane loops is 3. " + << loop_nest_summary_to_node(root, op); + user_assert(in_lanes == 0) + << "GPU lane loop over " << op->name << " is inside another GPU lane loop. GPU lane loops " + << "may not be nested. " + << loop_nest_summary_to_node(root, op); + in_lanes++; + ScopedValue s(innermost_threads_loop, op->name); + IRVisitor::visit(op); + in_lanes--; + } else { + IRVisitor::visit(op); + } + if (should_clear) { + clear_blocks_not_ok_reason(); + } + } + + void visit(const Realize *op) override { + if (!root) { + root = op; + } + if (in_blocks && blocks_not_ok_reason.tellp() == 0) { + blocks_not_ok_reason << "store_at location for " << op->name; + IRVisitor::visit(op); + clear_blocks_not_ok_reason(); + } else { + IRVisitor::visit(op); + } + } + + void visit(const ProducerConsumer *op) override { + if (!root) { + root = op; + } + if (op->is_producer && in_blocks && blocks_not_ok_reason.tellp() == 0) { + blocks_not_ok_reason << "compute_at location for " << op->name; + IRVisitor::visit(op); + clear_blocks_not_ok_reason(); + } else { + IRVisitor::visit(op); + } + } + + void visit(const HoistedStorage *op) override { + if (!root) { + root = op; + } + if (in_blocks && blocks_not_ok_reason.tellp() == 0) { + blocks_not_ok_reason << "hoist_storage location for " << op->name; + IRVisitor::visit(op); + clear_blocks_not_ok_reason(); + } else { + IRVisitor::visit(op); + } + } +}; + } // anonymous namespace Stmt canonicalize_gpu_vars(Stmt s) { + ValidateGPUSchedule validator; + s.accept(&validator); CanonicalizeGPUVars canonicalizer; s = canonicalizer.mutate(s); return s; diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp index 906963059cff..cd59fd470d38 100644 --- a/src/FuseGPUThreadLoops.cpp +++ b/src/FuseGPUThreadLoops.cpp @@ -1515,44 +1515,6 @@ class ZeroGPULoopMins : public IRMutator { ZeroGPULoopMins() = default; }; -class ValidateGPULoopNesting : public IRVisitor { - int gpu_block_depth = 0, gpu_thread_depth = 0; - string innermost_block_var, innermost_thread_var; - - using IRVisitor::visit; - - void visit(const For *op) override { - ScopedValue old_innermost_block_var(innermost_block_var); - ScopedValue old_innermost_thread_var(innermost_thread_var); - ScopedValue old_gpu_block_depth(gpu_block_depth); - ScopedValue old_gpu_thread_depth(gpu_thread_depth); - - for (int i = 1; i <= 4; i++) { - if (ends_with(op->name, block_names[4 - i])) { - user_assert(i > gpu_block_depth) - << "Invalid schedule: Loop over " << op->name - << " cannot be inside of loop over " << innermost_block_var << "\n"; - user_assert(gpu_thread_depth == 0) - << "Invalid schedule: Loop over " << op->name - << " cannot be inside of loop over " << innermost_thread_var << "\n"; - innermost_block_var = op->name; - gpu_block_depth = i; - } - if (ends_with(op->name, thread_names[4 - i])) { - user_assert(i > gpu_thread_depth) - << "Invalid schedule: Loop over " << op->name - << " cannot be inside of loop over " << innermost_thread_var << "\n"; - user_assert(gpu_block_depth > 0) - << "Invalid schedule: Loop over " << op->name - << " must be inside a loop over gpu blocks\n"; - innermost_thread_var = op->name; - gpu_thread_depth = i; - } - } - IRVisitor::visit(op); - } -}; - } // namespace // Also used by InjectImageIntrinsics @@ -1632,8 +1594,6 @@ class NormalizeIfStatements : public IRMutator { } // namespace Stmt fuse_gpu_thread_loops(Stmt s) { - ValidateGPULoopNesting validate; - s.accept(&validate); // NormalizeIfStatements pushes the predicates between GPU blocks // into the innermost GPU block. FuseGPUThreadLoops would then // merge the predicate into the merged GPU thread. diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp index 9525c9a07308..c575cd47477d 100644 --- a/src/ScheduleFunctions.cpp +++ b/src/ScheduleFunctions.cpp @@ -2269,49 +2269,10 @@ bool validate_schedule(Function f, const Stmt &s, const Target &target, bool is_ std::ostringstream err; - // If you're compute_at() inside a gpu blocks loop, you can't have a gpu blocks loop yourself - const auto has_gpu_blocks = [&]() { - for (const Dim &d : f.definition().schedule().dims()) { - if (d.for_type == ForType::GPUBlock) { - return true; - } - } - return false; - }; - const auto all_ok = [&]() { return store_idx >= 0 && compute_idx >= 0 && hoist_storage_idx >= 0; }; - if (all_ok() && has_gpu_blocks()) { - for (int i = 0; i <= compute_idx; i++) { - if (sites[i].is_gpu_block) { - string site_fname = sites[i].loop_level.func(); - user_error << "Functions that are compute_at() a gpu_block() loop cannot have their own gpu_block() loops, " - << "but Func \"" << f.name() << "\" is compute_at() \"" << site_fname << "\"\n"; - } - } - } - - // If you're compute_at() a var marked as a gpu block var, it must be the innermost one - if (all_ok() && sites[compute_idx].is_gpu_block) { - string compute_at_fname = sites[compute_idx].loop_level.func(); - int possibly_invalid_idx = compute_idx; - for (int i = compute_idx + 1; i < (int)sites.size(); i++) { - if (!sites[i].is_gpu_block) { - continue; - } - string site_fname = sites[i].loop_level.func(); - if (site_fname == compute_at_fname) { - err << "Functions that are compute_at() a gpu_block() loop must specify the innermost gpu_block() loop for that Func.\n"; - sites.erase(sites.begin() + possibly_invalid_idx); - // This one will also be invalid if we find a subsequent loop from the same func - possibly_invalid_idx = i; - store_idx = compute_idx = hoist_storage_idx = -1; - } - } - } - // Check there isn't a parallel loop between the compute_at and the store_at if (all_ok()) { for (int i = store_idx + 1; i <= compute_idx; i++) { diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 5960e7922658..3b946edda6d9 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -132,8 +132,6 @@ tests(GROUPS correctness gpu_data_flows.cpp gpu_different_blocks_threads_dimensions.cpp gpu_dynamic_shared.cpp - gpu_error_1.cpp - gpu_error_2.cpp gpu_free_sync.cpp gpu_give_input_buffers_device_allocations.cpp gpu_jit_explicit_copy_to_device.cpp @@ -187,6 +185,7 @@ tests(GROUPS correctness interval.cpp intrinsics.cpp introspection.cpp + invalid_gpu_loop_nests.cpp inverse.cpp isnan.cpp issue_3926.cpp diff --git a/test/correctness/gpu_error_1.cpp b/test/correctness/gpu_error_1.cpp deleted file mode 100644 index d3fafb72f8ba..000000000000 --- a/test/correctness/gpu_error_1.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include "Halide.h" -#include - -using namespace Halide; - -class MyCompileTimeErrorReporter : public CompileTimeErrorReporter { -public: - void warning(const char *msg) override { - std::cerr << "Should not see any warnings in this test, but saw: " << msg << "\n"; - exit(1); - } - - void error(const char *msg) override { - std::string m = msg; - if (!strstr(msg, "Functions that are compute_at() a gpu_block() loop cannot have their own gpu_block() loops")) { - std::cerr << "Did not see expected error, instead saw: (" << msg << ")\n"; - exit(1); - } - - std::cout << "Success!\n"; - exit(0); - } -}; - -int main(int argc, char **argv) { - static MyCompileTimeErrorReporter reporter; - set_custom_compile_time_error_reporter(&reporter); - - ImageParam im(Float(32), 2); - - Func a("a"), b("b"); - Var x("x"), y("y"); - - a(x, y) = im(x, y); - b(x, y) = a(x, y); - - // Verify that attempting to schedule such that we would have nested gpu-blocks for different - // functions produces a useful error message. - Var xi, yi; - b.gpu_tile(x, y, xi, yi, 4, 4); - a.compute_at(b, x).gpu_tile(x, xi, 4); - - b.realize({32, 32}, Target("host-metal")); - - std::cerr << "Failure, did not see error!\n"; - return 1; -} diff --git a/test/correctness/gpu_error_2.cpp b/test/correctness/gpu_error_2.cpp deleted file mode 100644 index 50a51330d145..000000000000 --- a/test/correctness/gpu_error_2.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "Halide.h" -#include - -using namespace Halide; - -class MyCompileTimeErrorReporter : public CompileTimeErrorReporter { -public: - void warning(const char *msg) override { - std::cerr << "Should not see any warnings in this test, but saw: " << msg << "\n"; - exit(1); - } - - void error(const char *msg) override { - if (!strstr(msg, "Functions that are compute_at() a gpu_block() loop must specify the innermost gpu_block() loop for that Func.")) { - std::cerr << "Did not see expected error, instead saw: (" << msg << ")\n"; - exit(1); - } - - std::cout << "Saw expected error message.\n"; - std::cout << "Success!\n"; - exit(0); - } -}; - -int main(int argc, char **argv) { - static MyCompileTimeErrorReporter reporter; - set_custom_compile_time_error_reporter(&reporter); - - ImageParam im(Float(32), 2); - - Func a("a"), b("b"); - Var x("x"), y("y"); - - a(x, y) = im(x, y); - a(x, y) += 1; - b(x, y) = a(x, y); - - Var xi, yi; - b.gpu_tile(x, y, xi, yi, 4, 4); - a.compute_at(b, y); - - b.realize({32, 32}, Target("host-metal")); - - std::cerr << "Failure, did not see error!\n"; - return 1; -} diff --git a/test/correctness/invalid_gpu_loop_nests.cpp b/test/correctness/invalid_gpu_loop_nests.cpp new file mode 100644 index 000000000000..551fe4a8eb54 --- /dev/null +++ b/test/correctness/invalid_gpu_loop_nests.cpp @@ -0,0 +1,103 @@ +#include "Halide.h" +#include + +using namespace Halide; + +void check_error(bool error) { + if (!error) { + std::cout << "There was supposed to be an error!\n"; + exit(1); + } +} + +int main(int argc, char **argv) { +#if HALIDE_WITH_EXCEPTIONS + if (!Halide::exceptions_enabled()) { + std::cout << "[SKIP] Halide was compiled without exceptions.\n"; + return 0; + } + + Target t = get_jit_target_from_environment(); + if (!t.has_gpu_feature()) { + std::cout << "[SKIP] No GPU target enabled.\n"; + return 0; + } + + Var v0, v1, v2, v3, v4, v5, v6, v7; + Param p; + for (int i = 0;; i++) { + Func f{"f"}, g{"g"}; + f(v0, v1, v2, v3, v4, v5, v6, v7) = v0; + g(v0, v1, v2, v3, v4, v5, v6, v7) = f(v0, v1, v2, v3, v4, v5, v6, v7); + switch (i) { + case 0: + // Threads but no blocks on an output Func + g.gpu_threads(v0); + break; + case 1: + // Threads but no blocks on a compute_root non-output Func + f.compute_root().gpu_threads(v0); + g.gpu_blocks(v1).gpu_threads(v0); + break; + case 2: + // Too many blocks loops + g.gpu_blocks(v0, v1).gpu_blocks(v2, v3); + break; + case 3: + // Too many threads loops + g.gpu_threads(v0, v1).gpu_threads(v2, v3).gpu_blocks(v4); + break; + case 4: + // Threads outside of blocks + g.gpu_blocks(v0).gpu_threads(v1); + break; + case 5: + // Something with a blocks loop compute_at inside something else with a blocks loop + g.gpu_blocks(v0); + f.compute_at(g, v0).gpu_blocks(v0); + break; + case 6: + // Something compute_at between two gpu_blocks loops + g.gpu_blocks(v0, v1); + f.compute_at(g, v1); + break; + case 7: + // Something with too many threads loops once nesting is taken into account + g.gpu_threads(v0, v1).gpu_blocks(v2, v3); + f.compute_at(g, v0).gpu_threads(v0, v1); + break; + case 8: + // The same, but only in a specialization + g.gpu_threads(v0, v1).gpu_blocks(v2, v3); + f.compute_at(g, v0).gpu_threads(v0).specialize(p).gpu_threads(v1); + break; + case 9: + // A serial loop in between two gpu blocks loops + g.gpu_blocks(v5, v7); + break; + default: + std::cout << "Success!\n"; + return 0; + } + + bool error = false; + try { + g.compile_jit(); + } catch (const Halide::CompileError &e) { + error = true; + std::cout << "Expected compile error:\n" + << e.what() << "\n"; + } + + if (!error) { + printf("There should have been an error\n"); + return 1; + } + } + + // unreachable +#else + std::cout << "[SKIP] Halide was compiled without exceptions.\n"; + return 0; +#endif +} From 55dfa397c2c6bac0c0394c4d3d802b79e21559be Mon Sep 17 00:00:00 2001 From: Zalman Stern Date: Wed, 7 Feb 2024 10:23:46 -0800 Subject: [PATCH 052/186] Add an easy way to print vectors in debug output. (#8072) * Add helper to print containers, or at least vectors, in debug info. * Add documentation comments. * Formatting. * Name change. --- src/Debug.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/Debug.h b/src/Debug.h index fadb5b4066ac..9f47a5aebeb6 100644 --- a/src/Debug.h +++ b/src/Debug.h @@ -65,6 +65,60 @@ class debug { static int debug_level(); }; +/** Allow easily printing the contents of containers, or std::vector-like containers, + * in debug output. Used like so: + * std::vector arg_types; + * debug(4) << "arg_types: " << PrintSpan(arg_types) << "\n"; + * Which results in output like "arg_types: { uint8x8, uint8x8 }" on one line. */ +template +struct PrintSpan { + const T &span; + PrintSpan(const T &span) + : span(span) { + } +}; + +template +inline StreamT &operator<<(StreamT &stream, const PrintSpan &wrapper) { + stream << "{ "; + const char *sep = ""; + for (const auto &e : wrapper.span) { + stream << sep << e; + sep = ", "; + } + stream << " }"; + return stream; +} + +/** Allow easily printing the contents of spans, or std::vector-like spans, + * in debug output. Used like so: + * std::vector arg_types; + * debug(4) << "arg_types: " << PrintSpan(arg_types) << "\n"; + * Which results in output like: + * arg_types: + * { + * uint8x8, + * uint8x8, + * } + * Indentation uses a tab character. */ +template +struct PrintSpanLn { + const T &span; + PrintSpanLn(const T &span) + : span(span) { + } +}; + +template +inline StreamT &operator<<(StreamT &stream, const PrintSpanLn &wrapper) { + stream << "\n{\n"; + for (const auto &e : wrapper.span) { + stream << "\t" << e << ",\n"; + } + stream << "}\n"; + return stream; +} + } // namespace Internal } // namespace Halide From de8e39dbcd2d60a47e5465303bd5aa7f30d404d7 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 9 Feb 2024 16:55:00 +0000 Subject: [PATCH 053/186] Bump serialization version to 18.0.0 (#8080) * Bump serialization version to 18.0.0 As a matter of policy, we should probably bump the version of the serialization format for every version of Halide -- even if changes are minimal-to-nonexistent -- to reinforce the fact that this isn't intended in any way as a long-term archival format. This PR suggests that we bump the major version to match the main Halide version, but I'm open for other suggestions. * Update halide_ir.fbs --- src/halide_ir.fbs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index e5855e301d1e..d91222d62f65 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -7,10 +7,12 @@ file_identifier "HLDE"; file_extension "hlpipe"; enum SerializationVersionMajor: int { - Value = 0 + Value = 18 } enum SerializationVersionMinor: int { - Value = 1 + // 0 = Unstable + // 1 = First stable version + Value = 0 } enum SerializationVersionPatch: int { Value = 0 From a3baa5de2b1064fa2930b94d9a49b11676457cbb Mon Sep 17 00:00:00 2001 From: James Price Date: Fri, 9 Feb 2024 13:39:21 -0500 Subject: [PATCH 054/186] [WebGPU] Update to latest native headers (#8081) * [WebGPU] Update to latest native headers * Remove #ifdef for `requiredFeature[s]Count` * Pass nullptr to wgpuCreateInstance * Emscripten currently requires this * Dawn accepts it too * Use nullptr for another wgpuCreateInstance call --- src/runtime/mini_webgpu.h | 490 +++++++++++++++++++++++++------------- src/runtime/webgpu.cpp | 4 +- test/common/gpu_context.h | 9 +- 3 files changed, 323 insertions(+), 180 deletions(-) diff --git a/src/runtime/mini_webgpu.h b/src/runtime/mini_webgpu.h index 5a766d1a80c3..3d6bf862f0b7 100644 --- a/src/runtime/mini_webgpu.h +++ b/src/runtime/mini_webgpu.h @@ -75,7 +75,7 @@ #define WGPU_ARRAY_LAYER_COUNT_UNDEFINED UINT32_MAX #define WGPU_COPY_STRIDE_UNDEFINED UINT32_MAX -#define WGPU_DEPTH_SLICE_UNDEFINED (0xffffffffUL) +#define WGPU_DEPTH_SLICE_UNDEFINED UINT32_MAX #define WGPU_LIMIT_U32_UNDEFINED UINT32_MAX #define WGPU_LIMIT_U64_UNDEFINED UINT64_MAX #define WGPU_MIP_LEVEL_COUNT_UNDEFINED UINT32_MAX @@ -115,6 +115,7 @@ typedef struct WGPUTextureViewImpl* WGPUTextureView WGPU_OBJECT_ATTRIBUTE; // Structure forward declarations struct WGPUAdapterProperties; +struct WGPUAdapterPropertiesD3D; struct WGPUBindGroupEntry; struct WGPUBlendComponent; struct WGPUBufferBindingLayout; @@ -128,9 +129,13 @@ struct WGPUCompilationMessage; struct WGPUComputePassTimestampWrites; struct WGPUConstantEntry; struct WGPUCopyTextureForBrowserOptions; +struct WGPUCreateComputePipelineAsyncCallbackInfo; +struct WGPUCreateRenderPipelineAsyncCallbackInfo; +struct WGPUDawnWGSLBlocklist; struct WGPUDawnAdapterPropertiesPowerPreference; struct WGPUDawnBufferDescriptorErrorInfoFromWireClient; struct WGPUDawnCacheDeviceDescriptor; +struct WGPUDawnComputePipelineFullSubgroups; struct WGPUDawnEncoderInternalUsageDescriptor; struct WGPUDawnExperimentalSubgroupLimits; struct WGPUDawnMultisampleStateRenderToSingleSampled; @@ -138,6 +143,7 @@ struct WGPUDawnRenderPassColorAttachmentRenderToSingleSampled; struct WGPUDawnShaderModuleSPIRVOptionsDescriptor; struct WGPUDawnTextureInternalUsageDescriptor; struct WGPUDawnTogglesDescriptor; +struct WGPUDawnWireWGSLControl; struct WGPUDepthStencilStateDepthWriteDefinedDawn; struct WGPUExtent2D; struct WGPUExtent3D; @@ -146,6 +152,7 @@ struct WGPUExternalTextureBindingLayout; struct WGPUFuture; struct WGPUInstanceFeatures; struct WGPULimits; +struct WGPUMemoryHeapInfo; struct WGPUMultisampleState; struct WGPUOrigin2D; struct WGPUOrigin3D; @@ -163,31 +170,32 @@ struct WGPURenderPassDescriptorMaxDrawCount; struct WGPURenderPassTimestampWrites; struct WGPURequestAdapterCallbackInfo; struct WGPURequestAdapterOptions; +struct WGPURequestDeviceCallbackInfo; struct WGPUSamplerBindingLayout; struct WGPUSamplerDescriptor; -struct WGPUShaderModuleDescriptor; struct WGPUShaderModuleSPIRVDescriptor; struct WGPUShaderModuleWGSLDescriptor; -struct WGPUSharedFenceDescriptor; +struct WGPUShaderModuleDescriptor; struct WGPUSharedFenceDXGISharedHandleDescriptor; struct WGPUSharedFenceDXGISharedHandleExportInfo; -struct WGPUSharedFenceExportInfo; struct WGPUSharedFenceMTLSharedEventDescriptor; struct WGPUSharedFenceMTLSharedEventExportInfo; +struct WGPUSharedFenceDescriptor; +struct WGPUSharedFenceExportInfo; struct WGPUSharedFenceVkSemaphoreOpaqueFDDescriptor; struct WGPUSharedFenceVkSemaphoreOpaqueFDExportInfo; struct WGPUSharedFenceVkSemaphoreSyncFDDescriptor; struct WGPUSharedFenceVkSemaphoreSyncFDExportInfo; struct WGPUSharedFenceVkSemaphoreZirconHandleDescriptor; struct WGPUSharedFenceVkSemaphoreZirconHandleExportInfo; +struct WGPUSharedTextureMemoryDXGISharedHandleDescriptor; +struct WGPUSharedTextureMemoryEGLImageDescriptor; +struct WGPUSharedTextureMemoryIOSurfaceDescriptor; struct WGPUSharedTextureMemoryAHardwareBufferDescriptor; struct WGPUSharedTextureMemoryBeginAccessDescriptor; struct WGPUSharedTextureMemoryDescriptor; -struct WGPUSharedTextureMemoryDmaBufDescriptor; -struct WGPUSharedTextureMemoryDXGISharedHandleDescriptor; -struct WGPUSharedTextureMemoryEGLImageDescriptor; +struct WGPUSharedTextureMemoryDmaBufPlane; struct WGPUSharedTextureMemoryEndAccessState; -struct WGPUSharedTextureMemoryIOSurfaceDescriptor; struct WGPUSharedTextureMemoryOpaqueFDDescriptor; struct WGPUSharedTextureMemoryVkDedicatedAllocationDescriptor; struct WGPUSharedTextureMemoryVkImageLayoutBeginState; @@ -200,8 +208,8 @@ struct WGPUSurfaceDescriptorFromAndroidNativeWindow; struct WGPUSurfaceDescriptorFromCanvasHTMLSelector; struct WGPUSurfaceDescriptorFromMetalLayer; struct WGPUSurfaceDescriptorFromWaylandSurface; -struct WGPUSurfaceDescriptorFromWindowsCoreWindow; struct WGPUSurfaceDescriptorFromWindowsHWND; +struct WGPUSurfaceDescriptorFromWindowsCoreWindow; struct WGPUSurfaceDescriptorFromWindowsSwapChainPanel; struct WGPUSurfaceDescriptorFromXlibWindow; struct WGPUSwapChainDescriptor; @@ -210,6 +218,7 @@ struct WGPUTextureBindingViewDimensionDescriptor; struct WGPUTextureDataLayout; struct WGPUTextureViewDescriptor; struct WGPUVertexAttribute; +struct WGPUAdapterPropertiesMemoryHeaps; struct WGPUBindGroupDescriptor; struct WGPUBindGroupLayoutEntry; struct WGPUBlendState; @@ -227,6 +236,7 @@ struct WGPUProgrammableStageDescriptor; struct WGPURenderPassColorAttachment; struct WGPURenderPassStorageAttachment; struct WGPURequiredLimits; +struct WGPUSharedTextureMemoryDmaBufDescriptor; struct WGPUSharedTextureMemoryProperties; struct WGPUSharedTextureMemoryVkImageDescriptor; struct WGPUSupportedLimits; @@ -242,25 +252,40 @@ struct WGPUVertexState; struct WGPUFragmentState; struct WGPURenderPipelineDescriptor; +typedef enum WGPUWGSLFeatureName { + WGPUWGSLFeatureName_Undefined = 0x00000000, + WGPUWGSLFeatureName_ReadonlyAndReadwriteStorageTextures = 0x00000001, + WGPUWGSLFeatureName_Packed4x8IntegerDotProduct = 0x00000002, + WGPUWGSLFeatureName_UnrestrictedPointerParameters = 0x00000003, + WGPUWGSLFeatureName_PointerCompositeAccess = 0x00000004, + WGPUWGSLFeatureName_ChromiumTestingUnimplemented = 0x000003E8, + WGPUWGSLFeatureName_ChromiumTestingUnsafeExperimental = 0x000003E9, + WGPUWGSLFeatureName_ChromiumTestingExperimental = 0x000003EA, + WGPUWGSLFeatureName_ChromiumTestingShippedWithKillswitch = 0x000003EB, + WGPUWGSLFeatureName_ChromiumTestingShipped = 0x000003EC, + WGPUWGSLFeatureName_Force32 = 0x7FFFFFFF +} WGPUWGSLFeatureName WGPU_ENUM_ATTRIBUTE; + typedef enum WGPUAdapterType { - WGPUAdapterType_DiscreteGPU = 0x00000000, - WGPUAdapterType_IntegratedGPU = 0x00000001, - WGPUAdapterType_CPU = 0x00000002, - WGPUAdapterType_Unknown = 0x00000003, + WGPUAdapterType_DiscreteGPU = 0x00000001, + WGPUAdapterType_IntegratedGPU = 0x00000002, + WGPUAdapterType_CPU = 0x00000003, + WGPUAdapterType_Unknown = 0x00000004, WGPUAdapterType_Force32 = 0x7FFFFFFF } WGPUAdapterType WGPU_ENUM_ATTRIBUTE; typedef enum WGPUAddressMode { - WGPUAddressMode_Repeat = 0x00000000, - WGPUAddressMode_MirrorRepeat = 0x00000001, - WGPUAddressMode_ClampToEdge = 0x00000002, + WGPUAddressMode_Undefined = 0x00000000, + WGPUAddressMode_ClampToEdge = 0x00000001, + WGPUAddressMode_Repeat = 0x00000002, + WGPUAddressMode_MirrorRepeat = 0x00000003, WGPUAddressMode_Force32 = 0x7FFFFFFF } WGPUAddressMode WGPU_ENUM_ATTRIBUTE; typedef enum WGPUAlphaMode { - WGPUAlphaMode_Premultiplied = 0x00000000, - WGPUAlphaMode_Unpremultiplied = 0x00000001, - WGPUAlphaMode_Opaque = 0x00000002, + WGPUAlphaMode_Opaque = 0x00000001, + WGPUAlphaMode_Premultiplied = 0x00000002, + WGPUAlphaMode_Unpremultiplied = 0x00000003, WGPUAlphaMode_Force32 = 0x7FFFFFFF } WGPUAlphaMode WGPU_ENUM_ATTRIBUTE; @@ -278,32 +303,34 @@ typedef enum WGPUBackendType { } WGPUBackendType WGPU_ENUM_ATTRIBUTE; typedef enum WGPUBlendFactor { - WGPUBlendFactor_Zero = 0x00000000, - WGPUBlendFactor_One = 0x00000001, - WGPUBlendFactor_Src = 0x00000002, - WGPUBlendFactor_OneMinusSrc = 0x00000003, - WGPUBlendFactor_SrcAlpha = 0x00000004, - WGPUBlendFactor_OneMinusSrcAlpha = 0x00000005, - WGPUBlendFactor_Dst = 0x00000006, - WGPUBlendFactor_OneMinusDst = 0x00000007, - WGPUBlendFactor_DstAlpha = 0x00000008, - WGPUBlendFactor_OneMinusDstAlpha = 0x00000009, - WGPUBlendFactor_SrcAlphaSaturated = 0x0000000A, - WGPUBlendFactor_Constant = 0x0000000B, - WGPUBlendFactor_OneMinusConstant = 0x0000000C, - WGPUBlendFactor_Src1 = 0x0000000D, - WGPUBlendFactor_OneMinusSrc1 = 0x0000000E, - WGPUBlendFactor_Src1Alpha = 0x0000000F, - WGPUBlendFactor_OneMinusSrc1Alpha = 0x00000010, + WGPUBlendFactor_Undefined = 0x00000000, + WGPUBlendFactor_Zero = 0x00000001, + WGPUBlendFactor_One = 0x00000002, + WGPUBlendFactor_Src = 0x00000003, + WGPUBlendFactor_OneMinusSrc = 0x00000004, + WGPUBlendFactor_SrcAlpha = 0x00000005, + WGPUBlendFactor_OneMinusSrcAlpha = 0x00000006, + WGPUBlendFactor_Dst = 0x00000007, + WGPUBlendFactor_OneMinusDst = 0x00000008, + WGPUBlendFactor_DstAlpha = 0x00000009, + WGPUBlendFactor_OneMinusDstAlpha = 0x0000000A, + WGPUBlendFactor_SrcAlphaSaturated = 0x0000000B, + WGPUBlendFactor_Constant = 0x0000000C, + WGPUBlendFactor_OneMinusConstant = 0x0000000D, + WGPUBlendFactor_Src1 = 0x0000000E, + WGPUBlendFactor_OneMinusSrc1 = 0x0000000F, + WGPUBlendFactor_Src1Alpha = 0x00000010, + WGPUBlendFactor_OneMinusSrc1Alpha = 0x00000011, WGPUBlendFactor_Force32 = 0x7FFFFFFF } WGPUBlendFactor WGPU_ENUM_ATTRIBUTE; typedef enum WGPUBlendOperation { - WGPUBlendOperation_Add = 0x00000000, - WGPUBlendOperation_Subtract = 0x00000001, - WGPUBlendOperation_ReverseSubtract = 0x00000002, - WGPUBlendOperation_Min = 0x00000003, - WGPUBlendOperation_Max = 0x00000004, + WGPUBlendOperation_Undefined = 0x00000000, + WGPUBlendOperation_Add = 0x00000001, + WGPUBlendOperation_Subtract = 0x00000002, + WGPUBlendOperation_ReverseSubtract = 0x00000003, + WGPUBlendOperation_Min = 0x00000004, + WGPUBlendOperation_Max = 0x00000005, WGPUBlendOperation_Force32 = 0x7FFFFFFF } WGPUBlendOperation WGPU_ENUM_ATTRIBUTE; @@ -317,21 +344,22 @@ typedef enum WGPUBufferBindingType { typedef enum WGPUBufferMapAsyncStatus { WGPUBufferMapAsyncStatus_Success = 0x00000000, - WGPUBufferMapAsyncStatus_ValidationError = 0x00000001, - WGPUBufferMapAsyncStatus_Unknown = 0x00000002, - WGPUBufferMapAsyncStatus_DeviceLost = 0x00000003, - WGPUBufferMapAsyncStatus_DestroyedBeforeCallback = 0x00000004, - WGPUBufferMapAsyncStatus_UnmappedBeforeCallback = 0x00000005, - WGPUBufferMapAsyncStatus_MappingAlreadyPending = 0x00000006, - WGPUBufferMapAsyncStatus_OffsetOutOfRange = 0x00000007, - WGPUBufferMapAsyncStatus_SizeOutOfRange = 0x00000008, + WGPUBufferMapAsyncStatus_InstanceDropped = 0x00000001, + WGPUBufferMapAsyncStatus_ValidationError = 0x00000002, + WGPUBufferMapAsyncStatus_Unknown = 0x00000003, + WGPUBufferMapAsyncStatus_DeviceLost = 0x00000004, + WGPUBufferMapAsyncStatus_DestroyedBeforeCallback = 0x00000005, + WGPUBufferMapAsyncStatus_UnmappedBeforeCallback = 0x00000006, + WGPUBufferMapAsyncStatus_MappingAlreadyPending = 0x00000007, + WGPUBufferMapAsyncStatus_OffsetOutOfRange = 0x00000008, + WGPUBufferMapAsyncStatus_SizeOutOfRange = 0x00000009, WGPUBufferMapAsyncStatus_Force32 = 0x7FFFFFFF } WGPUBufferMapAsyncStatus WGPU_ENUM_ATTRIBUTE; typedef enum WGPUBufferMapState { - WGPUBufferMapState_Unmapped = 0x00000000, - WGPUBufferMapState_Pending = 0x00000001, - WGPUBufferMapState_Mapped = 0x00000002, + WGPUBufferMapState_Unmapped = 0x00000001, + WGPUBufferMapState_Pending = 0x00000002, + WGPUBufferMapState_Mapped = 0x00000003, WGPUBufferMapState_Force32 = 0x7FFFFFFF } WGPUBufferMapState WGPU_ENUM_ATTRIBUTE; @@ -346,44 +374,47 @@ typedef enum WGPUCompareFunction { WGPUCompareFunction_Undefined = 0x00000000, WGPUCompareFunction_Never = 0x00000001, WGPUCompareFunction_Less = 0x00000002, - WGPUCompareFunction_LessEqual = 0x00000003, - WGPUCompareFunction_Greater = 0x00000004, - WGPUCompareFunction_GreaterEqual = 0x00000005, - WGPUCompareFunction_Equal = 0x00000006, - WGPUCompareFunction_NotEqual = 0x00000007, + WGPUCompareFunction_Equal = 0x00000003, + WGPUCompareFunction_LessEqual = 0x00000004, + WGPUCompareFunction_Greater = 0x00000005, + WGPUCompareFunction_NotEqual = 0x00000006, + WGPUCompareFunction_GreaterEqual = 0x00000007, WGPUCompareFunction_Always = 0x00000008, WGPUCompareFunction_Force32 = 0x7FFFFFFF } WGPUCompareFunction WGPU_ENUM_ATTRIBUTE; typedef enum WGPUCompilationInfoRequestStatus { WGPUCompilationInfoRequestStatus_Success = 0x00000000, - WGPUCompilationInfoRequestStatus_Error = 0x00000001, - WGPUCompilationInfoRequestStatus_DeviceLost = 0x00000002, - WGPUCompilationInfoRequestStatus_Unknown = 0x00000003, + WGPUCompilationInfoRequestStatus_InstanceDropped = 0x00000001, + WGPUCompilationInfoRequestStatus_Error = 0x00000002, + WGPUCompilationInfoRequestStatus_DeviceLost = 0x00000003, + WGPUCompilationInfoRequestStatus_Unknown = 0x00000004, WGPUCompilationInfoRequestStatus_Force32 = 0x7FFFFFFF } WGPUCompilationInfoRequestStatus WGPU_ENUM_ATTRIBUTE; typedef enum WGPUCompilationMessageType { - WGPUCompilationMessageType_Error = 0x00000000, - WGPUCompilationMessageType_Warning = 0x00000001, - WGPUCompilationMessageType_Info = 0x00000002, + WGPUCompilationMessageType_Error = 0x00000001, + WGPUCompilationMessageType_Warning = 0x00000002, + WGPUCompilationMessageType_Info = 0x00000003, WGPUCompilationMessageType_Force32 = 0x7FFFFFFF } WGPUCompilationMessageType WGPU_ENUM_ATTRIBUTE; typedef enum WGPUCreatePipelineAsyncStatus { WGPUCreatePipelineAsyncStatus_Success = 0x00000000, - WGPUCreatePipelineAsyncStatus_ValidationError = 0x00000001, - WGPUCreatePipelineAsyncStatus_InternalError = 0x00000002, - WGPUCreatePipelineAsyncStatus_DeviceLost = 0x00000003, - WGPUCreatePipelineAsyncStatus_DeviceDestroyed = 0x00000004, - WGPUCreatePipelineAsyncStatus_Unknown = 0x00000005, + WGPUCreatePipelineAsyncStatus_InstanceDropped = 0x00000001, + WGPUCreatePipelineAsyncStatus_ValidationError = 0x00000002, + WGPUCreatePipelineAsyncStatus_InternalError = 0x00000003, + WGPUCreatePipelineAsyncStatus_DeviceLost = 0x00000004, + WGPUCreatePipelineAsyncStatus_DeviceDestroyed = 0x00000005, + WGPUCreatePipelineAsyncStatus_Unknown = 0x00000006, WGPUCreatePipelineAsyncStatus_Force32 = 0x7FFFFFFF } WGPUCreatePipelineAsyncStatus WGPU_ENUM_ATTRIBUTE; typedef enum WGPUCullMode { - WGPUCullMode_None = 0x00000000, - WGPUCullMode_Front = 0x00000001, - WGPUCullMode_Back = 0x00000002, + WGPUCullMode_Undefined = 0x00000000, + WGPUCullMode_None = 0x00000001, + WGPUCullMode_Front = 0x00000002, + WGPUCullMode_Back = 0x00000003, WGPUCullMode_Force32 = 0x7FFFFFFF } WGPUCullMode WGPU_ENUM_ATTRIBUTE; @@ -394,9 +425,9 @@ typedef enum WGPUDeviceLostReason { } WGPUDeviceLostReason WGPU_ENUM_ATTRIBUTE; typedef enum WGPUErrorFilter { - WGPUErrorFilter_Validation = 0x00000000, - WGPUErrorFilter_OutOfMemory = 0x00000001, - WGPUErrorFilter_Internal = 0x00000002, + WGPUErrorFilter_Validation = 0x00000001, + WGPUErrorFilter_OutOfMemory = 0x00000002, + WGPUErrorFilter_Internal = 0x00000003, WGPUErrorFilter_Force32 = 0x7FFFFFFF } WGPUErrorFilter WGPU_ENUM_ATTRIBUTE; @@ -434,7 +465,6 @@ typedef enum WGPUFeatureName { WGPUFeatureName_DawnInternalUsages = 0x000003EA, WGPUFeatureName_DawnMultiPlanarFormats = 0x000003EB, WGPUFeatureName_DawnNative = 0x000003EC, - WGPUFeatureName_ChromiumExperimentalDp4a = 0x000003ED, WGPUFeatureName_ChromiumExperimentalTimestampQueryInsidePasses = 0x000003EE, WGPUFeatureName_ImplicitDeviceSynchronization = 0x000003EF, WGPUFeatureName_SurfaceCapabilities = 0x000003F0, @@ -455,6 +485,8 @@ typedef enum WGPUFeatureName { WGPUFeatureName_MultiPlanarFormatNv12a = 0x00000400, WGPUFeatureName_FramebufferFetch = 0x00000401, WGPUFeatureName_BufferMapExtendedUsages = 0x00000402, + WGPUFeatureName_AdapterPropertiesMemoryHeaps = 0x00000403, + WGPUFeatureName_AdapterPropertiesD3D = 0x00000404, WGPUFeatureName_SharedTextureMemoryVkDedicatedAllocation = 0x0000044C, WGPUFeatureName_SharedTextureMemoryAHardwareBuffer = 0x0000044D, WGPUFeatureName_SharedTextureMemoryDmaBuf = 0x0000044E, @@ -473,14 +505,16 @@ typedef enum WGPUFeatureName { } WGPUFeatureName WGPU_ENUM_ATTRIBUTE; typedef enum WGPUFilterMode { - WGPUFilterMode_Nearest = 0x00000000, - WGPUFilterMode_Linear = 0x00000001, + WGPUFilterMode_Undefined = 0x00000000, + WGPUFilterMode_Nearest = 0x00000001, + WGPUFilterMode_Linear = 0x00000002, WGPUFilterMode_Force32 = 0x7FFFFFFF } WGPUFilterMode WGPU_ENUM_ATTRIBUTE; typedef enum WGPUFrontFace { - WGPUFrontFace_CCW = 0x00000000, - WGPUFrontFace_CW = 0x00000001, + WGPUFrontFace_Undefined = 0x00000000, + WGPUFrontFace_CCW = 0x00000001, + WGPUFrontFace_CW = 0x00000002, WGPUFrontFace_Force32 = 0x7FFFFFFF } WGPUFrontFace WGPU_ENUM_ATTRIBUTE; @@ -499,16 +533,17 @@ typedef enum WGPULoadOp { } WGPULoadOp WGPU_ENUM_ATTRIBUTE; typedef enum WGPULoggingType { - WGPULoggingType_Verbose = 0x00000000, - WGPULoggingType_Info = 0x00000001, - WGPULoggingType_Warning = 0x00000002, - WGPULoggingType_Error = 0x00000003, + WGPULoggingType_Verbose = 0x00000001, + WGPULoggingType_Info = 0x00000002, + WGPULoggingType_Warning = 0x00000003, + WGPULoggingType_Error = 0x00000004, WGPULoggingType_Force32 = 0x7FFFFFFF } WGPULoggingType WGPU_ENUM_ATTRIBUTE; typedef enum WGPUMipmapFilterMode { - WGPUMipmapFilterMode_Nearest = 0x00000000, - WGPUMipmapFilterMode_Linear = 0x00000001, + WGPUMipmapFilterMode_Undefined = 0x00000000, + WGPUMipmapFilterMode_Nearest = 0x00000001, + WGPUMipmapFilterMode_Linear = 0x00000002, WGPUMipmapFilterMode_Force32 = 0x7FFFFFFF } WGPUMipmapFilterMode WGPU_ENUM_ATTRIBUTE; @@ -520,47 +555,51 @@ typedef enum WGPUPowerPreference { } WGPUPowerPreference WGPU_ENUM_ATTRIBUTE; typedef enum WGPUPresentMode { - WGPUPresentMode_Fifo = 0x00000000, - WGPUPresentMode_Immediate = 0x00000002, - WGPUPresentMode_Mailbox = 0x00000003, + WGPUPresentMode_Fifo = 0x00000001, + WGPUPresentMode_Immediate = 0x00000003, + WGPUPresentMode_Mailbox = 0x00000004, WGPUPresentMode_Force32 = 0x7FFFFFFF } WGPUPresentMode WGPU_ENUM_ATTRIBUTE; typedef enum WGPUPrimitiveTopology { - WGPUPrimitiveTopology_PointList = 0x00000000, - WGPUPrimitiveTopology_LineList = 0x00000001, - WGPUPrimitiveTopology_LineStrip = 0x00000002, - WGPUPrimitiveTopology_TriangleList = 0x00000003, - WGPUPrimitiveTopology_TriangleStrip = 0x00000004, + WGPUPrimitiveTopology_Undefined = 0x00000000, + WGPUPrimitiveTopology_PointList = 0x00000001, + WGPUPrimitiveTopology_LineList = 0x00000002, + WGPUPrimitiveTopology_LineStrip = 0x00000003, + WGPUPrimitiveTopology_TriangleList = 0x00000004, + WGPUPrimitiveTopology_TriangleStrip = 0x00000005, WGPUPrimitiveTopology_Force32 = 0x7FFFFFFF } WGPUPrimitiveTopology WGPU_ENUM_ATTRIBUTE; typedef enum WGPUQueryType { - WGPUQueryType_Occlusion = 0x00000000, - WGPUQueryType_Timestamp = 0x00000001, + WGPUQueryType_Occlusion = 0x00000001, + WGPUQueryType_Timestamp = 0x00000002, WGPUQueryType_Force32 = 0x7FFFFFFF } WGPUQueryType WGPU_ENUM_ATTRIBUTE; typedef enum WGPUQueueWorkDoneStatus { WGPUQueueWorkDoneStatus_Success = 0x00000000, - WGPUQueueWorkDoneStatus_Error = 0x00000001, - WGPUQueueWorkDoneStatus_Unknown = 0x00000002, - WGPUQueueWorkDoneStatus_DeviceLost = 0x00000003, + WGPUQueueWorkDoneStatus_InstanceDropped = 0x00000001, + WGPUQueueWorkDoneStatus_Error = 0x00000002, + WGPUQueueWorkDoneStatus_Unknown = 0x00000003, + WGPUQueueWorkDoneStatus_DeviceLost = 0x00000004, WGPUQueueWorkDoneStatus_Force32 = 0x7FFFFFFF } WGPUQueueWorkDoneStatus WGPU_ENUM_ATTRIBUTE; typedef enum WGPURequestAdapterStatus { WGPURequestAdapterStatus_Success = 0x00000000, - WGPURequestAdapterStatus_Unavailable = 0x00000001, - WGPURequestAdapterStatus_Error = 0x00000002, - WGPURequestAdapterStatus_Unknown = 0x00000003, + WGPURequestAdapterStatus_InstanceDropped = 0x00000001, + WGPURequestAdapterStatus_Unavailable = 0x00000002, + WGPURequestAdapterStatus_Error = 0x00000003, + WGPURequestAdapterStatus_Unknown = 0x00000004, WGPURequestAdapterStatus_Force32 = 0x7FFFFFFF } WGPURequestAdapterStatus WGPU_ENUM_ATTRIBUTE; typedef enum WGPURequestDeviceStatus { WGPURequestDeviceStatus_Success = 0x00000000, - WGPURequestDeviceStatus_Error = 0x00000001, - WGPURequestDeviceStatus_Unknown = 0x00000002, + WGPURequestDeviceStatus_InstanceDropped = 0x00000001, + WGPURequestDeviceStatus_Error = 0x00000002, + WGPURequestDeviceStatus_Unknown = 0x00000003, WGPURequestDeviceStatus_Force32 = 0x7FFFFFFF } WGPURequestDeviceStatus WGPU_ENUM_ATTRIBUTE; @@ -599,6 +638,11 @@ typedef enum WGPUSType { WGPUSType_PipelineLayoutPixelLocalStorage = 0x000003F8, WGPUSType_BufferHostMappedPointer = 0x000003F9, WGPUSType_DawnExperimentalSubgroupLimits = 0x000003FA, + WGPUSType_AdapterPropertiesMemoryHeaps = 0x000003FB, + WGPUSType_AdapterPropertiesD3D = 0x000003FC, + WGPUSType_DawnComputePipelineFullSubgroups = 0x000003FD, + WGPUSType_DawnWireWGSLControl = 0x000003FE, + WGPUSType_DawnWGSLBlocklist = 0x000003FF, WGPUSType_SharedTextureMemoryVkImageDescriptor = 0x0000044C, WGPUSType_SharedTextureMemoryVkDedicatedAllocationDescriptor = 0x0000044D, WGPUSType_SharedTextureMemoryAHardwareBufferDescriptor = 0x0000044E, @@ -645,14 +689,15 @@ typedef enum WGPUSharedFenceType { } WGPUSharedFenceType WGPU_ENUM_ATTRIBUTE; typedef enum WGPUStencilOperation { - WGPUStencilOperation_Keep = 0x00000000, - WGPUStencilOperation_Zero = 0x00000001, - WGPUStencilOperation_Replace = 0x00000002, - WGPUStencilOperation_Invert = 0x00000003, - WGPUStencilOperation_IncrementClamp = 0x00000004, - WGPUStencilOperation_DecrementClamp = 0x00000005, - WGPUStencilOperation_IncrementWrap = 0x00000006, - WGPUStencilOperation_DecrementWrap = 0x00000007, + WGPUStencilOperation_Undefined = 0x00000000, + WGPUStencilOperation_Keep = 0x00000001, + WGPUStencilOperation_Zero = 0x00000002, + WGPUStencilOperation_Replace = 0x00000003, + WGPUStencilOperation_Invert = 0x00000004, + WGPUStencilOperation_IncrementClamp = 0x00000005, + WGPUStencilOperation_DecrementClamp = 0x00000006, + WGPUStencilOperation_IncrementWrap = 0x00000007, + WGPUStencilOperation_DecrementWrap = 0x00000008, WGPUStencilOperation_Force32 = 0x7FFFFFFF } WGPUStencilOperation WGPU_ENUM_ATTRIBUTE; @@ -672,19 +717,21 @@ typedef enum WGPUStoreOp { } WGPUStoreOp WGPU_ENUM_ATTRIBUTE; typedef enum WGPUTextureAspect { - WGPUTextureAspect_All = 0x00000000, - WGPUTextureAspect_StencilOnly = 0x00000001, - WGPUTextureAspect_DepthOnly = 0x00000002, - WGPUTextureAspect_Plane0Only = 0x00000003, - WGPUTextureAspect_Plane1Only = 0x00000004, - WGPUTextureAspect_Plane2Only = 0x00000005, + WGPUTextureAspect_Undefined = 0x00000000, + WGPUTextureAspect_All = 0x00000001, + WGPUTextureAspect_StencilOnly = 0x00000002, + WGPUTextureAspect_DepthOnly = 0x00000003, + WGPUTextureAspect_Plane0Only = 0x00000004, + WGPUTextureAspect_Plane1Only = 0x00000005, + WGPUTextureAspect_Plane2Only = 0x00000006, WGPUTextureAspect_Force32 = 0x7FFFFFFF } WGPUTextureAspect WGPU_ENUM_ATTRIBUTE; typedef enum WGPUTextureDimension { - WGPUTextureDimension_1D = 0x00000000, - WGPUTextureDimension_2D = 0x00000001, - WGPUTextureDimension_3D = 0x00000002, + WGPUTextureDimension_Undefined = 0x00000000, + WGPUTextureDimension_1D = 0x00000001, + WGPUTextureDimension_2D = 0x00000002, + WGPUTextureDimension_3D = 0x00000003, WGPUTextureDimension_Force32 = 0x7FFFFFFF } WGPUTextureDimension WGPU_ENUM_ATTRIBUTE; @@ -855,9 +902,10 @@ typedef enum WGPUVertexFormat { } WGPUVertexFormat WGPU_ENUM_ATTRIBUTE; typedef enum WGPUVertexStepMode { - WGPUVertexStepMode_Vertex = 0x00000000, - WGPUVertexStepMode_Instance = 0x00000001, - WGPUVertexStepMode_VertexBufferNotUsed = 0x00000002, + WGPUVertexStepMode_Undefined = 0x00000000, + WGPUVertexStepMode_VertexBufferNotUsed = 0x00000001, + WGPUVertexStepMode_Vertex = 0x00000002, + WGPUVertexStepMode_Instance = 0x00000003, WGPUVertexStepMode_Force32 = 0x7FFFFFFF } WGPUVertexStepMode WGPU_ENUM_ATTRIBUTE; @@ -898,6 +946,17 @@ typedef enum WGPUColorWriteMask { } WGPUColorWriteMask WGPU_ENUM_ATTRIBUTE; typedef WGPUFlags WGPUColorWriteMaskFlags WGPU_ENUM_ATTRIBUTE; +typedef enum WGPUHeapProperty { + WGPUHeapProperty_Undefined = 0x00000000, + WGPUHeapProperty_DeviceLocal = 0x00000001, + WGPUHeapProperty_HostVisible = 0x00000002, + WGPUHeapProperty_HostCoherent = 0x00000004, + WGPUHeapProperty_HostUncached = 0x00000008, + WGPUHeapProperty_HostCached = 0x00000010, + WGPUHeapProperty_Force32 = 0x7FFFFFFF +} WGPUHeapProperty WGPU_ENUM_ATTRIBUTE; +typedef WGPUFlags WGPUHeapPropertyFlags WGPU_ENUM_ATTRIBUTE; + typedef enum WGPUMapMode { WGPUMapMode_None = 0x00000000, WGPUMapMode_Read = 0x00000001, @@ -933,6 +992,8 @@ typedef void (*WGPUCallback)(void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUCompilationInfoCallback)(WGPUCompilationInfoRequestStatus status, struct WGPUCompilationInfo const * compilationInfo, void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUCreateComputePipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPUComputePipeline pipeline, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUCreateRenderPipelineAsyncCallback)(WGPUCreatePipelineAsyncStatus status, WGPURenderPipeline pipeline, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef size_t (*WGPUDawnLoadCacheDataFunction)(void const * key, size_t keySize, void * value, size_t valueSize, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef void (*WGPUDawnStoreCacheDataFunction)(void const * key, size_t keySize, void const * value, size_t valueSize, void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUDeviceLostCallback)(WGPUDeviceLostReason reason, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUErrorCallback)(WGPUErrorType type, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPULoggingCallback)(WGPULoggingType type, char const * message, void * userdata) WGPU_FUNCTION_ATTRIBUTE; @@ -964,6 +1025,12 @@ typedef struct WGPUAdapterProperties { WGPUBool compatibilityMode; } WGPUAdapterProperties WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUAdapterProperties +typedef struct WGPUAdapterPropertiesD3D { + WGPUChainedStructOut chain; + uint32_t shaderModel; +} WGPUAdapterPropertiesD3D WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUBindGroupEntry { WGPUChainedStruct const * nextInChain; uint32_t binding; @@ -1064,6 +1131,27 @@ typedef struct WGPUCopyTextureForBrowserOptions { WGPUBool internalUsage; } WGPUCopyTextureForBrowserOptions WGPU_STRUCTURE_ATTRIBUTE; +typedef struct WGPUCreateComputePipelineAsyncCallbackInfo { + WGPUChainedStruct const * nextInChain; + WGPUCallbackMode mode; + WGPUCreateComputePipelineAsyncCallback callback; + void * userdata; +} WGPUCreateComputePipelineAsyncCallbackInfo WGPU_STRUCTURE_ATTRIBUTE; + +typedef struct WGPUCreateRenderPipelineAsyncCallbackInfo { + WGPUChainedStruct const * nextInChain; + WGPUCallbackMode mode; + WGPUCreateRenderPipelineAsyncCallback callback; + void * userdata; +} WGPUCreateRenderPipelineAsyncCallbackInfo WGPU_STRUCTURE_ATTRIBUTE; + +// Can be chained in WGPUInstanceDescriptor +typedef struct WGPUDawnWGSLBlocklist { + WGPUChainedStruct chain; + size_t blocklistedFeatureCount; + const char* const * blocklistedFeatures; +} WGPUDawnWGSLBlocklist WGPU_STRUCTURE_ATTRIBUTE; + // Can be chained in WGPUAdapterProperties typedef struct WGPUDawnAdapterPropertiesPowerPreference { WGPUChainedStructOut chain; @@ -1080,8 +1168,17 @@ typedef struct WGPUDawnBufferDescriptorErrorInfoFromWireClient { typedef struct WGPUDawnCacheDeviceDescriptor { WGPUChainedStruct chain; char const * isolationKey; + WGPUDawnLoadCacheDataFunction loadDataFunction; + WGPUDawnStoreCacheDataFunction storeDataFunction; + void * functionUserdata; } WGPUDawnCacheDeviceDescriptor WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUComputePipelineDescriptor +typedef struct WGPUDawnComputePipelineFullSubgroups { + WGPUChainedStruct chain; + WGPUBool requiresFullSubgroups; +} WGPUDawnComputePipelineFullSubgroups WGPU_STRUCTURE_ATTRIBUTE; + // Can be chained in WGPUCommandEncoderDescriptor typedef struct WGPUDawnEncoderInternalUsageDescriptor { WGPUChainedStruct chain; @@ -1130,6 +1227,14 @@ typedef struct WGPUDawnTogglesDescriptor { const char* const * disabledToggles; } WGPUDawnTogglesDescriptor WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUInstanceDescriptor +typedef struct WGPUDawnWireWGSLControl { + WGPUChainedStruct chain; + WGPUBool enableExperimental; + WGPUBool enableUnsafe; + WGPUBool enableTesting; +} WGPUDawnWireWGSLControl WGPU_STRUCTURE_ATTRIBUTE; + // Can be chained in WGPUDepthStencilState typedef struct WGPUDepthStencilStateDepthWriteDefinedDawn { WGPUChainedStruct chain; @@ -1203,6 +1308,11 @@ typedef struct WGPULimits { uint32_t maxComputeWorkgroupsPerDimension; } WGPULimits WGPU_STRUCTURE_ATTRIBUTE; +typedef struct WGPUMemoryHeapInfo { + WGPUHeapPropertyFlags properties; + uint64_t size; +} WGPUMemoryHeapInfo WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUMultisampleState { WGPUChainedStruct const * nextInChain; uint32_t count; @@ -1323,6 +1433,13 @@ typedef struct WGPURequestAdapterOptions { WGPUBool compatibilityMode; } WGPURequestAdapterOptions WGPU_STRUCTURE_ATTRIBUTE; +typedef struct WGPURequestDeviceCallbackInfo { + WGPUChainedStruct const * nextInChain; + WGPUCallbackMode mode; + WGPURequestDeviceCallback callback; + void * userdata; +} WGPURequestDeviceCallbackInfo WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUSamplerBindingLayout { WGPUChainedStruct const * nextInChain; WGPUSamplerBindingType type; @@ -1343,11 +1460,6 @@ typedef struct WGPUSamplerDescriptor { uint16_t maxAnisotropy; } WGPUSamplerDescriptor WGPU_STRUCTURE_ATTRIBUTE; -typedef struct WGPUShaderModuleDescriptor { - WGPUChainedStruct const * nextInChain; - WGPU_NULLABLE char const * label; -} WGPUShaderModuleDescriptor WGPU_STRUCTURE_ATTRIBUTE; - // Can be chained in WGPUShaderModuleDescriptor typedef struct WGPUShaderModuleSPIRVDescriptor { WGPUChainedStruct chain; @@ -1361,10 +1473,10 @@ typedef struct WGPUShaderModuleWGSLDescriptor { char const * code; } WGPUShaderModuleWGSLDescriptor WGPU_STRUCTURE_ATTRIBUTE; -typedef struct WGPUSharedFenceDescriptor { +typedef struct WGPUShaderModuleDescriptor { WGPUChainedStruct const * nextInChain; WGPU_NULLABLE char const * label; -} WGPUSharedFenceDescriptor WGPU_STRUCTURE_ATTRIBUTE; +} WGPUShaderModuleDescriptor WGPU_STRUCTURE_ATTRIBUTE; // Can be chained in WGPUSharedFenceDescriptor typedef struct WGPUSharedFenceDXGISharedHandleDescriptor { @@ -1378,11 +1490,6 @@ typedef struct WGPUSharedFenceDXGISharedHandleExportInfo { void * handle; } WGPUSharedFenceDXGISharedHandleExportInfo WGPU_STRUCTURE_ATTRIBUTE; -typedef struct WGPUSharedFenceExportInfo { - WGPUChainedStructOut * nextInChain; - WGPUSharedFenceType type; -} WGPUSharedFenceExportInfo WGPU_STRUCTURE_ATTRIBUTE; - // Can be chained in WGPUSharedFenceDescriptor typedef struct WGPUSharedFenceMTLSharedEventDescriptor { WGPUChainedStruct chain; @@ -1395,6 +1502,16 @@ typedef struct WGPUSharedFenceMTLSharedEventExportInfo { void * sharedEvent; } WGPUSharedFenceMTLSharedEventExportInfo WGPU_STRUCTURE_ATTRIBUTE; +typedef struct WGPUSharedFenceDescriptor { + WGPUChainedStruct const * nextInChain; + WGPU_NULLABLE char const * label; +} WGPUSharedFenceDescriptor WGPU_STRUCTURE_ATTRIBUTE; + +typedef struct WGPUSharedFenceExportInfo { + WGPUChainedStructOut * nextInChain; + WGPUSharedFenceType type; +} WGPUSharedFenceExportInfo WGPU_STRUCTURE_ATTRIBUTE; + // Can be chained in WGPUSharedFenceDescriptor typedef struct WGPUSharedFenceVkSemaphoreOpaqueFDDescriptor { WGPUChainedStruct chain; @@ -1431,6 +1548,24 @@ typedef struct WGPUSharedFenceVkSemaphoreZirconHandleExportInfo { uint32_t handle; } WGPUSharedFenceVkSemaphoreZirconHandleExportInfo WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUSharedTextureMemoryDescriptor +typedef struct WGPUSharedTextureMemoryDXGISharedHandleDescriptor { + WGPUChainedStruct chain; + void * handle; +} WGPUSharedTextureMemoryDXGISharedHandleDescriptor WGPU_STRUCTURE_ATTRIBUTE; + +// Can be chained in WGPUSharedTextureMemoryDescriptor +typedef struct WGPUSharedTextureMemoryEGLImageDescriptor { + WGPUChainedStruct chain; + void * image; +} WGPUSharedTextureMemoryEGLImageDescriptor WGPU_STRUCTURE_ATTRIBUTE; + +// Can be chained in WGPUSharedTextureMemoryDescriptor +typedef struct WGPUSharedTextureMemoryIOSurfaceDescriptor { + WGPUChainedStruct chain; + void * ioSurface; +} WGPUSharedTextureMemoryIOSurfaceDescriptor WGPU_STRUCTURE_ATTRIBUTE; + // Can be chained in WGPUSharedTextureMemoryDescriptor typedef struct WGPUSharedTextureMemoryAHardwareBufferDescriptor { WGPUChainedStruct chain; @@ -1439,6 +1574,7 @@ typedef struct WGPUSharedTextureMemoryAHardwareBufferDescriptor { typedef struct WGPUSharedTextureMemoryBeginAccessDescriptor { WGPUChainedStruct const * nextInChain; + WGPUBool concurrentRead; WGPUBool initialized; size_t fenceCount; WGPUSharedFence const * fences; @@ -1450,28 +1586,11 @@ typedef struct WGPUSharedTextureMemoryDescriptor { WGPU_NULLABLE char const * label; } WGPUSharedTextureMemoryDescriptor WGPU_STRUCTURE_ATTRIBUTE; -// Can be chained in WGPUSharedTextureMemoryDescriptor -typedef struct WGPUSharedTextureMemoryDmaBufDescriptor { - WGPUChainedStruct chain; - int memoryFD; - uint64_t allocationSize; - uint64_t drmModifier; - size_t planeCount; - uint64_t const * planeOffsets; - uint32_t const * planeStrides; -} WGPUSharedTextureMemoryDmaBufDescriptor WGPU_STRUCTURE_ATTRIBUTE; - -// Can be chained in WGPUSharedTextureMemoryDescriptor -typedef struct WGPUSharedTextureMemoryDXGISharedHandleDescriptor { - WGPUChainedStruct chain; - void * handle; -} WGPUSharedTextureMemoryDXGISharedHandleDescriptor WGPU_STRUCTURE_ATTRIBUTE; - -// Can be chained in WGPUSharedTextureMemoryDescriptor -typedef struct WGPUSharedTextureMemoryEGLImageDescriptor { - WGPUChainedStruct chain; - void * image; -} WGPUSharedTextureMemoryEGLImageDescriptor WGPU_STRUCTURE_ATTRIBUTE; +typedef struct WGPUSharedTextureMemoryDmaBufPlane { + int fd; + uint64_t offset; + uint32_t stride; +} WGPUSharedTextureMemoryDmaBufPlane WGPU_STRUCTURE_ATTRIBUTE; typedef struct WGPUSharedTextureMemoryEndAccessState { WGPUChainedStructOut * nextInChain; @@ -1481,17 +1600,14 @@ typedef struct WGPUSharedTextureMemoryEndAccessState { uint64_t const * signaledValues; } WGPUSharedTextureMemoryEndAccessState WGPU_STRUCTURE_ATTRIBUTE; -// Can be chained in WGPUSharedTextureMemoryDescriptor -typedef struct WGPUSharedTextureMemoryIOSurfaceDescriptor { - WGPUChainedStruct chain; - void * ioSurface; -} WGPUSharedTextureMemoryIOSurfaceDescriptor WGPU_STRUCTURE_ATTRIBUTE; - // Can be chained in WGPUSharedTextureMemoryDescriptor typedef struct WGPUSharedTextureMemoryOpaqueFDDescriptor { WGPUChainedStruct chain; + void const * vkImageCreateInfo; int memoryFD; + uint32_t memoryTypeIndex; uint64_t allocationSize; + WGPUBool dedicatedAllocation; } WGPUSharedTextureMemoryOpaqueFDDescriptor WGPU_STRUCTURE_ATTRIBUTE; // Can be chained in WGPUSharedTextureMemoryDescriptor @@ -1565,12 +1681,6 @@ typedef struct WGPUSurfaceDescriptorFromWaylandSurface { void * surface; } WGPUSurfaceDescriptorFromWaylandSurface WGPU_STRUCTURE_ATTRIBUTE; -// Can be chained in WGPUSurfaceDescriptor -typedef struct WGPUSurfaceDescriptorFromWindowsCoreWindow { - WGPUChainedStruct chain; - void * coreWindow; -} WGPUSurfaceDescriptorFromWindowsCoreWindow WGPU_STRUCTURE_ATTRIBUTE; - // Can be chained in WGPUSurfaceDescriptor typedef struct WGPUSurfaceDescriptorFromWindowsHWND { WGPUChainedStruct chain; @@ -1578,6 +1688,12 @@ typedef struct WGPUSurfaceDescriptorFromWindowsHWND { void * hwnd; } WGPUSurfaceDescriptorFromWindowsHWND WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUSurfaceDescriptor +typedef struct WGPUSurfaceDescriptorFromWindowsCoreWindow { + WGPUChainedStruct chain; + void * coreWindow; +} WGPUSurfaceDescriptorFromWindowsCoreWindow WGPU_STRUCTURE_ATTRIBUTE; + // Can be chained in WGPUSurfaceDescriptor typedef struct WGPUSurfaceDescriptorFromWindowsSwapChainPanel { WGPUChainedStruct chain; @@ -1639,6 +1755,13 @@ typedef struct WGPUVertexAttribute { uint32_t shaderLocation; } WGPUVertexAttribute WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUAdapterProperties +typedef struct WGPUAdapterPropertiesMemoryHeaps { + WGPUChainedStructOut chain; + size_t heapCount; + WGPUMemoryHeapInfo const * heapInfo; +} WGPUAdapterPropertiesMemoryHeaps WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUBindGroupDescriptor { WGPUChainedStruct const * nextInChain; WGPU_NULLABLE char const * label; @@ -1701,6 +1824,7 @@ typedef struct WGPUExternalTextureDescriptor { float const * dstTransferFunctionParameters; float const * gamutConversionMatrix; WGPUBool flipY; + WGPUBool mirrored; WGPUExternalTextureRotation rotation; } WGPUExternalTextureDescriptor WGPU_STRUCTURE_ATTRIBUTE; @@ -1775,6 +1899,16 @@ typedef struct WGPURequiredLimits { WGPULimits limits; } WGPURequiredLimits WGPU_STRUCTURE_ATTRIBUTE; +// Can be chained in WGPUSharedTextureMemoryDescriptor +typedef struct WGPUSharedTextureMemoryDmaBufDescriptor { + WGPUChainedStruct chain; + WGPUExtent3D size; + uint32_t drmFormat; + uint64_t drmModifier; + size_t planeCount; + WGPUSharedTextureMemoryDmaBufPlane const * planes; +} WGPUSharedTextureMemoryDmaBufDescriptor WGPU_STRUCTURE_ATTRIBUTE; + typedef struct WGPUSharedTextureMemoryProperties { WGPUChainedStructOut * nextInChain; WGPUTextureUsageFlags usage; @@ -1903,6 +2037,7 @@ extern "C" { #if !defined(WGPU_SKIP_PROCS) typedef void (*WGPUProcAdapterPropertiesFreeMembers)(WGPUAdapterProperties value) WGPU_FUNCTION_ATTRIBUTE; +typedef void (*WGPUProcAdapterPropertiesMemoryHeapsFreeMembers)(WGPUAdapterPropertiesMemoryHeaps value) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUInstance (*WGPUProcCreateInstance)(WGPUInstanceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUBool (*WGPUProcGetInstanceFeatures)(WGPUInstanceFeatures * features) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUProc (*WGPUProcGetProcAddress)(WGPUDevice device, char const * procName) WGPU_FUNCTION_ATTRIBUTE; @@ -1916,6 +2051,7 @@ typedef WGPUBool (*WGPUProcAdapterGetLimits)(WGPUAdapter adapter, WGPUSupportedL typedef void (*WGPUProcAdapterGetProperties)(WGPUAdapter adapter, WGPUAdapterProperties * properties) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUBool (*WGPUProcAdapterHasFeature)(WGPUAdapter adapter, WGPUFeatureName feature) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcAdapterRequestDevice)(WGPUAdapter adapter, WGPU_NULLABLE WGPUDeviceDescriptor const * descriptor, WGPURequestDeviceCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUFuture (*WGPUProcAdapterRequestDeviceF)(WGPUAdapter adapter, WGPU_NULLABLE WGPUDeviceDescriptor const * options, WGPURequestDeviceCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcAdapterReference)(WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcAdapterRelease)(WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE; @@ -1995,6 +2131,7 @@ typedef WGPUBuffer (*WGPUProcDeviceCreateBuffer)(WGPUDevice device, WGPUBufferDe typedef WGPUCommandEncoder (*WGPUProcDeviceCreateCommandEncoder)(WGPUDevice device, WGPU_NULLABLE WGPUCommandEncoderDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUComputePipeline (*WGPUProcDeviceCreateComputePipeline)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcDeviceCreateComputePipelineAsync)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUFuture (*WGPUProcDeviceCreateComputePipelineAsyncF)(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUBuffer (*WGPUProcDeviceCreateErrorBuffer)(WGPUDevice device, WGPUBufferDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUExternalTexture (*WGPUProcDeviceCreateErrorExternalTexture)(WGPUDevice device) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUShaderModule (*WGPUProcDeviceCreateErrorShaderModule)(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor, char const * errorMessage) WGPU_FUNCTION_ATTRIBUTE; @@ -2005,6 +2142,7 @@ typedef WGPUQuerySet (*WGPUProcDeviceCreateQuerySet)(WGPUDevice device, WGPUQuer typedef WGPURenderBundleEncoder (*WGPUProcDeviceCreateRenderBundleEncoder)(WGPUDevice device, WGPURenderBundleEncoderDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPURenderPipeline (*WGPUProcDeviceCreateRenderPipeline)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcDeviceCreateRenderPipelineAsync)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUFuture (*WGPUProcDeviceCreateRenderPipelineAsyncF)(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUSampler (*WGPUProcDeviceCreateSampler)(WGPUDevice device, WGPU_NULLABLE WGPUSamplerDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUShaderModule (*WGPUProcDeviceCreateShaderModule)(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUSwapChain (*WGPUProcDeviceCreateSwapChain)(WGPUDevice device, WGPUSurface surface, WGPUSwapChainDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; @@ -2041,6 +2179,8 @@ typedef void (*WGPUProcExternalTextureRelease)(WGPUExternalTexture externalTextu // Procs of Instance typedef WGPUSurface (*WGPUProcInstanceCreateSurface)(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; +typedef size_t (*WGPUProcInstanceEnumerateWGSLLanguageFeatures)(WGPUInstance instance, WGPUWGSLFeatureName * features) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUBool (*WGPUProcInstanceHasWGSLLanguageFeature)(WGPUInstance instance, WGPUWGSLFeatureName feature) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcInstanceProcessEvents)(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcInstanceRequestAdapter)(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUFuture (*WGPUProcInstanceRequestAdapterF)(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; @@ -2148,11 +2288,13 @@ typedef WGPUBool (*WGPUProcSharedTextureMemoryBeginAccess)(WGPUSharedTextureMemo typedef WGPUTexture (*WGPUProcSharedTextureMemoryCreateTexture)(WGPUSharedTextureMemory sharedTextureMemory, WGPU_NULLABLE WGPUTextureDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUBool (*WGPUProcSharedTextureMemoryEndAccess)(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryEndAccessState * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSharedTextureMemoryGetProperties)(WGPUSharedTextureMemory sharedTextureMemory, WGPUSharedTextureMemoryProperties * properties) WGPU_FUNCTION_ATTRIBUTE; +typedef WGPUBool (*WGPUProcSharedTextureMemoryIsDeviceLost)(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSharedTextureMemorySetLabel)(WGPUSharedTextureMemory sharedTextureMemory, char const * label) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSharedTextureMemoryReference)(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSharedTextureMemoryRelease)(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE; // Procs of Surface +typedef WGPUTextureFormat (*WGPUProcSurfaceGetPreferredFormat)(WGPUSurface surface, WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSurfaceReference)(WGPUSurface surface) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcSurfaceRelease)(WGPUSurface surface) WGPU_FUNCTION_ATTRIBUTE; @@ -2164,6 +2306,7 @@ typedef void (*WGPUProcSwapChainReference)(WGPUSwapChain swapChain) WGPU_FUNCTIO typedef void (*WGPUProcSwapChainRelease)(WGPUSwapChain swapChain) WGPU_FUNCTION_ATTRIBUTE; // Procs of Texture +typedef WGPUTextureView (*WGPUProcTextureCreateErrorView)(WGPUTexture texture, WGPU_NULLABLE WGPUTextureViewDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef WGPUTextureView (*WGPUProcTextureCreateView)(WGPUTexture texture, WGPU_NULLABLE WGPUTextureViewDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; typedef void (*WGPUProcTextureDestroy)(WGPUTexture texture) WGPU_FUNCTION_ATTRIBUTE; typedef uint32_t (*WGPUProcTextureGetDepthOrArrayLayers)(WGPUTexture texture) WGPU_FUNCTION_ATTRIBUTE; @@ -2189,6 +2332,7 @@ typedef void (*WGPUProcTextureViewRelease)(WGPUTextureView textureView) WGPU_FUN #if !defined(WGPU_SKIP_DECLARATIONS) WGPU_EXPORT void wgpuAdapterPropertiesFreeMembers(WGPUAdapterProperties value) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT void wgpuAdapterPropertiesMemoryHeapsFreeMembers(WGPUAdapterPropertiesMemoryHeaps value) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUInstance wgpuCreateInstance(WGPU_NULLABLE WGPUInstanceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUBool wgpuGetInstanceFeatures(WGPUInstanceFeatures * features) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUProc wgpuGetProcAddress(WGPU_NULLABLE WGPUDevice device, char const * procName) WGPU_FUNCTION_ATTRIBUTE; @@ -2202,6 +2346,7 @@ WGPU_EXPORT WGPUBool wgpuAdapterGetLimits(WGPUAdapter adapter, WGPUSupportedLimi WGPU_EXPORT void wgpuAdapterGetProperties(WGPUAdapter adapter, WGPUAdapterProperties * properties) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUBool wgpuAdapterHasFeature(WGPUAdapter adapter, WGPUFeatureName feature) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuAdapterRequestDevice(WGPUAdapter adapter, WGPU_NULLABLE WGPUDeviceDescriptor const * descriptor, WGPURequestDeviceCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUFuture wgpuAdapterRequestDeviceF(WGPUAdapter adapter, WGPU_NULLABLE WGPUDeviceDescriptor const * options, WGPURequestDeviceCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuAdapterReference(WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuAdapterRelease(WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE; @@ -2281,6 +2426,7 @@ WGPU_EXPORT WGPUBuffer wgpuDeviceCreateBuffer(WGPUDevice device, WGPUBufferDescr WGPU_EXPORT WGPUCommandEncoder wgpuDeviceCreateCommandEncoder(WGPUDevice device, WGPU_NULLABLE WGPUCommandEncoderDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUComputePipeline wgpuDeviceCreateComputePipeline(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuDeviceCreateComputePipelineAsync(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUFuture wgpuDeviceCreateComputePipelineAsyncF(WGPUDevice device, WGPUComputePipelineDescriptor const * descriptor, WGPUCreateComputePipelineAsyncCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUBuffer wgpuDeviceCreateErrorBuffer(WGPUDevice device, WGPUBufferDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUExternalTexture wgpuDeviceCreateErrorExternalTexture(WGPUDevice device) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUShaderModule wgpuDeviceCreateErrorShaderModule(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor, char const * errorMessage) WGPU_FUNCTION_ATTRIBUTE; @@ -2291,6 +2437,7 @@ WGPU_EXPORT WGPUQuerySet wgpuDeviceCreateQuerySet(WGPUDevice device, WGPUQuerySe WGPU_EXPORT WGPURenderBundleEncoder wgpuDeviceCreateRenderBundleEncoder(WGPUDevice device, WGPURenderBundleEncoderDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPURenderPipeline wgpuDeviceCreateRenderPipeline(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuDeviceCreateRenderPipelineAsync(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUFuture wgpuDeviceCreateRenderPipelineAsyncF(WGPUDevice device, WGPURenderPipelineDescriptor const * descriptor, WGPUCreateRenderPipelineAsyncCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUSampler wgpuDeviceCreateSampler(WGPUDevice device, WGPU_NULLABLE WGPUSamplerDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUShaderModule wgpuDeviceCreateShaderModule(WGPUDevice device, WGPUShaderModuleDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUSwapChain wgpuDeviceCreateSwapChain(WGPUDevice device, WGPUSurface surface, WGPUSwapChainDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; @@ -2327,6 +2474,8 @@ WGPU_EXPORT void wgpuExternalTextureRelease(WGPUExternalTexture externalTexture) // Methods of Instance WGPU_EXPORT WGPUSurface wgpuInstanceCreateSurface(WGPUInstance instance, WGPUSurfaceDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT size_t wgpuInstanceEnumerateWGSLLanguageFeatures(WGPUInstance instance, WGPUWGSLFeatureName * features) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUBool wgpuInstanceHasWGSLLanguageFeature(WGPUInstance instance, WGPUWGSLFeatureName feature) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuInstanceProcessEvents(WGPUInstance instance) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuInstanceRequestAdapter(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallback callback, void * userdata) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUFuture wgpuInstanceRequestAdapterF(WGPUInstance instance, WGPU_NULLABLE WGPURequestAdapterOptions const * options, WGPURequestAdapterCallbackInfo callbackInfo) WGPU_FUNCTION_ATTRIBUTE; @@ -2434,11 +2583,13 @@ WGPU_EXPORT WGPUBool wgpuSharedTextureMemoryBeginAccess(WGPUSharedTextureMemory WGPU_EXPORT WGPUTexture wgpuSharedTextureMemoryCreateTexture(WGPUSharedTextureMemory sharedTextureMemory, WGPU_NULLABLE WGPUTextureDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUBool wgpuSharedTextureMemoryEndAccess(WGPUSharedTextureMemory sharedTextureMemory, WGPUTexture texture, WGPUSharedTextureMemoryEndAccessState * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSharedTextureMemoryGetProperties(WGPUSharedTextureMemory sharedTextureMemory, WGPUSharedTextureMemoryProperties * properties) WGPU_FUNCTION_ATTRIBUTE; +WGPU_EXPORT WGPUBool wgpuSharedTextureMemoryIsDeviceLost(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSharedTextureMemorySetLabel(WGPUSharedTextureMemory sharedTextureMemory, char const * label) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSharedTextureMemoryReference(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSharedTextureMemoryRelease(WGPUSharedTextureMemory sharedTextureMemory) WGPU_FUNCTION_ATTRIBUTE; // Methods of Surface +WGPU_EXPORT WGPUTextureFormat wgpuSurfaceGetPreferredFormat(WGPUSurface surface, WGPUAdapter adapter) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSurfaceReference(WGPUSurface surface) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuSurfaceRelease(WGPUSurface surface) WGPU_FUNCTION_ATTRIBUTE; @@ -2450,6 +2601,7 @@ WGPU_EXPORT void wgpuSwapChainReference(WGPUSwapChain swapChain) WGPU_FUNCTION_A WGPU_EXPORT void wgpuSwapChainRelease(WGPUSwapChain swapChain) WGPU_FUNCTION_ATTRIBUTE; // Methods of Texture +WGPU_EXPORT WGPUTextureView wgpuTextureCreateErrorView(WGPUTexture texture, WGPU_NULLABLE WGPUTextureViewDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT WGPUTextureView wgpuTextureCreateView(WGPUTexture texture, WGPU_NULLABLE WGPUTextureViewDescriptor const * descriptor) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT void wgpuTextureDestroy(WGPUTexture texture) WGPU_FUNCTION_ATTRIBUTE; WGPU_EXPORT uint32_t wgpuTextureGetDepthOrArrayLayers(WGPUTexture texture) WGPU_FUNCTION_ATTRIBUTE; diff --git a/src/runtime/webgpu.cpp b/src/runtime/webgpu.cpp index b889ed5e7385..aa4e3fb5a71f 100644 --- a/src/runtime/webgpu.cpp +++ b/src/runtime/webgpu.cpp @@ -328,9 +328,7 @@ WEAK int create_webgpu_context(void *user_context) { << "WGPU: create_webgpu_context (user_context: " << user_context << ")\n"; - WGPUInstanceDescriptor desc{}; - desc.nextInChain = nullptr; - global_instance = wgpuCreateInstance(&desc); + global_instance = wgpuCreateInstance(nullptr); debug(user_context) << "WGPU: wgpuCreateInstance produces: " << global_instance << ")\n"; diff --git a/test/common/gpu_context.h b/test/common/gpu_context.h index ffcbd1c603c0..474e837a91f4 100644 --- a/test/common/gpu_context.h +++ b/test/common/gpu_context.h @@ -186,9 +186,7 @@ inline bool create_webgpu_context(WGPUInstance *instance_out, WGPUAdapter *adapt bool success = true; } results; - WGPUInstanceDescriptor desc{}; - desc.nextInChain = nullptr; - results.instance = wgpuCreateInstance(&desc); + results.instance = wgpuCreateInstance(nullptr); auto request_adapter_callback = [](WGPURequestAdapterStatus status, WGPUAdapter adapter, char const *message, void *userdata) { auto *results = (Results *)userdata; @@ -234,12 +232,7 @@ inline bool create_webgpu_context(WGPUInstance *instance_out, WGPUAdapter *adapt WGPUDeviceDescriptor desc{}; desc.nextInChain = nullptr; desc.label = nullptr; -#if defined(__EMSCRIPTEN__) - // ...sigh, really? - desc.requiredFeaturesCount = 0; -#else desc.requiredFeatureCount = 0; -#endif desc.requiredFeatures = nullptr; desc.requiredLimits = &requestedLimits; desc.deviceLostCallback = device_lost_callback; From 22581bfc8a3059954045dc5cae33f08b833df57e Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Sun, 11 Feb 2024 18:40:09 +0000 Subject: [PATCH 055/186] Remove OpenGLCompute (#8077) * Remove OpenGLCompute This was supposed to be removed in Halide 17 (oops), removing for Halide 18 * Update dynamic_allocation_in_gpu_kernel.cpp * Update dynamic_allocation_in_gpu_kernel.cpp * Update halide_ir.fbs --- Makefile | 13 - README.md | 2 +- README_cmake.md | 18 - apps/CMakeLists.txt | 1 - apps/openglcompute/AndroidManifest.xml | 22 - apps/openglcompute/Makefile | 99 -- apps/openglcompute/build.sh | 9 - apps/openglcompute/build.xml | 20 - apps/openglcompute/jni/Android.mk | 69 -- apps/openglcompute/jni/Application.mk | 7 - apps/openglcompute/jni/oglc_run.cpp | 250 ---- .../jni/oglc_two_kernels_run.cpp | 89 -- .../res/drawable-hdpi/ic_launcher.png | Bin 9397 -> 0 bytes .../res/drawable-ldpi/ic_launcher.png | Bin 2729 -> 0 bytes .../res/drawable-mdpi/ic_launcher.png | Bin 5237 -> 0 bytes .../res/drawable-xhdpi/ic_launcher.png | Bin 14383 -> 0 bytes apps/openglcompute/res/layout/main.xml | 15 - apps/openglcompute/res/values/strings.xml | 4 - .../HalideOpenGLComputeActivity.java | 30 - apps/openglcompute/test_oglc_avg.cpp | 59 - apps/openglcompute/test_two_kernels.cpp | 40 - cmake/HalideGeneratorHelpers.cmake | 16 - packaging/common/Description.txt | 2 +- .../src/halide/halide_/PyEnums.cpp | 2 - .../test/correctness/boundary_conditions.py | 1 - python_bindings/test/correctness/target.py | 3 +- src/BoundSmallAllocations.cpp | 11 +- src/CMakeLists.txt | 7 - src/CodeGen_C.cpp | 4 - src/CodeGen_Internal.cpp | 2 - src/CodeGen_OpenGLCompute_Dev.cpp | 1029 ----------------- src/CodeGen_OpenGLCompute_Dev.h | 23 - src/CodeGen_Vulkan_Dev.cpp | 2 - src/Deserialization.cpp | 2 - src/DeviceAPI.h | 2 - src/DeviceInterface.cpp | 7 - src/FuseGPUThreadLoops.cpp | 7 +- src/IRPrinter.cpp | 3 - src/JITModule.cpp | 63 - src/LLVM_Runtime_Linker.cpp | 21 - src/Lower.cpp | 9 +- src/Module.cpp | 6 +- src/OffloadGPULoops.cpp | 6 +- src/Pipeline.cpp | 4 - src/SelectGPUAPI.h | 2 +- src/Serialization.cpp | 2 - src/StorageFlattening.cpp | 4 +- src/Target.cpp | 15 - src/Target.h | 6 +- src/halide_ir.fbs | 1 - src/runtime/CMakeLists.txt | 5 - src/runtime/HalideRuntime.h | 2 - src/runtime/HalideRuntimeOpenGLCompute.h | 76 -- src/runtime/device_interface.cpp | 6 +- src/runtime/mini_opengl.h | 221 ---- src/runtime/opengl_egl_context.cpp | 181 --- src/runtime/opengl_glx_context.cpp | 156 --- src/runtime/openglcompute.cpp | 990 ---------------- src/runtime/osx_opengl_context.cpp | 118 -- src/runtime/runtime_api.cpp | 7 - test/correctness/async_copy_chain.cpp | 6 - test/correctness/async_device_copy.cpp | 6 - test/correctness/boundary_conditions.cpp | 8 +- test/correctness/device_buffer_copy.cpp | 5 - test/correctness/device_crop.cpp | 5 - test/correctness/device_slice.cpp | 5 - .../dynamic_allocation_in_gpu_kernel.cpp | 4 +- test/correctness/gpu_allocation_cache.cpp | 25 +- test/correctness/gpu_dynamic_shared.cpp | 5 - .../gpu_jit_explicit_copy_to_device.cpp | 2 +- test/correctness/gpu_large_alloc.cpp | 2 +- test/correctness/gpu_mixed_dimensionality.cpp | 2 +- test/correctness/gpu_multi_device.cpp | 8 - test/correctness/gpu_multi_kernel.cpp | 2 +- test/correctness/gpu_reuse_shared_memory.cpp | 4 +- test/correctness/logical.cpp | 20 +- test/correctness/math.cpp | 8 +- test/correctness/mul_div_mod.cpp | 2 - test/correctness/newtons_method.cpp | 3 +- test/correctness/parallel_gpu_nested.cpp | 2 +- test/correctness/plain_c_includes.c | 1 - test/correctness/target.cpp | 4 +- .../correctness/vectorized_gpu_allocation.cpp | 6 - 83 files changed, 44 insertions(+), 3862 deletions(-) delete mode 100644 apps/openglcompute/AndroidManifest.xml delete mode 100644 apps/openglcompute/Makefile delete mode 100755 apps/openglcompute/build.sh delete mode 100644 apps/openglcompute/build.xml delete mode 100644 apps/openglcompute/jni/Android.mk delete mode 100644 apps/openglcompute/jni/Application.mk delete mode 100644 apps/openglcompute/jni/oglc_run.cpp delete mode 100644 apps/openglcompute/jni/oglc_two_kernels_run.cpp delete mode 100644 apps/openglcompute/res/drawable-hdpi/ic_launcher.png delete mode 100644 apps/openglcompute/res/drawable-ldpi/ic_launcher.png delete mode 100644 apps/openglcompute/res/drawable-mdpi/ic_launcher.png delete mode 100644 apps/openglcompute/res/drawable-xhdpi/ic_launcher.png delete mode 100644 apps/openglcompute/res/layout/main.xml delete mode 100644 apps/openglcompute/res/values/strings.xml delete mode 100644 apps/openglcompute/src/com/example/hellohalideopenglcompute/HalideOpenGLComputeActivity.java delete mode 100644 apps/openglcompute/test_oglc_avg.cpp delete mode 100644 apps/openglcompute/test_two_kernels.cpp delete mode 100644 src/CodeGen_OpenGLCompute_Dev.cpp delete mode 100644 src/CodeGen_OpenGLCompute_Dev.h delete mode 100644 src/runtime/HalideRuntimeOpenGLCompute.h delete mode 100644 src/runtime/mini_opengl.h delete mode 100644 src/runtime/opengl_egl_context.cpp delete mode 100644 src/runtime/opengl_glx_context.cpp delete mode 100644 src/runtime/openglcompute.cpp delete mode 100644 src/runtime/osx_opengl_context.cpp diff --git a/Makefile b/Makefile index 04fc41fa4167..e1457ea161e2 100644 --- a/Makefile +++ b/Makefile @@ -126,7 +126,6 @@ WITH_WEBASSEMBLY ?= $(findstring webassembly, $(LLVM_COMPONENTS)) WITH_AMDGPU ?= $(findstring amdgpu, $(LLVM_COMPONENTS)) WITH_OPENCL ?= not-empty WITH_METAL ?= not-empty -WITH_OPENGLCOMPUTE ?= not-empty WITH_D3D12 ?= not-empty WITH_VULKAN ?= not-empty WITH_SPIRV ?= not-empty @@ -163,8 +162,6 @@ OPENCL_LLVM_CONFIG_LIB=$(if $(WITH_OPENCL), , ) METAL_CXX_FLAGS=$(if $(WITH_METAL), -DWITH_METAL, ) METAL_LLVM_CONFIG_LIB=$(if $(WITH_METAL), , ) -OPENGLCOMPUTE_CXX_FLAGS=$(if $(WITH_OPENGLCOMPUTE), -DWITH_OPENGLCOMPUTE, ) - D3D12_CXX_FLAGS=$(if $(WITH_D3D12), -DWITH_D3D12, ) D3D12_LLVM_CONFIG_LIB=$(if $(WITH_D3D12), , ) @@ -218,7 +215,6 @@ CXX_FLAGS += $(AARCH64_CXX_FLAGS) CXX_FLAGS += $(X86_CXX_FLAGS) CXX_FLAGS += $(OPENCL_CXX_FLAGS) CXX_FLAGS += $(METAL_CXX_FLAGS) -CXX_FLAGS += $(OPENGLCOMPUTE_CXX_FLAGS) CXX_FLAGS += $(D3D12_CXX_FLAGS) CXX_FLAGS += $(WEBGPU_CXX_FLAGS) CXX_FLAGS += $(POWERPC_CXX_FLAGS) @@ -345,7 +341,6 @@ endif ifneq ($(TEST_VULKAN), ) VULKAN_LD_FLAGS ?= -lvulkan endif -OPENGL_LD_FLAGS ?= -lGL HOST_OS=linux endif @@ -364,7 +359,6 @@ endif ifneq ($(TEST_METAL), ) METAL_LD_FLAGS ?= -framework Metal -framework Foundation endif -OPENGL_LD_FLAGS ?= -framework OpenGL HOST_OS=os_x endif @@ -476,7 +470,6 @@ SOURCE_FILES = \ CodeGen_Metal_Dev.cpp \ CodeGen_OpenCL_Dev.cpp \ CodeGen_Vulkan_Dev.cpp \ - CodeGen_OpenGLCompute_Dev.cpp \ CodeGen_Posix.cpp \ CodeGen_PowerPC.cpp \ CodeGen_PTX_Dev.cpp \ @@ -670,7 +663,6 @@ HEADER_FILES = \ CodeGen_Metal_Dev.h \ CodeGen_OpenCL_Dev.h \ CodeGen_Vulkan_Dev.h \ - CodeGen_OpenGLCompute_Dev.h \ CodeGen_Posix.h \ CodeGen_PTX_Dev.h \ CodeGen_PyTorch.h \ @@ -854,13 +846,9 @@ RUNTIME_CPP_COMPONENTS = \ msan \ msan_stubs \ opencl \ - opengl_egl_context \ - opengl_glx_context \ - openglcompute \ osx_clock \ osx_get_symbol \ osx_host_cpu_count \ - osx_opengl_context \ osx_yield \ posix_aligned_alloc \ posix_allocator \ @@ -931,7 +919,6 @@ RUNTIME_EXPORTED_INCLUDES = $(INCLUDE_DIR)/HalideRuntime.h \ $(INCLUDE_DIR)/HalideRuntimeHexagonDma.h \ $(INCLUDE_DIR)/HalideRuntimeHexagonHost.h \ $(INCLUDE_DIR)/HalideRuntimeOpenCL.h \ - $(INCLUDE_DIR)/HalideRuntimeOpenGLCompute.h \ $(INCLUDE_DIR)/HalideRuntimeMetal.h \ $(INCLUDE_DIR)/HalideRuntimeQurt.h \ $(INCLUDE_DIR)/HalideRuntimeVulkan.h \ diff --git a/README.md b/README.md index 6ebe04107159..c5dfe5507a8b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ currently targets: - CPU architectures: X86, ARM, Hexagon, PowerPC, RISC-V - Operating systems: Linux, Windows, macOS, Android, iOS, Qualcomm QuRT -- GPU Compute APIs: CUDA, OpenCL, OpenGL Compute Shaders, Apple Metal, Microsoft +- GPU Compute APIs: CUDA, OpenCL, Apple Metal, Microsoft Direct X 12, Vulkan Rather than being a standalone programming language, Halide is embedded in C++. diff --git a/README_cmake.md b/README_cmake.md index 49e2f2feb3f7..3908920450a1 100644 --- a/README_cmake.md +++ b/README_cmake.md @@ -500,23 +500,6 @@ If the CMake version is lower than 3.18, the deprecated [`FindCUDA`][findcuda] module will be used instead. It reads the variable `CUDA_TOOLKIT_ROOT_DIR` instead of `CUDAToolkit_ROOT` above. -TODO(https://github.com/halide/Halide/issues/5633): update this section for OpenGLCompute, which needs some (but maybe not all) of this. - -When targeting OpenGL, the [`FindOpenGL`][findopengl] and [`FindX11`][findx11] -modules will be used to link AOT generated binaries. These modules can be -overridden by setting the following variables: - -| Variable | Description | -|-------------------------|----------------------------------| -| `OPENGL_egl_LIBRARY` | Path to the EGL library. | -| `OPENGL_glu_LIBRARY` | Path to the GLU library. | -| `OPENGL_glx_LIBRARY` | Path to the GLVND GLX library. | -| `OPENGL_opengl_LIBRARY` | Path to the GLVND OpenGL library | -| `OPENGL_gl_LIBRARY` | Path to the OpenGL library. | - -The OpenGL paths will need to be set if you intend to use OpenGL with X11 on -macOS. - Halide also searches for `libpng` and `libjpeg-turbo` through the [`FindPNG`][findpng] and [`FindJPEG`][findjpeg] modules, respectively. They can be overridden by setting the following variables. @@ -1395,7 +1378,6 @@ guidelines you should follow when writing a new app. [finddoxygen]: https://cmake.org/cmake/help/latest/module/FindDoxygen.html [findjpeg]: https://cmake.org/cmake/help/latest/module/FindJPEG.html [findopencl]: https://cmake.org/cmake/help/latest/module/FindOpenCL.html -[findopengl]: https://cmake.org/cmake/help/latest/module/FindOpenGL.html [findpng]: https://cmake.org/cmake/help/latest/module/FindPNG.html [findpython3]: https://cmake.org/cmake/help/latest/module/FindPython3.html [findx11]: https://cmake.org/cmake/help/latest/module/FindX11.html diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index 1f6abcdc6e64..13d73167e865 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -58,7 +58,6 @@ add_app(max_filter) add_app(nl_means) # add_app(nn_ops) # TODO(#5374): missing CMake build # add_app(onnx) # TODO(#5374): missing CMake build -# add_app(openglcompute) # TODO(#5374): missing CMake build add_app(resize) # add_app(resnet_50) # TODO(#5374): missing CMake build # add_app(simd_op_check) # TODO(#5374): missing CMake build diff --git a/apps/openglcompute/AndroidManifest.xml b/apps/openglcompute/AndroidManifest.xml deleted file mode 100644 index e809beefd0ea..000000000000 --- a/apps/openglcompute/AndroidManifest.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - - - - - - - - - - - - - - diff --git a/apps/openglcompute/Makefile b/apps/openglcompute/Makefile deleted file mode 100644 index 4bda85258364..000000000000 --- a/apps/openglcompute/Makefile +++ /dev/null @@ -1,99 +0,0 @@ -include ../support/Makefile.inc - -CXX ?= c++ - -TOP := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))/../..) -.PHONY: all $(TOP) -all: run run-two -HALIDE_LIB := $(TOP)/$(LIBHALIDE_LDFLAGS) -$(HALIDE_LIB): $(TOP) - $(MAKE) -C $(TOP) - -test_%: test_%.cpp - $(CXX) -std=c++17 -I ../../include/ $< -L ../../bin/ -lHalide $(HALIDE_SYSTEM_LIBS) -o $@ -g - -avg_filter_uint32t.o avg_filter_uint32t.h avg_filter_float.o avg_filter_float.h: test_oglc_avg - LD_LIBRARY_PATH=../../bin DYLD_LIBRARY_PATH=../../bin HL_TARGET=arm-32-android-armv7s-openglcompute ./$< - -avg_filter_uint32t_arm.o avg_filter_uint32t_arm.h avg_filter_float_arm.o avg_filter_float_arm.h: test_oglc_avg - LD_LIBRARY_PATH=../../bin DYLD_LIBRARY_PATH=../../bin HL_TARGET=arm-32-android-armv7s ./$< "_arm" - -AVG_FILTER_SRC = jni/oglc_run.cpp \ - avg_filter_uint32t.o avg_filter_uint32t.h \ - avg_filter_uint32t_arm.o avg_filter_uint32t_arm.h \ - avg_filter_float.o avg_filter_float.h \ - avg_filter_float_arm.o avg_filter_float_arm.h - -libs/armeabi-v7a/oglc_run: $(HALIDE_LIB) $(AVG_FILTER_SRC) - ndk-build libs/armeabi-v7a/oglc_run - -two_kernels_filter.o two_kernels_filter.h: test_two_kernels - LD_LIBRARY_PATH=../../bin DYLD_LIBRARY_PATH=../../bin HL_TARGET=arm-32-android-armv7s-openglcompute ./$< - -TWO_KERNELS_SRC = jni/oglc_two_kernels_run.cpp \ - two_kernels_filter.o two_kernels_filter.h - -libs/armeabi-v7a/oglc_two_kernels_run: $(HALIDE_LIB) $(TWO_KERNELS_SRC) - ndk-build libs/armeabi-v7a/oglc_two_kernels_run libs/armeabi-v7a/liboglc_two_kernels.so - -jni-libs: $(HALIDE_LIB) $(AVG_FILTER_SRC) $(TWO_KERNELS_SRC) - ndk-build libs/armeabi-v7a/liboglc_two_kernels.so libs/armeabi-v7a/liboglc.so - -deploy: libs/armeabi-v7a/oglc_run - adb push libs/armeabi-v7a/oglc_run /mnt/sdcard/ - -define RUN_STEPS -su -mkdir -p /data/tmp -rm -rf /data/tmp/oglc -mkdir /data/tmp/oglc -cd /data/tmp/oglc -pwd -cp /mnt/sdcard/oglc_run . -chmod 777 /data/tmp/oglc/oglc_run -LD_LIBRARY_PATH=. ./oglc_run -exit -exit -endef -export RUN_STEPS - - -run: deploy - adb logcat -c - sh -c 'echo "$$RUN_STEPS" | adb shell' - adb logcat -d | grep "I oglc" - echo "Done" - -deploy-two: libs/armeabi-v7a/oglc_two_kernels_run - adb push libs/armeabi-v7a/oglc_two_kernels_run /mnt/sdcard/ - - -define RUN_TWO_STEPS -su -mkdir /data/tmp -cd /data/tmp -pwd -cp /mnt/sdcard/oglc_two_kernels_run . -chmod 777 /data/tmp/oglc_two_kernels_run -LD_LIBRARY_PATH=. ./oglc_two_kernels_run -exit -exit -endef -export RUN_TWO_STEPS - -run-two: deploy-two - adb logcat -c - sh -c 'echo "$$RUN_TWO_STEPS" | adb shell' - adb logcat -d | grep "I oglc" - echo "Done" - -clean: - rm -f test_oglc_avg - rm -rf test_oglc_avg.dSYM/ - rm -f avg_filter* - rm -f test_two_kernels - rm -rf test_two_kernels.dSYM/ - rm -rf libs/ - rm -rf obj/ - rm -rf bin/ - rm -rf gen/ diff --git a/apps/openglcompute/build.sh b/apps/openglcompute/build.sh deleted file mode 100755 index e00ac542386f..000000000000 --- a/apps/openglcompute/build.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -e -android update project -p . --target android-21 -make jni-libs -ant debug -adb install -r bin/HelloHalideOpenGLCompute-debug.apk -adb logcat -c -adb shell am start -n com.example.hellohalideopenglcompute/.HalideOpenGLComputeActivity -adb logcat | grep "^I/oglc" diff --git a/apps/openglcompute/build.xml b/apps/openglcompute/build.xml deleted file mode 100644 index 1773fe07f123..000000000000 --- a/apps/openglcompute/build.xml +++ /dev/null @@ -1,20 +0,0 @@ - - - - - - - - - - - - - - - - - diff --git a/apps/openglcompute/jni/Android.mk b/apps/openglcompute/jni/Android.mk deleted file mode 100644 index 232e91e208cc..000000000000 --- a/apps/openglcompute/jni/Android.mk +++ /dev/null @@ -1,69 +0,0 @@ -LOCAL_PATH:= $(call my-dir) - -# === oglc_run === - -include $(CLEAR_VARS) - -LOCAL_MODULE := oglc_run -LOCAL_SRC_FILES := oglc_run.cpp -LOCAL_STATIC_LIBRARIES := android_native_app_glue -LOCAL_STATIC_LIBRARIES += libOpengl -LOCAL_LDLIBS := -lm -llog -landroid -lEGL -lGLESv2 avg_filter_uint32t.o avg_filter_uint32t_arm.o avg_filter_float.o avg_filter_float_arm.o -LOCAL_ARM_MODE := arm - -LOCAL_CPPFLAGS += -std=c++17 -I../support -I../../include - -LOCAL_C_INCLUDES += ./ - -include $(BUILD_EXECUTABLE) - -# === oglc library === - -include $(CLEAR_VARS) - -LOCAL_MODULE := oglc -LOCAL_SRC_FILES := oglc_run.cpp -LOCAL_STATIC_LIBRARIES += libOpengl -LOCAL_LDLIBS := -lm -llog -landroid -lEGL -lGLESv2 avg_filter_uint32t.o avg_filter_uint32t_arm.o avg_filter_float.o avg_filter_float_arm.o -LOCAL_ARM_MODE := arm - -LOCAL_CPPFLAGS += -std=c++17 -I../support -I../../include - -LOCAL_C_INCLUDES += ./ - -include $(BUILD_SHARED_LIBRARY) - -# === oglc_two_kernels_run === - -include $(CLEAR_VARS) - -LOCAL_MODULE := oglc_two_kernels_run -LOCAL_SRC_FILES := oglc_two_kernels_run.cpp -LOCAL_STATIC_LIBRARIES := android_native_app_glue -LOCAL_STATIC_LIBRARIES += libOpengl -LOCAL_LDLIBS := -lm -llog -landroid -lEGL -lGLESv2 two_kernels_filter.o -LOCAL_ARM_MODE := arm - -LOCAL_CPPFLAGS += -std=c++17 -I../support -I../../include - -LOCAL_C_INCLUDES += ./ - -include $(BUILD_EXECUTABLE) - -# === oglc_two_kernels library === - -include $(CLEAR_VARS) - -LOCAL_MODULE := oglc_two_kernels -LOCAL_SRC_FILES := oglc_two_kernels_run.cpp -LOCAL_STATIC_LIBRARIES += libOpengl -LOCAL_LDLIBS := -lm -llog -landroid -lEGL -lGLESv2 two_kernels_filter.o -LOCAL_ARM_MODE := arm - -LOCAL_CPPFLAGS += -std=c++17 -I../support -I../../include - -LOCAL_C_INCLUDES += ./ - -include $(BUILD_SHARED_LIBRARY) - -$(call import-module,android/native_app_glue) diff --git a/apps/openglcompute/jni/Application.mk b/apps/openglcompute/jni/Application.mk deleted file mode 100644 index 88a9ea14cc27..000000000000 --- a/apps/openglcompute/jni/Application.mk +++ /dev/null @@ -1,7 +0,0 @@ -# TODO(aam): Confirm that application builds and runs for all supported targets: -# APP_ABI := armeabi armeabi-v7a arm64-v8a x86_64 x86 -APP_ABI := armeabi-v7a -APP_PLATFORM := android-17 - -APP_STL := c++_static -LOCAL_C_INCLUDES += ${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/4.8/include diff --git a/apps/openglcompute/jni/oglc_run.cpp b/apps/openglcompute/jni/oglc_run.cpp deleted file mode 100644 index 3378ab555dd1..000000000000 --- a/apps/openglcompute/jni/oglc_run.cpp +++ /dev/null @@ -1,250 +0,0 @@ -#include "avg_filter_float.h" -#include "avg_filter_float_arm.h" -#include "avg_filter_uint32t.h" -#include "avg_filter_uint32t_arm.h" -#include -#include -#include -#include -#include - -#include "HalideBuffer.h" -#include "HalideRuntimeOpenGLCompute.h" - -#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "oglc_run", __VA_ARGS__) -#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "oglc_run", __VA_ARGS__) - -using Halide::Runtime::Buffer; - -typedef int (*filter_t)(halide_buffer_t *, halide_buffer_t *); - -struct timing { - filter_t filter; - Buffer<> *input; - Buffer<> *output; - double worst_t = 0; - int worst_rep = 0; - double best_t = DBL_MAX; - int best_rep = 0; - - template - timing(filter_t filter, Buffer *input, Buffer *output) - : filter(filter), input(&input->template as()), output(&output->template as()) { - } - - int run(int n_reps, bool with_copying) { - timeval t1, t2; - for (int i = 0; i < n_reps; i++) { - input->set_host_dirty(); - gettimeofday(&t1, NULL); - int error = filter(*input, *output); - output->device_sync(); - - if (with_copying) { - output->copy_to_host(); - } - gettimeofday(&t2, NULL); - if (error) { - return error; - } - double t = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0; - if (t < best_t) { - best_t = t; - best_rep = i; - } - if (t > worst_t) { - worst_t = t; - worst_rep = i; - } - } - return 0; - } -}; - -template -class Tester; - -template -bool doBlur(Tester *tester, - Buffer bt_input, - Buffer bt_output, - Buffer bt_output_arm) { - return false; // This abstract implementation should never be called -} - -template -bool doCopy(Tester *tester, - Buffer bt_input, - Buffer bt_output, - Buffer bt_output_arm) { - return false; // This abstract implementation should never be called -} - -template -class Tester { - int debug_level; - -public: - Tester(int _debug_level = 0) - : debug_level(_debug_level) { - } - -private: - bool validate(Buffer actual, Buffer expected) { - int count_mismatches = 0; - actual.for_each_element([&](int x, int y, int c) { - T actual_value = actual(x, y, c); - T expected_value = expected(x, y, c); - const float EPSILON = 0.00001f; - if (abs((double((actual_value - expected_value)) > EPSILON))) { - if (count_mismatches < 100) { - std::ostringstream str; - str << "actual and expected results differ at " - << "(" << x << ", " << y << ", " << c << "):" - << +actual_value << " != " << +expected_value - << "\n"; - LOGI("%s", str.str().c_str()); - } - count_mismatches++; - } - }); - - return count_mismatches == 0; - } - - void print(Buffer buf) { - for (int j = 0; j < std::min(buf.height(), 10); j++) { - std::stringstream oss; - for (int i = 0; i < std::min(buf.width(), 10); i++) { - oss << " ["; - for (int k = 0; k < buf.channels(); k++) { - oss << std::fixed << std::setprecision(1); - if (k > 0) { - oss << std::setw(4); - } - oss << +buf(i, j, k); - } - oss << "]"; - } - LOGI("%s", oss.str().c_str()); - } - } - -public: - bool test(Buffer input, - Buffer output, - Buffer output_arm, - filter_t avg_filter, - filter_t avg_filter_arm) { - - // Performance check - input.set_host_dirty(); - timing openglcompute(avg_filter, &input, &output); - input.set_host_dirty(); - timing openglcompute_with_copying(avg_filter, &input, &output); - input.set_host_dirty(); - timing arm(avg_filter_arm, &input, &output_arm); - - const int N_REPS = 10; - arm.run(N_REPS, false); - openglcompute.run(N_REPS, false); - openglcompute_with_copying.run(N_REPS, true); - - LOGI("Out of %d runs best times are:\n" - "openglcompute: %fms(@%d)\n" - "openglcompute(with copy): %fms(@%d)\n" - "ARM: %fms(@%d)\n", - N_REPS, - openglcompute.best_t, openglcompute.best_rep, - openglcompute_with_copying.best_t, openglcompute_with_copying.best_rep, - arm.best_t, arm.best_rep); - LOGI("Out of %d runs worst times are:\n" - "openglcompute: %fms(@%d)\n" - "openglcompute(with copy): %fms(@%d)\n" - "ARM: %fms(@%d)\n", - N_REPS, - openglcompute.worst_t, openglcompute.worst_rep, - openglcompute_with_copying.worst_t, openglcompute_with_copying.worst_rep, - arm.worst_t, arm.worst_rep); - - // Data correctness check - input.set_host_dirty(); - avg_filter(input, output); - LOGI("Filter is done."); - output.device_sync(); - LOGI("Sync is done"); - output.copy_to_host(); - - LOGI("Output arm:"); - print(output_arm); - LOGI("Output openglcompute:"); - print(output); - - bool matches = validate(output, output_arm); - LOGI(matches ? "Test passed.\n" : "Test failed.\n"); - - return matches; - } - - void runTest() { - int width = 4096; - int height = 2048; - int channels = 4; - - auto input = Buffer::make_interleaved(width, height, channels); - LOGI("Allocated memory for %dx%dx%d image", width, height, channels); - - input.for_each_element([&](int i, int j, int k) { - input(i, j, k) = ((i + j) % 2) * 6; - }); - - LOGI("Input :\n"); - print(input); - - auto output = Buffer::make_interleaved(width, height, channels); - auto output_arm = Buffer::make_interleaved(width, height, channels); - - doBlur(this, input, output, output_arm); - } -}; - -template<> -bool doBlur(Tester *tester, - Buffer bt_input, - Buffer bt_output, - Buffer bt_output_arm) { - return tester->test(bt_input, - bt_output, bt_output_arm, - avg_filter_float, - avg_filter_float_arm); -} - -template<> -bool doBlur(Tester *tester, - Buffer bt_input, - Buffer bt_output, - Buffer bt_output_arm) { - return tester->test(bt_input, - bt_output, bt_output_arm, - avg_filter_uint32t, - avg_filter_uint32t_arm); -} - -int main(int argc, char **argv) { - LOGI("\nvvvv vvvv vvvv"); - LOGI("\nTesting uint32_t...\n"); - (new Tester())->runTest(); - LOGI("---- ---- ----"); - LOGI("\nTesting float...\n"); - (new Tester())->runTest(); - - halide_device_release(NULL, halide_openglcompute_device_interface()); - - LOGI("^^^^ ^^^^ ^^^^\n"); -} - -extern "C" { -JNIEXPORT void JNICALL Java_com_example_hellohalideopenglcompute_HalideOpenGLComputeActivity_runTest(JNIEnv *env, jobject obj) { - main(0, NULL); -} -} diff --git a/apps/openglcompute/jni/oglc_two_kernels_run.cpp b/apps/openglcompute/jni/oglc_two_kernels_run.cpp deleted file mode 100644 index 6574de25ae39..000000000000 --- a/apps/openglcompute/jni/oglc_two_kernels_run.cpp +++ /dev/null @@ -1,89 +0,0 @@ -#include "two_kernels_filter.h" -#include -#include -#include -#include -#include - -#include "HalideBuffer.h" -#include "HalideRuntimeOpenGLCompute.h" - -#define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "oglc_run", __VA_ARGS__) -#define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "oglc_run", __VA_ARGS__) - -template -void print(Halide::Runtime::Buffer buf) { - for (int j = 0; j < std::min(buf.height(), 10); j++) { - std::stringstream oss; - for (int i = 0; i < std::min(buf.width(), 10); i++) { - oss << " ["; - for (int k = 0; k < buf.channels(); k++) { - oss << std::fixed << std::setprecision(1); - if (k > 0) { - oss << std::setw(4); - } - oss << +buf(i, j, k); - } - oss << "]"; - } - LOGI("%s", oss.str().c_str()); - } -} - -int main(int argc, char **argv) { - LOGI("\nvvvv vvvv vvvv"); - - int width = 128; - int height = 128; - int channels = 4; - - auto input = Halide::Runtime::Buffer::make_interleaved(width, height, channels); - LOGI("Allocated memory for %dx%dx%d image", width, height, channels); - - input.for_each_element([&](int i, int j, int k) { - input(i, j, k) = ((i + j) % 2) * 6; - }); - - LOGI("Input :\n"); - print(input); - - auto output = Halide::Runtime::Buffer::make_interleaved(width, height, channels); - - two_kernels_filter(input, output); - LOGI("Filter is done."); - output.device_sync(); - LOGI("Sync is done"); - output.copy_to_host(); - - LOGI("Output :\n"); - print(output); - - int count_mismatches = 0; - output.for_each_element([&](int i, int j, int k) { - int32_t output_value = output(i, j, k); - int32_t input_value = input(i, j, k); - if (output_value != input_value) { - if (count_mismatches < 100) { - std::ostringstream str; - str << "output and input results differ at " - << "(" << i << ", " << j << ", " << k << "):" - << output_value << " != " << input_value - << "\n"; - LOGI("%s", str.str().c_str()); - } - count_mismatches++; - } - }); - - LOGI(count_mismatches == 0 ? "Test passed.\n" : "Test failed.\n"); - - halide_device_release(NULL, halide_openglcompute_device_interface()); - - LOGI("^^^^ ^^^^ ^^^^\n"); -} - -extern "C" { -JNIEXPORT void JNICALL Java_com_example_hellohalideopenglcompute_HalideOpenGLComputeActivity_runTwoKernelsTest(JNIEnv *env, jobject obj) { - main(0, NULL); -} -} diff --git a/apps/openglcompute/res/drawable-hdpi/ic_launcher.png b/apps/openglcompute/res/drawable-hdpi/ic_launcher.png deleted file mode 100644 index 96a442e5b8e9394ccf50bab9988cb2316026245d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9397 zcmV;mBud+fP)L`9r|n3#ts(U@pVoQ)(ZPc(6i z8k}N`MvWQ78F(rhG(?6FnFXYo>28{yZ}%O}TvdDT_5P?j=iW=V`8=UNc_}`JbG!ST zs@lK(TWkH+P**sB$A`cEY%Y53cQ}1&6`x-M$Cz&{o9bLU^M-%^mY?+vedlvt$RT-^ zu|w7}IaWaljBq#|I%Mpo!Wc2bbZF3KF9|D%wZe{YFM=hJAv$>j>nhx`=Wis#KG!cJA5x!4)f) zezMz1?Vn$GnZNjbFXH(pK83nn!^3=+^*kTTs5rV9Dq^XS(IKO!mKt5!dSmb3IVCxZ z8TTk5IE)F1V29$G7v#j9d-hy&_pdg8?kT4)zqr>?`}I%W>(?GO%*C&}?Fp|bI*~2&KZ$%^B6R&1~2kA{`CWy+>F-x=z-f{_&vyu_3yp{jtw(*syi% zu3t2|4{c~LJXRt2m>rMg2V_kLltCZ<`m>qcI?BPP?6hf``|e!rZEFszeYQ3f-*nAS zZ+h1$mFwy+7156lkB(k6)!1fUbJCxgIBK38$jj5cC$r&YXN)nr#PY=tJaLc?C_o?j+8H3Q>891JJ9&$l-r+-SG#q)*;r52% z@nlKflb65o%s*Jt)!pw1k{vIoQIvoJ0Y&Msiw0X!qJ)_47G*?aJ6bJFLh_4b$5&1k5wN>du*>6#i7R9T8; z7>EHOV=ue7mo77SJPwER4(A+s?n0JjYK)b}Om6n>ke?0JR=jTI+RFBg_iwb7k%n*2 zR_M0DJ9x+0zxba4(B1y^JQ_Nj6dlP5PGXvSq8fF#mxrFYj3d9(V#jJwt+IqU9+8+D z6C6Us1OI$d8OF!3+Hm1 zW5in zXV^%U35HooOpSmeqlG6e0kUMYNonKp1vr|My9}4-WO+uOxe_c-o&}%voNYHkqtle% z5yQ_^oozSUUNu30EQSAl!Q%(%3G1NXENSMjCL*Vx-Td2~rk(}d z8pT!HZe>1r5EGuz`pgsg@^yQEi=BIa#meLq0!?{TZ}q#}=7UC9_l=w|wv+pP!g4#! zRys6EN$Jv}#U47$k&)pDzvks}LGfPku6P9p!56Py)~1)W(11n7n}`Wx!=;_JTiu#d zpCqx=hEk@t4sp?!j{W}wP@V-=Pd=T^>6IKBy;#mLA7hCe{V7B3@I7Ipa}L`MbF|YQ z)$BNWsiEnoNHrtJli|n8cOnn4NyF=8MbVxgof0>Uv%wM_j94a;8(LMjlL~E(99gJ*2%JtNtAkD@j;^ za~Y~&j6uY{=Rv5S4joH*RW_m9N{ZSN0HhAwFyJNok zS9kx$>wMf%tUi&Eb`6u0lWJ|k?A-42(lp2UmS(PrAc(24wexRiHUieMwf$o%m6$xs zp#-SdBUu2D5`v;(9-sm&kN2M74c&AvKe_v@tQ|dzJ2qSgQHpnUP(iQ?J%Il;Jdyp# z7}cpq6Kdm+FS~zS4Eo;fuO=DFP*UlpO|_CNt5&NUqBvQWxmg7#ARvMf=%#H@p%RZ` zjK$hMbNb+vVP3UlkfIt&ptJ<00Ic{Ka+lF+&w;OEs1O2#V8~O|R*Gq9TIgM&UqM&bZOXBwnbC? zDr))NR&g>lwVgcmnx`K1$)PTTw3m}-T11^ZkY{}jQ@lGD$XzJIcVFkYBBW=o_}TUU zt@yd{Jz;@~72x#!RG(#ira6}v-*J#<{@@^OI-Q2T^}=IKLubsa&V-%WwlF1s7fz~u zMdQTV7SnRet#^`VO0V7H(?59X{uy+S`(sorO@2-+qioUdo9+6r4#|jb=?t50oh42R z{}I>Krut|YKkOc|O|M>y#(3YA;I(i+MiHSfwbJA$jIUr$Y2i|u)*>@2eUYk`j4C5r z>61dKu!AqM_E7#DoDzbd-bfT%AYXUUB{SS|{b{`5^?wz1{PVQgTlvyqOX8(#GTz(U zNPhnj>$lC`xaD56`TjW&uW8p~qikP*F8kHFM0frzdk%UNGjb1O$%uLK`0-)2UsZ3L z#+j+CI_8k4VslL%$aVR@joX>M-@odbX!os$xY$HDIOCokY?{Q0v2kQErf|ZlN>D9w zC+2}E&?rDdi#%))$p%P4C_xGXu=@U~_<|V4L|{>TP$XBp$5pCPXLzK3!;gP>7=QNi zkNOur`>xY=@VSpB#LsN9JKpOz({ANcdv>?K+D_*_HZ<;9>kplj^Ph5!e&&a#?(3vK z_Q@}D_M5kGcx^AuaI~qKYUnb1Mj-n;MURXa)+x7~e2gbMW|gw?5Rg zTOMlo>6zIJ$VNVgn(@kTSL0eP)nR35IHpoHM2W#h6cNmTm@-9`dFJ$;k(S`7Lg@RY zp!hNmb9un!O4Wt05ANDGirv(B14gW| zwjP}C9bK{J`qZ_S2o)b`RonR-b8~y8)$H0`+gg6>#^wu8eCp9xA9B>>8(KRizI?+^ zAJ#i>*({qM-c4gBB~5dzg(wj!HA`hkh!aDl5>u&J;>2K#Ax2)2wt|L!9X;(=*jy!`r4_FhCBoRxNjXNv(~jGQ|%<}%K6RimaBJcP0v}oCgRN3B;oiM)opj? zXm;;tv3q-yy}NqMOr^~3&1lW$w3}UK_IT2sCrkYx5$&6e2A%g;QZUX~A&L!2rFd0p z5%men@^zN_Xw2|v%*c2|wQfkN4r6u&k;LxYY+w3{KY#cie)!iz>(yAgt=&-+Sy2V& z9BJxI+VMKQ%dvY~x>gmEijj3ss_*NAT(8d1@DQ6e&#Ln&6Qk>wHrh>;V2nvomC`8& z(w?`?*_^3u-TJrMzv2~7dH(XLJvUOXk4U8oW6Ol)YsawhIB{GdvIzu1hzMTrE)cvB z%2GxMpaF89<9uF(?cfN(BNR?wwWvCZ6e62+G_{$+;`yjgLj{(^z*zzwd;K3RElb*%=??P zm+lLY0@Y}^kVdMYX5M)YJ~8h=i(S{q#NfU0xPTao4WPDQL=Y_;vg=p%iay1_`<0Ga zMG&<(pOU+bI2u9_g8IJBTqGX*3@G$Zc`pj0f@)vd2?Aj`ms>DHg>;w~p}HXV(*VJX zphd;fht9qL3E)D8h$$A;SGl22Ygv>`iU=A)z=1ZYN$|2`*$`R)?KD>$tw_e9h_x~eX_udS~Q%yz?48i*aIa+_wx|j{B zsG7mwZ)6M3dmvgMC3K-66;ML(9o2xU!F8+qF)>v{1;ip)6v_I)6law|rd_Dx2oV|n z(Qm_PUnTTuKFG)w%s|)lS!w~Lm$k|Al=0djocyHU;>1H=!N}0E0lSV^b2^6~^lUco zyoH+|_!li3#euHd4TJS8=CLaHG9H8g&h3Xm z#>BkpUBAmae(#)qO3)ZMG3irM=5IzA^s+)w86=tIMT{&?Awux<(k2>U#n`c&@Z?u= z%=#BoO-9Nc^?)hz*YW~~tU8rLR-MZBJsY_7fp2r~mY>q-O;L%5Fp?}V6CK=F(18U3 znxB8ZR0TT{)T64RDt!+yFgp!JXGP0|It0Hz2Em#YfRv>O>8A?J=Sz!nq<|{&mW=?~ zDQT{S6PH0|jwy37t+0Ob6izz)JdRlNEUbyk>-K?}FOT=Dj9SuS_0nTFd+A^D?Bo83 zTkicXcW=IuZoZd(Dl;&#`LI;_s?e;OH9quf?*XuV0O$Qh0j~HWKpA|PXV4&b2zs z@W5<)dtovIRZ@gvsi$^s;v05(XwF3$lJ;wzYfE`46fnT7>!qt|hWHRE>yQP)i8= zVbC|O{Ud6%kwGcch>>|pE-=?cW;TDR0lE5Nw7l66lr-zIYT3bj^ujCn$b0{ZO;gwK z#}}W(*T3~in$6ZCpbB98pftPTo;!K>U;H*7_}t4m;;4i9#^2t`pS<=jsnx198);d3 z-M6Mx{7-c0A-jhJQ`5mBy8TBnfbr2~sER5E5oz}=so34cg)GYarRWi8w#W$%G{?Z*4xDb#LX1B1 zg!4G{m~*)H_J8J^SNt`XU-fxjea`>p_$Qyn*Dn18*WdPCp8oWw^XU)%kfRQHMgfQh z1j_ua@O4G%QK;&YH3Y9(q!hkgOUCkcVH5N0Ug(EPX%H6qCfPqg))qrd#ec^47dBu- z=sRkmjGS>3K(tfRTo;zCXO-74hV;y1!vCN}v|w?AWR$YpYXs@Dr?iNLKD9s|2)0aHY!TKTYhwMI z7b#54h!H6rUU9+xnL$g6h?t?Li5guXPY1g)$bI$~rHWP%QkYJ6Y-U^0C(@*$ruN2*zn0QRBOeVpgMFbT%k!Dn1*u#%J^y)enX1K;0~ z%3Q zP(b%}P!Loj6M{v96(Qa~K!bq-V-P89U_K)0zHC_F#L==3IPh2hHG6&?rxvQ%|EljR zfGIDyu=rIrl1dyjuMfwuh?pXZmARwNZ?GbW;5BH5D#nN|WbGm+UGAh7_AcG>4&|{0 zrg?k@h8zm!0A|5Zo%X%g|2tBPKHHB6`~4h?I@bepDe6?^f8w zBnzfOf|j{kR5m6BLRr0$!RZ$PHSk*)tyjkws*DpyHIiiL*8o(Smx(OKT7@D&Y3OI^ zEUMtKa2*SLjt(eJsZsLsrgV`A+xL(~JN#JU6+L)gCe%VuSNbCzTr09w>eZ#779SKV z)m)@#TNVy|q3Tz_U`^7MY`l}`GU~OlQi|*cprX?tm@tIV+8kOGkaa=9Y<{N|RZ)ns zHlgnz2S%qwK9wXjest~Ux$YNNA{0?6Xpv{_mqYt8D`g&7Yb~>lX+HP&AK<=+Zl_kO z6a2g`^4=9W92GQ3e9Mk6?DlzlkIM`iOzwk*5L81TcuyYkI-<3^@49_+^XC7&N}SL1 zh$kIBxb`9+v}acfV?FQ zN#04eHe0*j{pz=zOj3#EHLrT3e)O;3xqpCWrl$e)PcD9jQ4P-8_zyZg^M7i|*kOuj znsvlwNUsy5+01^P_sqMOjXjxKwHn4)$87t-MWZZ*5Dbit4|D9vL+spsJ0JPd?{Ms) zFW^<@yqjZ=IvG%$ck_Cu9|b8CvoV%5P5IZWzs>i4`~`N+-p`7a6RbLHJ;nxtSB#Mb z`1I552=9DrYWFNZ{-=Mt;SVo5@3cmv`IZT@@>#~zCe-=qENxsn+uHfL`e?SbT3IQ_ zt~e)Lcirs_S5^X#?hDYmgV%8QQDe+?>*1&0e^BnaeZz(&D~3<)#QuUL8h*NlXgtr| z&a{_Z)o9FK_U5<0!E3N|yY1P2g%J9s*?!zF78+NSb%!ix)tbQ09oO&|U$~Bwk35^- zec9VN^xz{043e^xD}WEmzh8d^-~Pd8**bEfd+I?HuO~n4SksoN8LRPUy={E<@BjRMUh?X71Xaey>t^$&Eq2B7)u_r$ z|IQwpG52G!F$J5fRo1LqLB7iKz_!bI@27skX~+Eze|Y}IBuRp?hR7z|eA~7B<99#7 zrX4r2a_tCDUb_}Cg)g!OEVeJ5AEVRyb!9~f4OL68qhZZRP0l*>MdkxvxXeGWx$T>+ zI^X!wnYQDnwK9?i)j)eLXJU2Cw>~>R?72@MecvT7;h~2gATow_cbc)$Ws+xNSB{++ zo^tTp^y*(-Y-XF=$XyoBJnMN9+p!Qrep1)%ym_v7zZH{;u~L>T=4XP!f^?uC4ULUR zdl`>x+DVkHVd;|9#N*oubBFQEyRT#UK^0c7T}l)eEEFS)qvZl%f>#I;iCwAWb=kW0 z(e#lm51o?d>D|kgtTscVQCNDAXMAjxSX&{_Qf)T((wMHWWLbz6WpPXP0(3_SBWwI19Vx?$i6WUqP$4O|wjNbYzst$z{58`cBhm z&F(N-KeXFzo#aC|6BbC($As#B8X=}ggpDyQUp|Q>9cG$47#>TQn%T(eHA`5se7KnZ zF_dj_6NN0xS-oZ%Nj%PTpK=MC zw*4IMGls_v)mokI)Dph*pD<)7prEF|j6I$2=XF=Ua3z;BN^yt&H@G%7& zWnL7*e0S9svjSP>kuc;VCbZXUN3G7D8`G@!Qnjt=p=7yC?QH0tsa@RsuPMLj@wf-c z|LV)H$Auga+MTAU#>)eeuh_L`!qC=Ls|{m}Cy)|w6#aP}w6_-ya~9LF z{dQAPa-|&ME858gIK=}lVK7MLT~Oye&UM9y?0X=8Qmvb*)=X}iv%Me)Gqav+FWdGT zuk&#ak~?2Kzf}w)xZuKGx%+`1?Ecoq?*H@EjFm%C6OT577vWKoJB z$A^sIasm!5TGOFFGmHkKNTE7KW3nveUq1bt4Uj)!1_6BJ zU6=EoPrjVdk+pQX+j-GTpQS&&^43tT43kuRlvE8fGdYc!1|m)3WCuwlqB>NeQc0** zYE&wTj*QpuPLfJ)j2$(`sI@k@oR!^9d(3&Kd6r3*<)pooPNzq=)1%#NQ;nAsF*5VR zOYXQC;B^4*Sik--jy?J`uDj-! zSep}9YT4*SOrT2I6MF4H+EZFRPh+}^b4@i8OYk9Y&86o*Y4(`Ax1W4#tX^5m6LjZPb61LF2?qBy?B_?1YE!nej)R5c8qG`2s_uF`Cu+ z`X_$#2Ur#!Pw0WVd60fYG8A#y55LDyJ!Yt$5G6Efb<6Nr%-BTC_|llMB?%*A5%rOX z`fyBbD5g@4Ns^)P;F7zjv{t6u?k1J0kR*v#Dhair3iXjH^^qz=!xd`vm`W`oN-Wj_ zNML7~t!rRbc|9I0mUjpEgOJ9XGg2;vjDZ;b~V638P!uVuejytg~ci-I(n9#M6AR=mQG0YjoLKGPgFp(jS4Pn7UJR)Et z-8ZsqWsRLXri#f_BSeWIat3P+Q3Td1#ws={2CLGpDdvrgP#KD7 z&SnaR^#_Bsq;Xt;kyI^}iX~1WYzdHamc$tH1#Mz6f<2(WuH^s%^yXK78Gyg}{;LNA zoW%$)#R!a0wv&q%qj%+~i3^k&1jY!ljfi82Vr$~W5G6u&$Wp0VqR3*bDIWLE4Y64K ze08)CmeFrq2>QGFSDAk%Rhs}$r*rJVNuoO(~AJ!PG{T~d_i(dQ;OsQc+q&twwlJV|`Bv$N}R$K=uxCPyc!RBBXfRjRcZi5yAQk|YKj*>d`|Xw~ckP!!SW%^gsH z4oDR1AJt?S?}B;<&e0TPFsNAMQwxCt69o{uA>=K^qd1+MST3tptj8GHnN(upgb*ji zq`i%b+{{=o7ByB78@8!x_Gs&uqLOKv_6{gO2b4jbc8YT@EEzqBp!v_c?XXFx9Dq zb{!I|Nu<;4kZbyl3*LDg#$f7`nKwT9p9|2|t&fmAe64Of^c3TKI%Q?_^+uxaj|?xL zw5U4G#YlpQDngbfM)q85qt=DJt|y5nG){VqE;V8I&WBCAH+|pe@QT+};^BWB8(lGB zqe!DD7GqI`0pj%h;hm z;n?F&(5YS1X4{T?Hf24&;~ic?rDC*Zgk;*ga9b~Je`?R%gBQy3U5$!cEi-#s>T+d# zWH}Mbv|6p1R<`wiiPB32Gn*u}EQxC^LGJIR?H}~g*|#s5IQY`pJzcYP=0El5RWIen z8*k;5(^qldFJ}(enhxl1pnB_vPi5uu!@1|-9|Owd=%J>WPwQ>dkLW|!5WV<$<73Xb z{0CRJT1OpP567)vYea*J7*!3_M-nC`C)l*@dKzsw^5El5v)K$c-nf?sZ)?i>Gc=yt zg{xL=urnv{!j}h=hh{KFAjIS@=h9CPx#24YJ`L;(K){{a7>y{D4^000SaNLh0L02dMf02dMgXP?qi00007bV*G`2ipt~ z7YY(F`_Sb8017EdL_t(o!?l=uuwPYm$3JVI^ZWho?|E--5|R`W6G$K=z*GYY6wr=V zEHLAs& zdz|M!d-acV?}e00xbt)P@m$z^s#fV*k#SgXB4;4pFT(w@xz)o_l~EwJ+$tL zNA}&l{N}CqzO8^B)M@;g^aHT<;0E84yNhu{N${eJ-?VeV-AUA6q$<9trt}a{U45TFsn9Sc6zfp($j8t2s@dE zQIjAUBn)CY?J)11fS?@`1`%Nx6NL#$Z0Usk7(Wr4STgIdiMw7!!ptNtBYrmL$nY(+rzsSZg&+Q(Pts z$DVsczi`HH^ri&>wJ9FAf9p&De1OdZH!;t<6V-n!4>5RGht>sq2l{?Fa6~?LaQm$9 z9qH`6yjb)4PhAIa?cbkttcHHF=ZgDOlWSCc`VaTB=hp)doVH}{g9J0z z{OG}rx?{_LG>2kT!Sf8oqKD@j#DD_oG}lq0#F53O8AgO^qo8w6oGP^*|D}1SXUk7K zb?V*KdY9iC3G_f;Tb_CB@TqH89N00=&{%tU%c0Z4WB~ApI*tQ-I@60@=bck#y}*T6 z_R1w!Pet&si6M<0X$&@1Z04|OhSLnh!5CX8&N-6E$;g1?;NIcJ!9M@ET6asjDj{j& zq&1Y$9Lh>#7>)s?>Lr;~P$jdD%&Hf*{8+t^cGKb)1Y-;$qr{4!>WIP!krE;qzA0ie zH@2QMam0}lG!0Rtu2d9Jhk!tC3eGyD1bu2t1_*& znD@VXDUHfZeztiTyAJ-0ENzq8EH4L{qM4F8hdRitic@fz!#TyN5{GdxF+&jQ7@$l6 zDL9*@Sw_A%6O4hL>RjG2?L1CC{!f_IyJ&pj%>v_aJj(1 zDV}G@zl}MeEcR)=MBzMj!s=}<^ zGdSzCOStu`m-76U#|fg&xSoPB<%f3P={hr%`p}{nf+USozR$hK7$G3*$9{2!b{no?XWStM8y#?82#n6GW?7)Zsa` zwL!I2XXA1vS#2G_6uFg)uUPcjE9|${UC9d@_w0xRuPYew-0*;GI=nx){rvMUu(54@ z+`1-W3}TdRyVvvF=0|BZ+svA_fYc`R9sDKlJoSV8^oiAcd+nE5_tZVqd%^b&f>BQz zGBTL-|M&8(H=O;xQ=e^A=e^iz^4+6@yKlSf%8Tv#hqkcmS4VRN-hS^#_`+wt2f#&F zoaoiN8`U^;=?_+H4ewj^5AQhK+SC`?KJ^PeVnke)?{!I}B<(sU&3He<>2?MWWu%2Z z{8ENr@N(U$qFI3=v-$PTS07#Z@0&k3QOG}i+j)HBi%%Z=`tcW^UCejx+4hFXpTF~> z6_NH`)m1V01y2Phns1H@BEv%=rBZ<`6)ly05y^ASTBkN~;?g=vr9P;=m7CX$|G)Zgm+aiXZ~uaNy+(I$oqD4|rBaJZ zrIPx7!4u>8HcdFJC#TdexmzBje$|6hQ{z`W;j zcxEL`omomE>(d+x8Qd8VhX=5+`P#GV58evMdoP*&lTI}9fl8%JsjEQ2FXPkIUzaTk zaNk#c^;wYqAW|>-DX%0C?1}#Zoic`Di%g1kcS7qn!=Ut&(rcy6c zEP5*Vl6GWL2O9olCKpP^6ib5fJT(SUCo~-tix$s^a?N*TuSl&?#P^M4X@Pb!L1}-x z&WA*#CC1=+BE_;txmKWDDTfD-_Gz_Ib&Z~KTI()QX%w`p;#2A}c%F3r-vD)*@$xL` zN{seU@}^QO)(>T_xfWpdaeovRE7^CZPMr}#|!d*|R6{H=+M{MV$Mp3LNPKT_t5 z(-+S5yz=?J*A+!U{KSTh8xFttSbqQdFU>bSjT8Q$)Ky#JnbOd}k;7ZR_W37=|NQzh jFn-Lp|K;W1YU6(Zg`N}+zmb=x00000NkvXXu0mjf_|!_9 diff --git a/apps/openglcompute/res/drawable-mdpi/ic_launcher.png b/apps/openglcompute/res/drawable-mdpi/ic_launcher.png deleted file mode 100644 index 359047dfa4ed206e41e2354f9c6b307e713efe32..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5237 zcmV-*6pHJKP)!xJWW@nmR0Ns^Wrk)72_X;&VM@qLNZyn;-h1m-)j4PH{!#b7fObo=TF+Xw z)_t{JRqgNW{e9m)=MZ*rJl6A%IHK!gcqM)U)>TjF8ytMTRLpN39jns9J?@oOe47l4 z1dw7d06;*nuu_+V$6Qs4K>#PCRHVFExV^duw#+4>?(j) z*AHP%*L5@qEpM#j?*@5nOq@HlBR^5M@^_J9)U!&MV7N?QAAfFbdJaGWPgRws)6~+R z-NrZmx0V*7Od$!{dkY1w*wll3j_1b``)C%NHS6N>yBU998+?y%)4SU2YA} zA%$NKSGVi)4!sVH=l1lla~XcBLKrfnO2~CXCa>$GlX_p?dYsM`3%)hidhs()bzlDL zr7zEG>kK#SwpW`1YyR;!pa1&-`0t?)V)3FnK7V~pCo%hYIQUj+f?7Oh#@-(|a?XKA zr;?n->{Mx?{fOYn3n4;UD5a5kBx9Z>DQ1SETOzUjjZ`HF0&e`i-6T<17qM|ec7?fBc z;0k&%hz+o?+KMG>1)PSqUSqTR@!luCa_YiGo3TkPUp^w8T}r$YFf$gPyy|ZYU`={9 z3c4MNG|FgE6ETxVuw_~St-lefEMgF+NTdzZD8wWJ0s<69@frs3IxH*_A4`(dIZhJT z)TwApTxD36oOSS>-?;UKV^n{)k!mFpfWRL3*Rxl@V_bS?f`4@I!*C2lX%(H}L=`CT z0BxGtLQ@`yX#0U)3`bO@9NHBjM^*Gw64K=(1QdKEK*p+u<&qTSoUzKhfO`4Wz>@z)uK^Aw6m!k{QPq@f~bd?t)6?} z1bJ=k7!E&fDxUmP-(QVQ?F@i8a-dv4%Gg64haX`yNv^E%Ea<=YJ4SdqH4e{1~Sk?qbu|M;*f zbqpYh(szvQ9ev=Amrj8q0@9+|SbxTQw)=Lr&Hm@e_hY2mXXchai5dBmusvCYf%>!X zK>#8PKtTjx&+y*EIR|SkT*`=|2>VPq0kb=fM~F#u|GG<9sj?zc-#-8BqmC*-%N5t% z3v1um65bJjO9}`JV*qzjs9O-*vCma1qq%z0=Thg*sPtm8u4CiyU5H^JCTU0mH2?_M zGn{jci{Y)p`kvomV&MR6*th{{opqpyh3Ux4m)!GykUSWKMk@t>>SyNTwj2L%XZ{Nn z>Xv_j0zm+HA-wSFCJ4n;tqux{Z<*M!+ghP`mh}};q{({$d;y{&M#518E{~{H2e(KJ+~I! z(QA0${wLzt8F#!r1DoX%bYVIIT!6Y1 zJctN_2;>9AahjEz5Cm@p&;a2*ykj`$0UrSH$QJ^n3By@S!UCJh5jS2|HIuruyXF34 zRDv0v?9yEOYVFWR0jftU~yzAQIFKu_~N!vxLSpD zIxEmBpAwnRC3gEyg%Yon(xeEA2t*11fhfB~8i^HvMIcQOp5dF9V>l7DZ+tS31TC`?6B2!P-{Ai`NS%8sfWFCh_# z2!sJ<26G0;dxnUBNT3Wrj-j+52u(2zc*4ieoxAxfi_hFMD8$Dt*t4hHU+Z6a>y4`) z-dgRJ&wT2GICjQeJ24|X4P=?_kA+q7QY|L{F) z>E#!CslTU!sFuPzhBSJAZ4?NAGFdr600O~tQ;`JDd9Vkv#1X>KptUV8Q)hHgp)4=n zf7k1aF8a|v_e`5zKCDz~Nuz3ARYohScS~Kpws!0=fL0XBO0`T-YycqYn}yY@ZV?g2 zlnDnM86|@t(hM=mC6W&G)j}8N_Fwtr#>s`2R4qD9xuZ_o&BU=o5&`up5LX5DnnxN7 z(!|510_PdtJ9u$`Fq8(A0!#>KLogu_1c1^6@0sdRitRngzWe^er2PiAMIqpkE7Xj4 zqSD0i@PNn2cHaUJ;)tnGEM^?Y2OX%5fOPNhi#0IY;la!zy_Gm@B#Lw#(Mo_^%= znu44{7-|HeMy{k$Y%?&%Kq&>KG_*4CK85oRio&-@sE4y2Y3h;2*%j9ragC&24JaC` z`!uzlS%RjYWaMg=C2{s!Ax`QU03w3c0Yn(2{;azYNJdU3mn!CrxI&4*JCC^T#}y}2 zA`QzFa=EsmQ0RGvftbU zQ>{c90A|-98)Xj4nT0b0yyJf8t%xIraRd)QQ&z*I6o?d@PmrXe$eT_q-0f@}wCCAq zEl$Ss8*j&&jkjWZGSHg|Kx;aNPWFa9~0$jGSbWOU>XjH6xDc0w(iTEtcE6dO3#5TC{ScvW=I(b=Nv*)M5VtC-7j0@OiMO};u|K_aA+ua&Wy|G z0O?p6>sL7#>4bE^@$`cedW&;pHYGbq)cE=gVUygN~?!_hF|0teV`9}~ml+s!M!x_o7(s*;* zCVc-VU&If8em*{M)JJgGyiZ}QGSUDFC<*}~u!v@1)yzPXBMKoDa!^zNBmjHLN~pCo z86Fi-BjwE?n=_NmIA?K7liV3M;v_;xTNl23?ow=ga}EA*-%{NFA9)Ej6(HYiJs85m`CL9ANNz_7Wfw>}W{H&o zhy)^>0cdZXg2B-WvL1};5P}FJQvqpeDFK{}*W_F4Q?l}yJ$-+C<-Fxs|HfnZ?SC!9 z1CQT|j+S@fx%Cg={YRgO&z2Z>i~diz*O?*BnAkIbU{QcAP}Z33z=$xNR5+KgfMs35xDG&i*Vb0Kg44zZ^zZ& zc>uXE4-p1))`B-&1MC}R(r5-n0MAaC)!S!3D{E#4D+*c5&ME_7bO-`vnhuJ0%rG^y z*MSI{U{o_J!WqGvFVAW?BdzlmMhBQRZ2?B+Z$U21!?_gN1W=^F4PGQ^jHW1{`Cb9o zLx~8DXBkZ|AhymqMH-oHxQxU~>&7f9WD8o#QYOvxW(yKUdVH3~XXbxdwyFjxt+lAv zZaWSag=@ z=8P$&K}1lbY?iX@ee4?s0wKUBJ964=H$0STaA3T?n~R$9CTTo$W*+}*eEXdRL>ghx z0ulvhz0Z>9A)>e;5?WE{3wn~(Mxl@k5Z8vY60)g)Z7AM`NMj7L0~nqG?*MV$0cj#* zg?t%+Zb&IZs~iSLH{&P2T8vGbH$W*3fW~XQxiirODk4xy!&-;m-f<)T^zbbx6J$2bI!+g&Q(Tb>mTpfw(MhPbbX*24YD+xC~pjzlg4B?I0>ZG1eo;$GZ-@3q)Ayc(TT%9uB8CcO9K>t$rJ4+!Ga!{2blb3*{mJ?rAx;e_@g zW=}sb8SURhsg02gkr06Qo;))H{@ois2J0*E-a_ku;$#FwS}J2z^z{y5!Tf{u-m?$! zW7XmPw~xK}Y|U*DV-zVxM2Z?xn6(ROnxdy?JIXW%Qzy=WHv^~-wPRiPJ(xPPjP?m_ zU@!3AH)Mt2y@NuFGk%)cvT4gxH~;vV!~gKarE2vv&(f8P@Ag++xft8kE4o&xvN3^V zhgKTPzIFc&iMV*lvDmVC6ReMr3kzh>qKs;xT2uwI^KCQwiCuxGcI>;nX1mYH6|D_I zV?e$kJ`M5;L7M=zY84}cF$$#|Dx-Bwp4xT+U;&*D<@0j8tMo%x5%Tg?~5R?T=3cv%@lt|5rbf!U~$$KWHR3?Xk zu&I|c5%P}XIIb@4XrJ=aC`y!W*}^Y88R7A}hVa+MJ05U+?`P+M8rvjM6j3edroqA2 zxm4Kuj7oLnm$`fxbar$}K3^bGfWT*$Wd5R*hEfJ52%w-LATTp*YNZ}ksTNg7J=bnd z-Pkqa!RO=D(kYB&|Wjqg0rvF8kum{NfucTYqrP z`5U%u**G!G6{S=zQMp`3K3_yWUyzoz^2Q(tmC>3+s5Oq`4(BY=)S@2MFgiNo;u?&k zg`0}`37-~9P0%vHiA@+H2!cEy8o#>wuOImB)G_Pj7yce!TXGVt#ORn z(=jFB*q2Zp6$}lGp?}+$um^#4QjKaSEI75c$z6AAYL348>#uKEccl>fFbuUZ0R$d} zZ~}6sT!$|qC`YPurgrtQ76=RC$YS~T-}$t1r_YJ6x+vSq`|xwOl@gGLU>BhcFBv~FMie-ahi$Rz-LINpu0Hu~Za`}LYEdk2y0hQVU6k7}mB|~9e!x(}I6ii4k;VvE0 z?|KG+Oj%0Bi3m(dlp;$c5Cu`1CM@ypLV(%bX9 zr_WVSKiJ10x1!vdPr`gLXF?@f1r%~#N8UkH?XgO1p%e>?-DLnfb z=86?7j~f~sKElT8lSw^&-{|PJ_Z)D@o-cw6^yvN1aY@hS38meM!r|M7s_XW%93Aak za$IUh=gpcu=jzR`4$^18^F8_11#h4-#Jd^}{s&{CB`(>qac=+s03~!qSaf7zbY(hY za%Ew3WdJfTF)=MLIW00WR4_R@Gcr0eGA%GSIxsM(l48sN001R)MObuXVRU6WZEs|0 vW_bWIFflPLFgYzTHdHV-Ix;spGd3+SH##sdcWUue00000NkvXXu0mjfB?gph diff --git a/apps/openglcompute/res/drawable-xhdpi/ic_launcher.png b/apps/openglcompute/res/drawable-xhdpi/ic_launcher.png deleted file mode 100644 index 71c6d760f05183ef8a47c614d8d13380c8528499..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14383 zcmV+~IMBz5P)>IR{Zx9EA~4K?jU8DyU!%BVu|c#=(H1 zIAFva(2=Yn8AKWhO=@Vm>As!A%_mpwu-+fLs?Ir051^0kZ=Q9(`cB=t=bYMm<@H-@ z?@QQC#}7(lHuiOKOg-hI-&yJQ@X z>38Dx`mgcs{{O@!m2+^EdNUPDF+a6!8!8*d@!BI^jeED=gH;btqEI5d{e*jVDP7bq z{q~MSBE(fsoQg6}7k95+Ji!s3$poDp-qlOkXAwnM{3JB1P1P!!MLkm@C24>Si7~v(J@mNzG-t<6(_#~IP~Z}QN`;~#%u^^ zBv=E1KsZ>EXwWhEA%MjWSj+&p1YiKMScFGKjPH_0g9QS9!hVpahud$BNHq6km8f&$y)VmTQ`qJPd+?0zVd*nDN_N;fDC>PCKgkkd- zF&a`~zS4LCy*S)Om}M0r157c%Vz&|}g=6?|;XWKwAQT*MxQ#H?lrYWC!I5q;pTUZZ zoF|S^mMxt;_qPCIXf(txX5a0Ww;uk~=vd{jwJXPI%UbvK`FqRT9{O`bUiO)BJM_2% z(XOY!tbcIB+EHv;)4J*BV9|&y5&#Sa0{{$SB&foHK?p!lAcP=9mJn^Q zEdF4f`u+CiwmYVjr%WuN^Du#n`yU&B^3IJzBL_Zu-$?zTyBfz|`{R*^-t)z|a`kd+ z3q1~f(k6y5Nm3x1Yb_kKdg+KYV*sjIe!V z{5>Bz^<6`n@li*u;}T2+4lyJ`2oxNk906cBFdVfoiU|zCpa} z1i&zeF@X)3#Clk0*p&E|Ev$2}*1}l_W2{Z$7(q~!&ar*`feE?ciQuhsm(q`Gl}fN+ z@eJbtu1z-J9Kjlg^G?2Vm(yjpIN`_LzXAXv^r3($xF(p5y?b9P1*F-Cr~YXsj=g)| zS$n>$x7f>y=ZgXCM@>wqVLVI>hXL%1sn{O{%!kA@0KEW80E%#MFwm*p_a{B zD)9ll)VtgP1B?cSF@g0+Q1@mB1{Ma^85pZ!tc5iO#u!-ZV6}xY4oPBJCzg_?K&wta zn%L5Rj?vAeG*Bm!j&+Mc0?>)WhhMvFm(gdJCt~yENoevA*5h{EDh@*#(_{(r%m&=? zu|e$lr34M$iU-{w?Joo(Y{qhgD4~QIkSM}}!O$?MLZbI-s18e=OF&ai&7-M0rh0zYyI+(=47^@pK8?@?t)yRhO zzs%pSswcJ+l9+kcqH%0n*9V;dpM3NE&pVBFsSjxAt=MWGLVz-sxL2ty_6bwL*y%l( z^9>+yo3UI7lth3j7{MAa0$2!WSj1?ejxkiQ4K<7-K?@ef2cKYAaNFUg(T{h&499@8 zfO7ildBY909A~mi5d(n62vetXrh7` z4HzV;U3Zyv?>JqX@EIcrL17PGz;pl_gtaW`qV2(}?K z7!zhaTCssiN~pzE)ZG|bt^v&&Iw!VCuMKp5YG@e$;~cE9-qBhIYucx?3~Lx{30fye zS{fl{!|4FcxRUz?fTWbfM0}x+#ep9=eVP@JqE)w;wWx(pTzXQP1!_hCDgS-E@^?9S!F42HJ_S_#uc_5Su zs5YV8=8;EdD(d~XBf)i7k@eOjOu}f!6L8G}mPQ{ykK7Z1=*K{C7^dQQG~*hqW*BXt zwShMNOtkjDYl9@w(22=Uqtnw^7;U{qm`pPmt+!FL;E8XQ{Y&G*#ZExj-eADv1EkRiA9p=HbW9mXn&pE zx6s<=(T*{$-anb}*Q^f2@NW}!Ypi#4-44eZ5;wFGR z2l-#ffa_PC34p;4_~V9Ch1H=Mop@k2T=ZsZ95ER2~w$V2Qwf@K~R83 zvJIQ6w*fXxCEOy(CETXcuAvj1GDN3@H|;ZhZ>JU*V<1q%=E-}pVf-!#5kQI%P6I0* zTLpFk*7~tCJ3&MYqC=<6ZM^c6Z@7>dv20Zp<}9uM?_~fH0U)$$1VND)+d76o^q=A^ zEr^rEHJg*7*_`x*)CPi!7_L8n$2VUEYYnzlmg6rQKZCm73TFhg)~N(r7^9)J_GT#Y z=E!J+L>qrUGe4>H>r4xD=7=p^O5i)6{5&4r@Eg=yoNE;R%JeoxjiXN3-XX0XM8Z3x+2kseod+K#}a>@yV^%M}^*#iQp1F zAst%zV+r1|H5(QIra@x@LRv&YFN9=BDFGr7sAH&E#DX-22b|;do=c^e;n;zlgR|aA zyY$*QZ{k|5CRq1iVqyY?LIkChclb`g8G$6Wu3oE&%0x0;uh6maSl?4UGb=(U=b9CT zAAD)W^Fp)dRRgSbAYouM5g5E}`|w<2-3dk;YPD)2(M=f5sbl0cDunQcOk3Ku&N5x^1FSJ=M3mZon=-*VILENo0tgU=eUPES)PX*zAoL7o z=^+bdICcU=mYo}9XOEjc^IkZoMNjft0EE-uvH$-*2E<7n^$EZlD+Y?kfE~ZUXxp14 zEf*&Z@EgTT(Y7k=$iK(SA|BR=ybI5Z(;@VwCMZ!$sa_=8wT7h@fN5QG4U zvlvfCab)odtTZ3MLn~IoCYzzuBK6l5SDPdEd-X-eRX!@EFbu5#2NG>lLPR;HL-}yh z`_wi&MC5}HqLgS1BLC{41#goav%lv!HA~s6mwsoR&nay7yEk7xf5)QejjzT(&AaOVO#?>xa{z!6%4qPn@N-<8|7}ThG@fYqze_s}1$89iq|O`10Jds> zYaEiem4=mV>361M;_0g=f=i>8)OmJ>lG;J1CPwF4k%DWP#OL>1TN^ShV9rgEXOi~~ zo@v>AmuiBAwT9R;XvwTawOIhrs)H{7(gpbBM@FC!BA{L{Kms92D$+oBAOK+VhGBg7 zc3)5U{+-ADeGFL39|7~7nBW-O`9f^QpHak8ybYhG0{W>$Q)!!B3u9_nx2~CC?^LgC zw{LpU1qHTp&{+jz9CbniodoVWt?PyotcB^iXFaoWV!JN0<83{suyab>OdC2+=C-z^ z*N%~DOvW?==a`rY)^SNHJ^KfD&w!Ai3aa?hC9_FWO<7cBACBb`&gR+lG2YO;P7w)N z$40Dvd?O~u8W0k=P_IuBrh5qCR6NJtRo;Uu{YcZwM}hWjy#XVYoCUvLpd zn?q7ah~9Dw)-ffue$<-Vr!$MGYy)F7V6=nL-sT&_xx^dO37}>6x)aZ_usS8a%cMPf zzwKh0F>OY;)b6|VyE8_(G-_&JBaQvN3G>W?H+4=hAT(PCWA*%fj=K_LBQ@Gqt;@M| z0ZT|@FlvE~(|`wNGT+_rM8!xctgZCX?71^U5PB0x1YCU0kH~j9c;9A zYgg6?07kd90N`nW-cG@|S^K;O3l@!{FPe@H@;ShX>*$mw_$j6^H?+9E=;4JzVe!A@_?7{ll9hUq1mbgaVweTVAJ>>5RxDy zfyg`1+@W^8a!MHF63fmz-L`Zicf>A}NqK&zoP2oG6*0z51&Nt7Xq#*6oY5hmlvF>Uo>Ti(<_Xtp)F~;ksPsCeiHJgq7 zn$5=R4m)V>q0WihPCt1@ef7GAsEk=IlmzNki#xB|p40kiCCT4D^jduClFfL-Sv@e^ zq6;hk={{Bbz?2dOzty0|8!a3{^g%#iL_dXUZG5(F%43_g;A~0i{de7X?|+~1_Lqu} z|7ndFoN~|&f4=+SEz(T;R$MDCC9*6F4U%CCGKx{`Arwmi!h%2$3aF4ga|D3|00Km= zqm;J_I=921Ib{Opzk;3UNYv8Prgq*kOu|TFhq%dTH7uHSz{U}59Kkd~#0`PT>R4;r z*3qB6=(O->fBDloG%$^<-m+w9!-M}_oKl}V(7!?8r*DX#7%u# zqiRa;J8#t~r@W!xW`h%=JMerO17z636 z>Mb-fJc&3q&`AQ4jHsXxMuey+Q78!%N`#<5P)Z>xNCcroSP&p$2q6&!5-MaMt^Vc| zPeWE~7&-y0wP4542_uOu;-<%xlGq|?IJ|60S##{G0sLlSv?cqe2e#FWpP2z*0cQeKM=O$hoZYsudfZqvbY?RiHsquN31R{S z0>CNg*igOhM72^+CdV655EMRErtjZ%@l}86Iq1lP-m}kvi!p0H>ql3u3HDgW*t#yn z)(sXTTY<6dEliBY7#@kytXt?9ND{yq_^zwxbnKYQFtUpAP7eV{38;XeLZDCx5EUhQ z`T~@D6^gwAJ^dOzQ=dY)M{-|ZKNTkJ85`G@zCy6ewr-p}R9j}CAtu5EK^OvzHZ~P& zv|0v9lWAf^^R`XRg8}?z+r}m>+`HE&c+bRu=EMLn8`!d8f@lwkiS6ouM!Z2XVnZZ} zg!InY5u5{zwn$nAjYgtc4ab!+w-}&k-kf6x*RNUKSE+8n)c*Nu!QvU%V{eOMG!^U^ z^=1XFra|0vXw`w*q(;4(pjowO)HLd~1dUpPxMh*F99k`pjQY$u%^949O_Q+9JP83v zMUYBBDFGFD^A;5(!h-Z#6%nF>M4==R6@+I-Kv03VcSd^?Rj)d7Y^-%mlES^`(fP~X z`^AHcjk>1VWK1eFkTUTo1_RDGXzjddYd9n=qGp}>?Ju|ouQ_`GKKQD?;zM6O@R=Fl zbO;b5X+)SoAHa`qeOsYf6CCRVQYe6QZgVrcYP3V#vZz-yRmNighLdVfZ>5UU7AU}H@0rcd5CEg?Gc!Pt!ZA}W!(}(TI#qBn!3=VaL7hz@xpV7?oe3bJ zdJa5tR(}-sRpORy7`8oOBALjM3)zi_o|!!u`^Dj6v?Eq9p-V)oXiw-F^3s( zGX_Y(8W2ebDg9`PDDC6-s_6;lnFH5NW$#Km9BhYhfe8eO#59oT7@;ad$pDTmIw`?u z19cu|KzBaC$g^SR+Cs(-IW&>YlaNb@;PybeXpvLjKQB`Nk&PJuv}<(Jc}K$MQ>Gn| z$j(4JpIye)lw2u7sf`AlXgf>mCCs`G>9a1yW_B=TopzMlh^Axq!)1v$X<=+~8x#*> z-jo->B!r2|b{Jy-R_(+sBeLrzen!~LbaDsrokMPDIlX2NOL%&ue{6q$N8;E;CZA#w zaXtGW05mJzGXFnoKn@VMO;}oV$|Z`snBY<(k#9wosn*!G84wn5zQ5Mn^z?hY4@jTm z+FIb!=Tn-Mwc{J2UW1DA?tu3mx$H*`L^tI?Z91X>{FLJiu_yR&#Cwa5{Qs25|buw&r+a zojE^m|EX=`vJ8(D3BP!vJblLWa-a&W_FxFPjn3@1OY0pXv$fncA!a}d1?L=MU4hmH z1LeJN+<~vh{tHh=Pia~%2s5VciBpgLERGs~6PB<3Z#=sGT1+;!BMM6hgJMd2(`B1G zCAU+_^WY|py4pS^P4t{`%*u!2sbEo;eeC!O-<3yz@6H1}2KFo(&|%a3@0C;vsQnCX zzb};*4=WJ>mMS1Aq-4&K#Y{ajtx0_W5yE!VDZ{PF;$ZANesHv+rAR|EeqT*t+X5T3LfYMTmlO%4pjaGG=pN&O+S| zMsyICJZwfp6nV*ZkR4H2Zk*HWP9M^FIM;pe=}?3SQi=9Bog~@tlSH0yWISNUd4!S) z2{Tyhn4Pu649X_!Z6KweNkh-{b0j3?N1!?Da?|o37v?^|T#kh>!=~ zUj1WZoFtOH{yC1AWgdBTa-i*yI|7N!S>st4(B@EHIuvcKXb&N-H!g^JRGvOpLO^F|o(F{~cf1z(-Y(%2 zIFgPtZS5lWj)P}*sTax1NZK z6_m6>1a0l;kd}PHOh`-<{iOw1IQT+b^!>Ns%y%A!>;Lc@z)46U(~gGc42^aj)>#k{ zq*SO^8~DLbzkyTE+zXfe_>0(Q?kSKc!dQdOfFf;8L=g0#RG6NVh#>LU(5>X0>7I92 zMvR=HnWJ{8>B(MgHx#t9k|bmL)J0xB0T3t#$Z?KMba1{SBkYj6Ac$1ZzS*5McNWBv zI^7xl2jC4SeG?a5a4qI7nTpSU`*k?yBQM2Wci-$WAt6#mSUlU20dUL=DJ1Ik27YtZ z6?oHm$KaAHK7gZ+J_J50^Tlr|C9HAy{Y_Wm zSJz&Qr#9b%Lk>I!A9>$ZIPS1hA%wtWWgPXYfeYFhaCd@5I}DR}-Npw)A_}u`)@SBf zCeUFOoC6R*$*?2(Nyp3G<9-?g-uR-+ap6y2;E_lGBs!em4){nH@zV)p4N&L`gR?9& zjhHe%r0_yBo&*3`XAr0eFFxu`IO@QE#!bt9u>+An5<56z-;4V+ z3C)tn6uTmcdOXoX5arHbvK_{DV2IPJub;JAZdhnw&H4z9oLyZGouSK;XW z-+;HA@nI}kvZw#7wZ4fLz+aZ#fh&IXpLlfbAF#(>3-G~rei<)1;*A*SpOrI>h;pE@ zv$&r})|o>S?SV3bo#j|c(FO&&61G&xkY&~kcs+I6#Ib+2;SSn7GXwg2r)496ps>M= zI)J{6xw$lVG9pt{-(^4mEC8FosUyiD+3mnOQBNO9wHYxubs^4t`4@4*p>M)X_kIW0 z-E;-s@$sMIWk;WbH=KSh7A{w#>;o zN+}=20uVx2fUFPAkcVM;5u`%}DXmsXNdiCuxOz6X9A4QWjN3`Jz5^qCb~|^*zIf{^ zFUE<7zZKWtekrcH;hVT^*_Bv4=TQ9h;Tth9vw#nr_bI&mgnz}%X^XogUW)&DJ$jCa zb_hSa)S|$*!XWiIl;xzkx8|JaT|&mlg{a+%p9M9~;sg94+Tj$7E=07WD$^DFrbJ@^ zLQ$!dt3y|I$UePy+>!P0(_-UpMx@zo%7}%t55c)-eiyGe;a&LNl^?^hzg~;ePk$rM zKI@AZoH{QhssWMABf0`z++;^%uafT zm}kV@W7=tFoDd?X4~aCx$`Gbbsofz=aE_UX5EY^V5rI2805Ubrq^%3YdJcIOrP;7! z3u85w%sm`0I^th2cX0`?dBr&xoH`H2Bw%(BLOm_xeERpbr8PgSc0 zr0O1Mra4`5n1OlOrSlwXW4=3LzdM_x5RhpK9)&%1BGf4j>pN?qS?2+zgUudntxx-; z2)ca*x79vpBA$~1>~JuMgl~&63@NEyxqA+u1%Otofkva|%@lX~HqL!nXVFPW!Oo>E z8qYB9_MAM(Xmr*vmc4e9e5VZPTpWQk3T~I&IOlYyA8l6$JpKQBskgK1zm0pelY8Fa2xLiE_7`ioC6%Bo zLCq`xfE~cb6q;iJfOQh3~E(;W$QhLqV%s3Q#Pd=|I0WrxYP z{m9>^18IQ$_kEnuZjVWCWOEWE(V?pVV488gW)ddnI+4hoJf5?%E5TXT8qyPXR6fXP4Cm>~aQT~4j z8T^cv|JtYelpFKR-nQA^q8;*?1Gx4Y8y>s7AOR5*)4CvSmvGFs)m^mjC_2 z(^0QKOGy#{nstk!801$Rf4EeYqKzB0-dRD;S!bQi2;DJ5z%e_c8F7>AI;QmiP>6aM zP{Dw2}f>-}+^|?~^CtC%^tW>h&t5^x5olDZ)IH8OjJRrNZ`+E%^H7pTOB4 zd>L-N`!^^Si@t^+(BX_TEXQM8k?IE=u~JgC^q7X}`E;Wy!Dc{(G*b)iw{X1QFST{U2Bp$xAj>lInhY-&J4ZZj7hcNxrSt!yX_njL)g!;Jp z>g0s@X9!sigGg)J63+QGw8juyExB0>s5)t7qvpPS)G;$3zWJ(ED3zw#vY7_s>hL=q zrZ@@OOS8egIcv$%`Pj5>3_rg56ZqrpKfxLQ{9e5L#s7k0v6xoT9Au8|WKMYJqMt1{ zl~O`Vh0(F?xcc`$!f&ttE+*@nF=N&M=Jw7(5F$lqvj*f8OUN-Sh7vun7E~w%4Anr= zto=$BsaTuTUo3}n=9Ef)Pq`#XP}3FY=A^WVS=WpwKODw;-F)t+PY{>?$6a=^au67d zD0&VWaLq68#@+YbjHm~0*#mbHK=(E)!CB+m-L~3jIdJv)GM*R|wb6c2AMKOX;j*et zkZ4rRw>Phz_>>b<6#yuyxWBvrf&yf%dU@1}4!a3PSYXUuI2DH;y#%U%8!r3R`|!R` zy#jx_?YACb71F~U&UK0W4l!1WfcmOfv(>=QfBS8md;ZDz@$Wu|zCn!x4q1qqb9+$g zZ!gH$5tO1GmOruMdZXE>UGVV_!3igw!xi=B@QK4?YtEmn4FA5>sy(W8^ATfOH&|Ey z=t%v+7dk_~?U`8<{pFbs0M32Wr6?9kxb5l<&#nRQIsbJ0||h!8Pz&|T}y%N2P2E8mafjyef|-+GMNnIb?L7UiI1 zfFy}=Q$4R`fm%d zeLdXL!=wW9DnY&f`RQ}6x@e!*Lrw1o?)omw`!76^ozqYe$-Va8!*1HR38%h&0bY3Q z3wNrmJJoNat{I(=7_D2kO@LaNTG1co!8*pkG&FK`~JDG;YJ*A=mN}`-3J*m zWI%rTQa}g-0j2!91V(2Ucsn`+$aisrw<2F zz(N2Z3n47#FPee<4w;4Z{yQXJ7XL(^U#w+TVe)CAma7wwnA&` zNEq|A-|fw(op>-#J7IrRDn~F0ZP*45>`>~nSTg+}%$dFiuDo<;r*wYCH0J#OJQcSt zy8(MI+7HD-8A53M*B9=`8RyO=Ye51bw22vE%&s;S);TO$v?mtru~68!=z`E3;AH*& zYP?n%H!6h827}nA{zB3uKmd>TzJ`AaMa-k;?_UkDrOJvbK_zCGqG zS_LkU%CBS;J1kY&ktmtD%F}%AScAn1!`rH8H4Wx0=*Pr(4Xvs`-_#<6wCM`TZ0%Xc zGcvoL<}P`1$bR{h)*8e`L~=G@3Z`1Es%^t-Rwx;~xY`;XE(e1!PIGm#g`0n~>A8^Z zS&zRHO5FLeeB0%??zeX$Dg6~Lp5Mj_)1LKZ3X`Rw+)CR1vh9DUz34tQm3ct0m>)7j`{o*_J`~IhWHtD(n@@Liu zIJfs&uKV^1Yquf(mfpYqG4sR>4^bYXo%SD_(3%E{zF1W8SQ#SnDmYJ(pMhr_w6?cnyrMj9+v}s zdu(OaS81acCULxf94EpU$AU`~1yd2KUJyrMr@*WL4&ZD`C|1a`X_f#Kh!uzeND4s| zK!^~6B1joRsRATLkTQax2!sL%5r`rXhX99Qr{J7|(*o8guu~3BS#4X=*qQ+8$AU0? z%kc2J-wEmyM;vj2tJfdHjVmfR<&b~DPcOaYd866$zIE{}*FTIGzIX zSQwP#o{JW_&%XCsocNlB*mrOaEXMKhJS=J!VWPSbjxDB7St7QL zuB38tx;^Q*vuECT>rYp09eupF+#7IM2&owLAPW0Y2>PH@(RW6BY|`UFWWjJCB1Z&H zyY$mMK&0y#gdk*#yJbgdwG)G~a8AS67>TZPyTsKTCFNtdIGT-hjvvsZUMqUN&zJUgsK2R0ZCC1 zp(;?IN))ORML~%IRiHvtLaA6rp-@B=MF^t+Dj*2u;JAf2nMAcViqX-n*tBs2#Cmj8MC|07kNe(W+0 z$d2>B{7TH3GaqB46PPl!k3R6`%lVJXzB~Q)yRLm=<*NIqwHlV2bwf$)7i*C4n`{J; zL=Z`Yp@32fg<=s>f%~VH?+-#XDM(EbLKcM}_Bn-O9lIrsMy+IxL!y&>3*#g+3ui(IzkR{wpI^Sq=(EfJ zhs>8gdL6#`%d_!+-uDZ9``70J0KzDAK_s|XR#1u%MgltBpTQ)))uh#MXjVDhhMo}x z7Ol8pbwj>u`8}KOKmH7arD@<0ply@je?RlTrd)mfFK>SA$p;T4NGAjdAMPrTiYf^y zebf|20x}?k5s_d{65FZ|&KR&O?p=+s%~NpjOCnS^7ZAtIT}pglH~kwcsnS&bTbS2@EKBEdP1Bn0PBgumxA@4T2xe)}9)BAIuB z`>yAoU4F-Iqsea3fD8i2@b^|SPErX{fj|_c8z~hf3h7zuktp^kL`5&LA_dWe^hEsn z$Nmbf8IB9+EzII`PP&GcF4?yZLL&v*Sf&}V3R3hl5(o|k;nk!v?nz)7gBm@m5MkF0!SIyT4SR6 z+ViGBn--t;wncE%0#EU+9-Y~5?gPSQ2=9tbG}TKf6@A2H8% z>^2`zES69#^kHb|N%;0vvVw?h+QdlA;B5aOmu_urvpO*#IYJ;E*ITP%1OTH9KtU?v z*PgPEWOhzU)d~W|5RQXTLInaUkRG&{{iLudV|?5HV-I`rAPkF$qB07F9z=z*D@46$ z#^V&*;ct_`q_IY9cqHcj8M~GKyEhZ=Db7bweU05~;Tkbz8g3t6MgPu>i~DmseyDp`}_M6@#}p zXMfV)Gjmp{)C=okM?$bv3W5}@WzneDMI{*#QpBGh-n{vHhaI+`KtbF6j_*gSx_c9W z-KGIj5=JH-!%=)57S4Ey+p=XuY#)2#8;yGF)x*PEme(qpgc(o)&r$);PznPIt{}8d zwiw%Ze^OlW?nYeT-o65yW$q~~M%-$`I*lZ0V%4fgU92aBl;S24Brj?tTYeNL6SXib zik{Md>?ux@g|Jr=gt4x5j}xuaO{4tjB}?}cebXhMwDcWVH#C7;ezj${GGLd((VfRt zk9-#Q-SPlV*!Ln_bI+U5)Z1lTW81Xb3Xz(2VlkR}Tp{XTq+}==Zd0OL_f1xZZYqaM z$80m8n72X(f|FK)sZ-~pS{cEdh5fK@9HXNXsMa@O!Mwwz3}Rcbi!oxB&F?QSIIdWj zx>(6VaVGmk*5<(bg6N3tnEv$EiVjmlm zKuU#5Wh;L1&Bp-%AN|S+IN+dtu>8SW;MiEQQXoi>G#VR3kNlOA0hCa%=}ubL{Rw#g z8>O^z*aor(V1b*ij4|}&n%zkb0KoqRbb1&ct<2Ko0000bbVXQnWMOn=I%9HWVRU5x zGB7bQEigGPGBQ*!IXW{kIx{jYFgH3dFsPDZ%m4rYC3HntbYx+4WjbwdWNBu305UK! pF)c7TEipD!FgH3fH###mEigAaFfey&@l*f+002ovPDHLkV1iQC3p)S+ diff --git a/apps/openglcompute/res/layout/main.xml b/apps/openglcompute/res/layout/main.xml deleted file mode 100644 index 5a8da6d73556..000000000000 --- a/apps/openglcompute/res/layout/main.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/apps/openglcompute/res/values/strings.xml b/apps/openglcompute/res/values/strings.xml deleted file mode 100644 index 3a57a5288983..000000000000 --- a/apps/openglcompute/res/values/strings.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - HelloHalideAndroidOpenGLCompute - diff --git a/apps/openglcompute/src/com/example/hellohalideopenglcompute/HalideOpenGLComputeActivity.java b/apps/openglcompute/src/com/example/hellohalideopenglcompute/HalideOpenGLComputeActivity.java deleted file mode 100644 index b9cfb2f2f969..000000000000 --- a/apps/openglcompute/src/com/example/hellohalideopenglcompute/HalideOpenGLComputeActivity.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.example.hellohalideopenglcompute; - -import android.app.Activity; -import android.os.Bundle; -import android.hardware.Camera; -import android.util.Log; -import android.widget.FrameLayout; -import android.view.SurfaceView; - -public class HalideOpenGLComputeActivity extends Activity { - private static final String TAG = "HalideOpenGLComputeActivity"; - - static { - System.loadLibrary("oglc"); - System.loadLibrary("oglc_two_kernels"); - } - private static native void runTest(); - private static native void runTwoKernelsTest(); - - @Override - public void onCreate(Bundle b) { - super.onCreate(b); - Log.d(TAG, "Starting the tests:"); - runTest(); - Log.d(TAG, "Done with first test"); - runTwoKernelsTest(); - Log.d(TAG, "Done"); - finish(); - } -} diff --git a/apps/openglcompute/test_oglc_avg.cpp b/apps/openglcompute/test_oglc_avg.cpp deleted file mode 100644 index 346b7e9f7d72..000000000000 --- a/apps/openglcompute/test_oglc_avg.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include "Halide.h" - -using namespace Halide; - -void blur(std::string suffix, ImageParam input) { - input.dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4); - - Var x("x"), y("y"), c("c"); - - Func clamped("clamped"); - clamped = BoundaryConditions::repeat_edge(input); - - Func blur_x("blur_x"); - blur_x(x, y, c) = (clamped(x - 1, y, c) + - clamped(x, y, c) + - clamped(x + 1, y, c)) / - 3; - - Func result("avg_filter"); - result(x, y, c) = (blur_x(x, y - 1, c) + - blur_x(x, y, c) + - blur_x(x, y + 1, c)) / - 3; - - result.output_buffer().dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4); - - Target target = get_target_from_environment(); - result.bound(c, 0, 4) - .reorder_storage(c, x, y) - .reorder(c, x, y); - if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) { - Var xi("xi"), yi("yi"); - result.unroll(c) - .gpu_tile(x, y, xi, yi, 64, 64); - } else { - Var yi("yi"); - result - .unroll(c) - .split(y, y, yi, 32) - .parallel(y) - .vectorize(x, 4); - blur_x.store_at(result, y) - .compute_at(result, yi) - .reorder(c, x, y) - .unroll(c) - .vectorize(x, 4); - } - - std::string fn_name = std::string("avg_filter") + suffix; - result.compile_to_file(fn_name, {input}, fn_name); -} - -int main(int argc, char **argv) { - ImageParam input_uint32(UInt(32), 3, "input"); - blur(std::string("_uint32t") + (argc > 1 ? argv[1] : ""), input_uint32); - - ImageParam input_float(Float(32), 3, "input"); - blur(std::string("_float") + (argc > 1 ? argv[1] : ""), input_float); -} diff --git a/apps/openglcompute/test_two_kernels.cpp b/apps/openglcompute/test_two_kernels.cpp deleted file mode 100644 index abff1aba5b23..000000000000 --- a/apps/openglcompute/test_two_kernels.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include "Halide.h" - -using namespace Halide; - -int main(int argc, char **argv) { - ImageParam input(UInt(32), 3, "input"); - input.dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4); - - Var x, y, c, xi, yi; - Func f("f"); - f(x, y, c) = input(x, y, c) + 1; - f.bound(c, 0, 4) - .reorder_storage(c, x, y) - .reorder(c, x, y); - - f.compute_root(); - f.output_buffer().dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4); - - Target target = get_target_from_environment(); - if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) { - f.unroll(c) - .gpu_tile(x, y, xi, yi, 64, 64); - } - - Func g("g"); - g(x, y, c) = f(x, y, c) - 1; - g.bound(c, 0, 4) - .reorder_storage(c, x, y) - .reorder(c, x, y); - if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) { - g.unroll(c) - .gpu_tile(x, y, xi, yi, 64, 64); - } - g.output_buffer().dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4); - - std::string fn_name = std::string("two_kernels_filter") + (argc > 1 ? argv[1] : ""); - g.compile_to_file(fn_name, {input}, fn_name); - - return 0; -} diff --git a/cmake/HalideGeneratorHelpers.cmake b/cmake/HalideGeneratorHelpers.cmake index d45341536422..3aa380da450e 100644 --- a/cmake/HalideGeneratorHelpers.cmake +++ b/cmake/HalideGeneratorHelpers.cmake @@ -712,22 +712,6 @@ function(_Halide_add_targets_to_runtime TARGET) endfunction() function(_Halide_target_link_gpu_libs TARGET VISIBILITY) - # TODO(https://github.com/halide/Halide/issues/5633): verify that this is correct & necessary for OpenGLCompute - if ("${ARGN}" MATCHES "openglcompute") - if ("${ARGN}" MATCHES "egl") - find_package(OpenGL REQUIRED COMPONENTS OpenGL EGL) - target_link_libraries(${TARGET} ${VISIBILITY} OpenGL::OpenGL OpenGL::EGL) - else () - if ("${ARGN}" MATCHES "linux" OR ("${ARGN}" MATCHES "host" AND Halide_HOST_TARGET MATCHES "linux")) - find_package(X11 REQUIRED) - target_link_libraries(${TARGET} ${VISIBILITY} X11::X11) - endif () - - find_package(OpenGL REQUIRED) - target_link_libraries(${TARGET} ${VISIBILITY} OpenGL::GL) - endif () - endif () - if ("${ARGN}" MATCHES "vulkan") find_package(Vulkan REQUIRED) target_link_libraries(${TARGET} ${VISIBILITY} Vulkan::Vulkan) diff --git a/packaging/common/Description.txt b/packaging/common/Description.txt index 21464255c878..7f11935edb42 100644 --- a/packaging/common/Description.txt +++ b/packaging/common/Description.txt @@ -4,7 +4,7 @@ * CPU architectures: X86, ARM, Hexagon, PowerPC, RISC-V, WebAssembly * Operating systems: Linux, Windows, macOS, Android, iOS, Qualcomm QuRT - * GPU APIs: CUDA, OpenCL, OpenGL Compute Shaders, Apple Metal, Direct X 12 + * GPU APIs: CUDA, OpenCL, Apple Metal, Direct X 12 Rather than being a standalone programming language, Halide is embedded in C++. This means you write C++ code that builds an in-memory representation of a diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp index d723d66461d8..e6cede6c6edb 100644 --- a/python_bindings/src/halide/halide_/PyEnums.cpp +++ b/python_bindings/src/halide/halide_/PyEnums.cpp @@ -25,7 +25,6 @@ void define_enums(py::module &m) { .value("CUDA", DeviceAPI::CUDA) .value("Vulkan", DeviceAPI::Vulkan) .value("OpenCL", DeviceAPI::OpenCL) - .value("OpenGLCompute", DeviceAPI::OpenGLCompute) .value("Metal", DeviceAPI::Metal) .value("Hexagon", DeviceAPI::Hexagon); @@ -137,7 +136,6 @@ void define_enums(py::module &m) { .value("CLDoubles", Target::Feature::CLDoubles) .value("CLHalf", Target::Feature::CLHalf) .value("CLAtomics64", Target::Feature::CLAtomics64) - .value("OpenGLCompute", Target::Feature::OpenGLCompute) .value("EGL", Target::Feature::EGL) .value("UserContext", Target::Feature::UserContext) .value("Profile", Target::Feature::Profile) diff --git a/python_bindings/test/correctness/boundary_conditions.py b/python_bindings/test/correctness/boundary_conditions.py index 32abd12ff0e6..2fa5e8e8c59d 100644 --- a/python_bindings/test/correctness/boundary_conditions.py +++ b/python_bindings/test/correctness/boundary_conditions.py @@ -200,7 +200,6 @@ def test_all(vector_width, target, partition_policy): # https://github.com/halide/Halide/issues/2148 if target.has_feature(hl.TargetFeature.Metal) or \ target.has_feature(hl.TargetFeature.Vulkan) or \ - target.has_feature(hl.TargetFeature.OpenGLCompute) or \ target.has_feature(hl.TargetFeature.D3D12Compute): vector_width_power_max = 2 diff --git a/python_bindings/test/correctness/target.py b/python_bindings/test/correctness/target.py index 7876bc97ecef..a7031c2cd7d1 100644 --- a/python_bindings/test/correctness/target.py +++ b/python_bindings/test/correctness/target.py @@ -52,12 +52,11 @@ def test_target(): hl.TargetFeature.JIT, hl.TargetFeature.CUDA, hl.TargetFeature.OpenCL, - hl.TargetFeature.OpenGLCompute, hl.TargetFeature.Debug, ], ) ts = t1.to_string() - assert ts == "arm-32-android-cuda-debug-jit-opencl-openglcompute" + assert ts == "arm-32-android-cuda-debug-jit-opencl" assert hl.Target.validate_target_string(ts) # Expected failures: diff --git a/src/BoundSmallAllocations.cpp b/src/BoundSmallAllocations.cpp index f6a86f8a3e2a..f83a13d99614 100644 --- a/src/BoundSmallAllocations.cpp +++ b/src/BoundSmallAllocations.cpp @@ -74,9 +74,7 @@ class BoundSmallAllocations : public IRMutator { } bool must_be_constant(MemoryType memory_type) const { - return (memory_type == MemoryType::Register || - (device_api == DeviceAPI::OpenGLCompute && - memory_type == MemoryType::GPUShared)); + return memory_type == MemoryType::Register; } Stmt visit(const Realize *op) override { @@ -125,13 +123,6 @@ class BoundSmallAllocations : public IRMutator { << "Allocation " << op->name << " has a dynamic size. " << "Only fixed-size allocations can be stored in registers. " << "Try storing on the heap or stack instead."; - - user_assert(!(device_api == DeviceAPI::OpenGLCompute && - op->memory_type == MemoryType::GPUShared)) - << "Allocation " << op->name << " has a dynamic size. " - << "Only fixed-size allocations can be stored in shared memory " - << "in OpenGL compute shaders. Try storing in MemoryType::Heap " - << "instead."; } const int64_t *size_ptr = bound.defined() ? as_const_int(bound) : nullptr; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cfb092d29bf0..77453fbce0a9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,7 +35,6 @@ set(HEADER_FILES CodeGen_LLVM.h CodeGen_Metal_Dev.h CodeGen_OpenCL_Dev.h - CodeGen_OpenGLCompute_Dev.h CodeGen_Posix.h CodeGen_PTX_Dev.h CodeGen_PyTorch.h @@ -206,7 +205,6 @@ set(SOURCE_FILES CodeGen_LLVM.cpp CodeGen_Metal_Dev.cpp CodeGen_OpenCL_Dev.cpp - CodeGen_OpenGLCompute_Dev.cpp CodeGen_Posix.cpp CodeGen_PowerPC.cpp CodeGen_PTX_Dev.cpp @@ -612,11 +610,6 @@ if (TARGET_D3D12COMPUTE) target_compile_definitions(Halide PRIVATE WITH_D3D12) endif () -option(TARGET_OPENGLCOMPUTE "Include OpenGLCompute target" ON) -if (TARGET_OPENGLCOMPUTE) - target_compile_definitions(Halide PRIVATE WITH_OPENGLCOMPUTE) -endif () - if (TARGET_VULKAN) message(STATUS "Enabling Vulkan target") target_compile_definitions(Halide PRIVATE WITH_VULKAN) diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index 3939edc4a678..89c18cb8ab28 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -30,7 +30,6 @@ extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeCuda_h[]; extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeHexagonHost_h[]; extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeMetal_h[]; extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenCL_h[]; -extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h[]; extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeQurt_h[]; extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeD3D12Compute_h[]; extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeWebGPU_h[]; @@ -307,9 +306,6 @@ CodeGen_C::~CodeGen_C() { if (target.has_feature(Target::OpenCL)) { stream << halide_internal_runtime_header_HalideRuntimeOpenCL_h << "\n"; } - if (target.has_feature(Target::OpenGLCompute)) { - stream << halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h << "\n"; - } if (target.has_feature(Target::D3D12Compute)) { stream << halide_internal_runtime_header_HalideRuntimeD3D12Compute_h << "\n"; } diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp index 2fc5b5cae0df..78fc4224fb61 100644 --- a/src/CodeGen_Internal.cpp +++ b/src/CodeGen_Internal.cpp @@ -64,7 +64,6 @@ bool function_takes_user_context(const std::string &name) { "halide_memoization_cache_release", "halide_cuda_run", "halide_opencl_run", - "halide_openglcompute_run", "halide_metal_run", "halide_d3d12compute_run", "halide_vulkan_run", @@ -90,7 +89,6 @@ bool function_takes_user_context(const std::string &name) { "halide_vtcm_free", "halide_cuda_initialize_kernels", "halide_opencl_initialize_kernels", - "halide_openglcompute_initialize_kernels", "halide_metal_initialize_kernels", "halide_d3d12compute_initialize_kernels", "halide_vulkan_initialize_kernels", diff --git a/src/CodeGen_OpenGLCompute_Dev.cpp b/src/CodeGen_OpenGLCompute_Dev.cpp deleted file mode 100644 index f2f0949f33fd..000000000000 --- a/src/CodeGen_OpenGLCompute_Dev.cpp +++ /dev/null @@ -1,1029 +0,0 @@ -#include "CodeGen_OpenGLCompute_Dev.h" -#include "CSE.h" -#include "CodeGen_C.h" -#include "CodeGen_GPU_Dev.h" -#include "Debug.h" -#include "Deinterleave.h" -#include "FindIntrinsics.h" -#include "IRMatch.h" -#include "IRMutator.h" -#include "IROperator.h" -#include "Simplify.h" -#include -#include -#include - -namespace Halide { -namespace Internal { - -using std::ostringstream; -using std::string; -using std::vector; - -namespace { - -char get_lane_suffix(int i) { - internal_assert(i >= 0 && i < 4); - return "rgba"[i]; -} - -class CodeGen_OpenGLCompute_C : public CodeGen_C { -public: - CodeGen_OpenGLCompute_C(std::ostream &s, const Target &t); - void add_kernel(const Stmt &stmt, - const std::string &name, - const std::vector &args); - -protected: - Type map_type(const Type &); - - std::string print_name(const std::string &name) override; - std::string print_type(Type type, AppendSpaceIfNeeded space_option = DoNotAppendSpace) override; - - using CodeGen_C::visit; - - void visit(const Cast *) override; - - void visit(const FloatImm *) override; - void visit(const UIntImm *) override; - void visit(const IntImm *) override; - - void visit(const Max *op) override; - void visit(const Min *op) override; - - void visit(const Mod *) override; - - // these have specific functions - // in GLSL that operate on vectors - void visit(const EQ *) override; - void visit(const NE *) override; - void visit(const LT *) override; - void visit(const LE *) override; - void visit(const GT *) override; - void visit(const GE *) override; - - void visit(const Shuffle *) override; - - void visit(const For *) override; - void visit(const Ramp *op) override; - void visit(const Broadcast *op) override; - void visit(const Load *op) override; - void visit(const Store *op) override; - void visit(const Call *op) override; - void visit(const Allocate *op) override; - void visit(const Free *op) override; - void visit(const Select *op) override; - void visit(const Evaluate *op) override; - - const std::map builtin = { - {"abs", "abs"}, - {"abs_f32", "abs"}, - {"acos_f32", "acos"}, - {"acosh_f32", "acosh"}, - {"asin_f32", "asin"}, - {"asinh_f32", "asinh"}, - {"atan2_f32", "atan"}, // also called atan in GLSL - {"atan_f32", "atan"}, - {"atanh_f32", "atanh"}, - {"ceil_f32", "ceil"}, - {"cos_f32", "cos"}, - {"cosh_f32", "cosh"}, - {"equal", "equal"}, - {"exp_f32", "exp"}, - {"fast_inverse_sqrt_f32", "inversesqrt"}, - {"floor_f32", "floor"}, - {"greaterThan", "greaterThan"}, - {"greaterThanEqual", "greaterThanEqual"}, - {"isnan", "isnan"}, - {"lessThan", "lessThan"}, - {"lessThanEqual", "lessThanEqual"}, - {"log_f32", "log"}, - {"max", "max"}, - {"min", "min"}, - {"mix", "mix"}, - {"mod", "mod"}, - {"notEqual", "notEqual"}, - {"sin_f32", "sin"}, - {"sinh_f32", "sinh"}, - {"sqrt_f32", "sqrt"}, - {"tan_f32", "tan"}, - {"tanh_f32", "tanh"}, - {"trunc_f32", "trunc"}, - }; - int workgroup_size[3] = {0, 0, 0}; - - // Maps each buffer with whether its base type is a vector. - std::map buffer_is_vector; -}; - -CodeGen_OpenGLCompute_C::CodeGen_OpenGLCompute_C(std::ostream &s, const Target &t) - : CodeGen_C(s, t) { -} - -// Maps Halide types to appropriate GLSL types or emit error if no equivalent -// type is available. -Type CodeGen_OpenGLCompute_C::map_type(const Type &type) { - Type result = type; - if (type.is_scalar()) { - if (type.is_float()) { - user_assert(type.bits() <= 32) - << "GLSL: Can't represent a float with " << type.bits() << " bits.\n"; - result = Float(32); - } else if (type.is_bool()) { - // unchanged - } else if (type.is_int() && type.bits() <= 32) { - result = Int(32); - } else if (type.is_uint() && type.bits() <= 32) { - result = UInt(32); - } else { - user_error << "GLSL: Can't represent type '" << type << "'.\n"; - } - } else { - user_assert(type.lanes() <= 4) - << "GLSL: vector types wider than 4 aren't supported\n"; - user_assert(type.is_bool() || type.is_int() || type.is_uint() || type.is_float()) - << "GLSL: Can't represent vector type '" << type << "'.\n"; - Type scalar_type = type.element_of(); - result = map_type(scalar_type).with_lanes(type.lanes()); - } - return result; -} - -// Identifiers containing double underscores '__' are reserved in GLSL, so we -// have to use a different name mangling scheme than in the C code generator. -string CodeGen_OpenGLCompute_C::print_name(const string &name) { - const string mangled = CodeGen_C::print_name(name); - return replace_all(mangled, "__", "XX"); -} - -string CodeGen_OpenGLCompute_C::print_type(Type type, AppendSpaceIfNeeded space) { - ostringstream oss; - type = map_type(type); - if (type.is_scalar()) { - if (type.is_float()) { - oss << "float"; - } else if (type.is_bool()) { - oss << "bool"; - } else if (type.is_int()) { - oss << "int"; - } else if (type.is_uint()) { - oss << "uint"; - } else { - internal_error << "GLSL: invalid type '" << type << "' encountered.\n"; - } - } else { - if (type.is_float()) { - // no prefix for float vectors - } else if (type.is_bool()) { - oss << "b"; - } else if (type.is_int()) { - oss << "i"; - } else if (type.is_uint()) { - oss << "u"; - } else { - internal_error << "GLSL: invalid type '" << type << "' encountered.\n"; - } - oss << "vec" << type.lanes(); - } - - if (space == AppendSpace) { - oss << " "; - } - - return oss.str(); -} - -string simt_intrinsic(const string &name) { - if (ends_with(name, ".__thread_id_x")) { - return "gl_LocalInvocationID.x"; - } else if (ends_with(name, ".__thread_id_y")) { - return "gl_LocalInvocationID.y"; - } else if (ends_with(name, ".__thread_id_z")) { - return "gl_LocalInvocationID.z"; - } else if (ends_with(name, ".__thread_id_w")) { - internal_error << "4-dimension loops with " << name << " are not supported\n"; - } else if (ends_with(name, ".__block_id_x")) { - return "gl_WorkGroupID.x"; - } else if (ends_with(name, ".__block_id_y")) { - return "gl_WorkGroupID.y"; - } else if (ends_with(name, ".__block_id_z")) { - return "gl_WorkGroupID.z"; - } else if (ends_with(name, ".__block_id_w")) { - internal_error << "4-dimension loops with " << name << " are not supported\n"; - } - internal_error << "simt_intrinsic called on bad variable name: " << name << "\n"; - return ""; -} - -int thread_loop_workgroup_index(const string &name) { - string ids[] = {".__thread_id_x", - ".__thread_id_y", - ".__thread_id_z", - ".__thread_id_w"}; - for (size_t i = 0; i < sizeof(ids) / sizeof(string); i++) { - if (ends_with(name, ids[i])) { - return i; - } - } - return -1; -} - -void CodeGen_OpenGLCompute_C::visit(const FloatImm *op) { - ostringstream oss; - // Print integral numbers with trailing ".0". For fractional numbers use a - // precision of 9 digits, which should be enough to recover the binary - // float unambiguously from the decimal representation (if iostreams - // implements correct rounding). - const float truncated = (op->value < 0 ? std::ceil(op->value) : std::floor(op->value)); - if (truncated == op->value) { - oss << std::fixed << std::setprecision(1) << op->value; - } else { - oss << std::setprecision(9) << op->value; - } - id = oss.str(); -} - -void CodeGen_OpenGLCompute_C::visit(const UIntImm *op) { - if (op->type == Bool()) { - if (op->value == 1) { - id = "true"; - } else { - id = "false"; - } - } else { - id = std::to_string(op->value) + "u"; - } -} - -void CodeGen_OpenGLCompute_C::visit(const Max *op) { - print_expr(Call::make(op->type, "max", {op->a, op->b}, Call::PureExtern)); -} - -void CodeGen_OpenGLCompute_C::visit(const Min *op) { - print_expr(Call::make(op->type, "min", {op->a, op->b}, Call::PureExtern)); -} - -void CodeGen_OpenGLCompute_C::visit(const Mod *op) { - if (op->type.is_int() || op->type.is_uint()) { - // Just exploit the Euclidean identity - // FIXME: Why doesn't lower_euclidean_mod work for glsl? - // https://github.com/halide/Halide/issues/4979 - Expr zero = make_zero(op->type); - Expr equiv = select(op->a == zero, zero, - op->a - (op->a / op->b) * op->b); - equiv = common_subexpression_elimination(equiv); - print_expr(equiv); - } else { - print_expr(Call::make(op->type, "mod", {op->a, op->b}, Call::Extern)); - } -} - -// The following comparisons are defined for ivec and vec -// types, so we don't use call_builtin -void CodeGen_OpenGLCompute_C::visit(const EQ *op) { - if (op->type.is_vector()) { - print_expr(Call::make(op->type, "equal", {op->a, op->b}, Call::Extern)); - } else { - CodeGen_C::visit(op); - } -} - -void CodeGen_OpenGLCompute_C::visit(const NE *op) { - if (op->type.is_vector()) { - print_expr(Call::make(op->type, "notEqual", {op->a, op->b}, Call::Extern)); - } else { - CodeGen_C::visit(op); - } -} - -void CodeGen_OpenGLCompute_C::visit(const LT *op) { - if (op->type.is_vector()) { - print_expr(Call::make(op->type, "lessThan", {op->a, op->b}, Call::Extern)); - } else { - CodeGen_C::visit(op); - } -} - -void CodeGen_OpenGLCompute_C::visit(const LE *op) { - if (op->type.is_vector()) { - print_expr(Call::make(op->type, "lessThanEqual", {op->a, op->b}, Call::Extern)); - } else { - CodeGen_C::visit(op); - } -} - -void CodeGen_OpenGLCompute_C::visit(const GT *op) { - if (op->type.is_vector()) { - print_expr(Call::make(op->type, "greaterThan", {op->a, op->b}, Call::Extern)); - } else { - CodeGen_C::visit(op); - } -} - -void CodeGen_OpenGLCompute_C::visit(const GE *op) { - if (op->type.is_vector()) { - print_expr(Call::make(op->type, "greaterThanEqual", {op->a, op->b}, Call::Extern)); - } else { - CodeGen_C::visit(op); - } -} - -void CodeGen_OpenGLCompute_C::visit(const Shuffle *op) { - // The halide Shuffle represents the llvm intrinisc - // shufflevector, however, for GLSL its use is limited to swizzling - // up to a four channel vec type. - - internal_assert(op->vectors.size() == 1); - - int shuffle_lanes = op->type.lanes(); - internal_assert(shuffle_lanes <= 4); - - string expr = print_expr(op->vectors[0]); - - // Create a swizzle expression for the shuffle - string swizzle; - for (int i = 0; i != shuffle_lanes; ++i) { - int channel = op->indices[i]; - internal_assert(channel < 4) << "Shuffle of invalid channel"; - swizzle += get_lane_suffix(channel); - } - - print_assignment(op->type, expr + "." + swizzle); -} - -void CodeGen_OpenGLCompute_C::visit(const Call *op) { - if (op->is_intrinsic(Call::gpu_thread_barrier)) { - internal_assert(op->args.size() == 1) << "gpu_thread_barrier() intrinsic must specify memory fence type.\n"; - - const auto *fence_type_ptr = as_const_int(op->args[0]); - internal_assert(fence_type_ptr) << "gpu_thread_barrier() parameter is not a constant integer.\n"; - auto fence_type = *fence_type_ptr; - - stream << get_indent() << "barrier();\n"; - - // barrier() is an execution barrier; for memory behavior, we'll use the - // least-common-denominator groupMemoryBarrier(), because other fence types - // require extensions or GL 4.3 as a minumum. - if (fence_type & CodeGen_GPU_Dev::MemoryFenceType::Device || - fence_type & CodeGen_GPU_Dev::MemoryFenceType::Shared) { - stream << "groupMemoryBarrier();\n"; - } - print_assignment(op->type, "0"); - } else if (op->is_intrinsic(Call::lerp)) { - // Implement lerp using GLSL's mix() function, which always uses - // floating point arithmetic. - Expr zero_val = op->args[0]; - Expr one_val = op->args[1]; - Expr weight = op->args[2]; - - internal_assert(weight.type().is_uint() || weight.type().is_float()); - if (weight.type().is_uint()) { - // Normalize integer weights to [0.0f, 1.0f] range. - internal_assert(weight.type().bits() < 32); - weight = Div::make(Cast::make(Float(32), weight), - Cast::make(Float(32), weight.type().max())); - } else if (op->type.is_uint()) { - // Round float weights down to next multiple of (1/op->type.imax()) - // to give same results as lerp based on integer arithmetic. - internal_assert(op->type.bits() < 32); - weight = floor(weight * op->type.max()) / op->type.max(); - } - - Type result_type = Float(32, op->type.lanes()); - Expr e = Call::make(result_type, "mix", {zero_val, one_val, weight}, Call::Extern); - - if (!op->type.is_float()) { - // Mirror rounding implementation of Halide's integer lerp. - e = Cast::make(op->type, floor(e + 0.5f)); - } - print_expr(e); - return; - } else if (op->is_intrinsic(Call::abs)) { - internal_assert(op->args.size() == 1); - Expr a = op->args[0]; - Type target_type = map_type(op->type); - if (op->type != Int(32)) { - print_assignment(target_type, print_type(target_type) + "(abs(" + print_expr(a) + "))"); - } else { - print_assignment(target_type, "abs(" + print_expr(a) + ")"); - } - return; - } else if (op->is_intrinsic(Call::absd)) { - internal_assert(op->args.size() == 2); - Expr a = op->args[0]; - Expr b = op->args[1]; - Expr e = cast(op->type, select(a < b, b - a, a - b)); - print_expr(e); - return; - } else if (op->is_intrinsic(Call::return_second)) { - internal_assert(op->args.size() == 2); - // Simply discard the first argument, which is generally a call to - // 'halide_printf'. - print_assignment(op->type, print_expr(op->args[1])); - return; - } else if (op->is_intrinsic(Call::round)) { - print_assignment(op->type, "roundEven(" + print_expr(op->args[0]) + ")"); - return; - } else if (op->name == "fast_inverse_f32") { - print_expr(make_one(op->type) / op->args[0]); - return; - } else if (op->name == "fast_inverse_sqrt_f32") { - print_expr(make_one(op->type) / sqrt(op->args[0])); - return; - } else if (op->name == "pow_f32") { - if (can_prove(op->args[0] > 0)) { - ostringstream rhs; - rhs << "pow(" << print_expr(op->args[0]) << ", " << print_expr(op->args[1]) << ")"; - print_assignment(op->type, rhs.str()); - return; - } else { - ostringstream base; - string a = print_expr(op->args[0]); - string b = print_expr(op->args[1]); - base << "pow(abs(" << a << "), " << b << ")"; - string c = print_assignment(op->type, base.str()); - Expr a_var = is_const(op->args[0]) ? op->args[0] : Variable::make(op->type, a); - Expr b_var = is_const(op->args[1]) ? op->args[1] : Variable::make(op->type, b); - Expr c_var = Variable::make(op->type, c); - // OpenGL isn't required to produce NaNs, so we return - // zero in the undefined case. - Expr equiv = select(a_var > 0 || b_var % 2 == 0, c_var, - b_var % 2 == 1, -c_var, - 0.0f); - print_expr(simplify(equiv)); - return; - } - } else if (op->is_intrinsic(Call::shift_right)) { - print_assignment(op->type, print_expr(op->args[0]) + " >> " + print_expr(op->args[1])); - } else if (op->is_intrinsic(Call::shift_left)) { - print_assignment(op->type, print_expr(op->args[0]) + " << " + print_expr(op->args[1])); - } else if (op->is_intrinsic(Call::bitwise_not)) { - print_assignment(op->type, "~" + print_expr(op->args[0])); - } else if (op->is_intrinsic(Call::bitwise_and)) { - print_assignment(op->type, print_expr(op->args[0]) + " & " + print_expr(op->args[1])); - } else if (op->is_intrinsic(Call::bitwise_or)) { - print_assignment(op->type, print_expr(op->args[0]) + " | " + print_expr(op->args[1])); - } else if (op->is_intrinsic(Call::bitwise_xor)) { - print_assignment(op->type, print_expr(op->args[0]) + " ^ " + print_expr(op->args[1])); - } else if (op->is_intrinsic(Call::div_round_to_zero)) { - print_assignment(op->type, print_expr(op->args[0]) + " / " + print_expr(op->args[1])); - } else if (op->is_intrinsic(Call::mod_round_to_zero)) { - print_assignment(op->type, print_expr(op->args[0]) + " % " + print_expr(op->args[1])); - } else if (op->is_intrinsic(Call::saturating_cast)) { - Expr e = lower_intrinsic(op); - print_expr(e); - return; - } else { - auto it = builtin.find(op->name); - if (it == builtin.end()) { - user_error << "GLSL: unknown function '" << op->name << "' encountered.\n"; - } - - ostringstream rhs; - rhs << it->second << "("; - for (size_t i = 0; i < op->args.size(); i++) { - if (i > 0) { - rhs << ", "; - } - rhs << print_expr(op->args[i]); - } - rhs << ")"; - print_assignment(op->type, rhs.str()); - } -} - -void CodeGen_OpenGLCompute_C::visit(const Cast *op) { - Type value_type = op->value.type(); - // If both types are represented by the same GLSL type, no explicit cast - // is necessary. - Type target_type = map_type(op->type); - if (target_type == map_type(value_type)) { - Expr value = op->value; - if (value_type.code() == Type::Float) { - // float->int conversions may need explicit truncation if an - // integer type is embedded into a float. (Note: overflows are - // considered undefined behavior, so we do nothing about values - // that are out of range of the target type.) - if (op->type.code() == Type::UInt) { - value = simplify(floor(value)); - } else if (op->type.code() == Type::Int) { - value = simplify(trunc(value)); - } - } - // FIXME: Overflow is not UB for most Halide types - // https://github.com/halide/Halide/issues/4975 - value.accept(this); - } else { - print_assignment(target_type, print_type(target_type) + "(" + print_expr(op->value) + ")"); - } -} - -void CodeGen_OpenGLCompute_C::visit(const For *loop) { - user_assert(loop->for_type != ForType::GPULane) - << "The OpenGLCompute backend does not support the gpu_lanes() scheduling directive."; - - if (CodeGen_GPU_Dev::is_gpu_var(loop->name)) { - internal_assert((loop->for_type == ForType::GPUBlock) || - (loop->for_type == ForType::GPUThread)) - << "kernel loop must be either gpu block or gpu thread\n"; - internal_assert(is_const_zero(loop->min)); - - debug(4) << "loop extent is " << loop->extent << "\n"; - // - // Need to extract workgroup size. - // - int index = thread_loop_workgroup_index(loop->name); - if (index >= 0) { - const IntImm *int_limit = loop->extent.as(); - user_assert(int_limit != nullptr) << "For OpenGLCompute workgroup size must be a constant integer.\n"; - int new_workgroup_size = int_limit->value; - user_assert(workgroup_size[index] == 0 || - workgroup_size[index] == new_workgroup_size) - << "OpenGLCompute requires all gpu kernels have same workgroup size, " - << "but two different ones were encountered " << workgroup_size[index] - << " and " << new_workgroup_size - << " in dimension " << index << ".\n"; - workgroup_size[index] = new_workgroup_size; - debug(4) << "Workgroup size for index " << index << " is " << workgroup_size[index] << "\n"; - } - - stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name) - << " = int(" << simt_intrinsic(loop->name) << ");\n"; - - loop->body.accept(this); - - } else { - user_assert(loop->for_type != ForType::Parallel) - << "Cannot use parallel loops inside OpenGLCompute kernel\n"; - CodeGen_C::visit(loop); - } -} - -void CodeGen_OpenGLCompute_C::visit(const Ramp *op) { - if (op->lanes > 4) { - internal_error << "GLSL: ramp lanes " << op->lanes << " is not supported\n"; - } - - ostringstream rhs; - // Print the sequence vec(0, 1, 2, ...). - rhs << print_type(op->type) << "("; - for (int i = 0; i < op->type.lanes(); i++) { - rhs << i; - if (i != op->type.lanes() - 1) { - rhs << ", "; - } - } - rhs << ")"; - - // Multiply by the stride and add the base. - rhs << " * " << print_expr(op->stride) << " + " << print_expr(op->base); - - print_assignment(op->type, rhs.str()); -} - -void CodeGen_OpenGLCompute_C::visit(const Broadcast *op) { - string id_value = print_expr(op->value); - ostringstream oss; - oss << print_type(op->type.with_lanes(op->lanes)) << "(" << id_value << ")"; - print_assignment(op->type.with_lanes(op->lanes), oss.str()); -} - -void CodeGen_OpenGLCompute_C::visit(const Load *op) { - user_assert(is_const_one(op->predicate)) << "GLSL: predicated load is not supported.\n"; - // https://github.com/halide/Halide/issues/4975 - - string name = print_name(op->name); - if (!allocations.contains(op->name)) { - name += ".data"; - } - - // If the index is scalar, just index the buffer using the index. - if (op->type.is_scalar()) { - internal_assert(!buffer_is_vector[op->name]); - string index_id = print_expr(op->index); - string rhs = name + "[" + index_id + "]"; - print_assignment(op->type, rhs); - return; - } - - // If this is a dense vector load and the buffer has a vector base type, - // then index the buffer using the base of the ramp divided by the number - // of lanes. - Expr ramp_base = strided_ramp_base(op->index); - if (ramp_base.defined() && buffer_is_vector[op->name]) { - string index_id = print_expr(ramp_base / op->type.lanes()); - string rhs = name + "[" + index_id + "]"; - print_assignment(op->type, rhs); - return; - } - - // Gather vector elements. - internal_assert(op->type.is_vector()); - internal_assert(!buffer_is_vector[op->name]); - string index_id = print_expr(op->index); - string rhs = print_type(op->type) + "("; - for (int i = 0; i < op->type.lanes(); i++) { - rhs += name + "[" + index_id + "[" + std::to_string(i) + "]]"; - if (i != op->type.lanes() - 1) { - rhs += ", "; - } - } - rhs += ")"; - print_assignment(op->type, rhs); -} - -void CodeGen_OpenGLCompute_C::visit(const Store *op) { - user_assert(is_const_one(op->predicate)) << "GLSL: predicated store is not supported.\n"; - // https://github.com/halide/Halide/issues/4975 - - string name = print_name(op->name); - if (!allocations.contains(op->name)) { - name += ".data"; - } - - string value_id = print_expr(op->value); - - // If the index is scalar, just index the buffer using the index. - if (op->value.type().is_scalar()) { - internal_assert(!buffer_is_vector[op->name]); - string index_id = print_expr(op->index); - stream << get_indent() << name << "[" << index_id << "] = "; - stream << value_id << ";\n"; - - // Need a cache clear on stores to avoid reusing stale loaded - // values from before the store. - cache.clear(); - return; - } - - // If this is a dense vector store and the buffer has a vector base type, - // then index the buffer using the base of the ramp divided by the number - // of lanes. - Expr ramp_base = strided_ramp_base(op->index); - if (ramp_base.defined() && buffer_is_vector[op->name]) { - string index_id = print_expr(ramp_base / op->value.type().lanes()); - stream << get_indent() << name << "[" << index_id << "] = "; - stream << value_id << ";\n"; - - // Need a cache clear on stores to avoid reusing stale loaded - // values from before the store. - cache.clear(); - return; - } - - // Scatter vector elements. - internal_assert(op->value.type().is_vector()); - internal_assert(!buffer_is_vector[op->name]); - string index_id = print_expr(op->index); - for (int i = 0; i < op->value.type().lanes(); i++) { - string sub_index_id = index_id + "[" + std::to_string(i) + "]"; - stream << get_indent() << name << "[" << sub_index_id << "] = "; - stream << value_id << "[" << std::to_string(i) << "];\n"; - } - - // Need a cache clear on stores to avoid reusing stale loaded - // values from before the store. - cache.clear(); -} - -void CodeGen_OpenGLCompute_C::visit(const Select *op) { - ostringstream rhs; - string true_val = print_expr(op->true_value); - string false_val = print_expr(op->false_value); - string cond = print_expr(op->condition); - if (op->type.is_scalar()) { - rhs << cond << " ? " << true_val << " : " << false_val; - } else { - rhs << print_type(op->type) << "("; - for (int i = 0; i < op->type.lanes(); i++) { - string index = "[" + std::to_string(i) + "]"; - rhs << cond << index << " ? " - << true_val << index << " : " - << false_val << index; - if (i != op->type.lanes() - 1) { - rhs << ", "; - } - } - rhs << ")"; - } - print_assignment(op->type, rhs.str()); -} - -class CodeGen_OpenGLCompute_Dev : public CodeGen_GPU_Dev { -public: - CodeGen_OpenGLCompute_Dev(const Target &target); - - // CodeGen_GPU_Dev interface - void add_kernel(Stmt stmt, - const std::string &name, - const std::vector &args) override; - - void init_module() override; - - std::vector compile_to_src() override; - - std::string get_current_kernel_name() override; - - void dump() override; - - std::string print_gpu_name(const std::string &name) override; - - std::string api_unique_name() override { - return "openglcompute"; - } - bool kernel_run_takes_types() const override { - return true; - } - -protected: - std::ostringstream src_stream; - std::string cur_kernel_name; - CodeGen_OpenGLCompute_C glc; -}; - -CodeGen_OpenGLCompute_Dev::CodeGen_OpenGLCompute_Dev(const Target &target) - : glc(src_stream, target) { -} - -void CodeGen_OpenGLCompute_Dev::add_kernel(Stmt s, - const string &name, - const vector &args) { - debug(2) << "CodeGen_OpenGLCompute_Dev::compile " << name << "\n"; - - // TODO: do we have to uniquify these names, or can we trust that they are safe? - cur_kernel_name = name; - glc.add_kernel(s, name, args); -} - -namespace { -class FindSharedAllocations : public IRVisitor { - using IRVisitor::visit; - - void visit(const Allocate *op) override { - op->body.accept(this); - if (op->memory_type == MemoryType::GPUShared) { - allocs.push_back(op); - } - } - -public: - vector allocs; -}; - -// Check if all loads and stores to the member 'buffer' are dense, aligned, and -// have the same number of lanes. If this is indeed the case then the 'lanes' -// member stores the number of lanes in those loads and stores. -class CheckAlignedDenseVectorLoadStore : public IRVisitor { -public: - // True if all loads and stores from the buffer are dense, aligned, and all - // have the same number of lanes, false otherwise. - bool are_all_dense = true; - - // The number of lanes in the loads and stores. If the number of lanes is - // variable, then are_all_dense is set to false regardless, and this value - // is undefined. Initially set to -1 before any dense operation is - // discovered. - int lanes = -1; - - CheckAlignedDenseVectorLoadStore(string buffer) - : buffer(std::move(buffer)) { - } - -private: - // The name of the buffer to check. - string buffer; - - using IRVisitor::visit; - - void visit(const Load *op) override { - IRVisitor::visit(op); - - if (op->name != buffer) { - return; - } - - if (op->type.is_scalar()) { - are_all_dense = false; - return; - } - - Expr ramp_base = strided_ramp_base(op->index); - if (!ramp_base.defined()) { - are_all_dense = false; - return; - } - - if ((op->alignment.modulus % op->type.lanes() != 0) || - (op->alignment.remainder % op->type.lanes() != 0)) { - are_all_dense = false; - return; - } - - if (lanes != -1 && op->type.lanes() != lanes) { - are_all_dense = false; - return; - } - - lanes = op->type.lanes(); - } - - void visit(const Store *op) override { - IRVisitor::visit(op); - - if (op->name != buffer) { - return; - } - - if (op->value.type().is_scalar()) { - are_all_dense = false; - return; - } - - Expr ramp_base = strided_ramp_base(op->index); - if (!ramp_base.defined()) { - are_all_dense = false; - return; - } - - if ((op->alignment.modulus % op->value.type().lanes() != 0) || - (op->alignment.remainder % op->value.type().lanes() != 0)) { - are_all_dense = false; - return; - } - - if (lanes != -1 && op->value.type().lanes() != lanes) { - are_all_dense = false; - return; - } - - lanes = op->value.type().lanes(); - } -}; -} // namespace - -void CodeGen_OpenGLCompute_C::add_kernel(const Stmt &s, - const string &name, - const vector &args) { - - debug(2) << "Adding OpenGLCompute kernel " << name << "\n"; - cache.clear(); - - if (target.os == Target::Android) { - stream << "#version 310 es\n" - << "#extension GL_ANDROID_extension_pack_es31a : require\n"; - } else if (target.has_feature(Target::EGL)) { - stream << "#version 310 es\n"; - } else { - stream << "#version 430\n"; - } - stream << "float float_from_bits(int x) { return intBitsToFloat(int(x)); }\n"; - stream << "#define halide_maybe_unused(x) (void)(x)\n"; - - for (size_t i = 0; i < args.size(); i++) { - if (args[i].is_buffer) { - // - // layout(binding = 10) buffer buffer10 { - // vec3 data[]; - // } inBuffer; - // - CheckAlignedDenseVectorLoadStore check_dense(args[i].name); - s.accept(&check_dense); - int lanes = check_dense.are_all_dense ? check_dense.lanes : 1; - buffer_is_vector[args[i].name] = lanes > 1; - stream << "layout(binding=" << i << ")" - << " buffer buffer" << i << " { " - << print_type(args[i].type.with_lanes(lanes)) << " data[]; } " - << print_name(args[i].name) << ";\n"; - } else { - stream << "layout(location = " << i << ") uniform " << print_type(args[i].type) - << " " << print_name(args[i].name) << ";\n"; - } - } - - // Find all the shared allocations and declare them at global scope. - FindSharedAllocations fsa; - s.accept(&fsa); - for (const Allocate *op : fsa.allocs) { - internal_assert(op->extents.size() == 1 && is_const(op->extents[0])); - stream << "shared " - << print_type(op->type) << " " - << print_name(op->name) << "[" - << op->extents[0] << "];\n"; - } - - // We'll figure out the workgroup size while traversing the stmt - workgroup_size[0] = 0; - workgroup_size[1] = 0; - workgroup_size[2] = 0; - - stream << "void main()\n{\n"; - indent += 2; - print(s); - indent -= 2; - stream << "}\n"; - - // Declare the workgroup size. - indent += 2; - stream << "layout(local_size_x = " << workgroup_size[0]; - if (workgroup_size[1] > 1) { - stream << ", local_size_y = " << workgroup_size[1]; - } - if (workgroup_size[2] > 1) { - stream << ", local_size_z = " << workgroup_size[2]; - } - stream << ") in;\n// end of kernel " << name << "\n"; - indent -= 2; -} - -void CodeGen_OpenGLCompute_Dev::init_module() { - src_stream.str(""); - src_stream.clear(); - cur_kernel_name = ""; -} - -void CodeGen_OpenGLCompute_C::visit(const Allocate *op) { - debug(2) << "OpenGLCompute: Allocate " << op->name << " of type " << op->type << " on device\n"; - - stream << get_indent(); - Allocation alloc; - alloc.type = op->type; - allocations.push(op->name, alloc); - - internal_assert(!op->extents.empty()); - Expr extent = 1; - for (const Expr &e : op->extents) { - extent *= e; - } - extent = simplify(extent); - internal_assert(is_const(extent)); - - if (op->memory_type != MemoryType::GPUShared) { - stream << "{\n"; - indent += 2; - stream << get_indent(); - // Shared allocations were already declared at global scope. - stream << print_type(op->type) << " " - << print_name(op->name) << "[" - << op->extents[0] << "];\n"; - } - op->body.accept(this); - - if (op->memory_type != MemoryType::GPUShared) { - indent -= 2; - stream << get_indent() << "}\n"; - } - - buffer_is_vector[op->name] = op->type.is_vector(); -} - -void CodeGen_OpenGLCompute_C::visit(const Free *op) { - debug(2) << "OpenGLCompute: Free on device for " << op->name << "\n"; - - allocations.pop(op->name); -} - -void CodeGen_OpenGLCompute_C::visit(const Evaluate *op) { - if (is_const(op->value)) { - return; - } - print_expr(op->value); -} - -void CodeGen_OpenGLCompute_C::visit(const IntImm *op) { - if (op->type == Int(32)) { - // GL seems to interpret some large int immediates as uints. - id = "int(" + std::to_string(op->value) + ")"; - } else { - id = print_type(op->type) + "(" + std::to_string(op->value) + ")"; - } -} - -vector CodeGen_OpenGLCompute_Dev::compile_to_src() { - string str = src_stream.str(); - debug(1) << "GLSL Compute source:\n" - << str << "\n"; - vector buffer(str.begin(), str.end()); - buffer.push_back(0); - return buffer; -} - -string CodeGen_OpenGLCompute_Dev::get_current_kernel_name() { - return cur_kernel_name; -} - -void CodeGen_OpenGLCompute_Dev::dump() { - std::cerr << src_stream.str() << "\n"; -} - -std::string CodeGen_OpenGLCompute_Dev::print_gpu_name(const std::string &name) { - return name; -} - -} // namespace - -std::unique_ptr new_CodeGen_OpenGLCompute_Dev(const Target &target) { - return std::make_unique(target); -} - -} // namespace Internal -} // namespace Halide diff --git a/src/CodeGen_OpenGLCompute_Dev.h b/src/CodeGen_OpenGLCompute_Dev.h deleted file mode 100644 index f0a63c885909..000000000000 --- a/src/CodeGen_OpenGLCompute_Dev.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef HALIDE_CODEGEN_OPENGLCOMPUTE_DEV_H -#define HALIDE_CODEGEN_OPENGLCOMPUTE_DEV_H - -/** \file - * Defines the code-generator for producing GLSL kernel code for OpenGL Compute. - */ - -#include - -namespace Halide { - -struct Target; - -namespace Internal { - -struct CodeGen_GPU_Dev; - -std::unique_ptr new_CodeGen_OpenGLCompute_Dev(const Target &target); - -} // namespace Internal -} // namespace Halide - -#endif diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp index b86c99f9269e..61b365f2f7aa 100644 --- a/src/CodeGen_Vulkan_Dev.cpp +++ b/src/CodeGen_Vulkan_Dev.cpp @@ -291,8 +291,6 @@ class CodeGen_Vulkan_Dev : public CodeGen_GPU_Dev { // have the same number of lanes. If this is indeed the case then the 'lanes' // member stores the number of lanes in those loads and stores. // -// FIXME: Refactor this and the version in CodeGen_OpenGLCompute_Dev to a common place! -// class CheckAlignedDenseVectorLoadStore : public IRVisitor { public: // True if all loads and stores from the buffer are dense, aligned, and all diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index 33fa3b36e78e..551acfcdebf2 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -244,8 +244,6 @@ DeviceAPI Deserializer::deserialize_device_api(Serialize::DeviceAPI device_api) return DeviceAPI::CUDA; case Serialize::DeviceAPI::OpenCL: return DeviceAPI::OpenCL; - case Serialize::DeviceAPI::OpenGLCompute: - return DeviceAPI::OpenGLCompute; case Serialize::DeviceAPI::Metal: return DeviceAPI::Metal; case Serialize::DeviceAPI::Hexagon: diff --git a/src/DeviceAPI.h b/src/DeviceAPI.h index 1f67aaf7b048..12476a23b724 100644 --- a/src/DeviceAPI.h +++ b/src/DeviceAPI.h @@ -18,7 +18,6 @@ enum class DeviceAPI { Default_GPU, CUDA, OpenCL, - OpenGLCompute, Metal, Hexagon, HexagonDma, @@ -34,7 +33,6 @@ const DeviceAPI all_device_apis[] = {DeviceAPI::None, DeviceAPI::Default_GPU, DeviceAPI::CUDA, DeviceAPI::OpenCL, - DeviceAPI::OpenGLCompute, DeviceAPI::Metal, DeviceAPI::Hexagon, DeviceAPI::HexagonDma, diff --git a/src/DeviceInterface.cpp b/src/DeviceInterface.cpp index 9a0cb2f97e99..27f6b549ee7d 100644 --- a/src/DeviceInterface.cpp +++ b/src/DeviceInterface.cpp @@ -94,8 +94,6 @@ const halide_device_interface_t *get_device_interface_for_device_api(DeviceAPI d name = "opencl"; } else if (d == DeviceAPI::CUDA) { name = "cuda"; - } else if (d == DeviceAPI::OpenGLCompute) { - name = "openglcompute"; } else if (d == DeviceAPI::Hexagon) { name = "hexagon"; } else if (d == DeviceAPI::HexagonDma) { @@ -154,8 +152,6 @@ DeviceAPI get_default_device_api_for_target(const Target &target) { return DeviceAPI::OpenCL; } else if (target.has_feature(Target::CUDA)) { return DeviceAPI::CUDA; - } else if (target.has_feature(Target::OpenGLCompute)) { - return DeviceAPI::OpenGLCompute; } else if (target.arch != Target::Hexagon && target.has_feature(Target::HVX)) { return DeviceAPI::Hexagon; } else if (target.has_feature(Target::HexagonDma)) { @@ -192,9 +188,6 @@ Expr make_device_interface_call(DeviceAPI device_api, MemoryType memory_type) { case DeviceAPI::Metal: interface_name = "halide_metal_device_interface"; break; - case DeviceAPI::OpenGLCompute: - interface_name = "halide_openglcompute_device_interface"; - break; case DeviceAPI::Hexagon: interface_name = "halide_hexagon_device_interface"; break; diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp index cd59fd470d38..ef5a75344bb8 100644 --- a/src/FuseGPUThreadLoops.cpp +++ b/src/FuseGPUThreadLoops.cpp @@ -627,7 +627,6 @@ class ExtractSharedAndHeapAllocations : public IRMutator { if (!may_merge_allocs_of_different_type && mem_allocs[free_spaces[i]].group[0].type != alloc.type) { - // Types must also match for OpenGLCompute continue; } @@ -649,7 +648,6 @@ class ExtractSharedAndHeapAllocations : public IRMutator { if (!may_merge_allocs_of_different_type && mem_allocs[free_spaces[i]].group[0].type != alloc.type) { - // Types must also match for OpenGLCompute continue; } @@ -760,7 +758,7 @@ class ExtractSharedAndHeapAllocations : public IRMutator { // lifetimes, and then cluster the groups according to which // ones can share a single allocation. For cuda, opencl, and // similar we get one big combined allocation per memory - // type. For vulkan, openglcompute and direct3d, we also separate by + // type. For vulkan and direct3d, we also separate by // element type. map, vector> clustered_allocs; @@ -1034,8 +1032,7 @@ class ExtractSharedAndHeapAllocations : public IRMutator { : device_api(d), thread_id_var_name(unique_name('t')), num_threads_var_name(unique_name('t')), - may_merge_allocs_of_different_type(device_api != DeviceAPI::OpenGLCompute && - device_api != DeviceAPI::D3D12Compute && + may_merge_allocs_of_different_type(device_api != DeviceAPI::D3D12Compute && device_api != DeviceAPI::Vulkan && device_api != DeviceAPI::WebGPU) { } diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index 52cb3714268c..bc03dd124d9a 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -99,9 +99,6 @@ ostream &operator<<(ostream &out, const DeviceAPI &api) { case DeviceAPI::OpenCL: out << ""; break; - case DeviceAPI::OpenGLCompute: - out << ""; - break; case DeviceAPI::Metal: out << ""; break; diff --git a/src/JITModule.cpp b/src/JITModule.cpp index 0d37c07284c3..ffd8949d4ca1 100644 --- a/src/JITModule.cpp +++ b/src/JITModule.cpp @@ -58,48 +58,6 @@ typedef struct CUctx_st *CUcontext; typedef struct cl_context_st *cl_context; typedef struct cl_command_queue_st *cl_command_queue; -void load_opengl(bool needs_egl) { -#if defined(__linux__) - if (have_symbol("glXGetCurrentContext") && have_symbol("glDeleteTextures")) { - debug(1) << "OpenGL support code already linked in...\n"; - } else { - debug(1) << "Looking for OpenGL support code...\n"; - string error; - if (needs_egl) { - // NVIDIA EGL prefers users to load libOpenGL.so instead of libGL.so - // The way we're using it, it seems like libGL.so.1 is a valid fallback. - // See here for more details: https://developer.nvidia.com/blog/linking-opengl-server-side-rendering - llvm::sys::DynamicLibrary::LoadLibraryPermanently("libOpenGL.so.0", &error); - if (!error.empty()) { - debug(1) << "Could not find libOpenGL.so.0 when EGL requested. Falling back to libGL.so.1\n"; - llvm::sys::DynamicLibrary::LoadLibraryPermanently("libGL.so.1", &error); - } - user_assert(error.empty()) << "Could not find libOpenGL.so.0 or libGL.so.1\n"; - llvm::sys::DynamicLibrary::LoadLibraryPermanently("libEGL.so.1", &error); - user_assert(error.empty()) << "Could not find libEGL.so.1\n"; - } else { - llvm::sys::DynamicLibrary::LoadLibraryPermanently("libGL.so.1", &error); - user_assert(error.empty()) << "Could not find libGL.so\n"; - llvm::sys::DynamicLibrary::LoadLibraryPermanently("libX11.so.6", &error); - user_assert(error.empty()) << "Could not find libX11.so.6\n"; - } - } -#elif defined(__APPLE__) - if (have_symbol("aglCreateContext") && have_symbol("glDeleteTextures")) { - debug(1) << "OpenGL support code already linked in...\n"; - } else { - debug(1) << "Looking for OpenGL support code...\n"; - string error; - llvm::sys::DynamicLibrary::LoadLibraryPermanently("/System/Library/Frameworks/AGL.framework/AGL", &error); - user_assert(error.empty()) << "Could not find AGL.framework\n"; - llvm::sys::DynamicLibrary::LoadLibraryPermanently("/System/Library/Frameworks/OpenGL.framework/OpenGL", &error); - user_assert(error.empty()) << "Could not find OpenGL.framework\n"; - } -#else - internal_error << "JIT support for OpenGL on anything other than linux or OS X not yet implemented\n"; -#endif -} - void load_metal() { #if defined(__APPLE__) if (have_symbol("MTLCreateSystemDefaultDevice")) { @@ -766,7 +724,6 @@ enum RuntimeKind { OpenCL, Metal, CUDA, - OpenGLCompute, // NOTE: this feature is deprecated and will be removed in Halide 17 Hexagon, D3D12Compute, Vulkan, @@ -774,7 +731,6 @@ enum RuntimeKind { OpenCLDebug, MetalDebug, CUDADebug, - OpenGLComputeDebug, // NOTE: this feature is deprecated and will be removed in Halide 17 HexagonDebug, D3D12ComputeDebug, VulkanDebug, @@ -812,7 +768,6 @@ JITModule &make_module(llvm::Module *for_module, Target target, one_gpu.set_feature(Target::Metal, false); one_gpu.set_feature(Target::CUDA, false); one_gpu.set_feature(Target::HVX, false); - one_gpu.set_feature(Target::OpenGLCompute, false); one_gpu.set_feature(Target::D3D12Compute, false); one_gpu.set_feature(Target::Vulkan, false); one_gpu.set_feature(Target::WebGPU, false); @@ -847,17 +802,6 @@ JITModule &make_module(llvm::Module *for_module, Target target, one_gpu.set_feature(Target::CUDA); module_name += "cuda"; break; - case OpenGLComputeDebug: - one_gpu.set_feature(Target::Debug); - one_gpu.set_feature(Target::OpenGLCompute); - module_name = "debug_openglcompute"; - load_opengl(one_gpu.has_feature(Target::EGL)); - break; - case OpenGLCompute: - one_gpu.set_feature(Target::OpenGLCompute); - module_name += "openglcompute"; - load_opengl(one_gpu.has_feature(Target::EGL)); - break; case HexagonDebug: one_gpu.set_feature(Target::Debug); one_gpu.set_feature(Target::HVX); @@ -1065,13 +1009,6 @@ std::vector JITSharedRuntime::get(llvm::Module *for_module, const Tar result.push_back(m); } } - if (target.has_feature(Target::OpenGLCompute)) { - auto kind = target.has_feature(Target::Debug) ? OpenGLComputeDebug : OpenGLCompute; - JITModule m = make_module(for_module, target, kind, result, create); - if (m.compiled()) { - result.push_back(m); - } - } if (target.has_feature(Target::HVX)) { auto kind = target.has_feature(Target::Debug) ? HexagonDebug : Hexagon; JITModule m = make_module(for_module, target, kind, result, create); diff --git a/src/LLVM_Runtime_Linker.cpp b/src/LLVM_Runtime_Linker.cpp index ad65bdc2ebc2..609fbc3467bc 100644 --- a/src/LLVM_Runtime_Linker.cpp +++ b/src/LLVM_Runtime_Linker.cpp @@ -111,13 +111,9 @@ DECLARE_CPP_INITMOD(module_jit_ref_count) DECLARE_CPP_INITMOD(msan) DECLARE_CPP_INITMOD(msan_stubs) DECLARE_CPP_INITMOD(opencl) -DECLARE_CPP_INITMOD(opengl_egl_context) -DECLARE_CPP_INITMOD(opengl_glx_context) -DECLARE_CPP_INITMOD(openglcompute) DECLARE_CPP_INITMOD(osx_clock) DECLARE_CPP_INITMOD(osx_get_symbol) DECLARE_CPP_INITMOD(osx_host_cpu_count) -DECLARE_CPP_INITMOD(osx_opengl_context) DECLARE_CPP_INITMOD(osx_yield) DECLARE_CPP_INITMOD(posix_aligned_alloc) DECLARE_CPP_INITMOD(posix_allocator) @@ -1211,23 +1207,6 @@ std::unique_ptr get_initial_module_for_target(Target t, llvm::LLVM modules.push_back(get_initmod_opencl(c, bits_64, debug)); } } - if (t.has_feature(Target::OpenGLCompute)) { - modules.push_back(get_initmod_openglcompute(c, bits_64, debug)); - if (t.os == Target::Android) { - // Only platform that supports OpenGL Compute for now. - modules.push_back(get_initmod_opengl_egl_context(c, bits_64, debug)); - } else if (t.os == Target::Linux) { - if (t.has_feature(Target::EGL)) { - modules.push_back(get_initmod_opengl_egl_context(c, bits_64, debug)); - } else { - modules.push_back(get_initmod_opengl_glx_context(c, bits_64, debug)); - } - } else if (t.os == Target::OSX) { - modules.push_back(get_initmod_osx_opengl_context(c, bits_64, debug)); - } else { - // You're on your own to provide definitions of halide_opengl_get_proc_address and halide_opengl_create_context - } - } if (t.has_feature(Target::Metal)) { modules.push_back(get_initmod_metal(c, bits_64, debug)); if (t.arch == Target::ARM) { diff --git a/src/Lower.cpp b/src/Lower.cpp index 37c4bac07efb..74af1aeffe28 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -211,7 +211,6 @@ void lower_impl(const vector &output_funcs, bool will_inject_host_copies = (t.has_gpu_feature() || - t.has_feature(Target::OpenGLCompute) || t.has_feature(Target::HexagonDma) || (t.arch != Target::Hexagon && (t.has_feature(Target::HVX)))); @@ -251,11 +250,10 @@ void lower_impl(const vector &output_funcs, s = split_tuples(s, env); log("Lowering after destructuring tuple-valued realizations:", s); - // OpenGL relies on GPU var canonicalization occurring before + // Vulkan relies on GPU var canonicalization occurring before // storage flattening. if (t.has_gpu_feature() || - t.has_feature(Target::Vulkan) || - t.has_feature(Target::OpenGLCompute)) { + t.has_feature(Target::Vulkan)) { debug(1) << "Canonicalizing GPU var names...\n"; s = canonicalize_gpu_vars(s); log("Lowering after canonicalizing GPU var names:", s); @@ -327,8 +325,7 @@ void lower_impl(const vector &output_funcs, log("Lowering after vectorizing:", s); if (t.has_gpu_feature() || - t.has_feature(Target::Vulkan) || - t.has_feature(Target::OpenGLCompute)) { + t.has_feature(Target::Vulkan)) { debug(1) << "Injecting per-block gpu synchronization...\n"; s = fuse_gpu_thread_loops(s); log("Lowering after injecting per-block gpu synchronization:", s); diff --git a/src/Module.cpp b/src/Module.cpp index a00ff25e7d59..5bece0d7ebdd 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -332,7 +332,7 @@ struct ModuleContents { /** This is a copy of the code throughout the lowering process, which * reflects best the actual pipeline, without introducing device-specific * generated code from device-specific offloads (such as Cuda PTX, - * OpenGL Compute, etc...). In other words, we'd like to keep this + * etc...). In other words, we'd like to keep this * conceptually relevant and human-readable. */ Stmt conceptual_code; }; @@ -560,10 +560,6 @@ const Internal::Stmt &Module::get_conceptual_stmt() const { void Module::compile(const std::map &output_files) const { validate_outputs(output_files); - if (target().has_feature(Target::OpenGLCompute)) { - user_warning << "WARNING: OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.\n"; - } - // Minor but worthwhile optimization: if all of the output files are of types that won't // ever rely on submodules (e.g.: toplevel declarations in C/C++), don't bother resolving // the submodules, which can call compile_to_buffer(). diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index 46e6544036b7..77a57efc1149 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -5,7 +5,6 @@ #include "CodeGen_GPU_Dev.h" #include "CodeGen_Metal_Dev.h" #include "CodeGen_OpenCL_Dev.h" -#include "CodeGen_OpenGLCompute_Dev.h" #include "CodeGen_PTX_Dev.h" #include "CodeGen_Vulkan_Dev.h" #include "CodeGen_WebGPU_Dev.h" @@ -166,7 +165,7 @@ class InjectGpuOffload : public IRMutator { return a.type.bits() > b.type.bits(); } else { // Ensure that buffer arguments come first: - // for many OpenGL/Compute systems, the + // for some GPU systems, the // legal indices for buffer args are much // more restrictive than for scalar args, // and scalar args can be 'grown' by @@ -267,9 +266,6 @@ class InjectGpuOffload : public IRMutator { // host arch or os. device_target.os = Target::OSUnknown; device_target.arch = Target::ArchUnknown; - if (target.has_feature(Target::OpenGLCompute)) { - cgdev[DeviceAPI::OpenGLCompute] = new_CodeGen_OpenGLCompute_Dev(device_target); - } if (target.has_feature(Target::CUDA)) { cgdev[DeviceAPI::CUDA] = new_CodeGen_PTX_Dev(device_target); } diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp index c605d2038248..536b8994e686 100644 --- a/src/Pipeline.cpp +++ b/src/Pipeline.cpp @@ -957,10 +957,6 @@ void Pipeline::realize(JITUserContext *context, Target target = t; user_assert(defined()) << "Can't realize an undefined Pipeline\n"; - if (t.has_feature(Target::OpenGLCompute)) { - user_warning << "WARNING: OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.\n"; - } - debug(2) << "Realizing Pipeline for " << target << "\n"; if (target.has_unknowns()) { diff --git a/src/SelectGPUAPI.h b/src/SelectGPUAPI.h index ecb424364bb9..a38572d4946d 100644 --- a/src/SelectGPUAPI.h +++ b/src/SelectGPUAPI.h @@ -16,7 +16,7 @@ namespace Internal { /** Replace for loops with GPU_Default device_api with an actual * device API depending on what's enabled in the target. Choose the - * first of the following: opencl, cuda, openglcompute, opengl */ + * first of the following: opencl, cuda */ Stmt select_gpu_api(const Stmt &s, const Target &t); } // namespace Internal diff --git a/src/Serialization.cpp b/src/Serialization.cpp index f8be69271ff0..144d79af7e5e 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -214,8 +214,6 @@ Serialize::DeviceAPI Serializer::serialize_device_api(const DeviceAPI &device_ap return Serialize::DeviceAPI::CUDA; case DeviceAPI::OpenCL: return Serialize::DeviceAPI::OpenCL; - case DeviceAPI::OpenGLCompute: - return Serialize::DeviceAPI::OpenGLCompute; case DeviceAPI::Metal: return Serialize::DeviceAPI::Metal; case DeviceAPI::Hexagon: diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index 223a33837c7a..d7e7c50002f6 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -422,8 +422,7 @@ class FlattenDimensions : public IRMutator { // Create image_load("name", name.buffer, x - x_min, x_extent, // y - y_min, y_extent, ...). Extents can be used by - // successive passes. OpenGL, for example, uses them - // for coordinate normalization. + // successive passes. vector args(2); args[0] = op->name; args[1] = buffer_var; @@ -600,7 +599,6 @@ Stmt storage_flattening(Stmt s, const vector &outputs, const map &env, const Target &target) { - // The OpenGL backend requires loop mins to be zero'd at this point. s = zero_gpu_loop_mins(s); // Make an environment that makes it easier to figure out which diff --git a/src/Target.cpp b/src/Target.cpp index c824fea1c928..082b5103bd0b 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -498,7 +498,6 @@ const std::map feature_name_map = { {"cl_doubles", Target::CLDoubles}, {"cl_half", Target::CLHalf}, {"cl_atomics64", Target::CLAtomics64}, - {"openglcompute", Target::OpenGLCompute}, {"egl", Target::EGL}, {"user_context", Target::UserContext}, {"profile", Target::Profile}, @@ -983,9 +982,6 @@ bool Target::supported() const { #if !defined(WITH_METAL) bad |= has_feature(Target::Metal); #endif -#if !defined(WITH_OPENGLCOMPUTE) - bad |= has_feature(Target::OpenGLCompute); -#endif #if !defined(WITH_D3D12) bad |= has_feature(Target::D3D12Compute); #endif @@ -1059,7 +1055,6 @@ bool Target::has_gpu_feature() const { has_feature(OpenCL) || has_feature(Metal) || has_feature(D3D12Compute) || - has_feature(OpenGLCompute) || has_feature(Vulkan) || has_feature(WebGPU)); } @@ -1118,14 +1113,12 @@ bool Target::supports_type(const Type &t) const { if (t.bits() == 64) { if (t.is_float()) { return (!has_feature(Metal) && - !has_feature(OpenGLCompute) && !has_feature(D3D12Compute) && (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles)) && (!has_feature(Vulkan) || has_feature(Target::VulkanFloat64)) && !has_feature(WebGPU)); } else { return (!has_feature(Metal) && - !has_feature(OpenGLCompute) && !has_feature(D3D12Compute) && (!has_feature(Vulkan) || has_feature(Target::VulkanInt64)) && !has_feature(WebGPU)); @@ -1157,8 +1150,6 @@ bool Target::supports_type(const Type &t, DeviceAPI device) const { // Shader Model 5.x can optionally support double-precision; 64-bit int // types are not supported. return t.bits() < 64; - } else if (device == DeviceAPI::OpenGLCompute) { - return t.bits() < 64; } else if (device == DeviceAPI::Vulkan) { if (t.is_float() && t.bits() == 64) { return has_feature(Target::VulkanFloat64); @@ -1214,9 +1205,6 @@ DeviceAPI Target::get_required_device_api() const { if (has_feature(Target::OpenCL)) { return DeviceAPI::OpenCL; } - if (has_feature(Target::OpenGLCompute)) { - return DeviceAPI::OpenGLCompute; - } if (has_feature(Target::Vulkan)) { return DeviceAPI::Vulkan; } @@ -1232,8 +1220,6 @@ Target::Feature target_feature_for_device_api(DeviceAPI api) { return Target::CUDA; case DeviceAPI::OpenCL: return Target::OpenCL; - case DeviceAPI::OpenGLCompute: - return Target::OpenGLCompute; case DeviceAPI::Metal: return Target::Metal; case DeviceAPI::Hexagon: @@ -1333,7 +1319,6 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result) Metal, NoNEON, OpenCL, - OpenGLCompute, Vulkan, WebGPU, diff --git a/src/Target.h b/src/Target.h index 97c141f308e5..20730a313883 100644 --- a/src/Target.h +++ b/src/Target.h @@ -109,7 +109,6 @@ struct Target { CLDoubles = halide_target_feature_cl_doubles, CLHalf = halide_target_feature_cl_half, CLAtomics64 = halide_target_feature_cl_atomic64, - OpenGLCompute = halide_target_feature_openglcompute, // NOTE: This feature is deprecated and will be removed in Halide 17. EGL = halide_target_feature_egl, UserContext = halide_target_feature_user_context, Profile = halide_target_feature_profile, @@ -234,10 +233,7 @@ struct Target { /** Is a fully feature GPU compute runtime enabled? I.e. is * Func::gpu_tile and similar going to work? Currently includes - * CUDA, OpenCL, Metal and D3D12Compute. We do not include OpenGL, - * because it is not capable of gpgpu, and is not scheduled via - * Func::gpu_tile. - * TODO: Should OpenGLCompute be included here? */ + * CUDA, OpenCL, Metal and D3D12Compute. */ bool has_gpu_feature() const; /** Does this target allow using a certain type. Generally all diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index d91222d62f65..01a987b6f430 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -98,7 +98,6 @@ enum DeviceAPI: byte { Default_GPU, CUDA, OpenCL, - OpenGLCompute, Metal, Hexagon, HexagonDma, diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt index b1331ed07e52..039fae2d1b11 100644 --- a/src/runtime/CMakeLists.txt +++ b/src/runtime/CMakeLists.txt @@ -43,13 +43,9 @@ set(RUNTIME_CPP msan msan_stubs opencl - opengl_egl_context - opengl_glx_context - openglcompute osx_clock osx_get_symbol osx_host_cpu_count - osx_opengl_context osx_yield posix_aligned_alloc posix_allocator @@ -135,7 +131,6 @@ set(RUNTIME_HEADER_FILES HalideRuntimeHexagonHost.h HalideRuntimeMetal.h HalideRuntimeOpenCL.h - HalideRuntimeOpenGLCompute.h HalideRuntimeQurt.h HalideRuntimeVulkan.h HalideRuntimeWebGPU.h diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 7b84e44f6928..d8ae1268fbaf 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1356,8 +1356,6 @@ typedef enum halide_target_feature_t { halide_target_feature_cl_doubles, ///< Enable double support on OpenCL targets halide_target_feature_cl_atomic64, ///< Enable 64-bit atomics operations on OpenCL targets - halide_target_feature_openglcompute, ///< Enable OpenGL Compute runtime. NOTE: This feature is deprecated and will be removed in Halide 17. - halide_target_feature_user_context, ///< Generated code takes a user_context pointer as first argument halide_target_feature_profile, ///< Launch a sampling profiler alongside the Halide pipeline that monitors and reports the runtime used by each Func diff --git a/src/runtime/HalideRuntimeOpenGLCompute.h b/src/runtime/HalideRuntimeOpenGLCompute.h deleted file mode 100644 index f460703b798b..000000000000 --- a/src/runtime/HalideRuntimeOpenGLCompute.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef HALIDE_HALIDERUNTIMEOPENGLCOMPUTE_H -#define HALIDE_HALIDERUNTIMEOPENGLCOMPUTE_H - -// Don't include HalideRuntime.h if the contents of it were already pasted into a generated header above this one -#ifndef HALIDE_HALIDERUNTIME_H - -#include "HalideRuntime.h" - -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/** \file - * Routines specific to the Halide OpenGL Compute runtime. - */ - -#define HALIDE_RUNTIME_OPENGLCOMPUTE - -HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.") -extern const struct halide_device_interface_t *halide_openglcompute_device_interface(); - -/** These are forward declared here to allow clients to override the - * Halide Glsl runtime. Do not call them. */ -// @{ - -/** This function sets up OpenGL context, loads relevant GL functions, then - * compiles src OpenGL compute shader into OpenGL program and stores it for future use. - */ -HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.") -extern int halide_openglcompute_initialize_kernels(void *user_context, void **state_ptr, - const char *src, int size); - -/** This function triggers execution of OpenGL program built around compute shader. - * Execution of the shader is parallelized into given number of blocks and threads. - * - * This function doesn't wait for the completion of the shader, but it sets memory - * barrier which forces successive retrieval of output data to wait until shader is done. - */ -HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.") -extern int halide_openglcompute_run(void *user_context, - void *state_ptr, - const char *entry_name, - int blocksX, int blocksY, int blocksZ, - int threadsX, int threadsY, int threadsZ, - int shared_mem_bytes, - struct halide_type_t arg_types[], - void *args[], - int8_t is_buffer[]); - -HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.") -extern void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr); -// @} - -/** This function retrieves pointers to OpenGL API functions. - * - * You may have to implement this yourself. Halide only provides implementations - * for some platforms." - */ -HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.") -extern void *halide_opengl_get_proc_address(void *user_context, const char *name); - -/** This function creates an OpenGL context for use by the OpenGL backend. - * - * You may have to implement this yourself as well. Halide only provides - * implementations for some platforms." - */ -HALIDE_ATTRIBUTE_DEPRECATED("OpenGLCompute is deprecated in Halide 16 and will be removed in Halide 17.") -extern int halide_opengl_create_context(void *user_context); - -#ifdef __cplusplus -} // End extern "C" -#endif - -#endif // HALIDE_HALIDERUNTIMEOPENGLCOMPUTE_H diff --git a/src/runtime/device_interface.cpp b/src/runtime/device_interface.cpp index 710d1259678d..1625a6698ccc 100644 --- a/src/runtime/device_interface.cpp +++ b/src/runtime/device_interface.cpp @@ -473,10 +473,8 @@ WEAK int halide_default_buffer_copy(void *user_context, struct halide_buffer_t * // The right thing is that all devices have to support // device-to-device and device-to/from-arbitrarty-pointer. This - // means there will always have to be a device specifc version of - // this function and the default can go away or fail. At present - // there are some devices, e.g. OpenGL and OpenGLCompute, for which - // this is not yet implemented. + // means there will always have to be a device specific version of + // this function and the default can go away or fail. return halide_error_code_device_buffer_copy_failed; } diff --git a/src/runtime/mini_opengl.h b/src/runtime/mini_opengl.h deleted file mode 100644 index 1101fcd5a24c..000000000000 --- a/src/runtime/mini_opengl.h +++ /dev/null @@ -1,221 +0,0 @@ -#ifndef MINI_OPENGL_H -#define MINI_OPENGL_H - -// ---------- OpenGL core (1.3 and earlier) ---------- - -typedef char GLchar; -typedef unsigned char GLubyte; -typedef unsigned int GLenum; -typedef unsigned char GLboolean; -typedef int GLint; -typedef unsigned int GLuint; -typedef int GLsizei; -typedef ptrdiff_t GLsizeiptr; -typedef float GLfloat; -typedef double GLdouble; -typedef void GLvoid; - -#define GL_NO_ERROR 0x0 -#define GL_FALSE 0x0 -#define GL_TRUE 0x1 -#define GL_TRIANGLES 0x0004 -#define GL_TRIANGLE_STRIP 0x0005 -#define GL_CULL_FACE 0x0B44 -#define GL_DEPTH_TEST 0x0B71 -#define GL_VIEWPORT 0x0BA2 -#define GL_PACK_ALIGNMENT 0x0D05 -#define GL_UNPACK_ALIGNMENT 0x0CF5 -#define GL_UNPACK_ROW_LENGTH 0x0CF2 -#define GL_PACK_ROW_LENGTH 0x0D02 -#define GL_TEXTURE_2D 0x0DE1 -#define GL_TEXTURE_WIDTH 0x1000 -#define GL_TEXTURE_HEIGHT 0x1001 -#define GL_BYTE 0x1400 -#define GL_UNSIGNED_BYTE 0x1401 -#define GL_SHORT 0x1402 -#define GL_UNSIGNED_SHORT 0x1403 -#define GL_INT 0x1404 -#define GL_UNSIGNED_INT 0x1405 -#define GL_FLOAT 0x1406 -#define GL_MODELVIEW 0x1700 -#define GL_PROJECTION 0x1701 -#define GL_RED 0x1903 -#define GL_RGB 0x1907 -#define GL_RGBA 0x1908 -#define GL_LUMINANCE 0x1909 -#define GL_LUMINANCE_ALPHA 0x190A -#define GL_VERSION 0x1F02 -#define GL_EXTENSIONS 0x1F03 -#define GL_NEAREST 0x2600 -#define GL_TEXTURE_MAG_FILTER 0x2800 -#define GL_TEXTURE_MIN_FILTER 0x2801 -#define GL_TEXTURE_WRAP_S 0x2802 -#define GL_TEXTURE_WRAP_T 0x2803 -#define GL_CLAMP_TO_EDGE 0x812F -#define GL_TEXTURE0 0x84C0 -#define GL_TEXTURE1 0x84C1 -#define GL_TEXTURE2 0x84C2 -#define GL_TEXTURE3 0x84C3 -#define GL_ACTIVE_TEXTURE 0x84E0 -#define GL_TEXTURE_BINDING_2D 0x8069 -#define GL_ACTIVE_UNIFORMS 0x8B86 - -typedef void (*PFNGLACTIVETEXTUREPROC)(GLenum texture); -typedef void (*PFNGLBINDTEXTUREPROC)(GLenum target, GLuint texture); -typedef void (*PFNGLDISABLEPROC)(GLenum cap); -typedef void (*PFNGLDELETETEXTURESPROC)(GLsizei n, const GLuint *textures); -typedef void (*PFNGLDRAWBUFFERSPROC)(GLsizei n, const GLenum *bufs); -typedef void (*PFNGLDRAWELEMENTSPROC)(GLenum mode, GLsizei count, GLenum type, const GLvoid *indices); -typedef void (*PFNGLGENTEXTURESPROC)(GLsizei n, GLuint *textures); -typedef GLenum (*PFNGLGETERRORPROC)(); -typedef const GLubyte *(*PFNGLGETSTRINGPROC)(GLenum name); -typedef void (*PFNGLGETTEXIMAGEPROC)(GLenum target, GLint level, - GLenum format, GLenum type, - GLvoid *pixels); -typedef void (*PFNGLLOADIDENTITYPROC)(); -typedef void (*PFNGLMATRIXMODEPROC)(GLenum mode); -typedef void (*PFNGLORTHOPROC)(GLdouble left, GLdouble right, - GLdouble bottom, GLdouble top, - GLdouble near_val, GLdouble far_val); -typedef void (*PFNGLPIXELSTOREIPROC)(GLenum pname, GLint param); - -typedef void (*PFNGLGETTEXLEVELPARAMETERIVPROC)(GLenum target, GLint level, - GLenum pname, GLint *params); -typedef void (*PFNGLTEXIMAGE2DPROC)(GLenum target, GLint level, - GLint internalFormat, - GLsizei width, GLsizei height, - GLint border, GLenum format, GLenum type, - const GLvoid *pixels); -typedef void (*PFNGLTEXPARAMETERIPROC)(GLenum target, GLenum pname, GLint param); -typedef void (*PFNGLTEXSUBIMAGE2DPROC)(GLenum target, GLint level, - GLint xoffset, GLint yoffset, - GLsizei width, GLsizei height, - GLenum format, GLenum type, - const GLvoid *data); -typedef void (*PFNGLVIEWPORTPROC)(GLint x, GLint y, GLsizei width, GLsizei height); -typedef void (*PFNGLREADPIXELS)(GLint x, GLint y, - GLsizei width, GLsizei height, - GLenum format, GLenum type, - GLvoid *pixels); - -// ---------- OpenGL 1.5 ---------- - -#define GL_ARRAY_BUFFER 0x8892 -#define GL_ELEMENT_ARRAY_BUFFER 0x8893 -#define GL_STATIC_DRAW 0x88E4 -#define GL_ARRAY_BUFFER_BINDING 0x8894 -#define GL_ELEMENT_ARRAY_BUFFER_BINDING 0x8895 - -typedef void (*PFNGLGENBUFFERSPROC)(GLsizei n, GLuint *buffers); -typedef void (*PFNGLDELETEBUFFERSPROC)(GLsizei n, const GLuint *buffers); -typedef void (*PFNGLBINDBUFFERPROC)(GLenum target, GLuint buffer); -typedef void (*PFNGLBUFFERDATAPROC)(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage); - -// ---------- OpenGL 2.0 ---------- - -#define GL_FRAGMENT_SHADER 0x8B30 -#define GL_VERTEX_SHADER 0x8B31 -#define GL_COMPILE_STATUS 0x8B81 -#define GL_LINK_STATUS 0x8B82 -#define GL_INFO_LOG_LENGTH 0x8B84 -#define GL_IMPLEMENTATION_COLOR_READ_FORMAT 0x8B9B -#define GL_IMPLEMENTATION_COLOR_READ_TYPE 0x8B9A -#define GL_CURRENT_PROGRAM 0x8B8D -#define GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS 0x8B4D -#define GL_MAX_VERTEX_ATTRIBS 0x8869 -#define GL_VERTEX_ATTRIB_ARRAY_ENABLED 0x8622 - -typedef void (*PFNGLATTACHSHADERPROC)(GLuint program, GLuint shader); -typedef void (*PFNGLCOMPILESHADERPROC)(GLuint shader); -typedef GLuint (*PFNGLCREATEPROGRAMPROC)(); -typedef GLuint (*PFNGLCREATESHADERPROC)(GLenum type); -typedef void (*PFNGLDELETEPROGRAMPROC)(GLuint program); -typedef void (*PFNGLDELETESHADERPROC)(GLuint shader); -typedef void (*PFNGLDISABLEVERTEXATTRIBARRAYPROC)(GLuint index); -typedef void (*PFNGLENABLEVERTEXATTRIBARRAYPROC)(GLuint index); -typedef GLint (*PFNGLGETATTRIBLOCATIONPROC)(GLuint program, const GLchar *name); -typedef void (*PFNGLGETPROGRAMIVPROC)(GLuint program, GLenum pname, GLint *params); -typedef void (*PFNGLGETPROGRAMINFOLOGPROC)(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog); -typedef void (*PFNGLGETSHADERIVPROC)(GLuint shader, GLenum pname, GLint *params); -typedef void (*PFNGLGETSHADERINFOLOGPROC)(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog); -typedef GLint (*PFNGLGETUNIFORMLOCATIONPROC)(GLuint program, const GLchar *name); -typedef void (*PFNGLLINKPROGRAMPROC)(GLuint program); -typedef void (*PFNGLSHADERSOURCEPROC)(GLuint shader, GLsizei count, const GLchar **string, const GLint *length); -typedef void (*PFNGLUNIFORM1FPROC)(GLuint location, GLfloat value); -typedef void (*PFNGLUNIFORM1IPROC)(GLuint location, GLint value); -typedef void (*PFNGLUNIFORM1IVPROC)(GLint location, GLsizei count, const GLint *value); -typedef void (*PFNGLUNIFORM2IVPROC)(GLint location, GLsizei count, const GLint *value); -typedef void (*PFNGLUNIFORM1FVPROC)(GLint location, GLsizei count, const GLfloat *value); -typedef void (*PFNGLUSEPROGRAMPROC)(GLuint program); -typedef void (*PFNGLVERTEXATTRIBPOINTERPROC)(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const GLvoid *pointer); -typedef void (*PFNGLGETINTEGERV)(GLenum pname, GLint *data); -typedef void (*PFNGLGETBOOLEANV)(GLenum pname, GLboolean *data); -typedef void (*PFNGLFINISHPROC)(); -typedef void (*PFNGLGETVERTEXATTRIBIVPROC)(GLuint index, GLenum pname, GLint *params); - -// ---------- OpenGL 3.0 ---------- - -#define GL_MAJOR_VERSION 0x821B -#define GL_MINOR_VERSION 0x821C -#define GL_NUM_EXTENSIONS 0x821D -#define GL_RG 0x8227 -#define GL_R32F 0x822E -#define GL_RG32F 0x8230 -#define GL_RGBA32F 0x8814 -#define GL_RGB32F 0x8815 -#define GL_LUMINANCE32F 0x8818 -#define GL_VERTEX_ARRAY_BINDING 0x85B5 - -// GL_ARB_framebuffer_object -#define GL_FRAMEBUFFER_COMPLETE 0x8CD5 -#define GL_COLOR_ATTACHMENT0 0x8CE0 -#define GL_FRAMEBUFFER 0x8D40 -#define GL_FRAMEBUFFER_BINDING 0x8CA6 - -typedef void (*PFNGLBINDFRAMEBUFFERPROC)(GLenum target, GLuint framebuffer); -typedef GLenum (*PFNGLCHECKFRAMEBUFFERSTATUSPROC)(GLenum target); -typedef void (*PFNGLDELETEFRAMEBUFFERSPROC)(GLsizei n, const GLuint *framebuffers); -typedef void (*PFNGLFRAMEBUFFERTEXTURE2DPROC)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); -typedef void (*PFNGLGENFRAMEBUFFERSPROC)(GLsizei n, GLuint *framebuffers); - -typedef void (*PFNGLGENVERTEXARRAYS)(GLsizei n, GLuint *arrays); -typedef void (*PFNGLBINDVERTEXARRAY)(GLuint array); -typedef void (*PFNGLDELETEVERTEXARRAYS)(GLsizei n, const GLuint *arrays); -typedef const GLubyte *(*PFNGLGETSTRINGI)(GLenum name, GLuint index); -typedef void (*PFNDRAWBUFFERS)(GLsizei n, const GLenum *bufs); - -// ---------- OpenGL ES 3.1 ---------- - -#define GL_TEXTURE_BUFFER_EXT 0x8c2a - -#define GL_COMPUTE_SHADER 0x91B9 -#define GL_DYNAMIC_COPY 0x88ea - -#define GL_READ_ONLY 0x88B8 -#define GL_WRITE_ONLY 0x88B9 - -#define GL_MAP_READ_BIT 0x0001 -#define GL_MAP_WRITE_BIT 0x0002 - -#define GL_SHADER_STORAGE_BUFFER 0x90D2 - -#define GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT 0x00000001 -#define GL_BUFFER_UPDATE_BARRIER_BIT 0x00000200 -#define GL_ALL_BARRIER_BITS 0xFFFFFFFF - -typedef unsigned int GLbitfield; -typedef ptrdiff_t GLintptr; - -typedef void (*PFNGLTEXBUFFEREXTPROC)(GLenum target, GLenum internalformat, GLuint buffer); -typedef void (*PFNGLBINDIMAGETEXTUREPROC)(GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, GLenum format); -typedef void (*PFNGLMEMORYBARRIERPROC)(GLbitfield barriers); -typedef void *(*PFNGLMAPBUFFERRANGEPROC)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access); -typedef void (*PFNGLDISPATCHCOMPUTEPROC)(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z); -typedef void (*PFNGLUNMAPBUFFERPROC)(GLenum target); -typedef void (*PFNGLBINDBUFFERBASEPROC)(GLenum target, GLuint index, GLuint buffer); -typedef void (*PFNGLDELETEBUFFERSPROC)(GLsizei n, const GLuint *buffers); - -typedef void (*PFNGLGETACTIVEUNIFORM)(GLuint program, GLuint index, GLsizei bufSize, GLsizei *length, GLint *size, GLenum *type, GLchar *name); -typedef GLint (*PFNGLGETUNIFORMLOCATION)(GLuint program, const GLchar *name); - -#endif // MINI_OPENGL_H diff --git a/src/runtime/opengl_egl_context.cpp b/src/runtime/opengl_egl_context.cpp deleted file mode 100644 index a41e51ee67a1..000000000000 --- a/src/runtime/opengl_egl_context.cpp +++ /dev/null @@ -1,181 +0,0 @@ -#include "HalideRuntime.h" -#include "printer.h" - -extern "C" { - -#define EGLAPI -#define EGLAPIENTRY -#define EGLAPIENTRYP EGLAPIENTRY * - -typedef int32_t EGLint; -typedef unsigned int EGLBoolean; -typedef unsigned int EGLenum; -typedef void *EGLContext; -typedef void *EGLDisplay; -typedef void *EGLNativeDisplayType; -typedef void *EGLConfig; -typedef void *EGLSurface; -typedef void *EGLDeviceEXT; - -typedef EGLBoolean(EGLAPIENTRYP PFNEGLQUERYDEVICESEXTPROC)( - EGLint max_devices, EGLDeviceEXT *devices, EGLint *num_devices); -typedef EGLDisplay(EGLAPIENTRYP PFNEGLGETPLATFORMDISPLAYEXTPROC)( - EGLenum platform, void *native_display, const EGLint *attrib_list); - -#define EGL_NO_CONTEXT ((EGLContext)0) -#define EGL_DEFAULT_DISPLAY ((EGLNativeDisplayType)0) -#define EGL_NO_DISPLAY ((EGLDisplay)0) -#define EGL_NO_SURFACE ((EGLSurface)0) - -#define EGL_SUCCESS 0x3000 - -#define EGL_ALPHA_SIZE 0x3021 -#define EGL_BLUE_SIZE 0x3022 -#define EGL_GREEN_SIZE 0x3023 -#define EGL_RED_SIZE 0x3024 -#define EGL_SURFACE_TYPE 0x3033 -#define EGL_NONE 0x3038 -#define EGL_RENDERABLE_TYPE 0x3040 -#define EGL_HEIGHT 0x3056 -#define EGL_WIDTH 0x3057 -#define EGL_CONTEXT_CLIENT_VERSION 0x3098 - -#define EGL_PLATFORM_DEVICE_EXT 0x313F - -#define EGL_PBUFFER_BIT 0x0001 -#define EGL_OPENGL_ES2_BIT 0x0004 - -#define EGL_FALSE 0 -#define EGL_TRUE 1 - -EGLAPI EGLint EGLAPIENTRY eglGetError(void); -EGLAPI EGLContext EGLAPIENTRY eglGetCurrentContext(void); -EGLAPI EGLDisplay EGLAPIENTRY eglGetDisplay(EGLNativeDisplayType display_id); -EGLAPI EGLBoolean EGLAPIENTRY eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor); -EGLAPI EGLBoolean EGLAPIENTRY eglChooseConfig(EGLDisplay dpy, const EGLint *attrib_list, - EGLConfig *configs, EGLint config_size, - EGLint *num_config); -EGLAPI EGLContext EGLAPIENTRY eglCreateContext(EGLDisplay dpy, EGLConfig config, - EGLContext share_context, - const EGLint *attrib_list); -EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferSurface(EGLDisplay dpy, EGLConfig config, - const EGLint *attrib_list); -EGLAPI EGLBoolean EGLAPIENTRY eglMakeCurrent(EGLDisplay dpy, EGLSurface draw, - EGLSurface read, EGLContext ctx); - -EGLAPI void *eglGetProcAddress(const char *procname); - -extern int strcmp(const char *, const char *); - -WEAK int halide_opengl_create_context(void *user_context) { - if (eglGetCurrentContext() != EGL_NO_CONTEXT) { - return halide_error_code_success; - } - - EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY); - if (display == EGL_NO_DISPLAY || !eglInitialize(display, nullptr, nullptr)) { - PFNEGLQUERYDEVICESEXTPROC eglQueryDevicesEXT = - reinterpret_cast( - eglGetProcAddress("eglQueryDevicesEXT")); - if (eglQueryDevicesEXT == nullptr) { - return halide_error_code_generic_error; - } - - PFNEGLGETPLATFORMDISPLAYEXTPROC eglGetPlatformDisplayEXT = - reinterpret_cast( - eglGetProcAddress("eglGetPlatformDisplayEXT")); - if (eglGetPlatformDisplayEXT == nullptr) { - return halide_error_code_generic_error; - } - - const int kMaxDevices = 32; - EGLDeviceEXT egl_devices[kMaxDevices]; - EGLint num_devices = 0; - EGLint egl_error = eglGetError(); - if (!eglQueryDevicesEXT(kMaxDevices, egl_devices, &num_devices) || - egl_error != EGL_SUCCESS) { - return halide_error_code_generic_error; - } - - EGLBoolean initialized = EGL_FALSE; - for (EGLint i = 0; i < num_devices; ++i) { - display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, - egl_devices[i], nullptr); - if (eglGetError() == EGL_SUCCESS && display != EGL_NO_DISPLAY) { - int major, minor; - initialized = eglInitialize(display, &major, &minor); - if (eglGetError() == EGL_SUCCESS && initialized == EGL_TRUE) { - break; - } - } - } - - if (eglGetError() != EGL_SUCCESS || initialized != EGL_TRUE) { - error(user_context) << "Could not initialize EGL display"; - return halide_error_code_generic_error; - } - } - - EGLint attribs[] = { - EGL_SURFACE_TYPE, - EGL_PBUFFER_BIT, - EGL_RENDERABLE_TYPE, - EGL_OPENGL_ES2_BIT, - EGL_RED_SIZE, - 8, - EGL_GREEN_SIZE, - 8, - EGL_BLUE_SIZE, - 8, - EGL_ALPHA_SIZE, - 8, - EGL_NONE, - }; - EGLConfig config; - int numconfig; - EGLBoolean result = eglChooseConfig(display, attribs, &config, 1, &numconfig); - if (result != EGL_TRUE || numconfig != 1) { - debug(user_context) << "eglChooseConfig(): config not found: " - << " result=" << (int)result - << " eglGetError=" << eglGetError() - << " numConfig=" << numconfig; - error(user_context) << "eglChooseConfig(): config not found."; - return halide_error_code_generic_error; - } - - EGLint context_attribs[] = { - EGL_CONTEXT_CLIENT_VERSION, 2, - EGL_NONE}; - EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, - context_attribs); - if (context == EGL_NO_CONTEXT) { - error(user_context) << "eglCreateContext failed."; - return halide_error_code_generic_error; - } - - EGLint surface_attribs[] = { - EGL_WIDTH, 1, - EGL_HEIGHT, 1, - EGL_NONE}; - EGLSurface surface = eglCreatePbufferSurface(display, config, surface_attribs); - if (surface == EGL_NO_SURFACE) { - error(user_context) << "Error: Could not create EGL window surface."; - return halide_error_code_generic_error; - } - - result = eglMakeCurrent(display, surface, surface, context); - if (result != EGL_TRUE) { - debug(user_context) << "eglMakeCurrent fails: " - << " result=" << (int)result - << " eglGetError=" << eglGetError(); - error(user_context) << "eglMakeCurrent failed."; - return halide_error_code_generic_error; - } - return halide_error_code_success; -} - -WEAK void *halide_opengl_get_proc_address(void *user_context, const char *name) { - return (void *)eglGetProcAddress(name); -} - -} // extern "C" diff --git a/src/runtime/opengl_glx_context.cpp b/src/runtime/opengl_glx_context.cpp deleted file mode 100644 index 093285668806..000000000000 --- a/src/runtime/opengl_glx_context.cpp +++ /dev/null @@ -1,156 +0,0 @@ -#include "HalideRuntime.h" -#include "printer.h" - -extern "C" { - -typedef void *GLXContext; -typedef void *GLXFBConfig; -typedef int Bool; -typedef void Display; - -typedef void (*__GLXextFuncPtr)(); -extern __GLXextFuncPtr glXGetProcAddressARB(const char *); -extern void *XOpenDisplay(void *); -extern int XDefaultScreen(void *); -extern int glXQueryExtension(void *, void *, void *); -extern const char *glXQueryExtensionsString(Display *dpy, int screen); -extern GLXContext glXCreateNewContext(void *, void *, int, void *, int); -extern void **glXChooseFBConfig(void *, int, const int *, int *); -extern unsigned long glXCreatePbuffer(void *, void *, const int *); -extern int XFree(void *); -extern int XSync(void *, int); -extern GLXContext glXGetCurrentContext(); -extern int glXMakeContextCurrent(void *, unsigned long, unsigned long, void *); - -#define GLX_RENDER_TYPE 0x8011 -#define GLX_RGBA_TYPE 0x8014 -#define GLX_RGBA_BIT 1 -#define GLX_RED_SIZE 8 -#define GLX_GREEN_SIZE 8 -#define GLX_BLUE_SIZE 8 -#define GLX_ALPHA_SIZE 8 - -#define GLX_CONTEXT_MAJOR_VERSION_ARB 0x2091 -#define GLX_CONTEXT_MINOR_VERSION_ARB 0x2092 -typedef GLXContext (*glXCreateContextAttribsARBProc)(Display *, GLXFBConfig, GLXContext, Bool, const int *); - -} // extern "C" - -namespace Halide { -namespace Runtime { -namespace Internal { - -// Helper to check for extension string presence. Adapted from: -// http://www.opengl.org/resources/features/OGLextensions/ -WEAK bool glx_extension_supported(const char *extlist, const char *extension) { - // Extension names should not have spaces. - if (strchr(extension, ' ') != nullptr || *extension == '\0') { - return false; - } - - const char *start = extlist; - while (const char *pos = strstr(start, extension)) { - const char *end = pos + strlen(extension); - // Ensure the found match is a full word, not a substring. - if ((pos == start || pos[-1] == ' ') && - (*end == ' ' || *end == '\0')) { - return true; - } - start = end; - } - return false; -} - -} // namespace Internal -} // namespace Runtime -} // namespace Halide - -extern "C" { - -WEAK void *halide_opengl_get_proc_address(void *user_context, const char *name) { - return (void *)glXGetProcAddressARB(name); -} - -// Initialize OpenGL -WEAK int halide_opengl_create_context(void *user_context) { - const int desired_major_version = 3; - const int desired_minor_version = 2; - - if (glXGetCurrentContext()) { - // Already have a context - return halide_error_code_success; - } - - void *dpy = XOpenDisplay(nullptr); - if (!dpy) { - error(user_context) << "Could not open X11 display."; - return halide_error_code_generic_error; - } - - // GLX supported? - if (!glXQueryExtension(dpy, nullptr, nullptr)) { - error(user_context) << "GLX not supported by X server."; - return halide_error_code_generic_error; - } - - int screen = XDefaultScreen(dpy); - - int attribs[] = { - GLX_RENDER_TYPE, GLX_RGBA_BIT, - GLX_RED_SIZE, 8, - GLX_GREEN_SIZE, 8, - GLX_BLUE_SIZE, 8, - GLX_ALPHA_SIZE, 8, - 0}; - int num_configs = 0; - void **fbconfigs = glXChooseFBConfig(dpy, screen, attribs, &num_configs); - if (!num_configs) { - error(user_context) << "Could not get framebuffer config."; - return halide_error_code_generic_error; - } - void *fbconfig = fbconfigs[0]; - - const char *glxexts = glXQueryExtensionsString(dpy, screen); - void *share_list = nullptr; - int direct = 1; - void *context = nullptr; - - glXCreateContextAttribsARBProc glXCreateContextAttribsARB = nullptr; - glXCreateContextAttribsARB = (glXCreateContextAttribsARBProc) - glXGetProcAddressARB("glXCreateContextAttribsARB"); - - if (glx_extension_supported(glxexts, "GLX_ARB_create_context") && - glXCreateContextAttribsARB) { - int context_attribs[] = { - GLX_CONTEXT_MAJOR_VERSION_ARB, desired_major_version, - GLX_CONTEXT_MINOR_VERSION_ARB, desired_minor_version, - 0}; - context = glXCreateContextAttribsARB(dpy, fbconfig, share_list, direct, - context_attribs); - } - if (!context) { - // Open a legacy context - context = glXCreateNewContext(dpy, fbconfig, GLX_RGBA_TYPE, share_list, direct); - } - if (!context) { - error(user_context) << "Could not create OpenGL context."; - return halide_error_code_generic_error; - } - - int pbuffer_attribs[] = { - 0x8041 /* GLX_PBUFFER_WIDTH */, 32, - 0x8040 /* GLX_PBUFFER_HEIGHT */, 32, - 0}; - unsigned long pbuffer = glXCreatePbuffer(dpy, fbconfig, pbuffer_attribs); - - XFree(fbconfigs); - XSync(dpy, 0); - - if (!glXMakeContextCurrent(dpy, pbuffer, pbuffer, context)) { - error(user_context) << "Could not make context current."; - return halide_error_code_generic_error; - } - - return halide_error_code_success; -} -} diff --git a/src/runtime/openglcompute.cpp b/src/runtime/openglcompute.cpp deleted file mode 100644 index edb1327d90a9..000000000000 --- a/src/runtime/openglcompute.cpp +++ /dev/null @@ -1,990 +0,0 @@ -// Ignore deprecation warnings inside our own runtime -#define HALIDE_ALLOW_DEPRECATED 1 - -#include "HalideRuntimeOpenGLCompute.h" -#include "device_buffer_utils.h" -#include "device_interface.h" -#include "mini_opengl.h" -#include "printer.h" - -// Implementation note: all function that directly or indirectly access the -// runtime state in halide_openglcompute_state must be declared as WEAK, otherwise -// the behavior at runtime is undefined. - -// List of all OpenGL functions used by the runtime. The list is used to -// declare and initialize the dispatch table in OpenGLState below. -// -// grep "global_state." ../../src/runtime/openglcompute.cpp | sed -n "s/^\(.*\)global_state\.\([^(]*\).*/\2/p" | sort | uniq -// +GetError, GetString -// -CheckAndReportError -// -#define USED_GL_FUNCTIONS \ - GLFUNC(PFNGLATTACHSHADERPROC, AttachShader); \ - GLFUNC(PFNGLBINDBUFFERPROC, BindBuffer); \ - GLFUNC(PFNGLBINDBUFFERBASEPROC, BindBufferBase); \ - GLFUNC(PFNGLBUFFERDATAPROC, BufferData); \ - GLFUNC(PFNGLCREATEPROGRAMPROC, CreateProgram); \ - GLFUNC(PFNGLCOMPILESHADERPROC, CompileShader); \ - GLFUNC(PFNGLCREATESHADERPROC, CreateShader); \ - GLFUNC(PFNGLDELETEBUFFERSPROC, DeleteBuffers); \ - GLFUNC(PFNGLDELETEPROGRAMPROC, DeleteProgram); \ - GLFUNC(PFNGLDELETESHADERPROC, DeleteShader); \ - GLFUNC(PFNGLDISPATCHCOMPUTEPROC, DispatchCompute); \ - GLFUNC(PFNGLFINISHPROC, Finish); \ - GLFUNC(PFNGLGENBUFFERSPROC, GenBuffers); \ - GLFUNC(PFNGLGETERRORPROC, GetError); \ - GLFUNC(PFNGLGETPROGRAMINFOLOGPROC, GetProgramInfoLog); \ - GLFUNC(PFNGLGETPROGRAMIVPROC, GetProgramiv); \ - GLFUNC(PFNGLGETSHADERINFOLOGPROC, GetShaderInfoLog); \ - GLFUNC(PFNGLGETSHADERIVPROC, GetShaderiv); \ - GLFUNC(PFNGLGETSTRINGPROC, GetString); \ - GLFUNC(PFNGLLINKPROGRAMPROC, LinkProgram); \ - GLFUNC(PFNGLMAPBUFFERRANGEPROC, MapBufferRange); \ - GLFUNC(PFNGLMEMORYBARRIERPROC, MemoryBarrier); \ - GLFUNC(PFNGLSHADERSOURCEPROC, ShaderSource); \ - GLFUNC(PFNGLUNIFORM1IPROC, Uniform1i); \ - GLFUNC(PFNGLUNIFORM1IPROC, Uniform1ui); \ - GLFUNC(PFNGLUNIFORM1FPROC, Uniform1f); \ - GLFUNC(PFNGLUNMAPBUFFERPROC, UnmapBuffer); \ - GLFUNC(PFNGLUSEPROGRAMPROC, UseProgram); \ - GLFUNC(PFNGLGETACTIVEUNIFORM, GetActiveUniform); \ - GLFUNC(PFNGLGETUNIFORMLOCATION, GetUniformLocation); - -using namespace Halide::Runtime::Internal; - -namespace Halide { -namespace Runtime { -namespace Internal { -namespace OpenGLCompute { - -extern WEAK halide_device_interface_t openglcompute_device_interface; - -WEAK const char *gl_error_name(int32_t err) { - switch (err) { - case 0x500: - return "GL_INVALID_ENUM"; - break; - case 0x501: - return "GL_INVALID_VALUE"; - break; - case 0x502: - return "GL_INVALID_OPERATION"; - break; - case 0x503: - return "GL_STACK_OVERFLOW"; - break; - case 0x504: - return "GL_STACK_UNDERFLOW"; - break; - case 0x505: - return "GL_OUT_OF_MEMORY"; - break; - case 0x506: - return "GL_INVALID_FRAMEBUFFER_OPERATION"; - break; - case 0x507: - return "GL_CONTEXT_LOST"; - break; - case 0x8031: - return "GL_TABLE_TOO_LARGE"; - break; - default: - break; - } - return ""; -} - -struct HalideMalloc { - ALWAYS_INLINE HalideMalloc(void *user_context, size_t size) - : user_context(user_context), ptr(halide_malloc(user_context, size)) { - } - ALWAYS_INLINE ~HalideMalloc() { - halide_free(user_context, ptr); - } - void *const user_context; - void *const ptr; -}; - -struct KernelInfo { - char *kernel_name; - GLuint program_id; - KernelInfo *next; -}; - -struct ModuleState { - KernelInfo *kernel; - ModuleState *next; -}; - -WEAK KernelInfo *find_kernel_by_name(const char *kernel_name, const ModuleState *module) { - KernelInfo *kernel = module->kernel; - while (kernel && strcmp(kernel_name, kernel->kernel_name) != 0) { - kernel = kernel->next; - } - return kernel; -} - -// All persistent state maintained by the runtime. -struct GlobalState { - void init(); - int CheckAndReportError(void *user_context, const char *location); - - bool initialized; - - // Declare pointers used OpenGL functions -#define GLFUNC(PTYPE, VAR) PTYPE VAR - USED_GL_FUNCTIONS; -#undef GLFUNC -}; - -WEAK int GlobalState::CheckAndReportError(void *user_context, const char *location) { - GLenum err = GetError(); - if (err == GL_NO_ERROR) { - return halide_error_code_success; - } - - error(user_context) << "OpenGL error " << gl_error_name(err) << "(" << (int)err << ")" - << " at " << location << "."; - return halide_error_code_generic_error; -} - -WEAK GlobalState global_state; - -// A list of module-specific state. Each module corresponds to a single Halide filter -WEAK ModuleState *state_list; - -// ---------- Helper functions ---------- - -WEAK void debug_buffer(void *user_context, halide_buffer_t *buf) { - debug(user_context) - << " device: " << buf->device << "\n" - << " texture_id: " << (GLuint)buf->device << "\n" - << " host: " << buf->host << "\n" - << " extent: " << buf->dim[0].extent << " " << buf->dim[1].extent - << " " << buf->dim[2].extent << " " << buf->dim[3].extent << "\n" - << " stride: " << buf->dim[0].stride << " " << buf->dim[1].stride - << " " << buf->dim[2].stride << " " << buf->dim[3].stride << "\n" - << " min: " << buf->dim[0].min << " " << buf->dim[1].min - << " " << buf->dim[2].min << " " << buf->dim[3].min << "\n" - << " type: " << buf->type << "\n" - << " host_dirty: " << buf->host_dirty() << "\n" - << " device_dirty: " << buf->device_dirty() << "\n"; -} - -WEAK void GlobalState::init() { - initialized = false; -#define GLFUNC(type, name) name = nullptr; - USED_GL_FUNCTIONS; -#undef GLFUNC -} - -WEAK int load_gl_func(void *user_context, const char *name, void **ptr, bool required) { - void *p = halide_opengl_get_proc_address(user_context, name); - if (!p && required) { - error(user_context) << "Could not load function pointer for " << name; - return halide_error_code_symbol_not_found; - } - *ptr = p; - return halide_error_code_success; -} - -// Initialize the OpenGL-specific parts of the runtime. -WEAK int halide_openglcompute_init(void *user_context) { - if (global_state.initialized) { - return halide_error_code_success; - } - - global_state.init(); - - // Make a context if there isn't one - auto result = halide_opengl_create_context(user_context); - if (result) { - return result; - } - - // Initialize pointers to OpenGL functions. -#define GLFUNC(TYPE, VAR) \ - if (load_gl_func(user_context, "gl" #VAR, (void **)&global_state.VAR, true) < 0) { \ - error(user_context) << "Failed to load function: gl" #VAR; \ - return halide_error_code_symbol_not_found; \ - } - USED_GL_FUNCTIONS; -#undef GLFUNC - - debug(user_context) << "Halide running on " << global_state.GetString(GL_VERSION) << "\n"; - - global_state.initialized = true; - return halide_error_code_success; -} - -// Release all data allocated by the runtime. -// -// The OpenGL context itself is generally managed by the host application, so -// we leave it untouched. -WEAK int halide_openglcompute_device_release(void *user_context) { -#ifdef DEBUG_RUNTIME - uint64_t t_before = halide_current_time_ns(user_context); -#endif - - debug(user_context) << "OpenGLCompute: halide_openglcompute_device_release(user_context: " - << user_context << ")\n"; - - ModuleState *mod = state_list; - while (mod) { - KernelInfo *kernel = mod->kernel; - while (kernel) { - KernelInfo *next_kernel = kernel->next; - global_state.DeleteProgram(kernel->program_id); - free(kernel->kernel_name); - free(kernel); - kernel = next_kernel; - } - mod->kernel = nullptr; - ModuleState *next = mod->next; - // do not call free(mod) to avoid dangling pointers: the module state - // is still referenced in the code generated by Halide (see - // CodeGen_GPU_Host::get_module_state). - mod = next; - } - - global_state = GlobalState(); - -#ifdef DEBUG_RUNTIME - uint64_t t_after = halide_current_time_ns(user_context); - debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 - << " ms\n"; -#endif - - return halide_error_code_success; -} - -// Allocate a new texture matching the dimension and color format of the -// specified buffer. -WEAK int halide_openglcompute_device_malloc(void *user_context, halide_buffer_t *buf) { -#ifdef DEBUG_RUNTIME - uint64_t t_before = halide_current_time_ns(user_context); -#endif - - debug(user_context) << "OpenGLCompute: halide_openglcompute_device_malloc (user_context: " - << user_context << ", buf: " << buf << ")\n"; - - auto result = halide_openglcompute_init(user_context); - if (result) { - return result; - } - - size_t size = buf->size_in_bytes(); - halide_abort_if_false(user_context, size != 0); - - if (buf->device) { - // This buffer already has a device allocation - debug(user_context) << "openglcompute_device_malloc: This buffer already has a " - "device allocation\n"; - return halide_error_code_success; - } - - for (int i = 0; i < buf->dimensions; i++) { - halide_abort_if_false(user_context, buf->dim[i].stride >= 0); - } - - debug(user_context) << " allocating buffer, " - << "extents: " << buf->dim[0].extent << "x" - << buf->dim[1].extent << "x" << buf->dim[2].extent << "x" - << buf->dim[3].extent << " " - << "strides: " << buf->dim[0].stride << "x" - << buf->dim[1].stride << "x" << buf->dim[2].stride << "x" - << buf->dim[3].stride << " " - << "(type: " << buf->type << ")\n"; - - result = halide_openglcompute_init(user_context); - if (result) { - return result; - } - debug(user_context) << "openglcompute_device_malloc: initialization completed.\n"; - - if (!buf) { - return halide_error_code_buffer_argument_is_null; - } - - GLuint the_buffer; - global_state.GenBuffers(1, &the_buffer); - result = global_state.CheckAndReportError(user_context, "oglc: GenBuffers"); - if (result) { - return result; - } - global_state.BindBuffer(GL_ARRAY_BUFFER, the_buffer); - result = global_state.CheckAndReportError(user_context, "oglc: BindBuffer"); - if (result) { - return result; - } - - // OpenGLCompute only supports int32, uint32, and float data - // types, all of which are 4 bytes. We'll inflate the size for - // smaller types. - size *= (4 / buf->type.bytes()); - halide_abort_if_false(user_context, size != 0); - global_state.BufferData(GL_ARRAY_BUFFER, size, nullptr, GL_DYNAMIC_COPY); - result = global_state.CheckAndReportError(user_context, "oglc: BufferData"); - if (result) { - return result; - } - - buf->device = the_buffer; - buf->device_interface = &openglcompute_device_interface; - buf->device_interface->impl->use_module(); - - debug(user_context) << "Allocated dev_buffer(i.e. vbo) " << the_buffer << "\n"; - -#ifdef DEBUG_RUNTIME - uint64_t t_after = halide_current_time_ns(user_context); - debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 - << " ms for malloc\n"; -#endif - - return halide_error_code_success; -} - -WEAK int halide_openglcompute_device_free(void *user_context, halide_buffer_t *buf) { -#ifdef DEBUG_RUNTIME - uint64_t t_before = halide_current_time_ns(user_context); -#endif - - if (!global_state.initialized) { - error(user_context) << "OpenGL runtime not initialized in call to halide_openglcompute_device_free."; - return halide_error_code_generic_error; - } - - if (buf->device == 0) { - return halide_error_code_success; - } - GLuint the_buffer = (GLuint)buf->device; - - debug(user_context) << "OGLC: halide_openglcompute_device_free (" - << "user_context: " << user_context - << ", the_buffer:" << the_buffer - << ")\n"; - - global_state.DeleteBuffers(1, &the_buffer); - - buf->device = 0; - buf->device_interface->impl->release_module(); - buf->device_interface = nullptr; - -#ifdef DEBUG_RUNTIME - uint64_t t_after = halide_current_time_ns(user_context); - debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 - << " ms for free\n"; -#endif - - return halide_error_code_success; -} - -namespace { - -template -ALWAYS_INLINE void converting_copy_memory_helper(const device_copy ©, int d, int64_t src_off, int64_t dst_off) { - // Skip size-1 dimensions - while (d >= 0 && copy.extent[d] == 1) { - d--; - } - - if (d == -1) { - const Source *from = (Source *)(copy.src + src_off); - Dest *to = (Dest *)(copy.dst + dst_off); - for (uint64_t index = 0; index < copy.chunk_size; index++) { - *to++ = (Dest)*from++; - } - } else { - for (uint64_t i = 0; i < copy.extent[d]; i++) { - converting_copy_memory_helper(copy, d - 1, src_off, dst_off); - src_off += copy.src_stride_bytes[d]; - dst_off += copy.dst_stride_bytes[d]; - } - } -} - -} // namespace -// Copy image data from host memory to texture. -WEAK int halide_openglcompute_copy_to_device(void *user_context, halide_buffer_t *buf) { -#ifdef DEBUG_RUNTIME - uint64_t t_before = halide_current_time_ns(user_context); -#endif - - if (!global_state.initialized) { - error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_copy_to_device)."; - return halide_error_code_generic_error; - } - - GLuint the_buffer = (GLuint)buf->device; - debug(user_context) << "OGLC: halide_openglcompute_copy_to_device (" - << "user_context: " << user_context - << ", buf: " << buf - << ", the_buffer:" << the_buffer << ")\n"; - - global_state.BindBuffer(GL_ARRAY_BUFFER, the_buffer); - auto result = global_state.CheckAndReportError(user_context, "oglc: BindBuffer"); - if (result) { - return result; - } - - size_t size = buf->number_of_elements() * 4; - global_state.BindBuffer(GL_ARRAY_BUFFER, the_buffer); - result = global_state.CheckAndReportError(user_context, "oglc: BindBuffer"); - if (result) { - return result; - } - - debug(user_context) << "Calling global_state.MapBufferRange(GL_ARRAY_BUFFER, 0, " << (uint64_t)size << ", GL_MAP_READ_BIT|GL_MAP_WRITE_BIT)\n"; - void *device_data = global_state.MapBufferRange(GL_ARRAY_BUFFER, - 0, - size, - GL_MAP_READ_BIT | GL_MAP_WRITE_BIT); - result = global_state.CheckAndReportError(user_context, "oglc: MapBufferRange"); - if (result) { - return result; - } - halide_buffer_t buf_copy = *buf; - buf_copy.device = (uint64_t)device_data; - device_copy dev_copy = make_host_to_device_copy(&buf_copy); - - if (buf->type.code == halide_type_int) { - if (buf->type.bits == 8) { - converting_copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0); - } else if (buf->type.bits == 16) { - // Convert chunk_size in bytes to the number of items to be copied. - // This doesn't happen for the 8-bit case because it would be a division by one, - // and it doesn't happen for the 32-bit case as there is no data conversion and memcpy - // is used. - dev_copy.chunk_size /= 2; - converting_copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0); - } else if (buf->type.bits == 32) { - copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0); - } else { - error(user_context) << "OpenGLCompute does not support 64-bit integers."; - return halide_error_code_generic_error; - } - } else if (buf->type.code == halide_type_uint) { - if (buf->type.bits == 8) { - converting_copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0); - } else if (buf->type.bits == 16) { - // Convert chunk_size in bytes to the number of items to be copied. - // This doesn't happen for the 8-bit case because it would be a division by one, - // and it doesn't happen for the 32-bit case as there is no data conversion and memcpy - // is used. - dev_copy.chunk_size /= 2; - converting_copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0); - } else if (buf->type.bits == 32) { - copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0); - } else { - error(user_context) << "OpenGLCompute does not support 64-bit integers."; - return halide_error_code_generic_error; - } - } else if (buf->type.code == halide_type_float) { - if (buf->type.bits == 32) { - copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, dev_copy.src_begin, 0); - } else { - error(user_context) << "OpenGLCompute does not support 64-bit floating-point."; - return halide_error_code_generic_error; - } - } - global_state.UnmapBuffer(GL_ARRAY_BUFFER); - - debug(user_context) << " copied " << ((unsigned)size) << " bytes from " << buf->host << " to the device.\n"; - -#ifdef DEBUG_RUNTIME - uint64_t t_after = halide_current_time_ns(user_context); - debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 - << " ms for copy to dev\n"; -#endif - return halide_error_code_success; -} - -// Copy image data from texture back to host memory. -WEAK int halide_openglcompute_copy_to_host(void *user_context, halide_buffer_t *buf) { -#ifdef DEBUG_RUNTIME - uint64_t t_before = halide_current_time_ns(user_context); -#endif - - if (!global_state.initialized) { - error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_copy_to_host)."; - return halide_error_code_generic_error; - } - - GLuint the_buffer = (GLuint)buf->device; - size_t size = buf->size_in_bytes(); - halide_abort_if_false(user_context, size != 0); - - debug(user_context) << "OGLC: halide_openglcompute_copy_to_host (" - << "user_context: " << user_context - << ", buf: " << buf - << ", the_buffer:" << the_buffer - << ", size=" << (unsigned)size << ")\n"; - - global_state.BindBuffer(GL_ARRAY_BUFFER, the_buffer); - auto result = global_state.CheckAndReportError(user_context, "oglc: BindBuffer"); - if (result) { - return result; - } - - void *device_data = global_state.MapBufferRange(GL_ARRAY_BUFFER, - 0, - size, - GL_MAP_READ_BIT); - result = global_state.CheckAndReportError(user_context, "oglc: MapBufferRange"); - if (result) { - return result; - } - - halide_buffer_t buf_copy = *buf; - buf_copy.device = (uint64_t)device_data; - device_copy dev_copy = make_device_to_host_copy(&buf_copy); - - if (buf->type.code == halide_type_int) { - if (buf->type.bits == 8) { - converting_copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin); - } else if (buf->type.bits == 16) { - // Convert chunk_size in bytes to the number of items to be copied. - // This doesn't happen for the 8-bit case because it would be a division by one, - // and it doesn't happen for the 32-bit case as there is no data conversion and memcpy - // is used. - dev_copy.chunk_size /= 2; - converting_copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin); - } else if (buf->type.bits == 32) { - copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin); - } else { - error(user_context) << "OpenGLCompute does not support 64-bit integers."; - return halide_error_code_generic_error; - } - } else if (buf->type.code == halide_type_uint) { - if (buf->type.bits == 8) { - converting_copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin); - } else if (buf->type.bits == 16) { - // Convert chunk_size in bytes to the number of items to be copied. - // This doesn't happen for the 8-bit case because it would be a division by one, - // and it doesn't happen for the 32-bit case as there is no data conversion and memcpy - // is used. - dev_copy.chunk_size /= 2; - converting_copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin); - } else if (buf->type.bits == 32) { - copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin); - } else { - error(user_context) << "OpenGLCompute does not support 64-bit integers."; - return halide_error_code_generic_error; - } - } else if (buf->type.code == halide_type_float) { - if (buf->type.bits == 32) { - copy_memory_helper(dev_copy, MAX_COPY_DIMS - 1, 0, dev_copy.src_begin); - } else { - error(user_context) << "OpenGLCompute does not support 64-bit floating-point."; - return halide_error_code_generic_error; - } - } - - global_state.UnmapBuffer(GL_ARRAY_BUFFER); - - debug(user_context) << " copied " << (unsigned)size << " bytes to the host.\n"; - -#ifdef DEBUG_RUNTIME - uint64_t t_after = halide_current_time_ns(user_context); - debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 - << " ms for copy to host\n"; -#endif - - return halide_error_code_success; -} - -} // namespace OpenGLCompute -} // namespace Internal -} // namespace Runtime -} // namespace Halide - -using namespace Halide::Runtime::Internal::OpenGLCompute; - -// Create wrappers that satisfy old naming conventions - -extern "C" { - -WEAK int halide_openglcompute_run(void *user_context, void *state_ptr, - const char *entry_name, int blocksX, int blocksY, - int blocksZ, int threadsX, int threadsY, int threadsZ, - int shared_mem_bytes, halide_type_t arg_types[], void *args[], - int8_t arg_is_buffer[]) { -#ifdef DEBUG_RUNTIME - uint64_t t_before = halide_current_time_ns(user_context); -#endif - - debug(user_context) - << "OpenGLCompute: halide_openglcompute_run (user_context: " << user_context << ", " - << "entry: " << entry_name << ", " - << "blocks: " << blocksX << "x" << blocksY << "x" << blocksZ << ", " - << "threads: " << threadsX << "x" << threadsY << "x" << threadsZ << ", " - << "shmem: " << shared_mem_bytes << "\n"; - - if (!global_state.initialized) { - error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_run)."; - return halide_error_code_generic_error; - } - - ModuleState *mod = (ModuleState *)state_ptr; - if (!mod) { - error(user_context) << "Internal error: module state is nullptr."; - return halide_error_code_generic_error; - } - - KernelInfo *kernel = find_kernel_by_name(entry_name, mod); - if (!kernel) { - error(user_context) << "Internal error: unknown kernel named '" << entry_name << "'"; - return halide_error_code_generic_error; - } - - global_state.UseProgram(kernel->program_id); - auto result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run UseProgram"); - if (result) { - return result; - } - - // Populate uniforms with values passed in arguments. - // Order of the passed arguments matches what was generated for this kernel. - int i = 0; - while (arg_types[i].bits != 0) { - debug(user_context) << " args " << i - << " " << arg_types[i] - << " [" << (*((void **)args[i])) << " ...] " - << arg_is_buffer[i] << "\n"; - if (arg_is_buffer[i] == 0) { - if (arg_types[i].code == halide_type_int) { - int value; - if (arg_types[i].bits == 8) { - value = *((int8_t *)args[i]); - } else if (arg_types[i].bits == 16) { - value = *((int16_t *)args[i]); - } else if (arg_types[i].bits == 32) { - value = *((int32_t *)args[i]); - } else { - error(user_context) << "Cannot pass argument of type " << arg_types[i] << " to GL shader"; - return halide_error_code_generic_error; - } - global_state.Uniform1i(i, value); - result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run Uniform1i"); - if (result) { - return result; - } - } else if (arg_types[i].code == halide_type_uint) { - unsigned value; - if (arg_types[i].bits == 8 || - arg_types[i].bits == 1) { - value = *((uint8_t *)args[i]); - } else if (arg_types[i].bits == 16) { - value = *((uint16_t *)args[i]); - } else if (arg_types[i].bits == 32) { - value = *((uint32_t *)args[i]); - } else { - error(user_context) << "Cannot pass argument of type " << arg_types[i] << " to GL shader"; - return halide_error_code_generic_error; - } - global_state.Uniform1ui(i, value); - result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run Uniform1ui"); - if (result) { - return result; - } - } else if (arg_types[i].code == halide_type_float) { - float value; - if (arg_types[i].bits == 32) { - value = *((float *)args[i]); - } else { - error(user_context) << "Cannot pass argument of type " << arg_types[i] << " to GL shader"; - return halide_error_code_generic_error; - } - global_state.Uniform1f(i, value); - result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run Uniform1f"); - if (result) { - return result; - } - } else { - error(user_context) << "Cannot pass argument of type " << arg_types[i] << " to GL shader"; - return halide_error_code_generic_error; - } - } else { - uint64_t arg_value = ((halide_buffer_t *)args[i])->device; - - GLuint the_buffer = (GLuint)arg_value; - global_state.BindBufferBase(GL_SHADER_STORAGE_BUFFER, i, the_buffer); - result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run BindBufferBase"); - if (result) { - return result; - } - } - i++; - } - global_state.DispatchCompute(blocksX, blocksY, blocksZ); - result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run DispatchCompute"); - if (result) { - return result; - } - - global_state.MemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT); - result = global_state.CheckAndReportError(user_context, "halide_openglcompute_run MemoryBarrier"); - if (result) { - return result; - } - -#ifdef DEBUG_RUNTIME - uint64_t t_after = halide_current_time_ns(user_context); - debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 - << " ms for run\n"; -#endif - - return halide_error_code_success; -} - -WEAK int halide_openglcompute_device_sync(void *user_context, halide_buffer_t *) { -#ifdef DEBUG_RUNTIME - uint64_t t_before = halide_current_time_ns(user_context); -#endif - - if (!global_state.initialized) { - error(user_context) << "OpenGL runtime not initialized (halide_openglcompute_device_sync)."; - return halide_error_code_generic_error; - } - global_state.Finish(); -#ifdef DEBUG_RUNTIME - uint64_t t_after = halide_current_time_ns(user_context); - debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 - << " ms for sync\n"; -#endif - return halide_error_code_success; -} - -namespace { -WEAK char *get_kernel_name(const char *start, const char *end) { - const size_t kernel_name_length = end - start; - char *kernel_name = (char *)malloc(kernel_name_length + 1); - memcpy(kernel_name, start, kernel_name_length); - kernel_name[kernel_name_length] = '\0'; - return kernel_name; -} -} // namespace - -// Called at the beginning of a code block generated by Halide. This function -// is responsible for setting up the OpenGL environment and compiling the GLSL -// code into a compute shader. -WEAK int halide_openglcompute_initialize_kernels(void *user_context, void **state_ptr, - const char *src, int size) { -#ifdef DEBUG_RUNTIME - halide_start_clock(user_context); - uint64_t t_before = halide_current_time_ns(user_context); -#endif - - auto result = halide_openglcompute_init(user_context); - if (result) { - return result; - } - - ModuleState **state = (ModuleState **)state_ptr; - ModuleState *module = *state; - if (!module) { - module = (ModuleState *)malloc(sizeof(ModuleState)); - module->kernel = nullptr; - module->next = state_list; - state_list = module; - *state = module; - } - - if (module->kernel) { - return halide_error_code_success; - } - - const char *END_OF_KERNEL_MARKER = "\n// end of kernel "; - const size_t END_OF_KERNEL_MARKER_LENGTH = strlen(END_OF_KERNEL_MARKER); - - while (true) { - const char *end_of_kernel_marker = strstr(src, END_OF_KERNEL_MARKER); - if (!end_of_kernel_marker) { - break; // end of kernels sources is reached - } - - const char *just_before_kernel_name = end_of_kernel_marker + END_OF_KERNEL_MARKER_LENGTH; - const char *just_beyond_kernel_name = strstr(just_before_kernel_name, "\n"); - if (!just_beyond_kernel_name) { - error(user_context) << "Failed to find kernel name."; - return halide_error_code_generic_error; - } - - char *kernel_name = get_kernel_name(just_before_kernel_name, just_beyond_kernel_name); - - size_t src_len = just_beyond_kernel_name - src; - - KernelInfo *kernel = (KernelInfo *)malloc(sizeof(KernelInfo)); - kernel->kernel_name = kernel_name; - kernel->next = module->kernel; - module->kernel = kernel; - - GLuint shader = global_state.CreateShader(GL_COMPUTE_SHADER); - result = global_state.CheckAndReportError(user_context, "create shader"); - if (result) { - return result; - } - const GLchar *sources = {src}; - const GLint sources_lengths = {(GLint)src_len}; - -#ifdef DEBUG_RUNTIME - print(user_context) << "Compute shader source for: " << kernel_name; - halide_print(user_context, src); -#endif - - global_state.ShaderSource(shader, 1, &sources, &sources_lengths); - result = global_state.CheckAndReportError(user_context, "shader source"); - if (result) { - return result; - } - global_state.CompileShader(shader); - result = global_state.CheckAndReportError(user_context, "compile shader"); - if (result) { - return result; - } - - GLint shader_ok = 0; - global_state.GetShaderiv(shader, GL_COMPILE_STATUS, &shader_ok); - if (shader_ok != GL_TRUE) { - debug(user_context) << "Could not compile shader:\n"; - GLint log_len; - global_state.GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_len); - HalideMalloc log_tmp(user_context, log_len); - if (log_tmp.ptr) { - char *log = (char *)log_tmp.ptr; - global_state.GetShaderInfoLog(shader, log_len, nullptr, log); - debug(user_context) << log << "\n"; - } - global_state.DeleteShader(shader); - error(user_context) << "Could not compile shader."; - return halide_error_code_generic_error; - } - - // Link GLSL program - GLuint program = global_state.CreateProgram(); - global_state.AttachShader(program, shader); - result = global_state.CheckAndReportError(user_context, "attach shader"); - if (result) { - return result; - } - global_state.LinkProgram(program); - result = global_state.CheckAndReportError(user_context, "link program"); - if (result) { - return result; - } - - // Release the individual shaders - global_state.DeleteShader(shader); - - GLint status; - global_state.GetProgramiv(program, GL_LINK_STATUS, &status); - if (!status) { - GLint log_len; - global_state.GetProgramiv(program, GL_INFO_LOG_LENGTH, &log_len); - HalideMalloc log_tmp(user_context, log_len); - if (log_tmp.ptr) { - char *log = (char *)log_tmp.ptr; - global_state.GetProgramInfoLog(program, log_len, nullptr, log); - debug(user_context) << "Could not link GLSL program:\n" - << log << "\n"; - } - global_state.DeleteProgram(program); - error(user_context) << "Could not link GLSL program."; - return halide_error_code_generic_error; - } - kernel->program_id = program; - -#ifdef DEBUG_RUNTIME - GLint i; - GLint count; - - GLint size; // size of the variable - GLenum type; // type of the variable (float, vec3 or mat4, etc) - - const GLsizei bufSize = 64; // maximum name length - GLchar name[bufSize]; // variable name in GLSL - GLsizei length; // name length - - global_state.GetProgramiv(program, GL_ACTIVE_UNIFORMS, &count); - debug(user_context) << "Active Uniforms: " << count << "\n"; - - for (i = 0; i < count; i++) { - global_state.GetActiveUniform(program, (GLuint)i, bufSize, &length, &size, &type, name); - GLint loc = global_state.GetUniformLocation(program, name); - debug(user_context) << "Uniform " << i << " Type: " << type << " Name: " << name << " location: " << loc << "\n"; - } -#endif - src += src_len; // moving on to the next kernel - } -#ifdef DEBUG_RUNTIME - uint64_t t_after = halide_current_time_ns(user_context); - debug(user_context) << " Time: " << (t_after - t_before) / 1.0e6 - << " ms\n"; -#endif - - return halide_error_code_success; -} - -WEAK void halide_openglcompute_finalize_kernels(void *user_context, void *state_ptr) { -} - -WEAK int halide_openglcompute_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) { - return halide_default_device_and_host_malloc(user_context, buf, &openglcompute_device_interface); -} - -WEAK int halide_openglcompute_device_and_host_free(void *user_context, struct halide_buffer_t *buf) { - return halide_default_device_and_host_free(user_context, buf, &openglcompute_device_interface); -} - -WEAK const struct halide_device_interface_t *halide_openglcompute_device_interface() { - return &openglcompute_device_interface; -} - -} // extern "C" - -namespace Halide { -namespace Runtime { -namespace Internal { -namespace OpenGLCompute { - -WEAK halide_device_interface_impl_t openglcompute_device_interface_impl = { - halide_use_jit_module, - halide_release_jit_module, - halide_openglcompute_device_malloc, - halide_openglcompute_device_free, - halide_openglcompute_device_sync, - halide_openglcompute_device_release, - halide_openglcompute_copy_to_host, - halide_openglcompute_copy_to_device, - halide_openglcompute_device_and_host_malloc, - halide_openglcompute_device_and_host_free, - halide_default_buffer_copy, - halide_default_device_crop, - halide_default_device_slice, - halide_default_device_release_crop, - halide_default_device_wrap_native, - halide_default_device_detach_native, -}; - -WEAK halide_device_interface_t openglcompute_device_interface = { - halide_device_malloc, - halide_device_free, - halide_device_sync, - halide_device_release, - halide_copy_to_host, - halide_copy_to_device, - halide_device_and_host_malloc, - halide_device_and_host_free, - halide_buffer_copy, - halide_device_crop, - halide_device_slice, - halide_device_release_crop, - halide_device_wrap_native, - halide_device_detach_native, - nullptr, - &openglcompute_device_interface_impl}; - -} // namespace OpenGLCompute -} // namespace Internal -} // namespace Runtime -} // namespace Halide diff --git a/src/runtime/osx_opengl_context.cpp b/src/runtime/osx_opengl_context.cpp deleted file mode 100644 index 734d94b039ab..000000000000 --- a/src/runtime/osx_opengl_context.cpp +++ /dev/null @@ -1,118 +0,0 @@ -#include "HalideRuntime.h" -#include "printer.h" -#include "scoped_mutex_lock.h" - -#define USE_AGL 0 -#if USE_AGL -extern "C" void *aglChoosePixelFormat(void *, int, const int *); -extern "C" void *aglCreateContext(void *, void *); -extern "C" int aglGetError(); -extern "C" void aglDestroyPixelFormat(void *); -extern "C" unsigned char aglSetCurrentContext(void *); -#endif - -#if !USE_AGL -namespace Halide { -namespace Runtime { -namespace Internal { -namespace OpenGL { - -WEAK halide_mutex cgl_functions_mutex; -WEAK bool cgl_initialized = false; -WEAK int (*CGLChoosePixelFormat)(int *attributes, void **pixel_format_result, int *num_formats); -WEAK int (*CGLCreateContext)(void *pixel_format, void *share_context, void **context_Result); -WEAK int (*CGLDestroyPixelFormat)(void *); -WEAK int (*CGLSetCurrentContext)(void *); - -} // namespace OpenGL -} // namespace Internal -} // namespace Runtime -} // namespace Halide - -using namespace Halide::Runtime::Internal::OpenGL; -#endif - -extern "C" { - -WEAK void *halide_opengl_get_proc_address(void *user_context, const char *name) { - static void *dylib = nullptr; - if (!dylib) { - dylib = halide_load_library( - "/System/Library/Frameworks/OpenGL.framework/Versions/Current/OpenGL"); - if (!dylib) { - return nullptr; - } - } - return halide_get_library_symbol(dylib, name); -} - -// Initialize OpenGL -WEAK int halide_opengl_create_context(void *user_context) { -#if USE_AGL - void *ctx = nullptr; - - int attrib[] = {4 /* AGL_RGBA */, 0 /* Sentinel */}; - void *pf = aglChoosePixelFormat(nullptr, 0, attrib); - if (!pf) { - error(user_context) << "Could not create pixel format."; - return halide_error_code_generic_error; - } - ctx = aglCreateContext(pf, nullptr); - if (!ctx || aglGetError()) { - error(user_context) << "Could not create context."; - return halide_error_code_generic_error; - } - aglDestroyPixelFormat(pf); - if (!aglSetCurrentContext(ctx)) { - error(user_context) << "Could not activate OpenGL context."; - return halide_error_code_generic_error; - } -#else - { // locking scope - ScopedMutexLock lock(&cgl_functions_mutex); - - if (!cgl_initialized) { - if ((CGLChoosePixelFormat = - (int (*)(int *, void **, int *))halide_opengl_get_proc_address(user_context, "CGLChoosePixelFormat")) == nullptr) { - return halide_error_code_generic_error; - } - if ((CGLCreateContext = - (int (*)(void *, void *, void **))halide_opengl_get_proc_address(user_context, "CGLCreateContext")) == nullptr) { - return halide_error_code_generic_error; - } - if ((CGLDestroyPixelFormat = - (int (*)(void *))halide_opengl_get_proc_address(user_context, "CGLDestroyPixelFormat")) == nullptr) { - return halide_error_code_generic_error; - } - if ((CGLSetCurrentContext = - (int (*)(void *))halide_opengl_get_proc_address(user_context, "CGLSetCurrentContext")) == nullptr) { - return halide_error_code_generic_error; - } - } - cgl_initialized = true; - } - - void *ctx = nullptr; - int attribs[] = { - /* 5 kCGLPFADoubleBuffer */ - 72, // kCGLPFANoRecovery - 96, // kCGLPFAAllowOfflineRenderers - 99, // kCGLPFAOpenGLProfile - 0x1000, // kCGLOGLPVersion_Legacy -- 0x3200 is kCGLOGLPVersion_3_2_Core -- kCGLOGLPVersion_GL4_Core is 0x4100 - 0 // sentinel ending list - }; - - void *fmt; - int numFormats = 0; - if (CGLChoosePixelFormat(attribs, &fmt, &numFormats) != 0) { - return halide_error_code_generic_error; - } - if (CGLCreateContext(fmt, nullptr, &ctx) != 0) { - CGLDestroyPixelFormat(fmt); - return halide_error_code_generic_error; - } - CGLSetCurrentContext(ctx); -#endif - return halide_error_code_success; -} -} diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp index 5c64391b6259..a8651ae081a6 100644 --- a/src/runtime/runtime_api.cpp +++ b/src/runtime/runtime_api.cpp @@ -7,7 +7,6 @@ #include "HalideRuntimeHexagonHost.h" #include "HalideRuntimeMetal.h" #include "HalideRuntimeOpenCL.h" -#include "HalideRuntimeOpenGLCompute.h" #include "HalideRuntimeQurt.h" #include "HalideRuntimeVulkan.h" #include "HalideRuntimeWebGPU.h" @@ -160,12 +159,6 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_opencl_set_device_type, (void *)&halide_opencl_set_platform_name, (void *)&halide_opencl_wrap_cl_mem, - (void *)&halide_opengl_create_context, - (void *)&halide_opengl_get_proc_address, - (void *)&halide_openglcompute_device_interface, - (void *)&halide_openglcompute_initialize_kernels, - (void *)&halide_openglcompute_finalize_kernels, - (void *)&halide_openglcompute_run, (void *)&halide_pointer_to_string, (void *)&halide_print, (void *)&halide_profiler_get_pipeline_state, diff --git a/test/correctness/async_copy_chain.cpp b/test/correctness/async_copy_chain.cpp index 45b014c4bd8b..ae8623d446bb 100644 --- a/test/correctness/async_copy_chain.cpp +++ b/test/correctness/async_copy_chain.cpp @@ -25,12 +25,6 @@ int main(int argc, char **argv) { return 0; } - if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) { - printf("Skipping test for OpenGLCompute as it does not support copy_to_host/device() yet" - " (halide_buffer_copy is unimplemented in that backend).\n"); - return 0; - } - // Make a list of extern pipeline stages (just copies) all async // and connected by double buffers, then try various nestings of // them. This is a stress test of the async extern storage folding diff --git a/test/correctness/async_device_copy.cpp b/test/correctness/async_device_copy.cpp index 6e579c77b65f..3fc73d1c6139 100644 --- a/test/correctness/async_device_copy.cpp +++ b/test/correctness/async_device_copy.cpp @@ -22,12 +22,6 @@ int main(int argc, char **argv) { return 0; } - if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) { - printf("Skipping test for OpenGLCompute as it does not support copy_to_host/device() yet" - " (halide_buffer_copy is unimplemented in that backend).\n"); - return 0; - } - // Compute frames on GPU/CPU, and then sum then on // CPU/GPU. async() lets us overlap the CPU computation with the // copies. diff --git a/test/correctness/boundary_conditions.cpp b/test/correctness/boundary_conditions.cpp index def2d410226b..61422d130d01 100644 --- a/test/correctness/boundary_conditions.cpp +++ b/test/correctness/boundary_conditions.cpp @@ -20,12 +20,7 @@ bool expect_eq(T actual, T expected) { void schedule_test(Func f, int vector_width, Partition partition_policy, const Target &t) { if (vector_width != 1) { - if (t.has_feature(Target::OpenGLCompute)) { - // Vector stores not yet supported in OpenGLCompute backend - f.unroll(x, vector_width); - } else { - f.vectorize(x, vector_width); - } + f.vectorize(x, vector_width); } f.partition(x, partition_policy); f.partition(y, partition_policy); @@ -388,7 +383,6 @@ int main(int argc, char **argv) { int vector_width_max = 32; if (target.has_feature(Target::Metal) || target.has_feature(Target::Vulkan) || - target.has_feature(Target::OpenGLCompute) || target.has_feature(Target::D3D12Compute) || target.has_feature(Target::WebGPU)) { // https://github.com/halide/Halide/issues/2148 diff --git a/test/correctness/device_buffer_copy.cpp b/test/correctness/device_buffer_copy.cpp index 9179ac83cd24..31ff92b4ae85 100644 --- a/test/correctness/device_buffer_copy.cpp +++ b/test/correctness/device_buffer_copy.cpp @@ -32,11 +32,6 @@ int main(int argc, char **argv) { return 0; } - if (target.has_feature(Target::OpenGLCompute)) { - printf("Skipping test for OpenGLCompute, as it does not support device crops, slices, or copies\n"); - return 0; - } - printf("Test copy to device.\n"); { Halide::Runtime::Buffer gpu_buf = make_gpu_buffer(hexagon_rpc); diff --git a/test/correctness/device_crop.cpp b/test/correctness/device_crop.cpp index ee4b900bc1f3..44fa0a4b2bde 100644 --- a/test/correctness/device_crop.cpp +++ b/test/correctness/device_crop.cpp @@ -30,11 +30,6 @@ int main(int argc, char **argv) { return 0; } - if (target.has_feature(Target::OpenGLCompute)) { - printf("Skipping test for OpenGLCompute, as it does not support device crops, slices, or copies\n"); - return 0; - } - printf("Test in-place cropping.\n"); { Halide::Runtime::Buffer gpu_buf = make_gpu_buffer(hexagon_rpc); diff --git a/test/correctness/device_slice.cpp b/test/correctness/device_slice.cpp index 0b9e3ca5bbcb..3bebc6bbb541 100644 --- a/test/correctness/device_slice.cpp +++ b/test/correctness/device_slice.cpp @@ -32,11 +32,6 @@ int main(int argc, char **argv) { return 0; } - if (target.has_feature(Target::OpenGLCompute)) { - printf("Skipping test for OpenGLCompute, as it does not support device crops, slices, or copies\n"); - return 0; - } - printf("Test in-place slicing.\n"); { Halide::Runtime::Buffer gpu_buf = make_gpu_buffer(hexagon_rpc); diff --git a/test/correctness/dynamic_allocation_in_gpu_kernel.cpp b/test/correctness/dynamic_allocation_in_gpu_kernel.cpp index 242b0e94ba06..9ba586a04a7d 100644 --- a/test/correctness/dynamic_allocation_in_gpu_kernel.cpp +++ b/test/correctness/dynamic_allocation_in_gpu_kernel.cpp @@ -4,7 +4,7 @@ using namespace Halide; int main(int argc, char **argv) { Target t(get_jit_target_from_environment()); - if (!t.has_gpu_feature() && !t.has_feature(Target::OpenGLCompute)) { + if (!t.has_gpu_feature()) { printf("[SKIP] No GPU target enabled.\n"); return 0; } @@ -23,7 +23,7 @@ int main(int argc, char **argv) { // All of the f's have a dynamic size required (it depends on p), // so we'll store them in global memory ("Heap"). On cuda we get - // one big heap allocation. On openglcompute/d3d we should get one + // one big heap allocation. On d3d we should get one // allocation per coalesced group, and groups can only be // coalesced if the types match, so we get an allocation for // [f1,f3,f6], another for [f2,f4], and a third for f5. diff --git a/test/correctness/gpu_allocation_cache.cpp b/test/correctness/gpu_allocation_cache.cpp index cbb864bd6409..51b03e73f82c 100644 --- a/test/correctness/gpu_allocation_cache.cpp +++ b/test/correctness/gpu_allocation_cache.cpp @@ -140,21 +140,16 @@ int main(int argc, char **argv) { // Now run all at the same time to check for concurrency issues. - // FIXME: Skipping OpenGLCompute, which has concurrency - // issues. Probably due to using the GL context on the wrong - // thread. - if (!target.has_feature(Target::OpenGLCompute)) { - Halide::Tools::ThreadPool pool(1); - std::vector> futures; - futures.emplace_back(pool.async(test1, true)); - futures.emplace_back(pool.async(test1, true)); - futures.emplace_back(pool.async(test2, true)); - futures.emplace_back(pool.async(test2, true)); - futures.emplace_back(pool.async(test3, true)); - futures.emplace_back(pool.async(test3, true)); - for (auto &f : futures) { - f.get(); - } + Halide::Tools::ThreadPool pool(1); + std::vector> futures; + futures.emplace_back(pool.async(test1, true)); + futures.emplace_back(pool.async(test1, true)); + futures.emplace_back(pool.async(test2, true)); + futures.emplace_back(pool.async(test2, true)); + futures.emplace_back(pool.async(test3, true)); + futures.emplace_back(pool.async(test3, true)); + for (auto &f : futures) { + f.get(); } // Vulkan will OOM unless allocation cache is used ... skip this since we just ran the same tests above concurrently diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp index a956c6831afc..f98636ea8905 100644 --- a/test/correctness/gpu_dynamic_shared.cpp +++ b/test/correctness/gpu_dynamic_shared.cpp @@ -10,11 +10,6 @@ int main(int argc, char **argv) { return 0; } - if (t.has_feature(Target::OpenGLCompute)) { - printf("[SKIP] Skipping test for OpenGLCompute, as it does not support dynamically-sized shared memory\n"); - return 0; - } - if (t.has_feature(Target::Vulkan)) { const auto *interface = get_device_interface_for_device_api(DeviceAPI::Vulkan); assert(interface->compute_capability != nullptr); diff --git a/test/correctness/gpu_jit_explicit_copy_to_device.cpp b/test/correctness/gpu_jit_explicit_copy_to_device.cpp index bfa57b40d80d..2b234e7f9d06 100644 --- a/test/correctness/gpu_jit_explicit_copy_to_device.cpp +++ b/test/correctness/gpu_jit_explicit_copy_to_device.cpp @@ -5,7 +5,7 @@ using namespace Halide; int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); - if (!target.has_gpu_feature() && !target.has_feature(Target::OpenGLCompute)) { + if (!target.has_gpu_feature()) { printf("[SKIP] No GPU target enabled.\n"); return 0; } diff --git a/test/correctness/gpu_large_alloc.cpp b/test/correctness/gpu_large_alloc.cpp index 6800cf12248a..da3022172a60 100644 --- a/test/correctness/gpu_large_alloc.cpp +++ b/test/correctness/gpu_large_alloc.cpp @@ -21,7 +21,7 @@ int main(int argc, char **argv) { g(x, y) = clamp(f(x, y), 20, 100); Target target = get_jit_target_from_environment(); - if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) { + if (target.has_gpu_feature()) { Var xi, yi; f.compute_root().gpu_tile(x, y, xi, yi, 16, 16); g.compute_root().gpu_tile(x, y, xi, yi, 16, 16); diff --git a/test/correctness/gpu_mixed_dimensionality.cpp b/test/correctness/gpu_mixed_dimensionality.cpp index b3decba3ee76..f76eb15efe80 100644 --- a/test/correctness/gpu_mixed_dimensionality.cpp +++ b/test/correctness/gpu_mixed_dimensionality.cpp @@ -5,7 +5,7 @@ using namespace Halide; int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); - if (!target.has_gpu_feature() && !target.has_feature(Target::OpenGLCompute)) { + if (!target.has_gpu_feature()) { printf("[SKIP] No GPU target enabled.\n"); return 0; } diff --git a/test/correctness/gpu_multi_device.cpp b/test/correctness/gpu_multi_device.cpp index ea9be0e4eb82..ad1b0223f551 100644 --- a/test/correctness/gpu_multi_device.cpp +++ b/test/correctness/gpu_multi_device.cpp @@ -39,14 +39,6 @@ struct MultiDevicePipeline { .gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::Metal); current_stage++; } - if (jit_target.has_feature(Target::OpenGLCompute)) { - stage[current_stage](x, y, c) = stage[current_stage - 1](x, y, c) + 69; - stage[current_stage] - .compute_root() - .reorder(c, x, y) - .gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::OpenGLCompute); - current_stage++; - } } void run(Buffer &result) { diff --git a/test/correctness/gpu_multi_kernel.cpp b/test/correctness/gpu_multi_kernel.cpp index 722c720c78c9..66e21b6896e5 100644 --- a/test/correctness/gpu_multi_kernel.cpp +++ b/test/correctness/gpu_multi_kernel.cpp @@ -16,7 +16,7 @@ int main(int argc, char *argv[]) { kernel3(x) = cast(round(x + kernel2(x))); Target target = get_jit_target_from_environment(); - if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) { + if (target.has_gpu_feature()) { kernel1.gpu_tile(x, xi, 32).compute_root(); kernel2.gpu_tile(x, xi, 32).compute_root(); kernel3.gpu_tile(x, xi, 32); diff --git a/test/correctness/gpu_reuse_shared_memory.cpp b/test/correctness/gpu_reuse_shared_memory.cpp index 422775ac2021..37e932d78273 100644 --- a/test/correctness/gpu_reuse_shared_memory.cpp +++ b/test/correctness/gpu_reuse_shared_memory.cpp @@ -189,9 +189,7 @@ int main(int argc, char **argv) { } printf("Running dynamic shared test\n"); - if (t.has_feature(Target::OpenGLCompute) && memory_type == MemoryType::GPUShared) { - printf("Skipping test because GL doesn't support dynamic sizes for shared memory\n"); - } else if (t.has_feature(Target::Vulkan) && ((t.os == Target::IOS) || t.os == Target::OSX)) { + if (t.has_feature(Target::Vulkan) && ((t.os == Target::IOS) || t.os == Target::OSX)) { printf("Skipping test for Vulkan on iOS/OSX (MoltenVK doesn't support dynamic sizes for shared memory)!\n"); } else { if (dynamic_shared_test(memory_type) != 0) { diff --git a/test/correctness/logical.cpp b/test/correctness/logical.cpp index 50ef8df9421f..1bd134bc37f4 100644 --- a/test/correctness/logical.cpp +++ b/test/correctness/logical.cpp @@ -31,9 +31,7 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16); - if (!target.has_feature(Target::OpenGLCompute)) { - f.vectorize(xi, 4); - } + f.vectorize(xi, 4); } else if (target.has_feature(Target::HVX)) { f.hexagon().vectorize(x, 128); } else { @@ -67,9 +65,7 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16); - if (!target.has_feature(Target::OpenGLCompute)) { - f.vectorize(xi, 4); - } + f.vectorize(xi, 4); } else if (target.has_feature(Target::HVX)) { f.hexagon().vectorize(x, 128); } else { @@ -101,9 +97,7 @@ int main(int argc, char **argv) { if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16); - if (!target.has_feature(Target::OpenGLCompute)) { - f.vectorize(xi, 4); - } + f.vectorize(xi, 4); } else if (target.has_feature(Target::HVX)) { f.hexagon().vectorize(x, 128); } else { @@ -133,9 +127,7 @@ int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16); - if (!target.has_feature(Target::OpenGLCompute)) { - f.vectorize(xi, 4); - } + f.vectorize(xi, 4); } else if (target.has_feature(Target::HVX)) { f.hexagon().vectorize(x, 128); } else { @@ -193,9 +185,7 @@ int main(int argc, char **argv) { } if (target.has_gpu_feature()) { gpu.gpu_tile(x, y, xi, yi, 16, 16); - if (!target.has_feature(Target::OpenGLCompute)) { - gpu.vectorize(xi, 4); - } + gpu.vectorize(xi, 4); } else if (target.has_feature(Target::HVX)) { gpu.hexagon().vectorize(x, 128); } else { diff --git a/test/correctness/math.cpp b/test/correctness/math.cpp index 618a30ea104a..e45b1876918d 100644 --- a/test/correctness/math.cpp +++ b/test/correctness/math.cpp @@ -50,7 +50,6 @@ bool relatively_equal(value_t a, value_t b, Target target) { // For HLSL, try again with a lower error threshold, as it might be using // fast but approximated trigonometric functions: if (target.supports_device_api(DeviceAPI::D3D12Compute) || - target.supports_device_api(DeviceAPI::OpenGLCompute) || target.supports_device_api(DeviceAPI::WebGPU)) { // this threshold value has been empirically determined since there // is no clear documentation on the precision of these algorithms @@ -299,12 +298,7 @@ int main(int argc, char **argv) { call_1_float_types(ceil, 256, -25, 25); call_1_float_types(trunc, 256, -25, 25); - if (get_jit_target_from_environment().has_feature(Target::OpenGLCompute)) { - // GLSL isn't required to support NaN, so keep things real - call_2_float_types(pow, 256, 0.0, 10.0, -4.0f, 4.0f); - } else { - call_2_float_types(pow, 256, -10.0, 10.0, -4.0f, 4.0f); - } + call_2_float_types(pow, 256, -10.0, 10.0, -4.0f, 4.0f); const int8_t int8_min = std::numeric_limits::min(); const int16_t int16_min = std::numeric_limits::min(); diff --git a/test/correctness/mul_div_mod.cpp b/test/correctness/mul_div_mod.cpp index f4f41c8fc9f8..8eca8141bba2 100644 --- a/test/correctness/mul_div_mod.cpp +++ b/test/correctness/mul_div_mod.cpp @@ -556,8 +556,6 @@ int main(int argc, char **argv) { for (int i = 2; i <= 4; i *= 2) { vector_widths.push_back(i); } - } else if (target.has_feature(Target::OpenGLCompute)) { - // Vector load/store unimplemented } else if (target.has_feature(Target::HVX)) { vector_widths.push_back(128); } else { diff --git a/test/correctness/newtons_method.cpp b/test/correctness/newtons_method.cpp index bdd8652b28a9..fa1d4744eedd 100644 --- a/test/correctness/newtons_method.cpp +++ b/test/correctness/newtons_method.cpp @@ -59,9 +59,8 @@ int find_pi() { T secant_result = evaluate_may_gpu(g()[0]); - // Trig in vulkan/openglcompute/d3d12 is approximate + // Trig in vulkan/d3d12 is approximate float tolerance = target.has_feature(Target::Vulkan) || - target.has_feature(Target::OpenGLCompute) || target.has_feature(Target::D3D12Compute) ? 1e-5f : 1e-20f; diff --git a/test/correctness/parallel_gpu_nested.cpp b/test/correctness/parallel_gpu_nested.cpp index a7e604b8435a..53ddcc768e3a 100644 --- a/test/correctness/parallel_gpu_nested.cpp +++ b/test/correctness/parallel_gpu_nested.cpp @@ -14,7 +14,7 @@ int main(int argc, char **argv) { f(x, y, z) = x * y + z * k + 1; Target t = get_jit_target_from_environment(); - if (t.has_gpu_feature() && !t.has_feature(Target::OpenGLCompute)) { + if (t.has_gpu_feature()) { Var xi, yi; f.gpu_tile(x, y, xi, yi, 16, 16); } else if (t.has_feature(Target::HVX)) { diff --git a/test/correctness/plain_c_includes.c b/test/correctness/plain_c_includes.c index 65a436014cbd..0caadc695f9a 100644 --- a/test/correctness/plain_c_includes.c +++ b/test/correctness/plain_c_includes.c @@ -10,7 +10,6 @@ #include "HalideRuntimeHexagonHost.h" #include "HalideRuntimeMetal.h" #include "HalideRuntimeOpenCL.h" -#include "HalideRuntimeOpenGLCompute.h" #include "HalideRuntimeQurt.h" int main(int argc, char **argv) { diff --git a/test/correctness/target.cpp b/test/correctness/target.cpp index 8fc03b589a73..7c8fcbe4d15f 100644 --- a/test/correctness/target.cpp +++ b/test/correctness/target.cpp @@ -52,9 +52,9 @@ int main(int argc, char **argv) { // Full specification round-trip, crazy features t1 = Target(Target::Android, Target::ARM, 32, {Target::JIT, Target::CUDA, Target::OpenCL, - Target::OpenGLCompute, Target::Debug}); + Target::Debug}); ts = t1.to_string(); - if (ts != "arm-32-android-cuda-debug-jit-opencl-openglcompute") { + if (ts != "arm-32-android-cuda-debug-jit-opencl") { printf("to_string failure: %s\n", ts.c_str()); return 1; } diff --git a/test/correctness/vectorized_gpu_allocation.cpp b/test/correctness/vectorized_gpu_allocation.cpp index 9435509c2c6c..2a157cc93ada 100644 --- a/test/correctness/vectorized_gpu_allocation.cpp +++ b/test/correctness/vectorized_gpu_allocation.cpp @@ -11,12 +11,6 @@ int main(int argc, char **argv) { return 0; } - if (t.has_feature(Target::OpenGLCompute)) { - printf("[SKIP] No support for vector loads and stores in OpenGLCompute yet\n"); - // https://github.com/halide/Halide/issues/4979 - return 0; - } - // Fill input buffer. Buffer input(2, 2, 3); Buffer output(2, 2, 3); From 9c3615b07285d263dd0e617b61acb793e49f2c7d Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Sun, 11 Feb 2024 10:41:01 -0800 Subject: [PATCH 056/186] Add checks to prevent people from using negative split factors (#8076) * Add checks to prevent people from using negative split factors Our analysis passes assume that loop maxes are greater than loop mins, so negative split factors cause sufficient havoc that not even output bounds queries are safe. These are therefore checked on pipeline entry. This is a new way for output bounds queries to throw errors (in addition to the buffer pointers themselves being null, and maybe some buffer constraints). Testing this, I realized these errors were getting thrown twice, because the output buffer bounds query in Pipeline::realize was built around two recursive calls to realize, and both were calling the custom error handler. In addition to reporting errors in this class twice, this implies several other inefficiencies, e.g. jit call args were being prepped twice. I reworked it to be built around two calls to call_jit_code instead. Fixes #7938 * Add test to cmakelists * Remove pointless target arg to call_jit_code It has to be the same as the cached target in the receiving object anyway --- Makefile | 2 + src/AddSplitFactorChecks.cpp | 68 +++++++++++++++++++++ src/AddSplitFactorChecks.h | 25 ++++++++ src/CMakeLists.txt | 6 +- src/Callable.cpp | 2 +- src/JITModule.cpp | 4 +- src/JITModule.h | 2 +- src/Lower.cpp | 5 ++ src/Pipeline.cpp | 62 +++++++++++++------ src/Pipeline.h | 3 +- src/runtime/HalideRuntime.h | 6 ++ src/runtime/errors.cpp | 9 +++ src/runtime/runtime_api.cpp | 1 + test/correctness/CMakeLists.txt | 1 + test/correctness/negative_split_factors.cpp | 40 ++++++++++++ 15 files changed, 208 insertions(+), 28 deletions(-) create mode 100644 src/AddSplitFactorChecks.cpp create mode 100644 src/AddSplitFactorChecks.h create mode 100644 test/correctness/negative_split_factors.cpp diff --git a/Makefile b/Makefile index e1457ea161e2..b73b1632a0eb 100644 --- a/Makefile +++ b/Makefile @@ -442,6 +442,7 @@ SOURCE_FILES = \ AddAtomicMutex.cpp \ AddImageChecks.cpp \ AddParameterChecks.cpp \ + AddSplitFactorChecks.cpp \ AlignLoads.cpp \ AllocationBoundsInference.cpp \ ApplySplit.cpp \ @@ -637,6 +638,7 @@ HEADER_FILES = \ AddAtomicMutex.h \ AddImageChecks.h \ AddParameterChecks.h \ + AddSplitFactorChecks.h \ AlignLoads.h \ AllocationBoundsInference.h \ ApplySplit.h \ diff --git a/src/AddSplitFactorChecks.cpp b/src/AddSplitFactorChecks.cpp new file mode 100644 index 000000000000..74ec033ebb4f --- /dev/null +++ b/src/AddSplitFactorChecks.cpp @@ -0,0 +1,68 @@ +#include "AddSplitFactorChecks.h" +#include "Definition.h" +#include "Function.h" +#include "IR.h" +#include "IROperator.h" +#include "Simplify.h" + +namespace Halide { +namespace Internal { + +namespace { + +void check_all_split_factors(const Function &f, const Definition &def, std::vector *stmts) { + const StageSchedule &sched = def.schedule(); + for (const Split &split : sched.splits()) { + if (split.split_type != Split::SplitVar) { + continue; + } + if (is_positive_const(split.factor)) { + // Common-case optimization + continue; + } + Expr positive = simplify(split.factor > 0); + if (is_const_one(positive)) { + // We statically proved it + continue; + } + // We need a runtime check that says: if the condition is + // entered, the split factor will be positive. We can still + // assume the pipeline preconditions, because they will be + // checked before this. + std::ostringstream factor_str; + factor_str << split.factor; + Expr error = Call::make(Int(32), "halide_error_split_factor_not_positive", + {f.name(), + split_string(split.old_var, ".").back(), + split_string(split.outer, ".").back(), + split_string(split.inner, ".").back(), + factor_str.str(), split.factor}, + Call::Extern); + stmts->push_back(AssertStmt::make(positive, error)); + } + + for (const auto &s : def.specializations()) { + check_all_split_factors(f, s.definition, stmts); + } +} + +} // namespace + +Stmt add_split_factor_checks(const Stmt &s, const std::map &env) { + // Check split factors are strictly positive + std::vector stmts; + + for (const auto &p : env) { + const Function &f = p.second; + check_all_split_factors(f, f.definition(), &stmts); + for (const auto &u : f.updates()) { + check_all_split_factors(f, u, &stmts); + } + } + + stmts.push_back(s); + return Block::make(stmts); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/AddSplitFactorChecks.h b/src/AddSplitFactorChecks.h new file mode 100644 index 000000000000..8db610043808 --- /dev/null +++ b/src/AddSplitFactorChecks.h @@ -0,0 +1,25 @@ +#ifndef HALIDE_INTERNAL_ADD_SPLIT_FACTOR_CHECKS_H +#define HALIDE_INTERNAL_ADD_SPLIT_FACTOR_CHECKS_H + +/** \file + * + * Defines the lowering pass that adds the assertions that all split factors are + * strictly positive. + */ +#include + +#include "Expr.h" + +namespace Halide { +namespace Internal { + +class Function; + +/** Insert checks that all split factors that depend on scalar parameters are + * strictly positive. */ +Stmt add_split_factor_checks(const Stmt &s, const std::map &env); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 77453fbce0a9..cca681661c35 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,6 +10,7 @@ set(HEADER_FILES AddAtomicMutex.h AddImageChecks.h AddParameterChecks.h + AddSplitFactorChecks.h AlignLoads.h AllocationBoundsInference.h ApplySplit.h @@ -22,7 +23,7 @@ set(HEADER_FILES Bounds.h BoundsInference.h BoundConstantExtentLoops.h - BoundSmallAllocations.h + BoundSmallAllocations.h Buffer.h Callable.h CanonicalizeGPUVars.h @@ -178,6 +179,7 @@ set(SOURCE_FILES AddAtomicMutex.cpp AddImageChecks.cpp AddParameterChecks.cpp + AddSplitFactorChecks.cpp AlignLoads.cpp AllocationBoundsInference.cpp ApplySplit.cpp @@ -546,7 +548,7 @@ set_target_properties(Halide PROPERTIES # Note that we (deliberately) redeclare these versions here, even though the macros # with identical versions are expected to be defined in source; this allows us to # ensure that the versions defined between all build systems are identical. -target_compile_definitions(Halide PUBLIC +target_compile_definitions(Halide PUBLIC HALIDE_VERSION_MAJOR=${Halide_VERSION_MAJOR} HALIDE_VERSION_MINOR=${Halide_VERSION_MINOR} HALIDE_VERSION_PATCH=${Halide_VERSION_PATCH}) diff --git a/src/Callable.cpp b/src/Callable.cpp index 905155e52254..95a34ed455b1 100644 --- a/src/Callable.cpp +++ b/src/Callable.cpp @@ -192,7 +192,7 @@ Callable::FailureFn Callable::check_fcci(size_t argc, const FullCallCheckInfo *a JITFuncCallContext jit_call_context(context, contents->saved_jit_handlers); - int exit_status = contents->jit_cache.call_jit_code(contents->jit_cache.jit_target, argv); + int exit_status = contents->jit_cache.call_jit_code(argv); // If we're profiling, report runtimes and reset profiler stats. contents->jit_cache.finish_profiling(context); diff --git a/src/JITModule.cpp b/src/JITModule.cpp index ffd8949d4ca1..735f782f67c1 100644 --- a/src/JITModule.cpp +++ b/src/JITModule.cpp @@ -1113,7 +1113,7 @@ Target JITCache::get_compiled_jit_target() const { return jit_target; } -int JITCache::call_jit_code(const Target &target, const void *const *args) { +int JITCache::call_jit_code(const void *const *args) { #if defined(__has_feature) #if __has_feature(memory_sanitizer) user_warning << "MSAN does not support JIT compilers of any sort, and will report " @@ -1122,7 +1122,7 @@ int JITCache::call_jit_code(const Target &target, const void *const *args) { "compilation for Halide code."; #endif #endif - if (target.arch == Target::WebAssembly) { + if (get_compiled_jit_target().arch == Target::WebAssembly) { internal_assert(wasm_module.contents.defined()); return wasm_module.run(args); } else { diff --git a/src/JITModule.h b/src/JITModule.h index 467fb82db207..59b4c3a4f9a0 100644 --- a/src/JITModule.h +++ b/src/JITModule.h @@ -300,7 +300,7 @@ struct JITCache { Target get_compiled_jit_target() const; - int call_jit_code(const Target &target, const void *const *args); + int call_jit_code(const void *const *args); void finish_profiling(JITUserContext *context); }; diff --git a/src/Lower.cpp b/src/Lower.cpp index 74af1aeffe28..ba0918831fc8 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -9,6 +9,7 @@ #include "AddAtomicMutex.h" #include "AddImageChecks.h" #include "AddParameterChecks.h" +#include "AddSplitFactorChecks.h" #include "AllocationBoundsInference.h" #include "AsyncProducers.h" #include "BoundConstantExtentLoops.h" @@ -182,6 +183,10 @@ void lower_impl(const vector &output_funcs, s = bounds_inference(s, outputs, order, fused_groups, env, func_bounds, t); log("Lowering after computation bounds inference:", s); + debug(1) << "Asserting that all split factors are positive...\n"; + s = add_split_factor_checks(s, env); + log("Lowering after asserting that all split factors are positive:", s); + debug(1) << "Removing extern loops...\n"; s = remove_extern_loops(s); log("Lowering after removing extern loops:", s); diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp index 536b8994e686..79d1701a2593 100644 --- a/src/Pipeline.cpp +++ b/src/Pipeline.cpp @@ -570,7 +570,18 @@ Target Pipeline::get_compiled_jit_target() const { void Pipeline::compile_jit(const Target &target_arg) { user_assert(defined()) << "Pipeline is undefined\n"; - Target target = target_arg.with_feature(Target::JIT).with_feature(Target::UserContext); + Target target = target_arg; + + if (target.has_unknowns()) { + // If we've already jit-compiled for a specific target, use that. + target = get_compiled_jit_target(); + if (target.has_unknowns()) { + // Otherwise get the target from the environment + target = get_jit_target_from_environment(); + } + } + + target.set_features({Target::JIT, Target::UserContext}); // If we're re-jitting for the same target, we can just keep the old jit module. if (get_compiled_jit_target() == target) { @@ -751,17 +762,37 @@ Realization Pipeline::realize(JITUserContext *context, bufs.emplace_back(t, nullptr, sizes); } } - Realization r(std::move(bufs)); + Realization r{std::move(bufs)}; + + compile_jit(target); + JITUserContext empty_user_context = {}; + if (!context) { + context = &empty_user_context; + } + JITFuncCallContext jit_context(context, jit_handlers()); + JITCallArgs args(contents->inferred_args.size() + r.size()); + RealizationArg arg{r}; + prepare_jit_call_arguments(arg, contents->jit_cache.jit_target, + &context, true, args); + // Do an output bounds query if we can. Otherwise just assume the // output size is good. + int exit_status = 0; if (!target.has_feature(Target::NoBoundsQuery)) { - realize(context, r, target); + exit_status = call_jit_code(args); } - for (size_t i = 0; i < r.size(); i++) { - r[i].allocate(); + if (exit_status == 0) { + // Make the output allocations + for (size_t i = 0; i < r.size(); i++) { + r[i].allocate(); + } + // Do the actual computation + exit_status = call_jit_code(args); } - // Do the actual computation - realize(context, r, target); + + // If we're profiling, report runtimes and reset profiler stats. + contents->jit_cache.finish_profiling(context); + jit_context.finalize(exit_status); // Crop back to the requested size if necessary bool needs_crop = false; @@ -943,8 +974,8 @@ Pipeline::make_externs_jit_module(const Target &target, return result; } -int Pipeline::call_jit_code(const Target &target, const JITCallArgs &args) { - return contents->jit_cache.call_jit_code(target, args.store); +int Pipeline::call_jit_code(const JITCallArgs &args) { + return contents->jit_cache.call_jit_code(args.store); } void Pipeline::realize(RealizationArg outputs, const Target &t) { @@ -959,15 +990,6 @@ void Pipeline::realize(JITUserContext *context, debug(2) << "Realizing Pipeline for " << target << "\n"; - if (target.has_unknowns()) { - // If we've already jit-compiled for a specific target, use that. - target = get_compiled_jit_target(); - if (target.has_unknowns()) { - // Otherwise get the target from the environment - target = get_jit_target_from_environment(); - } - } - // We need to make a context for calling the jitted function to // carry the the set of custom handlers. Here's how handlers get // called when running jitted code: @@ -1041,7 +1063,7 @@ void Pipeline::realize(JITUserContext *context, // exception. debug(2) << "Calling jitted function\n"; - int exit_status = call_jit_code(target, args); + int exit_status = call_jit_code(args); debug(2) << "Back from jitted function. Exit status was " << exit_status << "\n"; // If we're profiling, report runtimes and reset profiler stats. @@ -1111,7 +1133,7 @@ void Pipeline::infer_input_bounds(JITUserContext *context, } Internal::debug(2) << "Calling jitted function\n"; - int exit_status = call_jit_code(contents->jit_cache.jit_target, args); + int exit_status = call_jit_code(args); jit_context.finalize(exit_status); Internal::debug(2) << "Back from jitted function\n"; bool changed = false; diff --git a/src/Pipeline.h b/src/Pipeline.h index 19272b2ed68d..37537db04fb7 100644 --- a/src/Pipeline.h +++ b/src/Pipeline.h @@ -149,7 +149,6 @@ class Pipeline { private: Internal::IntrusivePtr contents; - // For the three method below, precisely one of the first two args should be non-null void prepare_jit_call_arguments(RealizationArg &output, const Target &target, JITUserContext **user_context, bool is_bounds_inference, Internal::JITCallArgs &args_result); @@ -160,7 +159,7 @@ class Pipeline { static AutoSchedulerFn find_autoscheduler(const std::string &autoscheduler_name); - int call_jit_code(const Target &target, const Internal::JITCallArgs &args); + int call_jit_code(const Internal::JITCallArgs &args); // Get the value of contents->jit_target, but reality-check that the contents // sensibly match the value. Return Target() if not jitted. diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index d8ae1268fbaf..64034b8be328 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1242,6 +1242,10 @@ enum halide_error_code_t { /** An explicit storage bound provided is too small to store * all the values produced by the function. */ halide_error_code_storage_bound_too_small = -45, + + /** A factor used to split a loop was discovered to be zero or negative at + * runtime. */ + halide_error_code_split_factor_not_positive = -46, }; /** Halide calls the functions below on various error conditions. The @@ -1316,6 +1320,8 @@ extern int halide_error_device_dirty_with_no_device_support(void *user_context, extern int halide_error_storage_bound_too_small(void *user_context, const char *func_name, const char *var_name, int provided_size, int required_size); extern int halide_error_device_crop_failed(void *user_context); +extern int halide_error_split_factor_not_positive(void *user_context, const char *func_name, const char *orig, const char *outer, const char *inner, const char *factor_str, int factor); + // @} /** Optional features a compilation Target can have. diff --git a/src/runtime/errors.cpp b/src/runtime/errors.cpp index 003dde531dfc..0879cc4a7c60 100644 --- a/src/runtime/errors.cpp +++ b/src/runtime/errors.cpp @@ -291,4 +291,13 @@ WEAK int halide_error_device_crop_failed(void *user_context) { return halide_error_code_device_crop_failed; } +WEAK int halide_error_split_factor_not_positive(void *user_context, const char *func_name, const char *orig, const char *outer, const char *inner, const char *factor_str, int factor) { + error(user_context) << "In schedule for func " << func_name + << ", the factor used to split the variable " << orig + << " into " << outer << " and " << inner << " is " << factor_str + << ". This evaluated to " << factor << ", which is not strictly positive. " + << "Consider using max(" << factor_str << ", 1) instead."; + return halide_error_code_split_factor_not_positive; +} + } // extern "C" diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp index a8651ae081a6..db8ada2f4b8e 100644 --- a/src/runtime/runtime_api.cpp +++ b/src/runtime/runtime_api.cpp @@ -85,6 +85,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_error_param_too_small_u64, (void *)&halide_error_requirement_failed, (void *)&halide_error_specialize_fail, + (void *)&halide_error_split_factor_not_positive, (void *)&halide_error_unaligned_host_ptr, (void *)&halide_error_storage_bound_too_small, (void *)&halide_error_device_crop_failed, diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 3b946edda6d9..f77393a21114 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -222,6 +222,7 @@ tests(GROUPS correctness multiple_outputs.cpp mux.cpp narrow_predicates.cpp + negative_split_factors.cpp nested_tail_strategies.cpp newtons_method.cpp non_nesting_extern_bounds_query.cpp diff --git a/test/correctness/negative_split_factors.cpp b/test/correctness/negative_split_factors.cpp new file mode 100644 index 000000000000..bc032022b60f --- /dev/null +++ b/test/correctness/negative_split_factors.cpp @@ -0,0 +1,40 @@ +#include "Halide.h" +#include "halide_test_dirs.h" + +#include +#include + +using namespace Halide; + +bool error_occurred = false; +void my_error_handler(JITUserContext *user_context, const char *msg) { + error_occurred = true; +} + +int main(int argc, char **argv) { + // Trying to realize a Pipeline with a negative or zero split factor should + // error out cleanly, and not for example segfault because the output bounds + // query returned a garbage buffer. + + Param split; + + Func f; + Var x; + + f(x) = x; + f.parallel(x, split); + + split.set(-17); + + f.jit_handlers().custom_error = my_error_handler; + + f.realize({32}); + + if (!error_occurred) { + printf("There was supposed to be an error!\n"); + return 1; + } + + printf("Success!\n"); + return 0; +} From ada6345a8a1416ee5a29796f1f0f684df6c5f976 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 12 Feb 2024 10:10:00 -0800 Subject: [PATCH 057/186] Fix rfactor adding too many pure loops (#8086) When you rfactor an update definition, the new update definition must use all the pure vars of the Func, even though the one you're rfactoring may not have used them all. We also want to preserve any scheduling already done to the pure vars, so we want to preserve the dims list and splits list from the original definition. The code accounted for this by checking the dims list for any missing pure vars and adding them at the end (just before Var::outermost()), but this didn't account for the fact that they may no longer exist in the dims list due to splits that didn't reuse the outer name. In these circumstances we could end up with too many pure loops. E.g. if x has been split into xo and xi, then the code was adding a loop for x even though there were already loops for xo and xi, which of course produces garbage output. This PR instead just checks which pure vars are actually used in the update definition up front, and then uses that to tell which ones should be added. Fixes #7890 --- src/Func.cpp | 26 +++++++++++++++++++++++--- test/correctness/fuzz_schedule.cpp | 25 +++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/src/Func.cpp b/src/Func.cpp index 978d2b19a436..7e0995cc33b5 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -788,6 +788,17 @@ Func Stage::rfactor(vector> preserved) { vector &args = definition.args(); vector &values = definition.values(); + // Figure out which pure vars were used in this update definition. + std::set pure_vars_used; + internal_assert(args.size() == dim_vars.size()); + for (size_t i = 0; i < args.size(); i++) { + if (const Internal::Variable *var = args[i].as()) { + if (var->name == dim_vars[i].name()) { + pure_vars_used.insert(var->name); + } + } + } + // Check whether the operator is associative and determine the operator and // its identity for each value in the definition if it is a Tuple const auto &prover_result = prove_associativity(func_name, args, values); @@ -1012,16 +1023,20 @@ Func Stage::rfactor(vector> preserved) { // Determine the dims of the new update definition + // The new update definition needs all the pure vars of the Func, but the + // one we're rfactoring may not have used them all. Add any missing ones to + // the dims list. + // Add pure Vars from the original init definition to the dims list // if they are not already in the list for (const Var &v : dim_vars) { - const auto &iter = std::find_if(dims.begin(), dims.end(), - [&v](const Dim &dim) { return var_name_match(dim.var, v.name()); }); - if (iter == dims.end()) { + if (!pure_vars_used.count(v.name())) { Dim d = {v.name(), ForType::Serial, DeviceAPI::None, DimType::PureVar, Partition::Auto}; + // Insert it just before Var::outermost dims.insert(dims.end() - 1, d); } } + // Then, we need to remove lifted RVars from the dims list for (const string &rv : rvars_removed) { remove(rv); @@ -1888,6 +1903,11 @@ Stage &Stage::reorder(const std::vector &vars) { dims_old.swap(dims); + // We're not allowed to reorder Var::outermost inwards (rfactor assumes it's + // the last one). + user_assert(dims.back().var == Var::outermost().name()) + << "Var::outermost() may not be reordered inside any other var.\n"; + return *this; } diff --git a/test/correctness/fuzz_schedule.cpp b/test/correctness/fuzz_schedule.cpp index a774335a07bf..78fe9e0cb757 100644 --- a/test/correctness/fuzz_schedule.cpp +++ b/test/correctness/fuzz_schedule.cpp @@ -202,6 +202,31 @@ int main(int argc, char **argv) { check_blur_output(buf, correct); } + // https://github.com/halide/Halide/issues/7890 + { + Func input("input"); + Func local_sum("local_sum"); + Func blurry("blurry"); + Var x("x"), y("y"); + RVar yryf; + input(x, y) = 2 * x + 5 * y; + RDom r(-2, 5, -2, 5, "rdom_r"); + local_sum(x, y) = 0; + local_sum(x, y) += input(x + r.x, y + r.y); + blurry(x, y) = cast(local_sum(x, y) / 25); + + Var yo, yi, xo, xi, u; + blurry.split(y, yo, yi, 2, TailStrategy::Auto); + local_sum.split(x, xo, xi, 4, TailStrategy::Auto); + local_sum.update(0).split(x, xo, xi, 1, TailStrategy::Auto); + local_sum.update(0).rfactor(r.x, u); + blurry.store_root(); + local_sum.compute_root(); + Pipeline p({blurry}); + auto buf = p.realize({32, 32}); + check_blur_output(buf, correct); + } + // https://github.com/halide/Halide/issues/8054 { ImageParam input(Float(32), 2, "input"); From d8cfed69531b0e1a29955115e5f3148209d657df Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 13 Feb 2024 13:47:09 -0800 Subject: [PATCH 058/186] Forward the partition methods from generator outputs (#8090) --- src/Generator.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Generator.h b/src/Generator.h index 99d106056842..e819bd2a88a8 100644 --- a/src/Generator.h +++ b/src/Generator.h @@ -2280,6 +2280,8 @@ class GeneratorOutputBase : public GIOBase { HALIDE_FORWARD_METHOD(Func, align_bounds) HALIDE_FORWARD_METHOD(Func, align_extent) HALIDE_FORWARD_METHOD(Func, align_storage) + HALIDE_FORWARD_METHOD(Func, always_partition) + HALIDE_FORWARD_METHOD(Func, always_partition_all) HALIDE_FORWARD_METHOD_CONST(Func, args) HALIDE_FORWARD_METHOD(Func, bound) HALIDE_FORWARD_METHOD(Func, bound_extent) @@ -2303,9 +2305,12 @@ class GeneratorOutputBase : public GIOBase { HALIDE_FORWARD_METHOD(Func, hexagon) HALIDE_FORWARD_METHOD(Func, in) HALIDE_FORWARD_METHOD(Func, memoize) + HALIDE_FORWARD_METHOD(Func, never_partition) + HALIDE_FORWARD_METHOD(Func, never_partition_all) HALIDE_FORWARD_METHOD_CONST(Func, num_update_definitions) HALIDE_FORWARD_METHOD_CONST(Func, outputs) HALIDE_FORWARD_METHOD(Func, parallel) + HALIDE_FORWARD_METHOD(Func, partition) HALIDE_FORWARD_METHOD(Func, prefetch) HALIDE_FORWARD_METHOD(Func, print_loop_nest) HALIDE_FORWARD_METHOD(Func, rename) From c8f43f3b9a8b44fab5e500cd5c9af7204090b182 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 13 Feb 2024 13:47:19 -0800 Subject: [PATCH 059/186] Parallelize some tests (#8078) * Parallelize some tests This reduces the time taken to run all correctness tests from 8:15 to 3:15 on my machine. * The FIXME is actually fine * Remove debug print * Fix when we're willing to run x86 code in simd_op_check * Use separate imageparams per task * Deep-copy the LoopLevels * Make float16_t neon op check test at least build * Revert accidental serialization * Throw return values from callable into the void We don't have a custom error handler in place, so they're always zero * Skip test under ASAN * Fix unintentional change to test --- src/Schedule.cpp | 6 +- test/correctness/float16_t_neon_op_check.cpp | 59 +---- test/correctness/simd_op_check.h | 243 ++++++++++++++----- test/correctness/simd_op_check_hvx.cpp | 12 +- test/correctness/simd_op_check_wasm.cpp | 5 + test/correctness/simd_op_check_x86.cpp | 12 +- test/correctness/unroll_huge_mux.cpp | 13 +- test/correctness/vector_cast.cpp | 12 +- test/correctness/vector_math.cpp | 13 +- test/correctness/vector_reductions.cpp | 15 +- 10 files changed, 239 insertions(+), 151 deletions(-) diff --git a/src/Schedule.cpp b/src/Schedule.cpp index a2a34f34862e..72737b596e91 100644 --- a/src/Schedule.cpp +++ b/src/Schedule.cpp @@ -354,9 +354,9 @@ FuncSchedule FuncSchedule::deep_copy( internal_assert(contents.defined()) << "Cannot deep-copy undefined FuncSchedule\n"; FuncSchedule copy; - copy.contents->store_level = contents->store_level; - copy.contents->compute_level = contents->compute_level; - copy.contents->hoist_storage_level = contents->hoist_storage_level; + copy.contents->store_level.set(contents->store_level); + copy.contents->compute_level.set(contents->compute_level); + copy.contents->hoist_storage_level.set(contents->hoist_storage_level); copy.contents->storage_dims = contents->storage_dims; copy.contents->bounds = contents->bounds; copy.contents->estimates = contents->estimates; diff --git a/test/correctness/float16_t_neon_op_check.cpp b/test/correctness/float16_t_neon_op_check.cpp index a83db47758f5..33d2541cbd4a 100644 --- a/test/correctness/float16_t_neon_op_check.cpp +++ b/test/correctness/float16_t_neon_op_check.cpp @@ -64,7 +64,7 @@ class SimdOpCheck : public SimdOpCheckTest { // bits, 192 bits, and 256 bits for everything. struct TestParams { const int bits; - ImageParam in_f; + std::function in_f; std::vector> vl_params; Expr f_1, f_2, f_3, u_1, i_1; }; @@ -77,7 +77,7 @@ class SimdOpCheck : public SimdOpCheckTest { for (auto &test_param : test_params) { // outer loop for {fp32, fp16} const int bits = test_param.bits; - ImageParam in_f = test_param.in_f; + auto in_f = test_param.in_f; Expr f_1 = test_param.f_1; Expr f_2 = test_param.f_2; Expr f_3 = test_param.f_3; @@ -256,7 +256,7 @@ class SimdOpCheck : public SimdOpCheckTest { suffix_map.emplace(tasks.back().name, suffix); } - void compile_and_check(Func error, const string &op, const string &name, int vector_width, std::ostringstream &error_msg) override { + void compile_and_check(Func error, const string &op, const string &name, int vector_width, const std::vector &arg_types, std::ostringstream &error_msg) override { std::string fn_name = "test_" + name; std::string file_name = output_directory + fn_name; @@ -315,52 +315,11 @@ class SimdOpCheck : public SimdOpCheckTest { } // namespace int main(int argc, char **argv) { - Target host = get_host_target(); - Target hl_target = get_target_from_environment(); - Target jit_target = get_jit_target_from_environment(); - printf("host is: %s\n", host.to_string().c_str()); - printf("HL_TARGET is: %s\n", hl_target.to_string().c_str()); - printf("HL_JIT_TARGET is: %s\n", jit_target.to_string().c_str()); - - // Only for 64bit target with fp16 feature - if (!(hl_target.arch == Target::ARM && hl_target.bits == 64 && hl_target.has_feature(Target::ARMFp16))) { - Halide::Internal::Test::Sharder::accept_sharded_status(); - printf("[SKIP] To run this test, set HL_TARGET=arm-64--arm_fp16. \n"); - return 0; - } - - // Create Test Object - // Use smaller dimension than default(768, 128) to avoid fp16 overflow in reduction test case - SimdOpCheck test(hl_target, 384, 32); - - if (!test.can_run_code()) { - printf("[WARN] To run verification of realization, set HL_JIT_TARGET=arm-64--arm_fp16. \n"); - } - - if (argc > 1) { - test.filter = argv[1]; - } - - if (getenv("HL_SIMD_OP_CHECK_FILTER")) { - test.filter = getenv("HL_SIMD_OP_CHECK_FILTER"); - } - - if (argc > 2) { - // Don't forget: if you want to run the standard tests to a specific output - // directory, you'll need to invoke with the first arg enclosed - // in quotes (to avoid it being wildcard-expanded by the shell): - // - // correctness_simd_op_check "*" /path/to/output - // - test.output_directory = argv[2]; - } - - bool success = test.test_all(); - - if (!success) { - return 1; - } - - printf("Success!\n"); + // FIXME + printf("[SKIP] Test is currently broken. See https://github.com/halide/Halide/issues/8083"); return 0; + + return SimdOpCheckTest::main( + argc, argv, + {Target("arm-64-linux-arm_fp16")}); } diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index 7b1057b7f3ea..fce3172132ba 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -3,11 +3,59 @@ #include "Halide.h" #include "halide_test_dirs.h" +#include "halide_thread_pool.h" #include "test_sharding.h" #include #include +namespace { + +using namespace Halide; + +// Some exprs of each type to use in checked expressions. These will turn +// into loads to thread-local image params. +Expr input(const Type &t, const Expr &arg) { + return Internal::Call::make(t, "input", {arg}, Internal::Call::Extern); +} +Expr in_f16(const Expr &arg) { + return input(Float(16), arg); +} +Expr in_bf16(const Expr &arg) { + return input(BFloat(16), arg); +} +Expr in_f32(const Expr &arg) { + return input(Float(32), arg); +} +Expr in_f64(const Expr &arg) { + return input(Float(64), arg); +} +Expr in_i8(const Expr &arg) { + return input(Int(8), arg); +} +Expr in_i16(const Expr &arg) { + return input(Int(16), arg); +} +Expr in_i32(const Expr &arg) { + return input(Int(32), arg); +} +Expr in_i64(const Expr &arg) { + return input(Int(64), arg); +} +Expr in_u8(const Expr &arg) { + return input(UInt(8), arg); +} +Expr in_u16(const Expr &arg) { + return input(UInt(16), arg); +} +Expr in_u32(const Expr &arg) { + return input(UInt(32), arg); +} +Expr in_u64(const Expr &arg) { + return input(UInt(64), arg); +} +} // namespace + namespace Halide { struct TestResult { std::string op; @@ -33,32 +81,18 @@ class SimdOpCheckTest { std::string filter{"*"}; std::string output_directory{Internal::get_test_tmp_dir()}; std::vector tasks; - std::mt19937 rng; Target target; - ImageParam in_f32{Float(32), 1, "in_f32"}; - ImageParam in_f64{Float(64), 1, "in_f64"}; - ImageParam in_f16{Float(16), 1, "in_f16"}; - ImageParam in_bf16{BFloat(16), 1, "in_bf16"}; - ImageParam in_i8{Int(8), 1, "in_i8"}; - ImageParam in_u8{UInt(8), 1, "in_u8"}; - ImageParam in_i16{Int(16), 1, "in_i16"}; - ImageParam in_u16{UInt(16), 1, "in_u16"}; - ImageParam in_i32{Int(32), 1, "in_i32"}; - ImageParam in_u32{UInt(32), 1, "in_u32"}; - ImageParam in_i64{Int(64), 1, "in_i64"}; - ImageParam in_u64{UInt(64), 1, "in_u64"}; - - const std::vector image_params{in_f32, in_f64, in_f16, in_bf16, in_i8, in_u8, in_i16, in_u16, in_i32, in_u32, in_i64, in_u64}; - const std::vector arg_types{in_f32, in_f64, in_f16, in_bf16, in_i8, in_u8, in_i16, in_u16, in_i32, in_u32, in_i64, in_u64}; int W; int H; + int rng_seed; + using Sharder = Halide::Internal::Test::Sharder; SimdOpCheckTest(const Target t, int w, int h) - : target(t), W(w), H(h) { + : target(t), W(w), H(h), rng_seed(0) { target = target .with_feature(Target::NoBoundsQuery) .with_feature(Target::NoAsserts) @@ -67,7 +101,7 @@ class SimdOpCheckTest { virtual ~SimdOpCheckTest() = default; void set_seed(int seed) { - rng.seed(seed); + rng_seed = seed; } virtual bool can_run_code() const { @@ -112,7 +146,12 @@ class SimdOpCheckTest { return can_run_the_code; } - virtual void compile_and_check(Func error, const std::string &op, const std::string &name, int vector_width, std::ostringstream &error_msg) { + virtual void compile_and_check(Func error, + const std::string &op, + const std::string &name, + int vector_width, + const std::vector &arg_types, + std::ostringstream &error_msg) { std::string fn_name = "test_" + name; std::string file_name = output_directory + fn_name; @@ -197,6 +236,56 @@ class SimdOpCheckTest { TestResult check_one(const std::string &op, const std::string &name, int vector_width, Expr e) { std::ostringstream error_msg; + // Map the input calls in the Expr to loads to local + // imageparams, so that we're not sharing state across threads. + std::vector image_params{ + ImageParam{Float(32), 1, "in_f32"}, + ImageParam{Float(64), 1, "in_f64"}, + ImageParam{Float(16), 1, "in_f16"}, + ImageParam{BFloat(16), 1, "in_bf16"}, + ImageParam{Int(8), 1, "in_i8"}, + ImageParam{UInt(8), 1, "in_u8"}, + ImageParam{Int(16), 1, "in_i16"}, + ImageParam{UInt(16), 1, "in_u16"}, + ImageParam{Int(32), 1, "in_i32"}, + ImageParam{UInt(32), 1, "in_u32"}, + ImageParam{Int(64), 1, "in_i64"}, + ImageParam{UInt(64), 1, "in_u64"}}; + + for (auto &p : image_params) { + const int alignment_bytes = image_param_alignment(); + p.set_host_alignment(alignment_bytes); + const int alignment = alignment_bytes / p.type().bytes(); + p.dim(0).set_min((p.dim(0).min() / alignment) * alignment); + } + + const std::vector arg_types(image_params.begin(), image_params.end()); + + class HookUpImageParams : public Internal::IRMutator { + using Internal::IRMutator::visit; + + Expr visit(const Internal::Call *op) override { + if (op->name == "input") { + for (auto &p : image_params) { + if (p.type() == op->type) { + return p(mutate(op->args[0])); + } + } + } else if (op->call_type == Internal::Call::Halide && !op->func.weak) { + Internal::Function f(op->func); + f.mutate(this); + } + return Internal::IRMutator::visit(op); + } + const std::vector &image_params; + + public: + HookUpImageParams(const std::vector &image_params) + : image_params(image_params) { + } + } hook_up_image_params(image_params); + e = hook_up_image_params.mutate(e); + class HasInlineReduction : public Internal::IRVisitor { using Internal::IRVisitor::visit; void visit(const Internal::Call *op) override { @@ -250,42 +339,70 @@ class SimdOpCheckTest { Halide::Func error("error_" + name); error() = Halide::cast(maximum(absd(f(r_check.x, r_check.y), f_scalar(r_check.x, r_check.y)))); - setup_images(); - compile_and_check(error, op, name, vector_width, error_msg); + compile_and_check(error, op, name, vector_width, arg_types, error_msg); bool can_run_the_code = can_run_code(); if (can_run_the_code) { Target run_target = get_run_target(); - error.infer_input_bounds({}, run_target); - // Fill the inputs with noise - for (auto p : image_params) { - Halide::Buffer<> buf = p.get(); - if (!buf.defined()) continue; - assert(buf.data()); - Type t = buf.type(); - // For floats/doubles, we only use values that aren't - // subject to rounding error that may differ between - // vectorized and non-vectorized versions - if (t == Float(32)) { - buf.as().for_each_value([&](float &f) { f = (rng() & 0xfff) / 8.0f - 0xff; }); - } else if (t == Float(64)) { - buf.as().for_each_value([&](double &f) { f = (rng() & 0xfff) / 8.0 - 0xff; }); - } else if (t == Float(16)) { - buf.as().for_each_value([&](float16_t &f) { f = float16_t((rng() & 0xff) / 8.0f - 0xf); }); - } else { - // Random bits is fine - for (uint32_t *ptr = (uint32_t *)buf.data(); - ptr != (uint32_t *)buf.data() + buf.size_in_bytes() / 4; - ptr++) { - // Never use the top four bits, to avoid - // signed integer overflow. - *ptr = ((uint32_t)rng()) & 0x0fffffff; + // Make some unallocated input buffers + std::vector> inputs(image_params.size()); + + std::vector args(image_params.size()); + for (size_t i = 0; i < args.size(); i++) { + args[i] = image_params[i]; + inputs[i] = Runtime::Buffer<>(args[i].type, nullptr, 0); + } + auto callable = error.compile_to_callable(args, run_target); + + Runtime::Buffer output = Runtime::Buffer::make_scalar(); + output(0) = 1; // To ensure we'll fail if it's never written to + + // Do the bounds query call + assert(inputs.size() == 12); + (void)callable(inputs[0], inputs[1], inputs[2], inputs[3], + inputs[4], inputs[5], inputs[6], inputs[7], + inputs[8], inputs[9], inputs[10], inputs[11], + output); + + std::mt19937 rng; + rng.seed(rng_seed); + + // Allocate the input buffers and fill them with noise + for (size_t i = 0; i < inputs.size(); i++) { + if (inputs[i].size_in_bytes()) { + inputs[i].allocate(); + + Type t = inputs[i].type(); + // For floats/doubles, we only use values that aren't + // subject to rounding error that may differ between + // vectorized and non-vectorized versions + if (t == Float(32)) { + inputs[i].as().for_each_value([&](float &f) { f = (rng() & 0xfff) / 8.0f - 0xff; }); + } else if (t == Float(64)) { + inputs[i].as().for_each_value([&](double &f) { f = (rng() & 0xfff) / 8.0 - 0xff; }); + } else if (t == Float(16)) { + inputs[i].as().for_each_value([&](float16_t &f) { f = float16_t((rng() & 0xff) / 8.0f - 0xf); }); + } else { + // Random bits is fine + for (uint32_t *ptr = (uint32_t *)inputs[i].data(); + ptr != (uint32_t *)inputs[i].data() + inputs[i].size_in_bytes() / 4; + ptr++) { + // Never use the top four bits, to avoid + // signed integer overflow. + *ptr = ((uint32_t)rng()) & 0x0fffffff; + } } } } - Realization r = error.realize(); - double e = Buffer(r[0])(); + + // Do the real call + (void)callable(inputs[0], inputs[1], inputs[2], inputs[3], + inputs[4], inputs[5], inputs[6], inputs[7], + inputs[8], inputs[9], inputs[10], inputs[11], + output); + + double e = output(0); // Use a very loose tolerance for floating point tests. The // kinds of bugs we're looking for are codegen bugs that // return the wrong value entirely, not floating point @@ -329,16 +446,10 @@ class SimdOpCheckTest { tasks.emplace_back(Task{op, name, vector_width, e}); } virtual void add_tests() = 0; - virtual void setup_images() { - for (auto p : image_params) { - p.reset(); - - const int alignment_bytes = 16; - p.set_host_alignment(alignment_bytes); - const int alignment = alignment_bytes / p.type().bytes(); - p.dim(0).set_min((p.dim(0).min() / alignment) * alignment); - } + virtual int image_param_alignment() { + return 16; } + virtual bool test_all() { /* First add some tests based on the target */ add_tests(); @@ -348,21 +459,33 @@ class SimdOpCheckTest { const std::string run_target_str = run_target.to_string(); Sharder sharder; - bool success = true; + + Halide::Tools::ThreadPool pool; + std::vector> futures; + for (size_t t = 0; t < tasks.size(); t++) { if (!sharder.should_run(t)) continue; const auto &task = tasks.at(t); - auto result = check_one(task.op, task.name, task.vector_width, task.expr); + futures.push_back(pool.async([&]() { + return check_one(task.op, task.name, task.vector_width, task.expr); + })); + } + + for (auto &f : futures) { + auto result = f.get(); constexpr int tabstop = 32; const int spaces = std::max(1, tabstop - (int)result.op.size()); std::cout << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n"; if (!result.error_msg.empty()) { std::cerr << result.error_msg; - success = false; + // The thread-pool destructor will block until in-progress tasks + // are done, and then will discard any tasks that haven't been + // launched yet. + return false; } } - return success; + return true; } template diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp index 450ef3f06fe6..29bdde4a9163 100644 --- a/test/correctness/simd_op_check_hvx.cpp +++ b/test/correctness/simd_op_check_hvx.cpp @@ -23,16 +23,10 @@ class SimdOpCheckHVX : public SimdOpCheckTest { SimdOpCheckHVX(Target t, int w = 768 /*256*3*/, int h = 128) : SimdOpCheckTest(t, w, h) { } - void setup_images() override { - for (auto p : image_params) { - p.reset(); - // HVX needs 128 byte alignment - constexpr int kHostAlignmentBytes = 128; - p.set_host_alignment(kHostAlignmentBytes); - Expr min = p.dim(0).min(); - p.dim(0).set_min((min / 128) * 128); - } + int image_param_alignment() override { + return 128; } + void add_tests() override { Expr f32_1 = in_f32(x), f32_2 = in_f32(x + 16), f32_3 = in_f32(x + 32); Expr f64_1 = in_f64(x), f64_2 = in_f64(x + 16), f64_3 = in_f64(x + 32); diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp index 56e2e4231876..2045b42699f4 100644 --- a/test/correctness/simd_op_check_wasm.cpp +++ b/test/correctness/simd_op_check_wasm.cpp @@ -533,6 +533,11 @@ class SimdOpCheckWASM : public SimdOpCheckTest { } // namespace int main(int argc, char **argv) { +#ifdef HALIDE_INTERNAL_USING_ASAN + printf("[SKIP] This test causes an ASAN crash relating to ASAN's use of sigaltstack. It doesn't seem to be due to a bug in the test itself (see https://github.com/halide/Halide/pull/8078#issuecomment-1935407878)"); + return 0; +#endif + return SimdOpCheckTest::main( argc, argv, { diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp index 990e4e886307..b4c086ce0fc3 100644 --- a/test/correctness/simd_op_check_x86.cpp +++ b/test/correctness/simd_op_check_x86.cpp @@ -663,15 +663,15 @@ int main(int argc, char **argv) { // Always turn on f16c when using avx. Sandy Bridge had avx without // f16c, but f16c is orthogonal to everything else, so there's no // real reason to test avx without it. - Target("x86-64-linux-sse41-avx-f16c"), - Target("x86-64-linux-sse41-avx-f16c-avx2"), + Target("x86-64-linux-sse41-avx-f16c-fma"), + Target("x86-64-linux-sse41-avx-f16c-fma-avx2"), // See above: don't test avx512 without extra features, the test // isn't yet set up to test it properly. // Target("x86-64-linux-sse41-avx-avx2-avx512"), // Target("x86-64-linux-sse41-avx-avx2-avx512-avx512_knl"), - Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake"), - Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake"), - Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4"), - Target("x86-64-linux-sse41-avx-f16c-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4-avx512_sapphirerapids"), + Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake"), + Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake"), + Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4"), + Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4-avx512_sapphirerapids"), }); } diff --git a/test/correctness/unroll_huge_mux.cpp b/test/correctness/unroll_huge_mux.cpp index 233ee038c4e8..b24420fe68bb 100644 --- a/test/correctness/unroll_huge_mux.cpp +++ b/test/correctness/unroll_huge_mux.cpp @@ -12,7 +12,7 @@ int main(int argc, char **argv) { Var x; std::vector exprs; - for (int i = 0; i < 10000; i++) { + for (int i = 0; i < 5000; i++) { exprs.push_back(x & i); } @@ -21,17 +21,6 @@ int main(int argc, char **argv) { f.bound(x, 0, (int)exprs.size()); f.unroll(x); - // For 10000 expressions in the mux, this test uses more than 8MB - // in stack because the simplifier's Block visitor is still - // recursive and has a large stack frame. We'll put a cap on it to - // at least make sure the problem doesn't get worse. If this test - // crashes try raising the cap to see if we have a stack size - // regression. - // - // https://github.com/halide/Halide/issues/6238 - - set_compiler_stack_size(16 * 1024 * 1024); - f.compile_jit(); printf("Success!\n"); diff --git a/test/correctness/vector_cast.cpp b/test/correctness/vector_cast.cpp index 3b6eae0fa2e6..575d97842176 100644 --- a/test/correctness/vector_cast.cpp +++ b/test/correctness/vector_cast.cpp @@ -1,6 +1,6 @@ #include "Halide.h" +#include "halide_thread_pool.h" #include "test_sharding.h" - #include using namespace Halide; @@ -164,11 +164,17 @@ int main(int argc, char **argv) { using Sharder = Halide::Internal::Test::Sharder; Sharder sharder; + Halide::Tools::ThreadPool pool; + std::vector> futures; for (size_t t = 0; t < tasks.size(); t++) { if (!sharder.should_run(t)) continue; const auto &task = tasks.at(t); - if (!task.fn()) { - exit(1); + futures.push_back(pool.async(task.fn)); + } + + for (auto &f : futures) { + if (!f.get()) { + return 1; } } diff --git a/test/correctness/vector_math.cpp b/test/correctness/vector_math.cpp index 6e7f19a8bb1e..c5036fd1346f 100644 --- a/test/correctness/vector_math.cpp +++ b/test/correctness/vector_math.cpp @@ -1,4 +1,5 @@ #include "Halide.h" +#include "halide_thread_pool.h" #include "test_sharding.h" #include @@ -742,11 +743,19 @@ int main(int argc, char **argv) { using Sharder = Halide::Internal::Test::Sharder; Sharder sharder; + + std::vector> futures; + + Halide::Tools::ThreadPool pool; for (size_t t = 0; t < tasks.size(); t++) { if (!sharder.should_run(t)) continue; const auto &task = tasks.at(t); - if (!task.fn(task.lanes, task.seed)) { - exit(1); + futures.push_back(pool.async(task.fn, task.lanes, task.seed)); + } + + for (auto &f : futures) { + if (!f.get()) { + return 1; } } diff --git a/test/correctness/vector_reductions.cpp b/test/correctness/vector_reductions.cpp index f1c250cfec3d..9db9475e7fca 100644 --- a/test/correctness/vector_reductions.cpp +++ b/test/correctness/vector_reductions.cpp @@ -1,4 +1,5 @@ #include "Halide.h" +#include "halide_thread_pool.h" #include "test_sharding.h" using namespace Halide; @@ -194,15 +195,17 @@ int main(int argc, char **argv) { using Sharder = Halide::Internal::Test::Sharder; Sharder sharder; - Target prev_target; + + std::vector> futures; + Halide::Tools::ThreadPool pool; for (size_t t = 0; t < tasks.size(); t++) { if (!sharder.should_run(t)) continue; const auto &task = tasks.at(t); - if (task.target != prev_target) { - std::cout << "vector_reductions: Testing with " << task.target << "\n"; - prev_target = task.target; - } - task.fn(); + futures.push_back(pool.async(task.fn)); + } + + for (auto &f : futures) { + f.wait(); } std::cout << "Success!\n"; From 6edea167432abc11bd3c324c144f7dccb33d7574 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Wed, 14 Feb 2024 20:26:27 +0000 Subject: [PATCH 060/186] Allow disabling of mutlithreading in simd op check (#8096) simd_op_check_xtensa is not threadsafe at present --- test/correctness/simd_op_check.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index fce3172132ba..f386b7efc094 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -450,6 +450,10 @@ class SimdOpCheckTest { return 16; } + virtual bool use_multiple_threads() const { + return true; + } + virtual bool test_all() { /* First add some tests based on the target */ add_tests(); @@ -460,7 +464,10 @@ class SimdOpCheckTest { Sharder sharder; - Halide::Tools::ThreadPool pool; + Halide::Tools::ThreadPool pool( + use_multiple_threads() ? + Halide::Tools::ThreadPool::num_processors_online() : + 1); std::vector> futures; for (size_t t = 0; t < tasks.size(); t++) { From 40a622fa15f369a68a03e7e32529e39c54e9f0a2 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 14 Feb 2024 23:34:23 +0300 Subject: [PATCH 061/186] clang does not support `_Float16` when targeting i386 (#8085) See https://github.com/halide/Halide/issues/7678 --- src/runtime/HalideRuntime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 64034b8be328..b235117e9f5e 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -91,7 +91,7 @@ extern "C" { // Ideally there would be a better way to detect if the type // is supported, even in a compiler independent fashion, but // coming up with one has proven elusive. -#if defined(__clang__) && (__clang_major__ >= 16) && !defined(__EMSCRIPTEN__) +#if defined(__clang__) && (__clang_major__ >= 16) && !defined(__EMSCRIPTEN__) && !defined(__i386__) #if defined(__is_identifier) #if !__is_identifier(_Float16) #define HALIDE_CPP_COMPILER_HAS_FLOAT16 From f2d750f355fccadcd03af51bcb58af724719859c Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 14 Feb 2024 23:35:52 +0300 Subject: [PATCH 062/186] tests: correctness/float16_t: mark `__extendhfsf2` with default visibility (#8084) ``` [2336/4154] /usr/bin/clang++-17 -DHALIDE_ENABLE_RTTI -DHALIDE_VERSION_MAJOR=17 -DHALIDE_VERSION_MINOR=0 -DHALIDE_VERSION_PATCH=0 -DHALIDE_WITH_EXCEPTIONS -I/build/halide-17.0.0/test/common -I/build/halide-17.0.0/tools -I/build/halide-17.0.0/build/stage-1/halide/include -g -fdebug-default-version=4 -fprofile-use=/build/halide-17.0.0/build-profile/default.profdata -fcs-profile-generate -Xclang -mllvm -Xclang -vp-counters-per-site=100.0 -fuse-ld=lld-17 -Wl,--build-id=sha1 -std=c++17 -flto=thin -fPIE -fvisibility=hidden -fvisibility-inlines-hidden -Winvalid-pch -Xclang -include-pch -Xclang /build/halide-17.0.0/build/stage-1/halide/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /build/halide-17.0.0/build/stage-1/halide/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx -MD -MT test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o -MF test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o.d -o test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o -c /build/halide-17.0.0/test/correctness/float16_t.cpp <...> ld.lld-17: error: undefined hidden symbol: __extendhfsf2 >>> referenced by float16_t.cpp:391 (/build/halide-17.0.0/test/correctness/float16_t.cpp:391) >>> lto.tmp:(main) >>> did you mean: __extendbfsf2 >>> defined in: /lib/x86_64-linux-gnu/libgcc_s.so.1 clang++-17: error: linker command failed with exit code 1 (use -v to see invocation) ``` --- test/correctness/float16_t.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/correctness/float16_t.cpp b/test/correctness/float16_t.cpp index d135e8108fa7..d4399b008f0a 100644 --- a/test/correctness/float16_t.cpp +++ b/test/correctness/float16_t.cpp @@ -12,7 +12,7 @@ extern "C" { // In Clang 15 and later, this function is passed a uint16... but in the xmm0 register on x86-64. // So we'll declare it as a float and just grab the upper 16 bits. -__attribute__((weak)) float __extendhfsf2(float actually_a_float16) { +__attribute__((weak, visibility("default"))) float __extendhfsf2(float actually_a_float16) { uint16_t data; memcpy(&data, &actually_a_float16, sizeof(data)); return (float)Halide::float16_t::make_from_bits(data); @@ -20,7 +20,7 @@ __attribute__((weak)) float __extendhfsf2(float actually_a_float16) { #else -__attribute__((weak)) float __extendhfsf2(uint16_t data) { +__attribute__((weak, visibility("default"))) float __extendhfsf2(uint16_t data) { return (float)Halide::float16_t::make_from_bits(data); } From b5825618d186a24b8ff55bf0d810b88546133805 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 14 Feb 2024 13:57:09 -0800 Subject: [PATCH 063/186] Fix reduce_expr_modulo of vector in Solve.cpp (#8089) * Fix reduce_expr_modulo of vector in Solve.cpp * Fix test --- src/Solve.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Solve.cpp b/src/Solve.cpp index 22bd14e44412..b25719cff8c7 100644 --- a/src/Solve.cpp +++ b/src/Solve.cpp @@ -394,7 +394,7 @@ class SolveExpression : public IRMutator { if (a_uses_var && !b_uses_var) { const int64_t *ib = as_const_int(b); auto is_multiple_of_b = [&](const Expr &e) { - if (ib) { + if (ib && op->type.is_scalar()) { int64_t r = 0; return reduce_expr_modulo(e, *ib, &r) && r == 0; } else { @@ -1478,6 +1478,9 @@ void solve_test() { check_solve(min(x + y, x - z), x + min(y, 0 - z)); check_solve(max(x + y, x - z), x + max(y, 0 - z)); + check_solve((5 * Broadcast::make(x, 4) + y) / 5, + Broadcast::make(x, 4) + (Broadcast::make(y, 4) / 5)); + debug(0) << "Solve test passed\n"; } From 9a740b584e63cc67e841f134d61d79502e973252 Mon Sep 17 00:00:00 2001 From: Derek Gerstmann Date: Wed, 14 Feb 2024 14:41:51 -0800 Subject: [PATCH 064/186] [Vulkan] Region allocator fixes for memory requirements and allocations (#8087) * Add region allocator tests that check alignment, nearest_multiple and collect routines * Fix can_split() routine to use conformed sizes so that split allocation matches Fix region size accounting so that coalesce never has zero size regions to merge * Fix aligned_offset() routine to check for zero alignment (which means no constraint) * Fix ifdef for internal debugging * Clean up debug internal log messages * Use memory_requirements to determine nearest_multiple during initialization Query memory_requirements for each region, and reallocate if driver requires additional device memory * Formatting pass --------- Co-authored-by: Derek Gerstmann --- src/runtime/internal/block_allocator.h | 126 ++++++------ src/runtime/internal/memory_arena.h | 2 +- src/runtime/internal/memory_resources.h | 2 +- src/runtime/internal/region_allocator.h | 246 +++++++++++++++--------- src/runtime/vulkan_memory.h | 36 +++- test/runtime/block_allocator.cpp | 180 ++++++++++++++++- 6 files changed, 424 insertions(+), 168 deletions(-) diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h index 3ff850e5b19f..feee56a4e531 100644 --- a/src/runtime/internal/block_allocator.h +++ b/src/runtime/internal/block_allocator.h @@ -126,7 +126,7 @@ BlockAllocator *BlockAllocator::create(void *user_context, const Config &cfg, co allocators.system.allocate(user_context, sizeof(BlockAllocator))); if (result == nullptr) { - error(user_context) << "BlockAllocator: Failed to create instance! Out of memory!\n"; + error(user_context) << "BlockAllocator: Failed to create instance! Out of memory\n"; return nullptr; } @@ -160,12 +160,12 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r << "dedicated=" << (request.dedicated ? "true" : "false") << " " << "usage=" << halide_memory_usage_name(request.properties.usage) << " " << "caching=" << halide_memory_caching_name(request.properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...\n"; + << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ..."; #endif BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated); if (block_entry == nullptr) { error(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size (" - << (int32_t)(request.size) << " bytes)!\n"; + << (int32_t)(request.size) << " bytes)\n"; return nullptr; } @@ -180,7 +180,7 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r block_entry = create_block_entry(user_context, request.properties, request.size, request.dedicated); if (block_entry == nullptr) { error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size (" - << (int32_t)(request.size) << " bytes)!\n"; + << (int32_t)(request.size) << " bytes)\n"; return nullptr; } @@ -288,7 +288,7 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl if (result == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: Failed to allocate region of size (" - << (int32_t)(request.size) << " bytes)!\n"; + << (int32_t)(request.size) << " bytes)\n"; #endif // allocator has enough free space, but not enough contiguous space // -- collect and try to reallocate @@ -302,17 +302,17 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const { if (!is_compatible_block(block, properties)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: skipping block ... incompatible properties!\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n" - << " block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << "\n" - << " block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << "\n" - << " block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << "\n"; - debug(user_context) << " request_size=" << (uint32_t)size << "\n" - << " request_usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " request_caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " request_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: skipping block ... incompatible properties! (" + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << " " + << "block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << " " + << "block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << " " + << "block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << " " + << "request_size=" << (uint32_t)size << " " + << "request_usage=" << halide_memory_usage_name(properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; #endif // skip blocks that are using incompatible memory return false; @@ -320,20 +320,20 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo if (dedicated && (block->reserved > 0)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation!\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n"; + debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation! (" + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << ")"; #endif // skip blocks that can't be dedicated to a single allocation return false; } else if (block->memory.dedicated && (block->reserved > 0)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: skipping block ... already dedicated to an allocation!\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n"; + debug(user_context) << "BlockAllocator: skipping block ... already dedicated to an allocation! (" + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << ")"; #endif // skip dedicated blocks that are already allocated return false; @@ -355,16 +355,16 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro const BlockResource *block = static_cast(block_entry->value); if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: found suitable block ...\n" - << " user_context=" << (void *)(user_context) << "\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n" - << " request_size=" << (uint32_t)size << "\n" - << " dedicated=" << (dedicated ? "true" : "false") << "\n" - << " usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: found suitable block (" + << "user_context=" << (void *)(user_context) << " " + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << " " + << "request_size=" << (uint32_t)size << " " + << "dedicated=" << (dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(properties.usage) << " " + << "caching=" << halide_memory_caching_name(properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; #endif return block_entry; } @@ -373,13 +373,13 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro if (block_entry == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: couldn't find suitable block!\n" - << " user_context=" << (void *)(user_context) << "\n" - << " request_size=" << (uint32_t)size << "\n" - << " dedicated=" << (dedicated ? "true" : "false") << "\n" - << " usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: couldn't find suitable block! (" + << "user_context=" << (void *)(user_context) << " " + << "request_size=" << (uint32_t)size << " " + << "dedicated=" << (dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(properties.usage) << " " + << "caching=" << halide_memory_caching_name(properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; #endif } return block_entry; @@ -388,22 +388,22 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro BlockAllocator::BlockEntry * BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: reserving block ... !\n" - << " requested_size=" << (uint32_t)size << "\n" - << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n" - << " requested_usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " requested_caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " requested_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: reserving block ... ! (" + << "requested_size=" << (uint32_t)size << " " + << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; #endif BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated); if (block_entry == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: creating block ... !\n" - << " requested_size=" << (uint32_t)size << "\n" - << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n" - << " requested_usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " requested_caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " requested_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: creating block ... ! (" + << "requested_size=" << (uint32_t)size << " " + << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; #endif block_entry = create_block_entry(user_context, properties, size, dedicated); } @@ -422,14 +422,14 @@ BlockAllocator::create_region_allocator(void *user_context, BlockResource *block #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: Creating region allocator (" << "user_context=" << (void *)(user_context) << " " - << "block_resource=" << (void *)(block) << ")...\n"; + << "block_resource=" << (void *)(block) << ")..."; #endif halide_abort_if_false(user_context, block != nullptr); RegionAllocator *region_allocator = RegionAllocator::create( user_context, block, {allocators.system, allocators.region}); if (region_allocator == nullptr) { - error(user_context) << "BlockAllocator: Failed to create new region allocator!\n"; + error(user_context) << "BlockAllocator: Failed to create new region allocator\n"; return nullptr; } @@ -440,7 +440,7 @@ int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: Destroying region allocator (" << "user_context=" << (void *)(user_context) << " " - << "region_allocator=" << (void *)(region_allocator) << ")...\n"; + << "region_allocator=" << (void *)(region_allocator) << ")..."; #endif if (region_allocator == nullptr) { return 0; @@ -459,13 +459,13 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p if (config.maximum_block_count && (block_count() >= config.maximum_block_count)) { error(user_context) << "BlockAllocator: No free blocks found! Maximum block count reached (" - << (int32_t)(config.maximum_block_count) << ")!\n"; + << (int32_t)(config.maximum_block_count) << ")\n"; return nullptr; } BlockEntry *block_entry = block_list.append(user_context); if (block_entry == nullptr) { - debug(user_context) << "BlockAllocator: Failed to allocate new block entry!\n"; + debug(user_context) << "BlockAllocator: Failed to allocate new block entry\n"; return nullptr; } @@ -473,7 +473,7 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p debug(user_context) << "BlockAllocator: Creating block entry (" << "block_entry=" << (void *)(block_entry) << " " << "block=" << (void *)(block_entry->value) << " " - << "allocator=" << (void *)(allocators.block.allocate) << ")...\n"; + << "allocator=" << (void *)(allocators.block.allocate) << ")..."; #endif BlockResource *block = static_cast(block_entry->value); @@ -492,7 +492,7 @@ int BlockAllocator::release_block_entry(void *user_context, BlockAllocator::Bloc #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: Releasing block entry (" << "block_entry=" << (void *)(block_entry) << " " - << "block=" << (void *)(block_entry->value) << ")...\n"; + << "block=" << (void *)(block_entry->value) << ")..."; #endif BlockResource *block = static_cast(block_entry->value); if (block->allocator) { @@ -506,7 +506,7 @@ int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Bloc debug(user_context) << "BlockAllocator: Destroying block entry (" << "block_entry=" << (void *)(block_entry) << " " << "block=" << (void *)(block_entry->value) << " " - << "deallocator=" << (void *)(allocators.block.deallocate) << ")...\n"; + << "deallocator=" << (void *)(allocators.block.deallocate) << ")..."; #endif BlockResource *block = static_cast(block_entry->value); if (block->allocator) { @@ -520,7 +520,7 @@ int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Bloc int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n"; + debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")..."; #endif halide_abort_if_false(user_context, allocators.block.allocate != nullptr); MemoryBlock *memory_block = &(block->memory); @@ -531,7 +531,7 @@ int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) int BlockAllocator::free_memory_block(void *user_context, BlockResource *block) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n"; + debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")..."; #endif halide_abort_if_false(user_context, allocators.block.deallocate != nullptr); MemoryBlock *memory_block = &(block->memory); diff --git a/src/runtime/internal/memory_arena.h b/src/runtime/internal/memory_arena.h index 5953e12e470a..7d6c33da8f5d 100644 --- a/src/runtime/internal/memory_arena.h +++ b/src/runtime/internal/memory_arena.h @@ -271,7 +271,7 @@ void *MemoryArena::create_entry(void *user_context, Block *block, uint32_t index void *entry_ptr = lookup_entry(user_context, block, index); block->free_index = block->indices[index]; block->status[index] = AllocationStatus::InUse; -#if DEBUG_RUNTIME_INTERNAL +#ifdef DEBUG_RUNTIME_INTERNAL memset(entry_ptr, 0, config.entry_size); #endif return entry_ptr; diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h index e30afb0dd4ea..d41fa57304fb 100644 --- a/src/runtime/internal/memory_resources.h +++ b/src/runtime/internal/memory_resources.h @@ -127,7 +127,7 @@ ALWAYS_INLINE bool is_power_of_two_alignment(size_t x) { // -- Alignment must be power of two! ALWAYS_INLINE size_t aligned_offset(size_t offset, size_t alignment) { halide_abort_if_false(nullptr, is_power_of_two_alignment(alignment)); - return (offset + (alignment - 1)) & ~(alignment - 1); + return (alignment == 0) ? (offset) : (offset + (alignment - 1)) & ~(alignment - 1); } // Returns a suitable alignment such that requested alignment is a suitable diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h index 13c6b69f12e7..02c2cd7e6aa0 100644 --- a/src/runtime/internal/region_allocator.h +++ b/src/runtime/internal/region_allocator.h @@ -73,7 +73,7 @@ class RegionAllocator { BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region); // Returns true if the given region can be split to accomodate the given size - bool can_split(const BlockRegion *region, size_t size) const; + bool can_split(const BlockRegion *region, size_t size, size_t alignment) const; // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment); @@ -155,7 +155,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest & #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Unable to reserve more memory from block " << "-- requested size (" << (int32_t)(request.size) << " bytes) " - << "greater than available (" << (int32_t)(remaining) << " bytes)!\n"; + << "greater than available (" << (int32_t)(remaining) << " bytes)"; #endif return nullptr; } @@ -164,15 +164,15 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest & if (block_region == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Failed to locate region for requested size (" - << (int32_t)(request.size) << " bytes)!\n"; + << (int32_t)(request.size) << " bytes)"; #endif return nullptr; } - if (can_split(block_region, request.size)) { + if (can_split(block_region, request.size, request.alignment)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") " - << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n"; + << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)"; #endif split_block_region(user_context, block_region, request.size, request.alignment); } @@ -200,9 +200,6 @@ int RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) { } release_block_region(user_context, block_region); free_block_region(user_context, block_region); - if (can_coalesce(block_region)) { - block_region = coalesce_block_regions(user_context, block_region); - } return 0; } @@ -232,8 +229,10 @@ bool RegionAllocator::is_last_block_region(void *user_context, const BlockRegion bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, const BlockRegion *region, const MemoryRequest &request) const { if (!is_available(region)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: skipping block region ... not available! " - << " block_region=" << (void *)region << "\n"; + debug(user_context) << " skipping block region ... not available! (" + << " block_region=" << (void *)region + << " region_size=" << (uint32_t)(region->memory.size) + << ")"; #endif return false; } @@ -241,8 +240,10 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c // skip incompatible block regions for this request if (!is_compatible_block_region(region, request.properties)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: skipping block region ... incompatible properties! " - << " block_region=" << (void *)region << "\n"; + debug(user_context) << " skipping block region ... incompatible properties! (" + << " block_region=" << (void *)region + << " region_size=" << (uint32_t)(region->memory.size) + << ")"; #endif return false; } @@ -253,8 +254,12 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c // is the adjusted size larger than the current region? if (actual_size > region->memory.size) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: skipping block region ... not enough space for adjusted size! " - << " block_region=" << (void *)region << "\n"; + debug(user_context) << " skipping block region ... not enough space for adjusted size! (" + << " block_region=" << (void *)region + << " request_size=" << (uint32_t)(request.size) + << " actual_size=" << (uint32_t)(actual_size) + << " region_size=" << (uint32_t)(region->memory.size) + << ")"; #endif return false; } @@ -262,8 +267,12 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c // will the adjusted size fit within the remaining unallocated space? if ((actual_size + block->reserved) <= block->memory.size) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: found suitable block region! " - << " block_region=" << (void *)region << "\n"; + debug(user_context) << " found suitable block region! (" + << " block_region=" << (void *)region + << " request_size=" << (uint32_t)(request.size) + << " actual_size=" << (uint32_t)(actual_size) + << " region_size=" << (uint32_t)(region->memory.size) + << ")"; #endif return true; // you betcha } @@ -272,20 +281,29 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c } BlockRegion *RegionAllocator::find_block_region(void *user_context, const MemoryRequest &request) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: find block region ( " + << "user_context=" << (void *)(user_context) << " " + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; +#endif BlockRegion *block_region = block->regions; while (block_region != nullptr) { if (is_block_region_suitable_for_request(user_context, block_region, request)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: found suitable region ...\n" - << " user_context=" << (void *)(user_context) << "\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n" - << " requested_size=" << (uint32_t)request.size << "\n" - << " requested_is_dedicated=" << (request.dedicated ? "true" : "false") << "\n" - << " requested_usage=" << halide_memory_usage_name(request.properties.usage) << "\n" - << " requested_caching=" << halide_memory_caching_name(request.properties.caching) << "\n" - << " requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << "\n"; + debug(user_context) << "RegionAllocator: found suitable region ( " + << "user_context=" << (void *)(user_context) << " " + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << " " + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif return block_region; } @@ -299,13 +317,13 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory if (block_region == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: couldn't find suitable region!\n" - << " user_context=" << (void *)(user_context) << "\n" - << " requested_size=" << (uint32_t)request.size << "\n" - << " requested_is_dedicated=" << (request.dedicated ? "true" : "false") << "\n" - << " requested_usage=" << halide_memory_usage_name(request.properties.usage) << "\n" - << " requested_caching=" << halide_memory_caching_name(request.properties.caching) << "\n" - << " requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << "\n"; + debug(user_context) << "RegionAllocator: couldn't find suitable region! (" + << "user_context=" << (void *)(user_context) << " " + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif } @@ -342,12 +360,12 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Freeing region (" + debug(user_context) << "RegionAllocator: Freeing unused region to coalesce (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif halide_abort_if_false(user_context, allocators.region.deallocate != nullptr); MemoryRegion *memory_region = &(block_region->memory); @@ -361,7 +379,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Coalescing " << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) " - << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!"; + << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!"; #endif prev_region->next_ptr = block_region->next_ptr; @@ -379,7 +397,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Coalescing " << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) " - << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n"; + << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)"; #endif if (next_region->next_ptr) { @@ -393,8 +411,10 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe return block_region; } -bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size) const { - return (block_region && (block_region->memory.size > size) && (block_region->usage_count == 0)); +bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size, size_t alignment) const { + size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment); + size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple); + return (block_region && (block_region->memory.size > split_size) && (block_region->usage_count == 0)); } BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) { @@ -406,7 +426,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion << "block_region=" << (void *)block_region << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block_region->block_ptr->reserved << " " - << ")\n"; + << ")"; #endif halide_abort_if_false(user_context, allocators.region.deallocate != nullptr); MemoryRegion *memory_region = &(block_region->memory); @@ -420,18 +440,20 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion size_t empty_size = block_region->memory.size - split_size; #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: Conforming size and alignment \n" - << " requested_size=" << (uint32_t)size << "\n" - << " split_size=" << (uint32_t)split_size << "\n" - << " requested_alignment=" << (uint32_t)alignment << " " - << " required_alignment=" << (uint32_t)block->memory.properties.alignment << " " - << " actual_alignment=" << (uint32_t)actual_alignment << ")\n"; + debug(user_context) << "RegionAllocator: Conforming size and alignment (" + << "requested_size=" << (uint32_t)size << " " + << "split_size=" << (uint32_t)split_size << " " + << "split_offset=" << (uint32_t)split_size << " " + << "empty_size=" << (uint32_t)empty_size << " " + << "requested_alignment=" << (uint32_t)alignment << " " + << "required_alignment=" << (uint32_t)block->memory.properties.alignment << " " + << "actual_alignment=" << (uint32_t)actual_alignment << ")"; #endif #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Splitting " << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) " - << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n"; + << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)"; #endif BlockRegion *next_region = block_region->next_ptr; @@ -453,7 +475,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: Creating block region (" + debug(user_context) << "RegionAllocator: Creating block region request (" << "user_context=" << (void *)(user_context) << " " << "offset=" << (uint32_t)offset << " " << "size=" << (uint32_t)size << " " @@ -461,8 +483,16 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo << "dedicated=" << (dedicated ? "true" : "false") << " " << "usage=" << halide_memory_usage_name(properties.usage) << " " << "caching=" << halide_memory_caching_name(properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n"; + << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ..."; #endif + size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment); + size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple); + size_t actual_offset = aligned_offset(offset, actual_alignment); + + if (actual_size == 0) { + error(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n"; + return nullptr; + } BlockRegion *block_region = static_cast(arena->reserve(user_context, true)); if (block_region == nullptr) { @@ -470,16 +500,6 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo return nullptr; } -#ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: Added block region (" - << "user_context=" << (void *)(user_context) << " " - << "block_region=" << (void *)(block_region) << ") ...\n"; -#endif - - size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple); - size_t actual_offset = aligned_offset(offset, actual_alignment); - block_region->memory.handle = nullptr; block_region->memory.offset = actual_offset; block_region->memory.size = actual_size; @@ -490,11 +510,13 @@ BlockRegion *RegionAllocator::create_block_region(void *user_context, const Memo block_region->usage_count = 0; #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Creating region (" + debug(user_context) << "RegionAllocator: Created block region allocation (" + << "user_context=" << (void *)(user_context) << " " << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " + << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " - << ")\n"; + << ")"; #endif return block_region; @@ -504,7 +526,12 @@ int RegionAllocator::release_block_region(void *user_context, BlockRegion *block #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Releasing block region (" << "user_context=" << (void *)(user_context) << " " - << "block_region=" << (void *)(block_region) << ") ...\n"; + << "block_ptr=" << ((block_region) ? ((void *)block_region->block_ptr) : nullptr) << " " + << "block_region=" << (void *)block_region << " " + << "usage_count=" << ((block_region) ? (uint32_t)(block_region->usage_count) : 0) << " " + << "memory_offset=" << ((block_region) ? (uint32_t)(block_region->memory.offset) : 0) << " " + << "memory_size=" << ((block_region) ? (uint32_t)(block_region->memory.size) : 0) << " " + << "block_reserved=" << (uint32_t)(block->reserved) << ") ... "; #endif if (block_region == nullptr) { return 0; @@ -517,12 +544,13 @@ int RegionAllocator::release_block_region(void *user_context, BlockRegion *block if (block_region->status != AllocationStatus::Available) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Releasing region (" + debug(user_context) << " releasing region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " + << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)(block->reserved - block_region->memory.size) << " " - << ")\n"; + << ")"; #endif block->reserved -= block_region->memory.size; @@ -535,7 +563,7 @@ int RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Destroying block region (" << "user_context=" << (void *)(user_context) << " " - << "block_region=" << (void *)(block_region) << ") ...\n"; + << "block_region=" << (void *)(block_region) << ") ..."; #endif block_region->usage_count = 0; @@ -549,7 +577,7 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Allocating region (user_context=" << (void *)(user_context) << " size=" << (int32_t)(block_region->memory.size) - << " offset=" << (int32_t)block_region->memory.offset << ")!\n"; + << " offset=" << (int32_t)block_region->memory.offset << ")"; #endif halide_abort_if_false(user_context, allocators.region.allocate != nullptr); halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available); @@ -560,25 +588,25 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r memory_region->is_owner = true; #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Allocating region (" + debug(user_context) << " allocating region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif } else { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Re-using region (" + debug(user_context) << " re-using region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif } block_region->status = block_region->memory.dedicated ? AllocationStatus::Dedicated : AllocationStatus::InUse; @@ -590,24 +618,26 @@ int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_re #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Freeing block region (" << "user_context=" << (void *)(user_context) << " " + << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)(block_region) << " " + << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "status=" << (uint32_t)block_region->status << " " - << "usage_count=" << (uint32_t)block_region->usage_count << ") ...\n"; + << "usage_count=" << (uint32_t)block_region->usage_count << " " + << "block_reserved=" << (uint32_t)block->reserved << ")"; #endif if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Freeing region (" + debug(user_context) << " deallocating region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif + // NOTE: Deallocate but leave memory size as is, so that coalesce can compute region merging sizes halide_abort_if_false(user_context, allocators.region.deallocate != nullptr); MemoryRegion *memory_region = &(block_region->memory); allocators.region.deallocate(user_context, memory_region); - block_region->memory.size = 0; - block_region->memory.offset = 0; block_region->memory.handle = nullptr; } block_region->usage_count = 0; @@ -618,7 +648,7 @@ int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_re int RegionAllocator::release(void *user_context) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Releasing all regions (" - << "user_context=" << (void *)(user_context) << ") ...\n"; + << "user_context=" << (void *)(user_context) << ") ..."; #endif BlockRegion *block_region = block->regions; @@ -635,45 +665,67 @@ int RegionAllocator::release(void *user_context) { bool RegionAllocator::collect(void *user_context) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Collecting free block regions (" - << "user_context=" << (void *)(user_context) << ") ...\n"; + << "user_context=" << (void *)(user_context) << ") ..."; - uint32_t count = 0; + uint32_t collected_count = 0; + uint32_t remaining_count = 0; uint64_t reserved = block->reserved; debug(user_context) << " collecting unused regions (" << "block_ptr=" << (void *)block << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif bool has_collected = false; BlockRegion *block_region = block->regions; while (block_region != nullptr) { + debug(user_context) << " checking region (" + << "block_ptr=" << (void *)block_region->block_ptr << " " + << "block_region=" << (void *)block_region << " " + << "usage_count=" << (uint32_t)(block_region->usage_count) << " " + << "status=" << (uint32_t)(block_region->status) << " " + << "memory_size=" << (uint32_t)(block_region->memory.size) << " " + << "block_reserved=" << (uint32_t)block->reserved << " " + << ")"; + if (can_coalesce(block_region)) { #ifdef DEBUG_RUNTIME_INTERNAL - count++; + collected_count++; debug(user_context) << " collecting region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif block_region = coalesce_block_regions(user_context, block_region); has_collected = true; + } else { +#ifdef DEBUG_RUNTIME_INTERNAL + remaining_count++; +#endif } if (is_last_block_region(user_context, block_region)) { break; } block_region = block_region->next_ptr; } +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << " scanned active regions (" + << "block_ptr=" << (void *)block << " " + << "total_count=" << (uint32_t)(collected_count + remaining_count) << " " + << "block_reserved=" << (uint32_t)(block->reserved) << " " + << ")"; +#endif if (has_collected) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << " collected unused regions (" << "block_ptr=" << (void *)block << " " - << "region_count=" << (uint32_t)count << " " - << "collected=" << (uint32_t)(reserved - block->reserved) << " " - << ")\n"; + << "collected_count=" << (uint32_t)collected_count << " " + << "remaining_count=" << (uint32_t)remaining_count << " " + << "reclaimed=" << (uint32_t)(reserved - block->reserved) << " " + << ")"; #endif } return has_collected; @@ -682,23 +734,27 @@ bool RegionAllocator::collect(void *user_context) { int RegionAllocator::destroy(void *user_context) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Destroying all block regions (" - << "user_context=" << (void *)(user_context) << ") ...\n"; + << "user_context=" << (void *)(user_context) << ") ..."; #endif - for (BlockRegion *block_region = block->regions; block_region != nullptr;) { - - if (is_last_block_region(user_context, block_region)) { - destroy_block_region(user_context, block_region); - block_region = nullptr; - } else { - BlockRegion *prev_region = block_region; - block_region = block_region->next_ptr; - destroy_block_region(user_context, prev_region); + if (block->regions != nullptr) { + for (BlockRegion *block_region = block->regions; block_region != nullptr;) { + + if (is_last_block_region(user_context, block_region)) { + destroy_block_region(user_context, block_region); + block_region = nullptr; + } else { + BlockRegion *prev_region = block_region; + block_region = block_region->next_ptr; + destroy_block_region(user_context, prev_region); + } } } block->reserved = 0; block->regions = nullptr; block->allocator = nullptr; - MemoryArena::destroy(user_context, arena); + if (arena != nullptr) { + MemoryArena::destroy(user_context, arena); + } arena = nullptr; return 0; } diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h index 70a6bda64e5d..96535f3446ba 100644 --- a/src/runtime/vulkan_memory.h +++ b/src/runtime/vulkan_memory.h @@ -614,7 +614,8 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block #if defined(HL_VK_DEBUG_MEM) debug(nullptr) << "VulkanMemoryAllocator: Block allocated (" << "size=" << (uint32_t)block->size << ", " - << "alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ", " << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", " << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", " << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n"; @@ -630,6 +631,9 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block if (memory_requirements.alignment > block->properties.alignment) { block->properties.alignment = memory_requirements.alignment; } + if (memory_requirements.alignment > block->properties.nearest_multiple) { + block->properties.nearest_multiple = memory_requirements.alignment; + } block->handle = (void *)device_memory; instance->block_byte_count += block->size; instance->block_count++; @@ -867,6 +871,36 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; return halide_error_code_device_malloc_failed; } + + // NOTE: Vulkan will only allow us to bind device memory to a buffer if the memory requirements are met. + // So now we have to check those (on every allocation) and potentially recreate the buffer if the requirements + // don't match the requested VkBuffer's properties. Note that this is the internal storage for the driver, + // whose size may be required to larger than our requested size (even though we will only ever touch the + // size of the region we're managing as within our block) + VkMemoryRequirements memory_requirements = {0}; + vkGetBufferMemoryRequirements(instance->device, *buffer, &memory_requirements); + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements (" + << "requested_size=" << (uint32_t)region->size << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ")\n"; +#endif + + if (memory_requirements.size > region->size) { + vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks); +#ifdef DEBUG_RUNTIME + debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" << (uint64_t)memory_requirements.size << " bytes) ...\n"; +#endif + create_info.size = memory_requirements.size; + VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer); + if (result != VK_SUCCESS) { + error(user_context) << "VulkanRegionAllocator: Failed to recreate buffer!\n\t" + << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; + return halide_error_code_device_malloc_failed; + } + } + #ifdef DEBUG_RUNTIME debug(nullptr) << "vkCreateBuffer: Created buffer for device region (" << (uint64_t)region->size << " bytes) ...\n"; #endif diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp index b56c817e1f4e..b2190f63b592 100644 --- a/test/runtime/block_allocator.cpp +++ b/test/runtime/block_allocator.cpp @@ -21,7 +21,7 @@ int allocate_block(void *user_context, MemoryBlock *block) { << "block=" << (void *)(block) << " " << "block_size=" << int32_t(block->size) << " " << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " - << ") !\n"; + << ") ..."; return halide_error_code_success; } @@ -34,7 +34,7 @@ int deallocate_block(void *user_context, MemoryBlock *block) { << "block=" << (void *)(block) << " " << "block_size=" << int32_t(block->size) << " " << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " - << ") !\n"; + << ") ..."; return halide_error_code_success; } @@ -47,7 +47,7 @@ int allocate_region(void *user_context, MemoryRegion *region) { << "region=" << (void *)(region) << " " << "region_size=" << int32_t(region->size) << " " << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " - << ") !\n"; + << ") ..."; return halide_error_code_success; } @@ -60,7 +60,7 @@ int deallocate_region(void *user_context, MemoryRegion *region) { << "region=" << (void *)(region) << " " << "region_size=" << int32_t(region->size) << " " << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " - << ") !\n"; + << ") ..."; return halide_error_code_success; } @@ -74,7 +74,173 @@ int main(int argc, char **argv) { MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block}; MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region}; - // test class interface + // test region allocator class interface + { + // Manually create a block resource and allocate memory + size_t block_size = 4 * 1024 * 1024; + BlockResource block_resource = {}; + MemoryBlock *memory_block = &(block_resource.memory); + memory_block->size = block_size; + allocate_block(user_context, memory_block); + + // Create a region allocator to manage the block resource + RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator}; + RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators); + + MemoryRequest request = {0}; + request.size = sizeof(int); + request.alignment = sizeof(int); + request.properties.visibility = MemoryVisibility::DefaultVisibility; + request.properties.caching = MemoryCaching::DefaultCaching; + request.properties.usage = MemoryUsage::DefaultUsage; + + MemoryRegion *r1 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r1 != nullptr); + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == request.size); + + MemoryRegion *r2 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r2 != nullptr); + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size)); + + instance->reclaim(user_context, r1); + HALIDE_CHECK(user_context, allocated_region_memory == (1 * request.size)); + + MemoryRegion *r3 = instance->reserve(user_context, request); + halide_abort_if_false(user_context, r3 != nullptr); + halide_abort_if_false(user_context, allocated_block_memory == block_size); + halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size)); + instance->retain(user_context, r3); + halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size)); + instance->release(user_context, r3); + halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size)); + instance->reclaim(user_context, r3); + instance->release(user_context, r1); + + // [r1 = available] [r2 = in use] [r3 = available] ... no contiguous regions + HALIDE_CHECK(user_context, false == instance->collect(user_context)); + + // release r2 to make three consecutive regions to collect + instance->release(user_context, r2); + HALIDE_CHECK(user_context, true == instance->collect(user_context)); + + request.size = block_size / 2; // request two half-size regions + MemoryRegion *r4 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r4 != nullptr); + MemoryRegion *r5 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r5 != nullptr); + HALIDE_CHECK(user_context, nullptr == instance->reserve(user_context, request)); // requesting a third should fail + + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size)); + + instance->release(user_context, r4); + instance->release(user_context, r5); + + HALIDE_CHECK(user_context, true == instance->collect(user_context)); + + request.size = block_size; + MemoryRegion *r6 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r6 != nullptr); + + instance->destroy(user_context); + deallocate_block(user_context, memory_block); + + debug(user_context) << "Test : region_allocator::destroy (" + << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " + << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " + << ") ..."; + + HALIDE_CHECK(user_context, allocated_block_memory == 0); + HALIDE_CHECK(user_context, allocated_region_memory == 0); + + RegionAllocator::destroy(user_context, instance); + + debug(user_context) << "Test : region_allocator::destroy (" + << "allocated_system_memory=" << int32_t(get_allocated_system_memory()) << " " + << ") ..."; + + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + + // test region allocator nearest_multiple padding + { + // Manually create a block resource and allocate memory + size_t block_size = 4 * 1024 * 1024; + size_t padded_size = 32; + BlockResource block_resource = {}; + MemoryBlock *memory_block = &(block_resource.memory); + memory_block->size = block_size; + memory_block->properties.nearest_multiple = padded_size; + allocate_block(user_context, memory_block); + + // Create a region allocator to manage the block resource + RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator}; + RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators); + + MemoryRequest request = {0}; + request.size = sizeof(int); + request.alignment = sizeof(int); + request.properties.visibility = MemoryVisibility::DefaultVisibility; + request.properties.caching = MemoryCaching::DefaultCaching; + request.properties.usage = MemoryUsage::DefaultUsage; + + MemoryRegion *r1 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r1 != nullptr); + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == padded_size); + + MemoryRegion *r2 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r2 != nullptr); + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * padded_size)); + + instance->release(user_context, r1); + instance->release(user_context, r2); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * padded_size)); + HALIDE_CHECK(user_context, true == instance->collect(user_context)); + + request.size = block_size / 2; // request two half-size regions + MemoryRegion *r4 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r4 != nullptr); + MemoryRegion *r5 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r5 != nullptr); + HALIDE_CHECK(user_context, nullptr == instance->reserve(user_context, request)); // requesting a third should fail + + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size)); + + instance->release(user_context, r4); + instance->release(user_context, r5); + + HALIDE_CHECK(user_context, true == instance->collect(user_context)); + + request.size = block_size; + MemoryRegion *r6 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r6 != nullptr); + + instance->destroy(user_context); + deallocate_block(user_context, memory_block); + + debug(user_context) << "Test : region_allocator::destroy (" + << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " + << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " + << ") ..."; + + HALIDE_CHECK(user_context, allocated_block_memory == 0); + HALIDE_CHECK(user_context, allocated_region_memory == 0); + + RegionAllocator::destroy(user_context, instance); + + debug(user_context) << "Test : region_allocator::destroy (" + << "allocated_system_memory=" << int32_t(get_allocated_system_memory()) << " " + << ") ..."; + + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + + // test block allocator class interface { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; @@ -116,7 +282,7 @@ int main(int argc, char **argv) { debug(user_context) << "Test : block_allocator::destroy (" << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " - << ") !\n"; + << ") ..."; HALIDE_CHECK(user_context, allocated_block_memory == 0); HALIDE_CHECK(user_context, allocated_region_memory == 0); @@ -125,7 +291,7 @@ int main(int argc, char **argv) { debug(user_context) << "Test : block_allocator::destroy (" << "allocated_system_memory=" << int32_t(get_allocated_system_memory()) << " " - << ") !\n"; + << ") ..."; HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); } From e6e1b6f2dfa42120613b8fc0b9ea7768454fff9d Mon Sep 17 00:00:00 2001 From: Alex Reinking Date: Wed, 14 Feb 2024 17:58:55 -0800 Subject: [PATCH 065/186] Ensure string(REPLACE) is called with the right number of arguments (#8097) --- dependencies/wasm/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependencies/wasm/CMakeLists.txt b/dependencies/wasm/CMakeLists.txt index 7c0a02b377f1..c5122a042dd5 100644 --- a/dependencies/wasm/CMakeLists.txt +++ b/dependencies/wasm/CMakeLists.txt @@ -164,7 +164,7 @@ function(find_node_js) execute_process(COMMAND "${NODE_JS_EXECUTABLE}" --version OUTPUT_VARIABLE NODE_JS_VERSION_RAW OUTPUT_STRIP_TRAILING_WHITESPACE) - string(REPLACE "v" "" NODE_JS_VERSION ${NODE_JS_VERSION_RAW}) + string(REPLACE "v" "" NODE_JS_VERSION "${NODE_JS_VERSION_RAW}") if (NODE_JS_VERSION VERSION_LESS "16.13") message(FATAL_ERROR "Halide requires Node v16.13 or later, but found ${NODE_JS_VERSION_RAW} at ${NODE_JS_EXECUTABLE}. Please set NODE_JS_EXECUTABLE on the CMake command line.") From 2855ca31aa12c990d58ab1c4cab0dff2be4abea4 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 15 Feb 2024 09:06:36 -0800 Subject: [PATCH 066/186] Strip asserts right at the end of lowering (#8094) The simplifier exploits asserts to make simplification. When compiling with NoAsserts, certain assertions aren't ever introduced, which means that the simplifier can't exploit certain things that we know to be true. Mostly this has a negative effect on code size. E.g. tail cases get generated even though they are actually dead code. This PR keeps all the assertions right until the end of lowering, when it strips them in a dedicated pass. This reduces object file size for a large production blob of Halide code by ~10%, without measurably affecting runtime. --- Makefile | 2 + src/AddImageChecks.cpp | 39 ++++-------- src/CMakeLists.txt | 2 + src/Lower.cpp | 7 +++ src/ScheduleFunctions.cpp | 6 +- src/StripAsserts.cpp | 121 ++++++++++++++++++++++++++++++++++++++ src/StripAsserts.h | 18 ++++++ 7 files changed, 164 insertions(+), 31 deletions(-) create mode 100644 src/StripAsserts.cpp create mode 100644 src/StripAsserts.h diff --git a/Makefile b/Makefile index b73b1632a0eb..72c05619e3ea 100644 --- a/Makefile +++ b/Makefile @@ -603,6 +603,7 @@ SOURCE_FILES = \ StorageFlattening.cpp \ StorageFolding.cpp \ StrictifyFloat.cpp \ + StripAsserts.cpp \ Substitute.cpp \ Target.cpp \ Tracing.cpp \ @@ -785,6 +786,7 @@ HEADER_FILES = \ StorageFlattening.h \ StorageFolding.h \ StrictifyFloat.h \ + StripAsserts.h \ Substitute.h \ Target.h \ Tracing.h \ diff --git a/src/AddImageChecks.cpp b/src/AddImageChecks.cpp index dfe9ae88c85f..77d8015f32b9 100644 --- a/src/AddImageChecks.cpp +++ b/src/AddImageChecks.cpp @@ -162,7 +162,6 @@ Stmt add_image_checks_inner(Stmt s, const FuncValueBounds &fb, bool will_inject_host_copies) { - bool no_asserts = t.has_feature(Target::NoAsserts); bool no_bounds_query = t.has_feature(Target::NoBoundsQuery); // First hunt for all the referenced buffers @@ -618,12 +617,9 @@ Stmt add_image_checks_inner(Stmt s, replace_with_constrained[name] = constrained_var; } - Expr error = 0; - if (!no_asserts) { - error = Call::make(Int(32), "halide_error_constraint_violated", - {name, var, constrained_var_str, constrained_var}, - Call::Extern); - } + Expr error = Call::make(Int(32), "halide_error_constraint_violated", + {name, var, constrained_var_str, constrained_var}, + Call::Extern); // Check the var passed in equals the constrained version (when not in inference mode) asserts_constrained.push_back(AssertStmt::make(var == constrained_var, error)); @@ -679,14 +675,12 @@ Stmt add_image_checks_inner(Stmt s, } }; - if (!no_asserts) { - // Inject the code that checks the host pointers. - prepend_stmts(&asserts_host_non_null); - prepend_stmts(&asserts_host_alignment); - prepend_stmts(&asserts_device_not_dirty); - prepend_stmts(&dims_no_overflow_asserts); - prepend_lets(&lets_overflow); - } + // Inject the code that checks the host pointers. + prepend_stmts(&asserts_host_non_null); + prepend_stmts(&asserts_host_alignment); + prepend_stmts(&asserts_device_not_dirty); + prepend_stmts(&dims_no_overflow_asserts); + prepend_lets(&lets_overflow); // Replace uses of the var with the constrained versions in the // rest of the program. We also need to respect the existence of @@ -698,15 +692,10 @@ Stmt add_image_checks_inner(Stmt s, // all in reverse order compared to execution, as we incrementally // prepending code. - // Inject the code that checks the constraints are correct. We - // need these regardless of how NoAsserts is set, because they are - // what gets Halide to actually exploit the constraint. + // Inject the code that checks the constraints are correct. prepend_stmts(&asserts_constrained); - - if (!no_asserts) { - prepend_stmts(&asserts_required); - prepend_stmts(&asserts_type_checks); - } + prepend_stmts(&asserts_required); + prepend_stmts(&asserts_type_checks); // Inject the code that returns early for inference mode. if (!no_bounds_query) { @@ -714,9 +703,7 @@ Stmt add_image_checks_inner(Stmt s, prepend_stmts(&buffer_rewrites); } - if (!no_asserts) { - prepend_stmts(&asserts_proposed); - } + prepend_stmts(&asserts_proposed); // Inject the code that defines the proposed sizes. prepend_lets(&lets_proposed); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cca681661c35..557574f284c4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -156,6 +156,7 @@ set(HEADER_FILES StorageFlattening.h StorageFolding.h StrictifyFloat.h + StripAsserts.h Substitute.h Target.h Tracing.h @@ -340,6 +341,7 @@ set(SOURCE_FILES StorageFlattening.cpp StorageFolding.cpp StrictifyFloat.cpp + StripAsserts.cpp Substitute.cpp Target.cpp Tracing.cpp diff --git a/src/Lower.cpp b/src/Lower.cpp index ba0918831fc8..560e0353c7a4 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -68,6 +68,7 @@ #include "StorageFlattening.h" #include "StorageFolding.h" #include "StrictifyFloat.h" +#include "StripAsserts.h" #include "Substitute.h" #include "Tracing.h" #include "TrimNoOps.h" @@ -427,6 +428,12 @@ void lower_impl(const vector &output_funcs, s = hoist_prefetches(s); log("Lowering after hoisting prefetches:", s); + if (t.has_feature(Target::NoAsserts)) { + debug(1) << "Stripping asserts...\n"; + s = strip_asserts(s); + log("Lowering after stripping asserts:", s); + } + debug(1) << "Lowering after final simplification:\n" << s << "\n\n"; diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp index c575cd47477d..aa45841253b7 100644 --- a/src/ScheduleFunctions.cpp +++ b/src/ScheduleFunctions.cpp @@ -1368,11 +1368,7 @@ class InjectFunctionRealization : public IRMutator { // This is also the point at which we inject explicit bounds // for this realization. - if (target.has_feature(Target::NoAsserts)) { - return s; - } else { - return inject_explicit_bounds(s, func); - } + return inject_explicit_bounds(s, func); } Stmt build_realize_function_from_group(Stmt s, int func_index) { diff --git a/src/StripAsserts.cpp b/src/StripAsserts.cpp new file mode 100644 index 000000000000..9d9c667f4db1 --- /dev/null +++ b/src/StripAsserts.cpp @@ -0,0 +1,121 @@ +#include "StripAsserts.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "IRVisitor.h" +#include + +namespace Halide { +namespace Internal { + +namespace { + +bool may_discard(const Expr &e) { + class MayDiscard : public IRVisitor { + using IRVisitor::visit; + + void visit(const Call *op) override { + // Extern calls that are side-effecty in the sense that you can't + // move them around in the IR, but we're free to discard because + // they're just getters. + static const std::set discardable{ + Call::buffer_get_dimensions, + Call::buffer_get_min, + Call::buffer_get_extent, + Call::buffer_get_stride, + Call::buffer_get_max, + Call::buffer_get_host, + Call::buffer_get_device, + Call::buffer_get_device_interface, + Call::buffer_get_shape, + Call::buffer_get_host_dirty, + Call::buffer_get_device_dirty, + Call::buffer_get_type}; + + if (!(op->is_pure() || + discardable.count(op->name))) { + result = false; + } + } + + public: + bool result = true; + } d; + e.accept(&d); + + return d.result; +} + +class StripAsserts : public IRMutator { + using IRMutator::visit; + + // We're going to track which symbols are used so that we can strip lets we + // don't need after removing the asserts. + std::set used; + + // Drop all assert stmts. Assumes that you don't want any side-effects from + // the condition. + Stmt visit(const AssertStmt *op) override { + return Evaluate::make(0); + } + + Expr visit(const Variable *op) override { + used.insert(op->name); + return op; + } + + Expr visit(const Load *op) override { + used.insert(op->name); + return IRMutator::visit(op); + } + + Stmt visit(const Store *op) override { + used.insert(op->name); + return IRMutator::visit(op); + } + + // Also dead-code eliminate any let stmts wrapped around asserts + Stmt visit(const LetStmt *op) override { + Stmt body = mutate(op->body); + if (is_no_op(body)) { + if (may_discard(op->value)) { + return body; + } else { + // We visit the value just to keep the used variable set + // accurate. + mutate(op->value); + return Evaluate::make(op->value); + } + } else if (body.same_as(op->body)) { + mutate(op->value); + return op; + } else if (may_discard(op->value) && !used.count(op->name)) { + return body; + } else { + mutate(op->value); + return LetStmt::make(op->name, op->value, body); + } + } + + Stmt visit(const Block *op) override { + Stmt first = mutate(op->first); + Stmt rest = mutate(op->rest); + if (first.same_as(op->first) && rest.same_as(op->rest)) { + return op; + } else if (is_no_op(rest)) { + return first; + } else if (is_no_op(first)) { + return rest; + } else { + return Block::make(first, rest); + } + } +}; + +} // namespace + +Stmt strip_asserts(const Stmt &s) { + return StripAsserts().mutate(s); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/StripAsserts.h b/src/StripAsserts.h new file mode 100644 index 000000000000..48b22b3a5218 --- /dev/null +++ b/src/StripAsserts.h @@ -0,0 +1,18 @@ +#ifndef HALIDE_STRIP_ASSERTS_H +#define HALIDE_STRIP_ASSERTS_H + +/** \file + * Defines the lowering pass that strips asserts when NoAsserts is set. + */ + +#include "Expr.h" + +namespace Halide { +namespace Internal { + +Stmt strip_asserts(const Stmt &s); + +} // namespace Internal +} // namespace Halide + +#endif From d9668c5bcf7325cd669bf34f55d40f8c935453cb Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 15 Feb 2024 17:57:16 +0000 Subject: [PATCH 067/186] Fix clang-tidy error in runtime.printer.h (parameter shadows member) (#8074) --- src/runtime/.clang-tidy | 2 ++ src/runtime/printer.h | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/runtime/.clang-tidy b/src/runtime/.clang-tidy index 9c335b626fdf..4032f4ea3d9c 100644 --- a/src/runtime/.clang-tidy +++ b/src/runtime/.clang-tidy @@ -73,6 +73,8 @@ Checks: > bugprone-use-after-move, bugprone-virtual-near-miss, + clang-diagnostic-shadow-field, + misc-confusable-identifiers, -misc-const-correctness, -misc-definitions-in-headers, diff --git a/src/runtime/printer.h b/src/runtime/printer.h index 6a379561dbe5..af07a21730fd 100644 --- a/src/runtime/printer.h +++ b/src/runtime/printer.h @@ -184,8 +184,8 @@ namespace { template class HeapPrinter : public PrinterBase { public: - NEVER_INLINE explicit HeapPrinter(void *user_context) - : PrinterBase(user_context, (char *)malloc(buffer_length), buffer_length) { + NEVER_INLINE explicit HeapPrinter(void *user_context_) + : PrinterBase(user_context_, (char *)malloc(buffer_length), buffer_length) { if (!start) { allocation_error(); } @@ -247,8 +247,8 @@ class StackPrinter : public PrinterBase { char scratch[buffer_length]; public: - explicit StackPrinter(void *user_context) - : PrinterBase(user_context, scratch, buffer_length) { + explicit StackPrinter(void *user_context_) + : PrinterBase(user_context_, scratch, buffer_length) { static_assert(buffer_length <= 256, "StackPrinter is meant only for small buffer sizes; you are probably making a mistake."); } }; From 4fc1e57ea34f267ec1ea085bbab228569429170a Mon Sep 17 00:00:00 2001 From: Zalman Stern Date: Fri, 16 Feb 2024 13:58:23 -0800 Subject: [PATCH 068/186] Fix an issue where the Halide compiler hits an internal error for bool types in widening intrinsics. (#8099) * Fix an issue where the Halide compiler hits an internal error when bool types are used with e.g. widening_mul. This situation did not arise from user code doing this directly, but rather through some chain o lowering with float16 types. The test cases added to correctness_intrinsics target the issue directly and do fail without the fix. I did not add broader coverage for bool types and intrinsics as it would require more thinking. Most of them overflow for the true/true case and thus are of questionable use, however widening operations cannot overflow... Certainly we could define the language to forbid this, but currently the frontend does not do so. As indicated above, the use case driving this was not using bool arithmetic to begin with. * Formatting. --- src/FindIntrinsics.cpp | 273 ++++++++++++++++++-------------- test/correctness/intrinsics.cpp | 5 + 2 files changed, 159 insertions(+), 119 deletions(-) diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp index a77a7b1798f3..febd88d2399b 100644 --- a/src/FindIntrinsics.cpp +++ b/src/FindIntrinsics.cpp @@ -13,6 +13,8 @@ using namespace Halide::ConciseCasts; namespace { +// This routine provides a guard on the return type of intrisics such that only +// these types will ever be considered in the visiting that happens here. bool find_intrinsics_for_type(const Type &t) { // Currently, we only try to find and replace intrinsics for vector types that aren't bools. return t.is_vector() && t.bits() >= 8; @@ -28,17 +30,36 @@ Expr narrow(Expr a) { return Cast::make(result_type, std::move(a)); } +// Check a type to make sure it can be narrowed. find_intrinsics_for_type +// attempts to prevent this code from narrowing in cases that do not work, but +// it is incomplete for two reasons: +// +// - Arguments can be narrowed and that guard is only on return type, which +// are different for widening operations. +// +// - find_intrinsics_for_type does not cull out float16, and it probably +// should not as while it's ok to skip matching bool things, float16 things +// are useful. +bool can_narrow(const Type &t) { + return (t.is_float() && t.bits() >= 32) || + t.bits() >= 8; +} + Expr lossless_narrow(const Expr &x) { - return lossless_cast(x.type().narrow(), x); + return can_narrow(x.type()) ? lossless_cast(x.type().narrow(), x) : Expr(); } // Remove a widening cast even if it changes the sign of the result. Expr strip_widening_cast(const Expr &x) { - Expr narrow = lossless_narrow(x); - if (narrow.defined()) { - return narrow; + if (can_narrow(x.type())) { + Expr narrow = lossless_narrow(x); + if (narrow.defined()) { + return narrow; + } + return lossless_cast(x.type().narrow().with_code(halide_type_uint), x); + } else { + return Expr(); } - return lossless_cast(x.type().narrow().with_code(halide_type_uint), x); } Expr saturating_narrow(const Expr &a) { @@ -217,16 +238,18 @@ class FindIntrinsics : public IRMutator { // Try widening both from the same signedness as the result, and from uint. for (halide_type_code_t code : {op->type.code(), halide_type_uint}) { - Type narrow = op->type.narrow().with_code(code); - Expr narrow_a = lossless_cast(narrow, a); - Expr narrow_b = lossless_cast(narrow, b); + if (can_narrow(op->type)) { + Type narrow = op->type.narrow().with_code(code); + Expr narrow_a = lossless_cast(narrow, a); + Expr narrow_b = lossless_cast(narrow, b); - if (narrow_a.defined() && narrow_b.defined()) { - Expr result = widening_add(narrow_a, narrow_b); - if (result.type() != op->type) { - result = Cast::make(op->type, result); + if (narrow_a.defined() && narrow_b.defined()) { + Expr result = widening_add(narrow_a, narrow_b); + if (result.type() != op->type) { + result = Cast::make(op->type, result); + } + return mutate(result); } - return mutate(result); } } @@ -235,41 +258,43 @@ class FindIntrinsics : public IRMutator { // Yes we do an duplicate code, but we want to check the op->type.code() first, // and the opposite as well. for (halide_type_code_t code : {op->type.code(), halide_type_uint, halide_type_int}) { - Type narrow = op->type.narrow().with_code(code); - // Pulling casts out of VectorReduce nodes breaks too much codegen, skip for now. - Expr narrow_a = (a.node_type() == IRNodeType::VectorReduce) ? Expr() : lossless_cast(narrow, a); - Expr narrow_b = (b.node_type() == IRNodeType::VectorReduce) ? Expr() : lossless_cast(narrow, b); - - // This case should have been handled by the above check for widening_add. - internal_assert(!(narrow_a.defined() && narrow_b.defined())) - << "find_intrinsics failed to find a widening_add: " << a << " + " << b << "\n"; - - if (narrow_a.defined()) { - Expr result; - if (b.type().code() != narrow_a.type().code()) { - // Need to do a safe reinterpret. - Type t = b.type().with_code(code); - result = widen_right_add(cast(t, b), narrow_a); - internal_assert(result.type() != op->type); - result = cast(op->type, result); - } else { - result = widen_right_add(b, narrow_a); - } - internal_assert(result.type() == op->type); - return mutate(result); - } else if (narrow_b.defined()) { - Expr result; - if (a.type().code() != narrow_b.type().code()) { - // Need to do a safe reinterpret. - Type t = a.type().with_code(code); - result = widen_right_add(cast(t, a), narrow_b); - internal_assert(result.type() != op->type); - result = cast(op->type, result); - } else { - result = widen_right_add(a, narrow_b); + if (can_narrow(op->type)) { + Type narrow = op->type.narrow().with_code(code); + // Pulling casts out of VectorReduce nodes breaks too much codegen, skip for now. + Expr narrow_a = (a.node_type() == IRNodeType::VectorReduce) ? Expr() : lossless_cast(narrow, a); + Expr narrow_b = (b.node_type() == IRNodeType::VectorReduce) ? Expr() : lossless_cast(narrow, b); + + // This case should have been handled by the above check for widening_add. + internal_assert(!(narrow_a.defined() && narrow_b.defined())) + << "find_intrinsics failed to find a widening_add: " << a << " + " << b << "\n"; + + if (narrow_a.defined()) { + Expr result; + if (b.type().code() != narrow_a.type().code()) { + // Need to do a safe reinterpret. + Type t = b.type().with_code(code); + result = widen_right_add(cast(t, b), narrow_a); + internal_assert(result.type() != op->type); + result = cast(op->type, result); + } else { + result = widen_right_add(b, narrow_a); + } + internal_assert(result.type() == op->type); + return mutate(result); + } else if (narrow_b.defined()) { + Expr result; + if (a.type().code() != narrow_b.type().code()) { + // Need to do a safe reinterpret. + Type t = a.type().with_code(code); + result = widen_right_add(cast(t, a), narrow_b); + internal_assert(result.type() != op->type); + result = cast(op->type, result); + } else { + result = widen_right_add(a, narrow_b); + } + internal_assert(result.type() == op->type); + return mutate(result); } - internal_assert(result.type() == op->type); - return mutate(result); } } } @@ -294,22 +319,24 @@ class FindIntrinsics : public IRMutator { // Try widening both from the same type as the result, and from uint. for (halide_type_code_t code : {op->type.code(), halide_type_uint}) { - Type narrow = op->type.narrow().with_code(code); - Expr narrow_a = lossless_cast(narrow, a); - Expr narrow_b = lossless_cast(narrow, b); + if (can_narrow(op->type)) { + Type narrow = op->type.narrow().with_code(code); + Expr narrow_a = lossless_cast(narrow, a); + Expr narrow_b = lossless_cast(narrow, b); - if (narrow_a.defined() && narrow_b.defined()) { - Expr negative_narrow_b = lossless_negate(narrow_b); - Expr result; - if (negative_narrow_b.defined()) { - result = widening_add(narrow_a, negative_narrow_b); - } else { - result = widening_sub(narrow_a, narrow_b); - } - if (result.type() != op->type) { - result = Cast::make(op->type, result); + if (narrow_a.defined() && narrow_b.defined()) { + Expr negative_narrow_b = lossless_negate(narrow_b); + Expr result; + if (negative_narrow_b.defined()) { + result = widening_add(narrow_a, negative_narrow_b); + } else { + result = widening_sub(narrow_a, narrow_b); + } + if (result.type() != op->type) { + result = Cast::make(op->type, result); + } + return mutate(result); } - return mutate(result); } } @@ -324,22 +351,24 @@ class FindIntrinsics : public IRMutator { // Yes we do an duplicate code, but we want to check the op->type.code() first, // and the opposite as well. for (halide_type_code_t code : {op->type.code(), halide_type_uint, halide_type_int}) { - Type narrow = op->type.narrow().with_code(code); - Expr narrow_b = lossless_cast(narrow, b); - - if (narrow_b.defined()) { - Expr result; - if (a.type().code() != narrow_b.type().code()) { - // Need to do a safe reinterpret. - Type t = a.type().with_code(code); - result = widen_right_sub(cast(t, a), narrow_b); - internal_assert(result.type() != op->type); - result = cast(op->type, result); - } else { - result = widen_right_sub(a, narrow_b); + if (can_narrow(op->type)) { + Type narrow = op->type.narrow().with_code(code); + Expr narrow_b = lossless_cast(narrow, b); + + if (narrow_b.defined()) { + Expr result; + if (a.type().code() != narrow_b.type().code()) { + // Need to do a safe reinterpret. + Type t = a.type().with_code(code); + result = widen_right_sub(cast(t, a), narrow_b); + internal_assert(result.type() != op->type); + result = cast(op->type, result); + } else { + result = widen_right_sub(a, narrow_b); + } + internal_assert(result.type() == op->type); + return mutate(result); } - internal_assert(result.type() == op->type); - return mutate(result); } } } @@ -401,40 +430,42 @@ class FindIntrinsics : public IRMutator { // Yes we do an duplicate code, but we want to check the op->type.code() first, // and the opposite as well. for (halide_type_code_t code : {op->type.code(), halide_type_uint, halide_type_int}) { - Type narrow = op->type.narrow().with_code(code); - Expr narrow_a = lossless_cast(narrow, a); - Expr narrow_b = lossless_cast(narrow, b); - - // This case should have been handled by the above check for widening_mul. - internal_assert(!(narrow_a.defined() && narrow_b.defined())) - << "find_intrinsics failed to find a widening_mul: " << a << " + " << b << "\n"; - - if (narrow_a.defined()) { - Expr result; - if (b.type().code() != narrow_a.type().code()) { - // Need to do a safe reinterpret. - Type t = b.type().with_code(code); - result = widen_right_mul(cast(t, b), narrow_a); - internal_assert(result.type() != op->type); - result = cast(op->type, result); - } else { - result = widen_right_mul(b, narrow_a); - } - internal_assert(result.type() == op->type); - return mutate(result); - } else if (narrow_b.defined()) { - Expr result; - if (a.type().code() != narrow_b.type().code()) { - // Need to do a safe reinterpret. - Type t = a.type().with_code(code); - result = widen_right_mul(cast(t, a), narrow_b); - internal_assert(result.type() != op->type); - result = cast(op->type, result); - } else { - result = widen_right_mul(a, narrow_b); + if (can_narrow(op->type)) { + Type narrow = op->type.narrow().with_code(code); + Expr narrow_a = lossless_cast(narrow, a); + Expr narrow_b = lossless_cast(narrow, b); + + // This case should have been handled by the above check for widening_mul. + internal_assert(!(narrow_a.defined() && narrow_b.defined())) + << "find_intrinsics failed to find a widening_mul: " << a << " + " << b << "\n"; + + if (narrow_a.defined()) { + Expr result; + if (b.type().code() != narrow_a.type().code()) { + // Need to do a safe reinterpret. + Type t = b.type().with_code(code); + result = widen_right_mul(cast(t, b), narrow_a); + internal_assert(result.type() != op->type); + result = cast(op->type, result); + } else { + result = widen_right_mul(b, narrow_a); + } + internal_assert(result.type() == op->type); + return mutate(result); + } else if (narrow_b.defined()) { + Expr result; + if (a.type().code() != narrow_b.type().code()) { + // Need to do a safe reinterpret. + Type t = a.type().with_code(code); + result = widen_right_mul(cast(t, a), narrow_b); + internal_assert(result.type() != op->type); + result = cast(op->type, result); + } else { + result = widen_right_mul(a, narrow_b); + } + internal_assert(result.type() == op->type); + return mutate(result); } - internal_assert(result.type() == op->type); - return mutate(result); } } } @@ -853,21 +884,25 @@ class FindIntrinsics : public IRMutator { } else if (op->is_intrinsic(Call::widening_add) && (op->type.bits() >= 16)) { internal_assert(op->args.size() == 2); for (halide_type_code_t t : {op->type.code(), halide_type_uint}) { - Type narrow_t = op->type.narrow().narrow().with_code(t); - Expr narrow_a = lossless_cast(narrow_t, op->args[0]); - Expr narrow_b = lossless_cast(narrow_t, op->args[1]); - if (narrow_a.defined() && narrow_b.defined()) { - return mutate(Cast::make(op->type, widening_add(narrow_a, narrow_b))); + if (can_narrow(op->type)) { + Type narrow_t = op->type.narrow().narrow().with_code(t); + Expr narrow_a = lossless_cast(narrow_t, op->args[0]); + Expr narrow_b = lossless_cast(narrow_t, op->args[1]); + if (narrow_a.defined() && narrow_b.defined()) { + return mutate(Cast::make(op->type, widening_add(narrow_a, narrow_b))); + } } } } else if (op->is_intrinsic(Call::widening_sub) && (op->type.bits() >= 16)) { internal_assert(op->args.size() == 2); for (halide_type_code_t t : {op->type.code(), halide_type_uint}) { - Type narrow_t = op->type.narrow().narrow().with_code(t); - Expr narrow_a = lossless_cast(narrow_t, op->args[0]); - Expr narrow_b = lossless_cast(narrow_t, op->args[1]); - if (narrow_a.defined() && narrow_b.defined()) { - return mutate(Cast::make(op->type, widening_sub(narrow_a, narrow_b))); + if (can_narrow(op->type)) { + Type narrow_t = op->type.narrow().narrow().with_code(t); + Expr narrow_a = lossless_cast(narrow_t, op->args[0]); + Expr narrow_b = lossless_cast(narrow_t, op->args[1]); + if (narrow_a.defined() && narrow_b.defined()) { + return mutate(Cast::make(op->type, widening_sub(narrow_a, narrow_b))); + } } } } diff --git a/test/correctness/intrinsics.cpp b/test/correctness/intrinsics.cpp index 19f9c610b099..339a5c2525e5 100644 --- a/test/correctness/intrinsics.cpp +++ b/test/correctness/intrinsics.cpp @@ -121,6 +121,8 @@ Expr make_leaf(Type t, const char *name) { } int main(int argc, char **argv) { + Expr i1x = make_leaf(Int(1, 4), "i1x"); + Expr i1y = make_leaf(Int(1, 4), "i1y"); Expr i8x = make_leaf(Int(8, 4), "i8x"); Expr i8y = make_leaf(Int(8, 4), "i8y"); Expr i8z = make_leaf(Int(8, 4), "i8w"); @@ -150,15 +152,18 @@ int main(int argc, char **argv) { // check(u32(u8x) * 256, u32(widening_shift_left(u8x, u8(8)))); // Check widening arithmetic + check(i8(i1x) + i1y, widening_add(i1x, i1y)); check(i16(i8x) + i8y, widening_add(i8x, i8y)); check(u16(u8x) + u8y, widening_add(u8x, u8y)); check(i16(u8x) + u8y, i16(widening_add(u8x, u8y))); check(f32(f16x) + f32(f16y), widening_add(f16x, f16y)); + check(i8(i1x) - i1y, widening_sub(i1x, i1y)); check(i16(i8x) - i8y, widening_sub(i8x, i8y)); check(i16(u8x) - u8y, widening_sub(u8x, u8y)); check(f32(f16x) - f32(f16y), widening_sub(f16x, f16y)); + check(i8(i1x) * i1y, widening_mul(i1x, i1y)); check(i16(i8x) * i8y, widening_mul(i8x, i8y)); check(u16(u8x) * u8y, widening_mul(u8x, u8y)); check(i32(i8x) * i8y, i32(widening_mul(i8x, i8y))); From c4d56c6202476274280b8251810cd926913b499c Mon Sep 17 00:00:00 2001 From: Tarushii Goel Date: Mon, 19 Feb 2024 17:46:15 -0500 Subject: [PATCH 069/186] Small Tutorial Fix (#8111) * Update lesson_17_predicated_rdom.cpp * Update lesson_17_predicated_rdom.cpp --- tutorial/lesson_17_predicated_rdom.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/lesson_17_predicated_rdom.cpp b/tutorial/lesson_17_predicated_rdom.cpp index 77c43bdb55c2..b36fc49a773a 100644 --- a/tutorial/lesson_17_predicated_rdom.cpp +++ b/tutorial/lesson_17_predicated_rdom.cpp @@ -38,7 +38,7 @@ int main(int argc, char **argv) { Var x("x"), y("y"); circle(x, y) = x + y; - // Say we want an update that squares the values inside a + // Say we want an update that multiplies by two the values inside a // circular region centered at (3, 3) with radius of 3. To do // this, we first define the minimal bounding box over the // circular region using an RDom. From 46132176ff262a337c7cd4acb2839c18d49b6911 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 22 Feb 2024 09:13:15 -0800 Subject: [PATCH 070/186] Optionally print the time taken by each lowering pass (#8116) * Optionally print the time taken by each lowering pass I've been copy-pasting this from branch to branch, but I should just check it in. This is useful for performance optimization of the compiler itself. --- src/Lower.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/Lower.cpp b/src/Lower.cpp index 560e0353c7a4..6b56f23fcff9 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -91,15 +91,39 @@ namespace { class LoweringLogger { Stmt last_written; + std::chrono::time_point last_time; + std::vector> timings; + bool time_lowering_passes = false; public: + LoweringLogger() { + last_time = std::chrono::high_resolution_clock::now(); + static bool should_time = !get_env_variable("HL_TIME_LOWERING_PASSES").empty(); + time_lowering_passes = should_time; + } + void operator()(const string &message, const Stmt &s) { + auto t = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = t - last_time; if (!s.same_as(last_written)) { debug(2) << message << "\n" << s << "\n"; last_written = s; + last_time = t; } else { debug(2) << message << " (unchanged)\n\n"; + last_time = t; + } + timings.emplace_back(diff.count() * 1000, message); + } + + ~LoweringLogger() { + if (time_lowering_passes) { + debug(0) << "Lowering pass runtimes:\n"; + std::sort(timings.begin(), timings.end()); + for (const auto &p : timings) { + debug(0) << " " << p.first << " ms : " << p.second << "\n"; + } } } }; From ef31bf95f056ee7ee3c6eb76f6ac3690ad8f4f5f Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 22 Feb 2024 09:13:43 -0800 Subject: [PATCH 071/186] Do less redundant work in UnpackBuffers (#8104) We were redundantly creating a handle Variable every time we encountered something like foo.stride.0, instead of just the first time we encounter a Variable that refers to an input Parameter/Buffer. Speeds up this already-fast lowering pass by 10% or so. No measurable impact on total lowering time. --- src/UnpackBuffers.cpp | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/UnpackBuffers.cpp b/src/UnpackBuffers.cpp index 2f5b5a65bef6..3040f0b273ba 100644 --- a/src/UnpackBuffers.cpp +++ b/src/UnpackBuffers.cpp @@ -26,37 +26,46 @@ class FindBufferSymbols : public IRVisitor { void visit_param(const string &ref_name, const Parameter ¶m) { if (param.defined() && param.is_buffer()) { const string &name = param.name(); - buffers[name] = - BufferInfo{Variable::make(type_of(), name + ".buffer", param), - param.dimensions()}; + auto r = buffers.try_emplace(name); + if (r.second) { + // It's the first time we've seen this Parameter + r.first->second.handle = Variable::make(type_of(), name + ".buffer", param); + r.first->second.dimensions = param.dimensions(); + } } } void visit_buffer(const string &ref_name, const Buffer<> &buffer) { if (buffer.defined()) { const string &name = buffer.name(); - buffers[name] = - BufferInfo{Variable::make(type_of(), name + ".buffer", buffer), - buffer.dimensions()}; + auto r = buffers.try_emplace(name); + if (r.second) { + // It's the first time we've seen this Buffer + r.first->second.handle = Variable::make(type_of(), name + ".buffer", buffer); + r.first->second.dimensions = buffer.dimensions(); + } } } void visit(const Variable *op) override { - visit_param(op->name, op->param); - visit_buffer(op->name, op->image); - symbols.insert(op->name); + if (symbols.insert(op->name).second) { + visit_param(op->name, op->param); + visit_buffer(op->name, op->image); + } } void visit(const Load *op) override { - visit_param(op->name, op->param); - visit_buffer(op->name, op->image); - symbols.insert(op->name); + if (symbols.insert(op->name).second) { + visit_param(op->name, op->param); + visit_buffer(op->name, op->image); + } IRVisitor::visit(op); } void visit(const Store *op) override { - visit_param(op->name, op->param); - symbols.insert(op->name); + if (symbols.insert(op->name).second) { + visit_param(op->name, op->param); + } IRVisitor::visit(op); } From 57164dfe3d98e0e27bb44bb0efc525d8c5411e00 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 22 Feb 2024 10:52:54 -0800 Subject: [PATCH 072/186] Avoid redundant scope lookups (#8103) * Avoid redundant scope lookups This pattern has been bugging me for a long time: ``` if (scope.contains(key)) { Foo f = scope.get(key); } ``` This redundantly looks up the key in the scope twice. I've finally gotten around to fixing it. I've introduced a find method that either returns a const pointer to the value, if it exists, or null. It also searches any containing scopes, which are held by const pointer, so the method has to return a const pointer. ``` if (const Foo *f = scope.find(key)) { } ``` For cases where you want to get and then mutate, I added shallow_find, which doesn't search enclosing scopes, but returns a mutable pointer. We were also doing redundant scope lookups in ScopedBinding. We stored the key in the helper object, and then did a pop on that key in the ScopedBinding destructor. This commit changes Scope so that Scope::push returns an opaque token that you can pass to Scope::pop to have it remove that element without doing a fresh lookup. ScopedBinding now uses this. Under the hood it's just an iterator on the underlying map (map iterators are not invalidated on inserting or removing other stuff). The net effect is to speed up local laplacian lowering by about 5% I also considered making it look more like an stl class, and having find return an iterator, but it doesn't really work. The iterator it returns might point to an entry in an enclosing scope, in which case you can't compare it to the .end() method of the scope you have. Scopes are different enough from maps that the interface really needs to be distinct. --- src/Bounds.cpp | 65 +++++++++++----------- src/CSE.cpp | 4 +- src/ClampUnsafeAccesses.cpp | 6 ++- src/CodeGen_ARM.cpp | 5 +- src/CodeGen_C.cpp | 5 +- src/CodeGen_D3D12Compute_Dev.cpp | 5 +- src/CodeGen_Hexagon.cpp | 11 ++-- src/CodeGen_LLVM.cpp | 5 +- src/CodeGen_Metal_Dev.cpp | 9 ++-- src/CodeGen_OpenCL_Dev.cpp | 8 +-- src/CodeGen_Posix.cpp | 4 +- src/CodeGen_Vulkan_Dev.cpp | 28 +++++----- src/CodeGen_WebGPU_Dev.cpp | 8 +-- src/CodeGen_X86.cpp | 38 +++++++------ src/EliminateBoolVectors.cpp | 4 +- src/ExprUsesVar.h | 4 +- src/FindIntrinsics.cpp | 4 +- src/FuseGPUThreadLoops.cpp | 16 +++--- src/HexagonOptimize.cpp | 32 +++++------ src/LICM.cpp | 4 +- src/LoopCarry.cpp | 13 +++-- src/LowerWarpShuffles.cpp | 17 +++--- src/ModulusRemainder.cpp | 4 +- src/Monotonic.cpp | 4 +- src/Prefetch.cpp | 7 ++- src/PrintLoopNest.cpp | 15 +++--- src/Scope.h | 93 +++++++++++++++++++++++--------- src/Simplify.cpp | 43 +++++++-------- src/Simplify_Exprs.cpp | 32 +++++------ src/Simplify_Stmts.cpp | 20 +++---- src/SlidingWindow.cpp | 7 ++- src/Solve.cpp | 18 +++---- src/StageStridedLoads.cpp | 8 +-- src/StmtToHTML.cpp | 4 +- src/StorageFlattening.cpp | 5 +- src/UniquifyVariableNames.cpp | 7 ++- src/VectorizeLoops.cpp | 4 +- 37 files changed, 305 insertions(+), 261 deletions(-) diff --git a/src/Bounds.cpp b/src/Bounds.cpp index a08bb0b9ad61..16fd69f3e8fb 100644 --- a/src/Bounds.cpp +++ b/src/Bounds.cpp @@ -406,13 +406,12 @@ class Bounds : public IRVisitor { if (const_bound) { bounds_of_type(op->type); - if (scope.contains(op->name)) { - const Interval &scope_interval = scope.get(op->name); - if (scope_interval.has_upper_bound() && is_const(scope_interval.max)) { - interval.max = Interval::make_min(interval.max, scope_interval.max); + if (const Interval *scope_interval = scope.find(op->name)) { + if (scope_interval->has_upper_bound() && is_const(scope_interval->max)) { + interval.max = Interval::make_min(interval.max, scope_interval->max); } - if (scope_interval.has_lower_bound() && is_const(scope_interval.min)) { - interval.min = Interval::make_max(interval.min, scope_interval.min); + if (scope_interval->has_lower_bound() && is_const(scope_interval->min)) { + interval.min = Interval::make_max(interval.min, scope_interval->min); } } @@ -429,8 +428,8 @@ class Bounds : public IRVisitor { } } } else { - if (scope.contains(op->name)) { - interval = scope.get(op->name); + if (const Interval *in = scope.find(op->name)) { + interval = *in; } else if (op->type.is_vector()) { // Uh oh, we need to take the min/max lane of some unknown vector. Treat as unbounded. bounds_of_type(op->type); @@ -2054,11 +2053,10 @@ class FindInnermostVar : public IRVisitor { int innermost_depth = -1; void visit(const Variable *op) override { - if (vars_depth.contains(op->name)) { - int depth = vars_depth.get(op->name); - if (depth > innermost_depth) { + if (const int *depth = vars_depth.find(op->name)) { + if (*depth > innermost_depth) { innermost_var = op->name; - innermost_depth = depth; + innermost_depth = *depth; } } } @@ -2545,16 +2543,17 @@ class BoxesTouched : public IRGraphVisitor { // If this let stmt is a redefinition of a previous one, we should // remove the old let stmt from the 'children' map since it is // no longer valid at this point. - if ((f.vi.instance > 0) && let_stmts.contains(op->name)) { - const Expr &val = let_stmts.get(op->name); - CollectVars collect(op->name); - val.accept(&collect); - f.old_let_vars = collect.vars; - - VarInstance old_vi = VarInstance(f.vi.var, f.vi.instance - 1); - for (const auto &v : f.old_let_vars) { - internal_assert(vars_renaming.count(v)); - children[get_var_instance(v)].erase(old_vi); + if (f.vi.instance > 0) { + if (const Expr *val = let_stmts.find(op->name)) { + CollectVars collect(op->name); + val->accept(&collect); + f.old_let_vars = collect.vars; + + VarInstance old_vi = VarInstance(f.vi.var, f.vi.instance - 1); + for (const auto &v : f.old_let_vars) { + internal_assert(vars_renaming.count(v)); + children[get_var_instance(v)].erase(old_vi); + } } } let_stmts.push(op->name, op->value); @@ -2756,17 +2755,17 @@ class BoxesTouched : public IRGraphVisitor { expr_uses_var(box[i].min, l.min_name))) || (box[i].has_upper_bound() && (expr_uses_var(box[i].max, l.max_name) || expr_uses_var(box[i].max, l.min_name)))) { - internal_assert(let_stmts.contains(l.var)); - const Expr &val = let_stmts.get(l.var); - v_bound = bounds_of_expr_in_scope(val, scope, func_bounds); + const Expr *val = let_stmts.find(l.var); + internal_assert(val); + v_bound = bounds_of_expr_in_scope(*val, scope, func_bounds); bool fixed = v_bound.min.same_as(v_bound.max); v_bound.min = simplify(v_bound.min); v_bound.max = fixed ? v_bound.min : simplify(v_bound.max); - internal_assert(scope.contains(l.var)); - const Interval &old_bound = scope.get(l.var); - v_bound.max = simplify(min(v_bound.max, old_bound.max)); - v_bound.min = simplify(max(v_bound.min, old_bound.min)); + const Interval *old_bound = scope.find(l.var); + internal_assert(old_bound); + v_bound.max = simplify(min(v_bound.max, old_bound->max)); + v_bound.min = simplify(max(v_bound.min, old_bound->min)); } if (box[i].has_lower_bound()) { @@ -3017,14 +3016,14 @@ class BoxesTouched : public IRGraphVisitor { } Expr min_val, max_val; - if (scope.contains(op->name + ".loop_min")) { - min_val = scope.get(op->name + ".loop_min").min; + if (const Interval *in = scope.find(op->name + ".loop_min")) { + min_val = in->min; } else { min_val = bounds_of_expr_in_scope(op->min, scope, func_bounds).min; } - if (scope.contains(op->name + ".loop_max")) { - max_val = scope.get(op->name + ".loop_max").max; + if (const Interval *in = scope.find(op->name + ".loop_max")) { + max_val = in->max; } else { max_val = bounds_of_expr_in_scope(op->extent, scope, func_bounds).max; max_val += bounds_of_expr_in_scope(op->min, scope, func_bounds).max; diff --git a/src/CSE.cpp b/src/CSE.cpp index 7d39fcc90dc5..d8ecd619db81 100644 --- a/src/CSE.cpp +++ b/src/CSE.cpp @@ -201,8 +201,8 @@ class RemoveLets : public IRGraphMutator { Scope scope; Expr visit(const Variable *op) override { - if (scope.contains(op->name)) { - return scope.get(op->name); + if (const Expr *e = scope.find(op->name)) { + return *e; } else { return op; } diff --git a/src/ClampUnsafeAccesses.cpp b/src/ClampUnsafeAccesses.cpp index 5e2e1f5d5b2e..b3dd9ddc235e 100644 --- a/src/ClampUnsafeAccesses.cpp +++ b/src/ClampUnsafeAccesses.cpp @@ -50,8 +50,10 @@ struct ClampUnsafeAccesses : IRMutator { } Expr visit(const Variable *var) override { - if (is_inside_indexing && let_var_inside_indexing.contains(var->name)) { - let_var_inside_indexing.ref(var->name) = true; + if (is_inside_indexing) { + if (bool *b = let_var_inside_indexing.shallow_find(var->name)) { + *b = true; + } } return var; } diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 9c6525703f16..7852532183bf 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -82,13 +82,14 @@ class SubstituteInStridedLoads : public IRMutator { Expr visit(const Shuffle *op) override { int stride = op->slice_stride(); const Variable *var = op->vectors[0].as(); + const Expr *vec = nullptr; if (var && poisoned_vars.count(var->name) == 0 && op->vectors.size() == 1 && 2 <= stride && stride <= 4 && op->slice_begin() < stride && - loads.contains(var->name)) { - return Shuffle::make_slice({loads.get(var->name)}, op->slice_begin(), op->slice_stride(), op->type.lanes()); + (vec = loads.find(var->name))) { + return Shuffle::make_slice({*vec}, op->slice_begin(), op->slice_stride(), op->type.lanes()); } else { return IRMutator::visit(op); } diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index 89c18cb8ab28..b0cdcb3e956c 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -1936,8 +1936,9 @@ void CodeGen_C::visit(const Load *op) { user_assert(is_const_one(op->predicate)) << "Predicated scalar load is not supported by C backend.\n"; string id_index = print_expr(op->index); - bool type_cast_needed = !(allocations.contains(op->name) && - allocations.get(op->name).type.element_of() == t.element_of()); + const auto *alloc = allocations.find(op->name); + bool type_cast_needed = !(alloc && + alloc->type.element_of() == t.element_of()); if (type_cast_needed) { const char *const_flag = output_kind == CPlusPlusImplementation ? " const" : ""; rhs << "((" << print_type(t.element_of()) << const_flag << " *)" << name << ")"; diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index c8e45ea2ae09..4fd614cc0dfc 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -592,8 +592,9 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const Load *op) { string id_index = print_expr(op->index); // Get the rhs just for the cache. - bool type_cast_needed = !(allocations.contains(op->name) && - allocations.get(op->name).type == op->type); + const auto *alloc = allocations.find(op->name); + bool type_cast_needed = !(alloc && + alloc->type == op->type); ostringstream rhs; if (type_cast_needed) { diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index 9463a4c921aa..a77e9c7c1a76 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -221,8 +221,8 @@ class SloppyUnpredicateLoadsAndStores : public IRMutator { } } } else if (const Variable *op = e.as()) { - if (monotonic_vectors.contains(op->name)) { - return monotonic_vectors.get(op->name); + if (const auto *p = monotonic_vectors.find(op->name)) { + return *p; } } else if (const Let *op = e.as()) { auto v = get_extreme_lanes(op->value); @@ -2245,10 +2245,9 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) { codegen(alloc->body); // If there was no early free, free it now. - if (allocations.contains(alloc->name)) { - Allocation alloc_obj = allocations.get(alloc->name); - internal_assert(alloc_obj.destructor); - trigger_destructor(alloc_obj.destructor_function, alloc_obj.destructor); + if (const Allocation *alloc_obj = allocations.find(alloc->name)) { + internal_assert(alloc_obj->destructor); + trigger_destructor(alloc_obj->destructor_function, alloc_obj->destructor); allocations.pop(alloc->name); sym_pop(alloc->name); diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index a5c32cf83cc7..8922461524c5 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1268,7 +1268,8 @@ void CodeGen_LLVM::sym_pop(const string &name) { llvm::Value *CodeGen_LLVM::sym_get(const string &name, bool must_succeed) const { // look in the symbol table - if (!symbol_table.contains(name)) { + llvm::Value *const *v = symbol_table.find(name); + if (!v) { if (must_succeed) { std::ostringstream err; err << "Symbol not found: " << name << "\n"; @@ -1283,7 +1284,7 @@ llvm::Value *CodeGen_LLVM::sym_get(const string &name, bool must_succeed) const return nullptr; } } - return symbol_table.get(name); + return *v; } bool CodeGen_LLVM::sym_exists(const string &name) const { diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index 69d47279e9ae..79060294798e 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -390,8 +390,9 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Load *op) { string id_index = print_expr(op->index); // Get the rhs just for the cache. - bool type_cast_needed = !(allocations.contains(op->name) && - allocations.get(op->name).type == op->type); + const auto *alloc = allocations.find(op->name); + bool type_cast_needed = !(alloc && + alloc->type == op->type); ostringstream rhs; if (type_cast_needed) { rhs << "((" << get_memory_space(op->name) << " " @@ -467,8 +468,8 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const Store *op) { << id_value << "[" << i << "];\n"; } } else { - bool type_cast_needed = !(allocations.contains(op->name) && - allocations.get(op->name).type == t); + const auto *alloc = allocations.find(op->name); + bool type_cast_needed = !(alloc && alloc->type == t); string id_index = print_expr(op->index); stream << get_indent(); diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index 52feed53f9e0..c86e483cc5a8 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -484,8 +484,8 @@ string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_array_access(const string &na const Type &type, const string &id_index) { ostringstream rhs; - bool type_cast_needed = !(allocations.contains(name) && - allocations.get(name).type == type); + const auto *alloc = allocations.find(name); + bool type_cast_needed = !(alloc && alloc->type == type); if (type_cast_needed) { rhs << "((" << get_memory_space(name) << " " @@ -583,8 +583,8 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const Store *op) { // For atomicAdd, we check if op->value - store[index] is independent of store. // The atomicAdd operations in OpenCL only supports integers so we also check that. bool is_atomic_add = t.is_int_or_uint() && !expr_uses_var(delta, op->name); - bool type_cast_needed = !(allocations.contains(op->name) && - allocations.get(op->name).type == t); + const auto *alloc = allocations.find(op->name); + bool type_cast_needed = !(alloc && alloc->type == t); auto print_store_var = [&]() { if (type_cast_needed) { stream << "((" diff --git a/src/CodeGen_Posix.cpp b/src/CodeGen_Posix.cpp index af508194b06e..f812b63cce9d 100644 --- a/src/CodeGen_Posix.cpp +++ b/src/CodeGen_Posix.cpp @@ -342,8 +342,8 @@ void CodeGen_Posix::free_allocation(const std::string &name) { } string CodeGen_Posix::get_allocation_name(const std::string &n) { - if (allocations.contains(n)) { - return allocations.get(n).name; + if (const auto *alloc = allocations.find(n)) { + return alloc->name; } else { return n; } diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp index 61b365f2f7aa..39dd65b67671 100644 --- a/src/CodeGen_Vulkan_Dev.cpp +++ b/src/CodeGen_Vulkan_Dev.cpp @@ -1539,10 +1539,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Load *op) { user_assert(is_const_one(op->predicate)) << "Predicated loads not supported by SPIR-V codegen\n"; // Construct the pointer to read from - internal_assert(symbol_table.contains(op->name)); - SymbolIdStorageClassPair id_and_storage_class = symbol_table.get(op->name); - SpvId variable_id = id_and_storage_class.first; - SpvStorageClass storage_class = id_and_storage_class.second; + const SymbolIdStorageClassPair *id_and_storage_class = symbol_table.find(op->name); + internal_assert(id_and_storage_class); + SpvId variable_id = id_and_storage_class->first; + SpvStorageClass storage_class = id_and_storage_class->second; internal_assert(variable_id != SpvInvalidId); internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax)); @@ -1576,10 +1576,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Store *op) { op->value.accept(this); SpvId value_id = builder.current_id(); - internal_assert(symbol_table.contains(op->name)); - SymbolIdStorageClassPair id_and_storage_class = symbol_table.get(op->name); - SpvId variable_id = id_and_storage_class.first; - SpvStorageClass storage_class = id_and_storage_class.second; + const SymbolIdStorageClassPair *id_and_storage_class = symbol_table.find(op->name); + internal_assert(id_and_storage_class); + SpvId variable_id = id_and_storage_class->first; + SpvStorageClass storage_class = id_and_storage_class->second; internal_assert(variable_id != SpvInvalidId); internal_assert(((uint32_t)storage_class) < ((uint32_t)SpvStorageClassMax)); @@ -1665,9 +1665,10 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) { const std::string intrinsic_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + intrinsic.first; // Intrinsics are inserted when adding the kernel - internal_assert(symbol_table.contains(intrinsic_var_name)); - SpvId intrinsic_id = symbol_table.get(intrinsic_var_name).first; - SpvStorageClass storage_class = symbol_table.get(intrinsic_var_name).second; + const auto *intrin = symbol_table.find(intrinsic_var_name); + internal_assert(intrin); + SpvId intrinsic_id = intrin->first; + SpvStorageClass storage_class = intrin->second; // extract and cast to the extent type (which is what's expected by Halide's for loops) Type unsigned_type = UInt(32); @@ -1908,8 +1909,9 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Allocate *op) { void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Free *op) { debug(3) << "Vulkan: Popping allocation called " << op->name << " off the symbol table\n"; - internal_assert(symbol_table.contains(op->name)); - SpvId variable_id = symbol_table.get(op->name).first; + const auto *id = symbol_table.find(op->name); + internal_assert(id); + SpvId variable_id = id->first; storage_access_map.erase(variable_id); symbol_table.pop(op->name); } diff --git a/src/CodeGen_WebGPU_Dev.cpp b/src/CodeGen_WebGPU_Dev.cpp index 08d3a542f41b..de55113ff695 100644 --- a/src/CodeGen_WebGPU_Dev.cpp +++ b/src/CodeGen_WebGPU_Dev.cpp @@ -684,8 +684,8 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::visit(const Load *op) { // Get the allocation type, which may be different from the result type. Type alloc_type = result_type; - if (allocations.contains(op->name)) { - alloc_type = allocations.get(op->name).type; + if (const auto *alloc = allocations.find(op->name)) { + alloc_type = alloc->type; } else if (workgroup_allocations.count(op->name)) { alloc_type = workgroup_allocations.at(op->name)->type; } @@ -826,8 +826,8 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::visit(const Store *op) { // Get the allocation type, which may be different from the value type. Type alloc_type = value_type; - if (allocations.contains(op->name)) { - alloc_type = allocations.get(op->name).type; + if (const auto *alloc = allocations.find(op->name)) { + alloc_type = alloc->type; } else if (workgroup_allocations.count(op->name)) { alloc_type = workgroup_allocations.at(op->name)->type; } diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 8d87f4c1937e..0320e64b5ae5 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -866,28 +866,32 @@ void CodeGen_X86::visit(const Allocate *op) { } void CodeGen_X86::visit(const Load *op) { - if (mem_type.contains(op->name) && mem_type.get(op->name) == MemoryType::AMXTile) { - const Ramp *ramp = op->index.as(); - internal_assert(ramp) << "Expected AMXTile to have index ramp\n"; - Value *ptr = codegen_buffer_pointer(op->name, op->type, ramp->base); - LoadInst *load = builder->CreateAlignedLoad(llvm_type_of(upgrade_type_for_storage(op->type)), ptr, llvm::Align(op->type.bytes())); - add_tbaa_metadata(load, op->name, op->index); - value = load; - return; + if (const auto *mt = mem_type.find(op->name)) { + if (*mt == MemoryType::AMXTile) { + const Ramp *ramp = op->index.as(); + internal_assert(ramp) << "Expected AMXTile to have index ramp\n"; + Value *ptr = codegen_buffer_pointer(op->name, op->type, ramp->base); + LoadInst *load = builder->CreateAlignedLoad(llvm_type_of(upgrade_type_for_storage(op->type)), ptr, llvm::Align(op->type.bytes())); + add_tbaa_metadata(load, op->name, op->index); + value = load; + return; + } } CodeGen_Posix::visit(op); } void CodeGen_X86::visit(const Store *op) { - if (mem_type.contains(op->name) && mem_type.get(op->name) == MemoryType::AMXTile) { - Value *val = codegen(op->value); - Halide::Type value_type = op->value.type(); - const Ramp *ramp = op->index.as(); - internal_assert(ramp) << "Expected AMXTile to have index ramp\n"; - Value *ptr = codegen_buffer_pointer(op->name, value_type, ramp->base); - StoreInst *store = builder->CreateAlignedStore(val, ptr, llvm::Align(value_type.bytes())); - add_tbaa_metadata(store, op->name, op->index); - return; + if (const auto *mt = mem_type.find(op->name)) { + if (*mt == MemoryType::AMXTile) { + Value *val = codegen(op->value); + Halide::Type value_type = op->value.type(); + const Ramp *ramp = op->index.as(); + internal_assert(ramp) << "Expected AMXTile to have index ramp\n"; + Value *ptr = codegen_buffer_pointer(op->name, value_type, ramp->base); + StoreInst *store = builder->CreateAlignedStore(val, ptr, llvm::Align(value_type.bytes())); + add_tbaa_metadata(store, op->name, op->index); + return; + } } CodeGen_Posix::visit(op); } diff --git a/src/EliminateBoolVectors.cpp b/src/EliminateBoolVectors.cpp index cebfe0f0019b..62cdbdbef5b5 100644 --- a/src/EliminateBoolVectors.cpp +++ b/src/EliminateBoolVectors.cpp @@ -15,8 +15,8 @@ class EliminateBoolVectors : public IRMutator { Scope lets; Expr visit(const Variable *op) override { - if (lets.contains(op->name)) { - return Variable::make(lets.get(op->name), op->name); + if (const Type *t = lets.find(op->name)) { + return Variable::make(*t, op->name); } else { return op; } diff --git a/src/ExprUsesVar.h b/src/ExprUsesVar.h index 3bf129d259f7..84c3f7ae23d4 100644 --- a/src/ExprUsesVar.h +++ b/src/ExprUsesVar.h @@ -36,8 +36,8 @@ class ExprUsesVars : public IRGraphVisitor { void visit_name(const std::string &name) { if (vars.contains(name)) { result = true; - } else if (scope.contains(name)) { - include(scope.get(name)); + } else if (const Expr *e = scope.find(name)) { + IRGraphVisitor::include(*e); } } diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp index febd88d2399b..d453d0134c29 100644 --- a/src/FindIntrinsics.cpp +++ b/src/FindIntrinsics.cpp @@ -1118,8 +1118,8 @@ class SubstituteInWideningLets : public IRMutator { Scope replacements; Expr visit(const Variable *op) override { - if (replacements.contains(op->name)) { - return replacements.get(op->name); + if (const Expr *e = replacements.find(op->name)) { + return *e; } else { return op; } diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp index ef5a75344bb8..abde50d62e1f 100644 --- a/src/FuseGPUThreadLoops.cpp +++ b/src/FuseGPUThreadLoops.cpp @@ -1140,21 +1140,21 @@ class ExtractRegisterAllocations : public IRMutator { } Expr visit(const Load *op) override { - string new_name = op->name; - if (alloc_renaming.contains(op->name)) { - new_name = alloc_renaming.get(op->name); + const string *new_name = alloc_renaming.find(op->name); + if (!new_name) { + new_name = &(op->name); } - return Load::make(op->type, new_name, mutate(op->index), + return Load::make(op->type, *new_name, mutate(op->index), op->image, op->param, mutate(op->predicate), op->alignment); } Stmt visit(const Store *op) override { - string new_name = op->name; - if (alloc_renaming.contains(op->name)) { - new_name = alloc_renaming.get(op->name); + const string *new_name = alloc_renaming.find(op->name); + if (!new_name) { + new_name = &(op->name); } - return Store::make(new_name, mutate(op->value), mutate(op->index), + return Store::make(*new_name, mutate(op->value), mutate(op->index), op->param, mutate(op->predicate), op->alignment); } diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp index b76a9eb1cfef..deabd95d1d1b 100644 --- a/src/HexagonOptimize.cpp +++ b/src/HexagonOptimize.cpp @@ -1357,8 +1357,8 @@ class EliminateInterleaves : public IRMutator { } if (const Load *load = x.as()) { - if (buffers.contains(load->name)) { - return buffers.get(load->name) != BufferState::NotInterleaved; + if (const auto *state = buffers.find(load->name)) { + return *state != BufferState::NotInterleaved; } } @@ -1398,8 +1398,8 @@ class EliminateInterleaves : public IRMutator { } if (const Load *load = x.as()) { - if (buffers.contains(load->name)) { - return buffers.get(load->name) != BufferState::NotInterleaved; + if (const auto *state = buffers.find(load->name)) { + return *state != BufferState::NotInterleaved; } } @@ -1816,34 +1816,33 @@ class EliminateInterleaves : public IRMutator { Expr value = mutate(op->value); Expr index = mutate(op->index); - if (buffers.contains(op->name)) { + if (BufferState *state = buffers.shallow_find(op->name)) { // When inspecting the stores to a buffer, update the state. - BufferState &state = buffers.ref(op->name); if (!is_const_one(predicate) || !op->value.type().is_vector()) { // TODO(psuriana): This store is predicated. Mark the buffer as // not interleaved for now. - state = BufferState::NotInterleaved; + *state = BufferState::NotInterleaved; } else if (yields_removable_interleave(value)) { // The value yields a removable interleave. If we aren't tracking // this buffer, mark it as interleaved. - if (state == BufferState::Unknown) { - state = BufferState::Interleaved; + if (*state == BufferState::Unknown) { + *state = BufferState::Interleaved; } } else if (!yields_interleave(value)) { // The value does not yield an interleave. Mark the // buffer as not interleaved. - state = BufferState::NotInterleaved; + *state = BufferState::NotInterleaved; } else { // If the buffer yields an interleave, but is not an // interleave itself, we don't want to change the // buffer state. } - internal_assert(aligned_buffer_access.contains(op->name) && "Buffer not found in scope"); - bool &aligned_accesses = aligned_buffer_access.ref(op->name); + bool *aligned_accesses = aligned_buffer_access.shallow_find(op->name); + internal_assert(aligned_accesses) << "Buffer not found in scope"; int64_t aligned_offset = 0; if (!alignment_analyzer.is_aligned(op, &aligned_offset)) { - aligned_accesses = false; + *aligned_accesses = false; } } if (deinterleave_buffers.contains(op->name)) { @@ -1872,12 +1871,13 @@ class EliminateInterleaves : public IRMutator { // which is only true if any of the stores are // actually interleaved (and don't just yield an // interleave). - internal_assert(aligned_buffer_access.contains(op->name) && "Buffer not found in scope"); - bool &aligned_accesses = aligned_buffer_access.ref(op->name); + bool *aligned_accesses = aligned_buffer_access.shallow_find(op->name); + internal_assert(aligned_accesses) << "Buffer not found in scope"; + int64_t aligned_offset = 0; if (!alignment_analyzer.is_aligned(op, &aligned_offset)) { - aligned_accesses = false; + *aligned_accesses = false; } } else { // This is not a double vector load, so we can't diff --git a/src/LICM.cpp b/src/LICM.cpp index 641f4982a3e2..719b41442cfc 100644 --- a/src/LICM.cpp +++ b/src/LICM.cpp @@ -350,8 +350,8 @@ class GroupLoopInvariants : public IRMutator { const Scope &depth; void visit(const Variable *op) override { - if (depth.contains(op->name)) { - result = std::max(result, depth.get(op->name)); + if (const int *d = depth.find(op->name)) { + result = std::max(result, *d); } } diff --git a/src/LoopCarry.cpp b/src/LoopCarry.cpp index 050cdfbfc8d9..bfc2abc8ddf1 100644 --- a/src/LoopCarry.cpp +++ b/src/LoopCarry.cpp @@ -27,8 +27,8 @@ Expr is_linear(const Expr &e, const Scope &linear) { return Expr(); } if (const Variable *v = e.as()) { - if (linear.contains(v->name)) { - return linear.get(v->name); + if (const Expr *e = linear.find(v->name)) { + return *e; } else { return make_zero(v->type); } @@ -140,18 +140,17 @@ class StepForwards : public IRGraphMutator { using IRGraphMutator::visit; Expr visit(const Variable *op) override { - if (linear.contains(op->name)) { - Expr step = linear.get(op->name); - if (!step.defined()) { + if (const Expr *step = linear.find(op->name)) { + if (!step->defined()) { // It's non-linear success = false; return op; - } else if (is_const_zero(step)) { + } else if (is_const_zero(*step)) { // It's a known inner constant return op; } else { // It's linear - return Expr(op) + step; + return Expr(op) + *step; } } else { // It's some external constant diff --git a/src/LowerWarpShuffles.cpp b/src/LowerWarpShuffles.cpp index 79332c9336e5..ad48c37db78f 100644 --- a/src/LowerWarpShuffles.cpp +++ b/src/LowerWarpShuffles.cpp @@ -149,8 +149,8 @@ class DetermineAllocStride : public IRVisitor { } else if (const Variable *var = e.as()) { if (var->name == lane_var) { return 1; - } else if (dependent_vars.contains(var->name)) { - return dependent_vars.get(var->name); + } else if (const Expr *e = dependent_vars.find(var->name)) { + return *e; } else { return 0; } @@ -475,8 +475,9 @@ class LowerWarpShuffles : public IRMutator { if ((lt && equal(lt->a, this_lane) && is_const(lt->b)) || (le && equal(le->a, this_lane) && is_const(le->b))) { Expr condition = mutate(op->condition); - internal_assert(bounds.contains(this_lane_name)); - Interval interval = bounds.get(this_lane_name); + const Interval *in = bounds.find(this_lane_name); + internal_assert(in); + Interval interval = *in; interval.max = lt ? simplify(lt->b - 1) : le->b; ScopedBinding bind(bounds, this_lane_name, interval); Stmt then_case = mutate(op->then_case); @@ -488,10 +489,10 @@ class LowerWarpShuffles : public IRMutator { } Stmt visit(const Store *op) override { - if (allocation_info.contains(op->name)) { + if (const auto *alloc = allocation_info.find(op->name)) { Expr idx = mutate(op->index); Expr value = mutate(op->value); - Expr stride = allocation_info.get(op->name).stride; + Expr stride = alloc->stride; internal_assert(stride.defined() && warp_size.defined()); // Reduce the index to an index in my own stripe. We have @@ -639,9 +640,9 @@ class LowerWarpShuffles : public IRMutator { } Expr visit(const Load *op) override { - if (allocation_info.contains(op->name)) { + if (const auto *alloc = allocation_info.find(op->name)) { Expr idx = mutate(op->index); - Expr stride = allocation_info.get(op->name).stride; + Expr stride = alloc->stride; // Break the index into lane and stripe components Expr lane = simplify(reduce_expr(idx / stride, warp_size, bounds), true, bounds); diff --git a/src/ModulusRemainder.cpp b/src/ModulusRemainder.cpp index cfccce1da786..13b3c72a181d 100644 --- a/src/ModulusRemainder.cpp +++ b/src/ModulusRemainder.cpp @@ -110,8 +110,8 @@ void ComputeModulusRemainder::visit(const Reinterpret *) { } void ComputeModulusRemainder::visit(const Variable *op) { - if (scope.contains(op->name)) { - result = scope.get(op->name); + if (const auto *m = scope.find(op->name)) { + result = *m; } else { result = ModulusRemainder{}; } diff --git a/src/Monotonic.cpp b/src/Monotonic.cpp index dd8e17d5b177..fee151f00a22 100644 --- a/src/Monotonic.cpp +++ b/src/Monotonic.cpp @@ -280,8 +280,8 @@ class DerivativeBounds : public IRVisitor { void visit(const Variable *op) override { if (op->name == var) { result = ConstantInterval::single_point(1); - } else if (scope.contains(op->name)) { - result = scope.get(op->name); + } else if (const auto *r = scope.find(op->name)) { + result = *r; } else { result = ConstantInterval::single_point(0); } diff --git a/src/Prefetch.cpp b/src/Prefetch.cpp index c0fb1f5c9a64..144b1950c5cd 100644 --- a/src/Prefetch.cpp +++ b/src/Prefetch.cpp @@ -86,10 +86,9 @@ class InjectPrefetch : public IRMutator { using IRMutator::visit; Box get_buffer_bounds(const string &name, int dims) { - if (buffer_bounds.contains(name)) { - const Box &b = buffer_bounds.ref(name); - internal_assert((int)b.size() == dims); - return b; + if (const Box *b = buffer_bounds.find(name)) { + internal_assert((int)b->size() == dims); + return *b; } // It is an external buffer. diff --git a/src/PrintLoopNest.cpp b/src/PrintLoopNest.cpp index 52f1c319951a..9d38efaaf80a 100644 --- a/src/PrintLoopNest.cpp +++ b/src/PrintLoopNest.cpp @@ -94,12 +94,16 @@ class PrintLoopNest : public IRVisitor { Expr min_val = op->min, extent_val = op->extent; const Variable *min_var = min_val.as(); const Variable *extent_var = extent_val.as(); - if (min_var && constants.contains(min_var->name)) { - min_val = constants.get(min_var->name); + if (min_var) { + if (const Expr *e = constants.find(min_var->name)) { + min_val = *e; + } } - if (extent_var && constants.contains(extent_var->name)) { - extent_val = constants.get(extent_var->name); + if (extent_var) { + if (const Expr *e = constants.find(extent_var->name)) { + extent_val = *e; + } } if (extent_val.defined() && is_const(extent_val) && @@ -151,9 +155,8 @@ class PrintLoopNest : public IRVisitor { void visit(const LetStmt *op) override { if (is_const(op->value)) { - constants.push(op->name, op->value); + ScopedBinding bind(constants, op->name, op->value); op->body.accept(this); - constants.pop(op->name); } else { op->body.accept(this); } diff --git a/src/Scope.h b/src/Scope.h index 9d1cc43e1164..94d9eb9c165b 100644 --- a/src/Scope.h +++ b/src/Scope.h @@ -150,7 +150,39 @@ class Scope { return iter->second.top_ref(); } - /** Tests if a name is in scope */ + /** Returns a const pointer to an entry if it exists in this scope or any + * containing scope, or nullptr if it does not. Use this instead of if + * (scope.contains(foo)) { ... scope.get(foo) ... } to avoid doing two + * lookups. */ + template::value>::type> + const T2 *find(const std::string &name) const { + typename std::map>::const_iterator iter = table.find(name); + if (iter == table.end() || iter->second.empty()) { + if (containing_scope) { + return containing_scope->find(name); + } else { + return nullptr; + } + } + return &(iter->second.top_ref()); + } + + /** A version of find that returns a non-const pointer, but ignores + * containing scope. */ + template::value>::type> + T2 *shallow_find(const std::string &name) { + typename std::map>::iterator iter = table.find(name); + if (iter == table.end() || iter->second.empty()) { + return nullptr; + } else { + return &(iter->second.top_ref()); + } + } + + /** Tests if a name is in scope. If you plan to use the value if it is, call + * find instead. */ bool contains(const std::string &name) const { typename std::map>::const_iterator iter = table.find(name); if (iter == table.end() || iter->second.empty()) { @@ -173,19 +205,28 @@ class Scope { } } - /** Add a new (name, value) pair to the current scope. Hide old - * values that have this name until we pop this name. + struct PushToken { + typename std::map>::iterator iter; + }; + + /** Add a new (name, value) pair to the current scope. Hide old values that + * have this name until we pop this name. Returns a token that can be used + * to pop the same value without doing a fresh lookup. */ template::value>::type> - void push(const std::string &name, T2 &&value) { - table[name].push(std::forward(value)); + PushToken push(const std::string &name, T2 &&value) { + auto it = table.try_emplace(name).first; + it->second.push(std::forward(value)); + return PushToken{it}; } template::value>::type> - void push(const std::string &name) { - table[name].push(); + PushToken push(const std::string &name) { + auto it = table.try_emplace(name).first; + it->second.push(); + return PushToken{it}; } /** A name goes out of scope. Restore whatever its old value @@ -201,6 +242,14 @@ class Scope { } } + /** Pop a name using a token returned by push instead of a string. */ + void pop(PushToken p) { + p.iter->second.pop(); + if (p.iter->second.empty()) { + table.erase(p.iter); + } + } + /** Iterate through the scope. Does not capture any containing scope. */ class const_iterator { typename std::map>::const_iterator iter; @@ -271,20 +320,17 @@ std::ostream &operator<<(std::ostream &stream, const Scope &s) { template struct ScopedBinding { Scope *scope = nullptr; - std::string name; + typename Scope::PushToken token; ScopedBinding() = default; ScopedBinding(Scope &s, const std::string &n, T value) - : scope(&s), name(n) { - scope->push(name, std::move(value)); + : scope(&s), token(scope->push(n, std::move(value))) { } ScopedBinding(bool condition, Scope &s, const std::string &n, const T &value) - : scope(condition ? &s : nullptr), name(n) { - if (condition) { - scope->push(name, value); - } + : scope(condition ? &s : nullptr), + token(condition ? scope->push(n, value) : typename Scope::PushToken{}) { } bool bound() const { @@ -293,7 +339,7 @@ struct ScopedBinding { ~ScopedBinding() { if (scope) { - scope->pop(name); + scope->pop(token); } } @@ -301,7 +347,7 @@ struct ScopedBinding { ScopedBinding(const ScopedBinding &that) = delete; ScopedBinding(ScopedBinding &&that) noexcept : scope(that.scope), - name(std::move(that.name)) { + token(that.token) { // The move constructor must null out scope, so we don't try to pop it that.scope = nullptr; } @@ -313,20 +359,17 @@ struct ScopedBinding { template<> struct ScopedBinding { Scope<> *scope; - std::string name; + Scope<>::PushToken token; ScopedBinding(Scope<> &s, const std::string &n) - : scope(&s), name(n) { - scope->push(name); + : scope(&s), token(scope->push(n)) { } ScopedBinding(bool condition, Scope<> &s, const std::string &n) - : scope(condition ? &s : nullptr), name(n) { - if (condition) { - scope->push(name); - } + : scope(condition ? &s : nullptr), + token(condition ? scope->push(n) : Scope<>::PushToken{}) { } ~ScopedBinding() { if (scope) { - scope->pop(name); + scope->pop(token); } } @@ -334,7 +377,7 @@ struct ScopedBinding { ScopedBinding(const ScopedBinding &that) = delete; ScopedBinding(ScopedBinding &&that) noexcept : scope(that.scope), - name(std::move(that.name)) { + token(that.token) { // The move constructor must null out scope, so we don't try to pop it that.scope = nullptr; } diff --git a/src/Simplify.cpp b/src/Simplify.cpp index 339ef2917c83..61cf7886cb70 100644 --- a/src/Simplify.cpp +++ b/src/Simplify.cpp @@ -34,8 +34,8 @@ Simplify::Simplify(bool r, const Scope *bi, const Scopecontains(iter.name())) { - bounds.alignment = ai->get(iter.name()); + if (const auto *a = ai->find(iter.name())) { + bounds.alignment = *a; } if (bounds.min_defined || bounds.max_defined || bounds.alignment.modulus != 1) { @@ -74,18 +74,18 @@ std::pair, bool> Simplify::mutate_with_changes(const std::vect void Simplify::found_buffer_reference(const string &name, size_t dimensions) { for (size_t i = 0; i < dimensions; i++) { string stride = name + ".stride." + std::to_string(i); - if (var_info.contains(stride)) { - var_info.ref(stride).old_uses++; + if (auto *info = var_info.shallow_find(stride)) { + info->old_uses++; } string min = name + ".min." + std::to_string(i); - if (var_info.contains(min)) { - var_info.ref(min).old_uses++; + if (auto *info = var_info.shallow_find(min)) { + info->old_uses++; } } - if (var_info.contains(name)) { - var_info.ref(name).old_uses++; + if (auto *info = var_info.shallow_find(name)) { + info->old_uses++; } } @@ -187,8 +187,8 @@ void Simplify::ScopedFact::learn_upper_bound(const Variable *v, int64_t val) { ExprInfo b; b.max_defined = true; b.max = val; - if (simplify->bounds_and_alignment_info.contains(v->name)) { - b.intersect(simplify->bounds_and_alignment_info.get(v->name)); + if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) { + b.intersect(*info); } simplify->bounds_and_alignment_info.push(v->name, b); bounds_pop_list.push_back(v); @@ -198,8 +198,8 @@ void Simplify::ScopedFact::learn_lower_bound(const Variable *v, int64_t val) { ExprInfo b; b.min_defined = true; b.min = val; - if (simplify->bounds_and_alignment_info.contains(v->name)) { - b.intersect(simplify->bounds_and_alignment_info.get(v->name)); + if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) { + b.intersect(*info); } simplify->bounds_and_alignment_info.push(v->name, b); bounds_pop_list.push_back(v); @@ -228,10 +228,9 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) { // TODO: Visiting it again is inefficient Simplify::ExprInfo expr_info; simplify->mutate(eq->b, &expr_info); - if (simplify->bounds_and_alignment_info.contains(v->name)) { + if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) { // We already know something about this variable and don't want to suppress it. - auto existing_knowledge = simplify->bounds_and_alignment_info.get(v->name); - expr_info.intersect(existing_knowledge); + expr_info.intersect(*info); } simplify->bounds_and_alignment_info.push(v->name, expr_info); bounds_pop_list.push_back(v); @@ -245,10 +244,9 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) { // TODO: Visiting it again is inefficient Simplify::ExprInfo expr_info; simplify->mutate(eq->a, &expr_info); - if (simplify->bounds_and_alignment_info.contains(vb->name)) { + if (const auto *info = simplify->bounds_and_alignment_info.find(vb->name)) { // We already know something about this variable and don't want to suppress it. - auto existing_knowledge = simplify->bounds_and_alignment_info.get(vb->name); - expr_info.intersect(existing_knowledge); + expr_info.intersect(*info); } simplify->bounds_and_alignment_info.push(vb->name, expr_info); bounds_pop_list.push_back(vb); @@ -257,10 +255,9 @@ void Simplify::ScopedFact::learn_true(const Expr &fact) { Simplify::ExprInfo expr_info; expr_info.alignment.modulus = *modulus; expr_info.alignment.remainder = *remainder; - if (simplify->bounds_and_alignment_info.contains(v->name)) { + if (const auto *info = simplify->bounds_and_alignment_info.find(v->name)) { // We already know something about this variable and don't want to suppress it. - auto existing_knowledge = simplify->bounds_and_alignment_info.get(v->name); - expr_info.intersect(existing_knowledge); + expr_info.intersect(*info); } simplify->bounds_and_alignment_info.push(v->name, expr_info); bounds_pop_list.push_back(v); @@ -417,8 +414,8 @@ bool can_prove(Expr e, const Scope &bounds) { Expr visit(const Variable *op) override { auto it = vars.find(op->name); - if (lets.contains(op->name)) { - return Variable::make(op->type, lets.get(op->name)); + if (const std::string *n = lets.find(op->name)) { + return Variable::make(op->type, *n); } else if (it == vars.end()) { std::string name = "v" + std::to_string(count++); vars[op->name] = name; diff --git a/src/Simplify_Exprs.cpp b/src/Simplify_Exprs.cpp index a8e5fcce1a8d..b5fcc96ac0cd 100644 --- a/src/Simplify_Exprs.cpp +++ b/src/Simplify_Exprs.cpp @@ -221,35 +221,32 @@ Expr Simplify::visit(const VectorReduce *op, ExprInfo *bounds) { } Expr Simplify::visit(const Variable *op, ExprInfo *bounds) { - if (bounds_and_alignment_info.contains(op->name)) { - const ExprInfo &b = bounds_and_alignment_info.get(op->name); + if (const ExprInfo *b = bounds_and_alignment_info.find(op->name)) { if (bounds) { - *bounds = b; + *bounds = *b; } - if (b.min_defined && b.max_defined && b.min == b.max) { - return make_const(op->type, b.min); + if (b->min_defined && b->max_defined && b->min == b->max) { + return make_const(op->type, b->min); } } - if (var_info.contains(op->name)) { - auto &info = var_info.ref(op->name); - + if (auto *info = var_info.shallow_find(op->name)) { // if replacement is defined, we should substitute it in (unless // it's a var that has been hidden by a nested scope). - if (info.replacement.defined()) { - internal_assert(info.replacement.type() == op->type) + if (info->replacement.defined()) { + internal_assert(info->replacement.type() == op->type) << "Cannot replace variable " << op->name << " of type " << op->type - << " with expression of type " << info.replacement.type() << "\n"; - info.new_uses++; + << " with expression of type " << info->replacement.type() << "\n"; + info->new_uses++; // We want to remutate the replacement, because we may be // injecting it into a context where it is known to be a // constant (e.g. due to an if). - return mutate(info.replacement, bounds); + return mutate(info->replacement, bounds); } else { // This expression was not something deemed // substitutable - no replacement is defined. - info.old_uses++; + info->old_uses++; return op; } } else { @@ -321,15 +318,14 @@ Expr Simplify::visit(const Load *op, ExprInfo *bounds) { // unreachable loads. if (is_const_one(op->predicate)) { string alloc_extent_name = op->name + ".total_extent_bytes"; - if (bounds_and_alignment_info.contains(alloc_extent_name)) { + if (const auto *alloc_info = bounds_and_alignment_info.find(alloc_extent_name)) { if (index_info.max_defined && index_info.max < 0) { in_unreachable = true; return unreachable(op->type); } - const ExprInfo &alloc_info = bounds_and_alignment_info.get(alloc_extent_name); - if (alloc_info.max_defined && index_info.min_defined) { + if (alloc_info->max_defined && index_info.min_defined) { int index_min_bytes = index_info.min * op->type.bytes(); - if (index_min_bytes > alloc_info.max) { + if (index_min_bytes > alloc_info->max) { in_unreachable = true; return unreachable(op->type); } diff --git a/src/Simplify_Stmts.cpp b/src/Simplify_Stmts.cpp index 11b146ecdc6a..f6cb81345961 100644 --- a/src/Simplify_Stmts.cpp +++ b/src/Simplify_Stmts.cpp @@ -305,19 +305,19 @@ Stmt Simplify::visit(const Store *op) { // but perhaps the branch was hard to prove constant true or false. This // provides an alternative mechanism to simplify these unreachable stores. string alloc_extent_name = op->name + ".total_extent_bytes"; - if (is_const_one(op->predicate) && - bounds_and_alignment_info.contains(alloc_extent_name)) { - if (index_info.max_defined && index_info.max < 0) { - in_unreachable = true; - return Evaluate::make(unreachable()); - } - const ExprInfo &alloc_info = bounds_and_alignment_info.get(alloc_extent_name); - if (alloc_info.max_defined && index_info.min_defined) { - int index_min_bytes = index_info.min * op->value.type().bytes(); - if (index_min_bytes > alloc_info.max) { + if (is_const_one(op->predicate)) { + if (const auto *alloc_info = bounds_and_alignment_info.find(alloc_extent_name)) { + if (index_info.max_defined && index_info.max < 0) { in_unreachable = true; return Evaluate::make(unreachable()); } + if (alloc_info->max_defined && index_info.min_defined) { + int index_min_bytes = index_info.min * op->value.type().bytes(); + if (index_min_bytes > alloc_info->max) { + in_unreachable = true; + return Evaluate::make(unreachable()); + } + } } } diff --git a/src/SlidingWindow.cpp b/src/SlidingWindow.cpp index ab25ad32bc87..dfb50d714e37 100644 --- a/src/SlidingWindow.cpp +++ b/src/SlidingWindow.cpp @@ -69,10 +69,9 @@ class ExpandExpr : public IRMutator { const Scope &scope; Expr visit(const Variable *var) override { - if (scope.contains(var->name)) { - Expr expr = scope.get(var->name); - debug(4) << "Fully expanded " << var->name << " -> " << expr << "\n"; - return expr; + if (const Expr *expr = scope.find(var->name)) { + debug(4) << "Fully expanded " << var->name << " -> " << *expr << "\n"; + return *expr; } else { return var; } diff --git a/src/Solve.cpp b/src/Solve.cpp index b25719cff8c7..09245d90bf24 100644 --- a/src/Solve.cpp +++ b/src/Solve.cpp @@ -786,17 +786,15 @@ class SolveExpression : public IRMutator { if (op->name == var) { uses_var = true; return op; - } else if (scope.contains(op->name)) { - CacheEntry e = scope.get(op->name); - uses_var = uses_var || e.uses_var; - failed = failed || e.failed; - return e.expr; - } else if (external_scope.contains(op->name)) { - Expr e = external_scope.get(op->name); + } else if (const CacheEntry *e = scope.find(op->name)) { + uses_var = uses_var || e->uses_var; + failed = failed || e->failed; + return e->expr; + } else if (const Expr *e = external_scope.find(op->name)) { // Expressions in the external scope haven't been solved // yet. This will either pull its solution from the cache, // or solve it and then put it into the cache. - return mutate(e); + return mutate(*e); } else { return op; } @@ -948,13 +946,13 @@ class SolveForInterval : public IRVisitor { void visit(const Variable *op) override { internal_assert(op->type.is_bool()); - if (scope.contains(op->name)) { + if (const Expr *e = scope.find(op->name)) { pair key = {op->name, target}; auto it = solved_vars.find(key); if (it != solved_vars.end()) { result = it->second; } else { - scope.get(op->name).accept(this); + e->accept(this); solved_vars[key] = result; } } else { diff --git a/src/StageStridedLoads.cpp b/src/StageStridedLoads.cpp index feeab56a4122..723fc738ce51 100644 --- a/src/StageStridedLoads.cpp +++ b/src/StageStridedLoads.cpp @@ -103,8 +103,8 @@ class FindStridedLoads : public IRVisitor { if (stride >= 2 && stride < r->lanes && r->stride.type().is_scalar()) { const IRNode *s = scope; const Allocate *a = nullptr; - if (allocation_scope.contains(op->name)) { - a = allocation_scope.get(op->name); + if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) { + a = *a_ptr; } found_loads[Key{op->name, base, stride, r->lanes, op->type, a, s}][offset].push_back(op); } @@ -161,8 +161,8 @@ class ReplaceStridedLoads : public IRMutator { protected: Expr visit(const Load *op) override { const Allocate *alloc = nullptr; - if (allocation_scope.contains(op->name)) { - alloc = allocation_scope.get(op->name); + if (const Allocate *const *a_ptr = allocation_scope.find(op->name)) { + alloc = *a_ptr; } auto it = replacements.find({alloc, op}); if (it != replacements.end()) { diff --git a/src/StmtToHTML.cpp b/src/StmtToHTML.cpp index 9c317ba35525..79cf6563551e 100644 --- a/src/StmtToHTML.cpp +++ b/src/StmtToHTML.cpp @@ -1134,8 +1134,8 @@ class HTMLCodePrinter : public IRVisitor { std::string variable(const std::string &x, const std::string &tooltip) { int id; - if (scope.contains(x)) { - id = scope.get(x); + if (const int *i = scope.find(x)) { + id = *i; } else { id = gen_unique_id(); scope.push(x, id); diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index d7e7c50002f6..13d7d6475120 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -31,10 +31,9 @@ class ExpandExpr : public IRMutator { const Scope &scope; Expr visit(const Variable *var) override { - if (scope.contains(var->name)) { - Expr expr = scope.get(var->name); + if (const Expr *e = scope.find(var->name)) { // Mutate the expression, so lets can get replaced recursively. - expr = mutate(expr); + Expr expr = mutate(*e); debug(4) << "Fully expanded " << var->name << " -> " << expr << "\n"; return expr; } else { diff --git a/src/UniquifyVariableNames.cpp b/src/UniquifyVariableNames.cpp index 26689ec34633..85a6ba521771 100644 --- a/src/UniquifyVariableNames.cpp +++ b/src/UniquifyVariableNames.cpp @@ -104,10 +104,9 @@ class UniquifyVariableNames : public IRMutator { } Expr visit(const Variable *op) override { - if (renaming.contains(op->name)) { - string new_name = renaming.get(op->name); - if (new_name != op->name) { - return Variable::make(op->type, new_name); + if (const string *new_name = renaming.find(op->name)) { + if (*new_name != op->name) { + return Variable::make(op->type, *new_name); } } return op; diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index 6d10d2e9d5f3..0745a34a9d39 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -297,8 +297,8 @@ bool is_interleaved_ramp(const Expr &e, const Scope &scope, InterleavedRam return true; } } else if (const Variable *var = e.as()) { - if (scope.contains(var->name)) { - return is_interleaved_ramp(scope.get(var->name), scope, result); + if (const Expr *e = scope.find(var->name)) { + return is_interleaved_ramp(*e, scope, result); } } return false; From 4399ed819bbc23f6d89a0baece854419587120d2 Mon Sep 17 00:00:00 2001 From: Zalman Stern Date: Thu, 22 Feb 2024 20:07:47 -0800 Subject: [PATCH 073/186] Add Intel APX and AVX10 target flags and LLVM attribute setting. (#8052) * Add target flag and LLVM enables support for Intel AVX10. * Go ahead and add APX support as well. Correct spelling of APX target attributes. * Implement AVX10 and APX cpu feature detection. (As yet untested.) * Expand target feature flags for AVX10. --------- Co-authored-by: Steven Johnson --- .../src/halide/halide_/PyEnums.cpp | 2 + src/CodeGen_X86.cpp | 43 ++++++++++++++++--- src/Target.cpp | 39 ++++++++++++++++- src/Target.h | 2 + src/runtime/HalideRuntime.h | 2 + test/correctness/simd_op_check_x86.cpp | 2 + 6 files changed, 83 insertions(+), 7 deletions(-) diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp index e6cede6c6edb..4edd8029c340 100644 --- a/python_bindings/src/halide/halide_/PyEnums.cpp +++ b/python_bindings/src/halide/halide_/PyEnums.cpp @@ -192,6 +192,8 @@ void define_enums(py::module &m) { .value("VulkanV12", Target::VulkanV12) .value("VulkanV13", Target::VulkanV13) .value("Semihosting", Target::Feature::Semihosting) + .value("AVX10_1", Target::Feature::AVX10_1) + .value("X86APX", Target::Feature::X86APX) .value("FeatureEnd", Target::Feature::FeatureEnd); py::enum_(m, "TypeCode") diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 0320e64b5ae5..b0df27af0f2f 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -28,6 +28,14 @@ namespace { // existing flags, so that instruction patterns can just check for the // oldest feature flag that supports an instruction. Target complete_x86_target(Target t) { + if (t.has_feature(Target::AVX10_1)) { + if (t.vector_bits >= 256) { + t.set_feature(Target::AVX2); + } + if (t.vector_bits >= 512) { + t.set_feature(Target::AVX512_SapphireRapids); + } + } if (t.has_feature(Target::AVX512_SapphireRapids)) { t.set_feature(Target::AVX512_Zen4); } @@ -54,6 +62,7 @@ Target complete_x86_target(Target t) { if (t.has_feature(Target::AVX)) { t.set_feature(Target::SSE41); } + return t; } @@ -1035,9 +1044,31 @@ string CodeGen_X86::mattrs() const { } #if LLVM_VERSION >= 180 if (gather_might_be_slow(target)) { - attrs.push_back("+prefer-no-gather"); + attrs.emplace_back("+prefer-no-gather"); } #endif + + if (target.has_feature(Target::AVX10_1)) { + switch (target.vector_bits) { + case 256: + attrs.emplace_back("+avx10.1-256"); + break; + case 512: + attrs.emplace_back("+avx10.1-512"); + break; + default: + user_error << "AVX10 only supports 256 or 512 bit variants at present.\n"; + break; + } + } + + if (target.has_feature(Target::X86APX)) { + attrs.emplace_back("+egpr"); + attrs.emplace_back("+push2pop2"); + attrs.emplace_back("+ppx"); + attrs.emplace_back("+ndd"); + } + return join_strings(attrs, ","); } @@ -1046,10 +1077,12 @@ bool CodeGen_X86::use_soft_float_abi() const { } int CodeGen_X86::native_vector_bits() const { - if (target.has_feature(Target::AVX512) || - target.has_feature(Target::AVX512_Skylake) || - target.has_feature(Target::AVX512_KNL) || - target.has_feature(Target::AVX512_Cannonlake)) { + if (target.has_feature(Target::AVX10_1)) { + return target.vector_bits; + } else if (target.has_feature(Target::AVX512) || + target.has_feature(Target::AVX512_Skylake) || + target.has_feature(Target::AVX512_KNL) || + target.has_feature(Target::AVX512_Cannonlake)) { return 512; } else if (target.has_feature(Target::AVX) || target.has_feature(Target::AVX2)) { diff --git a/src/Target.cpp b/src/Target.cpp index 082b5103bd0b..ac96ae019065 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -251,6 +251,8 @@ Target calculate_host_target() { // Call cpuid with eax=7, ecx=0 int info2[4]; cpuid(info2, 7, 0); + int info3[4]; + cpuid(info3, 7, 1); const uint32_t avx2 = 1U << 5; const uint32_t avx512f = 1U << 16; const uint32_t avx512dq = 1U << 17; @@ -283,8 +285,6 @@ Target calculate_host_target() { const uint32_t avxvnni = 1U << 4; // avxvnni (note, not avx512vnni) result in eax const uint32_t avx512bf16 = 1U << 5; // bf16 result in eax, with cpuid(eax=7, ecx=1) - int info3[4]; - cpuid(info3, 7, 1); // TODO: port to family/model -based detection. if ((info3[0] & avxvnni) == avxvnni && (info3[0] & avx512bf16) == avx512bf16) { @@ -292,7 +292,40 @@ Target calculate_host_target() { } } } + + // AVX10 converged vector instructions. + const uint32_t avx10 = 1U << 19; + if (info2[3] & avx10) { + int info_avx10[4]; + cpuid(info_avx10, 0x24, 0x0); + + // This checks that the AVX10 version is greater than zero. + // It isn't really needed as for now only one version exists, but + // the docs indicate bits 0:7 of EBX should be >= 0 so... + if ((info[1] & 0xff) >= 1) { + initial_features.push_back(Target::AVX10_1); + + const uint32_t avx10_128 = 1U << 16; + const uint32_t avx10_256 = 1U << 17; + const uint32_t avx10_512 = 1U << 18; + // Choose the maximum one that is available. + if (info[1] & avx10_512) { + vector_bits = 512; + } else if (info[1] & avx10_256) { + vector_bits = 256; + } else if (info[1] & avx10_128) { // Not clear it is worth turning on AVX10 for this case. + vector_bits = 128; + } + } + } + + // APX register extensions, etc. + const uint32_t apx = 1U << 21; + if (info3[3] & apx) { + initial_features.push_back(Target::X86APX); + } } + #endif #endif #endif @@ -556,6 +589,8 @@ const std::map feature_name_map = { {"vk_v12", Target::VulkanV12}, {"vk_v13", Target::VulkanV13}, {"semihosting", Target::Semihosting}, + {"avx10_1", Target::AVX10_1}, + {"x86apx", Target::X86APX}, // NOTE: When adding features to this map, be sure to update PyEnums.cpp as well. }; diff --git a/src/Target.h b/src/Target.h index 20730a313883..3bc586822f75 100644 --- a/src/Target.h +++ b/src/Target.h @@ -167,6 +167,8 @@ struct Target { VulkanV12 = halide_target_feature_vulkan_version12, VulkanV13 = halide_target_feature_vulkan_version13, Semihosting = halide_target_feature_semihosting, + AVX10_1 = halide_target_feature_avx10_1, + X86APX = halide_target_feature_x86_apx, FeatureEnd = halide_target_feature_end }; Target() = default; diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index b235117e9f5e..62fbaeb66d43 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1425,6 +1425,8 @@ typedef enum halide_target_feature_t { halide_target_feature_vulkan_version12, ///< Enable Vulkan v1.2 runtime target support. halide_target_feature_vulkan_version13, ///< Enable Vulkan v1.3 runtime target support. halide_target_feature_semihosting, ///< Used together with Target::NoOS for the baremetal target built with semihosting library and run with semihosting mode where minimum I/O communication with a host PC is available. + halide_target_feature_avx10_1, ///< Intel AVX10 version 1 support. vector_bits is used to indicate width. + halide_target_feature_x86_apx, ///< Intel x86 APX support. Covers initial set of features released as APX: egpr,push2pop2,ppx,ndd . halide_target_feature_end ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing. } halide_target_feature_t; diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp index b4c086ce0fc3..8286bc68f9e6 100644 --- a/test/correctness/simd_op_check_x86.cpp +++ b/test/correctness/simd_op_check_x86.cpp @@ -673,5 +673,7 @@ int main(int argc, char **argv) { Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake"), Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4"), Target("x86-64-linux-sse41-avx-f16c-fma-avx2-avx512-avx512_skylake-avx512_cannonlake-avx512_zen4-avx512_sapphirerapids"), + // Can be enabled when AVX10 and APX support are stable in LLVM. + // Target("x86-64-linux-avx10_1-vector_bits_256-x86apx"), }); } From aae84f69ebbffe1689f25ab4bd80a2143b626bf2 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 26 Feb 2024 09:56:17 -0800 Subject: [PATCH 074/186] Use a caching version of stmt_uses_vars in TightenProducerConsumer nodes (#8102) We were making a very large number stmt_uses_vars queries that covered the same sub-stmts. I solved it by adding a cache. Speeds up local laplacian lowering by 10% by basically removing this pass from the profile. Also a drive-by typo fix in Lower.cpp --- src/AsyncProducers.cpp | 80 +++++++++++++++++++++++++++++++++++------- src/Lower.cpp | 2 +- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/src/AsyncProducers.cpp b/src/AsyncProducers.cpp index 92012ccfe4c1..352219478923 100644 --- a/src/AsyncProducers.cpp +++ b/src/AsyncProducers.cpp @@ -569,11 +569,67 @@ class InitializeSemaphores : public IRMutator { } }; +// A class to support stmt_uses_vars queries that repeatedly hit the same +// sub-stmts. Used to support TightenProducerConsumerNodes below. +class CachingStmtUsesVars : public IRMutator { + const Scope<> &query; + bool found_use = false; + std::map cache; + + using IRMutator::visit; + Expr visit(const Variable *op) override { + found_use |= query.contains(op->name); + return op; + } + + Expr visit(const Call *op) override { + found_use |= query.contains(op->name); + IRMutator::visit(op); + return op; + } + + Stmt visit(const Provide *op) override { + found_use |= query.contains(op->name); + IRMutator::visit(op); + return op; + } + +public: + CachingStmtUsesVars(const Scope<> &q) + : query(q) { + } + + using IRMutator::mutate; + Stmt mutate(const Stmt &s) override { + auto it = cache.find(s); + if (it != cache.end()) { + found_use |= it->second; + } else { + bool old = found_use; + found_use = false; + Stmt stmt = IRMutator::mutate(s); + if (found_use) { + cache.emplace(s, true); + } else { + cache.emplace(s, false); + } + found_use |= old; + } + return s; + } + + bool check_stmt(const Stmt &s) { + found_use = false; + mutate(s); + return found_use; + } +}; + // Tighten the scope of consume nodes as much as possible to avoid needless synchronization. class TightenProducerConsumerNodes : public IRMutator { using IRMutator::visit; - Stmt make_producer_consumer(const string &name, bool is_producer, Stmt body, const Scope &scope) { + Stmt make_producer_consumer(const string &name, bool is_producer, Stmt body, const Scope<> &scope, CachingStmtUsesVars &uses_vars) { if (const LetStmt *let = body.as()) { Stmt orig = body; // 'orig' is only used to keep a reference to the let @@ -595,7 +651,7 @@ class TightenProducerConsumerNodes : public IRMutator { body = ProducerConsumer::make(name, is_producer, body); } else { // Recurse onto a non-let-node - body = make_producer_consumer(name, is_producer, body, scope); + body = make_producer_consumer(name, is_producer, body, scope, uses_vars); } for (auto it = containing_lets.rbegin(); it != containing_lets.rend(); it++) { @@ -611,7 +667,6 @@ class TightenProducerConsumerNodes : public IRMutator { vector sub_stmts; Stmt rest; do { - Stmt first = block->first; sub_stmts.push_back(block->first); rest = block->rest; block = rest.as(); @@ -619,18 +674,18 @@ class TightenProducerConsumerNodes : public IRMutator { sub_stmts.push_back(rest); for (Stmt &s : sub_stmts) { - if (stmt_uses_vars(s, scope)) { - s = make_producer_consumer(name, is_producer, s, scope); + if (uses_vars.check_stmt(s)) { + s = make_producer_consumer(name, is_producer, s, scope, uses_vars); } } return Block::make(sub_stmts); } else if (const ProducerConsumer *pc = body.as()) { - return ProducerConsumer::make(pc->name, pc->is_producer, make_producer_consumer(name, is_producer, pc->body, scope)); + return ProducerConsumer::make(pc->name, pc->is_producer, make_producer_consumer(name, is_producer, pc->body, scope, uses_vars)); } else if (const Realize *r = body.as()) { return Realize::make(r->name, r->types, r->memory_type, r->bounds, r->condition, - make_producer_consumer(name, is_producer, r->body, scope)); + make_producer_consumer(name, is_producer, r->body, scope, uses_vars)); } else { return ProducerConsumer::make(name, is_producer, body); } @@ -638,17 +693,18 @@ class TightenProducerConsumerNodes : public IRMutator { Stmt visit(const ProducerConsumer *op) override { Stmt body = mutate(op->body); - Scope scope; - scope.push(op->name, 0); + Scope<> scope; + scope.push(op->name); Function f = env.find(op->name)->second; if (f.outputs() == 1) { - scope.push(op->name + ".buffer", 0); + scope.push(op->name + ".buffer"); } else { for (int i = 0; i < f.outputs(); i++) { - scope.push(op->name + "." + std::to_string(i) + ".buffer", 0); + scope.push(op->name + "." + std::to_string(i) + ".buffer"); } } - return make_producer_consumer(op->name, op->is_producer, body, scope); + CachingStmtUsesVars uses_vars{scope}; + return make_producer_consumer(op->name, op->is_producer, body, scope, uses_vars); } const map &env; diff --git a/src/Lower.cpp b/src/Lower.cpp index 6b56f23fcff9..52c049b63c72 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -331,7 +331,7 @@ void lower_impl(const vector &output_funcs, debug(1) << "Simplifying...\n"; s = simplify(s); s = unify_duplicate_lets(s); - log("Lowering after second simplifcation:", s); + log("Lowering after second simplification:", s); debug(1) << "Reduce prefetch dimension...\n"; s = reduce_prefetch_dimension(s, t); From 2b5beb3dfd2e079d21bce146b09c6645e0ba7df5 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 26 Feb 2024 17:11:47 -0800 Subject: [PATCH 075/186] Fix hoist_storage not handling condition correctly. (#8123) The allocation condition wasn't getting relaxed over the scope and loop vars like the extents were. --- src/StorageFlattening.cpp | 32 +++++++++++++++++++++++--------- test/correctness/skip_stages.cpp | 26 +++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index 13d7d6475120..ba4cc9b8acca 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -293,23 +293,37 @@ class FlattenDimensions : public IRMutator { stmt = LetStmt::make(op->name + ".buffer", builder.build(), stmt); if (hoisted_storages_map.count(op->name) > 0) { HoistedStorageData &hoisted_storage_data = hoisted_storages[hoisted_storages_map[op->name]]; - vector bounded_extents; - for (const auto &e : allocation_extents) { - Expr expanded_extent = e; + + auto expand_and_bound = [&](Expr e) { // Iterate from innermost outwards for (auto it = hoisted_storages.rbegin(); it != hoisted_storages.rend(); it++) { - expanded_extent = expand_expr(expanded_extent, it->scope); + e = expand_expr(e, it->scope); if (it->name == op->name) { break; } } - expanded_extent = simplify(common_subexpression_elimination(expanded_extent)); - Interval bounds = bounds_of_expr_in_scope(expanded_extent, hoisted_storage_data.loop_vars); - user_assert(bounds.max.defined()) << "Couldn't infer the upper bound for the storage size of " << op->name << ", consider using bound_storage.\n"; - bounded_extents.push_back(bounds.max); + + e = simplify(common_subexpression_elimination(e)); + Interval bounds = bounds_of_expr_in_scope(e, hoisted_storage_data.loop_vars); + return bounds.max; + }; + + vector bounded_extents; + for (const auto &e : allocation_extents) { + Expr expanded_extent = expand_and_bound(e); + user_assert(expanded_extent.defined() && + !expanded_extent.same_as(Interval::pos_inf())) + << "Couldn't infer the upper bound for the storage size of " << op->name << ", consider using bound_storage.\n"; + bounded_extents.push_back(expanded_extent); + } + + Expr expanded_condition = expand_and_bound(condition); + if (!expanded_condition.defined() || + expanded_condition.same_as(Interval::pos_inf())) { + expanded_condition = const_true(); } - HoistedAllocationInfo hoisted_alloc(op->name, op->types[0], op->memory_type, bounded_extents, condition); + HoistedAllocationInfo hoisted_alloc(op->name, op->types[0], op->memory_type, bounded_extents, expanded_condition); hoisted_storage_data.hoisted_allocations.push_back(hoisted_alloc); } else { diff --git a/test/correctness/skip_stages.cpp b/test/correctness/skip_stages.cpp index ea298670b6bf..970966a78e30 100644 --- a/test/correctness/skip_stages.cpp +++ b/test/correctness/skip_stages.cpp @@ -27,7 +27,7 @@ void check_counts(int a = 0, int b = 0, int c = 0, int d = 0) { } int main(int argc, char **argv) { - Var x; + Var x, y; Param toggle1, toggle2; { @@ -201,6 +201,30 @@ int main(int argc, char **argv) { check_counts(11); } + { + // Check the interation with storage hoisting + + // This Func may or may not be loaded, depending on y + Func maybe_loaded("maybe_loaded"); + maybe_loaded(x, y) = x + y; + + // This Func may or may not be used, depending on y + Func maybe_used("maybe_used"); + maybe_used(x, y) = maybe_loaded(x, y); + + Func output("output"); + output(x, y) = select(y % 100 == 37, 0, maybe_used(x, y)); + + // The allocation condition depends on y, but the actual allocation + // happens at the root level. + maybe_loaded.compute_at(output, y).hoist_storage_root(); + maybe_used.compute_at(output, y).hoist_storage_root(); + + // This will fail to compile with an undefined symbol if we haven't + // handled the condition correctly. + output.realize({100, 100}); + } + printf("Success!\n"); return 0; } From 36d74a8cbf9c4129f608cd97d231961f1bd99c4c Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 26 Feb 2024 17:56:59 -0800 Subject: [PATCH 076/186] Rewrite the skip stages lowering pass (#8115) * Avoid redundant scope lookups This pattern has been bugging me for a long time: ``` if (scope.contains(key)) { Foo f = scope.get(key); } ``` This redundantly looks up the key in the scope twice. I've finally gotten around to fixing it. I've introduced a find method that either returns a const pointer to the value, if it exists, or null. It also searches any containing scopes, which are held by const pointer, so the method has to return a const pointer. ``` if (const Foo *f = scope.find(key)) { } ``` For cases where you want to get and then mutate, I added shallow_find, which doesn't search enclosing scopes, but returns a mutable pointer. We were also doing redundant scope lookups in ScopedBinding. We stored the key in the helper object, and then did a pop on that key in the ScopedBinding destructor. This commit changes Scope so that Scope::push returns an opaque token that you can pass to Scope::pop to have it remove that element without doing a fresh lookup. ScopedBinding now uses this. Under the hood it's just an iterator on the underlying map (map iterators are not invalidated on inserting or removing other stuff). The net effect is to speed up local laplacian lowering by about 5% I also considered making it look more like an stl class, and having find return an iterator, but it doesn't really work. The iterator it returns might point to an entry in an enclosing scope, in which case you can't compare it to the .end() method of the scope you have. Scopes are different enough from maps that the interface really needs to be distinct. * Pacify clang-tidy * Fix unintentional mutation of interval in scope * Fix accidental Scope::get * Rewrite the skip stages lowering pass Skip stages was slow due to crappy computational complexity (quadratic?) I reworked it into a two-pass linear-time algorithm. The first part remembers which pieces of IR are actually relevant to the task, and the second pass performs the task using a bounds-inference-like algorithm. On main resnet50 spends 519 ms in this pass. This commit reduces it to 40 ms. Local laplacian with 100 pyramid levels spends 7.4 seconds in this pass. This commit reduces it to ~3 ms. This commit also moves the cache store for memoized Funcs into the produce node, instead of at the top of the consume node, because it naturally places it inside a condition you inject into the produce node. * clang-tidy fixes * Fix skip stages interaction with compute_with * Unify let visitors, and use fewer stack frames for them * Fix accidental leakage of .used into .loaded * Visit the bodies of uninteresting let chains * Another used -> loaded * Fix hoist_storage not handling condition correctly. --------- Co-authored-by: Steven Johnson --- src/BoundsInference.cpp | 7 +- src/IR.cpp | 1 + src/IR.h | 4 + src/Lower.cpp | 2 +- src/Memoization.cpp | 14 +- src/Scope.h | 5 + src/SkipStages.cpp | 1044 ++++++++++++++++++------------ src/SkipStages.h | 8 +- src/Util.cpp | 2 +- test/correctness/skip_stages.cpp | 43 ++ 10 files changed, 721 insertions(+), 409 deletions(-) diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp index 31b441ea4251..5965303197bc 100644 --- a/src/BoundsInference.cpp +++ b/src/BoundsInference.cpp @@ -1383,9 +1383,14 @@ Stmt bounds_inference(Stmt s, fused_pairs_in_groups.push_back(pairs); } + // Add a note in the IR for where the outermost dynamic-stage skipping + // checks should go. These are injected in a later pass. + Expr marker = Call::make(Int(32), Call::skip_stages_marker, {}, Call::Intrinsic); + s = Block::make(Evaluate::make(marker), s); + // Add a note in the IR for where assertions on input images // should go. Those are handled by a later lowering pass. - Expr marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic); + marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic); s = Block::make(Evaluate::make(marker), s); // Add a synthetic outermost loop to act as 'root'. diff --git a/src/IR.cpp b/src/IR.cpp index 3dcb73281412..c0bdb718291d 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -674,6 +674,7 @@ const char *const intrinsic_op_names[] = { "shift_right", "signed_integer_overflow", "size_of_halide_buffer_t", + "skip_stages_marker", "sliding_window_marker", "sorted_avg", "strict_float", diff --git a/src/IR.h b/src/IR.h index 82722af8173a..252e4588db03 100644 --- a/src/IR.h +++ b/src/IR.h @@ -594,6 +594,10 @@ struct Call : public ExprNode { signed_integer_overflow, size_of_halide_buffer_t, + // Marks the point in lowering where the outermost skip stages checks + // should be introduced. + skip_stages_marker, + // Takes a realization name and a loop variable. Declares that values of // the realization that were stored on earlier loop iterations of the // given loop are potentially loaded in this loop iteration somewhere diff --git a/src/Lower.cpp b/src/Lower.cpp index 52c049b63c72..3b357eb3061e 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -269,7 +269,7 @@ void lower_impl(const vector &output_funcs, log("Lowering after discarding safe promises:", s); debug(1) << "Dynamically skipping stages...\n"; - s = skip_stages(s, order); + s = skip_stages(s, outputs, fused_groups, env); log("Lowering after dynamically skipping stages:", s); debug(1) << "Forking asynchronous producers...\n"; diff --git a/src/Memoization.cpp b/src/Memoization.cpp index d07914591cc5..be99c3b8fcba 100644 --- a/src/Memoization.cpp +++ b/src/Memoization.cpp @@ -425,13 +425,10 @@ class InjectMemoization : public IRMutator { Stmt body = mutate(op->body); - std::string cache_miss_name = op->name + ".cache_miss"; - Expr cache_miss = Variable::make(Bool(), cache_miss_name); - if (op->is_producer) { - Stmt mutated_body = IfThenElse::make(cache_miss, body); - return ProducerConsumer::make(op->name, op->is_producer, mutated_body); - } else { + std::string cache_miss_name = op->name + ".cache_miss"; + Expr cache_miss = Variable::make(Bool(), cache_miss_name); + const Function f(iter->second); KeyInfo key_info(f, top_level_name, memoize_instance); @@ -447,9 +444,10 @@ class InjectMemoization : public IRMutator { key_info.store_computation(cache_key_name, computed_bounds_name, eviction_key_name, f.outputs(), op->name)); - Stmt mutated_body = Block::make(cache_store_back, body); - return ProducerConsumer::make(op->name, op->is_producer, mutated_body); + body = Block::make(body, cache_store_back); + body = IfThenElse::make(cache_miss, body); } + return ProducerConsumer::make(op->name, op->is_producer, body); } else { return IRMutator::visit(op); } diff --git a/src/Scope.h b/src/Scope.h index 94d9eb9c165b..f0578874762f 100644 --- a/src/Scope.h +++ b/src/Scope.h @@ -205,6 +205,11 @@ class Scope { } } + /** How many distinct names exist (does not count nested definitions of the same name) */ + size_t size() const { + return table.size(); + } + struct PushToken { typename std::map>::iterator iter; }; diff --git a/src/SkipStages.cpp b/src/SkipStages.cpp index 9da328c6f374..caf292972fbb 100644 --- a/src/SkipStages.cpp +++ b/src/SkipStages.cpp @@ -1,4 +1,5 @@ #include "SkipStages.h" +#include "Bounds.h" #include "CSE.h" #include "Debug.h" #include "ExprUsesVar.h" @@ -9,6 +10,7 @@ #include "Scope.h" #include "Simplify.h" #include "Substitute.h" +#include "UniquifyVariableNames.h" #include #include @@ -16,526 +18,774 @@ namespace Halide { namespace Internal { -using std::set; -using std::string; -using std::vector; +// This lowering pass skips run produce nodes and sometimes allocating for +// stages where the result can't affect the output. It's essentially computation +// and allocation bounds inference but simpler. For each production, instead of +// inferring the bounds to compute, we want to infer a single boolean that tells +// us whether or not to run it. For each allocation, instead of inferring the +// region to allocate, we want to infer a single boolean that tells us whether +// or not to allocate. +// +// Like with bounds inference, if we infer this from scratch for each Func, +// using the conditions under which its consumers are computed, we'd get a +// quadratic blow-up in the size of these conditions as you go along the from +// the output to the inputs. Instead, for each stage, we want the condition +// under which it will be computed in terms of symbolic variables that signify +// whether or not its immediate consumers are going to be computed. These +// conditions can depend on loop variables, so we potentially need a fresh set +// of these variables at each loop level that contains produce nodes. namespace { -bool extern_call_uses_buffer(const Call *op, const std::string &func) { - if (op->is_extern()) { - if (starts_with(op->name, "halide_memoization")) { - return false; - } - for (const auto &arg : op->args) { - const Variable *var = arg.as(); - if (var && - starts_with(var->name, func + ".") && - ends_with(var->name, ".buffer")) { - return true; - } - } - } - return false; -} - -class PredicateFinder : public IRVisitor { -public: - Expr predicate; - PredicateFinder(const string &b, bool s) - : predicate(const_false()), - buffer(b), +// A prepass to rule out certain pieces of IR as uninteresting, to speed up the +// main pass. +class SkipStagesAnalysis : public IRVisitor { + using IRVisitor::visit; - treat_selects_as_guards(s) { - } + // Is the visitor currently inside the condition of an IfThenElse or a + // Select (note: *not* one of the branches - the condition itself) + bool in_condition = false; -private: - using IRVisitor::visit; - string buffer; - bool varies = false; - bool treat_selects_as_guards; - bool in_produce = false; - Scope<> varying; - Scope<> in_pipeline; - Scope<> local_buffers; + // What is the nearest enclosing conditional node for the realize node of + // each func. nullptr for outputs, because they don't have realize nodes. + std::map conditional_around_realize_node; - void visit(const Variable *op) override { - bool this_varies = varying.contains(op->name); + // What is the current nearest enclosing conditional node. + const IRNode *enclosing_conditional = nullptr; - varies |= this_varies; + void visit(const Select *op) override { + { + ScopedValue bind(in_condition, true); + op->condition.accept(this); + } + { + ScopedValue bind(enclosing_conditional, op); + op->true_value.accept(this); + op->false_value.accept(this); + } } - void visit(const For *op) override { - op->min.accept(this); - bool min_varies = varies; - op->extent.accept(this); - bool should_pop = false; - if (!is_const_one(op->extent) || min_varies) { - should_pop = true; - varying.push(op->name); + void visit(const IfThenElse *op) override { + { + ScopedValue bind(in_condition, true); + op->condition.accept(this); } - op->body.accept(this); - if (should_pop) { - varying.pop(op->name); - } else if (expr_uses_var(predicate, op->name)) { - predicate = Let::make(op->name, op->min, predicate); + { + ScopedValue bind(enclosing_conditional, op); + op->then_case.accept(this); + if (op->else_case.defined()) { + op->else_case.accept(this); + } } } - template - void visit_let(const T *op) { - struct Frame { - const T *op; - ScopedBinding<> binding; - }; - vector frames; + // Have we encountered a Var or Call used inside a condition. If this + // happens in the value field of a let, then that let name should also be + // marked as interesting, because it could show up in a .used or .loaded + // condition. + bool found_var_used_in_condition = false; - decltype(op->body) body; - do { - bool old_varies = varies; - varies = false; + void visit(const LetStmt *op) override { + op->body.accept(this); + { + ScopedValue bind(in_condition, in_condition || + interesting_vars.count(op->name)); + found_var_used_in_condition = false; op->value.accept(this); - - frames.push_back(Frame{op, ScopedBinding<>(varies, varying, op->name)}); - - varies |= old_varies; - body = op->body; - op = body.template as(); - } while (op); - - body.accept(this); - - for (auto it = frames.rbegin(); it != frames.rend(); it++) { - if (expr_uses_var(predicate, it->op->name)) { - predicate = Let::make(it->op->name, it->op->value, predicate); + if (found_var_used_in_condition) { + // The value referred to a var or call that gets used in a + // condition somewhere, therefore this LetStmt could also get + // hoisted into a condition at some point. + interesting_vars.insert(op->name); } } } - void visit(const LetStmt *op) override { - visit_let(op); + void visit(const Let *op) override { + op->body.accept(this); + { + ScopedValue bind(in_condition, in_condition || + interesting_vars.count(op->name)); + bool old = found_var_used_in_condition; + found_var_used_in_condition = false; + op->value.accept(this); + if (found_var_used_in_condition) { + interesting_vars.insert(op->name); + } + // Is this expression interesting? I.e. might it show up in a .used + // or .loaded? Either the body Expr was interesting in its own right + // (refered to something used in a conditional somewhere), or the + // value was interesting, and presumably the value is used in the + // body. + found_var_used_in_condition = found_var_used_in_condition || old; + } } - void visit(const Let *op) override { - visit_let(op); + void visit(const Block *op) override { + // Reverse order + op->rest.accept(this); + op->first.accept(this); } + Scope<> in_produce; void visit(const ProducerConsumer *op) override { - ScopedBinding<> bind(in_pipeline, op->name); - if (op->is_producer && op->name == buffer) { - ScopedValue sv(in_produce, true); - IRVisitor::visit(op); + size_t id = func_id.at(op->name); + + if (op->is_producer && + !unconditionally_used_funcs.count(id) && + conditional_around_realize_node.count(id)) { + // This node could have an if statement injected here + ScopedValue s(enclosing_conditional, op); + ScopedBinding<> bind(in_produce, op->name); + op->body.accept(this); } else { - IRVisitor::visit(op); + // Either it's a consume, or it's used unconditionally outside of + // and after this produce node (remember we're iterating in reverse + // order), or it's an output (there is no enclosing realize node). + op->body.accept(this); } } - // Logical operators with eager constant folding - Expr make_and(Expr a, Expr b) { - if (is_const_zero(a) || is_const_one(b)) { - return a; - } else if (is_const_zero(b) || is_const_one(a)) { - return b; - } else if (equal(a, b)) { - return a; - } else { - return a && b; + Scope<> in_realize; + void visit(const Realize *op) override { + size_t id = func_id.at(op->name); + + // There may have already been a Realize node for this Func. We need to + // analyze this node from scratch. + unconditionally_used_funcs.erase(id); + + conditional_around_realize_node[id] = enclosing_conditional; + + // Don't consider the realization bounds, which can't contain Func uses, + // or the new or free exprs, which can't access Func data. + { + ScopedBinding<> bind(in_realize, op->name); + op->body.accept(this); + } + + if (conditionally_used_funcs.count(id)) { + // Was used conditionally in a different Realize node, and used + // unconditionally in this one. + unconditionally_used_funcs.erase(id); + } else if (!unconditionally_used_funcs.count(id)) { + // Was used conditionally in this Realize node. + conditionally_used_funcs.insert(id); } } - Expr make_or(Expr a, Expr b) { - if (is_const_zero(a) || is_const_one(b)) { - return b; - } else if (is_const_zero(b) || is_const_one(a)) { - return a; - } else if (equal(a, b)) { - return a; - } else { - return a || b; - } - } - - Expr make_select(const Expr &a, Expr b, Expr c) { - if (is_const_one(a)) { - return b; - } else if (is_const_zero(a)) { - return c; - } else if (is_const_one(b)) { - return make_or(a, c); - } else if (is_const_zero(b)) { - return make_and(make_not(a), c); - } else if (is_const_one(c)) { - return make_or(make_not(a), b); - } else if (is_const_zero(c)) { - return make_and(a, b); - } else { - return select(a, b, c); + void visit(const Call *op) override { + if (op->call_type == Call::Halide) { + if (in_condition) { + interesting_vars.insert(op->name); + found_var_used_in_condition = true; + } + size_t id = func_id.at(op->name); + if (!in_produce.contains(op->name) && + enclosing_conditional == conditional_around_realize_node[id]) { + unconditionally_used_funcs.insert(id); + } } + IRVisitor::visit(op); } - Expr make_not(const Expr &a) { - if (is_const_one(a)) { - return make_zero(a.type()); - } else if (is_const_zero(a)) { - return make_one(a.type()); - } else { - return !a; + void visit(const Variable *op) override { + if (in_condition) { + interesting_vars.insert(op->name); + found_var_used_in_condition = true; + } + if (op->type.is_handle()) { + auto it = func_id.find(op->name); + if (it != func_id.end() && + in_realize.contains(op->name) && + !in_produce.contains(op->name) && + enclosing_conditional == conditional_around_realize_node[it->second]) { + unconditionally_used_funcs.insert(it->second); + } } } - template - void visit_conditional(const Expr &condition, T true_case, T false_case) { - Expr old_predicate = predicate; +public: + SkipStagesAnalysis(const std::map &func_id) + : func_id(func_id) { + } - predicate = const_false(); - true_case.accept(this); - Expr true_predicate = predicate; + const std::map func_id; - predicate = const_false(); - if (false_case.defined()) { - false_case.accept(this); - } - Expr false_predicate = predicate; + // Vars which could conceivably end up in a skip-stages predicate. These are + // the ones that are used (possibly transitively) in conditions in Select or + // IfThenElse nodes. + std::set interesting_vars; - bool old_varies = varies; - predicate = const_false(); - varies = false; - condition.accept(this); + // All Funcs that are unconditionally called within the scope of at least + // one of their Realize nodes (and therefore could never be skipped so we + // don't need to worry about them in the mutator below) + std::set unconditionally_used_funcs; - predicate = make_or(predicate, old_predicate); - if (varies) { - predicate = make_or(predicate, make_or(true_predicate, false_predicate)); - } else { - predicate = make_or(predicate, make_select(condition, true_predicate, false_predicate)); - } + // All Funcs that are conditionally called within the scope of at least one + // of their Realize nodes, and therefore must not be added to + // unconditionally_used_funcs. + std::set conditionally_used_funcs; +}; - varies = varies || old_varies; +class SkipStages : public IRMutator { +public: + SkipStages(const SkipStagesAnalysis &analysis, const std::vector &name_for_id) + : analysis(analysis), name_for_id(name_for_id) { } - void visit(const Select *op) override { - if (treat_selects_as_guards) { - visit_conditional(op->condition, op->true_value, op->false_value); - } else { - IRVisitor::visit(op); - } - } +protected: + const SkipStagesAnalysis &analysis; + const std::vector &name_for_id; - void visit(const IfThenElse *op) override { - visit_conditional(op->condition, op->then_case, op->else_case); - } + using IRMutator::visit; - void visit(const Call *op) override { - varies |= in_pipeline.contains(op->name); + struct FuncInfo { + // Condition under which values are used and need to be correct. + Expr used; - IRVisitor::visit(op); + // Condition under which values are accessed, but don't need to be + // correct. May be distinct from used if the calls to this Func are + // guarded by selects. + Expr loaded; + }; + + // Conditions for each Func that describe how it is used in the Stmt just + // mutated, and any Stmts that come after it in the same enclosing loop + // body. (TODO: worry about fork) + std::map func_info; + + bool found_marker = false; + + // Might there be nested lets with the same name? Set to true if we ever + // stamp down a .used let more than once for the same Func. + bool need_uniquify = false; - if (!in_produce && (op->name == buffer || extern_call_uses_buffer(op, buffer))) { - predicate = const_true(); + // Func ids for which we have ever stamped down a .used or .loaded let. + std::set lets_emitted; + + // Have we made use of .used or .loaded vars that haven't been wrapped in a + // LetStmt yet (while iterating from inside out)? + bool inner_unbound_use_of_used_or_loaded_vars = false; + + Stmt emit_defs(Stmt stmt) { + for (auto &p : func_info) { + stmt = LetStmt::make(used_var_name(p.first), p.second.used, stmt); + stmt = LetStmt::make(loaded_var_name(p.first), p.second.loaded, stmt); + need_uniquify |= !lets_emitted.insert(p.first).second; } + return stmt; } - void visit(const Provide *op) override { - IRVisitor::visit(op); - if (in_produce && op->name != buffer && !local_buffers.contains(op->name)) { - predicate = const_true(); + Stmt visit(const Block *op) override { + // We want to iterate in reverse, which really just requires changing + // the block visitor. + Stmt rest = mutate(op->rest); + found_marker = false; + Stmt first = mutate(op->first); + if (found_marker) { + // This is where the outermost .used definitions go + internal_assert(first.as()); + if (inner_unbound_use_of_used_or_loaded_vars) { + rest = emit_defs(rest); + } + if (need_uniquify) { + rest = uniquify_variable_names(rest); + } + return rest; + } + if (first.same_as(op->first) && + rest.same_as(op->rest)) { + return op; + } else { + return Block::make(std::move(first), std::move(rest)); } } - void visit(const Realize *op) override { - ScopedBinding<> bind(local_buffers, op->name); - IRVisitor::visit(op); - } + Expr visit(const Call *op) override { + if (op->name == "halide_memoization_cache_lookup") { + // The buffer reference in a cache lookup doesn't count as a use - + // it's an out parameter. However, do *do* need to conditionalize + // the lookup on whether or not the buffer needs to be allocated. + Expr last_arg = op->args.back(); + const Call *c = last_arg.as(); + internal_assert(c && + c->is_intrinsic(Call::make_struct) && + !c->args.empty()) + << last_arg; + const Variable *v = c->args[0].as(); + internal_assert(v); + auto it = analysis.func_id.find(v->name); + internal_assert(it != analysis.func_id.end()); + size_t func = it->second; + if (func_info.find(func) != func_info.end()) { + return Call::make(op->type, Call::if_then_else, {loaded_var(func), Expr(op), make_zero(op->type)}, Call::PureIntrinsic); + } else { + // Not in the func info map, so it must be unconditionally used. + return op; + } + } - void visit(const Allocate *op) override { - // This code works to ensure expressions depending on an - // allocation don't get moved outside the allocation and are - // marked as varying if predicate depends on the value of the - // allocation. - ScopedBinding<> - bind_host_ptr(varying, op->name), - bind_buffer(varying, op->name + ".buffer"); - IRVisitor::visit(op); + Expr e = IRMutator::visit(op); + if (op->call_type == Call::Halide) { + size_t id = analysis.func_id.at(op->name); + if (!analysis.unconditionally_used_funcs.count(id)) { + // We're unconditionally used. Clobber any existing info. + func_info[id] = FuncInfo{const_true(), const_true()}; + } + } else if (op->is_intrinsic(Call::skip_stages_marker)) { + found_marker = true; + } + return e; } -}; -class ProductionGuarder : public IRMutator { -public: - ProductionGuarder(const string &b, Expr compute_p, Expr alloc_p) - : buffer(b), compute_predicate(std::move(compute_p)), alloc_predicate(std::move(alloc_p)) { + Expr visit(const Variable *op) override { + if (op->type == halide_type_of()) { + auto it = analysis.func_id.find(op->name); + if (it != analysis.func_id.end() && + !analysis.unconditionally_used_funcs.count(it->second)) { + // Conservatively assume any use of a .buffer symbol depends on + // the Func being allocated and the values being correct. + func_info[it->second] = FuncInfo{const_true(), const_true()}; + } + } + return op; } -private: - string buffer; - Expr compute_predicate; - Expr alloc_predicate; - - using IRMutator::visit; - - bool memoize_call_uses_buffer(const Call *op) { - internal_assert(op->call_type == Call::Extern); - internal_assert(starts_with(op->name, "halide_memoization")); - for (const auto &arg : op->args) { - const Variable *var = arg.as(); - if (var && - starts_with(var->name, buffer + ".") && - ends_with(var->name, ".buffer")) { - return true; + void merge_func_info(std::map *old, + const std::map &new_info, + const Expr &used = Expr{}, + const Expr &evaluated = Expr{}) { + for (const auto &it : new_info) { + FuncInfo fi = it.second; + if (used.defined()) { + fi.used = fi.used && used; + } + if (evaluated.defined()) { + fi.loaded = fi.loaded && evaluated; + } + auto [p, inserted] = old->try_emplace(it.first, fi); + if (!inserted) { + // Merge with any existing info + if (!is_const_one(p->second.used)) { + p->second.used = p->second.used || fi.used; + } + if (!is_const_one(p->second.loaded)) { + p->second.loaded = p->second.loaded || fi.loaded; + } } } - return false; } - Expr visit(const Call *op) override { + // Is an Expr safe to lift into a .used or .loaded condition. + bool may_lift(const Expr &e) { + class MayLift : public IRVisitor { + using IRVisitor::visit; + void visit(const Call *op) override { + if (!op->is_pure() && op->call_type != Call::Halide) { + result = false; + } else { + IRVisitor::visit(op); + } + } - if ((op->name == "halide_memoization_cache_lookup") && - memoize_call_uses_buffer(op)) { - // We need to guard call to halide_memoization_cache_lookup to only - // be executed if the corresponding buffer is allocated. We ignore - // the compute_predicate since in the case that alloc_predicate is - // true but compute_predicate is false, the consumer would still load - // data from the buffer even if it won't actually use the result, - // hence, we need to allocate some scratch memory for the consumer - // to load from. For memoized func, the memory might already be in - // the cache, so we perform the lookup instead of allocating a new one. - return Call::make(op->type, Call::if_then_else, - {alloc_predicate, op, 0}, Call::PureIntrinsic); - } else if ((op->name == "halide_memoization_cache_store") && - memoize_call_uses_buffer(op)) { - // We need to wrap the halide_memoization_cache_store with the - // compute_predicate, since the data to be written is only valid if - // the producer of the buffer is executed. - return Call::make(op->type, Call::if_then_else, - {compute_predicate, op, 0}, Call::PureIntrinsic); + public: + bool result = true; + } v; + e.accept(&v); + return v.result; + } + + // Come up with an upper bound for the truth value of an expression with the + // given var eliminated. + Expr relax_over_var(const Expr &e, const std::string &var) { + Scope domain; + domain.push(var, Interval::everything()); + Interval in = bounds_of_expr_in_scope(e, domain); + if (!in.has_upper_bound()) { + return const_true(); } else { - return IRMutator::visit(op); + return simplify(in.max); } } - Stmt visit(const ProducerConsumer *op) override { - // If the compute_predicate at this stage depends on something - // vectorized we should bail out. - Stmt stmt = IRMutator::visit(op); - - if (op->is_producer) { - op = stmt.as(); - internal_assert(op); - if (op->name == buffer) { - Stmt body = IfThenElse::make(compute_predicate, op->body); - stmt = ProducerConsumer::make(op->name, op->is_producer, body); - } - } - return stmt; - } -}; + // Come up with an upper bound for the truth value of an expression with any + // calls to the given func eliminated. + Expr relax_over_calls(const Expr &e, const std::string &func) { + class ReplaceCalls : public IRMutator { + const std::string &func; -class StageSkipper : public IRMutator { -public: - StageSkipper(const string &f) - : func(f) { - } + using IRMutator::visit; -private: - string func; - using IRMutator::visit; + Expr visit(const Call *op) override { + if (op->call_type == Call::Halide && op->name == func) { + return cast(op->type, var); + } + return IRMutator::visit(op); + } - Scope<> vector_vars; - bool in_vector_loop = false; + public: + const std::string var_name; + const Expr var; - Stmt visit(const For *op) override { - bool old_in_vector_loop = in_vector_loop; + ReplaceCalls(const std::string &func) + : func(func), + var_name(unique_name('t')), + var(Variable::make(Int(32), var_name)) { + } + } replacer(func); - // We want to be sure that the predicate doesn't vectorize. - if (op->for_type == ForType::Vectorized) { - vector_vars.push(op->name); - in_vector_loop = true; + return relax_over_var(replacer.mutate(e), replacer.var_name); + } + + Expr visit(const Select *op) override { + if (!may_lift(op->condition)) { + return IRMutator::visit(op); } - Stmt stmt = IRMutator::visit(op); + std::map old; + old.swap(func_info); + mutate(op->true_value); + merge_func_info(&old, func_info, op->condition); + func_info.clear(); + mutate(op->false_value); + merge_func_info(&old, func_info, !op->condition); + old.swap(func_info); + mutate(op->condition); // Check for any calls in the condition + + return op; + } + + Stmt mutate_conditional_stmt(const Stmt &s, const Expr &condition) { + std::map old; + old.swap(func_info); + Stmt stmt = mutate(s); + merge_func_info(&old, func_info, condition, condition); + old.swap(func_info); + return stmt; + } - if (op->for_type == ForType::Vectorized) { - vector_vars.pop(op->name); + Stmt visit(const IfThenElse *op) override { + if (!may_lift(op->condition)) { + // We won't be able to lift the condition + return IRMutator::visit(op); } - in_vector_loop = old_in_vector_loop; - - return stmt; + Stmt then_case = mutate_conditional_stmt(op->then_case, op->condition); + Stmt else_case; + if (op->else_case.defined()) { + else_case = mutate_conditional_stmt(op->else_case, !op->condition); + } + mutate(op->condition); + if (then_case.same_as(op->then_case) && + else_case.same_as(op->else_case)) { + return op; + } else { + return IfThenElse::make(op->condition, then_case, else_case); + } } - Stmt visit(const LetStmt *op) override { - struct Frame { - const LetStmt *op; - bool vector_var; - }; - vector frames; - Stmt result; - - while (op) { - bool vector_var = in_vector_loop && expr_uses_vars(op->value, vector_vars); - frames.push_back({op, vector_var}); - if (vector_var) { - vector_vars.push(op->name); - } - result = op->body; - op = result.as(); + template + auto visit_let(const T *op) -> decltype(op->body) { + const T *orig = op; + + // Peel off any uninteresting lets without wasting stack frames. + std::vector> containing_lets; + decltype(op->body) body; + while (op && !analysis.interesting_vars.count(op->name)) { + containing_lets.emplace_back(op->name, op->value); + body = op->body; + op = body.template as(); } - result = mutate(result); + bool changed = false; + if (op) { + std::map old; + old.swap(func_info); + body = mutate(op->body); + internal_assert(body.defined()); + if (may_lift(op->value)) { + for (auto &it : func_info) { + if (expr_uses_var(it.second.used, op->name)) { + it.second.used = Let::make(op->name, op->value, it.second.used); + } + if (expr_uses_var(it.second.loaded, op->name)) { + it.second.loaded = Let::make(op->name, op->value, it.second.loaded); + } + } + } else { + // Treat the let value as an unknown + for (auto &it : func_info) { + if (expr_uses_var(it.second.used, op->name)) { + it.second.used = relax_over_var(it.second.used, op->name); + } + if (expr_uses_var(it.second.loaded, op->name)) { + it.second.loaded = relax_over_var(it.second.loaded, op->name); + } + } + } + merge_func_info(&old, func_info); + old.swap(func_info); + mutate(op->value); + if (body.same_as(op->body)) { + body = op; + } else { + internal_assert(body.defined()); + body = T::make(op->name, op->value, std::move(body)); + changed = true; + } + } else if (std::is_same::value) { + auto new_body = mutate(body); + changed = !new_body.same_as(body); + body = std::move(new_body); + } else { + // Just visit the body + mutate(body); + } - for (auto it = frames.rbegin(); it != frames.rend(); it++) { - if (it->vector_var) { - vector_vars.pop(it->op->name); + // Rewrap any uninteresting lets + for (auto it = containing_lets.rbegin(); it != containing_lets.rend(); it++) { + mutate(it->second); // Visit the value of each let + if (changed) { + body = T::make(it->first, std::move(it->second), std::move(body)); } - result = LetStmt::make(it->op->name, it->op->value, result); } - return result; + + if (changed) { + internal_assert(body.defined()); + return body; + } else { + return orig; + } } - Stmt visit(const Realize *op) override { - if (op->name == func) { - debug(3) << "Finding compute predicate for " << op->name << "\n"; - PredicateFinder find_compute(op->name, true); - op->body.accept(&find_compute); - - debug(3) << "Simplifying compute predicate for " << op->name << ": " << find_compute.predicate << "\n"; - Expr compute_predicate = simplify(common_subexpression_elimination(find_compute.predicate)); - - debug(3) << "Compute predicate for " << op->name << " : " << compute_predicate << "\n"; - - if (expr_uses_vars(compute_predicate, vector_vars)) { - // Don't try to skip stages if the predicate may vary - // per lane. This will just unvectorize the - // production, which is probably contrary to the - // intent of the user. - compute_predicate = const_true(); - } + Expr visit(const Let *op) override { + return visit_let(op); + } - if (!is_const_one(compute_predicate)) { + Stmt visit(const LetStmt *op) override { + return visit_let(op); + } - debug(3) << "Finding allocate predicate for " << op->name << "\n"; - PredicateFinder find_alloc(op->name, false); - op->body.accept(&find_alloc); - debug(3) << "Simplifying allocate predicate for " << op->name << "\n"; - Expr alloc_predicate = simplify(common_subexpression_elimination(find_alloc.predicate)); + std::string used_var_name(size_t id) { + return name_for_id[id] + ".used"; + } - debug(3) << "Allocate predicate for " << op->name << " : " << alloc_predicate << "\n"; + Expr used_var(size_t id) { + return Variable::make(Bool(), used_var_name(id)); + } + + std::string loaded_var_name(size_t id) { + return name_for_id[id] + ".loaded"; + } - ProductionGuarder g(op->name, compute_predicate, alloc_predicate); - Stmt body = g.mutate(op->body); + Expr loaded_var(size_t id) { + return Variable::make(Bool(), loaded_var_name(id)); + } - debug(3) << "Done guarding computation for " << op->name << "\n"; + Scope<> in_realize; + Scope<> in_realize_and_produce_or_consume; - return Realize::make(op->name, op->types, op->memory_type, op->bounds, - alloc_predicate, body); + Stmt visit(const ProducerConsumer *op) override { + size_t id = analysis.func_id.at(op->name); + const bool unconditionally_used = analysis.unconditionally_used_funcs.count(id); + + if (op->is_producer && !unconditionally_used) { + // The body of this is conditional, based on a yet-to-be defined symbolic value. + Expr used = used_var(id); + Stmt body; + + auto it = func_info.try_emplace(id, FuncInfo{const_false(), const_false()}).first; + + // Save the info about how this Func is called. We don't + // care about self-calls in the produce node. + FuncInfo fi = it->second; + ScopedBinding<> bind_if(in_realize.contains(op->name), + in_realize_and_produce_or_consume, op->name); + + body = mutate_conditional_stmt(op->body, used); + // Restore the info about how this Func is called. Calls to + // it in its own producer don't count towards skip stages + // analysis. + it->second = fi; + body = IfThenElse::make(used, body); + inner_unbound_use_of_used_or_loaded_vars = true; + + if (body.same_as(op->body)) { + return op; } else { - return IRMutator::visit(op); + return ProducerConsumer::make(op->name, op->is_producer, std::move(body)); } } else { - return IRMutator::visit(op); - } - } -}; + ScopedBinding<> bind_if(!unconditionally_used && + in_realize.contains(op->name), + in_realize_and_produce_or_consume, op->name); -// Find Funcs where at least one of the consume nodes only uses the -// Func conditionally. We may want to guard the production of these -// Funcs. -class MightBeSkippable : public IRVisitor { + Stmt s = IRMutator::visit(op); - using IRVisitor::visit; - - bool in_conditional_stmt{false}; + if (analysis.interesting_vars.count(op->name)) { + for (auto &p : func_info) { + p.second.used = relax_over_calls(p.second.used, op->name); + p.second.loaded = relax_over_calls(p.second.loaded, op->name); + } + } - void visit(const Call *op) override { - IRVisitor::visit(op); - if (op->call_type == Call::Halide) { - unconditionally_used.insert(op->name); + return s; } } - void visit(const IfThenElse *op) override { - op->condition.accept(this); - - std::set old; - unconditionally_used.swap(old); - - ScopedValue old_in_conditional(in_conditional_stmt, true); - op->then_case.accept(this); - - std::set used_in_true; - used_in_true.swap(unconditionally_used); - if (op->else_case.defined()) { - op->else_case.accept(this); + Stmt visit(const Realize *op) override { + size_t id = analysis.func_id.at(op->name); + if (analysis.unconditionally_used_funcs.count(id)) { + return IRMutator::visit(op); } - // Take the set intersection of the true and false paths, and add them to the set. - std::set_intersection(used_in_true.begin(), used_in_true.end(), - unconditionally_used.begin(), unconditionally_used.end(), - std::inserter(old, old.begin())); + Stmt body; + { + ScopedBinding<> bind(in_realize, op->name); + body = mutate(op->body); + } + Expr condition = mutate(op->condition); + auto it = func_info.find(id); + if (it != func_info.end()) { + if (!is_const_one(it->second.loaded)) { + inner_unbound_use_of_used_or_loaded_vars = true; + condition = condition && loaded_var(id); + } + } - unconditionally_used.swap(old); + // We don't need to visit the bounds, because there can't be call nodes + // in them. + if (body.same_as(op->body) && + condition.same_as(op->condition)) { + return op; + } else { + return Realize::make(op->name, op->types, op->memory_type, + op->bounds, std::move(condition), std::move(body)); + } } - void visit(const Select *op) override { - op->condition.accept(this); - - std::set old; - unconditionally_used.swap(old); + bool in_vector_loop = false; - op->true_value.accept(this); - std::set used_in_true; - used_in_true.swap(unconditionally_used); + Stmt visit(const For *op) override { + ScopedValue s(in_vector_loop, + in_vector_loop || op->for_type == ForType::Vectorized); + bool old_inner_unbound_uses = inner_unbound_use_of_used_or_loaded_vars; + inner_unbound_use_of_used_or_loaded_vars = false; + + std::map old; + old.swap(func_info); + + Stmt body; + body = mutate(op->body); + // There can't be calls in the min and extent, so no need to visit; + // those. + + const bool in_sliding_loop = in_realize_and_produce_or_consume.size() < in_realize.size(); + bool may_emit = + !in_vector_loop && + !in_sliding_loop && + inner_unbound_use_of_used_or_loaded_vars; + + Stmt body_before = body; + if (may_emit) { + body = emit_defs(body); + } - op->false_value.accept(this); + // Now relax all the conditions that depend on this loop variable. + bool anything_depended_on_loop_var = false; + for (auto &p : func_info) { + if (expr_uses_var(p.second.used, op->name)) { + p.second.used = relax_over_var(p.second.used, op->name); + anything_depended_on_loop_var = true; + } + if (expr_uses_var(p.second.loaded, op->name)) { + p.second.loaded = relax_over_var(p.second.loaded, op->name); + anything_depended_on_loop_var = true; + } + } - // Again, take the set intersection - std::set_intersection(used_in_true.begin(), used_in_true.end(), - unconditionally_used.begin(), unconditionally_used.end(), - std::inserter(old, old.begin())); + if (!anything_depended_on_loop_var) { + // Adding definitions for .used and .loaded symbols is unnecessary + // here. We can just use the ones one loop level further out. + body = body_before; + } else if (may_emit) { + inner_unbound_use_of_used_or_loaded_vars = false; + } + inner_unbound_use_of_used_or_loaded_vars |= old_inner_unbound_uses; - unconditionally_used.swap(old); - } + // To consider: Could add that the loop has non-zero extent here. That + // somewhat blurs the lines between bounds inference and skip stages. + merge_func_info(&old, func_info); + old.swap(func_info); - void visit(const ProducerConsumer *op) override { - if (!op->is_producer) { - op->body.accept(this); - if (!unconditionally_used.count(op->name) || in_conditional_stmt) { - // This Func has a least one consume clause in which - // it is only used conditionally. - candidates.insert(op->name); - } + if (body.same_as(op->body)) { + return op; } else { - IRVisitor::visit(op); - // Calls inside the produce don't count - that's the block of code we intend to guard. - unconditionally_used.erase(op->name); + return For::make(op->name, op->min, op->extent, + op->for_type, op->partition_policy, op->device_api, std::move(body)); } } +}; - set unconditionally_used; +// Just drop the skip stages marker in the IR. Used when we deduce that we don't +// need to run the mutator above. +class StripSkipStagesMarker : public IRMutator { + using IRMutator::visit; -public: - set candidates; + Expr visit(const Call *op) override { + if (op->is_intrinsic(Call::skip_stages_marker)) { + return 0; + } else { + return op; + } + } }; } // namespace -Stmt skip_stages(Stmt stmt, const vector &order) { - // Don't consider the last stage, because it's the output, so it's - // never skippable. - MightBeSkippable check; - stmt.accept(&check); - for (size_t i = order.size() - 1; i > 0; i--) { - debug(2) << "skip_stages checking " << order[i - 1] << "\n"; - if (check.candidates.count(order[i - 1])) { - debug(2) << "skip_stages can skip " << order[i - 1] << "\n"; - StageSkipper skipper(order[i - 1]); - Stmt new_stmt = skipper.mutate(stmt); - if (!new_stmt.same_as(stmt)) { - // Might have made earlier stages skippable too - new_stmt.accept(&check); - } - stmt = new_stmt; +Stmt skip_stages(const Stmt &stmt, + const std::vector &outputs, + const std::vector> &order, + const std::map &env) { + + // Each thing we might want to skip gets a unique id, sorted by realization + // order of the corresponding Func. + std::map func_id; + std::vector name_for_id(order.size()); + for (size_t i = 0; i < order.size(); i++) { + // Funcs in a compute_with group get the same id, because you can either + // skip them all or skip none of them. + for (const auto &f : order[i]) { + func_id[f] = i; + } + name_for_id[i] = order[i][0]; + } + + // Map any .buffer symbols back to the id of the Func they refer to. + for (const auto &p : env) { + for (const auto &buf : p.second.output_buffers()) { + func_id[buf.name() + ".buffer"] = func_id[p.first]; } } - return stmt; + + // Make a map from Funcs to the first member of any compute_with group they belong to. + SkipStagesAnalysis analysis(func_id); + stmt.accept(&analysis); + + if (analysis.conditionally_used_funcs.empty()) { + // Nothing to do. No Funcs can be skipped. Just strip the skip stages + // marker. + return StripSkipStagesMarker().mutate(stmt); + } + + // There may be no calls to the output, which means they'll show up in + // neither set. Add them to the unconditionally used set so that the mutator + // knows to skip them. + for (const Function &f : outputs) { + analysis.unconditionally_used_funcs.insert(func_id[f.name()]); + } + + return SkipStages(analysis, name_for_id).mutate(stmt); } } // namespace Internal diff --git a/src/SkipStages.h b/src/SkipStages.h index a50886700485..2f0b86f5e971 100644 --- a/src/SkipStages.h +++ b/src/SkipStages.h @@ -1,6 +1,7 @@ #ifndef HALIDE_SKIP_STAGES #define HALIDE_SKIP_STAGES +#include #include #include @@ -13,12 +14,17 @@ namespace Halide { namespace Internal { +class Function; + /** Avoid computing certain stages if we can infer a runtime condition * to check that tells us they won't be used. Does this by analyzing * all reads of each buffer allocated, and inferring some condition * that tells us if the reads occur. If the condition is non-trivial, * inject ifs that guard the production. */ -Stmt skip_stages(Stmt s, const std::vector &order); +Stmt skip_stages(const Stmt &s, + const std::vector &outputs, + const std::vector> &order, + const std::map &env); } // namespace Internal } // namespace Halide diff --git a/src/Util.cpp b/src/Util.cpp index e1d1f3307848..d7f3c36a7993 100644 --- a/src/Util.cpp +++ b/src/Util.cpp @@ -619,7 +619,7 @@ struct TickStackEntry { namespace { -vector tick_stack; +thread_local vector tick_stack; } // namespace diff --git a/test/correctness/skip_stages.cpp b/test/correctness/skip_stages.cpp index 970966a78e30..a981e5bf3160 100644 --- a/test/correctness/skip_stages.cpp +++ b/test/correctness/skip_stages.cpp @@ -201,6 +201,49 @@ int main(int argc, char **argv) { check_counts(11); } + for (int test_case = 0; test_case <= 2; test_case++) { + // Test a data-dependent stage skip. Double all values that exist in + // rows that do not contain any negative numbers. + Func input("input"); + input(x, y) = select(y % 3 == 0 && x == 37, -1, x); + + Func any_negative("any_negative"); + const int W = 100, H = 100; + RDom r(0, W); + any_negative(y) = cast(false); + any_negative(y) = any_negative(y) || (input(r, y) < 0); + + Func doubled("doubled"); + doubled(x, y) = call_counter(input(x, y) * 2, 0); + + Func output("output"); + output(x, y) = select(any_negative(y), input(x, y), doubled(x, y)); + + input.compute_root(); + + if (test_case == 0) { + // any_negative(y) is a constant condition over this loop, so 'double' can be skipped. + doubled.compute_at(output, y); + any_negative.compute_root(); + } else if (test_case == 1) { + // any_negative(y) is not constant here, so 'double' can't be skipped. + Var yo, yi; + output.split(y, yo, yi, 10); + doubled.compute_at(output, yo); + any_negative.compute_root(); + } else { + // double is computed outside of the consume node for any_negative, + // so the condition can't be lifted because it contains a call that + // may change in value. + doubled.compute_at(output, y); + any_negative.compute_at(output, y); + } + + reset_counts(); + output.realize({W, H}); + check_counts(test_case == 0 ? 66 * 100 : 100 * 100); + } + { // Check the interation with storage hoisting From 7636c44acc2954a7c20275618093973da6767359 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 26 Feb 2024 18:03:33 -0800 Subject: [PATCH 077/186] Remove two dead vars from the Makefile (#8125) These appear to be unused --- Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Makefile b/Makefile index 72c05619e3ea..17e8a80e1ca4 100644 --- a/Makefile +++ b/Makefile @@ -230,9 +230,6 @@ CXX_FLAGS += $(WEBASSEMBLY_CXX_FLAGS) # On ubuntu, this requires packages flatbuffers-compiler and libflatbuffers-dev ifneq (,$(shell which flatc)) CXX_FLAGS += -DWITH_SERIALIZATION -I $(BUILD_DIR) -I $(shell which flatc | sed 's/bin.flatc/include/') -# Note: if updating here, be sure to update in CMakeLists.txt as well -HALIDE_SERIALIZATION_VERSION_MINOR ?= 1 -HALIDE_SERIALIZATION_VERSION_PATCH ?= 0 endif # This is required on some hosts like powerpc64le-linux-gnu because we may build From 8b3312ce9b343aef10ca7101a3f3f67db5501c71 Mon Sep 17 00:00:00 2001 From: Martijn Courteaux Date: Tue, 5 Mar 2024 17:16:06 +0100 Subject: [PATCH 078/186] Add support for setting the default allocator and deallocator functions in Halide::Runtime::Buffer. (#8132) --- src/runtime/HalideBuffer.h | 26 +++++++++++++++++++---- test/correctness/halide_buffer.cpp | 33 ++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/runtime/HalideBuffer.h b/src/runtime/HalideBuffer.h index 4ac2317278bc..7f914d0a4ff2 100644 --- a/src/runtime/HalideBuffer.h +++ b/src/runtime/HalideBuffer.h @@ -142,8 +142,8 @@ struct AllInts : std::false_type {}; template struct AllInts : std::false_type {}; -// A helper to detect if there are any zeros in a container namespace Internal { +// A helper to detect if there are any zeros in a container template bool any_zero(const Container &c) { for (int i : c) { @@ -153,6 +153,11 @@ bool any_zero(const Container &c) { } return false; } + +struct DefaultAllocatorFns { + static inline void *(*default_allocate_fn)(size_t) = nullptr; + static inline void (*default_deallocate_fn)(void *) = nullptr; +}; } // namespace Internal /** A struct acting as a header for allocations owned by the Buffer @@ -711,6 +716,13 @@ class Buffer { } public: + static void set_default_allocate_fn(void *(*allocate_fn)(size_t)) { + Internal::DefaultAllocatorFns::default_allocate_fn = allocate_fn; + } + static void set_default_deallocate_fn(void (*deallocate_fn)(void *)) { + Internal::DefaultAllocatorFns::default_deallocate_fn = deallocate_fn; + } + /** Determine if a Buffer can be constructed from some other Buffer type. * If this can be determined at compile time, fail with a static assert; otherwise * return a boolean based on runtime typing. */ @@ -893,7 +905,7 @@ class Buffer { #if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC // Only use aligned_alloc() if no custom allocators are specified. - if (!allocate_fn && !deallocate_fn) { + if (!allocate_fn && !deallocate_fn && !Internal::DefaultAllocatorFns::default_allocate_fn && !Internal::DefaultAllocatorFns::default_deallocate_fn) { // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes // on any supported platform, so we will just overallocate by 'alignment' // so that the user storage also starts at an aligned point. This is a bit @@ -908,10 +920,16 @@ class Buffer { // else fall thru #endif if (!allocate_fn) { - allocate_fn = malloc; + allocate_fn = Internal::DefaultAllocatorFns::default_allocate_fn; + if (!allocate_fn) { + allocate_fn = malloc; + } } if (!deallocate_fn) { - deallocate_fn = free; + deallocate_fn = Internal::DefaultAllocatorFns::default_deallocate_fn; + if (!deallocate_fn) { + deallocate_fn = free; + } } static_assert(sizeof(AllocationHeader) <= alignment); diff --git a/test/correctness/halide_buffer.cpp b/test/correctness/halide_buffer.cpp index 6c35f4b7a409..accaf6f6bb3e 100644 --- a/test/correctness/halide_buffer.cpp +++ b/test/correctness/halide_buffer.cpp @@ -6,6 +6,22 @@ using namespace Halide::Runtime; +static void *my_malloced_addr = nullptr; +static int my_malloc_count = 0; +static void *my_freed_addr = nullptr; +static int my_free_count = 0; +void *my_malloc(size_t size) { + void *ptr = malloc(size); + my_malloced_addr = ptr; + my_malloc_count++; + return ptr; +} +void my_free(void *ptr) { + my_freed_addr = ptr; + my_free_count++; + free(ptr); +} + template void check_equal_shape(const Buffer &a, const Buffer &b) { if (a.dimensions() != b.dimensions()) abort(); @@ -515,6 +531,23 @@ int main(int argc, char **argv) { assert(b.dim(3).stride() == b2.dim(3).stride()); } + { + // Test setting default allocate and deallocate functions. + Buffer<>::set_default_allocate_fn(my_malloc); + Buffer<>::set_default_deallocate_fn(my_free); + + assert(my_malloc_count == 0); + assert(my_free_count == 0); + auto b = Buffer(5, 4).fill(1); + assert(my_malloced_addr != nullptr && my_malloced_addr < b.data()); + assert(my_malloc_count == 1); + assert(my_free_count == 0); + b.deallocate(); + assert(my_malloc_count == 1); + assert(my_free_count == 1); + assert(my_malloced_addr == my_freed_addr); + } + printf("Success!\n"); return 0; } From d33ffa20f233224adcf80aa147cadf7f594dda51 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 5 Mar 2024 09:50:07 -0800 Subject: [PATCH 079/186] Make realization order invariant to unique_name suffixes (#8124) * Make realization order invariant to unique_name suffixes * Add test * definition_order -> uint64 everywhere * Use visitation order instead of definition order --------- Co-authored-by: Steven Johnson --- src/FindCalls.cpp | 83 ++++++++++++------- src/FindCalls.h | 5 ++ src/RealizationOrder.cpp | 71 +++++++++++++++- test/correctness/CMakeLists.txt | 1 + test/correctness/stable_realization_order.cpp | 41 +++++++++ 5 files changed, 167 insertions(+), 34 deletions(-) create mode 100644 test/correctness/stable_realization_order.cpp diff --git a/src/FindCalls.cpp b/src/FindCalls.cpp index 77c5ae7645cd..9345c89dcac5 100644 --- a/src/FindCalls.cpp +++ b/src/FindCalls.cpp @@ -8,24 +8,22 @@ namespace Halide { namespace Internal { -using std::map; -using std::string; -using std::vector; - namespace { + /* Find all the internal halide calls in an expr */ class FindCalls : public IRVisitor { public: - map calls; + std::map calls; + std::vector order; using IRVisitor::visit; void include_function(const Function &f) { - map::iterator iter = calls.find(f.name()); - if (iter == calls.end()) { - calls[f.name()] = f; + auto [it, inserted] = calls.emplace(f.name(), f); + if (inserted) { + order.push_back(f); } else { - user_assert(iter->second.same_as(f)) + user_assert(it->second.same_as(f)) << "Can't compile a pipeline using multiple functions with same name: " << f.name() << "\n"; } @@ -41,64 +39,87 @@ class FindCalls : public IRVisitor { } }; -void populate_environment_helper(const Function &f, map &env, - bool recursive = true, bool include_wrappers = false) { - map::const_iterator iter = env.find(f.name()); - if (iter != env.end()) { +void populate_environment_helper(const Function &f, + std::map *env, + std::vector *order, + bool recursive = true, + bool include_wrappers = false) { + std::map::const_iterator iter = env->find(f.name()); + if (iter != env->end()) { user_assert(iter->second.same_as(f)) << "Can't compile a pipeline using multiple functions with same name: " << f.name() << "\n"; return; } + auto insert_func = [](const Function &f, + std::map *env, + std::vector *order) { + auto [it, inserted] = env->emplace(f.name(), f); + if (inserted) { + order->push_back(f); + } + }; + FindCalls calls; f.accept(&calls); if (f.has_extern_definition()) { for (const ExternFuncArgument &arg : f.extern_arguments()) { if (arg.is_func()) { - Function g(arg.func); - calls.calls[g.name()] = g; + insert_func(Function{arg.func}, &calls.calls, &calls.order); } } } if (include_wrappers) { for (const auto &it : f.schedule().wrappers()) { - Function g(it.second); - calls.calls[g.name()] = g; + insert_func(Function{it.second}, &calls.calls, &calls.order); } } if (!recursive) { - env.insert(calls.calls.begin(), calls.calls.end()); + for (const Function &g : calls.order) { + insert_func(g, env, order); + } } else { - env[f.name()] = f; - - for (const auto &i : calls.calls) { - populate_environment_helper(i.second, env, recursive, include_wrappers); + insert_func(f, env, order); + for (const Function &g : calls.order) { + populate_environment_helper(g, env, order, recursive, include_wrappers); } } } } // namespace -map build_environment(const vector &funcs) { - map env; +std::map build_environment(const std::vector &funcs) { + std::map env; + std::vector order; for (const Function &f : funcs) { - populate_environment_helper(f, env, true, true); + populate_environment_helper(f, &env, &order, true, true); } return env; } -map find_transitive_calls(const Function &f) { - map res; - populate_environment_helper(f, res, true, false); +std::vector called_funcs_in_order_found(const std::vector &funcs) { + std::map env; + std::vector order; + for (const Function &f : funcs) { + populate_environment_helper(f, &env, &order, true, true); + } + return order; +} + +std::map find_transitive_calls(const Function &f) { + std::map res; + std::vector order; + populate_environment_helper(f, &res, &order, true, false); return res; } -map find_direct_calls(const Function &f) { - map res; - populate_environment_helper(f, res, false, false); +std::map find_direct_calls(const Function &f) { + std::map res; + std::vector order; + populate_environment_helper(f, &res, &order, false, false); return res; } diff --git a/src/FindCalls.h b/src/FindCalls.h index f55140ae9162..40787d922a4f 100644 --- a/src/FindCalls.h +++ b/src/FindCalls.h @@ -36,6 +36,11 @@ std::map find_transitive_calls(const Function &f); * a map of them. */ std::map build_environment(const std::vector &funcs); +/** Returns the same Functions as build_environment, but returns a vector of + * Functions instead, where the order is the order in which the Functions were + * first encountered. This is stable to changes in the names of the Functions. */ +std::vector called_funcs_in_order_found(const std::vector &funcs); + } // namespace Internal } // namespace Halide diff --git a/src/RealizationOrder.cpp b/src/RealizationOrder.cpp index 8541c17ea862..af12ba80c228 100644 --- a/src/RealizationOrder.cpp +++ b/src/RealizationOrder.cpp @@ -41,6 +41,7 @@ find_fused_groups(const map &env, map> fused_groups; map group_name; + int counter = 0; for (const auto &iter : env) { const string &fn = iter.first; if (visited.find(fn) == visited.end()) { @@ -48,7 +49,7 @@ find_fused_groups(const map &env, find_fused_groups_dfs(fn, fuse_adjacency_list, visited, group); // Create a unique name for the fused group. - string rename = unique_name("_fg"); + string rename = "_fg" + std::to_string(counter++); fused_groups.emplace(rename, group); for (const auto &m : group) { group_name.emplace(m, rename); @@ -69,7 +70,7 @@ void realization_order_dfs(const string ¤t, internal_assert(iter != graph.end()); for (const string &fn : iter->second) { - internal_assert(fn != current); + internal_assert(fn != current) << fn; if (visited.find(fn) == visited.end()) { realization_order_dfs(fn, graph, visited, result_set, order); } else { @@ -235,8 +236,63 @@ void check_fused_stages_are_scheduled_in_order(const Function &f) { } } +// Reorder Funcs in a vector to have an order that's resistant to unique_name +// calls, so that multitarget builds don't get arbitrary changes to topological +// ordering, and so that machine-generated schedules (which depend on the +// topological order) and less likely to be invalidated by things that have +// happened in the same process earlier. +// +// To do this, we break each name into a prefix, the visitation order counter of +// the Func, and then finally the full original name. The prefix is what you get +// after stripping off anything after a $ (to handle suffixes introduced by +// multi-character unique_name calls), and then stripping off any digits (to +// handle suffixes introduced by single-character unique_name calls). The +// visitation order is when the Func is first encountered in an IRVisitor +// traversal of the entire Pipeline. +// +// This is gross. The reason we don't just break ties by visitation order alone +// is because that way it's likely to be consistent with the realization +// order before this sorting was done. +void sort_funcs_by_name_and_counter(vector *funcs, + const map &env, + const map &visitation_order) { + vector> items; + items.reserve(funcs->size()); + for (size_t i = 0; i < funcs->size(); i++) { + const string &full_name = (*funcs)[i]; + string prefix = split_string(full_name, "$")[0]; + while (!prefix.empty() && std::isdigit(prefix.back())) { + prefix.pop_back(); + } + auto env_it = env.find(full_name); + uint64_t counter = 0; + if (env_it != env.end()) { + auto v_it = visitation_order.find(full_name); + internal_assert(v_it != visitation_order.end()) + << "Func " << full_name + << " is somehow in the visitation order but not the environment."; + counter = v_it->second; + } + + items.emplace_back(prefix, counter, full_name); + } + std::sort(items.begin(), items.end()); + for (size_t i = 0; i < items.size(); i++) { + (*funcs)[i] = std::move(std::get<2>(items[i])); + } +} + } // anonymous namespace +map compute_visitation_order(const vector &outputs) { + vector funcs = called_funcs_in_order_found(outputs); + map result; + for (uint64_t i = 0; i < funcs.size(); i++) { + result[funcs[i].name()] = i; + } + return result; +} + pair, vector>> realization_order( const vector &outputs, map &env) { @@ -318,6 +374,10 @@ pair, vector>> realization_order( } } } + auto visitation_order = compute_visitation_order(outputs); + for (auto &p : graph) { + sort_funcs_by_name_and_counter(&p.second, env, visitation_order); + } // Compute the realization order of the fused groups (i.e. the dummy nodes) // and also the realization order of the functions within a fused group. @@ -376,7 +436,12 @@ vector topological_order(const vector &outputs, s.push_back(callee.first); } } - graph.emplace(caller.first, s); + graph.emplace(caller.first, std::move(s)); + } + + auto visitation_order = compute_visitation_order(outputs); + for (auto &p : graph) { + sort_funcs_by_name_and_counter(&p.second, env, visitation_order); } vector order; diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index f77393a21114..9b934b768cdd 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -295,6 +295,7 @@ tests(GROUPS correctness split_fuse_rvar.cpp split_reuse_inner_name_bug.cpp split_store_compute.cpp + stable_realization_order.cpp stack_allocations.cpp stage_strided_loads.cpp stencil_chain_in_update_definitions.cpp diff --git a/test/correctness/stable_realization_order.cpp b/test/correctness/stable_realization_order.cpp new file mode 100644 index 000000000000..f62423559327 --- /dev/null +++ b/test/correctness/stable_realization_order.cpp @@ -0,0 +1,41 @@ +#include "Halide.h" + +using namespace Halide; +using namespace Halide::Internal; + +int main(int argc, char **argv) { + // Verify that the realization order is invariant to anything to do with + // unique_name counters. + + std::vector expected; + + for (int i = 0; i < 10; i++) { + std::map env; + Var x, y; + Expr s = 0; + std::vector funcs(8); + for (size_t i = 0; i < funcs.size() - 1; i++) { + funcs[i](x, y) = x + y; + s += funcs[i](x, y); + env[funcs[i].name()] = funcs[i].function(); + } + funcs.back()(x, y) = s; + env[funcs.back().name()] = funcs.back().function(); + + auto r = realization_order({funcs.back().function()}, env).first; + // Ties in the realization order are supposed to be broken by any + // alphabetical prefix of the Func name followed by time of + // definition. All the Funcs in this test have the same name, so it + // should just depend on time of definition. + assert(r.size() == funcs.size()); + for (size_t i = 0; i < funcs.size(); i++) { + if (funcs[i].name() != r[i]) { + debug(0) << "Unexpected realization order: " + << funcs[i].name() << " != " << r[i] << "\n"; + } + } + } + + printf("Success!\n"); + return 0; +} From 05ae15a82983c76fffcc0a2c3f4aecfd7098d4db Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 5 Mar 2024 09:50:19 -0800 Subject: [PATCH 080/186] Make gpu thread and block for loop names opaque (#8133) This is one of our largest remaining type of magic name. These were explicitly constructed in lots of places and then explicitly checked for with ends_with in lots of places. This PR makes the names opaque. Only CanonicalizeGPUVars.cpp knows what they are, and they don't have to be a single fixed thing as long as they're consistent within a process. Also reduced the number of GPU dimensions to three more uniformly. We were already asserting this, but there was lots of dead code in lowering passes after gpu loop validation that allowed for four. Also fixed a bug I found in is_block_uniform. It didn't consider that the dependence on a gpu thread variable in a load index could be because a let variable encountered depends on a gpu thread variable. --- src/CanonicalizeGPUVars.cpp | 43 ++++++++-------- src/CanonicalizeGPUVars.h | 7 +++ src/CodeGen_D3D12Compute_Dev.cpp | 36 +++++-------- src/CodeGen_GPU_Dev.cpp | 74 +++++++++++---------------- src/CodeGen_GPU_Dev.h | 4 -- src/CodeGen_Metal_Dev.cpp | 22 +++----- src/CodeGen_OpenCL_Dev.cpp | 22 +++----- src/CodeGen_PTX_Dev.cpp | 23 ++++----- src/CodeGen_Vulkan_Dev.cpp | 55 ++++++-------------- src/CodeGen_WebGPU_Dev.cpp | 22 +++----- src/DeviceArgument.cpp | 2 +- src/Expr.cpp | 7 +++ src/Expr.h | 3 ++ src/FuseGPUThreadLoops.cpp | 68 ++++++++++++------------ src/OffloadGPULoops.cpp | 44 ++++++---------- src/PartitionLoops.cpp | 8 +-- src/TrimNoOps.cpp | 2 +- test/correctness/fuse_gpu_threads.cpp | 10 ++-- 18 files changed, 195 insertions(+), 257 deletions(-) diff --git a/src/CanonicalizeGPUVars.cpp b/src/CanonicalizeGPUVars.cpp index 7e993d7a72c1..aef1f55c5577 100644 --- a/src/CanonicalizeGPUVars.cpp +++ b/src/CanonicalizeGPUVars.cpp @@ -11,23 +11,26 @@ namespace Halide { namespace Internal { using std::map; -using std::string; using std::vector; -namespace { -string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z"}; -string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z"}; - -string get_thread_name(int index) { +const std::string &gpu_thread_name(int index) { + static std::string gpu_thread_names[3] = {"." + unique_name("thread_id_x"), + "." + unique_name("thread_id_y"), + "." + unique_name("thread_id_z")}; internal_assert(index >= 0 && index < 3); - return thread_names[index]; + return gpu_thread_names[index]; } -string get_block_name(int index) { +const std::string &gpu_block_name(int index) { + static std::string gpu_block_names[3] = {"." + unique_name("block_id_x"), + "." + unique_name("block_id_y"), + "." + unique_name("block_id_z")}; internal_assert(index >= 0 && index < 3); - return block_names[index]; + return gpu_block_names[index]; } +namespace { + class CountGPUBlocksThreads : public IRVisitor { using IRVisitor::visit; @@ -73,12 +76,12 @@ class CountGPUBlocksThreads : public IRVisitor { }; class CanonicalizeGPUVars : public IRMutator { - map gpu_vars; + map gpu_vars; using IRMutator::visit; - string find_replacement(const string &suffix, const string &name) { - vector v = split_string(name, suffix); + std::string find_replacement(const std::string &suffix, const std::string &name) { + vector v = split_string(name, suffix); internal_assert(v.size() == 2); const auto &iter = gpu_vars.find(v[0]); if (iter != gpu_vars.end()) { @@ -87,7 +90,7 @@ class CanonicalizeGPUVars : public IRMutator { return name; } - string canonicalize_let(const string &name) { + std::string canonicalize_let(const std::string &name) { if (ends_with(name, ".loop_max")) { return find_replacement(".loop_max", name); } else if (ends_with(name, ".loop_min")) { @@ -100,7 +103,7 @@ class CanonicalizeGPUVars : public IRMutator { } Stmt visit(const For *op) override { - string name = op->name; + std::string name = op->name; Expr min = mutate(op->min); Expr extent = mutate(op->extent); Stmt body = mutate(op->body); @@ -113,13 +116,13 @@ class CanonicalizeGPUVars : public IRMutator { op->body.accept(&counter); if (op->for_type == ForType::GPUBlock) { - name += "." + get_block_name(counter.nblocks); + name += gpu_block_name(counter.nblocks); debug(5) << "Replacing " << op->name << " with GPU block name " << name << "\n"; } else if (op->for_type == ForType::GPUThread) { - name += "." + get_thread_name(counter.nthreads); + name += gpu_thread_name(counter.nthreads); debug(5) << "Replacing " << op->name << " with GPU thread name " << name << "\n"; } else if (op->for_type == ForType::GPULane) { - name += "." + get_thread_name(0); + name += gpu_thread_name(0); } if (name != op->name) { @@ -143,7 +146,7 @@ class CanonicalizeGPUVars : public IRMutator { } Stmt visit(const LetStmt *op) override { - vector> lets; + vector> lets; Stmt result; do { @@ -154,7 +157,7 @@ class CanonicalizeGPUVars : public IRMutator { result = mutate(result); for (auto it = lets.rbegin(); it != lets.rend(); it++) { - string name = canonicalize_let(it->first); + std::string name = canonicalize_let(it->first); if (name != it->first) { Expr new_var = Variable::make(Int(32), name); result = substitute(it->first, new_var, result); @@ -168,7 +171,7 @@ class CanonicalizeGPUVars : public IRMutator { Stmt visit(const IfThenElse *op) override { Expr condition = mutate(op->condition); - map old_gpu_vars; + map old_gpu_vars; old_gpu_vars.swap(gpu_vars); Stmt then_case = mutate(op->then_case); diff --git a/src/CanonicalizeGPUVars.h b/src/CanonicalizeGPUVars.h index 25d57a52dfc8..573471179a6a 100644 --- a/src/CanonicalizeGPUVars.h +++ b/src/CanonicalizeGPUVars.h @@ -15,6 +15,13 @@ namespace Internal { * by the nesting order: innermost is assigned to x and so on. */ Stmt canonicalize_gpu_vars(Stmt s); +/** Names for the thread and block id variables. Includes the leading + * dot. Indexed from inside out, so 0 gives you the innermost loop. */ +// @{ +const std::string &gpu_thread_name(int index); +const std::string &gpu_block_name(int index); +// @} + } // namespace Internal } // namespace Halide diff --git a/src/CodeGen_D3D12Compute_Dev.cpp b/src/CodeGen_D3D12Compute_Dev.cpp index 4fd614cc0dfc..4b5ea37d8a0e 100644 --- a/src/CodeGen_D3D12Compute_Dev.cpp +++ b/src/CodeGen_D3D12Compute_Dev.cpp @@ -3,6 +3,7 @@ #include #include +#include "CanonicalizeGPUVars.h" #include "CodeGen_D3D12Compute_Dev.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" @@ -221,22 +222,18 @@ string CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::print_reinterpret(Type namespace { string simt_intrinsic(const string &name) { - if (ends_with(name, ".__thread_id_x")) { + if (ends_with(name, gpu_thread_name(0))) { return "tid_in_tgroup.x"; - } else if (ends_with(name, ".__thread_id_y")) { + } else if (ends_with(name, gpu_thread_name(1))) { return "tid_in_tgroup.y"; - } else if (ends_with(name, ".__thread_id_z")) { + } else if (ends_with(name, gpu_thread_name(2))) { return "tid_in_tgroup.z"; - } else if (ends_with(name, ".__thread_id_w")) { - user_error << "HLSL (SM5.1) does not support more than three dimensions for compute kernel threads.\n"; - } else if (ends_with(name, ".__block_id_x")) { + } else if (ends_with(name, gpu_block_name(0))) { return "tgroup_index.x"; - } else if (ends_with(name, ".__block_id_y")) { + } else if (ends_with(name, gpu_block_name(1))) { return "tgroup_index.y"; - } else if (ends_with(name, ".__block_id_z")) { + } else if (ends_with(name, gpu_block_name(2))) { return "tgroup_index.z"; - } else if (ends_with(name, ".__block_id_w")) { - user_error << "HLSL (SM5.1) does not support more than three dimensions for compute dispatch groups.\n"; } internal_error << "simt_intrinsic called on bad variable name: " << name << "\n"; return ""; @@ -300,15 +297,10 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::visit(const For *loop) { user_assert(loop->for_type != ForType::GPULane) << "The D3D12Compute backend does not support the gpu_lanes() scheduling directive."; - if (!is_gpu_var(loop->name)) { - user_assert(loop->for_type != ForType::Parallel) << "Cannot use parallel loops inside D3D12Compute kernel\n"; + if (!is_gpu(loop->for_type)) { CodeGen_GPU_C::visit(loop); return; } - - internal_assert((loop->for_type == ForType::GPUBlock) || - (loop->for_type == ForType::GPUThread)) - << "kernel loop must be either gpu block or gpu thread\n"; internal_assert(is_const_zero(loop->min)); stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name) @@ -1153,7 +1145,7 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, struct FindThreadGroupSize : public IRVisitor { using IRVisitor::visit; void visit(const For *loop) override { - if (!is_gpu_var(loop->name)) { + if (!is_gpu(loop->for_type)) { return loop->body.accept(this); } if (loop->for_type != ForType::GPUThread) { @@ -1175,13 +1167,9 @@ void CodeGen_D3D12Compute_Dev::CodeGen_D3D12Compute_C::add_kernel(Stmt s, loop->body.accept(this); } int thread_loop_workgroup_index(const string &name) { - string ids[] = {".__thread_id_x", - ".__thread_id_y", - ".__thread_id_z", - ".__thread_id_w"}; - for (auto &id : ids) { - if (ends_with(name, id)) { - return (&id - ids); + for (int i = 0; i < 3; i++) { + if (ends_with(name, gpu_thread_name(i))) { + return i; } } return -1; diff --git a/src/CodeGen_GPU_Dev.cpp b/src/CodeGen_GPU_Dev.cpp index 08e456e78ce2..07148a508144 100644 --- a/src/CodeGen_GPU_Dev.cpp +++ b/src/CodeGen_GPU_Dev.cpp @@ -1,5 +1,7 @@ #include "CodeGen_GPU_Dev.h" +#include "CanonicalizeGPUVars.h" #include "Deinterleave.h" +#include "ExprUsesVar.h" #include "IRMutator.h" #include "IROperator.h" #include "IRVisitor.h" @@ -9,50 +11,6 @@ namespace Internal { CodeGen_GPU_Dev::~CodeGen_GPU_Dev() = default; -bool CodeGen_GPU_Dev::is_gpu_var(const std::string &name) { - return is_gpu_block_var(name) || is_gpu_thread_var(name); -} - -bool CodeGen_GPU_Dev::is_gpu_block_var(const std::string &name) { - return (ends_with(name, ".__block_id_x") || - ends_with(name, ".__block_id_y") || - ends_with(name, ".__block_id_z") || - ends_with(name, ".__block_id_w")); -} - -bool CodeGen_GPU_Dev::is_gpu_thread_var(const std::string &name) { - return (ends_with(name, ".__thread_id_x") || - ends_with(name, ".__thread_id_y") || - ends_with(name, ".__thread_id_z") || - ends_with(name, ".__thread_id_w")); -} - -namespace { -// Check to see if an expression is uniform within a block. -// This is done by checking to see if the expression depends on any GPU -// thread indices. -class IsBlockUniform : public IRVisitor { - using IRVisitor::visit; - - void visit(const Variable *op) override { - if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) { - result = false; - } - } - -public: - bool result = true; - - IsBlockUniform() = default; -}; -} // namespace - -bool CodeGen_GPU_Dev::is_block_uniform(const Expr &expr) { - IsBlockUniform v; - expr.accept(&v); - return v.result; -} - namespace { // Check to see if a buffer is a candidate for constant memory storage. // A buffer is a candidate for constant memory if it is never written to, @@ -71,7 +29,7 @@ class IsBufferConstant : public IRVisitor { void visit(const Load *op) override { if (op->name == buffer && - !CodeGen_GPU_Dev::is_block_uniform(op->index)) { + expr_uses_vars(op->index, depends_on_thread_var)) { result = false; } if (result) { @@ -79,6 +37,32 @@ class IsBufferConstant : public IRVisitor { } } + void visit(const LetStmt *op) override { + op->value.accept(this); + ScopedBinding<> bind_if(expr_uses_vars(op->value, depends_on_thread_var), + depends_on_thread_var, + op->name); + op->body.accept(this); + } + + void visit(const Let *op) override { + op->value.accept(this); + ScopedBinding<> bind_if(expr_uses_vars(op->value, depends_on_thread_var), + depends_on_thread_var, + op->name); + op->body.accept(this); + } + + void visit(const For *op) override { + ScopedBinding<> bind_if(op->for_type == ForType::GPUThread || + op->for_type == ForType::GPULane, + depends_on_thread_var, + op->name); + IRVisitor::visit(op); + } + + Scope<> depends_on_thread_var; + public: bool result = true; const std::string &buffer; diff --git a/src/CodeGen_GPU_Dev.h b/src/CodeGen_GPU_Dev.h index f6100116b955..ff80480003bc 100644 --- a/src/CodeGen_GPU_Dev.h +++ b/src/CodeGen_GPU_Dev.h @@ -55,10 +55,6 @@ struct CodeGen_GPU_Dev { return false; } - static bool is_gpu_var(const std::string &name); - static bool is_gpu_block_var(const std::string &name); - static bool is_gpu_thread_var(const std::string &name); - /** Checks if expr is block uniform, i.e. does not depend on a thread * var. */ static bool is_block_uniform(const Expr &expr); diff --git a/src/CodeGen_Metal_Dev.cpp b/src/CodeGen_Metal_Dev.cpp index 79060294798e..35b22058aec1 100644 --- a/src/CodeGen_Metal_Dev.cpp +++ b/src/CodeGen_Metal_Dev.cpp @@ -2,6 +2,7 @@ #include #include +#include "CanonicalizeGPUVars.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_Metal_Dev.h" @@ -187,22 +188,18 @@ string CodeGen_Metal_Dev::CodeGen_Metal_C::print_reinterpret(Type type, const Ex namespace { string simt_intrinsic(const string &name) { - if (ends_with(name, ".__thread_id_x")) { + if (ends_with(name, gpu_thread_name(0))) { return "tid_in_tgroup.x"; - } else if (ends_with(name, ".__thread_id_y")) { + } else if (ends_with(name, gpu_thread_name(1))) { return "tid_in_tgroup.y"; - } else if (ends_with(name, ".__thread_id_z")) { + } else if (ends_with(name, gpu_thread_name(2))) { return "tid_in_tgroup.z"; - } else if (ends_with(name, ".__thread_id_w")) { - user_error << "Metal does not support more than three dimensions in a kernel (threads).\n"; - } else if (ends_with(name, ".__block_id_x")) { + } else if (ends_with(name, gpu_block_name(0))) { return "tgroup_index.x"; - } else if (ends_with(name, ".__block_id_y")) { + } else if (ends_with(name, gpu_block_name(1))) { return "tgroup_index.y"; - } else if (ends_with(name, ".__block_id_z")) { + } else if (ends_with(name, gpu_block_name(2))) { return "tgroup_index.z"; - } else if (ends_with(name, ".__block_id_w")) { - user_error << "Metal does not support more than three dimensions in a kernel (groups).\n"; } internal_error << "simt_intrinsic called on bad variable name: " << name << "\n"; return ""; @@ -272,10 +269,7 @@ void CodeGen_Metal_Dev::CodeGen_Metal_C::visit(const For *loop) { user_assert(loop->for_type != ForType::GPULane) << "The Metal backend does not support the gpu_lanes() scheduling directive."; - if (is_gpu_var(loop->name)) { - internal_assert((loop->for_type == ForType::GPUBlock) || - (loop->for_type == ForType::GPUThread)) - << "kernel loop must be either gpu block or gpu thread\n"; + if (is_gpu(loop->for_type)) { internal_assert(is_const_zero(loop->min)); stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name) diff --git a/src/CodeGen_OpenCL_Dev.cpp b/src/CodeGen_OpenCL_Dev.cpp index c86e483cc5a8..d7c7951936f3 100644 --- a/src/CodeGen_OpenCL_Dev.cpp +++ b/src/CodeGen_OpenCL_Dev.cpp @@ -4,6 +4,7 @@ #include #include "CSE.h" +#include "CanonicalizeGPUVars.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_OpenCL_Dev.h" @@ -184,22 +185,18 @@ string CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::print_reinterpret(Type type, const namespace { string simt_intrinsic(const string &name) { - if (ends_with(name, ".__thread_id_x")) { + if (ends_with(name, gpu_thread_name(0))) { return "get_local_id(0)"; - } else if (ends_with(name, ".__thread_id_y")) { + } else if (ends_with(name, gpu_thread_name(1))) { return "get_local_id(1)"; - } else if (ends_with(name, ".__thread_id_z")) { + } else if (ends_with(name, gpu_thread_name(2))) { return "get_local_id(2)"; - } else if (ends_with(name, ".__thread_id_w")) { - return "get_local_id(3)"; - } else if (ends_with(name, ".__block_id_x")) { + } else if (ends_with(name, gpu_block_name(0))) { return "get_group_id(0)"; - } else if (ends_with(name, ".__block_id_y")) { + } else if (ends_with(name, gpu_block_name(1))) { return "get_group_id(1)"; - } else if (ends_with(name, ".__block_id_z")) { + } else if (ends_with(name, gpu_block_name(2))) { return "get_group_id(2)"; - } else if (ends_with(name, ".__block_id_w")) { - return "get_group_id(3)"; } internal_error << "simt_intrinsic called on bad variable name: " << name << "\n"; return ""; @@ -210,10 +207,7 @@ void CodeGen_OpenCL_Dev::CodeGen_OpenCL_C::visit(const For *loop) { user_assert(loop->for_type != ForType::GPULane) << "The OpenCL backend does not support the gpu_lanes() scheduling directive."; - if (is_gpu_var(loop->name)) { - internal_assert((loop->for_type == ForType::GPUBlock) || - (loop->for_type == ForType::GPUThread)) - << "kernel loop must be either gpu block or gpu thread\n"; + if (is_gpu(loop->for_type)) { internal_assert(is_const_zero(loop->min)); stream << get_indent() << print_type(Int(32)) << " " << print_name(loop->name) diff --git a/src/CodeGen_PTX_Dev.cpp b/src/CodeGen_PTX_Dev.cpp index 6be2f1b7e988..0d63427b8d83 100644 --- a/src/CodeGen_PTX_Dev.cpp +++ b/src/CodeGen_PTX_Dev.cpp @@ -1,5 +1,6 @@ #include "CodeGen_PTX_Dev.h" #include "CSE.h" +#include "CanonicalizeGPUVars.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_LLVM.h" @@ -105,8 +106,8 @@ class CodeGen_PTX_Dev : public CodeGen_LLVM, public CodeGen_GPU_Dev { } Type upgrade_type_for_storage(const Type &t) const override; - /** Map from simt variable names (e.g. foo.__block_id_x) to the llvm - * ptx intrinsic functions to call to get them. */ + /** Map from simt variable names (e.g. foo.block_id_x) to the llvm ptx + * intrinsic functions to call to get them. */ std::string simt_intrinsic(const std::string &name); bool supports_atomic_add(const Type &t) const override; @@ -282,29 +283,25 @@ void CodeGen_PTX_Dev::visit(const Call *op) { } string CodeGen_PTX_Dev::simt_intrinsic(const string &name) { - if (ends_with(name, ".__thread_id_x")) { + if (ends_with(name, gpu_thread_name(0))) { return "llvm.nvvm.read.ptx.sreg.tid.x"; - } else if (ends_with(name, ".__thread_id_y")) { + } else if (ends_with(name, gpu_thread_name(1))) { return "llvm.nvvm.read.ptx.sreg.tid.y"; - } else if (ends_with(name, ".__thread_id_z")) { + } else if (ends_with(name, gpu_thread_name(2))) { return "llvm.nvvm.read.ptx.sreg.tid.z"; - } else if (ends_with(name, ".__thread_id_w")) { - return "llvm.nvvm.read.ptx.sreg.tid.w"; - } else if (ends_with(name, ".__block_id_x")) { + } else if (ends_with(name, gpu_block_name(0))) { return "llvm.nvvm.read.ptx.sreg.ctaid.x"; - } else if (ends_with(name, ".__block_id_y")) { + } else if (ends_with(name, gpu_block_name(1))) { return "llvm.nvvm.read.ptx.sreg.ctaid.y"; - } else if (ends_with(name, ".__block_id_z")) { + } else if (ends_with(name, gpu_block_name(2))) { return "llvm.nvvm.read.ptx.sreg.ctaid.z"; - } else if (ends_with(name, ".__block_id_w")) { - return "llvm.nvvm.read.ptx.sreg.ctaid.w"; } internal_error << "simt_intrinsic called on bad variable name\n"; return ""; } void CodeGen_PTX_Dev::visit(const For *loop) { - if (is_gpu_var(loop->name)) { + if (is_gpu(loop->for_type)) { Expr simt_idx = Call::make(Int(32), simt_intrinsic(loop->name), std::vector(), Call::Extern); internal_assert(is_const_zero(loop->min)); sym_push(loop->name, codegen(simt_idx)); diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp index 39dd65b67671..157a3cbdc9ea 100644 --- a/src/CodeGen_Vulkan_Dev.cpp +++ b/src/CodeGen_Vulkan_Dev.cpp @@ -4,6 +4,7 @@ #include #include "CSE.h" +#include "CanonicalizeGPUVars.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_Vulkan_Dev.h" @@ -381,12 +382,10 @@ class CheckAlignedDenseVectorLoadStore : public IRVisitor { struct FindWorkGroupSize : public IRVisitor { using IRVisitor::visit; void visit(const For *loop) override { - if (!CodeGen_GPU_Dev::is_gpu_var(loop->name)) { - return loop->body.accept(this); - } + user_assert(loop->for_type != ForType::GPULane) + << "The Vulkan backend does not support the gpu_lanes() scheduling directive."; - if ((loop->for_type == ForType::GPUBlock) || - (loop->for_type == ForType::GPUThread)) { + if (is_gpu(loop->for_type)) { // This should always be true at this point in codegen internal_assert(is_const_zero(loop->min)); @@ -411,11 +410,8 @@ struct FindWorkGroupSize : public IRVisitor { } int thread_loop_workgroup_index(const std::string &name) { - std::string ids[] = {".__thread_id_x", - ".__thread_id_y", - ".__thread_id_z"}; - for (size_t i = 0; i < sizeof(ids) / sizeof(std::string); i++) { - if (ends_with(name, ids[i])) { + for (size_t i = 0; i < 3; i++) { + if (ends_with(name, gpu_thread_name(i))) { return i; } } @@ -1630,20 +1626,18 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const AssertStmt *stmt) { namespace { std::pair simt_intrinsic(const std::string &name) { - if (ends_with(name, ".__thread_id_x")) { + if (ends_with(name, gpu_thread_name(0))) { return {"LocalInvocationId", 0}; - } else if (ends_with(name, ".__thread_id_y")) { + } else if (ends_with(name, gpu_thread_name(1))) { return {"LocalInvocationId", 1}; - } else if (ends_with(name, ".__thread_id_z")) { + } else if (ends_with(name, gpu_thread_name(2))) { return {"LocalInvocationId", 2}; - } else if (ends_with(name, ".__block_id_x")) { + } else if (ends_with(name, gpu_block_name(0))) { return {"WorkgroupId", 0}; - } else if (ends_with(name, ".__block_id_y")) { + } else if (ends_with(name, gpu_block_name(1))) { return {"WorkgroupId", 1}; - } else if (ends_with(name, ".__block_id_z")) { + } else if (ends_with(name, gpu_block_name(2))) { return {"WorkgroupId", 2}; - } else if (ends_with(name, "id_w")) { - user_error << "Vulkan only supports <=3 dimensions for gpu blocks"; } internal_error << "simt_intrinsic called on bad variable name: " << name << "\n"; return {"", -1}; @@ -1654,11 +1648,7 @@ std::pair simt_intrinsic(const std::string &name) { void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const For *op) { debug(2) << "CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(For): name=" << op->name << " min=" << op->min << " extent=" << op->extent << "\n"; - if (is_gpu_var(op->name)) { - internal_assert((op->for_type == ForType::GPUBlock) || - (op->for_type == ForType::GPUThread)) - << "kernel loops must be either gpu block or gpu thread\n"; - + if (is_gpu(op->for_type)) { // This should always be true at this point in codegen internal_assert(is_const_zero(op->min)); auto intrinsic = simt_intrinsic(op->name); @@ -2477,11 +2467,6 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_workgroup_size(SpvId kernel_func local_size_y_id, local_size_z_id}; - const char *local_size_names[3] = { - "__thread_id_x", - "__thread_id_y", - "__thread_id_z"}; - debug(1) << "Vulkan: Using dynamic workgroup local size with default of [" << local_size_x << ", " << local_size_y << ", " << local_size_z << "]...\n"; // annotate each local size with a corresponding specialization constant @@ -2489,8 +2474,8 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_workgroup_size(SpvId kernel_func SpvId constant_id = (uint32_t)(descriptor_set_table.back().specialization_constants.size() + 1); SpvBuilder::Literals spec_id = {constant_id}; builder.add_annotation(local_size_ids[dim], SpvDecorationSpecId, spec_id); - builder.add_symbol(local_size_names[dim], local_size_ids[dim], builder.current_module().id()); - SpecializationBinding spec_binding = {constant_id, (uint32_t)sizeof(uint32_t), local_size_names[dim]}; + builder.add_symbol(gpu_thread_name(dim), local_size_ids[dim], builder.current_module().id()); + SpecializationBinding spec_binding = {constant_id, (uint32_t)sizeof(uint32_t), gpu_thread_name(dim)}; descriptor_set_table.back().specialization_constants.push_back(spec_binding); descriptor_set_table.back().workgroup_size_binding.local_size_constant_id[dim] = constant_id; } @@ -2520,18 +2505,12 @@ namespace { class FindIntrinsicsUsed : public IRVisitor { using IRVisitor::visit; void visit(const For *op) override { - if (CodeGen_GPU_Dev::is_gpu_var(op->name)) { + if (is_gpu(op->for_type)) { auto intrinsic = simt_intrinsic(op->name); - intrinsics_used.insert(intrinsic.first); + intrinsics_used.insert(op->name); } op->body.accept(this); } - void visit(const Variable *op) override { - if (CodeGen_GPU_Dev::is_gpu_var(op->name)) { - auto intrinsic = simt_intrinsic(op->name); - intrinsics_used.insert(intrinsic.first); - } - } public: std::unordered_set intrinsics_used; diff --git a/src/CodeGen_WebGPU_Dev.cpp b/src/CodeGen_WebGPU_Dev.cpp index de55113ff695..815013798bb4 100644 --- a/src/CodeGen_WebGPU_Dev.cpp +++ b/src/CodeGen_WebGPU_Dev.cpp @@ -4,6 +4,7 @@ #include #include +#include "CanonicalizeGPUVars.h" #include "CodeGen_GPU_Dev.h" #include "CodeGen_Internal.h" #include "CodeGen_WebGPU_Dev.h" @@ -603,22 +604,18 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::visit(const FloatImm *op) { namespace { string simt_intrinsic(const string &name) { - if (ends_with(name, ".__thread_id_x")) { + if (ends_with(name, gpu_thread_name(0))) { return "local_id.x"; - } else if (ends_with(name, ".__thread_id_y")) { + } else if (ends_with(name, gpu_thread_name(1))) { return "local_id.y"; - } else if (ends_with(name, ".__thread_id_z")) { + } else if (ends_with(name, gpu_thread_name(2))) { return "local_id.z"; - } else if (ends_with(name, ".__thread_id_w")) { - user_error << "WebGPU does not support more than three dimensions.\n"; - } else if (ends_with(name, ".__block_id_x")) { + } else if (ends_with(name, gpu_block_name(0))) { return "group_id.x"; - } else if (ends_with(name, ".__block_id_y")) { + } else if (ends_with(name, gpu_block_name(1))) { return "group_id.y"; - } else if (ends_with(name, ".__block_id_z")) { + } else if (ends_with(name, gpu_block_name(2))) { return "group_id.z"; - } else if (ends_with(name, ".__block_id_w")) { - user_error << "WebGPU does not support more than three dimensions.\n"; } internal_error << "invalid simt_intrinsic name: " << name << "\n"; return ""; @@ -646,10 +643,7 @@ void CodeGen_WebGPU_Dev::CodeGen_WGSL::visit(const For *loop) { user_assert(loop->for_type != ForType::GPULane) << "The WebGPU backend does not support the gpu_lanes() directive."; - if (is_gpu_var(loop->name)) { - internal_assert((loop->for_type == ForType::GPUBlock) || - (loop->for_type == ForType::GPUThread)) - << "kernel loop must be either gpu block or gpu thread\n"; + if (is_gpu(loop->for_type)) { internal_assert(is_const_zero(loop->min)); stream << get_indent() diff --git a/src/DeviceArgument.cpp b/src/DeviceArgument.cpp index 82278be273e5..104538611a65 100644 --- a/src/DeviceArgument.cpp +++ b/src/DeviceArgument.cpp @@ -65,7 +65,7 @@ void HostClosure::visit(const Call *op) { } void HostClosure::visit(const For *loop) { - if (CodeGen_GPU_Dev::is_gpu_var(loop->name)) { + if (is_gpu(loop->for_type)) { // The size of the threads and blocks is not part of the closure ScopedBinding<> p(ignore, loop->name); loop->body.accept(this); diff --git a/src/Expr.cpp b/src/Expr.cpp index a619661dedf6..c3a7deb483aa 100644 --- a/src/Expr.cpp +++ b/src/Expr.cpp @@ -87,6 +87,13 @@ bool is_parallel(ForType for_type) { for_type == ForType::GPULane); } +/** Returns true if for_type is GPUBlock, GPUThread, or GPULane. */ +bool is_gpu(ForType for_type) { + return (for_type == ForType::GPUBlock || + for_type == ForType::GPUThread || + for_type == ForType::GPULane); +} + } // namespace Internal Range::Range(const Expr &min_in, const Expr &extent_in) diff --git a/src/Expr.h b/src/Expr.h index 327462f973c0..31850fc56001 100644 --- a/src/Expr.h +++ b/src/Expr.h @@ -415,6 +415,9 @@ bool is_unordered_parallel(ForType for_type); /** Returns true if for_type executes for loop iterations in parallel. */ bool is_parallel(ForType for_type); +/** Returns true if for_type is GPUBlock, GPUThread, or GPULane. */ +bool is_gpu(ForType for_type); + /** A reference-counted handle to a statement node. */ struct Stmt : public IRHandle { Stmt() = default; diff --git a/src/FuseGPUThreadLoops.cpp b/src/FuseGPUThreadLoops.cpp index abde50d62e1f..4294f2ebc825 100644 --- a/src/FuseGPUThreadLoops.cpp +++ b/src/FuseGPUThreadLoops.cpp @@ -4,6 +4,7 @@ #include "Bounds.h" #include "CSE.h" +#include "CanonicalizeGPUVars.h" #include "CodeGen_GPU_Dev.h" #include "CompilerLogger.h" #include "ExprUsesVar.h" @@ -29,17 +30,14 @@ using std::vector; namespace { -string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"}; -string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"}; - class ExtractBlockSize : public IRVisitor { - Expr block_extent[4], block_count[4]; - string block_var_name[4]; + Expr block_extent[3], block_count[3]; + string block_var_name[3]; using IRVisitor::visit; void found_thread_for(int dim, const string &name, const Expr &extent) { - internal_assert(dim >= 0 && dim < 4); + internal_assert(dim >= 0 && dim < 3); if (!block_extent[dim].defined()) { block_extent[dim] = extent; } else { @@ -48,17 +46,17 @@ class ExtractBlockSize : public IRVisitor { } void found_block_for(int dim, const string &name, Expr extent) { - internal_assert(dim >= 0 && dim < 4); + internal_assert(dim >= 0 && dim < 3); internal_assert(!block_count[dim].defined()); block_count[dim] = std::move(extent); block_var_name[dim] = name; } void visit(const For *op) override { - for (int i = 0; i < 4; i++) { - if (ends_with(op->name, thread_names[i])) { + for (int i = 0; i < 3; i++) { + if (ends_with(op->name, gpu_thread_name(i))) { found_thread_for(i, op->name, op->extent); - } else if (ends_with(op->name, block_names[i])) { + } else if (ends_with(op->name, gpu_block_name(i))) { found_block_for(i, op->name, op->extent); } } @@ -88,21 +86,21 @@ class ExtractBlockSize : public IRVisitor { public: int blocks_dimensions() const { - for (int i = 0; i < 4; i++) { + for (int i = 0; i < 3; i++) { if (!block_count[i].defined()) { return i; } } - return 4; + return 3; } int threads_dimensions() const { - for (int i = 0; i < 4; i++) { + for (int i = 0; i < 3; i++) { if (!block_extent[i].defined()) { return i; } } - return 4; + return 3; } Expr num_threads(int d) const { @@ -114,12 +112,13 @@ class ExtractBlockSize : public IRVisitor { } Expr block_var(int d) const { + // The name of the actual for loop return Variable::make(Int(32), block_var_name[d]); } Expr thread_var(int d) const { // Thread variables get canonical names - return Variable::make(Int(32), "." + thread_names[d]); + return Variable::make(Int(32), gpu_thread_name(d)); } }; @@ -142,8 +141,8 @@ class NormalizeDimensionality : public IRMutator { return s; } while (max_depth < block_size.threads_dimensions()) { - string name = thread_names[max_depth]; - s = For::make("." + name, 0, 1, ForType::GPUThread, Partition::Never, device_api, s); + s = For::make(gpu_thread_name(max_depth), 0, 1, ForType::GPUThread, + Partition::Never, device_api, s); max_depth++; } return s; @@ -166,7 +165,8 @@ class NormalizeDimensionality : public IRMutator { } Stmt visit(const For *op) override { - if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) { + if (op->for_type == ForType::GPUThread || + op->for_type == ForType::GPULane) { depth++; if (depth > max_depth) { max_depth = depth; @@ -191,10 +191,11 @@ class ReplaceForWithIf : public IRMutator { const ExtractBlockSize &block_size; Stmt visit(const For *op) override { - if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) { + if (op->for_type == ForType::GPUThread || + op->for_type == ForType::GPULane) { int dim; - for (dim = 0; dim < 4; dim++) { - if (ends_with(op->name, thread_names[dim])) { + for (dim = 0; dim < 3; dim++) { + if (ends_with(op->name, gpu_thread_name(dim))) { break; } } @@ -203,7 +204,7 @@ class ReplaceForWithIf : public IRMutator { Stmt body = mutate(op->body); - Expr var = Variable::make(Int(32), "." + thread_names[dim]); + Expr var = Variable::make(Int(32), gpu_thread_name(dim)); body = substitute(op->name, var + op->min, body); if (equal(op->extent, block_size.num_threads(dim))) { @@ -322,7 +323,7 @@ class ExtractSharedAndHeapAllocations : public IRMutator { } Stmt visit(const For *op) override { - bool is_thread_loop = CodeGen_GPU_Dev::is_gpu_thread_var(op->name); + bool is_thread_loop = op->for_type == ForType::GPUThread || op->for_type == ForType::GPULane; ScopedValue old_in_threads(in_threads, in_threads || is_thread_loop); // Set aside the allocations we've found so far. @@ -1366,7 +1367,7 @@ class FuseGPUThreadLoopsSingleKernel : public IRMutator { ExtractSharedAndHeapAllocations &block_allocations; Stmt visit(const For *op) override { - if (ends_with(op->name, ".__block_id_x")) { + if (ends_with(op->name, gpu_block_name(0))) { Stmt body = op->body; // This is the innermost loop over blocks. @@ -1407,17 +1408,17 @@ class FuseGPUThreadLoopsSingleKernel : public IRMutator { debug(3) << "Replaced for with if:\n" << body << "\n\n"; - // There is always a loop over thread_id_x - string thread_id = "." + thread_names[0]; + // There is always a loop over the innermost thread dimension + string thread_id = gpu_thread_name(0); // Add back in any register-level allocations body = register_allocs.rewrap(body, thread_id); body = For::make(thread_id, 0, block_size_x, innermost_loop_type, op->partition_policy, op->device_api, body); // Rewrap the whole thing in other loops over threads for (int i = 1; i < block_size.threads_dimensions(); i++) { - thread_id = "." + thread_names[i]; + thread_id = gpu_thread_name(i); body = register_allocs.rewrap(body, thread_id); - body = For::make("." + thread_names[i], 0, block_size.num_threads(i), + body = For::make(thread_id, 0, block_size.num_threads(i), ForType::GPUThread, op->partition_policy, op->device_api, body); } thread_id.clear(); @@ -1452,14 +1453,15 @@ class FuseGPUThreadLoops : public IRMutator { using IRMutator::visit; Stmt visit(const For *op) override { - user_assert(!(CodeGen_GPU_Dev::is_gpu_thread_var(op->name))) + user_assert(!(op->for_type == ForType::GPUThread || + op->for_type == ForType::GPULane)) << "Loops over GPU thread variable: \"" << op->name << "\" is outside of any loop over a GPU block variable. " << "This schedule is malformed. There must be a GPU block " << "variable, and it must reordered to be outside all GPU " << "thread variables.\n"; - if (CodeGen_GPU_Dev::is_gpu_block_var(op->name)) { + if (op->for_type == ForType::GPUBlock) { // Do the analysis of thread block size and shared memory // usage. ExtractBlockSize block_size; @@ -1498,7 +1500,7 @@ class ZeroGPULoopMins : public IRMutator { (op->device_api == DeviceAPI::Vulkan); Stmt stmt = IRMutator::visit(op); - if (CodeGen_GPU_Dev::is_gpu_var(op->name) && !is_const_zero(op->min)) { + if (is_gpu(op->for_type) && !is_const_zero(op->min)) { op = stmt.as(); internal_assert(op); Expr adjusted = Variable::make(Int(32), op->name) + op->min; @@ -1526,7 +1528,7 @@ class FindInnermostGPUBlock : public IRVisitor { using IRVisitor::visit; void visit(const For *op) override { - if (CodeGen_GPU_Dev::is_gpu_block_var(op->name)) { + if (op->for_type == ForType::GPUBlock) { // Set the last found GPU block to found_gpu_block. found_gpu_block = op; } @@ -1567,7 +1569,7 @@ class NormalizeIfStatements : public IRMutator { bool inside_gpu_blocks = false; Stmt visit(const For *op) override { - if (!CodeGen_GPU_Dev::is_gpu_block_var(op->name)) { + if (op->for_type != ForType::GPUBlock) { return IRMutator::visit(op); } ScopedValue old_inside_gpu_blocks(inside_gpu_blocks, true); diff --git a/src/OffloadGPULoops.cpp b/src/OffloadGPULoops.cpp index 77a57efc1149..4a33c8f1bc00 100644 --- a/src/OffloadGPULoops.cpp +++ b/src/OffloadGPULoops.cpp @@ -1,5 +1,6 @@ #include +#include "CanonicalizeGPUVars.h" #include "Closure.h" #include "CodeGen_D3D12Compute_Dev.h" #include "CodeGen_GPU_Dev.h" @@ -31,13 +32,13 @@ namespace { // amount of shared memory to allocate. class ExtractBounds : public IRVisitor { public: - Expr num_threads[4]; - Expr num_blocks[4]; + Expr num_threads[3]; + Expr num_blocks[3]; Expr shared_mem_size; ExtractBounds() : shared_mem_size(0) { - for (int i = 0; i < 4; i++) { + for (int i = 0; i < 3; i++) { num_threads[i] = num_blocks[i] = 1; } } @@ -48,26 +49,17 @@ class ExtractBounds : public IRVisitor { using IRVisitor::visit; void visit(const For *op) override { - if (CodeGen_GPU_Dev::is_gpu_var(op->name)) { + if (is_gpu(op->for_type)) { internal_assert(is_const_zero(op->min)); } - if (ends_with(op->name, ".__thread_id_x")) { - num_threads[0] = op->extent; - } else if (ends_with(op->name, ".__thread_id_y")) { - num_threads[1] = op->extent; - } else if (ends_with(op->name, ".__thread_id_z")) { - num_threads[2] = op->extent; - } else if (ends_with(op->name, ".__thread_id_w")) { - num_threads[3] = op->extent; - } else if (ends_with(op->name, ".__block_id_x")) { - num_blocks[0] = op->extent; - } else if (ends_with(op->name, ".__block_id_y")) { - num_blocks[1] = op->extent; - } else if (ends_with(op->name, ".__block_id_z")) { - num_blocks[2] = op->extent; - } else if (ends_with(op->name, ".__block_id_w")) { - num_blocks[3] = op->extent; + for (int i = 0; i < 3; i++) { + if (ends_with(op->name, gpu_thread_name(i))) { + num_threads[i] = op->extent; + } + if (ends_with(op->name, gpu_block_name(i))) { + num_blocks[i] = op->extent; + } } op->body.accept(this); @@ -127,7 +119,7 @@ class InjectGpuOffload : public IRMutator { using IRMutator::visit; Stmt visit(const For *loop) override { - if (!CodeGen_GPU_Dev::is_gpu_var(loop->name)) { + if (!is_gpu(loop->for_type)) { return IRMutator::visit(loop); } @@ -142,12 +134,10 @@ class InjectGpuOffload : public IRMutator { debug(2) << "Kernel bounds: (" << bounds.num_threads[0] << ", " << bounds.num_threads[1] << ", " - << bounds.num_threads[2] << ", " - << bounds.num_threads[3] << ") threads, (" + << bounds.num_threads[2] << ") threads, (" << bounds.num_blocks[0] << ", " << bounds.num_blocks[1] << ", " - << bounds.num_blocks[2] << ", " - << bounds.num_blocks[3] << ") blocks\n"; + << bounds.num_blocks[2] << ") blocks\n"; // compute a closure over the state passed into the kernel HostClosure c; @@ -222,10 +212,6 @@ class InjectGpuOffload : public IRMutator { } arg_is_buffer.emplace_back(cast(0)); - // TODO: only three dimensions can be passed to - // cuLaunchKernel. How should we handle blkid[3]? - internal_assert(is_const_one(bounds.num_threads[3]) && is_const_one(bounds.num_blocks[3])) - << bounds.num_threads[3] << ", " << bounds.num_blocks[3] << "\n"; debug(3) << "bounds.num_blocks[0] = " << bounds.num_blocks[0] << "\n"; debug(3) << "bounds.num_blocks[1] = " << bounds.num_blocks[1] << "\n"; debug(3) << "bounds.num_blocks[2] = " << bounds.num_blocks[2] << "\n"; diff --git a/src/PartitionLoops.cpp b/src/PartitionLoops.cpp index 99b7a7cc25e1..a17f5db5b7c1 100644 --- a/src/PartitionLoops.cpp +++ b/src/PartitionLoops.cpp @@ -2,6 +2,7 @@ #include #include "CSE.h" +#include "CanonicalizeGPUVars.h" #include "CodeGen_GPU_Dev.h" #include "ExprUsesVar.h" #include "IREquality.h" @@ -566,8 +567,7 @@ class PartitionLoops : public IRMutator { } } mutation_checker{op, op->partition_policy == Partition::Always}; - ScopedValue old_in_gpu_loop(in_gpu_loop, in_gpu_loop || - CodeGen_GPU_Dev::is_gpu_var(op->name)); + ScopedValue old_in_gpu_loop(in_gpu_loop, in_gpu_loop || is_gpu(op->for_type)); // If we're inside GPU kernel, and the body contains thread // barriers or warp shuffles, it's not safe to partition loops. @@ -877,12 +877,12 @@ class RenormalizeGPULoops : public IRMutator { bool old_in_gpu_loop = in_gpu_loop; Stmt stmt; - if (in_gpu_loop || CodeGen_GPU_Dev::is_gpu_var(op->name)) { + if (in_gpu_loop || is_gpu(op->for_type)) { gpu_vars.push(op->name); in_gpu_loop = true; } - if (ends_with(op->name, "__thread_id_x")) { + if (ends_with(op->name, gpu_thread_name(0))) { internal_assert(!in_thread_loop); in_thread_loop = true; stmt = IRMutator::visit(op); diff --git a/src/TrimNoOps.cpp b/src/TrimNoOps.cpp index 25c164ed44b4..bbcf0dd3fdfb 100644 --- a/src/TrimNoOps.cpp +++ b/src/TrimNoOps.cpp @@ -355,7 +355,7 @@ class TrimNoOps : public IRMutator { Stmt visit(const For *op) override { // Bounds of GPU loops can't depend on outer gpu loop vars - if (CodeGen_GPU_Dev::is_gpu_var(op->name)) { + if (is_gpu(op->for_type)) { debug(3) << "TrimNoOps found gpu loop var: " << op->name << "\n"; return IRMutator::visit(op); } diff --git a/test/correctness/fuse_gpu_threads.cpp b/test/correctness/fuse_gpu_threads.cpp index 9ddba37db2b7..63361e76b928 100644 --- a/test/correctness/fuse_gpu_threads.cpp +++ b/test/correctness/fuse_gpu_threads.cpp @@ -6,8 +6,7 @@ using namespace Halide::Internal; class CheckThreadExtent : public IRVisitor { using IRVisitor::visit; void visit(const For *op) override { - if ((op->name == ".__thread_id_x") || (op->name == ".__thread_id_y")) { - assert(op->for_type == ForType::GPUThread); + if (op->for_type == ForType::GPUThread) { // Assert the min and extent to be 0 and 16 for this particular test case const int64_t *min = as_const_int(op->min); const int64_t *extent = as_const_int(op->extent); @@ -19,6 +18,11 @@ class CheckThreadExtent : public IRVisitor { }; int main(int argc, char **argv) { + // Canonical GPU for loop names are uniqued to make sure they don't collide + // with user-provided names. We'll test that works by trying for a collision: + unique_name("thread_id_x"); + unique_name("block_id_x"); + Target target = get_jit_target_from_environment(); if (!target.has_gpu_feature()) { printf("[SKIP] No GPU target enabled.\n"); @@ -51,7 +55,7 @@ int main(int argc, char **argv) { .vectorize(x, 4, TailStrategy::RoundUp) .gpu_threads(x, y); - // Lower it and inspect the IR to verify the min/extent of GPU ".__thread_id_x" + // Lower it and inspect the IR to verify the min/extent of GPU thread loops Module m = consumer.compile_to_module({consumer.infer_arguments()}, "fuse_gpu_threads", target); CheckThreadExtent c; m.functions().front().body.accept(&c); From 10e07e647ccc9b1d0e0523b8c110f40722fc7525 Mon Sep 17 00:00:00 2001 From: Zalman Stern Date: Tue, 5 Mar 2024 09:53:29 -0800 Subject: [PATCH 081/186] Add class template type deduction guides to avoid CTAD warning. (#8135) * Add class template type dedeuction guides to avoid CTAD warning. * Formatting. --- src/Debug.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Debug.h b/src/Debug.h index 9f47a5aebeb6..432ba07dc115 100644 --- a/src/Debug.h +++ b/src/Debug.h @@ -77,6 +77,9 @@ struct PrintSpan { : span(span) { } }; +// Class template argument deduction (CTAD) guide to prevent warnings. +template +PrintSpan(const T &) -> PrintSpan; template inline StreamT &operator<<(StreamT &stream, const PrintSpan &wrapper) { @@ -108,6 +111,9 @@ struct PrintSpanLn { : span(span) { } }; +// Class template argument deduction (CTAD) guide to prevent warnings. +template +PrintSpanLn(const T &) -> PrintSpanLn; template inline StreamT &operator<<(StreamT &stream, const PrintSpanLn &wrapper) { From 754e6ec9c076733971895bb7f8fe087e3bde9e11 Mon Sep 17 00:00:00 2001 From: Derek Gerstmann Date: Wed, 6 Mar 2024 11:46:23 -0800 Subject: [PATCH 082/186] [vulkan] Add conform API methods to memory allocator to fix block allocations (#8130) * Add conform API methods to block and region allocator classes Override conform requests for Vulkan memory allocator Cleanup memory requirement constraints for Vulkan Add conform test cases to block_allocator runtime test. * Clang format/tidy pas * Fix unsigned int comparisons * Clang format pass * Fix other unsigned int comparisons * Fix mismatched template types for max() * Fix whitespace for clang format --------- Co-authored-by: Derek Gerstmann --- src/runtime/internal/block_allocator.h | 132 ++++++++----- src/runtime/internal/memory_resources.h | 4 + src/runtime/internal/region_allocator.h | 184 ++++++++++-------- src/runtime/vulkan_memory.h | 241 ++++++++++++++++++------ test/runtime/block_allocator.cpp | 189 ++++++++++++++++++- 5 files changed, 566 insertions(+), 184 deletions(-) diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h index feee56a4e531..89b1a929e79b 100644 --- a/src/runtime/internal/block_allocator.h +++ b/src/runtime/internal/block_allocator.h @@ -55,10 +55,11 @@ class BlockAllocator { // Public interface methods MemoryRegion *reserve(void *user_context, const MemoryRequest &request); - int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *region); //< retain the region and increase the usage count - bool collect(void *user_context); //< returns true if any blocks were removed + int conform(void *user_context, MemoryRequest *request) const; //< conform the given request into a suitable allocation + int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *region); //< retain the region and increase the usage count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -86,13 +87,13 @@ class BlockAllocator { int destroy_region_allocator(void *user_context, RegionAllocator *region_allocator); // Reserves a block of memory for the requested size and returns the corresponding block entry, or nullptr on failure - BlockEntry *reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + BlockEntry *reserve_block_entry(void *user_context, const MemoryRequest &request); // Locates the "best-fit" block entry for the requested size, or nullptr if none was found - BlockEntry *find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + BlockEntry *find_block_entry(void *user_context, const MemoryRequest &request); - // Creates a new block entry and int the list - BlockEntry *create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + // Creates a new block entry and adds it tos the list + BlockEntry *create_block_entry(void *user_context, const MemoryRequest &request); // Releases the block entry from being used, and makes it available for further allocations int release_block_entry(void *user_context, BlockEntry *block_entry); @@ -113,7 +114,7 @@ class BlockAllocator { bool is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const; // Returns true if the given block is suitable for the request allocation - bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const; + bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const; Config config; LinkedList block_list; @@ -162,7 +163,8 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r << "caching=" << halide_memory_caching_name(request.properties.caching) << " " << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ..."; #endif - BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated); + // Reserve a block entry for use + BlockEntry *block_entry = reserve_block_entry(user_context, request); if (block_entry == nullptr) { error(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size (" << (int32_t)(request.size) << " bytes)\n"; @@ -173,11 +175,12 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r halide_abort_if_false(user_context, block != nullptr); halide_abort_if_false(user_context, block->allocator != nullptr); + // Reserve an initial memory region for the block MemoryRegion *result = reserve_memory_region(user_context, block->allocator, request); if (result == nullptr) { // Unable to reserve region in an existing block ... create a new block and try again. - block_entry = create_block_entry(user_context, request.properties, request.size, request.dedicated); + block_entry = create_block_entry(user_context, request); if (block_entry == nullptr) { error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size (" << (int32_t)(request.size) << " bytes)\n"; @@ -299,8 +302,8 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl return result; } -bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const { - if (!is_compatible_block(block, properties)) { +bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const { + if (!is_compatible_block(block, request.properties)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: skipping block ... incompatible properties! (" << "block_resource=" << (void *)block << " " @@ -309,16 +312,16 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo << "block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << " " << "block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << " " << "block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << " " - << "request_size=" << (uint32_t)size << " " - << "request_usage=" << halide_memory_usage_name(properties.usage) << " " - << "request_caching=" << halide_memory_caching_name(properties.caching) << " " - << "request_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "request_size=" << (uint32_t)request.size << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif // skip blocks that are using incompatible memory return false; } - if (dedicated && (block->reserved > 0)) { + if (request.dedicated && (block->reserved > 0)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation! (" << "block_resource=" << (void *)block << " " @@ -340,7 +343,7 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo } size_t available = (block->memory.size - block->reserved); - if (available >= size) { + if (available >= request.size) { return true; } @@ -348,23 +351,23 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo } BlockAllocator::BlockEntry * -BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::find_block_entry(void *user_context, const MemoryRequest &request) { BlockEntry *block_entry = block_list.back(); while (block_entry != nullptr) { BlockEntry *prev_entry = block_entry->prev_ptr; const BlockResource *block = static_cast(block_entry->value); - if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) { + if (is_block_suitable_for_request(user_context, block, request)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: found suitable block (" << "user_context=" << (void *)(user_context) << " " << "block_resource=" << (void *)block << " " << "block_size=" << (uint32_t)block->memory.size << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << "request_size=" << (uint32_t)size << " " - << "dedicated=" << (dedicated ? "true" : "false") << " " - << "usage=" << halide_memory_usage_name(properties.usage) << " " - << "caching=" << halide_memory_caching_name(properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "request_size=" << (uint32_t)request.size << " " + << "request_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif return block_entry; } @@ -375,37 +378,37 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: couldn't find suitable block! (" << "user_context=" << (void *)(user_context) << " " - << "request_size=" << (uint32_t)size << " " - << "dedicated=" << (dedicated ? "true" : "false") << " " - << "usage=" << halide_memory_usage_name(properties.usage) << " " - << "caching=" << halide_memory_caching_name(properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "request_size=" << (uint32_t)request.size << " " + << "request_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif } return block_entry; } BlockAllocator::BlockEntry * -BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::reserve_block_entry(void *user_context, const MemoryRequest &request) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: reserving block ... ! (" - << "requested_size=" << (uint32_t)size << " " - << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " " - << "requested_usage=" << halide_memory_usage_name(properties.usage) << " " - << "requested_caching=" << halide_memory_caching_name(properties.caching) << " " - << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif - BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated); + BlockEntry *block_entry = find_block_entry(user_context, request); if (block_entry == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: creating block ... ! (" - << "requested_size=" << (uint32_t)size << " " - << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " " - << "requested_usage=" << halide_memory_usage_name(properties.usage) << " " - << "requested_caching=" << halide_memory_caching_name(properties.caching) << " " - << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif - block_entry = create_block_entry(user_context, properties, size, dedicated); + block_entry = create_block_entry(user_context, request); } if (block_entry) { @@ -449,7 +452,7 @@ int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator } BlockAllocator::BlockEntry * -BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::create_block_entry(void *user_context, const MemoryRequest &request) { if (config.maximum_pool_size && (pool_size() >= config.maximum_pool_size)) { error(user_context) << "BlockAllocator: No free blocks found! Maximum pool size reached (" << (int32_t)(config.maximum_pool_size) << " bytes or " @@ -476,12 +479,16 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p << "allocator=" << (void *)(allocators.block.allocate) << ")..."; #endif + // Constrain the request to the a valid block allocation + MemoryRequest block_request = request; + conform(user_context, &block_request); + + // Create the block resource itself BlockResource *block = static_cast(block_entry->value); - block->memory.size = constrain_requested_size(size); + block->memory.size = block_request.size; block->memory.handle = nullptr; - block->memory.properties = properties; - block->memory.properties.nearest_multiple = max(config.nearest_multiple, properties.nearest_multiple); - block->memory.dedicated = dedicated; + block->memory.properties = block_request.properties; + block->memory.dedicated = block_request.dedicated; block->reserved = 0; block->allocator = create_region_allocator(user_context, block); alloc_memory_block(user_context, block); @@ -561,6 +568,33 @@ size_t BlockAllocator::constrain_requested_size(size_t size) const { return actual_size; } +int BlockAllocator::conform(void *user_context, MemoryRequest *request) const { + + request->properties.nearest_multiple = max(config.nearest_multiple, request->properties.nearest_multiple); + + if (request->properties.nearest_multiple) { + size_t nm = request->properties.nearest_multiple; + request->size = (((request->size + nm - 1) / nm) * nm); // round up to nearest multiple + } + + if (config.minimum_block_size) { + request->size = ((request->size < config.minimum_block_size) ? + config.minimum_block_size : + request->size); + } + if (config.maximum_block_size) { + request->size = ((request->size > config.maximum_block_size) ? + config.maximum_block_size : + request->size); + } + + if (allocators.block.conform) { + return allocators.block.conform(user_context, request); + } + + return 0; +} + bool BlockAllocator::is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const { if (properties.caching != MemoryCaching::DefaultCaching) { if (properties.caching != block->memory.properties.caching) { diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h index d41fa57304fb..0be6041519a1 100644 --- a/src/runtime/internal/memory_resources.h +++ b/src/runtime/internal/memory_resources.h @@ -202,18 +202,22 @@ struct HalideSystemAllocatorFns { typedef int (*AllocateBlockFn)(void *, MemoryBlock *); typedef int (*DeallocateBlockFn)(void *, MemoryBlock *); +typedef int (*ConformBlockRequestFn)(void *, MemoryRequest *); struct MemoryBlockAllocatorFns { AllocateBlockFn allocate = nullptr; DeallocateBlockFn deallocate = nullptr; + ConformBlockRequestFn conform = nullptr; }; typedef int (*AllocateRegionFn)(void *, MemoryRegion *); typedef int (*DeallocateRegionFn)(void *, MemoryRegion *); +typedef int (*ConformBlockRegionFn)(void *, MemoryRequest *); struct MemoryRegionAllocatorFns { AllocateRegionFn allocate = nullptr; DeallocateRegionFn deallocate = nullptr; + ConformBlockRegionFn conform = nullptr; }; // -- diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h index 02c2cd7e6aa0..3588389c3747 100644 --- a/src/runtime/internal/region_allocator.h +++ b/src/runtime/internal/region_allocator.h @@ -46,10 +46,11 @@ class RegionAllocator { // Public interface methods MemoryRegion *reserve(void *user_context, const MemoryRequest &request); - int release(void *user_context, MemoryRegion *memory_region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *memory_region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *memory_region); //< retain the region and increase usage count - bool collect(void *user_context); //< returns true if any blocks were removed + int conform(void *user_context, MemoryRequest *request) const; //< conform the given request into a suitable allocation + int release(void *user_context, MemoryRegion *memory_region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *memory_region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *memory_region); //< retain the region and increase usage count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -73,13 +74,13 @@ class RegionAllocator { BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region); // Returns true if the given region can be split to accomodate the given size - bool can_split(const BlockRegion *region, size_t size, size_t alignment) const; + bool can_split(const BlockRegion *region, const MemoryRequest &request) const; // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining - BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment); + BlockRegion *split_block_region(void *user_context, BlockRegion *region, const MemoryRequest &request); // Creates a new block region and adds it to the region list - BlockRegion *create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated); + BlockRegion *create_block_region(void *user_context, const MemoryRequest &request); // Creates a new block region and adds it to the region list int destroy_block_region(void *user_context, BlockRegion *region); @@ -137,30 +138,55 @@ int RegionAllocator::initialize(void *user_context, BlockResource *mb, const Mem allocators = ma; arena = MemoryArena::create(user_context, {sizeof(BlockRegion), MemoryArena::default_capacity, 0}, allocators.system); halide_abort_if_false(user_context, arena != nullptr); + MemoryRequest block_request = {}; + block_request.size = block->memory.size; + block_request.offset = 0; + block_request.alignment = block->memory.properties.alignment; + block_request.properties = block->memory.properties; + block_request.dedicated = block->memory.dedicated; block->allocator = this; - block->regions = create_block_region( - user_context, - block->memory.properties, - 0, block->memory.size, - block->memory.dedicated); + block->regions = create_block_region(user_context, block_request); + return 0; +} + +int RegionAllocator::conform(void *user_context, MemoryRequest *request) const { + if (allocators.region.conform) { + return allocators.region.conform(user_context, request); + } else { + size_t actual_alignment = conform_alignment(request->alignment, block->memory.properties.alignment); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, block->memory.properties.nearest_multiple); + request->alignment = actual_alignment; + request->offset = actual_offset; + request->size = actual_size; + } return 0; } MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) { halide_abort_if_false(user_context, request.size > 0); - size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(request.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple); + + MemoryRequest region_request = request; + + int error_code = conform(user_context, ®ion_request); + if (error_code) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n"; +#endif + return nullptr; + } + size_t remaining = block->memory.size - block->reserved; - if (remaining < actual_size) { + if (remaining < region_request.size) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Unable to reserve more memory from block " - << "-- requested size (" << (int32_t)(request.size) << " bytes) " + << "-- requested size (" << (int32_t)(region_request.size) << " bytes) " << "greater than available (" << (int32_t)(remaining) << " bytes)"; #endif return nullptr; } - BlockRegion *block_region = find_block_region(user_context, request); + BlockRegion *block_region = find_block_region(user_context, region_request); if (block_region == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Failed to locate region for requested size (" @@ -169,12 +195,12 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest & return nullptr; } - if (can_split(block_region, request.size, request.alignment)) { + if (can_split(block_region, region_request)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") " - << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)"; + << "to accomodate requested size (" << (int32_t)(region_request.size) << " bytes)"; #endif - split_block_region(user_context, block_region, request.size, request.alignment); + split_block_region(user_context, block_region, region_request); } alloc_block_region(user_context, block_region); @@ -237,8 +263,17 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c return false; } + MemoryRequest region_request = request; + int error_code = conform(user_context, ®ion_request); + if (error_code) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n"; +#endif + return false; + } + // skip incompatible block regions for this request - if (!is_compatible_block_region(region, request.properties)) { + if (!is_compatible_block_region(region, region_request.properties)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << " skipping block region ... incompatible properties! (" << " block_region=" << (void *)region @@ -248,16 +283,13 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c return false; } - size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(region->memory.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple); - // is the adjusted size larger than the current region? - if (actual_size > region->memory.size) { + if (region_request.size > region->memory.size) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << " skipping block region ... not enough space for adjusted size! (" << " block_region=" << (void *)region << " request_size=" << (uint32_t)(request.size) - << " actual_size=" << (uint32_t)(actual_size) + << " actual_size=" << (uint32_t)(region_request.size) << " region_size=" << (uint32_t)(region->memory.size) << ")"; #endif @@ -265,12 +297,12 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c } // will the adjusted size fit within the remaining unallocated space? - if ((actual_size + block->reserved) <= block->memory.size) { + if ((region_request.size + block->reserved) <= block->memory.size) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << " found suitable block region! (" << " block_region=" << (void *)region << " request_size=" << (uint32_t)(request.size) - << " actual_size=" << (uint32_t)(actual_size) + << " actual_size=" << (uint32_t)(region_request.size) << " region_size=" << (uint32_t)(region->memory.size) << ")"; #endif @@ -411,13 +443,11 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe return block_region; } -bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size, size_t alignment) const { - size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment); - size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple); - return (block_region && (block_region->memory.size > split_size) && (block_region->usage_count == 0)); +bool RegionAllocator::can_split(const BlockRegion *block_region, const MemoryRequest &split_request) const { + return (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0)); } -BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) { +BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, const MemoryRequest &request) { if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) { #ifdef DEBUG_RUNTIME_INTERNAL @@ -434,33 +464,17 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion block_region->memory.handle = nullptr; } - size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment); - size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple); - size_t split_offset = aligned_offset(block_region->memory.offset + size, actual_alignment); - size_t empty_size = block_region->memory.size - split_size; - -#ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: Conforming size and alignment (" - << "requested_size=" << (uint32_t)size << " " - << "split_size=" << (uint32_t)split_size << " " - << "split_offset=" << (uint32_t)split_size << " " - << "empty_size=" << (uint32_t)empty_size << " " - << "requested_alignment=" << (uint32_t)alignment << " " - << "required_alignment=" << (uint32_t)block->memory.properties.alignment << " " - << "actual_alignment=" << (uint32_t)actual_alignment << ")"; -#endif + MemoryRequest split_request = request; + split_request.size = block_region->memory.size - request.size; + split_request.offset = block_region->memory.offset + request.size; #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Splitting " << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) " - << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)"; + << "to create empty region (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)"; #endif - BlockRegion *next_region = block_region->next_ptr; - BlockRegion *empty_region = create_block_region(user_context, - block_region->memory.properties, - split_offset, empty_size, - block_region->memory.dedicated); + BlockRegion *empty_region = create_block_region(user_context, split_request); halide_abort_if_false(user_context, empty_region != nullptr); empty_region->next_ptr = next_region; @@ -469,42 +483,52 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion } empty_region->prev_ptr = block_region; block_region->next_ptr = empty_region; - block_region->memory.size -= empty_size; + block_region->memory.size -= empty_region->memory.size; return empty_region; } -BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) { +BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryRequest &request) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Creating block region request (" << "user_context=" << (void *)(user_context) << " " - << "offset=" << (uint32_t)offset << " " - << "size=" << (uint32_t)size << " " - << "alignment=" << (uint32_t)properties.alignment << " " - << "dedicated=" << (dedicated ? "true" : "false") << " " - << "usage=" << halide_memory_usage_name(properties.usage) << " " - << "caching=" << halide_memory_caching_name(properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ..."; -#endif - size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple); - size_t actual_offset = aligned_offset(offset, actual_alignment); - - if (actual_size == 0) { - error(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n"; + << "offset=" << (uint32_t)request.offset << " " + << "size=" << (uint32_t)request.size << " " + << "alignment=" << (uint32_t)request.properties.alignment << " " + << "dedicated=" << (request.dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ..."; +#endif + + MemoryRequest region_request = request; + int error_code = conform(user_context, ®ion_request); + if (error_code) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to conform request for new block region!\n"; +#endif + return nullptr; + } + + if (region_request.size == 0) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n"; +#endif return nullptr; } BlockRegion *block_region = static_cast(arena->reserve(user_context, true)); if (block_region == nullptr) { - error(user_context) << "RegionAllocator: Failed to allocate new block region!\n"; +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to allocate new block region!\n"; +#endif return nullptr; } block_region->memory.handle = nullptr; - block_region->memory.offset = actual_offset; - block_region->memory.size = actual_size; - block_region->memory.properties = properties; - block_region->memory.dedicated = dedicated; + block_region->memory.offset = region_request.offset; + block_region->memory.size = region_request.size; + block_region->memory.properties = region_request.properties; + block_region->memory.dedicated = region_request.dedicated; block_region->status = AllocationStatus::Available; block_region->block_ptr = block; block_region->usage_count = 0; @@ -669,6 +693,8 @@ bool RegionAllocator::collect(void *user_context) { uint32_t collected_count = 0; uint32_t remaining_count = 0; + uint64_t available_bytes = 0; + uint64_t scanned_bytes = 0; uint64_t reserved = block->reserved; debug(user_context) << " collecting unused regions (" << "block_ptr=" << (void *)block << " " @@ -679,6 +705,8 @@ bool RegionAllocator::collect(void *user_context) { bool has_collected = false; BlockRegion *block_region = block->regions; while (block_region != nullptr) { +#ifdef DEBUG_RUNTIME_INTERNAL + scanned_bytes += block_region->memory.size; debug(user_context) << " checking region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " @@ -687,6 +715,7 @@ bool RegionAllocator::collect(void *user_context) { << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " << ")"; +#endif if (can_coalesce(block_region)) { #ifdef DEBUG_RUNTIME_INTERNAL @@ -705,6 +734,9 @@ bool RegionAllocator::collect(void *user_context) { remaining_count++; #endif } +#ifdef DEBUG_RUNTIME_INTERNAL + available_bytes += is_available(block_region) ? block_region->memory.size : 0; +#endif if (is_last_block_region(user_context, block_region)) { break; } @@ -715,6 +747,8 @@ bool RegionAllocator::collect(void *user_context) { << "block_ptr=" << (void *)block << " " << "total_count=" << (uint32_t)(collected_count + remaining_count) << " " << "block_reserved=" << (uint32_t)(block->reserved) << " " + << "scanned_bytes=" << (uint32_t)(scanned_bytes) << " " + << "available_bytes=" << (uint32_t)(available_bytes) << " " << ")"; #endif diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h index 96535f3446ba..055fbef72277 100644 --- a/src/runtime/vulkan_memory.h +++ b/src/runtime/vulkan_memory.h @@ -58,11 +58,12 @@ class VulkanMemoryAllocator { static int destroy(void *user_context, VulkanMemoryAllocator *allocator); // Public interface methods - MemoryRegion *reserve(void *user_context, MemoryRequest &request); - int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *region); //< retain the region and increase its use count - bool collect(void *user_context); //< returns true if any blocks were removed + MemoryRegion *reserve(void *user_context, const MemoryRequest &request); + int conform(void *user_context, MemoryRequest *request); //< conforms the given memory request into one that can be allocated + int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *region); //< retain the region and increase its use count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -86,9 +87,11 @@ class VulkanMemoryAllocator { static int allocate_block(void *instance_ptr, MemoryBlock *block); static int deallocate_block(void *instance_ptr, MemoryBlock *block); + static int conform_block_request(void *instance_ptr, MemoryRequest *request); static int allocate_region(void *instance_ptr, MemoryRegion *region); static int deallocate_region(void *instance_ptr, MemoryRegion *region); + static int conform_region_request(void *instance_ptr, MemoryRequest *request); size_t bytes_allocated_for_blocks() const; size_t blocks_allocated() const; @@ -113,6 +116,8 @@ class VulkanMemoryAllocator { MemoryProperties properties, uint32_t required_flags) const; + int lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements); + size_t block_byte_count = 0; size_t block_count = 0; size_t region_byte_count = 0; @@ -180,8 +185,8 @@ int VulkanMemoryAllocator::initialize(void *user_context, block_byte_count = 0; BlockAllocator::MemoryAllocators allocators; allocators.system = system_allocator; - allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block}; - allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region}; + allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block, VulkanMemoryAllocator::conform_block_request}; + allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region, VulkanMemoryAllocator::conform_region_request}; BlockAllocator::Config block_allocator_config = {0}; block_allocator_config.maximum_pool_size = cfg.maximum_pool_size; block_allocator_config.maximum_block_count = cfg.maximum_block_count; @@ -202,7 +207,7 @@ int VulkanMemoryAllocator::initialize(void *user_context, return halide_error_code_success; } -MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) { +MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, const MemoryRequest &request) { #if defined(HL_VK_DEBUG_MEM) debug(nullptr) << "VulkanMemoryAllocator: Reserving memory (" << "user_context=" << user_context << " " @@ -272,6 +277,7 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) { error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid memory range !\n"; return nullptr; } +#if defined(HL_VK_DEBUG_MEM) debug(nullptr) << "VulkanMemoryAllocator: MapMemory (" << "user_context=" << user_context << "\n" << " region_size=" << (uint32_t)region->size << "\n" @@ -279,8 +285,8 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) { << " region_range.head_offset=" << (uint32_t)region->range.head_offset << "\n" << " region_range.tail_offset=" << (uint32_t)region->range.tail_offset << "\n" << " memory_offset=" << (uint32_t)memory_offset << "\n" - << " memory_size=" << (uint32_t)memory_size << ") ...\n"; - + << " memory_size=" << (uint32_t)memory_size << "\n)\n"; +#endif VkResult result = vkMapMemory(device, *device_memory, memory_offset, memory_size, 0, (void **)(&mapped_ptr)); if (result != VK_SUCCESS) { error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n"; @@ -528,6 +534,79 @@ VulkanMemoryAllocator::default_config() { } // -- +int VulkanMemoryAllocator::lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements) { +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Looking up requirements (" + << "user_context=" << user_context << " " + << "size=" << (uint32_t)block->size << ", " + << "usage_flags=" << usage_flags << ") ... \n"; +#endif + VkBufferCreateInfo create_info = { + VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // struct type + nullptr, // struct extending this + 0, // create flags + size, // buffer size (in bytes) + usage_flags, // buffer usage flags + VK_SHARING_MODE_EXCLUSIVE, // sharing mode + 0, nullptr}; + + // Create a buffer to determine alignment requirements + VkBuffer buffer = {0}; + VkResult result = vkCreateBuffer(this->device, &create_info, this->alloc_callbacks, &buffer); + if (result != VK_SUCCESS) { +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer to find requirements!\n\t" + << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; +#endif + return halide_error_code_device_malloc_failed; + } + + vkGetBufferMemoryRequirements(this->device, buffer, memory_requirements); + vkDestroyBuffer(this->device, buffer, this->alloc_callbacks); + return halide_error_code_success; +} + +int VulkanMemoryAllocator::conform_block_request(void *instance_ptr, MemoryRequest *request) { + + VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); + if (instance == nullptr) { + return halide_error_code_internal_error; + } + + void *user_context = instance->owner_context; +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Conforming block request (" + << "user_context=" << user_context << " " + << "request=" << (void *)(request) << ") ... \n"; +#endif + + if ((instance->device == nullptr) || (instance->physical_device == nullptr)) { + error(user_context) << "VulkanRegionAllocator: Unable to conform block request! Invalid device handle!\n"; + return halide_error_code_internal_error; + } + + VkMemoryRequirements memory_requirements = {0}; + uint32_t usage_flags = instance->select_memory_usage(user_context, request->properties); + int error_code = instance->lookup_requirements(user_context, request->size, usage_flags, &memory_requirements); + if (error_code != halide_error_code_success) { + error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n"; + return error_code; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Block allocated (" + << "size=" << (uint32_t)request->size << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ", " + << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", " + << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", " + << "dedicated=" << (request->dedicated ? "true" : "false") << ")\n"; +#endif + + request->size = memory_requirements.size; + request->properties.alignment = memory_requirements.alignment; + return halide_error_code_success; +} int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block) { VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); @@ -587,53 +666,6 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block debug(nullptr) << "vkAllocateMemory: Allocated memory for device region (" << (uint64_t)block->size << " bytes) ...\n"; #endif - uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties); - - VkBufferCreateInfo create_info = { - VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // struct type - nullptr, // struct extending this - 0, // create flags - sizeof(uint32_t), // buffer size (in bytes) - usage_flags, // buffer usage flags - VK_SHARING_MODE_EXCLUSIVE, // sharing mode - 0, nullptr}; - - // Create a buffer to determine alignment requirements - VkBuffer buffer = {0}; - result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, &buffer); - if (result != VK_SUCCESS) { - debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer!\n\t" - << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; - return halide_error_code_device_malloc_failed; - } - - VkMemoryRequirements memory_requirements = {0}; - vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements); - vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks); - -#if defined(HL_VK_DEBUG_MEM) - debug(nullptr) << "VulkanMemoryAllocator: Block allocated (" - << "size=" << (uint32_t)block->size << ", " - << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " - << "required_size=" << (uint32_t)memory_requirements.size << ", " - << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", " - << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", " - << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n"; -#endif - - // Enforce any alignment constrainst reported by the device limits for each usage type - if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) { - block->properties.alignment = instance->physical_device_limits.minStorageBufferOffsetAlignment; - } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { - block->properties.alignment = instance->physical_device_limits.minUniformBufferOffsetAlignment; - } - // Some drivers appear to report a buffer alignment constraint (regardless of usage) that can be larger than either of the above - if (memory_requirements.alignment > block->properties.alignment) { - block->properties.alignment = memory_requirements.alignment; - } - if (memory_requirements.alignment > block->properties.nearest_multiple) { - block->properties.nearest_multiple = memory_requirements.alignment; - } block->handle = (void *)device_memory; instance->block_byte_count += block->size; instance->block_count++; @@ -814,6 +846,98 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context, // -- +int VulkanMemoryAllocator::conform(void *user_context, MemoryRequest *request) { + + // NOTE: Vulkan will only allow us to bind device memory to a buffer if the memory requirements are met. + // So now we have to check those (on every allocation) and potentially recreate the buffer if the requirements + // don't match the requested VkBuffer's properties. Note that this is the internal storage for the driver, + // whose size may be required to larger than our requested size (even though we will only ever touch the + // size of the region we're managing as within our block) + + VkMemoryRequirements memory_requirements = {0}; + uint32_t usage_flags = select_memory_usage(user_context, request->properties); + int error_code = lookup_requirements(user_context, request->size, usage_flags, &memory_requirements); + if (error_code != halide_error_code_success) { + error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n"; + return error_code; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements (" + << "requested_size=" << (uint32_t)region->size << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ")\n"; +#endif + + // Enforce any alignment constraints reported by the device limits for each usage type + if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) { + if ((request->alignment % this->physical_device_limits.minStorageBufferOffsetAlignment) != 0) { + request->alignment = this->physical_device_limits.minStorageBufferOffsetAlignment; + } + } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { + if ((request->alignment % this->physical_device_limits.minUniformBufferOffsetAlignment) != 0) { + request->alignment = this->physical_device_limits.minUniformBufferOffsetAlignment; + } + } + + // Ensure the request ends on an aligned address + if (request->alignment > config.nearest_multiple) { + request->properties.nearest_multiple = request->alignment; + } + + size_t actual_alignment = conform_alignment(request->alignment, memory_requirements.alignment); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, memory_requirements.size, actual_alignment, request->properties.nearest_multiple); + +#if defined(HL_VK_DEBUG_MEM) + if ((request->size != actual_size) || (request->alignment != actual_alignment) || (request->offset != actual_offset)) { + debug(nullptr) << "VulkanMemoryAllocator: Adjusting request to match requirements (\n" + << " size = " << (uint64_t)request->size << " => " << (uint64_t)actual_size << ",\n" + << " alignment = " << (uint64_t)request->alignment << " => " << (uint64_t)actual_alignment << ",\n" + << " offset = " << (uint64_t)request->offset << " => " << (uint64_t)actual_offset << ",\n" + << " required.size = " << (uint64_t)memory_requirements.size << ",\n" + << " required.alignment = " << (uint64_t)memory_requirements.alignment << "\n)\n"; + } +#endif + request->size = actual_size; + request->alignment = actual_alignment; + request->offset = actual_offset; + + return halide_error_code_success; +} + +int VulkanMemoryAllocator::conform_region_request(void *instance_ptr, MemoryRequest *request) { + + VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); + if (instance == nullptr) { + return halide_error_code_internal_error; + } + + void *user_context = instance->owner_context; +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Conforming region request (" + << "user_context=" << user_context << " " + << "request=" << (void *)(region) << ") ... \n"; +#endif + + if ((instance->device == nullptr) || (instance->physical_device == nullptr)) { + error(user_context) << "VulkanRegionAllocator: Unable to conform region request! Invalid device handle!\n"; + return halide_error_code_internal_error; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanRegionAllocator: Conforming region request (" + << "size=" << (uint32_t)request->size << ", " + << "offset=" << (uint32_t)request->offset << ", " + << "dedicated=" << (request->dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(request->properties.usage) << " " + << "caching=" << halide_memory_caching_name(request->properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(request->properties.visibility) << ")\n"; +#endif + + return instance->conform(user_context, request); +} + int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *region) { VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); @@ -890,7 +1014,8 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg if (memory_requirements.size > region->size) { vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks); #ifdef DEBUG_RUNTIME - debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" << (uint64_t)memory_requirements.size << " bytes) ...\n"; + debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" + << (uint64_t)region->size << " => " << (uint64_t)memory_requirements.size << " bytes) ...\n"; #endif create_info.size = memory_requirements.size; VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer); diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp index b2190f63b592..26ce8066e118 100644 --- a/test/runtime/block_allocator.cpp +++ b/test/runtime/block_allocator.cpp @@ -1,3 +1,7 @@ +// NOTE: Uncomment the following two defines to enable debug output +// #define DEBUG_RUNTIME +// #define DEBUG_RUNTIME_INTERNAL + #include "HalideRuntime.h" #include "common.h" @@ -39,6 +43,17 @@ int deallocate_block(void *user_context, MemoryBlock *block) { return halide_error_code_success; } +int conform_block(void *user_context, MemoryRequest *request) { + + debug(user_context) << "Test : conform_block (" + << "request_size=" << int32_t(request->size) << " " + << "request_offset=" << int32_t(request->offset) << " " + << "request_alignment=" << int32_t(request->alignment) << " " + << ") ..."; + + return halide_error_code_success; +} + int allocate_region(void *user_context, MemoryRegion *region) { region->handle = (void *)1; allocated_region_memory += region->size; @@ -65,17 +80,38 @@ int deallocate_region(void *user_context, MemoryRegion *region) { return halide_error_code_success; } +int conform_region(void *user_context, MemoryRequest *request) { + size_t actual_alignment = conform_alignment(request->alignment, 0); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, actual_alignment); + + debug(user_context) << "Test : conform_region (\n " + << "request_size=" << int32_t(request->size) << "\n " + << "request_offset=" << int32_t(request->offset) << "\n " + << "request_alignment=" << int32_t(request->alignment) << "\n " + << "actual_size=" << int32_t(actual_size) << "\n " + << "actual_offset=" << int32_t(actual_offset) << "\n " + << "actual_alignment=" << int32_t(actual_alignment) << "\n" + << ") ..."; + + request->alignment = actual_alignment; + request->offset = actual_offset; + request->size = actual_size; + return halide_error_code_success; +} + } // end namespace int main(int argc, char **argv) { void *user_context = (void *)1; SystemMemoryAllocatorFns system_allocator = {allocate_system, deallocate_system}; - MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block}; - MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region}; // test region allocator class interface { + // Use custom conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, conform_region}; + // Manually create a block resource and allocate memory size_t block_size = 4 * 1024 * 1024; BlockResource block_resource = {}; @@ -164,8 +200,104 @@ int main(int argc, char **argv) { HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); } + // test region allocator conform request + { + // Use default conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + + // Manually create a block resource and allocate memory + size_t block_size = 4 * 1024 * 1024; + size_t padded_size = 32; + BlockResource block_resource = {}; + MemoryBlock *memory_block = &(block_resource.memory); + memory_block->size = block_size; + memory_block->properties.nearest_multiple = padded_size; + allocate_block(user_context, memory_block); + + // Create a region allocator to manage the block resource + RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator}; + RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators); + + // test zero size request + MemoryRequest request = {0}; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=" << int32_t(request.size) << " " + << "request.alignment=" << int32_t(request.alignment) << " " + << ") ..."; + + halide_abort_if_false(user_context, request.size == size_t(0)); + + // test round up size to alignment + request.size = 1; + request.alignment = 0; + request.properties.alignment = 4; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 4); + halide_abort_if_false(user_context, request.alignment != 4); + + size_t nm = padded_size; + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm))); + halide_abort_if_false(user_context, request.alignment == a); + } + } + + // test round up size and offset to alignment + request.size = 1; + request.offset = 1; + request.alignment = 32; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size == 32); + halide_abort_if_false(user_context, request.offset == 32); + halide_abort_if_false(user_context, request.alignment == 32); + + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t os = 1; os < sz; ++os) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.offset = os; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.offset=(" << os << " => " << int32_t(request.offset) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm))); + halide_abort_if_false(user_context, request.offset == aligned_offset(os, a)); + halide_abort_if_false(user_context, request.alignment == a); + } + } + } + + instance->destroy(user_context); + deallocate_block(user_context, memory_block); + HALIDE_CHECK(user_context, allocated_block_memory == 0); + HALIDE_CHECK(user_context, allocated_region_memory == 0); + + RegionAllocator::destroy(user_context, instance); + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + // test region allocator nearest_multiple padding { + // Use default conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + // Manually create a block resource and allocate memory size_t block_size = 4 * 1024 * 1024; size_t padded_size = 32; @@ -245,6 +377,9 @@ int main(int argc, char **argv) { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); @@ -296,11 +431,58 @@ int main(int argc, char **argv) { HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); } + // test conform request + { + uint32_t mbs = 1024; // min block size + BlockAllocator::Config config = {0}; + config.minimum_block_size = mbs; + + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; + BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); + + MemoryRequest request = {0}; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 0); + + // test round up size to alignment + request.size = 1; + request.alignment = 0; + request.properties.alignment = 4; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 4); + halide_abort_if_false(user_context, request.alignment != 4); + + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : block_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(mbs, (((sz + a - 1) / a) * a))); + halide_abort_if_false(user_context, request.alignment == a); + } + } + + BlockAllocator::destroy(user_context, instance); + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + // allocation stress test { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); @@ -340,6 +522,9 @@ int main(int argc, char **argv) { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); From 22868a4db5f3a3a142ed7bc457fd9fd9ee6bdd76 Mon Sep 17 00:00:00 2001 From: Prasoon Mishra Date: Thu, 7 Mar 2024 03:10:00 +0530 Subject: [PATCH 083/186] Add sobel in hexagon benchmarks app for CMake builds (#8127) * Add sobel in hexagon_benchmarks app for CMake builds Resolved compilation errors caused by the eliminate interleave pass, which changed the instruction from halide.hexagon.pack_satub.vuh to halide.hexagon.trunc_satub.vuh. The latter is only available in v65 or later. This commit ensures compatibility with v65 and later versions. * Minor fix to address the issue. --------- Co-authored-by: Steven Johnson --- apps/hexagon_benchmarks/CMakeLists.txt | 9 ++-- apps/hexagon_benchmarks/process.cpp | 5 +- src/HexagonOptimize.cpp | 66 ++++++++++++++++---------- 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/apps/hexagon_benchmarks/CMakeLists.txt b/apps/hexagon_benchmarks/CMakeLists.txt index 9cbcc541b76a..c01ad22035bd 100644 --- a/apps/hexagon_benchmarks/CMakeLists.txt +++ b/apps/hexagon_benchmarks/CMakeLists.txt @@ -22,23 +22,24 @@ endmacro() add_generator_and_library(dilate3x3) add_generator_and_library(gaussian5x5) add_generator_and_library(median3x3) +add_generator_and_library(sobel) # Main executable add_executable(process process.cpp) target_compile_options(process PRIVATE $<$:-O2>) if (Halide_TARGET MATCHES "hvx") - target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 TARGET_HAS_HVX) + target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL TARGET_HAS_HVX) else() - target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3) + target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL) endif() target_link_libraries(process PRIVATE Halide::Tools - dilate3x3 gaussian5x5 median3x3) + dilate3x3 gaussian5x5 median3x3 sobel) # Test that the app actually works! add_test(NAME hexagon_benchmarks COMMAND process -n 1) set_tests_properties(hexagon_benchmarks PROPERTIES LABELS hexagon_benchmarks PASS_REGULAR_EXPRESSION "Success!" - SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") + SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") \ No newline at end of file diff --git a/apps/hexagon_benchmarks/process.cpp b/apps/hexagon_benchmarks/process.cpp index 87a492c577d1..def519963ad0 100644 --- a/apps/hexagon_benchmarks/process.cpp +++ b/apps/hexagon_benchmarks/process.cpp @@ -43,10 +43,11 @@ int main(int argc, char **argv) { Dilate3x3Descriptor dilate3x3_pipeine(W, H); Median3x3Descriptor median3x3_pipeline(W, H); Gaussian5x5Descriptor gaussian5x5_pipeline(W, H); + SobelDescriptor sobel_pipeline(W, H); Conv3x3a32Descriptor conv3x3a32_pipeline(W, H); std::vector pipelines = {&conv3x3a16_pipeline, &dilate3x3_pipeine, &median3x3_pipeline, - &gaussian5x5_pipeline, &conv3x3a32_pipeline}; + &gaussian5x5_pipeline, &sobel_pipeline, &conv3x3a32_pipeline}; for (PipelineDescriptorBase *p : pipelines) { if (!p->defined()) { @@ -85,4 +86,4 @@ int main(int argc, char **argv) { printf("Success!\n"); return 0; -} +} \ No newline at end of file diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp index deabd95d1d1b..f11fa3348399 100644 --- a/src/HexagonOptimize.cpp +++ b/src/HexagonOptimize.cpp @@ -1685,6 +1685,14 @@ class EliminateInterleaves : public IRMutator { return true; } + // Indicates the minimum Hexagon Vector Extension (HVX) target version required for using these instructions. + enum class HvxTarget { + v62orLater, // Use for Hexagon v62 target or later + v65orLater, // Use for Hexagon v65 target or later + v66orLater, // Use for Hexagon v66 target or later + }; + HvxTarget hvx_target; + Expr visit(const Call *op) override { vector args(op->args); @@ -1702,27 +1710,27 @@ class EliminateInterleaves : public IRMutator { // does not deinterleave, and then opportunistically select // the interleaving alternative when we can cancel out to the // interleave. - static std::map deinterleaving_alts = { - {"halide.hexagon.pack.vh", "halide.hexagon.trunc.vh"}, - {"halide.hexagon.pack.vw", "halide.hexagon.trunc.vw"}, - {"halide.hexagon.packhi.vh", "halide.hexagon.trunclo.vh"}, - {"halide.hexagon.packhi.vw", "halide.hexagon.trunclo.vw"}, - {"halide.hexagon.pack_satub.vh", "halide.hexagon.trunc_satub.vh"}, - {"halide.hexagon.pack_satub.vuh", "halide.hexagon.trunc_satub.vuh"}, - {"halide.hexagon.pack_sath.vw", "halide.hexagon.trunc_sath.vw"}, - {"halide.hexagon.pack_satuh.vw", "halide.hexagon.trunc_satuh.vw"}, - {"halide.hexagon.pack_satuh.vuw", "halide.hexagon.trunc_satuh.vuw"}, + static std::map> deinterleaving_alts = { + {"halide.hexagon.pack.vh", {HvxTarget::v62orLater, "halide.hexagon.trunc.vh"}}, + {"halide.hexagon.pack.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc.vw"}}, + {"halide.hexagon.packhi.vh", {HvxTarget::v62orLater, "halide.hexagon.trunclo.vh"}}, + {"halide.hexagon.packhi.vw", {HvxTarget::v62orLater, "halide.hexagon.trunclo.vw"}}, + {"halide.hexagon.pack_satub.vh", {HvxTarget::v62orLater, "halide.hexagon.trunc_satub.vh"}}, + {"halide.hexagon.pack_satub.vuh", {HvxTarget::v65orLater, "halide.hexagon.trunc_satub.vuh"}}, + {"halide.hexagon.pack_sath.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc_sath.vw"}}, + {"halide.hexagon.pack_satuh.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc_satuh.vw"}}, + {"halide.hexagon.pack_satuh.vuw", {HvxTarget::v62orLater, "halide.hexagon.trunc_satuh.vuw"}}, }; // The reverse mapping of the above. - static std::map interleaving_alts = { - {"halide.hexagon.trunc.vh", "halide.hexagon.pack.vh"}, - {"halide.hexagon.trunc.vw", "halide.hexagon.pack.vw"}, - {"halide.hexagon.trunclo.vh", "halide.hexagon.packhi.vh"}, - {"halide.hexagon.trunclo.vw", "halide.hexagon.packhi.vw"}, - {"halide.hexagon.trunc_satub.vh", "halide.hexagon.pack_satub.vh"}, - {"halide.hexagon.trunc_sath.vw", "halide.hexagon.pack_sath.vw"}, - {"halide.hexagon.trunc_satuh.vw", "halide.hexagon.pack_satuh.vw"}, + static std::map> interleaving_alts = { + {"halide.hexagon.trunc.vh", {HvxTarget::v62orLater, "halide.hexagon.pack.vh"}}, + {"halide.hexagon.trunc.vw", {HvxTarget::v62orLater, "halide.hexagon.pack.vw"}}, + {"halide.hexagon.trunclo.vh", {HvxTarget::v62orLater, "halide.hexagon.packhi.vh"}}, + {"halide.hexagon.trunclo.vw", {HvxTarget::v62orLater, "halide.hexagon.packhi.vw"}}, + {"halide.hexagon.trunc_satub.vh", {HvxTarget::v62orLater, "halide.hexagon.pack_satub.vh"}}, + {"halide.hexagon.trunc_sath.vw", {HvxTarget::v62orLater, "halide.hexagon.pack_sath.vw"}}, + {"halide.hexagon.trunc_satuh.vw", {HvxTarget::v62orLater, "halide.hexagon.pack_satuh.vw"}}, }; if (is_native_deinterleave(op) && yields_interleave(args[0])) { @@ -1738,7 +1746,8 @@ class EliminateInterleaves : public IRMutator { op->func, op->value_index, op->image, op->param); // Add the interleave back to the result of the call. return native_interleave(expr); - } else if (deinterleaving_alts.find(op->name) != deinterleaving_alts.end() && + } else if (deinterleaving_alts.find(op->name) != deinterleaving_alts.end() && hvx_target >= deinterleaving_alts[op->name].first && + yields_removable_interleave(args)) { // This call has a deinterleaving alternative, and the // arguments are interleaved, so we should use the @@ -1746,14 +1755,14 @@ class EliminateInterleaves : public IRMutator { for (Expr &i : args) { i = remove_interleave(i); } - return Call::make(op->type, deinterleaving_alts[op->name], args, op->call_type); - } else if (interleaving_alts.count(op->name) && is_native_deinterleave(args[0])) { + return Call::make(op->type, deinterleaving_alts[op->name].second, args, op->call_type); + } else if (interleaving_alts.count(op->name) && hvx_target >= interleaving_alts[op->name].first && is_native_deinterleave(args[0])) { // This is an interleaving alternative with a // deinterleave, which can be generated when we // deinterleave storage. Revert back to the interleaving // op so we can remove the deinterleave. Expr arg = args[0].as()->args[0]; - return Call::make(op->type, interleaving_alts[op->name], {arg}, op->call_type, + return Call::make(op->type, interleaving_alts[op->name].second, {arg}, op->call_type, op->func, op->value_index, op->image, op->param); } else if (changed) { return Call::make(op->type, op->name, args, op->call_type, @@ -1896,8 +1905,15 @@ class EliminateInterleaves : public IRMutator { using IRMutator::visit; public: - EliminateInterleaves(int native_vector_bytes) + EliminateInterleaves(const Target &t, int native_vector_bytes) : native_vector_bits(native_vector_bytes * 8), alignment_analyzer(native_vector_bytes) { + if (t.features_any_of({Target::HVX_v65})) { + hvx_target = HvxTarget::v65orLater; + } else if (t.features_any_of({Target::HVX_v66})) { + hvx_target = HvxTarget::v66orLater; + } else { + hvx_target = HvxTarget::v62orLater; + } } }; @@ -2233,7 +2249,7 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) { << s << "\n"; // Try to eliminate any redundant interleave/deinterleave pairs. - s = EliminateInterleaves(t.natural_vector_size(Int(8))).mutate(s); + s = EliminateInterleaves(t, t.natural_vector_size(Int(8))).mutate(s); debug(4) << "Hexagon: Lowering after EliminateInterleaves\n" << s << "\n"; @@ -2246,4 +2262,4 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) { } } // namespace Internal -} // namespace Halide +} // namespace Halide \ No newline at end of file From 8cc4f02c94184da567dd5b653ca377bd3523c5ae Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 8 Mar 2024 02:13:56 +0000 Subject: [PATCH 084/186] Fix for top-of-tree LLVM (#8145) --- src/CodeGen_Internal.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp index 78fc4224fb61..697b9200fa33 100644 --- a/src/CodeGen_Internal.cpp +++ b/src/CodeGen_Internal.cpp @@ -610,7 +610,11 @@ void get_target_options(const llvm::Module &module, llvm::TargetOptions &options options.UseInitArray = true; options.FloatABIType = use_soft_float_abi ? llvm::FloatABI::Soft : llvm::FloatABI::Hard; +#if LLVM_VERSION >= 190 + options.MCOptions.X86RelaxRelocations = false; +#else options.RelaxELFRelocations = false; +#endif options.MCOptions.ABIName = mabi; } From 009fe7a15ffd6707ce15bc380e41ad66968d9bfa Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 8 Mar 2024 08:50:20 -0800 Subject: [PATCH 085/186] Handle loads of broadcasts in FlattenNestedRamps (#8139) With sufficiently perverse schedules, it's possible to end up with a load of a broadcast index (rather than a broadcast of a scalar load). This made FlattenNestedRamps divide by zero. Unfortunately this happened in a complex production pipeline, so I'm not entirely sure how to reproduce it. For that pipeline, this change fixes it and produces correct output. --- src/FlattenNestedRamps.cpp | 42 +++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp index f48bd75c37a2..92bcf3870d5d 100644 --- a/src/FlattenNestedRamps.cpp +++ b/src/FlattenNestedRamps.cpp @@ -81,19 +81,19 @@ class FlattenRamps : public IRMutator { // If they are, we'll have a full vector of const_indices if ((int)const_indices.size() == lanes) { - // Compute the stride for the underlying strided load - int stride = 0; - for (int c : const_indices) { - stride = (int)gcd(stride, c); - } - for (int &c : const_indices) { - c /= stride; + int stride = 0, extent = 1; + if (max_constant_offset > 0) { + for (int c : const_indices) { + stride = (int)gcd(stride, c); + } + for (int &c : const_indices) { + c /= stride; + } + // Compute the number of elements loaded + extent = (int)((max_constant_offset / stride) + 1); } - // Compute the number of elements loaded - int extent = (int)((max_constant_offset / stride) + 1); - // If we're gathering from a very large range, it // might be better to just do the gather rather than // doing a big dense load and then shuffling. We @@ -105,12 +105,22 @@ class FlattenRamps : public IRMutator { // in the schedule somehow. const int max_unused_lane_factor = 4; if (extent < max_unused_lane_factor * lanes) { - Expr dense_index = Ramp::make(min_lane, make_const(min_lane.type(), stride), extent); - Expr dense_load = - Load::make(op->type.with_lanes(extent), op->name, dense_index, - op->image, op->param, - const_true(extent), ModulusRemainder{}); - return Shuffle::make({dense_load}, const_indices); + if (max_constant_offset == 0) { + // It's a load of a broadcast. Convert it to a broadcast of a load + Expr load = Load::make(op->type.element_of(), op->name, min_lane, + op->image, op->param, + const_true(), ModulusRemainder{}); + return Broadcast::make(load, lanes); + } else { + // Turn it into a dense load and a shuffle + Expr dense_index = + Ramp::make(min_lane, make_const(min_lane.type(), stride), extent); + Expr dense_load = + Load::make(op->type.with_lanes(extent), op->name, dense_index, + op->image, op->param, + const_true(extent), ModulusRemainder{}); + return Shuffle::make({dense_load}, const_indices); + } } } } From 3c2d8099451521d9f1e1eb3632b31b2d7bc29310 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Mon, 11 Mar 2024 17:05:44 -0700 Subject: [PATCH 086/186] Use python itself to get the extension suffix, not python-config (#8148) * Use python itself to get the extension suffix, not python-config * Add a comment --- apps/onnx/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/onnx/Makefile b/apps/onnx/Makefile index f714b0254b75..5188c1c85068 100644 --- a/apps/onnx/Makefile +++ b/apps/onnx/Makefile @@ -90,7 +90,12 @@ ifeq ($(UNAME), Darwin) # Keep OS X builds from complaining about missing libpython symbols PYBIND11_CFLAGS += -undefined dynamic_lookup endif -PY_EXT = $(shell $(PYTHON)-config --extension-suffix) +# Get the python extension module suffix from python itself. You can +# also do this with python-config, but that's not resistant to version +# mismatches between python and python-config. This can happen when +# using a virtualenv, because virtualenvs override python, but not +# python-config. +PY_EXT = $(shell $(PYTHON) -c 'import sysconfig; print(sysconfig.get_config_var("EXT_SUFFIX"))') PY_MODEL_EXT = model_cpp$(PY_EXT) PYCXXFLAGS = $(PYBIND11_CFLAGS) $(CXXFLAGS) -Wno-deprecated-register From bf0d61149dde511f39b950689c2a08af7078e88b Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 12 Mar 2024 09:49:26 -0700 Subject: [PATCH 087/186] Rewrite the pass that adds mutexes for atomic nodes (#8105) * Avoid redundant scope lookups This pattern has been bugging me for a long time: ``` if (scope.contains(key)) { Foo f = scope.get(key); } ``` This redundantly looks up the key in the scope twice. I've finally gotten around to fixing it. I've introduced a find method that either returns a const pointer to the value, if it exists, or null. It also searches any containing scopes, which are held by const pointer, so the method has to return a const pointer. ``` if (const Foo *f = scope.find(key)) { } ``` For cases where you want to get and then mutate, I added shallow_find, which doesn't search enclosing scopes, but returns a mutable pointer. We were also doing redundant scope lookups in ScopedBinding. We stored the key in the helper object, and then did a pop on that key in the ScopedBinding destructor. This commit changes Scope so that Scope::push returns an opaque token that you can pass to Scope::pop to have it remove that element without doing a fresh lookup. ScopedBinding now uses this. Under the hood it's just an iterator on the underlying map (map iterators are not invalidated on inserting or removing other stuff). The net effect is to speed up local laplacian lowering by about 5% I also considered making it look more like an stl class, and having find return an iterator, but it doesn't really work. The iterator it returns might point to an entry in an enclosing scope, in which case you can't compare it to the .end() method of the scope you have. Scopes are different enough from maps that the interface really needs to be distinct. * Pacify clang-tidy * Rewrite the pass that injects mutexes to support atomics For O(n) nested allocate nodes, this pass was quadratic in n, even if there was no use of atomics. This commit rewrites it to use a linear-time algorithm, and skips it entirely after the first validation pass if there aren't any atomic nodes. It also needlessly used IRGraphMutators, which slowed things down, didn't handle LargeBuffers (could overflow in the allocation), incorrectly thought every producer/consumer node was associated with an output buffer, and didn't print the realization name when printing the atomic node (the body of an atomic node is only atomic w.r.t. a specific realization). I noticed all this because it stuck out in a profile. For resnet 50, the rewrite that changed to a linear algorithm took this stage from 185ms down to 6.7ms, and then skipping it entirely when it doesn't find any atomic nodes added 1.5 for the single IRVisitor check. For local laplacian with 100 pyramid levels (which contains many nested allocate nodes due to a large number of skip connections), the times are 5846 ms -> 16 ms -> 4.6 ms This is built on top of #8103 * Fix unintentional mutation of interval in scope --------- Co-authored-by: Steven Johnson --- src/AddAtomicMutex.cpp | 216 ++++++++++++++------------- src/AddAtomicMutex.h | 2 +- src/IRPrinter.cpp | 9 +- src/Lower.cpp | 2 +- src/runtime/HalideRuntime.h | 2 +- src/runtime/fake_thread_pool.cpp | 2 +- src/runtime/synchronization_common.h | 2 +- 7 files changed, 119 insertions(+), 116 deletions(-) diff --git a/src/AddAtomicMutex.cpp b/src/AddAtomicMutex.cpp index a2bf990e38f6..cf3b0ae8bb89 100644 --- a/src/AddAtomicMutex.cpp +++ b/src/AddAtomicMutex.cpp @@ -1,5 +1,4 @@ #include "AddAtomicMutex.h" - #include "ExprUsesVar.h" #include "Func.h" #include "IREquality.h" @@ -11,14 +10,10 @@ namespace Halide { namespace Internal { -using std::map; -using std::set; -using std::string; - namespace { /** Collect names of all stores matching the producer name inside a statement. */ -class CollectProducerStoreNames : public IRGraphVisitor { +class CollectProducerStoreNames : public IRVisitor { public: CollectProducerStoreNames(const std::string &producer_name) : producer_name(producer_name) { @@ -27,12 +22,12 @@ class CollectProducerStoreNames : public IRGraphVisitor { Scope store_names; protected: - using IRGraphVisitor::visit; + using IRVisitor::visit; void visit(const Store *op) override { - IRGraphVisitor::visit(op); + IRVisitor::visit(op); if (op->name == producer_name || starts_with(op->name, producer_name + ".")) { - // This is a Store for the desginated Producer. + // This is a Store for the designated Producer. store_names.push(op->name); } } @@ -42,7 +37,7 @@ class CollectProducerStoreNames : public IRGraphVisitor { /** Find Store inside of an Atomic node for the designated producer * and return their indices. */ -class FindProducerStoreIndex : public IRGraphVisitor { +class FindProducerStoreIndex : public IRVisitor { public: FindProducerStoreIndex(const std::string &producer_name) : producer_name(producer_name) { @@ -51,11 +46,11 @@ class FindProducerStoreIndex : public IRGraphVisitor { Expr index; // The returned index. protected: - using IRGraphVisitor::visit; + using IRVisitor::visit; // Need to also extract the let bindings of a Store index. void visit(const Let *op) override { - IRGraphVisitor::visit(op); // Make sure we visit the Store first. + IRVisitor::visit(op); // Make sure we visit the Store first. if (index.defined()) { if (expr_uses_var(index, op->name)) { index = Let::make(op->name, op->value, index); @@ -63,7 +58,7 @@ class FindProducerStoreIndex : public IRGraphVisitor { } } void visit(const LetStmt *op) override { - IRGraphVisitor::visit(op); // Make sure we visit the Store first. + IRVisitor::visit(op); // Make sure we visit the Store first. if (index.defined()) { if (expr_uses_var(index, op->name)) { index = Let::make(op->name, op->value, index); @@ -72,7 +67,7 @@ class FindProducerStoreIndex : public IRGraphVisitor { } void visit(const Store *op) override { - IRGraphVisitor::visit(op); + IRVisitor::visit(op); if (op->name == producer_name || starts_with(op->name, producer_name + ".")) { // This is a Store for the designated producer. @@ -94,11 +89,13 @@ class FindProducerStoreIndex : public IRGraphVisitor { /** Throws an assertion for cases where the indexing on left-hand-side of * an atomic update references to itself. * e.g. f(clamp(f(r), 0, 100)) = f(r) + 1 should be rejected. */ -class CheckAtomicValidity : public IRGraphVisitor { +class CheckAtomicValidity : public IRVisitor { protected: - using IRGraphVisitor::visit; + using IRVisitor::visit; void visit(const Atomic *op) override { + any_atomic = true; + // Collect the names of all Store nodes inside. CollectProducerStoreNames collector(op->producer_name); op->body.accept(&collector); @@ -115,13 +112,16 @@ class CheckAtomicValidity : public IRGraphVisitor { } op->body.accept(this); } + +public: + bool any_atomic = false; }; /** Search if the value of a Store node has a variable pointing to a let binding, * where the let binding contains the Store location. Use for checking whether * we need a mutex lock for Atomic since some lowering pass before lifted a let * binding from the Store node (currently only SplitTuple would do this). */ -class FindAtomicLetBindings : public IRGraphVisitor { +class FindAtomicLetBindings : public IRVisitor { public: FindAtomicLetBindings(const Scope &store_names) : store_names(store_names) { @@ -133,18 +133,18 @@ class FindAtomicLetBindings : public IRGraphVisitor { using IRVisitor::visit; void visit(const Let *op) override { - include(op->value); + op->value.accept(this); { ScopedBinding bind(let_bindings, op->name, op->value); - include(op->body); + op->body.accept(this); } } void visit(const LetStmt *op) override { - include(op->value); + op->value.accept(this); { ScopedBinding bind(let_bindings, op->name, op->value); - include(op->body); + op->body.accept(this); } } @@ -159,19 +159,19 @@ class FindAtomicLetBindings : public IRGraphVisitor { } void visit(const Store *op) override { - include(op->predicate); + op->predicate.accept(this); + op->index.accept(this); if (store_names.contains(op->name)) { // If we are in a designated store and op->value has a let binding // that uses one of the store_names, we found a lifted let. - ScopedValue old_inside_store(inside_store, op->name); - include(op->value); + ScopedValue old_inside_store(inside_store, op->name); + op->value.accept(this); } else { - include(op->value); + op->value.accept(this); } - include(op->index); } - string inside_store; + std::string inside_store; const Scope &store_names; Scope let_bindings; }; @@ -179,7 +179,7 @@ class FindAtomicLetBindings : public IRGraphVisitor { /** Clear out the Atomic node's mutex usages if it doesn't need one. */ class RemoveUnnecessaryMutexUse : public IRMutator { public: - set remove_mutex_lock_names; + std::set remove_mutex_lock_names; protected: using IRMutator::visit; @@ -200,30 +200,30 @@ class RemoveUnnecessaryMutexUse : public IRMutator { remove_mutex_lock_names.insert(op->mutex_name); Stmt body = mutate(op->body); return Atomic::make(op->producer_name, - string(), + std::string{}, std::move(body)); } } }; /** Find Store inside an Atomic that matches the provided store_names. */ -class FindStoreInAtomicMutex : public IRGraphVisitor { +class FindStoreInAtomicMutex : public IRVisitor { public: - using IRGraphVisitor::visit; + using IRVisitor::visit; FindStoreInAtomicMutex(const std::set &store_names) : store_names(store_names) { } bool found = false; - string producer_name; - string mutex_name; + std::string producer_name; + std::string mutex_name; protected: void visit(const Atomic *op) override { if (!found && !op->mutex_name.empty()) { ScopedValue old_in_atomic_mutex(in_atomic_mutex, true); - include(op->body); + op->body.accept(this); if (found) { // We found a Store inside Atomic with matching name, // record the mutex information. @@ -231,7 +231,7 @@ class FindStoreInAtomicMutex : public IRGraphVisitor { mutex_name = op->mutex_name; } } else { - include(op->body); + op->body.accept(this); } } @@ -241,11 +241,11 @@ class FindStoreInAtomicMutex : public IRGraphVisitor { found = true; } } - IRGraphVisitor::visit(op); + IRVisitor::visit(op); } bool in_atomic_mutex = false; - const set &store_names; + const std::set &store_names; }; /** Replace the indices in the Store nodes with the specified variable. */ @@ -276,26 +276,32 @@ class ReplaceStoreIndexWithVar : public IRMutator { /** Add mutex allocation & lock & unlock if required. */ class AddAtomicMutex : public IRMutator { public: - AddAtomicMutex(const map &env) - : env(env) { + AddAtomicMutex(const std::vector &o) { + for (const Function &f : o) { + outputs.emplace(f.name(), f); + } } protected: using IRMutator::visit; - const map &env; - // The set of producers that have allocated a mutex buffer - set allocated_mutexes; + // Maps from a producer name to a mutex name, for all encountered atomic + // nodes. + Scope needs_mutex_allocation; - Stmt allocate_mutex(const string &mutex_name, Expr extent, Stmt body) { + // Pipeline outputs + std::map outputs; + + Stmt allocate_mutex(const std::string &mutex_name, Expr extent, Stmt body) { Expr mutex_array = Call::make(type_of(), "halide_mutex_array_create", {std::move(extent)}, Call::Extern); + // Allocate a scalar of halide_mutex_array. // This generates halide_mutex_array mutex[1]; body = Allocate::make(mutex_name, - Handle(), + type_of(), MemoryType::Stack, {}, const_true(), @@ -309,37 +315,44 @@ class AddAtomicMutex : public IRMutator { // If this Allocate node is allocating a buffer for a producer, // and there is a Store node inside of an Atomic node requiring mutex lock // matching the name of the Allocate, allocate a mutex lock. - set store_names{op->name}; - FindStoreInAtomicMutex finder(store_names); - op->body.accept(&finder); - if (!finder.found) { - // No Atomic node that requires mutex lock from this node inside. - return IRMutator::visit(op); - } - if (allocated_mutexes.find(finder.mutex_name) != allocated_mutexes.end()) { - // We've already allocated a mutex. - return IRMutator::visit(op); + Stmt body = mutate(op->body); + + std::string producer_name; + if (ends_with(op->name, ".0")) { + producer_name = op->name.substr(0, op->name.size() - 2); + } else { + producer_name = op->name; } - allocated_mutexes.insert(finder.mutex_name); + if (const std::string *mutex_name = needs_mutex_allocation.find(producer_name)) { + Expr extent = cast(1); // uint64_t to handle LargeBuffers + for (const Expr &e : op->extents) { + extent = extent * e; + } - const string &mutex_name = finder.mutex_name; - Stmt body = mutate(op->body); - Expr extent = Expr(1); - for (const Expr &e : op->extents) { - extent = extent * e; + body = allocate_mutex(*mutex_name, extent, body); + + // At this stage in lowering it should be impossible to have an + // allocation that shadows the name of an outer allocation, but may as + // well handle it anyway by using a scope and popping at each allocate + // node. + needs_mutex_allocation.pop(producer_name); + } + + if (body.same_as(op->body)) { + return op; + } else { + return Allocate::make(op->name, + op->type, + op->memory_type, + op->extents, + op->condition, + std::move(body), + op->new_expr, + op->free_function, + op->padding); } - body = allocate_mutex(mutex_name, extent, body); - return Allocate::make(op->name, - op->type, - op->memory_type, - op->extents, - op->condition, - std::move(body), - op->new_expr, - op->free_function, - op->padding); } Stmt visit(const ProducerConsumer *op) override { @@ -348,50 +361,35 @@ class AddAtomicMutex : public IRMutator { // buffer at the producer node. if (!op->is_producer) { - // This is a consumer. + // This is a consumer return IRMutator::visit(op); } - // Find the corresponding output. - auto func_it = env.find(op->name); - if (func_it == env.end()) { - // Not an output. - return IRMutator::visit(op); - } - Func f = Func(func_it->second); - if (f.output_buffers().empty()) { - // Not an output. + auto it = outputs.find(op->name); + if (it == outputs.end()) { + // Not an output return IRMutator::visit(op); } - set store_names; - for (const auto &buffer : f.output_buffers()) { - store_names.insert(buffer.name()); - } + Function f = it->second; - FindStoreInAtomicMutex finder(store_names); - op->body.accept(&finder); - if (!finder.found) { - // No Atomic node that requires mutex lock from this node inside. - return IRMutator::visit(op); - } + Stmt body = mutate(op->body); - if (allocated_mutexes.find(finder.mutex_name) != allocated_mutexes.end()) { - // We've already allocated a mutex. - return IRMutator::visit(op); + if (const std::string *mutex_name = needs_mutex_allocation.find(it->first)) { + // All output buffers in a Tuple have the same extent. + OutputImageParam output_buffer = Func(f).output_buffers()[0]; + Expr extent = cast(1); // uint64_t to handle LargeBuffers + for (int i = 0; i < output_buffer.dimensions(); i++) { + extent *= output_buffer.dim(i).extent(); + } + body = allocate_mutex(*mutex_name, extent, body); } - allocated_mutexes.insert(finder.mutex_name); - - // We assume all output buffers in a Tuple have the same extent. - OutputImageParam output_buffer = f.output_buffers()[0]; - Expr extent = Expr(1); - for (int i = 0; i < output_buffer.dimensions(); i++) { - extent = extent * output_buffer.dim(i).extent(); + if (body.same_as(op->body)) { + return op; + } else { + return ProducerConsumer::make(op->name, op->is_producer, std::move(body)); } - Stmt body = mutate(op->body); - body = allocate_mutex(finder.mutex_name, extent, body); - return ProducerConsumer::make(op->name, op->is_producer, std::move(body)); } Stmt visit(const Atomic *op) override { @@ -414,7 +412,7 @@ class AddAtomicMutex : public IRMutator { // Lift the index outside of the atomic node. // This is for avoiding side-effects inside those expressions // being evaluated twice. - string name = unique_name('t'); + std::string name = unique_name('t'); index_let = index; index = Variable::make(index.type(), name); body = ReplaceStoreIndexWithVar(op->producer_name, index).mutate(body); @@ -444,17 +442,21 @@ class AddAtomicMutex : public IRMutator { internal_assert(index.as() != nullptr); ret = LetStmt::make(index.as()->name, index_let, ret); } + needs_mutex_allocation.push(op->producer_name, op->mutex_name); + return ret; } }; } // namespace -Stmt add_atomic_mutex(Stmt s, const map &env) { +Stmt add_atomic_mutex(Stmt s, const std::vector &outputs) { CheckAtomicValidity check; s.accept(&check); - s = RemoveUnnecessaryMutexUse().mutate(s); - s = AddAtomicMutex(env).mutate(s); + if (check.any_atomic) { + s = RemoveUnnecessaryMutexUse().mutate(s); + s = AddAtomicMutex(outputs).mutate(s); + } return s; } diff --git a/src/AddAtomicMutex.h b/src/AddAtomicMutex.h index c27b0346f349..5b11de621e97 100644 --- a/src/AddAtomicMutex.h +++ b/src/AddAtomicMutex.h @@ -23,7 +23,7 @@ namespace Internal { class Function; -Stmt add_atomic_mutex(Stmt s, const std::map &env); +Stmt add_atomic_mutex(Stmt s, const std::vector &outputs); } // namespace Internal } // namespace Halide diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index bc03dd124d9a..a186be1874d7 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -1112,11 +1112,12 @@ void IRPrinter::visit(const VectorReduce *op) { void IRPrinter::visit(const Atomic *op) { if (op->mutex_name.empty()) { - stream << get_indent() << "atomic {\n"; + stream << get_indent() << "atomic (" + << op->producer_name << ") {\n"; } else { - stream << get_indent() << "atomic ("; - stream << op->mutex_name; - stream << ") {\n"; + stream << get_indent() << "atomic (" + << op->producer_name << ", " + << op->mutex_name << ") {\n"; } indent += 2; print(op->body); diff --git a/src/Lower.cpp b/src/Lower.cpp index 3b357eb3061e..e39d55a65b9f 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -299,7 +299,7 @@ void lower_impl(const vector &output_funcs, log("Lowering after storage flattening:", s); debug(1) << "Adding atomic mutex allocation...\n"; - s = add_atomic_mutex(s, env); + s = add_atomic_mutex(s, outputs); log("Lowering after adding atomic mutex allocation:", s); debug(1) << "Unpacking buffer arguments...\n"; diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 62fbaeb66d43..1a19202745bb 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -195,7 +195,7 @@ extern void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mute /** Functions for constructing/destroying/locking/unlocking arrays of mutexes. */ struct halide_mutex_array; //@{ -extern struct halide_mutex_array *halide_mutex_array_create(int sz); +extern struct halide_mutex_array *halide_mutex_array_create(uint64_t sz); extern void halide_mutex_array_destroy(void *user_context, void *array); extern int halide_mutex_array_lock(struct halide_mutex_array *array, int entry); extern int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry); diff --git a/src/runtime/fake_thread_pool.cpp b/src/runtime/fake_thread_pool.cpp index 9c3cfddc5a47..531a16d1312e 100644 --- a/src/runtime/fake_thread_pool.cpp +++ b/src/runtime/fake_thread_pool.cpp @@ -96,7 +96,7 @@ WEAK void halide_mutex_unlock(halide_mutex *mutex) { // (e.g. correctness/multiple_scatter). Since we don't have threads, we don't // need to mutex to do anything, but returning a null would trigger an error // condition that would be misrepoted as out-of-memory. -WEAK halide_mutex_array *halide_mutex_array_create(int sz) { +WEAK halide_mutex_array *halide_mutex_array_create(uint64_t sz) { return &halide_fake_mutex_array; } diff --git a/src/runtime/synchronization_common.h b/src/runtime/synchronization_common.h index cb244f360eeb..778c423e4046 100644 --- a/src/runtime/synchronization_common.h +++ b/src/runtime/synchronization_common.h @@ -908,7 +908,7 @@ struct halide_mutex_array { struct halide_mutex *array; }; -WEAK halide_mutex_array *halide_mutex_array_create(int sz) { +WEAK halide_mutex_array *halide_mutex_array_create(uint64_t sz) { // TODO: If sz is huge, we should probably hash it down to something smaller // in the accessors below. Check for deadlocks before doing so. halide_mutex_array *array = (halide_mutex_array *)halide_malloc( From 4988ab5467b612bb6ce29914e5baf8bf70596ccb Mon Sep 17 00:00:00 2001 From: Martijn Courteaux Date: Wed, 13 Mar 2024 00:58:14 +0100 Subject: [PATCH 088/186] Feature: mark a Func as no_profiling, to prevent injection of profiling. (2nd implementation) (#8143) * Small feature to allow you to specify that a (typically small inner loop) Func should not be profiled. * Simplified the tuple name handling. * Optimize tuple name normalization in Profiling.cpp * Clang-format * Feedback on Function already being a pointer. Bump the Patch version of the serialization. --- src/Deserialization.cpp | 4 ++- src/Func.cpp | 5 +++ src/Func.h | 9 +++++ src/Function.cpp | 19 ++++++++--- src/Function.h | 7 ++++ src/Lower.cpp | 2 +- src/Profiling.cpp | 74 +++++++++++++++++++++++++++++++---------- src/Profiling.h | 5 ++- src/Serialization.cpp | 5 ++- src/halide_ir.fbs | 3 +- 10 files changed, 107 insertions(+), 26 deletions(-) diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index 551acfcdebf2..0a1403362621 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -504,12 +504,14 @@ void Deserializer::deserialize_function(const Serialize::Func *function, Functio const std::vector trace_tags = deserialize_vector(function->trace_tags(), &Deserializer::deserialize_string); + const bool no_profiling = function->no_profiling(); const bool frozen = function->frozen(); hl_function.update_with_deserialization(name, origin_name, output_types, required_types, required_dim, args, func_schedule, init_def, updates, debug_file, output_buffers, extern_arguments, extern_function_name, name_mangling, extern_function_device_api, extern_proxy_expr, - trace_loads, trace_stores, trace_realizations, trace_tags, frozen); + trace_loads, trace_stores, trace_realizations, trace_tags, + no_profiling, frozen); } Stmt Deserializer::deserialize_stmt(Serialize::Stmt type_code, const void *stmt) { diff --git a/src/Func.cpp b/src/Func.cpp index 7e0995cc33b5..1f480c99983c 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -3037,6 +3037,11 @@ Func &Func::add_trace_tag(const std::string &trace_tag) { return *this; } +Func &Func::no_profiling() { + func.do_not_profile(); + return *this; +} + void Func::debug_to_file(const string &filename) { invalidate_cache(); func.debug_file() = filename; diff --git a/src/Func.h b/src/Func.h index d4074ee18cc6..bced13f79481 100644 --- a/src/Func.h +++ b/src/Func.h @@ -2559,6 +2559,15 @@ class Func { */ Func &add_trace_tag(const std::string &trace_tag); + /** Marks this function as a function that should not be profiled + * when using the target feature Profile or ProfileByTimer. + * This is useful when this function is does too little work at once + * such that the overhead of setting the profiling token might + * become significant, or that the measured time is not representative + * due to modern processors (instruction level parallelism, out-of-order + * execution). */ + Func &no_profiling(); + /** Get a handle on the internal halide function that this Func * represents. Useful if you want to do introspection on Halide * functions */ diff --git a/src/Function.cpp b/src/Function.cpp index 795d18136843..cbb4b61574d4 100644 --- a/src/Function.cpp +++ b/src/Function.cpp @@ -110,6 +110,8 @@ struct FunctionContents { bool trace_loads = false, trace_stores = false, trace_realizations = false; std::vector trace_tags; + bool no_profiling = false; + bool frozen = false; void accept(IRVisitor *visitor) const { @@ -352,6 +354,7 @@ void Function::update_with_deserialization(const std::string &name, bool trace_stores, bool trace_realizations, const std::vector &trace_tags, + bool no_profiling, bool frozen) { contents->name = name; contents->origin_name = origin_name; @@ -373,6 +376,7 @@ void Function::update_with_deserialization(const std::string &name, contents->trace_stores = trace_stores; contents->trace_realizations = trace_realizations; contents->trace_tags = trace_tags; + contents->no_profiling = no_profiling; contents->frozen = frozen; } @@ -509,6 +513,7 @@ void Function::deep_copy(const FunctionPtr ©, DeepCopyMap &copied_map) const copy->trace_stores = contents->trace_stores; copy->trace_realizations = contents->trace_realizations; copy->trace_tags = contents->trace_tags; + copy->no_profiling = contents->no_profiling; copy->frozen = contents->frozen; copy->output_buffers = contents->output_buffers; copy->func_schedule = contents->func_schedule.deep_copy(copied_map); @@ -1139,10 +1144,6 @@ const std::vector &Function::get_trace_tags() const { return contents->trace_tags; } -void Function::freeze() { - contents->frozen = true; -} - void Function::lock_loop_levels() { auto &schedule = contents->func_schedule; schedule.compute_level().lock(); @@ -1166,6 +1167,16 @@ void Function::lock_loop_levels() { } } +void Function::do_not_profile() { + contents->no_profiling = true; +} +bool Function::should_not_profile() const { + return contents->no_profiling; +} + +void Function::freeze() { + contents->frozen = true; +} bool Function::frozen() const { return contents->frozen; } diff --git a/src/Function.h b/src/Function.h index 66b62a01f66b..49f68805d61e 100644 --- a/src/Function.h +++ b/src/Function.h @@ -88,6 +88,7 @@ class Function { bool trace_stores, bool trace_realizations, const std::vector &trace_tags, + bool no_profiling, bool frozen); /** Get a handle on the halide function contents that this Function @@ -290,6 +291,12 @@ class Function { * cannot be mutated further. */ void lock_loop_levels(); + /** Mark the function as too small for meaningful profiling. */ + void do_not_profile(); + + /** Check if the function is marked as one that should not be profiled. */ + bool should_not_profile() const; + /** Mark function as frozen, which means it cannot accept new * definitions. */ void freeze(); diff --git a/src/Lower.cpp b/src/Lower.cpp index e39d55a65b9f..79d02323b3bf 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -408,7 +408,7 @@ void lower_impl(const vector &output_funcs, if (t.has_feature(Target::Profile) || t.has_feature(Target::ProfileByTimer)) { debug(1) << "Injecting profiling...\n"; - s = inject_profiling(s, pipeline_name); + s = inject_profiling(s, pipeline_name, env); log("Lowering after injecting profiling:", s); } diff --git a/src/Profiling.cpp b/src/Profiling.cpp index 2be058b3c8a6..414578299df6 100644 --- a/src/Profiling.cpp +++ b/src/Profiling.cpp @@ -3,7 +3,7 @@ #include #include "CodeGen_Internal.h" -#include "ExprUsesVar.h" +#include "Function.h" #include "IRMutator.h" #include "IROperator.h" #include "InjectHostDevBufferCopies.h" @@ -71,13 +71,14 @@ class InjectProfiling : public IRMutator { vector stack; // What produce nodes are we currently inside of. string pipeline_name; + const map &env; bool in_fork = false; bool in_parallel = false; bool in_leaf_task = false; - InjectProfiling(const string &pipeline_name) - : pipeline_name(pipeline_name) { + InjectProfiling(const string &pipeline_name, const map &env) + : pipeline_name(pipeline_name), env(env) { stack.push_back(get_func_id("overhead")); // ID 0 is treated specially in the runtime as overhead internal_assert(stack.back() == 0); @@ -119,10 +120,28 @@ class InjectProfiling : public IRMutator { bool profiling_memory = true; // Strip down the tuple name, e.g. f.0 into f - string normalize_name(const string &name) { - vector v = split_string(name, "."); - internal_assert(!v.empty()); - return v[0]; + string normalize_name(const string &name) const { + size_t idx = name.find('.'); + if (idx != std::string::npos) { + internal_assert(idx != 0); + return name.substr(0, idx); + } else { + return name; + } + } + + Function lookup_function(const string &name) const { + auto it = env.find(name); + if (it != env.end()) { + return it->second; + } + string norm_name = normalize_name(name); + it = env.find(norm_name); + if (it != env.end()) { + return it->second; + } + internal_error << "No function in the environment found for name '" << name << "'.\n"; + return {}; } int get_func_id(const string &name) { @@ -185,7 +204,6 @@ class InjectProfiling : public IRMutator { } Stmt visit(const Allocate *op) override { - int idx = get_func_id(op->name); auto [new_extents, changed] = mutate_with_changes(op->extents); Expr condition = mutate(op->condition); @@ -199,6 +217,13 @@ class InjectProfiling : public IRMutator { // always conditionally false. remove_dead_allocations() is called after // inject_profiling() so this is a possible scenario. if (!is_const_zero(size) && on_stack) { + int idx; + Function func = lookup_function(op->name); + if (func.should_not_profile()) { + idx = stack.back(); // Attribute the stack size contribution to the deepest _profiled_ func. + } else { + idx = get_func_id(op->name); + } const uint64_t *int_size = as_const_uint(size); internal_assert(int_size != nullptr); // Stack size is always a const int func_stack_current[idx] += *int_size; @@ -212,6 +237,7 @@ class InjectProfiling : public IRMutator { vector tasks; bool track_heap_allocation = !is_const_zero(size) && !on_stack && profiling_memory; if (track_heap_allocation) { + int idx = get_func_id(op->name); debug(3) << " Allocation on heap: " << op->name << "(" << size << ") in pipeline " << pipeline_name << "\n"; @@ -245,8 +271,6 @@ class InjectProfiling : public IRMutator { } Stmt visit(const Free *op) override { - int idx = get_func_id(op->name); - AllocSize alloc = func_alloc_sizes.get(op->name); internal_assert(alloc.size.type() == UInt(64)); func_alloc_sizes.pop(op->name); @@ -256,6 +280,7 @@ class InjectProfiling : public IRMutator { if (!is_const_zero(alloc.size)) { if (!alloc.on_stack) { if (profiling_memory) { + int idx = get_func_id(op->name); debug(3) << " Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "\n"; vector tasks{ @@ -271,6 +296,13 @@ class InjectProfiling : public IRMutator { const uint64_t *int_size = as_const_uint(alloc.size); internal_assert(int_size != nullptr); + int idx; + Function func = lookup_function(op->name); + if (func.should_not_profile()) { + idx = stack.back(); // Attribute the stack size contribution to the deepest _profiled_ func. + } else { + idx = get_func_id(op->name); + } func_stack_current[idx] -= *int_size; debug(3) << " Free on stack: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n"; @@ -283,11 +315,19 @@ class InjectProfiling : public IRMutator { int idx; Stmt body; if (op->is_producer) { - idx = get_func_id(op->name); - stack.push_back(idx); - Stmt set_current = set_current_func(idx); - body = Block::make(set_current, mutate(op->body)); - stack.pop_back(); + Function func = lookup_function(op->name); + if (func.should_not_profile()) { + body = mutate(op->body); + if (body.same_as(op->body)) { + return op; + } + } else { + idx = get_func_id(op->name); + stack.push_back(idx); + Stmt set_current = set_current_func(idx); + body = Block::make(set_current, mutate(op->body)); + stack.pop_back(); + } } else { // At the beginning of the consume step, set the current task // back to the outer one. @@ -498,8 +538,8 @@ class InjectProfiling : public IRMutator { } // namespace -Stmt inject_profiling(Stmt s, const string &pipeline_name) { - InjectProfiling profiling(pipeline_name); +Stmt inject_profiling(Stmt s, const string &pipeline_name, const std::map &env) { + InjectProfiling profiling(pipeline_name, env); s = profiling.mutate(s); int num_funcs = (int)(profiling.indices.size()); diff --git a/src/Profiling.h b/src/Profiling.h index a6040b9160af..afaa47fe6d6e 100644 --- a/src/Profiling.h +++ b/src/Profiling.h @@ -23,6 +23,7 @@ * mandelbrot: 0.006444ms (10%) peak: 505344 num: 104000 avg: 5376 * argmin: 0.027715ms (46%) stack: 20 */ +#include #include #include "Expr.h" @@ -30,6 +31,8 @@ namespace Halide { namespace Internal { +class Function; + /** Take a statement representing a halide pipeline insert * high-resolution timing into the generated code (via spawning a * thread that acts as a sampling profiler); summaries of execution @@ -37,7 +40,7 @@ namespace Internal { * storage flattening, but after all bounds inference. * */ -Stmt inject_profiling(Stmt, const std::string &); +Stmt inject_profiling(Stmt, const std::string &, const std::map &env); } // namespace Internal } // namespace Halide diff --git a/src/Serialization.cpp b/src/Serialization.cpp index 144d79af7e5e..c1cb3a6d1193 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -1029,6 +1029,7 @@ Offset Serializer::serialize_function(FlatBufferBuilder &builde for (const auto &tag : function.get_trace_tags()) { trace_tags_serialized.push_back(serialize_string(builder, tag)); } + const bool no_profiling = function.should_not_profile(); const bool frozen = function.frozen(); auto func = Serialize::CreateFunc(builder, name_serialized, @@ -1050,7 +1051,9 @@ Offset Serializer::serialize_function(FlatBufferBuilder &builde trace_loads, trace_stores, trace_realizations, - builder.CreateVector(trace_tags_serialized), frozen); + builder.CreateVector(trace_tags_serialized), + no_profiling, + frozen); return func; } diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index 01a987b6f430..efc465cbee82 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -15,7 +15,7 @@ enum SerializationVersionMinor: int { Value = 0 } enum SerializationVersionPatch: int { - Value = 0 + Value = 1 } // from src/IR.cpp @@ -713,6 +713,7 @@ table Func { trace_stores: bool = false; trace_realizations: bool = false; trace_tags: [string]; + no_profiling: bool = false; frozen: bool = false; } From 83616f20c49c6f8e97403acd0add3df41753adeb Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 12 Mar 2024 17:00:49 -0700 Subject: [PATCH 089/186] Fix three nits (#8137) 1) has_gpu_feature already includes Vulkan, so there's no need to check for it. 2) Use emplace(...) instead of insert(make_pair(...)) 3) Fixed a place that should be using a ScopedValue --- src/BoundsInference.cpp | 6 +++--- src/Lower.cpp | 5 +---- src/StorageFlattening.cpp | 8 ++------ src/autoschedulers/mullapudi2016/AutoSchedule.cpp | 2 +- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp index 5965303197bc..21ca06e07285 100644 --- a/src/BoundsInference.cpp +++ b/src/BoundsInference.cpp @@ -1152,7 +1152,7 @@ class BoundsInference : public IRMutator { map stage_name_to_func; if (producing >= 0) { - fused_group.insert(make_pair(f.name(), stage_index)); + fused_group.emplace(f.name(), stage_index); } if (!no_pipelines && producing >= 0 && !f.has_extern_definition()) { @@ -1164,12 +1164,12 @@ class BoundsInference : public IRMutator { if (!((pair.func_1 == stages[producing].name) && ((int)pair.stage_1 == stage_index)) && is_fused_with_others(fused_groups, fused_pairs_in_groups, f, stage_index, pair.func_1, pair.stage_1, var)) { - fused_group.insert(make_pair(pair.func_1, pair.stage_1)); + fused_group.emplace(pair.func_1, pair.stage_1); } if (!((pair.func_2 == stages[producing].name) && ((int)pair.stage_2 == stage_index)) && is_fused_with_others(fused_groups, fused_pairs_in_groups, f, stage_index, pair.func_2, pair.stage_2, var)) { - fused_group.insert(make_pair(pair.func_2, pair.stage_2)); + fused_group.emplace(pair.func_2, pair.stage_2); } } diff --git a/src/Lower.cpp b/src/Lower.cpp index 79d02323b3bf..f092e2e711ef 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -280,10 +280,7 @@ void lower_impl(const vector &output_funcs, s = split_tuples(s, env); log("Lowering after destructuring tuple-valued realizations:", s); - // Vulkan relies on GPU var canonicalization occurring before - // storage flattening. - if (t.has_gpu_feature() || - t.has_feature(Target::Vulkan)) { + if (t.has_gpu_feature()) { debug(1) << "Canonicalizing GPU var names...\n"; s = canonicalize_gpu_vars(s); log("Lowering after canonicalizing GPU var names:", s); diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index ba4cc9b8acca..5860a7e50d0f 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -535,13 +535,9 @@ class FlattenDimensions : public IRMutator { Interval loop_bounds = Interval(expanded_min, simplify(expanded_min + expanded_extent - 1)); it->loop_vars.push(op->name, loop_bounds); } - bool old_in_gpu = in_gpu; - if (op->for_type == ForType::GPUBlock || - op->for_type == ForType::GPUThread) { - in_gpu = true; - } + + ScopedValue old_in_gpu(in_gpu, in_gpu || is_gpu(op->for_type)); Stmt stmt = IRMutator::visit(op); - in_gpu = old_in_gpu; for (auto &p : hoisted_storages) { p.loop_vars.pop(op->name); diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp index 2ce325538a86..e3cc2ec5e825 100644 --- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp +++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp @@ -1359,7 +1359,7 @@ Partitioner::Partitioner(const map &_pipeline_bounds, for (int s = 0; s < num_stages; s++) { FStage stg(f.second, s); Group g(stg, {stg}); - groups.insert(make_pair(stg, g)); + groups.emplace(stg, g); } } From f841a27b0c3f0b2b45c756908773c530d47f482c Mon Sep 17 00:00:00 2001 From: Volodymyr Kysenko Date: Thu, 14 Mar 2024 12:53:17 -0700 Subject: [PATCH 090/186] Bound allocation extents for hoist_storage using loop variables one-by-one (#8154) * Bound allocation extents using loop variable one-by-one * Use emplace_back --- src/StorageFlattening.cpp | 19 ++++++++++++++----- test/correctness/hoist_storage.cpp | 14 ++++++++++++++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index 5860a7e50d0f..59278d50fe69 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -88,7 +88,7 @@ class FlattenDimensions : public IRMutator { struct HoistedStorageData { string name; vector hoisted_allocations; - Scope loop_vars; + vector> loop_vars; Scope scope; HoistedStorageData(const string &n) @@ -304,8 +304,17 @@ class FlattenDimensions : public IRMutator { } e = simplify(common_subexpression_elimination(e)); - Interval bounds = bounds_of_expr_in_scope(e, hoisted_storage_data.loop_vars); - return bounds.max; + // Find bounds of expression using the intervals of the loop variables. The loop variables may depend on + // the other loop variables, so we just call bounds_of_expr_in_scope for each loop variable separately + // in a reverse order. + for (auto it = hoisted_storage_data.loop_vars.rbegin(); it != hoisted_storage_data.loop_vars.rend(); ++it) { + Scope one_loop_var; + one_loop_var.push(it->first, it->second); + Interval bounds = bounds_of_expr_in_scope(e, one_loop_var); + e = bounds.max; + } + + return e; }; vector bounded_extents; @@ -533,14 +542,14 @@ class FlattenDimensions : public IRMutator { expanded_min = simplify(expand_expr(expanded_min, it->scope)); expanded_extent = expand_expr(expanded_extent, it->scope); Interval loop_bounds = Interval(expanded_min, simplify(expanded_min + expanded_extent - 1)); - it->loop_vars.push(op->name, loop_bounds); + it->loop_vars.emplace_back(op->name, loop_bounds); } ScopedValue old_in_gpu(in_gpu, in_gpu || is_gpu(op->for_type)); Stmt stmt = IRMutator::visit(op); for (auto &p : hoisted_storages) { - p.loop_vars.pop(op->name); + p.loop_vars.pop_back(); } return stmt; diff --git a/test/correctness/hoist_storage.cpp b/test/correctness/hoist_storage.cpp index ce98b421bc54..4e96dfee9f2d 100644 --- a/test/correctness/hoist_storage.cpp +++ b/test/correctness/hoist_storage.cpp @@ -604,6 +604,20 @@ int main(int argc, char **argv) { }); } + { + ImageParam input(UInt(8), 2); + Var x{"x"}, y{"y"}, yo{"yo"}, yi{"yi"}; + Func f[3]; + f[0] = BoundaryConditions::repeat_edge(input); + f[1](x, y) = ((f[0]((x / 2) + 2, (y / 2) + 2)) + (f[0](x + 1, y))); + f[2](x, y) = ((f[1](x * 2, (y * 2) + -2)) + (f[1](x + -1, y + -1))); + f[2].split(y, yo, yi, 16); + f[0].hoist_storage(f[2], yo).compute_at(f[1], x); + f[1].hoist_storage_root().compute_at(f[2], yi); + + f[2].compile_jit(); + } + printf("Success!\n"); return 0; } From 76a7dd4f7fb538deaf7c2ade56c02bc84e5221e8 Mon Sep 17 00:00:00 2001 From: Zalman Stern Date: Fri, 15 Mar 2024 13:01:51 -0700 Subject: [PATCH 091/186] Support for ARM SVE2. (#8051) * Checkpoint SVE2 restart. * Remove dead code. Add new test. * Update cmake for new file. * Checkpoint progress on SVE2. * Checkpoint ARM SVE2 support. Passes correctness_simd_op_check_sve2 test at 128 and 256 bits. * Remove an opportunity for RISC V codegen to change due to SVE2 support. * Ensure SVE intrinsics get vscale vectors and non-SVE ones get fixed vectors. Use proper prefix for neon intrinsics. Comment cleanups. * Checkpoint SVE2 work. Generally passes test, though using both NEON and SVE2 with simd_op_check_sve2 fails as both posibilities need to be allowed for 128-bit or smaller operations. * Remove an unfavored implementation possibility. * Fix opcode recognition in test to handle some cases that show up. Change name of test class to avoid confusion. * Formatting fixes. Replace internal_error with nop return for CodeGen_LLVM::match_vector_type_scalable called on scalar. * Formatting fix. * Limit SVE2 test to LLVM 19. Remove dead code. * Fix a degenerate case asking for zero sized vectors via a HAlide type with lanes of zero, which is not correct. * Fix confusion about Neon64/Neon128 and make it clear this is just the width multiplier applied to intrinsics. * REmove extraneous commented out line. * Address some review feedback. Mostly comment fixes. * Fix missed conflict resolution. * Fix some TODOs in SVE code. Move utility function to Util.h and common code the other obvious use. * Formatting. * Add missed refactor change. * Add issue to TODO comment. * Remove TODOs that don't seem necessary. * Add issue for TODO. * Add issue for TODO. * Remove dubious looking FP to int code that was ifdef'ed out. Doesn't look like a TODO is needed anymore. * Add issues for TODOs. * Update simd_op_check_sve2.cpp * Make a deep copy of each piece of test IR so that we can parallelize * Fix two clang-tidy warnings * Remove try/catch block from simd-op-check-sve2 * Don't try to run SVE2 code if vector_bits doesn't match host. * Add support for fcvtm/p, make scalars go through pattern matching too (#8151) * Don't do arm neon instruction selection on scalars This revealed a bug. FindIntrinsics was not enabled for scalars anyway, so it was semi-pointless. --------- Co-authored-by: Zalman Stern Co-authored-by: Steven Johnson Co-authored-by: Andrew Adams --- src/CodeGen_ARM.cpp | 1388 ++++++++++++++++++----- src/CodeGen_LLVM.cpp | 230 +++- src/CodeGen_LLVM.h | 7 + src/Function.cpp | 6 +- src/IR.cpp | 1 + src/IR.h | 2 + src/IRMatch.cpp | 3 + src/LLVM_Output.cpp | 6 + src/StorageFolding.cpp | 5 +- src/Util.h | 11 + src/WasmExecutor.cpp | 11 +- src/runtime/HalideRuntime.h | 6 +- src/runtime/aarch64.ll | 76 +- src/runtime/errors.cpp | 8 + src/runtime/posix_math.ll | 28 +- src/runtime/runtime_api.cpp | 1 + test/correctness/CMakeLists.txt | 1 + test/correctness/simd_op_check_arm.cpp | 7 + test/correctness/simd_op_check_sve2.cpp | 1387 ++++++++++++++++++++++ 19 files changed, 2836 insertions(+), 348 deletions(-) create mode 100644 test/correctness/simd_op_check_sve2.cpp diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 7852532183bf..d0538d6ccca8 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -105,17 +105,30 @@ class CodeGen_ARM : public CodeGen_Posix { CodeGen_ARM(const Target &); protected: + using codegen_func_t = std::function &)>; using CodeGen_Posix::visit; - /** Assuming 'inner' is a function that takes two vector arguments, define a wrapper that - * takes one vector argument and splits it into two to call inner. */ - llvm::Function *define_concat_args_wrapper(llvm::Function *inner, const string &name); + /** Similar to llvm_type_of, but allows providing a VectorTypeConstraint to + * force Fixed or VScale vector results. */ + llvm::Type *llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, VectorTypeConstraint constraint); + + /** Define a wrapper LLVM func that takes some arguments which Halide defines + * and call inner LLVM intrinsic with an additional argument which LLVM requires. */ + llvm::Function *define_intrin_wrapper(const std::string &inner_name, + const Type &ret_type, + const std::string &mangled_name, + const std::vector &arg_types, + int intrinsic_flags, + bool sve_intrinsic); void init_module() override; void compile_func(const LoweredFunc &f, const std::string &simple_name, const std::string &extern_name) override; - /** Nodes for which we want to emit specific neon intrinsics */ + void begin_func(LinkageType linkage, const std::string &simple_name, + const std::string &extern_name, const std::vector &args) override; + + /** Nodes for which we want to emit specific ARM vector intrinsics */ // @{ void visit(const Cast *) override; void visit(const Add *) override; @@ -125,15 +138,25 @@ class CodeGen_ARM : public CodeGen_Posix { void visit(const Store *) override; void visit(const Load *) override; void visit(const Shuffle *) override; + void visit(const Ramp *) override; void visit(const Call *) override; void visit(const LT *) override; void visit(const LE *) override; void codegen_vector_reduce(const VectorReduce *, const Expr &) override; + bool codegen_dot_product_vector_reduce(const VectorReduce *, const Expr &); + bool codegen_pairwise_vector_reduce(const VectorReduce *, const Expr &); + bool codegen_across_vector_reduce(const VectorReduce *, const Expr &); // @} Type upgrade_type_for_arithmetic(const Type &t) const override; Type upgrade_type_for_argument_passing(const Type &t) const override; Type upgrade_type_for_storage(const Type &t) const override; + /** Helper function to perform codegen of vector operation in a way that + * total_lanes are divided into slices, codegen is performed for each slice + * and results are concatenated into total_lanes. + */ + Value *codegen_with_lanes(int slice_lanes, int total_lanes, const std::vector &args, codegen_func_t &cg_func); + /** Various patterns to peephole match against */ struct Pattern { string intrin; ///< Name of the intrinsic @@ -150,10 +173,12 @@ class CodeGen_ARM : public CodeGen_Posix { string mattrs() const override; bool use_soft_float_abi() const override; int native_vector_bits() const override; + int target_vscale() const override; // NEON can be disabled for older processors. - bool neon_intrinsics_disabled() { - return target.has_feature(Target::NoNEON); + bool simd_intrinsics_disabled() { + return target.has_feature(Target::NoNEON) && + !target.has_feature(Target::SVE2); } bool is_float16_and_has_feature(const Type &t) const { @@ -161,11 +186,28 @@ class CodeGen_ARM : public CodeGen_Posix { return t.code() == Type::Float && t.bits() == 16 && target.has_feature(Target::ARMFp16); } bool supports_call_as_float16(const Call *op) const override; + + /** Make predicate vector which starts with consecutive true followed by consecutive false */ + Expr make_vector_predicate_1s_0s(int true_lanes, int false_lanes) { + internal_assert((true_lanes + false_lanes) != 0) << "CodeGen_ARM::make_vector_predicate_1s_0s called with total of 0 lanes.\n"; + if (true_lanes == 0) { + return const_false(false_lanes); + } else if (false_lanes == 0) { + return const_true(true_lanes); + } else { + return Shuffle::make_concat({const_true(true_lanes), const_false(false_lanes)}); + } + } }; CodeGen_ARM::CodeGen_ARM(const Target &target) : CodeGen_Posix(target) { + // TODO(https://github.com/halide/Halide/issues/8088): See if + // use_llvm_vp_intrinsics can replace architecture specific code in this + // file, specifically in Load and Store visitors. Depends on quality of + // LLVM aarch64 backend lowering for these intrinsics on SVE2. + // RADDHN - Add and narrow with rounding // These must come before other narrowing rounding shift patterns casts.emplace_back("rounding_add_narrow", i8(rounding_shift_right(wild_i16x_ + wild_i16x_, 8))); @@ -211,6 +253,12 @@ CodeGen_ARM::CodeGen_ARM(const Target &target) casts.emplace_back("shift_right_narrow", i32(wild_i64x_ >> wild_u64_)); casts.emplace_back("shift_right_narrow", u32(wild_u64x_ >> wild_u64_)); + // VCVTP/M + casts.emplace_back("fp_to_int_floor", i32(floor(wild_f32x_))); + casts.emplace_back("fp_to_int_floor", u32(floor(wild_f32x_))); + casts.emplace_back("fp_to_int_ceil", i32(ceil(wild_f32x_))); + casts.emplace_back("fp_to_int_ceil", u32(ceil(wild_f32x_))); + // SQRSHL, UQRSHL - Saturating rounding shift left (by signed vector) // TODO: We need to match rounding shift right, and negate the RHS. @@ -299,26 +347,66 @@ struct ArmIntrinsic { SplitArg0 = 1 << 6, // This intrinsic requires splitting the argument into the low and high halves. NoPrefix = 1 << 7, // Don't prefix the intrinsic with llvm.* RequireFp16 = 1 << 8, // Available only if Target has ARMFp16 feature + Neon64Unavailable = 1 << 9, // Unavailable for 64 bit NEON + SveUnavailable = 1 << 10, // Unavailable for SVE + SveNoPredicate = 1 << 11, // In SVE intrinsics, additional predicate argument is required as default, unless this flag is set. + SveInactiveArg = 1 << 12, // This intrinsic needs the additional argument for fallback value for the lanes inactivated by predicate. + SveRequired = 1 << 13, // This intrinsic requires SVE. }; }; // clang-format off const ArmIntrinsic intrinsic_defs[] = { - {"vabs", "abs", UInt(8, 8), "abs", {Int(8, 8)}, ArmIntrinsic::HalfWidth}, - {"vabs", "abs", UInt(16, 4), "abs", {Int(16, 4)}, ArmIntrinsic::HalfWidth}, - {"vabs", "abs", UInt(32, 2), "abs", {Int(32, 2)}, ArmIntrinsic::HalfWidth}, - {"llvm.fabs", "llvm.fabs", Float(32, 2), "abs", {Float(32, 2)}, ArmIntrinsic::HalfWidth}, - {"llvm.fabs", "llvm.fabs", Float(16, 4), "abs", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16}, - - {"llvm.sqrt", "llvm.sqrt", Float(32, 2), "sqrt_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth}, - {"llvm.sqrt", "llvm.sqrt", Float(64, 2), "sqrt_f64", {Float(64, 2)}}, - - {"llvm.roundeven", "llvm.roundeven", Float(16, 8), "round", {Float(16, 8)}, ArmIntrinsic::RequireFp16}, - {"llvm.roundeven", "llvm.roundeven", Float(32, 4), "round", {Float(32, 4)}}, - {"llvm.roundeven", "llvm.roundeven", Float(64, 2), "round", {Float(64, 2)}}, - {"llvm.roundeven.f16", "llvm.roundeven.f16", Float(16), "round", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle}, - {"llvm.roundeven.f32", "llvm.roundeven.f32", Float(32), "round", {Float(32)}, ArmIntrinsic::NoMangle}, - {"llvm.roundeven.f64", "llvm.roundeven.f64", Float(64), "round", {Float(64)}, ArmIntrinsic::NoMangle}, + // TODO(https://github.com/halide/Halide/issues/8093): + // Some of the Arm intrinsic have the same name between Neon and SVE2 but with different behavior. For example, + // widening, narrowing and pair-wise operations which are performed in even (top) and odd (bottom) lanes basis in SVE, + // while in high and low lanes in Neon. Therefore, peep-hole code-gen with those SVE2 intrinsic is not enabled for now, + // because additional interleaving/deinterleaveing would be required to restore the element order in a vector. + + {"vabs", "abs", UInt(8, 8), "abs", {Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg}, + {"vabs", "abs", UInt(16, 4), "abs", {Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg}, + {"vabs", "abs", UInt(32, 2), "abs", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg}, + {"llvm.fabs", "llvm.fabs", Float(16, 4), "abs", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate}, + {"llvm.fabs", "llvm.fabs", Float(32, 2), "abs", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, + {"llvm.fabs", "llvm.fabs", Float(64, 2), "abs", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate}, + {"llvm.fabs.f16", "llvm.fabs.f16", Float(16), "abs", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.fabs.f32", "llvm.fabs.f32", Float(32), "abs", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.fabs.f64", "llvm.fabs.f64", Float(64), "abs", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + + {"llvm.sqrt", "llvm.sqrt", Float(16, 4), "sqrt_f16", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate}, + {"llvm.sqrt", "llvm.sqrt", Float(32, 2), "sqrt_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, + {"llvm.sqrt", "llvm.sqrt", Float(64, 2), "sqrt_f64", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate}, + {"llvm.sqrt.f16", "llvm.sqrt.f16", Float(16), "sqrt_f16", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.sqrt.f32", "llvm.sqrt.f32", Float(32), "sqrt_f32", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.sqrt.f64", "llvm.sqrt.f64", Float(64), "sqrt_f64", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + + {"llvm.floor", "llvm.floor", Float(16, 4), "floor_f16", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate}, + {"llvm.floor", "llvm.floor", Float(32, 2), "floor_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, + {"llvm.floor", "llvm.floor", Float(64, 2), "floor_f64", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate}, + {"llvm.floor.f16", "llvm.floor.f16", Float(16), "floor_f16", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.floor.f32", "llvm.floor.f32", Float(32), "floor_f32", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.floor.f64", "llvm.floor.f64", Float(64), "floor_f64", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + + {"llvm.ceil", "llvm.ceil", Float(16, 4), "ceil_f16", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate}, + {"llvm.ceil", "llvm.ceil", Float(32, 2), "ceil_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, + {"llvm.ceil", "llvm.ceil", Float(64, 2), "ceil_f64", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate}, + {"llvm.ceil.f16", "llvm.ceil.f16", Float(16), "ceil_f16", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.ceil.f32", "llvm.ceil.f32", Float(32), "ceil_f32", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.ceil.f64", "llvm.ceil.f64", Float(64), "ceil_f64", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + + {"llvm.trunc", "llvm.trunc", Float(16, 4), "trunc_f16", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate}, + {"llvm.trunc", "llvm.trunc", Float(32, 2), "trunc_f32", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, + {"llvm.trunc", "llvm.trunc", Float(64, 2), "trunc_f64", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate}, + {"llvm.trunc.f16", "llvm.trunc.f16", Float(16), "trunc_f16", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.trunc.f32", "llvm.trunc.f32", Float(32), "trunc_f32", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.trunc.f64", "llvm.trunc.f64", Float(64), "trunc_f64", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + + {"llvm.roundeven", "llvm.roundeven", Float(16, 4), "round", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveNoPredicate}, + {"llvm.roundeven", "llvm.roundeven", Float(32, 2), "round", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, + {"llvm.roundeven", "llvm.roundeven", Float(64, 2), "round", {Float(64, 2)}, ArmIntrinsic::SveNoPredicate}, + {"llvm.roundeven.f16", "llvm.roundeven.f16", Float(16), "round", {Float(16)}, ArmIntrinsic::RequireFp16 | ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.roundeven.f32", "llvm.roundeven.f32", Float(32), "round", {Float(32)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, + {"llvm.roundeven.f64", "llvm.roundeven.f64", Float(64), "round", {Float(64)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate}, // SABD, UABD - Absolute difference {"vabds", "sabd", UInt(8, 8), "absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth}, @@ -329,12 +417,12 @@ const ArmIntrinsic intrinsic_defs[] = { {"vabdu", "uabd", UInt(32, 2), "absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::HalfWidth}, // SMULL, UMULL - Widening multiply - {"vmulls", "smull", Int(16, 8), "widening_mul", {Int(8, 8), Int(8, 8)}}, - {"vmullu", "umull", UInt(16, 8), "widening_mul", {UInt(8, 8), UInt(8, 8)}}, - {"vmulls", "smull", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}}, - {"vmullu", "umull", UInt(32, 4), "widening_mul", {UInt(16, 4), UInt(16, 4)}}, - {"vmulls", "smull", Int(64, 2), "widening_mul", {Int(32, 2), Int(32, 2)}}, - {"vmullu", "umull", UInt(64, 2), "widening_mul", {UInt(32, 2), UInt(32, 2)}}, + {"vmulls", "smull", Int(16, 8), "widening_mul", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::SveUnavailable}, + {"vmullu", "umull", UInt(16, 8), "widening_mul", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::SveUnavailable}, + {"vmulls", "smull", Int(32, 4), "widening_mul", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::SveUnavailable}, + {"vmullu", "umull", UInt(32, 4), "widening_mul", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::SveUnavailable}, + {"vmulls", "smull", Int(64, 2), "widening_mul", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::SveUnavailable}, + {"vmullu", "umull", UInt(64, 2), "widening_mul", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::SveUnavailable}, // SQADD, UQADD - Saturating add // On arm32, the ARM version of this seems to be missing on some configurations. @@ -385,12 +473,30 @@ const ArmIntrinsic intrinsic_defs[] = { {"vminu", "umin", UInt(16, 4), "min", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::HalfWidth}, {"vmins", "smin", Int(32, 2), "min", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth}, {"vminu", "umin", UInt(32, 2), "min", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::HalfWidth}, - {"vmins", "fmin", Float(32, 2), "min", {Float(32, 2), Float(32, 2)}, ArmIntrinsic::HalfWidth}, + {nullptr, "smin", Int(64, 2), "min", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::Neon64Unavailable}, + {nullptr, "umin", UInt(64, 2), "min", {UInt(64, 2), UInt(64, 2)}, ArmIntrinsic::Neon64Unavailable}, {"vmins", "fmin", Float(16, 4), "min", {Float(16, 4), Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16}, + {"vmins", "fmin", Float(32, 2), "min", {Float(32, 2), Float(32, 2)}, ArmIntrinsic::HalfWidth}, + {nullptr, "fmin", Float(64, 2), "min", {Float(64, 2), Float(64, 2)}}, // FCVTZS, FCVTZU - {nullptr, "fcvtzs", Int(16, 4), "fp_to_int", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::RequireFp16}, - {nullptr, "fcvtzu", UInt(16, 4), "fp_to_int", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::RequireFp16}, + {nullptr, "fcvtzs", Int(16, 4), "fp_to_int", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveInactiveArg}, + {nullptr, "fcvtzu", UInt(16, 4), "fp_to_int", {Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveInactiveArg}, + {nullptr, "fcvtzs", Int(32, 2), "fp_to_int", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveInactiveArg}, + {nullptr, "fcvtzu", UInt(32, 2), "fp_to_int", {Float(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveInactiveArg}, + {nullptr, "fcvtzs", Int(64, 2), "fp_to_int", {Float(64, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveInactiveArg}, + {nullptr, "fcvtzu", UInt(64, 2), "fp_to_int", {Float(64, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveInactiveArg}, + + // FCVTP/M. These only exist in armv8 and onwards, so we just skip them for + // arm-32. LLVM doesn't seem to have intrinsics for them for SVE. + {nullptr, "fcvtpu", UInt(32, 4), "fp_to_int_ceil", {Float(32, 4)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {nullptr, "fcvtmu", UInt(32, 4), "fp_to_int_floor", {Float(32, 4)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {nullptr, "fcvtps", Int(32, 4), "fp_to_int_ceil", {Float(32, 4)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {nullptr, "fcvtms", Int(32, 4), "fp_to_int_floor", {Float(32, 4)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {nullptr, "fcvtpu", UInt(32, 2), "fp_to_int_ceil", {Float(32, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {nullptr, "fcvtmu", UInt(32, 2), "fp_to_int_floor", {Float(32, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {nullptr, "fcvtps", Int(32, 2), "fp_to_int_ceil", {Float(32, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {nullptr, "fcvtms", Int(32, 2), "fp_to_int_floor", {Float(32, 2)}, ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, // SMAX, UMAX, FMAX - Max {"vmaxs", "smax", Int(8, 8), "max", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth}, @@ -399,25 +505,34 @@ const ArmIntrinsic intrinsic_defs[] = { {"vmaxu", "umax", UInt(16, 4), "max", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::HalfWidth}, {"vmaxs", "smax", Int(32, 2), "max", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth}, {"vmaxu", "umax", UInt(32, 2), "max", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::HalfWidth}, - {"vmaxs", "fmax", Float(32, 2), "max", {Float(32, 2), Float(32, 2)}, ArmIntrinsic::HalfWidth}, + {nullptr, "smax", Int(64, 2), "max", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::Neon64Unavailable}, + {nullptr, "umax", UInt(64, 2), "max", {UInt(64, 2), UInt(64, 2)}, ArmIntrinsic::Neon64Unavailable}, {"vmaxs", "fmax", Float(16, 4), "max", {Float(16, 4), Float(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16}, + {"vmaxs", "fmax", Float(32, 2), "max", {Float(32, 2), Float(32, 2)}, ArmIntrinsic::HalfWidth}, + {nullptr, "fmax", Float(64, 2), "max", {Float(64, 2), Float(64, 2)}}, + + // NEG, FNEG + {nullptr, "neg", Int(8, 16), "negate", {Int(8, 16)}, ArmIntrinsic::SveInactiveArg | ArmIntrinsic::Neon64Unavailable}, + {nullptr, "neg", Int(16, 8), "negate", {Int(16, 8)}, ArmIntrinsic::SveInactiveArg | ArmIntrinsic::Neon64Unavailable}, + {nullptr, "neg", Int(32, 4), "negate", {Int(32, 4)}, ArmIntrinsic::SveInactiveArg | ArmIntrinsic::Neon64Unavailable}, + {nullptr, "neg", Int(64, 2), "negate", {Int(64, 2)}, ArmIntrinsic::SveInactiveArg | ArmIntrinsic::Neon64Unavailable}, // SQNEG, UQNEG - Saturating negation - {"vqneg", "sqneg", Int(8, 8), "saturating_negate", {Int(8, 8)}, ArmIntrinsic::HalfWidth}, - {"vqneg", "sqneg", Int(16, 4), "saturating_negate", {Int(16, 4)}, ArmIntrinsic::HalfWidth}, - {"vqneg", "sqneg", Int(32, 2), "saturating_negate", {Int(32, 2)}, ArmIntrinsic::HalfWidth}, - {"vqneg", "sqneg", Int(64, 2), "saturating_negate", {Int(64, 2)}}, + {"vqneg", "sqneg", Int(8, 8), "saturating_negate", {Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg}, + {"vqneg", "sqneg", Int(16, 4), "saturating_negate", {Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg}, + {"vqneg", "sqneg", Int(32, 2), "saturating_negate", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveInactiveArg}, + {"vqneg", "sqneg", Int(64, 2), "saturating_negate", {Int(64, 2)}, ArmIntrinsic::SveInactiveArg}, // SQXTN, UQXTN, SQXTUN - Saturating narrowing - {"vqmovns", "sqxtn", Int(8, 8), "saturating_narrow", {Int(16, 8)}}, - {"vqmovnu", "uqxtn", UInt(8, 8), "saturating_narrow", {UInt(16, 8)}}, - {"vqmovnsu", "sqxtun", UInt(8, 8), "saturating_narrow", {Int(16, 8)}}, - {"vqmovns", "sqxtn", Int(16, 4), "saturating_narrow", {Int(32, 4)}}, - {"vqmovnu", "uqxtn", UInt(16, 4), "saturating_narrow", {UInt(32, 4)}}, - {"vqmovnsu", "sqxtun", UInt(16, 4), "saturating_narrow", {Int(32, 4)}}, - {"vqmovns", "sqxtn", Int(32, 2), "saturating_narrow", {Int(64, 2)}}, - {"vqmovnu", "uqxtn", UInt(32, 2), "saturating_narrow", {UInt(64, 2)}}, - {"vqmovnsu", "sqxtun", UInt(32, 2), "saturating_narrow", {Int(64, 2)}}, + {"vqmovns", "sqxtn", Int(8, 8), "saturating_narrow", {Int(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vqmovnu", "uqxtn", UInt(8, 8), "saturating_narrow", {UInt(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vqmovnsu", "sqxtun", UInt(8, 8), "saturating_narrow", {Int(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vqmovns", "sqxtn", Int(16, 4), "saturating_narrow", {Int(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vqmovnu", "uqxtn", UInt(16, 4), "saturating_narrow", {UInt(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vqmovnsu", "sqxtun", UInt(16, 4), "saturating_narrow", {Int(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vqmovns", "sqxtn", Int(32, 2), "saturating_narrow", {Int(64, 2)}, ArmIntrinsic::SveUnavailable}, + {"vqmovnu", "uqxtn", UInt(32, 2), "saturating_narrow", {UInt(64, 2)}, ArmIntrinsic::SveUnavailable}, + {"vqmovnsu", "sqxtun", UInt(32, 2), "saturating_narrow", {Int(64, 2)}, ArmIntrinsic::SveUnavailable}, // RSHRN - Rounding shift right narrow (by immediate in [1, output bits]) // arm32 expects a vector RHS of the same type as the LHS except signed. @@ -440,52 +555,52 @@ const ArmIntrinsic intrinsic_defs[] = { // LLVM pattern matches these. // SQRSHL, UQRSHL - Saturating rounding shift left (by signed vector) - {"vqrshifts", "sqrshl", Int(8, 8), "saturating_rounding_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth}, - {"vqrshiftu", "uqrshl", UInt(8, 8), "saturating_rounding_shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth}, - {"vqrshifts", "sqrshl", Int(16, 4), "saturating_rounding_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth}, - {"vqrshiftu", "uqrshl", UInt(16, 4), "saturating_rounding_shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth}, - {"vqrshifts", "sqrshl", Int(32, 2), "saturating_rounding_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth}, - {"vqrshiftu", "uqrshl", UInt(32, 2), "saturating_rounding_shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth}, - {"vqrshifts", "sqrshl", Int(64, 2), "saturating_rounding_shift_left", {Int(64, 2), Int(64, 2)}}, - {"vqrshiftu", "uqrshl", UInt(64, 2), "saturating_rounding_shift_left", {UInt(64, 2), Int(64, 2)}}, + {"vqrshifts", "sqrshl", Int(8, 8), "saturating_rounding_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftu", "uqrshl", UInt(8, 8), "saturating_rounding_shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::SveUnavailable}, + {"vqrshifts", "sqrshl", Int(16, 4), "saturating_rounding_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftu", "uqrshl", UInt(16, 4), "saturating_rounding_shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::SveUnavailable}, + {"vqrshifts", "sqrshl", Int(32, 2), "saturating_rounding_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftu", "uqrshl", UInt(32, 2), "saturating_rounding_shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::SveUnavailable}, + {"vqrshifts", "sqrshl", Int(64, 2), "saturating_rounding_shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftu", "uqrshl", UInt(64, 2), "saturating_rounding_shift_left", {UInt(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, // SQRSHRN, UQRSHRN, SQRSHRUN - Saturating rounding narrowing shift right (by immediate in [1, output bits]) // arm32 expects a vector RHS of the same type as the LHS except signed. - {"vqrshiftns", nullptr, Int(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), Int(16, 8)}}, - {"vqrshiftnu", nullptr, UInt(8, 8), "saturating_rounding_shift_right_narrow", {UInt(16, 8), Int(16, 8)}}, - {"vqrshiftnsu", nullptr, UInt(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), Int(16, 8)}}, - {"vqrshiftns", nullptr, Int(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), Int(32, 4)}}, - {"vqrshiftnu", nullptr, UInt(16, 4), "saturating_rounding_shift_right_narrow", {UInt(32, 4), Int(32, 4)}}, - {"vqrshiftnsu", nullptr, UInt(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), Int(32, 4)}}, - {"vqrshiftns", nullptr, Int(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), Int(64, 2)}}, - {"vqrshiftnu", nullptr, UInt(32, 2), "saturating_rounding_shift_right_narrow", {UInt(64, 2), Int(64, 2)}}, - {"vqrshiftnsu", nullptr, UInt(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), Int(64, 2)}}, + {"vqrshiftns", nullptr, Int(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftnu", nullptr, UInt(8, 8), "saturating_rounding_shift_right_narrow", {UInt(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftnsu", nullptr, UInt(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftns", nullptr, Int(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftnu", nullptr, UInt(16, 4), "saturating_rounding_shift_right_narrow", {UInt(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftnsu", nullptr, UInt(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftns", nullptr, Int(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftnu", nullptr, UInt(32, 2), "saturating_rounding_shift_right_narrow", {UInt(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, + {"vqrshiftnsu", nullptr, UInt(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, // arm64 expects a 32-bit constant. - {nullptr, "sqrshrn", Int(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), UInt(32)}}, - {nullptr, "uqrshrn", UInt(8, 8), "saturating_rounding_shift_right_narrow", {UInt(16, 8), UInt(32)}}, - {nullptr, "sqrshrun", UInt(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), UInt(32)}}, - {nullptr, "sqrshrn", Int(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), UInt(32)}}, - {nullptr, "uqrshrn", UInt(16, 4), "saturating_rounding_shift_right_narrow", {UInt(32, 4), UInt(32)}}, - {nullptr, "sqrshrun", UInt(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), UInt(32)}}, - {nullptr, "sqrshrn", Int(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), UInt(32)}}, - {nullptr, "uqrshrn", UInt(32, 2), "saturating_rounding_shift_right_narrow", {UInt(64, 2), UInt(32)}}, - {nullptr, "sqrshrun", UInt(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), UInt(32)}}, + {nullptr, "sqrshrn", Int(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "uqrshrn", UInt(8, 8), "saturating_rounding_shift_right_narrow", {UInt(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqrshrun", UInt(8, 8), "saturating_rounding_shift_right_narrow", {Int(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqrshrn", Int(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "uqrshrn", UInt(16, 4), "saturating_rounding_shift_right_narrow", {UInt(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqrshrun", UInt(16, 4), "saturating_rounding_shift_right_narrow", {Int(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqrshrn", Int(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "uqrshrn", UInt(32, 2), "saturating_rounding_shift_right_narrow", {UInt(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqrshrun", UInt(32, 2), "saturating_rounding_shift_right_narrow", {Int(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable}, // SQSHL, UQSHL, SQSHLU - Saturating shift left by signed register. // There is also an immediate version of this - hopefully LLVM does this matching when appropriate. {"vqshifts", "sqshl", Int(8, 8), "saturating_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, {"vqshiftu", "uqshl", UInt(8, 8), "saturating_shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, - {"vqshiftsu", "sqshlu", UInt(8, 8), "saturating_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, + {"vqshiftsu", "sqshlu", UInt(8, 8), "saturating_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, {"vqshifts", "sqshl", Int(16, 4), "saturating_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, {"vqshiftu", "uqshl", UInt(16, 4), "saturating_shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, - {"vqshiftsu", "sqshlu", UInt(16, 4), "saturating_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, + {"vqshiftsu", "sqshlu", UInt(16, 4), "saturating_shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, {"vqshifts", "sqshl", Int(32, 2), "saturating_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, {"vqshiftu", "uqshl", UInt(32, 2), "saturating_shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, - {"vqshiftsu", "sqshlu", UInt(32, 2), "saturating_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth}, + {"vqshiftsu", "sqshlu", UInt(32, 2), "saturating_shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, {"vqshifts", "sqshl", Int(64, 2), "saturating_shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::AllowUnsignedOp1}, {"vqshiftu", "uqshl", UInt(64, 2), "saturating_shift_left", {UInt(64, 2), Int(64, 2)}, ArmIntrinsic::AllowUnsignedOp1}, - {"vqshiftsu", "sqshlu", UInt(64, 2), "saturating_shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::AllowUnsignedOp1}, + {"vqshiftsu", "sqshlu", UInt(64, 2), "saturating_shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::AllowUnsignedOp1 | ArmIntrinsic::SveUnavailable}, // SQSHRN, UQSHRN, SQRSHRUN Saturating narrowing shift right by an (by immediate in [1, output bits]) // arm32 expects a vector RHS of the same type as the LHS. @@ -500,15 +615,15 @@ const ArmIntrinsic intrinsic_defs[] = { {"vqshiftnsu", nullptr, UInt(32, 2), "saturating_shift_right_narrow", {Int(64, 2), Int(64, 2)}}, // arm64 expects a 32-bit constant. - {nullptr, "sqshrn", Int(8, 8), "saturating_shift_right_narrow", {Int(16, 8), UInt(32)}}, - {nullptr, "uqshrn", UInt(8, 8), "saturating_shift_right_narrow", {UInt(16, 8), UInt(32)}}, - {nullptr, "sqshrn", Int(16, 4), "saturating_shift_right_narrow", {Int(32, 4), UInt(32)}}, - {nullptr, "uqshrn", UInt(16, 4), "saturating_shift_right_narrow", {UInt(32, 4), UInt(32)}}, - {nullptr, "sqshrn", Int(32, 2), "saturating_shift_right_narrow", {Int(64, 2), UInt(32)}}, - {nullptr, "uqshrn", UInt(32, 2), "saturating_shift_right_narrow", {UInt(64, 2), UInt(32)}}, - {nullptr, "sqshrun", UInt(8, 8), "saturating_shift_right_narrow", {Int(16, 8), UInt(32)}}, - {nullptr, "sqshrun", UInt(16, 4), "saturating_shift_right_narrow", {Int(32, 4), UInt(32)}}, - {nullptr, "sqshrun", UInt(32, 2), "saturating_shift_right_narrow", {Int(64, 2), UInt(32)}}, + {nullptr, "sqshrn", Int(8, 8), "saturating_shift_right_narrow", {Int(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "uqshrn", UInt(8, 8), "saturating_shift_right_narrow", {UInt(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqshrn", Int(16, 4), "saturating_shift_right_narrow", {Int(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "uqshrn", UInt(16, 4), "saturating_shift_right_narrow", {UInt(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqshrn", Int(32, 2), "saturating_shift_right_narrow", {Int(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "uqshrn", UInt(32, 2), "saturating_shift_right_narrow", {UInt(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqshrun", UInt(8, 8), "saturating_shift_right_narrow", {Int(16, 8), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqshrun", UInt(16, 4), "saturating_shift_right_narrow", {Int(32, 4), UInt(32)}, ArmIntrinsic::SveUnavailable}, + {nullptr, "sqshrun", UInt(32, 2), "saturating_shift_right_narrow", {Int(64, 2), UInt(32)}, ArmIntrinsic::SveUnavailable}, // SRSHL, URSHL - Rounding shift left (by signed vector) {"vrshifts", "srshl", Int(8, 8), "rounding_shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth}, @@ -521,14 +636,15 @@ const ArmIntrinsic intrinsic_defs[] = { {"vrshiftu", "urshl", UInt(64, 2), "rounding_shift_left", {UInt(64, 2), Int(64, 2)}}, // SSHL, USHL - Shift left (by signed vector) - {"vshifts", "sshl", Int(8, 8), "shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth}, - {"vshiftu", "ushl", UInt(8, 8), "shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth}, - {"vshifts", "sshl", Int(16, 4), "shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth}, - {"vshiftu", "ushl", UInt(16, 4), "shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth}, - {"vshifts", "sshl", Int(32, 2), "shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth}, - {"vshiftu", "ushl", UInt(32, 2), "shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth}, - {"vshifts", "sshl", Int(64, 2), "shift_left", {Int(64, 2), Int(64, 2)}}, - {"vshiftu", "ushl", UInt(64, 2), "shift_left", {UInt(64, 2), Int(64, 2)}}, + // In SVE, no equivalent is found, though there are rounding, saturating, or widening versions. + {"vshifts", "sshl", Int(8, 8), "shift_left", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {"vshiftu", "ushl", UInt(8, 8), "shift_left", {UInt(8, 8), Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {"vshifts", "sshl", Int(16, 4), "shift_left", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {"vshiftu", "ushl", UInt(16, 4), "shift_left", {UInt(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {"vshifts", "sshl", Int(32, 2), "shift_left", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {"vshiftu", "ushl", UInt(32, 2), "shift_left", {UInt(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {"vshifts", "sshl", Int(64, 2), "shift_left", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, + {"vshiftu", "ushl", UInt(64, 2), "shift_left", {UInt(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, // SRSHR, URSHR - Rounding shift right (by immediate in [1, output bits]) // LLVM wants these expressed as SRSHL by negative amounts. @@ -537,28 +653,28 @@ const ArmIntrinsic intrinsic_defs[] = { // LLVM pattern matches these for us. // RADDHN - Add and narrow with rounding. - {"vraddhn", "raddhn", Int(8, 8), "rounding_add_narrow", {Int(16, 8), Int(16, 8)}}, - {"vraddhn", "raddhn", UInt(8, 8), "rounding_add_narrow", {UInt(16, 8), UInt(16, 8)}}, - {"vraddhn", "raddhn", Int(16, 4), "rounding_add_narrow", {Int(32, 4), Int(32, 4)}}, - {"vraddhn", "raddhn", UInt(16, 4), "rounding_add_narrow", {UInt(32, 4), UInt(32, 4)}}, - {"vraddhn", "raddhn", Int(32, 2), "rounding_add_narrow", {Int(64, 2), Int(64, 2)}}, - {"vraddhn", "raddhn", UInt(32, 2), "rounding_add_narrow", {UInt(64, 2), UInt(64, 2)}}, + {"vraddhn", "raddhn", Int(8, 8), "rounding_add_narrow", {Int(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vraddhn", "raddhn", UInt(8, 8), "rounding_add_narrow", {UInt(16, 8), UInt(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vraddhn", "raddhn", Int(16, 4), "rounding_add_narrow", {Int(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vraddhn", "raddhn", UInt(16, 4), "rounding_add_narrow", {UInt(32, 4), UInt(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vraddhn", "raddhn", Int(32, 2), "rounding_add_narrow", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, + {"vraddhn", "raddhn", UInt(32, 2), "rounding_add_narrow", {UInt(64, 2), UInt(64, 2)}, ArmIntrinsic::SveUnavailable}, // RSUBHN - Sub and narrow with rounding. - {"vrsubhn", "rsubhn", Int(8, 8), "rounding_sub_narrow", {Int(16, 8), Int(16, 8)}}, - {"vrsubhn", "rsubhn", UInt(8, 8), "rounding_sub_narrow", {UInt(16, 8), UInt(16, 8)}}, - {"vrsubhn", "rsubhn", Int(16, 4), "rounding_sub_narrow", {Int(32, 4), Int(32, 4)}}, - {"vrsubhn", "rsubhn", UInt(16, 4), "rounding_sub_narrow", {UInt(32, 4), UInt(32, 4)}}, - {"vrsubhn", "rsubhn", Int(32, 2), "rounding_sub_narrow", {Int(64, 2), Int(64, 2)}}, - {"vrsubhn", "rsubhn", UInt(32, 2), "rounding_sub_narrow", {UInt(64, 2), UInt(64, 2)}}, + {"vrsubhn", "rsubhn", Int(8, 8), "rounding_sub_narrow", {Int(16, 8), Int(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vrsubhn", "rsubhn", UInt(8, 8), "rounding_sub_narrow", {UInt(16, 8), UInt(16, 8)}, ArmIntrinsic::SveUnavailable}, + {"vrsubhn", "rsubhn", Int(16, 4), "rounding_sub_narrow", {Int(32, 4), Int(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vrsubhn", "rsubhn", UInt(16, 4), "rounding_sub_narrow", {UInt(32, 4), UInt(32, 4)}, ArmIntrinsic::SveUnavailable}, + {"vrsubhn", "rsubhn", Int(32, 2), "rounding_sub_narrow", {Int(64, 2), Int(64, 2)}, ArmIntrinsic::SveUnavailable}, + {"vrsubhn", "rsubhn", UInt(32, 2), "rounding_sub_narrow", {UInt(64, 2), UInt(64, 2)}, ArmIntrinsic::SveUnavailable}, // SQDMULH - Saturating doubling multiply keep high half. - {"vqdmulh", "sqdmulh", Int(16, 4), "qdmulh", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth}, - {"vqdmulh", "sqdmulh", Int(32, 2), "qdmulh", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth}, + {"vqdmulh", "sqdmulh", Int(16, 4), "qdmulh", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, + {"vqdmulh", "sqdmulh", Int(32, 2), "qdmulh", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, // SQRDMULH - Saturating doubling multiply keep high half with rounding. - {"vqrdmulh", "sqrdmulh", Int(16, 4), "qrdmulh", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth}, - {"vqrdmulh", "sqrdmulh", Int(32, 2), "qrdmulh", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth}, + {"vqrdmulh", "sqrdmulh", Int(16, 4), "qrdmulh", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, + {"vqrdmulh", "sqrdmulh", Int(32, 2), "qrdmulh", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::SveNoPredicate}, // PADD - Pairwise add. // 32-bit only has half-width versions. @@ -571,47 +687,49 @@ const ArmIntrinsic intrinsic_defs[] = { {"vpadd", nullptr, Float(32, 2), "pairwise_add", {Float(32, 4)}, ArmIntrinsic::SplitArg0}, {"vpadd", nullptr, Float(16, 4), "pairwise_add", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::RequireFp16}, - {nullptr, "addp", Int(8, 8), "pairwise_add", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "addp", UInt(8, 8), "pairwise_add", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "addp", Int(16, 4), "pairwise_add", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "addp", UInt(16, 4), "pairwise_add", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "addp", Int(32, 2), "pairwise_add", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "addp", UInt(32, 2), "pairwise_add", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "faddp", Float(32, 2), "pairwise_add", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "faddp", Float(64, 2), "pairwise_add", {Float(64, 4)}, ArmIntrinsic::SplitArg0}, - {nullptr, "faddp", Float(16, 4), "pairwise_add", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16}, + {nullptr, "addp", Int(8, 8), "pairwise_add", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "addp", UInt(8, 8), "pairwise_add", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "addp", Int(16, 4), "pairwise_add", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "addp", UInt(16, 4), "pairwise_add", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "addp", Int(32, 2), "pairwise_add", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "addp", UInt(32, 2), "pairwise_add", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "addp", Int(64, 2), "pairwise_add", {Int(64, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::SveUnavailable}, + {nullptr, "addp", UInt(64, 2), "pairwise_add", {UInt(64, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::SveUnavailable}, + {nullptr, "faddp", Float(32, 2), "pairwise_add", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "faddp", Float(64, 2), "pairwise_add", {Float(64, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::SveUnavailable}, + {nullptr, "faddp", Float(16, 4), "pairwise_add", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveUnavailable}, // SADDLP, UADDLP - Pairwise add long. - {"vpaddls", "saddlp", Int(16, 4), "pairwise_widening_add", {Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs}, - {"vpaddlu", "uaddlp", UInt(16, 4), "pairwise_widening_add", {UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs}, - {"vpaddlu", "uaddlp", Int(16, 4), "pairwise_widening_add", {UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs}, - {"vpaddls", "saddlp", Int(32, 2), "pairwise_widening_add", {Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs}, - {"vpaddlu", "uaddlp", UInt(32, 2), "pairwise_widening_add", {UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs}, - {"vpaddlu", "uaddlp", Int(32, 2), "pairwise_widening_add", {UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs}, - {"vpaddls", "saddlp", Int(64, 1), "pairwise_widening_add", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors}, - {"vpaddlu", "uaddlp", UInt(64, 1), "pairwise_widening_add", {UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors}, - {"vpaddlu", "uaddlp", Int(64, 1), "pairwise_widening_add", {UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors}, + {"vpaddls", "saddlp", Int(16, 4), "pairwise_widening_add", {Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {"vpaddlu", "uaddlp", UInt(16, 4), "pairwise_widening_add", {UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {"vpaddlu", "uaddlp", Int(16, 4), "pairwise_widening_add", {UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {"vpaddls", "saddlp", Int(32, 2), "pairwise_widening_add", {Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {"vpaddlu", "uaddlp", UInt(32, 2), "pairwise_widening_add", {UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {"vpaddlu", "uaddlp", Int(32, 2), "pairwise_widening_add", {UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::SveUnavailable}, + {"vpaddls", "saddlp", Int(64, 1), "pairwise_widening_add", {Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::SveUnavailable}, + {"vpaddlu", "uaddlp", UInt(64, 1), "pairwise_widening_add", {UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::SveUnavailable}, + {"vpaddlu", "uaddlp", Int(64, 1), "pairwise_widening_add", {UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleRetArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::SveUnavailable}, // SPADAL, UPADAL - Pairwise add and accumulate long. - {"vpadals", nullptr, Int(16, 4), "pairwise_widening_add_accumulate", {Int(16, 4), Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs}, - {"vpadalu", nullptr, UInt(16, 4), "pairwise_widening_add_accumulate", {UInt(16, 4), UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs}, - {"vpadalu", nullptr, Int(16, 4), "pairwise_widening_add_accumulate", {Int(16, 4), UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs}, - {"vpadals", nullptr, Int(32, 2), "pairwise_widening_add_accumulate", {Int(32, 2), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs}, - {"vpadalu", nullptr, UInt(32, 2), "pairwise_widening_add_accumulate", {UInt(32, 2), UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs}, - {"vpadalu", nullptr, Int(32, 2), "pairwise_widening_add_accumulate", {Int(32, 2), UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs}, - {"vpadals", nullptr, Int(64, 1), "pairwise_widening_add_accumulate", {Int(64, 1), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors}, - {"vpadalu", nullptr, UInt(64, 1), "pairwise_widening_add_accumulate", {UInt(64, 1), UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors}, - {"vpadalu", nullptr, Int(64, 1), "pairwise_widening_add_accumulate", {Int(64, 1), UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors}, + {"vpadals", "sadalp", Int(16, 4), "pairwise_widening_add_accumulate", {Int(16, 4), Int(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable}, + {"vpadalu", "uadalp", UInt(16, 4), "pairwise_widening_add_accumulate", {UInt(16, 4), UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable}, + {"vpadalu", "uadalp", Int(16, 4), "pairwise_widening_add_accumulate", {Int(16, 4), UInt(8, 8)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable}, + {"vpadals", "sadalp", Int(32, 2), "pairwise_widening_add_accumulate", {Int(32, 2), Int(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable}, + {"vpadalu", "uadalp", UInt(32, 2), "pairwise_widening_add_accumulate", {UInt(32, 2), UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable}, + {"vpadalu", "uadalp", Int(32, 2), "pairwise_widening_add_accumulate", {Int(32, 2), UInt(16, 4)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::Neon64Unavailable}, + {"vpadals", "sadalp", Int(64, 1), "pairwise_widening_add_accumulate", {Int(64, 1), Int(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::Neon64Unavailable}, + {"vpadalu", "uadalp", UInt(64, 1), "pairwise_widening_add_accumulate", {UInt(64, 1), UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::Neon64Unavailable}, + {"vpadalu", "uadalp", Int(64, 1), "pairwise_widening_add_accumulate", {Int(64, 1), UInt(32, 2)}, ArmIntrinsic::HalfWidth | ArmIntrinsic::MangleArgs | ArmIntrinsic::ScalarsAreVectors | ArmIntrinsic::Neon64Unavailable}, // SMAXP, UMAXP, FMAXP - Pairwise max. - {nullptr, "smaxp", Int(8, 8), "pairwise_max", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "umaxp", UInt(8, 8), "pairwise_max", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "smaxp", Int(16, 4), "pairwise_max", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "umaxp", UInt(16, 4), "pairwise_max", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "smaxp", Int(32, 2), "pairwise_max", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "umaxp", UInt(32, 2), "pairwise_max", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "fmaxp", Float(32, 2), "pairwise_max", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "fmaxp", Float(16, 4), "pairwise_max", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16}, + {nullptr, "smaxp", Int(8, 8), "pairwise_max", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "umaxp", UInt(8, 8), "pairwise_max", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "smaxp", Int(16, 4), "pairwise_max", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "umaxp", UInt(16, 4), "pairwise_max", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "smaxp", Int(32, 2), "pairwise_max", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "umaxp", UInt(32, 2), "pairwise_max", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "fmaxp", Float(32, 2), "pairwise_max", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "fmaxp", Float(16, 4), "pairwise_max", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveUnavailable}, // On arm32, we only have half-width versions of these. {"vpmaxs", nullptr, Int(8, 8), "pairwise_max", {Int(8, 16)}, ArmIntrinsic::SplitArg0}, @@ -624,14 +742,14 @@ const ArmIntrinsic intrinsic_defs[] = { {"vpmaxs", nullptr, Float(16, 4), "pairwise_max", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::RequireFp16}, // SMINP, UMINP, FMINP - Pairwise min. - {nullptr, "sminp", Int(8, 8), "pairwise_min", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "uminp", UInt(8, 8), "pairwise_min", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "sminp", Int(16, 4), "pairwise_min", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "uminp", UInt(16, 4), "pairwise_min", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "sminp", Int(32, 2), "pairwise_min", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "uminp", UInt(32, 2), "pairwise_min", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "fminp", Float(32, 2), "pairwise_min", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth}, - {nullptr, "fminp", Float(16, 4), "pairwise_min", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16}, + {nullptr, "sminp", Int(8, 8), "pairwise_min", {Int(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "uminp", UInt(8, 8), "pairwise_min", {UInt(8, 16)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "sminp", Int(16, 4), "pairwise_min", {Int(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "uminp", UInt(16, 4), "pairwise_min", {UInt(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "sminp", Int(32, 2), "pairwise_min", {Int(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "uminp", UInt(32, 2), "pairwise_min", {UInt(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "fminp", Float(32, 2), "pairwise_min", {Float(32, 4)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::SveUnavailable}, + {nullptr, "fminp", Float(16, 4), "pairwise_min", {Float(16, 8)}, ArmIntrinsic::SplitArg0 | ArmIntrinsic::HalfWidth | ArmIntrinsic::RequireFp16 | ArmIntrinsic::SveUnavailable}, // On arm32, we only have half-width versions of these. {"vpmins", nullptr, Int(8, 8), "pairwise_min", {Int(8, 16)}, ArmIntrinsic::SplitArg0}, @@ -645,28 +763,35 @@ const ArmIntrinsic intrinsic_defs[] = { // SDOT, UDOT - Dot products. // Mangle this one manually, there aren't that many and it is a special case. - {nullptr, "sdot.v2i32.v8i8", Int(32, 2), "dot_product", {Int(32, 2), Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle}, - {nullptr, "udot.v2i32.v8i8", Int(32, 2), "dot_product", {Int(32, 2), UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle}, - {nullptr, "udot.v2i32.v8i8", UInt(32, 2), "dot_product", {UInt(32, 2), UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle}, - {nullptr, "sdot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), Int(8, 16), Int(8, 16)}, ArmIntrinsic::NoMangle}, - {nullptr, "udot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle}, - {nullptr, "udot.v4i32.v16i8", UInt(32, 4), "dot_product", {UInt(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle}, + {nullptr, "sdot.v2i32.v8i8", Int(32, 2), "dot_product", {Int(32, 2), Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable}, + {nullptr, "udot.v2i32.v8i8", Int(32, 2), "dot_product", {Int(32, 2), UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable}, + {nullptr, "udot.v2i32.v8i8", UInt(32, 2), "dot_product", {UInt(32, 2), UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable}, + {nullptr, "sdot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), Int(8, 16), Int(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable}, + {nullptr, "udot.v4i32.v16i8", Int(32, 4), "dot_product", {Int(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable}, + {nullptr, "udot.v4i32.v16i8", UInt(32, 4), "dot_product", {UInt(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveUnavailable}, + // SVE versions. + {nullptr, "sdot.nxv4i32", Int(32, 4), "dot_product", {Int(32, 4), Int(8, 16), Int(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::SveRequired}, + {nullptr, "udot.nxv4i32", Int(32, 4), "dot_product", {Int(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::SveRequired}, + {nullptr, "udot.nxv4i32", UInt(32, 4), "dot_product", {UInt(32, 4), UInt(8, 16), UInt(8, 16)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::SveRequired}, + {nullptr, "sdot.nxv2i64", Int(64, 2), "dot_product", {Int(64, 2), Int(16, 8), Int(16, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::Neon64Unavailable | ArmIntrinsic::SveRequired}, + {nullptr, "udot.nxv2i64", Int(64, 2), "dot_product", {Int(64, 2), UInt(16, 8), UInt(16, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::Neon64Unavailable | ArmIntrinsic::SveRequired}, + {nullptr, "udot.nxv2i64", UInt(64, 2), "dot_product", {UInt(64, 2), UInt(16, 8), UInt(16, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::SveNoPredicate | ArmIntrinsic::Neon64Unavailable | ArmIntrinsic::SveRequired}, // ABDL - Widening absolute difference // The ARM backend folds both signed and unsigned widening casts of absd to a widening_absd, so we need to handle both signed and // unsigned input and return types. - {"vabdl_i8x8", "vabdl_i8x8", Int(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_i8x8", "vabdl_i8x8", UInt(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_u8x8", "vabdl_u8x8", Int(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_u8x8", "vabdl_u8x8", UInt(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_i16x4", "vabdl_i16x4", Int(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_i16x4", "vabdl_i16x4", UInt(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_u16x4", "vabdl_u16x4", Int(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_u16x4", "vabdl_u16x4", UInt(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_i32x2", "vabdl_i32x2", Int(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_i32x2", "vabdl_i32x2", UInt(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_u32x2", "vabdl_u32x2", Int(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, - {"vabdl_u32x2", "vabdl_u32x2", UInt(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix}, + {"vabdl_i8x8", "vabdl_i8x8", Int(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_i8x8", "vabdl_i8x8", UInt(16, 8), "widening_absd", {Int(8, 8), Int(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_u8x8", "vabdl_u8x8", Int(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_u8x8", "vabdl_u8x8", UInt(16, 8), "widening_absd", {UInt(8, 8), UInt(8, 8)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_i16x4", "vabdl_i16x4", Int(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_i16x4", "vabdl_i16x4", UInt(32, 4), "widening_absd", {Int(16, 4), Int(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_u16x4", "vabdl_u16x4", Int(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_u16x4", "vabdl_u16x4", UInt(32, 4), "widening_absd", {UInt(16, 4), UInt(16, 4)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_i32x2", "vabdl_i32x2", Int(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_i32x2", "vabdl_i32x2", UInt(64, 2), "widening_absd", {Int(32, 2), Int(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_u32x2", "vabdl_u32x2", Int(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, + {"vabdl_u32x2", "vabdl_u32x2", UInt(64, 2), "widening_absd", {UInt(32, 2), UInt(32, 2)}, ArmIntrinsic::NoMangle | ArmIntrinsic::NoPrefix | ArmIntrinsic::SveUnavailable}, }; // List of fp16 math functions which we can avoid "emulated" equivalent code generation. @@ -706,32 +831,103 @@ const std::map float16_transcendental_remapping = { }; // clang-format on -llvm::Function *CodeGen_ARM::define_concat_args_wrapper(llvm::Function *inner, const string &name) { - llvm::FunctionType *inner_ty = inner->getFunctionType(); +llvm::Type *CodeGen_ARM::llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, + VectorTypeConstraint constraint) { + llvm::Type *ret = llvm_type_of(t.element_of()); + if (!t.is_scalar() || scalars_are_vectors) { + int lanes = t.lanes(); + if (constraint == VectorTypeConstraint::VScale) { + lanes /= target_vscale(); + } + ret = get_vector_type(ret, lanes, constraint); + } + return ret; +} + +llvm::Function *CodeGen_ARM::define_intrin_wrapper(const std::string &inner_name, + const Type &ret_type, + const std::string &mangled_name, + const std::vector &arg_types, + int intrinsic_flags, + bool sve_intrinsic) { + + auto to_llvm_type = [&](const Type &t) { + return llvm_type_with_constraint(t, (intrinsic_flags & ArmIntrinsic::ScalarsAreVectors), + !sve_intrinsic ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale); + }; + + llvm::Type *llvm_ret_type = to_llvm_type(ret_type); + std::vector llvm_arg_types; + std::transform(arg_types.begin(), arg_types.end(), std::back_inserter(llvm_arg_types), to_llvm_type); + + const bool add_predicate = sve_intrinsic && !(intrinsic_flags & ArmIntrinsic::SveNoPredicate); + bool add_inactive_arg = sve_intrinsic && (intrinsic_flags & ArmIntrinsic::SveInactiveArg); + bool split_arg0 = intrinsic_flags & ArmIntrinsic::SplitArg0; + + if (!(add_inactive_arg || add_predicate || split_arg0)) { + // No need to wrap + return get_llvm_intrin(llvm_ret_type, mangled_name, llvm_arg_types); + } + + std::vector inner_llvm_arg_types; + std::vector inner_args; + internal_assert(!arg_types.empty()); + const int inner_lanes = split_arg0 ? arg_types[0].lanes() / 2 : arg_types[0].lanes(); + + if (add_inactive_arg) { + // The fallback value has the same type as ret value. + // We don't use this, so just pad it with 0. + inner_llvm_arg_types.push_back(llvm_ret_type); + + Value *zero = Constant::getNullValue(llvm_ret_type); + inner_args.push_back(zero); + } + if (add_predicate) { + llvm::Type *pred_type = to_llvm_type(Int(1, inner_lanes)); + inner_llvm_arg_types.push_back(pred_type); + // Halide does not have general support for predication so use + // constant true for all lanes. + Value *ptrue = Constant::getAllOnesValue(pred_type); + inner_args.push_back(ptrue); + } + if (split_arg0) { + llvm::Type *split_arg_type = to_llvm_type(arg_types[0].with_lanes(inner_lanes)); + inner_llvm_arg_types.push_back(split_arg_type); + inner_llvm_arg_types.push_back(split_arg_type); + internal_assert(arg_types.size() == 1); + } else { + // Push back all argument typs which Halide defines + std::copy(llvm_arg_types.begin(), llvm_arg_types.end(), std::back_inserter(inner_llvm_arg_types)); + } - internal_assert(inner_ty->getNumParams() == 2); - llvm::Type *inner_arg0_ty = inner_ty->getParamType(0); - llvm::Type *inner_arg1_ty = inner_ty->getParamType(1); - int inner_arg0_lanes = get_vector_num_elements(inner_arg0_ty); - int inner_arg1_lanes = get_vector_num_elements(inner_arg1_ty); + llvm::Function *inner = get_llvm_intrin(llvm_ret_type, mangled_name, inner_llvm_arg_types); + llvm::FunctionType *inner_ty = inner->getFunctionType(); - llvm::Type *concat_arg_ty = - get_vector_type(inner_arg0_ty->getScalarType(), inner_arg0_lanes + inner_arg1_lanes); + llvm::FunctionType *wrapper_ty = llvm::FunctionType::get(inner_ty->getReturnType(), llvm_arg_types, false); - // Make a wrapper. - llvm::FunctionType *wrapper_ty = - llvm::FunctionType::get(inner_ty->getReturnType(), {concat_arg_ty}, false); + string wrapper_name = inner_name + unique_name("_wrapper"); llvm::Function *wrapper = - llvm::Function::Create(wrapper_ty, llvm::GlobalValue::InternalLinkage, name, module.get()); + llvm::Function::Create(wrapper_ty, llvm::GlobalValue::InternalLinkage, wrapper_name, module.get()); llvm::BasicBlock *block = llvm::BasicBlock::Create(module->getContext(), "entry", wrapper); IRBuilderBase::InsertPoint here = builder->saveIP(); builder->SetInsertPoint(block); + if (split_arg0) { + // Call the real intrinsic. + Value *low = slice_vector(wrapper->getArg(0), 0, inner_lanes); + Value *high = slice_vector(wrapper->getArg(0), inner_lanes, inner_lanes); + inner_args.push_back(low); + inner_args.push_back(high); + internal_assert(inner_llvm_arg_types.size() == 2); + } else { + for (auto *itr = wrapper->arg_begin(); itr != wrapper->arg_end(); ++itr) { + inner_args.push_back(itr); + } + } + // Call the real intrinsic. - Value *low = slice_vector(wrapper->getArg(0), 0, inner_arg0_lanes); - Value *high = slice_vector(wrapper->getArg(0), inner_arg0_lanes, inner_arg1_lanes); - Value *ret = builder->CreateCall(inner, {low, high}); + Value *ret = builder->CreateCall(inner, inner_args); builder->CreateRet(ret); // Always inline these wrappers. @@ -746,15 +942,32 @@ llvm::Function *CodeGen_ARM::define_concat_args_wrapper(llvm::Function *inner, c void CodeGen_ARM::init_module() { CodeGen_Posix::init_module(); - if (neon_intrinsics_disabled()) { + const bool has_neon = !target.has_feature(Target::NoNEON); + const bool has_sve = target.has_feature(Target::SVE2); + if (!(has_neon || has_sve)) { return; } - string prefix = target.bits == 32 ? "llvm.arm.neon." : "llvm.aarch64.neon."; + enum class SIMDFlavors { + NeonWidthX1, + NeonWidthX2, + SVE, + }; + + std::vector flavors; + if (has_neon) { + flavors.push_back(SIMDFlavors::NeonWidthX1); + flavors.push_back(SIMDFlavors::NeonWidthX2); + } + if (has_sve) { + flavors.push_back(SIMDFlavors::SVE); + } + for (const ArmIntrinsic &intrin : intrinsic_defs) { if (intrin.flags & ArmIntrinsic::RequireFp16 && !target.has_feature(Target::ARMFp16)) { continue; } + // Get the name of the intrinsic with the appropriate prefix. const char *intrin_name = nullptr; if (target.bits == 32) { @@ -765,21 +978,66 @@ void CodeGen_ARM::init_module() { if (!intrin_name) { continue; } - string full_name = intrin_name; - if (!starts_with(full_name, "llvm.") && (intrin.flags & ArmIntrinsic::NoPrefix) == 0) { - full_name = prefix + full_name; - } - // We might have to generate versions of this intrinsic with multiple widths. - vector width_factors = {1}; - if (intrin.flags & ArmIntrinsic::HalfWidth) { - width_factors.push_back(2); - } + // This makes up to three passes defining intrinsics for 64-bit, + // 128-bit, and, if SVE is avaailable, whatever the SVE target width + // is. Some variants will not result in a definition getting added based + // on the target and the intrinsic flags. The intrinsic width may be + // scaled and one of two opcodes may be selected by different + // interations of this loop. + for (const auto flavor : flavors) { + const bool is_sve = (flavor == SIMDFlavors::SVE); + + // Skip intrinsics that are NEON or SVE only depending on whether compiling for SVE. + if (is_sve) { + if (intrin.flags & ArmIntrinsic::SveUnavailable) { + continue; + } + } else { + if (intrin.flags & ArmIntrinsic::SveRequired) { + continue; + } + } + if ((target.bits == 64) && + (intrin.flags & ArmIntrinsic::Neon64Unavailable) && + !is_sve) { + continue; + } + // Already declared in the x1 pass. + if ((flavor == SIMDFlavors::NeonWidthX2) && + !(intrin.flags & ArmIntrinsic::HalfWidth)) { + continue; + } + + string full_name = intrin_name; + const bool is_vanilla_intrinsic = starts_with(full_name, "llvm."); + if (!is_vanilla_intrinsic && (intrin.flags & ArmIntrinsic::NoPrefix) == 0) { + if (target.bits == 32) { + full_name = "llvm.arm.neon." + full_name; + } else { + full_name = (is_sve ? "llvm.aarch64.sve." : "llvm.aarch64.neon.") + full_name; + } + } + + int width_factor = 1; + if (!((intrin.ret_type.lanes <= 1) && (intrin.flags & ArmIntrinsic::NoMangle))) { + switch (flavor) { + case SIMDFlavors::NeonWidthX1: + width_factor = 1; + break; + case SIMDFlavors::NeonWidthX2: + width_factor = 2; + break; + case SIMDFlavors::SVE: + width_factor = (intrin.flags & ArmIntrinsic::HalfWidth) ? 2 : 1; + width_factor *= target_vscale(); + break; + } + } - for (int width_factor : width_factors) { Type ret_type = intrin.ret_type; ret_type = ret_type.with_lanes(ret_type.lanes() * width_factor); - internal_assert(ret_type.bits() * ret_type.lanes() <= 128) << full_name << "\n"; + internal_assert(ret_type.bits() * ret_type.lanes() <= 128 * width_factor) << full_name << "\n"; vector arg_types; arg_types.reserve(4); for (halide_type_t i : intrin.arg_types) { @@ -787,9 +1045,7 @@ void CodeGen_ARM::init_module() { break; } Type arg_type = i; - if (arg_type.is_vector()) { - arg_type = arg_type.with_lanes(arg_type.lanes() * width_factor); - } + arg_type = arg_type.with_lanes(arg_type.lanes() * width_factor); arg_types.emplace_back(arg_type); } @@ -799,7 +1055,7 @@ void CodeGen_ARM::init_module() { if (starts_with(full_name, "llvm.") && (intrin.flags & ArmIntrinsic::NoMangle) == 0) { // Append LLVM name mangling for either the return type or the arguments, or both. vector types; - if (intrin.flags & ArmIntrinsic::MangleArgs) { + if (intrin.flags & ArmIntrinsic::MangleArgs && !is_sve) { types = arg_types; } else if (intrin.flags & ArmIntrinsic::MangleRetArgs) { types = {ret_type}; @@ -808,7 +1064,9 @@ void CodeGen_ARM::init_module() { types = {ret_type}; } for (const Type &t : types) { - mangled_name_builder << ".v" << t.lanes(); + std::string llvm_vector_prefix = is_sve ? ".nxv" : ".v"; + int mangle_lanes = t.lanes() / (is_sve ? target_vscale() : 1); + mangled_name_builder << llvm_vector_prefix << mangle_lanes; if (t.is_int() || t.is_uint()) { mangled_name_builder << "i"; } else if (t.is_float()) { @@ -819,17 +1077,9 @@ void CodeGen_ARM::init_module() { } string mangled_name = mangled_name_builder.str(); - llvm::Function *intrin_impl = nullptr; - if (intrin.flags & ArmIntrinsic::SplitArg0) { - // This intrinsic needs a wrapper to split the argument. - string wrapper_name = intrin.name + unique_name("_wrapper"); - Type split_arg_type = arg_types[0].with_lanes(arg_types[0].lanes() / 2); - llvm::Function *to_wrap = get_llvm_intrin(ret_type, mangled_name, {split_arg_type, split_arg_type}); - intrin_impl = define_concat_args_wrapper(to_wrap, wrapper_name); - } else { - bool scalars_are_vectors = intrin.flags & ArmIntrinsic::ScalarsAreVectors; - intrin_impl = get_llvm_intrin(ret_type, mangled_name, arg_types, scalars_are_vectors); - } + llvm::Function *intrin_impl = define_intrin_wrapper( + intrin.name, ret_type, mangled_name, arg_types, + intrin.flags, is_sve); function_does_not_access_memory(intrin_impl); intrin_impl->addFnAttr(llvm::Attribute::NoUnwind); @@ -862,8 +1112,31 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f, CodeGen_Posix::compile_func(func, simple_name, extern_name); } +void CodeGen_ARM::begin_func(LinkageType linkage, const std::string &simple_name, + const std::string &extern_name, const std::vector &args) { + CodeGen_Posix::begin_func(linkage, simple_name, extern_name, args); + + // TODO(https://github.com/halide/Halide/issues/8092): There is likely a + // better way to ensure this is only generated for the outermost function + // that is being compiled. Avoiding the assert on inner functions is both an + // efficiency and a correctness issue as the assertion code may not compile + // in all contexts. + if (linkage != LinkageType::Internal) { + int effective_vscale = target_vscale(); + if (effective_vscale != 0 && !target.has_feature(Target::NoAsserts)) { + // Make sure run-time vscale is equal to compile-time vscale + Expr runtime_vscale = Call::make(Int(32), Call::get_runtime_vscale, {}, Call::PureIntrinsic); + Value *val_runtime_vscale = codegen(runtime_vscale); + Value *val_compiletime_vscale = ConstantInt::get(i32_t, effective_vscale); + Value *cond = builder->CreateICmpEQ(val_runtime_vscale, val_compiletime_vscale); + create_assertion(cond, Call::make(Int(32), "halide_error_vscale_invalid", + {simple_name, runtime_vscale, Expr(effective_vscale)}, Call::Extern)); + } + } +} + void CodeGen_ARM::visit(const Cast *op) { - if (!neon_intrinsics_disabled() && op->type.is_vector()) { + if (!simd_intrinsics_disabled() && op->type.is_vector()) { vector matches; for (const Pattern &pattern : casts) { if (expr_match(pattern.pattern, op, matches)) { @@ -898,14 +1171,11 @@ void CodeGen_ARM::visit(const Cast *op) { } } - // LLVM fptoui generates fcvtzs if src is fp16 scalar else fcvtzu. - // To avoid that, we use neon intrinsic explicitly. - if (is_float16_and_has_feature(op->value.type())) { - if (op->type.is_int_or_uint() && op->type.bits() == 16) { - value = call_overloaded_intrin(op->type, "fp_to_int", {op->value}); - if (value) { - return; - } + // LLVM fptoui generates fcvtzs or fcvtzu in inconsistent way + if (op->value.type().is_float() && op->type.is_int_or_uint()) { + if (Value *v = call_overloaded_intrin(op->type, "fp_to_int", {op->value})) { + value = v; + return; } } @@ -913,7 +1183,7 @@ void CodeGen_ARM::visit(const Cast *op) { } void CodeGen_ARM::visit(const Add *op) { - if (neon_intrinsics_disabled() || + if (simd_intrinsics_disabled() || !op->type.is_vector() || !target.has_feature(Target::ARMDotProd) || !op->type.is_int_or_uint() || @@ -997,7 +1267,7 @@ void CodeGen_ARM::visit(const Add *op) { } void CodeGen_ARM::visit(const Sub *op) { - if (neon_intrinsics_disabled()) { + if (simd_intrinsics_disabled()) { CodeGen_Posix::visit(op); return; } @@ -1012,6 +1282,46 @@ void CodeGen_ARM::visit(const Sub *op) { } } + // Peep-hole (0 - b) pattern to generate "negate" instruction + if (is_const_zero(op->a)) { + if (target_vscale() != 0) { + if ((op->type.bits() >= 8 && op->type.is_int())) { + if (Value *v = call_overloaded_intrin(op->type, "negate", {op->b})) { + value = v; + return; + } + } else if (op->type.bits() >= 16 && op->type.is_float()) { + value = builder->CreateFNeg(codegen(op->b)); + return; + } + } else { + // llvm.neon.neg/fneg intrinsic doesn't seem to exist. Instead, + // llvm will generate floating point negate instructions if we ask for (-0.0f)-x + if (op->type.is_float() && + (op->type.bits() >= 32 || is_float16_and_has_feature(op->type))) { + Constant *a; + if (op->type.bits() == 16) { + a = ConstantFP::getNegativeZero(f16_t); + } else if (op->type.bits() == 32) { + a = ConstantFP::getNegativeZero(f32_t); + } else if (op->type.bits() == 64) { + a = ConstantFP::getNegativeZero(f64_t); + } else { + a = nullptr; + internal_error << "Unknown bit width for floating point type: " << op->type << "\n"; + } + + Value *b = codegen(op->b); + + if (op->type.lanes() > 1) { + a = get_splat(op->type.lanes(), a); + } + value = builder->CreateFSub(a, b); + return; + } + } + } + // llvm will generate floating point negate instructions if we ask for (-0.0f)-x if (op->type.is_float() && (op->type.bits() >= 32 || is_float16_and_has_feature(op->type)) && @@ -1042,7 +1352,7 @@ void CodeGen_ARM::visit(const Sub *op) { void CodeGen_ARM::visit(const Min *op) { // Use a 2-wide vector for scalar floats. - if (!neon_intrinsics_disabled() && (op->type == Float(32) || op->type.is_vector())) { + if (!simd_intrinsics_disabled() && (op->type.is_float() || op->type.is_vector())) { value = call_overloaded_intrin(op->type, "min", {op->a, op->b}); if (value) { return; @@ -1054,7 +1364,7 @@ void CodeGen_ARM::visit(const Min *op) { void CodeGen_ARM::visit(const Max *op) { // Use a 2-wide vector for scalar floats. - if (!neon_intrinsics_disabled() && (op->type == Float(32) || op->type.is_vector())) { + if (!simd_intrinsics_disabled() && (op->type.is_float() || op->type.is_vector())) { value = call_overloaded_intrin(op->type, "max", {op->a, op->b}); if (value) { return; @@ -1066,12 +1376,13 @@ void CodeGen_ARM::visit(const Max *op) { void CodeGen_ARM::visit(const Store *op) { // Predicated store - if (!is_const_one(op->predicate)) { + const bool is_predicated_store = !is_const_one(op->predicate); + if (is_predicated_store && !target.has_feature(Target::SVE2)) { CodeGen_Posix::visit(op); return; } - if (neon_intrinsics_disabled()) { + if (simd_intrinsics_disabled()) { CodeGen_Posix::visit(op); return; } @@ -1079,8 +1390,8 @@ void CodeGen_ARM::visit(const Store *op) { // A dense store of an interleaving can be done using a vst2 intrinsic const Ramp *ramp = op->index.as(); - // We only deal with ramps here - if (!ramp) { + // We only deal with ramps here except for SVE2 + if (!ramp && !target.has_feature(Target::SVE2)) { CodeGen_Posix::visit(op); return; } @@ -1102,21 +1413,27 @@ void CodeGen_ARM::visit(const Store *op) { intrin_type = t; Type elt = t.element_of(); int vec_bits = t.bits() * t.lanes(); - if (elt == Float(32) || + if (elt == Float(32) || elt == Float(64) || is_float16_and_has_feature(elt) || - elt == Int(8) || elt == Int(16) || elt == Int(32) || - elt == UInt(8) || elt == UInt(16) || elt == UInt(32)) { + elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) || + elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) { + // TODO(zvookin): Handle vector_bits_*. if (vec_bits % 128 == 0) { type_ok_for_vst = true; - intrin_type = intrin_type.with_lanes(128 / t.bits()); + int target_vector_bits = target.vector_bits; + if (target_vector_bits == 0) { + target_vector_bits = 128; + } + intrin_type = intrin_type.with_lanes(target_vector_bits / t.bits()); } else if (vec_bits % 64 == 0) { type_ok_for_vst = true; - intrin_type = intrin_type.with_lanes(64 / t.bits()); + auto intrin_bits = (vec_bits % 128 == 0 || target.has_feature(Target::SVE2)) ? 128 : 64; + intrin_type = intrin_type.with_lanes(intrin_bits / t.bits()); } } } - if (is_const_one(ramp->stride) && + if (ramp && is_const_one(ramp->stride) && shuffle && shuffle->is_interleave() && type_ok_for_vst && 2 <= shuffle->vectors.size() && shuffle->vectors.size() <= 4) { @@ -1138,11 +1455,14 @@ void CodeGen_ARM::visit(const Store *op) { for (int i = 0; i < num_vecs; ++i) { args[i] = codegen(shuffle->vectors[i]); } + Value *store_pred_val = codegen(op->predicate); + + bool is_sve = target.has_feature(Target::SVE2); // Declare the function std::ostringstream instr; vector arg_types; - llvm::Type *intrin_llvm_type = llvm_type_of(intrin_type); + llvm::Type *intrin_llvm_type = llvm_type_with_constraint(intrin_type, false, is_sve ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed); #if LLVM_VERSION >= 170 const bool is_opaque = true; #else @@ -1160,27 +1480,38 @@ void CodeGen_ARM::visit(const Store *op) { arg_types.front() = i8_t->getPointerTo(); arg_types.back() = i32_t; } else { - instr << "llvm.aarch64.neon.st" - << num_vecs - << ".v" - << intrin_type.lanes() - << (t.is_float() ? 'f' : 'i') - << t.bits() - << ".p0"; - if (!is_opaque) { - instr << (t.is_float() ? 'f' : 'i') << t.bits(); + if (is_sve) { + instr << "llvm.aarch64.sve.st" + << num_vecs + << ".nxv" + << (intrin_type.lanes() / target_vscale()) + << (t.is_float() ? 'f' : 'i') + << t.bits(); + arg_types = vector(num_vecs, intrin_llvm_type); + arg_types.emplace_back(get_vector_type(i1_t, intrin_type.lanes() / target_vscale(), VectorTypeConstraint::VScale)); // predicate + arg_types.emplace_back(llvm_type_of(intrin_type.element_of())->getPointerTo()); + } else { + instr << "llvm.aarch64.neon.st" + << num_vecs + << ".v" + << intrin_type.lanes() + << (t.is_float() ? 'f' : 'i') + << t.bits() + << ".p0"; + if (!is_opaque) { + instr << (t.is_float() ? 'f' : 'i') << t.bits(); + } + arg_types = vector(num_vecs + 1, intrin_llvm_type); + arg_types.back() = llvm_type_of(intrin_type.element_of())->getPointerTo(); } - arg_types = vector(num_vecs + 1, intrin_llvm_type); - arg_types.back() = llvm_type_of(intrin_type.element_of())->getPointerTo(); } llvm::FunctionType *fn_type = FunctionType::get(llvm::Type::getVoidTy(*context), arg_types, false); llvm::FunctionCallee fn = module->getOrInsertFunction(instr.str(), fn_type); internal_assert(fn); - // How many vst instructions do we need to generate? - int slices = t.lanes() / intrin_type.lanes(); + // SVE2 supports predication for smaller than whole vector size. + internal_assert(target.has_feature(Target::SVE2) || (t.lanes() >= intrin_type.lanes())); - internal_assert(slices >= 1); for (int i = 0; i < t.lanes(); i += intrin_type.lanes()) { Expr slice_base = simplify(ramp->base + i * num_vecs); Expr slice_ramp = Ramp::make(slice_base, ramp->stride, intrin_type.lanes() * num_vecs); @@ -1190,6 +1521,7 @@ void CodeGen_ARM::visit(const Store *op) { // Take a slice of each arg for (int j = 0; j < num_vecs; j++) { slice_args[j] = slice_vector(slice_args[j], i, intrin_type.lanes()); + slice_args[j] = convert_fixed_or_scalable_vector_type(slice_args[j], get_vector_type(slice_args[j]->getType()->getScalarType(), intrin_type.lanes())); } if (target.bits == 32) { @@ -1200,10 +1532,30 @@ void CodeGen_ARM::visit(const Store *op) { // Set the alignment argument slice_args.push_back(ConstantInt::get(i32_t, alignment)); } else { + if (is_sve) { + // Set the predicate argument + auto active_lanes = std::min(t.lanes() - i, intrin_type.lanes()); + Value *vpred_val; + if (is_predicated_store) { + vpred_val = slice_vector(store_pred_val, i, intrin_type.lanes()); + } else { + Expr vpred = make_vector_predicate_1s_0s(active_lanes, intrin_type.lanes() - active_lanes); + vpred_val = codegen(vpred); + } + slice_args.push_back(vpred_val); + } // Set the pointer argument slice_args.push_back(ptr); } + if (is_sve) { + for (auto &arg : slice_args) { + if (arg->getType()->isVectorTy()) { + arg = match_vector_type_scalable(arg, VectorTypeConstraint::VScale); + } + } + } + CallInst *store = builder->CreateCall(fn, slice_args); add_tbaa_metadata(store, op->name, slice_ramp); } @@ -1216,8 +1568,95 @@ void CodeGen_ARM::visit(const Store *op) { return; } + if (target.has_feature(Target::SVE2)) { + const IntImm *stride = ramp ? ramp->stride.as() : nullptr; + if (stride && stride->value == 1) { + // Basically we can deal with vanilla codegen, + // but to avoid LLVM error, process with the multiple of natural_lanes + const int natural_lanes = target.natural_vector_size(op->value.type()); + if (ramp->lanes % natural_lanes) { + int aligned_lanes = align_up(ramp->lanes, natural_lanes); + // Use predicate to prevent overrun + Expr vpred; + if (is_predicated_store) { + vpred = Shuffle::make_concat({op->predicate, const_false(aligned_lanes - ramp->lanes)}); + } else { + vpred = make_vector_predicate_1s_0s(ramp->lanes, aligned_lanes - ramp->lanes); + } + auto aligned_index = Ramp::make(ramp->base, stride, aligned_lanes); + Expr padding = make_zero(op->value.type().with_lanes(aligned_lanes - ramp->lanes)); + Expr aligned_value = Shuffle::make_concat({op->value, padding}); + codegen(Store::make(op->name, aligned_value, aligned_index, op->param, vpred, op->alignment)); + return; + } + } else if (op->index.type().is_vector()) { + // Scatter + Type elt = op->value.type().element_of(); + + // Rewrite float16 case into reinterpret and Store in uint16, as it is unsupported in LLVM + if (is_float16_and_has_feature(elt)) { + Type u16_type = op->value.type().with_code(halide_type_uint); + Expr v = reinterpret(u16_type, op->value); + codegen(Store::make(op->name, v, op->index, op->param, op->predicate, op->alignment)); + return; + } + + const int store_lanes = op->value.type().lanes(); + const int index_bits = 32; + Type type_with_max_bits = Int(std::max(elt.bits(), index_bits)); + // The number of lanes is constrained by index vector type + const int natural_lanes = target.natural_vector_size(type_with_max_bits); + const int vscale_natural_lanes = natural_lanes / target_vscale(); + + Expr base = 0; + Value *elt_ptr = codegen_buffer_pointer(op->name, elt, base); + Value *val = codegen(op->value); + Value *index = codegen(op->index); + Value *store_pred_val = codegen(op->predicate); + + llvm::Type *slice_type = get_vector_type(llvm_type_of(elt), vscale_natural_lanes, VectorTypeConstraint::VScale); + llvm::Type *slice_index_type = get_vector_type(llvm_type_of(op->index.type().element_of()), vscale_natural_lanes, VectorTypeConstraint::VScale); + llvm::Type *pred_type = get_vector_type(llvm_type_of(op->predicate.type().element_of()), vscale_natural_lanes, VectorTypeConstraint::VScale); + + std::ostringstream instr; + instr << "llvm.aarch64.sve.st1.scatter.uxtw." + << (elt.bits() != 8 ? "index." : "") // index is scaled into bytes + << "nxv" + << vscale_natural_lanes + << (elt == Float(32) || elt == Float(64) ? 'f' : 'i') + << elt.bits(); + + vector arg_types{slice_type, pred_type, elt_ptr->getType(), slice_index_type}; + llvm::FunctionType *fn_type = FunctionType::get(void_t, arg_types, false); + FunctionCallee fn = module->getOrInsertFunction(instr.str(), fn_type); + + // We need to slice the result into native vector lanes to use intrinsic + for (int i = 0; i < store_lanes; i += natural_lanes) { + Value *slice_value = slice_vector(val, i, natural_lanes); + Value *slice_index = slice_vector(index, i, natural_lanes); + const int active_lanes = std::min(store_lanes - i, natural_lanes); + + Expr vpred = make_vector_predicate_1s_0s(active_lanes, natural_lanes - active_lanes); + Value *vpred_val = codegen(vpred); + vpred_val = convert_fixed_or_scalable_vector_type(vpred_val, pred_type); + if (is_predicated_store) { + Value *sliced_store_vpred_val = slice_vector(store_pred_val, i, natural_lanes); + vpred_val = builder->CreateAnd(vpred_val, sliced_store_vpred_val); + } + + slice_value = match_vector_type_scalable(slice_value, VectorTypeConstraint::VScale); + vpred_val = match_vector_type_scalable(vpred_val, VectorTypeConstraint::VScale); + slice_index = match_vector_type_scalable(slice_index, VectorTypeConstraint::VScale); + CallInst *store = builder->CreateCall(fn, {slice_value, vpred_val, elt_ptr, slice_index}); + add_tbaa_metadata(store, op->name, op->index); + } + + return; + } + } + // If the stride is one or minus one, we can deal with that using vanilla codegen - const IntImm *stride = ramp->stride.as(); + const IntImm *stride = ramp ? ramp->stride.as() : nullptr; if (stride && (stride->value == 1 || stride->value == -1)) { CodeGen_Posix::visit(op); return; @@ -1250,12 +1689,13 @@ void CodeGen_ARM::visit(const Store *op) { void CodeGen_ARM::visit(const Load *op) { // Predicated load - if (!is_const_one(op->predicate)) { + const bool is_predicated_load = !is_const_one(op->predicate); + if (is_predicated_load && !target.has_feature(Target::SVE2)) { CodeGen_Posix::visit(op); return; } - if (neon_intrinsics_disabled()) { + if (simd_intrinsics_disabled()) { CodeGen_Posix::visit(op); return; } @@ -1263,14 +1703,15 @@ void CodeGen_ARM::visit(const Load *op) { const Ramp *ramp = op->index.as(); // We only deal with ramps here - if (!ramp) { + if (!ramp && !target.has_feature(Target::SVE2)) { CodeGen_Posix::visit(op); return; } // If the stride is in [-1, 1], we can deal with that using vanilla codegen const IntImm *stride = ramp ? ramp->stride.as() : nullptr; - if (stride && (-1 <= stride->value && stride->value <= 1)) { + if (stride && (-1 <= stride->value && stride->value <= 1) && + !target.has_feature(Target::SVE2)) { CodeGen_Posix::visit(op); return; } @@ -1296,6 +1737,168 @@ void CodeGen_ARM::visit(const Load *op) { } } + if (target.has_feature(Target::SVE2)) { + if (stride && stride->value < 1) { + CodeGen_Posix::visit(op); + return; + } else if (stride && stride->value == 1) { + const int natural_lanes = target.natural_vector_size(op->type); + if (ramp->lanes % natural_lanes) { + // Load with lanes multiple of natural_lanes + int aligned_lanes = align_up(ramp->lanes, natural_lanes); + // Use predicate to prevent from overrun + Expr vpred; + if (is_predicated_load) { + vpred = Shuffle::make_concat({op->predicate, const_false(aligned_lanes - ramp->lanes)}); + } else { + vpred = make_vector_predicate_1s_0s(ramp->lanes, aligned_lanes - ramp->lanes); + } + auto aligned_index = Ramp::make(ramp->base, stride, aligned_lanes); + auto aligned_type = op->type.with_lanes(aligned_lanes); + value = codegen(Load::make(aligned_type, op->name, aligned_index, op->image, op->param, vpred, op->alignment)); + value = slice_vector(value, 0, ramp->lanes); + return; + } else { + CodeGen_Posix::visit(op); + return; + } + } else if (stride && (2 <= stride->value && stride->value <= 4)) { + // Structured load ST2/ST3/ST4 of SVE + + Expr base = ramp->base; + ModulusRemainder align = op->alignment; + + int aligned_stride = gcd(stride->value, align.modulus); + int offset = 0; + if (aligned_stride == stride->value) { + offset = mod_imp((int)align.remainder, aligned_stride); + } else { + const Add *add = base.as(); + if (const IntImm *add_c = add ? add->b.as() : base.as()) { + offset = mod_imp(add_c->value, stride->value); + } + } + + if (offset) { + base = simplify(base - offset); + } + + Value *load_pred_val = codegen(op->predicate); + + // We need to slice the result in to native vector lanes to use sve intrin. + // LLVM will optimize redundant ld instructions afterwards + const int slice_lanes = target.natural_vector_size(op->type); + vector results; + for (int i = 0; i < op->type.lanes(); i += slice_lanes) { + int load_base_i = i * stride->value; + Expr slice_base = simplify(base + load_base_i); + Expr slice_index = Ramp::make(slice_base, stride, slice_lanes); + std::ostringstream instr; + instr << "llvm.aarch64.sve.ld" + << stride->value + << ".sret.nxv" + << slice_lanes + << (op->type.is_float() ? 'f' : 'i') + << op->type.bits(); + llvm::Type *elt = llvm_type_of(op->type.element_of()); + llvm::Type *slice_type = get_vector_type(elt, slice_lanes); + StructType *sret_type = StructType::get(module->getContext(), std::vector(stride->value, slice_type)); + std::vector arg_types{get_vector_type(i1_t, slice_lanes), PointerType::get(elt, 0)}; + llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false); + FunctionCallee fn = module->getOrInsertFunction(instr.str(), fn_type); + + // Set the predicate argument + int active_lanes = std::min(op->type.lanes() - i, slice_lanes); + + Expr vpred = make_vector_predicate_1s_0s(active_lanes, slice_lanes - active_lanes); + Value *vpred_val = codegen(vpred); + vpred_val = convert_fixed_or_scalable_vector_type(vpred_val, get_vector_type(vpred_val->getType()->getScalarType(), slice_lanes)); + if (is_predicated_load) { + Value *sliced_load_vpred_val = slice_vector(load_pred_val, i, slice_lanes); + vpred_val = builder->CreateAnd(vpred_val, sliced_load_vpred_val); + } + + Value *elt_ptr = codegen_buffer_pointer(op->name, op->type.element_of(), slice_base); + CallInst *load_i = builder->CreateCall(fn, {vpred_val, elt_ptr}); + add_tbaa_metadata(load_i, op->name, slice_index); + // extract one element out of returned struct + Value *extracted = builder->CreateExtractValue(load_i, offset); + results.push_back(extracted); + } + + // Retrieve original lanes + value = concat_vectors(results); + value = slice_vector(value, 0, op->type.lanes()); + return; + } else if (op->index.type().is_vector()) { + // General Gather Load + + // Rewrite float16 case into load in uint16 and reinterpret, as it is unsupported in LLVM + if (is_float16_and_has_feature(op->type)) { + Type u16_type = op->type.with_code(halide_type_uint); + Expr equiv = Load::make(u16_type, op->name, op->index, op->image, op->param, op->predicate, op->alignment); + equiv = reinterpret(op->type, equiv); + equiv = common_subexpression_elimination(equiv); + value = codegen(equiv); + return; + } + + Type elt = op->type.element_of(); + const int load_lanes = op->type.lanes(); + const int index_bits = 32; + Type type_with_max_bits = Int(std::max(elt.bits(), index_bits)); + // The number of lanes is constrained by index vector type + const int natural_lanes = target.natural_vector_size(type_with_max_bits); + const int vscale_natural_lanes = natural_lanes / target_vscale(); + + Expr base = 0; + Value *elt_ptr = codegen_buffer_pointer(op->name, elt, base); + Value *index = codegen(op->index); + Value *load_pred_val = codegen(op->predicate); + + llvm::Type *slice_type = get_vector_type(llvm_type_of(elt), vscale_natural_lanes, VectorTypeConstraint::VScale); + llvm::Type *slice_index_type = get_vector_type(llvm_type_of(op->index.type().element_of()), vscale_natural_lanes, VectorTypeConstraint::VScale); + llvm::Type *pred_type = get_vector_type(llvm_type_of(op->predicate.type().element_of()), vscale_natural_lanes, VectorTypeConstraint::VScale); + + std::ostringstream instr; + instr << "llvm.aarch64.sve.ld1.gather.uxtw." + << (elt.bits() != 8 ? "index." : "") // index is scaled into bytes + << "nxv" + << vscale_natural_lanes + << (elt == Float(32) || elt == Float(64) ? 'f' : 'i') + << elt.bits(); + + llvm::FunctionType *fn_type = FunctionType::get(slice_type, {pred_type, elt_ptr->getType(), slice_index_type}, false); + FunctionCallee fn = module->getOrInsertFunction(instr.str(), fn_type); + + // We need to slice the result in to native vector lanes to use intrinsic + vector results; + for (int i = 0; i < load_lanes; i += natural_lanes) { + Value *slice_index = slice_vector(index, i, natural_lanes); + + const int active_lanes = std::min(load_lanes - i, natural_lanes); + + Expr vpred = make_vector_predicate_1s_0s(active_lanes, natural_lanes - active_lanes); + Value *vpred_val = codegen(vpred); + if (is_predicated_load) { + Value *sliced_load_vpred_val = slice_vector(load_pred_val, i, natural_lanes); + vpred_val = builder->CreateAnd(vpred_val, sliced_load_vpred_val); + } + + vpred_val = match_vector_type_scalable(vpred_val, VectorTypeConstraint::VScale); + slice_index = match_vector_type_scalable(slice_index, VectorTypeConstraint::VScale); + CallInst *gather = builder->CreateCall(fn, {vpred_val, elt_ptr, slice_index}); + add_tbaa_metadata(gather, op->name, op->index); + results.push_back(gather); + } + + // Retrieve original lanes + value = concat_vectors(results); + value = slice_vector(value, 0, load_lanes); + return; + } + } + CodeGen_Posix::visit(op); } @@ -1322,6 +1925,33 @@ void CodeGen_ARM::visit(const Shuffle *op) { } } +void CodeGen_ARM::visit(const Ramp *op) { + if (target_vscale() != 0 && op->type.is_int_or_uint()) { + if (is_const_zero(op->base) && is_const_one(op->stride)) { + codegen_func_t cg_func = [&](int lanes, const std::vector &args) { + internal_assert(args.empty()); + // Generate stepvector intrinsic for ScalableVector + return builder->CreateStepVector(llvm_type_of(op->type.with_lanes(lanes))); + }; + + // codgen with next-power-of-two lanes, because if we sliced into natural_lanes(e.g. 4), + // it would produce {0,1,2,3,0,1,..} instead of {0,1,2,3,4,5,..} + const int ret_lanes = op->type.lanes(); + const int aligned_lanes = next_power_of_two(ret_lanes); + value = codegen_with_lanes(aligned_lanes, ret_lanes, {}, cg_func); + return; + } else { + Expr broadcast_base = Broadcast::make(op->base, op->lanes); + Expr broadcast_stride = Broadcast::make(op->stride, op->lanes); + Expr step_ramp = Ramp::make(make_zero(op->base.type()), make_one(op->base.type()), op->lanes); + value = codegen(broadcast_base + broadcast_stride * step_ramp); + return; + } + } + + CodeGen_Posix::visit(op); +} + void CodeGen_ARM::visit(const Call *op) { if (op->is_intrinsic(Call::sorted_avg)) { value = codegen(halving_add(op->args[0], op->args[1])); @@ -1407,7 +2037,6 @@ void CodeGen_ARM::visit(const Call *op) { for (const auto &i : cast_rewrites) { if (expr_match(i.first, op, matches)) { Expr replacement = substitute("*", matches[0], with_lanes(i.second, op->type.lanes())); - debug(3) << "rewriting cast to: " << replacement << " from " << Expr(op) << "\n"; value = codegen(replacement); return; } @@ -1464,14 +2093,28 @@ void CodeGen_ARM::visit(const LE *op) { } void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) { - if (neon_intrinsics_disabled() || - op->op == VectorReduce::Or || - op->op == VectorReduce::And || - op->op == VectorReduce::Mul) { + if (simd_intrinsics_disabled()) { CodeGen_Posix::codegen_vector_reduce(op, init); return; } + if (codegen_dot_product_vector_reduce(op, init)) { + return; + } + if (codegen_pairwise_vector_reduce(op, init)) { + return; + } + if (codegen_across_vector_reduce(op, init)) { + return; + } + CodeGen_Posix::codegen_vector_reduce(op, init); +} + +bool CodeGen_ARM::codegen_dot_product_vector_reduce(const VectorReduce *op, const Expr &init) { + if (op->op != VectorReduce::Add) { + return false; + } + struct Pattern { VectorReduce::Operator reduce_op; int factor; @@ -1485,11 +2128,23 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init {VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Target::ARMDotProd}, {VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd}, {VectorReduce::Add, 4, u32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd}, + {VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Target::SVE2}, + {VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::SVE2}, + {VectorReduce::Add, 4, u32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::SVE2}, + {VectorReduce::Add, 4, i64(widening_mul(wild_i16x_, wild_i16x_)), "dot_product", Target::SVE2}, + {VectorReduce::Add, 4, i64(widening_mul(wild_u16x_, wild_u16x_)), "dot_product", Target::SVE2}, + {VectorReduce::Add, 4, u64(widening_mul(wild_u16x_, wild_u16x_)), "dot_product", Target::SVE2}, // A sum is the same as a dot product with a vector of ones, and this appears to // be a bit faster. {VectorReduce::Add, 4, i32(wild_i8x_), "dot_product", Target::ARMDotProd, {1}}, {VectorReduce::Add, 4, i32(wild_u8x_), "dot_product", Target::ARMDotProd, {1}}, {VectorReduce::Add, 4, u32(wild_u8x_), "dot_product", Target::ARMDotProd, {1}}, + {VectorReduce::Add, 4, i32(wild_i8x_), "dot_product", Target::SVE2, {1}}, + {VectorReduce::Add, 4, i32(wild_u8x_), "dot_product", Target::SVE2, {1}}, + {VectorReduce::Add, 4, u32(wild_u8x_), "dot_product", Target::SVE2, {1}}, + {VectorReduce::Add, 4, i64(wild_i16x_), "dot_product", Target::SVE2, {1}}, + {VectorReduce::Add, 4, i64(wild_u16x_), "dot_product", Target::SVE2, {1}}, + {VectorReduce::Add, 4, u64(wild_u16x_), "dot_product", Target::SVE2, {1}}, }; // clang-format on @@ -1507,7 +2162,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init Expr equiv = VectorReduce::make(op->op, op->value, op->value.type().lanes() / p.factor); equiv = VectorReduce::make(op->op, equiv, op->type.lanes()); codegen_vector_reduce(equiv.as(), init); - return; + return true; } for (int i : p.extra_operands) { @@ -1518,6 +2173,7 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init if (!i.defined()) { i = make_zero(op->type); } + if (const Shuffle *s = matches[0].as()) { if (s->is_broadcast()) { // LLVM wants the broadcast as the second operand for the broadcasting @@ -1525,15 +2181,27 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init std::swap(matches[0], matches[1]); } } - value = call_overloaded_intrin(op->type, p.intrin, {i, matches[0], matches[1]}); - if (value) { - return; + + if (Value *v = call_overloaded_intrin(op->type, p.intrin, {i, matches[0], matches[1]})) { + value = v; + return true; } } } + return false; +} + +bool CodeGen_ARM::codegen_pairwise_vector_reduce(const VectorReduce *op, const Expr &init) { + if (op->op != VectorReduce::Add && + op->op != VectorReduce::Max && + op->op != VectorReduce::Min) { + return false; + } + // TODO: Move this to be patterns? The patterns are pretty trivial, but some // of the other logic is tricky. + int factor = op->value.type().lanes() / op->type.lanes(); const char *intrin = nullptr; vector intrin_args; Expr accumulator = init; @@ -1547,33 +2215,38 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init narrow = lossless_cast(narrow_type.with_code(Type::UInt), op->value); } if (narrow.defined()) { - if (init.defined() && target.bits == 32) { - // On 32-bit, we have an intrinsic for widening add-accumulate. + if (init.defined() && (target.bits == 32 || target.has_feature(Target::SVE2))) { + // On 32-bit or SVE2, we have an intrinsic for widening add-accumulate. // TODO: this could be written as a pattern with widen_right_add (#6951). intrin = "pairwise_widening_add_accumulate"; intrin_args = {accumulator, narrow}; accumulator = Expr(); + } else if (target.has_feature(Target::SVE2)) { + intrin = "pairwise_widening_add_accumulate"; + intrin_args = {Expr(0), narrow}; + accumulator = Expr(); } else { // On 64-bit, LLVM pattern matches widening add-accumulate if // we give it the widening add. intrin = "pairwise_widening_add"; intrin_args = {narrow}; } - } else { + } else if (!target.has_feature(Target::SVE2)) { + // Exclude SVE, as it process lanes in different order (even/odd wise) than NEON intrin = "pairwise_add"; intrin_args = {op->value}; } - } else if (op->op == VectorReduce::Min && factor == 2) { + } else if (op->op == VectorReduce::Min && factor == 2 && !target.has_feature(Target::SVE2)) { intrin = "pairwise_min"; intrin_args = {op->value}; - } else if (op->op == VectorReduce::Max && factor == 2) { + } else if (op->op == VectorReduce::Max && factor == 2 && !target.has_feature(Target::SVE2)) { intrin = "pairwise_max"; intrin_args = {op->value}; } if (intrin) { - value = call_overloaded_intrin(op->type, intrin, intrin_args); - if (value) { + if (Value *v = call_overloaded_intrin(op->type, intrin, intrin_args)) { + value = v; if (accumulator.defined()) { // We still have an initial value to take care of string n = unique_name('t'); @@ -1595,11 +2268,126 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init codegen(accumulator); sym_pop(n); } - return; + return true; } } - CodeGen_Posix::codegen_vector_reduce(op, init); + return false; +} + +bool CodeGen_ARM::codegen_across_vector_reduce(const VectorReduce *op, const Expr &init) { + if (target_vscale() == 0) { + // Leave this to vanilla codegen to emit "llvm.vector.reduce." intrinsic, + // which doesn't support scalable vector in LLVM 14 + return false; + } + + if (op->op != VectorReduce::Add && + op->op != VectorReduce::Max && + op->op != VectorReduce::Min) { + return false; + } + + Expr val = op->value; + const int output_lanes = op->type.lanes(); + const int native_lanes = target.natural_vector_size(op->type); + const int input_lanes = val.type().lanes(); + const int input_bits = op->type.bits(); + Type elt = op->type.element_of(); + + if (output_lanes != 1 || input_lanes < 2) { + return false; + } + + Expr (*binop)(Expr, Expr) = nullptr; + std::string op_name; + switch (op->op) { + case VectorReduce::Add: + binop = Add::make; + op_name = "add"; + break; + case VectorReduce::Min: + binop = Min::make; + op_name = "min"; + break; + case VectorReduce::Max: + binop = Max::make; + op_name = "max"; + break; + default: + internal_error << "unreachable"; + } + + if (input_lanes == native_lanes) { + std::stringstream name; // e.g. llvm.aarch64.sve.sminv.nxv4i32 + name << "llvm.aarch64.sve." + << (op->type.is_float() ? "f" : op->type.is_int() ? "s" : + "u") + << op_name << "v" + << ".nxv" << (native_lanes / target_vscale()) << (op->type.is_float() ? "f" : "i") << input_bits; + + // Integer add accumulation output is 64 bit only + const bool type_upgraded = op->op == VectorReduce::Add && op->type.is_int_or_uint(); + const int output_bits = type_upgraded ? 64 : input_bits; + Type intrin_ret_type = op->type.with_bits(output_bits); + + const string intrin_name = name.str(); + + Expr pred = const_true(native_lanes); + vector args{pred, op->value}; + + // Make sure the declaration exists, or the codegen for + // call will assume that the args should scalarize. + if (!module->getFunction(intrin_name)) { + vector arg_types; + for (const Expr &e : args) { + arg_types.push_back(llvm_type_with_constraint(e.type(), false, VectorTypeConstraint::VScale)); + } + FunctionType *func_t = FunctionType::get(llvm_type_with_constraint(intrin_ret_type, false, VectorTypeConstraint::VScale), + arg_types, false); + llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get()); + } + + Expr equiv = Call::make(intrin_ret_type, intrin_name, args, Call::PureExtern); + if (type_upgraded) { + equiv = Cast::make(op->type, equiv); + } + if (init.defined()) { + equiv = binop(init, equiv); + } + equiv = common_subexpression_elimination(equiv); + equiv.accept(this); + return true; + + } else if (input_lanes < native_lanes) { + // Create equivalent where lanes==native_lanes by padding data which doesn't affect the result + Expr padding; + const int inactive_lanes = native_lanes - input_lanes; + + switch (op->op) { + case VectorReduce::Add: + padding = make_zero(elt.with_lanes(inactive_lanes)); + break; + case VectorReduce::Min: + padding = elt.with_lanes(inactive_lanes).min(); + break; + case VectorReduce::Max: + padding = elt.with_lanes(inactive_lanes).max(); + break; + default: + internal_error << "unreachable"; + } + + Expr equiv = VectorReduce::make(op->op, Shuffle::make_concat({val, padding}), 1); + if (init.defined()) { + equiv = binop(equiv, init); + } + equiv = common_subexpression_elimination(equiv); + equiv.accept(this); + return true; + } + + return false; } Type CodeGen_ARM::upgrade_type_for_arithmetic(const Type &t) const { @@ -1623,6 +2411,39 @@ Type CodeGen_ARM::upgrade_type_for_storage(const Type &t) const { return CodeGen_Posix::upgrade_type_for_storage(t); } +Value *CodeGen_ARM::codegen_with_lanes(int slice_lanes, int total_lanes, + const std::vector &args, codegen_func_t &cg_func) { + std::vector llvm_args; + // codegen args + for (const auto &arg : args) { + llvm_args.push_back(codegen(arg)); + } + + if (slice_lanes == total_lanes) { + // codegen op + return cg_func(slice_lanes, llvm_args); + } + + std::vector results; + for (int start = 0; start < total_lanes; start += slice_lanes) { + std::vector sliced_args; + for (auto &llvm_arg : llvm_args) { + Value *v = llvm_arg; + if (get_vector_num_elements(llvm_arg->getType()) == total_lanes) { + // Except for scalar argument which some ops have, arguments are sliced + v = slice_vector(llvm_arg, start, slice_lanes); + } + sliced_args.push_back(v); + } + // codegen op + value = cg_func(slice_lanes, sliced_args); + results.push_back(value); + } + // Restore the results into vector with total_lanes + value = concat_vectors(results); + return slice_vector(value, 0, total_lanes); +} + string CodeGen_ARM::mcpu_target() const { if (target.bits == 32) { if (target.has_feature(Target::ARMv7s)) { @@ -1635,6 +2456,8 @@ string CodeGen_ARM::mcpu_target() const { return "cyclone"; } else if (target.os == Target::OSX) { return "apple-a12"; + } else if (target.has_feature(Target::SVE2)) { + return "cortex-x1"; } else { return "generic"; } @@ -1667,6 +2490,7 @@ string CodeGen_ARM::mattrs() const { } } else { // TODO: Should Halide's SVE flags be 64-bit only? + // TODO: Sound we ass "-neon" if NoNEON is set? Does this make any sense? if (target.has_feature(Target::SVE2)) { attrs.emplace_back("+sve2"); } else if (target.has_feature(Target::SVE)) { @@ -1689,7 +2513,21 @@ bool CodeGen_ARM::use_soft_float_abi() const { } int CodeGen_ARM::native_vector_bits() const { - return 128; + if (target.has_feature(Target::SVE) || target.has_feature(Target::SVE2)) { + return std::max(target.vector_bits, 128); + } else { + return 128; + } +} + +int CodeGen_ARM::target_vscale() const { + if (target.features_any_of({Target::SVE, Target::SVE2})) { + user_assert(target.vector_bits != 0) << "For SVE/SVE2 support, target_vector_bits= must be set in target.\n"; + user_assert((target.vector_bits % 128) == 0) << "For SVE/SVE2 support, target_vector_bits must be a multiple of 128.\n"; + return target.vector_bits / 128; + } + + return 0; } bool CodeGen_ARM::supports_call_as_float16(const Call *op) const { diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 8922461524c5..1871460569c3 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -657,7 +657,11 @@ void CodeGen_LLVM::end_func(const std::vector &args) { } } - internal_assert(!verifyFunction(*function, &llvm::errs())); + bool valid = !verifyFunction(*function, &llvm::errs()); + if (!valid) { + function->print(dbgs()); + } + internal_assert(valid) << "Generated function does not pass LLVM's verifyFunction.\n"; current_function_args.clear(); } @@ -1348,10 +1352,6 @@ bool is_power_of_two(int x) { return (x & (x - 1)) == 0; } -int next_power_of_two(int x) { - return static_cast(1) << static_cast(std::ceil(std::log2(x))); -} - } // namespace Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const { @@ -1449,16 +1449,16 @@ void CodeGen_LLVM::visit(const Cast *op) { } value = codegen(op->value); - llvm::Type *llvm_dst = llvm_type_of(dst); + llvm::Type *llvm_dst = llvm_type_of(dst.element_of()); + if (value->getType()->isVectorTy()) { + llvm_dst = VectorType::get(llvm_dst, dyn_cast(value->getType())->getElementCount()); + } if (dst.is_handle() && src.is_handle()) { value = builder->CreateBitCast(value, llvm_dst); } else if (dst.is_handle() || src.is_handle()) { internal_error << "Can't cast from " << src << " to " << dst << "\n"; } else if (!src.is_float() && !dst.is_float()) { - // Widening integer casts either zero extend or sign extend, - // depending on the source type. Narrowing integer casts - // always truncate. value = builder->CreateIntCast(value, llvm_dst, src.is_int()); } else if (src.is_float() && dst.is_int()) { value = builder->CreateFPToSI(value, llvm_dst); @@ -1879,6 +1879,11 @@ void CodeGen_LLVM::visit(const Select *op) { Value *a = codegen(op->true_value); Value *b = codegen(op->false_value); + if (a->getType()->isVectorTy()) { + cmp = match_vector_type_scalable(cmp, a); + b = match_vector_type_scalable(b, a); + } + if (!try_vector_predication_intrinsic("llvm.vp.select", llvm_type_of(op->type), op->type.lanes(), NoMask(), {VPArg(cmp), VPArg(a, 0), VPArg(b)})) { value = builder->CreateSelect(cmp, a, b); @@ -2266,6 +2271,7 @@ void CodeGen_LLVM::codegen_predicated_store(const Store *op) { Value *vpred = codegen(op->predicate); Halide::Type value_type = op->value.type(); Value *val = codegen(op->value); + vpred = match_vector_type_scalable(vpred, value); int alignment = value_type.bytes(); int native_bytes = native_vector_bits() / 8; @@ -2357,7 +2363,6 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri llvm::Value *vpred, bool slice_to_native, llvm::Value *stride) { debug(4) << "Vectorize predicated dense vector load:\n\t" << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n"; - int align_bytes = type.bytes(); // The size of a single element int native_bits = native_vector_bits(); @@ -2402,7 +2407,7 @@ llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::stri Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo()); - Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr; + Value *slice_mask = (vpred != nullptr) ? match_vector_type_scalable(slice_vector(vpred, i, slice_lanes), slice_type) : nullptr; MaskVariant vp_slice_mask = slice_mask ? MaskVariant(slice_mask) : AllEnabledMask(); Instruction *load_inst = nullptr; @@ -3304,6 +3309,8 @@ void CodeGen_LLVM::visit(const Call *op) { value = codegen(lower_extract_bits(op)); } else if (op->is_intrinsic(Call::concat_bits)) { value = codegen(lower_concat_bits(op)); + } else if (op->is_intrinsic(Call::get_runtime_vscale)) { + value = builder->CreateVScale(ConstantInt::get(i32_t, 1)); } else if (op->is_intrinsic()) { Expr lowered = lower_intrinsic(op); if (!lowered.defined()) { @@ -3478,6 +3485,11 @@ void CodeGen_LLVM::visit(const Call *op) { << halide_arg << "\n"; args[i] = builder->CreatePointerCast(args[i], t); } + } else if (args[i]->getType()->isVectorTy()) { + llvm::Type *t = func_t->getParamType(i); + if (t->isVectorTy()) { + args[i] = match_vector_type_scalable(args[i], t); + } } } } @@ -4274,14 +4286,14 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini break; case VectorReduce::Min: name = "fmin"; - // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118 + // TODO(zvookin): Not correct for strict_float. See: https://github.com/halide/Halide/issues/7118 if (takes_initial_value && !initial_value.defined()) { initial_value = op->type.max(); } break; case VectorReduce::Max: name = "fmax"; - // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118 + // TODO(zvookin): Not correct for strict_float. See: https://github.com/halide/Halide/issues/7118 if (takes_initial_value && !initial_value.defined()) { initial_value = op->type.min(); } @@ -4752,16 +4764,45 @@ Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes llvm::FunctionType *intrin_type = intrin->getFunctionType(); for (int i = 0; i < (int)arg_values.size(); i++) { - if (arg_values[i]->getType() != intrin_type->getParamType(i)) { - // TODO: Change this to call convert_fixed_or_scalable_vector_type and - // remove normalize_fixed_scalable_vector_type, fixed_to_scalable_vector_type, - // and scalable_to_fixed_vector_type - arg_values[i] = normalize_fixed_scalable_vector_type(intrin_type->getParamType(i), arg_values[i]); - } - if (arg_values[i]->getType() != intrin_type->getParamType(i)) { - // There can be some mismatches in types, such as when passing scalar Halide type T - // to LLVM vector type <1 x T>. - arg_values[i] = builder->CreateBitCast(arg_values[i], intrin_type->getParamType(i)); + llvm::Type *arg_type = arg_values[i]->getType(); + llvm::Type *formal_param_type = intrin_type->getParamType(i); + if (arg_type != formal_param_type) { + bool both_vectors = isa(arg_type) && isa(formal_param_type); + bool arg_is_fixed = isa(arg_type); + bool formal_is_fixed = isa(formal_param_type); + + // Apparently the bitcast in the else branch below can + // change the scalar type and vector length together so + // long as the total bits are the same. E.g. on HVX, + // <128 x i16> to <64 x i32>. This is probably a bug, but + // it seems to be allowed so it is also supported in the + // fixed/vscale matching path. + if (both_vectors && (arg_is_fixed != formal_is_fixed) && (effective_vscale != 0)) { + bool scalar_types_match = arg_type->getScalarType() == formal_param_type->getScalarType(); + if (arg_is_fixed && !scalar_types_match) { + unsigned fixed_count = dyn_cast(formal_param_type)->getElementCount().getKnownMinValue() * effective_vscale; + llvm::Type *match_scalar_type = llvm::VectorType::get(formal_param_type->getScalarType(), fixed_count, false); + arg_values[i] = builder->CreateBitCast(arg_values[i], match_scalar_type); + } + llvm::ElementCount ec = dyn_cast(arg_values[i]->getType())->getElementCount(); + int mid_count = formal_is_fixed ? (ec.getKnownMinValue() * effective_vscale) : (ec.getFixedValue() / effective_vscale); + llvm::Type *match_vector_flavor_type = llvm::VectorType::get(arg_values[i]->getType()->getScalarType(), mid_count, !formal_is_fixed); + arg_values[i] = convert_fixed_or_scalable_vector_type(arg_values[i], match_vector_flavor_type); + if (formal_is_fixed && !scalar_types_match) { + arg_values[i] = builder->CreateBitCast(arg_values[i], formal_param_type); + } + } else { + // TODO(https://github.com/halide/Halide/issues/8117): That this + // can happen is probably a bug. It will crash in module + // validation for anything LLVM doesn't support. Better to + // regularize the Halide IR by inserting an intentional cast or + // to add extra intrinsics patterns. At the very least, some + // extra validation should be added here. + + // There can be some mismatches in types, such as when passing + // scalar Halide type T to LLVM vector type <1 x T>. + arg_values[i] = builder->CreateBitCast(arg_values[i], formal_param_type); + } } } @@ -4785,16 +4826,45 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) { return builder->CreateExtractElement(vec, (uint64_t)start); } - vector indices(size); - for (int i = 0; i < size; i++) { - int idx = start + i; - if (idx >= 0 && idx < vec_lanes) { - indices[i] = idx; - } else { - indices[i] = -1; + bool is_fixed = isa(vec->getType()); + + // TODO(https://github.com/halide/Halide/issues/8118): It is likely worth + // looking into using llvm.vector.{extract,insert} for this case + // too. However that would need to be validated performance wise for all + // architectures. + if (is_fixed) { + vector indices(size); + for (int i = 0; i < size; i++) { + int idx = start + i; + if (idx >= 0 && idx < vec_lanes) { + indices[i] = idx; + } else { + indices[i] = -1; + } } + return shuffle_vectors(vec, indices); + } else { + // Extract a fixed vector with all the values in the source. + // Then insert back into a vector extended to size. This will + // be a scalable vector if size can be scalable, fixed + // otherwise. + llvm::Type *scalar_type = vec->getType()->getScalarType(); + + int intermediate_lanes = std::min(size, vec_lanes - start); + llvm::Type *intermediate_type = get_vector_type(scalar_type, intermediate_lanes, VectorTypeConstraint::Fixed); + + vec = builder->CreateExtractVector(intermediate_type, vec, ConstantInt::get(i64_t, start)); + + // Insert vector into a poison vector and return. + int effective_size = is_fixed ? size : (size / effective_vscale); + llvm::VectorType *result_type = dyn_cast(get_vector_type(scalar_type, effective_size, + is_fixed ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale)); + Constant *poison = PoisonValue::get(scalar_type); + llvm::Value *result_vec = ConstantVector::getSplat(result_type->getElementCount(), poison); + vec = builder->CreateInsertVector(result_type, result_vec, vec, ConstantInt::get(i64_t, 0)); + + return vec; } - return shuffle_vectors(vec, indices); } Value *CodeGen_LLVM::concat_vectors(const vector &v) { @@ -4831,6 +4901,11 @@ Value *CodeGen_LLVM::concat_vectors(const vector &v) { } int w_matched = std::max(w1, w2); + if (v1->getType() != v2->getType()) { + // arbitrary decision here to convert v2 to type of v1 rather than + // target fixed or scalable. + v2 = convert_fixed_or_scalable_vector_type(v2, v1->getType()); + } internal_assert(v1->getType() == v2->getType()); vector indices(w1 + w2); @@ -4903,8 +4978,11 @@ std::pair CodeGen_LLVM::find_vector_runtime_function(cons while (l < lanes) { l *= 2; } - for (int i = l; i > 1; i /= 2) { - sizes_to_try.push_back(i); + + // This will be 1 for non-vscale architectures. + int vscale_divisor = std::max(effective_vscale, 1); + for (int i = l; i > vscale_divisor; i /= 2) { + sizes_to_try.push_back(i / vscale_divisor); } // If none of those match, we'll also try doubling @@ -4913,10 +4991,11 @@ std::pair CodeGen_LLVM::find_vector_runtime_function(cons // vector implementation). sizes_to_try.push_back(l * 2); + std::string vec_prefix = effective_vscale != 0 ? "nx" : "x"; for (int l : sizes_to_try) { - llvm::Function *vec_fn = module->getFunction(name + "x" + std::to_string(l)); + llvm::Function *vec_fn = module->getFunction(name + vec_prefix + std::to_string(l)); if (vec_fn) { - return {vec_fn, l}; + return {vec_fn, l * vscale_divisor}; } } @@ -4982,6 +5061,42 @@ llvm::Value *CodeGen_LLVM::normalize_fixed_scalable_vector_type(llvm::Type *desi return result; } +llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, VectorTypeConstraint constraint) { + if (constraint == VectorTypeConstraint::None) { + return value; + } + + llvm::Type *value_type = value->getType(); + if (!isa(value_type)) { + return value; + } + + bool value_fixed = isa(value_type); + bool guide_fixed = (constraint == VectorTypeConstraint::Fixed); + if (value_fixed != guide_fixed) { + int value_scaled_elements = get_vector_num_elements(value_type); + if (!guide_fixed) { + value_scaled_elements /= effective_vscale; + } + llvm::Type *desired_type = get_vector_type(value_type->getScalarType(), value_scaled_elements, + guide_fixed ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale); + value = convert_fixed_or_scalable_vector_type(value, desired_type); + } + + return value; +} + +llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, llvm::Type *guide_type) { + if (!isa(guide_type)) { + return value; + } + return match_vector_type_scalable(value, isa(guide_type) ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale); +} + +llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, llvm::Value *guide) { + return match_vector_type_scalable(value, guide->getType()); +} + llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *arg, llvm::Type *desired_type) { llvm::Type *arg_type = arg->getType(); @@ -5007,13 +5122,21 @@ llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *ar if (isa(arg_type) && isa(result_type)) { use_insert = true; + if (arg_elements > result_elements) { + arg = slice_vector(arg, 0, result_elements); + } + arg_elements = result_elements; } else if (isa(result_type) && isa(arg_type)) { use_insert = false; + if (arg_elements < result_elements) { + arg = slice_vector(arg, 0, result_elements); + } + arg_elements = result_elements; } else { // Use extract to make smaller, insert to make bigger. // A somewhat arbitary decision. - use_insert = (arg_elements > result_elements); + use_insert = (arg_elements < result_elements); } std::string intrin_name = "llvm.vector."; @@ -5165,10 +5288,27 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n, bool scalable = false; switch (type_constraint) { case VectorTypeConstraint::None: - scalable = effective_vscale != 0 && - ((n % effective_vscale) == 0); - if (scalable) { - n = n / effective_vscale; + if (effective_vscale > 0) { + bool wide_enough = true; + // TODO(https://github.com/halide/Halide/issues/8119): Architecture + // specific code should not go here. Ideally part of this can go + // away via LLVM fixes and modifying intrinsic selection to handle + // scalable vs. fixed vectors. Making this method virtual is + // possibly expensive. + if (target.arch == Target::ARM) { + if (!target.has_feature(Target::NoNEON)) { + // force booleans into bytes. TODO(https://github.com/halide/Halide/issues/8119): figure out a better way to do this. + int bit_size = std::max((int)t->getScalarSizeInBits(), 8); + wide_enough = (bit_size * n) > 128; + } else { + // TODO(https://github.com/halide/Halide/issues/8119): AArch64 SVE2 support is crashy with scalable vectors of min size 1. + wide_enough = (n / effective_vscale) > 1; + } + } + scalable = wide_enough && ((n % effective_vscale) == 0); + if (scalable) { + n = n / effective_vscale; + } } break; case VectorTypeConstraint::Fixed: @@ -5190,10 +5330,12 @@ llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value, bool scalable = false; switch (type_constraint) { case VectorTypeConstraint::None: - scalable = effective_vscale != 0 && - ((lanes % effective_vscale) == 0); - if (scalable) { - lanes = lanes / effective_vscale; + if (effective_vscale > 0) { + bool wide_enough = (lanes / effective_vscale) > 1; + scalable = wide_enough && ((lanes % effective_vscale) == 0); + if (scalable) { + lanes = lanes / effective_vscale; + } } break; case VectorTypeConstraint::Fixed: diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index b3e9cdabd498..908929e54373 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -579,6 +579,13 @@ class CodeGen_LLVM : public IRVisitor { llvm::Constant *get_splat(int lanes, llvm::Constant *value, VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const; + /** Make sure a value type has the same scalable/fixed vector type as a guide. */ + // @{ + llvm::Value *match_vector_type_scalable(llvm::Value *value, VectorTypeConstraint constraint); + llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Type *guide); + llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Value *guide); + // @} + /** Support for generating LLVM vector predication intrinsics * ("@llvm.vp.*" and "@llvm.experimental.vp.*") */ diff --git a/src/Function.cpp b/src/Function.cpp index cbb4b61574d4..b72a39e1c90a 100644 --- a/src/Function.cpp +++ b/src/Function.cpp @@ -491,8 +491,10 @@ ExternFuncArgument deep_copy_extern_func_argument_helper(const ExternFuncArgumen } // namespace void Function::deep_copy(const FunctionPtr ©, DeepCopyMap &copied_map) const { - internal_assert(copy.defined() && contents.defined()) - << "Cannot deep-copy undefined Function\n"; + internal_assert(copy.defined()) + << "Cannot deep-copy to undefined Function\n"; + internal_assert(contents.defined()) + << "Cannot deep-copy from undefined Function\n"; // Add reference to this Function's deep-copy to the map in case of // self-reference, e.g. self-reference in an Definition. diff --git a/src/IR.cpp b/src/IR.cpp index c0bdb718291d..81cf0a0f41ff 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -690,6 +690,7 @@ const char *const intrinsic_op_names[] = { "widening_shift_left", "widening_shift_right", "widening_sub", + "get_runtime_vscale", }; static_assert(sizeof(intrinsic_op_names) / sizeof(intrinsic_op_names[0]) == Call::IntrinsicOpCount, diff --git a/src/IR.h b/src/IR.h index 252e4588db03..31aa3f195e43 100644 --- a/src/IR.h +++ b/src/IR.h @@ -629,6 +629,8 @@ struct Call : public ExprNode { widening_shift_right, widening_sub, + get_runtime_vscale, + IntrinsicOpCount // Sentinel: keep last. }; diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp index 3e5d95d787e6..10521f82ac03 100644 --- a/src/IRMatch.cpp +++ b/src/IRMatch.cpp @@ -262,6 +262,9 @@ class IRMatch : public IRVisitor { if (result && e && types_match(op->type, e->type)) { expr = e->value; op->value.accept(this); + } else if (op->lanes == 0 && types_match(op->value.type(), expr.type())) { + // zero lanes means any number of lanes, so match scalars too. + op->value.accept(this); } else { result = false; } diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp index 6b54aeef0e97..e40441b388f0 100644 --- a/src/LLVM_Output.cpp +++ b/src/LLVM_Output.cpp @@ -331,6 +331,12 @@ std::unique_ptr clone_module(const llvm::Module &module_in) { // Read it back in. llvm::MemoryBufferRef buffer_ref(llvm::StringRef(clone_buffer.data(), clone_buffer.size()), "clone_buffer"); auto cloned_module = llvm::parseBitcodeFile(buffer_ref, module_in.getContext()); + + // TODO(): Add support for returning the error. + if (!cloned_module) { + llvm::dbgs() << cloned_module.takeError(); + module_in.print(llvm::dbgs(), nullptr, false, true); + } internal_assert(cloned_module); return std::move(cloned_module.get()); diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp index fd7a12d66995..a207b3ce63f5 100644 --- a/src/StorageFolding.cpp +++ b/src/StorageFolding.cpp @@ -10,6 +10,7 @@ #include "Monotonic.h" #include "Simplify.h" #include "Substitute.h" +#include "Util.h" #include namespace Halide { @@ -17,10 +18,6 @@ namespace Internal { namespace { -int64_t next_power_of_two(int64_t x) { - return static_cast(1) << static_cast(std::ceil(std::log2(x))); -} - using std::map; using std::string; using std::vector; diff --git a/src/Util.h b/src/Util.h index 15c297796911..bce0a7f1d015 100644 --- a/src/Util.h +++ b/src/Util.h @@ -13,6 +13,7 @@ /** \file * Various utility functions used internally Halide. */ +#include #include #include #include @@ -532,6 +533,16 @@ int clz64(uint64_t x); int ctz64(uint64_t x); // @} +/** Return an integer 2^n, for some n, which is >= x. Argument x must be > 0. */ +inline int64_t next_power_of_two(int64_t x) { + return static_cast(1) << static_cast(std::ceil(std::log2(x))); +} + +template +inline T align_up(T x, int n) { + return (x + n - 1) / n * n; +} + } // namespace Internal } // namespace Halide diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp index b99efdc6d67e..bfe66213f44f 100644 --- a/src/WasmExecutor.cpp +++ b/src/WasmExecutor.cpp @@ -101,11 +101,6 @@ struct debug_sink { // BDMalloc // --------------------- -template -inline T align_up(T p, int alignment = 32) { - return (p + alignment - 1) & ~(alignment - 1); -} - // Debugging our Malloc is extremely noisy and usually undesired #define BDMALLOC_DEBUG_LEVEL 0 @@ -318,7 +313,7 @@ std::vector compile_to_wasm(const Module &module, const std::string &fn_na stack_size += cg->get_requested_alloca_total(); } - stack_size = align_up(stack_size); + stack_size = align_up(stack_size, 32); wdebug(1) << "Requesting stack size of " << stack_size << "\n"; std::unique_ptr llvm_module = @@ -708,7 +703,7 @@ wasm32_ptr_t hostbuf_to_wasmbuf(WabtContext &wabt_context, const halide_buffer_t const size_t dims_size_in_bytes = sizeof(halide_dimension_t) * src->dimensions; const size_t dims_offset = sizeof(wasm_halide_buffer_t); const size_t mem_needed_base = sizeof(wasm_halide_buffer_t) + dims_size_in_bytes; - const size_t host_offset = align_up(mem_needed_base); + const size_t host_offset = align_up(mem_needed_base, 32); const size_t host_size_in_bytes = src->size_in_bytes(); const size_t mem_needed = host_offset + host_size_in_bytes; @@ -1613,7 +1608,7 @@ wasm32_ptr_t hostbuf_to_wasmbuf(const Local &context, const halide_buff const size_t dims_size_in_bytes = sizeof(halide_dimension_t) * src->dimensions; const size_t dims_offset = sizeof(wasm_halide_buffer_t); const size_t mem_needed_base = sizeof(wasm_halide_buffer_t) + dims_size_in_bytes; - const size_t host_offset = align_up(mem_needed_base); + const size_t host_offset = align_up(mem_needed_base, 32); const size_t host_size_in_bytes = src->size_in_bytes(); const size_t mem_needed = host_offset + host_size_in_bytes; diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 1a19202745bb..1d0843be0329 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1246,6 +1246,10 @@ enum halide_error_code_t { /** A factor used to split a loop was discovered to be zero or negative at * runtime. */ halide_error_code_split_factor_not_positive = -46, + + /** "vscale" value of Scalable Vector detected in runtime does not match + * the vscale value used in compilation. */ + halide_error_code_vscale_invalid = -47, }; /** Halide calls the functions below on various error conditions. The @@ -1321,7 +1325,7 @@ extern int halide_error_storage_bound_too_small(void *user_context, const char * int provided_size, int required_size); extern int halide_error_device_crop_failed(void *user_context); extern int halide_error_split_factor_not_positive(void *user_context, const char *func_name, const char *orig, const char *outer, const char *inner, const char *factor_str, int factor); - +extern int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale); // @} /** Optional features a compilation Target can have. diff --git a/src/runtime/aarch64.ll b/src/runtime/aarch64.ll index 9ae3b8e46ac2..c68a4f05fb42 100644 --- a/src/runtime/aarch64.ll +++ b/src/runtime/aarch64.ll @@ -48,25 +48,34 @@ define weak_odr <2 x i64> @vabdl_u32x2(<2 x i32> %a, <2 x i32> %b) nounwind alwa declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %x) nounwind readnone; declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %x) nounwind readnone; +declare float @llvm.aarch64.neon.frecpe.f32(float) declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %x) nounwind readnone; declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %x) nounwind readnone; +declare float @llvm.aarch64.neon.frsqrte.f32(float) declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %x, <4 x float> %y) nounwind readnone; declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %x, <2 x float> %y) nounwind readnone; +declare float @llvm.aarch64.neon.frecps.f32(float, float) declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %x, <4 x float> %y) nounwind readnone; declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %x, <2 x float> %y) nounwind readnone; +declare float @llvm.aarch64.neon.frsqrts.f32(float, float) + declare <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> %x) nounwind readnone; declare <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> %x) nounwind readnone; +declare half @llvm.aarch64.neon.frecpe.f16(half) declare <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> %x) nounwind readnone; declare <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> %x) nounwind readnone; +declare half @llvm.aarch64.neon.frsqrte.f16(half) declare <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> %x, <8 x half> %y) nounwind readnone; declare <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> %x, <4 x half> %y) nounwind readnone; +declare half @llvm.aarch64.neon.frecps.f16(half, half) declare <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %x, <8 x half> %y) nounwind readnone; declare <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> %x, <4 x half> %y) nounwind readnone; +declare half @llvm.aarch64.neon.frsqrts.f16(half, half) define weak_odr float @fast_inverse_f32(float %x) nounwind alwaysinline { - %vec = insertelement <2 x float> poison, float %x, i32 0 - %approx = tail call <2 x float> @fast_inverse_f32x2(<2 x float> %vec) - %result = extractelement <2 x float> %approx, i32 0 + %approx = tail call float @llvm.aarch64.neon.frecpe.f32(float %x) + %correction = tail call float @llvm.aarch64.neon.frecps.f32(float %approx, float %x) + %result = fmul float %approx, %correction ret float %result } @@ -85,9 +94,9 @@ define weak_odr <4 x float> @fast_inverse_f32x4(<4 x float> %x) nounwind alwaysi } define weak_odr half @fast_inverse_f16(half %x) nounwind alwaysinline { - %vec = insertelement <4 x half> poison, half %x, i32 0 - %approx = tail call <4 x half> @fast_inverse_f16x4(<4 x half> %vec) - %result = extractelement <4 x half> %approx, i32 0 + %approx = tail call half @llvm.aarch64.neon.frecpe.f16(half %x) + %correction = tail call half @llvm.aarch64.neon.frecps.f16(half %approx, half %x) + %result = fmul half %approx, %correction ret half %result } @@ -106,9 +115,10 @@ define weak_odr <8 x half> @fast_inverse_f16x8(<8 x half> %x) nounwind alwaysinl } define weak_odr float @fast_inverse_sqrt_f32(float %x) nounwind alwaysinline { - %vec = insertelement <2 x float> poison, float %x, i32 0 - %approx = tail call <2 x float> @fast_inverse_sqrt_f32x2(<2 x float> %vec) - %result = extractelement <2 x float> %approx, i32 0 + %approx = tail call float @llvm.aarch64.neon.frsqrte.f32(float %x) + %approx2 = fmul float %approx, %approx + %correction = tail call float @llvm.aarch64.neon.frsqrts.f32(float %approx2, float %x) + %result = fmul float %approx, %correction ret float %result } @@ -129,9 +139,10 @@ define weak_odr <4 x float> @fast_inverse_sqrt_f32x4(<4 x float> %x) nounwind al } define weak_odr half @fast_inverse_sqrt_f16(half %x) nounwind alwaysinline { - %vec = insertelement <4 x half> poison, half %x, i32 0 - %approx = tail call <4 x half> @fast_inverse_sqrt_f16x4(<4 x half> %vec) - %result = extractelement <4 x half> %approx, i32 0 + %approx = tail call half @llvm.aarch64.neon.frsqrte.f16(half %x) + %approx2 = fmul half %approx, %approx + %correction = tail call half @llvm.aarch64.neon.frsqrts.f16(half %approx2, half %x) + %result = fmul half %approx, %correction ret half %result } @@ -149,4 +160,43 @@ define weak_odr <8 x half> @fast_inverse_sqrt_f16x8(<8 x half> %x) nounwind alwa %correction = tail call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %approx2, <8 x half> %x) %result = fmul <8 x half> %approx, %correction ret <8 x half> %result -} \ No newline at end of file +} + +declare @llvm.aarch64.sve.frecpe.x.nxv4f32( %x) nounwind readnone; +declare @llvm.aarch64.sve.frsqrte.x.nxv4f32( %x) nounwind readnone; +declare @llvm.aarch64.sve.frecps.x.nxv4f32( %x, %y) nounwind readnone; +declare @llvm.aarch64.sve.frsqrts.x.nxv4f32( %x, %y) nounwind readnone; +declare @llvm.aarch64.sve.frecpe.x.nxv8f16( %x) nounwind readnone; +declare @llvm.aarch64.sve.frsqrte.x.nxv8f16( %x) nounwind readnone; +declare @llvm.aarch64.sve.frecps.x.nxv8f16( %x, %y) nounwind readnone; +declare @llvm.aarch64.sve.frsqrts.x.nxv8f16( %x, %y) nounwind readnone; + +define weak_odr @fast_inverse_f32nx4( %x) nounwind alwaysinline { + %approx = tail call @llvm.aarch64.sve.frecpe.x.nxv4f32( %x) + %correction = tail call @llvm.aarch64.sve.frecps.x.nxv4f32( %approx, %x) + %result = fmul %approx, %correction + ret %result +} + +define weak_odr @fast_inverse_f16nx8( %x) nounwind alwaysinline { + %approx = tail call @llvm.aarch64.sve.frecpe.x.nxv8f16( %x) + %correction = tail call @llvm.aarch64.sve.frecps.x.nxv8f16( %approx, %x) + %result = fmul %approx, %correction + ret %result +} + +define weak_odr @fast_inverse_sqrt_f32nx4( %x) nounwind alwaysinline { + %approx = tail call @llvm.aarch64.sve.frsqrte.x.nxv4f32( %x) + %approx2 = fmul %approx, %approx + %correction = tail call @llvm.aarch64.sve.frsqrts.x.nxv4f32( %approx2, %x) + %result = fmul %approx, %correction + ret %result +} + +define weak_odr @fast_inverse_sqrt_f16nx8( %x) nounwind alwaysinline { + %approx = tail call @llvm.aarch64.sve.frsqrte.x.nxv8f16( %x) + %approx2 = fmul %approx, %approx + %correction = tail call @llvm.aarch64.sve.frsqrts.x.nxv8f16( %approx2, %x) + %result = fmul %approx, %correction + ret %result +} diff --git a/src/runtime/errors.cpp b/src/runtime/errors.cpp index 0879cc4a7c60..acb640c44b52 100644 --- a/src/runtime/errors.cpp +++ b/src/runtime/errors.cpp @@ -300,4 +300,12 @@ WEAK int halide_error_split_factor_not_positive(void *user_context, const char * return halide_error_code_split_factor_not_positive; } +WEAK int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale) { + error(user_context) + << "The function " << func_name + << " is compiled with the assumption that vscale of Scalable Vector is " << compiletime_vscale + << ". However, the detected runtime vscale is " << runtime_vscale << "."; + return halide_error_code_vscale_invalid; +} + } // extern "C" diff --git a/src/runtime/posix_math.ll b/src/runtime/posix_math.ll index 236652279615..ee6c2571f4eb 100644 --- a/src/runtime/posix_math.ll +++ b/src/runtime/posix_math.ll @@ -322,4 +322,30 @@ define weak_odr double @neg_inf_f64() nounwind uwtable readnone alwaysinline { define weak_odr double @nan_f64() nounwind uwtable readnone alwaysinline { ret double 0x7FF8000000000000 -} \ No newline at end of file +} + +; In case scalable vector with un-natural vector size, LLVM doesn't auto-vectorize the above scalar version +define weak_odr @inf_f32nx4() nounwind uwtable readnone alwaysinline { + ret shufflevector ( insertelement ( undef, float 0x7FF0000000000000, i32 0), undef, zeroinitializer) +} + +define weak_odr @neg_inf_f32nx4() nounwind uwtable readnone alwaysinline { + ret shufflevector ( insertelement ( undef, float 0xFFF0000000000000, i32 0), undef, zeroinitializer) +} + +define weak_odr @nan_f32nx4() nounwind uwtable readnone alwaysinline { + ret shufflevector ( insertelement ( undef, float 0x7FF8000000000000, i32 0), undef, zeroinitializer) +} + + +define weak_odr @inf_f64nx2() nounwind uwtable readnone alwaysinline { + ret shufflevector ( insertelement ( undef, double 0x7FF0000000000000, i32 0), undef, zeroinitializer) +} + +define weak_odr @neg_inf_f64nx2() nounwind uwtable readnone alwaysinline { + ret shufflevector ( insertelement ( undef, double 0xFFF0000000000000, i32 0), undef, zeroinitializer) +} + +define weak_odr @nan_f64nx2() nounwind uwtable readnone alwaysinline { + ret shufflevector ( insertelement ( undef, double 0x7FF8000000000000, i32 0), undef, zeroinitializer) +} diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp index db8ada2f4b8e..7955e8749df7 100644 --- a/src/runtime/runtime_api.cpp +++ b/src/runtime/runtime_api.cpp @@ -89,6 +89,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_error_unaligned_host_ptr, (void *)&halide_error_storage_bound_too_small, (void *)&halide_error_device_crop_failed, + (void *)&halide_error_vscale_invalid, (void *)&halide_float16_bits_to_double, (void *)&halide_float16_bits_to_float, (void *)&halide_free, diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 9b934b768cdd..604ceda468f5 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -277,6 +277,7 @@ tests(GROUPS correctness simd_op_check_hvx.cpp simd_op_check_powerpc.cpp simd_op_check_riscv.cpp + simd_op_check_sve2.cpp simd_op_check_wasm.cpp simd_op_check_x86.cpp simplified_away_embedded_image.cpp diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp index e8762a6ea2d8..3ebf5071569e 100644 --- a/test/correctness/simd_op_check_arm.cpp +++ b/test/correctness/simd_op_check_arm.cpp @@ -230,6 +230,13 @@ class SimdOpCheckARM : public SimdOpCheckTest { check(arm32 ? "vcvt.s32.f32" : "fcvtzs", 2 * w, i32(f32_1)); // skip the fixed point conversions for now + if (!arm32) { + check("fcvtmu *v", 2 * w, u32(floor(f32_1))); + check("fcvtpu *v", 2 * w, u32(ceil(f32_1))); + check("fcvtms *v", 2 * w, i32(floor(f32_1))); + check("fcvtps *v", 2 * w, i32(ceil(f32_1))); + } + // VDIV - F, D Divide // This doesn't actually get vectorized in 32-bit. Not sure cortex processors can do vectorized division. check(arm32 ? "vdiv.f32" : "fdiv", 2 * w, f32_1 / f32_2); diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp new file mode 100644 index 000000000000..1a176dbccecd --- /dev/null +++ b/test/correctness/simd_op_check_sve2.cpp @@ -0,0 +1,1387 @@ +#include "simd_op_check.h" + +#include "Halide.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Halide; +using namespace Halide::ConciseCasts; +using namespace std; + +namespace { + +using CastFuncTy = function; + +class SimdOpCheckArmSve : public SimdOpCheckTest { +public: + SimdOpCheckArmSve(Target t, int w = 384, int h = 32) + : SimdOpCheckTest(t, w, h), debug_mode(Internal::get_env_variable("HL_DEBUG_SIMDOPCHECK")) { + + // Determine and hold can_run_the_code + // TODO: Since features of Arm CPU cannot be obtained automatically from get_host_target(), + // it is necessary to set some feature (e.g. "arm_fp16") explicitly to HL_JIT_TARGET. + // Halide throws error if there is unacceptable mismatch between jit_target and host_target. + + Target host = get_host_target(); + Target jit_target = get_jit_target_from_environment(); + cout << "host is: " << host.to_string() << endl; + cout << "HL_TARGET is: " << target.to_string() << endl; + cout << "HL_JIT_TARGET is: " << jit_target.to_string() << endl; + + auto is_same_triple = [](const Target &t1, const Target &t2) -> bool { + return t1.arch == t2.arch && t1.bits == t2.bits && t1.os == t2.os && t1.vector_bits == t2.vector_bits; + }; + + can_run_the_code = is_same_triple(host, target) && is_same_triple(jit_target, target); + + // A bunch of feature flags also need to match between the + // compiled code and the host in order to run the code. + for (Target::Feature f : {Target::ARMv7s, Target::ARMFp16, Target::NoNEON, Target::SVE2}) { + if (target.has_feature(f) != jit_target.has_feature(f)) { + can_run_the_code = false; + } + } + if (!can_run_the_code) { + cout << "[WARN] To perform verification of realization, " + << R"(the target triple "arm--" and key feature "arm_fp16")" + << " must be the same between HL_TARGET and HL_JIT_TARGET" << endl; + } + } + + bool can_run_code() const override { + // If we can meet the condition about target, run the error checking Halide::Func. + return can_run_the_code; + } + + void add_tests() override { + check_arm_integer(); + check_arm_float(); + check_arm_load_store(); + check_arm_pairwise(); + } + +private: + void check_arm_integer() { + // clang-format off + vector> test_params{ + {8, in_i8, in_u8, in_f16, in_i16, in_u16, i8, i8_sat, i16, i8, i8_sat, u8, u8_sat, u16, u8, u8_sat}, + {16, in_i16, in_u16, in_f16, in_i32, in_u32, i16, i16_sat, i32, i8, i8_sat, u16, u16_sat, u32, u8, u8_sat}, + {32, in_i32, in_u32, in_f32, in_i64, in_u64, i32, i32_sat, i64, i16, i16_sat, u32, u32_sat, u64, u16, u16_sat}, + {64, in_i64, in_u64, in_f64, in_i64, in_u64, i64, i64_sat, i64, i32, i32_sat, u64, u64_sat, u64, u32, u32_sat}, + }; + // clang-format on + + for (const auto &[bits, in_i, in_u, in_f, in_i_wide, in_u_wide, + cast_i, satcast_i, widen_i, narrow_i, satnarrow_i, + cast_u, satcast_u, widen_u, narrow_u, satnarrow_u] : test_params) { + + Expr i_1 = in_i(x), i_2 = in_i(x + 16), i_3 = in_i(x + 32); + Expr u_1 = in_u(x), u_2 = in_u(x + 16), u_3 = in_u(x + 32); + Expr i_wide_1 = in_i_wide(x), i_wide_2 = in_i_wide(x + 16); + Expr u_wide_1 = in_u_wide(x), u_wide_2 = in_u_wide(x + 16); + Expr f_1 = in_f(x); + + // TODO: reconcile this comment and logic and figure out + // whether we're test 192 and 256 for NEON and which bit + // widths other that the target one for SVE2. + // + // In general neon ops have the 64-bit version, the 128-bit + // version (ending in q), and the widening version that takes + // 64-bit args and produces a 128-bit result (ending in l). We try + // to peephole match any with vector, so we just try 64-bits, 128 + // bits, 192 bits, and 256 bits for everything. + std::vector simd_bit_widths; + if (has_neon()) { + simd_bit_widths.push_back(64); + simd_bit_widths.push_back(128); + } + if (has_sve() && ((target.vector_bits > 128) || !has_neon())) { + simd_bit_widths.push_back(target.vector_bits); + } + for (auto &total_bits : simd_bit_widths) { + const int vf = total_bits / bits; + + // Due to workaround for SVE LLVM issues, in case of vector of half length of natural_lanes, + // there is some inconsistency in generated SVE insturction about the number of lanes. + // So the verification of lanes is skipped for this specific case. + const int instr_lanes = (total_bits == 64 && has_sve()) ? + Instruction::ANY_LANES : + Instruction::get_instr_lanes(bits, vf, target); + const int widen_lanes = Instruction::get_instr_lanes(bits * 2, vf, target); + const int narrow_lanes = Instruction::get_instr_lanes(bits, vf * 2, target); + + AddTestFunctor add_all(*this, bits, instr_lanes, vf); + AddTestFunctor add_all_vec(*this, bits, instr_lanes, vf, vf != 1); + AddTestFunctor add_8_16_32(*this, bits, instr_lanes, vf, bits != 64); + AddTestFunctor add_16_32_64(*this, bits, instr_lanes, vf, bits != 8); + AddTestFunctor add_16_32(*this, bits, instr_lanes, vf, bits == 16 || bits == 32); + AddTestFunctor add_32(*this, bits, instr_lanes, vf, bits == 32); + + AddTestFunctor add_8_16_32_widen(*this, bits, widen_lanes, vf, bits != 64 && !has_sve()); + + AddTestFunctor add_16_32_64_narrow(*this, bits, narrow_lanes, vf * 2, bits != 8 && !has_sve()); + AddTestFunctor add_16_32_narrow(*this, bits, narrow_lanes, vf * 2, (bits == 16 || bits == 32) && !has_sve()); + AddTestFunctor add_16_narrow(*this, bits, narrow_lanes, vf * 2, bits == 16 && !has_sve()); + + // VABA I - Absolute Difference and Accumulate + if (!has_sve()) { + // Relying on LLVM to detect accumulation + add_8_16_32(sel_op("vaba.s", "saba"), i_1 + absd(i_2, i_3)); + add_8_16_32(sel_op("vaba.u", "uaba"), u_1 + absd(u_2, u_3)); + } + + // VABAL I - Absolute Difference and Accumulate Long + add_8_16_32_widen(sel_op("vabal.s", "sabal"), i_wide_1 + absd(i_2, i_3)); + add_8_16_32_widen(sel_op("vabal.u", "uabal"), u_wide_1 + absd(u_2, u_3)); + + // VABD I, F - Absolute Difference + add_8_16_32(sel_op("vabd.s", "sabd"), absd(i_2, i_3)); + add_8_16_32(sel_op("vabd.u", "uabd"), absd(u_2, u_3)); + + // Via widening, taking abs, then narrowing + add_8_16_32(sel_op("vabd.s", "sabd"), cast_u(abs(widen_i(i_2) - i_3))); + add_8_16_32(sel_op("vabd.u", "uabd"), cast_u(abs(widen_i(u_2) - u_3))); + + // VABDL I - Absolute Difference Long + add_8_16_32_widen(sel_op("vabdl.s", "sabdl"), widen_i(absd(i_2, i_3))); + add_8_16_32_widen(sel_op("vabdl.u", "uabdl"), widen_u(absd(u_2, u_3))); + + // Via widening then taking an abs + add_8_16_32_widen(sel_op("vabdl.s", "sabdl"), abs(widen_i(i_2) - widen_i(i_3))); + add_8_16_32_widen(sel_op("vabdl.u", "uabdl"), abs(widen_i(u_2) - widen_i(u_3))); + + // VABS I, F F, D Absolute + add_8_16_32(sel_op("vabs.s", "abs"), abs(i_1)); + + // VADD I, F F, D Add + add_all_vec(sel_op("vadd.i", "add"), i_1 + i_2); + add_all_vec(sel_op("vadd.i", "add"), u_1 + u_2); + + // VADDHN I - Add and Narrow Returning High Half + add_16_32_64_narrow(sel_op("vaddhn.i", "addhn"), narrow_i((i_1 + i_2) >> (bits / 2))); + add_16_32_64_narrow(sel_op("vaddhn.i", "addhn"), narrow_u((u_1 + u_2) >> (bits / 2))); + + // VADDL I - Add Long + add_8_16_32_widen(sel_op("vaddl.s", "saddl"), widen_i(i_1) + widen_i(i_2)); + add_8_16_32_widen(sel_op("vaddl.u", "uaddl"), widen_u(u_1) + widen_u(u_2)); + + // VADDW I - Add Wide + add_8_16_32_widen(sel_op("vaddw.s", "saddw"), i_1 + i_wide_1); + add_8_16_32_widen(sel_op("vaddw.u", "uaddw"), u_1 + u_wide_1); + + // VAND X - Bitwise AND + // Not implemented in front-end yet + // VBIC I - Bitwise Clear + // VBIF X - Bitwise Insert if False + // VBIT X - Bitwise Insert if True + // skip these ones + + // VCEQ I, F - Compare Equal + add_8_16_32(sel_op("vceq.i", "cmeq", "cmpeq"), select(i_1 == i_2, cast_i(1), cast_i(2))); + add_8_16_32(sel_op("vceq.i", "cmeq", "cmpeq"), select(u_1 == u_2, cast_u(1), cast_u(2))); +#if 0 + // VCGE I, F - Compare Greater Than or Equal + // Halide flips these to less than instead + check("vcge.s8", 16, select(i8_1 >= i8_2, i8(1), i8(2))); + check("vcge.u8", 16, select(u8_1 >= u8_2, u8(1), u8(2))); + check("vcge.s16", 8, select(i16_1 >= i16_2, i16(1), i16(2))); + check("vcge.u16", 8, select(u16_1 >= u16_2, u16(1), u16(2))); + check("vcge.s32", 4, select(i32_1 >= i32_2, i32(1), i32(2))); + check("vcge.u32", 4, select(u32_1 >= u32_2, u32(1), u32(2))); + check("vcge.f32", 4, select(f32_1 >= f32_2, 1.0f, 2.0f)); + check("vcge.s8", 8, select(i8_1 >= i8_2, i8(1), i8(2))); + check("vcge.u8", 8, select(u8_1 >= u8_2, u8(1), u8(2))); + check("vcge.s16", 4, select(i16_1 >= i16_2, i16(1), i16(2))); + check("vcge.u16", 4, select(u16_1 >= u16_2, u16(1), u16(2))); + check("vcge.s32", 2, select(i32_1 >= i32_2, i32(1), i32(2))); + check("vcge.u32", 2, select(u32_1 >= u32_2, u32(1), u32(2))); + check("vcge.f32", 2, select(f32_1 >= f32_2, 1.0f, 2.0f)); +#endif + // VCGT I, F - Compare Greater Than + add_8_16_32(sel_op("vcgt.s", "cmgt", "cmpgt"), select(i_1 > i_2, cast_i(1), cast_i(2))); + add_8_16_32(sel_op("vcgt.u", "cmhi", "cmphi"), select(u_1 > u_2, cast_u(1), cast_u(2))); +#if 0 + // VCLS I - Count Leading Sign Bits + // We don't currently match these, but it wouldn't be hard to do. + check(arm32 ? "vcls.s8" : "cls", 8 * w, max(count_leading_zeros(i8_1), count_leading_zeros(~i8_1))); + check(arm32 ? "vcls.s16" : "cls", 8 * w, max(count_leading_zeros(i16_1), count_leading_zeros(~i16_1))); + check(arm32 ? "vcls.s32" : "cls", 8 * w, max(count_leading_zeros(i32_1), count_leading_zeros(~i32_1))); +#endif + // VCLZ I - Count Leading Zeros + add_8_16_32(sel_op("vclz.i", "clz"), count_leading_zeros(i_1)); + add_8_16_32(sel_op("vclz.i", "clz"), count_leading_zeros(u_1)); + + // VCMP - F, D Compare Setting Flags + // We skip this + + // VCNT I - Count Number of Set Bits + if (!has_sve()) { + // In NEON, there is only cnt for bytes, and then horizontal adds. + add_8_16_32({{sel_op("vcnt.", "cnt"), 8, total_bits == 64 ? 8 : 16}}, vf, popcount(i_1)); + add_8_16_32({{sel_op("vcnt.", "cnt"), 8, total_bits == 64 ? 8 : 16}}, vf, popcount(u_1)); + } else { + add_8_16_32("cnt", popcount(i_1)); + add_8_16_32("cnt", popcount(u_1)); + } + + // VDUP X - Duplicate + add_8_16_32(sel_op("vdup.", "dup", "mov"), cast_i(y)); + add_8_16_32(sel_op("vdup.", "dup", "mov"), cast_u(y)); + + // VEOR X - Bitwise Exclusive OR + // check("veor", 4, bool1 ^ bool2); + + // VEXT I - Extract Elements and Concatenate + // unaligned loads with known offsets should use vext +#if 0 + // We currently don't do this. + check("vext.8", 16, in_i8(x+1)); + check("vext.16", 8, in_i16(x+1)); + check("vext.32", 4, in_i32(x+1)); +#endif + // VHADD I - Halving Add + add_8_16_32(sel_op("vhadd.s", "shadd"), cast_i((widen_i(i_1) + widen_i(i_2)) / 2)); + add_8_16_32(sel_op("vhadd.u", "uhadd"), cast_u((widen_u(u_1) + widen_u(u_2)) / 2)); + + // Halide doesn't define overflow behavior for i32 so we + // can use vhadd instruction. We can't use it for unsigned u8,i16,u16,u32. + add_32(sel_op("vhadd.s", "shadd"), (i_1 + i_2) / 2); + + // VHSUB I - Halving Subtract + add_8_16_32(sel_op("vhsub.s", "shsub"), cast_i((widen_i(i_1) - widen_i(i_2)) / 2)); + add_8_16_32(sel_op("vhsub.u", "uhsub"), cast_u((widen_u(u_1) - widen_u(u_2)) / 2)); + + add_32(sel_op("vhsub.s", "shsub"), (i_1 - i_2) / 2); + + // VMAX I, F - Maximum + add_8_16_32(sel_op("vmax.s", "smax"), max(i_1, i_2)); + add_8_16_32(sel_op("vmax.u", "umax"), max(u_1, u_2)); + + // VMIN I, F - Minimum + add_8_16_32(sel_op("vmin.s", "smin"), min(i_1, i_2)); + add_8_16_32(sel_op("vmin.u", "umin"), min(u_1, u_2)); + + // VMLA I, F F, D Multiply Accumulate + add_8_16_32("mla signed", sel_op("vmla.i", "mla", "(mad|mla)"), i_1 + i_2 * i_3); + add_8_16_32("mla unsigned", sel_op("vmla.i", "mla", "(mad|mla)"), u_1 + u_2 * u_3); + // VMLS I, F F, D Multiply Subtract + add_8_16_32("mls signed", sel_op("vmls.i", "mls", "(mls|msb)"), i_1 - i_2 * i_3); + add_8_16_32("mls unsigned", sel_op("vmls.i", "mls", "(mls|msb)"), u_1 - u_2 * u_3); + + // VMLAL I - Multiply Accumulate Long + // Try to trick LLVM into generating a zext instead of a sext by making + // LLVM think the operand never has a leading 1 bit. zext breaks LLVM's + // pattern matching of mlal. + add_8_16_32_widen(sel_op("vmlal.s", "smlal"), i_wide_1 + widen_i(i_2 & 0x3) * i_3); + add_8_16_32_widen(sel_op("vmlal.u", "umlal"), u_wide_1 + widen_u(u_2) * u_3); + + // VMLSL I - Multiply Subtract Long + add_8_16_32_widen(sel_op("vmlsl.s", "smlsl"), i_wide_1 - widen_i(i_2 & 0x3) * i_3); + add_8_16_32_widen(sel_op("vmlsl.u", "umlsl"), u_wide_1 - widen_u(u_2) * u_3); + + // VMOV X F, D Move Register or Immediate + // This is for loading immediates, which we won't do in the inner loop anyway + + // VMOVL I - Move Long + // For aarch64, llvm does a widening shift by 0 instead of using the sxtl instruction. + add_8_16_32_widen(sel_op("vmovl.s", "sshll"), widen_i(i_1)); + add_8_16_32_widen(sel_op("vmovl.u", "ushll"), widen_u(u_1)); + add_8_16_32_widen(sel_op("vmovl.u", "ushll"), widen_i(u_1)); + + // VMOVN I - Move and Narrow + if (Halide::Internal::get_llvm_version() >= 140 && total_bits >= 128) { + if (is_arm32()) { + add_16_32_64_narrow("vmovn.i", narrow_i(i_1)); + add_16_32_64_narrow("vmovn.i", narrow_u(u_1)); + } else { + add_16_32_64({{"uzp1", bits / 2, narrow_lanes * 2}}, vf * 2, narrow_i(i_1)); + add_16_32_64({{"uzp1", bits / 2, narrow_lanes * 2}}, vf * 2, narrow_u(u_1)); + } + } else { + add_16_32_64_narrow(sel_op("vmovn.i", "xtn"), narrow_i(i_1)); + add_16_32_64_narrow(sel_op("vmovn.i", "xtn"), narrow_u(u_1)); + } + + // VMRS X F, D Move Advanced SIMD or VFP Register to ARM compute Engine + // VMSR X F, D Move ARM Core Register to Advanced SIMD or VFP + // trust llvm to use this correctly + + // VMUL I, F, P F, D Multiply + add_8_16_32(sel_op("vmul.i", "mul"), i_2 * i_1); + add_8_16_32(sel_op("vmul.i", "mul"), u_2 * u_1); + + // VMULL I, F, P - Multiply Long + add_8_16_32_widen(sel_op("vmull.s", "smull"), widen_i(i_1) * i_2); + add_8_16_32_widen(sel_op("vmull.u", "umull"), widen_u(u_1) * u_2); + + // integer division by a constant should use fixed point unsigned + // multiplication, which is done by using a widening multiply + // followed by a narrowing + add_8_16_32_widen(sel_op("vmull.u", "umull"), i_1 / 37); + add_8_16_32_widen(sel_op("vmull.u", "umull"), u_1 / 37); + + // VMVN X - Bitwise NOT + // check("vmvn", ~bool1); + + // VNEG I, F F, D Negate + add_8_16_32(sel_op("vneg.s", "neg"), -i_1); + +#if 0 + // These are vfp, not neon. They only work on scalars + check("vnmla.f32", 4, -(f32_1 + f32_2*f32_3)); + check("vnmla.f64", 2, -(f64_1 + f64_2*f64_3)); + check("vnmls.f32", 4, -(f32_1 - f32_2*f32_3)); + check("vnmls.f64", 2, -(f64_1 - f64_2*f64_3)); + check("vnmul.f32", 4, -(f32_1*f32_2)); + check("vnmul.f64", 2, -(f64_1*f64_2)); + + // Of questionable value. Catching abs calls is annoying, and the + // slow path is only one more op (for the max). + check("vqabs.s8", 16, abs(max(i8_1, -max_i8))); + check("vqabs.s8", 8, abs(max(i8_1, -max_i8))); + check("vqabs.s16", 8, abs(max(i16_1, -max_i16))); + check("vqabs.s16", 4, abs(max(i16_1, -max_i16))); + check("vqabs.s32", 4, abs(max(i32_1, -max_i32))); + check("vqabs.s32", 2, abs(max(i32_1, -max_i32))); +#endif + // VQADD I - Saturating Add + add_8_16_32(sel_op("vqadd.s", "sqadd"), satcast_i(widen_i(i_1) + widen_i(i_2))); + const Expr max_u = UInt(bits).max(); + add_8_16_32(sel_op("vqadd.u", "uqadd"), cast_u(min(widen_u(u_1) + widen_u(u_2), max_u))); + + // Check the case where we add a constant that could be narrowed + add_8_16_32(sel_op("vqadd.u", "uqadd"), cast_u(min(widen_u(u_1) + 17, max_u))); + + // Can't do larger ones because we can't represent the intermediate 128-bit wide ops. + + // VQDMLAL I - Saturating Double Multiply Accumulate Long + // VQDMLSL I - Saturating Double Multiply Subtract Long + // We don't do these, but it would be possible. + + // VQDMULH I - Saturating Doubling Multiply Returning High Half + // VQDMULL I - Saturating Doubling Multiply Long + add_16_32(sel_op("vqdmulh.s", "sqdmulh"), satcast_i((widen_i(i_1) * widen_i(i_2)) >> (bits - 1))); + + // VQMOVN I - Saturating Move and Narrow + // VQMOVUN I - Saturating Move and Unsigned Narrow + add_16_32_64_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(i_1)); + add_16_32_64_narrow(sel_op("vqmovun.s", "sqxtun"), satnarrow_u(i_1)); + const Expr max_u_narrow = UInt(bits / 2).max(); + add_16_32_64_narrow(sel_op("vqmovn.u", "uqxtn"), narrow_u(min(u_1, max_u_narrow))); + // Double saturating narrow + add_16_32_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(i_wide_1)); + add_16_32_narrow(sel_op("vqmovn.u", "uqxtn"), narrow_u(min(u_wide_1, max_u_narrow))); + add_16_32_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(i_wide_1)); + add_16_32_narrow(sel_op("vqmovun.s", "sqxtun"), satnarrow_u(i_wide_1)); + // Triple saturating narrow + Expr i64_1 = in_i64(x), u64_1 = in_u64(x), f32_1 = in_f32(x), f64_1 = in_f64(x); + add_16_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(i64_1)); + add_16_narrow(sel_op("vqmovn.u", "uqxtn"), narrow_u(min(u64_1, max_u_narrow))); + add_16_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(f32_1)); + add_16_narrow(sel_op("vqmovn.s", "sqxtn"), satnarrow_i(f64_1)); + add_16_narrow(sel_op("vqmovun.s", "sqxtun"), satnarrow_u(f32_1)); + add_16_narrow(sel_op("vqmovun.s", "sqxtun"), satnarrow_u(f64_1)); + + // VQNEG I - Saturating Negate + const Expr max_i = Int(bits).max(); + add_8_16_32(sel_op("vqneg.s", "sqneg"), -max(i_1, -max_i)); + + // VQRDMULH I - Saturating Rounding Doubling Multiply Returning High Half + // Note: division in Halide always rounds down (not towards + // zero). Otherwise these patterns would be more complicated. + add_16_32(sel_op("vqrdmulh.s", "sqrdmulh"), satcast_i((widen_i(i_1) * widen_i(i_2) + (1 << (bits - 2))) / (widen_i(1) << (bits - 1)))); + + // VQRSHRN I - Saturating Rounding Shift Right Narrow + // VQRSHRUN I - Saturating Rounding Shift Right Unsigned Narrow + add_16_32_64_narrow(sel_op("vqrshrn.s", "sqrshrn"), satnarrow_i((widen_i(i_1) + 8) / 16)); + add_16_32_64_narrow(sel_op("vqrshrun.s", "sqrshrun"), satnarrow_u((widen_i(i_1) + 8) / 16)); + add_16_32_narrow(sel_op("vqrshrn.u", "uqrshrn"), narrow_u(min((widen_u(u_1) + 8) / 16, max_u_narrow))); + + // VQSHL I - Saturating Shift Left + add_8_16_32(sel_op("vqshl.s", "sqshl"), satcast_i(widen_i(i_1) * 16)); + add_8_16_32(sel_op("vqshl.u", "uqshl"), cast_u(min(widen_u(u_1) * 16, max_u))); + + // VQSHLU I - Saturating Shift Left Unsigned + if (!has_sve()) { + add_8_16_32(sel_op("vqshlu.s", "sqshlu"), satcast_u(widen_i(i_1) * 16)); + } + + // VQSHRN I - Saturating Shift Right Narrow + // VQSHRUN I - Saturating Shift Right Unsigned Narrow + add_16_32_64_narrow(sel_op("vqshrn.s", "sqshrn"), satnarrow_i(i_1 / 16)); + add_16_32_64_narrow(sel_op("vqshrun.s", "sqshrun"), satnarrow_u(i_1 / 16)); + add_16_32_narrow(sel_op("vqshrn.u", "uqshrn"), narrow_u(min(u_1 / 16, max_u_narrow))); + + // VQSUB I - Saturating Subtract + add_8_16_32(sel_op("vqsub.s", "sqsub"), satcast_i(widen_i(i_1) - widen_i(i_2))); + + // N.B. Saturating subtracts are expressed by widening to a igned* type + add_8_16_32(sel_op("vqsub.u", "uqsub"), satcast_u(widen_i(u_1) - widen_i(u_2))); + + // VRADDHN I - Rounding Add and Narrow Returning High Half + add_16_32_64_narrow(sel_op("vraddhn.i", "raddhn"), narrow_i((widen_i(i_1 + i_2) + (Expr(cast_i(1)) << (bits / 2 - 1))) >> (bits / 2))); + add_16_32_narrow(sel_op("vraddhn.i", "raddhn"), narrow_u((widen_u(u_1 + u_2) + (Expr(cast_u(1)) << (bits / 2 - 1))) >> (bits / 2))); + + // VREV16 X - Reverse in Halfwords + // VREV32 X - Reverse in Words + // VREV64 X - Reverse in Doublewords + + // These reverse within each halfword, word, and doubleword + // respectively. Sometimes llvm generates them, and sometimes + // it generates vtbl instructions. + + // VRHADD I - Rounding Halving Add + add_8_16_32(sel_op("vrhadd.s", "srhadd"), cast_i((widen_i(i_1) + widen_i(i_2) + 1) / 2)); + add_8_16_32(sel_op("vrhadd.u", "urhadd"), cast_u((widen_u(u_1) + widen_u(u_2) + 1) / 2)); + + // VRSHL I - Rounding Shift Left + Expr shift = (i_2 % bits) - (bits / 2); + Expr round_s = (cast_i(1) >> min(shift, 0)) / 2; + Expr round_u = (cast_u(1) >> min(shift, 0)) / 2; + add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) << shift)); + add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) << shift)); + + round_s = (cast_i(1) << max(shift, 0)) / 2; + round_u = (cast_u(1) << max(shift, 0)) / 2; + add_8_16_32(sel_op("vrshl.s", "srshl", "srshlr"), cast_i((widen_i(i_1) + round_s) >> shift)); + add_8_16_32(sel_op("vrshl.u", "urshl", "urshlr"), cast_u((widen_u(u_1) + round_u) >> shift)); + + // VRSHR I - Rounding Shift Right + add_8_16_32(sel_op("vrshr.s", "srshr", "srshl"), cast_i((widen_i(i_1) + 1) >> 1)); + add_8_16_32(sel_op("vrshr.u", "urshr", "urshl"), cast_u((widen_u(u_1) + 1) >> 1)); + + // VRSHRN I - Rounding Shift Right Narrow + if (Halide::Internal::get_llvm_version() >= 140) { + // LLVM14 converts RSHRN/RSHRN2 to RADDHN/RADDHN2 when the shift amount is half the width of the vector element + // See https://reviews.llvm.org/D116166 + add_16_32_narrow(sel_op("vrshrn.i", "raddhn"), narrow_i((widen_i(i_1) + (cast_i(1) << (bits / 2 - 1))) >> (bits / 2))); + add_16_32_narrow(sel_op("vrshrn.i", "raddhn"), narrow_u((widen_u(u_1) + (cast_u(1) << (bits / 2 - 1))) >> (bits / 2))); + } + add_16_32_64_narrow(sel_op("vrshrn.i", "rshrn"), narrow_i((widen_i(i_1) + (1 << (bits / 4))) >> (bits / 4 + 1))); + add_16_32_narrow(sel_op("vrshrn.i", "rshrn"), narrow_u((widen_u(u_1) + (1 << (bits / 4))) >> (bits / 4 + 1))); + + // VRSRA I - Rounding Shift Right and Accumulate + if (!has_sve()) { + // Relying on LLVM to detect accumulation + add_8_16_32(sel_op("vrsra.s", "srsra"), i_2 + cast_i((widen_i(i_1) + 1) >> 1)); + add_8_16_32(sel_op("vrsra.u", "ursra"), i_2 + cast_u((widen_u(u_1) + 1) >> 1)); + } + + // VRSUBHN I - Rounding Subtract and Narrow Returning High Half + add_16_32_64_narrow(sel_op("vrsubhn.i", "rsubhn"), narrow_i((widen_i(i_1 - i_2) + (Expr(cast_i(1)) << (bits / 2 - 1))) >> (bits / 2))); + add_16_32_narrow(sel_op("vrsubhn.i", "rsubhn"), narrow_u((widen_u(u_1 - u_2) + (Expr(cast_u(1)) << (bits / 2 - 1))) >> (bits / 2))); + + // VSHL I - Shift Left + add_all_vec(sel_op("vshl.i", "shl", "lsl"), i_1 * 16); + add_all_vec(sel_op("vshl.i", "shl", "lsl"), u_1 * 16); + + if (!has_sve()) { // No equivalent instruction in SVE. + add_all_vec(sel_op("vshl.s", "sshl"), i_1 << shift); + add_all_vec(sel_op("vshl.s", "sshl"), i_1 >> shift); + add_all_vec(sel_op("vshl.u", "ushl"), u_1 << shift); + add_all_vec(sel_op("vshl.u", "ushl"), u_1 >> shift); + } + + // VSHLL I - Shift Left Long + add_8_16_32_widen(sel_op("vshll.s", "sshll"), widen_i(i_1) * 16); + add_8_16_32_widen(sel_op("vshll.u", "ushll"), widen_u(u_1) * 16); + + // VSHR I - Shift Right + add_all_vec(sel_op("vshr.s", "sshr", "asr"), i_1 / 16); + add_all_vec(sel_op("vshr.u", "ushr", "lsr"), u_1 / 16); + + // VSHRN I - Shift Right Narrow + add_16_32_64_narrow(sel_op("vshrn.i", "shrn"), narrow_i(i_1 >> (bits / 2))); + add_16_32_64_narrow(sel_op("vshrn.i", "shrn"), narrow_u(u_1 >> (bits / 2))); + + add_16_32_64_narrow(sel_op("vshrn.i", "shrn"), narrow_i(i_1 / 16)); + add_16_32_64_narrow(sel_op("vshrn.i", "shrn"), narrow_u(u_1 / 16)); + + // VSLI X - Shift Left and Insert + // I guess this could be used for (x*256) | (y & 255)? We don't do bitwise ops on integers, so skip it. + + // VSRA I - Shift Right and Accumulate + if (!has_sve()) { + // Relying on LLVM to detect accumulation + add_all_vec(sel_op("vsra.s", "ssra"), i_2 + i_1 / 16); + add_all_vec(sel_op("vsra.u", "usra"), u_2 + u_1 / 16); + } + + // VSRI X - Shift Right and Insert + // See VSLI + + // VSUB I, F F, D Subtract + add_all_vec(sel_op("vsub.i", "sub"), i_1 - i_2); + add_all_vec(sel_op("vsub.i", "sub"), u_1 - u_2); + + // VSUBHN I - Subtract and Narrow + add_16_32_64_narrow(sel_op("vsubhn.i", "subhn"), narrow_i((i_1 - i_2) >> (bits / 2))); + add_16_32_64_narrow(sel_op("vsubhn.i", "subhn"), narrow_u((u_1 - u_2) >> (bits / 2))); + + // VSUBL I - Subtract Long + add_8_16_32_widen(sel_op("vsubl.s", "ssubl"), widen_i(i_1) - widen_i(i_2)); + add_8_16_32_widen(sel_op("vsubl.u", "usubl"), widen_u(u_1) - widen_u(u_2)); + + add_8_16_32_widen(sel_op("vsubl.s", "ssubl"), widen_i(i_1) - widen_i(in_i(0))); + add_8_16_32_widen(sel_op("vsubl.u", "usubl"), widen_u(u_1) - widen_u(in_u(0))); + + // VSUBW I - Subtract Wide + add_8_16_32_widen(sel_op("vsubw.s", "ssubw"), i_wide_1 - i_1); + add_8_16_32_widen(sel_op("vsubw.u", "usubw"), u_wide_1 - u_1); + } + } + } + + void check_arm_float() { + vector> test_params{ + {16, in_f16, in_u16, in_i16, f16}, + {32, in_f32, in_u32, in_i32, f32}, + {64, in_f64, in_u64, in_i64, f64}, + }; + + for (const auto &[bits, in_f, in_u, in_i, cast_f] : test_params) { + Expr f_1 = in_f(x), f_2 = in_f(x + 16), f_3 = in_f(x + 32); + Expr u_1 = in_u(x); + Expr i_1 = in_i(x); + + // Arithmetic which could throw FP exception could return NaN, which results in output mismatch. + // To avoid that, we need a positive value within certain range + Func in_f_clamped; + in_f_clamped(x) = clamp(in_f(x), cast_f(1e-3f), cast_f(1.0f)); + in_f_clamped.compute_root(); // To prevent LLVM optimization which results in a different instruction + Expr f_1_clamped = in_f_clamped(x); + Expr f_2_clamped = in_f_clamped(x + 16); + + if (bits == 16 && !is_float16_supported()) { + continue; + } + + vector total_bits_params = {256}; // {64, 128, 192, 256}; + if (bits != 64) { + // Add scalar case to verify float16 native operation + total_bits_params.push_back(bits); + } + + for (auto total_bits : total_bits_params) { + const int vf = total_bits / bits; + const bool is_vector = vf > 1; + + const int instr_lanes = Instruction::get_instr_lanes(bits, vf, target); + const int force_vectorized_lanes = Instruction::get_force_vectorized_instr_lanes(bits, vf, target); + + AddTestFunctor add(*this, bits, instr_lanes, vf); + AddTestFunctor add_arm32_f32(*this, bits, vf, is_arm32() && bits == 32); + AddTestFunctor add_arm64(*this, bits, instr_lanes, vf, !is_arm32()); + + add({{sel_op("vabs.f", "fabs"), bits, force_vectorized_lanes}}, vf, abs(f_1)); + add(sel_op("vadd.f", "fadd"), f_1 + f_2); + add(sel_op("vsub.f", "fsub"), f_1 - f_2); + add(sel_op("vmul.f", "fmul"), f_1 * f_2); + add("fdiv", sel_op("vdiv.f", "fdiv", "(fdiv|fdivr)"), f_1 / f_2_clamped); + auto fneg_lanes = has_sve() ? force_vectorized_lanes : instr_lanes; + add({{sel_op("vneg.f", "fneg"), bits, fneg_lanes}}, vf, -f_1); + add({{sel_op("vsqrt.f", "fsqrt"), bits, force_vectorized_lanes}}, vf, sqrt(f_1_clamped)); + + add_arm32_f32(is_vector ? "vceq.f" : "vcmp.f", select(f_1 == f_2, cast_f(1.0f), cast_f(2.0f))); + add_arm32_f32(is_vector ? "vcgt.f" : "vcmp.f", select(f_1 > f_2, cast_f(1.0f), cast_f(2.0f))); + add_arm64(is_vector ? "fcmeq" : "fcmp", select(f_1 == f_2, cast_f(1.0f), cast_f(2.0f))); + add_arm64(is_vector ? "fcmgt" : "fcmp", select(f_1 > f_2, cast_f(1.0f), cast_f(2.0f))); + + add_arm32_f32("vcvt.f32.u", cast_f(u_1)); + add_arm32_f32("vcvt.f32.s", cast_f(i_1)); + add_arm32_f32("vcvt.u32.f", cast(UInt(bits), f_1)); + add_arm32_f32("vcvt.s32.f", cast(Int(bits), f_1)); + // The max of Float(16) is less than that of UInt(16), which generates "nan" in emulator + Expr float_max = Float(bits).max(); + add_arm64("ucvtf", cast_f(min(float_max, u_1))); + add_arm64("scvtf", cast_f(i_1)); + add_arm64({{"fcvtzu", bits, force_vectorized_lanes}}, vf, cast(UInt(bits), f_1)); + add_arm64({{"fcvtzs", bits, force_vectorized_lanes}}, vf, cast(Int(bits), f_1)); + add_arm64({{"frintn", bits, force_vectorized_lanes}}, vf, round(f_1)); + add_arm64({{"frintm", bits, force_vectorized_lanes}}, vf, floor(f_1)); + add_arm64({{"frintp", bits, force_vectorized_lanes}}, vf, ceil(f_1)); + add_arm64({{"frintz", bits, force_vectorized_lanes}}, vf, trunc(f_1)); + + add_arm32_f32({{"vmax.f", bits, force_vectorized_lanes}}, vf, max(f_1, f_2)); + add_arm32_f32({{"vmin.f", bits, force_vectorized_lanes}}, vf, min(f_1, f_2)); + + add_arm64({{"fmax", bits, force_vectorized_lanes}}, vf, max(f_1, f_2)); + add_arm64({{"fmin", bits, force_vectorized_lanes}}, vf, min(f_1, f_2)); + if (bits != 64 && total_bits != 192) { + // Halide relies on LLVM optimization for this pattern, and in some case it doesn't work + add_arm64("fmla", is_vector ? (has_sve() ? "(fmla|fmad)" : "fmla") : "fmadd", f_1 + f_2 * f_3); + add_arm64("fmls", is_vector ? (has_sve() ? "(fmls|fmsb)" : "fmls") : "fmsub", f_1 - f_2 * f_3); + } + if (bits != 64) { + add_arm64(vector{"frecpe", "frecps"}, fast_inverse(f_1_clamped)); + add_arm64(vector{"frsqrte", "frsqrts"}, fast_inverse_sqrt(f_1_clamped)); + } + + if (bits == 16) { + // Some of the math ops (exp,log,pow) for fp16 are converted into "xxx_fp32" call + // and then lowered to Internal::halide_xxx() function. + // In case the target has FP16 feature, native type conversion between fp16 and fp32 should be generated + // instead of emulated equivalent code with other types. + if (is_vector && !has_sve()) { + add_arm64("exp", {{"fcvtl", 16, 4}, {"fcvtn", 16, 4}}, vf, exp(f_1_clamped)); + add_arm64("log", {{"fcvtl", 16, 4}, {"fcvtn", 16, 4}}, vf, log(f_1_clamped)); + add_arm64("pow", {{"fcvtl", 16, 4}, {"fcvtn", 16, 4}}, vf, pow(f_1_clamped, f_2_clamped)); + } else { + add_arm64("exp", "fcvt", exp(f_1_clamped)); + add_arm64("log", "fcvt", log(f_1_clamped)); + add_arm64("pow", "fcvt", pow(f_1_clamped, f_2_clamped)); + } + } + + // No corresponding instructions exists for is_nan, is_inf, is_finite. + // The instructions expected to be generated depends on CodeGen_LLVM::visit(const Call *op) + add_arm64("nan", is_vector ? sel_op("", "fcmge", "fcmuo") : "fcmp", is_nan(f_1)); + add_arm64("inf", {{"fabs", bits, force_vectorized_lanes}}, vf, is_inf(f_1)); + add_arm64("finite", {{"fabs", bits, force_vectorized_lanes}}, vf, is_inf(f_1)); + } + + if (bits == 16) { + // Actually, the following ops are not vectorized because SIMD instruction is unavailable. + // The purpose of the test is just to confirm no error. + // In case the target has FP16 feature, native type conversion between fp16 and fp32 should be generated + // instead of emulated equivalent code with other types. + AddTestFunctor add_f16(*this, 16, 1); + + add_f16("sinf", {{"bl", "sinf"}, {"fcvt", 16, 1}}, 1, sin(f_1_clamped)); + add_f16("asinf", {{"bl", "asinf"}, {"fcvt", 16, 1}}, 1, asin(f_1_clamped)); + add_f16("cosf", {{"bl", "cosf"}, {"fcvt", 16, 1}}, 1, cos(f_1_clamped)); + add_f16("acosf", {{"bl", "acosf"}, {"fcvt", 16, 1}}, 1, acos(f_1_clamped)); + add_f16("tanf", {{"bl", "tanf"}, {"fcvt", 16, 1}}, 1, tan(f_1_clamped)); + add_f16("atanf", {{"bl", "atanf"}, {"fcvt", 16, 1}}, 1, atan(f_1_clamped)); + add_f16("atan2f", {{"bl", "atan2f"}, {"fcvt", 16, 1}}, 1, atan2(f_1_clamped, f_2_clamped)); + add_f16("sinhf", {{"bl", "sinhf"}, {"fcvt", 16, 1}}, 1, sinh(f_1_clamped)); + add_f16("asinhf", {{"bl", "asinhf"}, {"fcvt", 16, 1}}, 1, asinh(f_1_clamped)); + add_f16("coshf", {{"bl", "coshf"}, {"fcvt", 16, 1}}, 1, cosh(f_1_clamped)); + add_f16("acoshf", {{"bl", "acoshf"}, {"fcvt", 16, 1}}, 1, acosh(max(f_1, cast_f(1.0f)))); + add_f16("tanhf", {{"bl", "tanhf"}, {"fcvt", 16, 1}}, 1, tanh(f_1_clamped)); + add_f16("atanhf", {{"bl", "atanhf"}, {"fcvt", 16, 1}}, 1, atanh(clamp(f_1, cast_f(-0.5f), cast_f(0.5f)))); + } + } + } + + void check_arm_load_store() { + vector> test_params = { + {Int(8), in_i8}, {Int(16), in_i16}, {Int(32), in_i32}, {Int(64), in_i64}, {UInt(8), in_u8}, {UInt(16), in_u16}, {UInt(32), in_u32}, {UInt(64), in_u64}, {Float(16), in_f16}, {Float(32), in_f32}, {Float(64), in_f64}}; + + for (const auto &[elt, in_im] : test_params) { + const int bits = elt.bits(); + if ((elt == Float(16) && !is_float16_supported()) || + (is_arm32() && bits == 64)) { + continue; + } + + // LD/ST - Load/Store + for (int width = 64; width <= 64 * 4; width *= 2) { + const int total_lanes = width / bits; + const int instr_lanes = min(total_lanes, 128 / bits); + if (instr_lanes < 2) continue; // bail out scalar op + + // In case of arm32, instruction selection looks inconsistent due to optimization by LLVM + AddTestFunctor add(*this, bits, total_lanes, target.bits == 64); + // NOTE: if the expr is too simple, LLVM might generate "bl memcpy" + Expr load_store_1 = in_im(x) * 3; + + if (has_sve()) { + // in native width, ld1b/st1b is used regardless of data type + const bool allow_byte_ls = (width == target.vector_bits); + add({get_sve_ls_instr("ld1", bits, bits, "", allow_byte_ls ? "b" : "")}, total_lanes, load_store_1); + add({get_sve_ls_instr("st1", bits, bits, "", allow_byte_ls ? "b" : "")}, total_lanes, load_store_1); + } else { + // vector register is not used for simple load/store + string reg_prefix = (width <= 64) ? "d" : "q"; + add({{"st[rp]", reg_prefix + R"(\d\d?)"}}, total_lanes, load_store_1); + add({{"ld[rp]", reg_prefix + R"(\d\d?)"}}, total_lanes, load_store_1); + } + } + + // LD2/ST2 - Load/Store two-element structures + int base_vec_bits = has_sve() ? target.vector_bits : 128; + for (int width = base_vec_bits; width <= base_vec_bits * 4; width *= 2) { + const int total_lanes = width / bits; + const int vector_lanes = total_lanes / 2; + const int instr_lanes = min(vector_lanes, base_vec_bits / bits); + if (instr_lanes < 2) continue; // bail out scalar op + + AddTestFunctor add_ldn(*this, bits, vector_lanes); + AddTestFunctor add_stn(*this, bits, instr_lanes, total_lanes); + + Func tmp1, tmp2; + tmp1(x) = cast(elt, x); + tmp1.compute_root(); + tmp2(x, y) = select(x % 2 == 0, tmp1(x / 2), tmp1(x / 2 + 16)); + tmp2.compute_root().vectorize(x, total_lanes); + Expr load_2 = in_im(x * 2) + in_im(x * 2 + 1); + Expr store_2 = tmp2(0, 0) + tmp2(0, 127); + + if (has_sve()) { + // TODO(inssue needed): Added strided load support. +#if 0 + add_ldn({get_sve_ls_instr("ld2", bits)}, vector_lanes, load_2); +#endif + add_stn({get_sve_ls_instr("st2", bits)}, total_lanes, store_2); + } else { + add_ldn(sel_op("vld2.", "ld2"), load_2); + add_stn(sel_op("vst2.", "st2"), store_2); + } + } + + // Also check when the two expressions interleaved have a common + // subexpression, which results in a vector var being lifted out. + for (int width = base_vec_bits; width <= base_vec_bits * 4; width *= 2) { + const int total_lanes = width / bits; + const int vector_lanes = total_lanes / 2; + const int instr_lanes = Instruction::get_instr_lanes(bits, vector_lanes, target); + if (instr_lanes < 2) continue; // bail out scalar op + + AddTestFunctor add_stn(*this, bits, instr_lanes, total_lanes); + + Func tmp1, tmp2; + tmp1(x) = cast(elt, x); + tmp1.compute_root(); + Expr e = (tmp1(x / 2) * 2 + 7) / 4; + tmp2(x, y) = select(x % 2 == 0, e * 3, e + 17); + tmp2.compute_root().vectorize(x, total_lanes); + Expr store_2 = tmp2(0, 0) + tmp2(0, 127); + + if (has_sve()) { + add_stn({get_sve_ls_instr("st2", bits)}, total_lanes, store_2); + } else { + add_stn(sel_op("vst2.", "st2"), store_2); + } + } + + // LD3/ST3 - Store three-element structures + for (int width = 192; width <= 192 * 4; width *= 2) { + const int total_lanes = width / bits; + const int vector_lanes = total_lanes / 3; + const int instr_lanes = Instruction::get_instr_lanes(bits, vector_lanes, target); + if (instr_lanes < 2) continue; // bail out scalar op + + AddTestFunctor add_ldn(*this, bits, vector_lanes); + AddTestFunctor add_stn(*this, bits, instr_lanes, total_lanes); + + Func tmp1, tmp2; + tmp1(x) = cast(elt, x); + tmp1.compute_root(); + tmp2(x, y) = select(x % 3 == 0, tmp1(x / 3), + x % 3 == 1, tmp1(x / 3 + 16), + tmp1(x / 3 + 32)); + tmp2.compute_root().vectorize(x, total_lanes); + Expr load_3 = in_im(x * 3) + in_im(x * 3 + 1) + in_im(x * 3 + 2); + Expr store_3 = tmp2(0, 0) + tmp2(0, 127); + + if (has_sve()) { + // TODO(issue needed): Added strided load support. +#if 0 + add_ldn({get_sve_ls_instr("ld3", bits)}, vector_lanes, load_3); + add_stn({get_sve_ls_instr("st3", bits)}, total_lanes, store_3); +#endif + } else { + add_ldn(sel_op("vld3.", "ld3"), load_3); + add_stn(sel_op("vst3.", "st3"), store_3); + } + } + + // LD4/ST4 - Store four-element structures + for (int width = 256; width <= 256 * 4; width *= 2) { + const int total_lanes = width / bits; + const int vector_lanes = total_lanes / 4; + const int instr_lanes = Instruction::get_instr_lanes(bits, vector_lanes, target); + if (instr_lanes < 2) continue; // bail out scalar op + + AddTestFunctor add_ldn(*this, bits, vector_lanes); + AddTestFunctor add_stn(*this, bits, instr_lanes, total_lanes); + + Func tmp1, tmp2; + tmp1(x) = cast(elt, x); + tmp1.compute_root(); + tmp2(x, y) = select(x % 4 == 0, tmp1(x / 4), + x % 4 == 1, tmp1(x / 4 + 16), + x % 4 == 2, tmp1(x / 4 + 32), + tmp1(x / 4 + 48)); + tmp2.compute_root().vectorize(x, total_lanes); + Expr load_4 = in_im(x * 4) + in_im(x * 4 + 1) + in_im(x * 4 + 2) + in_im(x * 4 + 3); + Expr store_4 = tmp2(0, 0) + tmp2(0, 127); + + if (has_sve()) { + // TODO(issue needed): Added strided load support. +#if 0 + add_ldn({get_sve_ls_instr("ld4", bits)}, vector_lanes, load_4); + add_stn({get_sve_ls_instr("st4", bits)}, total_lanes, store_4); +#endif + } else { + add_ldn(sel_op("vld4.", "ld4"), load_4); + add_stn(sel_op("vst4.", "st4"), store_4); + } + } + + // SVE Gather/Scatter + if (has_sve()) { + for (int width = 64; width <= 64 * 4; width *= 2) { + const int total_lanes = width / bits; + const int instr_lanes = min(total_lanes, 128 / bits); + if (instr_lanes < 2) continue; // bail out scalar op + + AddTestFunctor add(*this, bits, total_lanes); + Expr index = clamp(cast(in_im(x)), 0, W - 1); + Func tmp; + tmp(x, y) = cast(elt, y); + tmp(x, index) = cast(elt, 1); + tmp.compute_root().update().vectorize(x, total_lanes); + Expr gather = in_im(index); + Expr scatter = tmp(0, 0) + tmp(0, 127); + + const int index_bits = std::max(32, bits); + add({get_sve_ls_instr("ld1", bits, index_bits, "uxtw")}, total_lanes, gather); + add({get_sve_ls_instr("st1", bits, index_bits, "uxtw")}, total_lanes, scatter); + } + } + } + } + + void check_arm_pairwise() { + // A summation reduction that starts at something + // non-trivial, to avoid llvm simplifying accumulating + // widening summations into just widening summations. + auto sum_ = [&](Expr e) { + Func f; + f(x) = cast(e.type(), 123); + f(x) += e; + return f(x); + }; + + // Tests for integer type + { + vector> test_params{ + {8, in_i8, in_u8, i16, i32, u16, u32}, + {16, in_i16, in_u16, i32, i64, u32, u64}, + {32, in_i32, in_u32, i64, i64, u64, u64}, + {64, in_i64, in_u64, i64, i64, u64, u64}, + }; + // clang-format on + + for (const auto &[bits, in_i, in_u, widen_i, widenx4_i, widen_u, widenx4_u] : test_params) { + + for (auto &total_bits : {64, 128}) { + const int vf = total_bits / bits; + const int instr_lanes = Instruction::get_force_vectorized_instr_lanes(bits, vf, target); + AddTestFunctor add(*this, bits, instr_lanes, vf, !(is_arm32() && bits == 64)); // 64 bit is unavailable in neon 32 bit + AddTestFunctor add_8_16_32(*this, bits, instr_lanes, vf, bits != 64); + const int widen_lanes = Instruction::get_instr_lanes(bits, vf * 2, target); + AddTestFunctor add_widen(*this, bits, widen_lanes, vf, bits != 64); + + if (!has_sve()) { + // VPADD I, F - Pairwise Add + // VPMAX I, F - Pairwise Maximum + // VPMIN I, F - Pairwise Minimum + for (int f : {2, 4}) { + RDom r(0, f); + + add(sel_op("vpadd.i", "addp"), sum_(in_i(f * x + r))); + add(sel_op("vpadd.i", "addp"), sum_(in_u(f * x + r))); + add_8_16_32(sel_op("vpmax.s", "smaxp"), maximum(in_i(f * x + r))); + add_8_16_32(sel_op("vpmax.u", "umaxp"), maximum(in_u(f * x + r))); + add_8_16_32(sel_op("vpmin.s", "sminp"), minimum(in_i(f * x + r))); + add_8_16_32(sel_op("vpmin.u", "uminp"), minimum(in_u(f * x + r))); + } + } + + // VPADAL I - Pairwise Add and Accumulate Long + // VPADDL I - Pairwise Add Long + { + int f = 2; + RDom r(0, f); + + // If we're reducing by a factor of two, we can + // use the forms with an accumulator + add_widen(sel_op("vpadal.s", "sadalp"), sum_(widen_i(in_i(f * x + r)))); + add_widen(sel_op("vpadal.u", "uadalp"), sum_(widen_i(in_u(f * x + r)))); + add_widen(sel_op("vpadal.u", "uadalp"), sum_(widen_u(in_u(f * x + r)))); + } + { + int f = 4; + RDom r(0, f); + + // If we're reducing by more than that, that's not + // possible. + // In case of SVE, addlp is unavailable, so adalp is used with accumulator=0 instead. + add_widen(sel_op("vpaddl.s", "saddlp", "sadalp"), sum_(widen_i(in_i(f * x + r)))); + add_widen(sel_op("vpaddl.u", "uaddlp", "uadalp"), sum_(widen_i(in_u(f * x + r)))); + add_widen(sel_op("vpaddl.u", "uaddlp", "uadalp"), sum_(widen_u(in_u(f * x + r)))); + } + + const bool is_arm_dot_prod_available = (!is_arm32() && target.has_feature(Target::ARMDotProd) && bits == 8) || + (has_sve() && (bits == 8 || bits == 16)); + if ((bits == 8 || bits == 16) && !is_arm_dot_prod_available) { // udot/sdot is applied if available + int f = 4; + RDom r(0, f); + // If we're widening the type by a factor of four + // as well as reducing by a factor of four, we + // expect vpaddl followed by vpadal + // Note that when going from u8 to i32 like this, + // the vpaddl is unsigned and the vpadal is a + // signed, because the intermediate type is u16 + const int widenx4_lanes = Instruction::get_instr_lanes(bits * 2, vf, target); + string op_addl, op_adal; + op_addl = sel_op("vpaddl.s", "saddlp"); + op_adal = sel_op("vpadal.s", "sadalp"); + add({{op_addl, bits, widen_lanes}, {op_adal, bits * 2, widenx4_lanes}}, vf, sum_(widenx4_i(in_i(f * x + r)))); + op_addl = sel_op("vpaddl.u", "uaddlp"); + op_adal = sel_op("vpadal.u", "uadalp"); + add({{op_addl, bits, widen_lanes}, {op_adal, bits * 2, widenx4_lanes}}, vf, sum_(widenx4_i(in_u(f * x + r)))); + add({{op_addl, bits, widen_lanes}, {op_adal, bits * 2, widenx4_lanes}}, vf, sum_(widenx4_u(in_u(f * x + r)))); + } + + // UDOT/SDOT + if (is_arm_dot_prod_available) { + const int factor_32bit = vf / 4; + for (int f : {4, 8}) { + // checks vector register for narrow src data type (i.e. 8 or 16 bit) + const int lanes_src = Instruction::get_instr_lanes(bits, f * factor_32bit, target); + AddTestFunctor add_dot(*this, bits, lanes_src, factor_32bit); + RDom r(0, f); + + add_dot("udot", sum(widenx4_u(in_u(f * x + r)) * in_u(f * x + r + 32))); + add_dot("sdot", sum(widenx4_i(in_i(f * x + r)) * in_i(f * x + r + 32))); + if (f == 4) { + // This doesn't generate for higher reduction factors because the + // intermediate is 16-bit instead of 32-bit. It seems like it would + // be slower to fix this (because the intermediate sum would be + // 32-bit instead of 16-bit). + add_dot("udot", sum(widenx4_u(in_u(f * x + r)))); + add_dot("sdot", sum(widenx4_i(in_i(f * x + r)))); + } + } + } + } + } + } + + // Tests for Float type + { + // clang-format off + vector> test_params{ + {16, in_f16}, + {32, in_f32}, + {64, in_f64}, + }; + // clang-format on + if (!has_sve()) { + for (const auto &[bits, in_f] : test_params) { + for (auto &total_bits : {64, 128}) { + const int vf = total_bits / bits; + if (vf < 2) continue; + AddTestFunctor add(*this, bits, vf); + AddTestFunctor add_16_32(*this, bits, vf, bits != 64); + + if (bits == 16 && !is_float16_supported()) { + continue; + } + + for (int f : {2, 4}) { + RDom r(0, f); + + add(sel_op("vadd.f", "faddp"), sum_(in_f(f * x + r))); + add_16_32(sel_op("vmax.f", "fmaxp"), maximum(in_f(f * x + r))); + add_16_32(sel_op("vmin.f", "fminp"), minimum(in_f(f * x + r))); + } + } + } + } + } + } + + struct ArmTask { + vector instrs; + }; + + struct Instruction { + string opcode; + optional operand; + optional bits; + optional pattern_lanes; + static inline const int ANY_LANES = -1; + + // matching pattern for opcode/operand is directly set + Instruction(const string &opcode, const string &operand) + : opcode(opcode), operand(operand), bits(nullopt), pattern_lanes(nullopt) { + } + + // matching pattern for opcode/operand is generated from bits/lanes + Instruction(const string &opcode, int bits, int lanes) + : opcode(opcode), operand(nullopt), bits(bits), pattern_lanes(lanes) { + } + + string generate_pattern(const Target &target) const { + bool is_arm32 = target.bits == 32; + bool has_sve = target.has_feature(Target::SVE2); + + string opcode_pattern; + string operand_pattern; + if (bits && pattern_lanes) { + if (is_arm32) { + opcode_pattern = get_opcode_neon32(); + operand_pattern = get_reg_neon32(); + } else if (!has_sve) { + opcode_pattern = opcode; + operand_pattern = get_reg_neon64(); + } else { + opcode_pattern = opcode; + operand_pattern = get_reg_sve(); + } + } else { + opcode_pattern = opcode; + operand_pattern = operand.value_or(""); + } + // e.g "add v15.h " -> "\s*add\s.*\bv\d\d?\.h\b.*" + return opcode_pattern + R"(\s.*\b)" + operand_pattern + R"(\b.*)"; + } + + // TODO Fix this for SVE2 + static int natural_lanes(int bits) { + return 128 / bits; + } + + static int get_instr_lanes(int bits, int vec_factor, const Target &target) { + return min(natural_lanes(bits), vec_factor); + } + + static int get_force_vectorized_instr_lanes(int bits, int vec_factor, const Target &target) { + // For some cases, where scalar operation is forced to vectorize + if (target.has_feature(Target::SVE2)) { + if (vec_factor == 1) { + return 1; + } else { + return natural_lanes(bits); + } + } else { + int min_lanes = std::max(2, natural_lanes(bits) / 2); // 64 bit wide VL + return max(min_lanes, get_instr_lanes(bits, vec_factor, target)); + } + } + + string get_opcode_neon32() const { + return opcode + to_string(bits.value()); + } + + const char *get_bits_designator() const { + static const map designators{ + // NOTE: vector or float only + {8, "b"}, + {16, "h"}, + {32, "s"}, + {64, "d"}, + }; + auto iter = designators.find(bits.value()); + assert(iter != designators.end()); + return iter->second; + } + + string get_reg_sve() const { + if (pattern_lanes == ANY_LANES) { + return R"((z\d\d?\.[bhsd])|(s\d\d?))"; + } else { + const char *bits_designator = get_bits_designator(); + // TODO(need issue): This should only match the scalar register, and likely a NEON instruction opcode. + // Generating a full SVE vector instruction for a scalar operation is inefficient. However this is + // happening and fixing it involves changing intrinsic selection. Likely to use NEON intrinsics where + // applicable. For now, accept both a scalar operation and a vector one. + std::string scalar_reg_pattern = (pattern_lanes > 1) ? "" : std::string("|(") + bits_designator + R"(\d\d?))"; // e.g. "h15" + + return std::string(R"(((z\d\d?\.)") + bits_designator + ")|(" + + R"(v\d\d?\.)" + to_string(pattern_lanes.value()) + bits_designator + ")" + scalar_reg_pattern + ")"; + } + } + + string get_reg_neon32() const { + return ""; + } + + string get_reg_neon64() const { + const char *bits_designator = get_bits_designator(); + if (pattern_lanes == 1) { + return std::string(bits_designator) + R"(\d\d?)"; // e.g. "h15" + } else if (pattern_lanes == ANY_LANES) { + return R"(v\d\d?\.[bhsd])"; + } else { + return R"(v\d\d?\.)" + to_string(pattern_lanes.value()) + bits_designator; // e.g. "v15.4h" + } + } + }; + + Instruction get_sve_ls_instr(const string &base_opcode, int opcode_bits, int operand_bits, const string &additional = "", const string &optional_type = "") { + static const map opcode_suffix_map = {{8, "b"}, {16, "h"}, {32, "w"}, {64, "d"}}; + static const map operand_suffix_map = {{8, "b"}, {16, "h"}, {32, "s"}, {64, "d"}}; + string opcode_size_specifier; + string operand_size_specifier; + if (!optional_type.empty()) { + opcode_size_specifier = "["; + operand_size_specifier = "["; + } + opcode_size_specifier += opcode_suffix_map.at(opcode_bits); + operand_size_specifier += operand_suffix_map.at(operand_bits); + if (!optional_type.empty()) { + opcode_size_specifier += optional_type; + opcode_size_specifier += "]"; + operand_size_specifier += optional_type; + operand_size_specifier += "]"; + } + const string opcode = base_opcode + opcode_size_specifier; + string operand = R"(z\d\d?\.)" + operand_size_specifier; + if (!additional.empty()) { + operand += ", " + additional; + } + return Instruction(opcode, operand); + } + + Instruction get_sve_ls_instr(const string &base_opcode, int bits) { + return get_sve_ls_instr(base_opcode, bits, bits, ""); + } + + // Helper functor to add test case + class AddTestFunctor { + public: + AddTestFunctor(SimdOpCheckArmSve &p, + int default_bits, + int default_instr_lanes, + int default_vec_factor, + bool is_enabled = true /* false to skip testing */) + : parent(p), default_bits(default_bits), default_instr_lanes(default_instr_lanes), + default_vec_factor(default_vec_factor), is_enabled(is_enabled){}; + + AddTestFunctor(SimdOpCheckArmSve &p, + int default_bits, + // default_instr_lanes is inferred from bits and vec_factor + int default_vec_factor, + bool is_enabled = true /* false to skip testing */) + : parent(p), default_bits(default_bits), + default_instr_lanes(Instruction::get_instr_lanes(default_bits, default_vec_factor, p.target)), + default_vec_factor(default_vec_factor), is_enabled(is_enabled){}; + + // Constructs single Instruction with default parameters + void operator()(const string &opcode, Expr e) { + // Use opcode for name + (*this)(opcode, opcode, e); + } + + // Constructs single Instruction with default parameters except for custom name + void operator()(const string &op_name, const string &opcode, Expr e) { + create_and_register(op_name, {Instruction{opcode, default_bits, default_instr_lanes}}, default_vec_factor, e); + } + + // Constructs multiple Instruction with default parameters + void operator()(const vector &opcodes, Expr e) { + assert(!opcodes.empty()); + (*this)(opcodes[0], opcodes, e); + } + + // Constructs multiple Instruction with default parameters except for custom name + void operator()(const string &op_name, const vector &opcodes, Expr e) { + vector instrs; + for (const auto &opcode : opcodes) { + instrs.emplace_back(opcode, default_bits, default_instr_lanes); + } + create_and_register(op_name, instrs, default_vec_factor, e); + } + + // Set single or multiple Instructions of custom parameters + void operator()(const vector &instructions, int vec_factor, Expr e) { + // Use the 1st opcode for name + assert(!instructions.empty()); + string op_name = instructions[0].opcode; + (*this)(op_name, instructions, vec_factor, e); + } + + // Set single or multiple Instructions of custom parameters, with custom name + void operator()(const string &op_name, const vector &instructions, int vec_factor, Expr e) { + create_and_register(op_name, instructions, vec_factor, e); + } + + private: + void create_and_register(const string &op_name, const vector &instructions, int vec_factor, Expr e) { + if (!is_enabled) return; + + // Generate regular expression for the instruction we check + vector instr_patterns; + transform(instructions.begin(), instructions.end(), back_inserter(instr_patterns), + [t = parent.target](const Instruction &instr) { return instr.generate_pattern(t); }); + + std::stringstream type_name_stream; + type_name_stream << e.type(); + std::string decorated_op_name = op_name + "_" + type_name_stream.str() + "_x" + std::to_string(vec_factor); + auto unique_name = "op_" + decorated_op_name + "_" + std::to_string(parent.tasks.size()); + + // Bail out after generating the unique_name, so that names are + // unique across different processes and don't depend on filter + // settings. + if (!parent.wildcard_match(parent.filter, decorated_op_name)) return; + + // Create a deep copy of the expr and all Funcs referenced by it, so + // that no IR is shared between tests. This is required by the base + // class, and is why we can parallelize. + { + using namespace Halide::Internal; + class FindOutputs : public IRVisitor { + using IRVisitor::visit; + void visit(const Call *op) override { + if (op->func.defined()) { + outputs.insert(op->func); + } + IRVisitor::visit(op); + } + + public: + std::set outputs; + } finder; + e.accept(&finder); + std::vector outputs(finder.outputs.begin(), finder.outputs.end()); + auto env = deep_copy(outputs, build_environment(outputs)).second; + class DeepCopy : public IRMutator { + std::map copied; + using IRMutator::visit; + Expr visit(const Call *op) override { + if (op->func.defined()) { + auto it = env.find(op->name); + if (it != env.end()) { + return Func(it->second)(mutate(op->args)); + } + } + return IRMutator::visit(op); + } + const std::map &env; + + public: + DeepCopy(const std::map &env) + : env(env) { + } + } copier(env); + e = copier.mutate(e); + } + + // Create Task and register + parent.tasks.emplace_back(Task{decorated_op_name, unique_name, vec_factor, e}); + parent.arm_tasks.emplace(unique_name, ArmTask{std::move(instr_patterns)}); + } + + SimdOpCheckArmSve &parent; + int default_bits; + int default_instr_lanes; + int default_vec_factor; + bool is_enabled; + }; + + void compile_and_check(Func error, const string &op, const string &name, int vector_width, const std::vector &arg_types, ostringstream &error_msg) override { + // This is necessary as LLVM validation errors, crashes, etc. don't tell which op crashed. + cout << "Starting op " << op << "\n"; + string fn_name = "test_" + name; + string file_name = output_directory + fn_name; + + auto ext = Internal::get_output_info(target); + std::map outputs = { + {OutputFileType::llvm_assembly, file_name + ext.at(OutputFileType::llvm_assembly).extension}, + {OutputFileType::c_header, file_name + ext.at(OutputFileType::c_header).extension}, + {OutputFileType::object, file_name + ext.at(OutputFileType::object).extension}, + {OutputFileType::assembly, file_name + ".s"}, + }; + + error.compile_to(outputs, arg_types, fn_name, target); + + std::ifstream asm_file; + asm_file.open(file_name + ".s"); + + auto arm_task = arm_tasks.find(name); + assert(arm_task != arm_tasks.end()); + + std::ostringstream msg; + msg << op << " did not generate for target=" << target.to_string() + << " vector_width=" << vector_width << ". Instead we got:\n"; + + string line; + vector matched_lines; + vector &patterns = arm_task->second.instrs; + while (getline(asm_file, line) && !patterns.empty()) { + msg << line << "\n"; + auto pattern = patterns.begin(); + while (pattern != patterns.end()) { + smatch match; + if (regex_search(line, match, regex(*pattern))) { + pattern = patterns.erase(pattern); + matched_lines.emplace_back(match[0]); + } else { + ++pattern; + } + } + } + + if (!patterns.empty()) { + error_msg << "Failed: " << msg.str() << "\n"; + error_msg << "The following instruction patterns were not found:\n"; + for (auto &p : patterns) { + error_msg << p << "\n"; + } + } else if (debug_mode == "1") { + for (auto &l : matched_lines) { + error_msg << " " << setw(20) << name << ", vf=" << setw(2) << vector_width << ", "; + error_msg << l << endl; + } + } + } + + inline const string &sel_op(const string &neon32, const string &neon64) { + return is_arm32() ? neon32 : neon64; + } + + inline const string &sel_op(const string &neon32, const string &neon64, const string &sve) { + return is_arm32() ? neon32 : + target.has_feature(Target::SVE) || target.has_feature(Target::SVE2) ? sve : + neon64; + } + + inline bool is_arm32() const { + return target.bits == 32; + }; + inline bool has_neon() const { + return !target.has_feature(Target::NoNEON); + }; + inline bool has_sve() const { + return target.has_feature(Target::SVE2); + }; + + bool is_float16_supported() const { + return (target.bits == 64) && target.has_feature(Target::ARMFp16); + } + + bool can_run_the_code; + string debug_mode; + std::unordered_map arm_tasks; + const Var x{"x"}, y{"y"}; +}; +} // namespace + +int main(int argc, char **argv) { + if (Halide::Internal::get_llvm_version() < 190) { + std::cout << "[SKIP] simd_op_check_sve2 requires LLVM 19 or later.\n"; + return 0; + } + + return SimdOpCheckTest::main( + argc, argv, + { + Target("arm-64-linux-sve2-no_neon-vector_bits_128"), + Target("arm-64-linux-sve2-no_neon-vector_bits_256"), + }); +} From a132246ced07adc59c7b3631009464e5a14e0abb Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 15 Mar 2024 14:04:44 -0700 Subject: [PATCH 092/186] Fix two compute_with bugs. (#8152) * Fix two compute_with bugs. This PR fixes a bug in compute_with, and another bug I found while fixing it (we could really use a compute_with fuzzer). The first bug is that you can get into situations where the bounds of a producer func will refer directly to the loop variable of a consumer func, where the consumer is in a compute_with fused group. In main, that loop variable may not be defined because fused loop names have been rewritten to include the token ".fused.". This PR adds let stmts to define it just inside the fused loop body. The second bug is that not all parent loops in compute_with fused groups were having their bounds expanded to cover the region to be computed of all children, because the logic for deciding which loops to expand only considered the non-specialized pure definition. So e.g. compute_with applied to an update stage would fail to compute values of the child Func where they do not overlap with the parent Func. This PR visits all definitions of the parent Func of the fused group, instead of just the unspecialized pure definition of the parent Func. Fixes #8149 * clang-tidy --- src/ScheduleFunctions.cpp | 224 ++++++++++++++++++++---------- test/correctness/compute_with.cpp | 87 +++++++++++- 2 files changed, 236 insertions(+), 75 deletions(-) diff --git a/src/ScheduleFunctions.cpp b/src/ScheduleFunctions.cpp index aa45841253b7..8fa2fd71a7a2 100644 --- a/src/ScheduleFunctions.cpp +++ b/src/ScheduleFunctions.cpp @@ -1021,81 +1021,126 @@ class CollectBounds : public IRVisitor { } }; -class SubstituteFusedBounds : public IRMutator { -public: - const map &replacements; - explicit SubstituteFusedBounds(const map &r) - : replacements(r) { +// Rename a loop var in a compute_with cluster to include '.fused.', to +// disambiguate its bounds from the original loop bounds. The '.fused.' token is +// injected somewhere that's not going to change the results of var_name_match, +// so that it's unchanged as a scheduling point. +string fused_name(const string &var) { + size_t last_dot = var.rfind('.'); + internal_assert(last_dot != string::npos); + return var.substr(0, last_dot) + ".fused." + var.substr(last_dot + 1); +} + +// The bounds of every loop exist in 'replacements' should be replaced. The +// loop is also renamed by adding '.fused' in the original name before the +// variable name. +Stmt substitute_fused_bounds(Stmt s, const map &replacements) { + if (!s.defined() || replacements.empty()) { + return s; } -private: - using IRMutator::visit; + class SubstituteFusedBounds : public IRMutator { + const map &replacements; - Stmt visit(const For *op) override { - const auto *min_var = op->min.as(); - const auto *extent_var = op->extent.as(); - if (min_var && extent_var) { - Expr min_val, extent_val; - { - const auto &it = replacements.find(min_var->name); - if (it != replacements.end()) { - min_val = it->second; + using IRMutator::visit; + + Stmt visit(const For *op) override { + const auto *min_var = op->min.as(); + const auto *extent_var = op->extent.as(); + if (min_var && extent_var) { + Expr min_val, extent_val; + { + const auto &it = replacements.find(min_var->name); + if (it != replacements.end()) { + min_val = it->second; + } } - } - { - const auto &it = replacements.find(extent_var->name); - if (it != replacements.end()) { - extent_val = it->second; + { + const auto &it = replacements.find(extent_var->name); + if (it != replacements.end()) { + extent_val = it->second; + } + } + if (!min_val.defined() || !extent_val.defined()) { + return IRMutator::visit(op); + } + + Stmt body = mutate(op->body); + + string new_var = fused_name(op->name); + + ForType for_type = op->for_type; + DeviceAPI device_api = op->device_api; + if (is_const_one(extent_val)) { + // This is the child loop of a fused group. The real loop of the + // fused group is the loop of the parent function of the fused + // group. This child loop is just a scheduling point, and should + // never be a device transition, so we rewrite it to be a simple + // serial loop of extent 1." + for_type = ForType::Serial; + device_api = DeviceAPI::None; } + + Stmt stmt = For::make(new_var, Variable::make(Int(32), new_var + ".loop_min"), + Variable::make(Int(32), new_var + ".loop_extent"), + for_type, op->partition_policy, device_api, body); + + // Add let stmts defining the bound of the renamed for-loop. + stmt = LetStmt::make(new_var + ".loop_min", min_val, stmt); + stmt = LetStmt::make(new_var + ".loop_max", simplify(min_val + extent_val - 1), stmt); + stmt = LetStmt::make(new_var + ".loop_extent", extent_val, stmt); + // Replace any reference to the old loop name with the new one. + stmt = substitute(op->name, Variable::make(Int(32), new_var), stmt); + return stmt; + } else { + return IRMutator::visit(op); } - if (!min_val.defined() || !extent_val.defined()) { + } + + public: + explicit SubstituteFusedBounds(const map &r) + : replacements(r) { + } + } subs(replacements); + + return subs.mutate(s); +} + +// Add letstmts inside each parent loop that define the corresponding child loop +// vars as equal to it. Bounds inference might need a child loop var. +Stmt add_loop_var_aliases(Stmt s, const map> &loop_var_aliases) { + if (!s.defined() || loop_var_aliases.empty()) { + return s; + } + + class AddLoopVarAliases : public IRMutator { + const map> &loop_var_aliases; + + using IRMutator::visit; + + Stmt visit(const For *op) override { + auto it = loop_var_aliases.find(op->name); + if (it == loop_var_aliases.end()) { return IRMutator::visit(op); } + Expr var = Variable::make(Int(32), op->name); Stmt body = mutate(op->body); - - size_t last_dot = op->name.rfind('.'); - internal_assert(last_dot != string::npos); - string new_var = op->name.substr(0, last_dot) + ".fused." + op->name.substr(last_dot + 1); - - ForType for_type = op->for_type; - DeviceAPI device_api = op->device_api; - if (is_const_one(extent_val)) { - // This is the child loop of a fused group. The real loop of the - // fused group is the loop of the parent function of the fused - // group. This child loop is just a scheduling point, and should - // never be a device transition, so we rewrite it to be a simple - // serial loop of extent 1." - for_type = ForType::Serial; - device_api = DeviceAPI::None; + for (const string &alias : it->second) { + body = LetStmt::make(alias, var, body); } - Stmt stmt = For::make(new_var, Variable::make(Int(32), new_var + ".loop_min"), - Variable::make(Int(32), new_var + ".loop_extent"), - for_type, op->partition_policy, device_api, body); + return For::make(op->name, op->min, op->extent, op->for_type, + op->partition_policy, op->device_api, std::move(body)); + } - // Add let stmts defining the bound of the renamed for-loop. - stmt = LetStmt::make(new_var + ".loop_min", min_val, stmt); - stmt = LetStmt::make(new_var + ".loop_max", simplify(min_val + extent_val - 1), stmt); - stmt = LetStmt::make(new_var + ".loop_extent", extent_val, stmt); - // Replace any reference to the old loop name with the new one. - stmt = substitute(op->name, Variable::make(Int(32), new_var), stmt); - return stmt; - } else { - return IRMutator::visit(op); + public: + explicit AddLoopVarAliases(const map> &a) + : loop_var_aliases(a) { } - } -}; + } add_aliases(loop_var_aliases); -// The bounds of every loop exist in 'replacements' should be replaced. The -// loop is also renamed by adding '.fused' in the original name before the -// variable name. -Stmt substitute_fused_bounds(Stmt s, const map &replacements) { - if (!s.defined() || replacements.empty()) { - return s; - } else { - return SubstituteFusedBounds(replacements).mutate(s); - } + return add_aliases.mutate(s); } // Shift the iteration domain of a loop nest by some factor. @@ -1460,7 +1505,9 @@ class InjectFunctionRealization : public IRMutator { } Stmt build_produce_definition(const Function &f, const string &prefix, const Definition &def, bool is_update, - map &replacements, vector> &add_lets) { + map &replacements, + vector> &add_lets, + map> &aliases) { const vector &dims = def.schedule().dims(); // From inner to outer const LoopLevel &fuse_level = def.schedule().fuse_level().level; @@ -1499,6 +1546,10 @@ class InjectFunctionRealization : public IRMutator { replacements.emplace(var + ".loop_extent", make_const(Int(32), 1)); replacements.emplace(var + ".loop_min", val); replacements.emplace(var + ".loop_max", val); + + string var_fused = fused_name(var_orig); + aliases[var_fused].emplace(std::move(var_orig)); + aliases[var_fused].emplace(std::move(var)); } } @@ -1550,18 +1601,17 @@ class InjectFunctionRealization : public IRMutator { // Replace the bounds of the parent fused loop (i.e. the first one to be // realized in the group) with union of the bounds of the fused group. - Stmt replace_parent_bound_with_union_bound(const Function &f, Stmt produce, const map &bounds) { - string prefix = f.name() + ".s0"; - const Definition &def = f.definition(); + Stmt replace_parent_bound_with_union_bound(const string &func, int stage, + const Definition &def, Stmt produce, + const map &bounds, + map &replacements) { - if (!def.defined()) { + if (def.schedule().fused_pairs().empty()) { return produce; } const vector &dims = def.schedule().dims(); // From inner to outer - map replacements; - vector dependence = collect_all_dependence(def); // Compute the union of the bounds of the fused loops. @@ -1582,6 +1632,8 @@ class InjectFunctionRealization : public IRMutator { // the parent, e.g. y.yi and yi. int dim2_idx = (int)(dims_2.size() - (dims.size() - i)); internal_assert(dim2_idx < (int)dims_2.size()); + string var_1 = func + ".s" + std::to_string(stage) + + "." + dims[i].var; string var_2 = pair.func_2 + ".s" + std::to_string(pair.stage_2) + "." + dims_2[dim2_idx].var; @@ -1592,7 +1644,6 @@ class InjectFunctionRealization : public IRMutator { Expr max_2 = bounds.find(var_2 + ".loop_max")->second; Expr extent_2 = bounds.find(var_2 + ".loop_extent")->second; - string var_1 = prefix + "." + dims[i].var; internal_assert(bounds.count(var_1 + ".loop_min")); internal_assert(bounds.count(var_1 + ".loop_max")); internal_assert(bounds.count(var_1 + ".loop_extent")); @@ -1616,8 +1667,26 @@ class InjectFunctionRealization : public IRMutator { } } - // Now, replace the bounds of the parent fused loops with the union bounds. + // Now, replace the bounds of the parent fused loops with the union + // bounds. + for (const auto &spec : def.specializations()) { + produce = replace_parent_bound_with_union_bound(func, stage, spec.definition, produce, bounds, replacements); + } + + return produce; + } + + Stmt replace_parent_bound_with_union_bound(const Function &f, Stmt produce, + const map &bounds) { + map replacements; + + int stage = 0; + produce = replace_parent_bound_with_union_bound(f.name(), stage++, f.definition(), produce, bounds, replacements); + for (const Definition &def : f.updates()) { + produce = replace_parent_bound_with_union_bound(f.name(), stage++, def, produce, bounds, replacements); + } produce = substitute_fused_bounds(produce, replacements); + return produce; } @@ -1748,22 +1817,23 @@ class InjectFunctionRealization : public IRMutator { Stmt producer; map replacements; vector> add_lets; + map> aliases; for (const auto &func_stage : stage_order) { const auto &f = func_stage.first; if (f.has_extern_definition() && (func_stage.second == 0)) { - const Stmt &produceDef = Internal::build_extern_produce(env, f, target); - producer = inject_stmt(producer, produceDef, LoopLevel::inlined().lock()); + const Stmt &produce_def = Internal::build_extern_produce(env, f, target); + producer = inject_stmt(producer, produce_def, LoopLevel::inlined().lock()); continue; } string def_prefix = f.name() + ".s" + std::to_string(func_stage.second) + "."; const auto &def = (func_stage.second == 0) ? f.definition() : f.updates()[func_stage.second - 1]; - const Stmt &produceDef = build_produce_definition(f, def_prefix, def, func_stage.second > 0, - replacements, add_lets); - producer = inject_stmt(producer, produceDef, def.schedule().fuse_level().level); + const Stmt &produce_def = build_produce_definition(f, def_prefix, def, func_stage.second > 0, + replacements, add_lets, aliases); + producer = inject_stmt(producer, produce_def, def.schedule().fuse_level().level); } internal_assert(producer.defined()); @@ -1799,8 +1869,14 @@ class InjectFunctionRealization : public IRMutator { // Replace the bounds of parent fused loop with union of bounds of // the fused loops. + Function group_parent = funcs.back(); producer = replace_parent_bound_with_union_bound(funcs.back(), producer, bounds); + // Define the old loop var names as equal to the corresponding parent + // fused loop var. Bounds inference might refer directly to the original + // loop vars. + producer = add_loop_var_aliases(producer, aliases); + // Add the producer nodes. for (const auto &i : funcs) { producer = ProducerConsumer::make_produce(i.name(), producer); diff --git a/test/correctness/compute_with.cpp b/test/correctness/compute_with.cpp index 053570a2f5c0..0152642028eb 100644 --- a/test/correctness/compute_with.cpp +++ b/test/correctness/compute_with.cpp @@ -2204,6 +2204,89 @@ int two_compute_at_test() { return 0; } +// Test for the issue described in https://github.com/halide/Halide/issues/8149. +int child_var_dependent_bounds_test() { + Func f{"f"}, g{"g"}; + Var x{"x"}, y{"y"}; + RDom r(0, 10, "r"); + + Func f_inter{"f_inter"}, g_inter{"g_inter"}; + + f_inter(x, y) = x; + f_inter(x, y) += 1; + f(x) = x; + f(x) += f_inter(x, r); + + g_inter(x, y) = x; + g_inter(x, y) += 1; + g(x) = x; + g(x) += g_inter(x, r); + + f_inter.compute_at(f, r); + g_inter.compute_at(f, r); + g.update().compute_with(f.update(), r); + f.update().unscheduled(); + + Pipeline p({f, g}); + + p.compile_jit(); + Buffer f_buf(10), g_buf(10); + + f_buf.set_min(2); + p.realize({f_buf, g_buf}); + f_buf.set_min(0); + + for (int i = 0; i < 10; i++) { + int correct_f = 10 + 11 * (i + 2); + int correct_g = 10 + 11 * i; + if (f_buf(i) != correct_f) { + printf("f(%d) = %d instead of %d\n", i, f_buf(i), correct_f); + } + if (g_buf(i) != correct_g) { + printf("g(%d) = %d instead of %d\n", i, g_buf(i), correct_f); + } + } + + return 0; +} + +int overlapping_updates_test() { + Func f{"f"}, g{"g"}; + Var x{"x"}; + + f(x) = 0; + f(x) += x; + g(x) = 0; + g(x) += x; + + g.update().compute_with(f.update(), x); + f.update().unscheduled(); + + Pipeline p({f, g}); + + p.compile_jit(); + Buffer f_buf(10), g_buf(10); + + f_buf.set_min(2); + p.realize({f_buf, g_buf}); + f_buf.set_min(0); + + for (int i = 0; i < 10; i++) { + int correct_f = i + 2; + int correct_g = i; + if (f_buf(i) != correct_f) { + printf("f(%d) = %d instead of %d\n", i, f_buf(i), correct_f); + return 1; + } + if (g_buf(i) != correct_g) { + printf("g(%d) = %d instead of %d\n", i, g_buf(i), correct_f); + return 1; + } + } + + return 0; +} + } // namespace int main(int argc, char **argv) { @@ -2247,7 +2330,9 @@ int main(int argc, char **argv) { {"different arg number compute_at test", different_arg_num_compute_at_test}, {"store_at different levels test", store_at_different_levels_test}, {"rvar bounds test", rvar_bounds_test}, - {"two_compute_at test", two_compute_at_test}, + {"two compute at test", two_compute_at_test}, + {"overlapping updates test", overlapping_updates_test}, + {"child var dependent bounds test", child_var_dependent_bounds_test}, }; using Sharder = Halide::Internal::Test::Sharder; From 8864e8ac1c0bb460f0034e9c46f7f944afad3a19 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Tue, 19 Mar 2024 02:09:09 +0300 Subject: [PATCH 093/186] Python bindings: `add_python_test()`: do set `HL_JIT_TARGET` too (#8156) This one took quite a bit of digging. I wanted to enable opencl tests on debian package, and `boundary_conditions.py`+`division.py` were failing when run with `HL_TARGET=host OCL_ICD_VENDORS=no-opencl-please.missing` env variables with `clGetPlatformIDs failed`, which made no sense to me. Empty `HL_JIT_TARGET` results in `opencl` being detected, unsurprisingly. --- python_bindings/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_bindings/CMakeLists.txt b/python_bindings/CMakeLists.txt index 590ecc432e10..25f61fe7dcdd 100644 --- a/python_bindings/CMakeLists.txt +++ b/python_bindings/CMakeLists.txt @@ -68,7 +68,7 @@ function(add_python_test) list(PREPEND ARG_PYTHONPATH "$/..") list(TRANSFORM ARG_PYTHONPATH PREPEND "PYTHONPATH=path_list_prepend:") - list(PREPEND ARG_ENVIRONMENT "HL_TARGET=${Halide_TARGET}") + list(PREPEND ARG_ENVIRONMENT "HL_TARGET=${Halide_TARGET};HL_JIT_TARGET=${Halide_TARGET}") cmake_path(GET ARG_FILE STEM test_name) set(test_name "${ARG_LABEL}_${test_name}") From a4158c0bf062440e91cbd0b2d5690bc7d82ea568 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Wed, 3 Apr 2024 12:28:25 -0700 Subject: [PATCH 094/186] fix ub in lower rounding shift right (#8173) * Avoid out-of-range shifts in lower_rounding_shift_left/right Consider `lower_rounding_shift_right(a, (uint8)0)` The term b - 1 becomes 255, and now you have an out-of-range shift, which causes the simplifier to inject a signed_integer_overflow intrinsic, and compilation to fail. This is a little annoying because if b == 0, b_positive is a zero mask, so the result isn't used anyway (this is also why this change is legal). In llvm, it's a poison value, not UB, so masking it off works. If the simplifier were smarter, it might just drop the signed_integer_overflow intrinsic on detecting that it was being bitwise-and-ed with zero. But the safest thing to do is not overflow. saturating_add/sub are typically as cheap as add/sub. 99.9% of the time b is some positive constant anyway, so it's going to get constant-folded. * Add test --- src/FindIntrinsics.cpp | 14 ++++++++------ test/correctness/intrinsics.cpp | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp index d453d0134c29..d7b053981ac8 100644 --- a/src/FindIntrinsics.cpp +++ b/src/FindIntrinsics.cpp @@ -1274,10 +1274,11 @@ Expr lower_widening_shift_right(const Expr &a, const Expr &b) { } Expr lower_rounding_shift_left(const Expr &a, const Expr &b) { - // Shift left, then add one to the result if bits were dropped - // (because b < 0) and the most significant dropped bit was a one. + // Shift left, then add one to the result if bits were dropped (because b < 0) + // and the most significant dropped bit was a one. We must take care not + // to introduce UB in the shifts, even if the result would be masked off. Expr b_negative = select(b < 0, make_one(a.type()), make_zero(a.type())); - return simplify((a << b) + (b_negative & (a << (b + 1)))); + return simplify((a << b) + (b_negative & (a << saturating_add(b, make_one(b.type()))))); } Expr lower_rounding_shift_right(const Expr &a, const Expr &b) { @@ -1289,10 +1290,11 @@ Expr lower_rounding_shift_right(const Expr &a, const Expr &b) { Expr round = simplify(cast(a.type(), (1 << shift) - 1)); return rounding_halving_add(a, round) >> shift; } - // Shift right, then add one to the result if bits were dropped - // (because b > 0) and the most significant dropped bit was a one. + // Shift right, then add one to the result if bits were dropped (because b > 0) + // and the most significant dropped bit was a one. We must take care not to + // introduce UB in the shifts, even if the result would be masked off. Expr b_positive = select(b > 0, make_one(a.type()), make_zero(a.type())); - return simplify((a >> b) + (b_positive & (a >> (b - 1)))); + return simplify((a >> b) + (b_positive & (a >> saturating_sub(b, make_one(b.type()))))); } Expr lower_saturating_add(const Expr &a, const Expr &b) { diff --git a/test/correctness/intrinsics.cpp b/test/correctness/intrinsics.cpp index 339a5c2525e5..e5119bd5e1be 100644 --- a/test/correctness/intrinsics.cpp +++ b/test/correctness/intrinsics.cpp @@ -361,6 +361,22 @@ int main(int argc, char **argv) { g.compile_jit(); } + // Rounding shifts by extreme values, when lowered, used to have the + // potential to overflow and turn into out-of-range shifts. The simplifier + // detected this and injected a signed_integer_overflow intrinsic, which + // then threw an error in codegen, even though the rounding shift calls are + // well-defined. + { + Func f, g; + + f(x) = cast(x); + f.compute_root(); + + g(x) = rounding_shift_right(x, 0) + rounding_shift_left(x, 8); + + g.compile_jit(); + } + printf("Success!\n"); return 0; } From 3b8a532538ab8f4fa81b0d74ac7ab5449826e099 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 4 Apr 2024 10:19:13 -0700 Subject: [PATCH 095/186] Add some missing _Float16 support (#8174) (Changes extracted from https://github.com/halide/Halide/pull/8169, which may or may not land in its current form) Some missing support for _Float16 that will likely be handy: - Allow _Float16 to be detected for Clang 15 (since my local XCode Clang 15 definitely supports it) - Expr(_Float16) - HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(_Float16); - Add _Float16 to the convert matrix in halide_image_io.h --- src/Expr.h | 5 ++ src/Type.h | 3 + src/runtime/HalideRuntime.h | 2 +- tools/halide_image_io.h | 118 ++++++++++++++++++++++++++++++++++++ 4 files changed, 127 insertions(+), 1 deletion(-) diff --git a/src/Expr.h b/src/Expr.h index 31850fc56001..b9832c104de8 100644 --- a/src/Expr.h +++ b/src/Expr.h @@ -298,6 +298,11 @@ struct Expr : public Internal::IRHandle { Expr(bfloat16_t x) : IRHandle(Internal::FloatImm::make(BFloat(16), (double)x)) { } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 + explicit Expr(_Float16 x) + : IRHandle(Internal::FloatImm::make(Float(16), (double)x)) { + } +#endif Expr(float x) : IRHandle(Internal::FloatImm::make(Float(32), x)) { } diff --git a/src/Type.h b/src/Type.h index af5447350810..c8a397b3f0a7 100644 --- a/src/Type.h +++ b/src/Type.h @@ -166,6 +166,9 @@ HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::float16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::bfloat16_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(halide_task_t); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(halide_loop_task_t); +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(_Float16); +#endif HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(float); HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(double); HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_buffer_t); diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 1d0843be0329..0379c1f9ab47 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -91,7 +91,7 @@ extern "C" { // Ideally there would be a better way to detect if the type // is supported, even in a compiler independent fashion, but // coming up with one has proven elusive. -#if defined(__clang__) && (__clang_major__ >= 16) && !defined(__EMSCRIPTEN__) && !defined(__i386__) +#if defined(__clang__) && (__clang_major__ >= 15) && !defined(__EMSCRIPTEN__) && !defined(__i386__) #if defined(__is_identifier) #if !__is_identifier(_Float16) #define HALIDE_CPP_COMPILER_HAS_FLOAT16 diff --git a/tools/halide_image_io.h b/tools/halide_image_io.h index e039f7c2e798..1e0cbff01897 100644 --- a/tools/halide_image_io.h +++ b/tools/halide_image_io.h @@ -116,6 +116,12 @@ template<> inline bool convert(const int64_t &in) { return in != 0; } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline bool convert(const _Float16 &in) { + return (float)in != 0; +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline bool convert(const float &in) { return in != 0; @@ -165,6 +171,12 @@ template<> inline uint8_t convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline uint8_t convert(const _Float16 &in) { + return (uint8_t)std::lround((float)in * 255.0f); +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline uint8_t convert(const float &in) { return (uint8_t)std::lround(in * 255.0f); @@ -211,6 +223,12 @@ template<> inline uint16_t convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline uint16_t convert(const _Float16 &in) { + return (uint16_t)std::lround((float)in * 65535.0f); +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline uint16_t convert(const float &in) { return (uint16_t)std::lround(in * 65535.0f); @@ -257,6 +275,12 @@ template<> inline uint32_t convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline uint32_t convert(const _Float16 &in) { + return (uint32_t)std::llround((float)in * 4294967295.0); +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline uint32_t convert(const float &in) { return (uint32_t)std::llround(in * 4294967295.0); @@ -303,6 +327,12 @@ template<> inline uint64_t convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline uint64_t convert(const _Float16 &in) { + return convert((uint32_t)std::llround((float)in * 4294967295.0)); +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline uint64_t convert(const float &in) { return convert((uint32_t)std::llround(in * 4294967295.0)); @@ -349,6 +379,12 @@ template<> inline int8_t convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline int8_t convert(const _Float16 &in) { + return convert((float)in); +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline int8_t convert(const float &in) { return convert(in); @@ -395,6 +431,12 @@ template<> inline int16_t convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline int16_t convert(const _Float16 &in) { + return convert((float)in); +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline int16_t convert(const float &in) { return convert(in); @@ -441,6 +483,12 @@ template<> inline int32_t convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline int32_t convert(const _Float16 &in) { + return convert((float)in); +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline int32_t convert(const float &in) { return convert(in); @@ -487,6 +535,12 @@ template<> inline int64_t convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline int64_t convert(const _Float16 &in) { + return convert((float)in); +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline int64_t convert(const float &in) { return convert(in); @@ -496,6 +550,58 @@ inline int64_t convert(const double &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +// Convert to f16 +template<> +inline _Float16 convert(const bool &in) { + return in; +} +template<> +inline _Float16 convert(const uint8_t &in) { + return (_Float16)(in / 255.0f); +} +template<> +inline _Float16 convert(const uint16_t &in) { + return (_Float16)(in / 65535.0f); +} +template<> +inline _Float16 convert(const uint32_t &in) { + return (_Float16)(in / 4294967295.0); +} +template<> +inline _Float16 convert(const uint64_t &in) { + return convert<_Float16, uint32_t>(uint32_t(in >> 32)); +} +template<> +inline _Float16 convert(const int8_t &in) { + return convert<_Float16, uint8_t>(in); +} +template<> +inline _Float16 convert(const int16_t &in) { + return convert<_Float16, uint16_t>(in); +} +template<> +inline _Float16 convert(const int32_t &in) { + return convert<_Float16, uint64_t>(in); +} +template<> +inline _Float16 convert(const int64_t &in) { + return convert<_Float16, uint64_t>(in); +} +template<> +inline _Float16 convert(const _Float16 &in) { + return in; +} +template<> +inline _Float16 convert(const float &in) { + return (_Float16)in; +} +template<> +inline _Float16 convert(const double &in) { + return (_Float16)in; +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 + // Convert to f32 template<> inline float convert(const bool &in) { @@ -533,6 +639,12 @@ template<> inline float convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline float convert(const _Float16 &in) { + return (float)in; +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline float convert(const float &in) { return in; @@ -579,6 +691,12 @@ template<> inline double convert(const int64_t &in) { return convert(in); } +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 +template<> +inline double convert(const _Float16 &in) { + return (double)in; +} +#endif // HALIDE_CPP_COMPILER_HAS_FLOAT16 template<> inline double convert(const float &in) { return (double)in; From 7d9935740ca1c8790b494c670a79f163f4a4c168 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Fri, 5 Apr 2024 09:07:05 -0700 Subject: [PATCH 096/186] Add conversion code for Float16 that was missed in #8174 (#8178) * Add conversion code for Float16 that was missed in #8174 * Don't sniff for _Float16 when building ASAN * Update HalideRuntime.h --- src/runtime/HalideRuntime.h | 16 ++++++++++++++++ tools/halide_image_io.h | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 0379c1f9ab47..1d66ab02b368 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -86,6 +86,20 @@ extern "C" { #ifndef COMPILING_HALIDE_RUNTIME +// ASAN builds can cause linker errors for Float16, so sniff for that and +// don't enable it by default. +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define HALIDE_RUNTIME_ASAN_DETECTED +#endif +#endif + +#if defined(__SANITIZE_ADDRESS__) && !defined(HALIDE_RUNTIME_ASAN_DETECTED) +#define HALIDE_RUNTIME_ASAN_DETECTED +#endif + +#if !defined(HALIDE_RUNTIME_ASAN_DETECTED) + // clang had _Float16 added as a reserved name in clang 8, but // doesn't actually support it on most platforms until clang 15. // Ideally there would be a better way to detect if the type @@ -108,6 +122,8 @@ extern "C" { #endif #endif +#endif // !HALIDE_RUNTIME_ASAN_DETECTED + #endif // !COMPILING_HALIDE_RUNTIME /** \file diff --git a/tools/halide_image_io.h b/tools/halide_image_io.h index 1e0cbff01897..ff23c30aa995 100644 --- a/tools/halide_image_io.h +++ b/tools/halide_image_io.h @@ -2227,6 +2227,10 @@ struct ImageTypeConversion { const halide_type_t src_type = src.type(); switch (src_type.element_of().as_u32()) { +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 + case halide_type_t(halide_type_float, 16).as_u32(): + return convert_image(src.template as<_Float16, AnyDims>()); +#endif case halide_type_t(halide_type_float, 32).as_u32(): return convert_image(src.template as()); case halide_type_t(halide_type_float, 64).as_u32(): @@ -2272,6 +2276,10 @@ struct ImageTypeConversion { // Call the appropriate static-to-static conversion routine // based on the desired dst type. switch (dst_type.element_of().as_u32()) { +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 + case halide_type_t(halide_type_float, 16).as_u32(): + return convert_image<_Float16>(src); +#endif case halide_type_t(halide_type_float, 32).as_u32(): return convert_image(src); case halide_type_t(halide_type_float, 64).as_u32(): From a46204408f0762479473f0c478327c0a5b7553f1 Mon Sep 17 00:00:00 2001 From: Alexander Root <32245479+rootjalex@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:38:46 -0700 Subject: [PATCH 097/186] Tighten bounds of abs() (#8168) * Tighten bounds of abs() * make abs bounds tight for non-int32 too * make int32 min expression match non-int32 min expression --- dependencies/llvm/CMakeLists.txt | 2 +- src/Bounds.cpp | 30 ++++++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/dependencies/llvm/CMakeLists.txt b/dependencies/llvm/CMakeLists.txt index a4aef94b08de..d070caf53b19 100644 --- a/dependencies/llvm/CMakeLists.txt +++ b/dependencies/llvm/CMakeLists.txt @@ -21,7 +21,7 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") message(STATUS "Using ClangConfig.cmake in: ${Clang_DIR}") if (LLVM_PACKAGE_VERSION VERSION_LESS 16.0) - message(FATAL_ERROR "LLVM version must be 15.0 or newer") + message(FATAL_ERROR "LLVM version must be 16.0 or newer") endif () if (LLVM_PACKAGE_VERSION VERSION_GREATER 19.0) diff --git a/src/Bounds.cpp b/src/Bounds.cpp index 16fd69f3e8fb..d7d337dacfdf 100644 --- a/src/Bounds.cpp +++ b/src/Bounds.cpp @@ -1237,18 +1237,29 @@ class Bounds : public IRVisitor { if (op->is_intrinsic(Call::abs)) { Interval a = arg_bounds.get(0); - interval.min = make_zero(t); + if (a.is_bounded()) { if (equal(a.min, a.max)) { interval = Interval::single_point(Call::make(t, Call::abs, {a.max}, Call::PureIntrinsic)); } else if (op->args[0].type().is_int() && op->args[0].type().bits() >= 32) { - interval.max = Max::make(Cast::make(t, -a.min), Cast::make(t, a.max)); + interval.min = Cast::make(t, Max::make(a.min, -Min::make(make_zero(a.min.type()), a.max))); + interval.max = Cast::make(t, Max::make(-a.min, a.max)); } else { + interval.min = Cast::make(t, Max::make(a.min, -Min::make(make_zero(a.min.type()), a.max))); a.min = Call::make(t, Call::abs, {a.min}, Call::PureIntrinsic); a.max = Call::make(t, Call::abs, {a.max}, Call::PureIntrinsic); interval.max = Max::make(a.min, a.max); } } else { + if (a.has_lower_bound()) { + // If a is strictly positive, then abs(a) is strictly positive. + interval.min = Cast::make(t, Max::make(make_zero(a.min.type()), a.min)); + } else if (a.has_upper_bound()) { + // If a is strictly negative, then abs(a) is strictly positive. + interval.min = Cast::make(t, -Min::make(make_zero(a.max.type()), a.max)); + } else { + interval.min = make_zero(t); + } // If the argument is unbounded on one side, then the max is unbounded. interval.max = Interval::pos_inf(); } @@ -3651,6 +3662,21 @@ void bounds_test() { check(scope, cast(x), 0.0f, 10.0f); check(scope, cast(abs(cast(x))), 0, 10); + check(scope, abs(2 + x), u32(2), u32(12)); + check(scope, abs(x - 11), u32(1), u32(11)); + check(scope, abs(x - 5), u32(0), u32(5)); + check(scope, abs(2 + cast(x)), 2.f, 12.f); + check(scope, abs(cast(x) - 11), 1.f, 11.f); + check(scope, abs(cast(x) - 5), 0.f, 5.f); + check(scope, abs(2 + cast(x)), u8(2), u8(12)); + check(scope, abs(cast(x) - 11), u8(1), u8(11)); + check(scope, abs(cast(x) - 5), u8(0), u8(5)); + scope.push("x", Interval(123, Interval::pos_inf())); + check(scope, abs(x), u32(123), Interval::pos_inf()); + scope.pop("x"); + scope.push("x", Interval(Interval::neg_inf(), -123)); + check(scope, abs(x), u32(123), Interval::pos_inf()); + scope.pop("x"); // Check some vectors check(scope, Ramp::make(x * 2, 5, 5), 0, 40); From 14ae0826dc93f0dcc40465f0bcd6b742fda3f656 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 5 Apr 2024 09:39:07 -0700 Subject: [PATCH 098/186] Clarify the meaning of Shuffle::is_broadcast() (#8158) * Fix horrifying bug in lossless_cast of a subtract * A 'broadcast' shuffle is more complex than it seems I was poking at the Shuffle node, and checking its usage, and it seems that despite the comment, Shuffles that return true for is_broadcast are not the same as a Broadcast node. Instead of repeating the input vector some number of times, it repeats a shuffle of the input vector. This means IRPrinter was incorrect. None of the other usages were bad. This PR makes this clearer in the comment, and fixes IRPrinter. * Revert accidental change --- src/IR.h | 9 ++++----- src/IRPrinter.cpp | 4 ---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/IR.h b/src/IR.h index 31aa3f195e43..d3f6af596f31 100644 --- a/src/IR.h +++ b/src/IR.h @@ -879,11 +879,10 @@ struct Shuffle : public ExprNode { * arguments. */ bool is_interleave() const; - /** Check if this shuffle can be represented as a broadcast. - * For example: - * A uint8 shuffle of with 4*n lanes and indices: - * 0, 1, 2, 3, 0, 1, 2, 3, ....., 0, 1, 2, 3 - * can be represented as a uint32 broadcast with n lanes (factor = 4). */ + /** Check if this shuffle can be represented as a repeating pattern that + * repeats the same shuffle of the single input vector some number of times. + * For example: 0, 3, 1, 1, 0, 3, 1, 1, ....., 0, 3, 1, 1 + */ bool is_broadcast() const; int broadcast_factor() const; diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index a186be1874d7..fb40de78f14a 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -1084,10 +1084,6 @@ void IRPrinter::visit(const Shuffle *op) { << ", " << op->slice_stride() << ", " << op->indices.size() << ")"; - } else if (op->is_broadcast()) { - stream << "broadcast("; - print_list(op->vectors); - stream << ", " << op->broadcast_factor() << ")"; } else { stream << "shuffle("; print_list(op->vectors); From 35f0c29a1930b118edab98b6d22ccad12fe6b3c6 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Sat, 6 Apr 2024 08:17:25 -0700 Subject: [PATCH 099/186] Add .npy support to halide_image_io (#8175) * Add .npy support to halide_image_io The .npy format is NumPy's native format for storing multidimensional arrays (aka tensors/buffers). Being able to load/save in this format makes it (potentially) a lot easier to interchange data with the Python ecosystem, as well as providing a file format that support floating-point data more robustly than any of the others that we current support. This adds load/save support for a useful subset: - We support the int/uint/float types common in Halide (except for f16/bf16 for now) - We don't support reading or writing files that are in `fortran_order` - We don't support any object/struct/etc files, only numeric primitives - We only support loading files that are in the host's endianness (typically little-endian) Note that at present this doesn't support f16 / bf16 formats, but that could likely be added with minimal difficulty. The tricky bit of this is that the reading code has to parse a (limited) Python dict in text form. Please review that part carefully. TODO: we could probably add this as an option for `debug_to_file()` without too much pain in a followup PR. * clang-tidy * clang-tidy * Address review comments * Allow for "keys" as well as 'keys' * Add float16 support * Use old-school parser * clang-tidy --- test/correctness/image_io.cpp | 62 ++++-- tools/halide_image_io.h | 359 +++++++++++++++++++++++++++++----- 2 files changed, 357 insertions(+), 64 deletions(-) diff --git a/test/correctness/image_io.cpp b/test/correctness/image_io.cpp index 132dac492f82..4921aa6f8a02 100644 --- a/test/correctness/image_io.cpp +++ b/test/correctness/image_io.cpp @@ -25,7 +25,10 @@ void test_round_trip(Buffer buf, std::string format) { reloaded.translate(d, buf.dim(d).min() - reloaded.dim(d).min()); } - Tools::save_image(reloaded, Internal::get_test_tmp_dir() + "test_reloaded." + format); + o = std::ostringstream(); + o << Internal::get_test_tmp_dir() << "test_" << halide_type_of() << "x" << buf.channels() << ".reloaded." << format; + filename = o.str(); + Tools::save_image(reloaded, filename); // Check they're not too different. RDom r(reloaded); @@ -33,15 +36,15 @@ void test_round_trip(Buffer buf, std::string format) { for (int i = 0; i < r.dimensions(); ++i) { args.push_back(r[i]); } - uint32_t diff = evaluate(maximum(abs(cast(buf(args)) - cast(reloaded(args))))); + double diff = evaluate(maximum(abs(cast(buf(args)) - cast(reloaded(args))))); - uint32_t max_diff = 0; + double max_diff = 0.00001; if (format == "jpg") { max_diff = 32; } if (diff > max_diff) { - printf("test_round_trip: Difference of %d when saved and loaded as %s\n", diff, format.c_str()); - abort(); + printf("test_round_trip: Difference of %f when saved and loaded as %s\n", diff, format.c_str()); + exit(1); } } @@ -62,7 +65,7 @@ void test_convert_image_s2s(Buffer buf) { uint32_t diff = evaluate(maximum(abs(cast(buf(args)) - cast(buf2(args))))); if (diff > 0) { printf("test_convert_image_s2s: Difference of %d when converted\n", diff); - abort(); + exit(1); } } @@ -85,7 +88,7 @@ void test_convert_image_d2s(Buffer buf) { uint32_t diff = evaluate(maximum(abs(cast(buf(args)) - cast(buf2(args))))); if (diff > 0) { printf("test_convert_image_d2s: Difference of %d when converted\n", diff); - abort(); + exit(1); } } @@ -110,7 +113,7 @@ void test_convert_image_s2d(Buffer buf) { uint32_t diff = evaluate(maximum(abs(cast(buf(args)) - cast(buf2(args))))); if (diff > 0) { printf("test_convert_image_s2d: Difference of %d when converted\n", diff); - abort(); + exit(1); } } @@ -135,7 +138,7 @@ void test_convert_image_d2d(Buffer<> buf_d) { uint32_t diff = evaluate(maximum(abs(cast(buf(args)) - cast(buf2(args))))); if (diff > 0) { printf("test_convert_image_d2d: Difference of %d when converted\n", diff); - abort(); + exit(1); } } @@ -166,8 +169,8 @@ void do_test() { // Make some colored noise Func f; Var x, y, c, w; - const float one = std::numeric_limits::max(); - f(x, y, c) = cast(clamp(make_noise(10)(x, y, c), 0.0f, 1.0f) * one); + const Expr one = std::is_floating_point::value ? Expr(1.0) : Expr(std::numeric_limits::max()); + f(x, y, c) = cast(clamp(make_noise(10)(x, y, c), Expr(0.0), Expr(1.0)) * one); Buffer color_buf = f.realize({width, height, 3}); @@ -176,16 +179,19 @@ void do_test() { color_buf.crop(0, inset, width - inset * 2); color_buf.crop(1, inset, height - inset * 2); - test_convert_image_s2s(color_buf); - test_convert_image_s2d(color_buf); - test_convert_image_d2s(color_buf); - test_convert_image_d2d(color_buf); + const auto ht = halide_type_of(); + if (ht == halide_type_t(halide_type_uint, 8) || ht == halide_type_t(halide_type_uint, 16)) { + test_convert_image_s2s(color_buf); + test_convert_image_s2d(color_buf); + test_convert_image_d2s(color_buf); + test_convert_image_d2d(color_buf); + } Buffer luma_buf(width, height, 1); luma_buf.copy_from(color_buf); luma_buf.slice(2); - std::vector formats = {"ppm", "pgm", "tmp", "mat", "tiff"}; + std::vector formats = {"npy", "ppm", "pgm", "tmp", "mat", "tiff"}; #ifndef HALIDE_NO_JPEG formats.push_back("jpg"); #endif @@ -193,7 +199,14 @@ void do_test() { formats.push_back("png"); #endif for (std::string format : formats) { - if (format == "jpg" && halide_type_of() != halide_type_t(halide_type_uint, 8)) { + // .npy is the only format here that supports float16 + if (halide_type_of() == halide_type_t(halide_type_float, 16) && format != "npy") { + continue; + } + if ((format == "jpg" || format == "pgm" || format == "ppm") && ht != halide_type_t(halide_type_uint, 8)) { + continue; + } + if (format == "png" && ht != halide_type_t(halide_type_uint, 8) && ht != halide_type_t(halide_type_uint, 16)) { continue; } if (format == "tmp") { @@ -238,7 +251,7 @@ void test_mat_header() { std::ifstream fs(filename.c_str(), std::ifstream::binary); if (!fs) { std::cout << "Cannot read " << filename << "\n"; - abort(); + exit(1); } fs.seekg(0, fs.end); // .mat file begins with a 128 bytes header and a 8 bytes @@ -251,13 +264,24 @@ void test_mat_header() { fs.close(); if (file_size != stored_file_size) { std::cout << "Wrong file size written for " << filename << ". Expected " << file_size << ", got" << stored_file_size << "\n"; - abort(); + exit(1); } } int main(int argc, char **argv) { + do_test(); + do_test(); + do_test(); + do_test(); do_test(); do_test(); + do_test(); + do_test(); + do_test(); +#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16 + do_test<_Float16>(); +#endif + do_test(); test_mat_header(); printf("Success!\n"); return 0; diff --git a/tools/halide_image_io.h b/tools/halide_image_io.h index ff23c30aa995..1a0d250b746f 100644 --- a/tools/halide_image_io.h +++ b/tools/halide_image_io.h @@ -1166,6 +1166,317 @@ bool save_ppm(ImageType &im, const std::string &filename) { return Internal::save_pnm(im, 3, filename); } +// -------------- .npy file format +// Based on documentation at https://numpy.org/devdocs/reference/generated/numpy.lib.format.html +// and elsewhere + +#if (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN) || defined(HALIDE_FORCE_BIG_ENDIAN) +constexpr bool host_is_big_endian = true; +#else +constexpr bool host_is_big_endian = false; +#endif + +constexpr char little_endian_char = '<'; +constexpr char big_endian_char = '>'; +constexpr char no_endian_char = '|'; +constexpr char host_endian_char = (host_is_big_endian ? big_endian_char : little_endian_char); + +struct npy_dtype_info_t { + char byte_order; + char type_code; + char type_bytes; + + std::string descr() const { + return std::string(1, byte_order) + std::string(1, type_code) + std::to_string((int)type_bytes); + } +}; + +inline static const std::array, 11> npy_dtypes = {{ + {halide_type_t(halide_type_float, 16), {host_endian_char, 'f', 2}}, + {halide_type_of(), {host_endian_char, 'f', sizeof(float)}}, + {halide_type_of(), {host_endian_char, 'f', sizeof(double)}}, + {halide_type_of(), {no_endian_char, 'i', sizeof(int8_t)}}, + {halide_type_of(), {host_endian_char, 'i', sizeof(int16_t)}}, + {halide_type_of(), {host_endian_char, 'i', sizeof(int32_t)}}, + {halide_type_of(), {host_endian_char, 'i', sizeof(int64_t)}}, + {halide_type_of(), {no_endian_char, 'u', sizeof(uint8_t)}}, + {halide_type_of(), {host_endian_char, 'u', sizeof(uint16_t)}}, + {halide_type_of(), {host_endian_char, 'u', sizeof(uint32_t)}}, + {halide_type_of(), {host_endian_char, 'u', sizeof(uint64_t)}}, +}}; + +inline static const std::array npy_magic_string = {'\x93', 'N', 'U', 'M', 'P', 'Y'}; +inline static const std::array npy_v1_bytes = {'\x01', '\x00'}; + +inline std::string trim_whitespace(const std::string &s) { + const size_t first = s.find_first_not_of(" \t\n"); + if (first == std::string::npos) { + return ""; + } + const size_t last = s.find_last_not_of(" \t\n"); + return s.substr(first, (last - first + 1)); +} + +struct NpyHeader { + char type_code; + int type_bytes; + std::vector extents; + + bool parse(const std::string &header) { + const char *ptr = &header[0]; + if (*ptr++ != '{') { + return false; + } + while (true) { + char endian; + int consumed; + if (std::sscanf(ptr, "'descr': '%c%c%d'%n", &endian, &type_code, &type_bytes, &consumed) == 3) { + if (endian != '<' && endian != '|') { + return false; + } + ptr += consumed; + } else if (std::strncmp(ptr, "'fortran_order': False", 22) == 0) { + ptr += 22; + } else if (std::strncmp(ptr, "'shape': (", 10) == 0) { + ptr += 10; + int n; + while (std::sscanf(ptr, "%d%n", &n, &consumed) == 1) { + extents.push_back(n); + ptr += consumed; + if (*ptr == ',') { + ptr++; + } + if (*ptr == ' ') { + ptr++; + } + } + if (*ptr++ != ')') { + return false; + } + } else if (*ptr == '}') { + return true; + } else { + return false; + } + if (*ptr == ',') { + ptr++; + } + if (*ptr == ' ') { + ptr++; + } + assert(ptr <= &header.back()); + } + } +}; + +// return true iff the buffer storage has no padding between +// any elements, and is in strictly planar order. +template +bool buffer_is_compact_planar(ImageType &im) { + const halide_type_t im_type = im.type(); + const size_t elem_size = (im_type.bits / 8); + if (((const uint8_t *)im.begin() + (im.number_of_elements() * elem_size)) != (const uint8_t *)im.end()) { + return false; + } + for (int d = 1; d < im.dimensions(); ++d) { + if (im.dim(d - 1).stride() > im.dim(d).stride()) { + return false; + } + // Strides can only match if the previous dimension has extent 1 + // (this can happen when artificially adding dimension(s), e.g. + // to write a .tmp file) + if (im.dim(d - 1).stride() == im.dim(d).stride() && im.dim(d - 1).extent() != 1) { + return false; + } + } + return true; +} + +template +bool load_npy(const std::string &filename, ImageType *im) { + static_assert(!ImageType::has_static_halide_type, ""); + + FileOpener f(filename, "rb"); + if (!check(f.f != nullptr, "File could not be opened for reading")) { + return false; + } + + char magic_and_version[8]; + if (!check(f.read_bytes(magic_and_version, 8), "Could not read .npy header")) { + return false; + } + if (memcmp(magic_and_version, npy_magic_string.data(), npy_magic_string.size()) != 0) { + return check(false, "Bad .npy magic string"); + } + if ((magic_and_version[6] != 1 && magic_and_version[6] != 2 && magic_and_version[6] != 3) || magic_and_version[7] != 0) { + return check(false, "Bad .npy version"); + } + size_t header_len; + uint8_t header_len_le[4]; + if (magic_and_version[6] == 1) { + if (!check(f.read_bytes(header_len_le, 2), "Could not read .npy header")) { + return false; + } + header_len = (header_len_le[0] << 0) | (header_len_le[1] << 8); + if (!check((6 + 2 + 2 + header_len) % 64 == 0, ".npy header is not aligned properly")) { + return false; + } + } else { + if (!check(f.read_bytes(header_len_le, 4), "Could not read .npy header")) { + return false; + } + header_len = (header_len_le[0] << 0) | (header_len_le[1] << 8) | (header_len_le[2] << 16) | (header_len_le[3] << 24); + if (!check((6 + 2 + 4 + header_len) % 64 == 0, ".npy header is not aligned properly")) { + return false; + } + } + + std::string header(header_len + 1, ' '); + if (!check(f.read_bytes(header.data(), header_len), "Could not read .npy header string")) { + return false; + } + + NpyHeader h; + if (!check(h.parse(header), "Could not parse .npy header dict")) { + return false; + } + + halide_type_t im_type((halide_type_code_t)0, 0, 0); + for (const auto &d : npy_dtypes) { + if (h.type_code == d.second.type_code && h.type_bytes == d.second.type_bytes) { + im_type = d.first; + break; + } + } + if (!check(im_type.bits != 0, "Unsupported type in load_npy")) { + return false; + } + + *im = ImageType(im_type, h.extents); + + // This should never fail unless the default Buffer<> constructor behavior changes. + if (!check(buffer_is_compact_planar(*im), "load_npy() requires compact planar images")) { + return false; + } + + if (!check(f.read_bytes(im->begin(), im->size_in_bytes()), "Count not read .npy payload")) { + return false; + } + + im->set_host_dirty(); + return true; +} + +template +bool write_planar_payload(ImageType &im, FileOpener &f) { + if (im.dimensions() == 0 || buffer_is_compact_planar(im)) { + // Contiguous buffer! Write it all in one swell foop. + if (!check(f.write_bytes(im.begin(), im.size_in_bytes()), "Count not write planar payload")) { + return false; + } + } else { + // We have to do this the hard way. + int d = im.dimensions() - 1; + for (int i = im.dim(d).min(); i <= im.dim(d).max(); i++) { + auto slice = im.sliced(d, i); + if (!write_planar_payload(slice, f)) { + return false; + } + } + } + return true; +} + +template +bool save_npy(ImageType &im, const std::string &filename) { + static_assert(!ImageType::has_static_halide_type, ""); + + if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) { + return false; + } + + const halide_type_t im_type = im.type(); + npy_dtype_info_t di = {0, 0, 0}; + for (const auto &d : npy_dtypes) { + if (d.first == im_type) { + di = d.second; + break; + } + } + if (!check(di.byte_order != 0, "Unsupported type in save_npy")) { + return false; + } + + std::string shape = "("; + for (int d = 0; d < im.dimensions(); ++d) { + if (d > 0) { + shape += ","; + } + shape += std::to_string(im.dim(d).extent()); + if (im.dimensions() == 1) { + shape += ","; // special-case for single-element tuples + } + } + shape += ")"; + + std::string header_dict_str = "{'descr': '" + di.descr() + "', 'fortran_order': False, 'shape': " + shape + "}\n"; + + const size_t unpadded_length = npy_magic_string.size() + npy_v1_bytes.size() + 2 + header_dict_str.size(); + const size_t padded_length = (unpadded_length + 64 - 1) & ~(64 - 1); + const size_t padding = padded_length - unpadded_length; + header_dict_str += std::string(padding, ' '); + + if (!check(header_dict_str.size() <= 65535, "Header is too large for v1 .npy file")) { + return false; + } + const uint16_t header_len = (uint16_t)(header_dict_str.size()); + const uint8_t header_len_le[2] = { + (uint8_t)((header_len >> 0) & 0xff), + (uint8_t)((header_len >> 8) & 0xff)}; + + FileOpener f(filename, "wb"); + if (!check(f.write_bytes(npy_magic_string.data(), npy_magic_string.size()), ".npy write failed")) { + return false; + } + if (!check(f.write_bytes(npy_v1_bytes.data(), npy_v1_bytes.size()), ".npy write failed")) { + return false; + } + if (!check(f.write_bytes(header_len_le, 2), ".npy write failed")) { + return false; + } + if (!check(f.write_bytes(header_dict_str.data(), header_dict_str.size()), ".npy write failed")) { + return false; + } + + if (!write_planar_payload(im, f)) { + return false; + } + + return true; +} + +inline const std::set &query_npy() { + auto build_set = []() -> std::set { + // NumPy doesn't support bfloat16, not sure if they plan to, + // so we don't attempt to support it here + std::set s; + for (halide_type_code_t code : {halide_type_int, halide_type_uint, halide_type_float}) { + for (int bits : {8, 16, 32, 64}) { + if (code == halide_type_float && bits < 16) { + continue; + } + for (int dims : {1, 2, 3, 4}) { + s.insert({halide_type_t(code, bits), dims}); + } + } + } + return s; + }; + + static std::set info = build_set(); + return info; +} + #ifndef HALIDE_NO_JPEG template @@ -1293,29 +1604,6 @@ inline const halide_type_t *tmp_code_to_halide_type() { return tmp_code_to_halide_type_; } -// return true iff the buffer storage has no padding between -// any elements, and is in strictly planar order. -template -bool buffer_is_compact_planar(ImageType &im) { - const halide_type_t im_type = im.type(); - const size_t elem_size = (im_type.bits / 8); - if (((const uint8_t *)im.begin() + (im.number_of_elements() * elem_size)) != (const uint8_t *)im.end()) { - return false; - } - for (int d = 1; d < im.dimensions(); ++d) { - if (im.dim(d - 1).stride() > im.dim(d).stride()) { - return false; - } - // Strides can only match if the previous dimension has extent 1 - // (this can happen when artificially adding dimension(s), e.g. - // to write a .tmp file) - if (im.dim(d - 1).stride() == im.dim(d).stride() && im.dim(d - 1).extent() != 1) { - return false; - } - } - return true; -} - // ".tmp" is a file format used by the ImageStack tool (see https://github.com/abadams/ImageStack) template bool load_tmp(const std::string &filename, ImageType *im) { @@ -1371,26 +1659,6 @@ inline const std::set &query_tmp() { return info; } -template -bool write_planar_payload(ImageType &im, FileOpener &f) { - if (im.dimensions() == 0 || buffer_is_compact_planar(im)) { - // Contiguous buffer! Write it all in one swell foop. - if (!check(f.write_bytes(im.begin(), im.size_in_bytes()), "Count not write .tmp payload")) { - return false; - } - } else { - // We have to do this the hard way. - int d = im.dimensions() - 1; - for (int i = im.dim(d).min(); i <= im.dim(d).max(); i++) { - auto slice = im.sliced(d, i); - if (!write_planar_payload(slice, f)) { - return false; - } - } - } - return true; -} - // ".tmp" is a file format used by the ImageStack tool (see https://github.com/abadams/ImageStack) template bool save_tmp(ImageType &im, const std::string &filename) { @@ -2121,6 +2389,7 @@ bool find_imageio(const std::string &filename, ImageIO *result {"jpeg", {load_jpg, save_jpg, query_jpg}}, {"jpg", {load_jpg, save_jpg, query_jpg}}, #endif + {"npy", {load_npy, save_npy, query_npy}}, {"pgm", {load_pgm, save_pgm, query_pgm}}, #ifndef HALIDE_NO_PNG {"png", {load_png, save_png, query_png}}, @@ -2441,7 +2710,7 @@ class load_image { operator ImageType() { using DynamicImageType = typename Internal::ImageTypeWithElemType::type; DynamicImageType im_d; - (void)load(filename, &im_d); + Internal::CheckFail(load(filename, &im_d), "load() failed"); Internal::CheckFail(ImageType::can_convert_from(im_d), "Type mismatch assigning the result of load_image. " "Did you mean to use load_and_convert_image?"); @@ -2464,7 +2733,7 @@ class load_and_convert_image { inline operator ImageType() { using DynamicImageType = typename Internal::ImageTypeWithElemType::type; DynamicImageType im_d; - (void)load(filename, &im_d); + Internal::CheckFail(load(filename, &im_d), "load() failed"); const halide_type_t expected_type = ImageType::static_halide_type(); if (im_d.type() == expected_type) { return im_d.template as(); From e3d3c8cacfe6d664a8994166d0998f362bf55ce8 Mon Sep 17 00:00:00 2001 From: Martijn Courteaux Date: Mon, 8 Apr 2024 17:29:33 +0200 Subject: [PATCH 100/186] Fix unused variable. (#8180) --- src/FindCalls.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/FindCalls.cpp b/src/FindCalls.cpp index 9345c89dcac5..1fca6de1175c 100644 --- a/src/FindCalls.cpp +++ b/src/FindCalls.cpp @@ -55,7 +55,7 @@ void populate_environment_helper(const Function &f, auto insert_func = [](const Function &f, std::map *env, std::vector *order) { - auto [it, inserted] = env->emplace(f.name(), f); + bool inserted = env->emplace(f.name(), f).second; if (inserted) { order->push_back(f); } From 8f3f6cff6996afe993883d4fbb3bf99f2f700fb1 Mon Sep 17 00:00:00 2001 From: Fabian Schuetze Date: Thu, 11 Apr 2024 18:58:36 +0200 Subject: [PATCH 101/186] Update Hexagon Install Instructions (#8182) update Hexagon install instructions --- README.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c5dfe5507a8b..839785441292 100644 --- a/README.md +++ b/README.md @@ -406,15 +406,12 @@ branch.) ### 2. Download and install the Hexagon SDK and Hexagon Tools -Go to https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools - -1. Select the Hexagon Series 600 Software and download & run QPM and install - the Hexagon SDK 4.3.0 version or later for Linux. -2. untar the installer -3. Run the extracted installer to install the Hexagon SDK and Hexagon Tools, - selecting Installation of Hexagon SDK into `/location/of/SDK/Hexagon_SDK/4.x` - and the Hexagon tools into `/location/of/SDK/Hexagon_Tools/8.x` -4. Set an environment variable to point to the SDK installation location +Go to https://qpm.qualcomm.com/#/main/home + +1. Go to Tools, and download Qualcomm Package Manager 3. Install the package manager on your machine. +2. Run the installed Qualcomm Package Manager and install the Qualcomm Hexagon SDK 5.x (or 4.x). + The SDK can be selected from the Qualcomm Hexagon SDK Products. +3. Set an environment variable to point to the SDK installation location ``` export SDK_LOC=/location/of/SDK ``` From dc837074c4ca73583c3541ea54438d7fda84fdf9 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Thu, 11 Apr 2024 11:04:42 -0700 Subject: [PATCH 102/186] Add .npy support to debug_to_file() (#8177) * Add .npy support to halide_image_io The .npy format is NumPy's native format for storing multidimensional arrays (aka tensors/buffers). Being able to load/save in this format makes it (potentially) a lot easier to interchange data with the Python ecosystem, as well as providing a file format that support floating-point data more robustly than any of the others that we current support. This adds load/save support for a useful subset: - We support the int/uint/float types common in Halide (except for f16/bf16 for now) - We don't support reading or writing files that are in `fortran_order` - We don't support any object/struct/etc files, only numeric primitives - We only support loading files that are in the host's endianness (typically little-endian) Note that at present this doesn't support f16 / bf16 formats, but that could likely be added with minimal difficulty. The tricky bit of this is that the reading code has to parse a (limited) Python dict in text form. Please review that part carefully. TODO: we could probably add this as an option for `debug_to_file()` without too much pain in a followup PR. * clang-tidy * clang-tidy * Address review comments * Allow for "keys" as well as 'keys' * Add .npy support to debug_to_file() Built on top of https://github.com/halide/Halide/pull/8175, this adds .npy as an option. This is actually pretty great because it's easy to do something like ``` ss = numpy.load("my_file.npy") print(ss) ``` in Python and get nicely-formatted output, which can sometimes be a lot easier for debugging that inserting lots of print() statements (see https://github.com/halide/Halide/issues/8176) Did a drive-by change to the correctness test to use this format instead of .mat. * Add float16 support * Add support for Float16 images in npy * Assume little-endian * Remove redundant halide_error() * naming convention * naming convention * Test both mat and npy * Don't call halide_error() * Use old-school parser * clang-tidy --- src/DebugToFile.cpp | 4 + src/runtime/write_debug_image.cpp | 140 ++++++++++++++++++++++++--- test/correctness/debug_to_file.cpp | 147 +++++++++++++++-------------- 3 files changed, 207 insertions(+), 84 deletions(-) diff --git a/src/DebugToFile.cpp b/src/DebugToFile.cpp index 8147e4cfe7f1..8510b806a132 100644 --- a/src/DebugToFile.cpp +++ b/src/DebugToFile.cpp @@ -42,6 +42,8 @@ class DebugToFile : public IRMutator { num_elements *= bound.extent; } + // TODO: why do we bother with this? halide_debug_to_file() + // can infer the type-and-size it needs from the buffer's type field. int type_code = 0; Type t = op->types[0]; if (t == Float(32)) { @@ -64,6 +66,8 @@ class DebugToFile : public IRMutator { type_code = 8; } else if (t == Int(64)) { type_code = 9; + } else if (t == Float(16)) { + type_code = 10; } else { user_error << "Type " << t << " not supported for debug_to_file\n"; } diff --git a/src/runtime/write_debug_image.cpp b/src/runtime/write_debug_image.cpp index f51017c1fbb4..a5f8816db2c7 100644 --- a/src/runtime/write_debug_image.cpp +++ b/src/runtime/write_debug_image.cpp @@ -1,13 +1,16 @@ #include "HalideRuntime.h" -// We support three formats, tiff, mat, and tmp. +// We support four formats, npy, tiff, mat, and tmp. // // All formats support arbitrary types, and are easy to write in a // small amount of code. // +// npy: +// - Arbitrary dimensionality, type +// - Readable by NumPy and other Python tools // TIFF: // - 2/3-D only -// - Readable by the most tools +// - Readable by a lot of tools // mat: // - Arbitrary dimensionality, type // - Readable by matlab, ImageStack, and many other tools @@ -26,20 +29,22 @@ namespace Internal { // Mappings from the type_code passed in to the type codes of the // formats. See "type_code" in DebugToFile.cpp +constexpr int kNumTypeCodes = 11; + // TIFF sample type values are: // 1 => Unsigned int // 2 => Signed int // 3 => Floating-point -WEAK int16_t pixel_type_to_tiff_sample_type[] = { +WEAK int16_t pixel_type_to_tiff_sample_type[kNumTypeCodes] = { // float, double, uint8, int8, ... uint64, int64 - 3, 3, 1, 2, 1, 2, 1, 2, 1, 2}; + 3, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0}; // See the .mat level 5 documentation for matlab class codes. -WEAK uint8_t pixel_type_to_matlab_class_code[] = { - 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; +WEAK uint8_t pixel_type_to_matlab_class_code[kNumTypeCodes] = { + 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0}; -WEAK uint8_t pixel_type_to_matlab_type_code[] = { - 7, 9, 2, 1, 4, 3, 6, 5, 13, 12}; +WEAK uint8_t pixel_type_to_matlab_type_code[kNumTypeCodes] = { + 7, 9, 2, 1, 4, 3, 6, 5, 13, 12, 0}; #pragma pack(push) #pragma pack(2) @@ -125,6 +130,39 @@ struct ScopedFile { } }; +// Halide runtime has lots of assumptions that we are always little-endian, +// so we'll hardcode this here; leaving in the logic to make it clear. +constexpr bool host_is_big_endian = false; +constexpr char little_endian_char = '<'; +constexpr char big_endian_char = '>'; +constexpr char no_endian_char = '|'; +constexpr char host_endian_char = (host_is_big_endian ? big_endian_char : little_endian_char); + +struct npy_dtype_info_t { + char byte_order; + char kind; + size_t item_size; +}; + +struct htype_to_dtype { + halide_type_t htype; + npy_dtype_info_t dtype; +}; + +WEAK htype_to_dtype npy_dtypes[] = { + {halide_type_t(halide_type_float, 16), {host_endian_char, 'f', 2}}, + {halide_type_of(), {host_endian_char, 'f', sizeof(float)}}, + {halide_type_of(), {host_endian_char, 'f', sizeof(double)}}, + {halide_type_of(), {no_endian_char, 'i', sizeof(int8_t)}}, + {halide_type_of(), {host_endian_char, 'i', sizeof(int16_t)}}, + {halide_type_of(), {host_endian_char, 'i', sizeof(int32_t)}}, + {halide_type_of(), {host_endian_char, 'i', sizeof(int64_t)}}, + {halide_type_of(), {no_endian_char, 'u', sizeof(uint8_t)}}, + {halide_type_of(), {host_endian_char, 'u', sizeof(uint16_t)}}, + {halide_type_of(), {host_endian_char, 'u', sizeof(uint32_t)}}, + {halide_type_of(), {host_endian_char, 'u', sizeof(uint64_t)}}, +}; + } // namespace Internal } // namespace Runtime } // namespace Halide @@ -142,11 +180,15 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam return halide_error_code_bad_dimensions; } - if (auto result = halide_copy_to_host(user_context, buf); - result != halide_error_code_success) { + if (auto result = halide_copy_to_host(user_context, buf); result != halide_error_code_success) { + // halide_error() has already been called return result; } + // Note: all calls to this function are wrapped in an assert that identifies + // the function that failed, so calling halide_error() anywhere after this is redundant + // and actually unhelpful. + ScopedFile f(filename, "wb"); if (!f.open()) { return halide_error_code_debug_to_file_failed; @@ -167,7 +209,73 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam uint32_t final_padding_bytes = 0; - if (ends_with(filename, ".tiff") || ends_with(filename, ".tif")) { + if (ends_with(filename, ".npy")) { + npy_dtype_info_t di = {0, 0, 0}; + for (const auto &d : npy_dtypes) { + if (d.htype == buf->type) { + di = d.dtype; + break; + } + } + if (di.byte_order == 0) { + return halide_error_code_debug_to_file_failed; + } + + constexpr int max_dict_string_size = 1024; + char dict_string_buf[max_dict_string_size]; + char *dst = dict_string_buf; + char *end = dict_string_buf + max_dict_string_size - 1; + + dst = halide_string_to_string(dst, end, "{'descr': '"); + *dst++ = di.byte_order; + *dst++ = di.kind; + dst = halide_int64_to_string(dst, end, di.item_size, 1); + dst = halide_string_to_string(dst, end, "', 'fortran_order': False, 'shape': ("); + for (int d = 0; d < buf->dimensions; ++d) { + if (d > 0) { + dst = halide_string_to_string(dst, end, ","); + } + dst = halide_int64_to_string(dst, end, buf->dim[d].extent, 1); + if (buf->dimensions == 1) { + dst = halide_string_to_string(dst, end, ","); // special-case for single-element tuples + } + } + dst = halide_string_to_string(dst, end, ")}\n"); + if (dst >= end) { + // bloody unlikely, but just in case + return halide_error_code_debug_to_file_failed; + } + + const char *npy_magic_string_and_version = "\x93NUMPY\x01\x00"; + + const size_t unpadded_length = 8 + 2 + (dst - dict_string_buf); + const size_t padded_length = (unpadded_length + 64 - 1) & ~(64 - 1); + const size_t padding = padded_length - unpadded_length; + memset(dst, ' ', padding); + dst += padding; + + const size_t header_len = dst - dict_string_buf; + if (header_len > 65535) { + return halide_error_code_debug_to_file_failed; + } + const uint8_t header_len_le[2] = { + (uint8_t)((header_len >> 0) & 0xff), + (uint8_t)((header_len >> 8) & 0xff)}; + + if (!f.write(npy_magic_string_and_version, 8)) { + return halide_error_code_debug_to_file_failed; + } + if (!f.write(header_len_le, 2)) { + return halide_error_code_debug_to_file_failed; + } + if (!f.write(dict_string_buf, dst - dict_string_buf)) { + return halide_error_code_debug_to_file_failed; + } + } else if (ends_with(filename, ".tiff") || ends_with(filename, ".tif")) { + if (type_code == 10) { + return halide_error_code_debug_to_file_failed; + } + int32_t channels; int32_t width = shape[0].extent; int32_t height = shape[1].extent; @@ -243,6 +351,10 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam } } } else if (ends_with(filename, ".mat")) { + if (type_code == 10) { + return halide_error_code_debug_to_file_failed; + } + // Construct a name for the array from the filename const char *end = filename; while (*end) { @@ -279,7 +391,6 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam // level 5 .mat files have a size limit. (Padding itself should never cause the overflow. // Code written this way for safety.) if (((uint64_t)payload_bytes + final_padding_bytes) >> 32) { - halide_error(user_context, "Can't debug_to_file to a .mat file greater than 4GB\n"); return halide_error_code_debug_to_file_failed; } @@ -325,6 +436,10 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam return halide_error_code_debug_to_file_failed; } } else { + if (type_code == 10) { + return halide_error_code_debug_to_file_failed; + } + int32_t header[] = {shape[0].extent, shape[1].extent, shape[2].extent, @@ -370,7 +485,6 @@ WEAK extern "C" int halide_debug_to_file(void *user_context, const char *filenam const uint64_t zero = 0; if (final_padding_bytes) { if (final_padding_bytes > sizeof(zero)) { - halide_error(user_context, "Unexpectedly large final_padding_bytes"); return halide_error_code_debug_to_file_failed; } if (!f.write(&zero, final_padding_bytes)) { diff --git a/test/correctness/debug_to_file.cpp b/test/correctness/debug_to_file.cpp index 2b0aee28e8c0..780428c3389f 100644 --- a/test/correctness/debug_to_file.cpp +++ b/test/correctness/debug_to_file.cpp @@ -15,88 +15,93 @@ int main(int argc, char **argv) { return 0; } - std::string f_mat = Internal::get_test_tmp_dir() + "f.mat"; - std::string g_mat = Internal::get_test_tmp_dir() + "g.mat"; - std::string h_mat = Internal::get_test_tmp_dir() + "h.mat"; - - Internal::ensure_no_file_exists(f_mat); - Internal::ensure_no_file_exists(g_mat); - Internal::ensure_no_file_exists(h_mat); - - { - Func f, g, h, j; - Var x, y, z; - f(x, y, z) = cast(x + y + z); - g(x, y) = cast(f(x, y, 0) + f(x + 1, y, 1)); - h(x, y) = cast(f(x, y, -1) + g(x, y)); - - Target target = get_jit_target_from_environment(); - if (target.has_gpu_feature()) { - Var xi, yi; - f.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(f_mat); - g.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(g_mat); - h.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(h_mat); - } else { - f.compute_root().debug_to_file(f_mat); - g.compute_root().debug_to_file(g_mat); - h.compute_root().debug_to_file(h_mat); - } + std::vector formats = {"npy", "mat"}; + for (const auto &format : formats) { + std::cout << "Testing format " << format << "...\n"; + + std::string f_path = Internal::get_test_tmp_dir() + "f." + format; + std::string g_path = Internal::get_test_tmp_dir() + "g." + format; + std::string h_path = Internal::get_test_tmp_dir() + "h." + format; + + Internal::ensure_no_file_exists(f_path); + Internal::ensure_no_file_exists(g_path); + Internal::ensure_no_file_exists(h_path); + + { + Func f, g, h, j; + Var x, y, z; + f(x, y, z) = cast(x + y + z); + g(x, y) = cast(f(x, y, 0) + f(x + 1, y, 1)); + h(x, y) = cast(f(x, y, -1) + g(x, y)); + + Target target = get_jit_target_from_environment(); + if (target.has_gpu_feature()) { + Var xi, yi; + f.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(f_path); + g.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(g_path); + h.compute_root().gpu_tile(x, y, xi, yi, 1, 1).debug_to_file(h_path); + } else { + f.compute_root().debug_to_file(f_path); + g.compute_root().debug_to_file(g_path); + h.compute_root().debug_to_file(h_path); + } - Buffer im = h.realize({10, 10}, target); - } + Buffer im = h.realize({10, 10}, target); + } - { - Internal::assert_file_exists(f_mat); - Internal::assert_file_exists(g_mat); - Internal::assert_file_exists(h_mat); + { + Internal::assert_file_exists(f_path); + Internal::assert_file_exists(g_path); + Internal::assert_file_exists(h_path); + + Buffer f = Tools::load_image(f_path); + assert(f.dimensions() == 3 && + f.dim(0).extent() == 11 && + f.dim(1).extent() == 10 && + f.dim(2).extent() == 3); + + for (int z = 0; z < 3; z++) { + for (int y = 0; y < 10; y++) { + for (int x = 0; x < 11; x++) { + int32_t val = f(x, y, z); + // The min coord gets lost on debug_to_file, so f should be shifted up by one. + if (val != x + y + z - 1) { + printf("f(%d, %d, %d) = %d instead of %d\n", x, y, z, val, x + y); + return 1; + } + } + } + } - Buffer f = Tools::load_image(f_mat); - assert(f.dimensions() == 3 && - f.dim(0).extent() == 11 && - f.dim(1).extent() == 10 && - f.dim(2).extent() == 3); + Buffer g = Tools::load_image(g_path); + assert(g.dimensions() == 2 && + g.dim(0).extent() == 10 && + g.dim(1).extent() == 10); - for (int z = 0; z < 3; z++) { for (int y = 0; y < 10; y++) { - for (int x = 0; x < 11; x++) { - int32_t val = f(x, y, z); - // The min coord gets lost on debug_to_file, so f should be shifted up by one. - if (val != x + y + z - 1) { - printf("f(%d, %d, %d) = %d instead of %d\n", x, y, z, val, x + y); + for (int x = 0; x < 10; x++) { + float val = g(x, y); + float correct = (float)(f(x, y, 1) + f(x + 1, y, 2)); + if (val != correct) { + printf("g(%d, %d) = %f instead of %f\n", x, y, val, correct); return 1; } } } - } - Buffer g = Tools::load_image(g_mat); - assert(g.dimensions() == 2 && - g.dim(0).extent() == 10 && - g.dim(1).extent() == 10); - - for (int y = 0; y < 10; y++) { - for (int x = 0; x < 10; x++) { - float val = g(x, y); - float correct = (float)(f(x, y, 1) + f(x + 1, y, 2)); - if (val != correct) { - printf("g(%d, %d) = %f instead of %f\n", x, y, val, correct); - return 1; - } - } - } + Buffer h = Tools::load_image(h_path); + assert(h.dimensions() == 2 && + h.dim(0).extent() == 10 && + h.dim(1).extent() == 10); - Buffer h = Tools::load_image(h_mat); - assert(h.dimensions() == 2 && - h.dim(0).extent() == 10 && - h.dim(1).extent() == 10); - - for (int y = 0; y < 10; y++) { - for (int x = 0; x < 10; x++) { - int32_t val = h(x, y); - int32_t correct = f(x, y, 0) + g(x, y); - if (val != correct) { - printf("h(%d, %d) = %d instead of %d\n", x, y, val, correct); - return 1; + for (int y = 0; y < 10; y++) { + for (int x = 0; x < 10; x++) { + int32_t val = h(x, y); + int32_t correct = f(x, y, 0) + g(x, y); + if (val != correct) { + printf("h(%d, %d) = %d instead of %d\n", x, y, val, correct); + return 1; + } } } } From f4c78317887b6df4d2486e1f81e81f9012943f0f Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 11 Apr 2024 15:07:20 -0700 Subject: [PATCH 103/186] Don't print on parallel task entry/exit with -debug flag (#8185) Fixes #8184 --- src/LowerParallelTasks.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/LowerParallelTasks.cpp b/src/LowerParallelTasks.cpp index a035b2af6d1f..70f47885528c 100644 --- a/src/LowerParallelTasks.cpp +++ b/src/LowerParallelTasks.cpp @@ -302,9 +302,6 @@ struct LowerParallelTasks : public IRMutator { // TODO(zvookin): Figure out how we want to handle name mangling of closures. // For now, the C++ backend makes them extern "C" so they have to be NameMangling::C. LoweredFunc closure_func{new_function_name, closure_args, std::move(wrapped_body), LinkageType::Internal, NameMangling::C}; - if (target.has_feature(Target::Debug)) { - debug_arguments(&closure_func, target); - } closure_implementations.emplace_back(std::move(closure_func)); } From 7994e7030976f9fcd321a4d1d5f76f4582e01905 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 16 Apr 2024 14:27:43 -0700 Subject: [PATCH 104/186] Fix corner case in if_then_else simplification (#8189) Co-authored-by: Steven Johnson --- src/Simplify_Call.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Simplify_Call.cpp b/src/Simplify_Call.cpp index 33d11ccb8d06..29bc75aa2bb2 100644 --- a/src/Simplify_Call.cpp +++ b/src/Simplify_Call.cpp @@ -576,7 +576,11 @@ Expr Simplify::visit(const Call *op, ExprInfo *bounds) { } in_unreachable = false; if (true_unreachable) { - return false_value; + if (false_value.defined()) { + return false_value; + } else { + return make_zero(op->type); + } } else if (false_unreachable) { return true_value; } From 4e0b313fa7f6d3897f960dd322cfd13daed97c98 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Thu, 18 Apr 2024 12:48:59 -0700 Subject: [PATCH 105/186] Rewrite IREquality to use a more compact stack instead of deep recursion (#8198) * Rewrite IREquality to use a more compact stack instead of deep recursion Deletes a bunch of code and speeds up lowering time of local laplacian with 20 pyramid levels by ~2.5% * clang-tidy * Fold in the version of equal in IRMatch.h/cpp * Add missing switch breaks * Add missing comments * Elaborate on why we treat NaNs as equal --- src/Associativity.cpp | 2 +- src/Bounds.cpp | 4 +- src/CSE.cpp | 18 +- src/IREquality.cpp | 1179 ++++++++++++++++--------------------- src/IREquality.h | 239 ++++---- src/IRMatch.cpp | 144 ----- src/IRMatch.h | 12 - src/ParallelRVar.cpp | 2 +- src/RDom.cpp | 2 +- src/ScheduleFunctions.cpp | 2 +- 10 files changed, 660 insertions(+), 944 deletions(-) diff --git a/src/Associativity.cpp b/src/Associativity.cpp index 39a0011391a6..6baa9e5fa7c6 100644 --- a/src/Associativity.cpp +++ b/src/Associativity.cpp @@ -145,7 +145,7 @@ bool associative_op_pattern_match(const Expr &e, debug(5) << "Adding result: " << iter.first << " -> " << iter.second << "\n"; match.emplace(iter.first, iter.second); } else { - if (!equal(iter.first, match_iter->first) || !equal(iter.second, match_iter->second)) { + if (iter.first != match_iter->first || !equal(iter.second, match_iter->second)) { return false; } } diff --git a/src/Bounds.cpp b/src/Bounds.cpp index d7d337dacfdf..a8ed2deba0d2 100644 --- a/src/Bounds.cpp +++ b/src/Bounds.cpp @@ -79,9 +79,9 @@ int static_sign(const Expr &x) { return -1; } else { Expr zero = make_zero(x.type()); - if (equal(const_true(), simplify(x > zero))) { + if (is_const_one(simplify(x > zero))) { return 1; - } else if (equal(const_true(), simplify(x < zero))) { + } else if (is_const_one(simplify(x < zero))) { return -1; } } diff --git a/src/CSE.cpp b/src/CSE.cpp index d8ecd619db81..0905562c4e63 100644 --- a/src/CSE.cpp +++ b/src/CSE.cpp @@ -76,7 +76,7 @@ class GVN : public IRMutator { Expr expr; int use_count = 0; // All consumer Exprs for which this is the last child Expr. - map uses; + map uses; Entry(const Expr &e) : expr(e) { } @@ -84,25 +84,15 @@ class GVN : public IRMutator { vector> entries; map shallow_numbering, output_numbering; - map leaves; + map leaves; - int number = -1; - - IRCompareCache cache; - - GVN() - : number(0), cache(8) { - } + int number = 0; Stmt mutate(const Stmt &s) override { internal_error << "Can't call GVN on a Stmt: " << s << "\n"; return Stmt(); } - ExprWithCompareCache with_cache(const Expr &e) { - return ExprWithCompareCache(e, &cache); - } - Expr mutate(const Expr &e) override { // Early out if we've already seen this exact Expr. { @@ -123,7 +113,7 @@ class GVN : public IRMutator { // that child has an identical parent to this one. auto &use_map = number == -1 ? leaves : entries[number]->uses; - auto p = use_map.emplace(with_cache(new_e), (int)entries.size()); + auto p = use_map.emplace(new_e, (int)entries.size()); auto iter = p.first; bool novel = p.second; if (novel) { diff --git a/src/IREquality.cpp b/src/IREquality.cpp index 0d21ca1e26b5..bb64c1035590 100644 --- a/src/IREquality.cpp +++ b/src/IREquality.cpp @@ -10,713 +10,561 @@ using std::vector; namespace { -/** The class that does the work of comparing two IR nodes. */ -class IRComparer : public IRVisitor { -public: - /** Different possible results of a comparison. Unknown should - * only occur internally due to a cache miss. */ - enum CmpResult { Unknown, - Equal, - LessThan, - GreaterThan }; - - /** The result of the comparison. Should be Equal, LessThan, or GreaterThan. */ - CmpResult result = Equal; - - /** Compare two expressions or statements and return the - * result. Returns the result immediately if it is already - * non-zero. */ - // @{ - CmpResult compare_expr(const Expr &a, const Expr &b); - CmpResult compare_stmt(const Stmt &a, const Stmt &b); - // @} - - /** If the expressions you're comparing may contain many repeated - * subexpressions, it's worth passing in a cache to use. - * Currently this is only done in common-subexpression - * elimination. */ - IRComparer(IRCompareCache *c = nullptr) - : cache(c) { - } - -private: - Expr expr; - Stmt stmt; - IRCompareCache *cache; - - CmpResult compare_names(const std::string &a, const std::string &b); - CmpResult compare_types(Type a, Type b); - CmpResult compare_expr_vector(const std::vector &a, const std::vector &b); - - // Compare two things that already have a well-defined operator< - template - CmpResult compare_scalar(T a, T b); - - void visit(const IntImm *) override; - void visit(const UIntImm *) override; - void visit(const FloatImm *) override; - void visit(const StringImm *) override; - void visit(const Cast *) override; - void visit(const Reinterpret *) override; - void visit(const Variable *) override; - void visit(const Add *) override; - void visit(const Sub *) override; - void visit(const Mul *) override; - void visit(const Div *) override; - void visit(const Mod *) override; - void visit(const Min *) override; - void visit(const Max *) override; - void visit(const EQ *) override; - void visit(const NE *) override; - void visit(const LT *) override; - void visit(const LE *) override; - void visit(const GT *) override; - void visit(const GE *) override; - void visit(const And *) override; - void visit(const Or *) override; - void visit(const Not *) override; - void visit(const Select *) override; - void visit(const Load *) override; - void visit(const Ramp *) override; - void visit(const Broadcast *) override; - void visit(const Call *) override; - void visit(const Let *) override; - void visit(const LetStmt *) override; - void visit(const AssertStmt *) override; - void visit(const ProducerConsumer *) override; - void visit(const For *) override; - void visit(const Acquire *) override; - void visit(const Store *) override; - void visit(const Provide *) override; - void visit(const Allocate *) override; - void visit(const Free *) override; - void visit(const Realize *) override; - void visit(const Block *) override; - void visit(const Fork *) override; - void visit(const IfThenElse *) override; - void visit(const Evaluate *) override; - void visit(const Shuffle *) override; - void visit(const Prefetch *) override; - void visit(const Atomic *) override; - void visit(const VectorReduce *) override; - void visit(const HoistedStorage *) override; -}; - -template -IRComparer::CmpResult IRComparer::compare_scalar(T a, T b) { - if (result != Equal) { - return result; - } - - if constexpr (std::is_floating_point_v) { - // NaNs are equal to each other and less than non-nans - if (std::isnan(a) && std::isnan(b)) { - result = Equal; - return result; - } - if (std::isnan(a)) { - result = LessThan; - return result; +enum class Order { Equal, + LessThan, + GreaterThan }; + +// A helper class for comparing two pieces of IR with the minimum amount of +// recursion. +template +struct Comparer { + + // Points to any cache in use for comparing Expr graphs. Will be non-null + // exactly when cache_size > 0 + const IRNode **cache; + + // The compare method below does the actual work, but it needs to call out + // to a variety of template helper functions to compare specific types. We + // make the syntax in the giant switch statement in the compare method much + // simpler if we just give these helper functions access to the state in the + // compare method: The stack pointers, the currently-considered piece of + // IR, and the result of the comparison so far. + const IRNode **stack_end = nullptr, **stack_ptr = nullptr; + const IRNode *next_a = nullptr, *next_b = nullptr; + Order result = Order::Equal; + + Comparer(const IRNode **cache) + : cache(cache) { + } + + // Compare the given member variable of next_a and next_b. If it's an Expr + // or Stmt, it's guaranteed to be defined. + template + HALIDE_ALWAYS_INLINE void cmp(MemberType Node::*member_ptr) { + if (result == Order::Equal) { + cmp(((const Node *)next_a)->*member_ptr, ((const Node *)next_b)->*member_ptr); } - if (std::isnan(b)) { - result = GreaterThan; - return result; - } - } - - if (a < b) { - result = LessThan; - } else if (a > b) { - result = GreaterThan; - } - - return result; -} - -IRComparer::CmpResult IRComparer::compare_expr(const Expr &a, const Expr &b) { - if (result != Equal) { - return result; - } - - if (a.same_as(b)) { - result = Equal; - return result; - } - - // Undefined values are equal to each other and less than defined values - if (!a.defined() && !b.defined()) { - result = Equal; - return result; - } - - if (!a.defined()) { - result = LessThan; - return result; } - if (!b.defined()) { - result = GreaterThan; - return result; - } - - // If in the future we have hashes for Exprs, this is a good place - // to compare the hashes: - // if (compare_scalar(a.hash(), b.hash()) != Equal) { - // return result; - // } - - if (compare_scalar(a->node_type, b->node_type) != Equal) { - return result; - } - - if (compare_types(a.type(), b.type()) != Equal) { - return result; - } - - // Check the cache - perhaps these exprs have already been compared and found equal. - if (cache && cache->contains(a, b)) { - result = Equal; - return result; - } - - expr = a; - b.accept(this); - - if (cache && result == Equal) { - cache->insert(a, b); - } - - return result; -} - -IRComparer::CmpResult IRComparer::compare_stmt(const Stmt &a, const Stmt &b) { - if (result != Equal) { - return result; - } - - if (a.same_as(b)) { - result = Equal; - return result; - } - - if (!a.defined() && !b.defined()) { - result = Equal; - return result; - } - - if (!a.defined()) { - result = LessThan; - return result; - } - - if (!b.defined()) { - result = GreaterThan; - return result; - } - - if (compare_scalar(a->node_type, b->node_type) != Equal) { - return result; - } - - stmt = a; - b.accept(this); - - return result; -} - -IRComparer::CmpResult IRComparer::compare_types(Type a, Type b) { - if (result != Equal) { - return result; - } - - compare_scalar(a.code(), b.code()); - compare_scalar(a.bits(), b.bits()); - compare_scalar(a.lanes(), b.lanes()); - - if (result != Equal) { - return result; - } - - const halide_handle_cplusplus_type *ha = a.handle_type; - const halide_handle_cplusplus_type *hb = b.handle_type; - - if (ha == hb) { - // Same handle type, or both not handles, or both void * - return result; + // The same as above, but with no guarantee. + template + HALIDE_ALWAYS_INLINE void cmp_if_defined(MemberType Node::*member_ptr) { + if (result == Order::Equal) { + cmp_if_defined(((const Node *)next_a)->*member_ptr, ((const Node *)next_b)->*member_ptr); + } } - if (ha == nullptr) { - // void* < T* - result = LessThan; - return result; - } + size_t hash(const IRNode *a, const IRNode *b) { + // A simple hash designed to get enough information into the low bits to + // avoid too many collisions, while being robust to weird things like + // having strided set of Exprs. + uintptr_t pa = (uintptr_t)a; + uintptr_t pb = (uintptr_t)b; + uintptr_t h = (((pa * 17) ^ (pb * 13)) >> 4); + h ^= h >> 8; + h = h & (cache_size - 1); + return h; + } + + // See if we've already processed this pair of IR nodes + bool cache_contains(const IRNode *a, const IRNode *b) { + size_t h = hash(a, b); + const IRNode **c = cache + h * 2; + return (c[0] == a && c[1] == b); + } + + // Mark a pair of IR nodes as already processed. We don't do this until + // we're done processing their children, because there aren't going to be + // any queries to match a node with one of its children, because nodes can't + // be their own ancestors. Inserting it into the cache too soon just means + // it's going to be evicted before we need it. + void cache_insert(const IRNode *a, const IRNode *b) { + size_t h = hash(a, b); + const IRNode **c = cache + h * 2; + c[0] = a; + c[1] = b; + } + + // Compare two known-to-be-defined IR nodes. Well... don't actually compare + // them because that would be a recursive call. Just push them onto the + // pending tasks stack. + void cmp(const IRHandle &a, const IRHandle &b) { + if (cache_size > 0 && cache_contains(a.get(), b.get())) { + return; + } - if (hb == nullptr) { - // T* > void* - result = GreaterThan; - return result; + if (a.get() == b.get()) { + } else if (stack_ptr == stack_end) { + // Out of stack space. Make a recursive call to buy some more stack. + Comparer sub_comparer(cache); + result = sub_comparer.compare(*(a.get()), *(b.get())); + } else { + *stack_ptr++ = a.get(); + *stack_ptr++ = b.get(); + } } - // They're both non-void handle types with distinct type info - // structs. We now need to distinguish between different C++ - // pointer types (e.g. char * vs const float *). If would be nice - // if the structs were unique per C++ type. Then comparing the - // pointers above would be sufficient. Unfortunately, different - // shared libraries in the same process each create a distinct - // struct for the same type. We therefore have to do a deep - // comparison of the type info fields. - - compare_scalar(ha->reference_type, hb->reference_type); - compare_names(ha->inner_name.name, hb->inner_name.name); - compare_scalar(ha->inner_name.cpp_type_type, hb->inner_name.cpp_type_type); - compare_scalar(ha->namespaces.size(), hb->namespaces.size()); - compare_scalar(ha->enclosing_types.size(), hb->enclosing_types.size()); - compare_scalar(ha->cpp_type_modifiers.size(), hb->cpp_type_modifiers.size()); - - if (result != Equal) { - return result; + // Compare two IR nodes, which may or may not be defined. + HALIDE_ALWAYS_INLINE + void cmp_if_defined(const IRHandle &a, const IRHandle &b) { + if (a.defined() < b.defined()) { + result = Order::LessThan; + } else if (a.defined() > b.defined()) { + result = Order::GreaterThan; + } else if (a.defined() && b.defined()) { + cmp(a, b); + } } - for (size_t i = 0; i < ha->namespaces.size(); i++) { - compare_names(ha->namespaces[i], hb->namespaces[i]); + template + void cmp(const std::vector &a, const std::vector &b) { + if (a.size() < b.size()) { + result = Order::LessThan; + } else if (a.size() > b.size()) { + result = Order::GreaterThan; + } else { + for (size_t i = 0; i < a.size() && result == Order::Equal; i++) { + cmp(a[i], b[i]); + } + } } - if (result != Equal) { - return result; + HALIDE_ALWAYS_INLINE + void cmp(const Range &a, const Range &b) { + cmp(a.min, b.min); + cmp(a.extent, b.extent); + } + + HALIDE_ALWAYS_INLINE + void cmp(const ModulusRemainder &a, const ModulusRemainder &b) { + cmp(a.modulus, b.modulus); + cmp(a.remainder, b.remainder); + } + + void cmp(const halide_handle_cplusplus_type *ha, + const halide_handle_cplusplus_type *hb) { + if (ha == hb) { + return; + } else if (!ha) { + result = Order::LessThan; + } else if (!hb) { + result = Order::GreaterThan; + } else { + // They're both non-void handle types with distinct type info + // structs. We now need to distinguish between different C++ + // pointer types (e.g. char * vs const float *). If would be nice + // if the structs were unique per C++ type. Then comparing the + // pointers above would be sufficient. Unfortunately, different + // shared libraries in the same process each create a distinct + // struct for the same type. We therefore have to do a deep + // comparison of the type info fields. + cmp(ha->reference_type, hb->reference_type); + cmp(ha->inner_name.name, hb->inner_name.name); + cmp(ha->inner_name.cpp_type_type, hb->inner_name.cpp_type_type); + cmp(ha->namespaces, hb->namespaces); + cmp(ha->enclosing_types, hb->enclosing_types); + cmp(ha->cpp_type_modifiers, hb->cpp_type_modifiers); + } } - for (size_t i = 0; i < ha->enclosing_types.size(); i++) { - compare_scalar(ha->enclosing_types[i].cpp_type_type, - hb->enclosing_types[i].cpp_type_type); - compare_names(ha->enclosing_types[i].name, - hb->enclosing_types[i].name); + HALIDE_ALWAYS_INLINE + void cmp(const Type &a, const Type &b) { + uint32_t ta = ((halide_type_t)a).as_u32(); + uint32_t tb = ((halide_type_t)b).as_u32(); + if (ta < tb) { + result = Order::LessThan; + } else if (ta > tb) { + result = Order::GreaterThan; + } else { + if (a.handle_type || b.handle_type) { + cmp(a.handle_type, b.handle_type); + } + } } - if (result != Equal) { - return result; + void cmp(const PrefetchDirective &a, const PrefetchDirective &b) { + cmp(a.name, b.name); + cmp(a.at, b.at); + cmp(a.from, b.from); + cmp(a.offset, b.offset); + cmp(a.strategy, b.strategy); } - for (size_t i = 0; i < ha->cpp_type_modifiers.size(); i++) { - compare_scalar(ha->cpp_type_modifiers[i], - hb->cpp_type_modifiers[i]); + HALIDE_ALWAYS_INLINE + void cmp(double a, double b) { + // Floating point scalars need special handling, due to NaNs. + if (std::isnan(a) && std::isnan(b)) { + // Under numeric rules, NaNs aren't equal, but we're not actually + // comparing numbers here. We are comparing IR nodes to see if + // they'll compile to the same thing. Two NaN FloatImms will compile + // to the same thing, so they should be considered equal in this + // context, so we leave comparison state unchanged. + // + // Note however that we consider -0 equal to 0 here, because + // otherwise you get tedious problems like std::nearbyint(-0.5) with + // round-to-nearest mode leaving it platform-dependent whether you + // get -0 or 0. So if we say -0 != 0, our constant folding would be + // platform-dependent. + } else if (std::isnan(a)) { + result = Order::LessThan; + } else if (std::isnan(b)) { + result = Order::GreaterThan; + } else if (a < b) { + result = Order::LessThan; + } else if (b < a) { + result = Order::GreaterThan; + } } - return result; -} - -IRComparer::CmpResult IRComparer::compare_names(const string &a, const string &b) { - if (result != Equal) { - return result; + HALIDE_ALWAYS_INLINE + void cmp(const std::string &a, const std::string &b) { + int r = a.compare(b); + if (r < 0) { + result = Order::LessThan; + } else if (r > 0) { + result = Order::GreaterThan; + } } - int string_cmp = a.compare(b); - if (string_cmp < 0) { - result = LessThan; - } else if (string_cmp > 0) { - result = GreaterThan; + // The method to use whenever we can just use operator< and get a bool. + template && + std::is_same_v() < std::declval()), bool>>> + HALIDE_NEVER_INLINE void cmp(const T &a, const T &b) { + if (a < b) { + result = Order::LessThan; + } else if (b < a) { + result = Order::GreaterThan; + } } - return result; -} + Order compare(const IRNode &root_a, const IRNode &root_b) { + constexpr size_t stack_size = 64; // 1 kb + const IRNode *stack_storage[stack_size * 2]; // Intentionally uninitialized + + stack_ptr = stack_storage; + stack_end = stack_storage + stack_size * 2; + result = Order::Equal; + + *stack_ptr++ = &root_a; + *stack_ptr++ = &root_b; + + while (result == Order::Equal && stack_ptr > stack_storage) { + stack_ptr -= 2; + next_a = stack_ptr[0]; + next_b = stack_ptr[1]; + + if (next_a == next_b) { + continue; + } + + if (cache_size > 0 && (((uintptr_t)next_a) & 1)) { + // If we are using a cache, we want to keep the nodes on the + // stack while processing their children, but mark them with a + // tombstone. We'll flip the low bit to 1 for our tombstone. We + // want to insert them into the cache when the tombstone is + // handled. This if statement triggers if we just hit a + // tombstone. + cache_insert((const IRNode *)((uintptr_t)next_a ^ 1), next_b); + continue; + } + + cmp(next_a->node_type, next_b->node_type); + if (result != Order::Equal) { + break; + } + + if (next_a->node_type < IRNodeType::LetStmt) { + cmp(&BaseExprNode::type); + } + + if (cache_size > 0) { + // Keep the parent nodes on the stack, but mark them with a + // tombstone bit. + stack_ptr[0] = (const IRNode *)(((uintptr_t)next_a) | 1); + stack_ptr += 2; + } + + switch (next_a->node_type) { + case IRNodeType::IntImm: + cmp(&IntImm::value); + break; + case IRNodeType::UIntImm: + cmp(&UIntImm::value); + break; + case IRNodeType::FloatImm: + cmp(&FloatImm::value); + break; + case IRNodeType::StringImm: + cmp(&StringImm::value); + break; + case IRNodeType::Broadcast: + cmp(&Broadcast::value); + break; + case IRNodeType::Cast: + cmp(&Cast::value); + break; + case IRNodeType::Reinterpret: + cmp(&Cast::value); + break; + case IRNodeType::Variable: + cmp(&Variable::name); + break; + case IRNodeType::Add: + cmp(&Add::a); + cmp(&Add::b); + break; + case IRNodeType::Sub: + cmp(&Sub::a); + cmp(&Sub::b); + break; + case IRNodeType::Mod: + cmp(&Mod::a); + cmp(&Mod::b); + break; + case IRNodeType::Mul: + cmp(&Mul::a); + cmp(&Mul::b); + break; + case IRNodeType::Div: + cmp(&Div::a); + cmp(&Div::b); + break; + case IRNodeType::Min: + cmp(&Min::a); + cmp(&Min::b); + break; + case IRNodeType::Max: + cmp(&Max::a); + cmp(&Max::b); + break; + case IRNodeType::EQ: + cmp(&EQ::a); + cmp(&EQ::b); + break; + case IRNodeType::NE: + cmp(&NE::a); + cmp(&NE::b); + break; + case IRNodeType::LT: + cmp(<::a); + cmp(<::b); + break; + case IRNodeType::LE: + cmp(&LE::a); + cmp(&LE::b); + break; + case IRNodeType::GT: + cmp(>::a); + cmp(>::b); + case IRNodeType::GE: + cmp(&GE::a); + cmp(&GE::b); + break; + case IRNodeType::And: + cmp(&And::a); + cmp(&And::b); + break; + case IRNodeType::Or: + cmp(&Or::a); + cmp(&Or::b); + break; + case IRNodeType::Not: + cmp(&Not::a); + break; + case IRNodeType::Select: + cmp(&Select::condition); + cmp(&Select::true_value); + cmp(&Select::false_value); + break; + case IRNodeType::Load: + cmp(&Load::name); + cmp(&Load::alignment); + cmp(&Load::index); + cmp(&Load::predicate); + break; + case IRNodeType::Ramp: + cmp(&Ramp::stride); + cmp(&Ramp::base); + break; + case IRNodeType::Call: + cmp(&Call::name); + cmp(&Call::call_type); + cmp(&Call::value_index); + cmp(&Call::args); + break; + case IRNodeType::Let: + cmp(&Let::name); + cmp(&Let::value); + cmp(&Let::body); + break; + case IRNodeType::Shuffle: + cmp(&Shuffle::indices); + cmp(&Shuffle::vectors); + break; + case IRNodeType::VectorReduce: + cmp(&VectorReduce::op); + cmp(&VectorReduce::value); + break; + case IRNodeType::LetStmt: + cmp(&LetStmt::name); + cmp(&LetStmt::value); + cmp(&LetStmt::body); + break; + case IRNodeType::AssertStmt: + cmp(&AssertStmt::condition); + cmp(&AssertStmt::message); + break; + case IRNodeType::ProducerConsumer: + cmp(&ProducerConsumer::name); + cmp(&ProducerConsumer::is_producer); + cmp(&ProducerConsumer::body); + break; + case IRNodeType::For: + cmp(&For::name); + cmp(&For::for_type); + cmp(&For::device_api); + cmp(&For::partition_policy); + cmp(&For::min); + cmp(&For::extent); + cmp(&For::body); + break; + case IRNodeType::Acquire: + cmp(&Acquire::semaphore); + cmp(&Acquire::count); + cmp(&Acquire::body); + break; + case IRNodeType::Store: + cmp(&Store::name); + cmp(&Store::alignment); + cmp(&Store::predicate); + cmp(&Store::value); + cmp(&Store::index); + break; + case IRNodeType::Provide: + cmp(&Provide::name); + cmp(&Provide::args); + cmp(&Provide::values); + break; + case IRNodeType::Allocate: + cmp(&Allocate::name); + cmp(&Allocate::type); + cmp(&Allocate::free_function); + cmp_if_defined(&Allocate::new_expr); + cmp(&Allocate::condition); + cmp(&Allocate::extents); + cmp(&Allocate::body); + break; + case IRNodeType::Free: + cmp(&Free::name); + break; + case IRNodeType::Realize: + cmp(&Realize::name); + cmp(&Realize::types); + cmp(&Realize::bounds); + cmp(&Realize::body); + cmp(&Realize::condition); + break; + case IRNodeType::Block: + cmp(&Block::first); + cmp(&Block::rest); + break; + case IRNodeType::Fork: + cmp(&Fork::first); + cmp(&Fork::rest); + break; + case IRNodeType::IfThenElse: + cmp(&IfThenElse::condition); + cmp(&IfThenElse::then_case); + cmp_if_defined(&IfThenElse::else_case); + break; + case IRNodeType::Evaluate: + cmp(&Evaluate::value); + break; + case IRNodeType::Prefetch: + cmp(&Prefetch::name); + cmp(&Prefetch::types); + cmp(&Prefetch::prefetch); + cmp(&Prefetch::bounds); + cmp(&Prefetch::condition); + cmp(&Prefetch::body); + break; + case IRNodeType::Atomic: + cmp(&Atomic::producer_name); + cmp(&Atomic::mutex_name); + cmp(&Atomic::body); + break; + case IRNodeType::HoistedStorage: + cmp(&HoistedStorage::name); + cmp(&HoistedStorage::body); + break; + } + } -IRComparer::CmpResult IRComparer::compare_expr_vector(const vector &a, const vector &b) { - if (result != Equal) { + // Don't hold onto pointers to this stack frame. + stack_ptr = stack_end = nullptr; return result; } - - compare_scalar(a.size(), b.size()); - for (size_t i = 0; (i < a.size()) && result == Equal; i++) { - compare_expr(a[i], b[i]); - } - - return result; -} - -void IRComparer::visit(const IntImm *op) { - const IntImm *e = expr.as(); - compare_scalar(e->value, op->value); -} - -void IRComparer::visit(const UIntImm *op) { - const UIntImm *e = expr.as(); - compare_scalar(e->value, op->value); -} - -void IRComparer::visit(const FloatImm *op) { - const FloatImm *e = expr.as(); - compare_scalar(e->value, op->value); -} - -void IRComparer::visit(const StringImm *op) { - const StringImm *e = expr.as(); - compare_names(e->value, op->value); -} - -void IRComparer::visit(const Cast *op) { - compare_expr(expr.as()->value, op->value); -} - -void IRComparer::visit(const Reinterpret *op) { - compare_expr(expr.as()->value, op->value); -} - -void IRComparer::visit(const Variable *op) { - const Variable *e = expr.as(); - compare_names(e->name, op->name); -} - -namespace { -template -void visit_binary_operator(IRComparer *cmp, const T *op, Expr expr) { - const T *e = expr.as(); - cmp->compare_expr(e->a, op->a); - cmp->compare_expr(e->b, op->b); -} -} // namespace - -void IRComparer::visit(const Add *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const Sub *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const Mul *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const Div *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const Mod *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const Min *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const Max *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const EQ *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const NE *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const LT *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const LE *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const GT *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const GE *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const And *op) { - visit_binary_operator(this, op, expr); -} -void IRComparer::visit(const Or *op) { - visit_binary_operator(this, op, expr); -} - -void IRComparer::visit(const Not *op) { - const Not *e = expr.as(); - compare_expr(e->a, op->a); -} - -void IRComparer::visit(const Select *op) { - const Select *e = expr.as