diff --git a/apps/hexagon_benchmarks/CMakeLists.txt b/apps/hexagon_benchmarks/CMakeLists.txt index 9cbcc541b76a..c01ad22035bd 100644 --- a/apps/hexagon_benchmarks/CMakeLists.txt +++ b/apps/hexagon_benchmarks/CMakeLists.txt @@ -22,23 +22,24 @@ endmacro() add_generator_and_library(dilate3x3) add_generator_and_library(gaussian5x5) add_generator_and_library(median3x3) +add_generator_and_library(sobel) # Main executable add_executable(process process.cpp) target_compile_options(process PRIVATE $<$:-O2>) if (Halide_TARGET MATCHES "hvx") - target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 TARGET_HAS_HVX) + target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL TARGET_HAS_HVX) else() - target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3) + target_compile_definitions(process PRIVATE DILATE3X3 GAUSSIAN5X5 MEDIAN3X3 SOBEL) endif() target_link_libraries(process PRIVATE Halide::Tools - dilate3x3 gaussian5x5 median3x3) + dilate3x3 gaussian5x5 median3x3 sobel) # Test that the app actually works! add_test(NAME hexagon_benchmarks COMMAND process -n 1) set_tests_properties(hexagon_benchmarks PROPERTIES LABELS hexagon_benchmarks PASS_REGULAR_EXPRESSION "Success!" - SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") + SKIP_REGULAR_EXPRESSION "\\[SKIP\\]") \ No newline at end of file diff --git a/apps/hexagon_benchmarks/process.cpp b/apps/hexagon_benchmarks/process.cpp index 87a492c577d1..def519963ad0 100644 --- a/apps/hexagon_benchmarks/process.cpp +++ b/apps/hexagon_benchmarks/process.cpp @@ -43,10 +43,11 @@ int main(int argc, char **argv) { Dilate3x3Descriptor dilate3x3_pipeine(W, H); Median3x3Descriptor median3x3_pipeline(W, H); Gaussian5x5Descriptor gaussian5x5_pipeline(W, H); + SobelDescriptor sobel_pipeline(W, H); Conv3x3a32Descriptor conv3x3a32_pipeline(W, H); std::vector pipelines = {&conv3x3a16_pipeline, &dilate3x3_pipeine, &median3x3_pipeline, - &gaussian5x5_pipeline, &conv3x3a32_pipeline}; + &gaussian5x5_pipeline, &sobel_pipeline, &conv3x3a32_pipeline}; for (PipelineDescriptorBase *p : pipelines) { if (!p->defined()) { @@ -85,4 +86,4 @@ int main(int argc, char **argv) { printf("Success!\n"); return 0; -} +} \ No newline at end of file diff --git a/apps/onnx/Makefile b/apps/onnx/Makefile index f714b0254b75..5188c1c85068 100644 --- a/apps/onnx/Makefile +++ b/apps/onnx/Makefile @@ -90,7 +90,12 @@ ifeq ($(UNAME), Darwin) # Keep OS X builds from complaining about missing libpython symbols PYBIND11_CFLAGS += -undefined dynamic_lookup endif -PY_EXT = $(shell $(PYTHON)-config --extension-suffix) +# Get the python extension module suffix from python itself. You can +# also do this with python-config, but that's not resistant to version +# mismatches between python and python-config. This can happen when +# using a virtualenv, because virtualenvs override python, but not +# python-config. +PY_EXT = $(shell $(PYTHON) -c 'import sysconfig; print(sysconfig.get_config_var("EXT_SUFFIX"))') PY_MODEL_EXT = model_cpp$(PY_EXT) PYCXXFLAGS = $(PYBIND11_CFLAGS) $(CXXFLAGS) -Wno-deprecated-register diff --git a/src/AddAtomicMutex.cpp b/src/AddAtomicMutex.cpp index a2bf990e38f6..cf3b0ae8bb89 100644 --- a/src/AddAtomicMutex.cpp +++ b/src/AddAtomicMutex.cpp @@ -1,5 +1,4 @@ #include "AddAtomicMutex.h" - #include "ExprUsesVar.h" #include "Func.h" #include "IREquality.h" @@ -11,14 +10,10 @@ namespace Halide { namespace Internal { -using std::map; -using std::set; -using std::string; - namespace { /** Collect names of all stores matching the producer name inside a statement. */ -class CollectProducerStoreNames : public IRGraphVisitor { +class CollectProducerStoreNames : public IRVisitor { public: CollectProducerStoreNames(const std::string &producer_name) : producer_name(producer_name) { @@ -27,12 +22,12 @@ class CollectProducerStoreNames : public IRGraphVisitor { Scope store_names; protected: - using IRGraphVisitor::visit; + using IRVisitor::visit; void visit(const Store *op) override { - IRGraphVisitor::visit(op); + IRVisitor::visit(op); if (op->name == producer_name || starts_with(op->name, producer_name + ".")) { - // This is a Store for the desginated Producer. + // This is a Store for the designated Producer. store_names.push(op->name); } } @@ -42,7 +37,7 @@ class CollectProducerStoreNames : public IRGraphVisitor { /** Find Store inside of an Atomic node for the designated producer * and return their indices. */ -class FindProducerStoreIndex : public IRGraphVisitor { +class FindProducerStoreIndex : public IRVisitor { public: FindProducerStoreIndex(const std::string &producer_name) : producer_name(producer_name) { @@ -51,11 +46,11 @@ class FindProducerStoreIndex : public IRGraphVisitor { Expr index; // The returned index. protected: - using IRGraphVisitor::visit; + using IRVisitor::visit; // Need to also extract the let bindings of a Store index. void visit(const Let *op) override { - IRGraphVisitor::visit(op); // Make sure we visit the Store first. + IRVisitor::visit(op); // Make sure we visit the Store first. if (index.defined()) { if (expr_uses_var(index, op->name)) { index = Let::make(op->name, op->value, index); @@ -63,7 +58,7 @@ class FindProducerStoreIndex : public IRGraphVisitor { } } void visit(const LetStmt *op) override { - IRGraphVisitor::visit(op); // Make sure we visit the Store first. + IRVisitor::visit(op); // Make sure we visit the Store first. if (index.defined()) { if (expr_uses_var(index, op->name)) { index = Let::make(op->name, op->value, index); @@ -72,7 +67,7 @@ class FindProducerStoreIndex : public IRGraphVisitor { } void visit(const Store *op) override { - IRGraphVisitor::visit(op); + IRVisitor::visit(op); if (op->name == producer_name || starts_with(op->name, producer_name + ".")) { // This is a Store for the designated producer. @@ -94,11 +89,13 @@ class FindProducerStoreIndex : public IRGraphVisitor { /** Throws an assertion for cases where the indexing on left-hand-side of * an atomic update references to itself. * e.g. f(clamp(f(r), 0, 100)) = f(r) + 1 should be rejected. */ -class CheckAtomicValidity : public IRGraphVisitor { +class CheckAtomicValidity : public IRVisitor { protected: - using IRGraphVisitor::visit; + using IRVisitor::visit; void visit(const Atomic *op) override { + any_atomic = true; + // Collect the names of all Store nodes inside. CollectProducerStoreNames collector(op->producer_name); op->body.accept(&collector); @@ -115,13 +112,16 @@ class CheckAtomicValidity : public IRGraphVisitor { } op->body.accept(this); } + +public: + bool any_atomic = false; }; /** Search if the value of a Store node has a variable pointing to a let binding, * where the let binding contains the Store location. Use for checking whether * we need a mutex lock for Atomic since some lowering pass before lifted a let * binding from the Store node (currently only SplitTuple would do this). */ -class FindAtomicLetBindings : public IRGraphVisitor { +class FindAtomicLetBindings : public IRVisitor { public: FindAtomicLetBindings(const Scope &store_names) : store_names(store_names) { @@ -133,18 +133,18 @@ class FindAtomicLetBindings : public IRGraphVisitor { using IRVisitor::visit; void visit(const Let *op) override { - include(op->value); + op->value.accept(this); { ScopedBinding bind(let_bindings, op->name, op->value); - include(op->body); + op->body.accept(this); } } void visit(const LetStmt *op) override { - include(op->value); + op->value.accept(this); { ScopedBinding bind(let_bindings, op->name, op->value); - include(op->body); + op->body.accept(this); } } @@ -159,19 +159,19 @@ class FindAtomicLetBindings : public IRGraphVisitor { } void visit(const Store *op) override { - include(op->predicate); + op->predicate.accept(this); + op->index.accept(this); if (store_names.contains(op->name)) { // If we are in a designated store and op->value has a let binding // that uses one of the store_names, we found a lifted let. - ScopedValue old_inside_store(inside_store, op->name); - include(op->value); + ScopedValue old_inside_store(inside_store, op->name); + op->value.accept(this); } else { - include(op->value); + op->value.accept(this); } - include(op->index); } - string inside_store; + std::string inside_store; const Scope &store_names; Scope let_bindings; }; @@ -179,7 +179,7 @@ class FindAtomicLetBindings : public IRGraphVisitor { /** Clear out the Atomic node's mutex usages if it doesn't need one. */ class RemoveUnnecessaryMutexUse : public IRMutator { public: - set remove_mutex_lock_names; + std::set remove_mutex_lock_names; protected: using IRMutator::visit; @@ -200,30 +200,30 @@ class RemoveUnnecessaryMutexUse : public IRMutator { remove_mutex_lock_names.insert(op->mutex_name); Stmt body = mutate(op->body); return Atomic::make(op->producer_name, - string(), + std::string{}, std::move(body)); } } }; /** Find Store inside an Atomic that matches the provided store_names. */ -class FindStoreInAtomicMutex : public IRGraphVisitor { +class FindStoreInAtomicMutex : public IRVisitor { public: - using IRGraphVisitor::visit; + using IRVisitor::visit; FindStoreInAtomicMutex(const std::set &store_names) : store_names(store_names) { } bool found = false; - string producer_name; - string mutex_name; + std::string producer_name; + std::string mutex_name; protected: void visit(const Atomic *op) override { if (!found && !op->mutex_name.empty()) { ScopedValue old_in_atomic_mutex(in_atomic_mutex, true); - include(op->body); + op->body.accept(this); if (found) { // We found a Store inside Atomic with matching name, // record the mutex information. @@ -231,7 +231,7 @@ class FindStoreInAtomicMutex : public IRGraphVisitor { mutex_name = op->mutex_name; } } else { - include(op->body); + op->body.accept(this); } } @@ -241,11 +241,11 @@ class FindStoreInAtomicMutex : public IRGraphVisitor { found = true; } } - IRGraphVisitor::visit(op); + IRVisitor::visit(op); } bool in_atomic_mutex = false; - const set &store_names; + const std::set &store_names; }; /** Replace the indices in the Store nodes with the specified variable. */ @@ -276,26 +276,32 @@ class ReplaceStoreIndexWithVar : public IRMutator { /** Add mutex allocation & lock & unlock if required. */ class AddAtomicMutex : public IRMutator { public: - AddAtomicMutex(const map &env) - : env(env) { + AddAtomicMutex(const std::vector &o) { + for (const Function &f : o) { + outputs.emplace(f.name(), f); + } } protected: using IRMutator::visit; - const map &env; - // The set of producers that have allocated a mutex buffer - set allocated_mutexes; + // Maps from a producer name to a mutex name, for all encountered atomic + // nodes. + Scope needs_mutex_allocation; - Stmt allocate_mutex(const string &mutex_name, Expr extent, Stmt body) { + // Pipeline outputs + std::map outputs; + + Stmt allocate_mutex(const std::string &mutex_name, Expr extent, Stmt body) { Expr mutex_array = Call::make(type_of(), "halide_mutex_array_create", {std::move(extent)}, Call::Extern); + // Allocate a scalar of halide_mutex_array. // This generates halide_mutex_array mutex[1]; body = Allocate::make(mutex_name, - Handle(), + type_of(), MemoryType::Stack, {}, const_true(), @@ -309,37 +315,44 @@ class AddAtomicMutex : public IRMutator { // If this Allocate node is allocating a buffer for a producer, // and there is a Store node inside of an Atomic node requiring mutex lock // matching the name of the Allocate, allocate a mutex lock. - set store_names{op->name}; - FindStoreInAtomicMutex finder(store_names); - op->body.accept(&finder); - if (!finder.found) { - // No Atomic node that requires mutex lock from this node inside. - return IRMutator::visit(op); - } - if (allocated_mutexes.find(finder.mutex_name) != allocated_mutexes.end()) { - // We've already allocated a mutex. - return IRMutator::visit(op); + Stmt body = mutate(op->body); + + std::string producer_name; + if (ends_with(op->name, ".0")) { + producer_name = op->name.substr(0, op->name.size() - 2); + } else { + producer_name = op->name; } - allocated_mutexes.insert(finder.mutex_name); + if (const std::string *mutex_name = needs_mutex_allocation.find(producer_name)) { + Expr extent = cast(1); // uint64_t to handle LargeBuffers + for (const Expr &e : op->extents) { + extent = extent * e; + } - const string &mutex_name = finder.mutex_name; - Stmt body = mutate(op->body); - Expr extent = Expr(1); - for (const Expr &e : op->extents) { - extent = extent * e; + body = allocate_mutex(*mutex_name, extent, body); + + // At this stage in lowering it should be impossible to have an + // allocation that shadows the name of an outer allocation, but may as + // well handle it anyway by using a scope and popping at each allocate + // node. + needs_mutex_allocation.pop(producer_name); + } + + if (body.same_as(op->body)) { + return op; + } else { + return Allocate::make(op->name, + op->type, + op->memory_type, + op->extents, + op->condition, + std::move(body), + op->new_expr, + op->free_function, + op->padding); } - body = allocate_mutex(mutex_name, extent, body); - return Allocate::make(op->name, - op->type, - op->memory_type, - op->extents, - op->condition, - std::move(body), - op->new_expr, - op->free_function, - op->padding); } Stmt visit(const ProducerConsumer *op) override { @@ -348,50 +361,35 @@ class AddAtomicMutex : public IRMutator { // buffer at the producer node. if (!op->is_producer) { - // This is a consumer. + // This is a consumer return IRMutator::visit(op); } - // Find the corresponding output. - auto func_it = env.find(op->name); - if (func_it == env.end()) { - // Not an output. - return IRMutator::visit(op); - } - Func f = Func(func_it->second); - if (f.output_buffers().empty()) { - // Not an output. + auto it = outputs.find(op->name); + if (it == outputs.end()) { + // Not an output return IRMutator::visit(op); } - set store_names; - for (const auto &buffer : f.output_buffers()) { - store_names.insert(buffer.name()); - } + Function f = it->second; - FindStoreInAtomicMutex finder(store_names); - op->body.accept(&finder); - if (!finder.found) { - // No Atomic node that requires mutex lock from this node inside. - return IRMutator::visit(op); - } + Stmt body = mutate(op->body); - if (allocated_mutexes.find(finder.mutex_name) != allocated_mutexes.end()) { - // We've already allocated a mutex. - return IRMutator::visit(op); + if (const std::string *mutex_name = needs_mutex_allocation.find(it->first)) { + // All output buffers in a Tuple have the same extent. + OutputImageParam output_buffer = Func(f).output_buffers()[0]; + Expr extent = cast(1); // uint64_t to handle LargeBuffers + for (int i = 0; i < output_buffer.dimensions(); i++) { + extent *= output_buffer.dim(i).extent(); + } + body = allocate_mutex(*mutex_name, extent, body); } - allocated_mutexes.insert(finder.mutex_name); - - // We assume all output buffers in a Tuple have the same extent. - OutputImageParam output_buffer = f.output_buffers()[0]; - Expr extent = Expr(1); - for (int i = 0; i < output_buffer.dimensions(); i++) { - extent = extent * output_buffer.dim(i).extent(); + if (body.same_as(op->body)) { + return op; + } else { + return ProducerConsumer::make(op->name, op->is_producer, std::move(body)); } - Stmt body = mutate(op->body); - body = allocate_mutex(finder.mutex_name, extent, body); - return ProducerConsumer::make(op->name, op->is_producer, std::move(body)); } Stmt visit(const Atomic *op) override { @@ -414,7 +412,7 @@ class AddAtomicMutex : public IRMutator { // Lift the index outside of the atomic node. // This is for avoiding side-effects inside those expressions // being evaluated twice. - string name = unique_name('t'); + std::string name = unique_name('t'); index_let = index; index = Variable::make(index.type(), name); body = ReplaceStoreIndexWithVar(op->producer_name, index).mutate(body); @@ -444,17 +442,21 @@ class AddAtomicMutex : public IRMutator { internal_assert(index.as() != nullptr); ret = LetStmt::make(index.as()->name, index_let, ret); } + needs_mutex_allocation.push(op->producer_name, op->mutex_name); + return ret; } }; } // namespace -Stmt add_atomic_mutex(Stmt s, const map &env) { +Stmt add_atomic_mutex(Stmt s, const std::vector &outputs) { CheckAtomicValidity check; s.accept(&check); - s = RemoveUnnecessaryMutexUse().mutate(s); - s = AddAtomicMutex(env).mutate(s); + if (check.any_atomic) { + s = RemoveUnnecessaryMutexUse().mutate(s); + s = AddAtomicMutex(outputs).mutate(s); + } return s; } diff --git a/src/AddAtomicMutex.h b/src/AddAtomicMutex.h index c27b0346f349..5b11de621e97 100644 --- a/src/AddAtomicMutex.h +++ b/src/AddAtomicMutex.h @@ -23,7 +23,7 @@ namespace Internal { class Function; -Stmt add_atomic_mutex(Stmt s, const std::map &env); +Stmt add_atomic_mutex(Stmt s, const std::vector &outputs); } // namespace Internal } // namespace Halide diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp index 5965303197bc..21ca06e07285 100644 --- a/src/BoundsInference.cpp +++ b/src/BoundsInference.cpp @@ -1152,7 +1152,7 @@ class BoundsInference : public IRMutator { map stage_name_to_func; if (producing >= 0) { - fused_group.insert(make_pair(f.name(), stage_index)); + fused_group.emplace(f.name(), stage_index); } if (!no_pipelines && producing >= 0 && !f.has_extern_definition()) { @@ -1164,12 +1164,12 @@ class BoundsInference : public IRMutator { if (!((pair.func_1 == stages[producing].name) && ((int)pair.stage_1 == stage_index)) && is_fused_with_others(fused_groups, fused_pairs_in_groups, f, stage_index, pair.func_1, pair.stage_1, var)) { - fused_group.insert(make_pair(pair.func_1, pair.stage_1)); + fused_group.emplace(pair.func_1, pair.stage_1); } if (!((pair.func_2 == stages[producing].name) && ((int)pair.stage_2 == stage_index)) && is_fused_with_others(fused_groups, fused_pairs_in_groups, f, stage_index, pair.func_2, pair.stage_2, var)) { - fused_group.insert(make_pair(pair.func_2, pair.stage_2)); + fused_group.emplace(pair.func_2, pair.stage_2); } } diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp index 78fc4224fb61..697b9200fa33 100644 --- a/src/CodeGen_Internal.cpp +++ b/src/CodeGen_Internal.cpp @@ -610,7 +610,11 @@ void get_target_options(const llvm::Module &module, llvm::TargetOptions &options options.UseInitArray = true; options.FloatABIType = use_soft_float_abi ? llvm::FloatABI::Soft : llvm::FloatABI::Hard; +#if LLVM_VERSION >= 190 + options.MCOptions.X86RelaxRelocations = false; +#else options.RelaxELFRelocations = false; +#endif options.MCOptions.ABIName = mabi; } diff --git a/src/Deserialization.cpp b/src/Deserialization.cpp index 551acfcdebf2..0a1403362621 100644 --- a/src/Deserialization.cpp +++ b/src/Deserialization.cpp @@ -504,12 +504,14 @@ void Deserializer::deserialize_function(const Serialize::Func *function, Functio const std::vector trace_tags = deserialize_vector(function->trace_tags(), &Deserializer::deserialize_string); + const bool no_profiling = function->no_profiling(); const bool frozen = function->frozen(); hl_function.update_with_deserialization(name, origin_name, output_types, required_types, required_dim, args, func_schedule, init_def, updates, debug_file, output_buffers, extern_arguments, extern_function_name, name_mangling, extern_function_device_api, extern_proxy_expr, - trace_loads, trace_stores, trace_realizations, trace_tags, frozen); + trace_loads, trace_stores, trace_realizations, trace_tags, + no_profiling, frozen); } Stmt Deserializer::deserialize_stmt(Serialize::Stmt type_code, const void *stmt) { diff --git a/src/FlattenNestedRamps.cpp b/src/FlattenNestedRamps.cpp index f48bd75c37a2..92bcf3870d5d 100644 --- a/src/FlattenNestedRamps.cpp +++ b/src/FlattenNestedRamps.cpp @@ -81,19 +81,19 @@ class FlattenRamps : public IRMutator { // If they are, we'll have a full vector of const_indices if ((int)const_indices.size() == lanes) { - // Compute the stride for the underlying strided load - int stride = 0; - for (int c : const_indices) { - stride = (int)gcd(stride, c); - } - for (int &c : const_indices) { - c /= stride; + int stride = 0, extent = 1; + if (max_constant_offset > 0) { + for (int c : const_indices) { + stride = (int)gcd(stride, c); + } + for (int &c : const_indices) { + c /= stride; + } + // Compute the number of elements loaded + extent = (int)((max_constant_offset / stride) + 1); } - // Compute the number of elements loaded - int extent = (int)((max_constant_offset / stride) + 1); - // If we're gathering from a very large range, it // might be better to just do the gather rather than // doing a big dense load and then shuffling. We @@ -105,12 +105,22 @@ class FlattenRamps : public IRMutator { // in the schedule somehow. const int max_unused_lane_factor = 4; if (extent < max_unused_lane_factor * lanes) { - Expr dense_index = Ramp::make(min_lane, make_const(min_lane.type(), stride), extent); - Expr dense_load = - Load::make(op->type.with_lanes(extent), op->name, dense_index, - op->image, op->param, - const_true(extent), ModulusRemainder{}); - return Shuffle::make({dense_load}, const_indices); + if (max_constant_offset == 0) { + // It's a load of a broadcast. Convert it to a broadcast of a load + Expr load = Load::make(op->type.element_of(), op->name, min_lane, + op->image, op->param, + const_true(), ModulusRemainder{}); + return Broadcast::make(load, lanes); + } else { + // Turn it into a dense load and a shuffle + Expr dense_index = + Ramp::make(min_lane, make_const(min_lane.type(), stride), extent); + Expr dense_load = + Load::make(op->type.with_lanes(extent), op->name, dense_index, + op->image, op->param, + const_true(extent), ModulusRemainder{}); + return Shuffle::make({dense_load}, const_indices); + } } } } diff --git a/src/Func.cpp b/src/Func.cpp index 7e0995cc33b5..1f480c99983c 100644 --- a/src/Func.cpp +++ b/src/Func.cpp @@ -3037,6 +3037,11 @@ Func &Func::add_trace_tag(const std::string &trace_tag) { return *this; } +Func &Func::no_profiling() { + func.do_not_profile(); + return *this; +} + void Func::debug_to_file(const string &filename) { invalidate_cache(); func.debug_file() = filename; diff --git a/src/Func.h b/src/Func.h index d4074ee18cc6..bced13f79481 100644 --- a/src/Func.h +++ b/src/Func.h @@ -2559,6 +2559,15 @@ class Func { */ Func &add_trace_tag(const std::string &trace_tag); + /** Marks this function as a function that should not be profiled + * when using the target feature Profile or ProfileByTimer. + * This is useful when this function is does too little work at once + * such that the overhead of setting the profiling token might + * become significant, or that the measured time is not representative + * due to modern processors (instruction level parallelism, out-of-order + * execution). */ + Func &no_profiling(); + /** Get a handle on the internal halide function that this Func * represents. Useful if you want to do introspection on Halide * functions */ diff --git a/src/Function.cpp b/src/Function.cpp index 88f5b851e986..b72a39e1c90a 100644 --- a/src/Function.cpp +++ b/src/Function.cpp @@ -110,6 +110,8 @@ struct FunctionContents { bool trace_loads = false, trace_stores = false, trace_realizations = false; std::vector trace_tags; + bool no_profiling = false; + bool frozen = false; void accept(IRVisitor *visitor) const { @@ -352,6 +354,7 @@ void Function::update_with_deserialization(const std::string &name, bool trace_stores, bool trace_realizations, const std::vector &trace_tags, + bool no_profiling, bool frozen) { contents->name = name; contents->origin_name = origin_name; @@ -373,6 +376,7 @@ void Function::update_with_deserialization(const std::string &name, contents->trace_stores = trace_stores; contents->trace_realizations = trace_realizations; contents->trace_tags = trace_tags; + contents->no_profiling = no_profiling; contents->frozen = frozen; } @@ -511,6 +515,7 @@ void Function::deep_copy(const FunctionPtr ©, DeepCopyMap &copied_map) const copy->trace_stores = contents->trace_stores; copy->trace_realizations = contents->trace_realizations; copy->trace_tags = contents->trace_tags; + copy->no_profiling = contents->no_profiling; copy->frozen = contents->frozen; copy->output_buffers = contents->output_buffers; copy->func_schedule = contents->func_schedule.deep_copy(copied_map); @@ -1141,10 +1146,6 @@ const std::vector &Function::get_trace_tags() const { return contents->trace_tags; } -void Function::freeze() { - contents->frozen = true; -} - void Function::lock_loop_levels() { auto &schedule = contents->func_schedule; schedule.compute_level().lock(); @@ -1168,6 +1169,16 @@ void Function::lock_loop_levels() { } } +void Function::do_not_profile() { + contents->no_profiling = true; +} +bool Function::should_not_profile() const { + return contents->no_profiling; +} + +void Function::freeze() { + contents->frozen = true; +} bool Function::frozen() const { return contents->frozen; } diff --git a/src/Function.h b/src/Function.h index 66b62a01f66b..49f68805d61e 100644 --- a/src/Function.h +++ b/src/Function.h @@ -88,6 +88,7 @@ class Function { bool trace_stores, bool trace_realizations, const std::vector &trace_tags, + bool no_profiling, bool frozen); /** Get a handle on the halide function contents that this Function @@ -290,6 +291,12 @@ class Function { * cannot be mutated further. */ void lock_loop_levels(); + /** Mark the function as too small for meaningful profiling. */ + void do_not_profile(); + + /** Check if the function is marked as one that should not be profiled. */ + bool should_not_profile() const; + /** Mark function as frozen, which means it cannot accept new * definitions. */ void freeze(); diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp index deabd95d1d1b..f11fa3348399 100644 --- a/src/HexagonOptimize.cpp +++ b/src/HexagonOptimize.cpp @@ -1685,6 +1685,14 @@ class EliminateInterleaves : public IRMutator { return true; } + // Indicates the minimum Hexagon Vector Extension (HVX) target version required for using these instructions. + enum class HvxTarget { + v62orLater, // Use for Hexagon v62 target or later + v65orLater, // Use for Hexagon v65 target or later + v66orLater, // Use for Hexagon v66 target or later + }; + HvxTarget hvx_target; + Expr visit(const Call *op) override { vector args(op->args); @@ -1702,27 +1710,27 @@ class EliminateInterleaves : public IRMutator { // does not deinterleave, and then opportunistically select // the interleaving alternative when we can cancel out to the // interleave. - static std::map deinterleaving_alts = { - {"halide.hexagon.pack.vh", "halide.hexagon.trunc.vh"}, - {"halide.hexagon.pack.vw", "halide.hexagon.trunc.vw"}, - {"halide.hexagon.packhi.vh", "halide.hexagon.trunclo.vh"}, - {"halide.hexagon.packhi.vw", "halide.hexagon.trunclo.vw"}, - {"halide.hexagon.pack_satub.vh", "halide.hexagon.trunc_satub.vh"}, - {"halide.hexagon.pack_satub.vuh", "halide.hexagon.trunc_satub.vuh"}, - {"halide.hexagon.pack_sath.vw", "halide.hexagon.trunc_sath.vw"}, - {"halide.hexagon.pack_satuh.vw", "halide.hexagon.trunc_satuh.vw"}, - {"halide.hexagon.pack_satuh.vuw", "halide.hexagon.trunc_satuh.vuw"}, + static std::map> deinterleaving_alts = { + {"halide.hexagon.pack.vh", {HvxTarget::v62orLater, "halide.hexagon.trunc.vh"}}, + {"halide.hexagon.pack.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc.vw"}}, + {"halide.hexagon.packhi.vh", {HvxTarget::v62orLater, "halide.hexagon.trunclo.vh"}}, + {"halide.hexagon.packhi.vw", {HvxTarget::v62orLater, "halide.hexagon.trunclo.vw"}}, + {"halide.hexagon.pack_satub.vh", {HvxTarget::v62orLater, "halide.hexagon.trunc_satub.vh"}}, + {"halide.hexagon.pack_satub.vuh", {HvxTarget::v65orLater, "halide.hexagon.trunc_satub.vuh"}}, + {"halide.hexagon.pack_sath.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc_sath.vw"}}, + {"halide.hexagon.pack_satuh.vw", {HvxTarget::v62orLater, "halide.hexagon.trunc_satuh.vw"}}, + {"halide.hexagon.pack_satuh.vuw", {HvxTarget::v62orLater, "halide.hexagon.trunc_satuh.vuw"}}, }; // The reverse mapping of the above. - static std::map interleaving_alts = { - {"halide.hexagon.trunc.vh", "halide.hexagon.pack.vh"}, - {"halide.hexagon.trunc.vw", "halide.hexagon.pack.vw"}, - {"halide.hexagon.trunclo.vh", "halide.hexagon.packhi.vh"}, - {"halide.hexagon.trunclo.vw", "halide.hexagon.packhi.vw"}, - {"halide.hexagon.trunc_satub.vh", "halide.hexagon.pack_satub.vh"}, - {"halide.hexagon.trunc_sath.vw", "halide.hexagon.pack_sath.vw"}, - {"halide.hexagon.trunc_satuh.vw", "halide.hexagon.pack_satuh.vw"}, + static std::map> interleaving_alts = { + {"halide.hexagon.trunc.vh", {HvxTarget::v62orLater, "halide.hexagon.pack.vh"}}, + {"halide.hexagon.trunc.vw", {HvxTarget::v62orLater, "halide.hexagon.pack.vw"}}, + {"halide.hexagon.trunclo.vh", {HvxTarget::v62orLater, "halide.hexagon.packhi.vh"}}, + {"halide.hexagon.trunclo.vw", {HvxTarget::v62orLater, "halide.hexagon.packhi.vw"}}, + {"halide.hexagon.trunc_satub.vh", {HvxTarget::v62orLater, "halide.hexagon.pack_satub.vh"}}, + {"halide.hexagon.trunc_sath.vw", {HvxTarget::v62orLater, "halide.hexagon.pack_sath.vw"}}, + {"halide.hexagon.trunc_satuh.vw", {HvxTarget::v62orLater, "halide.hexagon.pack_satuh.vw"}}, }; if (is_native_deinterleave(op) && yields_interleave(args[0])) { @@ -1738,7 +1746,8 @@ class EliminateInterleaves : public IRMutator { op->func, op->value_index, op->image, op->param); // Add the interleave back to the result of the call. return native_interleave(expr); - } else if (deinterleaving_alts.find(op->name) != deinterleaving_alts.end() && + } else if (deinterleaving_alts.find(op->name) != deinterleaving_alts.end() && hvx_target >= deinterleaving_alts[op->name].first && + yields_removable_interleave(args)) { // This call has a deinterleaving alternative, and the // arguments are interleaved, so we should use the @@ -1746,14 +1755,14 @@ class EliminateInterleaves : public IRMutator { for (Expr &i : args) { i = remove_interleave(i); } - return Call::make(op->type, deinterleaving_alts[op->name], args, op->call_type); - } else if (interleaving_alts.count(op->name) && is_native_deinterleave(args[0])) { + return Call::make(op->type, deinterleaving_alts[op->name].second, args, op->call_type); + } else if (interleaving_alts.count(op->name) && hvx_target >= interleaving_alts[op->name].first && is_native_deinterleave(args[0])) { // This is an interleaving alternative with a // deinterleave, which can be generated when we // deinterleave storage. Revert back to the interleaving // op so we can remove the deinterleave. Expr arg = args[0].as()->args[0]; - return Call::make(op->type, interleaving_alts[op->name], {arg}, op->call_type, + return Call::make(op->type, interleaving_alts[op->name].second, {arg}, op->call_type, op->func, op->value_index, op->image, op->param); } else if (changed) { return Call::make(op->type, op->name, args, op->call_type, @@ -1896,8 +1905,15 @@ class EliminateInterleaves : public IRMutator { using IRMutator::visit; public: - EliminateInterleaves(int native_vector_bytes) + EliminateInterleaves(const Target &t, int native_vector_bytes) : native_vector_bits(native_vector_bytes * 8), alignment_analyzer(native_vector_bytes) { + if (t.features_any_of({Target::HVX_v65})) { + hvx_target = HvxTarget::v65orLater; + } else if (t.features_any_of({Target::HVX_v66})) { + hvx_target = HvxTarget::v66orLater; + } else { + hvx_target = HvxTarget::v62orLater; + } } }; @@ -2233,7 +2249,7 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) { << s << "\n"; // Try to eliminate any redundant interleave/deinterleave pairs. - s = EliminateInterleaves(t.natural_vector_size(Int(8))).mutate(s); + s = EliminateInterleaves(t, t.natural_vector_size(Int(8))).mutate(s); debug(4) << "Hexagon: Lowering after EliminateInterleaves\n" << s << "\n"; @@ -2246,4 +2262,4 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) { } } // namespace Internal -} // namespace Halide +} // namespace Halide \ No newline at end of file diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index bc03dd124d9a..a186be1874d7 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -1112,11 +1112,12 @@ void IRPrinter::visit(const VectorReduce *op) { void IRPrinter::visit(const Atomic *op) { if (op->mutex_name.empty()) { - stream << get_indent() << "atomic {\n"; + stream << get_indent() << "atomic (" + << op->producer_name << ") {\n"; } else { - stream << get_indent() << "atomic ("; - stream << op->mutex_name; - stream << ") {\n"; + stream << get_indent() << "atomic (" + << op->producer_name << ", " + << op->mutex_name << ") {\n"; } indent += 2; print(op->body); diff --git a/src/Lower.cpp b/src/Lower.cpp index 3b357eb3061e..f092e2e711ef 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -280,10 +280,7 @@ void lower_impl(const vector &output_funcs, s = split_tuples(s, env); log("Lowering after destructuring tuple-valued realizations:", s); - // Vulkan relies on GPU var canonicalization occurring before - // storage flattening. - if (t.has_gpu_feature() || - t.has_feature(Target::Vulkan)) { + if (t.has_gpu_feature()) { debug(1) << "Canonicalizing GPU var names...\n"; s = canonicalize_gpu_vars(s); log("Lowering after canonicalizing GPU var names:", s); @@ -299,7 +296,7 @@ void lower_impl(const vector &output_funcs, log("Lowering after storage flattening:", s); debug(1) << "Adding atomic mutex allocation...\n"; - s = add_atomic_mutex(s, env); + s = add_atomic_mutex(s, outputs); log("Lowering after adding atomic mutex allocation:", s); debug(1) << "Unpacking buffer arguments...\n"; @@ -408,7 +405,7 @@ void lower_impl(const vector &output_funcs, if (t.has_feature(Target::Profile) || t.has_feature(Target::ProfileByTimer)) { debug(1) << "Injecting profiling...\n"; - s = inject_profiling(s, pipeline_name); + s = inject_profiling(s, pipeline_name, env); log("Lowering after injecting profiling:", s); } diff --git a/src/Profiling.cpp b/src/Profiling.cpp index 2be058b3c8a6..414578299df6 100644 --- a/src/Profiling.cpp +++ b/src/Profiling.cpp @@ -3,7 +3,7 @@ #include #include "CodeGen_Internal.h" -#include "ExprUsesVar.h" +#include "Function.h" #include "IRMutator.h" #include "IROperator.h" #include "InjectHostDevBufferCopies.h" @@ -71,13 +71,14 @@ class InjectProfiling : public IRMutator { vector stack; // What produce nodes are we currently inside of. string pipeline_name; + const map &env; bool in_fork = false; bool in_parallel = false; bool in_leaf_task = false; - InjectProfiling(const string &pipeline_name) - : pipeline_name(pipeline_name) { + InjectProfiling(const string &pipeline_name, const map &env) + : pipeline_name(pipeline_name), env(env) { stack.push_back(get_func_id("overhead")); // ID 0 is treated specially in the runtime as overhead internal_assert(stack.back() == 0); @@ -119,10 +120,28 @@ class InjectProfiling : public IRMutator { bool profiling_memory = true; // Strip down the tuple name, e.g. f.0 into f - string normalize_name(const string &name) { - vector v = split_string(name, "."); - internal_assert(!v.empty()); - return v[0]; + string normalize_name(const string &name) const { + size_t idx = name.find('.'); + if (idx != std::string::npos) { + internal_assert(idx != 0); + return name.substr(0, idx); + } else { + return name; + } + } + + Function lookup_function(const string &name) const { + auto it = env.find(name); + if (it != env.end()) { + return it->second; + } + string norm_name = normalize_name(name); + it = env.find(norm_name); + if (it != env.end()) { + return it->second; + } + internal_error << "No function in the environment found for name '" << name << "'.\n"; + return {}; } int get_func_id(const string &name) { @@ -185,7 +204,6 @@ class InjectProfiling : public IRMutator { } Stmt visit(const Allocate *op) override { - int idx = get_func_id(op->name); auto [new_extents, changed] = mutate_with_changes(op->extents); Expr condition = mutate(op->condition); @@ -199,6 +217,13 @@ class InjectProfiling : public IRMutator { // always conditionally false. remove_dead_allocations() is called after // inject_profiling() so this is a possible scenario. if (!is_const_zero(size) && on_stack) { + int idx; + Function func = lookup_function(op->name); + if (func.should_not_profile()) { + idx = stack.back(); // Attribute the stack size contribution to the deepest _profiled_ func. + } else { + idx = get_func_id(op->name); + } const uint64_t *int_size = as_const_uint(size); internal_assert(int_size != nullptr); // Stack size is always a const int func_stack_current[idx] += *int_size; @@ -212,6 +237,7 @@ class InjectProfiling : public IRMutator { vector tasks; bool track_heap_allocation = !is_const_zero(size) && !on_stack && profiling_memory; if (track_heap_allocation) { + int idx = get_func_id(op->name); debug(3) << " Allocation on heap: " << op->name << "(" << size << ") in pipeline " << pipeline_name << "\n"; @@ -245,8 +271,6 @@ class InjectProfiling : public IRMutator { } Stmt visit(const Free *op) override { - int idx = get_func_id(op->name); - AllocSize alloc = func_alloc_sizes.get(op->name); internal_assert(alloc.size.type() == UInt(64)); func_alloc_sizes.pop(op->name); @@ -256,6 +280,7 @@ class InjectProfiling : public IRMutator { if (!is_const_zero(alloc.size)) { if (!alloc.on_stack) { if (profiling_memory) { + int idx = get_func_id(op->name); debug(3) << " Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "\n"; vector tasks{ @@ -271,6 +296,13 @@ class InjectProfiling : public IRMutator { const uint64_t *int_size = as_const_uint(alloc.size); internal_assert(int_size != nullptr); + int idx; + Function func = lookup_function(op->name); + if (func.should_not_profile()) { + idx = stack.back(); // Attribute the stack size contribution to the deepest _profiled_ func. + } else { + idx = get_func_id(op->name); + } func_stack_current[idx] -= *int_size; debug(3) << " Free on stack: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n"; @@ -283,11 +315,19 @@ class InjectProfiling : public IRMutator { int idx; Stmt body; if (op->is_producer) { - idx = get_func_id(op->name); - stack.push_back(idx); - Stmt set_current = set_current_func(idx); - body = Block::make(set_current, mutate(op->body)); - stack.pop_back(); + Function func = lookup_function(op->name); + if (func.should_not_profile()) { + body = mutate(op->body); + if (body.same_as(op->body)) { + return op; + } + } else { + idx = get_func_id(op->name); + stack.push_back(idx); + Stmt set_current = set_current_func(idx); + body = Block::make(set_current, mutate(op->body)); + stack.pop_back(); + } } else { // At the beginning of the consume step, set the current task // back to the outer one. @@ -498,8 +538,8 @@ class InjectProfiling : public IRMutator { } // namespace -Stmt inject_profiling(Stmt s, const string &pipeline_name) { - InjectProfiling profiling(pipeline_name); +Stmt inject_profiling(Stmt s, const string &pipeline_name, const std::map &env) { + InjectProfiling profiling(pipeline_name, env); s = profiling.mutate(s); int num_funcs = (int)(profiling.indices.size()); diff --git a/src/Profiling.h b/src/Profiling.h index a6040b9160af..afaa47fe6d6e 100644 --- a/src/Profiling.h +++ b/src/Profiling.h @@ -23,6 +23,7 @@ * mandelbrot: 0.006444ms (10%) peak: 505344 num: 104000 avg: 5376 * argmin: 0.027715ms (46%) stack: 20 */ +#include #include #include "Expr.h" @@ -30,6 +31,8 @@ namespace Halide { namespace Internal { +class Function; + /** Take a statement representing a halide pipeline insert * high-resolution timing into the generated code (via spawning a * thread that acts as a sampling profiler); summaries of execution @@ -37,7 +40,7 @@ namespace Internal { * storage flattening, but after all bounds inference. * */ -Stmt inject_profiling(Stmt, const std::string &); +Stmt inject_profiling(Stmt, const std::string &, const std::map &env); } // namespace Internal } // namespace Halide diff --git a/src/Serialization.cpp b/src/Serialization.cpp index 144d79af7e5e..c1cb3a6d1193 100644 --- a/src/Serialization.cpp +++ b/src/Serialization.cpp @@ -1029,6 +1029,7 @@ Offset Serializer::serialize_function(FlatBufferBuilder &builde for (const auto &tag : function.get_trace_tags()) { trace_tags_serialized.push_back(serialize_string(builder, tag)); } + const bool no_profiling = function.should_not_profile(); const bool frozen = function.frozen(); auto func = Serialize::CreateFunc(builder, name_serialized, @@ -1050,7 +1051,9 @@ Offset Serializer::serialize_function(FlatBufferBuilder &builde trace_loads, trace_stores, trace_realizations, - builder.CreateVector(trace_tags_serialized), frozen); + builder.CreateVector(trace_tags_serialized), + no_profiling, + frozen); return func; } diff --git a/src/StorageFlattening.cpp b/src/StorageFlattening.cpp index ba4cc9b8acca..5860a7e50d0f 100644 --- a/src/StorageFlattening.cpp +++ b/src/StorageFlattening.cpp @@ -535,13 +535,9 @@ class FlattenDimensions : public IRMutator { Interval loop_bounds = Interval(expanded_min, simplify(expanded_min + expanded_extent - 1)); it->loop_vars.push(op->name, loop_bounds); } - bool old_in_gpu = in_gpu; - if (op->for_type == ForType::GPUBlock || - op->for_type == ForType::GPUThread) { - in_gpu = true; - } + + ScopedValue old_in_gpu(in_gpu, in_gpu || is_gpu(op->for_type)); Stmt stmt = IRMutator::visit(op); - in_gpu = old_in_gpu; for (auto &p : hoisted_storages) { p.loop_vars.pop(op->name); diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp index 2ce325538a86..e3cc2ec5e825 100644 --- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp +++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp @@ -1359,7 +1359,7 @@ Partitioner::Partitioner(const map &_pipeline_bounds, for (int s = 0; s < num_stages; s++) { FStage stg(f.second, s); Group g(stg, {stg}); - groups.insert(make_pair(stg, g)); + groups.emplace(stg, g); } } diff --git a/src/halide_ir.fbs b/src/halide_ir.fbs index 01a987b6f430..efc465cbee82 100644 --- a/src/halide_ir.fbs +++ b/src/halide_ir.fbs @@ -15,7 +15,7 @@ enum SerializationVersionMinor: int { Value = 0 } enum SerializationVersionPatch: int { - Value = 0 + Value = 1 } // from src/IR.cpp @@ -713,6 +713,7 @@ table Func { trace_stores: bool = false; trace_realizations: bool = false; trace_tags: [string]; + no_profiling: bool = false; frozen: bool = false; } diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 9408c59da167..1d0843be0329 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -195,7 +195,7 @@ extern void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mute /** Functions for constructing/destroying/locking/unlocking arrays of mutexes. */ struct halide_mutex_array; //@{ -extern struct halide_mutex_array *halide_mutex_array_create(int sz); +extern struct halide_mutex_array *halide_mutex_array_create(uint64_t sz); extern void halide_mutex_array_destroy(void *user_context, void *array); extern int halide_mutex_array_lock(struct halide_mutex_array *array, int entry); extern int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry); diff --git a/src/runtime/fake_thread_pool.cpp b/src/runtime/fake_thread_pool.cpp index 9c3cfddc5a47..531a16d1312e 100644 --- a/src/runtime/fake_thread_pool.cpp +++ b/src/runtime/fake_thread_pool.cpp @@ -96,7 +96,7 @@ WEAK void halide_mutex_unlock(halide_mutex *mutex) { // (e.g. correctness/multiple_scatter). Since we don't have threads, we don't // need to mutex to do anything, but returning a null would trigger an error // condition that would be misrepoted as out-of-memory. -WEAK halide_mutex_array *halide_mutex_array_create(int sz) { +WEAK halide_mutex_array *halide_mutex_array_create(uint64_t sz) { return &halide_fake_mutex_array; } diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h index feee56a4e531..89b1a929e79b 100644 --- a/src/runtime/internal/block_allocator.h +++ b/src/runtime/internal/block_allocator.h @@ -55,10 +55,11 @@ class BlockAllocator { // Public interface methods MemoryRegion *reserve(void *user_context, const MemoryRequest &request); - int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *region); //< retain the region and increase the usage count - bool collect(void *user_context); //< returns true if any blocks were removed + int conform(void *user_context, MemoryRequest *request) const; //< conform the given request into a suitable allocation + int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *region); //< retain the region and increase the usage count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -86,13 +87,13 @@ class BlockAllocator { int destroy_region_allocator(void *user_context, RegionAllocator *region_allocator); // Reserves a block of memory for the requested size and returns the corresponding block entry, or nullptr on failure - BlockEntry *reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + BlockEntry *reserve_block_entry(void *user_context, const MemoryRequest &request); // Locates the "best-fit" block entry for the requested size, or nullptr if none was found - BlockEntry *find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + BlockEntry *find_block_entry(void *user_context, const MemoryRequest &request); - // Creates a new block entry and int the list - BlockEntry *create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + // Creates a new block entry and adds it tos the list + BlockEntry *create_block_entry(void *user_context, const MemoryRequest &request); // Releases the block entry from being used, and makes it available for further allocations int release_block_entry(void *user_context, BlockEntry *block_entry); @@ -113,7 +114,7 @@ class BlockAllocator { bool is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const; // Returns true if the given block is suitable for the request allocation - bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const; + bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const; Config config; LinkedList block_list; @@ -162,7 +163,8 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r << "caching=" << halide_memory_caching_name(request.properties.caching) << " " << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ..."; #endif - BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated); + // Reserve a block entry for use + BlockEntry *block_entry = reserve_block_entry(user_context, request); if (block_entry == nullptr) { error(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size (" << (int32_t)(request.size) << " bytes)\n"; @@ -173,11 +175,12 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r halide_abort_if_false(user_context, block != nullptr); halide_abort_if_false(user_context, block->allocator != nullptr); + // Reserve an initial memory region for the block MemoryRegion *result = reserve_memory_region(user_context, block->allocator, request); if (result == nullptr) { // Unable to reserve region in an existing block ... create a new block and try again. - block_entry = create_block_entry(user_context, request.properties, request.size, request.dedicated); + block_entry = create_block_entry(user_context, request); if (block_entry == nullptr) { error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size (" << (int32_t)(request.size) << " bytes)\n"; @@ -299,8 +302,8 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl return result; } -bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const { - if (!is_compatible_block(block, properties)) { +bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const { + if (!is_compatible_block(block, request.properties)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: skipping block ... incompatible properties! (" << "block_resource=" << (void *)block << " " @@ -309,16 +312,16 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo << "block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << " " << "block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << " " << "block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << " " - << "request_size=" << (uint32_t)size << " " - << "request_usage=" << halide_memory_usage_name(properties.usage) << " " - << "request_caching=" << halide_memory_caching_name(properties.caching) << " " - << "request_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "request_size=" << (uint32_t)request.size << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif // skip blocks that are using incompatible memory return false; } - if (dedicated && (block->reserved > 0)) { + if (request.dedicated && (block->reserved > 0)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation! (" << "block_resource=" << (void *)block << " " @@ -340,7 +343,7 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo } size_t available = (block->memory.size - block->reserved); - if (available >= size) { + if (available >= request.size) { return true; } @@ -348,23 +351,23 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo } BlockAllocator::BlockEntry * -BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::find_block_entry(void *user_context, const MemoryRequest &request) { BlockEntry *block_entry = block_list.back(); while (block_entry != nullptr) { BlockEntry *prev_entry = block_entry->prev_ptr; const BlockResource *block = static_cast(block_entry->value); - if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) { + if (is_block_suitable_for_request(user_context, block, request)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: found suitable block (" << "user_context=" << (void *)(user_context) << " " << "block_resource=" << (void *)block << " " << "block_size=" << (uint32_t)block->memory.size << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << "request_size=" << (uint32_t)size << " " - << "dedicated=" << (dedicated ? "true" : "false") << " " - << "usage=" << halide_memory_usage_name(properties.usage) << " " - << "caching=" << halide_memory_caching_name(properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "request_size=" << (uint32_t)request.size << " " + << "request_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif return block_entry; } @@ -375,37 +378,37 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: couldn't find suitable block! (" << "user_context=" << (void *)(user_context) << " " - << "request_size=" << (uint32_t)size << " " - << "dedicated=" << (dedicated ? "true" : "false") << " " - << "usage=" << halide_memory_usage_name(properties.usage) << " " - << "caching=" << halide_memory_caching_name(properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "request_size=" << (uint32_t)request.size << " " + << "request_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif } return block_entry; } BlockAllocator::BlockEntry * -BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::reserve_block_entry(void *user_context, const MemoryRequest &request) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: reserving block ... ! (" - << "requested_size=" << (uint32_t)size << " " - << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " " - << "requested_usage=" << halide_memory_usage_name(properties.usage) << " " - << "requested_caching=" << halide_memory_caching_name(properties.caching) << " " - << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif - BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated); + BlockEntry *block_entry = find_block_entry(user_context, request); if (block_entry == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: creating block ... ! (" - << "requested_size=" << (uint32_t)size << " " - << "requested_is_dedicated=" << (dedicated ? "true" : "false") << " " - << "requested_usage=" << halide_memory_usage_name(properties.usage) << " " - << "requested_caching=" << halide_memory_caching_name(properties.caching) << " " - << "requested_visibility=" << halide_memory_visibility_name(properties.visibility) << ")"; + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif - block_entry = create_block_entry(user_context, properties, size, dedicated); + block_entry = create_block_entry(user_context, request); } if (block_entry) { @@ -449,7 +452,7 @@ int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator } BlockAllocator::BlockEntry * -BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::create_block_entry(void *user_context, const MemoryRequest &request) { if (config.maximum_pool_size && (pool_size() >= config.maximum_pool_size)) { error(user_context) << "BlockAllocator: No free blocks found! Maximum pool size reached (" << (int32_t)(config.maximum_pool_size) << " bytes or " @@ -476,12 +479,16 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p << "allocator=" << (void *)(allocators.block.allocate) << ")..."; #endif + // Constrain the request to the a valid block allocation + MemoryRequest block_request = request; + conform(user_context, &block_request); + + // Create the block resource itself BlockResource *block = static_cast(block_entry->value); - block->memory.size = constrain_requested_size(size); + block->memory.size = block_request.size; block->memory.handle = nullptr; - block->memory.properties = properties; - block->memory.properties.nearest_multiple = max(config.nearest_multiple, properties.nearest_multiple); - block->memory.dedicated = dedicated; + block->memory.properties = block_request.properties; + block->memory.dedicated = block_request.dedicated; block->reserved = 0; block->allocator = create_region_allocator(user_context, block); alloc_memory_block(user_context, block); @@ -561,6 +568,33 @@ size_t BlockAllocator::constrain_requested_size(size_t size) const { return actual_size; } +int BlockAllocator::conform(void *user_context, MemoryRequest *request) const { + + request->properties.nearest_multiple = max(config.nearest_multiple, request->properties.nearest_multiple); + + if (request->properties.nearest_multiple) { + size_t nm = request->properties.nearest_multiple; + request->size = (((request->size + nm - 1) / nm) * nm); // round up to nearest multiple + } + + if (config.minimum_block_size) { + request->size = ((request->size < config.minimum_block_size) ? + config.minimum_block_size : + request->size); + } + if (config.maximum_block_size) { + request->size = ((request->size > config.maximum_block_size) ? + config.maximum_block_size : + request->size); + } + + if (allocators.block.conform) { + return allocators.block.conform(user_context, request); + } + + return 0; +} + bool BlockAllocator::is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const { if (properties.caching != MemoryCaching::DefaultCaching) { if (properties.caching != block->memory.properties.caching) { diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h index d41fa57304fb..0be6041519a1 100644 --- a/src/runtime/internal/memory_resources.h +++ b/src/runtime/internal/memory_resources.h @@ -202,18 +202,22 @@ struct HalideSystemAllocatorFns { typedef int (*AllocateBlockFn)(void *, MemoryBlock *); typedef int (*DeallocateBlockFn)(void *, MemoryBlock *); +typedef int (*ConformBlockRequestFn)(void *, MemoryRequest *); struct MemoryBlockAllocatorFns { AllocateBlockFn allocate = nullptr; DeallocateBlockFn deallocate = nullptr; + ConformBlockRequestFn conform = nullptr; }; typedef int (*AllocateRegionFn)(void *, MemoryRegion *); typedef int (*DeallocateRegionFn)(void *, MemoryRegion *); +typedef int (*ConformBlockRegionFn)(void *, MemoryRequest *); struct MemoryRegionAllocatorFns { AllocateRegionFn allocate = nullptr; DeallocateRegionFn deallocate = nullptr; + ConformBlockRegionFn conform = nullptr; }; // -- diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h index 02c2cd7e6aa0..3588389c3747 100644 --- a/src/runtime/internal/region_allocator.h +++ b/src/runtime/internal/region_allocator.h @@ -46,10 +46,11 @@ class RegionAllocator { // Public interface methods MemoryRegion *reserve(void *user_context, const MemoryRequest &request); - int release(void *user_context, MemoryRegion *memory_region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *memory_region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *memory_region); //< retain the region and increase usage count - bool collect(void *user_context); //< returns true if any blocks were removed + int conform(void *user_context, MemoryRequest *request) const; //< conform the given request into a suitable allocation + int release(void *user_context, MemoryRegion *memory_region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *memory_region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *memory_region); //< retain the region and increase usage count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -73,13 +74,13 @@ class RegionAllocator { BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region); // Returns true if the given region can be split to accomodate the given size - bool can_split(const BlockRegion *region, size_t size, size_t alignment) const; + bool can_split(const BlockRegion *region, const MemoryRequest &request) const; // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining - BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment); + BlockRegion *split_block_region(void *user_context, BlockRegion *region, const MemoryRequest &request); // Creates a new block region and adds it to the region list - BlockRegion *create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated); + BlockRegion *create_block_region(void *user_context, const MemoryRequest &request); // Creates a new block region and adds it to the region list int destroy_block_region(void *user_context, BlockRegion *region); @@ -137,30 +138,55 @@ int RegionAllocator::initialize(void *user_context, BlockResource *mb, const Mem allocators = ma; arena = MemoryArena::create(user_context, {sizeof(BlockRegion), MemoryArena::default_capacity, 0}, allocators.system); halide_abort_if_false(user_context, arena != nullptr); + MemoryRequest block_request = {}; + block_request.size = block->memory.size; + block_request.offset = 0; + block_request.alignment = block->memory.properties.alignment; + block_request.properties = block->memory.properties; + block_request.dedicated = block->memory.dedicated; block->allocator = this; - block->regions = create_block_region( - user_context, - block->memory.properties, - 0, block->memory.size, - block->memory.dedicated); + block->regions = create_block_region(user_context, block_request); + return 0; +} + +int RegionAllocator::conform(void *user_context, MemoryRequest *request) const { + if (allocators.region.conform) { + return allocators.region.conform(user_context, request); + } else { + size_t actual_alignment = conform_alignment(request->alignment, block->memory.properties.alignment); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, block->memory.properties.nearest_multiple); + request->alignment = actual_alignment; + request->offset = actual_offset; + request->size = actual_size; + } return 0; } MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) { halide_abort_if_false(user_context, request.size > 0); - size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(request.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple); + + MemoryRequest region_request = request; + + int error_code = conform(user_context, ®ion_request); + if (error_code) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n"; +#endif + return nullptr; + } + size_t remaining = block->memory.size - block->reserved; - if (remaining < actual_size) { + if (remaining < region_request.size) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Unable to reserve more memory from block " - << "-- requested size (" << (int32_t)(request.size) << " bytes) " + << "-- requested size (" << (int32_t)(region_request.size) << " bytes) " << "greater than available (" << (int32_t)(remaining) << " bytes)"; #endif return nullptr; } - BlockRegion *block_region = find_block_region(user_context, request); + BlockRegion *block_region = find_block_region(user_context, region_request); if (block_region == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Failed to locate region for requested size (" @@ -169,12 +195,12 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest & return nullptr; } - if (can_split(block_region, request.size, request.alignment)) { + if (can_split(block_region, region_request)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") " - << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)"; + << "to accomodate requested size (" << (int32_t)(region_request.size) << " bytes)"; #endif - split_block_region(user_context, block_region, request.size, request.alignment); + split_block_region(user_context, block_region, region_request); } alloc_block_region(user_context, block_region); @@ -237,8 +263,17 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c return false; } + MemoryRequest region_request = request; + int error_code = conform(user_context, ®ion_request); + if (error_code) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n"; +#endif + return false; + } + // skip incompatible block regions for this request - if (!is_compatible_block_region(region, request.properties)) { + if (!is_compatible_block_region(region, region_request.properties)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << " skipping block region ... incompatible properties! (" << " block_region=" << (void *)region @@ -248,16 +283,13 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c return false; } - size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(region->memory.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple); - // is the adjusted size larger than the current region? - if (actual_size > region->memory.size) { + if (region_request.size > region->memory.size) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << " skipping block region ... not enough space for adjusted size! (" << " block_region=" << (void *)region << " request_size=" << (uint32_t)(request.size) - << " actual_size=" << (uint32_t)(actual_size) + << " actual_size=" << (uint32_t)(region_request.size) << " region_size=" << (uint32_t)(region->memory.size) << ")"; #endif @@ -265,12 +297,12 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c } // will the adjusted size fit within the remaining unallocated space? - if ((actual_size + block->reserved) <= block->memory.size) { + if ((region_request.size + block->reserved) <= block->memory.size) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << " found suitable block region! (" << " block_region=" << (void *)region << " request_size=" << (uint32_t)(request.size) - << " actual_size=" << (uint32_t)(actual_size) + << " actual_size=" << (uint32_t)(region_request.size) << " region_size=" << (uint32_t)(region->memory.size) << ")"; #endif @@ -411,13 +443,11 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe return block_region; } -bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size, size_t alignment) const { - size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment); - size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple); - return (block_region && (block_region->memory.size > split_size) && (block_region->usage_count == 0)); +bool RegionAllocator::can_split(const BlockRegion *block_region, const MemoryRequest &split_request) const { + return (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0)); } -BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) { +BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, const MemoryRequest &request) { if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) { #ifdef DEBUG_RUNTIME_INTERNAL @@ -434,33 +464,17 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion block_region->memory.handle = nullptr; } - size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment); - size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple); - size_t split_offset = aligned_offset(block_region->memory.offset + size, actual_alignment); - size_t empty_size = block_region->memory.size - split_size; - -#ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: Conforming size and alignment (" - << "requested_size=" << (uint32_t)size << " " - << "split_size=" << (uint32_t)split_size << " " - << "split_offset=" << (uint32_t)split_size << " " - << "empty_size=" << (uint32_t)empty_size << " " - << "requested_alignment=" << (uint32_t)alignment << " " - << "required_alignment=" << (uint32_t)block->memory.properties.alignment << " " - << "actual_alignment=" << (uint32_t)actual_alignment << ")"; -#endif + MemoryRequest split_request = request; + split_request.size = block_region->memory.size - request.size; + split_request.offset = block_region->memory.offset + request.size; #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Splitting " << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) " - << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)"; + << "to create empty region (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)"; #endif - BlockRegion *next_region = block_region->next_ptr; - BlockRegion *empty_region = create_block_region(user_context, - block_region->memory.properties, - split_offset, empty_size, - block_region->memory.dedicated); + BlockRegion *empty_region = create_block_region(user_context, split_request); halide_abort_if_false(user_context, empty_region != nullptr); empty_region->next_ptr = next_region; @@ -469,42 +483,52 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion } empty_region->prev_ptr = block_region; block_region->next_ptr = empty_region; - block_region->memory.size -= empty_size; + block_region->memory.size -= empty_region->memory.size; return empty_region; } -BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) { +BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryRequest &request) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Creating block region request (" << "user_context=" << (void *)(user_context) << " " - << "offset=" << (uint32_t)offset << " " - << "size=" << (uint32_t)size << " " - << "alignment=" << (uint32_t)properties.alignment << " " - << "dedicated=" << (dedicated ? "true" : "false") << " " - << "usage=" << halide_memory_usage_name(properties.usage) << " " - << "caching=" << halide_memory_caching_name(properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ..."; -#endif - size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple); - size_t actual_offset = aligned_offset(offset, actual_alignment); - - if (actual_size == 0) { - error(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n"; + << "offset=" << (uint32_t)request.offset << " " + << "size=" << (uint32_t)request.size << " " + << "alignment=" << (uint32_t)request.properties.alignment << " " + << "dedicated=" << (request.dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ..."; +#endif + + MemoryRequest region_request = request; + int error_code = conform(user_context, ®ion_request); + if (error_code) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to conform request for new block region!\n"; +#endif + return nullptr; + } + + if (region_request.size == 0) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n"; +#endif return nullptr; } BlockRegion *block_region = static_cast(arena->reserve(user_context, true)); if (block_region == nullptr) { - error(user_context) << "RegionAllocator: Failed to allocate new block region!\n"; +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to allocate new block region!\n"; +#endif return nullptr; } block_region->memory.handle = nullptr; - block_region->memory.offset = actual_offset; - block_region->memory.size = actual_size; - block_region->memory.properties = properties; - block_region->memory.dedicated = dedicated; + block_region->memory.offset = region_request.offset; + block_region->memory.size = region_request.size; + block_region->memory.properties = region_request.properties; + block_region->memory.dedicated = region_request.dedicated; block_region->status = AllocationStatus::Available; block_region->block_ptr = block; block_region->usage_count = 0; @@ -669,6 +693,8 @@ bool RegionAllocator::collect(void *user_context) { uint32_t collected_count = 0; uint32_t remaining_count = 0; + uint64_t available_bytes = 0; + uint64_t scanned_bytes = 0; uint64_t reserved = block->reserved; debug(user_context) << " collecting unused regions (" << "block_ptr=" << (void *)block << " " @@ -679,6 +705,8 @@ bool RegionAllocator::collect(void *user_context) { bool has_collected = false; BlockRegion *block_region = block->regions; while (block_region != nullptr) { +#ifdef DEBUG_RUNTIME_INTERNAL + scanned_bytes += block_region->memory.size; debug(user_context) << " checking region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " @@ -687,6 +715,7 @@ bool RegionAllocator::collect(void *user_context) { << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " << ")"; +#endif if (can_coalesce(block_region)) { #ifdef DEBUG_RUNTIME_INTERNAL @@ -705,6 +734,9 @@ bool RegionAllocator::collect(void *user_context) { remaining_count++; #endif } +#ifdef DEBUG_RUNTIME_INTERNAL + available_bytes += is_available(block_region) ? block_region->memory.size : 0; +#endif if (is_last_block_region(user_context, block_region)) { break; } @@ -715,6 +747,8 @@ bool RegionAllocator::collect(void *user_context) { << "block_ptr=" << (void *)block << " " << "total_count=" << (uint32_t)(collected_count + remaining_count) << " " << "block_reserved=" << (uint32_t)(block->reserved) << " " + << "scanned_bytes=" << (uint32_t)(scanned_bytes) << " " + << "available_bytes=" << (uint32_t)(available_bytes) << " " << ")"; #endif diff --git a/src/runtime/synchronization_common.h b/src/runtime/synchronization_common.h index cb244f360eeb..778c423e4046 100644 --- a/src/runtime/synchronization_common.h +++ b/src/runtime/synchronization_common.h @@ -908,7 +908,7 @@ struct halide_mutex_array { struct halide_mutex *array; }; -WEAK halide_mutex_array *halide_mutex_array_create(int sz) { +WEAK halide_mutex_array *halide_mutex_array_create(uint64_t sz) { // TODO: If sz is huge, we should probably hash it down to something smaller // in the accessors below. Check for deadlocks before doing so. halide_mutex_array *array = (halide_mutex_array *)halide_malloc( diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h index 96535f3446ba..055fbef72277 100644 --- a/src/runtime/vulkan_memory.h +++ b/src/runtime/vulkan_memory.h @@ -58,11 +58,12 @@ class VulkanMemoryAllocator { static int destroy(void *user_context, VulkanMemoryAllocator *allocator); // Public interface methods - MemoryRegion *reserve(void *user_context, MemoryRequest &request); - int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *region); //< retain the region and increase its use count - bool collect(void *user_context); //< returns true if any blocks were removed + MemoryRegion *reserve(void *user_context, const MemoryRequest &request); + int conform(void *user_context, MemoryRequest *request); //< conforms the given memory request into one that can be allocated + int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *region); //< retain the region and increase its use count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -86,9 +87,11 @@ class VulkanMemoryAllocator { static int allocate_block(void *instance_ptr, MemoryBlock *block); static int deallocate_block(void *instance_ptr, MemoryBlock *block); + static int conform_block_request(void *instance_ptr, MemoryRequest *request); static int allocate_region(void *instance_ptr, MemoryRegion *region); static int deallocate_region(void *instance_ptr, MemoryRegion *region); + static int conform_region_request(void *instance_ptr, MemoryRequest *request); size_t bytes_allocated_for_blocks() const; size_t blocks_allocated() const; @@ -113,6 +116,8 @@ class VulkanMemoryAllocator { MemoryProperties properties, uint32_t required_flags) const; + int lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements); + size_t block_byte_count = 0; size_t block_count = 0; size_t region_byte_count = 0; @@ -180,8 +185,8 @@ int VulkanMemoryAllocator::initialize(void *user_context, block_byte_count = 0; BlockAllocator::MemoryAllocators allocators; allocators.system = system_allocator; - allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block}; - allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region}; + allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block, VulkanMemoryAllocator::conform_block_request}; + allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region, VulkanMemoryAllocator::conform_region_request}; BlockAllocator::Config block_allocator_config = {0}; block_allocator_config.maximum_pool_size = cfg.maximum_pool_size; block_allocator_config.maximum_block_count = cfg.maximum_block_count; @@ -202,7 +207,7 @@ int VulkanMemoryAllocator::initialize(void *user_context, return halide_error_code_success; } -MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) { +MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, const MemoryRequest &request) { #if defined(HL_VK_DEBUG_MEM) debug(nullptr) << "VulkanMemoryAllocator: Reserving memory (" << "user_context=" << user_context << " " @@ -272,6 +277,7 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) { error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid memory range !\n"; return nullptr; } +#if defined(HL_VK_DEBUG_MEM) debug(nullptr) << "VulkanMemoryAllocator: MapMemory (" << "user_context=" << user_context << "\n" << " region_size=" << (uint32_t)region->size << "\n" @@ -279,8 +285,8 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) { << " region_range.head_offset=" << (uint32_t)region->range.head_offset << "\n" << " region_range.tail_offset=" << (uint32_t)region->range.tail_offset << "\n" << " memory_offset=" << (uint32_t)memory_offset << "\n" - << " memory_size=" << (uint32_t)memory_size << ") ...\n"; - + << " memory_size=" << (uint32_t)memory_size << "\n)\n"; +#endif VkResult result = vkMapMemory(device, *device_memory, memory_offset, memory_size, 0, (void **)(&mapped_ptr)); if (result != VK_SUCCESS) { error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n"; @@ -528,6 +534,79 @@ VulkanMemoryAllocator::default_config() { } // -- +int VulkanMemoryAllocator::lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements) { +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Looking up requirements (" + << "user_context=" << user_context << " " + << "size=" << (uint32_t)block->size << ", " + << "usage_flags=" << usage_flags << ") ... \n"; +#endif + VkBufferCreateInfo create_info = { + VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // struct type + nullptr, // struct extending this + 0, // create flags + size, // buffer size (in bytes) + usage_flags, // buffer usage flags + VK_SHARING_MODE_EXCLUSIVE, // sharing mode + 0, nullptr}; + + // Create a buffer to determine alignment requirements + VkBuffer buffer = {0}; + VkResult result = vkCreateBuffer(this->device, &create_info, this->alloc_callbacks, &buffer); + if (result != VK_SUCCESS) { +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer to find requirements!\n\t" + << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; +#endif + return halide_error_code_device_malloc_failed; + } + + vkGetBufferMemoryRequirements(this->device, buffer, memory_requirements); + vkDestroyBuffer(this->device, buffer, this->alloc_callbacks); + return halide_error_code_success; +} + +int VulkanMemoryAllocator::conform_block_request(void *instance_ptr, MemoryRequest *request) { + + VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); + if (instance == nullptr) { + return halide_error_code_internal_error; + } + + void *user_context = instance->owner_context; +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Conforming block request (" + << "user_context=" << user_context << " " + << "request=" << (void *)(request) << ") ... \n"; +#endif + + if ((instance->device == nullptr) || (instance->physical_device == nullptr)) { + error(user_context) << "VulkanRegionAllocator: Unable to conform block request! Invalid device handle!\n"; + return halide_error_code_internal_error; + } + + VkMemoryRequirements memory_requirements = {0}; + uint32_t usage_flags = instance->select_memory_usage(user_context, request->properties); + int error_code = instance->lookup_requirements(user_context, request->size, usage_flags, &memory_requirements); + if (error_code != halide_error_code_success) { + error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n"; + return error_code; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Block allocated (" + << "size=" << (uint32_t)request->size << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ", " + << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", " + << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", " + << "dedicated=" << (request->dedicated ? "true" : "false") << ")\n"; +#endif + + request->size = memory_requirements.size; + request->properties.alignment = memory_requirements.alignment; + return halide_error_code_success; +} int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block) { VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); @@ -587,53 +666,6 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block debug(nullptr) << "vkAllocateMemory: Allocated memory for device region (" << (uint64_t)block->size << " bytes) ...\n"; #endif - uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties); - - VkBufferCreateInfo create_info = { - VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // struct type - nullptr, // struct extending this - 0, // create flags - sizeof(uint32_t), // buffer size (in bytes) - usage_flags, // buffer usage flags - VK_SHARING_MODE_EXCLUSIVE, // sharing mode - 0, nullptr}; - - // Create a buffer to determine alignment requirements - VkBuffer buffer = {0}; - result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, &buffer); - if (result != VK_SUCCESS) { - debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer!\n\t" - << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; - return halide_error_code_device_malloc_failed; - } - - VkMemoryRequirements memory_requirements = {0}; - vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements); - vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks); - -#if defined(HL_VK_DEBUG_MEM) - debug(nullptr) << "VulkanMemoryAllocator: Block allocated (" - << "size=" << (uint32_t)block->size << ", " - << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " - << "required_size=" << (uint32_t)memory_requirements.size << ", " - << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", " - << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", " - << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n"; -#endif - - // Enforce any alignment constrainst reported by the device limits for each usage type - if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) { - block->properties.alignment = instance->physical_device_limits.minStorageBufferOffsetAlignment; - } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { - block->properties.alignment = instance->physical_device_limits.minUniformBufferOffsetAlignment; - } - // Some drivers appear to report a buffer alignment constraint (regardless of usage) that can be larger than either of the above - if (memory_requirements.alignment > block->properties.alignment) { - block->properties.alignment = memory_requirements.alignment; - } - if (memory_requirements.alignment > block->properties.nearest_multiple) { - block->properties.nearest_multiple = memory_requirements.alignment; - } block->handle = (void *)device_memory; instance->block_byte_count += block->size; instance->block_count++; @@ -814,6 +846,98 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context, // -- +int VulkanMemoryAllocator::conform(void *user_context, MemoryRequest *request) { + + // NOTE: Vulkan will only allow us to bind device memory to a buffer if the memory requirements are met. + // So now we have to check those (on every allocation) and potentially recreate the buffer if the requirements + // don't match the requested VkBuffer's properties. Note that this is the internal storage for the driver, + // whose size may be required to larger than our requested size (even though we will only ever touch the + // size of the region we're managing as within our block) + + VkMemoryRequirements memory_requirements = {0}; + uint32_t usage_flags = select_memory_usage(user_context, request->properties); + int error_code = lookup_requirements(user_context, request->size, usage_flags, &memory_requirements); + if (error_code != halide_error_code_success) { + error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n"; + return error_code; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements (" + << "requested_size=" << (uint32_t)region->size << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ")\n"; +#endif + + // Enforce any alignment constraints reported by the device limits for each usage type + if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) { + if ((request->alignment % this->physical_device_limits.minStorageBufferOffsetAlignment) != 0) { + request->alignment = this->physical_device_limits.minStorageBufferOffsetAlignment; + } + } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { + if ((request->alignment % this->physical_device_limits.minUniformBufferOffsetAlignment) != 0) { + request->alignment = this->physical_device_limits.minUniformBufferOffsetAlignment; + } + } + + // Ensure the request ends on an aligned address + if (request->alignment > config.nearest_multiple) { + request->properties.nearest_multiple = request->alignment; + } + + size_t actual_alignment = conform_alignment(request->alignment, memory_requirements.alignment); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, memory_requirements.size, actual_alignment, request->properties.nearest_multiple); + +#if defined(HL_VK_DEBUG_MEM) + if ((request->size != actual_size) || (request->alignment != actual_alignment) || (request->offset != actual_offset)) { + debug(nullptr) << "VulkanMemoryAllocator: Adjusting request to match requirements (\n" + << " size = " << (uint64_t)request->size << " => " << (uint64_t)actual_size << ",\n" + << " alignment = " << (uint64_t)request->alignment << " => " << (uint64_t)actual_alignment << ",\n" + << " offset = " << (uint64_t)request->offset << " => " << (uint64_t)actual_offset << ",\n" + << " required.size = " << (uint64_t)memory_requirements.size << ",\n" + << " required.alignment = " << (uint64_t)memory_requirements.alignment << "\n)\n"; + } +#endif + request->size = actual_size; + request->alignment = actual_alignment; + request->offset = actual_offset; + + return halide_error_code_success; +} + +int VulkanMemoryAllocator::conform_region_request(void *instance_ptr, MemoryRequest *request) { + + VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); + if (instance == nullptr) { + return halide_error_code_internal_error; + } + + void *user_context = instance->owner_context; +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Conforming region request (" + << "user_context=" << user_context << " " + << "request=" << (void *)(region) << ") ... \n"; +#endif + + if ((instance->device == nullptr) || (instance->physical_device == nullptr)) { + error(user_context) << "VulkanRegionAllocator: Unable to conform region request! Invalid device handle!\n"; + return halide_error_code_internal_error; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanRegionAllocator: Conforming region request (" + << "size=" << (uint32_t)request->size << ", " + << "offset=" << (uint32_t)request->offset << ", " + << "dedicated=" << (request->dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(request->properties.usage) << " " + << "caching=" << halide_memory_caching_name(request->properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(request->properties.visibility) << ")\n"; +#endif + + return instance->conform(user_context, request); +} + int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *region) { VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); @@ -890,7 +1014,8 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg if (memory_requirements.size > region->size) { vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks); #ifdef DEBUG_RUNTIME - debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" << (uint64_t)memory_requirements.size << " bytes) ...\n"; + debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" + << (uint64_t)region->size << " => " << (uint64_t)memory_requirements.size << " bytes) ...\n"; #endif create_info.size = memory_requirements.size; VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer); diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp index b2190f63b592..26ce8066e118 100644 --- a/test/runtime/block_allocator.cpp +++ b/test/runtime/block_allocator.cpp @@ -1,3 +1,7 @@ +// NOTE: Uncomment the following two defines to enable debug output +// #define DEBUG_RUNTIME +// #define DEBUG_RUNTIME_INTERNAL + #include "HalideRuntime.h" #include "common.h" @@ -39,6 +43,17 @@ int deallocate_block(void *user_context, MemoryBlock *block) { return halide_error_code_success; } +int conform_block(void *user_context, MemoryRequest *request) { + + debug(user_context) << "Test : conform_block (" + << "request_size=" << int32_t(request->size) << " " + << "request_offset=" << int32_t(request->offset) << " " + << "request_alignment=" << int32_t(request->alignment) << " " + << ") ..."; + + return halide_error_code_success; +} + int allocate_region(void *user_context, MemoryRegion *region) { region->handle = (void *)1; allocated_region_memory += region->size; @@ -65,17 +80,38 @@ int deallocate_region(void *user_context, MemoryRegion *region) { return halide_error_code_success; } +int conform_region(void *user_context, MemoryRequest *request) { + size_t actual_alignment = conform_alignment(request->alignment, 0); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, actual_alignment); + + debug(user_context) << "Test : conform_region (\n " + << "request_size=" << int32_t(request->size) << "\n " + << "request_offset=" << int32_t(request->offset) << "\n " + << "request_alignment=" << int32_t(request->alignment) << "\n " + << "actual_size=" << int32_t(actual_size) << "\n " + << "actual_offset=" << int32_t(actual_offset) << "\n " + << "actual_alignment=" << int32_t(actual_alignment) << "\n" + << ") ..."; + + request->alignment = actual_alignment; + request->offset = actual_offset; + request->size = actual_size; + return halide_error_code_success; +} + } // end namespace int main(int argc, char **argv) { void *user_context = (void *)1; SystemMemoryAllocatorFns system_allocator = {allocate_system, deallocate_system}; - MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block}; - MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region}; // test region allocator class interface { + // Use custom conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, conform_region}; + // Manually create a block resource and allocate memory size_t block_size = 4 * 1024 * 1024; BlockResource block_resource = {}; @@ -164,8 +200,104 @@ int main(int argc, char **argv) { HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); } + // test region allocator conform request + { + // Use default conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + + // Manually create a block resource and allocate memory + size_t block_size = 4 * 1024 * 1024; + size_t padded_size = 32; + BlockResource block_resource = {}; + MemoryBlock *memory_block = &(block_resource.memory); + memory_block->size = block_size; + memory_block->properties.nearest_multiple = padded_size; + allocate_block(user_context, memory_block); + + // Create a region allocator to manage the block resource + RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator}; + RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators); + + // test zero size request + MemoryRequest request = {0}; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=" << int32_t(request.size) << " " + << "request.alignment=" << int32_t(request.alignment) << " " + << ") ..."; + + halide_abort_if_false(user_context, request.size == size_t(0)); + + // test round up size to alignment + request.size = 1; + request.alignment = 0; + request.properties.alignment = 4; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 4); + halide_abort_if_false(user_context, request.alignment != 4); + + size_t nm = padded_size; + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm))); + halide_abort_if_false(user_context, request.alignment == a); + } + } + + // test round up size and offset to alignment + request.size = 1; + request.offset = 1; + request.alignment = 32; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size == 32); + halide_abort_if_false(user_context, request.offset == 32); + halide_abort_if_false(user_context, request.alignment == 32); + + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t os = 1; os < sz; ++os) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.offset = os; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.offset=(" << os << " => " << int32_t(request.offset) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm))); + halide_abort_if_false(user_context, request.offset == aligned_offset(os, a)); + halide_abort_if_false(user_context, request.alignment == a); + } + } + } + + instance->destroy(user_context); + deallocate_block(user_context, memory_block); + HALIDE_CHECK(user_context, allocated_block_memory == 0); + HALIDE_CHECK(user_context, allocated_region_memory == 0); + + RegionAllocator::destroy(user_context, instance); + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + // test region allocator nearest_multiple padding { + // Use default conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + // Manually create a block resource and allocate memory size_t block_size = 4 * 1024 * 1024; size_t padded_size = 32; @@ -245,6 +377,9 @@ int main(int argc, char **argv) { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); @@ -296,11 +431,58 @@ int main(int argc, char **argv) { HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); } + // test conform request + { + uint32_t mbs = 1024; // min block size + BlockAllocator::Config config = {0}; + config.minimum_block_size = mbs; + + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; + BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); + + MemoryRequest request = {0}; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 0); + + // test round up size to alignment + request.size = 1; + request.alignment = 0; + request.properties.alignment = 4; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 4); + halide_abort_if_false(user_context, request.alignment != 4); + + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : block_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(mbs, (((sz + a - 1) / a) * a))); + halide_abort_if_false(user_context, request.alignment == a); + } + } + + BlockAllocator::destroy(user_context, instance); + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + // allocation stress test { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); @@ -340,6 +522,9 @@ int main(int argc, char **argv) { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);