From c4dbb060860e7b74722ee000802fbaf5ca8bcc39 Mon Sep 17 00:00:00 2001 From: Steven Johnson Date: Mon, 24 Jun 2024 13:23:56 -0700 Subject: [PATCH] Backport #8259 to release/17.x (#8270) * [vulkan] Fix Vulkan SIMT mappings for GPU loop vars. (#8259) * Fix Vulkan SIMT mappings for GPU loop vars. Previous refactoring accidentally used the fully qualified var name rather than the categorized vulkan intrinsic name. * Avoid formatting the GPU kernel to a string for Vulkan (since it's binary SPIR-V needs to remain intact). --------- Co-authored-by: Derek Gerstmann Co-authored-by: Steven Johnson * Update CodeGen_Vulkan_Dev.cpp * [vulkan] Add conform API methods to memory allocator to fix block allocations (#8130) * Add conform API methods to block and region allocator classes Override conform requests for Vulkan memory allocator Cleanup memory requirement constraints for Vulkan Add conform test cases to block_allocator runtime test. * Clang format/tidy pas * Fix unsigned int comparisons * Clang format pass * Fix other unsigned int comparisons * Fix mismatched template types for max() * Fix whitespace for clang format --------- Co-authored-by: Derek Gerstmann * Backport fixes for Vulkan in src/runtime/internal for allocations. --------- Co-authored-by: Derek Gerstmann Co-authored-by: Derek Gerstmann --- src/CodeGen_C.cpp | 5 +- src/CodeGen_Vulkan_Dev.cpp | 22 +- src/runtime/internal/block_allocator.h | 210 +++++++------ src/runtime/internal/memory_arena.h | 2 +- src/runtime/internal/memory_resources.h | 6 +- src/runtime/internal/region_allocator.h | 374 +++++++++++++++--------- src/runtime/vulkan_memory.h | 265 +++++++++++++---- test/runtime/block_allocator.cpp | 357 +++++++++++++++++++++- 8 files changed, 945 insertions(+), 296 deletions(-) diff --git a/src/CodeGen_C.cpp b/src/CodeGen_C.cpp index 3939edc4a678..133afad203e4 100644 --- a/src/CodeGen_C.cpp +++ b/src/CodeGen_C.cpp @@ -1159,8 +1159,9 @@ void CodeGen_C::compile(const Buffer<> &buffer) { bool is_constant = buffer.dimensions() != 0; // If it is an GPU source kernel, we would like to see the actual output, not the - // uint8 representation. We use a string literal for this. - if (ends_with(name, "gpu_source_kernels")) { + // uint8 representation. We use a string literal for this. Since the Vulkan backend + // actually generates a SPIR-V binary, keep it as raw data to avoid textual reformatting. + if (ends_with(name, "gpu_source_kernels") && !target.has_feature(Target::Vulkan)) { stream << "static const char *" << name << "_string = R\"BUFCHARSOURCE("; stream.write((char *)b.host, num_elems); stream << ")BUFCHARSOURCE\";\n"; diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp index 7e06447a27fc..5a142dab4799 100644 --- a/src/CodeGen_Vulkan_Dev.cpp +++ b/src/CodeGen_Vulkan_Dev.cpp @@ -2514,12 +2514,20 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_workgroup_size(SpvId kernel_func namespace { // Locate all the unique GPU variables used as SIMT intrinsics +// This pass is used to identify if LocalInvocationID and/or WorkgroupID +// need to be declared as variables for the entrypoint to the Kernel. Since +// these can only be declared once and their type is vec3, we don't +// care about the specific dims that are mapped to loop variables. class FindIntrinsicsUsed : public IRVisitor { using IRVisitor::visit; void visit(const For *op) override { if (CodeGen_GPU_Dev::is_gpu_var(op->name)) { + + // map the block or thread id name to the SIMT intrinsic definition auto intrinsic = simt_intrinsic(op->name); - intrinsics_used.insert(intrinsic.first); + + // mark the name of the intrinsic being used (without the dimension) + intrinsics_used.insert(intrinsic.first); // name only! } op->body.accept(this); } @@ -2555,20 +2563,22 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::declare_entry_point(const Stmt &s, SpvId s.accept(&find_intrinsics); SpvFactory::Variables entry_point_variables; - for (const std::string &intrinsic_name : find_intrinsics.intrinsics_used) { + for (const std::string &used_intrinsic : find_intrinsics.intrinsics_used) { - // The builtins are pointers to vec3 + // The builtins are pointers to vec3 and can only be declared once per kernel entrypoint SpvStorageClass storage_class = SpvStorageClassInput; SpvId intrinsic_type_id = builder.declare_type(Type(Type::UInt, 32, 3)); SpvId intrinsic_ptr_type_id = builder.declare_pointer_type(intrinsic_type_id, storage_class); - const std::string intrinsic_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + intrinsic_name; + const std::string intrinsic_var_name = std::string("k") + std::to_string(kernel_index) + std::string("_") + used_intrinsic; SpvId intrinsic_var_id = builder.declare_global_variable(intrinsic_var_name, intrinsic_ptr_type_id, storage_class); SpvId intrinsic_loaded_id = builder.reserve_id(); builder.append(SpvFactory::load(intrinsic_type_id, intrinsic_loaded_id, intrinsic_var_id)); symbol_table.push(intrinsic_var_name, {intrinsic_loaded_id, storage_class}); - // Annotate that this is the specific builtin - SpvBuiltIn built_in_kind = map_simt_builtin(intrinsic_name); + // Map the used intrinsic name to the specific builtin + SpvBuiltIn built_in_kind = map_simt_builtin(used_intrinsic); + + // Add an annotation that indicates this variable is bound to the requested intrinsic SpvBuilder::Literals annotation_literals = {(uint32_t)built_in_kind}; builder.add_annotation(intrinsic_var_id, SpvDecorationBuiltIn, annotation_literals); diff --git a/src/runtime/internal/block_allocator.h b/src/runtime/internal/block_allocator.h index 3ff850e5b19f..89b1a929e79b 100644 --- a/src/runtime/internal/block_allocator.h +++ b/src/runtime/internal/block_allocator.h @@ -55,10 +55,11 @@ class BlockAllocator { // Public interface methods MemoryRegion *reserve(void *user_context, const MemoryRequest &request); - int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *region); //< retain the region and increase the usage count - bool collect(void *user_context); //< returns true if any blocks were removed + int conform(void *user_context, MemoryRequest *request) const; //< conform the given request into a suitable allocation + int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *region); //< retain the region and increase the usage count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -86,13 +87,13 @@ class BlockAllocator { int destroy_region_allocator(void *user_context, RegionAllocator *region_allocator); // Reserves a block of memory for the requested size and returns the corresponding block entry, or nullptr on failure - BlockEntry *reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + BlockEntry *reserve_block_entry(void *user_context, const MemoryRequest &request); // Locates the "best-fit" block entry for the requested size, or nullptr if none was found - BlockEntry *find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + BlockEntry *find_block_entry(void *user_context, const MemoryRequest &request); - // Creates a new block entry and int the list - BlockEntry *create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated); + // Creates a new block entry and adds it tos the list + BlockEntry *create_block_entry(void *user_context, const MemoryRequest &request); // Releases the block entry from being used, and makes it available for further allocations int release_block_entry(void *user_context, BlockEntry *block_entry); @@ -113,7 +114,7 @@ class BlockAllocator { bool is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const; // Returns true if the given block is suitable for the request allocation - bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const; + bool is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const; Config config; LinkedList block_list; @@ -126,7 +127,7 @@ BlockAllocator *BlockAllocator::create(void *user_context, const Config &cfg, co allocators.system.allocate(user_context, sizeof(BlockAllocator))); if (result == nullptr) { - error(user_context) << "BlockAllocator: Failed to create instance! Out of memory!\n"; + error(user_context) << "BlockAllocator: Failed to create instance! Out of memory\n"; return nullptr; } @@ -160,12 +161,13 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r << "dedicated=" << (request.dedicated ? "true" : "false") << " " << "usage=" << halide_memory_usage_name(request.properties.usage) << " " << "caching=" << halide_memory_caching_name(request.properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ...\n"; + << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ..."; #endif - BlockEntry *block_entry = reserve_block_entry(user_context, request.properties, request.size, request.dedicated); + // Reserve a block entry for use + BlockEntry *block_entry = reserve_block_entry(user_context, request); if (block_entry == nullptr) { error(user_context) << "BlockAllocator: Failed to allocate new empty block of requested size (" - << (int32_t)(request.size) << " bytes)!\n"; + << (int32_t)(request.size) << " bytes)\n"; return nullptr; } @@ -173,14 +175,15 @@ MemoryRegion *BlockAllocator::reserve(void *user_context, const MemoryRequest &r halide_abort_if_false(user_context, block != nullptr); halide_abort_if_false(user_context, block->allocator != nullptr); + // Reserve an initial memory region for the block MemoryRegion *result = reserve_memory_region(user_context, block->allocator, request); if (result == nullptr) { // Unable to reserve region in an existing block ... create a new block and try again. - block_entry = create_block_entry(user_context, request.properties, request.size, request.dedicated); + block_entry = create_block_entry(user_context, request); if (block_entry == nullptr) { error(user_context) << "BlockAllocator: Out of memory! Failed to allocate empty block of size (" - << (int32_t)(request.size) << " bytes)!\n"; + << (int32_t)(request.size) << " bytes)\n"; return nullptr; } @@ -288,7 +291,7 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl if (result == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: Failed to allocate region of size (" - << (int32_t)(request.size) << " bytes)!\n"; + << (int32_t)(request.size) << " bytes)\n"; #endif // allocator has enough free space, but not enough contiguous space // -- collect and try to reallocate @@ -299,48 +302,48 @@ MemoryRegion *BlockAllocator::reserve_memory_region(void *user_context, RegionAl return result; } -bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryProperties &properties, size_t size, bool dedicated) const { - if (!is_compatible_block(block, properties)) { +bool BlockAllocator::is_block_suitable_for_request(void *user_context, const BlockResource *block, const MemoryRequest &request) const { + if (!is_compatible_block(block, request.properties)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: skipping block ... incompatible properties!\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n" - << " block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << "\n" - << " block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << "\n" - << " block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << "\n"; - debug(user_context) << " request_size=" << (uint32_t)size << "\n" - << " request_usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " request_caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " request_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: skipping block ... incompatible properties! (" + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << " " + << "block_usage=" << halide_memory_usage_name(block->memory.properties.usage) << " " + << "block_caching=" << halide_memory_caching_name(block->memory.properties.caching) << " " + << "block_visibility=" << halide_memory_visibility_name(block->memory.properties.visibility) << " " + << "request_size=" << (uint32_t)request.size << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif // skip blocks that are using incompatible memory return false; } - if (dedicated && (block->reserved > 0)) { + if (request.dedicated && (block->reserved > 0)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation!\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n"; + debug(user_context) << "BlockAllocator: skipping block ... can be used for dedicated allocation! (" + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << ")"; #endif // skip blocks that can't be dedicated to a single allocation return false; } else if (block->memory.dedicated && (block->reserved > 0)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: skipping block ... already dedicated to an allocation!\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n"; + debug(user_context) << "BlockAllocator: skipping block ... already dedicated to an allocation! (" + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << ")"; #endif // skip dedicated blocks that are already allocated return false; } size_t available = (block->memory.size - block->reserved); - if (available >= size) { + if (available >= request.size) { return true; } @@ -348,23 +351,23 @@ bool BlockAllocator::is_block_suitable_for_request(void *user_context, const Blo } BlockAllocator::BlockEntry * -BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::find_block_entry(void *user_context, const MemoryRequest &request) { BlockEntry *block_entry = block_list.back(); while (block_entry != nullptr) { BlockEntry *prev_entry = block_entry->prev_ptr; const BlockResource *block = static_cast(block_entry->value); - if (is_block_suitable_for_request(user_context, block, properties, size, dedicated)) { + if (is_block_suitable_for_request(user_context, block, request)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: found suitable block ...\n" - << " user_context=" << (void *)(user_context) << "\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n" - << " request_size=" << (uint32_t)size << "\n" - << " dedicated=" << (dedicated ? "true" : "false") << "\n" - << " usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: found suitable block (" + << "user_context=" << (void *)(user_context) << " " + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << " " + << "request_size=" << (uint32_t)request.size << " " + << "request_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif return block_entry; } @@ -373,39 +376,39 @@ BlockAllocator::find_block_entry(void *user_context, const MemoryProperties &pro if (block_entry == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: couldn't find suitable block!\n" - << " user_context=" << (void *)(user_context) << "\n" - << " request_size=" << (uint32_t)size << "\n" - << " dedicated=" << (dedicated ? "true" : "false") << "\n" - << " usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: couldn't find suitable block! (" + << "user_context=" << (void *)(user_context) << " " + << "request_size=" << (uint32_t)request.size << " " + << "request_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "request_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "request_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "request_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif } return block_entry; } BlockAllocator::BlockEntry * -BlockAllocator::reserve_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::reserve_block_entry(void *user_context, const MemoryRequest &request) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: reserving block ... !\n" - << " requested_size=" << (uint32_t)size << "\n" - << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n" - << " requested_usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " requested_caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " requested_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: reserving block ... ! (" + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif - BlockEntry *block_entry = find_block_entry(user_context, properties, size, dedicated); + BlockEntry *block_entry = find_block_entry(user_context, request); if (block_entry == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: creating block ... !\n" - << " requested_size=" << (uint32_t)size << "\n" - << " requested_is_dedicated=" << (dedicated ? "true" : "false") << "\n" - << " requested_usage=" << halide_memory_usage_name(properties.usage) << "\n" - << " requested_caching=" << halide_memory_caching_name(properties.caching) << "\n" - << " requested_visibility=" << halide_memory_visibility_name(properties.visibility) << "\n"; + debug(user_context) << "BlockAllocator: creating block ... ! (" + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif - block_entry = create_block_entry(user_context, properties, size, dedicated); + block_entry = create_block_entry(user_context, request); } if (block_entry) { @@ -422,14 +425,14 @@ BlockAllocator::create_region_allocator(void *user_context, BlockResource *block #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: Creating region allocator (" << "user_context=" << (void *)(user_context) << " " - << "block_resource=" << (void *)(block) << ")...\n"; + << "block_resource=" << (void *)(block) << ")..."; #endif halide_abort_if_false(user_context, block != nullptr); RegionAllocator *region_allocator = RegionAllocator::create( user_context, block, {allocators.system, allocators.region}); if (region_allocator == nullptr) { - error(user_context) << "BlockAllocator: Failed to create new region allocator!\n"; + error(user_context) << "BlockAllocator: Failed to create new region allocator\n"; return nullptr; } @@ -440,7 +443,7 @@ int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: Destroying region allocator (" << "user_context=" << (void *)(user_context) << " " - << "region_allocator=" << (void *)(region_allocator) << ")...\n"; + << "region_allocator=" << (void *)(region_allocator) << ")..."; #endif if (region_allocator == nullptr) { return 0; @@ -449,7 +452,7 @@ int BlockAllocator::destroy_region_allocator(void *user_context, RegionAllocator } BlockAllocator::BlockEntry * -BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &properties, size_t size, bool dedicated) { +BlockAllocator::create_block_entry(void *user_context, const MemoryRequest &request) { if (config.maximum_pool_size && (pool_size() >= config.maximum_pool_size)) { error(user_context) << "BlockAllocator: No free blocks found! Maximum pool size reached (" << (int32_t)(config.maximum_pool_size) << " bytes or " @@ -459,13 +462,13 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p if (config.maximum_block_count && (block_count() >= config.maximum_block_count)) { error(user_context) << "BlockAllocator: No free blocks found! Maximum block count reached (" - << (int32_t)(config.maximum_block_count) << ")!\n"; + << (int32_t)(config.maximum_block_count) << ")\n"; return nullptr; } BlockEntry *block_entry = block_list.append(user_context); if (block_entry == nullptr) { - debug(user_context) << "BlockAllocator: Failed to allocate new block entry!\n"; + debug(user_context) << "BlockAllocator: Failed to allocate new block entry\n"; return nullptr; } @@ -473,15 +476,19 @@ BlockAllocator::create_block_entry(void *user_context, const MemoryProperties &p debug(user_context) << "BlockAllocator: Creating block entry (" << "block_entry=" << (void *)(block_entry) << " " << "block=" << (void *)(block_entry->value) << " " - << "allocator=" << (void *)(allocators.block.allocate) << ")...\n"; + << "allocator=" << (void *)(allocators.block.allocate) << ")..."; #endif + // Constrain the request to the a valid block allocation + MemoryRequest block_request = request; + conform(user_context, &block_request); + + // Create the block resource itself BlockResource *block = static_cast(block_entry->value); - block->memory.size = constrain_requested_size(size); + block->memory.size = block_request.size; block->memory.handle = nullptr; - block->memory.properties = properties; - block->memory.properties.nearest_multiple = max(config.nearest_multiple, properties.nearest_multiple); - block->memory.dedicated = dedicated; + block->memory.properties = block_request.properties; + block->memory.dedicated = block_request.dedicated; block->reserved = 0; block->allocator = create_region_allocator(user_context, block); alloc_memory_block(user_context, block); @@ -492,7 +499,7 @@ int BlockAllocator::release_block_entry(void *user_context, BlockAllocator::Bloc #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "BlockAllocator: Releasing block entry (" << "block_entry=" << (void *)(block_entry) << " " - << "block=" << (void *)(block_entry->value) << ")...\n"; + << "block=" << (void *)(block_entry->value) << ")..."; #endif BlockResource *block = static_cast(block_entry->value); if (block->allocator) { @@ -506,7 +513,7 @@ int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Bloc debug(user_context) << "BlockAllocator: Destroying block entry (" << "block_entry=" << (void *)(block_entry) << " " << "block=" << (void *)(block_entry->value) << " " - << "deallocator=" << (void *)(allocators.block.deallocate) << ")...\n"; + << "deallocator=" << (void *)(allocators.block.deallocate) << ")..."; #endif BlockResource *block = static_cast(block_entry->value); if (block->allocator) { @@ -520,7 +527,7 @@ int BlockAllocator::destroy_block_entry(void *user_context, BlockAllocator::Bloc int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")...\n"; + debug(user_context) << "BlockAllocator: Allocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.allocate << ")..."; #endif halide_abort_if_false(user_context, allocators.block.allocate != nullptr); MemoryBlock *memory_block = &(block->memory); @@ -531,7 +538,7 @@ int BlockAllocator::alloc_memory_block(void *user_context, BlockResource *block) int BlockAllocator::free_memory_block(void *user_context, BlockResource *block) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")...\n"; + debug(user_context) << "BlockAllocator: Deallocating block (ptr=" << (void *)block << " allocator=" << (void *)allocators.block.deallocate << ")..."; #endif halide_abort_if_false(user_context, allocators.block.deallocate != nullptr); MemoryBlock *memory_block = &(block->memory); @@ -561,6 +568,33 @@ size_t BlockAllocator::constrain_requested_size(size_t size) const { return actual_size; } +int BlockAllocator::conform(void *user_context, MemoryRequest *request) const { + + request->properties.nearest_multiple = max(config.nearest_multiple, request->properties.nearest_multiple); + + if (request->properties.nearest_multiple) { + size_t nm = request->properties.nearest_multiple; + request->size = (((request->size + nm - 1) / nm) * nm); // round up to nearest multiple + } + + if (config.minimum_block_size) { + request->size = ((request->size < config.minimum_block_size) ? + config.minimum_block_size : + request->size); + } + if (config.maximum_block_size) { + request->size = ((request->size > config.maximum_block_size) ? + config.maximum_block_size : + request->size); + } + + if (allocators.block.conform) { + return allocators.block.conform(user_context, request); + } + + return 0; +} + bool BlockAllocator::is_compatible_block(const BlockResource *block, const MemoryProperties &properties) const { if (properties.caching != MemoryCaching::DefaultCaching) { if (properties.caching != block->memory.properties.caching) { diff --git a/src/runtime/internal/memory_arena.h b/src/runtime/internal/memory_arena.h index 5953e12e470a..7d6c33da8f5d 100644 --- a/src/runtime/internal/memory_arena.h +++ b/src/runtime/internal/memory_arena.h @@ -271,7 +271,7 @@ void *MemoryArena::create_entry(void *user_context, Block *block, uint32_t index void *entry_ptr = lookup_entry(user_context, block, index); block->free_index = block->indices[index]; block->status[index] = AllocationStatus::InUse; -#if DEBUG_RUNTIME_INTERNAL +#ifdef DEBUG_RUNTIME_INTERNAL memset(entry_ptr, 0, config.entry_size); #endif return entry_ptr; diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h index e30afb0dd4ea..0be6041519a1 100644 --- a/src/runtime/internal/memory_resources.h +++ b/src/runtime/internal/memory_resources.h @@ -127,7 +127,7 @@ ALWAYS_INLINE bool is_power_of_two_alignment(size_t x) { // -- Alignment must be power of two! ALWAYS_INLINE size_t aligned_offset(size_t offset, size_t alignment) { halide_abort_if_false(nullptr, is_power_of_two_alignment(alignment)); - return (offset + (alignment - 1)) & ~(alignment - 1); + return (alignment == 0) ? (offset) : (offset + (alignment - 1)) & ~(alignment - 1); } // Returns a suitable alignment such that requested alignment is a suitable @@ -202,18 +202,22 @@ struct HalideSystemAllocatorFns { typedef int (*AllocateBlockFn)(void *, MemoryBlock *); typedef int (*DeallocateBlockFn)(void *, MemoryBlock *); +typedef int (*ConformBlockRequestFn)(void *, MemoryRequest *); struct MemoryBlockAllocatorFns { AllocateBlockFn allocate = nullptr; DeallocateBlockFn deallocate = nullptr; + ConformBlockRequestFn conform = nullptr; }; typedef int (*AllocateRegionFn)(void *, MemoryRegion *); typedef int (*DeallocateRegionFn)(void *, MemoryRegion *); +typedef int (*ConformBlockRegionFn)(void *, MemoryRequest *); struct MemoryRegionAllocatorFns { AllocateRegionFn allocate = nullptr; DeallocateRegionFn deallocate = nullptr; + ConformBlockRegionFn conform = nullptr; }; // -- diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h index 13c6b69f12e7..3588389c3747 100644 --- a/src/runtime/internal/region_allocator.h +++ b/src/runtime/internal/region_allocator.h @@ -46,10 +46,11 @@ class RegionAllocator { // Public interface methods MemoryRegion *reserve(void *user_context, const MemoryRequest &request); - int release(void *user_context, MemoryRegion *memory_region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *memory_region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *memory_region); //< retain the region and increase usage count - bool collect(void *user_context); //< returns true if any blocks were removed + int conform(void *user_context, MemoryRequest *request) const; //< conform the given request into a suitable allocation + int release(void *user_context, MemoryRegion *memory_region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *memory_region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *memory_region); //< retain the region and increase usage count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -73,13 +74,13 @@ class RegionAllocator { BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region); // Returns true if the given region can be split to accomodate the given size - bool can_split(const BlockRegion *region, size_t size) const; + bool can_split(const BlockRegion *region, const MemoryRequest &request) const; // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining - BlockRegion *split_block_region(void *user_context, BlockRegion *region, size_t size, size_t alignment); + BlockRegion *split_block_region(void *user_context, BlockRegion *region, const MemoryRequest &request); // Creates a new block region and adds it to the region list - BlockRegion *create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated); + BlockRegion *create_block_region(void *user_context, const MemoryRequest &request); // Creates a new block region and adds it to the region list int destroy_block_region(void *user_context, BlockRegion *region); @@ -137,44 +138,69 @@ int RegionAllocator::initialize(void *user_context, BlockResource *mb, const Mem allocators = ma; arena = MemoryArena::create(user_context, {sizeof(BlockRegion), MemoryArena::default_capacity, 0}, allocators.system); halide_abort_if_false(user_context, arena != nullptr); + MemoryRequest block_request = {}; + block_request.size = block->memory.size; + block_request.offset = 0; + block_request.alignment = block->memory.properties.alignment; + block_request.properties = block->memory.properties; + block_request.dedicated = block->memory.dedicated; block->allocator = this; - block->regions = create_block_region( - user_context, - block->memory.properties, - 0, block->memory.size, - block->memory.dedicated); + block->regions = create_block_region(user_context, block_request); + return 0; +} + +int RegionAllocator::conform(void *user_context, MemoryRequest *request) const { + if (allocators.region.conform) { + return allocators.region.conform(user_context, request); + } else { + size_t actual_alignment = conform_alignment(request->alignment, block->memory.properties.alignment); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, block->memory.properties.nearest_multiple); + request->alignment = actual_alignment; + request->offset = actual_offset; + request->size = actual_size; + } return 0; } MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &request) { halide_abort_if_false(user_context, request.size > 0); - size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(request.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple); + + MemoryRequest region_request = request; + + int error_code = conform(user_context, ®ion_request); + if (error_code) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n"; +#endif + return nullptr; + } + size_t remaining = block->memory.size - block->reserved; - if (remaining < actual_size) { + if (remaining < region_request.size) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Unable to reserve more memory from block " - << "-- requested size (" << (int32_t)(request.size) << " bytes) " - << "greater than available (" << (int32_t)(remaining) << " bytes)!\n"; + << "-- requested size (" << (int32_t)(region_request.size) << " bytes) " + << "greater than available (" << (int32_t)(remaining) << " bytes)"; #endif return nullptr; } - BlockRegion *block_region = find_block_region(user_context, request); + BlockRegion *block_region = find_block_region(user_context, region_request); if (block_region == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Failed to locate region for requested size (" - << (int32_t)(request.size) << " bytes)!\n"; + << (int32_t)(request.size) << " bytes)"; #endif return nullptr; } - if (can_split(block_region, request.size)) { + if (can_split(block_region, region_request)) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") " - << "to accomodate requested size (" << (int32_t)(request.size) << " bytes)!\n"; + << "to accomodate requested size (" << (int32_t)(region_request.size) << " bytes)"; #endif - split_block_region(user_context, block_region, request.size, request.alignment); + split_block_region(user_context, block_region, region_request); } alloc_block_region(user_context, block_region); @@ -200,9 +226,6 @@ int RegionAllocator::reclaim(void *user_context, MemoryRegion *memory_region) { } release_block_region(user_context, block_region); free_block_region(user_context, block_region); - if (can_coalesce(block_region)) { - block_region = coalesce_block_regions(user_context, block_region); - } return 0; } @@ -232,38 +255,56 @@ bool RegionAllocator::is_last_block_region(void *user_context, const BlockRegion bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, const BlockRegion *region, const MemoryRequest &request) const { if (!is_available(region)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: skipping block region ... not available! " - << " block_region=" << (void *)region << "\n"; + debug(user_context) << " skipping block region ... not available! (" + << " block_region=" << (void *)region + << " region_size=" << (uint32_t)(region->memory.size) + << ")"; #endif return false; } - // skip incompatible block regions for this request - if (!is_compatible_block_region(region, request.properties)) { + MemoryRequest region_request = request; + int error_code = conform(user_context, ®ion_request); + if (error_code) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: skipping block region ... incompatible properties! " - << " block_region=" << (void *)region << "\n"; + debug(user_context) << "RegionAllocator: Failed to conform region request! Unable to reserve memory ...\n"; #endif return false; } - size_t actual_alignment = conform_alignment(request.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(region->memory.offset, request.size, actual_alignment, block->memory.properties.nearest_multiple); + // skip incompatible block regions for this request + if (!is_compatible_block_region(region, region_request.properties)) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << " skipping block region ... incompatible properties! (" + << " block_region=" << (void *)region + << " region_size=" << (uint32_t)(region->memory.size) + << ")"; +#endif + return false; + } // is the adjusted size larger than the current region? - if (actual_size > region->memory.size) { + if (region_request.size > region->memory.size) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: skipping block region ... not enough space for adjusted size! " - << " block_region=" << (void *)region << "\n"; + debug(user_context) << " skipping block region ... not enough space for adjusted size! (" + << " block_region=" << (void *)region + << " request_size=" << (uint32_t)(request.size) + << " actual_size=" << (uint32_t)(region_request.size) + << " region_size=" << (uint32_t)(region->memory.size) + << ")"; #endif return false; } // will the adjusted size fit within the remaining unallocated space? - if ((actual_size + block->reserved) <= block->memory.size) { + if ((region_request.size + block->reserved) <= block->memory.size) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: found suitable block region! " - << " block_region=" << (void *)region << "\n"; + debug(user_context) << " found suitable block region! (" + << " block_region=" << (void *)region + << " request_size=" << (uint32_t)(request.size) + << " actual_size=" << (uint32_t)(region_request.size) + << " region_size=" << (uint32_t)(region->memory.size) + << ")"; #endif return true; // you betcha } @@ -272,20 +313,29 @@ bool RegionAllocator::is_block_region_suitable_for_request(void *user_context, c } BlockRegion *RegionAllocator::find_block_region(void *user_context, const MemoryRequest &request) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: find block region ( " + << "user_context=" << (void *)(user_context) << " " + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; +#endif BlockRegion *block_region = block->regions; while (block_region != nullptr) { if (is_block_region_suitable_for_request(user_context, block_region, request)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: found suitable region ...\n" - << " user_context=" << (void *)(user_context) << "\n" - << " block_resource=" << (void *)block << "\n" - << " block_size=" << (uint32_t)block->memory.size << "\n" - << " block_reserved=" << (uint32_t)block->reserved << "\n" - << " requested_size=" << (uint32_t)request.size << "\n" - << " requested_is_dedicated=" << (request.dedicated ? "true" : "false") << "\n" - << " requested_usage=" << halide_memory_usage_name(request.properties.usage) << "\n" - << " requested_caching=" << halide_memory_caching_name(request.properties.caching) << "\n" - << " requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << "\n"; + debug(user_context) << "RegionAllocator: found suitable region ( " + << "user_context=" << (void *)(user_context) << " " + << "block_resource=" << (void *)block << " " + << "block_size=" << (uint32_t)block->memory.size << " " + << "block_reserved=" << (uint32_t)block->reserved << " " + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif return block_region; } @@ -299,13 +349,13 @@ BlockRegion *RegionAllocator::find_block_region(void *user_context, const Memory if (block_region == nullptr) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: couldn't find suitable region!\n" - << " user_context=" << (void *)(user_context) << "\n" - << " requested_size=" << (uint32_t)request.size << "\n" - << " requested_is_dedicated=" << (request.dedicated ? "true" : "false") << "\n" - << " requested_usage=" << halide_memory_usage_name(request.properties.usage) << "\n" - << " requested_caching=" << halide_memory_caching_name(request.properties.caching) << "\n" - << " requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << "\n"; + debug(user_context) << "RegionAllocator: couldn't find suitable region! (" + << "user_context=" << (void *)(user_context) << " " + << "requested_size=" << (uint32_t)request.size << " " + << "requested_is_dedicated=" << (request.dedicated ? "true" : "false") << " " + << "requested_usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "requested_caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "requested_visibility=" << halide_memory_visibility_name(request.properties.visibility) << ")"; #endif } @@ -342,12 +392,12 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Freeing region (" + debug(user_context) << "RegionAllocator: Freeing unused region to coalesce (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif halide_abort_if_false(user_context, allocators.region.deallocate != nullptr); MemoryRegion *memory_region = &(block_region->memory); @@ -361,7 +411,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Coalescing " << "previous region (offset=" << (int32_t)prev_region->memory.offset << " size=" << (int32_t)(prev_region->memory.size) << " bytes) " - << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n!"; + << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!"; #endif prev_region->next_ptr = block_region->next_ptr; @@ -379,7 +429,7 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Coalescing " << "next region (offset=" << (int32_t)next_region->memory.offset << " size=" << (int32_t)(next_region->memory.size) << " bytes) " - << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)!\n"; + << "into current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)"; #endif if (next_region->next_ptr) { @@ -393,11 +443,11 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe return block_region; } -bool RegionAllocator::can_split(const BlockRegion *block_region, size_t size) const { - return (block_region && (block_region->memory.size > size) && (block_region->usage_count == 0)); +bool RegionAllocator::can_split(const BlockRegion *block_region, const MemoryRequest &split_request) const { + return (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0)); } -BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, size_t size, size_t alignment) { +BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, const MemoryRequest &request) { if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) { #ifdef DEBUG_RUNTIME_INTERNAL @@ -406,7 +456,7 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion << "block_region=" << (void *)block_region << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block_region->block_ptr->reserved << " " - << ")\n"; + << ")"; #endif halide_abort_if_false(user_context, allocators.region.deallocate != nullptr); MemoryRegion *memory_region = &(block_region->memory); @@ -414,31 +464,17 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion block_region->memory.handle = nullptr; } - size_t actual_alignment = conform_alignment(alignment, block->memory.properties.alignment); - size_t split_size = conform_size(block_region->memory.offset, size, actual_alignment, block->memory.properties.nearest_multiple); - size_t split_offset = aligned_offset(block_region->memory.offset + size, actual_alignment); - size_t empty_size = block_region->memory.size - split_size; - -#ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: Conforming size and alignment \n" - << " requested_size=" << (uint32_t)size << "\n" - << " split_size=" << (uint32_t)split_size << "\n" - << " requested_alignment=" << (uint32_t)alignment << " " - << " required_alignment=" << (uint32_t)block->memory.properties.alignment << " " - << " actual_alignment=" << (uint32_t)actual_alignment << ")\n"; -#endif + MemoryRequest split_request = request; + split_request.size = block_region->memory.size - request.size; + split_request.offset = block_region->memory.offset + request.size; #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Splitting " << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) " - << "to create empty region (offset=" << (int32_t)split_offset << " size=" << (int32_t)(empty_size) << " bytes)!\n"; + << "to create empty region (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)"; #endif - BlockRegion *next_region = block_region->next_ptr; - BlockRegion *empty_region = create_block_region(user_context, - block_region->memory.properties, - split_offset, empty_size, - block_region->memory.dedicated); + BlockRegion *empty_region = create_block_region(user_context, split_request); halide_abort_if_false(user_context, empty_region != nullptr); empty_region->next_ptr = next_region; @@ -447,54 +483,64 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion } empty_region->prev_ptr = block_region; block_region->next_ptr = empty_region; - block_region->memory.size -= empty_size; + block_region->memory.size -= empty_region->memory.size; return empty_region; } -BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryProperties &properties, size_t offset, size_t size, bool dedicated) { +BlockRegion *RegionAllocator::create_block_region(void *user_context, const MemoryRequest &request) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: Creating block region (" + debug(user_context) << "RegionAllocator: Creating block region request (" << "user_context=" << (void *)(user_context) << " " - << "offset=" << (uint32_t)offset << " " - << "size=" << (uint32_t)size << " " - << "alignment=" << (uint32_t)properties.alignment << " " - << "dedicated=" << (dedicated ? "true" : "false") << " " - << "usage=" << halide_memory_usage_name(properties.usage) << " " - << "caching=" << halide_memory_caching_name(properties.caching) << " " - << "visibility=" << halide_memory_visibility_name(properties.visibility) << ") ...\n"; + << "offset=" << (uint32_t)request.offset << " " + << "size=" << (uint32_t)request.size << " " + << "alignment=" << (uint32_t)request.properties.alignment << " " + << "dedicated=" << (request.dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(request.properties.usage) << " " + << "caching=" << halide_memory_caching_name(request.properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(request.properties.visibility) << ") ..."; #endif - BlockRegion *block_region = static_cast(arena->reserve(user_context, true)); - if (block_region == nullptr) { - error(user_context) << "RegionAllocator: Failed to allocate new block region!\n"; + MemoryRequest region_request = request; + int error_code = conform(user_context, ®ion_request); + if (error_code) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to conform request for new block region!\n"; +#endif return nullptr; } + if (region_request.size == 0) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "RegionAllocator: Added block region (" - << "user_context=" << (void *)(user_context) << " " - << "block_region=" << (void *)(block_region) << ") ...\n"; + debug(user_context) << "RegionAllocator: Failed to allocate new block region ... region size was zero!\n"; #endif + return nullptr; + } - size_t actual_alignment = conform_alignment(properties.alignment, block->memory.properties.alignment); - size_t actual_size = conform_size(offset, size, actual_alignment, block->memory.properties.nearest_multiple); - size_t actual_offset = aligned_offset(offset, actual_alignment); + BlockRegion *block_region = static_cast(arena->reserve(user_context, true)); + if (block_region == nullptr) { +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << "RegionAllocator: Failed to allocate new block region!\n"; +#endif + return nullptr; + } block_region->memory.handle = nullptr; - block_region->memory.offset = actual_offset; - block_region->memory.size = actual_size; - block_region->memory.properties = properties; - block_region->memory.dedicated = dedicated; + block_region->memory.offset = region_request.offset; + block_region->memory.size = region_request.size; + block_region->memory.properties = region_request.properties; + block_region->memory.dedicated = region_request.dedicated; block_region->status = AllocationStatus::Available; block_region->block_ptr = block; block_region->usage_count = 0; #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Creating region (" + debug(user_context) << "RegionAllocator: Created block region allocation (" + << "user_context=" << (void *)(user_context) << " " << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " + << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " - << ")\n"; + << ")"; #endif return block_region; @@ -504,7 +550,12 @@ int RegionAllocator::release_block_region(void *user_context, BlockRegion *block #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Releasing block region (" << "user_context=" << (void *)(user_context) << " " - << "block_region=" << (void *)(block_region) << ") ...\n"; + << "block_ptr=" << ((block_region) ? ((void *)block_region->block_ptr) : nullptr) << " " + << "block_region=" << (void *)block_region << " " + << "usage_count=" << ((block_region) ? (uint32_t)(block_region->usage_count) : 0) << " " + << "memory_offset=" << ((block_region) ? (uint32_t)(block_region->memory.offset) : 0) << " " + << "memory_size=" << ((block_region) ? (uint32_t)(block_region->memory.size) : 0) << " " + << "block_reserved=" << (uint32_t)(block->reserved) << ") ... "; #endif if (block_region == nullptr) { return 0; @@ -517,12 +568,13 @@ int RegionAllocator::release_block_region(void *user_context, BlockRegion *block if (block_region->status != AllocationStatus::Available) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Releasing region (" + debug(user_context) << " releasing region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " + << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)(block->reserved - block_region->memory.size) << " " - << ")\n"; + << ")"; #endif block->reserved -= block_region->memory.size; @@ -535,7 +587,7 @@ int RegionAllocator::destroy_block_region(void *user_context, BlockRegion *block #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Destroying block region (" << "user_context=" << (void *)(user_context) << " " - << "block_region=" << (void *)(block_region) << ") ...\n"; + << "block_region=" << (void *)(block_region) << ") ..."; #endif block_region->usage_count = 0; @@ -549,7 +601,7 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Allocating region (user_context=" << (void *)(user_context) << " size=" << (int32_t)(block_region->memory.size) - << " offset=" << (int32_t)block_region->memory.offset << ")!\n"; + << " offset=" << (int32_t)block_region->memory.offset << ")"; #endif halide_abort_if_false(user_context, allocators.region.allocate != nullptr); halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available); @@ -560,25 +612,25 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r memory_region->is_owner = true; #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Allocating region (" + debug(user_context) << " allocating region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif } else { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Re-using region (" + debug(user_context) << " re-using region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif } block_region->status = block_region->memory.dedicated ? AllocationStatus::Dedicated : AllocationStatus::InUse; @@ -590,24 +642,26 @@ int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_re #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Freeing block region (" << "user_context=" << (void *)(user_context) << " " + << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)(block_region) << " " + << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "status=" << (uint32_t)block_region->status << " " - << "usage_count=" << (uint32_t)block_region->usage_count << ") ...\n"; + << "usage_count=" << (uint32_t)block_region->usage_count << " " + << "block_reserved=" << (uint32_t)block->reserved << ")"; #endif if ((block_region->usage_count == 0) && (block_region->memory.handle != nullptr)) { #ifdef DEBUG_RUNTIME_INTERNAL - debug(user_context) << "Freeing region (" + debug(user_context) << " deallocating region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif + // NOTE: Deallocate but leave memory size as is, so that coalesce can compute region merging sizes halide_abort_if_false(user_context, allocators.region.deallocate != nullptr); MemoryRegion *memory_region = &(block_region->memory); allocators.region.deallocate(user_context, memory_region); - block_region->memory.size = 0; - block_region->memory.offset = 0; block_region->memory.handle = nullptr; } block_region->usage_count = 0; @@ -618,7 +672,7 @@ int RegionAllocator::free_block_region(void *user_context, BlockRegion *block_re int RegionAllocator::release(void *user_context) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Releasing all regions (" - << "user_context=" << (void *)(user_context) << ") ...\n"; + << "user_context=" << (void *)(user_context) << ") ..."; #endif BlockRegion *block_region = block->regions; @@ -635,45 +689,77 @@ int RegionAllocator::release(void *user_context) { bool RegionAllocator::collect(void *user_context) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Collecting free block regions (" - << "user_context=" << (void *)(user_context) << ") ...\n"; + << "user_context=" << (void *)(user_context) << ") ..."; - uint32_t count = 0; + uint32_t collected_count = 0; + uint32_t remaining_count = 0; + uint64_t available_bytes = 0; + uint64_t scanned_bytes = 0; uint64_t reserved = block->reserved; debug(user_context) << " collecting unused regions (" << "block_ptr=" << (void *)block << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif bool has_collected = false; BlockRegion *block_region = block->regions; while (block_region != nullptr) { +#ifdef DEBUG_RUNTIME_INTERNAL + scanned_bytes += block_region->memory.size; + debug(user_context) << " checking region (" + << "block_ptr=" << (void *)block_region->block_ptr << " " + << "block_region=" << (void *)block_region << " " + << "usage_count=" << (uint32_t)(block_region->usage_count) << " " + << "status=" << (uint32_t)(block_region->status) << " " + << "memory_size=" << (uint32_t)(block_region->memory.size) << " " + << "block_reserved=" << (uint32_t)block->reserved << " " + << ")"; +#endif + if (can_coalesce(block_region)) { #ifdef DEBUG_RUNTIME_INTERNAL - count++; + collected_count++; debug(user_context) << " collecting region (" << "block_ptr=" << (void *)block_region->block_ptr << " " << "block_region=" << (void *)block_region << " " << "memory_size=" << (uint32_t)(block_region->memory.size) << " " << "block_reserved=" << (uint32_t)block->reserved << " " - << ")\n"; + << ")"; #endif block_region = coalesce_block_regions(user_context, block_region); has_collected = true; + } else { +#ifdef DEBUG_RUNTIME_INTERNAL + remaining_count++; +#endif } +#ifdef DEBUG_RUNTIME_INTERNAL + available_bytes += is_available(block_region) ? block_region->memory.size : 0; +#endif if (is_last_block_region(user_context, block_region)) { break; } block_region = block_region->next_ptr; } +#ifdef DEBUG_RUNTIME_INTERNAL + debug(user_context) << " scanned active regions (" + << "block_ptr=" << (void *)block << " " + << "total_count=" << (uint32_t)(collected_count + remaining_count) << " " + << "block_reserved=" << (uint32_t)(block->reserved) << " " + << "scanned_bytes=" << (uint32_t)(scanned_bytes) << " " + << "available_bytes=" << (uint32_t)(available_bytes) << " " + << ")"; +#endif if (has_collected) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << " collected unused regions (" << "block_ptr=" << (void *)block << " " - << "region_count=" << (uint32_t)count << " " - << "collected=" << (uint32_t)(reserved - block->reserved) << " " - << ")\n"; + << "collected_count=" << (uint32_t)collected_count << " " + << "remaining_count=" << (uint32_t)remaining_count << " " + << "reclaimed=" << (uint32_t)(reserved - block->reserved) << " " + << ")"; #endif } return has_collected; @@ -682,23 +768,27 @@ bool RegionAllocator::collect(void *user_context) { int RegionAllocator::destroy(void *user_context) { #ifdef DEBUG_RUNTIME_INTERNAL debug(user_context) << "RegionAllocator: Destroying all block regions (" - << "user_context=" << (void *)(user_context) << ") ...\n"; + << "user_context=" << (void *)(user_context) << ") ..."; #endif - for (BlockRegion *block_region = block->regions; block_region != nullptr;) { - - if (is_last_block_region(user_context, block_region)) { - destroy_block_region(user_context, block_region); - block_region = nullptr; - } else { - BlockRegion *prev_region = block_region; - block_region = block_region->next_ptr; - destroy_block_region(user_context, prev_region); + if (block->regions != nullptr) { + for (BlockRegion *block_region = block->regions; block_region != nullptr;) { + + if (is_last_block_region(user_context, block_region)) { + destroy_block_region(user_context, block_region); + block_region = nullptr; + } else { + BlockRegion *prev_region = block_region; + block_region = block_region->next_ptr; + destroy_block_region(user_context, prev_region); + } } } block->reserved = 0; block->regions = nullptr; block->allocator = nullptr; - MemoryArena::destroy(user_context, arena); + if (arena != nullptr) { + MemoryArena::destroy(user_context, arena); + } arena = nullptr; return 0; } diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h index 70a6bda64e5d..055fbef72277 100644 --- a/src/runtime/vulkan_memory.h +++ b/src/runtime/vulkan_memory.h @@ -58,11 +58,12 @@ class VulkanMemoryAllocator { static int destroy(void *user_context, VulkanMemoryAllocator *allocator); // Public interface methods - MemoryRegion *reserve(void *user_context, MemoryRequest &request); - int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse - int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate - int retain(void *user_context, MemoryRegion *region); //< retain the region and increase its use count - bool collect(void *user_context); //< returns true if any blocks were removed + MemoryRegion *reserve(void *user_context, const MemoryRequest &request); + int conform(void *user_context, MemoryRequest *request); //< conforms the given memory request into one that can be allocated + int release(void *user_context, MemoryRegion *region); //< unmark and cache the region for reuse + int reclaim(void *user_context, MemoryRegion *region); //< free the region and consolidate + int retain(void *user_context, MemoryRegion *region); //< retain the region and increase its use count + bool collect(void *user_context); //< returns true if any blocks were removed int release(void *user_context); int destroy(void *user_context); @@ -86,9 +87,11 @@ class VulkanMemoryAllocator { static int allocate_block(void *instance_ptr, MemoryBlock *block); static int deallocate_block(void *instance_ptr, MemoryBlock *block); + static int conform_block_request(void *instance_ptr, MemoryRequest *request); static int allocate_region(void *instance_ptr, MemoryRegion *region); static int deallocate_region(void *instance_ptr, MemoryRegion *region); + static int conform_region_request(void *instance_ptr, MemoryRequest *request); size_t bytes_allocated_for_blocks() const; size_t blocks_allocated() const; @@ -113,6 +116,8 @@ class VulkanMemoryAllocator { MemoryProperties properties, uint32_t required_flags) const; + int lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements); + size_t block_byte_count = 0; size_t block_count = 0; size_t region_byte_count = 0; @@ -180,8 +185,8 @@ int VulkanMemoryAllocator::initialize(void *user_context, block_byte_count = 0; BlockAllocator::MemoryAllocators allocators; allocators.system = system_allocator; - allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block}; - allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region}; + allocators.block = {VulkanMemoryAllocator::allocate_block, VulkanMemoryAllocator::deallocate_block, VulkanMemoryAllocator::conform_block_request}; + allocators.region = {VulkanMemoryAllocator::allocate_region, VulkanMemoryAllocator::deallocate_region, VulkanMemoryAllocator::conform_region_request}; BlockAllocator::Config block_allocator_config = {0}; block_allocator_config.maximum_pool_size = cfg.maximum_pool_size; block_allocator_config.maximum_block_count = cfg.maximum_block_count; @@ -202,7 +207,7 @@ int VulkanMemoryAllocator::initialize(void *user_context, return halide_error_code_success; } -MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, MemoryRequest &request) { +MemoryRegion *VulkanMemoryAllocator::reserve(void *user_context, const MemoryRequest &request) { #if defined(HL_VK_DEBUG_MEM) debug(nullptr) << "VulkanMemoryAllocator: Reserving memory (" << "user_context=" << user_context << " " @@ -272,6 +277,7 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) { error(user_context) << "VulkanMemoryAllocator: Unable to map region! Invalid memory range !\n"; return nullptr; } +#if defined(HL_VK_DEBUG_MEM) debug(nullptr) << "VulkanMemoryAllocator: MapMemory (" << "user_context=" << user_context << "\n" << " region_size=" << (uint32_t)region->size << "\n" @@ -279,8 +285,8 @@ void *VulkanMemoryAllocator::map(void *user_context, MemoryRegion *region) { << " region_range.head_offset=" << (uint32_t)region->range.head_offset << "\n" << " region_range.tail_offset=" << (uint32_t)region->range.tail_offset << "\n" << " memory_offset=" << (uint32_t)memory_offset << "\n" - << " memory_size=" << (uint32_t)memory_size << ") ...\n"; - + << " memory_size=" << (uint32_t)memory_size << "\n)\n"; +#endif VkResult result = vkMapMemory(device, *device_memory, memory_offset, memory_size, 0, (void **)(&mapped_ptr)); if (result != VK_SUCCESS) { error(user_context) << "VulkanMemoryAllocator: Mapping region failed! vkMapMemory returned error code: " << vk_get_error_name(result) << "\n"; @@ -528,6 +534,79 @@ VulkanMemoryAllocator::default_config() { } // -- +int VulkanMemoryAllocator::lookup_requirements(void *user_context, size_t size, uint32_t usage_flags, VkMemoryRequirements *memory_requirements) { +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Looking up requirements (" + << "user_context=" << user_context << " " + << "size=" << (uint32_t)block->size << ", " + << "usage_flags=" << usage_flags << ") ... \n"; +#endif + VkBufferCreateInfo create_info = { + VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // struct type + nullptr, // struct extending this + 0, // create flags + size, // buffer size (in bytes) + usage_flags, // buffer usage flags + VK_SHARING_MODE_EXCLUSIVE, // sharing mode + 0, nullptr}; + + // Create a buffer to determine alignment requirements + VkBuffer buffer = {0}; + VkResult result = vkCreateBuffer(this->device, &create_info, this->alloc_callbacks, &buffer); + if (result != VK_SUCCESS) { +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer to find requirements!\n\t" + << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; +#endif + return halide_error_code_device_malloc_failed; + } + + vkGetBufferMemoryRequirements(this->device, buffer, memory_requirements); + vkDestroyBuffer(this->device, buffer, this->alloc_callbacks); + return halide_error_code_success; +} + +int VulkanMemoryAllocator::conform_block_request(void *instance_ptr, MemoryRequest *request) { + + VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); + if (instance == nullptr) { + return halide_error_code_internal_error; + } + + void *user_context = instance->owner_context; +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Conforming block request (" + << "user_context=" << user_context << " " + << "request=" << (void *)(request) << ") ... \n"; +#endif + + if ((instance->device == nullptr) || (instance->physical_device == nullptr)) { + error(user_context) << "VulkanRegionAllocator: Unable to conform block request! Invalid device handle!\n"; + return halide_error_code_internal_error; + } + + VkMemoryRequirements memory_requirements = {0}; + uint32_t usage_flags = instance->select_memory_usage(user_context, request->properties); + int error_code = instance->lookup_requirements(user_context, request->size, usage_flags, &memory_requirements); + if (error_code != halide_error_code_success) { + error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n"; + return error_code; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Block allocated (" + << "size=" << (uint32_t)request->size << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ", " + << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", " + << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", " + << "dedicated=" << (request->dedicated ? "true" : "false") << ")\n"; +#endif + + request->size = memory_requirements.size; + request->properties.alignment = memory_requirements.alignment; + return halide_error_code_success; +} int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block) { VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); @@ -587,49 +666,6 @@ int VulkanMemoryAllocator::allocate_block(void *instance_ptr, MemoryBlock *block debug(nullptr) << "vkAllocateMemory: Allocated memory for device region (" << (uint64_t)block->size << " bytes) ...\n"; #endif - uint32_t usage_flags = instance->select_memory_usage(user_context, block->properties); - - VkBufferCreateInfo create_info = { - VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // struct type - nullptr, // struct extending this - 0, // create flags - sizeof(uint32_t), // buffer size (in bytes) - usage_flags, // buffer usage flags - VK_SHARING_MODE_EXCLUSIVE, // sharing mode - 0, nullptr}; - - // Create a buffer to determine alignment requirements - VkBuffer buffer = {0}; - result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, &buffer); - if (result != VK_SUCCESS) { - debug(nullptr) << "VulkanMemoryAllocator: Failed to create buffer!\n\t" - << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; - return halide_error_code_device_malloc_failed; - } - - VkMemoryRequirements memory_requirements = {0}; - vkGetBufferMemoryRequirements(instance->device, buffer, &memory_requirements); - vkDestroyBuffer(instance->device, buffer, instance->alloc_callbacks); - -#if defined(HL_VK_DEBUG_MEM) - debug(nullptr) << "VulkanMemoryAllocator: Block allocated (" - << "size=" << (uint32_t)block->size << ", " - << "alignment=" << (uint32_t)memory_requirements.alignment << ", " - << "uniform_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minUniformBufferOffsetAlignment << ", " - << "storage_buffer_offset_alignment=" << (uint32_t)instance->physical_device_limits.minStorageBufferOffsetAlignment << ", " - << "dedicated=" << (block->dedicated ? "true" : "false") << ")\n"; -#endif - - // Enforce any alignment constrainst reported by the device limits for each usage type - if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) { - block->properties.alignment = instance->physical_device_limits.minStorageBufferOffsetAlignment; - } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { - block->properties.alignment = instance->physical_device_limits.minUniformBufferOffsetAlignment; - } - // Some drivers appear to report a buffer alignment constraint (regardless of usage) that can be larger than either of the above - if (memory_requirements.alignment > block->properties.alignment) { - block->properties.alignment = memory_requirements.alignment; - } block->handle = (void *)device_memory; instance->block_byte_count += block->size; instance->block_count++; @@ -810,6 +846,98 @@ uint32_t VulkanMemoryAllocator::select_memory_type(void *user_context, // -- +int VulkanMemoryAllocator::conform(void *user_context, MemoryRequest *request) { + + // NOTE: Vulkan will only allow us to bind device memory to a buffer if the memory requirements are met. + // So now we have to check those (on every allocation) and potentially recreate the buffer if the requirements + // don't match the requested VkBuffer's properties. Note that this is the internal storage for the driver, + // whose size may be required to larger than our requested size (even though we will only ever touch the + // size of the region we're managing as within our block) + + VkMemoryRequirements memory_requirements = {0}; + uint32_t usage_flags = select_memory_usage(user_context, request->properties); + int error_code = lookup_requirements(user_context, request->size, usage_flags, &memory_requirements); + if (error_code != halide_error_code_success) { + error(user_context) << "VulkanRegionAllocator: Failed to conform block request! Unable to lookup requirements!\n"; + return error_code; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements (" + << "requested_size=" << (uint32_t)region->size << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ")\n"; +#endif + + // Enforce any alignment constraints reported by the device limits for each usage type + if (usage_flags & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT) { + if ((request->alignment % this->physical_device_limits.minStorageBufferOffsetAlignment) != 0) { + request->alignment = this->physical_device_limits.minStorageBufferOffsetAlignment; + } + } else if (usage_flags & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT) { + if ((request->alignment % this->physical_device_limits.minUniformBufferOffsetAlignment) != 0) { + request->alignment = this->physical_device_limits.minUniformBufferOffsetAlignment; + } + } + + // Ensure the request ends on an aligned address + if (request->alignment > config.nearest_multiple) { + request->properties.nearest_multiple = request->alignment; + } + + size_t actual_alignment = conform_alignment(request->alignment, memory_requirements.alignment); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, memory_requirements.size, actual_alignment, request->properties.nearest_multiple); + +#if defined(HL_VK_DEBUG_MEM) + if ((request->size != actual_size) || (request->alignment != actual_alignment) || (request->offset != actual_offset)) { + debug(nullptr) << "VulkanMemoryAllocator: Adjusting request to match requirements (\n" + << " size = " << (uint64_t)request->size << " => " << (uint64_t)actual_size << ",\n" + << " alignment = " << (uint64_t)request->alignment << " => " << (uint64_t)actual_alignment << ",\n" + << " offset = " << (uint64_t)request->offset << " => " << (uint64_t)actual_offset << ",\n" + << " required.size = " << (uint64_t)memory_requirements.size << ",\n" + << " required.alignment = " << (uint64_t)memory_requirements.alignment << "\n)\n"; + } +#endif + request->size = actual_size; + request->alignment = actual_alignment; + request->offset = actual_offset; + + return halide_error_code_success; +} + +int VulkanMemoryAllocator::conform_region_request(void *instance_ptr, MemoryRequest *request) { + + VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); + if (instance == nullptr) { + return halide_error_code_internal_error; + } + + void *user_context = instance->owner_context; +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Conforming region request (" + << "user_context=" << user_context << " " + << "request=" << (void *)(region) << ") ... \n"; +#endif + + if ((instance->device == nullptr) || (instance->physical_device == nullptr)) { + error(user_context) << "VulkanRegionAllocator: Unable to conform region request! Invalid device handle!\n"; + return halide_error_code_internal_error; + } + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanRegionAllocator: Conforming region request (" + << "size=" << (uint32_t)request->size << ", " + << "offset=" << (uint32_t)request->offset << ", " + << "dedicated=" << (request->dedicated ? "true" : "false") << " " + << "usage=" << halide_memory_usage_name(request->properties.usage) << " " + << "caching=" << halide_memory_caching_name(request->properties.caching) << " " + << "visibility=" << halide_memory_visibility_name(request->properties.visibility) << ")\n"; +#endif + + return instance->conform(user_context, request); +} + int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *region) { VulkanMemoryAllocator *instance = reinterpret_cast(instance_ptr); @@ -867,6 +995,37 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; return halide_error_code_device_malloc_failed; } + + // NOTE: Vulkan will only allow us to bind device memory to a buffer if the memory requirements are met. + // So now we have to check those (on every allocation) and potentially recreate the buffer if the requirements + // don't match the requested VkBuffer's properties. Note that this is the internal storage for the driver, + // whose size may be required to larger than our requested size (even though we will only ever touch the + // size of the region we're managing as within our block) + VkMemoryRequirements memory_requirements = {0}; + vkGetBufferMemoryRequirements(instance->device, *buffer, &memory_requirements); + +#if defined(HL_VK_DEBUG_MEM) + debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements (" + << "requested_size=" << (uint32_t)region->size << ", " + << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", " + << "required_size=" << (uint32_t)memory_requirements.size << ")\n"; +#endif + + if (memory_requirements.size > region->size) { + vkDestroyBuffer(instance->device, *buffer, instance->alloc_callbacks); +#ifdef DEBUG_RUNTIME + debug(nullptr) << "VulkanMemoryAllocator: Reallocating buffer to match required size (" + << (uint64_t)region->size << " => " << (uint64_t)memory_requirements.size << " bytes) ...\n"; +#endif + create_info.size = memory_requirements.size; + VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer); + if (result != VK_SUCCESS) { + error(user_context) << "VulkanRegionAllocator: Failed to recreate buffer!\n\t" + << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n"; + return halide_error_code_device_malloc_failed; + } + } + #ifdef DEBUG_RUNTIME debug(nullptr) << "vkCreateBuffer: Created buffer for device region (" << (uint64_t)region->size << " bytes) ...\n"; #endif diff --git a/test/runtime/block_allocator.cpp b/test/runtime/block_allocator.cpp index b56c817e1f4e..efe13b04bbf6 100644 --- a/test/runtime/block_allocator.cpp +++ b/test/runtime/block_allocator.cpp @@ -1,3 +1,7 @@ +// NOTE: Uncomment the following two defines to enable debug output +// #define DEBUG_RUNTIME +// #define DEBUG_RUNTIME_INTERNAL + #include "HalideRuntime.h" #include "common.h" @@ -39,6 +43,17 @@ int deallocate_block(void *user_context, MemoryBlock *block) { return halide_error_code_success; } +int conform_block(void *user_context, MemoryRequest *request) { + + debug(user_context) << "Test : conform_block (" + << "request_size=" << int32_t(request->size) << " " + << "request_offset=" << int32_t(request->offset) << " " + << "request_alignment=" << int32_t(request->alignment) << " " + << ") ..."; + + return halide_error_code_success; +} + int allocate_region(void *user_context, MemoryRegion *region) { region->handle = (void *)1; allocated_region_memory += region->size; @@ -65,20 +80,306 @@ int deallocate_region(void *user_context, MemoryRegion *region) { return halide_error_code_success; } +int conform_region(void *user_context, MemoryRequest *request) { + size_t actual_alignment = conform_alignment(request->alignment, 0); + size_t actual_offset = aligned_offset(request->offset, actual_alignment); + size_t actual_size = conform_size(actual_offset, request->size, actual_alignment, actual_alignment); + + debug(user_context) << "Test : conform_region (\n " + << "request_size=" << int32_t(request->size) << "\n " + << "request_offset=" << int32_t(request->offset) << "\n " + << "request_alignment=" << int32_t(request->alignment) << "\n " + << "actual_size=" << int32_t(actual_size) << "\n " + << "actual_offset=" << int32_t(actual_offset) << "\n " + << "actual_alignment=" << int32_t(actual_alignment) << "\n" + << ") ..."; + + request->alignment = actual_alignment; + request->offset = actual_offset; + request->size = actual_size; + return halide_error_code_success; +} + } // end namespace int main(int argc, char **argv) { void *user_context = (void *)1; SystemMemoryAllocatorFns system_allocator = {allocate_system, deallocate_system}; - MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block}; - MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region}; - // test class interface + // test region allocator class interface + { + // Use custom conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, conform_region}; + + // Manually create a block resource and allocate memory + size_t block_size = 4 * 1024 * 1024; + BlockResource block_resource = {}; + MemoryBlock *memory_block = &(block_resource.memory); + memory_block->size = block_size; + allocate_block(user_context, memory_block); + + // Create a region allocator to manage the block resource + RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator}; + RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators); + + MemoryRequest request = {0}; + request.size = sizeof(int); + request.alignment = sizeof(int); + request.properties.visibility = MemoryVisibility::DefaultVisibility; + request.properties.caching = MemoryCaching::DefaultCaching; + request.properties.usage = MemoryUsage::DefaultUsage; + + MemoryRegion *r1 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r1 != nullptr); + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == request.size); + + MemoryRegion *r2 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r2 != nullptr); + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size)); + + instance->reclaim(user_context, r1); + HALIDE_CHECK(user_context, allocated_region_memory == (1 * request.size)); + + MemoryRegion *r3 = instance->reserve(user_context, request); + halide_abort_if_false(user_context, r3 != nullptr); + halide_abort_if_false(user_context, allocated_block_memory == block_size); + halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size)); + instance->retain(user_context, r3); + halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size)); + instance->release(user_context, r3); + halide_abort_if_false(user_context, allocated_region_memory == (2 * request.size)); + instance->reclaim(user_context, r3); + instance->release(user_context, r1); + + // [r1 = available] [r2 = in use] [r3 = available] ... no contiguous regions + HALIDE_CHECK(user_context, false == instance->collect(user_context)); + + // release r2 to make three consecutive regions to collect + instance->release(user_context, r2); + HALIDE_CHECK(user_context, true == instance->collect(user_context)); + + request.size = block_size / 2; // request two half-size regions + MemoryRegion *r4 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r4 != nullptr); + MemoryRegion *r5 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r5 != nullptr); + HALIDE_CHECK(user_context, nullptr == instance->reserve(user_context, request)); // requesting a third should fail + + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size)); + + instance->release(user_context, r4); + instance->release(user_context, r5); + + HALIDE_CHECK(user_context, true == instance->collect(user_context)); + + request.size = block_size; + MemoryRegion *r6 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r6 != nullptr); + + instance->destroy(user_context); + deallocate_block(user_context, memory_block); + + debug(user_context) << "Test : region_allocator::destroy (" + << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " + << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " + << ") ..."; + + HALIDE_CHECK(user_context, allocated_block_memory == 0); + HALIDE_CHECK(user_context, allocated_region_memory == 0); + + RegionAllocator::destroy(user_context, instance); + + debug(user_context) << "Test : region_allocator::destroy (" + << "allocated_system_memory=" << int32_t(get_allocated_system_memory()) << " " + << ") ..."; + + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + + // test region allocator conform request + { + // Use default conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + + // Manually create a block resource and allocate memory + size_t block_size = 4 * 1024 * 1024; + size_t padded_size = 32; + BlockResource block_resource = {}; + MemoryBlock *memory_block = &(block_resource.memory); + memory_block->size = block_size; + memory_block->properties.nearest_multiple = padded_size; + allocate_block(user_context, memory_block); + + // Create a region allocator to manage the block resource + RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator}; + RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators); + + // test zero size request + MemoryRequest request = {0}; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=" << int32_t(request.size) << " " + << "request.alignment=" << int32_t(request.alignment) << " " + << ") ..."; + + halide_abort_if_false(user_context, request.size == size_t(0)); + + // test round up size to alignment + request.size = 1; + request.alignment = 0; + request.properties.alignment = 4; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 4); + halide_abort_if_false(user_context, request.alignment != 4); + + size_t nm = padded_size; + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm))); + halide_abort_if_false(user_context, request.alignment == a); + } + } + + // test round up size and offset to alignment + request.size = 1; + request.offset = 1; + request.alignment = 32; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size == 32); + halide_abort_if_false(user_context, request.offset == 32); + halide_abort_if_false(user_context, request.alignment == 32); + + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t os = 1; os < sz; ++os) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.offset = os; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : region_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.offset=(" << os << " => " << int32_t(request.offset) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(nm, (((sz + nm - 1) / nm) * nm))); + halide_abort_if_false(user_context, request.offset == aligned_offset(os, a)); + halide_abort_if_false(user_context, request.alignment == a); + } + } + } + + instance->destroy(user_context); + deallocate_block(user_context, memory_block); + HALIDE_CHECK(user_context, allocated_block_memory == 0); + HALIDE_CHECK(user_context, allocated_region_memory == 0); + + RegionAllocator::destroy(user_context, instance); + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + + // test region allocator nearest_multiple padding + { + // Use default conform allocation request callbacks + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + + // Manually create a block resource and allocate memory + size_t block_size = 4 * 1024 * 1024; + size_t padded_size = 32; + BlockResource block_resource = {}; + MemoryBlock *memory_block = &(block_resource.memory); + memory_block->size = block_size; + memory_block->properties.nearest_multiple = padded_size; + allocate_block(user_context, memory_block); + + // Create a region allocator to manage the block resource + RegionAllocator::MemoryAllocators allocators = {system_allocator, region_allocator}; + RegionAllocator *instance = RegionAllocator::create(user_context, &block_resource, allocators); + + MemoryRequest request = {0}; + request.size = sizeof(int); + request.alignment = sizeof(int); + request.properties.visibility = MemoryVisibility::DefaultVisibility; + request.properties.caching = MemoryCaching::DefaultCaching; + request.properties.usage = MemoryUsage::DefaultUsage; + + MemoryRegion *r1 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r1 != nullptr); + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == padded_size); + + MemoryRegion *r2 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r2 != nullptr); + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * padded_size)); + + instance->release(user_context, r1); + instance->release(user_context, r2); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * padded_size)); + HALIDE_CHECK(user_context, true == instance->collect(user_context)); + + request.size = block_size / 2; // request two half-size regions + MemoryRegion *r4 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r4 != nullptr); + MemoryRegion *r5 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r5 != nullptr); + HALIDE_CHECK(user_context, nullptr == instance->reserve(user_context, request)); // requesting a third should fail + + HALIDE_CHECK(user_context, allocated_block_memory == block_size); + HALIDE_CHECK(user_context, allocated_region_memory == (2 * request.size)); + + instance->release(user_context, r4); + instance->release(user_context, r5); + + HALIDE_CHECK(user_context, true == instance->collect(user_context)); + + request.size = block_size; + MemoryRegion *r6 = instance->reserve(user_context, request); + HALIDE_CHECK(user_context, r6 != nullptr); + + instance->destroy(user_context); + deallocate_block(user_context, memory_block); + + debug(user_context) << "Test : region_allocator::destroy (" + << "allocated_block_memory=" << int32_t(allocated_block_memory) << " " + << "allocated_region_memory=" << int32_t(allocated_region_memory) << " " + << ") ..."; + + HALIDE_CHECK(user_context, allocated_block_memory == 0); + HALIDE_CHECK(user_context, allocated_region_memory == 0); + + RegionAllocator::destroy(user_context, instance); + + debug(user_context) << "Test : region_allocator::destroy (" + << "allocated_system_memory=" << int32_t(get_allocated_system_memory()) << " " + << ") ..."; + + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + + // test block allocator class interface { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); @@ -130,11 +431,58 @@ int main(int argc, char **argv) { HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); } + // test conform request + { + uint32_t mbs = 1024; // min block size + BlockAllocator::Config config = {0}; + config.minimum_block_size = mbs; + + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; + BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; + BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); + + MemoryRequest request = {0}; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 0); + + // test round up size to alignment + request.size = 1; + request.alignment = 0; + request.properties.alignment = 4; + instance->conform(user_context, &request); + halide_abort_if_false(user_context, request.size != 4); + halide_abort_if_false(user_context, request.alignment != 4); + + for (uint32_t sz = 1; sz < 256; ++sz) { + for (uint32_t a = 2; a < sz; a *= 2) { + request.size = sz; + request.alignment = a; + instance->conform(user_context, &request); + + debug(user_context) << "Test : block_allocator::conform (" + << "request.size=(" << sz << " => " << int32_t(request.size) << ") " + << "request.alignment=(" << a << " => " << int32_t(request.alignment) << ") " + << "..."; + + halide_abort_if_false(user_context, request.size == max(mbs, (((sz + a - 1) / a) * a))); + halide_abort_if_false(user_context, request.alignment == a); + } + } + + BlockAllocator::destroy(user_context, instance); + HALIDE_CHECK(user_context, get_allocated_system_memory() == 0); + } + // allocation stress test { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators); @@ -174,6 +522,9 @@ int main(int argc, char **argv) { BlockAllocator::Config config = {0}; config.minimum_block_size = 1024; + // Use default conform allocation request callbacks + MemoryBlockAllocatorFns block_allocator = {allocate_block, deallocate_block, nullptr}; + MemoryRegionAllocatorFns region_allocator = {allocate_region, deallocate_region, nullptr}; BlockAllocator::MemoryAllocators allocators = {system_allocator, block_allocator, region_allocator}; BlockAllocator *instance = BlockAllocator::create(user_context, config, allocators);