halide · derek-gerstmann · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/src/Target.cpp b/src/Target.cpp
@@ -1304,17 +1304,28 @@ int Target::get_arm_v8_lower_bound() const {
 }
 
 bool Target::supports_type(const Type &t) const {
+    if (has_feature(Vulkan)) {
+        if (t.is_float() && t.bits() == 64) {
+            return has_feature(Target::VulkanFloat64);
+        } else if (t.is_float() && t.bits() == 16) {
+            return has_feature(Target::VulkanFloat16);
+        } else if (t.is_int_or_uint() && t.bits() == 64) {
+            return has_feature(Target::VulkanInt64);
+        } else if (t.is_int_or_uint() && t.bits() == 16) {
+            return has_feature(Target::VulkanInt16);
+        } else if (t.is_int_or_uint() && t.bits() == 8) {
+            return has_feature(Target::VulkanInt8);
+        }
+    }
     if (t.bits() == 64) {
         if (t.is_float()) {
             return (!has_feature(Metal) &&
                     !has_feature(D3D12Compute) &&
                     (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles)) &&
-                    (!has_feature(Vulkan) || has_feature(Target::VulkanFloat64)) &&
                     !has_feature(WebGPU));
         } else {
             return (!has_feature(Metal) &&
                     !has_feature(D3D12Compute) &&
-                    (!has_feature(Vulkan) || has_feature(Target::VulkanInt64)) &&
                     !has_feature(WebGPU));
         }
     }

diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
@@ -151,7 +151,7 @@ ALWAYS_INLINE size_t aligned_size(size_t offset, size_t size, size_t alignment)
 ALWAYS_INLINE size_t conform_size(size_t offset, size_t size, size_t alignment, size_t nearest_multiple) {
     size_t adjusted_size = aligned_size(offset, size, alignment);
     adjusted_size = (alignment > adjusted_size) ? alignment : adjusted_size;
-    if (nearest_multiple > 0) {
+    if ((nearest_multiple > 0) && ((adjusted_size % nearest_multiple) != 0)) {
         size_t rounded_size = (((adjusted_size + nearest_multiple - 1) / nearest_multiple) * nearest_multiple);
         return rounded_size;
     } else {

diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
@@ -74,7 +74,7 @@ class RegionAllocator {
     BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region);
 
     // Returns true if the given region can be split to accomodate the given size
-    bool can_split(const BlockRegion *region, const MemoryRequest &request) const;
+    bool can_split(void *use_context, const BlockRegion *region, const MemoryRequest &request) const;
 
     // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining
     BlockRegion *split_block_region(void *user_context, BlockRegion *region, const MemoryRequest &request);
@@ -195,7 +195,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
         return nullptr;
     }
 
-    if (can_split(block_region, region_request)) {
+    if (can_split(user_context, block_region, region_request)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
                             << "to accomodate requested size (" << (int32_t)(region_request.size) << " bytes)";
@@ -443,8 +443,29 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     return block_region;
 }
 
-bool RegionAllocator::can_split(const BlockRegion *block_region, const MemoryRequest &split_request) const {
-    return (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0));
+bool RegionAllocator::can_split(void *user_context, const BlockRegion *block_region, const MemoryRequest &split_request) const {
+
+    // See if we can actually split the block region and create empty space big enough
+    if (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0)) {
+
+        // We can only split if there's still room left after conforming the allocation request since the
+        // conform method may actually grow the requested size to accomodate alignment constraints
+        MemoryRequest test_request = split_request;
+        test_request.size = block_region->memory.size - test_request.size;
+        test_request.offset = block_region->memory.offset + test_request.size;
+        int error_code = conform(user_context, &test_request);
+        if (error_code) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+            debug(nullptr) << "RegionAllocator: Failed to conform test request for splitting block region!\n";
+#endif
+            return false;
+        }
+
+        if ((block_region->memory.size - test_request.size) > 0) {
+            return true;
+        }
+    }
+    return false;
 }
 
 BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, const MemoryRequest &request) {
@@ -470,8 +491,9 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Splitting "
-                        << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
-                        << "to create empty region (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)";
+                        << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) into ...\n\t"
+                        << "existing region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size - split_request.size) << " bytes)\n\t"
+                        << "empty region    (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)\n";
 #endif
     BlockRegion *next_region = block_region->next_ptr;
     BlockRegion *empty_region = create_block_region(user_context, split_request);
@@ -484,6 +506,12 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     empty_region->prev_ptr = block_region;
     block_region->next_ptr = empty_region;
     block_region->memory.size -= empty_region->memory.size;
+
+#ifdef DEBUG_RUNTIME_INTERNAL
+    debug(user_context) << "RegionAllocator: Split block region into ...\n\t"
+                        << "existing region (ptr=" << (void *)block_region << " prev_ptr=" << block_region->prev_ptr << " next_ptr=" << block_region->next_ptr << " offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n\t"
+                        << "empty region    (ptr=" << (void *)empty_region << " prev_ptr=" << empty_region->prev_ptr << " next_ptr=" << empty_region->next_ptr << " offset=" << (int32_t)empty_region->memory.offset << " size=" << (int32_t)(empty_region->memory.size) << " bytes)\n";
+#endif
     return empty_region;
 }
 
@@ -605,8 +633,22 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r
 #endif
     halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
     halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available);
+
     int error_code = 0;
     MemoryRegion *memory_region = &(block_region->memory);
+    if (memory_region->size <= 0) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "    skipping zero size region ("
+                            << "block_ptr=" << (void *)block_region->block_ptr << " "
+                            << "block_region=" << (void *)block_region << " "
+                            << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " "
+                            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                            << "block_reserved=" << (uint32_t)block->reserved << " "
+                            << ")\n";
+#endif
+        return error_code;
+    }
+
     if (memory_region->handle == nullptr) {
         error_code = allocators.region.allocate(user_context, memory_region);
         memory_region->is_owner = true;

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
@@ -1199,13 +1199,6 @@ WEAK int halide_vulkan_run(void *user_context,
                 }
             }
         }
-
-        // 2b. Create the pipeline layout
-        error_code = vk_create_pipeline_layout(user_context, ctx.allocator, shader_module->shader_count, shader_module->descriptor_set_layouts, &(shader_module->pipeline_layout));
-        if (error_code != halide_error_code_success) {
-            error(user_context) << "Vulkan: Failed to create pipeline layout!\n";
-            return error_code;
-        }
     }
 
     VulkanDispatchData dispatch_data = {};
@@ -1219,16 +1212,8 @@ WEAK int halide_vulkan_run(void *user_context,
 
     VulkanShaderBinding *entry_point_binding = (shader_module->shader_bindings + entry_point_index);
 
-    // 2c. Setup the compute pipeline (eg override any specializations for shared mem or workgroup size)
-    error_code = vk_setup_compute_pipeline(user_context, ctx.allocator, entry_point_binding, &dispatch_data, shader_module->shader_module, shader_module->pipeline_layout, &(entry_point_binding->compute_pipeline));
-    if (error_code != halide_error_code_success) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline!\n";
-        return error_code;
-    }
-
-    // 2d. Create a descriptor set
-    if (entry_point_binding->descriptor_set == VK_NULL_HANDLE) {
-
+    // 2c. If Push Descriptor Set isn't supported, then allocate a descriptor set
+    if ((vkCmdPushDescriptorSetKHR == nullptr) && (entry_point_binding->descriptor_set == VK_NULL_HANDLE)) {
         // Construct a descriptor pool
         //
         // NOTE: while this could be re-used across multiple pipelines, we only know the storage requirements of this kernel's
@@ -1250,7 +1235,7 @@ WEAK int halide_vulkan_run(void *user_context,
         }
     }
 
-    // 3a. Create a buffer for the scalar parameters
+    // 2d. Create a buffer for the scalar parameters
     if ((entry_point_binding->args_region == nullptr) && entry_point_binding->uniform_buffer_count) {
         size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context, arg_sizes, args, arg_is_buffer);
         if (scalar_buffer_size > 0) {
@@ -1262,7 +1247,7 @@ WEAK int halide_vulkan_run(void *user_context,
         }
     }
 
-    // 3b. Update uniform buffer with scalar parameters
+    // 2e. Update uniform buffer with scalar parameters
     VkBuffer *args_buffer = nullptr;
     if ((entry_point_binding->args_region != nullptr) && entry_point_binding->uniform_buffer_count) {
         error_code = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, entry_point_binding->args_region, arg_sizes, args, arg_is_buffer);
@@ -1278,10 +1263,28 @@ WEAK int halide_vulkan_run(void *user_context,
         }
     }
 
-    // 3c. Update buffer bindings for descriptor set
-    error_code = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_binding->descriptor_set);
+    // 2f. If Push Descriptor Set isn't supported, then update the buffer bindings for the allocated descriptor set
+    if (vkCmdPushDescriptorSetKHR == nullptr) {
+        error_code = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_binding->descriptor_set);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to update descriptor set!\n";
+            return error_code;
+        }
+    }
+
+    // 2b. Create the pipeline layout
+    if (shader_module->pipeline_layout == VK_NULL_HANDLE) {
+        error_code = vk_create_pipeline_layout(user_context, ctx.allocator, shader_module->shader_count, shader_module->descriptor_set_layouts, &(shader_module->pipeline_layout));
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to create pipeline layout!\n";
+            return error_code;
+        }
+    }
+
+    // 3. Setup the compute pipeline (eg override any specializations for shared mem or workgroup size)
+    error_code = vk_setup_compute_pipeline(user_context, ctx.allocator, entry_point_binding, &dispatch_data, shader_module->shader_module, shader_module->pipeline_layout, &(entry_point_binding->compute_pipeline));
     if (error_code != halide_error_code_success) {
-        error(user_context) << "Vulkan: Failed to update descriptor set!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline!\n";
         return error_code;
     }
 
@@ -1293,18 +1296,49 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     // 5. Fill the command buffer
-    error_code = vk_fill_command_buffer_with_dispatch_call(user_context,
-                                                           ctx.device, cmds.command_buffer,
-                                                           entry_point_binding->compute_pipeline,
-                                                           shader_module->pipeline_layout,
-                                                           entry_point_binding->descriptor_set,
-                                                           entry_point_index,
-                                                           blocksX, blocksY, blocksZ);
+    error_code = vk_begin_command_buffer(user_context, cmds.command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to start command buffer for dispatch call!\n";
+        return error_code;
+    }
+    error_code = vk_bind_pipeline(user_context, cmds.command_buffer, entry_point_binding->compute_pipeline);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to bind compute pipeline to command buffer for dispatch call!\n";
+        return error_code;
+    }
+
+    if (vkCmdPushDescriptorSetKHR != nullptr) {
+        error_code = vk_push_descriptor_set(user_context, ctx.allocator, cmds.command_buffer, entry_point_binding->compute_pipeline, shader_module->pipeline_layout, entry_point_binding->descriptor_set, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to update descriptor set!\n";
+            return error_code;
+        }
+    } else {
+        error_code = vk_bind_descriptor_sets(user_context, cmds.command_buffer, shader_module->pipeline_layout, entry_point_binding->descriptor_set, entry_point_index);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to bind descriptor set to command buffer for dispatch call!\n";
+            return error_code;
+        }
+    }
+
+    error_code = vk_dispatch_kernel(user_context,
+                                    ctx.device, cmds.command_buffer,
+                                    entry_point_binding->compute_pipeline,
+                                    shader_module->pipeline_layout,
+                                    entry_point_binding->descriptor_set,
+                                    entry_point_index,
+                                    blocksX, blocksY, blocksZ);
     if (error_code != halide_error_code_success) {
         error(user_context) << "Vulkan: Failed to fill command buffer with dispatch call!\n";
         return error_code;
     }
 
+    error_code = vk_end_command_buffer(user_context, cmds.command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to end command buffer for dispatch call!\n";
+        return error_code;
+    }
+
     // 6. Submit the command buffer to our command queue
     error_code = vk_submit_command_buffer(user_context, ctx.queue, cmds.command_buffer);
     if (error_code != halide_error_code_success) {

diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
@@ -203,10 +203,18 @@ uint32_t vk_get_required_device_extensions(void *user_context, StringTable &ext_
 uint32_t vk_get_optional_device_extensions(void *user_context, StringTable &ext_table) {
     const char *optional_ext_table[] = {
         "VK_KHR_portability_subset",  //< necessary for running under Molten (aka Vulkan on Mac)
+        VK_KHR_MAINTENANCE_1_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_2_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_3_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_4_EXTENSION_NAME,
         VK_KHR_MAINTENANCE_5_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_6_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_7_EXTENSION_NAME,
         VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
         VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
-        VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME};
+        VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME,
+        VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME};
+
     const uint32_t optional_ext_count = sizeof(optional_ext_table) / sizeof(optional_ext_table[0]);
     ext_table.fill(user_context, (const char **)optional_ext_table, optional_ext_count);
     return optional_ext_count;

diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
@@ -204,6 +204,7 @@ VULKAN_FN(vkCmdCopyBuffer2)
 // VULKAN_FN(vkCmdCopyImageToBuffer2)
 // VULKAN_FN(vkCmdEndRendering)
 VULKAN_FN(vkCmdPipelineBarrier2)
+VULKAN_FN(vkCmdPushDescriptorSetKHR)
 VULKAN_FN(vkCmdResetEvent2)
 // VULKAN_FN(vkCmdResolveImage2)
 // VULKAN_FN(vkCmdSetCullMode)