prefer faster and larger device local only memory on amd integrated g…

…raphics, heap budget value follows the same strategy as blob allocator
Tencent · Aug 12, 2023 · 9271623 · 9271623
1 parent 75e10c6
commit 9271623
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 14 deletions.
diff --git a/src/allocator.cpp b/src/allocator.cpp
@@ -738,6 +738,16 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
         {
             // integrated gpu, prefer unified memory
             buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+
+            // on amd integrated gpu, there is a faster and larger device-only heap
+            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
+            uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
+            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
+            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
+            {
+                buffer_memory_type_index = device_local_memory_type_index;
+            }
         }
         else
         {
@@ -990,6 +1000,16 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize,
         {
             // integrated gpu, prefer unified memory
             image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+
+            // on amd integrated gpu, there is a faster and larger device-only heap
+            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
+            uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
+            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
+            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
+            {
+                image_memory_type_index = device_local_memory_type_index;
+            }
         }
         else
         {
@@ -1299,6 +1319,16 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
                 {
                     // integrated gpu, prefer unified memory
                     buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+
+                    // on amd integrated gpu, there is a faster and larger device-only heap
+                    uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+                    const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
+                    uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
+                    uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
+                    if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
+                    {
+                        buffer_memory_type_index = device_local_memory_type_index;
+                    }
                 }
                 else
                 {
@@ -1348,6 +1378,16 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
         {
             // integrated gpu, prefer unified memory
             buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+
+            // on amd integrated gpu, there is a faster and larger device-only heap
+            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
+            uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
+            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
+            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
+            {
+                buffer_memory_type_index = device_local_memory_type_index;
+            }
         }
         else
         {
@@ -1484,6 +1524,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
                 {
                     // integrated gpu, prefer unified memory
                     image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+
+                    // on amd integrated gpu, there is a faster and larger device-only heap
+                    uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+                    const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
+                    uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
+                    uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
+                    if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
+                    {
+                        image_memory_type_index = device_local_memory_type_index;
+                    }
                 }
                 else
                 {
@@ -1578,6 +1628,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
         {
             // integrated gpu, prefer unified memory
             image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+
+            // on amd integrated gpu, there is a faster and larger device-only heap
+            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
+            uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
+            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
+            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
+            {
+                image_memory_type_index = device_local_memory_type_index;
+            }
         }
         else
         {

diff --git a/src/gpu.cpp b/src/gpu.cpp
@@ -3153,23 +3153,13 @@ uint32_t VulkanDevice::get_heap_budget() const
 {
     const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();
 
-    // the first device local heap
-    uint32_t device_local_heap_index = 0;
-    uint32_t device_local_heap_size = 0;
-    for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
-    {
-        const VkMemoryHeap& memoryHeap = memory_properties.memoryHeaps[i];
-        if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
-        {
-            device_local_heap_index = i;
-            device_local_heap_size = memoryHeap.size / 1024 / 1024;
-            break;
-        }
-    }
+    uint32_t buffer_memory_type_index = d->dummy_allocator->buffer_memory_type_index;
+    uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
 
     if (!info.support_VK_EXT_memory_budget())
     {
         //         NCNN_LOGE("heap budget from assumption\n");
+        uint32_t device_local_heap_size = memory_properties.memoryHeaps[buffer_heap_index].size / 1024 / 1024;
 
         // we usually cannot use all heap
         // 70% for 4G+
@@ -3187,7 +3177,7 @@ uint32_t VulkanDevice::get_heap_budget() const
 
     vkGetPhysicalDeviceMemoryProperties2KHR(info.physical_device(), &memoryProperties);
 
-    return memoryBudgetProperties.heapBudget[device_local_heap_index] / 1024 / 1024;
+    return memoryBudgetProperties.heapBudget[buffer_heap_index] / 1024 / 1024;
 }
 
 void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const