Skip to content

Commit

Permalink
prefer faster and larger device local only memory on amd integrated g…
Browse files Browse the repository at this point in the history
…raphics, heap budget value follows the same strategy as blob allocator
  • Loading branch information
nihui committed Aug 12, 2023
1 parent 75e10c6 commit 9271623
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 14 deletions.
60 changes: 60 additions & 0 deletions src/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,16 @@ VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
{
// integrated gpu, prefer unified memory
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
buffer_memory_type_index = device_local_memory_type_index;
}
}
else
{
Expand Down Expand Up @@ -990,6 +1000,16 @@ VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize,
{
// integrated gpu, prefer unified memory
image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
image_memory_type_index = device_local_memory_type_index;
}
}
else
{
Expand Down Expand Up @@ -1299,6 +1319,16 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
{
// integrated gpu, prefer unified memory
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
buffer_memory_type_index = device_local_memory_type_index;
}
}
else
{
Expand Down Expand Up @@ -1348,6 +1378,16 @@ VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
{
// integrated gpu, prefer unified memory
buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
buffer_memory_type_index = device_local_memory_type_index;
}
}
else
{
Expand Down Expand Up @@ -1484,6 +1524,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
{
// integrated gpu, prefer unified memory
image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
image_memory_type_index = device_local_memory_type_index;
}
}
else
{
Expand Down Expand Up @@ -1578,6 +1628,16 @@ VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsiz
{
// integrated gpu, prefer unified memory
image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

// on amd integrated gpu, there is a faster and larger device-only heap
uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physical_device_memory_properties();
uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
{
image_memory_type_index = device_local_memory_type_index;
}
}
else
{
Expand Down
18 changes: 4 additions & 14 deletions src/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3153,23 +3153,13 @@ uint32_t VulkanDevice::get_heap_budget() const
{
const VkPhysicalDeviceMemoryProperties& memory_properties = info.physical_device_memory_properties();

// the first device local heap
uint32_t device_local_heap_index = 0;
uint32_t device_local_heap_size = 0;
for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
{
const VkMemoryHeap& memoryHeap = memory_properties.memoryHeaps[i];
if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
{
device_local_heap_index = i;
device_local_heap_size = memoryHeap.size / 1024 / 1024;
break;
}
}
uint32_t buffer_memory_type_index = d->dummy_allocator->buffer_memory_type_index;
uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;

if (!info.support_VK_EXT_memory_budget())
{
// NCNN_LOGE("heap budget from assumption\n");
uint32_t device_local_heap_size = memory_properties.memoryHeaps[buffer_heap_index].size / 1024 / 1024;

// we usually cannot use all heap
// 70% for 4G+
Expand All @@ -3187,7 +3177,7 @@ uint32_t VulkanDevice::get_heap_budget() const

vkGetPhysicalDeviceMemoryProperties2KHR(info.physical_device(), &memoryProperties);

return memoryBudgetProperties.heapBudget[device_local_heap_index] / 1024 / 1024;
return memoryBudgetProperties.heapBudget[buffer_heap_index] / 1024 / 1024;
}

void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const
Expand Down

0 comments on commit 9271623

Please sign in to comment.