Skip to content

Commit b068422

Browse files
committed
Adjust Vulkan queue selection and creation logic
- The queue selection logic rewrite addresses a bug in the old implementation where the code would pass an invalid queue index when selecting any queues other than number 0. - The queue priority has been changed to the lowest value as it doesn't make sense to prioritize this workload over e.g. graphics workload. - The new implementation will attempt to use compute-only queues which are common on AMD GPUs. It's not clear how much difference will this make but hopefully it would lead to better scheduling. The priority changes were primary made due to that the old implementation causing my system to hang, stutter or otherwise become unstable and crash. After the change I'm able to run autotvm tuning inside a desktop environment. While it might be good to minimize the variance of execution time for autotvm, it doesn't worth it when it causes system instability as described above.
1 parent 6d0351a commit b068422

File tree

1 file changed

+42
-30
lines changed

1 file changed

+42
-30
lines changed

src/runtime/vulkan/vulkan.cc

Lines changed: 42 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
117117
}
118118
void SetDevice(TVMContext ctx) final { VulkanThreadEntry::ThreadLocal()->ctx = ctx; }
119119
void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
120+
uint32_t FindComputeQueue(VkPhysicalDevice phy_dev);
120121
void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
121122
DLDataType type_hint) final {
122123
const auto& vctx = context(ctx.device_id);
@@ -490,33 +491,17 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
490491
std::vector<VkPhysicalDevice> all_phy_devs(phy_dev_count);
491492
VULKAN_CALL(vkEnumeratePhysicalDevices(instance_, &phy_dev_count, dmlc::BeginPtr(all_phy_devs)));
492493
for (VkPhysicalDevice phy_dev : all_phy_devs) {
493-
uint32_t queue_prop_count = 0;
494-
vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, nullptr);
495-
std::vector<VkQueueFamilyProperties> queue_props(queue_prop_count);
496-
vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count,
497-
dmlc::BeginPtr(queue_props));
498-
uint32_t queue_family_index = 0;
499-
std::vector<VkDeviceQueueCreateInfo> queue_create_info;
500-
float priority = 1.0f;
501-
for (uint32_t i = 0; i < queue_props.size(); i++) {
502-
// find queues that support compute
503-
if (VK_QUEUE_COMPUTE_BIT & queue_props[i].queueFlags) {
504-
VkDeviceQueueCreateInfo info;
505-
info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
506-
info.pNext = nullptr;
507-
info.flags = 0;
508-
info.queueFamilyIndex = i;
509-
info.queueCount = 1;
510-
info.pQueuePriorities = &priority;
511-
512-
queue_create_info.push_back(info);
513-
// only use the first available queue for now
514-
if (queue_create_info.size() == 0) {
515-
queue_family_index = i;
516-
}
517-
}
518-
}
519-
if (queue_create_info.size() == 0) continue;
494+
uint32_t queue_family_index = FindComputeQueue(phy_dev);
495+
if (queue_family_index == -1U) continue;
496+
float priority = 0.0f;
497+
498+
VkDeviceQueueCreateInfo queue_create_info;
499+
queue_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
500+
queue_create_info.pNext = nullptr;
501+
queue_create_info.flags = 0;
502+
queue_create_info.queueFamilyIndex = queue_family_index;
503+
queue_create_info.queueCount = 1;
504+
queue_create_info.pQueuePriorities = &priority;
520505

521506
VulkanContext ctx;
522507
// setup context
@@ -554,8 +539,8 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
554539
device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
555540
device_create_info.pNext = nullptr;
556541
device_create_info.flags = 0;
557-
device_create_info.queueCreateInfoCount = static_cast<uint32_t>(queue_create_info.size());
558-
device_create_info.pQueueCreateInfos = queue_create_info.data();
542+
device_create_info.queueCreateInfoCount = 1;
543+
device_create_info.pQueueCreateInfos = &queue_create_info;
559544
device_create_info.enabledLayerCount = 0;
560545
device_create_info.ppEnabledLayerNames = nullptr;
561546
device_create_info.enabledExtensionCount = extensions.size();
@@ -677,7 +662,34 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
677662
<< "\' phy_dev_id=" << context_[i].phy_device
678663
<< " use_immediate=" << context_[i].UseImmediate();
679664
}
680-
} // namespace vulkan
665+
}
666+
667+
uint32_t VulkanDeviceAPI::FindComputeQueue(VkPhysicalDevice phy_dev) {
668+
uint32_t queue_prop_count = 0;
669+
vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, nullptr);
670+
std::vector<VkQueueFamilyProperties> queue_props(queue_prop_count);
671+
vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, dmlc::BeginPtr(queue_props));
672+
// Prefer compute-only queues. On cerain devices supporting this (e.g. Mesa RADV), using
673+
// compute-only queues gives better responsiveness for other graphics workload (e.g. desktop).
674+
auto compute_dedicated = std::find_if(queue_props.begin(), queue_props.end(), [](auto prop) {
675+
return (VK_QUEUE_COMPUTE_BIT & prop.queueFlags) != 0 &&
676+
(VK_QUEUE_GRAPHICS_BIT & prop.queueFlags) == 0;
677+
});
678+
if (compute_dedicated == queue_props.end()) {
679+
auto compute = std::find_if(queue_props.begin(), queue_props.end(), [](auto prop) {
680+
return (VK_QUEUE_COMPUTE_BIT & prop.queueFlags) != 0;
681+
});
682+
if (compute == queue_props.end()) {
683+
return -1;
684+
} else {
685+
return std::distance(queue_props.begin(), compute);
686+
}
687+
} else {
688+
return std::distance(queue_props.begin(), compute_dedicated);
689+
}
690+
}
691+
692+
// namespace vulkan
681693
class VulkanModuleNode;
682694

683695
// a wrapped function class to get packed func.

0 commit comments

Comments
 (0)