Adjust Vulkan queue selection and creation logic

ishitatsuyuki · ishitatsuyuki · commit b068422899f2 · 2020-10-11T23:49:03.000+09:00
- The queue selection logic rewrite addresses a bug in the old
implementation where the code would pass an invalid queue index when
selecting any queues other than number 0.
- The queue priority has been changed to the lowest value as it doesn't
make sense to prioritize this workload over e.g. graphics workload.
- The new implementation will attempt to use compute-only queues which
are common on AMD GPUs. It's not clear how much difference will this
make but hopefully it would lead to better scheduling.

The priority changes were primary made due to that the old
implementation causing my system to hang, stutter or otherwise become
unstable and crash. After the change I'm able to run autotvm tuning
inside a desktop environment. While it might be good to minimize the
variance of execution time for autotvm, it doesn't worth it when it causes
system instability as described above.
diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc
@@ -117,6 +117,7 @@ class VulkanDeviceAPI final : public DeviceAPI {
   }
   void SetDevice(TVMContext ctx) final { VulkanThreadEntry::ThreadLocal()->ctx = ctx; }
   void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final;
+  uint32_t FindComputeQueue(VkPhysicalDevice phy_dev);
   void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment,
                        DLDataType type_hint) final {
     const auto& vctx = context(ctx.device_id);
@@ -490,33 +491,17 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
   std::vector<VkPhysicalDevice> all_phy_devs(phy_dev_count);
   VULKAN_CALL(vkEnumeratePhysicalDevices(instance_, &phy_dev_count, dmlc::BeginPtr(all_phy_devs)));
   for (VkPhysicalDevice phy_dev : all_phy_devs) {
-    uint32_t queue_prop_count = 0;
-    vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, nullptr);
-    std::vector<VkQueueFamilyProperties> queue_props(queue_prop_count);
-    vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count,
-                                             dmlc::BeginPtr(queue_props));
-    uint32_t queue_family_index = 0;
-    std::vector<VkDeviceQueueCreateInfo> queue_create_info;
-    float priority = 1.0f;
-    for (uint32_t i = 0; i < queue_props.size(); i++) {
-      // find queues that support compute
-      if (VK_QUEUE_COMPUTE_BIT & queue_props[i].queueFlags) {
-        VkDeviceQueueCreateInfo info;
-        info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
-        info.pNext = nullptr;
-        info.flags = 0;
-        info.queueFamilyIndex = i;
-        info.queueCount = 1;
-        info.pQueuePriorities = &priority;
-
-        queue_create_info.push_back(info);
-        // only use the first available queue for now
-        if (queue_create_info.size() == 0) {
-          queue_family_index = i;
-        }
-      }
-    }
-    if (queue_create_info.size() == 0) continue;
+    uint32_t queue_family_index = FindComputeQueue(phy_dev);
+    if (queue_family_index == -1U) continue;
+    float priority = 0.0f;
+
+    VkDeviceQueueCreateInfo queue_create_info;
+    queue_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    queue_create_info.pNext = nullptr;
+    queue_create_info.flags = 0;
+    queue_create_info.queueFamilyIndex = queue_family_index;
+    queue_create_info.queueCount = 1;
+    queue_create_info.pQueuePriorities = &priority;
 
     VulkanContext ctx;
     // setup context
@@ -554,8 +539,8 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
     device_create_info.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
     device_create_info.pNext = nullptr;
     device_create_info.flags = 0;
-    device_create_info.queueCreateInfoCount = static_cast<uint32_t>(queue_create_info.size());
-    device_create_info.pQueueCreateInfos = queue_create_info.data();
+    device_create_info.queueCreateInfoCount = 1;
+    device_create_info.pQueueCreateInfos = &queue_create_info;
     device_create_info.enabledLayerCount = 0;
     device_create_info.ppEnabledLayerNames = nullptr;
     device_create_info.enabledExtensionCount = extensions.size();
@@ -677,7 +662,34 @@ VulkanDeviceAPI::VulkanDeviceAPI() {
               << "\' phy_dev_id=" << context_[i].phy_device
               << " use_immediate=" << context_[i].UseImmediate();
   }
-}  // namespace vulkan
+}
+
+uint32_t VulkanDeviceAPI::FindComputeQueue(VkPhysicalDevice phy_dev) {
+  uint32_t queue_prop_count = 0;
+  vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, nullptr);
+  std::vector<VkQueueFamilyProperties> queue_props(queue_prop_count);
+  vkGetPhysicalDeviceQueueFamilyProperties(phy_dev, &queue_prop_count, dmlc::BeginPtr(queue_props));
+  // Prefer compute-only queues. On cerain devices supporting this (e.g. Mesa RADV), using
+  // compute-only queues gives better responsiveness for other graphics workload (e.g. desktop).
+  auto compute_dedicated = std::find_if(queue_props.begin(), queue_props.end(), [](auto prop) {
+    return (VK_QUEUE_COMPUTE_BIT & prop.queueFlags) != 0 &&
+           (VK_QUEUE_GRAPHICS_BIT & prop.queueFlags) == 0;
+  });
+  if (compute_dedicated == queue_props.end()) {
+    auto compute = std::find_if(queue_props.begin(), queue_props.end(), [](auto prop) {
+      return (VK_QUEUE_COMPUTE_BIT & prop.queueFlags) != 0;
+    });
+    if (compute == queue_props.end()) {
+      return -1;
+    } else {
+      return std::distance(queue_props.begin(), compute);
+    }
+  } else {
+    return std::distance(queue_props.begin(), compute_dedicated);
+  }
+}
+
+// namespace vulkan
 class VulkanModuleNode;
 
 // a wrapped function class to get packed func.