intel · steffenlarsen · May 12, 2023 · Apr 18, 2023 · Apr 18, 2023 · Apr 18, 2023
@@ -0,0 +1,49 @@
+# sycl_ext_codeplay_max_registers_per_work_group_query
+
+## Notice
+
+This document describes an **experimental** API that applications can use to try
+out a new feature. Future versions of this API may change in ways that are
+incompatible with this experimental version.
+
+
+## Introduction
+
+This extension adds a new device information descriptor that provides the ability to query a device for the maximum number of registers available per work-group.
+
+OpenCL never offered such query due to the nature of being a very platform specific one - which is why it is also absent from SYCL. Now that SYCL supports back-ends where the register usage is a limiting resource factor of the possible maximum work-group size for a kernel, having the ability to query that limit is important for writing safe and portable code.
+
+## Feature test macro
+
+As encouraged by the SYCL specification, a feature-test macro, `SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY`, is provided to determine whether this extension is implemented.
+
+## New device descriptor
+
+| Device descriptor                                     | Return type | Description                                                                                                                                                                                                             |
+| ------------------------------------------------------ | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ext::codeplay::experimental::info::device::max_registers_per_work_group     |  unsigned int      | Returns the maximum number of registers available for use per work-group based on the capability of the device.       |
+
+### Note
+
+## Examples
+
+```c++
+sycl::device gpu = sycl::device{sycl::gpu_selector_v};
+std::cout << gpu.get_info<sycl::info::device::name>() << '\n';
+
+#ifdef SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY
+unsigned int registers_per_group = gpu.get_info<sycl::ext::codeplay::experimental::info::device::max_registers_per_work_group>();
+std::cout << "Max registers per work-group: " << registers_per_group << '\n';
+#endif
+```
+
+Ouputs to the console:
+
+Executed using the CUDA back-end on NVIDIA.
+
+```
+NVIDIA ...
+Max registers per work-group: 65536
+```
+
+- See: [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities)
@@ -90,9 +90,11 @@
 // native handles.
 // 12.29 Support PI_EXT_PLATFORM_INFO_BACKEND query in piPlatformGetInfo
 // 12.30 Added PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT device info query.
+// 12.31 Added PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP device
+// info query.
 
 #define _PI_H_VERSION_MAJOR 12
-#define _PI_H_VERSION_MINOR 30
+#define _PI_H_VERSION_MINOR 31
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -350,6 +352,8 @@ typedef enum {
   PI_EXT_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 0x20006,
   PI_EXT_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 0x20007,
   PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 0x20008,
+  // The number of max registers per block (device specific)
+  PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 0x20009,
 } _pi_device_info;
 
 typedef enum {

@@ -3,6 +3,9 @@
 #define __SYCL_PARAM_TRAITS_TEMPLATE_SPEC __SYCL_PARAM_TRAITS_SPEC
 #endif
 __SYCL_PARAM_TRAITS_SPEC(ext::codeplay::experimental,device, supports_fusion, bool, PI_EXT_CODEPLAY_DEVICE_INFO_SUPPORTS_FUSION)
+__SYCL_PARAM_TRAITS_SPEC(
+    ext::codeplay::experimental, device, max_registers_per_work_group, uint32_t,
+    PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP)
 #ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF
 #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
 #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF

@@ -425,6 +425,27 @@ bool getMaxRegistersJitOptionValue(const std::string &build_options,
   return true;
 }
 
+// Helper to verify out-of-registers case (exceeded block max registers).
+// If the kernel requires a number of registers for the entire thread
+// block exceeds the hardware limitations, then the cuLaunchKernel call
+// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error.
+bool hasExceededMaxRegistersPerBlock(pi_device device, pi_kernel kernel,
+                                     size_t blockSize) {
+  assert(device);
+  assert(kernel);
+
+  int maxRegsPerBlock{0};
+  PI_CHECK_ERROR(cuDeviceGetAttribute(
+      &maxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+      device->get()));
+
+  int regsPerThread{0};
+  PI_CHECK_ERROR(cuFuncGetAttribute(&regsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                    kernel->get()));
+
+  return blockSize * regsPerThread > size_t(maxRegsPerBlock);
+};
+
 } // anonymous namespace
 
 /// ------ Error handling, matching OpenCL plugin semantics.
@@ -2111,6 +2132,21 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
                             false);
   }
 
+  case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
+    // Maximum number of 32-bit registers available to a thread block.
+    // Note: This number is shared by all thread blocks simultaneously resident
+    // on a multiprocessor.
+    int max_registers{-1};
+    PI_CHECK_ERROR(cuDeviceGetAttribute(
+        &max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+        device->get()));
+
+    sycl::detail::pi::assertion(max_registers >= 0);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<uint32_t>(max_registers));
+  }
+
     // TODO: Investigate if this information is available on CUDA.
   case PI_DEVICE_INFO_PCI_ADDRESS:
   case PI_DEVICE_INFO_GPU_EU_COUNT:
@@ -3218,10 +3254,18 @@ pi_result cuda_piEnqueueKernelLaunch(
           return PI_SUCCESS;
         };
 
+        size_t kernelLocalWorkGroupSize = 0;
         for (size_t dim = 0; dim < work_dim; dim++) {
           auto err = isValid(dim);
           if (err != PI_SUCCESS)
             return err;
+          // If no error then sum the total local work size per dim.
+          kernelLocalWorkGroupSize += local_work_size[dim];
+        }
+
+        if (hasExceededMaxRegistersPerBlock(command_queue->device_, kernel,
+                                            kernelLocalWorkGroupSize)) {
+          return PI_ERROR_INVALID_WORK_GROUP_SIZE;
         }
       } else {
         guessLocalWorkSize(command_queue->device_, threadsPerBlock,

@@ -1970,6 +1970,22 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
                             false);
   }
 
+  case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
+    // Maximum number of 32-bit registers available to a thread block.
+    // Note: This number is shared by all thread blocks simultaneously resident
+    // on a multiprocessor.
+    int max_registers{-1};
+    sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_registers,
+                              hipDeviceAttributeMaxRegistersPerBlock,
+                              device->get()) == hipSuccess);
+
+    sycl::detail::pi::assertion(max_registers >= 0);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<uint32_t>(max_registers));
+  }
+
   // TODO: Investigate if this information is available on HIP.
   case PI_DEVICE_INFO_PCI_ADDRESS:
   case PI_DEVICE_INFO_GPU_EU_COUNT:

@@ -780,6 +780,22 @@ struct get_device_info_impl<
   }
 };
 
+// Specialization for max registers per work-group
+template <>
+struct get_device_info_impl<
+    uint32_t,
+    ext::codeplay::experimental::info::device::max_registers_per_work_group> {
+  static uint32_t get(const DeviceImplPtr &Dev) {
+    uint32_t maxRegsPerWG;
+    Dev->getPlugin().call<PiApiKind::piDeviceGetInfo>(
+        Dev->getHandleRef(),
+        PiInfoCode<ext::codeplay::experimental::info::device::
+                       max_registers_per_work_group>::value,
+        sizeof(maxRegsPerWG), &maxRegsPerWG, nullptr);
+    return maxRegsPerWG;
+  }
+};
+
 template <typename Param>
 typename Param::return_type get_device_info(const DeviceImplPtr &Dev) {
   static_assert(is_device_info_desc<Param>::value,
@@ -1660,6 +1676,14 @@ inline bool get_device_info_host<
   return false;
 }
 
+template <>
+inline uint32_t get_device_info_host<
+    ext::codeplay::experimental::info::device::max_registers_per_work_group>() {
+  throw runtime_error("Obtaining the maximum number of available registers per "
+                      "work-group is not supported on HOST device",
+                      PI_ERROR_INVALID_DEVICE);
+}
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
@@ -53,6 +53,7 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
   bool IsOpenCLV1x = false; // Backend is OpenCL 1.x
   bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0
   bool IsLevelZero = false;   // Backend is any OneAPI Level 0 version
+  bool IsCuda = false;        // Backend is CUDA
   auto Backend = Platform.get_backend();
   if (Backend == sycl::backend::opencl) {
     std::string VersionString =
@@ -63,6 +64,8 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
         (VersionString.find("2.") == 0) || (VersionString.find("3.") == 0);
   } else if (Backend == sycl::backend::ext_oneapi_level_zero) {
     IsLevelZero = true;
+  } else if (Backend == sycl::backend::ext_oneapi_cuda) {
+    IsCuda = true;
   }
 
   size_t CompileWGSize[3] = {0};
@@ -237,6 +240,46 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
           // else unknown.  fallback (below)
         }
       }
+    } else if (IsCuda) {
+      // CUDA:
+      // PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers
+      // required for the launch config exceeds the maximum number of registers
+      // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
+      // This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
+      // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
+      // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
+      // Specifications per Compute Capability, for limitations.
+      const size_t TotalNumberOfWIs =
+          NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
+
+      uint32_t NumRegisters = 0;
+      Plugin.call<PiApiKind::piKernelGetGroupInfo>(
+          Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
+          &NumRegisters, nullptr);
+
+      uint32_t MaxRegistersPerBlock =
+          DeviceImpl.get_info<ext::codeplay::experimental::info::device::
+                                  max_registers_per_work_group>();
+
+      const bool HasExceededAvailableRegisters =
+          TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
+
+      if (HasExceededAvailableRegisters) {
+        std::string message(
+            "Exceeded the number of registers available on the hardware.\n");
+        throw sycl::nd_range_error(
+            // Additional information which can be helpful to the user.
+            message.append(
+                "\tThe number registers per work-group cannot exceed " +
+                std::to_string(MaxRegistersPerBlock) +
+                " for this kernel on this device.\n"
+                "\tThe kernel uses " +
+                std::to_string(NumRegisters) +
+                " registers per work-item for a total of " +
+                std::to_string(TotalNumberOfWIs) +
+                " work-items per work-group.\n"),
+            PI_ERROR_INVALID_WORK_GROUP_SIZE);
+      }
     } else {
       // TODO: Decide what checks (if any) we need for the other backends
     }

@@ -81,6 +81,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 #define SYCL_EXT_CODEPLAY_KERNEL_FUSION 1
 #endif
 #define SYCL_EXT_INTEL_CACHE_CONFIG 1
+#define SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY 1
 
 #ifndef __has_include
 #define __has_include(x) 0

@@ -0,0 +1,24 @@
+// REQUIRES: cuda || hip
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue q;
+  sycl::device dev = q.get_device();
+
+#if !defined(SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY)
+#error SYCL_EXT_CODEPLAY_MAX_REGISTERS_PER_WORK_GROUP_QUERY is not defined!
+#endif
+
+  auto max_regs_per_wg =
+      dev.get_info<sycl::ext::codeplay::experimental::info::device::
+                       max_registers_per_work_group>();
+  std::cout << "Max register per work-group: " << max_regs_per_wg << std::endl;
+
+  assert(max_regs_per_wg > 0);
+
+  std::cout << "Passed!" << std::endl;
+  return 0;
+}