[ROCm] Fixes the kernel asserts API declaration mismatch error (pytor…

…ch#81790) This problem updates the the PR [pytorch#73040](pytorch#73040) The compilation error in pyTorch with ROCm is successful with these changes when `NDEBUG` is enabled. Solution: For HIP we keep `__device__ __assert_fail()` and for host side compilation we want to use the `__assert_fail()` from the glibc library. Tested the code by compiling with below steps ``` python3 tools/amd_build/build_amd.py python3 setup.py develop --cmake-only cmake -DHIP_HIPCC_FLAGS_RELEASE="-DNDEBUG" build cmake --build build ``` The UT test_fixed_cuda_assert_async is still skipped due performance overhead. cc @jithunnair-amd Pull Request resolved: pytorch#81790 Approved by: https://github.com/shintaro-iwasaki, https://github.com/jeffdaily, https://github.com/malfet
lcy-seso · Aug 16, 2022 · 8473e69 · 8473e69
1 parent b156f33
commit 8473e69
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 14 deletions.
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
@@ -332,7 +332,9 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // CUDA_KERNEL_ASSERT checks the assertion
 // even when NDEBUG is defined. This is useful for important assertions in CUDA
 // code that would otherwise be suppressed when building Release.
-#if defined(__ANDROID__) || defined(__APPLE__) || defined(USE_ROCM)
+#if defined(__ANDROID__) || defined(__APPLE__) ||  \
+    (defined(USE_ROCM) && ROCM_VERSION < 40100) || \
+    (defined(USE_ROCM) && defined(ROCM_DISABLE_GPU_ASSERTS))
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #elif defined(_MSC_VER)
@@ -361,22 +363,32 @@ extern SYCL_EXTERNAL void __assert_fail(
     const char* func);
 #else // __SYCL_DEVICE_ONLY__
 #if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
+// CUDA supports __assert_fail function which are common for both device
+// and host side code.
 __host__ __device__
-#endif // __CUDA_ARCH__
+#endif
+
+    // This forward declaration matching the declaration of __assert_fail
+    // exactly how it is in glibc in case parts of the program are compiled with
+    // different NDEBUG settings. Otherwise we might get 'ambiguous declaration'
+    // error. Note: On ROCm - this declaration serves for host side compilation.
     void
     __assert_fail(
         const char* assertion,
         const char* file,
         unsigned int line,
-        const char* function) throw()
-// We match the declaration of __assert_fail exactly how it is in glibc in case
-// parts of the program are compiled with different NDEBUG settings. Otherwise
-// we might get 'ambiguous declaration' error.
-#ifdef __GNUC__
-        __attribute__((__noreturn__))
-#endif
-        ;
-#endif
+        const char* function) throw() __attribute__((__noreturn__));
+
+#if (defined(__HIP_ARCH__) || defined(__HIP__)) && \
+    !defined(ROCM_DISABLE_GPU_ASSERTS)
+// ROCm supports __assert_fail only as a device side function.
+__device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
+    const char* assertion,
+    const char* file,
+    unsigned int line,
+    const char* function);
+#endif // defined(__HIP_ARCH__) || defined(__HIP__)
+#endif // __SYCL_DEVICE_ONLY__
 }
 #endif // NDEBUG
 #define CUDA_KERNEL_ASSERT(cond)                                         \

diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
@@ -143,6 +143,9 @@ message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
 # Add HIP to the CMAKE Module Path
 set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
 
+#Disable kernel assert due to performance regression
+set(ROCM_ENABLE_KERNEL_ASSERTS FALSE CACHE BOOL "Kernel asserts are disabled by default for ROCm")
+
 macro(find_package_and_print_version PACKAGE_NAME)
   find_package("${PACKAGE_NAME}" ${ARGN})
   message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
@@ -283,8 +286,18 @@ if(HIP_FOUND)
   find_package_and_print_version(hipcub REQUIRED)
   find_package_and_print_version(rocthrust REQUIRED)
 
-  # Disable Asserts In Code (Can't use asserts on HIP stack.)
-  add_definitions(-DNDEBUG)
+  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
+    if(ROCM_ENABLE_KERNEL_ASSERTS)
+      message("ROCm version >= 4.1; enabling asserts")
+    else()
+      add_definitions(-DROCM_DISABLE_GPU_ASSERTS)
+      message("ROCm version >= 4.1; kernel asserts are disabled")
+    endif()
+  else()
+    # Disable Asserts In Code (Can't use asserts on HIP stack.)
+    add_definitions(-DNDEBUG)
+    message("ROCm version < 4.1; disablng asserts")
+  endif()
 
   if(HIP_COMPILER STREQUAL clang)
     set(hip_library_name amdhip64)

diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -1979,7 +1979,7 @@ def worker(rank):
 t2.start()
 """])
 
-    @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
+    @unittest.skipIf(TEST_WITH_ROCM, "In ROCm, kernel asserts are disabled due to performance overhead")
     def test_fixed_cuda_assert_async(self):
         with self.assertRaisesRegex(RuntimeError, "Boolean value of Tensor with no values is ambiguous"):
             torch._assert_async(torch.tensor([], device="cuda"))