Skip to content

Commit

Permalink
[ROCm] Fixes the kernel asserts API declaration mismatch error (pytor…
Browse files Browse the repository at this point in the history
…ch#81790)

This problem updates the the PR [pytorch#73040](pytorch#73040)

The compilation error in pyTorch with ROCm is successful with these changes when `NDEBUG` is enabled.

Solution:
For HIP we keep `__device__ __assert_fail()`
and for host side compilation we want to use the `__assert_fail()` from the glibc library.

Tested the code by compiling with below steps
```
python3 tools/amd_build/build_amd.py
python3 setup.py develop --cmake-only
cmake -DHIP_HIPCC_FLAGS_RELEASE="-DNDEBUG" build
cmake --build build
```

The UT test_fixed_cuda_assert_async is still skipped due performance overhead.

cc @jithunnair-amd

Pull Request resolved: pytorch#81790
Approved by: https://github.com/shintaro-iwasaki, https://github.com/jeffdaily, https://github.com/malfet
  • Loading branch information
pruthvistony authored and pytorchmergebot committed Aug 16, 2022
1 parent b156f33 commit 8473e69
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 14 deletions.
34 changes: 23 additions & 11 deletions c10/macros/Macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,9 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
// CUDA_KERNEL_ASSERT checks the assertion
// even when NDEBUG is defined. This is useful for important assertions in CUDA
// code that would otherwise be suppressed when building Release.
#if defined(__ANDROID__) || defined(__APPLE__) || defined(USE_ROCM)
#if defined(__ANDROID__) || defined(__APPLE__) || \
(defined(USE_ROCM) && ROCM_VERSION < 40100) || \
(defined(USE_ROCM) && defined(ROCM_DISABLE_GPU_ASSERTS))
// Those platforms do not support assert()
#define CUDA_KERNEL_ASSERT(cond)
#elif defined(_MSC_VER)
Expand Down Expand Up @@ -361,22 +363,32 @@ extern SYCL_EXTERNAL void __assert_fail(
const char* func);
#else // __SYCL_DEVICE_ONLY__
#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
// CUDA supports __assert_fail function which are common for both device
// and host side code.
__host__ __device__
#endif // __CUDA_ARCH__
#endif

// This forward declaration matching the declaration of __assert_fail
// exactly how it is in glibc in case parts of the program are compiled with
// different NDEBUG settings. Otherwise we might get 'ambiguous declaration'
// error. Note: On ROCm - this declaration serves for host side compilation.
void
__assert_fail(
const char* assertion,
const char* file,
unsigned int line,
const char* function) throw()
// We match the declaration of __assert_fail exactly how it is in glibc in case
// parts of the program are compiled with different NDEBUG settings. Otherwise
// we might get 'ambiguous declaration' error.
#ifdef __GNUC__
__attribute__((__noreturn__))
#endif
;
#endif
const char* function) throw() __attribute__((__noreturn__));

#if (defined(__HIP_ARCH__) || defined(__HIP__)) && \
!defined(ROCM_DISABLE_GPU_ASSERTS)
// ROCm supports __assert_fail only as a device side function.
__device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
const char* assertion,
const char* file,
unsigned int line,
const char* function);
#endif // defined(__HIP_ARCH__) || defined(__HIP__)
#endif // __SYCL_DEVICE_ONLY__
}
#endif // NDEBUG
#define CUDA_KERNEL_ASSERT(cond) \
Expand Down
17 changes: 15 additions & 2 deletions cmake/public/LoadHIP.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
# Add HIP to the CMAKE Module Path
set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})

#Disable kernel assert due to performance regression
set(ROCM_ENABLE_KERNEL_ASSERTS FALSE CACHE BOOL "Kernel asserts are disabled by default for ROCm")

macro(find_package_and_print_version PACKAGE_NAME)
find_package("${PACKAGE_NAME}" ${ARGN})
message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
Expand Down Expand Up @@ -283,8 +286,18 @@ if(HIP_FOUND)
find_package_and_print_version(hipcub REQUIRED)
find_package_and_print_version(rocthrust REQUIRED)

# Disable Asserts In Code (Can't use asserts on HIP stack.)
add_definitions(-DNDEBUG)
if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
if(ROCM_ENABLE_KERNEL_ASSERTS)
message("ROCm version >= 4.1; enabling asserts")
else()
add_definitions(-DROCM_DISABLE_GPU_ASSERTS)
message("ROCm version >= 4.1; kernel asserts are disabled")
endif()
else()
# Disable Asserts In Code (Can't use asserts on HIP stack.)
add_definitions(-DNDEBUG)
message("ROCm version < 4.1; disablng asserts")
endif()

if(HIP_COMPILER STREQUAL clang)
set(hip_library_name amdhip64)
Expand Down
2 changes: 1 addition & 1 deletion test/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1979,7 +1979,7 @@ def worker(rank):
t2.start()
"""])

@unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts")
@unittest.skipIf(TEST_WITH_ROCM, "In ROCm, kernel asserts are disabled due to performance overhead")
def test_fixed_cuda_assert_async(self):
with self.assertRaisesRegex(RuntimeError, "Boolean value of Tensor with no values is ambiguous"):
torch._assert_async(torch.tensor([], device="cuda"))
Expand Down

0 comments on commit 8473e69

Please sign in to comment.