From 19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 5 Aug 2024 12:28:13 -0600 Subject: [PATCH] Fix isnan namespace qualification in cutlass/functional.h (#1679) * Fix unrelated MSVC build warnings * Fix use of isnan in functional.h Correct namespace qualification of isnan in functional.h so that it invokes cutlass::isnan for half_t, instead of converting half_t to float and invoking std::isnan (on host, or ::isnan on device). --- include/cutlass/detail/helper_macros.hpp | 38 ++++++++++ .../sm90_epilogue_tma_warpspecialized.hpp | 5 +- include/cutlass/functional.h | 69 ++++++++--------- test/unit/core/functional.cu | 76 +++++++++++++++++++ 4 files changed, 152 insertions(+), 36 deletions(-) diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp index 926ccc7308..4cd895f147 100644 --- a/include/cutlass/detail/helper_macros.hpp +++ b/include/cutlass/detail/helper_macros.hpp @@ -95,6 +95,44 @@ CUTLASS_HOST_DEVICE void __CUTLASS_UNUSED(T const &) #endif #endif +// CUTLASS_CMATH_NAMESPACE is the namespace where code can find +// functions like isnan and log. Such functions are in +// the std namespace in host code, but in the global namespace +// in device code. +// +// The intended use case for this macro is in "using" declarations +// for making argument-dependent lookup (ADL) work in generic code. +// For example, if T is cutlass::half_t, the following code will +// invoke cutlass::isnan(half_t). If T is float, it will invoke +// std::isnan on host and ::isnan on device. (CUTLASS's support +// for NVRTC prevents it from using things in the std namespace +// in device code.) Correct use of "using" declarations can help +// avoid unexpected implicit conversions, like from half_t to float. +// +// template +// bool foo(T x) { +// using CUTLASS_CMATH_NAMESPACE :: isnan; +// return isnan(x); +// } +// +// Without this macro, one would need to write the following. +// +// template +// bool foo(T x) { +// #if defined(__CUDA_ARCH__) +// using ::isnan; +// #else +// using std::isnan; +// #endif +// return isnan(x); +// } + +#if defined(__CUDA_ARCH__) +# define CUTLASS_CMATH_NAMESPACE +#else +# define CUTLASS_CMATH_NAMESPACE std +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp index 56b55292a8..4b2157b6c7 100644 --- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp @@ -670,7 +670,8 @@ class CollectiveEpilogue< // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions // Sync requirements of smem reuse may preclude this optimization // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD - int epi_m_prev = 0, epi_n_prev = 0; + [[maybe_unused]] int epi_m_prev = 0; + [[maybe_unused]] int epi_n_prev = 0; static_assert(not (DelayTmaStore and ReuseSmemC and StagesC == StagesD), "This TMA epilogue configuration will deadlock"); // The TMA store sequence for one subtile iteration @@ -725,7 +726,7 @@ class CollectiveEpilogue< for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) { CUTLASS_PRAGMA_UNROLL for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) { - bool is_first_iteration = epi_m == 0 && epi_n == 0; + [[maybe_unused]] bool is_first_iteration = epi_m == 0 && epi_n == 0; bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1; if (subtile_idx != -1 && (epi_n * static_cast(size<2>(gD_epi)) + epi_m) != subtile_idx) { diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index da946c7c5f..65e49d5290 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -369,11 +369,14 @@ template struct maximum { CUTLASS_HOST_DEVICE T operator()(T const &lhs, T const &rhs) const { -#if defined(__CUDA_ARCH__) - return lhs > rhs or ::isnan(lhs) ? lhs : rhs; -#else - return lhs > rhs or std::isnan(lhs) ? lhs : rhs; -#endif + using CUTLASS_CMATH_NAMESPACE :: isnan; + + // Call isnan unqualified, so argument-dependent lookup (ADL) + // will find overloads such as cutlass::isnan(half_t). + // Calling ::isnan or std::isnan directly would force + // implicit conversions to float of custom number types + // in the cutlass namespace (e.g., cutlass::half_t). + return lhs > rhs || isnan(lhs) ? lhs : rhs; } }; @@ -389,15 +392,14 @@ template <> struct maximum { CUTLASS_HOST_DEVICE float operator()(float const lhs, float const rhs) const { - float res; #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + float res; asm volatile("max.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs)); -#elif defined(__CUDA_ARCH__) - res = lhs > rhs or ::isnan(lhs) ? lhs : rhs; + return res; #else - res = lhs > rhs or std::isnan(lhs) ? lhs : rhs; + using CUTLASS_CMATH_NAMESPACE :: isnan; + return lhs > rhs || isnan(lhs) ? lhs : rhs; #endif - return res; } }; @@ -427,11 +429,9 @@ template struct minimum { CUTLASS_HOST_DEVICE T operator()(T const &lhs, T const &rhs) const { -#if defined(__CUDA_ARCH__) - return lhs < rhs or ::isnan(lhs) ? lhs : rhs; -#else - return lhs < rhs or std::isnan(lhs) ? lhs : rhs; -#endif + using CUTLASS_CMATH_NAMESPACE :: isnan; + + return lhs < rhs || isnan(lhs) ? lhs : rhs; } }; @@ -512,6 +512,8 @@ template struct guarded_multiply_add { CUTLASS_HOST_DEVICE C operator()(A const &a, B const &b, C const &c) const { + using CUTLASS_CMATH_NAMESPACE :: isnan; + if (isnan(a) || isnan(b)) { return C(0); } @@ -531,7 +533,10 @@ struct guarded_multiply_add { : "h"(*reinterpret_cast(&a)), "h"(*reinterpret_cast(&b)), "h"(*reinterpret_cast(&c))); return result; #else - if (isnan(a) || isnan(b)) { + // Namespace-qualifying isnan as cutlass::isnan saves the compiler + // the trouble of argument-dependent lookup. Calling std::isnan or + // ::isnan here would result in unwanted implicit conversion to float. + if (cutlass::isnan(a) || cutlass::isnan(b)) { return half_t(0); } return a * b + c; @@ -544,13 +549,9 @@ template struct guarded_multiply_add_relu0 { CUTLASS_HOST_DEVICE C operator()(A const &a, B const &b, C const &c) const { - if ( -#if defined(__CUDA_ARCH__) - ::isnan(a) || ::isnan(b) -#else - std::isnan(a) || std::isnan(b) -#endif - ) { + using CUTLASS_CMATH_NAMESPACE :: isnan; + + if (isnan(a) || isnan(b)) { return C(0); } maximum mx; @@ -569,13 +570,7 @@ struct guarded_multiply_add_relu0 { : "h"(*reinterpret_cast(&a)), "h"(*reinterpret_cast(&b)), "h"(*reinterpret_cast(&c))); return result; #else - if ( -#if defined(__CUDA_ARCH__) - ::isnan(a) || ::isnan(b) -#else - std::isnan(a) || std::isnan(b) -#endif - ) { + if (cutlass::isnan(a) || cutlass::isnan(b)) { return half_t(0); } maximum mx; @@ -782,6 +777,10 @@ struct atomic_add { #if defined(__CUDA_ARCH__) atomicAdd(ptr, data); +#else + CUTLASS_UNUSED(ptr); + CUTLASS_UNUSED(data); + CUTLASS_NOT_IMPLEMENTED(); #endif } }; @@ -793,8 +792,9 @@ struct atomic_add void operator()(double *ptr, const double &data) { #if !defined(__CUDA_ARCH__) - CUTLASS_UNUSED(ptr); - CUTLASS_UNUSED(data); + CUTLASS_UNUSED(ptr); + CUTLASS_UNUSED(data); + CUTLASS_NOT_IMPLEMENTED(); #elif (__CUDA_ARCH__ >= 600) atomicAdd(ptr, data); #else @@ -819,8 +819,9 @@ struct atomic_add void operator()(half2 *ptr, const half2 &data) { #if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)) - CUTLASS_UNUSED(ptr); - CUTLASS_UNUSED(data); + CUTLASS_UNUSED(ptr); + CUTLASS_UNUSED(data); + CUTLASS_NOT_IMPLEMENTED(); #else // Vector-2 atomic reduction requires .target sm_60 or higher uint32_t word = reinterpret_cast(data); diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu index 174a089555..2b914b255f 100644 --- a/test/unit/core/functional.cu +++ b/test/unit/core/functional.cu @@ -491,4 +491,80 @@ TEST(Functional, multiply_add_quaternion_f32) { Functional_multiply_add_QuaternionT(); } +namespace cutlass_test { + +__global__ void +test_cutlass_maximum(cutlass::half_t const* in1, cutlass::half_t const* in2, cutlass::half_t* out) +{ + { + constexpr bool propagate_NaN = true; + cutlass::maximum op; + if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 + && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { + *out = op(*in1, *in2); + } + } + { + constexpr bool propagate_NaN = false; + cutlass::maximum op; + if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 + && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0) { + *out = op(*in1, *in2); + } + } +} + +} // cutlass_test + +// Test compilation on both host and device. +TEST(Functional, maximum_half_host_propagate_NaN) { + constexpr bool propagate_NaN = true; + cutlass::maximum op; + cutlass::half_t x(1.0f); + cutlass::half_t y(2.0f); + + auto result = op(x, y); + static_assert(std::is_same_v); + EXPECT_EQ(result, y); + result = op(y, x); + EXPECT_EQ(result, y); +} + +TEST(Functional, maximum_half_host_dont_propagate_NaN) { + constexpr bool propagate_NaN = false; + cutlass::maximum op; + cutlass::half_t x(1.0f); + cutlass::half_t y(2.0f); + + auto result = op(x, y); + static_assert(std::is_same_v); + EXPECT_EQ(result, y); + result = op(y, x); + EXPECT_EQ(result, y); +} + +TEST(Function, maximum_half_device) { + using Tensor = cutlass::HostTensor; + + Tensor in1({1, 1}); + Tensor in2({1, 1}); + Tensor out({1, 1}); + in1.host_data()[0] = cutlass::half_t(1.0f); + in2.host_data()[0] = cutlass::half_t(2.0f); + out.host_data()[0] = cutlass::half_t(0.0f); + + in1.sync_device(); + in2.sync_device(); + out.sync_device(); + + cutlass_test::test_cutlass_maximum<<< 1, 1 >>>( + in1.device_data(), + in2.device_data(), + out.device_data() + ); + out.sync_host(); + + EXPECT_EQ(out.host_data()[0], 2.0f); +} + /////////////////////////////////////////////////////////////////////////////////////////////////