Skip to content

Commit c2a6b17

Browse files
authored
Merge pull request #1405 from LLNL/rc-v2022.10.4
Rc v2022.10.4
2 parents a83a448 + 887c9e0 commit c2a6b17

File tree

5 files changed

+31
-11
lines changed

5 files changed

+31
-11
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ include(CMakeDependentOption)
1616
# Set version number
1717
set(RAJA_VERSION_MAJOR 2022)
1818
set(RAJA_VERSION_MINOR 10)
19-
set(RAJA_VERSION_PATCHLEVEL 3)
19+
set(RAJA_VERSION_PATCHLEVEL 4)
2020

2121
if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
2222
message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")

RELEASE_NOTES.md

+7
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ Notable changes include:
1919

2020
* Bug fixes/improvements:
2121

22+
Version 2022.10.4 -- Release date 2022-14-01
23+
============================================
24+
25+
This release fixes an issue that was found after the v2022.10.3 release.
26+
27+
* Fixes device alignment bug in workgroups which led to missing symbol errors
28+
with the AMD clang compiler.
2229

2330
Version 2022.10.3 -- Release date 2022-12-01
2431
============================================

docs/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888
# The short X.Y version.
8989
version = u'2022.10'
9090
# The full version, including alpha/beta/rc tags.
91-
release = u'2022.10.3'
91+
release = u'2022.10.4'
9292

9393
# The language for content autogenerated by Sphinx. Refer to documentation
9494
# for a list of supported languages.

include/RAJA/config.hpp.in

+20-6
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#define RAJA_config_HPP
3333

3434
#include <utility>
35+
#include <cstddef>
3536
#include <type_traits>
3637

3738
#if defined(_MSVC_LANG)
@@ -239,6 +240,15 @@ static_assert(RAJA_HAS_SOME_CXX14,
239240
#define RAJA_PRAGMA(x) _Pragma(RAJA_STRINGIFY(x))
240241
#endif
241242

243+
244+
/* NOTE: Below we define RAJA_MAX_ALIGN for each compiler, currently it is set as 16 bytes
245+
for all cases, except MSVC. Previously this was set by alignof(std::max_align_t) which, in Clang,
246+
is based on the sizeof(long double). This causes an in inconsistency as CUDA/HIP long doubles
247+
are demoted to doubles causing alignof(std::max_align_t) to return 8 bytes on the device and
248+
16 bytes on the host. We therefore set a standard size and ensure validity through a
249+
static_assert.
250+
*/
251+
242252
namespace RAJA {
243253

244254
#if defined(RAJA_ENABLE_OPENMP) && !defined(__HIP_DEVICE_COMPILE__)
@@ -374,7 +384,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
374384
//
375385
// Configuration options for Intel compilers
376386
//
377-
387+
#define RAJA_MAX_ALIGN 16
378388
#if defined (RAJA_ENABLE_FORCEINLINE_RECURSIVE)
379389
#define RAJA_FORCEINLINE_RECURSIVE RAJA_PRAGMA(forceinline recursive)
380390
#else
@@ -387,6 +397,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
387397
#define RAJA_INLINE inline __attribute__((always_inline))
388398
#endif
389399

400+
390401
#define RAJA_UNROLL RAJA_PRAGMA(unroll)
391402
#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll(N))
392403

@@ -412,9 +423,9 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
412423
//
413424
// Configuration options for GNU compilers
414425
//
426+
#define RAJA_MAX_ALIGN 16
415427
#define RAJA_FORCEINLINE_RECURSIVE
416428
#define RAJA_INLINE inline __attribute__((always_inline))
417-
418429
#if !defined(__NVCC__)
419430
#define RAJA_UNROLL RAJA_PRAGMA(GCC unroll 10000)
420431
#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(GCC unroll N)
@@ -446,11 +457,11 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
446457
//
447458
// Configuration options for xlc compiler (i.e., bgq/sequoia).
448459
//
460+
#define RAJA_MAX_ALIGN 16
449461
#define RAJA_FORCEINLINE_RECURSIVE
450462
#define RAJA_INLINE inline __attribute__((always_inline))
451463
#define RAJA_UNROLL
452464
#define RAJA_UNROLL_COUNT(N)
453-
454465
// FIXME: alignx is breaking CUDA+xlc
455466
#if defined(RAJA_ENABLE_CUDA)
456467
#define RAJA_ALIGN_DATA(d) d
@@ -476,12 +487,11 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
476487
//
477488
// Configuration options for clang compilers
478489
//
490+
#define RAJA_MAX_ALIGN 16
479491
#define RAJA_FORCEINLINE_RECURSIVE
480492
#define RAJA_INLINE inline __attribute__((always_inline))
481493
#define RAJA_UNROLL RAJA_PRAGMA(clang loop unroll(enable))
482494
#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(clang loop unroll_count(N))
483-
484-
485495
// note that neither nvcc nor Apple Clang compiler currently doesn't support
486496
// the __builtin_assume_aligned attribute
487497
#if defined(RAJA_ENABLE_CUDA) || defined(__APPLE__)
@@ -514,7 +524,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
514524

515525
// This is the same as undefined compiler, but squelches the warning message
516526
#elif defined(RAJA_COMPILER_MSVC)
517-
527+
#define RAJA_MAX_ALIGN alignof(std::max_align_t)
518528
#define RAJA_FORCEINLINE_RECURSIVE
519529
#define RAJA_INLINE inline
520530
#define RAJA_ALIGN_DATA(d) d
@@ -526,6 +536,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
526536
#else
527537

528538
#pragma message("RAJA_COMPILER unknown, using default empty macros.")
539+
#define RAJA_MAX_ALIGN 16
529540
#define RAJA_FORCEINLINE_RECURSIVE
530541
#define RAJA_INLINE inline
531542
#define RAJA_ALIGN_DATA(d) d
@@ -536,6 +547,9 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
536547

537548
#endif
538549

550+
static_assert(RAJA_MAX_ALIGN >= alignof(std::max_align_t) && (RAJA_MAX_ALIGN/alignof(std::max_align_t))*alignof(std::max_align_t) == RAJA_MAX_ALIGN,
551+
"Inconsistent RAJA_MAX_ALIGN size");
552+
539553
#cmakedefine RAJA_HAVE_POSIX_MEMALIGN
540554
#cmakedefine RAJA_HAVE_ALIGNED_ALLOC
541555
#cmakedefine RAJA_HAVE_MM_MALLOC

include/RAJA/pattern/WorkGroup/WorkStruct.hpp

+2-3
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ struct WorkStruct;
4545
* sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
4646
*/
4747
template < typename Dispatcher_T >
48-
using GenericWorkStruct = WorkStruct<alignof(std::max_align_t), Dispatcher_T>;
48+
using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
4949

5050
template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
5151
struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
@@ -71,7 +71,6 @@ struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, Call
7171
"WorkStruct and GenericWorkStruct must have obj at the same offset");
7272
static_assert(sizeof(value_type) <= sizeof(true_value_type),
7373
"WorkStruct must not be smaller than GenericWorkStruct");
74-
7574
true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
7675

7776
value_ptr->dispatcher = dispatcher;
@@ -112,7 +111,7 @@ struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, Call
112111

113112
const dispatcher_type* dispatcher;
114113
typename dispatcher_type::invoker_type invoke;
115-
typename std::aligned_storage<size, alignof(std::max_align_t)>::type obj;
114+
typename std::aligned_storage<size, RAJA_MAX_ALIGN>::type obj;
116115
};
117116

118117
} // namespace detail

0 commit comments

Comments
 (0)