[Headers][NFC] Deduplicate gpu_match_ between targets via inlining #131141

JonChesterfield · 2025-03-13T13:26:45Z

Declare a few functions before including the target specific headers then define a fallback_match_{any,all} used by amdgpu and by older nvptx.

Fixes a minor bug on pre-volta where one of the four fallback paths was missing a sync_lane.

llvmbot · 2025-03-13T13:27:19Z

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-backend-amdgpu

Author: Jon Chesterfield (JonChesterfield)

Changes

Declare a few functions before including the target specific headers then define a fallback_match_any, used by amdgpu and by older nvptx.

Full diff: https://github.com/llvm/llvm-project/pull/131141.diff

3 Files Affected:

(modified) clang/lib/Headers/amdgpuintrin.h (+1-18)
(modified) clang/lib/Headers/gpuintrin.h (+47-1)
(modified) clang/lib/Headers/nvptxintrin.h (+2-17)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..74054068c9714 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,20 +142,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __match_mask = 0;
-
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-    if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-      if (__first == __x) {
-        __match_mask = __gpu_lane_mask();
-        __done = 1;
-      }
-    }
-  }
-  __gpu_sync_lane(__lane_mask);
-  return __match_mask;
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..e4a9a49e10e1f 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,52 @@ _Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Returns the bit-mask of active threads in the current warp or wavefront.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x);
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
+ 
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+ 
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+
 #if defined(__NVPTX__)
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
@@ -115,7 +161,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
+// Copies the value from the first active thread to the rest.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 10ad7a682d4cd..1da9402040b52 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
@@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
   // Newer targets can use the dedicated CUDA support.
 #if __CUDA_ARCH__ >= 700
   return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#else
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 #endif
-
-  uint32_t __match_mask = 0;
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-    if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-      if (__first == __x) {
-        __match_mask = __gpu_lane_mask();
-        __done = 1;
-      }
-    }
-  }
-  return __match_mask;
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.

llvmbot · 2025-03-13T13:27:19Z

@llvm/pr-subscribers-clang

Author: Jon Chesterfield (JonChesterfield)

Changes

Declare a few functions before including the target specific headers then define a fallback_match_any, used by amdgpu and by older nvptx.

Full diff: https://github.com/llvm/llvm-project/pull/131141.diff

3 Files Affected:

(modified) clang/lib/Headers/amdgpuintrin.h (+1-18)
(modified) clang/lib/Headers/gpuintrin.h (+47-1)
(modified) clang/lib/Headers/nvptxintrin.h (+2-17)

diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..74054068c9714 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,20 +142,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __match_mask = 0;
-
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-    if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-      if (__first == __x) {
-        __match_mask = __gpu_lane_mask();
-        __done = 1;
-      }
-    }
-  }
-  __gpu_sync_lane(__lane_mask);
-  return __match_mask;
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..e4a9a49e10e1f 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,52 @@ _Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Returns the bit-mask of active threads in the current warp or wavefront.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x);
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
+ 
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+ 
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+
 #if defined(__NVPTX__)
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
@@ -115,7 +161,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
+// Copies the value from the first active thread to the rest.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 10ad7a682d4cd..1da9402040b52 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
@@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
   // Newer targets can use the dedicated CUDA support.
 #if __CUDA_ARCH__ >= 700
   return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#else
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 #endif
-
-  uint32_t __match_mask = 0;
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-    if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-      if (__first == __x) {
-        __match_mask = __gpu_lane_mask();
-        __done = 1;
-      }
-    }
-  }
-  return __match_mask;
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.

github-actions · 2025-03-13T13:32:40Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jhuber6 · 2025-03-13T15:27:02Z

clang/lib/Headers/gpuintrin.h

+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x);


Suggested change

__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x);

__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);

jhuber6

LG

llvm-ci · 2025-03-13T16:12:43Z

LLVM Buildbot has detected a new failure on builder openmp-s390x-linux running on systemz-1 while building clang at step 6 "test-openmp".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/9091

Here is the relevant piece of the build log for the reference

Step 6 (test-openmp) failure: test (failure)
******************** TEST 'libomp :: tasking/issue-94260-2.c' FAILED ********************
Exit Code: -11

Command Output (stdout):
--
# RUN: at line 1
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang -fopenmp   -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test -L /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -fno-omit-frame-pointer -mbackchain -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic && /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# executed command: /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang -fopenmp -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test -L /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -fno-omit-frame-pointer -mbackchain -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic
# executed command: /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11

--

********************

…lvm#131141) Declare a few functions before including the target specific headers then define a fallback_match_{any,all} used by amdgpu and by older nvptx. Fixes a minor bug on pre-volta where one of the four fallback paths was missing a sync_lane.

JonChesterfield requested a review from jhuber6 March 13, 2025 13:26

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Mar 13, 2025

JonChesterfield force-pushed the jc_header_refactor branch 2 times, most recently from 4225329 to e1456be Compare March 13, 2025 13:29

JonChesterfield force-pushed the jc_header_refactor branch 2 times, most recently from 28cb801 to cde4232 Compare March 13, 2025 14:07

JonChesterfield changed the title ~~[Headers][NFC] Deduplicate gpu_match_any between targets~~ [Headers][NFC] Deduplicate gpu_match_ between targets via inlining Mar 13, 2025

JonChesterfield force-pushed the jc_header_refactor branch from cde4232 to 5e55b82 Compare March 13, 2025 14:14

jhuber6 reviewed Mar 13, 2025

View reviewed changes

[Headers][NFC] Deduplicate gpu_match_ between targets via inlining

fbeb177

JonChesterfield force-pushed the jc_header_refactor branch from 5e55b82 to fbeb177 Compare March 13, 2025 15:29

jhuber6 approved these changes Mar 13, 2025

View reviewed changes

JonChesterfield merged commit c9d7f70 into llvm:main Mar 13, 2025
6 of 10 checks passed

JonChesterfield deleted the jc_header_refactor branch March 13, 2025 15:44

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Headers][NFC] Deduplicate gpu_match_ between targets via inlining #131141

[Headers][NFC] Deduplicate gpu_match_ between targets via inlining #131141

Uh oh!

JonChesterfield commented Mar 13, 2025 •

edited

Loading

Uh oh!

llvmbot commented Mar 13, 2025 •

edited

Loading

Uh oh!

llvmbot commented Mar 13, 2025

Uh oh!

github-actions bot commented Mar 13, 2025 •

edited

Loading

Uh oh!

jhuber6 Mar 13, 2025

Uh oh!

JonChesterfield Mar 13, 2025

Uh oh!

jhuber6 left a comment

Uh oh!

Uh oh!

llvm-ci commented Mar 13, 2025

Uh oh!

Uh oh!

	__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x);
	__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);

[Headers][NFC] Deduplicate gpu_match_ between targets via inlining #131141

[Headers][NFC] Deduplicate gpu_match_ between targets via inlining #131141

Uh oh!

Conversation

JonChesterfield commented Mar 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 13, 2025

Uh oh!

github-actions bot commented Mar 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jhuber6 Mar 13, 2025

Choose a reason for hiding this comment

Uh oh!

JonChesterfield Mar 13, 2025

Choose a reason for hiding this comment

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Mar 13, 2025

Uh oh!

Uh oh!

JonChesterfield commented Mar 13, 2025 •

edited

Loading

llvmbot commented Mar 13, 2025 •

edited

Loading

github-actions bot commented Mar 13, 2025 •

edited

Loading