Skip to content

[Headers][NFC] Deduplicate gpu_match_ between targets via inlining #131141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 13, 2025

Conversation

JonChesterfield
Copy link
Collaborator

@JonChesterfield JonChesterfield commented Mar 13, 2025

Declare a few functions before including the target specific headers then define a fallback_match_{any,all} used by amdgpu and by older nvptx.

Fixes a minor bug on pre-volta where one of the four fallback paths was missing a sync_lane.

@JonChesterfield JonChesterfield requested a review from jhuber6 March 13, 2025 13:26
@llvmbot llvmbot added clang Clang issues not falling into any other category backend:AMDGPU backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Mar 13, 2025
@llvmbot
Copy link
Member

llvmbot commented Mar 13, 2025

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-backend-amdgpu

Author: Jon Chesterfield (JonChesterfield)

Changes

Declare a few functions before including the target specific headers then define a fallback_match_any, used by amdgpu and by older nvptx.


Full diff: https://github.com/llvm/llvm-project/pull/131141.diff

3 Files Affected:

  • (modified) clang/lib/Headers/amdgpuintrin.h (+1-18)
  • (modified) clang/lib/Headers/gpuintrin.h (+47-1)
  • (modified) clang/lib/Headers/nvptxintrin.h (+2-17)
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..74054068c9714 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,20 +142,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __match_mask = 0;
-
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-    if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-      if (__first == __x) {
-        __match_mask = __gpu_lane_mask();
-        __done = 1;
-      }
-    }
-  }
-  __gpu_sync_lane(__lane_mask);
-  return __match_mask;
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..e4a9a49e10e1f 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,52 @@ _Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Returns the bit-mask of active threads in the current warp or wavefront.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x);
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
+ 
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+ 
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+
 #if defined(__NVPTX__)
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
@@ -115,7 +161,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
+// Copies the value from the first active thread to the rest.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 10ad7a682d4cd..1da9402040b52 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
@@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
   // Newer targets can use the dedicated CUDA support.
 #if __CUDA_ARCH__ >= 700
   return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#else
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 #endif
-
-  uint32_t __match_mask = 0;
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-    if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-      if (__first == __x) {
-        __match_mask = __gpu_lane_mask();
-        __done = 1;
-      }
-    }
-  }
-  return __match_mask;
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.

@llvmbot
Copy link
Member

llvmbot commented Mar 13, 2025

@llvm/pr-subscribers-clang

Author: Jon Chesterfield (JonChesterfield)

Changes

Declare a few functions before including the target specific headers then define a fallback_match_any, used by amdgpu and by older nvptx.


Full diff: https://github.com/llvm/llvm-project/pull/131141.diff

3 Files Affected:

  • (modified) clang/lib/Headers/amdgpuintrin.h (+1-18)
  • (modified) clang/lib/Headers/gpuintrin.h (+47-1)
  • (modified) clang/lib/Headers/nvptxintrin.h (+2-17)
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..74054068c9714 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of workgroups in the 'x' dimension of the grid.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,20 +142,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __match_mask = 0;
-
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-    if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-      if (__first == __x) {
-        __match_mask = __gpu_lane_mask();
-        __done = 1;
-      }
-    }
-  }
-  __gpu_sync_lane(__lane_mask);
-  return __match_mask;
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..e4a9a49e10e1f 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,52 @@ _Pragma("push_macro(\"bool\")");
 #define bool _Bool
 #endif
 
+
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Returns the bit-mask of active threads in the current warp or wavefront.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
+
+// Returns a bitmask of threads in the current lane for which \p x is true.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
+                                                          bool __x);
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint32_t
+__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
+
+
+// Copies the value from the first active thread to the rest.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
+
+ 
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+  uint32_t __match_mask = 0;
+
+  bool __done = 0;
+  while (__gpu_ballot(__lane_mask, !__done)) {
+    if (!__done) {
+      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+      if (__first == __x) {
+        __match_mask = __gpu_lane_mask();
+        __done = 1;
+      }
+    }
+  }
+  __gpu_sync_lane(__lane_mask);
+  return __match_mask;
+}
+
+ 
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
+
 #if defined(__NVPTX__)
 #include <nvptxintrin.h>
 #elif defined(__AMDGPU__)
@@ -115,7 +161,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
   return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
 }
 
-// Copies the value from the first active thread in the wavefront to the rest.
+// Copies the value from the first active thread to the rest.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
   uint32_t __hi = (uint32_t)(__x >> 32ull);
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 10ad7a682d4cd..1da9402040b52 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
 // Attribute to declare a function as a kernel.
 #define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
 
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
 // Returns the number of CUDA blocks in the 'x' dimension.
 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
   return __nvvm_read_ptx_sreg_nctaid_x();
@@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
   // Newer targets can use the dedicated CUDA support.
 #if __CUDA_ARCH__ >= 700
   return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#else
+  return __gpu_fallback_match_any_u32(__lane_mask, __x);
 #endif
-
-  uint32_t __match_mask = 0;
-  bool __done = 0;
-  while (__gpu_ballot(__lane_mask, !__done)) {
-    if (!__done) {
-      uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
-      if (__first == __x) {
-        __match_mask = __gpu_lane_mask();
-        __done = 1;
-      }
-    }
-  }
-  return __match_mask;
 }
 
 // Returns a bitmask marking all lanes that have the same value of __x.

@JonChesterfield JonChesterfield force-pushed the jc_header_refactor branch 2 times, most recently from 4225329 to e1456be Compare March 13, 2025 13:29
Copy link

github-actions bot commented Mar 13, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@JonChesterfield JonChesterfield force-pushed the jc_header_refactor branch 2 times, most recently from 28cb801 to cde4232 Compare March 13, 2025 14:07
@JonChesterfield JonChesterfield changed the title [Headers][NFC] Deduplicate gpu_match_any between targets [Headers][NFC] Deduplicate gpu_match_ between targets via inlining Mar 13, 2025

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
__gpu_fallback_match_any_u32(uint64_t __lane_mask, uint32_t __x);
__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, done

Copy link
Contributor

@jhuber6 jhuber6 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LG

@JonChesterfield JonChesterfield merged commit c9d7f70 into llvm:main Mar 13, 2025
6 of 10 checks passed
@JonChesterfield JonChesterfield deleted the jc_header_refactor branch March 13, 2025 15:44
@llvm-ci
Copy link
Collaborator

llvm-ci commented Mar 13, 2025

LLVM Buildbot has detected a new failure on builder openmp-s390x-linux running on systemz-1 while building clang at step 6 "test-openmp".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/88/builds/9091

Here is the relevant piece of the build log for the reference
Step 6 (test-openmp) failure: test (failure)
******************** TEST 'libomp :: tasking/issue-94260-2.c' FAILED ********************
Exit Code: -11

Command Output (stdout):
--
# RUN: at line 1
/home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang -fopenmp   -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test -L /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -fno-omit-frame-pointer -mbackchain -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic && /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# executed command: /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/./bin/clang -fopenmp -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test -L /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -fno-omit-frame-pointer -mbackchain -I /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/ompt /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic
# executed command: /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11

--

********************


frederik-h pushed a commit to frederik-h/llvm-project that referenced this pull request Mar 18, 2025
…lvm#131141)

Declare a few functions before including the target specific headers
then define a fallback_match_{any,all} used by amdgpu and by older
nvptx.

Fixes a minor bug on pre-volta where one of the four fallback paths was
missing a sync_lane.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics clang Clang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants