Skip to content

[AMDGPU] Fix backward compatibility kernarg preload prolog base offset#201355

Merged
aobolensk merged 3 commits into
llvm:mainfrom
aobolensk:llvm-amdgpu-kernel-offset-nonhsa
Jun 3, 2026
Merged

[AMDGPU] Fix backward compatibility kernarg preload prolog base offset#201355
aobolensk merged 3 commits into
llvm:mainfrom
aobolensk:llvm-amdgpu-kernel-offset-nonhsa

Conversation

@aobolensk
Copy link
Copy Markdown
Contributor

Backward compatibility preload prolog loaded args from kernarg-segment byte 0, but on non-AMDHSA triples the explicit args start at getExplicitKernelArgOffset() (value: 36), so preloaded SGPRs held the runtime header instead of the arguments

Backward compatibility preload prolog loaded args from kernarg-segment byte 0, but on non-AMDHSA triples the explicit args start at getExplicitKernelArgOffset() (value: 36), so preloaded SGPRs held the runtime header instead of the arguments
@llvmorg-github-actions
Copy link
Copy Markdown

@llvm/pr-subscribers-backend-amdgpu

Author: Arseniy Obolenskiy (aobolensk)

Changes

Backward compatibility preload prolog loaded args from kernarg-segment byte 0, but on non-AMDHSA triples the explicit args start at getExplicitKernelArgOffset() (value: 36), so preloaded SGPRs held the runtime header instead of the arguments


Full diff: https://github.com/llvm/llvm-project/pull/201355.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll (+24-24)
  • (modified) llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll (+32-6)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
index c943d6fa2dd92..cf8f12c4704fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -183,7 +183,7 @@ void AMDGPUPreloadKernArgProlog::addBackCompatLoads(
     MachineBasicBlock *BackCompatMBB, Register KernArgSegmentPtr,
     unsigned NumKernArgPreloadSGPRs) {
   Register KernArgPreloadSGPR = MFI.getArgInfo().FirstKernArgPreloadReg;
-  unsigned Offset = 0;
+  unsigned Offset = ST.getExplicitKernelArgOffset();
   // Fill all user SGPRs used for kernarg preloading with sequential data from
   // the kernarg segment
   while (NumKernArgPreloadSGPRs > 0) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index f99718de97765..56e03aef279ef 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -35,8 +35,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ;
 ; GFX90A-LABEL: barrier_release:
 ; GFX90A:       ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_branch .LBB0_0
 ; GFX90A-NEXT:    .p2align 8
@@ -57,8 +57,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ;
 ; GFX90A-TGSPLIT-LABEL: barrier_release:
 ; GFX90A-TGSPLIT:       ; %bb.1:
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_branch .LBB0_0
 ; GFX90A-TGSPLIT-NEXT:    .p2align 8
@@ -80,8 +80,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ;
 ; GFX942-LABEL: barrier_release:
 ; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_branch .LBB0_0
 ; GFX942-NEXT:    .p2align 8
@@ -102,8 +102,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ;
 ; GFX942-TGSPLIT-LABEL: barrier_release:
 ; GFX942-TGSPLIT:       ; %bb.1:
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    s_branch .LBB0_0
 ; GFX942-TGSPLIT-NEXT:    .p2align 8
@@ -193,8 +193,8 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
 ;
 ; GFX90A-LABEL: fence_fence:
 ; GFX90A:       ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_branch .LBB1_0
 ; GFX90A-NEXT:    .p2align 8
@@ -218,8 +218,8 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
 ;
 ; GFX90A-TGSPLIT-LABEL: fence_fence:
 ; GFX90A-TGSPLIT:       ; %bb.1:
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_branch .LBB1_0
 ; GFX90A-TGSPLIT-NEXT:    .p2align 8
@@ -244,8 +244,8 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
 ;
 ; GFX942-LABEL: fence_fence:
 ; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_branch .LBB1_0
 ; GFX942-NEXT:    .p2align 8
@@ -269,8 +269,8 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
 ;
 ; GFX942-TGSPLIT-LABEL: fence_fence:
 ; GFX942-TGSPLIT:       ; %bb.1:
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    s_branch .LBB1_0
 ; GFX942-TGSPLIT-NEXT:    .p2align 8
@@ -378,8 +378,8 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
 ;
 ; GFX90A-LABEL: release_acquire:
 ; GFX90A:       ; %bb.1:
-; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_branch .LBB2_0
 ; GFX90A-NEXT:    .p2align 8
@@ -403,8 +403,8 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
 ;
 ; GFX90A-TGSPLIT-LABEL: release_acquire:
 ; GFX90A-TGSPLIT:       ; %bb.1:
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-TGSPLIT-NEXT:    s_branch .LBB2_0
 ; GFX90A-TGSPLIT-NEXT:    .p2align 8
@@ -429,8 +429,8 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
 ;
 ; GFX942-LABEL: release_acquire:
 ; GFX942:       ; %bb.1:
-; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_branch .LBB2_0
 ; GFX942-NEXT:    .p2align 8
@@ -454,8 +454,8 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
 ;
 ; GFX942-TGSPLIT-LABEL: release_acquire:
 ; GFX942-TGSPLIT:       ; %bb.1:
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-TGSPLIT-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX942-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-TGSPLIT-NEXT:    s_branch .LBB2_0
 ; GFX942-TGSPLIT-NEXT:    .p2align 8
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 84aa948ac11b3..505b5d4bb2cd9 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=CHECK,ASM %s
+; On non-AMDHSA triples the explicit kernarg segment starts at byte 36, so the
+; back-compat prolog must load preloaded args from that offset, not from 0.
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=CHECK,NONHSA %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
 
@@ -16,6 +19,17 @@ define amdgpu_kernel void @preload_ptr_kernarg_header(ptr inreg %arg) {
 ; ASM-NEXT:    v_mov_b64_e32 v[2:3], s[8:9]
 ; ASM-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; ASM-NEXT:    s_endpgm
+;
+; NONHSA-LABEL: preload_ptr_kernarg_header:
+; NONHSA:         s_load_dwordx2 s[8:9], s[4:5], 0x24
+; NONHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; NONHSA-NEXT:    s_branch .LBB0_0
+; NONHSA-NEXT:    .p2align 8
+; NONHSA-NEXT:  .LBB0_0:
+; NONHSA-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; NONHSA-NEXT:    v_mov_b64_e32 v[2:3], s[8:9]
+; NONHSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; NONHSA-NEXT:    s_endpgm
     store ptr %arg, ptr %arg
     ret void
 }
@@ -34,6 +48,18 @@ define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg
 ; ASM-NEXT:    v_mov_b32_e32 v2, s10
 ; ASM-NEXT:    flat_store_dword v[0:1], v2
 ; ASM-NEXT:    s_endpgm
+;
+; NONHSA-LABEL: preload_i32_kernarg_header:
+; NONHSA:         s_load_dwordx2 s[8:9], s[4:5], 0x24
+; NONHSA-NEXT:    s_load_dword s10, s[4:5], 0x2c
+; NONHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; NONHSA-NEXT:    s_branch .LBB1_0
+; NONHSA-NEXT:    .p2align 8
+; NONHSA-NEXT:  .LBB1_0:
+; NONHSA-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; NONHSA-NEXT:    v_mov_b32_e32 v2, s10
+; NONHSA-NEXT:    flat_store_dword v[0:1], v2
+; NONHSA-NEXT:    s_endpgm
     store i32 %arg1, ptr %arg
     ret void
 }
@@ -43,11 +69,11 @@ define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg
 ; OBJ-NOT: s_branch
 ; ASM-NOT: s_branch
 define void @non_kernel_function(ptr %arg) {
-; ASM-LABEL: non_kernel_function:
-; ASM:         s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ASM-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
-; ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ASM-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-LABEL: non_kernel_function:
+; CHECK:         s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
     store ptr %arg, ptr %arg
     ret void
 }

; GFX90A-TGSPLIT: ; %bb.1:
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably would switch this test to be amdhsa

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image That'll introduce changes anyway

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it will use the better maintained ABI. The particular ABI isn't relevant for the test, so might as well use the usual

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok then. Updated

@aobolensk aobolensk merged commit cf9bf34 into llvm:main Jun 3, 2026
10 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants