[AMDGPU] Fix backward compatibility kernarg preload prolog base offset#201355
Merged
Conversation
Backward compatibility preload prolog loaded args from kernarg-segment byte 0, but on non-AMDHSA triples the explicit args start at getExplicitKernelArgOffset() (value: 36), so preloaded SGPRs held the runtime header instead of the arguments
|
@llvm/pr-subscribers-backend-amdgpu Author: Arseniy Obolenskiy (aobolensk) ChangesBackward compatibility preload prolog loaded args from kernarg-segment byte 0, but on non-AMDHSA triples the explicit args start at Full diff: https://github.com/llvm/llvm-project/pull/201355.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
index c943d6fa2dd92..cf8f12c4704fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernArgProlog.cpp
@@ -183,7 +183,7 @@ void AMDGPUPreloadKernArgProlog::addBackCompatLoads(
MachineBasicBlock *BackCompatMBB, Register KernArgSegmentPtr,
unsigned NumKernArgPreloadSGPRs) {
Register KernArgPreloadSGPR = MFI.getArgInfo().FirstKernArgPreloadReg;
- unsigned Offset = 0;
+ unsigned Offset = ST.getExplicitKernelArgOffset();
// Fill all user SGPRs used for kernarg preloading with sequential data from
// the kernarg segment
while (NumKernArgPreloadSGPRs > 0) {
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index f99718de97765..56e03aef279ef 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -35,8 +35,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
;
; GFX90A-LABEL: barrier_release:
; GFX90A: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_branch .LBB0_0
; GFX90A-NEXT: .p2align 8
@@ -57,8 +57,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
;
; GFX90A-TGSPLIT-LABEL: barrier_release:
; GFX90A-TGSPLIT: ; %bb.1:
-; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_branch .LBB0_0
; GFX90A-TGSPLIT-NEXT: .p2align 8
@@ -80,8 +80,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
;
; GFX942-LABEL: barrier_release:
; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB0_0
; GFX942-NEXT: .p2align 8
@@ -102,8 +102,8 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
;
; GFX942-TGSPLIT-LABEL: barrier_release:
; GFX942-TGSPLIT: ; %bb.1:
-; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: s_branch .LBB0_0
; GFX942-TGSPLIT-NEXT: .p2align 8
@@ -193,8 +193,8 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
;
; GFX90A-LABEL: fence_fence:
; GFX90A: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_branch .LBB1_0
; GFX90A-NEXT: .p2align 8
@@ -218,8 +218,8 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
;
; GFX90A-TGSPLIT-LABEL: fence_fence:
; GFX90A-TGSPLIT: ; %bb.1:
-; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_branch .LBB1_0
; GFX90A-TGSPLIT-NEXT: .p2align 8
@@ -244,8 +244,8 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
;
; GFX942-LABEL: fence_fence:
; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB1_0
; GFX942-NEXT: .p2align 8
@@ -269,8 +269,8 @@ define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc,
;
; GFX942-TGSPLIT-LABEL: fence_fence:
; GFX942-TGSPLIT: ; %bb.1:
-; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: s_branch .LBB1_0
; GFX942-TGSPLIT-NEXT: .p2align 8
@@ -378,8 +378,8 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
;
; GFX90A-LABEL: release_acquire:
; GFX90A: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_branch .LBB2_0
; GFX90A-NEXT: .p2align 8
@@ -403,8 +403,8 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
;
; GFX90A-TGSPLIT-LABEL: release_acquire:
; GFX90A-TGSPLIT: ; %bb.1:
-; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: s_branch .LBB2_0
; GFX90A-TGSPLIT-NEXT: .p2align 8
@@ -429,8 +429,8 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
;
; GFX942-LABEL: release_acquire:
; GFX942: ; %bb.1:
-; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_branch .LBB2_0
; GFX942-NEXT: .p2align 8
@@ -454,8 +454,8 @@ define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc,
;
; GFX942-TGSPLIT-LABEL: release_acquire:
; GFX942-TGSPLIT: ; %bb.1:
-; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
-; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: s_branch .LBB2_0
; GFX942-TGSPLIT-NEXT: .p2align 8
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index 84aa948ac11b3..505b5d4bb2cd9 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,5 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=CHECK,ASM %s
+; On non-AMDHSA triples the explicit kernarg segment starts at byte 36, so the
+; back-compat prolog must load preloaded args from that offset, not from 0.
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=CHECK,NONHSA %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
@@ -16,6 +19,17 @@ define amdgpu_kernel void @preload_ptr_kernarg_header(ptr inreg %arg) {
; ASM-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
; ASM-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; ASM-NEXT: s_endpgm
+;
+; NONHSA-LABEL: preload_ptr_kernarg_header:
+; NONHSA: s_load_dwordx2 s[8:9], s[4:5], 0x24
+; NONHSA-NEXT: s_waitcnt lgkmcnt(0)
+; NONHSA-NEXT: s_branch .LBB0_0
+; NONHSA-NEXT: .p2align 8
+; NONHSA-NEXT: .LBB0_0:
+; NONHSA-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; NONHSA-NEXT: v_mov_b64_e32 v[2:3], s[8:9]
+; NONHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; NONHSA-NEXT: s_endpgm
store ptr %arg, ptr %arg
ret void
}
@@ -34,6 +48,18 @@ define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg
; ASM-NEXT: v_mov_b32_e32 v2, s10
; ASM-NEXT: flat_store_dword v[0:1], v2
; ASM-NEXT: s_endpgm
+;
+; NONHSA-LABEL: preload_i32_kernarg_header:
+; NONHSA: s_load_dwordx2 s[8:9], s[4:5], 0x24
+; NONHSA-NEXT: s_load_dword s10, s[4:5], 0x2c
+; NONHSA-NEXT: s_waitcnt lgkmcnt(0)
+; NONHSA-NEXT: s_branch .LBB1_0
+; NONHSA-NEXT: .p2align 8
+; NONHSA-NEXT: .LBB1_0:
+; NONHSA-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; NONHSA-NEXT: v_mov_b32_e32 v2, s10
+; NONHSA-NEXT: flat_store_dword v[0:1], v2
+; NONHSA-NEXT: s_endpgm
store i32 %arg1, ptr %arg
ret void
}
@@ -43,11 +69,11 @@ define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg
; OBJ-NOT: s_branch
; ASM-NOT: s_branch
define void @non_kernel_function(ptr %arg) {
-; ASM-LABEL: non_kernel_function:
-; ASM: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ASM-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
-; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; ASM-NEXT: s_setpc_b64 s[30:31]
+; CHECK-LABEL: non_kernel_function:
+; CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
store ptr %arg, ptr %arg
ret void
}
|
arsenm
approved these changes
Jun 3, 2026
| ; GFX90A-TGSPLIT: ; %bb.1: | ||
| ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 | ||
| ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 | ||
| ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 |
Contributor
There was a problem hiding this comment.
Probably would switch this test to be amdhsa
Contributor
Author
Contributor
There was a problem hiding this comment.
Yes, it will use the better maintained ABI. The particular ABI isn't relevant for the test, so might as well use the usual
arsenm
approved these changes
Jun 3, 2026
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.

Backward compatibility preload prolog loaded args from kernarg-segment byte 0, but on non-AMDHSA triples the explicit args start at
getExplicitKernelArgOffset()(value: 36), so preloaded SGPRs held the runtime header instead of the arguments