[AMDGPU] Kernel hangs when compiled with code-object version 5 due to insufficient stack

The `libc` test suite currently cannot be updated to code-object version 5 because of  an observed hang while calling global constructors. The following LLVM-IR also in https://godbolt.org/z/9j9TWPfaK causes issues only when the `amdgpu_code_object_version` metadata is set to `500` and when optimizations are turned on. This is taken from the kernel in `libc` that simply iterates the `__init_array_end` and `__init_array_start` array and invokes function pointers. Note that this issue is present even when the loop body is never executed.

```llvm
; ModuleID = 'image.0.5.precodegen.bc'
source_filename = "ld-temp.o"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
target triple = "amdgcn-amd-amdhsa"

@__init_array_end = external hidden addrspace(1) global [0 x i64], align 8
@__init_array_start = external hidden addrspace(1) global [0 x i64], align 8

; Function Attrs: mustprogress
define protected amdgpu_kernel void @_begin(i32 noundef %argc, ptr noundef %argv, ptr noundef %env) local_unnamed_addr #0 {
entry:
  br i1 icmp eq (i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @__init_array_end to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @__init_array_start to ptr) to i64)), i64 0), label %exit, label %for.body.preheader.i

for.body.preheader.i:                             ; preds = %entry
  %sub.ptr.div.i = ashr exact i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @__init_array_end to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @__init_array_start to ptr) to i64)), 3
  %umax.i = tail call i64 @llvm.umax.i64(i64 %sub.ptr.div.i, i64 1)
  br label %for.body.i

for.body.i:                                       ; preds = %for.body.i, %for.body.preheader.i
  %i.04.i = phi i64 [ %inc.i, %for.body.i ], [ 0, %for.body.preheader.i ]
  %arrayidx.i = getelementptr inbounds [0 x i64], ptr addrspace(1) @__init_array_start, i64 0, i64 %i.04.i
  %0 = load i64, ptr addrspace(1) %arrayidx.i, align 8, !tbaa !8
  %1 = inttoptr i64 %0 to ptr
  tail call void %1(i32 noundef %argc, ptr noundef %argv, ptr noundef %env) #4
  %inc.i = add nuw i64 %i.04.i, 1
  %exitcond.not.i = icmp eq i64 %inc.i, %umax.i
  br i1 %exitcond.not.i, label %exit, label %for.body.i, !llvm.loop !12

exit: ; preds = %for.body.i, %entry
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.umax.i64(i64, i64) #1

attributes #0 = { mustprogress "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #2 = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #3 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
attributes #4 = { nobuiltin }

!opencl.ocl.version = !{!0, !0, !0}
!llvm.ident = !{!1, !2, !1, !2, !1, !2}
!llvm.module.flags = !{!3, !4, !5, !6, !7}

!0 = !{i32 2, i32 0}
!1 = !{!"clang version 18.0.0"}
!2 = !{!"clang version 16.0.0"}
!3 = !{i32 1, !"amdgpu_code_object_version", i32 500}
!4 = !{i32 1, !"wchar_size", i32 4}
!5 = !{i32 8, !"PIC Level", i32 1}
!6 = !{i32 1, !"ThinLTO", i32 0}
!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
!8 = !{!9, !9, i64 0}
!9 = !{!"long", !10, i64 0}
!10 = !{!"omnipotent char", !11, i64 0}
!11 = !{!"Simple C++ TBAA"}
!12 = distinct !{!12, !13}
!13 = !{!"llvm.loop.mustprogress"}
```

The issue I have found is caused by the emission of the `.private_segment_fixed_size` kernel metadata being incorrectly set to zero after optimizations. If I take the GCN and manually edit the metadata to set `private_segment_fixed_size` to a non-zero value the kernel no longer hangs. To reproduce, use the following command line invocation:
```
$ clang bad.ll --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O1 -c; llvm-readelf --notes bad.o | grep 'private_segment'
```

If you change the `amdgpu_code_object_version` value to instead be `400` in the source, the issue goes away as the stack is no-longer set to zero.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU] Kernel hangs when compiled with code-object version 5 due to insufficient stack #72517

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[AMDGPU] Kernel hangs when compiled with code-object version 5 due to insufficient stack #72517

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions