diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index accc18e623cb47..c27e69a0bcbbcf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -187,9 +187,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { DT = &getAnalysis().getDomTree(); auto &PDT = getAnalysis().getPostDomTree(); - - // If there's only one exit, we don't need to do anything. - if (PDT.root_size() <= 1) + if (PDT.root_size() == 0 || + (PDT.root_size() == 1 && + !isa(PDT.getRoot()->getTerminator()))) return false; LegacyDivergenceAnalysis &DA = getAnalysis(); diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index a7f4bfe64373af..bc2ed12067ada1 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -567,29 +567,29 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_add_i32 s1, s9, s1 ; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 -; GFX908-NEXT: .LBB3_1: ; %bb12 +; GFX908-NEXT: .LBB3_1: ; %Flow20 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_add_u32 s6, s6, s4 -; GFX908-NEXT: s_addc_u32 s7, s7, 0 -; GFX908-NEXT: s_add_u32 s10, s10, s12 -; GFX908-NEXT: s_addc_u32 s11, s11, s13 -; GFX908-NEXT: .LBB3_2: ; %bb9 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX908-NEXT: s_cbranch_vccz .LBB3_12 +; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX908-NEXT: s_cbranch_scc0 .LBB3_1 -; GFX908-NEXT: ; %bb.3: ; %bb14 +; GFX908-NEXT: s_mov_b64 s[16:17], -1 +; GFX908-NEXT: s_cbranch_scc0 .LBB3_10 +; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off ; GFX908-NEXT: s_mov_b32 s9, s8 ; GFX908-NEXT: v_mov_b32_e32 v4, s8 -; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: v_mov_b32_e32 v8, s8 +; GFX908-NEXT: v_mov_b32_e32 v6, s8 ; GFX908-NEXT: v_mov_b32_e32 v5, s9 -; GFX908-NEXT: v_mov_b32_e32 v7, s9 ; GFX908-NEXT: v_mov_b32_e32 v9, s9 +; GFX908-NEXT: v_mov_b32_e32 v7, s9 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 +; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 -; GFX908-NEXT: s_mov_b64 s[16:17], s[10:11] +; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: v_readfirstlane_b32 s5, v2 @@ -597,38 +597,40 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_add_u32 s5, s5, 1 ; GFX908-NEXT: s_addc_u32 s9, s9, 0 ; GFX908-NEXT: s_mul_hi_u32 s19, s2, s5 -; GFX908-NEXT: s_mul_i32 s20, s3, s5 +; GFX908-NEXT: s_mul_i32 s22, s3, s5 ; GFX908-NEXT: s_mul_i32 s18, s2, s5 ; GFX908-NEXT: s_mul_i32 s5, s2, s9 ; GFX908-NEXT: s_add_i32 s5, s19, s5 -; GFX908-NEXT: s_add_i32 s5, s5, s20 +; GFX908-NEXT: s_add_i32 s5, s5, s22 ; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_4: ; %bb58 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX908-NEXT: s_add_u32 s16, s16, s0 -; GFX908-NEXT: s_addc_u32 s17, s17, s1 -; GFX908-NEXT: s_cbranch_vccz .LBB3_1 +; GFX908-NEXT: s_add_u32 s20, s20, s0 +; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] +; GFX908-NEXT: s_addc_u32 s21, s21, s1 +; GFX908-NEXT: s_mov_b64 s[22:23], 0 +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] +; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX908-NEXT: s_add_u32 s20, s16, s18 -; GFX908-NEXT: s_addc_u32 s21, s17, s5 -; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX908-NEXT: s_add_u32 s22, s20, s18 +; GFX908-NEXT: s_addc_u32 s23, s21, s5 +; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[20:21] offset:-4 glc +; GFX908-NEXT: global_load_dword v12, v19, s[22:23] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v19, s[20:21] glc +; GFX908-NEXT: global_load_dword v12, v19, s[22:23] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v19 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 -; GFX908-NEXT: s_and_b64 vcc, exec, s[14:15] +; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -645,12 +647,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 ; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 ; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 -; GFX908-NEXT: v_add_f32_e32 v7, v7, v27 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v26 -; GFX908-NEXT: v_add_f32_e32 v8, v8, v14 -; GFX908-NEXT: v_add_f32_e32 v9, v9, v15 +; GFX908-NEXT: v_add_f32_e32 v9, v9, v27 +; GFX908-NEXT: v_add_f32_e32 v8, v8, v26 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX908-NEXT: v_add_f32_e32 v7, v7, v15 ; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 +; GFX908-NEXT: s_mov_b64 s[22:23], -1 ; GFX908-NEXT: s_branch .LBB3_4 ; ; GFX90A-LABEL: introduced_copy_to_sgpr: @@ -700,25 +703,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_add_i32 s1, s9, s1 ; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 -; GFX90A-NEXT: .LBB3_1: ; %bb12 +; GFX90A-NEXT: .LBB3_1: ; %Flow20 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_add_u32 s6, s6, s4 -; GFX90A-NEXT: s_addc_u32 s7, s7, 0 -; GFX90A-NEXT: s_add_u32 s10, s10, s12 -; GFX90A-NEXT: s_addc_u32 s11, s11, s13 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX90A-NEXT: s_cbranch_scc0 .LBB3_1 +; GFX90A-NEXT: s_mov_b64 s[16:17], -1 +; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10 ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX90A-NEXT: s_mov_b32 s9, s8 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0 -; GFX90A-NEXT: s_mov_b64 s[16:17], s[10:11] +; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1 +; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11] ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s5, v4 @@ -726,39 +729,41 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_add_u32 s5, s5, 1 ; GFX90A-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s5 -; GFX90A-NEXT: s_mul_i32 s20, s3, s5 +; GFX90A-NEXT: s_mul_i32 s22, s3, s5 ; GFX90A-NEXT: s_mul_i32 s18, s2, s5 ; GFX90A-NEXT: s_mul_i32 s5, s2, s9 ; GFX90A-NEXT: s_add_i32 s5, s19, s5 -; GFX90A-NEXT: s_add_i32 s5, s5, s20 +; GFX90A-NEXT: s_add_i32 s5, s5, s22 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX90A-NEXT: s_add_u32 s16, s16, s0 -; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] -; GFX90A-NEXT: s_addc_u32 s17, s17, s1 -; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 +; GFX90A-NEXT: s_add_u32 s20, s20, s0 +; GFX90A-NEXT: s_addc_u32 s21, s21, s1 +; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] +; GFX90A-NEXT: s_mov_b64 s[22:23], 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] +; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: s_add_u32 s20, s16, s18 -; GFX90A-NEXT: s_addc_u32 s21, s17, s5 -; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc +; GFX90A-NEXT: s_add_u32 s22, s20, s18 +; GFX90A-NEXT: s_addc_u32 s23, s21, s5 +; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc +; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[14:15] -; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4 +; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX90A-NEXT: ; %bb.6: ; %bb51 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -770,10 +775,40 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15] ; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25] -; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27] -; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] +; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[26:27] +; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] +; GFX90A-NEXT: s_mov_b64 s[22:23], -1 ; GFX90A-NEXT: s_branch .LBB3_4 +; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 +; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15] +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] +; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 +; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 +; GFX90A-NEXT: ; implicit-def: $vgpr12_vgpr13 +; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 +; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard +; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1 +; GFX90A-NEXT: .LBB3_10: ; %Flow19 +; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; GFX90A-NEXT: s_mov_b64 s[14:15], -1 +; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17] +; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 +; GFX90A-NEXT: ; %bb.11: ; %bb12 +; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; GFX90A-NEXT: s_add_u32 s6, s6, s4 +; GFX90A-NEXT: s_addc_u32 s7, s7, 0 +; GFX90A-NEXT: s_add_u32 s10, s10, s12 +; GFX90A-NEXT: s_addc_u32 s11, s11, s13 +; GFX90A-NEXT: s_mov_b64 s[14:15], 0 +; GFX90A-NEXT: s_branch .LBB3_1 +; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock +; GFX90A-NEXT: s_endpgm bb: %i = load volatile i16, ptr addrspace(4) undef, align 2 %i6 = zext i16 %i to i64 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index 731c560f1a3e51..051d8f1e0d3dc0 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s @@ -44,9 +45,9 @@ bb: bb2: ; 24 bytes call void asm sideeffect - "v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + "v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 call void @llvm.amdgcn.s.sleep(i32 0) br label %bb3 @@ -87,10 +88,10 @@ bb0: bb2: ; 32 bytes call void asm sideeffect - "v_nop_e64 - v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 br label %bb3 bb3: @@ -129,10 +130,10 @@ bb0: bb2: call void asm sideeffect " ; 32 bytes - v_nop_e64 - v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 br label %bb3 bb3: @@ -165,10 +166,10 @@ bb: bb2: call void asm sideeffect " ; 32 bytes - v_nop_e64 - v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 br label %bb3 bb3: @@ -209,11 +210,11 @@ bb: bb2: %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ] - ; 24 bytes + ; 24 bytes call void asm sideeffect - "v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + "v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 %inc = add nsw i32 %loop.idx, 1 ; add cost 4 %cmp = icmp slt i32 %inc, 10 ; condition cost = 8 br i1 %cmp, label %bb2, label %bb3 ; - @@ -264,10 +265,10 @@ bb2: bb3: ; 32 byte asm call void asm sideeffect - "v_nop_e64 - v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 br label %bb4 bb4: @@ -277,7 +278,7 @@ bb4: ; GCN-LABEL: {{^}}uniform_unconditional_min_long_backward_branch: ; GCN-NEXT: ; %bb.0: ; %entry - +; GCN-NEXT: s_and_b64 vcc, exec, -1 ; GCN-NEXT: .L[[LOOP:BB[0-9]_[0-9]+]]: ; %loop ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: ;;#ASMSTART @@ -286,7 +287,8 @@ bb4: ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND - +; GCN-NEXT: s_mov_b64 vcc, vcc +; GCN-NEXT: s_cbranch_vccz .LBB6_2 ; GCN-NEXT: {{.LBB[0-9]+_[0-9]+}}: ; %loop ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1 @@ -295,6 +297,8 @@ bb4: ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], (.L[[LOOP]]-[[POST_GETPC]])&4294967295 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (.L[[LOOP]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s[[[PC_LO]]:[[PC_HI]]] +; GCN-NEXT: .LBB6_2: ; %DummyReturnBlock +; GCN-NEXT: s_endpgm ; GCN-NEXT: .Lfunc_end{{[0-9]+}}: define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr addrspace(1) %arg, i32 %arg1) { entry: @@ -303,10 +307,10 @@ entry: loop: ; 32 byte asm call void asm sideeffect - "v_nop_e64 - v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 br label %loop } @@ -361,19 +365,19 @@ bb1: bb2: call void asm sideeffect - "v_nop_e64 - v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + "v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 br label %bb3 bb3: ; These NOPs prevent tail-duplication-based outlining ; from firing, which defeats the need to expand the branches and this test. call void asm sideeffect - "v_nop_e64", ""() #0 + "v_nop_e64", ""() #0 call void asm sideeffect - "v_nop_e64", ""() #0 + "v_nop_e64", ""() #0 ret void } @@ -465,17 +469,17 @@ entry: loop: %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ] call void asm sideeffect - "v_nop_e64 - v_nop_e64", ""() #0 + "v_nop_e64 + v_nop_e64", ""() #0 %cmp1 = fcmp olt float %phi, 8.0 br i1 %cmp1, label %loop_body, label %ret loop_body: call void asm sideeffect "v_nop_e64 - v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 br label %loop ret: @@ -521,9 +525,9 @@ bb9: ; preds = %bb bb13: ; preds = %bb call void asm sideeffect "v_nop_e64 - v_nop_e64 - v_nop_e64 - v_nop_e64", ""() #0 + v_nop_e64 + v_nop_e64 + v_nop_e64", ""() #0 br i1 %tmp6, label %bb19, label %bb14 bb14: ; preds = %bb13, %bb9 diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 449d57f09e68b0..8b6fcc74e5f044 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -92,6 +92,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_addk_i32 s0, 0x80 +; GCN-NEXT: s_and_b64 vcc, exec, -1 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: .LBB1_1: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -101,7 +102,10 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: ds_write_b32 v0, v1 ; GCN-NEXT: s_add_i32 s0, s0, 4 -; GCN-NEXT: s_branch .LBB1_1 +; GCN-NEXT: s_mov_b64 vcc, vcc +; GCN-NEXT: s_cbranch_vccnz .LBB1_1 +; GCN-NEXT: ; %bb.2: ; %DummyReturnBlock +; GCN-NEXT: s_endpgm ; ; GCN_DBG-LABEL: loop_const_true: ; GCN_DBG: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll index c06e56ec2eb21f..5ceea9ef63a4a5 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; optnone disables AMDGPUAnnotateUniformValues, so no branch is known @@ -16,7 +17,7 @@ ; GCN: s_or_b64 exec, exec ; GCN: {{[s|v]}}_cmp_eq_u32 -; GCN: s_cbranch +; GCN: s_cbranch_execz ; GCN-NEXT: s_branch define amdgpu_kernel void @copytoreg_divergent_brcond(i32 %arg, i32 %arg1, i32 %arg2) #0 { bb: diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 83d424d93453ff..3be8470a82061b 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s ; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -simplifycfg-require-and-preserve-domtree=1 %s | FileCheck -check-prefix=IR %s @@ -10,18 +9,25 @@ define amdgpu_kernel void @infinite_loop(ptr addrspace(1) %out) { ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_and_b64 vcc, exec, -1 ; SI-NEXT: .LBB0_1: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_branch .LBB0_1 +; SI-NEXT: s_mov_b64 vcc, vcc +; SI-NEXT: s_cbranch_vccnz .LBB0_1 +; SI-NEXT: ; %bb.2: ; %DummyReturnBlock +; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop( ; IR-NEXT: entry: ; IR-NEXT: br label [[LOOP:%.*]] ; IR: loop: ; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 -; IR-NEXT: br label [[LOOP]] +; IR-NEXT: br i1 true, label [[LOOP]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; entry: br label %loop @@ -61,6 +67,7 @@ define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP]], label [[UNIFIEDRETURNBLOCK]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond = icmp eq i32 %tmp, 1 @@ -124,6 +131,7 @@ define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; IR-NEXT: br i1 true, label [[LOOP2]], label [[DUMMYRETURNBLOCK]] ; IR: DummyReturnBlock: ; IR-NEXT: ret void +; entry: br i1 undef, label %loop1, label %loop2 @@ -184,6 +192,7 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; IR-NEXT: br i1 [[COND3]], label [[INNER_LOOP]], label [[OUTER_LOOP]] ; IR: UnifiedReturnBlock: ; IR-NEXT: ret void +; entry: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %cond1 = icmp eq i32 %tmp, 1 diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 6b33b6bea4d8c8..39ac423bd091e4 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -105,15 +105,18 @@ define amdgpu_ps void @only_kill() #0 { ; CHECK-NEXT: .LBB2_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; CHECK-NEXT: s_cbranch_scc0 .LBB2_3 +; CHECK-NEXT: s_cbranch_scc0 .LBB2_4 ; CHECK-NEXT: ; %bb.2: ; %loop ; CHECK-NEXT: ; in Loop: Header=BB2_1 Depth=1 ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_branch .LBB2_1 -; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: exp null off, off, off, off done vm +; CHECK-NEXT: s_mov_b64 vcc, exec +; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: ; %bb.3: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm main_body: br label %loop diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll index 19a9261f817c9a..6c858efaf37d21 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll @@ -10,24 +10,23 @@ define <3 x float> @liveout_undef_subrange(<3 x float> %arg) { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f32_e32 v3, v2, v2 -; CHECK-NEXT: ; kill: killed $vgpr1 ; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 -; CHECK-NEXT: .LBB0_1: ; %bb1 -; CHECK-NEXT: ; =>This Loop Header: Depth=1 -; CHECK-NEXT: ; Child Loop BB0_2 Depth 2 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB0_2: ; %bb1 -; CHECK-NEXT: ; Parent Loop BB0_1 Depth=1 -; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ; kill: killed $vgpr1 +; CHECK-NEXT: .LBB0_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB0_2 -; CHECK-NEXT: ; %bb.3: ; %bb2 +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: v_mul_f32_e32 v2, v3, v2 -; CHECK-NEXT: s_branch .LBB0_1 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.3: ; %DummyReturnBlock +; CHECK-NEXT: s_setpc_b64 s[30:31] bb: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll index 33fdfacc167642..52ae259be44f0d 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll +++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}negated_cond: @@ -7,7 +8,8 @@ ; GCN-NOT: v_cndmask_b32 ; GCN-NOT: v_cmp ; GCN: s_andn2_b64 vcc, exec, [[CC]] -; GCN: s_cbranch_vccnz .LBB0_2 +; GCN: s_lshl_b32 s12, s12, 5 +; GCN: s_cbranch_vccz .LBB0_6 define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) { bb: br label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll new file mode 100644 index 00000000000000..58d5dc20d5ac57 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s + +define void @nested_inf_loop(i1 %0, i1 %1) { +; CHECK-LABEL: nested_inf_loop: +; CHECK-NEXT: %bb.0: ; %BB +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1 +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: .LBB0_1: ; %BB1 +; CHECK: s_and_b64 s[10:11], exec, s[6:7] +; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: %bb.2: ; %BB2 +; CHECK: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: .LBB0_3: ; %BB4 +; CHECK: s_and_b64 s[10:11], exec, s[4:5] +; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_cbranch_execnz .LBB0_3 +; CHECK-NEXT: %bb.4: ; %loop.exit.guard +; CHECK: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_mov_b64 vcc, 0 +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: s_branch .LBB0_1 +; CHECK-NEXT: %bb.5: ; %DummyReturnBlock +; CHECK-NEXT: s_setpc_b64 s[30:31] +BB: + br label %BB1 + +BB1: + br i1 %0, label %BB3, label %BB2 + +BB2: + br label %BB4 + +BB4: + br i1 %1, label %BB3, label %BB4 + +BB3: + br label %BB1 +} diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll index 6dc9b9d0f7e955..4c0310f0153faa 100644 --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -296,97 +296,189 @@ define hidden void @blam() { ; GCN-NEXT: v_writelane_b32 v40, s47, 15 ; GCN-NEXT: v_writelane_b32 v40, s48, 16 ; GCN-NEXT: v_writelane_b32 v40, s49, 17 +; GCN-NEXT: v_writelane_b32 v40, s50, 18 +; GCN-NEXT: v_writelane_b32 v40, s51, 19 +; GCN-NEXT: v_writelane_b32 v40, s52, 20 +; GCN-NEXT: v_writelane_b32 v40, s53, 21 +; GCN-NEXT: v_writelane_b32 v40, s54, 22 +; GCN-NEXT: v_writelane_b32 v40, s55, 23 +; GCN-NEXT: v_writelane_b32 v40, s56, 24 +; GCN-NEXT: v_writelane_b32 v40, s57, 25 ; GCN-NEXT: v_mov_b32_e32 v41, v31 -; GCN-NEXT: s_mov_b32 s44, s15 -; GCN-NEXT: s_mov_b32 s45, s14 -; GCN-NEXT: s_mov_b32 s46, s13 -; GCN-NEXT: s_mov_b32 s47, s12 +; GCN-NEXT: s_mov_b32 s46, s15 +; GCN-NEXT: s_mov_b32 s47, s14 +; GCN-NEXT: s_mov_b32 s48, s13 +; GCN-NEXT: s_mov_b32 s49, s12 ; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] ; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] ; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] ; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] -; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_mov_b64 s[50:51], 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41 -; GCN-NEXT: v_mov_b32_e32 v43, 0 ; GCN-NEXT: flat_load_dword v44, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 -; GCN-NEXT: s_getpc_b64 s[48:49] -; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v41 +; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v44 -; GCN-NEXT: s_branch .LBB1_3 -; GCN-NEXT: .LBB1_1: ; %bb10 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 -; GCN-NEXT: .LBB1_2: ; %bb18 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 -; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: .LBB1_3: ; %bb2 -; GCN-NEXT: ; =>This Loop Header: Depth=1 -; GCN-NEXT: ; Child Loop BB1_4 Depth 2 -; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: .LBB1_4: ; %bb2 -; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 -; GCN-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-NEXT: v_cmp_eq_f32_e64 s[52:53], 0, v44 +; GCN-NEXT: v_cmp_neq_f32_e64 s[42:43], 0, v44 +; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 +; GCN-NEXT: s_branch .LBB1_2 +; GCN-NEXT: LBB1_1: ; %Flow7 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_or_b64 s[50:51], s[4:5], s[50:51] +; GCN-NEXT: s_andn2_b64 exec, exec, s[50:51] +; GCN-NEXT: s_cbranch_execz .LBB1_18 +; GCN-NEXT: .LBB1_2: ; %bb2 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: flat_load_dword v0, v[42:43] ; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 +; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_6 -; GCN-NEXT: ; %bb.5: ; %bb8 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] -; GCN-NEXT: s_cbranch_execnz .LBB1_4 -; GCN-NEXT: s_branch .LBB1_1 -; GCN-NEXT: .LBB1_6: ; %bb6 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 -; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 2, v0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB1_4 -; GCN-NEXT: ; %bb.7: ; %bb11 -; GCN-NEXT: ; in Loop: Header=BB1_4 Depth=2 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN-NEXT: s_xor_b64 s[54:55], exec, s[8:9] +; GCN-NEXT: s_cbranch_execz .LBB1_12 +; GCN-NEXT: ; %bb.3: ; %bb6 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: v_cmp_eq_u32_e64 s[44:45], 3, v0 +; GCN-NEXT: s_and_saveexec_b64 s[56:57], s[44:45] +; GCN-NEXT: s_cbranch_execz .LBB1_11 +; GCN-NEXT: %bb.4: ; %bb11 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, spam@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, spam@rel32@hi+12 ; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] ; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] ; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] ; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] -; GCN-NEXT: s_mov_b32 s12, s47 -; GCN-NEXT: s_mov_b32 s13, s46 -; GCN-NEXT: s_mov_b32 s14, s45 -; GCN-NEXT: s_mov_b32 s15, s44 +; GCN-NEXT: s_mov_b32 s12, s49 +; GCN-NEXT: s_mov_b32 s13, s48 +; GCN-NEXT: s_mov_b32 s14, s47 +; GCN-NEXT: s_mov_b32 s15, s46 ; GCN-NEXT: v_mov_b32_e32 v31, v41 -; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49] -; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GCN-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 -; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN-NEXT: s_cbranch_execnz .LBB1_4 -; GCN-NEXT: ; %bb.8: ; %bb14 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_10 +; GCN-NEXT: ; %bb.5: ; %bb14 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[8:9], s[52:53] +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[42:43] +; GCN-NEXT: s_cbranch_execz .LBB1_7 +; GCN-NEXT: ; %bb.6: ; %bb16 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: s_or_b64 s[8:9], s[52:53], exec +; GCN-NEXT: .LBB1_7: ; %Flow3 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[8:9] +; GCN-NEXT: s_xor_b64 s[8:9], exec, s[10:11] +; GCN-NEXT: s_cbranch_execz .LBB1_9 +; GCN-NEXT: ; %bb.8: ; %bb17 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 +; GCN-NEXT: .LBB1_9: ; %Flow4 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[42:43] -; GCN-NEXT: s_cbranch_execnz .LBB1_10 -; GCN-NEXT: ; %bb.9: ; %bb16 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: .LBB1_10: ; %Flow2 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_andn2_b64 s[4:5], s[44:45], exec +; GCN-NEXT: s_and_b64 s[8:9], vcc, exec +; GCN-NEXT: s_or_b64 s[44:45], s[4:5], s[8:9] +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: .LBB1_11: ; %Flow1 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[56:57] +; GCN-NEXT: s_orn2_b64 s[4:5], s[44:45], exec +; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: .LBB1_12: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_andn2_saveexec_b64 s[8:9], s[54:55] +; GCN-NEXT: s_cbranch_execz .LBB1_16 +; GCN-NEXT: ; %bb.13: ; %bb8 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] +; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_15 +; GCN-NEXT: ; %bb.14: ; %bb10 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 -; GCN-NEXT: .LBB1_10: ; %bb17 -; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GCN-NEXT: s_branch .LBB1_2 +; GCN-NEXT: s_or_b64 s[10:11], s[6:7], exec +; GCN-NEXT: .LBB1_15: ; %Flow6 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[12:13] +; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_and_b64 s[12:13], vcc, exec +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GCN-NEXT: .LBB1_16: ; %Flow5 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB1_1 +; GCN-NEXT: ; %bb.17: ; %bb18 +; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 +; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; GCN-NEXT: s_branch .LBB1_1 +; GCN-NEXT: .LBB1_18: ; %DummyReturnBlock +; GCN-NEXT: s_or_b64 exec, exec, s[50:51] +; GCN-NEXT: v_readlane_b32 s57, v40, 25 +; GCN-NEXT: v_readlane_b32 s56, v40, 24 +; GCN-NEXT: v_readlane_b32 s55, v40, 23 +; GCN-NEXT: v_readlane_b32 s54, v40, 22 +; GCN-NEXT: v_readlane_b32 s53, v40, 21 +; GCN-NEXT: v_readlane_b32 s52, v40, 20 +; GCN-NEXT: v_readlane_b32 s51, v40, 19 +; GCN-NEXT: v_readlane_b32 s50, v40, 18 +; GCN-NEXT: v_readlane_b32 s49, v40, 17 +; GCN-NEXT: v_readlane_b32 s48, v40, 16 +; GCN-NEXT: v_readlane_b32 s47, v40, 15 +; GCN-NEXT: v_readlane_b32 s46, v40, 14 +; GCN-NEXT: v_readlane_b32 s45, v40, 13 +; GCN-NEXT: v_readlane_b32 s44, v40, 12 +; GCN-NEXT: v_readlane_b32 s43, v40, 11 +; GCN-NEXT: v_readlane_b32 s42, v40, 10 +; GCN-NEXT: v_readlane_b32 s41, v40, 9 +; GCN-NEXT: v_readlane_b32 s40, v40, 8 +; GCN-NEXT: v_readlane_b32 s39, v40, 7 +; GCN-NEXT: v_readlane_b32 s38, v40, 6 +; GCN-NEXT: v_readlane_b32 s37, v40, 5 +; GCN-NEXT: v_readlane_b32 s36, v40, 4 +; GCN-NEXT: v_readlane_b32 s35, v40, 3 +; GCN-NEXT: v_readlane_b32 s34, v40, 2 +; GCN-NEXT: v_readlane_b32 s31, v40, 1 +; GCN-NEXT: v_readlane_b32 s30, v40, 0 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: v_readlane_b32 s4, v46, 0 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_addk_i32 s32, 0xf800 +; GCN-NEXT: s_mov_b32 s33, s4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] bb: %tmp = load float, ptr null, align 16 br label %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll index ad7170eae81f9d..c07ac636c5bb5e 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -34,7 +34,8 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GCN-NEXT: s_cbranch_execnz .LBB0_2 ; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 ; GCN-NEXT: s_mov_b32 exec_lo, s5 -; GCN-NEXT: s_branch .LBB0_1 +; GCN-NEXT: s_mov_b32 vcc_lo, exec_lo +; GCN-NEXT: s_cbranch_vccnz .LBB0_1 ; ; GFX11-LABEL: vgpr_descriptor_waterfall_loop_idom_update: ; GFX11: ; %bb.0: ; %entry @@ -65,7 +66,12 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 { ; GFX11-NEXT: s_cbranch_execnz .LBB0_2 ; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_branch .LBB0_1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX11-NEXT: s_cbranch_vccnz .LBB0_1 +; GFX11-NEXT: ; %bb.4: ; %DummyReturnBlock +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: br label %bb0 diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll index b1ad87b0efc5ce..e52ed009865df3 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc < %s | FileCheck %s target triple = "amdgcn--" @@ -12,7 +13,7 @@ target triple = "amdgcn--" ;CHECK: [[LOOP_LABEL:.LBB[0-9]+_[0-9]+]]: ;CHECK: buffer_store_dword ;CHECK: buffer_store_dword -;CHECK: s_branch [[LOOP_LABEL]] +;CHECK: s_cbranch_vccnz [[LOOP_LABEL]] define amdgpu_kernel void @foo() { entry: