diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index f2d25513d8d72a..d197f6d50f8039 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4324,10 +4324,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); break; } + case Intrinsic::amdgcn_wqm_demote: case Intrinsic::amdgcn_kill: { OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); break; } + case Intrinsic::amdgcn_wqm_helper: { + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + break; + } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_atomic_buffer_load: case Intrinsic::amdgcn_raw_tbuffer_load: { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 13380ff85fee85..4d8dd8dd37e53b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -786,12 +786,12 @@ def : Pat < def : Pat < (int_amdgcn_wqm_demote i1:$src), - (SI_DEMOTE_I1 $src, 0) + (SI_DEMOTE_I1 SCSrc_i1:$src, 0) >; def : Pat < (int_amdgcn_wqm_demote (i1 (not i1:$src))), - (SI_DEMOTE_I1 $src, -1) + (SI_DEMOTE_I1 SCSrc_i1:$src, -1) >; def : Pat < diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll new file mode 100644 index 00000000000000..7f331e5716a5dc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -0,0 +1,1275 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s + +define amdgpu_ps void @static_exact(float %arg0, float %arg1) { +; SI-LABEL: static_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 0 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execz BB0_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB0_2: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: static_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 0 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz BB0_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB0_2: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: static_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s0, 0, 0 +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_execz BB0_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB0_2: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: static_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 0 +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_execz BB0_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB0_2: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 false) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { +; SI-LABEL: dynamic_exact: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execz BB1_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB1_2: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: dynamic_exact: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz BB1_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: dynamic_exact: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_execz BB1_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB1_2: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: dynamic_exact: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_execz BB1_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB1_2: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %c0 = fcmp olt float %arg0, 0.000000e+00 + %c1 = fcmp oge float %arg1, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %c1) + %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps void @branch(float %arg0, float %arg1) { +; SI-LABEL: branch: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 +; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; SI-NEXT: s_xor_b64 s[0:1], exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB2_2 +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: BB2_2: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: s_cbranch_execz BB2_4 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB2_4: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: branch: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB2_2 +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-NEXT: BB2_2: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz BB2_4 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB2_4: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: branch: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s0, 0, 1 +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, s0 +; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB2_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s1, 0, 0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: BB2_2: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_execz BB2_4 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB2_4: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: branch: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, 1 +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[2:3] +; GFX10-64-NEXT: s_cbranch_execz BB2_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, 0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: BB2_2: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_execz BB2_4 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB2_4: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %i0 = fptosi float %arg0 to i32 + %i1 = fptosi float %arg1 to i32 + %c0 = or i32 %i0, %i1 + %c1 = and i32 %c0, 1 + %c2 = icmp eq i32 %c1, 0 + br i1 %c2, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_1: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; SI-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; SI-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; SI-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; SI-NEXT: s_cbranch_execz BB3_2 +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: BB3_2: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB3_5 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB3_5 +; SI-NEXT: ; %bb.4: ; %.continue +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB3_6 +; SI-NEXT: BB3_5: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB3_6: +; +; GFX9-LABEL: wqm_demote_1: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; GFX9-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; GFX9-NEXT: s_cbranch_execz BB3_2 +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: BB3_2: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB3_5 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB3_5 +; GFX9-NEXT: ; %bb.4: ; %.continue +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB3_6 +; GFX9-NEXT: BB3_5: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB3_6: +; +; GFX10-32-LABEL: wqm_demote_1: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s13, 0, 1 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, s13 +; GFX10-32-NEXT: s_and_saveexec_b32 s14, s13 +; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s14 +; GFX10-32-NEXT: s_cbranch_execz BB3_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s14, 0, 0 +; GFX10-32-NEXT: s_and_b32 s12, s12, s14 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-32-NEXT: BB3_2: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB3_5 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB3_5 +; GFX10-32-NEXT: ; %bb.4: ; %.continue +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB3_6 +; GFX10-32-NEXT: BB3_5: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB3_6: +; +; GFX10-64-LABEL: wqm_demote_1: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; GFX10-64-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; GFX10-64-NEXT: s_cbranch_execz BB3_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: BB3_2: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB3_5 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB3_5 +; GFX10-64-NEXT: ; %bb.4: ; %.continue +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB3_6 +; GFX10-64-NEXT: BB3_5: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB3_6: +.entry: + %z.cmp = fcmp olt float %z, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_2: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; SI-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; SI-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; SI-NEXT: s_cbranch_execz BB4_2 +; SI-NEXT: ; %bb.1: ; %.demote +; SI-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[16:17] +; SI-NEXT: BB4_2: ; %.continue +; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB4_5 +; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB4_5 +; SI-NEXT: ; %bb.4: ; %.continue +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB4_6 +; SI-NEXT: BB4_5: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB4_6: +; +; GFX9-LABEL: wqm_demote_2: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; GFX9-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; GFX9-NEXT: s_cbranch_execz BB4_2 +; GFX9-NEXT: ; %bb.1: ; %.demote +; GFX9-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX9-NEXT: BB4_2: ; %.continue +; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB4_5 +; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB4_5 +; GFX9-NEXT: ; %bb.4: ; %.continue +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB4_6 +; GFX9-NEXT: BB4_5: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB4_6: +; +; GFX10-32-LABEL: wqm_demote_2: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s13, 0, 1 +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s13, vcc_lo, s13 +; GFX10-32-NEXT: s_and_saveexec_b32 s14, s13 +; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s14 +; GFX10-32-NEXT: s_cbranch_execz BB4_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s14, 0, 0 +; GFX10-32-NEXT: s_and_b32 s12, s12, s14 +; GFX10-32-NEXT: s_wqm_b32 s14, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 +; GFX10-32-NEXT: BB4_2: ; %.continue +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB4_5 +; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB4_5 +; GFX10-32-NEXT: ; %bb.4: ; %.continue +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB4_6 +; GFX10-32-NEXT: BB4_5: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB4_6: +; +; GFX10-64-LABEL: wqm_demote_2: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[14:15], 0, 1 +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[14:15], vcc, s[14:15] +; GFX10-64-NEXT: s_and_saveexec_b64 s[16:17], s[14:15] +; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[16:17] +; GFX10-64-NEXT: s_cbranch_execz BB4_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[16:17], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[12:13], s[12:13], s[16:17] +; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] +; GFX10-64-NEXT: BB4_2: ; %.continue +; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB4_5 +; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB4_5 +; GFX10-64-NEXT: ; %bb.4: ; %.continue +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB4_6 +; GFX10-64-NEXT: BB4_5: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB4_6: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + br i1 %z.cmp, label %.continue, label %.demote + +.demote: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue + +.continue: + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { +; SI-LABEL: wqm_demote_dynamic: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[12:13], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[12:13], s[12:13], vcc +; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] +; SI-NEXT: s_and_b64 exec, exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB5_2 +; SI-NEXT: ; %bb.1: ; %.entry +; SI-NEXT: v_add_f32_e32 v0, v0, v0 +; SI-NEXT: s_and_b64 exec, exec, s[12:13] +; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_branch BB5_3 +; SI-NEXT: BB5_2: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB5_3: +; +; GFX9-LABEL: wqm_demote_dynamic: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[12:13], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], vcc +; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB5_2 +; GFX9-NEXT: ; %bb.1: ; %.entry +; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_branch BB5_3 +; GFX9-NEXT: BB5_2: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB5_3: +; +; GFX10-32-LABEL: wqm_demote_dynamic: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s12, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_and_b32 s12, s12, vcc_lo +; GFX10-32-NEXT: s_wqm_b32 s13, s12 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB5_2 +; GFX10-32-NEXT: ; %bb.1: ; %.entry +; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-32-NEXT: s_waitcnt vmcnt(0) +; GFX10-32-NEXT: s_branch BB5_3 +; GFX10-32-NEXT: BB5_2: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB5_3: +; +; GFX10-64-LABEL: wqm_demote_dynamic: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[12:13], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_and_b64 s[12:13], s[12:13], vcc +; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB5_2 +; GFX10-64-NEXT: ; %bb.1: ; %.entry +; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-64-NEXT: s_waitcnt vmcnt(0) +; GFX10-64-NEXT: s_branch BB5_3 +; GFX10-64-NEXT: BB5_2: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB5_3: +.entry: + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %tex1 = extractelement <4 x float> %tex, i32 0 + %z.cmp = fcmp olt float %tex0, 0.0 + call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) + %coord1 = fadd float %tex0, %tex1 + %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 + + ret <4 x float> %rtex +} + +define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { +; SI-LABEL: wqm_deriv: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s0, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s1, s0, 0x100000 +; SI-NEXT: s_lshl_b32 s0, s4, 16 +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_or_b32 s1, s4, s1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; SI-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_cbranch_execz BB6_2 +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7] +; SI-NEXT: s_wqm_b64 s[6:7], s[2:3] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: BB6_2: ; %.continue0 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_8 +; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: s_wqm_b64 s[4:5], s[2:3] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_8 +; SI-NEXT: ; %bb.4: ; %.continue0 +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_and_b64 exec, exec, s[2:3] +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_and_b64 s[2:3], s[2:3], vcc +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; SI-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_6 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; SI-NEXT: s_and_b64 exec, exec, s[4:5] +; SI-NEXT: BB6_6: ; %.continue1 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB6_8 +; SI-NEXT: ; %bb.7: ; %.continue1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB6_8: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX9-NEXT: s_movk_i32 s3, 0x3c00 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX9-NEXT: s_cbranch_execz BB6_2 +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: BB6_2: ; %.continue0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB6_7 +; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, s3 +; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB6_5 +; GFX9-NEXT: ; %bb.4: ; %.demote1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX9-NEXT: BB6_5: ; %.continue1 +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execz BB6_7 +; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB6_7: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s2, 0, 1 +; GFX10-32-NEXT: s_movk_i32 s1, 0x3c00 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s2, vcc_lo, s2 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 +; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_execz BB6_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 0 +; GFX10-32-NEXT: s_and_b32 s0, s0, s3 +; GFX10-32-NEXT: s_wqm_b32 s3, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: BB6_2: ; %.continue0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB6_7 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s2, s1, 0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s1, 0, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 1 +; GFX10-32-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s0, s0, s3 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, s0 +; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_execz BB6_5 +; GFX10-32-NEXT: ; %bb.4: ; %.demote1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: BB6_5: ; %.continue1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_cbranch_execz BB6_7 +; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-32-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB6_7: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX10-64-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX10-64-NEXT: s_cbranch_execz BB6_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: BB6_2: ; %.continue0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB6_7 +; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] +; GFX10-64-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX10-64-NEXT: s_and_b64 s[0:1], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB6_5 +; GFX10-64-NEXT: ; %bb.4: ; %.demote1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 0 +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: BB6_5: ; %.continue1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_cbranch_execz BB6_7 +; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB6_7: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %live = call i1 @llvm.amdgcn.wqm.helper() + %live.cond = select i1 %live, i32 0, i32 1065353216 + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { +; SI-LABEL: wqm_deriv_loop: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_wqm_b64 exec, exec +; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 +; SI-NEXT: s_movk_i32 s2, 0x3c00 +; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s2, s4, 16 +; SI-NEXT: s_or_b32 s2, s3, s2 +; SI-NEXT: s_lshl_b32 s3, s3, 16 +; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; SI-NEXT: s_cbranch_execz BB7_2 +; SI-NEXT: ; %bb.1: ; %.demote0 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: BB7_2: ; %.continue0.preheader +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_9 +; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: BB7_4: ; %.continue0 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1] +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; SI-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; SI-NEXT: s_cbranch_execz BB7_6 +; SI-NEXT: ; %bb.5: ; %.demote1 +; SI-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; SI-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: BB7_6: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: ; %bb.7: ; %.continue1 +; SI-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; SI-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; SI-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_and_b64 s[6:7], exec, s[6:7] +; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execnz BB7_4 +; SI-NEXT: ; %bb.8: ; %.return +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; SI-NEXT: s_endpgm +; SI-NEXT: BB7_9: +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; +; GFX9-LABEL: wqm_deriv_loop: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_wqm_b64 exec, exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX9-NEXT: s_movk_i32 s3, 0x3c00 +; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_2 +; GFX9-NEXT: ; %bb.1: ; %.demote0 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: BB7_2: ; %.continue0.preheader +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_9 +; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, s3 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: BB7_4: ; %.continue0 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_b64 s[6:7], s[0:1], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX9-NEXT: s_cbranch_execz BB7_6 +; GFX9-NEXT: ; %bb.5: ; %.demote1 +; GFX9-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-NEXT: BB7_6: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: ; %bb.7: ; %.continue1 +; GFX9-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX9-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_and_b64 s[6:7], exec, s[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz BB7_4 +; GFX9-NEXT: ; %bb.8: ; %.return +; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: s_endpgm +; GFX9-NEXT: BB7_9: +; GFX9-NEXT: exp null off, off, off, off done vm +; GFX9-NEXT: s_endpgm +; +; GFX10-32-LABEL: wqm_deriv_loop: +; GFX10-32: ; %bb.0: ; %.entry +; GFX10-32-NEXT: s_mov_b32 s0, exec_lo +; GFX10-32-NEXT: ; implicit-def: $vcc_hi +; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s3, 0, 1 +; GFX10-32-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-32-NEXT: s_xor_b32 s3, vcc_lo, s3 +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 +; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 +; GFX10-32-NEXT: s_cbranch_execz BB7_2 +; GFX10-32-NEXT: ; %bb.1: ; %.demote0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 0 +; GFX10-32-NEXT: s_and_b32 s0, s0, s4 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: BB7_2: ; %.continue0.preheader +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX10-32-NEXT: s_cbranch_execz BB7_9 +; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-32-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-32-NEXT: BB7_4: ; %.continue0 +; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 1 +; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX10-32-NEXT: s_and_b32 s5, s0, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s4, s5, s4 +; GFX10-32-NEXT: s_and_saveexec_b32 s5, s4 +; GFX10-32-NEXT: s_xor_b32 s4, exec_lo, s5 +; GFX10-32-NEXT: s_cbranch_execz BB7_6 +; GFX10-32-NEXT: ; %bb.5: ; %.demote1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s5, 0, 0 +; GFX10-32-NEXT: s_and_b32 s0, s0, s5 +; GFX10-32-NEXT: s_wqm_b32 s5, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s5 +; GFX10-32-NEXT: BB7_6: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: s_wqm_b32 s4, s0 +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 +; GFX10-32-NEXT: ; %bb.7: ; %.continue1 +; GFX10-32-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_ne_u32_e64 s4, 0, 1 +; GFX10-32-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v1 +; GFX10-32-NEXT: s_xor_b32 s4, vcc_lo, s4 +; GFX10-32-NEXT: s_and_b32 s4, exec_lo, s4 +; GFX10-32-NEXT: s_or_b32 s1, s4, s1 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execnz BB7_4 +; GFX10-32-NEXT: ; %bb.8: ; %.return +; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: s_endpgm +; GFX10-32-NEXT: BB7_9: +; GFX10-32-NEXT: exp null off, off, off, off done vm +; GFX10-32-NEXT: s_endpgm +; +; GFX10-64-LABEL: wqm_deriv_loop: +; GFX10-64: ; %bb.0: ; %.entry +; GFX10-64-NEXT: s_mov_b64 s[0:1], exec +; GFX10-64-NEXT: s_wqm_b64 exec, exec +; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, 1 +; GFX10-64-NEXT: s_movk_i32 s2, 0x3c00 +; GFX10-64-NEXT: s_mov_b32 s3, 0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX10-64-NEXT: s_xor_b64 s[4:5], vcc, s[4:5] +; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; GFX10-64-NEXT: s_cbranch_execz BB7_2 +; GFX10-64-NEXT: ; %bb.1: ; %.demote0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: BB7_2: ; %.continue0.preheader +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_9 +; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s3, s2, 0 +; GFX10-64-NEXT: s_pack_ll_b32_b16 s2, 0, s2 +; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-64-NEXT: BB7_4: ; %.continue0 +; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1] +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0 +; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 +; GFX10-64-NEXT: s_and_b64 s[8:9], s[0:1], vcc +; GFX10-64-NEXT: s_xor_b64 s[6:7], s[8:9], s[6:7] +; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX10-64-NEXT: s_cbranch_execz BB7_6 +; GFX10-64-NEXT: ; %bb.5: ; %.demote1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, 0 +; GFX10-64-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX10-64-NEXT: BB7_6: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: ; %bb.7: ; %.continue1 +; GFX10-64-NEXT: ; in Loop: Header=BB7_4 Depth=1 +; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1 +; GFX10-64-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 +; GFX10-64-NEXT: s_xor_b64 s[6:7], vcc, s[6:7] +; GFX10-64-NEXT: s_and_b64 s[6:7], exec, s[6:7] +; GFX10-64-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execnz BB7_4 +; GFX10-64-NEXT: ; %bb.8: ; %.return +; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-64-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: s_endpgm +; GFX10-64-NEXT: BB7_9: +; GFX10-64-NEXT: exp null off, off, off, off done vm +; GFX10-64-NEXT: s_endpgm +.entry: + %p0 = extractelement <2 x float> %input, i32 0 + %p1 = extractelement <2 x float> %input, i32 1 + %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2 + %argi = fptosi float %arg to i32 + %cond0 = icmp eq i32 %argi, 0 + br i1 %cond0, label %.continue0, label %.demote0 + +.demote0: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue0 + +.continue0: + %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] + %live = call i1 @llvm.amdgcn.wqm.helper() + %live.cond = select i1 %live, i32 0, i32 %count + %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) + %live.v0f = bitcast i32 %live.v0 to float + %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) + %live.v1f = bitcast i32 %live.v1 to float + %v0 = fsub float %live.v0f, %live.v1f + %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) + %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 + %cond2 = and i1 %live, %cond1 + br i1 %cond2, label %.continue1, label %.demote1 + +.demote1: + call void @llvm.amdgcn.wqm.demote(i1 false) + br label %.continue1 + +.continue1: + %next = add i32 %count, 1 + %loop.cond = icmp slt i32 %next, %limit + br i1 %loop.cond, label %.continue0, label %.return + +.return: + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) #3 + ret void +} + +declare void @llvm.amdgcn.wqm.demote(i1) #0 +declare i1 @llvm.amdgcn.wqm.helper() #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare float @llvm.amdgcn.wqm.f32(float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 +declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 +declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 +declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { inaccessiblememonly nounwind } +attributes #4 = { convergent nounwind readnone }