Skip to content

Commit 9eff001

Browse files
committed
[TargetLowering] Correctly yield NaN from FP_TO_BF16
We didn't set the exponent field, resulting in tiny numbers instead of NaNs.
1 parent 2b2881b commit 9eff001

File tree

7 files changed

+966
-967
lines changed

7 files changed

+966
-967
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

+4-5
Original file line numberDiff line numberDiff line change
@@ -10948,12 +10948,11 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const {
1094810948
Op = expandRoundInexactToOdd(F32, Op, dl, DAG);
1094910949
Op = DAG.getNode(ISD::BITCAST, dl, I32, Op);
1095010950

10951-
// Extract the sign bit.
10952-
SDValue SignBit =
10953-
DAG.getNode(ISD::AND, dl, I32, Op,
10954-
DAG.getConstant(APInt::getSignMask(32), dl, I32));
10951+
// Extract the sign bit and exponent.
10952+
SDValue SignBitAndExponentField = DAG.getNode(
10953+
ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32));
1095510954
// Set the quiet bit.
10956-
SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBit,
10955+
SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField,
1095710956
DAG.getConstant(0x400000, dl, I32));
1095810957

1095910958
// Factor in the contribution of the low 16 bits.

llvm/test/CodeGen/AMDGPU/bf16.ll

+932-932
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
790790
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
791791
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
792792
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
793-
; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0
793+
; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0
794794
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2
795795
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
796796
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
@@ -806,7 +806,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
806806
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
807807
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
808808
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
809-
; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0
809+
; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0
810810
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
811811
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2
812812
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0

llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll

+10-10
Original file line numberDiff line numberDiff line change
@@ -1524,7 +1524,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
15241524
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
15251525
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
15261526
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
1527-
; GFX900-NEXT: v_and_b32_e32 v4, 0x80000000, v1
1527+
; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1
15281528
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
15291529
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
15301530
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
@@ -1566,7 +1566,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
15661566
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
15671567
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
15681568
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
1569-
; GFX908-NEXT: v_and_b32_e32 v4, 0x80000000, v1
1569+
; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1
15701570
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
15711571
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
15721572
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
@@ -1608,7 +1608,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16081608
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
16091609
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
16101610
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
1611-
; GFX90A-NEXT: v_and_b32_e32 v4, 0x80000000, v1
1611+
; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1
16121612
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
16131613
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
16141614
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
@@ -1632,7 +1632,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16321632
; GFX10: ; %bb.0:
16331633
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
16341634
; GFX10-NEXT: v_mov_b32_e32 v0, 0
1635-
; GFX10-NEXT: s_brev_b32 s5, 1
1635+
; GFX10-NEXT: s_mov_b32 s5, 0xff800000
16361636
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
16371637
; GFX10-NEXT: s_and_b32 s0, s2, -4
16381638
; GFX10-NEXT: s_mov_b32 s1, s3
@@ -1673,7 +1673,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
16731673
; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
16741674
; GFX11: ; %bb.0:
16751675
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
1676-
; GFX11-NEXT: s_brev_b32 s5, 1
1676+
; GFX11-NEXT: s_mov_b32 s5, 0xff800000
16771677
; GFX11-NEXT: v_mov_b32_e32 v0, 0
16781678
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
16791679
; GFX11-NEXT: s_and_b32 s0, s2, -4
@@ -1744,7 +1744,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
17441744
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
17451745
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
17461746
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
1747-
; GFX900-NEXT: v_and_b32_e32 v4, 0x80000000, v1
1747+
; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1
17481748
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
17491749
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
17501750
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
@@ -1786,7 +1786,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
17861786
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
17871787
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
17881788
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
1789-
; GFX908-NEXT: v_and_b32_e32 v4, 0x80000000, v1
1789+
; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1
17901790
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
17911791
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
17921792
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
@@ -1828,7 +1828,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18281828
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
18291829
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
18301830
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
1831-
; GFX90A-NEXT: v_and_b32_e32 v4, 0x80000000, v1
1831+
; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1
18321832
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
18331833
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
18341834
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
@@ -1854,7 +1854,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18541854
; GFX10: ; %bb.0:
18551855
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
18561856
; GFX10-NEXT: v_mov_b32_e32 v0, 0
1857-
; GFX10-NEXT: s_brev_b32 s5, 1
1857+
; GFX10-NEXT: s_mov_b32 s5, 0xff800000
18581858
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
18591859
; GFX10-NEXT: s_and_b32 s0, s2, -4
18601860
; GFX10-NEXT: s_mov_b32 s1, s3
@@ -1895,7 +1895,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
18951895
; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
18961896
; GFX11: ; %bb.0:
18971897
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
1898-
; GFX11-NEXT: s_brev_b32 s5, 1
1898+
; GFX11-NEXT: s_mov_b32 s5, 0xff800000
18991899
; GFX11-NEXT: v_mov_b32_e32 v0, 0
19001900
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
19011901
; GFX11-NEXT: s_and_b32 s0, s2, -4

llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -912,7 +912,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
912912
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
913913
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
914914
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
915-
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
915+
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
916916
; DAGISEL-GFX11-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
917917
; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
918918
; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
@@ -934,7 +934,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
934934
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
935935
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
936936
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
937-
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
937+
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
938938
; DAGISEL-GFX11-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
939939
; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
940940
; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
@@ -956,7 +956,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
956956
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
957957
; DAGISEL-GFX10-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
958958
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
959-
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
959+
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
960960
; DAGISEL-GFX10-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
961961
; DAGISEL-GFX10-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
962962
; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
@@ -978,7 +978,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
978978
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
979979
; DAGISEL-GFX10-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
980980
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
981-
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
981+
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
982982
; DAGISEL-GFX10-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
983983
; DAGISEL-GFX10-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
984984
; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec

llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -1413,7 +1413,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
14131413
; VI-NEXT: v_add_f32_e32 v3, 4.0, v3
14141414
; VI-NEXT: v_bfe_u32 v6, v3, 16, 1
14151415
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3
1416-
; VI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
1416+
; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v3
14171417
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
14181418
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7
14191419
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
@@ -1451,7 +1451,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
14511451
; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
14521452
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
14531453
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
1454-
; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3
1454+
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3
14551455
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6
14561456
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
14571457
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
@@ -1560,7 +1560,7 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
15601560
; VI-NEXT: v_add_f32_e32 v4, 4.0, v4
15611561
; VI-NEXT: v_bfe_u32 v6, v4, 16, 1
15621562
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4
1563-
; VI-NEXT: v_and_b32_e32 v7, 0x80000000, v4
1563+
; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v4
15641564
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
15651565
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7
15661566
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
@@ -1597,7 +1597,7 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
15971597
; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
15981598
; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4
15991599
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
1600-
; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4
1600+
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4
16011601
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6
16021602
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
16031603
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

+10-10
Original file line numberDiff line numberDiff line change
@@ -4262,20 +4262,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
42624262
; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11
42634263
; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2
42644264
; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1
4265-
; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v7
4265+
; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v7
42664266
; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1
4267-
; GFX9-NEXT: v_and_b32_e32 v12, 0x80000000, v1
4267+
; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v1
42684268
; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2
42694269
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9
42704270
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
42714271
; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1
4272-
; GFX9-NEXT: v_and_b32_e32 v14, 0x80000000, v8
4272+
; GFX9-NEXT: v_and_b32_e32 v14, 0xff800000, v8
42734273
; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2
42744274
; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12
42754275
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
42764276
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
42774277
; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1
4278-
; GFX9-NEXT: v_and_b32_e32 v16, 0x80000000, v2
4278+
; GFX9-NEXT: v_and_b32_e32 v16, 0xff800000, v2
42794279
; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2
42804280
; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v14
42814281
; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
@@ -4298,20 +4298,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
42984298
; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2
42994299
; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7
43004300
; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
4301-
; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v1
4301+
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1
43024302
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
4303-
; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v3
4303+
; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v3
43044304
; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2
43054305
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
43064306
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
43074307
; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1
4308-
; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v2
4308+
; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v2
43094309
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2
43104310
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8
43114311
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
43124312
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
43134313
; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1
4314-
; GFX9-NEXT: v_and_b32_e32 v12, 0x80000000, v4
4314+
; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v4
43154315
; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2
43164316
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10
43174317
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
@@ -4332,7 +4332,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
43324332
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
43334333
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
43344334
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
4335-
; GFX10-NEXT: s_brev_b32 s2, 1
4335+
; GFX10-NEXT: s_mov_b32 s2, 0xff800000
43364336
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
43374337
; GFX10-NEXT: s_clause 0x2
43384338
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
@@ -4416,7 +4416,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
44164416
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
44174417
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
44184418
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
4419-
; GFX11-NEXT: s_brev_b32 s0, 1
4419+
; GFX11-NEXT: s_mov_b32 s0, 0xff800000
44204420
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
44214421
; GFX11-NEXT: s_clause 0x2
44224422
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3]

0 commit comments

Comments
 (0)