Skip to content

Commit aa7ccbf

Browse files
committed
AMDGPU: Improve v8f16/v8bf16 copysign handling
1 parent dfbb9a0 commit aa7ccbf

File tree

3 files changed

+74
-459
lines changed

3 files changed

+74
-459
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
758758

759759
// Can do this in one BFI plus a constant materialize.
760760
setOperationAction(ISD::FCOPYSIGN,
761-
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16},
761+
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
762+
MVT::v8f16, MVT::v8bf16},
762763
Custom);
763764

764765
setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
59405941
EVT VT = Op.getValueType();
59415942
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
59425943
VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
5943-
VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 ||
5944-
VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 ||
5945-
VT == MVT::v32f16);
5944+
VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
5945+
VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
5946+
VT == MVT::v32i16 || VT == MVT::v32f16);
59465947

59475948
auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
59485949
auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);

llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll

Lines changed: 41 additions & 254 deletions
Original file line numberDiff line numberDiff line change
@@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
13901390
;
13911391
; GFX8-LABEL: s_copysign_v8bf16:
13921392
; GFX8: ; %bb.0:
1393-
; GFX8-NEXT: s_movk_i32 s8, 0x7fff
1393+
; GFX8-NEXT: s_mov_b32 s8, 0x7fff7fff
13941394
; GFX8-NEXT: v_mov_b32_e32 v0, s3
13951395
; GFX8-NEXT: v_mov_b32_e32 v1, s7
1396-
; GFX8-NEXT: s_lshr_b32 s7, s7, 16
1397-
; GFX8-NEXT: s_lshr_b32 s3, s3, 16
13981396
; GFX8-NEXT: v_bfi_b32 v0, s8, v0, v1
1399-
; GFX8-NEXT: v_mov_b32_e32 v1, s3
1400-
; GFX8-NEXT: v_mov_b32_e32 v2, s7
1401-
; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
1402-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1403-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
14041397
; GFX8-NEXT: v_mov_b32_e32 v1, s2
14051398
; GFX8-NEXT: v_mov_b32_e32 v2, s6
1406-
; GFX8-NEXT: s_lshr_b32 s3, s6, 16
1407-
; GFX8-NEXT: s_lshr_b32 s2, s2, 16
14081399
; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
1409-
; GFX8-NEXT: v_mov_b32_e32 v2, s2
1410-
; GFX8-NEXT: v_mov_b32_e32 v3, s3
1411-
; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
1412-
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1413-
; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
14141400
; GFX8-NEXT: v_mov_b32_e32 v2, s1
14151401
; GFX8-NEXT: v_mov_b32_e32 v3, s5
1416-
; GFX8-NEXT: s_lshr_b32 s2, s5, 16
1417-
; GFX8-NEXT: s_lshr_b32 s1, s1, 16
14181402
; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
1419-
; GFX8-NEXT: v_mov_b32_e32 v3, s1
1420-
; GFX8-NEXT: v_mov_b32_e32 v4, s2
1421-
; GFX8-NEXT: v_bfi_b32 v3, s8, v3, v4
1422-
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1423-
; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
14241403
; GFX8-NEXT: v_mov_b32_e32 v3, s0
14251404
; GFX8-NEXT: v_mov_b32_e32 v4, s4
1426-
; GFX8-NEXT: s_lshr_b32 s1, s4, 16
1427-
; GFX8-NEXT: s_lshr_b32 s0, s0, 16
14281405
; GFX8-NEXT: v_bfi_b32 v3, s8, v3, v4
1429-
; GFX8-NEXT: v_mov_b32_e32 v4, s0
1430-
; GFX8-NEXT: v_mov_b32_e32 v5, s1
1431-
; GFX8-NEXT: v_bfi_b32 v4, s8, v4, v5
1432-
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1433-
; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
14341406
; GFX8-NEXT: v_readfirstlane_b32 s0, v3
14351407
; GFX8-NEXT: v_readfirstlane_b32 s1, v2
14361408
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
@@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
14391411
;
14401412
; GFX9-LABEL: s_copysign_v8bf16:
14411413
; GFX9: ; %bb.0:
1442-
; GFX9-NEXT: s_movk_i32 s8, 0x7fff
1414+
; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff
14431415
; GFX9-NEXT: v_mov_b32_e32 v0, s3
14441416
; GFX9-NEXT: v_mov_b32_e32 v1, s7
1445-
; GFX9-NEXT: s_lshr_b32 s7, s7, 16
1446-
; GFX9-NEXT: s_lshr_b32 s3, s3, 16
14471417
; GFX9-NEXT: v_bfi_b32 v0, s8, v0, v1
1448-
; GFX9-NEXT: v_mov_b32_e32 v1, s3
1449-
; GFX9-NEXT: v_mov_b32_e32 v2, s7
1450-
; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
1451-
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
1452-
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
14531418
; GFX9-NEXT: v_mov_b32_e32 v1, s2
14541419
; GFX9-NEXT: v_mov_b32_e32 v2, s6
1455-
; GFX9-NEXT: s_lshr_b32 s3, s6, 16
1456-
; GFX9-NEXT: s_lshr_b32 s2, s2, 16
14571420
; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
1458-
; GFX9-NEXT: v_mov_b32_e32 v2, s2
1459-
; GFX9-NEXT: v_mov_b32_e32 v3, s3
1460-
; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
1461-
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1462-
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
14631421
; GFX9-NEXT: v_mov_b32_e32 v2, s1
14641422
; GFX9-NEXT: v_mov_b32_e32 v3, s5
1465-
; GFX9-NEXT: s_lshr_b32 s2, s5, 16
1466-
; GFX9-NEXT: s_lshr_b32 s1, s1, 16
14671423
; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
1468-
; GFX9-NEXT: v_mov_b32_e32 v3, s1
1469-
; GFX9-NEXT: v_mov_b32_e32 v4, s2
1470-
; GFX9-NEXT: v_bfi_b32 v3, s8, v3, v4
1471-
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
1472-
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
14731424
; GFX9-NEXT: v_mov_b32_e32 v3, s0
14741425
; GFX9-NEXT: v_mov_b32_e32 v4, s4
1475-
; GFX9-NEXT: s_lshr_b32 s1, s4, 16
1476-
; GFX9-NEXT: s_lshr_b32 s0, s0, 16
14771426
; GFX9-NEXT: v_bfi_b32 v3, s8, v3, v4
1478-
; GFX9-NEXT: v_mov_b32_e32 v4, s0
1479-
; GFX9-NEXT: v_mov_b32_e32 v5, s1
1480-
; GFX9-NEXT: v_bfi_b32 v4, s8, v4, v5
1481-
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
1482-
; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
14831427
; GFX9-NEXT: v_readfirstlane_b32 s0, v3
14841428
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
14851429
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
@@ -1488,85 +1432,36 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
14881432
;
14891433
; GFX10-LABEL: s_copysign_v8bf16:
14901434
; GFX10: ; %bb.0:
1491-
; GFX10-NEXT: v_mov_b32_e32 v0, s7
1492-
; GFX10-NEXT: s_lshr_b32 s7, s7, 16
1435+
; GFX10-NEXT: v_mov_b32_e32 v0, s4
1436+
; GFX10-NEXT: v_mov_b32_e32 v1, s5
14931437
; GFX10-NEXT: v_mov_b32_e32 v2, s6
1494-
; GFX10-NEXT: v_mov_b32_e32 v1, s7
1495-
; GFX10-NEXT: s_lshr_b32 s7, s6, 16
1496-
; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s3, v0
14971438
; GFX10-NEXT: v_mov_b32_e32 v3, s7
1498-
; GFX10-NEXT: s_lshr_b32 s3, s3, 16
1499-
; GFX10-NEXT: v_mov_b32_e32 v4, s5
1500-
; GFX10-NEXT: v_mov_b32_e32 v5, s4
1501-
; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s3, v1
1502-
; GFX10-NEXT: s_lshr_b32 s3, s2, 16
1503-
; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, s2, v2
1504-
; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, s3, v3
1505-
; GFX10-NEXT: s_lshr_b32 s2, s5, 16
1506-
; GFX10-NEXT: s_lshr_b32 s3, s4, 16
1507-
; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, s1, v4
1508-
; GFX10-NEXT: v_mov_b32_e32 v6, s2
1509-
; GFX10-NEXT: v_mov_b32_e32 v7, s3
1510-
; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, s0, v5
1511-
; GFX10-NEXT: s_lshr_b32 s1, s1, 16
1512-
; GFX10-NEXT: s_lshr_b32 s0, s0, 16
1513-
; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, s1, v6
1514-
; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, s0, v7
1515-
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
1516-
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
1517-
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
1518-
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
1519-
; GFX10-NEXT: v_lshl_or_b32 v5, v7, 16, v5
1520-
; GFX10-NEXT: v_lshl_or_b32 v4, v6, 16, v4
1521-
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
1522-
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1523-
; GFX10-NEXT: v_readfirstlane_b32 s0, v5
1524-
; GFX10-NEXT: v_readfirstlane_b32 s1, v4
1439+
; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
1440+
; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
1441+
; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2
1442+
; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3
1443+
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1444+
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
15251445
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
1526-
; GFX10-NEXT: v_readfirstlane_b32 s3, v0
1446+
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
15271447
; GFX10-NEXT: ; return to shader part epilog
15281448
;
15291449
; GFX11-LABEL: s_copysign_v8bf16:
15301450
; GFX11: ; %bb.0:
1531-
; GFX11-NEXT: v_mov_b32_e32 v0, s7
1532-
; GFX11-NEXT: s_lshr_b32 s7, s7, 16
1533-
; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s4
1534-
; GFX11-NEXT: v_mov_b32_e32 v1, s7
1535-
; GFX11-NEXT: s_lshr_b32 s7, s6, 16
1536-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1451+
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
15371452
; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
1538-
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s3, v0
1539-
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
1540-
; GFX11-NEXT: v_bfi_b32 v4, 0x7fff, s1, v4
1541-
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s3, v1
1542-
; GFX11-NEXT: s_lshr_b32 s3, s2, 16
1543-
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s2, v2
1544-
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s3, v3
1545-
; GFX11-NEXT: s_lshr_b32 s2, s5, 16
1546-
; GFX11-NEXT: s_lshr_b32 s3, s4, 16
1547-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1548-
; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
1549-
; GFX11-NEXT: v_bfi_b32 v5, 0x7fff, s0, v5
1550-
; GFX11-NEXT: s_lshr_b32 s1, s1, 16
1551-
; GFX11-NEXT: s_lshr_b32 s0, s0, 16
1552-
; GFX11-NEXT: v_bfi_b32 v6, 0x7fff, s1, v6
1553-
; GFX11-NEXT: v_bfi_b32 v7, 0x7fff, s0, v7
1554-
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
1555-
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
1556-
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
1557-
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1558-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1559-
; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
1560-
; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v4
1561-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1562-
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2
1563-
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1453+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1454+
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
1455+
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
1456+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
1457+
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2
1458+
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3
15641459
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1565-
; GFX11-NEXT: v_readfirstlane_b32 s0, v5
1566-
; GFX11-NEXT: v_readfirstlane_b32 s1, v4
1460+
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
1461+
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
15671462
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
15681463
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
1569-
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1464+
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
15701465
; GFX11-NEXT: ; return to shader part epilog
15711466
%out = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %arg_mag, <8 x bfloat> %arg_sign)
15721467
%cast = bitcast <8 x bfloat> %out to <4 x i32>
@@ -2542,148 +2437,40 @@ define <8 x bfloat> @v_copysign_v8bf16(<8 x bfloat> %mag, <8 x bfloat> %sign) {
25422437
; GFX8-LABEL: v_copysign_v8bf16:
25432438
; GFX8: ; %bb.0:
25442439
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2545-
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
2546-
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
2547-
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
2548-
; GFX8-NEXT: v_bfi_b32 v8, s4, v9, v8
2549-
; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v7
2550-
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
2551-
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v2
2552-
; GFX8-NEXT: v_bfi_b32 v7, s4, v9, v7
2553-
; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v6
2554-
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
2555-
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2556-
; GFX8-NEXT: v_bfi_b32 v6, s4, v9, v6
2557-
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
2558-
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2559-
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v0
2560-
; GFX8-NEXT: v_bfi_b32 v5, s4, v9, v5
2440+
; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
25612441
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4
2562-
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
2563-
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2564-
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
2565-
; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2566-
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
2567-
; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2568-
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
2569-
; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2442+
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
2443+
; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v6
2444+
; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v7
25702445
; GFX8-NEXT: s_setpc_b64 s[30:31]
25712446
;
25722447
; GFX9-LABEL: v_copysign_v8bf16:
25732448
; GFX9: ; %bb.0:
25742449
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575-
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
2576-
; GFX9-NEXT: v_bfi_b32 v8, s4, v3, v7
2577-
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2578-
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2579-
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v7
2580-
; GFX9-NEXT: v_bfi_b32 v7, s4, v2, v6
2581-
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2582-
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2583-
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v6
2584-
; GFX9-NEXT: v_bfi_b32 v6, s4, v1, v5
2585-
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2586-
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2587-
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
2588-
; GFX9-NEXT: v_bfi_b32 v5, s4, v0, v4
2589-
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
2590-
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2450+
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
25912451
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
2592-
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
2593-
; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
2594-
; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
2595-
; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
2596-
; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
2452+
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
2453+
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v6
2454+
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v7
25972455
; GFX9-NEXT: s_setpc_b64 s[30:31]
25982456
;
25992457
; GFX10-LABEL: v_copysign_v8bf16:
26002458
; GFX10: ; %bb.0:
26012459
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2602-
; GFX10-NEXT: v_bfi_b32 v8, 0x7fff, v3, v7
2603-
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2604-
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2605-
; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, v2, v6
2606-
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2607-
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v5
2608-
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4
2609-
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v0
2610-
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v1
2611-
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2612-
; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v5
2613-
; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v4
2614-
; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, v12, v11
2615-
; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, v13, v10
2616-
; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v2, v6
2617-
; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, v3, v7
2618-
; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
2619-
; GFX10-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
2620-
; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x5040100
2621-
; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
2460+
; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
2461+
; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
2462+
; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6
2463+
; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7
26222464
; GFX10-NEXT: s_setpc_b64 s[30:31]
26232465
;
2624-
; GFX11TRUE16-LABEL: v_copysign_v8bf16:
2625-
; GFX11TRUE16: ; %bb.0:
2626-
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2627-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
2628-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
2629-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
2630-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
2631-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
2632-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
2633-
; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v9
2634-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
2635-
; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v3, v7
2636-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
2637-
; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v10, v11
2638-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
2639-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.l
2640-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
2641-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
2642-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
2643-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
2644-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
2645-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
2646-
; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v12
2647-
; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v0, v4
2648-
; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0x7fff, v9, v10
2649-
; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fff, v1, v5
2650-
; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, v2, v6
2651-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l
2652-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
2653-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
2654-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v5.l
2655-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
2656-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
2657-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
2658-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
2659-
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
2660-
;
2661-
; GFX11FAKE16-LABEL: v_copysign_v8bf16:
2662-
; GFX11FAKE16: ; %bb.0:
2663-
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2664-
; GFX11FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, v3, v7
2665-
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2666-
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2667-
; GFX11FAKE16-NEXT: v_bfi_b32 v9, 0x7fff, v2, v6
2668-
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2669-
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v5
2670-
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
2671-
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0
2672-
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1
2673-
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2674-
; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v5
2675-
; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v4
2676-
; GFX11FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, v12, v11
2677-
; GFX11FAKE16-NEXT: v_bfi_b32 v5, 0x7fff, v13, v10
2678-
; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v6
2679-
; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, v7
2680-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2681-
; GFX11FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
2682-
; GFX11FAKE16-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
2683-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2684-
; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v9, 0x5040100
2685-
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
2686-
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
2466+
; GFX11-LABEL: v_copysign_v8bf16:
2467+
; GFX11: ; %bb.0:
2468+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2469+
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
2470+
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
2471+
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6
2472+
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7
2473+
; GFX11-NEXT: s_setpc_b64 s[30:31]
26872474
%result = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %mag, <8 x bfloat> %sign)
26882475
ret <8 x bfloat> %result
26892476
}

0 commit comments

Comments
 (0)