@@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
1390
1390
;
1391
1391
; GFX8-LABEL: s_copysign_v8bf16:
1392
1392
; GFX8: ; %bb.0:
1393
- ; GFX8-NEXT: s_movk_i32 s8, 0x7fff
1393
+ ; GFX8-NEXT: s_mov_b32 s8, 0x7fff7fff
1394
1394
; GFX8-NEXT: v_mov_b32_e32 v0, s3
1395
1395
; GFX8-NEXT: v_mov_b32_e32 v1, s7
1396
- ; GFX8-NEXT: s_lshr_b32 s7, s7, 16
1397
- ; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1398
1396
; GFX8-NEXT: v_bfi_b32 v0, s8, v0, v1
1399
- ; GFX8-NEXT: v_mov_b32_e32 v1, s3
1400
- ; GFX8-NEXT: v_mov_b32_e32 v2, s7
1401
- ; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
1402
- ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1403
- ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1404
1397
; GFX8-NEXT: v_mov_b32_e32 v1, s2
1405
1398
; GFX8-NEXT: v_mov_b32_e32 v2, s6
1406
- ; GFX8-NEXT: s_lshr_b32 s3, s6, 16
1407
- ; GFX8-NEXT: s_lshr_b32 s2, s2, 16
1408
1399
; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2
1409
- ; GFX8-NEXT: v_mov_b32_e32 v2, s2
1410
- ; GFX8-NEXT: v_mov_b32_e32 v3, s3
1411
- ; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
1412
- ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1413
- ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1414
1400
; GFX8-NEXT: v_mov_b32_e32 v2, s1
1415
1401
; GFX8-NEXT: v_mov_b32_e32 v3, s5
1416
- ; GFX8-NEXT: s_lshr_b32 s2, s5, 16
1417
- ; GFX8-NEXT: s_lshr_b32 s1, s1, 16
1418
1402
; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3
1419
- ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1420
- ; GFX8-NEXT: v_mov_b32_e32 v4, s2
1421
- ; GFX8-NEXT: v_bfi_b32 v3, s8, v3, v4
1422
- ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1423
- ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1424
1403
; GFX8-NEXT: v_mov_b32_e32 v3, s0
1425
1404
; GFX8-NEXT: v_mov_b32_e32 v4, s4
1426
- ; GFX8-NEXT: s_lshr_b32 s1, s4, 16
1427
- ; GFX8-NEXT: s_lshr_b32 s0, s0, 16
1428
1405
; GFX8-NEXT: v_bfi_b32 v3, s8, v3, v4
1429
- ; GFX8-NEXT: v_mov_b32_e32 v4, s0
1430
- ; GFX8-NEXT: v_mov_b32_e32 v5, s1
1431
- ; GFX8-NEXT: v_bfi_b32 v4, s8, v4, v5
1432
- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1433
- ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1434
1406
; GFX8-NEXT: v_readfirstlane_b32 s0, v3
1435
1407
; GFX8-NEXT: v_readfirstlane_b32 s1, v2
1436
1408
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
@@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
1439
1411
;
1440
1412
; GFX9-LABEL: s_copysign_v8bf16:
1441
1413
; GFX9: ; %bb.0:
1442
- ; GFX9-NEXT: s_movk_i32 s8, 0x7fff
1414
+ ; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff
1443
1415
; GFX9-NEXT: v_mov_b32_e32 v0, s3
1444
1416
; GFX9-NEXT: v_mov_b32_e32 v1, s7
1445
- ; GFX9-NEXT: s_lshr_b32 s7, s7, 16
1446
- ; GFX9-NEXT: s_lshr_b32 s3, s3, 16
1447
1417
; GFX9-NEXT: v_bfi_b32 v0, s8, v0, v1
1448
- ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1449
- ; GFX9-NEXT: v_mov_b32_e32 v2, s7
1450
- ; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
1451
- ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
1452
- ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1453
1418
; GFX9-NEXT: v_mov_b32_e32 v1, s2
1454
1419
; GFX9-NEXT: v_mov_b32_e32 v2, s6
1455
- ; GFX9-NEXT: s_lshr_b32 s3, s6, 16
1456
- ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
1457
1420
; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
1458
- ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1459
- ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1460
- ; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
1461
- ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1462
- ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
1463
1421
; GFX9-NEXT: v_mov_b32_e32 v2, s1
1464
1422
; GFX9-NEXT: v_mov_b32_e32 v3, s5
1465
- ; GFX9-NEXT: s_lshr_b32 s2, s5, 16
1466
- ; GFX9-NEXT: s_lshr_b32 s1, s1, 16
1467
1423
; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
1468
- ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1469
- ; GFX9-NEXT: v_mov_b32_e32 v4, s2
1470
- ; GFX9-NEXT: v_bfi_b32 v3, s8, v3, v4
1471
- ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
1472
- ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
1473
1424
; GFX9-NEXT: v_mov_b32_e32 v3, s0
1474
1425
; GFX9-NEXT: v_mov_b32_e32 v4, s4
1475
- ; GFX9-NEXT: s_lshr_b32 s1, s4, 16
1476
- ; GFX9-NEXT: s_lshr_b32 s0, s0, 16
1477
1426
; GFX9-NEXT: v_bfi_b32 v3, s8, v3, v4
1478
- ; GFX9-NEXT: v_mov_b32_e32 v4, s0
1479
- ; GFX9-NEXT: v_mov_b32_e32 v5, s1
1480
- ; GFX9-NEXT: v_bfi_b32 v4, s8, v4, v5
1481
- ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
1482
- ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
1483
1427
; GFX9-NEXT: v_readfirstlane_b32 s0, v3
1484
1428
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
1485
1429
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
@@ -1488,85 +1432,36 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x
1488
1432
;
1489
1433
; GFX10-LABEL: s_copysign_v8bf16:
1490
1434
; GFX10: ; %bb.0:
1491
- ; GFX10-NEXT: v_mov_b32_e32 v0, s7
1492
- ; GFX10-NEXT: s_lshr_b32 s7, s7, 16
1435
+ ; GFX10-NEXT: v_mov_b32_e32 v0, s4
1436
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s5
1493
1437
; GFX10-NEXT: v_mov_b32_e32 v2, s6
1494
- ; GFX10-NEXT: v_mov_b32_e32 v1, s7
1495
- ; GFX10-NEXT: s_lshr_b32 s7, s6, 16
1496
- ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s3, v0
1497
1438
; GFX10-NEXT: v_mov_b32_e32 v3, s7
1498
- ; GFX10-NEXT: s_lshr_b32 s3, s3, 16
1499
- ; GFX10-NEXT: v_mov_b32_e32 v4, s5
1500
- ; GFX10-NEXT: v_mov_b32_e32 v5, s4
1501
- ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s3, v1
1502
- ; GFX10-NEXT: s_lshr_b32 s3, s2, 16
1503
- ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, s2, v2
1504
- ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, s3, v3
1505
- ; GFX10-NEXT: s_lshr_b32 s2, s5, 16
1506
- ; GFX10-NEXT: s_lshr_b32 s3, s4, 16
1507
- ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, s1, v4
1508
- ; GFX10-NEXT: v_mov_b32_e32 v6, s2
1509
- ; GFX10-NEXT: v_mov_b32_e32 v7, s3
1510
- ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, s0, v5
1511
- ; GFX10-NEXT: s_lshr_b32 s1, s1, 16
1512
- ; GFX10-NEXT: s_lshr_b32 s0, s0, 16
1513
- ; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, s1, v6
1514
- ; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, s0, v7
1515
- ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
1516
- ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
1517
- ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
1518
- ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
1519
- ; GFX10-NEXT: v_lshl_or_b32 v5, v7, 16, v5
1520
- ; GFX10-NEXT: v_lshl_or_b32 v4, v6, 16, v4
1521
- ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
1522
- ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1523
- ; GFX10-NEXT: v_readfirstlane_b32 s0, v5
1524
- ; GFX10-NEXT: v_readfirstlane_b32 s1, v4
1439
+ ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
1440
+ ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
1441
+ ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2
1442
+ ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3
1443
+ ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
1444
+ ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
1525
1445
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
1526
- ; GFX10-NEXT: v_readfirstlane_b32 s3, v0
1446
+ ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
1527
1447
; GFX10-NEXT: ; return to shader part epilog
1528
1448
;
1529
1449
; GFX11-LABEL: s_copysign_v8bf16:
1530
1450
; GFX11: ; %bb.0:
1531
- ; GFX11-NEXT: v_mov_b32_e32 v0, s7
1532
- ; GFX11-NEXT: s_lshr_b32 s7, s7, 16
1533
- ; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s4
1534
- ; GFX11-NEXT: v_mov_b32_e32 v1, s7
1535
- ; GFX11-NEXT: s_lshr_b32 s7, s6, 16
1536
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1451
+ ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
1537
1452
; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
1538
- ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s3, v0
1539
- ; GFX11-NEXT: s_lshr_b32 s3, s3, 16
1540
- ; GFX11-NEXT: v_bfi_b32 v4, 0x7fff, s1, v4
1541
- ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s3, v1
1542
- ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
1543
- ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s2, v2
1544
- ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s3, v3
1545
- ; GFX11-NEXT: s_lshr_b32 s2, s5, 16
1546
- ; GFX11-NEXT: s_lshr_b32 s3, s4, 16
1547
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1548
- ; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
1549
- ; GFX11-NEXT: v_bfi_b32 v5, 0x7fff, s0, v5
1550
- ; GFX11-NEXT: s_lshr_b32 s1, s1, 16
1551
- ; GFX11-NEXT: s_lshr_b32 s0, s0, 16
1552
- ; GFX11-NEXT: v_bfi_b32 v6, 0x7fff, s1, v6
1553
- ; GFX11-NEXT: v_bfi_b32 v7, 0x7fff, s0, v7
1554
- ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
1555
- ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
1556
- ; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
1557
- ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1558
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1559
- ; GFX11-NEXT: v_lshl_or_b32 v5, v7, 16, v5
1560
- ; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v4
1561
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1562
- ; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2
1563
- ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1453
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1454
+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
1455
+ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
1456
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
1457
+ ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2
1458
+ ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3
1564
1459
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1565
- ; GFX11-NEXT: v_readfirstlane_b32 s0, v5
1566
- ; GFX11-NEXT: v_readfirstlane_b32 s1, v4
1460
+ ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
1461
+ ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
1567
1462
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1568
1463
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
1569
- ; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1464
+ ; GFX11-NEXT: v_readfirstlane_b32 s3, v3
1570
1465
; GFX11-NEXT: ; return to shader part epilog
1571
1466
%out = call <8 x bfloat> @llvm.copysign.v8bf16 (<8 x bfloat> %arg_mag , <8 x bfloat> %arg_sign )
1572
1467
%cast = bitcast <8 x bfloat> %out to <4 x i32 >
@@ -2542,148 +2437,40 @@ define <8 x bfloat> @v_copysign_v8bf16(<8 x bfloat> %mag, <8 x bfloat> %sign) {
2542
2437
; GFX8-LABEL: v_copysign_v8bf16:
2543
2438
; GFX8: ; %bb.0:
2544
2439
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2545
- ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
2546
- ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3
2547
- ; GFX8-NEXT: s_movk_i32 s4, 0x7fff
2548
- ; GFX8-NEXT: v_bfi_b32 v8, s4, v9, v8
2549
- ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v7
2550
- ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v6
2551
- ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v2
2552
- ; GFX8-NEXT: v_bfi_b32 v7, s4, v9, v7
2553
- ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v6
2554
- ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5
2555
- ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2556
- ; GFX8-NEXT: v_bfi_b32 v6, s4, v9, v6
2557
- ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
2558
- ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
2559
- ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v0
2560
- ; GFX8-NEXT: v_bfi_b32 v5, s4, v9, v5
2440
+ ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
2561
2441
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4
2562
- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
2563
- ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2564
- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
2565
- ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2566
- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7
2567
- ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2568
- ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v8
2569
- ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2442
+ ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
2443
+ ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v6
2444
+ ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v7
2570
2445
; GFX8-NEXT: s_setpc_b64 s[30:31]
2571
2446
;
2572
2447
; GFX9-LABEL: v_copysign_v8bf16:
2573
2448
; GFX9: ; %bb.0:
2574
2449
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575
- ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
2576
- ; GFX9-NEXT: v_bfi_b32 v8, s4, v3, v7
2577
- ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2578
- ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2579
- ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v7
2580
- ; GFX9-NEXT: v_bfi_b32 v7, s4, v2, v6
2581
- ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2582
- ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2583
- ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v6
2584
- ; GFX9-NEXT: v_bfi_b32 v6, s4, v1, v5
2585
- ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
2586
- ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2587
- ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
2588
- ; GFX9-NEXT: v_bfi_b32 v5, s4, v0, v4
2589
- ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
2590
- ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2450
+ ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
2591
2451
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
2592
- ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
2593
- ; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4
2594
- ; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4
2595
- ; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4
2596
- ; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4
2452
+ ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
2453
+ ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v6
2454
+ ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v7
2597
2455
; GFX9-NEXT: s_setpc_b64 s[30:31]
2598
2456
;
2599
2457
; GFX10-LABEL: v_copysign_v8bf16:
2600
2458
; GFX10: ; %bb.0:
2601
2459
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2602
- ; GFX10-NEXT: v_bfi_b32 v8, 0x7fff, v3, v7
2603
- ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2604
- ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2605
- ; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, v2, v6
2606
- ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2607
- ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v5
2608
- ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4
2609
- ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v0
2610
- ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v1
2611
- ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2612
- ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v5
2613
- ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v4
2614
- ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, v12, v11
2615
- ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, v13, v10
2616
- ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v2, v6
2617
- ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, v3, v7
2618
- ; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
2619
- ; GFX10-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
2620
- ; GFX10-NEXT: v_perm_b32 v2, v2, v9, 0x5040100
2621
- ; GFX10-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
2460
+ ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
2461
+ ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
2462
+ ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6
2463
+ ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7
2622
2464
; GFX10-NEXT: s_setpc_b64 s[30:31]
2623
2465
;
2624
- ; GFX11TRUE16-LABEL: v_copysign_v8bf16:
2625
- ; GFX11TRUE16: ; %bb.0:
2626
- ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2627
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
2628
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l
2629
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
2630
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h
2631
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l
2632
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l
2633
- ; GFX11TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v9
2634
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
2635
- ; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v3, v7
2636
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
2637
- ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v10, v11
2638
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l
2639
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v0.l
2640
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l
2641
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
2642
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h
2643
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
2644
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h
2645
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
2646
- ; GFX11TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v12
2647
- ; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v0, v4
2648
- ; GFX11TRUE16-NEXT: v_bfi_b32 v9, 0x7fff, v9, v10
2649
- ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fff, v1, v5
2650
- ; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, v2, v6
2651
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.l
2652
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
2653
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l
2654
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v5.l
2655
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
2656
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
2657
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v8.l
2658
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v7.l
2659
- ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
2660
- ;
2661
- ; GFX11FAKE16-LABEL: v_copysign_v8bf16:
2662
- ; GFX11FAKE16: ; %bb.0:
2663
- ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2664
- ; GFX11FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, v3, v7
2665
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2666
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2667
- ; GFX11FAKE16-NEXT: v_bfi_b32 v9, 0x7fff, v2, v6
2668
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2669
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v5
2670
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v4
2671
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0
2672
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1
2673
- ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2674
- ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v5
2675
- ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v4
2676
- ; GFX11FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, v12, v11
2677
- ; GFX11FAKE16-NEXT: v_bfi_b32 v5, 0x7fff, v13, v10
2678
- ; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v6
2679
- ; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, v7
2680
- ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2681
- ; GFX11FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100
2682
- ; GFX11FAKE16-NEXT: v_perm_b32 v1, v5, v1, 0x5040100
2683
- ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2684
- ; GFX11FAKE16-NEXT: v_perm_b32 v2, v2, v9, 0x5040100
2685
- ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x5040100
2686
- ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
2466
+ ; GFX11-LABEL: v_copysign_v8bf16:
2467
+ ; GFX11: ; %bb.0:
2468
+ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2469
+ ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
2470
+ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
2471
+ ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6
2472
+ ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7
2473
+ ; GFX11-NEXT: s_setpc_b64 s[30:31]
2687
2474
%result = call <8 x bfloat> @llvm.copysign.v8bf16 (<8 x bfloat> %mag , <8 x bfloat> %sign )
2688
2475
ret <8 x bfloat> %result
2689
2476
}
0 commit comments