@@ -523,8 +523,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
523
523
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
524
524
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
525
525
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
526
- ; VI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
527
- ; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v0
526
+ ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
528
527
; VI-NEXT: v_mov_b32_e32 v0, s0
529
528
; VI-NEXT: v_mov_b32_e32 v1, s1
530
529
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -556,8 +555,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
556
555
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
557
556
; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
558
557
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
559
- ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
560
- ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
558
+ ; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v1
561
559
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
562
560
; GFX9-NEXT: s_endpgm
563
561
;
@@ -590,9 +588,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
590
588
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
591
589
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
592
590
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
593
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
594
- ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7fff7fff, v0
595
- ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
591
+ ; GFX11-NEXT: v_mov_b32_e32 v1, 0
592
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
593
+ ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0
596
594
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
597
595
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
598
596
; GFX11-NEXT: s_endpgm
@@ -634,8 +632,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
634
632
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
635
633
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
636
634
; VI-NEXT: s_waitcnt lgkmcnt(0)
637
- ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
638
- ; VI-NEXT: s_xor_b32 s2, s2, 0x80008000
635
+ ; VI-NEXT: s_or_b32 s2, s2, 0x80008000
639
636
; VI-NEXT: v_mov_b32_e32 v0, s0
640
637
; VI-NEXT: v_mov_b32_e32 v1, s1
641
638
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -648,8 +645,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
648
645
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
649
646
; GFX9-NEXT: v_mov_b32_e32 v0, 0
650
647
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
651
- ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
652
- ; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
648
+ ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000
653
649
; GFX9-NEXT: v_mov_b32_e32 v1, s2
654
650
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
655
651
; GFX9-NEXT: s_endpgm
@@ -660,9 +656,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
660
656
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
661
657
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
662
658
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
663
- ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
664
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
665
- ; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
659
+ ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000
660
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
666
661
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
667
662
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
668
663
; GFX11-NEXT: s_endpgm
@@ -977,7 +972,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
977
972
; VI-NEXT: v_mov_b32_e32 v0, s0
978
973
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
979
974
; VI-NEXT: v_mov_b32_e32 v1, s1
980
- ; VI-NEXT: s_xor_b32 s1, s0 , 0x80008000
975
+ ; VI-NEXT: s_or_b32 s1, s4 , 0x80008000
981
976
; VI-NEXT: v_mov_b32_e32 v4, s0
982
977
; VI-NEXT: v_mov_b32_e32 v2, s2
983
978
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -992,12 +987,12 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
992
987
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
993
988
; GFX9-NEXT: v_mov_b32_e32 v0, 0
994
989
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
995
- ; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff
996
- ; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000
997
- ; GFX9-NEXT: v_mov_b32_e32 v1, s4
998
- ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
990
+ ; GFX9-NEXT: s_and_b32 s5, s4, 0x7fff7fff
991
+ ; GFX9-NEXT: s_or_b32 s4, s4, 0x80008000
999
992
; GFX9-NEXT: v_mov_b32_e32 v1, s5
1000
- ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
993
+ ; GFX9-NEXT: v_mov_b32_e32 v2, s4
994
+ ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
995
+ ; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
1001
996
; GFX9-NEXT: s_endpgm
1002
997
;
1003
998
; GFX11-LABEL: s_fneg_multi_use_fabs_v2bf16:
@@ -1007,9 +1002,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
1007
1002
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1008
1003
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1009
1004
; GFX11-NEXT: s_and_b32 s4, s6, 0x7fff7fff
1010
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1005
+ ; GFX11-NEXT: s_or_b32 s5, s6, 0x80008000
1011
1006
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
1012
- ; GFX11-NEXT: s_xor_b32 s5, s4, 0x80008000
1013
1007
; GFX11-NEXT: v_mov_b32_e32 v2, s5
1014
1008
; GFX11-NEXT: s_clause 0x1
1015
1009
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
0 commit comments