Skip to content

Commit d39da2e

Browse files
committed
[AMDGPU] Patterns for <2 x bfloat> fneg (fabs)
1 parent 9b992f2 commit d39da2e

File tree

2 files changed

+21
-28
lines changed

2 files changed

+21
-28
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1840,22 +1840,21 @@ def : GCNPat <
18401840
(UniformUnaryFrag<fabs> (v2fp16vt SReg_32:$src)),
18411841
(S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff)))
18421842
>;
1843-
}
18441843

18451844
// This is really (fneg (fabs v2f16:$src))
18461845
//
18471846
// fabs is not reported as free because there is modifier for it in
18481847
// VOP3P instructions, so it is turned into the bit op.
18491848
def : GCNPat <
1850-
(UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
1849+
(UniformUnaryFrag<fneg> (v2fp16vt (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
18511850
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
18521851
>;
18531852

18541853
def : GCNPat <
1855-
(UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))),
1854+
(UniformUnaryFrag<fneg> (v2fp16vt (fabs SReg_32:$src))),
18561855
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
18571856
>;
1858-
1857+
}
18591858

18601859
// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
18611860
// of the real value.
@@ -1986,12 +1985,12 @@ def : GCNPat <
19861985
(fabs (v2fp16vt VGPR_32:$src)),
19871986
(V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
19881987
>;
1989-
}
19901988

19911989
def : GCNPat <
1992-
(fneg (v2f16 (fabs VGPR_32:$src))),
1990+
(fneg (v2fp16vt (fabs VGPR_32:$src))),
19931991
(V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
19941992
>;
1993+
}
19951994

19961995
def : GCNPat <
19971996
(fabs (f64 VReg_64:$src)),

llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -523,8 +523,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
523523
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
524524
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
525525
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
526-
; VI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
527-
; VI-NEXT: v_xor_b32_e32 v2, 0x80008000, v0
526+
; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
528527
; VI-NEXT: v_mov_b32_e32 v0, s0
529528
; VI-NEXT: v_mov_b32_e32 v1, s1
530529
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -556,8 +555,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
556555
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
557556
; GFX9-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
558557
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
559-
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
560-
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
558+
; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v1
561559
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
562560
; GFX9-NEXT: s_endpgm
563561
;
@@ -590,9 +588,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
590588
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
591589
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
592590
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
593-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
594-
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7fff7fff, v0
595-
; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
591+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
592+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
593+
; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0
596594
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
597595
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
598596
; GFX11-NEXT: s_endpgm
@@ -634,8 +632,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
634632
; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
635633
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
636634
; VI-NEXT: s_waitcnt lgkmcnt(0)
637-
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
638-
; VI-NEXT: s_xor_b32 s2, s2, 0x80008000
635+
; VI-NEXT: s_or_b32 s2, s2, 0x80008000
639636
; VI-NEXT: v_mov_b32_e32 v0, s0
640637
; VI-NEXT: v_mov_b32_e32 v1, s1
641638
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -648,8 +645,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
648645
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
649646
; GFX9-NEXT: v_mov_b32_e32 v0, 0
650647
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
651-
; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
652-
; GFX9-NEXT: s_xor_b32 s2, s2, 0x80008000
648+
; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000
653649
; GFX9-NEXT: v_mov_b32_e32 v1, s2
654650
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
655651
; GFX9-NEXT: s_endpgm
@@ -660,9 +656,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x
660656
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
661657
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
662658
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
663-
; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
664-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
665-
; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
659+
; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000
660+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
666661
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
667662
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
668663
; GFX11-NEXT: s_endpgm
@@ -977,7 +972,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
977972
; VI-NEXT: v_mov_b32_e32 v0, s0
978973
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
979974
; VI-NEXT: v_mov_b32_e32 v1, s1
980-
; VI-NEXT: s_xor_b32 s1, s0, 0x80008000
975+
; VI-NEXT: s_or_b32 s1, s4, 0x80008000
981976
; VI-NEXT: v_mov_b32_e32 v4, s0
982977
; VI-NEXT: v_mov_b32_e32 v2, s2
983978
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -992,12 +987,12 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
992987
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
993988
; GFX9-NEXT: v_mov_b32_e32 v0, 0
994989
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
995-
; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff
996-
; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000
997-
; GFX9-NEXT: v_mov_b32_e32 v1, s4
998-
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
990+
; GFX9-NEXT: s_and_b32 s5, s4, 0x7fff7fff
991+
; GFX9-NEXT: s_or_b32 s4, s4, 0x80008000
999992
; GFX9-NEXT: v_mov_b32_e32 v1, s5
1000-
; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
993+
; GFX9-NEXT: v_mov_b32_e32 v2, s4
994+
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
995+
; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
1001996
; GFX9-NEXT: s_endpgm
1002997
;
1003998
; GFX11-LABEL: s_fneg_multi_use_fabs_v2bf16:
@@ -1007,9 +1002,8 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0,
10071002
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
10081003
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
10091004
; GFX11-NEXT: s_and_b32 s4, s6, 0x7fff7fff
1010-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1005+
; GFX11-NEXT: s_or_b32 s5, s6, 0x80008000
10111006
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
1012-
; GFX11-NEXT: s_xor_b32 s5, s4, 0x80008000
10131007
; GFX11-NEXT: v_mov_b32_e32 v2, s5
10141008
; GFX11-NEXT: s_clause 0x1
10151009
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]

0 commit comments

Comments
 (0)