Skip to content

Commit a066b83

Browse files
committed
v_s_xx_f16 support in moveToVALU
1 parent 77ce1b4 commit a066b83

8 files changed

+1364
-130
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7725,6 +7725,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77257725
Inst.eraseFromParent();
77267726
return;
77277727
}
7728+
case AMDGPU::V_S_EXP_F16_e64:
7729+
case AMDGPU::V_S_LOG_F16_e64:
7730+
case AMDGPU::V_S_RCP_F16_e64:
7731+
case AMDGPU::V_S_RSQ_F16_e64:
7732+
case AMDGPU::V_S_SQRT_F16_e64: {
7733+
const DebugLoc &DL = Inst.getDebugLoc();
7734+
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7735+
? &AMDGPU::VGPR_16RegClass
7736+
: &AMDGPU::VGPR_32RegClass);
7737+
auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7738+
.addImm(0) // src0_modifiers
7739+
.add(Inst.getOperand(1))
7740+
.addImm(0) // clamp
7741+
.addImm(0); // omod
7742+
if (ST.useRealTrue16Insts())
7743+
NewInstr.addImm(0); // opsel0
7744+
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7745+
legalizeOperandsVALUt16(*NewInstr, MRI);
7746+
legalizeOperands(*NewInstr, MDT);
7747+
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7748+
Inst.eraseFromParent();
7749+
return;
7750+
}
77287751
}
77297752

77307753
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {

llvm/test/CodeGen/AMDGPU/frem.ll

Lines changed: 897 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 78 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,86 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
23
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
34
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
47

58
declare half @llvm.amdgcn.rcp.f16(half %a)
69

7-
; GCN-LABEL: {{^}}rcp_f16
8-
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
9-
; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
10-
; GFX11-TRUE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
11-
; GFX11-FAKE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
12-
; GCN: buffer_store_short v[[R_F16]]
13-
; GCN: s_endpgm
1410
define amdgpu_kernel void @rcp_f16(
11+
; GCN-LABEL: rcp_f16:
12+
; GCN: ; %bb.0: ; %entry
13+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
14+
; GCN-NEXT: s_mov_b32 s7, 0xf000
15+
; GCN-NEXT: s_mov_b32 s6, -1
16+
; GCN-NEXT: s_mov_b32 s10, s6
17+
; GCN-NEXT: s_mov_b32 s11, s7
18+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
19+
; GCN-NEXT: s_mov_b32 s8, s2
20+
; GCN-NEXT: s_mov_b32 s9, s3
21+
; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
22+
; GCN-NEXT: s_mov_b32 s4, s0
23+
; GCN-NEXT: s_mov_b32 s5, s1
24+
; GCN-NEXT: s_waitcnt vmcnt(0)
25+
; GCN-NEXT: v_rcp_f16_e32 v0, v0
26+
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
27+
; GCN-NEXT: s_endpgm
28+
;
29+
; GFX11-TRUE16-LABEL: rcp_f16:
30+
; GFX11-TRUE16: ; %bb.0: ; %entry
31+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
32+
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
33+
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
34+
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
35+
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
36+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
37+
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
38+
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
39+
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
40+
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
41+
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
42+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
43+
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
44+
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
45+
; GFX11-TRUE16-NEXT: s_endpgm
46+
;
47+
; GFX11-FAKE16-LABEL: rcp_f16:
48+
; GFX11-FAKE16: ; %bb.0: ; %entry
49+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
50+
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
51+
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
52+
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
53+
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
54+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
55+
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
56+
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
57+
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
58+
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
59+
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
60+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
61+
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v0, v0
62+
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
63+
; GFX11-FAKE16-NEXT: s_endpgm
64+
;
65+
; GFX12-TRUE16-LABEL: rcp_f16:
66+
; GFX12-TRUE16: ; %bb.0: ; %entry
67+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
68+
; GFX12-TRUE16-NEXT: v_rcp_f16_e32 v0.l, 0
69+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
70+
; GFX12-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
71+
; GFX12-TRUE16-NEXT: s_mov_b32 s2, -1
72+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
73+
; GFX12-TRUE16-NEXT: s_endpgm
74+
;
75+
; GFX12-FAKE16-LABEL: rcp_f16:
76+
; GFX12-FAKE16: ; %bb.0: ; %entry
77+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
78+
; GFX12-FAKE16-NEXT: v_rcp_f16_e32 v0, 0
79+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
80+
; GFX12-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
81+
; GFX12-FAKE16-NEXT: s_mov_b32 s2, -1
82+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
83+
; GFX12-FAKE16-NEXT: s_endpgm
1584
ptr addrspace(1) %r,
1685
ptr addrspace(1) %a) {
1786
entry:
@@ -20,3 +89,5 @@ entry:
2089
store half %r.val, ptr addrspace(1) %r
2190
ret void
2291
}
92+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
93+
; VI: {{.*}}
Lines changed: 78 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,86 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
23
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
34
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
47

58
declare half @llvm.amdgcn.rsq.f16(half %a)
69

7-
; GCN-LABEL: {{^}}rsq_f16
8-
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
9-
; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
10-
; GFX11-TRUE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
11-
; GFX11-FAKE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
12-
; GCN: buffer_store_short v[[R_F16]]
13-
; GCN: s_endpgm
1410
define amdgpu_kernel void @rsq_f16(
11+
; GCN-LABEL: rsq_f16:
12+
; GCN: ; %bb.0: ; %entry
13+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
14+
; GCN-NEXT: s_mov_b32 s7, 0xf000
15+
; GCN-NEXT: s_mov_b32 s6, -1
16+
; GCN-NEXT: s_mov_b32 s10, s6
17+
; GCN-NEXT: s_mov_b32 s11, s7
18+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
19+
; GCN-NEXT: s_mov_b32 s8, s2
20+
; GCN-NEXT: s_mov_b32 s9, s3
21+
; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
22+
; GCN-NEXT: s_mov_b32 s4, s0
23+
; GCN-NEXT: s_mov_b32 s5, s1
24+
; GCN-NEXT: s_waitcnt vmcnt(0)
25+
; GCN-NEXT: v_rsq_f16_e32 v0, v0
26+
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
27+
; GCN-NEXT: s_endpgm
28+
;
29+
; GFX11-TRUE16-LABEL: rsq_f16:
30+
; GFX11-TRUE16: ; %bb.0: ; %entry
31+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
32+
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
33+
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
34+
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
35+
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
36+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
37+
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
38+
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
39+
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
40+
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
41+
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
42+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
43+
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
44+
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
45+
; GFX11-TRUE16-NEXT: s_endpgm
46+
;
47+
; GFX11-FAKE16-LABEL: rsq_f16:
48+
; GFX11-FAKE16: ; %bb.0: ; %entry
49+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
50+
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
51+
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
52+
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
53+
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
54+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
55+
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
56+
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
57+
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
58+
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
59+
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
60+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
61+
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
62+
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
63+
; GFX11-FAKE16-NEXT: s_endpgm
64+
;
65+
; GFX12-TRUE16-LABEL: rsq_f16:
66+
; GFX12-TRUE16: ; %bb.0: ; %entry
67+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
68+
; GFX12-TRUE16-NEXT: v_rsq_f16_e32 v0.l, 0
69+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
70+
; GFX12-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
71+
; GFX12-TRUE16-NEXT: s_mov_b32 s2, -1
72+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
73+
; GFX12-TRUE16-NEXT: s_endpgm
74+
;
75+
; GFX12-FAKE16-LABEL: rsq_f16:
76+
; GFX12-FAKE16: ; %bb.0: ; %entry
77+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
78+
; GFX12-FAKE16-NEXT: v_rsq_f16_e32 v0, 0
79+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
80+
; GFX12-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
81+
; GFX12-FAKE16-NEXT: s_mov_b32 s2, -1
82+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
83+
; GFX12-FAKE16-NEXT: s_endpgm
1584
ptr addrspace(1) %r,
1685
ptr addrspace(1) %a) {
1786
entry:
@@ -20,3 +89,5 @@ entry:
2089
store half %r.val, ptr addrspace(1) %r
2190
ret void
2291
}
92+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
93+
; VI: {{.*}}

llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
22
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
33
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4-
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
5-
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
4+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
5+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
6+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
7+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
68

79
declare half @llvm.sqrt.f16(half %a)
810
declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -81,6 +83,26 @@ define amdgpu_kernel void @sqrt_f16(
8183
; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
8284
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
8385
; GFX11-FAKE16-NEXT: s_endpgm
86+
;
87+
; GFX12-TRUE16-LABEL: sqrt_f16:
88+
; GFX12-TRUE16: ; %bb.0: ; %entry
89+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
90+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, 0
91+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
92+
; GFX12-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
93+
; GFX12-TRUE16-NEXT: s_mov_b32 s2, -1
94+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
95+
; GFX12-TRUE16-NEXT: s_endpgm
96+
;
97+
; GFX12-FAKE16-LABEL: sqrt_f16:
98+
; GFX12-FAKE16: ; %bb.0: ; %entry
99+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
100+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, 0
101+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
102+
; GFX12-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
103+
; GFX12-FAKE16-NEXT: s_mov_b32 s2, -1
104+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
105+
; GFX12-FAKE16-NEXT: s_endpgm
84106
ptr addrspace(1) %r,
85107
ptr addrspace(1) %a) {
86108
entry:
@@ -189,6 +211,32 @@ define amdgpu_kernel void @sqrt_v2f16(
189211
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
190212
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
191213
; GFX11-FAKE16-NEXT: s_endpgm
214+
;
215+
; GFX12-TRUE16-LABEL: sqrt_v2f16:
216+
; GFX12-TRUE16: ; %bb.0: ; %entry
217+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
218+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, 0
219+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.h, 0
220+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
221+
; GFX12-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
222+
; GFX12-TRUE16-NEXT: s_mov_b32 s2, -1
223+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
224+
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
225+
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], null
226+
; GFX12-TRUE16-NEXT: s_endpgm
227+
;
228+
; GFX12-FAKE16-LABEL: sqrt_v2f16:
229+
; GFX12-FAKE16: ; %bb.0: ; %entry
230+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
231+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, 0
232+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v1, 0
233+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
234+
; GFX12-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
235+
; GFX12-FAKE16-NEXT: s_mov_b32 s2, -1
236+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
237+
; GFX12-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
238+
; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], null
239+
; GFX12-FAKE16-NEXT: s_endpgm
192240
ptr addrspace(1) %r,
193241
ptr addrspace(1) %a) {
194242
entry:
@@ -197,5 +245,3 @@ entry:
197245
store <2 x half> %r.val, ptr addrspace(1) %r
198246
ret void
199247
}
200-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
201-
; GFX11: {{.*}}

0 commit comments

Comments
 (0)