-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU][True16][CodeGen] v_s_xxx_f16 t16 mode handling in movetoVALU process #141152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
b7fa490
to
a066b83
Compare
a066b83
to
f92e958
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesAdd op_sel for v_s_xxx_f16 when move them to VALU update a few related codegen test for gfx12 in true16 mode Patch is 100.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/141152.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a27d4eeee97f4..f59b3ac43d2eb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7736,6 +7736,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
}
+ case AMDGPU::V_S_EXP_F16_e64:
+ case AMDGPU::V_S_LOG_F16_e64:
+ case AMDGPU::V_S_RCP_F16_e64:
+ case AMDGPU::V_S_RSQ_F16_e64:
+ case AMDGPU::V_S_SQRT_F16_e64: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
+ ? &AMDGPU::VGPR_16RegClass
+ : &AMDGPU::VGPR_32RegClass);
+ auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(1))
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ if (ST.useRealTrue16Insts())
+ NewInstr.addImm(0); // opsel0
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+ legalizeOperandsVALUt16(*NewInstr, MRI);
+ legalizeOperands(*NewInstr, MDT);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
+ return;
+ }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 7a1351174733b..8613691c09517 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -8,6 +8,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; SI-LABEL: frem_f16:
@@ -331,6 +333,82 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -537,6 +615,50 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: fast_frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: fast_frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, 0
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -743,6 +865,50 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, 0
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -985,6 +1151,42 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
+; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v5, v4
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v5, v6, v5
+; GFX1200-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v5, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v1
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1142,6 +1344,27 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: fast_frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1299,6 +1522,27 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: unsafe_frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1551,6 +1795,39 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v12, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v12, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v12, s[4:5]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1200-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
+; GFX1200-NEXT: v_mul_f64_e32 v[10:11], v[8:9], v[6:7]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; GFX1200-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1772,6 +2049,35 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:...
[truncated]
|
CI failure is not related |
; GFX12-TRUE16-LABEL: rcp_f16: | ||
; GFX12-TRUE16: ; %bb.0: ; %entry | ||
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 | ||
; GFX12-TRUE16-NEXT: v_rcp_f16_e32 v0.l, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks fairly suspicious, like it might not be the right ISA. Dow do we avoid the buffer_load_u16 on GFX12? Probably not related to this patch, since it is the same in fake16.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems there is an issue with u16 load/store with both true16/fake16 flow in gfx12. I'll take a look at it.
; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1) | ||
; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF | ||
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16 | ||
; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_EXP_F16_t16_e64 0, 0, 0, 0, 0, implicit $mode, implicit $exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all arguments are 0?
@@ -1,5 +1,5 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 | |||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should have a +real-true16 runline as well?
Add op_sel for v_s_xxx_f16 when move them to VALU
update a few related codegen test for gfx12 in true16 mode