[AMDGPU][True16][CodeGen] fp conversion in true/fake16 format #101678

broxigarchen · 2024-08-02T13:55:25Z

fp conversion V_CVT_F_F/V_CVT_F_U instructions true16 format were previously implemented using fake16 profile.

With the MC support inplace, correct and support these instructions in true16/fake16 format in CodeGen

llvmbot · 2024-08-13T21:23:30Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Author: Brox Chen (broxigarchen)

Changes

Patch is 207.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101678.diff

19 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+7-3)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+6-3)
(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+61-25)
(modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+57-53)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir (+76-51)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir (+76-51)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir (+27-27)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir (+27-27)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir (+177-101)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir (+167-97)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir (+50-25)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir (+52-23)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll (+33-16)
(added) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir (+39)
(added) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir (+41)
(modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir (-37)
(modified) llvm/test/CodeGen/AMDGPU/schedule-regpressure-ilp-metric-spills.mir (+128-128)
(modified) llvm/test/CodeGen/AMDGPU/sitofp.f16.ll (+251-120)
(modified) llvm/test/CodeGen/AMDGPU/uitofp.f16.ll (+251-120)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8af5c364509f0e..b3c061a2bac338 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5424,9 +5424,13 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
   case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
   case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
-  case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
-  case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
-  case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
+  case AMDGPU::S_CVT_F32_F16:
+  case AMDGPU::S_CVT_HI_F32_F16:
+    return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
+                                   : AMDGPU::V_CVT_F32_F16_fake16_e64;
+  case AMDGPU::S_CVT_F16_F32:
+    return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
+                                   : AMDGPU::V_CVT_F16_F32_fake16_e64;
   case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
   case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
   case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e99b43afd1c3a2..31b32f65b68f5c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1747,9 +1747,11 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
             (ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
       /* else */,
         // VOP1 without modifiers
-        !if (HasClamp,
-          (ins Src0RC:$src0, Clamp0:$clamp),
-          (ins Src0RC:$src0))
+        !if(HasOMod,
+          (ins Src0RC:$src0, Clamp0:$clamp, omod0:$omod),
+          !if (HasClamp,
+            (ins Src0RC:$src0, Clamp0:$clamp),
+            (ins Src0RC:$src0)))
       /* endif */ ),
     !if (!eq(NumSrcArgs, 2),
       !if (HasModifiers,
@@ -2537,6 +2539,7 @@ class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> {
   // Most DstVT are 16-bit, but not all
   let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
+  let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
   let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
   let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
   let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5a139d1cf8d825..49132ddf7c5476 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1104,7 +1104,7 @@ def : Pat <
 // VOP1 Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
+multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
   // f16_to_fp patterns
   def : GCNPat <
     (f32 (any_f16_to_fp i32:$src0)),
@@ -1131,25 +1131,42 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
     (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
   >;
 
+  // fp_to_fp16 patterns
   def : GCNPat <
-    (f64 (any_fpextend f16:$src)),
-    (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
+    (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+    (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
   >;
 
-  // fp_to_fp16 patterns
+  // This is only used on targets without half support
+  // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
   def : GCNPat <
-    (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+    (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
     (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
   >;
+}
+
+let SubtargetPredicate = NotHasTrue16BitInsts in
+defm : f16_to_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
+
+let SubtargetPredicate = UseFakeTrue16Insts in
+defm : f16_to_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
+
+multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64,
+                       Instruction cvt_f32_f16_inst_e64,
+                       RegOrImmOperand VSrc> {
+  def : GCNPat <
+    (f64 (any_fpextend f16:$src)),
+    (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
+  >;
 
   def : GCNPat <
     (i32 (fp_to_sint f16:$src)),
-    (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
+    (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
   >;
 
   def : GCNPat <
     (i32 (fp_to_uint f16:$src)),
-    (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
+    (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
   >;
 
   def : GCNPat <
@@ -1161,20 +1178,16 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
     (f16 (uint_to_fp i32:$src)),
     (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
   >;
-
-  // This is only used on targets without half support
-  // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
-  def : GCNPat <
-    (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
-    (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
-  >;
 }
 
 let SubtargetPredicate = NotHasTrue16BitInsts in
-defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
+defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64, VSrc_b32>;
 
-let SubtargetPredicate = HasTrue16BitInsts in
-defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64>;
+let SubtargetPredicate = UseRealTrue16Insts in
+defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64, VSrcT_b16>;
+
+let SubtargetPredicate = UseFakeTrue16Insts in
+defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64, VSrc_b16>;
 
 //===----------------------------------------------------------------------===//
 // VOP2 Patterns
@@ -2784,13 +2797,24 @@ def : GCNPat <
                         SSrc_i1:$src))
 >;
 
-let SubtargetPredicate = HasTrue16BitInsts in
+let SubtargetPredicate = UseRealTrue16Insts in
 def : GCNPat <
   (f16 (sint_to_fp i1:$src)),
-  (V_CVT_F16_F32_t16_e32 (
-      V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+  (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
-                        SSrc_i1:$src))
+                        SSrc_i1:$src),
+      /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
+>;
+
+let SubtargetPredicate = UseFakeTrue16Insts in
+def : GCNPat <
+  (f16 (sint_to_fp i1:$src)),
+  (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+                        SSrc_i1:$src),
+      /*clamp*/ 0, /*omod*/ 0)
 >;
 
 let SubtargetPredicate = NotHasTrue16BitInsts in
@@ -2801,13 +2825,25 @@ def : GCNPat <
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
                         SSrc_i1:$src))
 >;
-let SubtargetPredicate = HasTrue16BitInsts in
+
+let SubtargetPredicate = UseRealTrue16Insts in
 def : GCNPat <
   (f16 (uint_to_fp i1:$src)),
-  (V_CVT_F16_F32_t16_e32 (
-      V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+  (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
-                        SSrc_i1:$src))
+                        SSrc_i1:$src),
+      /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
+>;
+
+let SubtargetPredicate = UseFakeTrue16Insts in
+def : GCNPat <
+  (f16 (uint_to_fp i1:$src)),
+  (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+                        SSrc_i1:$src),
+      /*clamp*/ 0, /*omod*/ 0)
 >;
 
 def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 03e4cb9fcf49b7..362744f51db475 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -187,21 +187,17 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
   let HasClamp = 1;
 }
 
-class VOPProfileI2F_True16<ValueType dstVt, ValueType srcVt> :
-  VOPProfile_Fake16<VOPProfile<[dstVt, srcVt, untyped, untyped]>> {
-
-  let Ins64 = (ins Src0RC64:$src0, Clamp:$clamp, omod:$omod);
-  let InsVOP3Base = (ins Src0VOP3DPP:$src0, Clamp:$clamp, omod:$omod);
-  let AsmVOP3Base = "$vdst, $src0$clamp$omod";
-
-  let HasModifiers = 0;
-  let HasClamp = 1;
-}
-
 def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
 def VOP1_F32_I32 : VOPProfileI2F <f32, i32>;
 def VOP1_F16_I16 : VOPProfileI2F <f16, i16>;
-def VOP1_F16_I16_t16 : VOPProfileI2F_True16 <f16, i16>;
+def VOP1_F16_I16_t16 : VOPProfile_True16 <VOP1_F16_I16> {
+  let HasClamp = 1;
+}
+def VOP1_F16_I16_fake16 : VOPProfile_Fake16 <VOP1_F16_I16> {
+  let HasModifiers = 0;
+  let HasOMod = 1;
+  let HasClamp = 1;
+}
 
 def VOP_NOP_PROFILE : VOPProfile <[untyped, untyped, untyped, untyped]>{
   let HasExtVOP3DPP = 0;
@@ -217,7 +213,10 @@ class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> :
 def VOP_I32_F32_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f32>;
 def VOP_I32_F64_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f64>;
 def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>;
-def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_Fake16<VOP_I16_F16> {
+def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_True16<VOP_I16_F16> {
+  let HasOMod = 1;
+}
+def VOP_I16_F16_SPECIAL_OMOD_fake16 : VOPProfile_Fake16<VOP_I16_F16> {
   let HasOMod = 1;
 }
 
@@ -294,16 +293,22 @@ defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_sint>;
 let FPDPRounding = 1, isReMaterializable = 0 in {
+  // V_CVT_F16_F32 and V_CVT_F32_F16 are special cases because they are
+  // present in targets without Has16BitInsts. Otherwise they can use
+  // class VOP1Inst_t16
   let OtherPredicates = [NotHasTrue16BitInsts] in
   defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, any_fpround>;
-  let OtherPredicates = [HasTrue16BitInsts] in
-  defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_Fake16<VOP_F16_F32>, any_fpround>;
+  let OtherPredicates = [UseRealTrue16Insts] in
+  defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_True16<VOP_F16_F32>, any_fpround>;
+  let OtherPredicates = [UseFakeTrue16Insts] in
+  defm V_CVT_F16_F32_fake16 : VOP1Inst <"v_cvt_f16_f32_fake16", VOPProfile_Fake16<VOP_F16_F32>, any_fpround>;
 } // End FPDPRounding = 1, isReMaterializable = 0
-
 let OtherPredicates = [NotHasTrue16BitInsts] in
-defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, any_fpextend>;
-let OtherPredicates = [HasTrue16BitInsts] in
-defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_Fake16<VOP_F32_F16>, any_fpextend>;
+  defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, any_fpextend>;
+let OtherPredicates = [UseRealTrue16Insts] in
+  defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_True16<VOP_F32_F16>, any_fpextend>;
+let OtherPredicates = [UseFakeTrue16Insts] in
+  defm V_CVT_F32_F16_fake16 : VOP1Inst <"v_cvt_f32_f16_fake16", VOPProfile_Fake16<VOP_F32_F16>, any_fpextend>;
 
 let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
@@ -473,24 +478,15 @@ let SubtargetPredicate = isGFX7Plus in {
 } // End isReMaterializable = 1
 
 let FPDPRounding = 1 in {
-let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
-defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
-defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
-}
-let OtherPredicates = [HasTrue16BitInsts] in {
-defm V_CVT_F16_U16_t16 : VOP1Inst <"v_cvt_f16_u16_t16", VOP1_F16_I16_t16, uint_to_fp>;
-defm V_CVT_F16_I16_t16 : VOP1Inst <"v_cvt_f16_i16_t16", VOP1_F16_I16_t16, sint_to_fp>;
-}
+defm V_CVT_F16_U16 : VOP1Inst_t16_with_profiles <"v_cvt_f16_u16", VOP1_F16_I16, VOP1_F16_I16_t16, VOP1_F16_I16_fake16, uint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst_t16_with_profiles <"v_cvt_f16_i16", VOP1_F16_I16, VOP1_F16_I16_t16, VOP1_F16_I16_fake16, sint_to_fp>;
 } // End FPDPRounding = 1
 // OMod clears exceptions when set in these two instructions
-let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
-defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16_SPECIAL_OMOD, fp_to_uint>;
-defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, fp_to_sint>;
-}
-let OtherPredicates = [HasTrue16BitInsts] in {
-defm V_CVT_U16_F16_t16 : VOP1Inst <"v_cvt_u16_f16_t16", VOP_I16_F16_SPECIAL_OMOD_t16, fp_to_uint>;
-defm V_CVT_I16_F16_t16 : VOP1Inst <"v_cvt_i16_f16_t16", VOP_I16_F16_SPECIAL_OMOD_t16, fp_to_sint>;
-}
+defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16",
+   VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD, fp_to_uint>;
+defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16",
+   VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD, fp_to_sint>;
+
 let TRANS = 1, SchedRW = [WriteTrans32] in {
 defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
 defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
@@ -501,12 +497,8 @@ defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
 defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
 } // End TRANS = 1, SchedRW = [WriteTrans32]
 defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
-let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
-defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
-}
-let OtherPredicates = [HasTrue16BitInsts] in {
-defm V_FREXP_EXP_I16_F16_t16 : VOP1Inst <"v_frexp_exp_i16_f16_t16", VOP_I16_F16_SPECIAL_OMOD_t16, int_amdgcn_frexp_exp>;
-}
+defm V_FREXP_EXP_I16_F16 : VOP1Inst_t16_with_profiles <"v_frexp_exp_i16_f16",
+   VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD, int_amdgcn_frexp_exp>;
 defm V_FLOOR_F16 : VOP1Inst_t16 <"v_floor_f16", VOP_F16_F16, ffloor>;
 defm V_CEIL_F16 : VOP1Inst_t16 <"v_ceil_f16", VOP_F16_F16, fceil>;
 defm V_TRUNC_F16 : VOP1Inst_t16 <"v_trunc_f16", VOP_F16_F16, ftrunc>;
@@ -525,7 +517,7 @@ def : GCNPat<
     (V_CVT_F16_F32_e32 $src)
 >;
 }
-let OtherPredicates = [HasTrue16BitInsts] in {
+let OtherPredicates = [UseRealTrue16Insts] in {
 def : GCNPat<
     (f32 (f16_to_fp i16:$src)),
     (V_CVT_F32_F16_t16_e32 $src)
@@ -535,6 +527,16 @@ def : GCNPat<
     (V_CVT_F16_F32_t16_e32 $src)
 >;
 }
+let OtherPredicates = [UseFakeTrue16Insts] in {
+def : GCNPat<
+    (f32 (f16_to_fp i16:$src)),
+    (V_CVT_F32_F16_fake16_e32 $src)
+>;
+def : GCNPat<
+    (i16 (AMDGPUfp_to_f16 f32:$src)),
+    (V_CVT_F16_F32_fake16_e32 $src)
+>;
+}
 
 def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
   let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1);
@@ -554,14 +556,10 @@ let SubtargetPredicate = isGFX9Plus in {
   defm V_SAT_PK_U8_I16    : VOP1Inst_t16<"v_sat_pk_u8_i16", VOP_I16_I32>;
 
   let mayRaiseFPException = 0 in {
-    let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
-      defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16_SPECIAL_OMOD>;
-      defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16_SPECIAL_OMOD>;
-    }
-    let OtherPredicates = [HasTrue16BitInsts] in {
-      defm V_CVT_NORM_I16_F16_t16 : VOP1Inst<"v_cvt_norm_i16_f16_t16", VOP_I16_F16_SPECIAL_OMOD_t16>;
-      defm V_CVT_NORM_U16_F16_t16 : VOP1Inst<"v_cvt_norm_u16_f16_t16", VOP_I16_F16_SPECIAL_OMOD_t16>;
-    }
+    defm V_CVT_NORM_I16_F16 : VOP1Inst_t16_with_profiles<"v_cvt_norm_i16_f16",
+      VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
+    defm V_CVT_NORM_U16_F16 : VOP1Inst_t16_with_profiles<"v_cvt_norm_u16_f16",
+      VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16>;
   } // End mayRaiseFPException = 0
 } // End SubtargetPredicate = isGFX9Plus
 
@@ -975,9 +973,13 @@ defm V_CVT_I32_I16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_
 defm V_CVT_U32_U16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
 
 defm V_CVT_F16_U16_t16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x050, "v_cvt_f16_u16">;
+defm V_CVT_F16_U16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x050, "v_cvt_f16_u16">;
 defm V_CVT_F16_I16_t16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
+defm V_CVT_F16_I16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
 defm V_CVT_U16_F16_t16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x052, "v_cvt_u16_f16">;
+defm V_CVT_U16_F16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x052, "v_cvt_u16_f16">;
 defm V_CVT_I16_F16_t16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x053, "v_cvt_i16_f16">;
+defm V_CVT_I16_F16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x053, "v_cvt_i16_f16">;
 defm V_RCP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
 defm V_RCP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
 defm V_SQRT_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
@@ -990,6 +992,7 @@ defm V_EXP_F16_t16           : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16"
 defm V_EXP_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
 defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
 defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
+defm V_FREXP_EXP_I16_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
 defm V_FLOOR_F16_t16         : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
 defm V_FLOOR_F16_fake16      : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
 defm V_CEIL_F16_t16          : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
@@ -1001,10 +1004,14 @@ defm V_SIN_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16"
 defm V_COS_F16_fake16        : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">;
 defm V_SAT_PK_U8_I16_fake16  : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
 defm V_CVT_NORM_I16_F16_t16  : VOP1_Real_FULL_t16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">;
+defm V_CVT_NORM_I16_F16_fake16  : VOP1_Real_FULL_t16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">;
 defm V_CVT_NORM_U16_F16_t16  : VOP1_Real_FULL_t16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">;
+defm V_CVT_NORM_U16_F16_fake16  : VOP1_Real_FULL_t16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">;
 
 defm V_CVT_F16_F32_t16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x00a, "v_cvt_f16_f32">;
+defm V_CVT_F16_F32_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x00a, "v_cvt_f16_f32">;
 defm V_CVT_F32_F16_t16       : VOP1_Real_FULL_t16_gfx11_gfx12<0x00b, "v_cvt_f32_f16">;
+defm V_CVT_F32_F16_fake16    : VOP1_Real_FULL_t16_gfx11_gfx12<0x00b, "v_cvt_f32_f16">;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
@@ -1430,7 +1437,6 @@ def : GCNPat<
 >;
 } // End OtherPredicates = [UseFakeTrue16Insts]
 
-
 let OtherPredicates = [UseRealTrue16Insts] in {
 def : GCNPat<
   (i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
@@ -1456,9 +1462,7 @@ def : GCNPat <
   (i16 (trunc i64:$src)),
   (EXTRACT_SUBREG $src, lo16)
 >;
-
 } // End OtherPredicates = [UseRealTrue16Insts]
-
 //===----------------------------------------------------------------------===//
 // GFX9
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalI...
[truncated]

broxigarchen · 2024-08-13T21:25:58Z

Hi @arsenm @Sisyph This PR is ready for review. Thanks!

llvm/lib/Target/AMDGPU/SIInstructions.td

broxigarchen · 2024-08-14T18:23:10Z

It seems this patch is missing MC level changes. reduce this PR into CodeGen only and open another PR for MC level changes #104510

broxigarchen · 2024-08-22T16:28:09Z

It seems this patch is missing MC level changes. reduce this PR into CodeGen only and open another PR for MC level changes #104510

Waiting for the MC PR to be merged, then I will update this PR. Thanks!

llvm/lib/Target/AMDGPU/SIInstructions.td

broxigarchen · 2024-10-03T20:40:15Z

It seems this patch is missing MC level changes. reduce this PR into CodeGen only and open another PR for MC level changes #104510

Waiting for the MC PR to be merged, then I will update this PR. Thanks!

It takes a while but this PR is ready for review now. Thanks!

llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/lib/Target/AMDGPU/VOP1Instructions.td

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir

Sisyph

LGTM. Thanks!

…ormat

broxigarchen · 2024-10-15T20:36:31Z

Squash to one commit and pull down to downstream branch for review

broxigarchen changed the title ~~tmp~~ [AMDGPU][CodeGen] Support VOP1 Fake16 ISA Aug 2, 2024

broxigarchen force-pushed the main-merge-fake16-vop1 branch from 0b3a31e to ffa9d95 Compare August 2, 2024 18:58

broxigarchen changed the title ~~[AMDGPU][CodeGen] Support VOP1 Fake16 ISA~~ [AMDGPU][CodeGen] Support VOP1 True/Fake16 ISA Aug 9, 2024

broxigarchen changed the title ~~[AMDGPU][CodeGen] Support VOP1 True/Fake16 ISA~~ [AMDGPU][CodeGen] Support VOP1 fp conversion in True/Fake16 format Aug 9, 2024

broxigarchen changed the title ~~[AMDGPU][CodeGen] Support VOP1 fp conversion in True/Fake16 format~~ [AMDGPU][True16][CodeGen] Support fp conversion in true/fake16 format Aug 13, 2024

broxigarchen force-pushed the main-merge-fake16-vop1 branch 2 times, most recently from 605ea1c to db1ff0d Compare August 13, 2024 21:22

broxigarchen marked this pull request as ready for review August 13, 2024 21:22

broxigarchen changed the title ~~[AMDGPU][True16][CodeGen] Support fp conversion in true/fake16 format~~ [AMDGPU][True16][CodeGen] fp conversion in true/fake16 format Aug 13, 2024

llvmbot added backend:AMDGPU llvm:globalisel labels Aug 13, 2024

broxigarchen mentioned this pull request Aug 13, 2024

Request Commit Access For broxigarchen #100457

Closed

arsenm reviewed Aug 14, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstructions.td Show resolved Hide resolved

cdevadas reviewed Aug 23, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstructions.td Outdated Show resolved Hide resolved

broxigarchen force-pushed the main-merge-fake16-vop1 branch from db1ff0d to 3630154 Compare October 3, 2024 20:37

broxigarchen requested a review from kosarev October 3, 2024 20:39

broxigarchen requested review from arsenm and cdevadas October 3, 2024 20:40

kosarev reviewed Oct 4, 2024

View reviewed changes

llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll Show resolved Hide resolved

Sisyph reviewed Oct 4, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstructions.td Show resolved Hide resolved

llvm/lib/Target/AMDGPU/VOP1Instructions.td Outdated Show resolved Hide resolved

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir Show resolved Hide resolved

broxigarchen force-pushed the main-merge-fake16-vop1 branch from 6c2fcd0 to e32a678 Compare October 7, 2024 15:21

Sisyph approved these changes Oct 7, 2024

View reviewed changes

kosarev approved these changes Oct 9, 2024

View reviewed changes

[AMDGPU][True16][CodeGen] fp conversion instructions in true/fake16 f…

5038288

…ormat

broxigarchen force-pushed the main-merge-fake16-vop1 branch from e32a678 to 5038288 Compare October 15, 2024 20:35

broxigarchen merged commit 35e937b into llvm:main Oct 16, 2024
8 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][True16][CodeGen] fp conversion in true/fake16 format #101678

[AMDGPU][True16][CodeGen] fp conversion in true/fake16 format #101678

Uh oh!

broxigarchen commented Aug 2, 2024 •

edited

Loading

Uh oh!

llvmbot commented Aug 13, 2024 •

edited

Loading

Uh oh!

broxigarchen commented Aug 13, 2024

Uh oh!

Uh oh!

broxigarchen commented Aug 14, 2024 •

edited

Loading

Uh oh!

broxigarchen commented Aug 22, 2024

Uh oh!

Uh oh!

broxigarchen commented Oct 3, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Sisyph left a comment

Uh oh!

broxigarchen commented Oct 15, 2024

Uh oh!

Uh oh!

Uh oh!

[AMDGPU][True16][CodeGen] fp conversion in true/fake16 format #101678

[AMDGPU][True16][CodeGen] fp conversion in true/fake16 format #101678

Uh oh!

Conversation

broxigarchen commented Aug 2, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Aug 13, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

broxigarchen commented Aug 13, 2024

Uh oh!

Uh oh!

broxigarchen commented Aug 14, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

broxigarchen commented Aug 22, 2024

Uh oh!

Uh oh!

broxigarchen commented Oct 3, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

broxigarchen commented Oct 15, 2024

Uh oh!

Uh oh!

Uh oh!

broxigarchen commented Aug 2, 2024 •

edited

Loading

llvmbot commented Aug 13, 2024 •

edited

Loading

broxigarchen commented Aug 14, 2024 •

edited

Loading