Skip to content

[AMDGPU][True16][CodeGen] fp conversion in true/fake16 format #101678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 19 additions & 8 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7361,14 +7361,25 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
const DebugLoc &DL = Inst.getDebugLoc();
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
.addImm(16)
.add(Inst.getOperand(1));
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
.addReg(TmpReg)
.addImm(0) // clamp
.addImm(0); // omod
if (ST.useRealTrue16Insts()) {
BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
.add(Inst.getOperand(1));
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
.addReg(TmpReg, 0, AMDGPU::hi16)
.addImm(0) // clamp
.addImm(0) // omod
.addImm(0); // op_sel0
} else {
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
.addImm(16)
.add(Inst.getOperand(1));
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
.addReg(TmpReg)
.addImm(0) // clamp
.addImm(0); // omod
}

MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
Expand Down
86 changes: 61 additions & 25 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1094,7 +1094,7 @@ def : Pat <
// VOP1 Patterns
//===----------------------------------------------------------------------===//

multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
// f16_to_fp patterns
def : GCNPat <
(f32 (any_f16_to_fp i32:$src0)),
Expand All @@ -1121,25 +1121,42 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
(cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
>;

// fp_to_fp16 patterns
def : GCNPat <
(f64 (any_fpextend f16:$src)),
(V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
>;

// fp_to_fp16 patterns
// This is only used on targets without half support
// TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
def : GCNPat <
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
(i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
>;
}

let True16Predicate = NotHasTrue16BitInsts in
defm : f16_to_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;

let True16Predicate = UseFakeTrue16Insts in
defm : f16_to_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;

multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64,
Instruction cvt_f32_f16_inst_e64,
RegOrImmOperand VSrc> {
def : GCNPat <
(f64 (any_fpextend f16:$src)),
(V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
>;

def : GCNPat <
(i32 (fp_to_sint f16:$src)),
(V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
(V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
>;

def : GCNPat <
(i32 (fp_to_uint f16:$src)),
(V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
(V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
>;

def : GCNPat <
Expand All @@ -1151,20 +1168,16 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
(f16 (uint_to_fp i32:$src)),
(cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
>;

// This is only used on targets without half support
// TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
def : GCNPat <
(i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
>;
}

let True16Predicate = NotHasTrue16BitInsts in
defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64, VSrc_b32>;

let True16Predicate = UseRealTrue16Insts in
defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64, VSrcT_b16>;

let True16Predicate = UseFakeTrue16Insts in
defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64, VSrc_b16>;

//===----------------------------------------------------------------------===//
// VOP2 Patterns
Expand Down Expand Up @@ -2774,30 +2787,53 @@ def : GCNPat <
SSrc_i1:$src))
>;

let SubtargetPredicate = HasTrue16BitInsts in
let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(f16 (sint_to_fp i1:$src)),
(V_CVT_F16_F32_fake16_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
(V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
SSrc_i1:$src))
SSrc_i1:$src),
/*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
>;

let SubtargetPredicate = NotHasTrue16BitInsts in
let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(f16 (sint_to_fp i1:$src)),
(V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
SSrc_i1:$src),
/*clamp*/ 0, /*omod*/ 0)
>;

let True16Predicate = NotHasTrue16BitInsts in
def : GCNPat <
(f16 (uint_to_fp i1:$src)),
(V_CVT_F16_F32_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
SSrc_i1:$src))
>;
let SubtargetPredicate = HasTrue16BitInsts in

let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(f16 (uint_to_fp i1:$src)),
(V_CVT_F16_F32_fake16_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
(V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
SSrc_i1:$src))
SSrc_i1:$src),
/*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
>;

let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(f16 (uint_to_fp i1:$src)),
(V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
SSrc_i1:$src),
/*clamp*/ 0, /*omod*/ 0)
>;

def : GCNPat <
Expand Down
21 changes: 15 additions & 6 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ let FPDPRounding = 1 in {
defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
} // End FPDPRounding = 1

let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
def : GCNPat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_e32 $src)
Expand All @@ -513,7 +513,7 @@ def : GCNPat<
(V_CVT_F16_F32_e32 $src)
>;
}
let OtherPredicates = [HasTrue16BitInsts] in {
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_t16_e32 $src)
Expand All @@ -523,6 +523,16 @@ def : GCNPat<
(V_CVT_F16_F32_t16_e32 $src)
>;
}
let True16Predicate = UseFakeTrue16Insts in {
def : GCNPat<
(f32 (f16_to_fp i16:$src)),
(V_CVT_F32_F16_fake16_e32 $src)
>;
def : GCNPat<
(i16 (AMDGPUfp_to_f16 f32:$src)),
(V_CVT_F16_F32_fake16_e32 $src)
>;
}

def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1);
Expand Down Expand Up @@ -1414,15 +1424,14 @@ def : GCNPat <

} // End OtherPredicates = [isGFX8Plus, p]

let OtherPredicates = [UseFakeTrue16Insts] in {
let True16Predicate = UseFakeTrue16Insts in {
def : GCNPat<
(i32 (DivergentUnaryFrag<anyext> i16:$src)),
(COPY $src)
>;
} // End OtherPredicates = [UseFakeTrue16Insts]

} // End True16Predicate = UseFakeTrue16Insts

let OtherPredicates = [UseRealTrue16Insts] in {
let True16Predicate = UseRealTrue16Insts in {
def : GCNPat<
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
(COPY $src)
Expand Down
Loading
Loading