Skip to content

Commit 5038288

Browse files
committed
[AMDGPU][True16][CodeGen] fp conversion instructions in true/fake16 format
1 parent ae5bd2a commit 5038288

23 files changed

+2562
-1272
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7361,14 +7361,25 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
73617361
const DebugLoc &DL = Inst.getDebugLoc();
73627362
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
73637363
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7364-
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7365-
.addImm(16)
7366-
.add(Inst.getOperand(1));
7367-
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7368-
.addImm(0) // src0_modifiers
7369-
.addReg(TmpReg)
7370-
.addImm(0) // clamp
7371-
.addImm(0); // omod
7364+
if (ST.useRealTrue16Insts()) {
7365+
BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7366+
.add(Inst.getOperand(1));
7367+
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7368+
.addImm(0) // src0_modifiers
7369+
.addReg(TmpReg, 0, AMDGPU::hi16)
7370+
.addImm(0) // clamp
7371+
.addImm(0) // omod
7372+
.addImm(0); // op_sel0
7373+
} else {
7374+
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7375+
.addImm(16)
7376+
.add(Inst.getOperand(1));
7377+
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7378+
.addImm(0) // src0_modifiers
7379+
.addReg(TmpReg)
7380+
.addImm(0) // clamp
7381+
.addImm(0); // omod
7382+
}
73727383

73737384
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
73747385
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,7 +1094,7 @@ def : Pat <
10941094
// VOP1 Patterns
10951095
//===----------------------------------------------------------------------===//
10961096

1097-
multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
1097+
multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
10981098
// f16_to_fp patterns
10991099
def : GCNPat <
11001100
(f32 (any_f16_to_fp i32:$src0)),
@@ -1121,25 +1121,42 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
11211121
(cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
11221122
>;
11231123

1124+
// fp_to_fp16 patterns
11241125
def : GCNPat <
1125-
(f64 (any_fpextend f16:$src)),
1126-
(V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
1126+
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1127+
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
11271128
>;
11281129

1129-
// fp_to_fp16 patterns
1130+
// This is only used on targets without half support
1131+
// TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
11301132
def : GCNPat <
1131-
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1133+
(i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
11321134
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
11331135
>;
1136+
}
1137+
1138+
let True16Predicate = NotHasTrue16BitInsts in
1139+
defm : f16_to_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
1140+
1141+
let True16Predicate = UseFakeTrue16Insts in
1142+
defm : f16_to_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
1143+
1144+
multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64,
1145+
Instruction cvt_f32_f16_inst_e64,
1146+
RegOrImmOperand VSrc> {
1147+
def : GCNPat <
1148+
(f64 (any_fpextend f16:$src)),
1149+
(V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
1150+
>;
11341151

11351152
def : GCNPat <
11361153
(i32 (fp_to_sint f16:$src)),
1137-
(V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
1154+
(V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
11381155
>;
11391156

11401157
def : GCNPat <
11411158
(i32 (fp_to_uint f16:$src)),
1142-
(V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
1159+
(V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
11431160
>;
11441161

11451162
def : GCNPat <
@@ -1151,20 +1168,16 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
11511168
(f16 (uint_to_fp i32:$src)),
11521169
(cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
11531170
>;
1154-
1155-
// This is only used on targets without half support
1156-
// TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
1157-
def : GCNPat <
1158-
(i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1159-
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
1160-
>;
11611171
}
11621172

11631173
let True16Predicate = NotHasTrue16BitInsts in
1164-
defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
1174+
defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64, VSrc_b32>;
1175+
1176+
let True16Predicate = UseRealTrue16Insts in
1177+
defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64, VSrcT_b16>;
11651178

11661179
let True16Predicate = UseFakeTrue16Insts in
1167-
defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
1180+
defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64, VSrc_b16>;
11681181

11691182
//===----------------------------------------------------------------------===//
11701183
// VOP2 Patterns
@@ -2774,30 +2787,53 @@ def : GCNPat <
27742787
SSrc_i1:$src))
27752788
>;
27762789

2777-
let SubtargetPredicate = HasTrue16BitInsts in
2790+
let True16Predicate = UseRealTrue16Insts in
27782791
def : GCNPat <
27792792
(f16 (sint_to_fp i1:$src)),
2780-
(V_CVT_F16_F32_fake16_e32 (
2781-
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2793+
(V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
2794+
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
27822795
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
2783-
SSrc_i1:$src))
2796+
SSrc_i1:$src),
2797+
/*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
27842798
>;
27852799

2786-
let SubtargetPredicate = NotHasTrue16BitInsts in
2800+
let True16Predicate = UseFakeTrue16Insts in
2801+
def : GCNPat <
2802+
(f16 (sint_to_fp i1:$src)),
2803+
(V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
2804+
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2805+
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
2806+
SSrc_i1:$src),
2807+
/*clamp*/ 0, /*omod*/ 0)
2808+
>;
2809+
2810+
let True16Predicate = NotHasTrue16BitInsts in
27872811
def : GCNPat <
27882812
(f16 (uint_to_fp i1:$src)),
27892813
(V_CVT_F16_F32_e32 (
27902814
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
27912815
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
27922816
SSrc_i1:$src))
27932817
>;
2794-
let SubtargetPredicate = HasTrue16BitInsts in
2818+
2819+
let True16Predicate = UseRealTrue16Insts in
27952820
def : GCNPat <
27962821
(f16 (uint_to_fp i1:$src)),
2797-
(V_CVT_F16_F32_fake16_e32 (
2798-
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2822+
(V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
2823+
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
27992824
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
2800-
SSrc_i1:$src))
2825+
SSrc_i1:$src),
2826+
/*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
2827+
>;
2828+
2829+
let True16Predicate = UseFakeTrue16Insts in
2830+
def : GCNPat <
2831+
(f16 (uint_to_fp i1:$src)),
2832+
(V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
2833+
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2834+
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
2835+
SSrc_i1:$src),
2836+
/*clamp*/ 0, /*omod*/ 0)
28012837
>;
28022838

28032839
def : GCNPat <

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,7 @@ let FPDPRounding = 1 in {
503503
defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
504504
} // End FPDPRounding = 1
505505

506-
let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
506+
let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
507507
def : GCNPat<
508508
(f32 (f16_to_fp i16:$src)),
509509
(V_CVT_F32_F16_e32 $src)
@@ -513,7 +513,7 @@ def : GCNPat<
513513
(V_CVT_F16_F32_e32 $src)
514514
>;
515515
}
516-
let OtherPredicates = [HasTrue16BitInsts] in {
516+
let True16Predicate = UseRealTrue16Insts in {
517517
def : GCNPat<
518518
(f32 (f16_to_fp i16:$src)),
519519
(V_CVT_F32_F16_t16_e32 $src)
@@ -523,6 +523,16 @@ def : GCNPat<
523523
(V_CVT_F16_F32_t16_e32 $src)
524524
>;
525525
}
526+
let True16Predicate = UseFakeTrue16Insts in {
527+
def : GCNPat<
528+
(f32 (f16_to_fp i16:$src)),
529+
(V_CVT_F32_F16_fake16_e32 $src)
530+
>;
531+
def : GCNPat<
532+
(i16 (AMDGPUfp_to_f16 f32:$src)),
533+
(V_CVT_F16_F32_fake16_e32 $src)
534+
>;
535+
}
526536

527537
def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
528538
let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1);
@@ -1414,15 +1424,14 @@ def : GCNPat <
14141424

14151425
} // End OtherPredicates = [isGFX8Plus, p]
14161426

1417-
let OtherPredicates = [UseFakeTrue16Insts] in {
1427+
let True16Predicate = UseFakeTrue16Insts in {
14181428
def : GCNPat<
14191429
(i32 (DivergentUnaryFrag<anyext> i16:$src)),
14201430
(COPY $src)
14211431
>;
1422-
} // End OtherPredicates = [UseFakeTrue16Insts]
1423-
1432+
} // End True16Predicate = UseFakeTrue16Insts
14241433

1425-
let OtherPredicates = [UseRealTrue16Insts] in {
1434+
let True16Predicate = UseRealTrue16Insts in {
14261435
def : GCNPat<
14271436
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
14281437
(COPY $src)

0 commit comments

Comments
 (0)