Skip to content

Commit bb50c8a

Browse files
Ruihan-Yintmds
authored andcommitted
Enable EVEX feature: Embedded Rounding for Avx512F.Add() (dotnet#94684)
* some workaround with embedded rounding in compiler backend. * extend _idEvexbContext to 2bit to distinguish embedded broadcast and embedded rounding * Expose APIs with rounding mode. * Apply format patch * Do not include the third parameter in Avx512.Add(left, right) * split _idEvexbContext bits and made a explicit convert function from uint8_t to insOpts for embedded rounding mode. * Remove unexpected comment-out * Fix unexpected deletion * resolve comments: removed redundent bits in instDesc for EVEX.b context. Introduced `emitDispEmbRounding` to display the embedded rounding feature in the disassembly. * bug fix: fix un-needed assertion check. * Apply format patch. * Resolve comments: merge INS_OPTS_EVEX_b and INS_OPTS_EVEX_er_rd Do a pre-check for embedded rounding before lowering. * Add a helper function to generalize the logic when lowering the embedded rounding intrinsics. * Resolve comments: 1. fix typo in commnets 2. Add SetEvexBroadcastIfNeeded 3. Added mask in insOpts * 1. Add unit case for non-default rounding mode 2. removed round-to-even, the default option from InsOpts as it will be handled on the default path. * formatting * 1. Create a fallback jump table for embedded rounding APIs when control byte is not constant. 2. Create a template to generate the unit tests for embedded rounding APIs. 3. nit: fix naming. * remove hand-written unit tests for embedded rounding. * formatting * Resolve comments. * formatting * revert changes: let SetEmbRoundingMode accept unexpected values to accomadate the jump table generatation logics.
1 parent a36c365 commit bb50c8a

File tree

19 files changed

+760
-45
lines changed

19 files changed

+760
-45
lines changed

src/coreclr/jit/codegen.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
971971
void genHWIntrinsic_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber reg, GenTree* rmOp);
972972
void genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
973973
void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr);
974+
void genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
974975
void genHWIntrinsic_R_R_RM(
975976
GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTree* op2);
976977
void genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);

src/coreclr/jit/emit.h

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -774,8 +774,12 @@ class emitter
774774
unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr
775775
unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables
776776
#if defined(TARGET_XARCH)
777-
unsigned _idEvexbContext : 1; // does EVEX.b need to be set.
778-
#endif // TARGET_XARCH
777+
// EVEX.b can indicate several context: embedded broadcast, embedded rounding.
778+
// For normal and embedded broadcast intrinsics, EVEX.L'L has the same semantic, vector length.
779+
// For embedded rounding, EVEX.L'L semantic changes to indicate the rounding mode.
780+
// Multiple bits in _idEvexbContext are used to inform emitter to specially handle the EVEX.L'L bits.
781+
unsigned _idEvexbContext : 2;
782+
#endif // TARGET_XARCH
779783

780784
#ifdef TARGET_ARM64
781785

@@ -808,8 +812,8 @@ class emitter
808812

809813
////////////////////////////////////////////////////////////////////////
810814
// Space taken up to here:
811-
// x86: 47 bits
812-
// amd64: 47 bits
815+
// x86: 48 bits
816+
// amd64: 48 bits
813817
// arm: 48 bits
814818
// arm64: 53 bits
815819
// loongarch64: 46 bits
@@ -828,7 +832,7 @@ class emitter
828832
#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64)
829833
#define ID_EXTRA_BITFIELD_BITS (14)
830834
#elif defined(TARGET_XARCH)
831-
#define ID_EXTRA_BITFIELD_BITS (15)
835+
#define ID_EXTRA_BITFIELD_BITS (16)
832836
#else
833837
#error Unsupported or unset target architecture
834838
#endif
@@ -863,8 +867,8 @@ class emitter
863867

864868
////////////////////////////////////////////////////////////////////////
865869
// Space taken up to here (with/without prev offset, assuming host==target):
866-
// x86: 53/49 bits
867-
// amd64: 54/49 bits
870+
// x86: 54/50 bits
871+
// amd64: 55/50 bits
868872
// arm: 54/50 bits
869873
// arm64: 60/55 bits
870874
// loongarch64: 53/48 bits
@@ -880,8 +884,8 @@ class emitter
880884

881885
////////////////////////////////////////////////////////////////////////
882886
// Small constant size (with/without prev offset, assuming host==target):
883-
// x86: 11/15 bits
884-
// amd64: 10/15 bits
887+
// x86: 10/14 bits
888+
// amd64: 9/14 bits
885889
// arm: 10/14 bits
886890
// arm64: 4/9 bits
887891
// loongarch64: 11/16 bits
@@ -1578,15 +1582,35 @@ class emitter
15781582
}
15791583

15801584
#ifdef TARGET_XARCH
1581-
bool idIsEvexbContext() const
1585+
bool idIsEvexbContextSet() const
15821586
{
15831587
return _idEvexbContext != 0;
15841588
}
1585-
void idSetEvexbContext()
1589+
1590+
void idSetEvexbContext(insOpts instOptions)
15861591
{
15871592
assert(_idEvexbContext == 0);
1588-
_idEvexbContext = 1;
1589-
assert(_idEvexbContext == 1);
1593+
if (instOptions == INS_OPTS_EVEX_eb_er_rd)
1594+
{
1595+
_idEvexbContext = 1;
1596+
}
1597+
else if (instOptions == INS_OPTS_EVEX_er_ru)
1598+
{
1599+
_idEvexbContext = 2;
1600+
}
1601+
else if (instOptions == INS_OPTS_EVEX_er_rz)
1602+
{
1603+
_idEvexbContext = 3;
1604+
}
1605+
else
1606+
{
1607+
unreached();
1608+
}
1609+
}
1610+
1611+
unsigned idGetEvexbContext() const
1612+
{
1613+
return _idEvexbContext;
15901614
}
15911615
#endif
15921616

@@ -2166,6 +2190,7 @@ class emitter
21662190
void emitDispInsOffs(unsigned offs, bool doffs);
21672191
void emitDispInsHex(instrDesc* id, BYTE* code, size_t sz);
21682192
void emitDispEmbBroadcastCount(instrDesc* id);
2193+
void emitDispEmbRounding(instrDesc* id);
21692194
void emitDispIns(instrDesc* id,
21702195
bool isNew,
21712196
bool doffs,
@@ -3814,7 +3839,7 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
38143839
//
38153840
emitAttr emitter::emitGetMemOpSize(instrDesc* id) const
38163841
{
3817-
if (id->idIsEvexbContext())
3842+
if (id->idIsEvexbContextSet())
38183843
{
38193844
// should have the assumption that Evex.b now stands for the embedded broadcast context.
38203845
// reference: Section 2.7.5 in Intel 64 and ia-32 architectures software developer's manual volume 2.

src/coreclr/jit/emitxarch.cpp

Lines changed: 106 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,30 @@ static bool isLowSimdReg(regNumber reg)
11391139
#endif
11401140
}
11411141

1142+
//------------------------------------------------------------------------
1143+
// GetEmbRoundingMode: Get the rounding mode for embedded rounding
1144+
//
1145+
// Arguments:
1146+
// mode -- the flag from the corresponding GenTree node indicating the mode.
1147+
//
1148+
// Return Value:
1149+
// the instruction option carrying the rounding mode information.
1150+
//
1151+
insOpts emitter::GetEmbRoundingMode(uint8_t mode) const
1152+
{
1153+
switch (mode)
1154+
{
1155+
case 1:
1156+
return INS_OPTS_EVEX_eb_er_rd;
1157+
case 2:
1158+
return INS_OPTS_EVEX_er_ru;
1159+
case 3:
1160+
return INS_OPTS_EVEX_er_rz;
1161+
default:
1162+
unreached();
1163+
}
1164+
}
1165+
11421166
//------------------------------------------------------------------------
11431167
// encodeRegAsIval: Encodes a register as an ival for use by a SIMD instruction
11441168
//
@@ -1309,18 +1333,50 @@ emitter::code_t emitter::AddEvexPrefix(const instrDesc* id, code_t code, emitAtt
13091333

13101334
if (attr == EA_32BYTE)
13111335
{
1312-
// Set L bit to 1 in case of instructions that operate on 256-bits.
1336+
// Set EVEX.L'L bits to 01 in case of instructions that operate on 256-bits.
13131337
code |= LBIT_IN_BYTE_EVEX_PREFIX;
13141338
}
13151339
else if (attr == EA_64BYTE)
13161340
{
1317-
// Set L' bits to 11 in case of instructions that operate on 512-bits.
1341+
// Set EVEX.L'L bits to 10 in case of instructions that operate on 512-bits.
13181342
code |= LPRIMEBIT_IN_BYTE_EVEX_PREFIX;
13191343
}
13201344

1321-
if (id->idIsEvexbContext())
1345+
if (id->idIsEvexbContextSet())
13221346
{
13231347
code |= EVEX_B_BIT;
1348+
1349+
if (!id->idHasMem())
1350+
{
1351+
// embedded rounding case.
1352+
unsigned roundingMode = id->idGetEvexbContext();
1353+
if (roundingMode == 1)
1354+
{
1355+
// {rd-sae}
1356+
code &= ~(LPRIMEBIT_IN_BYTE_EVEX_PREFIX);
1357+
code |= LBIT_IN_BYTE_EVEX_PREFIX;
1358+
}
1359+
else if (roundingMode == 2)
1360+
{
1361+
// {ru-sae}
1362+
code |= LPRIMEBIT_IN_BYTE_EVEX_PREFIX;
1363+
code &= ~(LBIT_IN_BYTE_EVEX_PREFIX);
1364+
}
1365+
else if (roundingMode == 3)
1366+
{
1367+
// {rz-sae}
1368+
code |= LPRIMEBIT_IN_BYTE_EVEX_PREFIX;
1369+
code |= LBIT_IN_BYTE_EVEX_PREFIX;
1370+
}
1371+
else
1372+
{
1373+
unreached();
1374+
}
1375+
}
1376+
else
1377+
{
1378+
assert(id->idGetEvexbContext() == 1);
1379+
}
13241380
}
13251381

13261382
regNumber maskReg = REG_NA;
@@ -6742,11 +6798,7 @@ void emitter::emitIns_R_R_A(
67426798
id->idIns(ins);
67436799
id->idReg1(reg1);
67446800
id->idReg2(reg2);
6745-
if (instOptions == INS_OPTS_EVEX_b)
6746-
{
6747-
assert(UseEvexEncoding());
6748-
id->idSetEvexbContext();
6749-
}
6801+
SetEvexBroadcastIfNeeded(id, instOptions);
67506802

67516803
emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins);
67526804

@@ -6871,11 +6923,7 @@ void emitter::emitIns_R_R_C(instruction ins,
68716923
id->idReg1(reg1);
68726924
id->idReg2(reg2);
68736925
id->idAddr()->iiaFieldHnd = fldHnd;
6874-
if (instOptions == INS_OPTS_EVEX_b)
6875-
{
6876-
assert(UseEvexEncoding());
6877-
id->idSetEvexbContext();
6878-
}
6926+
SetEvexBroadcastIfNeeded(id, instOptions);
68796927

68806928
UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins));
68816929
id->idCodeSize(sz);
@@ -6889,7 +6937,8 @@ void emitter::emitIns_R_R_C(instruction ins,
68896937
* Add an instruction with three register operands.
68906938
*/
68916939

6892-
void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2)
6940+
void emitter::emitIns_R_R_R(
6941+
instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, insOpts instOptions)
68936942
{
68946943
assert(IsAvx512OrPriorInstruction(ins));
68956944
assert(IsThreeOperandAVXInstruction(ins) || IsKInstruction(ins));
@@ -6901,6 +6950,13 @@ void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg,
69016950
id->idReg2(reg1);
69026951
id->idReg3(reg2);
69036952

6953+
if ((instOptions & INS_OPTS_b_MASK) != INS_OPTS_NONE)
6954+
{
6955+
// if EVEX.b needs to be set in this path, then it should be embedded rounding.
6956+
assert(UseEvexEncoding());
6957+
id->idSetEvexbContext(instOptions);
6958+
}
6959+
69046960
UNATIVE_OFFSET sz = emitInsSizeRR(id, insCodeRM(ins));
69056961
id->idCodeSize(sz);
69066962

@@ -6921,12 +6977,8 @@ void emitter::emitIns_R_R_S(
69216977
id->idReg1(reg1);
69226978
id->idReg2(reg2);
69236979
id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs);
6980+
SetEvexBroadcastIfNeeded(id, instOptions);
69246981

6925-
if (instOptions == INS_OPTS_EVEX_b)
6926-
{
6927-
assert(UseEvexEncoding());
6928-
id->idSetEvexbContext();
6929-
}
69306982
#ifdef DEBUG
69316983
id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs;
69326984
#endif
@@ -8224,11 +8276,11 @@ void emitter::emitIns_SIMD_R_R_C(instruction ins,
82248276
// op2Reg -- The register of the second operand
82258277
//
82268278
void emitter::emitIns_SIMD_R_R_R(
8227-
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg)
8279+
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, insOpts instOptions)
82288280
{
82298281
if (UseSimdEncoding())
82308282
{
8231-
emitIns_R_R_R(ins, attr, targetReg, op1Reg, op2Reg);
8283+
emitIns_R_R_R(ins, attr, targetReg, op1Reg, op2Reg, instOptions);
82328284
}
82338285
else
82348286
{
@@ -10656,7 +10708,7 @@ void emitter::emitDispInsHex(instrDesc* id, BYTE* code, size_t sz)
1065610708
//
1065710709
void emitter::emitDispEmbBroadcastCount(instrDesc* id)
1065810710
{
10659-
if (!id->idIsEvexbContext())
10711+
if (!id->idIsEvexbContextSet())
1066010712
{
1066110713
return;
1066210714
}
@@ -10665,6 +10717,37 @@ void emitter::emitDispEmbBroadcastCount(instrDesc* id)
1066510717
printf(" {1to%d}", vectorSize / baseSize);
1066610718
}
1066710719

10720+
// emitDispEmbRounding: Display the tag where embedded rounding is activated
10721+
//
10722+
// Arguments:
10723+
// id - The instruction descriptor
10724+
//
10725+
void emitter::emitDispEmbRounding(instrDesc* id)
10726+
{
10727+
if (!id->idIsEvexbContextSet())
10728+
{
10729+
return;
10730+
}
10731+
assert(!id->idHasMem());
10732+
unsigned roundingMode = id->idGetEvexbContext();
10733+
if (roundingMode == 1)
10734+
{
10735+
printf(" {rd-sae}");
10736+
}
10737+
else if (roundingMode == 2)
10738+
{
10739+
printf(" {ru-sae}");
10740+
}
10741+
else if (roundingMode == 3)
10742+
{
10743+
printf(" {rz-sae}");
10744+
}
10745+
else
10746+
{
10747+
unreached();
10748+
}
10749+
}
10750+
1066810751
//--------------------------------------------------------------------
1066910752
// emitDispIns: Dump the given instruction to jitstdout.
1067010753
//
@@ -11533,6 +11616,7 @@ void emitter::emitDispIns(
1153311616
printf("%s, ", emitRegName(id->idReg1(), attr));
1153411617
printf("%s, ", emitRegName(reg2, attr));
1153511618
printf("%s", emitRegName(reg3, attr));
11619+
emitDispEmbRounding(id);
1153611620
break;
1153711621
}
1153811622

0 commit comments

Comments
 (0)