Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 817ec7f

Browse files
committed
Enable SIMD for RyuJIT/x86
This change implements support for Vector<long>, handling SIMDIntrinsicInit, which takes a LONG, and decomposition of SIMDIntrinsicGetItem, which produces a LONG. It also enables SIMD, including AVX, by default for RyuJIT/x86.
1 parent d227cd0 commit 817ec7f

18 files changed

+371
-87
lines changed

src/inc/clrconfigvalues.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -562,13 +562,13 @@ CONFIG_DWORD_INFO_EX(INTERNAL_JitLoopHoistStats, W("JitLoopHoistStats"), 0, "Dis
562562
CONFIG_DWORD_INFO_EX(INTERNAL_JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0, "In debug builds log places where loop cloning optimizations are performed on the fast path.", CLRConfig::REGUTIL_default);
563563
CONFIG_DWORD_INFO_EX(INTERNAL_JitVNMapSelLimit, W("JitVNMapSelLimit"), 0, "If non-zero, assert if # of VNF_MapSelect applications considered reaches this", CLRConfig::REGUTIL_default)
564564
RETAIL_CONFIG_DWORD_INFO(INTERNAL_JitVNMapSelBudget, W("JitVNMapSelBudget"), 100, "Max # of MapSelect's considered for a particular top-level invocation.")
565-
#if defined(_TARGET_AMD64_)
565+
#if defined(_TARGET_AMD64_) || defined(_TARGET_X86_)
566566
#define EXTERNAL_FeatureSIMD_Default 1
567567
#define EXTERNAL_JitEnableAVX_Default 1
568-
#else // !defined(_TARGET_AMD64_)
568+
#else // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
569569
#define EXTERNAL_FeatureSIMD_Default 0
570570
#define EXTERNAL_JitEnableAVX_Default 0
571-
#endif // !defined(_TARGET_AMD64_)
571+
#endif // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
572572
RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_FeatureSIMD, W("FeatureSIMD"), EXTERNAL_FeatureSIMD_Default, "Enable SIMD support with companion SIMDVector.dll", CLRConfig::REGUTIL_default)
573573
RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_EnableAVX, W("EnableAVX"), EXTERNAL_JitEnableAVX_Default, "Enable AVX instruction set for wide operations as default", CLRConfig::REGUTIL_default)
574574

src/jit/decomposelongs.cpp

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,12 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree)
249249
nextNode = DecomposeRotate(use);
250250
break;
251251

252+
#ifdef FEATURE_SIMD
253+
case GT_SIMD:
254+
nextNode = DecomposeSimd(use);
255+
break;
256+
#endif // FEATURE_SIMD
257+
252258
case GT_LOCKADD:
253259
case GT_XADD:
254260
case GT_XCHG:
@@ -1562,6 +1568,125 @@ GenTree* DecomposeLongs::DecomposeUMod(LIR::Use& use)
15621568
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
15631569
}
15641570

1571+
#ifdef FEATURE_SIMD
1572+
1573+
//------------------------------------------------------------------------
1574+
// DecomposeSimd: Decompose GT_SIMD.
1575+
//
1576+
// Arguments:
1577+
// use - the LIR::Use object for the def that needs to be decomposed.
1578+
//
1579+
// Return Value:
1580+
// The next node to process.
1581+
//
1582+
GenTree* DecomposeLongs::DecomposeSimd(LIR::Use& use)
1583+
{
1584+
GenTree* tree = use.Def();
1585+
genTreeOps oper = tree->OperGet();
1586+
1587+
assert(oper == GT_SIMD);
1588+
1589+
GenTreeSIMD* simdTree = tree->AsSIMD();
1590+
1591+
switch (simdTree->gtSIMDIntrinsicID)
1592+
{
1593+
case SIMDIntrinsicInit:
1594+
case SIMDIntrinsicInitN:
1595+
case SIMDIntrinsicInitArray:
1596+
NYI("SIMDIntrinsicInit*");
1597+
break;
1598+
1599+
case SIMDIntrinsicGetItem:
1600+
return DecomposeSimdGetItem(use);
1601+
1602+
default:
1603+
noway_assert(!"unexpected GT_SIMD node in long decomposition");
1604+
break;
1605+
}
1606+
1607+
return nullptr;
1608+
}
1609+
1610+
//------------------------------------------------------------------------
1611+
// DecomposeSimdGetItem: Decompose GT_SIMD -- SIMDIntrinsicGetItem.
1612+
//
1613+
// Decompose a get[i] node on Vector<long>. For:
1614+
//
1615+
// GT_SIMD{get_item}[long](simd_var, index)
1616+
//
1617+
// create:
1618+
//
1619+
// tmp_simd_var = simd_var
1620+
// tmp_index = index
1621+
// lowResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2)
1622+
// highResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1)
1623+
// return: GT_LONG(lowResult, highResult)
1624+
//
1625+
// This isn't optimal codegen, since SIMDIntrinsicGetItem sometimes requires
1626+
// temps that could be shared, for example.
1627+
//
1628+
// Arguments:
1629+
// use - the LIR::Use object for the def that needs to be decomposed.
1630+
//
1631+
// Return Value:
1632+
// The next node to process.
1633+
//
1634+
GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use)
1635+
{
1636+
GenTree* tree = use.Def();
1637+
genTreeOps oper = tree->OperGet();
1638+
1639+
assert(oper == GT_SIMD);
1640+
1641+
GenTreeSIMD* simdTree = tree->AsSIMD();
1642+
var_types baseType = simdTree->gtSIMDBaseType;
1643+
unsigned simdSize = simdTree->gtSIMDSize;
1644+
1645+
assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
1646+
assert(varTypeIsLong(baseType));
1647+
assert(varTypeIsLong(simdTree));
1648+
assert(varTypeIsSIMD(simdTree->gtOp.gtOp1->gtType));
1649+
assert(simdTree->gtOp.gtOp2->gtType == TYP_INT);
1650+
1651+
LIR::Use op1(Range(), &simdTree->gtOp.gtOp1, simdTree);
1652+
unsigned simdTmpVarNum = op1.ReplaceWithLclVar(m_compiler, m_blockWeight);
1653+
JITDUMP("[DecomposeSimdGetItem]: Saving op1 tree to a temp var:\n");
1654+
DISPTREERANGE(Range(), op1.Def());
1655+
1656+
LIR::Use op2(Range(), &simdTree->gtOp.gtOp2, simdTree);
1657+
unsigned indexTmpVarNum = op2.ReplaceWithLclVar(m_compiler, m_blockWeight);
1658+
JITDUMP("[DecomposeSimdGetItem]: Saving op2 tree to a temp var:\n");
1659+
DISPTREERANGE(Range(), op2.Def());
1660+
1661+
GenTree* simdTmpVar1 = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->gtOp.gtOp1->gtType);
1662+
GenTree* indexTmpVar1 = m_compiler->gtNewLclLNode(indexTmpVarNum, TYP_INT);
1663+
GenTree* two1 = m_compiler->gtNewIconNode(2, TYP_INT);
1664+
GenTree* indexTimesTwo1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar1, two1);
1665+
1666+
GenTree* loResult =
1667+
m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar1, indexTimesTwo1, SIMDIntrinsicGetItem, TYP_INT, simdSize);
1668+
1669+
GenTree* simdTmpVar2 = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->gtOp.gtOp1->gtType);
1670+
GenTree* indexTmpVar2 = m_compiler->gtNewLclLNode(indexTmpVarNum, TYP_INT);
1671+
GenTree* two2 = m_compiler->gtNewIconNode(2, TYP_INT);
1672+
GenTree* indexTimesTwo2 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar2, two2);
1673+
GenTree* one = m_compiler->gtNewIconNode(1, TYP_INT);
1674+
GenTree* indexTimesTwoPlusOne = m_compiler->gtNewOperNode(GT_ADD, TYP_INT, indexTimesTwo2, one);
1675+
1676+
GenTree* hiResult =
1677+
m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar2, indexTimesTwoPlusOne, SIMDIntrinsicGetItem, TYP_INT, simdSize);
1678+
1679+
Range().InsertBefore(tree, simdTmpVar1, indexTmpVar1, two1, indexTimesTwo1);
1680+
Range().InsertBefore(tree, loResult, simdTmpVar2, indexTmpVar2, two2);
1681+
Range().InsertBefore(tree, indexTimesTwo2, one, indexTimesTwoPlusOne, hiResult);
1682+
1683+
Range().Remove(tree);
1684+
1685+
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
1686+
}
1687+
1688+
#endif // FEATURE_SIMD
1689+
15651690
//------------------------------------------------------------------------
15661691
// StoreNodeToVar: Check if the user is a STORE_LCL_VAR, and if it isn't,
15671692
// store the node to a var. Then decompose the new LclVar.

src/jit/decomposelongs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ class DecomposeLongs
5555
GenTree* DecomposeRotate(LIR::Use& use);
5656
GenTree* DecomposeMul(LIR::Use& use);
5757
GenTree* DecomposeUMod(LIR::Use& use);
58+
GenTree* DecomposeSimd(LIR::Use& use);
59+
GenTree* DecomposeSimdGetItem(LIR::Use& use);
5860

5961
// Helper functions
6062
GenTree* FinalizeDecomposition(LIR::Use& use, GenTree* loResult, GenTree* hiResult, GenTree* insertResultAfter);

src/jit/ee_il_dll.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,13 +409,16 @@ unsigned CILJit::getMaxIntrinsicSIMDVectorLength(DWORD cpuCompileFlags)
409409
{
410410
if (JitConfig.EnableAVX() != 0)
411411
{
412+
JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 32\n");
412413
return 32;
413414
}
414415
}
415416
#endif // FEATURE_AVX_SUPPORT
417+
JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 16\n");
416418
return 16;
417419
#endif // _TARGET_XARCH_
418420
#else // !FEATURE_SIMD
421+
JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 0\n");
419422
return 0;
420423
#endif // !FEATURE_SIMD
421424
}

src/jit/emitxarch.cpp

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,6 @@ bool emitter::IsAVXInstruction(instruction ins)
5757
#endif
5858
}
5959

60-
#ifdef _TARGET_AMD64_
61-
#define REX_PREFIX_MASK 0xFF00000000LL
62-
#endif // _TARGET_AMD64_
63-
6460
#ifdef FEATURE_AVX_SUPPORT
6561
// Returns true if the AVX instruction is a binary operator that requires 3 operands.
6662
// When we emit an instruction with only two operands, we will duplicate the destination
@@ -717,12 +713,10 @@ unsigned emitter::emitGetPrefixSize(code_t code)
717713
return 3;
718714
}
719715

720-
#ifdef _TARGET_AMD64_
721-
if (code & REX_PREFIX_MASK)
716+
if (hasRexPrefix(code))
722717
{
723718
return 1;
724719
}
725-
#endif // _TARGET_AMD64_
726720

727721
return 0;
728722
}
@@ -1882,10 +1876,9 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
18821876
}
18831877
}
18841878

1885-
#ifdef _TARGET_AMD64_
18861879
size += emitGetVexPrefixAdjustedSize(ins, attrSize, code);
18871880

1888-
if (code & REX_PREFIX_MASK)
1881+
if (hasRexPrefix(code))
18891882
{
18901883
// REX prefix
18911884
size += emitGetRexPrefixSize(ins);
@@ -1900,7 +1893,6 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code)
19001893
// Should have a REX byte
19011894
size += emitGetRexPrefixSize(ins);
19021895
}
1903-
#endif // _TARGET_AMD64_
19041896

19051897
if (rgx == REG_NA)
19061898
{
@@ -2303,9 +2295,7 @@ void emitter::emitIns(instruction ins)
23032295
}
23042296
#endif // DEBUG
23052297

2306-
#ifdef _TARGET_AMD64_
2307-
assert((code & REX_PREFIX_MASK) == 0); // Can't have a REX bit with no operands, right?
2308-
#endif // _TARGET_AMD64_
2298+
assert(!hasRexPrefix(code)); // Can't have a REX bit with no operands, right?
23092299

23102300
if (code & 0xFF000000)
23112301
{
@@ -3997,16 +3987,14 @@ void emitter::emitIns_C_I(instruction ins, emitAttr attr, CORINFO_FIELD_HANDLE f
39973987
code_t code = insCodeMI(ins);
39983988
UNATIVE_OFFSET sz = emitInsSizeCV(id, code, val);
39993989

4000-
#ifdef _TARGET_AMD64_
40013990
// Vex prefix
40023991
sz += emitGetVexPrefixAdjustedSize(ins, attr, insCodeMI(ins));
40033992

40043993
// REX prefix, if not already included in "code"
4005-
if (TakesRexWPrefix(ins, attr) && (code & REX_PREFIX_MASK) == 0)
3994+
if (TakesRexWPrefix(ins, attr) && !hasRexPrefix(code))
40063995
{
40073996
sz += emitGetRexPrefixSize(ins);
40083997
}
4009-
#endif // _TARGET_AMD64_
40103998

40113999
id->idAddr()->iiaFieldHnd = fldHnd;
40124000
id->idCodeSize(sz);

src/jit/emitxarch.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,16 @@ void SetUseSSE3_4(bool value)
109109
}
110110
bool Is4ByteSSE4Instruction(instruction ins);
111111

112+
bool hasRexPrefix(code_t code)
113+
{
114+
#ifdef _TARGET_AMD64_
115+
const code_t REX_PREFIX_MASK = 0xFF00000000LL;
116+
return (code & REX_PREFIX_MASK) != 0;
117+
#else // !_TARGET_AMD64_
118+
return false;
119+
#endif // !_TARGET_AMD64_
120+
}
121+
112122
#ifdef FEATURE_AVX_SUPPORT
113123

114124
// 3-byte VEX prefix starts with byte 0xC4
@@ -178,7 +188,7 @@ bool IsThreeOperandAVXInstruction(instruction ins)
178188
}
179189
bool Is4ByteAVXInstruction(instruction ins);
180190
#else // !FEATURE_AVX_SUPPORT
181-
bool UseAVX()
191+
bool UseAVX()
182192
{
183193
return false;
184194
}

src/jit/gentree.cpp

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7581,9 +7581,7 @@ void Compiler::gtBlockOpInit(GenTreePtr result, GenTreePtr dst, GenTreePtr srcOr
75817581

75827582
if (dst->OperIsLocal() && varTypeIsStruct(dst))
75837583
{
7584-
unsigned lclNum = dst->AsLclVarCommon()->GetLclNum();
7585-
LclVarDsc* lclVarDsc = &lvaTable[lclNum];
7586-
lclVarDsc->lvUsedInSIMDIntrinsic = true;
7584+
setLclRelatedToSIMDIntrinsic(dst);
75877585
}
75887586
}
75897587
}
@@ -16869,14 +16867,15 @@ bool FieldSeqNode::IsPseudoField()
1686916867
GenTreeSIMD* Compiler::gtNewSIMDNode(
1687016868
var_types type, GenTreePtr op1, SIMDIntrinsicID simdIntrinsicID, var_types baseType, unsigned size)
1687116869
{
16872-
// TODO-CQ: An operand may be a GT_OBJ(GT_ADDR(GT_LCL_VAR))), in which case it should be
16873-
// marked lvUsedInSIMDIntrinsic.
1687416870
assert(op1 != nullptr);
1687516871
if (op1->OperGet() == GT_LCL_VAR)
1687616872
{
16877-
unsigned lclNum = op1->AsLclVarCommon()->GetLclNum();
16878-
LclVarDsc* lclVarDsc = &lvaTable[lclNum];
16879-
lclVarDsc->lvUsedInSIMDIntrinsic = true;
16873+
setLclRelatedToSIMDIntrinsic(op1);
16874+
}
16875+
else if ((op1->OperGet() == GT_OBJ) && (op1->gtOp.gtOp1->OperGet() == GT_ADDR) &&
16876+
op1->gtOp.gtOp1->gtOp.gtOp1->OperIsLocal())
16877+
{
16878+
setLclRelatedToSIMDIntrinsic(op1->gtOp.gtOp1->gtOp.gtOp1);
1688016879
}
1688116880

1688216881
return new (this, GT_SIMD) GenTreeSIMD(type, op1, simdIntrinsicID, baseType, size);
@@ -16885,21 +16884,28 @@ GenTreeSIMD* Compiler::gtNewSIMDNode(
1688516884
GenTreeSIMD* Compiler::gtNewSIMDNode(
1688616885
var_types type, GenTreePtr op1, GenTreePtr op2, SIMDIntrinsicID simdIntrinsicID, var_types baseType, unsigned size)
1688716886
{
16888-
// TODO-CQ: An operand may be a GT_OBJ(GT_ADDR(GT_LCL_VAR))), in which case it should be
16889-
// marked lvUsedInSIMDIntrinsic.
1689016887
assert(op1 != nullptr);
1689116888
if (op1->OperIsLocal())
1689216889
{
16893-
unsigned lclNum = op1->AsLclVarCommon()->GetLclNum();
16894-
LclVarDsc* lclVarDsc = &lvaTable[lclNum];
16895-
lclVarDsc->lvUsedInSIMDIntrinsic = true;
16890+
setLclRelatedToSIMDIntrinsic(op1);
16891+
}
16892+
else if ((op1->OperGet() == GT_OBJ) && (op1->gtOp.gtOp1->OperGet() == GT_ADDR) &&
16893+
op1->gtOp.gtOp1->gtOp.gtOp1->OperIsLocal())
16894+
{
16895+
setLclRelatedToSIMDIntrinsic(op1->gtOp.gtOp1->gtOp.gtOp1);
1689616896
}
1689716897

16898-
if (op2 != nullptr && op2->OperIsLocal())
16898+
if (op2 != nullptr)
1689916899
{
16900-
unsigned lclNum = op2->AsLclVarCommon()->GetLclNum();
16901-
LclVarDsc* lclVarDsc = &lvaTable[lclNum];
16902-
lclVarDsc->lvUsedInSIMDIntrinsic = true;
16900+
if (op2->OperIsLocal())
16901+
{
16902+
setLclRelatedToSIMDIntrinsic(op2);
16903+
}
16904+
else if ((op2->OperGet() == GT_OBJ) && (op2->gtOp.gtOp1->OperGet() == GT_ADDR) &&
16905+
op2->gtOp.gtOp1->gtOp.gtOp1->OperIsLocal())
16906+
{
16907+
setLclRelatedToSIMDIntrinsic(op2->gtOp.gtOp1->gtOp.gtOp1);
16908+
}
1690316909
}
1690416910

1690516911
return new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);

src/jit/gschecks.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,9 @@ void Compiler::gsParamsToShadows()
413413
lvaTable[shadowVar].lvBaseType = varDsc->lvBaseType;
414414
}
415415
#endif
416+
416417
lvaTable[shadowVar].lvRegStruct = varDsc->lvRegStruct;
418+
lvaTable[shadowVar].lvExactSize = varDsc->lvExactSize;
417419

418420
lvaTable[shadowVar].lvAddrExposed = varDsc->lvAddrExposed;
419421
lvaTable[shadowVar].lvDoNotEnregister = varDsc->lvDoNotEnregister;

src/jit/importer.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1514,10 +1514,8 @@ var_types Compiler::impNormStructType(CORINFO_CLASS_HANDLE structHnd,
15141514
{
15151515
*pSimdBaseType = simdBaseType;
15161516
}
1517-
#ifdef _TARGET_AMD64_
1518-
// Amd64: also indicate that we use floating point registers
1517+
// Also indicate that we use floating point registers.
15191518
compFloatingPointUsed = true;
1520-
#endif
15211519
}
15221520
}
15231521
}

src/jit/instr.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3513,13 +3513,25 @@ instruction CodeGen::ins_CopyIntToFloat(var_types srcType, var_types dstType)
35133513
{
35143514
// On SSE2/AVX - the same instruction is used for moving double/quad word to XMM/YMM register.
35153515
assert((srcType == TYP_INT) || (srcType == TYP_UINT) || (srcType == TYP_LONG) || (srcType == TYP_ULONG));
3516+
3517+
#if !defined(_TARGET_64BIT_)
3518+
// No 64-bit registers on x86.
3519+
assert((srcType != TYP_LONG) && (srcType != TYP_ULONG));
3520+
#endif // !defined(_TARGET_64BIT_)
3521+
35163522
return INS_mov_i2xmm;
35173523
}
35183524

35193525
instruction CodeGen::ins_CopyFloatToInt(var_types srcType, var_types dstType)
35203526
{
35213527
// On SSE2/AVX - the same instruction is used for moving double/quad word of XMM/YMM to an integer register.
35223528
assert((dstType == TYP_INT) || (dstType == TYP_UINT) || (dstType == TYP_LONG) || (dstType == TYP_ULONG));
3529+
3530+
#if !defined(_TARGET_64BIT_)
3531+
// No 64-bit registers on x86.
3532+
assert((dstType != TYP_LONG) && (dstType != TYP_ULONG));
3533+
#endif // !defined(_TARGET_64BIT_)
3534+
35233535
return INS_mov_xmm2i;
35243536
}
35253537

0 commit comments

Comments
 (0)