Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Enable SIMD for RyuJIT/x86 #8644

Merged
merged 1 commit into from
Feb 6, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/inc/clrconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -562,13 +562,13 @@ CONFIG_DWORD_INFO_EX(INTERNAL_JitLoopHoistStats, W("JitLoopHoistStats"), 0, "Dis
CONFIG_DWORD_INFO_EX(INTERNAL_JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0, "In debug builds log places where loop cloning optimizations are performed on the fast path.", CLRConfig::REGUTIL_default);
CONFIG_DWORD_INFO_EX(INTERNAL_JitVNMapSelLimit, W("JitVNMapSelLimit"), 0, "If non-zero, assert if # of VNF_MapSelect applications considered reaches this", CLRConfig::REGUTIL_default)
RETAIL_CONFIG_DWORD_INFO(INTERNAL_JitVNMapSelBudget, W("JitVNMapSelBudget"), 100, "Max # of MapSelect's considered for a particular top-level invocation.")
#if defined(_TARGET_AMD64_)
#if defined(_TARGET_AMD64_) || defined(_TARGET_X86_)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why wouldn't we use TARGET_XARCH here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The JIT defines _TARGET_XARCH_ but the VM does not.

#define EXTERNAL_FeatureSIMD_Default 1
#define EXTERNAL_JitEnableAVX_Default 1
#else // !defined(_TARGET_AMD64_)
#else // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
#define EXTERNAL_FeatureSIMD_Default 0
#define EXTERNAL_JitEnableAVX_Default 0
#endif // !defined(_TARGET_AMD64_)
#endif // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_FeatureSIMD, W("FeatureSIMD"), EXTERNAL_FeatureSIMD_Default, "Enable SIMD support with companion SIMDVector.dll", CLRConfig::REGUTIL_default)
RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_EnableAVX, W("EnableAVX"), EXTERNAL_JitEnableAVX_Default, "Enable AVX instruction set for wide operations as default", CLRConfig::REGUTIL_default)

Expand Down
3 changes: 2 additions & 1 deletion src/jit/codegenlinear.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,11 @@ void genSIMDCheck(GenTree* treeNode);
// their size rounded to TARGET_POINTER_SIZE (which is 8 bytes on 64-bit targets) and hence
// Vector3 locals could be treated as TYP_SIMD16 while reading/writing.
void genStoreIndTypeSIMD12(GenTree* treeNode);
void genStoreLclFldTypeSIMD12(GenTree* treeNode);
void genLoadIndTypeSIMD12(GenTree* treeNode);
void genStoreLclTypeSIMD12(GenTree* treeNode);
void genLoadLclTypeSIMD12(GenTree* treeNode);
#ifdef _TARGET_X86_
void genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg);
void genPutArgStkSIMD12(GenTree* treeNode);
#endif // _TARGET_X86_
#endif // FEATURE_SIMD
Expand Down
100 changes: 69 additions & 31 deletions src/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1491,10 +1491,11 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
// storing of TYP_SIMD12 (i.e. Vector3) field
if (treeNode->TypeGet() == TYP_SIMD12)
{
genStoreLclFldTypeSIMD12(treeNode);
genStoreLclTypeSIMD12(treeNode);
break;
}
#endif
#endif // FEATURE_SIMD

GenTreePtr op1 = treeNode->gtGetOp1();
genConsumeRegs(op1);
emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
Expand Down Expand Up @@ -1531,6 +1532,13 @@ void CodeGen::genCodeForTreeNode(GenTreePtr treeNode)
#endif // !defined(_TARGET_64BIT_)

#ifdef FEATURE_SIMD
// storing of TYP_SIMD12 (i.e. Vector3) field
if (treeNode->TypeGet() == TYP_SIMD12)
{
genStoreLclTypeSIMD12(treeNode);
break;
}

if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
{
// This is only possible for a zero-init.
Expand Down Expand Up @@ -7450,14 +7458,20 @@ unsigned CodeGen::getBaseVarForPutArgStk(GenTreePtr treeNode)

#ifdef _TARGET_X86_
//---------------------------------------------------------------------
// adjustStackForPutArgStk:
// genAdjustStackForPutArgStk:
// adjust the stack pointer for a putArgStk node if necessary.
//
// Arguments:
// putArgStk - the putArgStk node.
//
// Returns: true if the stack pointer was adjusted; false otherwise.
//
// Notes:
// Sets `m_pushStkArg` to true if the stack arg needs to be pushed,
// false if the stack arg needs to be stored at the current stack
// pointer address. This is exactly the opposite of the return value
// of this function.
//
bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
{
#ifdef FEATURE_SIMD
Expand Down Expand Up @@ -7515,11 +7529,10 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
}

//---------------------------------------------------------------------
// genPutArgStkFieldList - generate code for passing an arg on the stack.
// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack.
//
// Arguments
// treeNode - the GT_PUTARG_STK node
// targetType - the type of the treeNode
// treeNode - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST
//
// Return value:
// None
Expand All @@ -7531,24 +7544,36 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)

// Set m_pushStkArg and pre-adjust the stack if necessary.
const bool preAdjustedStack = genAdjustStackForPutArgStk(putArgStk);

// For now, we only support the "push" case; we will push a full slot for the first field of each slot
// within the struct.
assert((putArgStk->isPushKind()) && !preAdjustedStack && m_pushStkArg);

// If we have pre-adjusted the stack and are simply storing the fields in order) set the offset to 0.
// If we have pre-adjusted the stack and are simply storing the fields in order, set the offset to 0.
// (Note that this mode is not currently being used.)
// If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them
// in reverse order, so we start with the current field offset at the size of the struct arg (which must be
// a multiple of the target pointer size).
unsigned currentOffset = (preAdjustedStack) ? 0 : putArgStk->getArgSize();
unsigned prevFieldOffset = currentOffset;
regNumber tmpReg = REG_NA;
regNumber intTmpReg = REG_NA;
regNumber simdTmpReg = REG_NA;
if (putArgStk->gtRsvdRegs != RBM_NONE)
{
assert(genCountBits(putArgStk->gtRsvdRegs) == 1);
tmpReg = genRegNumFromMask(putArgStk->gtRsvdRegs);
assert(genIsValidIntReg(tmpReg));
regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
if ((rsvdRegs & RBM_ALLINT) != 0)
{
intTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLINT);
assert(genIsValidIntReg(intTmpReg));
}
if ((rsvdRegs & RBM_ALLFLOAT) != 0)
{
simdTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLFLOAT);
assert(genIsValidFloatReg(simdTmpReg));
}
assert(genCountBits(rsvdRegs) == ((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1));
}

for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
{
GenTree* const fieldNode = current->Current();
Expand Down Expand Up @@ -7576,7 +7601,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
// able to detect stores into the outgoing argument area of the stack on x86.
const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevFieldOffset - fieldOffset) >= 4);
int adjustment = roundUp(currentOffset - fieldOffset, 4);
if (fieldIsSlot)
if (fieldIsSlot && !varTypeIsSIMD(fieldType))
{
fieldType = genActualType(fieldType);
unsigned pushSize = genTypeSize(fieldType);
Expand All @@ -7594,12 +7619,13 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
else
{
m_pushStkArg = false;

// We always "push" floating point fields (i.e. they are full slot values that don't
// require special handling).
assert(varTypeIsIntegralOrI(fieldNode));
assert(varTypeIsIntegralOrI(fieldNode) || varTypeIsSIMD(fieldNode));

// If we can't push this field, it needs to be in a register so that we can store
// it to the stack location.
assert(tmpReg != REG_NA);
if (adjustment != 0)
{
// This moves the stack pointer to fieldOffset.
Expand All @@ -7611,15 +7637,16 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
}

// Does it need to be in a byte register?
// If so, we'll use tmpReg, which must have been allocated as a byte register.
// If so, we'll use intTmpReg, which must have been allocated as a byte register.
// If it's already in a register, but not a byteable one, then move it.
if (varTypeIsByte(fieldType) && ((argReg == REG_NA) || ((genRegMask(argReg) & RBM_BYTE_REGS) == 0)))
{
noway_assert((genRegMask(tmpReg) & RBM_BYTE_REGS) != 0);
assert(intTmpReg != REG_NA);
noway_assert((genRegMask(intTmpReg) & RBM_BYTE_REGS) != 0);
if (argReg != REG_NA)
{
inst_RV_RV(INS_mov, tmpReg, argReg, fieldType);
argReg = tmpReg;
inst_RV_RV(INS_mov, intTmpReg, argReg, fieldType);
argReg = intTmpReg;
}
}
}
Expand All @@ -7630,6 +7657,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
{
if (fieldNode->isUsedFromSpillTemp())
{
assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
assert(fieldNode->IsRegOptional());
TempDsc* tmp = getSpillTempDsc(fieldNode);
getEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), 0);
Expand Down Expand Up @@ -7662,25 +7690,35 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk)
}
else
{
// The stack has been adjusted and we will load the field to tmpReg and then store it on the stack.
// The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
assert(varTypeIsIntegralOrI(fieldNode));
switch (fieldNode->OperGet())
{
case GT_LCL_VAR:
inst_RV_TT(INS_mov, tmpReg, fieldNode);
inst_RV_TT(INS_mov, intTmpReg, fieldNode);
break;
case GT_CNS_INT:
genSetRegToConst(tmpReg, fieldNode->TypeGet(), fieldNode);
genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
break;
default:
unreached();
}
genStoreRegToStackArg(fieldType, tmpReg, fieldOffset - currentOffset);
genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);
}
}
else
{
genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
if (fieldType == TYP_SIMD12)
{
assert(genIsValidFloatReg(simdTmpReg));
genStoreSIMD12ToStack(argReg, simdTmpReg);
}
else
#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
{
genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
}
if (m_pushStkArg)
{
// We always push a slot-rounded size
Expand Down Expand Up @@ -7715,14 +7753,6 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)

#ifdef _TARGET_X86_

#ifdef FEATURE_SIMD
if (targetType == TYP_SIMD12)
{
genPutArgStkSIMD12(putArgStk);
return;
}
#endif // FEATURE_SIMD

if (varTypeIsStruct(targetType))
{
(void)genAdjustStackForPutArgStk(putArgStk);
Expand Down Expand Up @@ -7950,6 +7980,14 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk)
{
var_types targetType = putArgStk->TypeGet();

#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
if (targetType == TYP_SIMD12)
{
genPutArgStkSIMD12(putArgStk);
return;
}
#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)

if (varTypeIsSIMD(targetType))
{
regNumber srcReg = genConsumeReg(putArgStk->gtGetOp1());
Expand Down
35 changes: 33 additions & 2 deletions src/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ class LclVarDsc
#endif // defined(_TARGET_64BIT_)
}

unsigned lvSize() // Size needed for storage representation. Only used for structs or TYP_BLK.
unsigned lvSize() const // Size needed for storage representation. Only used for structs or TYP_BLK.
{
// TODO-Review: Sometimes we get called on ARM with HFA struct variables that have been promoted,
// where the struct itself is no longer used because all access is via its member fields.
Expand All @@ -688,7 +688,8 @@ class LclVarDsc

#if defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
// For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. We can't do
// this for arguments, which must be passed according the defined ABI.
// this for arguments, which must be passed according the defined ABI. We don't want to do this for
// dependently promoted struct fields, but we don't know that here. See lvaMapSimd12ToSimd16().
if ((lvType == TYP_SIMD12) && !lvIsParam)
{
assert(lvExactSize == 12);
Expand Down Expand Up @@ -1980,6 +1981,7 @@ class Compiler
SIMDIntrinsicID simdIntrinsicID,
var_types baseType,
unsigned size);
void SetOpLclRelatedToSIMDIntrinsic(GenTreePtr op);
#endif

GenTreePtr gtNewLclLNode(unsigned lnum, var_types type, IL_OFFSETX ILoffs = BAD_IL_OFFSET);
Expand Down Expand Up @@ -2652,6 +2654,35 @@ class Compiler
bool lvaIsFieldOfDependentlyPromotedStruct(const LclVarDsc* varDsc);
bool lvaIsGCTracked(const LclVarDsc* varDsc);

#if defined(FEATURE_SIMD)
bool lvaMapSimd12ToSimd16(const LclVarDsc* varDsc)
{
assert(varDsc->lvType == TYP_SIMD12);
assert(varDsc->lvExactSize == 12);

#if defined(_TARGET_64BIT_)
assert(varDsc->lvSize() == 16);
return true;
#else // !defined(_TARGET_64BIT_)

// For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. lvSize()
// already does this calculation. However, we also need to prevent mapping types if the var is a
// depenendently promoted struct field, which must remain its exact size within its parent struct.
// However, we don't know this until late, so we may have already pretended the field is bigger
// before that.
if ((varDsc->lvSize() == 16) && !lvaIsFieldOfDependentlyPromotedStruct(varDsc))
{
return true;
}
else
{
return false;
}

#endif // !defined(_TARGET_64BIT_)
}
#endif // defined(FEATURE_SIMD)

BYTE* lvaGetGcLayout(unsigned varNum);
bool lvaTypeIsGC(unsigned varNum);
unsigned lvaGSSecurityCookie; // LclVar number
Expand Down
Loading