-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Remove AVX/SSE transition penalties #8588
Changes from all commits
cc169ea
933257e
3fdc10a
c0627f8
ec2f1de
eecbbbf
3d5e08f
0981bb4
b6d69f5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10583,6 +10583,7 @@ GenTreePtr CodeGen::genMakeConst(const void* cnsAddr, var_types cnsType, GenTree | |
// funclet frames: this will be FuncletInfo.fiSpDelta. | ||
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize) | ||
{ | ||
genVzeroupperIfNeeded(false); | ||
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask; | ||
|
||
// Only callee saved floating point registers should be in regMask | ||
|
@@ -10621,16 +10622,6 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize) | |
offset -= XMM_REGSIZE_BYTES; | ||
} | ||
} | ||
|
||
#ifdef FEATURE_AVX_SUPPORT | ||
// Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs. | ||
// This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is | ||
// using SSE2. | ||
if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX) | ||
{ | ||
instGen(INS_vzeroupper); | ||
} | ||
#endif | ||
} | ||
|
||
// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working | ||
|
@@ -10651,6 +10642,7 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) | |
// fast path return | ||
if (regMask == RBM_NONE) | ||
{ | ||
genVzeroupperIfNeeded(); | ||
return; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than duplicate this code, perhaps this could be factored out into a small method, e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, will refactor it into a small method. Those flags does not need to be reset since they have been reset already at Compiler::compSetProcessor(). |
||
} | ||
|
||
|
@@ -10682,16 +10674,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) | |
assert((offset % 16) == 0); | ||
#endif // _TARGET_AMD64_ | ||
|
||
#ifdef FEATURE_AVX_SUPPORT | ||
// Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs. | ||
// This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is | ||
// using SSE2. | ||
if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX) | ||
{ | ||
instGen(INS_vzeroupper); | ||
} | ||
#endif | ||
|
||
for (regNumber reg = REG_FLT_CALLEE_SAVED_FIRST; regMask != RBM_NONE; reg = REG_NEXT(reg)) | ||
{ | ||
regMaskTP regBit = genRegMask(reg); | ||
|
@@ -10706,7 +10688,41 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize) | |
offset -= XMM_REGSIZE_BYTES; | ||
} | ||
} | ||
genVzeroupperIfNeeded(); | ||
} | ||
|
||
// Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the | ||
// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs | ||
// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains | ||
// 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native | ||
// code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog | ||
// if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty. | ||
// | ||
// Params | ||
// check256bitOnly - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper | ||
// instruction, false to check if the function contains AVX instruciton (either 128-bit or 256-bit). | ||
// | ||
void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/) | ||
{ | ||
#ifdef FEATURE_AVX_SUPPORT | ||
bool emitVzeroUpper = false; | ||
if (check256bitOnly) | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here too we don't need to check for instruction set because lower is making that check before setting flags on emitter. Instead we should assert. bool emitVzeroUpper; if (emitVzeroUpper) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed accordingly. |
||
emitVzeroUpper = getEmitter()->Contains256bitAVX(); | ||
} | ||
else | ||
{ | ||
emitVzeroUpper = getEmitter()->ContainsAVX(); | ||
} | ||
|
||
if (emitVzeroUpper) | ||
{ | ||
assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX); | ||
instGen(INS_vzeroupper); | ||
} | ||
#endif | ||
} | ||
|
||
#endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87 | ||
|
||
//----------------------------------------------------------------------------------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -166,7 +166,8 @@ void Lowering::TreeNodeInfoInit(GenTree* tree) | |
Compiler* compiler = comp; | ||
|
||
TreeNodeInfo* info = &(tree->gtLsraInfo); | ||
|
||
// floating type generates AVX instruction (vmovss etc.), set the flag | ||
SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet())); | ||
switch (tree->OperGet()) | ||
{ | ||
GenTree* op1; | ||
|
@@ -1773,6 +1774,8 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) | |
{ | ||
MakeSrcContained(blkNode, source); | ||
} | ||
// use XMM register to fill with constants, it's AVX instruction and set the flag | ||
SetContainsAVXFlags(); | ||
} | ||
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; | ||
|
||
|
@@ -1954,6 +1957,9 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) | |
// series of 16-byte loads and stores. | ||
blkNode->gtLsraInfo.internalFloatCount = 1; | ||
blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates()); | ||
// Uses XMM reg for load and store and hence check to see whether AVX instructions | ||
// are used for codegen, set ContainsAVX flag | ||
SetContainsAVXFlags(); | ||
} | ||
|
||
// If src or dst are on stack, we don't have to generate the address into a register | ||
|
@@ -2732,6 +2738,7 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree) | |
TreeNodeInfo* info = &(tree->gtLsraInfo); | ||
LinearScan* lsra = m_lsra; | ||
info->dstCount = 1; | ||
SetContainsAVXFlags(true, simdTree->gtSIMDSize); | ||
switch (simdTree->gtSIMDIntrinsicID) | ||
{ | ||
GenTree* op1; | ||
|
@@ -4572,6 +4579,31 @@ void Lowering::SetMulOpCounts(GenTreePtr tree) | |
} | ||
} | ||
|
||
//------------------------------------------------------------------------------ | ||
// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set | ||
// Contains256bitAVX flag when SIMD vector size is 32 bytes | ||
// | ||
// Arguments: | ||
// isFloatingPointType - true if it is floating point type | ||
// sizeOfSIMDVector - SIMD Vector size | ||
// | ||
void Lowering::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/) | ||
{ | ||
#ifdef FEATURE_AVX_SUPPORT | ||
if (isFloatingPointType) | ||
{ | ||
if (comp->getFloatingPointInstructionSet() == InstructionSet_AVX) | ||
{ | ||
comp->getEmitter()->SetContainsAVX(true); | ||
} | ||
if (sizeOfSIMDVector == 32 && comp->getSIMDInstructionSet() == InstructionSet_AVX) | ||
{ | ||
comp->getEmitter()->SetContains256bitAVX(true); | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be accurate I would suggest to write it as follows if (isFloatingPointType)
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed |
||
#endif | ||
} | ||
|
||
//------------------------------------------------------------------------------ | ||
// isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format | ||
// | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question:
AFAIK from Intel manual one will incur transition penalty from legacy SSE (scalar or 128-bit) to AVX (128 or 256-bit) or vice versa.
Will there be a transition penalty on transitioning from legacy SSE (scalar or 128-bit) into scalar AVX or vice versa?
If the answer is No: then we don't need to track whether a method contains avx instructions. Rather whether it contains 128/256-bit instructions alone would be sufficient. In such a case we are unnecessarily issuing vzero upper in some cases in the prolog of the method.
If the answer is Yes: Before a user PInvoke call don't we need to issue a vzeroupper (in codegenxarch.cpp)? Right now that logic is issuing a vzeroupper only if the method has 256-bit AVX instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The transition penalty happens on transition from 256 bit AVX to legacy SSE (case 1) and from Legacy SSE to 128bit or 256bit AVX (case 2), case 2 only happens if there was a preceding AVX256->legacy SSE transition penalty (case 1). So for the first part, the answer is Yes, there's transition penalty from legacy SSE to AVX (either 128bit or 256bit), RyuJIT generates 128bit and/or 256bit AVX instructions, we insert Vzeroupper in the prolog to avoid that. For the second part, the answer is Yes but penalty only occurs on 256bit AVX to legacy SSE, no transition penalty from 128bit AVX to legacy SSE, that's why the logic is only issuing a vzeroupper if the method has 256bit AVX instruction.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider a managed method is using scalar floating point operations but no SIMD (128 or 256-bit) and running on AVX hardware. JIT would produce scalar floating point AVX instructions for the method. In the method prolog we are generating vzeroupper because the logic in lowerxarch would set a flag on emitter to indicate, method uses AVX instructions (but no 256-bit).
One reason i could think of is reverse PInvoking such a managed method. The native code could be using legacy SSE (scalar or 128-bit) and calling to managed method could result in legacy SSE to scalar AVX transition. Can you please confirm that will incur transition penalty? It is not clear to me what Intel manual says in this regard.
In reply to: 95486702 [](ancestors = 95486702)