Skip to content

Commit 6d877c5

Browse files
Update where and when vzeroupper is emitted (#98261)
* Update where and when vzeroupper is emitted * Ensure we emit vzeroupper for JIT helpers that need it * Make sure vzeroupper is in genRestoreCalleeSavedFltRegs * Scope when vzeroupper is emitted to fewer places * Revert the simplification done to SetContainsAVXFlags * Try to minify the TP impact of the improved vzeroupper handling
1 parent b9bd1de commit 6d877c5

File tree

8 files changed

+135
-45
lines changed

8 files changed

+135
-45
lines changed

src/coreclr/jit/codegen.h

-2
Original file line numberDiff line numberDiff line change
@@ -476,8 +476,6 @@ class CodeGen final : public CodeGenInterface
476476
// Save/Restore callee saved float regs to stack
477477
void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
478478
void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
479-
// Generate VZeroupper instruction to avoid AVX/SSE transition penalty
480-
void genVzeroupperIfNeeded(bool check256bitOnly = true);
481479

482480
#endif // TARGET_XARCH
483481

src/coreclr/jit/codegenxarch.cpp

+39-43
Original file line numberDiff line numberDiff line change
@@ -6074,16 +6074,18 @@ void CodeGen::genCall(GenTreeCall* call)
60746074
}
60756075
#endif // defined(DEBUG) && defined(TARGET_X86)
60766076

6077-
// When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
6078-
// if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
6079-
// transition penalty, assuming the user function contains legacy SSE instruction.
6080-
// To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
6081-
// VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
6082-
// when there's preceding 256-bit AVX to legacy SSE transition penalty.
6083-
// This applies to 512bit AVX512 instructions as well.
6084-
if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX()))
6085-
{
6086-
assert(compiler->canUseVexEncoding());
6077+
if (GetEmitter()->Contains256bitOrMoreAVX() && call->NeedsVzeroupper(compiler))
6078+
{
6079+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
6080+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
6081+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
6082+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
6083+
// register) and before any call to an unknown function.
6084+
6085+
// This method contains a call that needs vzeroupper but also uses 256-bit or higher
6086+
// AVX itself. This means we couldn't optimize to only emitting a single vzeroupper in
6087+
// the method prologue and instead need to insert one before each call that needs it.
6088+
60876089
instGen(INS_vzeroupper);
60886090
}
60896091

@@ -11188,12 +11190,27 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1118811190
// funclet frames: this will be FuncletInfo.fiSpDelta.
1118911191
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1119011192
{
11191-
genVzeroupperIfNeeded(false);
1119211193
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
1119311194

1119411195
// Only callee saved floating point registers should be in regMask
1119511196
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1119611197

11198+
if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
11199+
{
11200+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11201+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11202+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11203+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11204+
// register) and before any call to an unknown function.
11205+
11206+
// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11207+
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11208+
// This reduces the overall amount of codegen, particularly for more common paths not using any
11209+
// SIMD or floating-point.
11210+
11211+
instGen(INS_vzeroupper);
11212+
}
11213+
1119711214
// fast path return
1119811215
if (regMask == RBM_NONE)
1119911216
{
@@ -11241,10 +11258,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1124111258
// Only callee saved floating point registers should be in regMask
1124211259
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1124311260

11261+
if (GetEmitter()->Contains256bitOrMoreAVX())
11262+
{
11263+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11264+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11265+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11266+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11267+
// register) and before any call to an unknown function.
11268+
11269+
instGen(INS_vzeroupper);
11270+
}
11271+
1124411272
// fast path return
1124511273
if (regMask == RBM_NONE)
1124611274
{
11247-
genVzeroupperIfNeeded();
1124811275
return;
1124911276
}
1125011277

@@ -11287,37 +11314,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1128711314
offset -= XMM_REGSIZE_BYTES;
1128811315
}
1128911316
}
11290-
genVzeroupperIfNeeded();
11291-
}
11292-
11293-
// Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
11294-
// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs
11295-
// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains
11296-
// 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native
11297-
// code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog
11298-
// if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty.
11299-
//
11300-
// Params
11301-
// check256bitOnly - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
11302-
// instruction, false to check if the function contains AVX instruction (either 128-bit or 256-bit).
11303-
//
11304-
void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
11305-
{
11306-
bool emitVzeroUpper = false;
11307-
if (check256bitOnly)
11308-
{
11309-
emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX();
11310-
}
11311-
else
11312-
{
11313-
emitVzeroUpper = GetEmitter()->ContainsAVX();
11314-
}
11315-
11316-
if (emitVzeroUpper)
11317-
{
11318-
assert(compiler->canUseVexEncoding());
11319-
instGen(INS_vzeroupper);
11320-
}
1132111317
}
1132211318

1132311319
//-----------------------------------------------------------------------------------

src/coreclr/jit/compiler.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -2312,6 +2312,7 @@ void Compiler::compSetProcessor()
23122312
// Assume each JITted method does not contain AVX instruction at first
23132313
codeGen->GetEmitter()->SetContainsAVX(false);
23142314
codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false);
2315+
codeGen->GetEmitter()->SetContainsCallNeedingVzeroupper(false);
23152316
}
23162317
if (canUseEvexEncoding())
23172318
{

src/coreclr/jit/compiler.h

+2
Original file line numberDiff line numberDiff line change
@@ -9383,6 +9383,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
93839383
}
93849384

93859385
#ifdef TARGET_XARCH
9386+
public:
93869387
bool canUseVexEncoding() const
93879388
{
93889389
return compOpportunisticallyDependsOn(InstructionSet_AVX);
@@ -9399,6 +9400,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
93999400
return compOpportunisticallyDependsOn(InstructionSet_AVX512F);
94009401
}
94019402

9403+
private:
94029404
//------------------------------------------------------------------------
94039405
// DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding.
94049406
//

src/coreclr/jit/emitxarch.h

+10
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,16 @@ void SetContains256bitOrMoreAVX(bool value)
468468
contains256bitOrMoreAVXInstruction = value;
469469
}
470470

471+
bool containsCallNeedingVzeroupper = false;
472+
bool ContainsCallNeedingVzeroupper() const
473+
{
474+
return containsCallNeedingVzeroupper;
475+
}
476+
void SetContainsCallNeedingVzeroupper(bool value)
477+
{
478+
containsCallNeedingVzeroupper = value;
479+
}
480+
471481
bool IsDstDstSrcAVXInstruction(instruction ins) const;
472482
bool IsDstSrcSrcAVXInstruction(instruction ins) const;
473483
bool IsThreeOperandAVXInstruction(instruction ins) const;

src/coreclr/jit/gentree.cpp

+69
Original file line numberDiff line numberDiff line change
@@ -2076,6 +2076,75 @@ void CallArgs::Remove(CallArg* arg)
20762076
assert(!"Did not find arg to remove in CallArgs::Remove");
20772077
}
20782078

2079+
#ifdef TARGET_XARCH
2080+
//---------------------------------------------------------------
2081+
// NeedsVzeroupper: Determines if the call needs a vzeroupper emitted before it is invoked
2082+
//
2083+
// Parameters:
2084+
// comp - the compiler
2085+
//
2086+
// Returns:
2087+
// true if a vzeroupper needs to be emitted; otherwise, false
2088+
//
2089+
bool GenTreeCall::NeedsVzeroupper(Compiler* comp)
2090+
{
2091+
bool needsVzeroupper = false;
2092+
2093+
if (IsPInvoke() && comp->canUseVexEncoding())
2094+
{
2095+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
2096+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
2097+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
2098+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
2099+
// register) and before any call to an unknown function.
2100+
2101+
switch (gtCallType)
2102+
{
2103+
case CT_USER_FUNC:
2104+
case CT_INDIRECT:
2105+
{
2106+
// Since P/Invokes are not compiled by the runtime, they are typically "unknown" since they
2107+
// may use the legacy encoding. This includes both CT_USER_FUNC and CT_INDIRECT
2108+
2109+
needsVzeroupper = true;
2110+
break;
2111+
}
2112+
2113+
case CT_HELPER:
2114+
{
2115+
// Most helpers are well known to not use any floating-point or SIMD logic internally, but
2116+
// a few do exist so we need to ensure they are handled. They are identified by taking or
2117+
// returning a floating-point or SIMD type, regardless of how it is actually passed/returned.
2118+
2119+
if (varTypeUsesFloatReg(this))
2120+
{
2121+
needsVzeroupper = true;
2122+
}
2123+
else
2124+
{
2125+
for (CallArg& arg : gtArgs.Args())
2126+
{
2127+
if (varTypeUsesFloatReg(arg.GetSignatureType()))
2128+
{
2129+
needsVzeroupper = true;
2130+
break;
2131+
}
2132+
}
2133+
}
2134+
break;
2135+
}
2136+
2137+
default:
2138+
{
2139+
unreached();
2140+
}
2141+
}
2142+
}
2143+
2144+
return needsVzeroupper;
2145+
}
2146+
#endif // TARGET_XARCH
2147+
20792148
//---------------------------------------------------------------
20802149
// GetOtherRegMask: Get the reg mask of gtOtherRegs of call node
20812150
//

src/coreclr/jit/gentree.h

+4
Original file line numberDiff line numberDiff line change
@@ -5124,6 +5124,10 @@ struct GenTreeCall final : public GenTree
51245124
#endif
51255125
}
51265126

5127+
#ifdef TARGET_XARCH
5128+
bool NeedsVzeroupper(Compiler* comp);
5129+
#endif // TARGET_XARCH
5130+
51275131
// Get reg mask of all the valid registers of gtOtherRegs array
51285132
regMaskTP GetOtherRegMask() const;
51295133

src/coreclr/jit/lsraxarch.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,16 @@ int LinearScan::BuildCall(GenTreeCall* call)
13411341
srcCount += BuildOperandUses(ctrlExpr, ctrlExprCandidates);
13421342
}
13431343

1344+
if (call->NeedsVzeroupper(compiler))
1345+
{
1346+
// Much like for Contains256bitOrMoreAVX, we want to track if any
1347+
// call needs a vzeroupper inserted. This allows us to reduce
1348+
// the total number of vzeroupper being inserted for cases where
1349+
// no 256+ AVX is used directly by the method.
1350+
1351+
compiler->GetEmitter()->SetContainsCallNeedingVzeroupper(true);
1352+
}
1353+
13441354
buildInternalRegisterUses();
13451355

13461356
// Now generate defs and kills.

0 commit comments

Comments
 (0)