@@ -6074,16 +6074,18 @@ void CodeGen::genCall(GenTreeCall* call)
6074
6074
}
6075
6075
#endif // defined(DEBUG) && defined(TARGET_X86)
6076
6076
6077
- // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
6078
- // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
6079
- // transition penalty, assuming the user function contains legacy SSE instruction.
6080
- // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
6081
- // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
6082
- // when there's preceding 256-bit AVX to legacy SSE transition penalty.
6083
- // This applies to 512bit AVX512 instructions as well.
6084
- if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX()))
6085
- {
6086
- assert(compiler->canUseVexEncoding());
6077
+ if (GetEmitter()->Contains256bitOrMoreAVX() && call->NeedsVzeroupper(compiler))
6078
+ {
6079
+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
6080
+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
6081
+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
6082
+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
6083
+ // register) and before any call to an unknown function.
6084
+
6085
+ // This method contains a call that needs vzeroupper but also uses 256-bit or higher
6086
+ // AVX itself. This means we couldn't optimize to only emitting a single vzeroupper in
6087
+ // the method prologue and instead need to insert one before each call that needs it.
6088
+
6087
6089
instGen(INS_vzeroupper);
6088
6090
}
6089
6091
@@ -11188,12 +11190,27 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
11188
11190
// funclet frames: this will be FuncletInfo.fiSpDelta.
11189
11191
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
11190
11192
{
11191
- genVzeroupperIfNeeded(false);
11192
11193
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
11193
11194
11194
11195
// Only callee saved floating point registers should be in regMask
11195
11196
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
11196
11197
11198
+ if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
11199
+ {
11200
+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11201
+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11202
+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11203
+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11204
+ // register) and before any call to an unknown function.
11205
+
11206
+ // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11207
+ // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11208
+ // This reduces the overall amount of codegen, particularly for more common paths not using any
11209
+ // SIMD or floating-point.
11210
+
11211
+ instGen(INS_vzeroupper);
11212
+ }
11213
+
11197
11214
// fast path return
11198
11215
if (regMask == RBM_NONE)
11199
11216
{
@@ -11241,10 +11258,20 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
11241
11258
// Only callee saved floating point registers should be in regMask
11242
11259
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
11243
11260
11261
+ if (GetEmitter()->Contains256bitOrMoreAVX())
11262
+ {
11263
+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11264
+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11265
+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11266
+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11267
+ // register) and before any call to an unknown function.
11268
+
11269
+ instGen(INS_vzeroupper);
11270
+ }
11271
+
11244
11272
// fast path return
11245
11273
if (regMask == RBM_NONE)
11246
11274
{
11247
- genVzeroupperIfNeeded();
11248
11275
return;
11249
11276
}
11250
11277
@@ -11287,37 +11314,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
11287
11314
offset -= XMM_REGSIZE_BYTES;
11288
11315
}
11289
11316
}
11290
- genVzeroupperIfNeeded();
11291
- }
11292
-
11293
- // Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
11294
- // AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs
11295
- // (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains
11296
- // 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native
11297
- // code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog
11298
- // if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty.
11299
- //
11300
- // Params
11301
- // check256bitOnly - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
11302
- // instruction, false to check if the function contains AVX instruction (either 128-bit or 256-bit).
11303
- //
11304
- void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
11305
- {
11306
- bool emitVzeroUpper = false;
11307
- if (check256bitOnly)
11308
- {
11309
- emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX();
11310
- }
11311
- else
11312
- {
11313
- emitVzeroUpper = GetEmitter()->ContainsAVX();
11314
- }
11315
-
11316
- if (emitVzeroUpper)
11317
- {
11318
- assert(compiler->canUseVexEncoding());
11319
- instGen(INS_vzeroupper);
11320
- }
11321
11317
}
11322
11318
11323
11319
//-----------------------------------------------------------------------------------
0 commit comments