@@ -10454,8 +10454,10 @@ void CodeGen::genFnEpilog(BasicBlock* block)
1045410454 }
1045510455#endif
1045610456
10457+ genClearAvxStateInEpilog ();
10458+
1045710459 // Restore float registers that were saved to stack before SP is modified.
10458- genRestoreCalleeSavedFltRegs (compiler-> compLclFrameSize );
10460+ genRestoreCalleeSavedFltRegs ();
1045910461
1046010462#ifdef JIT32_GCENCODER
1046110463 // When using the JIT32 GC encoder, we do not start the OS-reported portion of the epilog until after
@@ -10915,6 +10917,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1091510917
1091610918 // This is the end of the OS-reported prolog for purposes of unwinding
1091710919 compiler->unwindEndProlog ();
10920+
10921+ genClearAvxStateInProlog ();
1091810922}
1091910923
1092010924/* ****************************************************************************
@@ -10935,6 +10939,8 @@ void CodeGen::genFuncletEpilog()
1093510939
1093610940 ScopedSetVariable<bool > _setGeneratingEpilog (&compiler->compGeneratingEpilog , true );
1093710941
10942+ genClearAvxStateInEpilog ();
10943+
1093810944 inst_RV_IV (INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta , EA_PTRSIZE);
1093910945 instGen_Return (0 );
1094010946}
@@ -11032,6 +11038,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1103211038 // Add a padding for 16-byte alignment
1103311039 inst_RV_IV (INS_sub, REG_SPBASE, 12 , EA_PTRSIZE);
1103411040#endif
11041+
11042+ genClearAvxStateInProlog ();
1103511043}
1103611044
1103711045/* ****************************************************************************
@@ -11050,6 +11058,8 @@ void CodeGen::genFuncletEpilog()
1105011058
1105111059 ScopedSetVariable<bool > _setGeneratingEpilog (&compiler->compGeneratingEpilog , true );
1105211060
11061+ genClearAvxStateInEpilog ();
11062+
1105311063#ifdef UNIX_X86_ABI
1105411064 // Revert a padding that was added for 16-byte alignment
1105511065 inst_RV_IV (INS_add, REG_SPBASE, 12 , EA_PTRSIZE);
@@ -11339,40 +11349,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1133911349// Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1134011350// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1134111351// Here offset = 16-byte aligned offset after pushing integer registers.
11342- //
11343- // Params
11344- // lclFrameSize - Fixed frame size excluding callee pushed int regs.
11345- // non-funclet: this will be compLclFrameSize.
11346- // funclet frames: this will be FuncletInfo.fiSpDelta.
11347- void CodeGen::genPreserveCalleeSavedFltRegs (unsigned lclFrameSize)
11352+ void CodeGen::genPreserveCalleeSavedFltRegs ()
1134811353{
1134911354 regMaskTP regMask = compiler->compCalleeFPRegsSavedMask ;
1135011355
1135111356 // Only callee saved floating point registers should be in regMask
1135211357 assert ((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1135311358
11354- if (GetEmitter ()->ContainsCallNeedingVzeroupper () && !GetEmitter ()->Contains256bitOrMoreAVX ())
11355- {
11356- // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11357- // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11358- // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11359- // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11360- // register) and before any call to an unknown function.
11361-
11362- // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11363- // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11364- // This reduces the overall amount of codegen, particularly for more common paths not using any
11365- // SIMD or floating-point.
11366-
11367- instGen (INS_vzeroupper);
11368- }
11369-
1137011359 // fast path return
1137111360 if (regMask == RBM_NONE)
1137211361 {
1137311362 return ;
1137411363 }
1137511364
11365+ unsigned lclFrameSize = compiler->compLclFrameSize ;
11366+
1137611367#ifdef TARGET_AMD64
1137711368 unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven () ? REGSIZE_BYTES : 0 ;
1137811369 unsigned offset = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
@@ -11402,35 +11393,21 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1140211393// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1140311394// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1140411395// Here offset = 16-byte aligned offset after pushing integer registers.
11405- //
11406- // Params
11407- // lclFrameSize - Fixed frame size excluding callee pushed int regs.
11408- // non-funclet: this will be compLclFrameSize.
11409- // funclet frames: this will be FuncletInfo.fiSpDelta.
11410- void CodeGen::genRestoreCalleeSavedFltRegs (unsigned lclFrameSize)
11396+ void CodeGen::genRestoreCalleeSavedFltRegs ()
1141111397{
1141211398 regMaskTP regMask = compiler->compCalleeFPRegsSavedMask ;
1141311399
1141411400 // Only callee saved floating point registers should be in regMask
1141511401 assert ((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1141611402
11417- if (GetEmitter ()->Contains256bitOrMoreAVX ())
11418- {
11419- // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11420- // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11421- // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11422- // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11423- // register) and before any call to an unknown function.
11424-
11425- instGen (INS_vzeroupper);
11426- }
11427-
1142811403 // fast path return
1142911404 if (regMask == RBM_NONE)
1143011405 {
1143111406 return ;
1143211407 }
1143311408
11409+ unsigned lclFrameSize = compiler->compLclFrameSize ;
11410+
1143411411#ifdef TARGET_AMD64
1143511412 unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven () ? REGSIZE_BYTES : 0 ;
1143611413 instruction copyIns = ins_Copy (TYP_FLOAT);
@@ -11472,6 +11449,45 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1147211449 }
1147311450}
1147411451
11452+ // -----------------------------------------------------------------------------------
11453+ // genClearAvxStateInProlog: Generate vzeroupper instruction to clear AVX state if necessary in a prolog
11454+ //
11455+ void CodeGen::genClearAvxStateInProlog ()
11456+ {
11457+ if (GetEmitter ()->ContainsCallNeedingVzeroupper () && !GetEmitter ()->Contains256bitOrMoreAVX ())
11458+ {
11459+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11460+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11461+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11462+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11463+ // register) and before any call to an unknown function.
11464+
11465+ // This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11466+ // AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11467+ // This reduces the overall amount of codegen, particularly for more common paths not using any
11468+ // SIMD or floating-point.
11469+
11470+ instGen (INS_vzeroupper);
11471+ }
11472+ }
11473+
11474+ // -----------------------------------------------------------------------------------
11475+ // genClearAvxStateInEpilog: Generate vzeroupper instruction to clear AVX state if necessary in an epilog
11476+ //
11477+ void CodeGen::genClearAvxStateInEpilog ()
11478+ {
11479+ if (GetEmitter ()->Contains256bitOrMoreAVX ())
11480+ {
11481+ // The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11482+ // Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11483+ // between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11484+ // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11485+ // register) and before any call to an unknown function.
11486+
11487+ instGen (INS_vzeroupper);
11488+ }
11489+ }
11490+
1147511491// -----------------------------------------------------------------------------------
1147611492// instGen_MemoryBarrier: Emit a MemoryBarrier instruction
1147711493//
0 commit comments