Skip to content

Commit a9114e3

Browse files
authored
Move vzeroupper emit back to JIT (#115748)
vzeroupper is AVX instruction and so it cannot be executed unconditionally in static asm helpers Fixes #115672
1 parent 00b280a commit a9114e3

File tree

7 files changed

+65
-67
lines changed

7 files changed

+65
-67
lines changed

src/coreclr/jit/codegen.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,8 +515,12 @@ class CodeGen final : public CodeGenInterface
515515
#if defined(TARGET_XARCH)
516516

517517
// Save/Restore callee saved float regs to stack
518-
void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
519-
void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
518+
void genPreserveCalleeSavedFltRegs();
519+
void genRestoreCalleeSavedFltRegs();
520+
521+
// Generate vzeroupper instruction to clear AVX state if necessary
522+
void genClearAvxStateInProlog();
523+
void genClearAvxStateInEpilog();
520524

521525
#endif // TARGET_XARCH
522526

src/coreclr/jit/codegencommon.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5285,8 +5285,10 @@ void CodeGen::genFnProlog()
52855285
#endif // TARGET_ARMARCH
52865286

52875287
#if defined(TARGET_XARCH)
5288+
genClearAvxStateInProlog();
5289+
52885290
// Preserve callee saved float regs to stack.
5289-
genPreserveCalleeSavedFltRegs(compiler->compLclFrameSize);
5291+
genPreserveCalleeSavedFltRegs();
52905292
#endif // defined(TARGET_XARCH)
52915293

52925294
#ifdef TARGET_AMD64

src/coreclr/jit/codegenxarch.cpp

Lines changed: 56 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10454,8 +10454,10 @@ void CodeGen::genFnEpilog(BasicBlock* block)
1045410454
}
1045510455
#endif
1045610456

10457+
genClearAvxStateInEpilog();
10458+
1045710459
// Restore float registers that were saved to stack before SP is modified.
10458-
genRestoreCalleeSavedFltRegs(compiler->compLclFrameSize);
10460+
genRestoreCalleeSavedFltRegs();
1045910461

1046010462
#ifdef JIT32_GCENCODER
1046110463
// When using the JIT32 GC encoder, we do not start the OS-reported portion of the epilog until after
@@ -10915,6 +10917,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1091510917

1091610918
// This is the end of the OS-reported prolog for purposes of unwinding
1091710919
compiler->unwindEndProlog();
10920+
10921+
genClearAvxStateInProlog();
1091810922
}
1091910923

1092010924
/*****************************************************************************
@@ -10935,6 +10939,8 @@ void CodeGen::genFuncletEpilog()
1093510939

1093610940
ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
1093710941

10942+
genClearAvxStateInEpilog();
10943+
1093810944
inst_RV_IV(INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta, EA_PTRSIZE);
1093910945
instGen_Return(0);
1094010946
}
@@ -11032,6 +11038,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1103211038
// Add a padding for 16-byte alignment
1103311039
inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
1103411040
#endif
11041+
11042+
genClearAvxStateInProlog();
1103511043
}
1103611044

1103711045
/*****************************************************************************
@@ -11050,6 +11058,8 @@ void CodeGen::genFuncletEpilog()
1105011058

1105111059
ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
1105211060

11061+
genClearAvxStateInEpilog();
11062+
1105311063
#ifdef UNIX_X86_ABI
1105411064
// Revert a padding that was added for 16-byte alignment
1105511065
inst_RV_IV(INS_add, REG_SPBASE, 12, EA_PTRSIZE);
@@ -11339,40 +11349,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1133911349
// Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1134011350
// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1134111351
// Here offset = 16-byte aligned offset after pushing integer registers.
11342-
//
11343-
// Params
11344-
// lclFrameSize - Fixed frame size excluding callee pushed int regs.
11345-
// non-funclet: this will be compLclFrameSize.
11346-
// funclet frames: this will be FuncletInfo.fiSpDelta.
11347-
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
11352+
void CodeGen::genPreserveCalleeSavedFltRegs()
1134811353
{
1134911354
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
1135011355

1135111356
// Only callee saved floating point registers should be in regMask
1135211357
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1135311358

11354-
if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
11355-
{
11356-
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11357-
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11358-
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11359-
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11360-
// register) and before any call to an unknown function.
11361-
11362-
// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11363-
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11364-
// This reduces the overall amount of codegen, particularly for more common paths not using any
11365-
// SIMD or floating-point.
11366-
11367-
instGen(INS_vzeroupper);
11368-
}
11369-
1137011359
// fast path return
1137111360
if (regMask == RBM_NONE)
1137211361
{
1137311362
return;
1137411363
}
1137511364

11365+
unsigned lclFrameSize = compiler->compLclFrameSize;
11366+
1137611367
#ifdef TARGET_AMD64
1137711368
unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
1137811369
unsigned offset = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
@@ -11402,35 +11393,21 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1140211393
// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1140311394
// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1140411395
// Here offset = 16-byte aligned offset after pushing integer registers.
11405-
//
11406-
// Params
11407-
// lclFrameSize - Fixed frame size excluding callee pushed int regs.
11408-
// non-funclet: this will be compLclFrameSize.
11409-
// funclet frames: this will be FuncletInfo.fiSpDelta.
11410-
void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
11396+
void CodeGen::genRestoreCalleeSavedFltRegs()
1141111397
{
1141211398
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
1141311399

1141411400
// Only callee saved floating point registers should be in regMask
1141511401
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1141611402

11417-
if (GetEmitter()->Contains256bitOrMoreAVX())
11418-
{
11419-
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11420-
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11421-
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11422-
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11423-
// register) and before any call to an unknown function.
11424-
11425-
instGen(INS_vzeroupper);
11426-
}
11427-
1142811403
// fast path return
1142911404
if (regMask == RBM_NONE)
1143011405
{
1143111406
return;
1143211407
}
1143311408

11409+
unsigned lclFrameSize = compiler->compLclFrameSize;
11410+
1143411411
#ifdef TARGET_AMD64
1143511412
unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
1143611413
instruction copyIns = ins_Copy(TYP_FLOAT);
@@ -11472,6 +11449,45 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1147211449
}
1147311450
}
1147411451

11452+
//-----------------------------------------------------------------------------------
11453+
// genClearAvxStateInProlog: Generate vzeroupper instruction to clear AVX state if necessary in a prolog
11454+
//
11455+
void CodeGen::genClearAvxStateInProlog()
11456+
{
11457+
if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
11458+
{
11459+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11460+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11461+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11462+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11463+
// register) and before any call to an unknown function.
11464+
11465+
// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11466+
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11467+
// This reduces the overall amount of codegen, particularly for more common paths not using any
11468+
// SIMD or floating-point.
11469+
11470+
instGen(INS_vzeroupper);
11471+
}
11472+
}
11473+
11474+
//-----------------------------------------------------------------------------------
11475+
// genClearAvxStateInEpilog: Generate vzeroupper instruction to clear AVX state if necessary in an epilog
11476+
//
11477+
void CodeGen::genClearAvxStateInEpilog()
11478+
{
11479+
if (GetEmitter()->Contains256bitOrMoreAVX())
11480+
{
11481+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11482+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11483+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11484+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11485+
// register) and before any call to an unknown function.
11486+
11487+
instGen(INS_vzeroupper);
11488+
}
11489+
}
11490+
1147511491
//-----------------------------------------------------------------------------------
1147611492
// instGen_MemoryBarrier: Emit a MemoryBarrier instruction
1147711493
//

src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -245,19 +245,13 @@ NESTED_END RhpRethrow, _TEXT
245245

246246
alloc_stack stack_alloc_size
247247

248-
// Mirror clearing of AVX state done by regular method prologs
249-
vzeroupper
250-
251248
END_PROLOGUE
252249
.endm
253250

254251
//
255252
// Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
256253
//
257254
.macro FUNCLET_CALL_EPILOGUE
258-
// Mirror clearing of AVX state done by regular method epilogs
259-
vzeroupper
260-
261255
free_stack stack_alloc_size
262256

263257
pop_nonvol_reg rbp

src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -308,9 +308,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
308308

309309
alloc_stack stack_alloc_size
310310

311-
;; Mirror clearing of AVX state done by regular method prologs
312-
vzeroupper
313-
314311
save_xmm128_postrsp xmm6, (arguments_scratch_area_size + 0 * 10h)
315312
save_xmm128_postrsp xmm7, (arguments_scratch_area_size + 1 * 10h)
316313
save_xmm128_postrsp xmm8, (arguments_scratch_area_size + 2 * 10h)
@@ -329,9 +326,6 @@ endm
329326
;; Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
330327
;;
331328
FUNCLET_CALL_EPILOGUE macro
332-
;; Mirror clearing of AVX state done by regular method epilogs
333-
vzeroupper
334-
335329
movdqa xmm6, [rsp + arguments_scratch_area_size + 0 * 10h]
336330
movdqa xmm7, [rsp + arguments_scratch_area_size + 1 * 10h]
337331
movdqa xmm8, [rsp + arguments_scratch_area_size + 2 * 10h]

src/coreclr/vm/amd64/AsmHelpers.asm

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -506,9 +506,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
506506

507507
alloc_stack stack_alloc_size
508508

509-
;; Mirror clearing of AVX state done by regular method prologs
510-
vzeroupper
511-
512509
save_xmm128_postrsp xmm6, (arguments_scratch_area_size + 0 * 10h)
513510
save_xmm128_postrsp xmm7, (arguments_scratch_area_size + 1 * 10h)
514511
save_xmm128_postrsp xmm8, (arguments_scratch_area_size + 2 * 10h)
@@ -527,9 +524,6 @@ endm
527524
;; Epilogue of all funclet calling helpers (CallXXXXFunclet)
528525
;;
529526
FUNCLET_CALL_EPILOGUE macro
530-
;; Mirror clearing of AVX state done by regular method epilogs
531-
vzeroupper
532-
533527
movdqa xmm6, [rsp + arguments_scratch_area_size + 0 * 10h]
534528
movdqa xmm7, [rsp + arguments_scratch_area_size + 1 * 10h]
535529
movdqa xmm8, [rsp + arguments_scratch_area_size + 2 * 10h]

src/coreclr/vm/amd64/asmhelpers.S

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -359,19 +359,13 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT
359359

360360
alloc_stack stack_alloc_size
361361

362-
// Mirror clearing of AVX state done by regular method prologs
363-
vzeroupper
364-
365362
END_PROLOGUE
366363
.endm
367364

368365
//
369366
// Epilogue of all funclet calling helpers (CallXXXXFunclet)
370367
//
371368
.macro FUNCLET_CALL_EPILOGUE
372-
// Mirror clearing of AVX state done by regular method epilogs
373-
vzeroupper
374-
375369
free_stack stack_alloc_size
376370

377371
pop_nonvol_reg rbp

0 commit comments

Comments
 (0)