Skip to content

Commit 52ac82f

Browse files
committed
Move vzeroupper emit back to JIT
vzeroupper is AVX instruction and so it cannot be executed unconditionally in static asm helpers Fixes #115019
1 parent 29638e8 commit 52ac82f

File tree

7 files changed

+65
-67
lines changed

7 files changed

+65
-67
lines changed

src/coreclr/jit/codegen.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -515,8 +515,12 @@ class CodeGen final : public CodeGenInterface
515515
#if defined(TARGET_XARCH)
516516

517517
// Save/Restore callee saved float regs to stack
518-
void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
519-
void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
518+
void genPreserveCalleeSavedFltRegs();
519+
void genRestoreCalleeSavedFltRegs();
520+
521+
// Generate vzeroupper instruction to clear AVX state if necessary
522+
void genClearAvxStateInProlog();
523+
void genClearAvxStateInEpilog();
520524

521525
#endif // TARGET_XARCH
522526

src/coreclr/jit/codegencommon.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5285,8 +5285,10 @@ void CodeGen::genFnProlog()
52855285
#endif // TARGET_ARMARCH
52865286

52875287
#if defined(TARGET_XARCH)
5288+
genClearAvxStateInProlog();
5289+
52885290
// Preserve callee saved float regs to stack.
5289-
genPreserveCalleeSavedFltRegs(compiler->compLclFrameSize);
5291+
genPreserveCalleeSavedFltRegs();
52905292
#endif // defined(TARGET_XARCH)
52915293

52925294
#ifdef TARGET_AMD64

src/coreclr/jit/codegenxarch.cpp

Lines changed: 56 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -10452,8 +10452,10 @@ void CodeGen::genFnEpilog(BasicBlock* block)
1045210452
}
1045310453
#endif
1045410454

10455+
genClearAvxStateInEpilog();
10456+
1045510457
// Restore float registers that were saved to stack before SP is modified.
10456-
genRestoreCalleeSavedFltRegs(compiler->compLclFrameSize);
10458+
genRestoreCalleeSavedFltRegs();
1045710459

1045810460
#ifdef JIT32_GCENCODER
1045910461
// When using the JIT32 GC encoder, we do not start the OS-reported portion of the epilog until after
@@ -10913,6 +10915,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1091310915

1091410916
// This is the end of the OS-reported prolog for purposes of unwinding
1091510917
compiler->unwindEndProlog();
10918+
10919+
genClearAvxStateInProlog();
1091610920
}
1091710921

1091810922
/*****************************************************************************
@@ -10933,6 +10937,8 @@ void CodeGen::genFuncletEpilog()
1093310937

1093410938
ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
1093510939

10940+
genClearAvxStateInEpilog();
10941+
1093610942
inst_RV_IV(INS_add, REG_SPBASE, genFuncletInfo.fiSpDelta, EA_PTRSIZE);
1093710943
instGen_Return(0);
1093810944
}
@@ -11030,6 +11036,8 @@ void CodeGen::genFuncletProlog(BasicBlock* block)
1103011036
// Add a padding for 16-byte alignment
1103111037
inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
1103211038
#endif
11039+
11040+
genClearAvxStateInProlog();
1103311041
}
1103411042

1103511043
/*****************************************************************************
@@ -11048,6 +11056,8 @@ void CodeGen::genFuncletEpilog()
1104811056

1104911057
ScopedSetVariable<bool> _setGeneratingEpilog(&compiler->compGeneratingEpilog, true);
1105011058

11059+
genClearAvxStateInEpilog();
11060+
1105111061
#ifdef UNIX_X86_ABI
1105211062
// Revert a padding that was added for 16-byte alignment
1105311063
inst_RV_IV(INS_add, REG_SPBASE, 12, EA_PTRSIZE);
@@ -11337,40 +11347,21 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu
1133711347
// Save compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1133811348
// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1133911349
// Here offset = 16-byte aligned offset after pushing integer registers.
11340-
//
11341-
// Params
11342-
// lclFrameSize - Fixed frame size excluding callee pushed int regs.
11343-
// non-funclet: this will be compLclFrameSize.
11344-
// funclet frames: this will be FuncletInfo.fiSpDelta.
11345-
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
11350+
void CodeGen::genPreserveCalleeSavedFltRegs()
1134611351
{
1134711352
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
1134811353

1134911354
// Only callee saved floating point registers should be in regMask
1135011355
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1135111356

11352-
if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
11353-
{
11354-
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11355-
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11356-
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11357-
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11358-
// register) and before any call to an unknown function.
11359-
11360-
// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11361-
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11362-
// This reduces the overall amount of codegen, particularly for more common paths not using any
11363-
// SIMD or floating-point.
11364-
11365-
instGen(INS_vzeroupper);
11366-
}
11367-
1136811357
// fast path return
1136911358
if (regMask == RBM_NONE)
1137011359
{
1137111360
return;
1137211361
}
1137311362

11363+
unsigned lclFrameSize = compiler->compLclFrameSize;
11364+
1137411365
#ifdef TARGET_AMD64
1137511366
unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
1137611367
unsigned offset = lclFrameSize - firstFPRegPadding - XMM_REGSIZE_BYTES;
@@ -11400,35 +11391,21 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
1140011391
// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
1140111392
// down the stack to the largest register number stored at [RSP+offset-(genCountBits(regMask)-1)*XMM_REG_SIZE]
1140211393
// Here offset = 16-byte aligned offset after pushing integer registers.
11403-
//
11404-
// Params
11405-
// lclFrameSize - Fixed frame size excluding callee pushed int regs.
11406-
// non-funclet: this will be compLclFrameSize.
11407-
// funclet frames: this will be FuncletInfo.fiSpDelta.
11408-
void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
11394+
void CodeGen::genRestoreCalleeSavedFltRegs()
1140911395
{
1141011396
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
1141111397

1141211398
// Only callee saved floating point registers should be in regMask
1141311399
assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
1141411400

11415-
if (GetEmitter()->Contains256bitOrMoreAVX())
11416-
{
11417-
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11418-
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11419-
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11420-
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11421-
// register) and before any call to an unknown function.
11422-
11423-
instGen(INS_vzeroupper);
11424-
}
11425-
1142611401
// fast path return
1142711402
if (regMask == RBM_NONE)
1142811403
{
1142911404
return;
1143011405
}
1143111406

11407+
unsigned lclFrameSize = compiler->compLclFrameSize;
11408+
1143211409
#ifdef TARGET_AMD64
1143311410
unsigned firstFPRegPadding = compiler->lvaIsCalleeSavedIntRegCountEven() ? REGSIZE_BYTES : 0;
1143411411
instruction copyIns = ins_Copy(TYP_FLOAT);
@@ -11470,6 +11447,45 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
1147011447
}
1147111448
}
1147211449

11450+
//-----------------------------------------------------------------------------------
11451+
// genClearAvxStateInProlog: Generate vzeroupper instruction to clear AVX state if necessary in a prolog
11452+
//
11453+
void CodeGen::genClearAvxStateInProlog()
11454+
{
11455+
if (GetEmitter()->ContainsCallNeedingVzeroupper() && !GetEmitter()->Contains256bitOrMoreAVX())
11456+
{
11457+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11458+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11459+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11460+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11461+
// register) and before any call to an unknown function.
11462+
11463+
// This method contains a call that needs vzeroupper but also doesn't use 256-bit or higher
11464+
// AVX itself. Thus we can optimize to only emitting a single vzeroupper in the function prologue
11465+
// This reduces the overall amount of codegen, particularly for more common paths not using any
11466+
// SIMD or floating-point.
11467+
11468+
instGen(INS_vzeroupper);
11469+
}
11470+
}
11471+
11472+
//-----------------------------------------------------------------------------------
11473+
// genClearAvxStateInEpilog: Generate vzeroupper instruction to clear AVX state if necessary in an epilog
11474+
//
11475+
void CodeGen::genClearAvxStateInEpilog()
11476+
{
11477+
if (GetEmitter()->Contains256bitOrMoreAVX())
11478+
{
11479+
// The Intel optimization manual guidance in `3.11.5.3 Fixing Instruction Slowdowns` states:
11480+
// Insert a VZEROUPPER to tell the hardware that the state of the higher registers is clean
11481+
// between the VEX and the legacy SSE instructions. Often the best way to do this is to insert a
11482+
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
11483+
// register) and before any call to an unknown function.
11484+
11485+
instGen(INS_vzeroupper);
11486+
}
11487+
}
11488+
1147311489
//-----------------------------------------------------------------------------------
1147411490
// instGen_MemoryBarrier: Emit a MemoryBarrier instruction
1147511491
//

src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.S

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -245,19 +245,13 @@ NESTED_END RhpRethrow, _TEXT
245245

246246
alloc_stack stack_alloc_size
247247

248-
// Mirror clearing of AVX state done by regular method prologs
249-
vzeroupper
250-
251248
END_PROLOGUE
252249
.endm
253250

254251
//
255252
// Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
256253
//
257254
.macro FUNCLET_CALL_EPILOGUE
258-
// Mirror clearing of AVX state done by regular method epilogs
259-
vzeroupper
260-
261255
free_stack stack_alloc_size
262256

263257
pop_nonvol_reg rbp

src/coreclr/nativeaot/Runtime/amd64/ExceptionHandling.asm

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -308,9 +308,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
308308

309309
alloc_stack stack_alloc_size
310310

311-
;; Mirror clearing of AVX state done by regular method prologs
312-
vzeroupper
313-
314311
save_xmm128_postrsp xmm6, (arguments_scratch_area_size + 0 * 10h)
315312
save_xmm128_postrsp xmm7, (arguments_scratch_area_size + 1 * 10h)
316313
save_xmm128_postrsp xmm8, (arguments_scratch_area_size + 2 * 10h)
@@ -329,9 +326,6 @@ endm
329326
;; Epilogue of all funclet calling helpers (RhpCallXXXXFunclet)
330327
;;
331328
FUNCLET_CALL_EPILOGUE macro
332-
;; Mirror clearing of AVX state done by regular method epilogs
333-
vzeroupper
334-
335329
movdqa xmm6, [rsp + arguments_scratch_area_size + 0 * 10h]
336330
movdqa xmm7, [rsp + arguments_scratch_area_size + 1 * 10h]
337331
movdqa xmm8, [rsp + arguments_scratch_area_size + 2 * 10h]

src/coreclr/vm/amd64/AsmHelpers.asm

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -523,9 +523,6 @@ FUNCLET_CALL_PROLOGUE macro localsCount, alignStack
523523

524524
alloc_stack stack_alloc_size
525525

526-
;; Mirror clearing of AVX state done by regular method prologs
527-
vzeroupper
528-
529526
save_xmm128_postrsp xmm6, (arguments_scratch_area_size + 0 * 10h)
530527
save_xmm128_postrsp xmm7, (arguments_scratch_area_size + 1 * 10h)
531528
save_xmm128_postrsp xmm8, (arguments_scratch_area_size + 2 * 10h)
@@ -544,9 +541,6 @@ endm
544541
;; Epilogue of all funclet calling helpers (CallXXXXFunclet)
545542
;;
546543
FUNCLET_CALL_EPILOGUE macro
547-
;; Mirror clearing of AVX state done by regular method epilogs
548-
vzeroupper
549-
550544
movdqa xmm6, [rsp + arguments_scratch_area_size + 0 * 10h]
551545
movdqa xmm7, [rsp + arguments_scratch_area_size + 1 * 10h]
552546
movdqa xmm8, [rsp + arguments_scratch_area_size + 2 * 10h]

src/coreclr/vm/amd64/asmhelpers.S

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,19 +378,13 @@ LEAF_END ThisPtrRetBufPrecodeWorker, _TEXT
378378

379379
alloc_stack stack_alloc_size
380380

381-
// Mirror clearing of AVX state done by regular method prologs
382-
vzeroupper
383-
384381
END_PROLOGUE
385382
.endm
386383

387384
//
388385
// Epilogue of all funclet calling helpers (CallXXXXFunclet)
389386
//
390387
.macro FUNCLET_CALL_EPILOGUE
391-
// Mirror clearing of AVX state done by regular method epilogs
392-
vzeroupper
393-
394388
free_stack stack_alloc_size
395389

396390
pop_nonvol_reg rbp

0 commit comments

Comments
 (0)