Skip to content

Commit 4d72c6b

Browse files
Use AVX2 in codegen for GT_STORE_BLK (#55604)
* Saving current state, current state does not currently pass tests * Getting available vector register size from compiler instance instead, resolves failing tests. * Emit vzeroupper if there is potentially an AVX-SSE transition * Copy remainder using same tempReg allocated during SIMD moves instead of allocating a GPR * Better guard for maximum SIMD register size * Fix typo * Remove commented out lines * Insert vzeroupper if using AVX2 * Add another vzeroupper * Remove vzeroupper, inserting the instruction hurts performance * Adding vzeroupper again for testing * Remove debug lines * Add assert before copying remaining bytes, change regSize condition to depend on AVX, remove vzeroupper inserts. * Fix typo * Added check to ensure size is not 0 to prevent allocating register for structs of size 0. * Using YMM registers during init block if block is large enough * Add some comments * Formatting * Rename remainder variable to shiftBack, better describes the logic for the block. * Use shift-back technique for init block as well. * Add assert for init block * Use xmm register to move remainder if it fits, shift-back for GPR for remainder instead of step-down * Use xmm in init block for remainder if it is an appropriate size, shift-back for GPR in init instead of step-down register size * Avoid allocating GPR during LSRA for BlkOpKindUnroll * Remove shift-back of GPR for remainder, added back the step-down to resolve test failures on x86 * Shift-back GPR when handling remainder for AMD64 only
1 parent c980180 commit 4d72c6b

File tree

3 files changed

+173
-13
lines changed

3 files changed

+173
-13
lines changed

src/coreclr/jit/codegenxarch.cpp

Lines changed: 170 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2782,7 +2782,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
27822782
assert(src->IsIntegralConst(0));
27832783
assert(willUseSimdMov);
27842784
#ifdef TARGET_AMD64
2785-
assert(size % 16 == 0);
2785+
assert(size >= XMM_REGSIZE_BYTES);
27862786
#else
27872787
assert(size % 8 == 0);
27882788
#endif
@@ -2797,24 +2797,33 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
27972797
{
27982798
regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
27992799

2800+
unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
2801+
? YMM_REGSIZE_BYTES
2802+
: XMM_REGSIZE_BYTES;
2803+
28002804
if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
28012805
{
28022806
// If the source is constant 0 then always use xorps, it's faster
28032807
// than copying the constant from a GPR to a XMM register.
2804-
emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg);
2808+
emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
28052809
}
28062810
else
28072811
{
28082812
emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
28092813
emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
2814+
2815+
if (regSize == YMM_REGSIZE_BYTES)
2816+
{
2817+
// Extend the bytes in the lower lanes to the upper lanes
2818+
emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1);
2819+
}
28102820
#ifdef TARGET_X86
28112821
// For x86, we need one more to convert it from 8 bytes to 16 bytes.
28122822
emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
28132823
#endif
28142824
}
28152825

28162826
instruction simdMov = simdUnalignedMovIns();
2817-
unsigned regSize = XMM_REGSIZE_BYTES;
28182827
unsigned bytesWritten = 0;
28192828

28202829
while (bytesWritten < size)
@@ -2828,8 +2837,21 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
28282837
#endif
28292838
if (bytesWritten + regSize > size)
28302839
{
2840+
#ifdef TARGET_AMD64
2841+
if (size - bytesWritten <= XMM_REGSIZE_BYTES)
2842+
{
2843+
regSize = XMM_REGSIZE_BYTES;
2844+
}
2845+
2846+
// Shift dstOffset back to use full SIMD move
2847+
unsigned shiftBack = regSize - (size - bytesWritten);
2848+
assert(shiftBack <= regSize);
2849+
bytesWritten -= shiftBack;
2850+
dstOffset -= shiftBack;
2851+
#else
28312852
assert(srcIntReg != REG_NA);
28322853
break;
2854+
#endif
28332855
}
28342856

28352857
if (dstLclNum != BAD_VAR_NUM)
@@ -2849,14 +2871,51 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
28492871
size -= bytesWritten;
28502872
}
28512873

2852-
// Fill the remainder using normal stores.
2874+
// Fill the remainder using normal stores.
2875+
#ifdef TARGET_AMD64
2876+
unsigned regSize = REGSIZE_BYTES;
2877+
2878+
while (regSize > size)
2879+
{
2880+
regSize /= 2;
2881+
}
2882+
2883+
for (; size > regSize; size -= regSize, dstOffset += regSize)
2884+
{
2885+
if (dstLclNum != BAD_VAR_NUM)
2886+
{
2887+
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
2888+
}
2889+
else
2890+
{
2891+
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
2892+
dstAddrIndexScale, dstOffset);
2893+
}
2894+
}
2895+
2896+
if (size > 0)
2897+
{
2898+
unsigned shiftBack = regSize - size;
2899+
assert(shiftBack <= regSize);
2900+
dstOffset -= shiftBack;
2901+
2902+
if (dstLclNum != BAD_VAR_NUM)
2903+
{
2904+
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
2905+
}
2906+
else
2907+
{
2908+
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
2909+
dstAddrIndexScale, dstOffset);
2910+
}
2911+
}
2912+
#else // TARGET_X86
28532913
for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize)
28542914
{
28552915
while (regSize > size)
28562916
{
28572917
regSize /= 2;
28582918
}
2859-
28602919
if (dstLclNum != BAD_VAR_NUM)
28612920
{
28622921
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
@@ -2867,6 +2926,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
28672926
dstAddrIndexScale, dstOffset);
28682927
}
28692928
}
2929+
#endif
28702930
}
28712931

28722932
#ifdef TARGET_AMD64
@@ -3017,8 +3077,13 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
30173077
regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT);
30183078

30193079
instruction simdMov = simdUnalignedMovIns();
3020-
for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize;
3021-
size -= regSize, srcOffset += regSize, dstOffset += regSize)
3080+
3081+
// Get the largest SIMD register available if the size is large enough
3082+
unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
3083+
? YMM_REGSIZE_BYTES
3084+
: XMM_REGSIZE_BYTES;
3085+
3086+
for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
30223087
{
30233088
if (srcLclNum != BAD_VAR_NUM)
30243089
{
@@ -3041,15 +3106,109 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
30413106
}
30423107
}
30433108

3044-
// TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores.
3045-
// On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to
3046-
// allocate a GPR just for the remainder.
3109+
if (size > 0)
3110+
{
3111+
if (size <= XMM_REGSIZE_BYTES)
3112+
{
3113+
regSize = XMM_REGSIZE_BYTES;
3114+
}
3115+
3116+
// Copy the remainder by moving the last regSize bytes of the buffer
3117+
unsigned shiftBack = regSize - size;
3118+
assert(shiftBack <= regSize);
3119+
3120+
srcOffset -= shiftBack;
3121+
dstOffset -= shiftBack;
3122+
3123+
if (srcLclNum != BAD_VAR_NUM)
3124+
{
3125+
emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
3126+
}
3127+
else
3128+
{
3129+
emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
3130+
srcAddrIndexScale, srcOffset);
3131+
}
3132+
3133+
if (dstLclNum != BAD_VAR_NUM)
3134+
{
3135+
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
3136+
}
3137+
else
3138+
{
3139+
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
3140+
dstAddrIndexScale, dstOffset);
3141+
}
3142+
}
3143+
3144+
return;
30473145
}
30483146

3147+
// Fill the remainder with normal loads/stores
30493148
if (size > 0)
30503149
{
30513150
regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT);
30523151

3152+
#ifdef TARGET_AMD64
3153+
unsigned regSize = REGSIZE_BYTES;
3154+
3155+
while (regSize > size)
3156+
{
3157+
regSize /= 2;
3158+
}
3159+
3160+
for (; size > regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
3161+
{
3162+
if (srcLclNum != BAD_VAR_NUM)
3163+
{
3164+
emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
3165+
}
3166+
else
3167+
{
3168+
emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
3169+
srcAddrIndexScale, srcOffset);
3170+
}
3171+
3172+
if (dstLclNum != BAD_VAR_NUM)
3173+
{
3174+
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
3175+
}
3176+
else
3177+
{
3178+
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
3179+
dstAddrIndexScale, dstOffset);
3180+
}
3181+
}
3182+
3183+
if (size > 0)
3184+
{
3185+
unsigned shiftBack = regSize - size;
3186+
assert(shiftBack <= regSize);
3187+
3188+
srcOffset -= shiftBack;
3189+
dstOffset -= shiftBack;
3190+
3191+
if (srcLclNum != BAD_VAR_NUM)
3192+
{
3193+
emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
3194+
}
3195+
else
3196+
{
3197+
emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
3198+
srcAddrIndexScale, srcOffset);
3199+
}
3200+
3201+
if (dstLclNum != BAD_VAR_NUM)
3202+
{
3203+
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
3204+
}
3205+
else
3206+
{
3207+
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
3208+
dstAddrIndexScale, dstOffset);
3209+
}
3210+
}
3211+
#else // TARGET_X86
30533212
for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize)
30543213
{
30553214
while (regSize > size)
@@ -3077,6 +3236,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
30773236
dstAddrIndexScale, dstOffset);
30783237
}
30793238
}
3239+
#endif
30803240
}
30813241
}
30823242

src/coreclr/jit/lowerxarch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
225225
{
226226
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
227227
#ifdef TARGET_AMD64
228-
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0);
228+
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
229229
#else
230230
const bool willUseOnlySimdMov = (size % 8 == 0);
231231
#endif

src/coreclr/jit/lsraxarch.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1364,7 +1364,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
13641364
switch (blkNode->gtBlkOpKind)
13651365
{
13661366
case GenTreeBlk::BlkOpKindUnroll:
1367-
if ((size % XMM_REGSIZE_BYTES) != 0)
1367+
if (size < XMM_REGSIZE_BYTES)
13681368
{
13691369
regMaskTP regMask = allRegs(TYP_INT);
13701370
#ifdef TARGET_X86
@@ -1381,7 +1381,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
13811381
if (size >= XMM_REGSIZE_BYTES)
13821382
{
13831383
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
1384-
SetContainsAVXFlags();
1384+
SetContainsAVXFlags(size);
13851385
}
13861386
break;
13871387

0 commit comments

Comments
 (0)