Skip to content

Use AVX2 in codegen for GT_STORE_BLK #55604

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Sep 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
89538c1
Saving current state, current state does not currently pass tests
alexcovington Jul 7, 2021
068f5bf
Getting available vector register size from compiler instance instead…
alexcovington Jul 8, 2021
1e11d3e
Emit vzeroupper if there is potentially an AVX-SSE transition
alexcovington Jul 8, 2021
0b7f138
Copy remainder using same tempReg allocated during SIMD moves instead…
alexcovington Jul 8, 2021
fc0cf3d
Better guard for maximum SIMD register size
alexcovington Jul 9, 2021
0c982dc
Fix typo
alexcovington Jul 9, 2021
9952bbd
Remove commented out lines
alexcovington Jul 9, 2021
754ab73
Merge branch 'main' of https://github.com/dotnet/runtime into enable-…
alexcovington Jul 12, 2021
f385295
Insert vzeroupper if using AVX2
alexcovington Jul 12, 2021
58be507
Add another vzeroupper
alexcovington Jul 12, 2021
c784c45
Remove vzeroupper, inserting the instruction hurts performance
alexcovington Jul 12, 2021
de244c7
Merge branch 'main' into enable-avx2-gt-store-blk
alexcovington Jul 13, 2021
e21209c
Merge branch 'main' into enable-avx2-gt-store-blk
alexcovington Jul 13, 2021
b0199b1
Adding vzeroupper again for testing
alexcovington Jul 13, 2021
47fcb1d
Remove debug lines
alexcovington Jul 13, 2021
4cce390
Add assert before copying remaining bytes, change regSize condition t…
alexcovington Jul 13, 2021
98180b4
Fix typo
alexcovington Jul 13, 2021
eba6f31
Added check to ensure size is not 0 to prevent allocating register fo…
alexcovington Jul 15, 2021
6b31381
Using YMM registers during init block if block is large enough
alexcovington Jul 16, 2021
5517bdc
Add some comments
alexcovington Jul 16, 2021
8f434cb
Formatting
alexcovington Jul 16, 2021
e2e6259
Rename remainder variable to shiftBack, better describes the logic fo…
alexcovington Jul 19, 2021
dd7a85b
Use shift-back technique for init block as well.
alexcovington Jul 19, 2021
be3dfb2
Add assert for init block
alexcovington Jul 19, 2021
afa2060
Use xmm register to move remainder if it fits, shift-back for GPR for…
alexcovington Jul 28, 2021
3fddd55
Use xmm in init block for remainder if it is an appropriate size, shi…
alexcovington Jul 28, 2021
710941a
Avoid allocating GPR during LSRA for BlkOpKindUnroll
alexcovington Jul 30, 2021
fccef32
Merge branch 'main' of https://github.com/dotnet/runtime into enable-…
alexcovington Aug 3, 2021
c1ee2c4
Merge branch 'main' of https://github.com/dotnet/runtime into enable-…
alexcovington Aug 5, 2021
fa81b67
Remove shift-back of GPR for remainder, added back the step-down to r…
alexcovington Aug 5, 2021
c1497d1
Shift-back GPR when handling remainder for AMD64 only
alexcovington Aug 5, 2021
fd2a9d4
Merge branch 'main' of https://github.com/dotnet/runtime into enable-…
alexcovington Sep 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 170 additions & 10 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2782,7 +2782,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
assert(src->IsIntegralConst(0));
assert(willUseSimdMov);
#ifdef TARGET_AMD64
assert(size % 16 == 0);
assert(size >= XMM_REGSIZE_BYTES);
#else
assert(size % 8 == 0);
#endif
Expand All @@ -2797,24 +2797,33 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
{
regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);

unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;

if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
{
// If the source is constant 0 then always use xorps, it's faster
// than copying the constant from a GPR to a XMM register.
emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg);
emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
}
else
{
emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);

if (regSize == YMM_REGSIZE_BYTES)
{
// Extend the bytes in the lower lanes to the upper lanes
emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1);
}
#ifdef TARGET_X86
// For x86, we need one more to convert it from 8 bytes to 16 bytes.
emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
#endif
}

instruction simdMov = simdUnalignedMovIns();
unsigned regSize = XMM_REGSIZE_BYTES;
unsigned bytesWritten = 0;

while (bytesWritten < size)
Expand All @@ -2828,8 +2837,21 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
#endif
if (bytesWritten + regSize > size)
{
#ifdef TARGET_AMD64
if (size - bytesWritten <= XMM_REGSIZE_BYTES)
{
regSize = XMM_REGSIZE_BYTES;
}

// Shift dstOffset back to use full SIMD move
unsigned shiftBack = regSize - (size - bytesWritten);
assert(shiftBack <= regSize);
bytesWritten -= shiftBack;
dstOffset -= shiftBack;
#else
assert(srcIntReg != REG_NA);
break;
#endif
}

if (dstLclNum != BAD_VAR_NUM)
Expand All @@ -2849,14 +2871,51 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
size -= bytesWritten;
}

// Fill the remainder using normal stores.
// Fill the remainder using normal stores.
#ifdef TARGET_AMD64
unsigned regSize = REGSIZE_BYTES;

while (regSize > size)
{
regSize /= 2;
}

for (; size > regSize; size -= regSize, dstOffset += regSize)
{
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}

if (size > 0)
{
unsigned shiftBack = regSize - size;
assert(shiftBack <= regSize);
dstOffset -= shiftBack;

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}
#else // TARGET_X86
for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize)
{
while (regSize > size)
{
regSize /= 2;
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
Expand All @@ -2867,6 +2926,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
dstAddrIndexScale, dstOffset);
}
}
#endif
}

#ifdef TARGET_AMD64
Expand Down Expand Up @@ -3017,8 +3077,13 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT);

instruction simdMov = simdUnalignedMovIns();
for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize;
size -= regSize, srcOffset += regSize, dstOffset += regSize)

// Get the largest SIMD register available if the size is large enough
unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;

for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
{
if (srcLclNum != BAD_VAR_NUM)
{
Expand All @@ -3041,15 +3106,109 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
}
}

// TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores.
// On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to
// allocate a GPR just for the remainder.
if (size > 0)
{
if (size <= XMM_REGSIZE_BYTES)
{
regSize = XMM_REGSIZE_BYTES;
}

// Copy the remainder by moving the last regSize bytes of the buffer
unsigned shiftBack = regSize - size;
assert(shiftBack <= regSize);

srcOffset -= shiftBack;
dstOffset -= shiftBack;

if (srcLclNum != BAD_VAR_NUM)
{
emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}

return;
}

// Fill the remainder with normal loads/stores
if (size > 0)
{
regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT);

#ifdef TARGET_AMD64
unsigned regSize = REGSIZE_BYTES;

while (regSize > size)
{
regSize /= 2;
}

for (; size > regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
{
if (srcLclNum != BAD_VAR_NUM)
{
emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}

if (size > 0)
{
unsigned shiftBack = regSize - size;
assert(shiftBack <= regSize);

srcOffset -= shiftBack;
dstOffset -= shiftBack;

if (srcLclNum != BAD_VAR_NUM)
{
emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}
#else // TARGET_X86
for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize)
{
while (regSize > size)
Expand Down Expand Up @@ -3077,6 +3236,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
dstAddrIndexScale, dstOffset);
}
}
#endif
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
{
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
#ifdef TARGET_AMD64
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0);
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
#else
const bool willUseOnlySimdMov = (size % 8 == 0);
#endif
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1364,7 +1364,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
switch (blkNode->gtBlkOpKind)
{
case GenTreeBlk::BlkOpKindUnroll:
if ((size % XMM_REGSIZE_BYTES) != 0)
if (size < XMM_REGSIZE_BYTES)
{
regMaskTP regMask = allRegs(TYP_INT);
#ifdef TARGET_X86
Expand All @@ -1381,7 +1381,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
if (size >= XMM_REGSIZE_BYTES)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
SetContainsAVXFlags();
SetContainsAVXFlags(size);
}
break;

Expand Down