Skip to content

Use SIMD for block inits with GC fields #102132

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 79 additions & 41 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3231,6 +3231,18 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
assert(size <= INT32_MAX);
assert(dstOffset < (INT32_MAX - static_cast<int>(size)));

auto emitStore = [&](instruction ins, unsigned width, regNumber target) {
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(ins, EA_ATTR(width), target, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(ins, EA_ATTR(width), target, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale,
dstOffset);
}
};

#ifdef FEATURE_SIMD
if (willUseSimdMov)
{
Expand All @@ -3244,18 +3256,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
instruction simdMov = simdUnalignedMovIns();
unsigned bytesWritten = 0;

auto emitSimdMovs = [&]() {
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
};

while (bytesWritten < size)
{
if (bytesWritten + regSize > size)
Expand All @@ -3264,7 +3264,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
break;
}

emitSimdMovs();
emitStore(simdMov, regSize, srcXmmReg);
dstOffset += regSize;
bytesWritten += regSize;
}
Expand All @@ -3279,10 +3279,71 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)

// Rewind dstOffset so we can fit a vector for the while remainder
dstOffset -= (regSize - size);
emitSimdMovs();
emitStore(simdMov, regSize, srcXmmReg);
size = 0;
}
}
else if (node->IsOnHeapAndContainsReferences() && ((internalRegisters.GetAll(node) & RBM_ALLFLOAT) != 0))
{
// For block with GC refs we still can use SIMD, but only for continuous
// non-GC parts where atomicity guarantees are not that strict.
assert(!willUseSimdMov);
ClassLayout* layout = node->GetLayout();

regNumber simdZeroReg = REG_NA;
unsigned slots = layout->GetSlotCount();
unsigned slot = 0;
while (slot < slots)
{
if (!layout->IsGCPtr(slot))
{
// How many continuous non-GC slots do we have?
unsigned nonGcSlotCount = 0;
do
{
nonGcSlotCount++;
slot++;
} while ((slot < slots) && !layout->IsGCPtr(slot));

for (unsigned nonGcSlot = 0; nonGcSlot < nonGcSlotCount; nonGcSlot++)
{
// Are continuous nongc slots enough to use SIMD?
unsigned simdSize = compiler->roundDownSIMDSize((nonGcSlotCount - nonGcSlot) * REGSIZE_BYTES);
if (simdSize > 0)
{
// Initialize simdZeroReg with zero on demand
if (simdZeroReg == REG_NA)
{
simdZeroReg = internalRegisters.GetSingle(node, RBM_ALLFLOAT);
// SIMD16 is sufficient for any SIMD size
simd_t vecCon = {};
genSetRegToConst(simdZeroReg, TYP_SIMD16, &vecCon);
}

emitStore(simdUnalignedMovIns(), simdSize, simdZeroReg);
dstOffset += (int)simdSize;
nonGcSlot += (simdSize / REGSIZE_BYTES) - 1;
}
else
{
emitStore(INS_mov, REGSIZE_BYTES, srcIntReg);
dstOffset += REGSIZE_BYTES;
}
}
}
else
{
// GC slot - update atomically
emitStore(INS_mov, REGSIZE_BYTES, srcIntReg);
dstOffset += REGSIZE_BYTES;
slot++;
}
}

// There are no trailing elements
assert((layout->GetSize() % TARGET_POINTER_SIZE) == 0);
size = 0;
}
#endif // FEATURE_SIMD

assert((srcIntReg != REG_NA) || (size == 0));
Expand All @@ -3298,15 +3359,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)

for (; size > regSize; size -= regSize, dstOffset += regSize)
{
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
emitStore(INS_mov, regSize, srcIntReg);
}

// Handle the non-SIMD remainder by overlapping with previously processed data if needed
Expand All @@ -3322,15 +3375,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
assert(shiftBack <= regSize);
dstOffset -= shiftBack;

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
emitStore(INS_mov, regSize, srcIntReg);
}
#else // TARGET_X86
for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize)
Expand All @@ -3339,15 +3384,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
{
regSize /= 2;
}
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}

emitStore(INS_mov, regSize, srcIntReg);
}
#endif
}
Expand Down
27 changes: 24 additions & 3 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1430,9 +1430,30 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
{
case GenTreeBlk::BlkOpKindUnroll:
{
const bool canUse16BytesSimdMov =
!blkNode->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported();
const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
bool willUseSimdMov = compiler->IsBaselineSimdIsaSupported() && (size >= XMM_REGSIZE_BYTES);
if (willUseSimdMov && blkNode->IsOnHeapAndContainsReferences())
{
ClassLayout* layout = blkNode->GetLayout();

unsigned xmmCandidates = 0;
unsigned continuousNonGc = 0;
for (unsigned slot = 0; slot < layout->GetSlotCount(); slot++)
{
if (layout->IsGCPtr(slot))
{
xmmCandidates += ((continuousNonGc * TARGET_POINTER_SIZE) / XMM_REGSIZE_BYTES);
continuousNonGc = 0;
}
else
{
continuousNonGc++;
}
}
xmmCandidates += ((continuousNonGc * TARGET_POINTER_SIZE) / XMM_REGSIZE_BYTES);

// Just one XMM candidate is not profitable
willUseSimdMov = xmmCandidates > 1;
}

if (willUseSimdMov)
{
Expand Down
Loading