Skip to content

Commit

Permalink
Merge pull request #7318 from knn-k/aarch64acInlining
Browse files Browse the repository at this point in the history
AArch64: Improve arraycopy inlining
  • Loading branch information
0xdaryl authored May 15, 2024
2 parents 0331584 + 84de48b commit 57a22b2
Showing 1 changed file with 66 additions and 52 deletions.
118 changes: 66 additions & 52 deletions compiler/aarch64/codegen/OMRTreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7181,65 +7181,71 @@ inlineConstantLengthForwardArrayCopy(TR::Node *node, int64_t byteLen, TR::Regist
if (byteLen == 0)
return;

int64_t iteration64 = byteLen >> 6;
int32_t residue64 = byteLen & 0x3F;
int64_t iteration = byteLen >> 7;
int32_t residue = byteLen & 0x7F;
TR::Register *dataReg1 = (byteLen >= 16) ? cg->allocateRegister(TR_VRF) : NULL;
TR::Register *dataReg2 = (residue64 & 0xF) ? cg->allocateRegister() : NULL;
TR::Register *dataReg2 = (byteLen >= 32) ? cg->allocateRegister(TR_VRF) : NULL;
TR::Register *dataReg3 = ((iteration > 1) || (residue & 0xF)) ? cg->allocateRegister() : NULL;

if (iteration64 > 1)
if (iteration > 1)
{
TR::Register *cntReg = cg->allocateRegister();
loadConstant64(cg, node, iteration64, cntReg);
TR::Register *cntReg = dataReg3;
loadConstant64(cg, node, iteration, cntReg);

TR::LabelSymbol *loopLabel = generateLabelSymbol(cg);
generateLabelInstruction(cg, TR::InstOpCode::label, node, loopLabel);

// Copy 16x4 bytes in a loop
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpostq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, 16));
generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 16), dataReg1);
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpostq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, 16));
generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 16), dataReg1);
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpostq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, 16));
generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 16), dataReg1);
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpostq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, 16));
generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 16), dataReg1);
// Copy 32x4 bytes in a loop
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32));
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2);
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32));
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2);
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32));
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2);
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32));
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::subimmx, node, cntReg, cntReg, 1);
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2);
generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, cntReg, loopLabel);

cg->stopUsingRegister(cntReg);
}
else if (iteration64 == 1)
else if (iteration == 1)
{
residue += 128;
}

while (residue >= 32)
{
residue64 += 64;
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32));
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2);
residue -= 32;
}

int32_t offset = 0;
while (residue64 > 0)
while (residue > 0)
{
TR::InstOpCode::Mnemonic loadOp;
TR::InstOpCode::Mnemonic storeOp;
int32_t dataSize;
TR::Register *dataReg = (residue64 >= 16) ? dataReg1 : dataReg2;
TR::Register *dataReg = (residue >= 16) ? dataReg1 : dataReg3;

if (residue64 >= 16)
if (residue >= 16)
{
loadOp = TR::InstOpCode::vldrimmq;
storeOp = TR::InstOpCode::vstrimmq;
dataSize = 16;
}
else if (residue64 >= 8)
else if (residue >= 8)
{
loadOp = TR::InstOpCode::ldrimmx;
storeOp = TR::InstOpCode::strimmx;
dataSize = 8;
}
else if (residue64 >= 4)
else if (residue >= 4)
{
loadOp = TR::InstOpCode::ldrimmw;
storeOp = TR::InstOpCode::strimmw;
dataSize = 4;
}
else if (residue64 >= 2)
else if (residue >= 2)
{
loadOp = TR::InstOpCode::ldrhimm;
storeOp = TR::InstOpCode::strhimm;
Expand All @@ -7255,13 +7261,15 @@ inlineConstantLengthForwardArrayCopy(TR::Node *node, int64_t byteLen, TR::Regist
generateTrg1MemInstruction(cg, loadOp, node, dataReg, TR::MemoryReference::createWithDisplacement(cg, srcReg, offset));
generateMemSrc1Instruction(cg, storeOp, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, offset), dataReg);
offset += dataSize;
residue64 -= dataSize;
residue -= dataSize;
}

if (dataReg1)
cg->stopUsingRegister(dataReg1);
if (dataReg2)
cg->stopUsingRegister(dataReg2);
if (dataReg3)
cg->stopUsingRegister(dataReg3);

return;
}
Expand All @@ -7272,68 +7280,74 @@ inlineConstantLengthBackwardArrayCopy(TR::Node *node, int64_t byteLen, TR::Regis
if (byteLen == 0)
return;

int64_t iteration64 = byteLen >> 6;
int32_t residue64 = byteLen & 0x3F;
int64_t iteration = byteLen >> 7;
int32_t residue = byteLen & 0x7F;
TR::Register *dataReg1 = (byteLen >= 16) ? cg->allocateRegister(TR_VRF) : NULL;
TR::Register *dataReg2 = (residue64 & 0xF) ? cg->allocateRegister() : NULL;
TR::Register *dataReg2 = (byteLen >= 32) ? cg->allocateRegister(TR_VRF) : NULL;
TR::Register *dataReg3 = ((iteration > 1) || (residue & 0xF)) ? cg->allocateRegister() : NULL;

// Adjusting scrReg and dstReg
addConstant64(cg, node, srcReg, srcReg, byteLen);
addConstant64(cg, node, dstReg, dstReg, byteLen);

if (iteration64 > 1)
if (iteration > 1)
{
TR::Register *cntReg = cg->allocateRegister();
loadConstant64(cg, node, iteration64, cntReg);
TR::Register *cntReg = dataReg3;
loadConstant64(cg, node, iteration, cntReg);

TR::LabelSymbol *loopLabel = generateLabelSymbol(cg);
generateLabelInstruction(cg, TR::InstOpCode::label, node, loopLabel);

// Copy 16x4 bytes in a loop
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpreq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, -16));
generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -16), dataReg1);
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpreq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, -16));
generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -16), dataReg1);
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpreq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, -16));
generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -16), dataReg1);
generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpreq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, -16));
generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -16), dataReg1);
// Copy 32x4 bytes in a loop
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32));
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2);
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32));
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2);
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32));
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2);
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32));
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::subimmx, node, cntReg, cntReg, 1);
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2);
generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, cntReg, loopLabel);

cg->stopUsingRegister(cntReg);
}
else if (iteration64 == 1)
else if (iteration == 1)
{
residue += 128;
}

while (residue >= 32)
{
residue64 += 64;
generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32));
generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2);
residue -= 32;
}

while (residue64 > 0)
while (residue > 0)
{
TR::InstOpCode::Mnemonic loadOp;
TR::InstOpCode::Mnemonic storeOp;
int32_t dataSize;
TR::Register *dataReg = (residue64 >= 16) ? dataReg1 : dataReg2;
TR::Register *dataReg = (residue >= 16) ? dataReg1 : dataReg3;

if (residue64 >= 16)
if (residue >= 16)
{
loadOp = TR::InstOpCode::vldrpreq;
storeOp = TR::InstOpCode::vstrpreq;
dataSize = 16;
}
else if (residue64 >= 8)
else if (residue >= 8)
{
loadOp = TR::InstOpCode::ldrprex;
storeOp = TR::InstOpCode::strprex;
dataSize = 8;
}
else if (residue64 >= 4)
else if (residue >= 4)
{
loadOp = TR::InstOpCode::ldrprew;
storeOp = TR::InstOpCode::strprew;
dataSize = 4;
}
else if (residue64 >= 2)
else if (residue >= 2)
{
loadOp = TR::InstOpCode::ldrhpre;
storeOp = TR::InstOpCode::strhpre;
Expand All @@ -7348,7 +7362,7 @@ inlineConstantLengthBackwardArrayCopy(TR::Node *node, int64_t byteLen, TR::Regis

generateTrg1MemInstruction(cg, loadOp, node, dataReg, TR::MemoryReference::createWithDisplacement(cg, srcReg, -dataSize));
generateMemSrc1Instruction(cg, storeOp, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -dataSize), dataReg);
residue64 -= dataSize;
residue -= dataSize;
}

if (dataReg1)
Expand Down

0 comments on commit 57a22b2

Please sign in to comment.