From 84de48bf7c2f7bb6df3ea7ae240302971c74c990 Mon Sep 17 00:00:00 2001 From: KONNO Kazuhiro Date: Thu, 25 Apr 2024 13:12:04 +0900 Subject: [PATCH] AArch64: Improve arraycopy inlining This commit improves the inlined code of arraycopy for AArch64, by using ldp/stp instructions with a pair of SIMD registers, which can load/store 32 bytes at a time. Signed-off-by: KONNO Kazuhiro --- compiler/aarch64/codegen/OMRTreeEvaluator.cpp | 118 ++++++++++-------- 1 file changed, 66 insertions(+), 52 deletions(-) diff --git a/compiler/aarch64/codegen/OMRTreeEvaluator.cpp b/compiler/aarch64/codegen/OMRTreeEvaluator.cpp index 84146fd2b9..6ad15ef8eb 100644 --- a/compiler/aarch64/codegen/OMRTreeEvaluator.cpp +++ b/compiler/aarch64/codegen/OMRTreeEvaluator.cpp @@ -7181,65 +7181,71 @@ inlineConstantLengthForwardArrayCopy(TR::Node *node, int64_t byteLen, TR::Regist if (byteLen == 0) return; - int64_t iteration64 = byteLen >> 6; - int32_t residue64 = byteLen & 0x3F; + int64_t iteration = byteLen >> 7; + int32_t residue = byteLen & 0x7F; TR::Register *dataReg1 = (byteLen >= 16) ? cg->allocateRegister(TR_VRF) : NULL; - TR::Register *dataReg2 = (residue64 & 0xF) ? cg->allocateRegister() : NULL; + TR::Register *dataReg2 = (byteLen >= 32) ? cg->allocateRegister(TR_VRF) : NULL; + TR::Register *dataReg3 = ((iteration > 1) || (residue & 0xF)) ? cg->allocateRegister() : NULL; - if (iteration64 > 1) + if (iteration > 1) { - TR::Register *cntReg = cg->allocateRegister(); - loadConstant64(cg, node, iteration64, cntReg); + TR::Register *cntReg = dataReg3; + loadConstant64(cg, node, iteration, cntReg); TR::LabelSymbol *loopLabel = generateLabelSymbol(cg); generateLabelInstruction(cg, TR::InstOpCode::label, node, loopLabel); - // Copy 16x4 bytes in a loop - generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpostq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, 16)); - generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 16), dataReg1); - generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpostq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, 16)); - generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 16), dataReg1); - generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpostq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, 16)); - generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 16), dataReg1); - generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpostq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, 16)); - generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 16), dataReg1); + // Copy 32x4 bytes in a loop + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32)); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2); + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32)); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2); + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32)); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2); + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32)); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::subimmx, node, cntReg, cntReg, 1); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2); generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, cntReg, loopLabel); - - cg->stopUsingRegister(cntReg); } - else if (iteration64 == 1) + else if (iteration == 1) + { + residue += 128; + } + + while (residue >= 32) { - residue64 += 64; + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppostq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, 32)); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppostq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, 32), dataReg1, dataReg2); + residue -= 32; } int32_t offset = 0; - while (residue64 > 0) + while (residue > 0) { TR::InstOpCode::Mnemonic loadOp; TR::InstOpCode::Mnemonic storeOp; int32_t dataSize; - TR::Register *dataReg = (residue64 >= 16) ? dataReg1 : dataReg2; + TR::Register *dataReg = (residue >= 16) ? dataReg1 : dataReg3; - if (residue64 >= 16) + if (residue >= 16) { loadOp = TR::InstOpCode::vldrimmq; storeOp = TR::InstOpCode::vstrimmq; dataSize = 16; } - else if (residue64 >= 8) + else if (residue >= 8) { loadOp = TR::InstOpCode::ldrimmx; storeOp = TR::InstOpCode::strimmx; dataSize = 8; } - else if (residue64 >= 4) + else if (residue >= 4) { loadOp = TR::InstOpCode::ldrimmw; storeOp = TR::InstOpCode::strimmw; dataSize = 4; } - else if (residue64 >= 2) + else if (residue >= 2) { loadOp = TR::InstOpCode::ldrhimm; storeOp = TR::InstOpCode::strhimm; @@ -7255,13 +7261,15 @@ inlineConstantLengthForwardArrayCopy(TR::Node *node, int64_t byteLen, TR::Regist generateTrg1MemInstruction(cg, loadOp, node, dataReg, TR::MemoryReference::createWithDisplacement(cg, srcReg, offset)); generateMemSrc1Instruction(cg, storeOp, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, offset), dataReg); offset += dataSize; - residue64 -= dataSize; + residue -= dataSize; } if (dataReg1) cg->stopUsingRegister(dataReg1); if (dataReg2) cg->stopUsingRegister(dataReg2); + if (dataReg3) + cg->stopUsingRegister(dataReg3); return; } @@ -7272,68 +7280,74 @@ inlineConstantLengthBackwardArrayCopy(TR::Node *node, int64_t byteLen, TR::Regis if (byteLen == 0) return; - int64_t iteration64 = byteLen >> 6; - int32_t residue64 = byteLen & 0x3F; + int64_t iteration = byteLen >> 7; + int32_t residue = byteLen & 0x7F; TR::Register *dataReg1 = (byteLen >= 16) ? cg->allocateRegister(TR_VRF) : NULL; - TR::Register *dataReg2 = (residue64 & 0xF) ? cg->allocateRegister() : NULL; + TR::Register *dataReg2 = (byteLen >= 32) ? cg->allocateRegister(TR_VRF) : NULL; + TR::Register *dataReg3 = ((iteration > 1) || (residue & 0xF)) ? cg->allocateRegister() : NULL; // Adjusting scrReg and dstReg addConstant64(cg, node, srcReg, srcReg, byteLen); addConstant64(cg, node, dstReg, dstReg, byteLen); - if (iteration64 > 1) + if (iteration > 1) { - TR::Register *cntReg = cg->allocateRegister(); - loadConstant64(cg, node, iteration64, cntReg); + TR::Register *cntReg = dataReg3; + loadConstant64(cg, node, iteration, cntReg); TR::LabelSymbol *loopLabel = generateLabelSymbol(cg); generateLabelInstruction(cg, TR::InstOpCode::label, node, loopLabel); - // Copy 16x4 bytes in a loop - generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpreq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, -16)); - generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -16), dataReg1); - generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpreq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, -16)); - generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -16), dataReg1); - generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpreq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, -16)); - generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -16), dataReg1); - generateTrg1MemInstruction(cg, TR::InstOpCode::vldrpreq, node, dataReg1, TR::MemoryReference::createWithDisplacement(cg, srcReg, -16)); - generateMemSrc1Instruction(cg, TR::InstOpCode::vstrpreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -16), dataReg1); + // Copy 32x4 bytes in a loop + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32)); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2); + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32)); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2); + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32)); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2); + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32)); generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::subimmx, node, cntReg, cntReg, 1); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2); generateCompareBranchInstruction(cg, TR::InstOpCode::cbnzx, node, cntReg, loopLabel); - - cg->stopUsingRegister(cntReg); } - else if (iteration64 == 1) + else if (iteration == 1) + { + residue += 128; + } + + while (residue >= 32) { - residue64 += 64; + generateTrg2MemInstruction(cg, TR::InstOpCode::vldppreq, node, dataReg1, dataReg2, TR::MemoryReference::createWithDisplacement(cg, srcReg, -32)); + generateMemSrc2Instruction(cg, TR::InstOpCode::vstppreq, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -32), dataReg1, dataReg2); + residue -= 32; } - while (residue64 > 0) + while (residue > 0) { TR::InstOpCode::Mnemonic loadOp; TR::InstOpCode::Mnemonic storeOp; int32_t dataSize; - TR::Register *dataReg = (residue64 >= 16) ? dataReg1 : dataReg2; + TR::Register *dataReg = (residue >= 16) ? dataReg1 : dataReg3; - if (residue64 >= 16) + if (residue >= 16) { loadOp = TR::InstOpCode::vldrpreq; storeOp = TR::InstOpCode::vstrpreq; dataSize = 16; } - else if (residue64 >= 8) + else if (residue >= 8) { loadOp = TR::InstOpCode::ldrprex; storeOp = TR::InstOpCode::strprex; dataSize = 8; } - else if (residue64 >= 4) + else if (residue >= 4) { loadOp = TR::InstOpCode::ldrprew; storeOp = TR::InstOpCode::strprew; dataSize = 4; } - else if (residue64 >= 2) + else if (residue >= 2) { loadOp = TR::InstOpCode::ldrhpre; storeOp = TR::InstOpCode::strhpre; @@ -7348,7 +7362,7 @@ inlineConstantLengthBackwardArrayCopy(TR::Node *node, int64_t byteLen, TR::Regis generateTrg1MemInstruction(cg, loadOp, node, dataReg, TR::MemoryReference::createWithDisplacement(cg, srcReg, -dataSize)); generateMemSrc1Instruction(cg, storeOp, node, TR::MemoryReference::createWithDisplacement(cg, dstReg, -dataSize), dataReg); - residue64 -= dataSize; + residue -= dataSize; } if (dataReg1)