diff --git a/compiler/aarch64/codegen/OMRCodeGenerator.cpp b/compiler/aarch64/codegen/OMRCodeGenerator.cpp index 8317e198dc3..cbd8098aa07 100644 --- a/compiler/aarch64/codegen/OMRCodeGenerator.cpp +++ b/compiler/aarch64/codegen/OMRCodeGenerator.cpp @@ -710,6 +710,8 @@ bool OMR::ARM64::CodeGenerator::getSupportsOpCodeForAutoSIMD(TR::CPU *cpu, TR::I case TR::vmbitswap: case TR::vbyteswap: case TR::vmbyteswap: + case TR::mmAllTrue: + case TR::mmAnyTrue: // Float/ Double are not supported return (et == TR::Int8 || et == TR::Int16 || et == TR::Int32 || et == TR::Int64); case TR::vload: diff --git a/compiler/aarch64/codegen/OMRTreeEvaluator.cpp b/compiler/aarch64/codegen/OMRTreeEvaluator.cpp index 222738c2588..35540da350a 100644 --- a/compiler/aarch64/codegen/OMRTreeEvaluator.cpp +++ b/compiler/aarch64/codegen/OMRTreeEvaluator.cpp @@ -1232,13 +1232,73 @@ OMR::ARM64::TreeEvaluator::mAllTrueEvaluator(TR::Node *node, TR::CodeGenerator * TR::Register* OMR::ARM64::TreeEvaluator::mmAnyTrueEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - return TR::TreeEvaluator::unImpOpEvaluator(node, cg); + TR::Node *firstChild = node->getFirstChild(); + TR::Node *secondChild = node->getSecondChild(); + TR_ASSERT_FATAL_WITH_NODE(node, firstChild->getDataType().getVectorLength() == TR::VectorLength128, + "Only 128-bit vectors are supported %s", firstChild->getDataType().toString()); + + TR::Register *maskReg = cg->evaluate(firstChild); + TR::Register *mask2Reg = cg->evaluate(secondChild); + TR::Register *resultReg = cg->allocateRegister(TR_GPR); + TR::Register *tempReg = cg->allocateRegister(TR_VRF); + + /* + * and v2.16b, v0.16b, v1.16b + * ; umaxp is fast if arrangement specifier is 4s. + * umaxp v2.4s, v2.4s, v2.4s + * ; now relevant data is in lower 64bit of v2. + * umov x0, v2.2d[0] + * cmp x0, #0 + * cset x0, ne + */ + generateTrg1Src2Instruction(cg, TR::InstOpCode::vand16b, node, tempReg, maskReg, mask2Reg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vumaxp4s, node, tempReg, tempReg, tempReg); + generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resultReg, tempReg, 0); + generateCompareImmInstruction(cg, node, resultReg, 0, true); + generateCSetInstruction(cg, node, resultReg, TR::CC_NE); + + cg->stopUsingRegister(tempReg); + node->setRegister(resultReg); + cg->decReferenceCount(firstChild); + cg->decReferenceCount(secondChild); + + return resultReg; } TR::Register* OMR::ARM64::TreeEvaluator::mmAllTrueEvaluator(TR::Node *node, TR::CodeGenerator *cg) { - return TR::TreeEvaluator::unImpOpEvaluator(node, cg); + TR::Node *firstChild = node->getFirstChild(); + TR::Node *secondChild = node->getSecondChild(); + TR_ASSERT_FATAL_WITH_NODE(node, firstChild->getDataType().getVectorLength() == TR::VectorLength128, + "Only 128-bit vectors are supported %s", firstChild->getDataType().toString()); + + TR::Register *maskReg = cg->evaluate(firstChild); + TR::Register *mask2Reg = cg->evaluate(secondChild); + TR::Register *resultReg = cg->allocateRegister(TR_GPR); + TR::Register *tempReg = cg->allocateRegister(TR_VRF); + + /* + * and v2.16b, v0.16b, v1.16b + * ; uminp is fast if arrangement specifier is 4s. + * uminp v2.4s, v2.4s, v2.4s + * ; now relevant data is in lower 64bit of v2. + * umov x0, v2.2d[0] + * cmn x0, #1 + * cset x0, eq + */ + generateTrg1Src2Instruction(cg, TR::InstOpCode::vand16b, node, tempReg, maskReg, mask2Reg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vuminp4s, node, tempReg, tempReg, tempReg); + generateMovVectorElementToGPRInstruction(cg, TR::InstOpCode::umovxd, node, resultReg, tempReg, 0); + generateCompareImmInstruction(cg, node, resultReg, -1, true); + generateCSetInstruction(cg, node, resultReg, TR::CC_EQ); + + cg->stopUsingRegister(tempReg); + node->setRegister(resultReg); + cg->decReferenceCount(firstChild); + cg->decReferenceCount(secondChild); + + return resultReg; } TR::Register*