Skip to content

Commit ca0e587

Browse files
JamesChestermanNickGuy-Arm
authored andcommitted
[AArch64][SVE] Add lowering for PARTIAL_REDUCE_U/SMLA to USDOT
Add lowering for PARTIAL_REDUCE_U/SMLA nodes to USDOT instructions. This happens when there is a MUL instruction as the second operand in the ISD node. Then the extends on the operands of the MUL op need to have a different signedness.
1 parent c979ce7 commit ca0e587

File tree

4 files changed

+109
-146
lines changed

4 files changed

+109
-146
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -924,8 +924,19 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
924924
/// illegal ResNo in that case.
925925
bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
926926
// See if the target wants to custom lower this node.
927-
if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
928-
return false;
927+
unsigned Opcode = N->getOpcode();
928+
bool IsPRMLAOpcode =
929+
Opcode == ISD::PARTIAL_REDUCE_UMLA || Opcode == ISD::PARTIAL_REDUCE_SMLA;
930+
931+
if (IsPRMLAOpcode) {
932+
if (TLI.getPartialReduceMLAAction(N->getValueType(0),
933+
N->getOperand(1).getValueType()) !=
934+
TargetLowering::Custom)
935+
return false;
936+
} else {
937+
if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
938+
return false;
939+
}
929940

930941
SmallVector<SDValue, 8> Results;
931942
if (LegalizeResult)

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7734,6 +7734,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
77347734
return LowerFLDEXP(Op, DAG);
77357735
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
77367736
return LowerVECTOR_HISTOGRAM(Op, DAG);
7737+
case ISD::PARTIAL_REDUCE_UMLA:
7738+
case ISD::PARTIAL_REDUCE_SMLA:
7739+
return LowerPARTIAL_REDUCE_MLA(Op, DAG);
77377740
}
77387741
}
77397742

@@ -27500,6 +27503,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
2750027503
if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
2750127504
Results.push_back(Res);
2750227505
return;
27506+
case ISD::PARTIAL_REDUCE_UMLA:
27507+
case ISD::PARTIAL_REDUCE_SMLA:
27508+
Results.push_back(LowerPARTIAL_REDUCE_MLA(SDValue(N, 0), DAG));
27509+
return;
2750327510
case ISD::ADD:
2750427511
case ISD::FADD:
2750527512
ReplaceAddWithADDP(N, Results, DAG, Subtarget);
@@ -29428,6 +29435,80 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
2942829435
return Scatter;
2942929436
}
2943029437

29438+
// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(MulOpLHS), SEXT(MulOpRHS)), Splat 1)
29439+
// to USDOT(Acc, MulOpLHS, MulOpRHS)
29440+
// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(MulOpLHS), ZEXT(MulOpRHS)), Splat 1)
29441+
// to USDOT(Acc, MulOpRHS, MulOpLHS)
29442+
SDValue
29443+
AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
29444+
SelectionDAG &DAG) const {
29445+
bool Scalable = Op.getValueType().isScalableVector();
29446+
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29447+
if (Scalable && !Subtarget.isSVEorStreamingSVEAvailable())
29448+
return SDValue();
29449+
if (!Scalable && (!Subtarget.isNeonAvailable() || !Subtarget.hasDotProd()))
29450+
return SDValue();
29451+
if (!Subtarget.hasMatMulInt8())
29452+
return SDValue();
29453+
SDLoc DL(Op);
29454+
29455+
if (Op.getOperand(1).getOpcode() != ISD::MUL)
29456+
return SDValue();
29457+
29458+
SDValue Acc = Op.getOperand(0);
29459+
SDValue Mul = Op.getOperand(1);
29460+
29461+
APInt ConstantOne;
29462+
if (!ISD::isConstantSplatVector(Op.getOperand(2).getNode(), ConstantOne) ||
29463+
!ConstantOne.isOne())
29464+
return SDValue();
29465+
29466+
SDValue ExtMulOpLHS = Mul.getOperand(0);
29467+
SDValue ExtMulOpRHS = Mul.getOperand(1);
29468+
unsigned ExtMulOpLHSOpcode = ExtMulOpLHS.getOpcode();
29469+
unsigned ExtMulOpRHSOpcode = ExtMulOpRHS.getOpcode();
29470+
if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
29471+
!ISD::isExtOpcode(ExtMulOpRHSOpcode))
29472+
return SDValue();
29473+
29474+
SDValue MulOpLHS = ExtMulOpLHS.getOperand(0);
29475+
SDValue MulOpRHS = ExtMulOpRHS.getOperand(0);
29476+
EVT MulOpLHSVT = MulOpLHS.getValueType();
29477+
if (MulOpLHSVT != MulOpRHS.getValueType())
29478+
return SDValue();
29479+
29480+
bool LHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
29481+
bool RHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
29482+
if (LHSIsSigned == RHSIsSigned)
29483+
return SDValue();
29484+
29485+
EVT AccVT = Acc.getValueType();
29486+
// There is no nxv2i64 version of usdot
29487+
if (Scalable && AccVT != MVT::nxv4i32 && AccVT != MVT::nxv4i64)
29488+
return SDValue();
29489+
29490+
// USDOT expects the signed operand to be last
29491+
if (!RHSIsSigned)
29492+
std::swap(MulOpLHS, MulOpRHS);
29493+
29494+
unsigned Opcode = AArch64ISD::USDOT;
29495+
// Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
29496+
// product followed by a zero / sign extension
29497+
// Don't want this to be split because there is no nxv2i64 version of usdot
29498+
if ((AccVT == MVT::nxv4i64 && MulOpLHSVT == MVT::nxv16i8) ||
29499+
(AccVT == MVT::v4i64 && MulOpLHSVT == MVT::v16i8)) {
29500+
EVT AccVTI32 = (AccVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
29501+
29502+
SDValue DotI32 =
29503+
DAG.getNode(Opcode, DL, AccVTI32, DAG.getConstant(0, DL, AccVTI32),
29504+
MulOpLHS, MulOpRHS);
29505+
SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, AccVT);
29506+
return DAG.getNode(ISD::ADD, DL, AccVT, Acc, Extended);
29507+
}
29508+
29509+
return DAG.getNode(Opcode, DL, AccVT, Acc, MulOpLHS, MulOpRHS);
29510+
}
29511+
2943129512
SDValue
2943229513
AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
2943329514
SelectionDAG &DAG) const {

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,7 @@ class AArch64TargetLowering : public TargetLowering {
11971197
SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
11981198
SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
11991199
SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
1200+
SDValue LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
12001201
SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
12011202
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
12021203
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;

llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

Lines changed: 14 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -174,25 +174,7 @@ define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
174174
;
175175
; CHECK-NEWLOWERING-LABEL: usdot:
176176
; CHECK-NEWLOWERING: // %bb.0: // %entry
177-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b
178-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
179-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
180-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
181-
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
182-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h
183-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
184-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
185-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
186-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h
187-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
188-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
189-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
190-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
191-
; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
192-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
193-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
194-
; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
195-
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
177+
; CHECK-NEWLOWERING-NEXT: usdot z0.s, z1.b, z2.b
196178
; CHECK-NEWLOWERING-NEXT: ret
197179
entry:
198180
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -233,25 +215,7 @@ define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
233215
;
234216
; CHECK-NEWLOWERING-LABEL: sudot:
235217
; CHECK-NEWLOWERING: // %bb.0: // %entry
236-
; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b
237-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
238-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
239-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
240-
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
241-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h
242-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
243-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
244-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
245-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h
246-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
247-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
248-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
249-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
250-
; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
251-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
252-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
253-
; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
254-
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
218+
; CHECK-NEWLOWERING-NEXT: usdot z0.s, z2.b, z1.b
255219
; CHECK-NEWLOWERING-NEXT: ret
256220
entry:
257221
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -457,59 +421,12 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
457421
;
458422
; CHECK-NEWLOWERING-LABEL: usdot_8to64:
459423
; CHECK-NEWLOWERING: // %bb.0: // %entry
460-
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
461-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
462-
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
463-
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
464-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
465-
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
466-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
467-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
468-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
469-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
470-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
471-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
472-
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
473-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
474-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
475-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
476-
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
477-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
478-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
479-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
480-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
481-
; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
482-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
483-
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
484-
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
485-
; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
486-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
487-
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
488-
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
489-
; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
490-
; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
491-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
492-
; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
493-
; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
494-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
495-
; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
496-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
497-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
498-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
499-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
500-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
501-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
502-
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
503-
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
504-
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
505-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
506-
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
507-
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
508-
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
509-
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
510-
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
511-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
512-
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
424+
; CHECK-NEWLOWERING-NEXT: mov z4.s, #0 // =0x0
425+
; CHECK-NEWLOWERING-NEXT: usdot z4.s, z2.b, z3.b
426+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z4.s
427+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z4.s
428+
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
429+
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
513430
; CHECK-NEWLOWERING-NEXT: ret
514431
entry:
515432
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -590,59 +507,12 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
590507
;
591508
; CHECK-NEWLOWERING-LABEL: sudot_8to64:
592509
; CHECK-NEWLOWERING: // %bb.0: // %entry
593-
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
594-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
595-
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
596-
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
597-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
598-
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
599-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
600-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
601-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
602-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
603-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
604-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
605-
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
606-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
607-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
608-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
609-
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
610-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
611-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
612-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
613-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
614-
; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
615-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
616-
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
617-
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
618-
; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
619-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
620-
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
621-
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
622-
; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
623-
; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
624-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
625-
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
626-
; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
627-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
628-
; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
629-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
630-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
631-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
632-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
633-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
634-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
635-
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
636-
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
637-
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
638-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
639-
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
640-
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
641-
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
642-
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
643-
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
644-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
645-
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
510+
; CHECK-NEWLOWERING-NEXT: mov z4.s, #0 // =0x0
511+
; CHECK-NEWLOWERING-NEXT: usdot z4.s, z3.b, z2.b
512+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z4.s
513+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z4.s
514+
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
515+
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
646516
; CHECK-NEWLOWERING-NEXT: ret
647517
entry:
648518
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>

0 commit comments

Comments
 (0)