Skip to content

Commit c967e33

Browse files
JamesChestermanNickGuy-Arm
authored andcommitted
[AArch64][SVE] Add lowering for PARTIAL_REDUCE_U/SMLA to USDOT
Add lowering for PARTIAL_REDUCE_U/SMLA nodes to USDOT instructions. This happens when there is a MUL instruction as the second operand in the ISD node. Then the extends on the operands of the MUL op need to have a different signedness.
1 parent 364835d commit c967e33

File tree

4 files changed

+109
-146
lines changed

4 files changed

+109
-146
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -920,8 +920,19 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
920920
/// illegal ResNo in that case.
921921
bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
922922
// See if the target wants to custom lower this node.
923-
if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
924-
return false;
923+
unsigned Opcode = N->getOpcode();
924+
bool IsPRMLAOpcode =
925+
Opcode == ISD::PARTIAL_REDUCE_UMLA || Opcode == ISD::PARTIAL_REDUCE_SMLA;
926+
927+
if (IsPRMLAOpcode) {
928+
if (TLI.getPartialReduceMLAAction(N->getValueType(0),
929+
N->getOperand(1).getValueType()) !=
930+
TargetLowering::Custom)
931+
return false;
932+
} else {
933+
if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
934+
return false;
935+
}
925936

926937
SmallVector<SDValue, 8> Results;
927938
if (LegalizeResult)

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7736,6 +7736,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
77367736
return LowerFLDEXP(Op, DAG);
77377737
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
77387738
return LowerVECTOR_HISTOGRAM(Op, DAG);
7739+
case ISD::PARTIAL_REDUCE_UMLA:
7740+
case ISD::PARTIAL_REDUCE_SMLA:
7741+
return LowerPARTIAL_REDUCE_MLA(Op, DAG);
77397742
}
77407743
}
77417744

@@ -27474,6 +27477,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
2747427477
if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
2747527478
Results.push_back(Res);
2747627479
return;
27480+
case ISD::PARTIAL_REDUCE_UMLA:
27481+
case ISD::PARTIAL_REDUCE_SMLA:
27482+
Results.push_back(LowerPARTIAL_REDUCE_MLA(SDValue(N, 0), DAG));
27483+
return;
2747727484
case ISD::ADD:
2747827485
case ISD::FADD:
2747927486
ReplaceAddWithADDP(N, Results, DAG, Subtarget);
@@ -29396,6 +29403,80 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
2939629403
return Scatter;
2939729404
}
2939829405

29406+
// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(MulOpLHS), SEXT(MulOpRHS)), Splat 1)
29407+
// to USDOT(Acc, MulOpLHS, MulOpRHS)
29408+
// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(MulOpLHS), ZEXT(MulOpRHS)), Splat 1)
29409+
// to USDOT(Acc, MulOpRHS, MulOpLHS)
29410+
SDValue
29411+
AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
29412+
SelectionDAG &DAG) const {
29413+
bool Scalable = Op.getValueType().isScalableVector();
29414+
auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29415+
if (Scalable && !Subtarget.isSVEorStreamingSVEAvailable())
29416+
return SDValue();
29417+
if (!Scalable && (!Subtarget.isNeonAvailable() || !Subtarget.hasDotProd()))
29418+
return SDValue();
29419+
if (!Subtarget.hasMatMulInt8())
29420+
return SDValue();
29421+
SDLoc DL(Op);
29422+
29423+
if (Op.getOperand(1).getOpcode() != ISD::MUL)
29424+
return SDValue();
29425+
29426+
SDValue Acc = Op.getOperand(0);
29427+
SDValue Mul = Op.getOperand(1);
29428+
29429+
APInt ConstantOne;
29430+
if (!ISD::isConstantSplatVector(Op.getOperand(2).getNode(), ConstantOne) ||
29431+
!ConstantOne.isOne())
29432+
return SDValue();
29433+
29434+
SDValue ExtMulOpLHS = Mul.getOperand(0);
29435+
SDValue ExtMulOpRHS = Mul.getOperand(1);
29436+
unsigned ExtMulOpLHSOpcode = ExtMulOpLHS.getOpcode();
29437+
unsigned ExtMulOpRHSOpcode = ExtMulOpRHS.getOpcode();
29438+
if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
29439+
!ISD::isExtOpcode(ExtMulOpRHSOpcode))
29440+
return SDValue();
29441+
29442+
SDValue MulOpLHS = ExtMulOpLHS.getOperand(0);
29443+
SDValue MulOpRHS = ExtMulOpRHS.getOperand(0);
29444+
EVT MulOpLHSVT = MulOpLHS.getValueType();
29445+
if (MulOpLHSVT != MulOpRHS.getValueType())
29446+
return SDValue();
29447+
29448+
bool LHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
29449+
bool RHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
29450+
if (LHSIsSigned == RHSIsSigned)
29451+
return SDValue();
29452+
29453+
EVT AccVT = Acc.getValueType();
29454+
// There is no nxv2i64 version of usdot
29455+
if (Scalable && AccVT != MVT::nxv4i32 && AccVT != MVT::nxv4i64)
29456+
return SDValue();
29457+
29458+
// USDOT expects the signed operand to be last
29459+
if (!RHSIsSigned)
29460+
std::swap(MulOpLHS, MulOpRHS);
29461+
29462+
unsigned Opcode = AArch64ISD::USDOT;
29463+
// Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
29464+
// product followed by a zero / sign extension
29465+
// Don't want this to be split because there is no nxv2i64 version of usdot
29466+
if ((AccVT == MVT::nxv4i64 && MulOpLHSVT == MVT::nxv16i8) ||
29467+
(AccVT == MVT::v4i64 && MulOpLHSVT == MVT::v16i8)) {
29468+
EVT AccVTI32 = (AccVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
29469+
29470+
SDValue DotI32 =
29471+
DAG.getNode(Opcode, DL, AccVTI32, DAG.getConstant(0, DL, AccVTI32),
29472+
MulOpLHS, MulOpRHS);
29473+
SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, AccVT);
29474+
return DAG.getNode(ISD::ADD, DL, AccVT, Acc, Extended);
29475+
}
29476+
29477+
return DAG.getNode(Opcode, DL, AccVT, Acc, MulOpLHS, MulOpRHS);
29478+
}
29479+
2939929480
SDValue
2940029481
AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
2940129482
SelectionDAG &DAG) const {

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,7 @@ class AArch64TargetLowering : public TargetLowering {
11971197
SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
11981198
SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
11991199
SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
1200+
SDValue LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
12001201
SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
12011202
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
12021203
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;

llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

Lines changed: 14 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -106,25 +106,7 @@ define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
106106
;
107107
; CHECK-NEWLOWERING-LABEL: usdot:
108108
; CHECK-NEWLOWERING: // %bb.0: // %entry
109-
; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b
110-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
111-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
112-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
113-
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
114-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h
115-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
116-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
117-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
118-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h
119-
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
120-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
121-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
122-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
123-
; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
124-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
125-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
126-
; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
127-
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
109+
; CHECK-NEWLOWERING-NEXT: usdot z0.s, z1.b, z2.b
128110
; CHECK-NEWLOWERING-NEXT: ret
129111
entry:
130112
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -165,25 +147,7 @@ define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
165147
;
166148
; CHECK-NEWLOWERING-LABEL: sudot:
167149
; CHECK-NEWLOWERING: // %bb.0: // %entry
168-
; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b
169-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
170-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
171-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
172-
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
173-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h
174-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
175-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
176-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
177-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h
178-
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
179-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
180-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
181-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
182-
; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
183-
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
184-
; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
185-
; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
186-
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
150+
; CHECK-NEWLOWERING-NEXT: usdot z0.s, z2.b, z1.b
187151
; CHECK-NEWLOWERING-NEXT: ret
188152
entry:
189153
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -389,59 +353,12 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
389353
;
390354
; CHECK-NEWLOWERING-LABEL: usdot_8to64:
391355
; CHECK-NEWLOWERING: // %bb.0: // %entry
392-
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
393-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
394-
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
395-
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
396-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
397-
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
398-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
399-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
400-
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
401-
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
402-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
403-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
404-
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
405-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
406-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
407-
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
408-
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
409-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
410-
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
411-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
412-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
413-
; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
414-
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
415-
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
416-
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
417-
; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
418-
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
419-
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
420-
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
421-
; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
422-
; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
423-
; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
424-
; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
425-
; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
426-
; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
427-
; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
428-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
429-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
430-
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
431-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
432-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
433-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
434-
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
435-
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
436-
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
437-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
438-
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
439-
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
440-
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
441-
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
442-
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
443-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
444-
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
356+
; CHECK-NEWLOWERING-NEXT: mov z4.s, #0 // =0x0
357+
; CHECK-NEWLOWERING-NEXT: usdot z4.s, z2.b, z3.b
358+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z4.s
359+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z4.s
360+
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
361+
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
445362
; CHECK-NEWLOWERING-NEXT: ret
446363
entry:
447364
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -522,59 +439,12 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
522439
;
523440
; CHECK-NEWLOWERING-LABEL: sudot_8to64:
524441
; CHECK-NEWLOWERING: // %bb.0: // %entry
525-
; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
526-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
527-
; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
528-
; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
529-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
530-
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
531-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
532-
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
533-
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
534-
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
535-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
536-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
537-
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
538-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
539-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
540-
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
541-
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
542-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
543-
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
544-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
545-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
546-
; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
547-
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
548-
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
549-
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
550-
; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
551-
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
552-
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
553-
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
554-
; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
555-
; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
556-
; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
557-
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
558-
; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
559-
; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
560-
; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
561-
; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
562-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
563-
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
564-
; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
565-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
566-
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
567-
; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
568-
; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
569-
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
570-
; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
571-
; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
572-
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
573-
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
574-
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
575-
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
576-
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
577-
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
442+
; CHECK-NEWLOWERING-NEXT: mov z4.s, #0 // =0x0
443+
; CHECK-NEWLOWERING-NEXT: usdot z4.s, z3.b, z2.b
444+
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z4.s
445+
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z4.s
446+
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
447+
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
578448
; CHECK-NEWLOWERING-NEXT: ret
579449
entry:
580450
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>

0 commit comments

Comments
 (0)