Skip to content

Commit e3c8e17

Browse files
Reland "[DAGCombiner] Add generic DAG combine for ISD::PARTIAL_REDUCE_MLA (#127083)"
This relands commit 7a06681.
1 parent c7dbf20 commit e3c8e17

File tree

3 files changed

+341
-260
lines changed

3 files changed

+341
-260
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,7 @@ namespace {
545545
SDValue visitMGATHER(SDNode *N);
546546
SDValue visitMSCATTER(SDNode *N);
547547
SDValue visitMHISTOGRAM(SDNode *N);
548+
SDValue visitPARTIAL_REDUCE_MLA(SDNode *N);
548549
SDValue visitVPGATHER(SDNode *N);
549550
SDValue visitVPSCATTER(SDNode *N);
550551
SDValue visitVP_STRIDED_LOAD(SDNode *N);
@@ -1973,6 +1974,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
19731974
case ISD::MSCATTER: return visitMSCATTER(N);
19741975
case ISD::MSTORE: return visitMSTORE(N);
19751976
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N);
1977+
case ISD::PARTIAL_REDUCE_SMLA:
1978+
case ISD::PARTIAL_REDUCE_UMLA:
1979+
return visitPARTIAL_REDUCE_MLA(N);
19761980
case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N);
19771981
case ISD::LIFETIME_END: return visitLIFETIME_END(N);
19781982
case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
@@ -12492,6 +12496,58 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
1249212496
return SDValue();
1249312497
}
1249412498

12499+
// Makes PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(LHSExtOp), ZEXT(RHSExtOp)),
12500+
// Splat(1)) into
12501+
// PARTIAL_REDUCE_UMLA(Acc, LHSExtOp, RHSExtOp).
12502+
// Makes PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(LHSExtOp), SEXT(RHSExtOp)),
12503+
// Splat(1)) into
12504+
// PARTIAL_REDUCE_SMLA(Acc, LHSExtOp, RHSExtOp).
12505+
SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
12506+
SDLoc DL(N);
12507+
12508+
SDValue Acc = N->getOperand(0);
12509+
SDValue Op1 = N->getOperand(1);
12510+
SDValue Op2 = N->getOperand(2);
12511+
12512+
APInt ConstantOne;
12513+
if (Op1->getOpcode() != ISD::MUL ||
12514+
!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
12515+
!ConstantOne.isOne())
12516+
return SDValue();
12517+
12518+
SDValue LHS = Op1->getOperand(0);
12519+
SDValue RHS = Op1->getOperand(1);
12520+
unsigned LHSOpcode = LHS->getOpcode();
12521+
unsigned RHSOpcode = RHS->getOpcode();
12522+
if (!ISD::isExtOpcode(LHSOpcode) || !ISD::isExtOpcode(RHSOpcode))
12523+
return SDValue();
12524+
12525+
SDValue LHSExtOp = LHS->getOperand(0);
12526+
SDValue RHSExtOp = RHS->getOperand(0);
12527+
EVT LHSExtOpVT = LHSExtOp.getValueType();
12528+
if (LHSExtOpVT != RHSExtOp.getValueType() || LHSOpcode != RHSOpcode)
12529+
return SDValue();
12530+
12531+
// FIXME: Add a check to only perform the DAG combine if there is lowering
12532+
// provided by the target
12533+
12534+
bool ExtIsSigned = LHSOpcode == ISD::SIGN_EXTEND;
12535+
12536+
// For a 2-stage extend the signedness of both of the extends must be the
12537+
// same. This is so the node can be folded into only a signed or unsigned
12538+
// node.
12539+
bool NodeIsSigned = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA;
12540+
EVT AccElemVT = Acc.getValueType().getVectorElementType();
12541+
if (ExtIsSigned != NodeIsSigned &&
12542+
Op1.getValueType().getVectorElementType() != AccElemVT)
12543+
return SDValue();
12544+
12545+
unsigned NewOpcode =
12546+
ExtIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
12547+
return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp,
12548+
RHSExtOp);
12549+
}
12550+
1249512551
SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
1249612552
auto *SLD = cast<VPStridedLoadSDNode>(N);
1249712553
EVT EltVT = SLD->getValueType(0).getVectorElementType();

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

Lines changed: 93 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
1212
;
1313
; CHECK-NODOT-LABEL: udot:
1414
; CHECK-NODOT: // %bb.0:
15-
; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
16-
; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
17-
; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
18-
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
19-
; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
20-
; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
21-
; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
15+
; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
16+
; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
17+
; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
18+
; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
19+
; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
20+
; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
21+
; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
22+
; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
23+
; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
2224
; CHECK-NODOT-NEXT: ret
2325
%u.wide = zext <16 x i8> %u to <16 x i32>
2426
%s.wide = zext <16 x i8> %s to <16 x i32>
@@ -50,18 +52,20 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
5052
; CHECK-NODOT-NEXT: mov x8, xzr
5153
; CHECK-NODOT-NEXT: .LBB1_1: // %vector.body
5254
; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
53-
; CHECK-NODOT-NEXT: ldr q0, [x0, x8]
54-
; CHECK-NODOT-NEXT: ldr q2, [x1, x8]
55+
; CHECK-NODOT-NEXT: ldr q0, [x1, x8]
56+
; CHECK-NODOT-NEXT: ldr q2, [x0, x8]
5557
; CHECK-NODOT-NEXT: add x8, x8, #16
5658
; CHECK-NODOT-NEXT: cmp x8, #16
57-
; CHECK-NODOT-NEXT: umull v3.8h, v0.8b, v2.8b
58-
; CHECK-NODOT-NEXT: umull2 v2.8h, v0.16b, v2.16b
59+
; CHECK-NODOT-NEXT: ushll2 v3.8h, v0.16b, #0
60+
; CHECK-NODOT-NEXT: ushll2 v4.8h, v2.16b, #0
61+
; CHECK-NODOT-NEXT: ushll v5.8h, v0.8b, #0
62+
; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
5963
; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
60-
; CHECK-NODOT-NEXT: ushll v1.4s, v2.4h, #0
61-
; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v3.4h
62-
; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
63-
; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v2.8h
64-
; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s
64+
; CHECK-NODOT-NEXT: umull v6.4s, v4.4h, v3.4h
65+
; CHECK-NODOT-NEXT: umlal v1.4s, v2.4h, v5.4h
66+
; CHECK-NODOT-NEXT: umlal2 v6.4s, v2.8h, v5.8h
67+
; CHECK-NODOT-NEXT: umlal2 v1.4s, v4.8h, v3.8h
68+
; CHECK-NODOT-NEXT: add v1.4s, v6.4s, v1.4s
6569
; CHECK-NODOT-NEXT: b.ne .LBB1_1
6670
; CHECK-NODOT-NEXT: // %bb.2: // %end
6771
; CHECK-NODOT-NEXT: ret
@@ -95,17 +99,19 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
9599
;
96100
; CHECK-NODOT-LABEL: udot_narrow:
97101
; CHECK-NODOT: // %bb.0:
98-
; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
102+
; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
103+
; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
99104
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
100-
; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
101-
; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
102-
; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
103-
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
105+
; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
106+
; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
107+
; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
108+
; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
109+
; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
104110
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
105-
; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
106-
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
107-
; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
111+
; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
112+
; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
108113
; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
114+
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
109115
; CHECK-NODOT-NEXT: ret
110116
%u.wide = zext <8 x i8> %u to <8 x i32>
111117
%s.wide = zext <8 x i8> %s to <8 x i32>
@@ -122,13 +128,15 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
122128
;
123129
; CHECK-NODOT-LABEL: sdot:
124130
; CHECK-NODOT: // %bb.0:
125-
; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
126-
; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
127-
; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
128-
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
129-
; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
130-
; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
131-
; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
131+
; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
132+
; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
133+
; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
134+
; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
135+
; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
136+
; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
137+
; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
138+
; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
139+
; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
132140
; CHECK-NODOT-NEXT: ret
133141
%u.wide = sext <16 x i8> %u to <16 x i32>
134142
%s.wide = sext <16 x i8> %s to <16 x i32>
@@ -145,17 +153,19 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
145153
;
146154
; CHECK-NODOT-LABEL: sdot_narrow:
147155
; CHECK-NODOT: // %bb.0:
148-
; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
156+
; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
157+
; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
149158
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
150-
; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
151-
; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
152-
; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
153-
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
159+
; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
160+
; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
161+
; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
162+
; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
163+
; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
154164
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
155-
; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
156-
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
157-
; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
165+
; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
166+
; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
158167
; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
168+
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
159169
; CHECK-NODOT-NEXT: ret
160170
%u.wide = sext <8 x i8> %u to <8 x i32>
161171
%s.wide = sext <8 x i8> %s to <8 x i32>
@@ -407,19 +417,27 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
407417
;
408418
; CHECK-NODOT-LABEL: udot_8to64:
409419
; CHECK-NODOT: // %bb.0: // %entry
410-
; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
411-
; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
412-
; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
413-
; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
420+
; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0
421+
; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0
422+
; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
423+
; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
424+
; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0
425+
; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0
414426
; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
415-
; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
416-
; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
417-
; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
418-
; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
419-
; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
420-
; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
421-
; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
422-
; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
427+
; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0
428+
; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0
429+
; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0
430+
; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0
431+
; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0
432+
; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s
433+
; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s
434+
; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s
435+
; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s
436+
; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s
437+
; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s
438+
; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s
439+
; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s
440+
; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
423441
; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
424442
; CHECK-NODOT-NEXT: ret
425443
entry:
@@ -442,19 +460,27 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
442460
;
443461
; CHECK-NODOT-LABEL: sdot_8to64:
444462
; CHECK-NODOT: // %bb.0: // %entry
445-
; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
446-
; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
447-
; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
448-
; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
463+
; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0
464+
; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0
465+
; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0
466+
; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
467+
; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0
468+
; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0
449469
; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
450-
; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
451-
; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
452-
; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
453-
; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
454-
; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
455-
; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
456-
; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
457-
; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
470+
; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0
471+
; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0
472+
; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0
473+
; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0
474+
; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0
475+
; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s
476+
; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s
477+
; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s
478+
; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s
479+
; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s
480+
; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s
481+
; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s
482+
; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s
483+
; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
458484
; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
459485
; CHECK-NODOT-NEXT: ret
460486
entry:
@@ -771,9 +797,10 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
771797
define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
772798
; CHECK-LABEL: not_udot:
773799
; CHECK: // %bb.0:
774-
; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
775-
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
776-
; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
800+
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
801+
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
802+
; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
803+
; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
777804
; CHECK-NEXT: ret
778805
%u.wide = zext <8 x i8> %u to <8 x i32>
779806
%s.wide = zext <8 x i8> %s to <8 x i32>

0 commit comments

Comments
 (0)