Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RISCV] Add DAG combine for forming VAADDU_VL from VP intrinsics. #124848

Merged
merged 5 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 98 additions & 12 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1526,18 +1526,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
if (Subtarget.hasVInstructions())
setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER,
ISD::MSCATTER, ISD::VP_GATHER,
ISD::VP_SCATTER, ISD::SRA,
ISD::SRL, ISD::SHL,
ISD::STORE, ISD::SPLAT_VECTOR,
ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
ISD::VP_STORE, ISD::EXPERIMENTAL_VP_REVERSE,
ISD::MUL, ISD::SDIV,
ISD::UDIV, ISD::SREM,
ISD::UREM, ISD::INSERT_VECTOR_ELT,
ISD::ABS, ISD::CTPOP,
ISD::VECTOR_SHUFFLE, ISD::VSELECT});
setTargetDAGCombine(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stray format change.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added VP_TRUNCATE to it. It added it to the middle because that's where it was in my downstream.

{ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA,
ISD::SRL, ISD::SHL, ISD::STORE,
ISD::SPLAT_VECTOR, ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
ISD::VP_STORE, ISD::VP_TRUNCATE, ISD::EXPERIMENTAL_VP_REVERSE,
ISD::MUL, ISD::SDIV, ISD::UDIV,
ISD::SREM, ISD::UREM, ISD::INSERT_VECTOR_ELT,
ISD::ABS, ISD::CTPOP, ISD::VECTOR_SHUFFLE,
ISD::VSELECT});

if (Subtarget.hasVendorXTHeadMemPair())
setTargetDAGCombine({ISD::LOAD, ISD::STORE});
Expand Down Expand Up @@ -16373,6 +16371,92 @@ static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
VPStore->isTruncatingStore(), VPStore->isCompressingStore());
}

// Peephole avgceil pattern.
// %1 = zext <N x i8> %a to <N x i32>
// %2 = zext <N x i8> %b to <N x i32>
// %3 = add nuw nsw <N x i32> %1, splat (i32 1)
// %4 = add nuw nsw <N x i32> %3, %2
// %5 = lshr <N x i32> %4, splat (i32 1)
// %6 = trunc <N x i32> %5 to <N x i8>
static SDValue performVP_TRUNCATECombine(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
EVT VT = N->getValueType(0);

// Ignore fixed vectors.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!VT.isScalableVector() || !TLI.isTypeLegal(VT))
return SDValue();

SDValue In = N->getOperand(0);
SDValue Mask = N->getOperand(1);
SDValue VL = N->getOperand(2);

// Input should be a vp_srl with same mask and VL.
if (In.getOpcode() != ISD::VP_SRL || In.getOperand(2) != Mask ||
In.getOperand(3) != VL)
return SDValue();

// Shift amount should be 1.
if (!isOneOrOneSplat(In.getOperand(1)))
return SDValue();

// Shifted value should be a vp_add with same mask and VL.
SDValue LHS = In.getOperand(0);
if (LHS.getOpcode() != ISD::VP_ADD || LHS.getOperand(2) != Mask ||
LHS.getOperand(3) != VL)
return SDValue();

SDValue Operands[3];

// Matches another VP_ADD with same VL and Mask.
auto FindAdd = [&](SDValue V, SDValue Other) {
if (V.getOpcode() != ISD::VP_ADD || V.getOperand(2) != Mask ||
V.getOperand(3) != VL)
return false;

Operands[0] = Other;
Operands[1] = V.getOperand(1);
Operands[2] = V.getOperand(0);
return true;
};

// We need to find another VP_ADD in one of the operands.
SDValue LHS0 = LHS.getOperand(0);
SDValue LHS1 = LHS.getOperand(1);
if (!FindAdd(LHS0, LHS1) && !FindAdd(LHS1, LHS0))
return SDValue();

// Now we have three operands of two additions. Check that one of them is a
// constant vector with ones.
auto I = llvm::find_if(Operands,
[](const SDValue &Op) { return isOneOrOneSplat(Op); });
Comment on lines +16431 to +16432
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you pass the predicate directly?

Suggested change
auto I = llvm::find_if(Operands,
[](const SDValue &Op) { return isOneOrOneSplat(Op); });
auto I = llvm::find_if(Operands, isOneOrOneSplat);

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't compile. Possibly because isOneOrOneSplat has a second defaulted argument

if (I == std::end(Operands))
return SDValue();
// We found a vector with ones, move if it to the end of the Operands array.
std::swap(*I, Operands[2]);

// Make sure the other 2 operands can be promoted from the result type.
for (SDValue Op : drop_end(Operands)) {
if (Op.getOpcode() != ISD::VP_ZERO_EXTEND || Op.getOperand(1) != Mask ||
Op.getOperand(2) != VL)
return SDValue();
// Input must be the same size or smaller than our result.
if (Op.getOperand(0).getScalarValueSizeInBits() > VT.getScalarSizeInBits())
return SDValue();
}

// Pattern is detected.
// Rebuild the zero extends in case the inputs are smaller than our result.
SDValue NewOp0 = DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[0]), VT,
Operands[0].getOperand(0), Mask, VL);
SDValue NewOp1 = DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[1]), VT,
Operands[1].getOperand(0), Mask, VL);
// Build a VAADDU with RNU rounding mode.
SDLoc DL(N);
return DAG.getNode(RISCVISD::AVGCEILU_VL, DL, VT,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AVGCEILU_VL appears to be previously unused code. Do we have any such other nodes which need deleted?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not unused it used via macro in getRISCVVLOp

{NewOp0, NewOp1, DAG.getUNDEF(VT), Mask, VL});
}

// Convert from one FMA opcode to another based on whether we are negating the
// multiply result and/or the accumulator.
// NOTE: Only supports RVV operations with VL.
Expand Down Expand Up @@ -17930,6 +18014,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (SDValue V = combineTruncOfSraSext(N, DAG))
return V;
return combineTruncToVnclip(N, DAG, Subtarget);
case ISD::VP_TRUNCATE:
return performVP_TRUNCATECombine(N, DAG, Subtarget);
case ISD::TRUNCATE:
return performTRUNCATECombine(N, DAG, Subtarget);
case ISD::SELECT:
Expand Down
169 changes: 169 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/vp-vaaddu.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s

declare <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
declare <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)

define <vscale x 2 x i8> @vaaddu_1(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_1:
; CHECK: # %bb.0:
; CHECK-NEXT: csrwi vxrm, 0
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i8> %d
}

define <vscale x 2 x i8> @vaaddu_2(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_2:
; CHECK: # %bb.0:
; CHECK-NEXT: csrwi vxrm, 0
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i8> %d
}

define <vscale x 2 x i8> @vaaddu_3(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_3:
; CHECK: # %bb.0:
; CHECK-NEXT: csrwi vxrm, 0
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i8> %d
}

define <vscale x 2 x i8> @vaaddu_4(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_4:
; CHECK: # %bb.0:
; CHECK-NEXT: csrwi vxrm, 0
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %yz, <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i8> %d
}

define <vscale x 2 x i8> @vaaddu_5(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_5:
; CHECK: # %bb.0:
; CHECK-NEXT: csrwi vxrm, 0
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> %xz, <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i8> %d
}

define <vscale x 2 x i8> @vaaddu_6(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_6:
; CHECK: # %bb.0:
; CHECK-NEXT: csrwi vxrm, 0
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
; CHECK-NEXT: vaaddu.vv v8, v9, v8, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> %xz, <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %yz, <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i8> %d
}

; Test where the size is reduced by 4x instead of 2x.
define <vscale x 2 x i8> @vaaddu_7(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_7:
; CHECK: # %bb.0:
; CHECK-NEXT: csrwi vxrm, 0
; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma
; CHECK-NEXT: vaaddu.vv v8, v8, v9, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i8> %d
}

; Test where the zext can't be completely removed.
define <vscale x 2 x i16> @vaaddu_8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; CHECK-NEXT: vzext.vf2 v10, v8, v0.t
; CHECK-NEXT: csrwi vxrm, 0
; CHECK-NEXT: vzext.vf2 v8, v9, v0.t
; CHECK-NEXT: vaaddu.vv v8, v10, v8, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i16> %d
}

; Negative test. The truncate has a smaller type than the zero extend.
; TODO: Could still handle this by truncating after an i16 vaaddu.
define <vscale x 2 x i8> @vaaddu_9(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vaaddu_9:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma
; CHECK-NEXT: vwaddu.vv v10, v8, v9, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; CHECK-NEXT: vadd.vi v8, v10, 1, v0.t
; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
; CHECK-NEXT: ret
%xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1> %m, i32 %vl)
%yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %y, <vscale x 2 x i1> %m, i32 %vl)
%a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
%b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
%c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> splat (i32 1), <vscale x 2 x i1> %m, i32 %vl)
%d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
ret <vscale x 2 x i8> %d
}