[RISCV] Add DAG combine for forming VAADDU_VL from VP intrinsics. #124848

topperc · 2025-01-28T21:59:46Z

This adds a VP version of an existing DAG combine. I've put it in RISCVISelLowering since we would need to add a ISD::VP_AVGCEIL opcode otherwise.

This pattern appears in 525.264_r.

I extracted this code from our downstream compiler. Hopefully it works with the current EVL vectorizer in upstream.

@lukel97 can you double check that this pattern triggers on 525.x264_r with the EVL vectorizer.

This adds a VP version of an existing DAG combine. I've put it in RISCV since we would need to add a ISD::VP_AVGCEIL opcode otherwise. This pattern appears in 525.264_r.

llvmbot · 2025-01-28T22:00:21Z

@llvm/pr-subscribers-backend-risc-v

Author: Craig Topper (topperc)

Changes

This adds a VP version of an existing DAG combine. I've put it in RISCVISelLowering since we would need to add a ISD::VP_AVGCEIL opcode otherwise.

This pattern appears in 525.264_r.

I extracted this code from our downstream compiler. Hopefully it works with the current EVL vectorizer in upstream.

@lukel97 can you double check that this pattern triggers on 525.x264_r with the EVL vectorizer.

Patch is 20.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124848.diff

2 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+107-12)
(added) llvm/test/CodeGen/RISCV/rvv/vp-vaaddu.ll (+169)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 8d09e534b1858b..f004a00b19d205 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1526,18 +1526,16 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
                          ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
   if (Subtarget.hasVInstructions())
-    setTargetDAGCombine({ISD::FCOPYSIGN,     ISD::MGATHER,
-                         ISD::MSCATTER,      ISD::VP_GATHER,
-                         ISD::VP_SCATTER,    ISD::SRA,
-                         ISD::SRL,           ISD::SHL,
-                         ISD::STORE,         ISD::SPLAT_VECTOR,
-                         ISD::BUILD_VECTOR,  ISD::CONCAT_VECTORS,
-                         ISD::VP_STORE,      ISD::EXPERIMENTAL_VP_REVERSE,
-                         ISD::MUL,           ISD::SDIV,
-                         ISD::UDIV,          ISD::SREM,
-                         ISD::UREM,          ISD::INSERT_VECTOR_ELT,
-                         ISD::ABS,           ISD::CTPOP,
-                         ISD::VECTOR_SHUFFLE, ISD::VSELECT});
+    setTargetDAGCombine(
+        {ISD::FCOPYSIGN,    ISD::MGATHER,      ISD::MSCATTER,
+         ISD::VP_GATHER,    ISD::VP_SCATTER,   ISD::SRA,
+         ISD::SRL,          ISD::SHL,          ISD::STORE,
+         ISD::SPLAT_VECTOR, ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
+         ISD::VP_STORE,     ISD::VP_TRUNCATE,  ISD::EXPERIMENTAL_VP_REVERSE,
+         ISD::MUL,          ISD::SDIV,         ISD::UDIV,
+         ISD::SREM,         ISD::UREM,         ISD::INSERT_VECTOR_ELT,
+         ISD::ABS,          ISD::CTPOP,        ISD::VECTOR_SHUFFLE,
+         ISD::VSELECT});
 
   if (Subtarget.hasVendorXTHeadMemPair())
     setTargetDAGCombine({ISD::LOAD, ISD::STORE});
@@ -16373,6 +16371,101 @@ static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
       VPStore->isTruncatingStore(), VPStore->isCompressingStore());
 }
 
+// Peephole avgceil pattern.
+//   %1 = zext <N x i8> %a to <N x i32>
+//   %2 = zext <N x i8> %b to <N x i32>
+//   %3 = add nuw nsw <N x i32> %1, splat (i32 1)
+//   %4 = add nuw nsw <N x i32> %3, %2
+//   %5 = lshr <N x i32> %N, <i32 1 x N>
+//   %6 = trunc <N x i32> %5 to <N x i8>
+static SDValue performVP_TRUNCATECombine(SDNode *N, SelectionDAG &DAG,
+                                         const RISCVSubtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+
+  // Ignore fixed vectors.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isScalableVector() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  SDValue In = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  SDValue VL = N->getOperand(2);
+
+  // Input should be a vp_srl with same mask and VL.
+  if (In.getOpcode() != ISD::VP_SRL || In.getOperand(2) != Mask ||
+      In.getOperand(3) != VL)
+    return SDValue();
+
+  // Shift amount should be 1.
+  if (!isOneOrOneSplat(In.getOperand(1)))
+    return SDValue();
+
+  // Shifted value should be a vp_add with same mask and VL.
+  SDValue LHS = In.getOperand(0);
+  if (LHS.getOpcode() != ISD::VP_ADD || LHS.getOperand(2) != Mask ||
+      LHS.getOperand(3) != VL)
+    return SDValue();
+
+  SDValue Operands[3];
+  Operands[0] = LHS.getOperand(0);
+  Operands[1] = LHS.getOperand(1);
+
+  // Matches another VP_ADD with same VL and Mask.
+  auto FindAdd = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
+    if (V.getOpcode() != ISD::VP_ADD || V.getOperand(2) != Mask ||
+        V.getOperand(3) != VL)
+      return false;
+
+    Op0 = V.getOperand(0);
+    Op1 = V.getOperand(1);
+    return true;
+  };
+
+  // We need to find another VP_ADD in one of the operands.
+  SDValue Op0, Op1;
+  if (FindAdd(Operands[0], Op0, Op1))
+    Operands[0] = Operands[1];
+  else if (!FindAdd(Operands[1], Op0, Op1))
+    return SDValue();
+  Operands[2] = Op0;
+  Operands[1] = Op1;
+
+  // Now we have three operands of two additions. Check that one of them is a
+  // constant vector with ones.
+  auto I = llvm::find_if(Operands,
+                         [](const SDValue &Op) { return isOneOrOneSplat(Op); });
+  if (I == std::end(Operands))
+    return SDValue();
+  // We found a vector with ones, move if it to the end of the Operands array.
+  std::swap(Operands[I - std::begin(Operands)], Operands[2]);
+
+  // Make sure the other 2 operands can be promoted from the result type.
+  for (int i = 0; i < 2; ++i) {
+    if (Operands[i].getOpcode() != ISD::VP_ZERO_EXTEND ||
+        Operands[i].getOperand(1) != Mask || Operands[i].getOperand(2) != VL)
+      return SDValue();
+    // Input must be smaller than our result.
+    if (Operands[i].getOperand(0).getScalarValueSizeInBits() >
+        VT.getScalarSizeInBits())
+      return SDValue();
+  }
+
+  // Pattern is detected.
+  Op0 = Operands[0].getOperand(0);
+  Op1 = Operands[1].getOperand(0);
+  // Rebuild the zero extends if the inputs are smaller than our result.
+  if (Op0.getValueType() != VT)
+    Op0 =
+        DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[0]), VT, Op0, Mask, VL);
+  if (Op1.getValueType() != VT)
+    Op1 =
+        DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[1]), VT, Op1, Mask, VL);
+  // Build a VAADDU with RNU rounding mode.
+  SDLoc DL(N);
+  return DAG.getNode(RISCVISD::AVGCEILU_VL, DL, VT,
+                     {Op0, Op1, DAG.getUNDEF(VT), Mask, VL});
+}
+
 // Convert from one FMA opcode to another based on whether we are negating the
 // multiply result and/or the accumulator.
 // NOTE: Only supports RVV operations with VL.
@@ -17930,6 +18023,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     if (SDValue V = combineTruncOfSraSext(N, DAG))
       return V;
     return combineTruncToVnclip(N, DAG, Subtarget);
+  case ISD::VP_TRUNCATE:
+    return performVP_TRUNCATECombine(N, DAG, Subtarget);
   case ISD::TRUNCATE:
     return performTRUNCATECombine(N, DAG, Subtarget);
   case ISD::SELECT:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vaaddu.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vaaddu.ll
new file mode 100644
index 00000000000000..42f8e38d5ac163
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vaaddu.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+declare <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vaaddu_1(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vaaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x i8> %d
+}
+
+define <vscale x 2 x i8> @vaaddu_2(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vaaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x i8> %d
+}
+
+define <vscale x 2 x i8> @vaaddu_3(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vaaddu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x i8> %d
+}
+
+define <vscale x 2 x i8> @vaaddu_4(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vaaddu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %xz, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %yz, <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x i8> %d
+}
+
+define <vscale x 2 x i8> @vaaddu_5(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vaaddu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i16> %xz, <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %yz, <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x i8> %d
+}
+
+define <vscale x 2 x i8> @vaaddu_6(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vaaddu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i16> %xz, <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %yz, <vscale x 2 x i16> %a, <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i16> @llvm.vp.lshr.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %c, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x i8> %d
+}
+
+; Test where the size is reduced by 4x instead of 2x.
+define <vscale x 2 x i8> @vaaddu_7(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vaaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %d = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x i8> %d
+}
+
+; Test where the zext can't be completely removed.
+define <vscale x 2 x i16> @vaaddu_8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vzext.vf2 v8, v9, v0.t
+; CHECK-NEXT:    vaaddu.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %d = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> %c, <vscale x 2 x i1> %m, i32 %vl)
+  ret <vscale x 2 x i16> %d
+}
+
+; Negative test. The truncate has a smaller type than the zero extend.
+; TODO: Could still handle this by truncating after an i16 vaaddu.
+define <vscale x 2 x i8> @vaaddu_9(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y, <vscale x 2 x i1> %m, i32 zeroext %vl) {
+; CHECK-LABEL: vaaddu_9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vwaddu.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vi v8, v10, 1, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    ret
+  %xz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1> %m, i32 %vl)
+  %yz = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %y, <vscale x 2 x i1> %m, i32 %vl)
+  %a = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %xz, <vscale x 2 x i32> %yz, <vscale x 2 x i1> %m, i32 %vl)
+  %b = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %c = call <vscale x 2 x i32> @llvm.vp.lshr.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale...
[truncated]

lukel97 · 2025-01-29T05:23:50Z

Just checked, this patch triggers on 525.x264_r with EVL tail folding

preames · 2025-01-29T16:00:40Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+//   %2 = zext <N x i8> %b to <N x i32>
+//   %3 = add nuw nsw <N x i32> %1, splat (i32 1)
+//   %4 = add nuw nsw <N x i32> %3, %2
+//   %5 = lshr <N x i32> %N, <i32 1 x N>


I think %N should be %4 here? And the second operand looks invalid?

preames · 2025-01-29T16:48:12Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+
+  // We need to find another VP_ADD in one of the operands.
+  SDValue Op0, Op1;
+  if (FindAdd(Operands[0], Op0, Op1))


The multiple uses of Operands[0] seems a tad confusing to me. The logic appears correct, but might it be simpler to simply pass LHS.getOperand(X) to the helper, and update Operands[1], and Operands[2] by reference?

This code was lifted from some X86 code that was removed here ea2ee5d

I'll look into improving it

preames · 2025-01-29T16:51:02Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+        Operands[i].getOperand(1) != Mask || Operands[i].getOperand(2) != VL)
+      return SDValue();
+    // Input must be smaller than our result.
+    if (Operands[i].getOperand(0).getScalarValueSizeInBits() >


Zero extend should always be less than equal right? If so, is this check needed? And don't you need to check for the degenerate equal sized type case? Or do we simplify those at construction?

This is checking the destination type of the VP_TRUNCATE against the input type of VP_ZERO_EXTEND.

preames · 2025-01-29T16:53:23Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+  std::swap(Operands[I - std::begin(Operands)], Operands[2]);
+
+  // Make sure the other 2 operands can be promoted from the result type.
+  for (int i = 0; i < 2; ++i) {


Might be easier to read as for (auto Op : Operands.take_front(2)). -- Actually, ignore this, we had take_front on ArrayRef, but not SmallVector. Might be worth adding, but definitely non-blocking for this review.

Note there is no SmallVector in this code. It's a plain C array. We can use drop_end on it to skip the last element.

preames · 2025-01-29T16:54:48Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+  }
+
+  // Pattern is detected.
+  Op0 = Operands[0].getOperand(0);


The reuse of Op0, Op1 here is a bit odd. Maybe reuse Operands instead, and drop the splat of 1 above rather than moving it to the end? That would simplify the loop just above too. Though I guess that complicates the debug loc management just below.

Operands is a plain C array so we can't really "drop" the splat of 1.

preames · 2025-01-29T16:57:19Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+  if (Op0.getValueType() != VT)
+    Op0 =
+        DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[0]), VT, Op0, Mask, VL);
+  if (Op1.getValueType() != VT)


Surely, we short circuit these on construction and can just unconditionally call the getNode API?

preames · 2025-01-29T17:00:21Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+        DAG.getNode(ISD::VP_ZERO_EXTEND, SDLoc(Operands[1]), VT, Op1, Mask, VL);
+  // Build a VAADDU with RNU rounding mode.
+  SDLoc DL(N);
+  return DAG.getNode(RISCVISD::AVGCEILU_VL, DL, VT,


AVGCEILU_VL appears to be previously unused code. Do we have any such other nodes which need deleted?

It's not unused it used via macro in getRISCVVLOp

preames · 2025-01-29T17:00:49Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

-                         ISD::UREM,          ISD::INSERT_VECTOR_ELT,
-                         ISD::ABS,           ISD::CTPOP,
-                         ISD::VECTOR_SHUFFLE, ISD::VSELECT});
+    setTargetDAGCombine(


Stray format change.

I added VP_TRUNCATE to it. It added it to the middle because that's where it was in my downstream.

preames · 2025-01-29T17:03:37Z

This adds a VP version of an existing DAG combine. I've put it in RISCVISelLowering since we would need to add a ISD::VP_AVGCEIL opcode otherwise.

Not opposed to doing this in RISCV DAG, but an alternative in case you haven't considered it would be to emit a AVGCEIL node for the full type width (no predication), and then a VP.VSELECT just after it. I believe the VLOptimizer should fold this back into the predicated form without issue. This works for all instructions which are non-faulting (i.e. we can speculatively execute the additional lanes).

preames

LGTM w/minor comment.

preames · 2025-01-30T03:07:29Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+    if (Op.getOpcode() != ISD::VP_ZERO_EXTEND || Op.getOperand(1) != Mask ||
+        Op.getOperand(2) != VL)
+      return SDValue();
+    // Input must be smaller than our result.


smaller but the code checks less than equal. I think the comment just needs updated.

lukel97

LGTM

Could we also do this after legalization and try and match it on TRUNCATE_VECTOR_VL? That way we would also match fixed length vectors.

lukel97 · 2025-01-30T15:49:54Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+  auto I = llvm::find_if(Operands,
+                         [](const SDValue &Op) { return isOneOrOneSplat(Op); });


Can you pass the predicate directly?

Suggested change

auto I = llvm::find_if(Operands,

[](const SDValue &Op) { return isOneOrOneSplat(Op); });

auto I = llvm::find_if(Operands, isOneOrOneSplat);

This doesn't compile. Possibly because isOneOrOneSplat has a second defaulted argument

topperc · 2025-01-30T16:10:11Z

LGTM

Could we also do this after legalization and try and match it on TRUNCATE_VECTOR_VL? That way we would also match fixed length vectors.

Don't we have a generic DAGCombine for fixed vectors using non-VP operations?

lukel97 · 2025-01-30T16:34:54Z

LGTM
Could we also do this after legalization and try and match it on TRUNCATE_VECTOR_VL? That way we would also match fixed length vectors.

Don't we have a generic DAGCombine for fixed vectors using non-VP operations?

Yeah but this would be for fixed length VP operations. Just putting the idea out there

topperc · 2025-01-30T16:51:04Z

LGTM
Could we also do this after legalization and try and match it on TRUNCATE_VECTOR_VL? That way we would also match fixed length vectors.

Don't we have a generic DAGCombine for fixed vectors using non-VP operations?

Yeah but this would be for fixed length VP operations. Just putting the idea out there

I think moving it after legalization is a little more complex. We'll have to handle VP_TRUNCATE i32->i8 being split into 2 TRUNCATE_VECTOR_VLs and we'll have to look for combinations of VWADDU_VL and VWADDU_W_VL.

llvm-ci · 2025-01-30T17:10:02Z

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime running on omp-vega20-0 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/14941

Here is the relevant piece of the build log for the reference

Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: api/omp_host_call.c' FAILED ********************
Exit Code: 2

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/api/omp_host_call.c -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/api/Output/omp_host_call.c.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a && /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/api/Output/omp_host_call.c.tmp | /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/api/omp_host_call.c
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/api/omp_host_call.c -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/api/Output/omp_host_call.c.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# note: command had no output on stdout or stderr
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/api/Output/omp_host_call.c.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/api/omp_host_call.c
# .---command stderr------------
# | FileCheck error: '<stdin>' is empty.
# | FileCheck command line:  /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/api/omp_host_call.c
# `-----------------------------
# error: command failed with exit status: 2

--

********************

llvm-ci · 2025-01-30T17:19:18Z

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/12743

Here is the relevant piece of the build log for the reference

Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: offloading/pgo1.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-instr-generate      -Xclang "-fprofile-instrument=clang"
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-instr-generate -Xclang -fprofile-instrument=clang
# RUN: at line 3
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp 2>&1 | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c      --check-prefix="CLANG-PGO"
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c --check-prefix=CLANG-PGO
# .---command stderr------------
# | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c:32:20: error: CLANG-PGO-NEXT: expected string not found in input
# | // CLANG-PGO-NEXT: [ 0 11 20 ]
# |                    ^
# | <stdin>:3:28: note: scanning from here
# | ======== Counters =========
# |                            ^
# | <stdin>:4:1: note: possible intended match here
# | [ 0 13 20 ]
# | ^
# | 
# | Input file: <stdin>
# | Check file: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |            1: ======= GPU Profile ======= 
# |            2: Target: amdgcn-amd-amdhsa 
# |            3: ======== Counters ========= 
# | next:32'0                                X error: no match found
# |            4: [ 0 13 20 ] 
# | next:32'0     ~~~~~~~~~~~~
# | next:32'1     ?            possible intended match
# |            5: [ 10 ] 
# | next:32'0     ~~~~~~~
# |            6: [ 20 ] 
# | next:32'0     ~~~~~~~
# |            7: ========== Data =========== 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            8: { 9515997471539760012 4749112401 0xffffffffffffffd8 0x0 0x0 0x0 3 [...] 0 } 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            9: { 3666282617048535130 24 0xffffffffffffffb0 0x0 0x0 0x0 1 [...] 0 } 
# | next:32'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            .
# |            .
# |            .
...

llvm-ci · 2025-01-30T17:21:19Z

LLVM Buildbot has detected a new failure on builder llvm-clang-aarch64-darwin running on doug-worker-4 while building llvm at step 6 "test-build-unified-tree-check-all".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/190/builds/13807

Here is the relevant piece of the build log for the reference

Step 6 (test-build-unified-tree-check-all) failure: test (failure)
******************** TEST 'Clang :: Analysis/live-stmts.cpp' FAILED ********************
Exit Code: 1

Command Output (stderr):
--
RUN: at line 1: /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/clang -cc1 -internal-isystem /Users/buildbot/buildbot-root/aarch64-darwin/build/lib/clang/21/include -nostdsysteminc -analyze -analyzer-constraints=range -setup-static-analyzer -w -analyzer-checker=debug.DumpLiveExprs /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Analysis/live-stmts.cpp 2>&1   | /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Analysis/live-stmts.cpp
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/clang -cc1 -internal-isystem /Users/buildbot/buildbot-root/aarch64-darwin/build/lib/clang/21/include -nostdsysteminc -analyze -analyzer-constraints=range -setup-static-analyzer -w -analyzer-checker=debug.DumpLiveExprs /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Analysis/live-stmts.cpp
+ /Users/buildbot/buildbot-root/aarch64-darwin/build/bin/FileCheck /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Analysis/live-stmts.cpp
�[1m/Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Analysis/live-stmts.cpp:239:16: �[0m�[0;1;31merror: �[0m�[1mCHECK-EMPTY: is not on the line after the previous match
�[0m// CHECK-EMPTY:
�[0;1;32m               ^
�[0m�[1m<stdin>:180:1: �[0m�[0;1;30mnote: �[0m�[1m'next' match was here
�[0m
�[0;1;32m^
�[0m�[1m<stdin>:177:1: �[0m�[0;1;30mnote: �[0m�[1mprevious match ended here
�[0m
�[0;1;32m^
�[0m�[1m<stdin>:178:1: �[0m�[0;1;30mnote: �[0m�[1mnon-matching line after previous match is here
�[0mImplicitCastExpr 0x111059b78 '_Bool' <LValueToRValue>
�[0;1;32m^
�[0m
Input file: <stdin>
Check file: /Users/buildbot/buildbot-root/aarch64-darwin/llvm-project/clang/test/Analysis/live-stmts.cpp

-dump-input=help explains the following input dump.

Input was:
<<<<<<
�[1m�[0m�[0;1;30m           1: �[0m�[1m�[0;1;46m �[0m
�[0;1;30m           2: �[0m�[1m�[0;1;46m�[0m[ B0 (live expressions at block exit) ]�[0;1;46m �[0m
�[0;1;32mcheck:21      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
�[0m�[0;1;30m           3: �[0m�[1m�[0;1;46m�[0m �[0m
�[0;1;32mempty:22      ^
�[0m�[0;1;30m           4: �[0m�[1m�[0;1;46m�[0m �[0m
�[0;1;32mempty:23      ^
�[0m�[0;1;30m           5: �[0m�[1m�[0;1;46m�[0m[ B1 (live expressions at block exit) ]�[0;1;46m �[0m
�[0;1;32mcheck:24      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
�[0m�[0;1;30m           6: �[0m�[1m�[0;1;46m�[0m �[0m
�[0;1;32mempty:25      ^
�[0m�[0;1;30m           7: �[0m�[1m�[0;1;46m�[0m �[0m
�[0;1;32mempty:26      ^
�[0m�[0;1;30m           8: �[0m�[1m�[0;1;46m�[0m[ B2 (live expressions at block exit) ]�[0;1;46m �[0m
�[0;1;32mcheck:27      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
�[0m�[0;1;30m           9: �[0m�[1m�[0;1;46m�[0m �[0m
�[0;1;32mempty:28      ^
�[0m�[0;1;30m          10: �[0m�[1m�[0;1;46m�[0mDeclRefExpr 0x111056ee0 'int' lvalue ParmVar 0x111039a70 'y' 'int'�[0;1;46m �[0m
�[0;1;32mnext:29       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
�[0m�[0;1;30m          11: �[0m�[1m�[0;1;46m�[0m �[0m
�[0;1;32mempty:30      ^
�[0m�[0;1;30m          12: �[0m�[1m�[0;1;46m�[0mDeclRefExpr 0x111056f00 'int' lvalue ParmVar 0x111039af0 'z' 'int'�[0;1;46m �[0m
...

[RISCV} Add DAG combine for forming VAADDU_VL from VP intrinsics.

8185a93

This adds a VP version of an existing DAG combine. I've put it in RISCV since we would need to add a ISD::VP_AVGCEIL opcode otherwise. This pattern appears in 525.264_r.

topperc requested review from preames and lukel97 January 28, 2025 21:59

llvmbot added the backend:RISC-V label Jan 28, 2025

lukel97 changed the title ~~[RISCV} Add DAG combine for forming VAADDU_VL from VP intrinsics.~~ [RISCV] Add DAG combine for forming VAADDU_VL from VP intrinsics. Jan 29, 2025

preames reviewed Jan 29, 2025

View reviewed changes

topperc added 2 commits January 29, 2025 15:31

fixup! address review comments

ffa8ae0

fixup! Use splat in test.

13c6a51

preames approved these changes Jan 30, 2025

View reviewed changes

fixup! Update comment

39cef88

lukel97 approved these changes Jan 30, 2025

View reviewed changes

fixup! Update comment

7192ca8

topperc merged commit e7e72a9 into llvm:main Jan 30, 2025
5 of 7 checks passed

topperc deleted the pr/vp-avgceil branch January 30, 2025 17:03

		auto I = llvm::find_if(Operands,
		[](const SDValue &Op) { return isOneOrOneSplat(Op); });

	auto I = llvm::find_if(Operands,
	[](const SDValue &Op) { return isOneOrOneSplat(Op); });
	auto I = llvm::find_if(Operands, isOneOrOneSplat);

[RISCV] Add DAG combine for forming VAADDU_VL from VP intrinsics. #124848

[RISCV] Add DAG combine for forming VAADDU_VL from VP intrinsics. #124848

Uh oh!

Conversation

topperc commented Jan 28, 2025

Uh oh!

llvmbot commented Jan 28, 2025

Uh oh!

lukel97 commented Jan 29, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

preames commented Jan 29, 2025

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

lukel97 left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

topperc commented Jan 30, 2025

Uh oh!

lukel97 commented Jan 30, 2025

Uh oh!

topperc commented Jan 30, 2025

Uh oh!

Uh oh!

llvm-ci commented Jan 30, 2025

Uh oh!

llvm-ci commented Jan 30, 2025

Uh oh!

llvm-ci commented Jan 30, 2025

Uh oh!

Uh oh!