[RISCV] Support memcmp expansion for vectors #114517

wangpc-pp · 2024-11-01T07:02:28Z

This patch adds the support of generating vector instructions for
memcmp. This implementation is inspired by X86's.

We convert integer comparisons (eq/ne only) into vector comparisons
and do a vector reduction and to get the result.

The range of supported load sizes is (XLEN, VLEN * LMUL8] and
non-power-of-2 types are not supported.

Fixes #143294.

Created using spr 1.3.6-beta.1

Created using spr 1.3.6-beta.1 [skip ci]

llvmbot · 2024-11-01T07:03:01Z

@llvm/pr-subscribers-backend-risc-v

Author: Pengcheng Wang (wangpc-pp)

Changes

Patch is 404.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114517.diff

4 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+100-3)
(modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+5)
(modified) llvm/test/CodeGen/RISCV/memcmp-optsize.ll (+920-530)
(modified) llvm/test/CodeGen/RISCV/memcmp.ll (+4570-1843)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3b3f8772a08940..89b4f22a1260db 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -14474,17 +14475,116 @@ static bool narrowIndex(SDValue &N, ISD::MemIndexType IndexType, SelectionDAG &D
   return true;
 }
 
+/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
+/// recognizable memcmp expansion.
+static bool isOrXorXorTree(SDValue X, bool Root = true) {
+  if (X.getOpcode() == ISD::OR)
+    return isOrXorXorTree(X.getOperand(0), false) &&
+           isOrXorXorTree(X.getOperand(1), false);
+  if (Root)
+    return false;
+  return X.getOpcode() == ISD::XOR;
+}
+
+/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
+/// expansion.
+static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
+                                EVT VecVT, EVT CmpVT) {
+  SDValue Op0 = X.getOperand(0);
+  SDValue Op1 = X.getOperand(1);
+  if (X.getOpcode() == ISD::OR) {
+    SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT);
+    SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT);
+    if (VecVT != CmpVT)
+      return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
+    return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
+  }
+  if (X.getOpcode() == ISD::XOR) {
+    SDValue A = DAG.getBitcast(VecVT, Op0);
+    SDValue B = DAG.getBitcast(VecVT, Op1);
+    if (VecVT != CmpVT)
+      return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
+    return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+  }
+  llvm_unreachable("Impossible");
+}
+
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue
+combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
+                                const SDLoc &DL, SelectionDAG &DAG,
+                                const RISCVSubtarget &Subtarget) {
+  assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+  EVT OpVT = X.getValueType();
+  MVT XLenVT = Subtarget.getXLenVT();
+  unsigned OpSize = OpVT.getSizeInBits();
+
+  // We're looking for an oversized integer equality comparison.
+  if (!Subtarget.hasVInstructions() || !OpVT.isScalarInteger() ||
+      OpSize < Subtarget.getRealMinVLen() ||
+      OpSize > Subtarget.getRealMinVLen() * 8)
+    return SDValue();
+
+  bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
+  if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+    return SDValue();
+
+  // Don't perform this combine if constructing the vector will be expensive.
+  auto IsVectorBitCastCheap = [](SDValue X) {
+    X = peekThroughBitcasts(X);
+    return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+           X.getOpcode() == ISD::LOAD;
+  };
+  if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
+      !IsOrXorXorTreeCCZero)
+    return SDValue();
+
+  bool NoImplicitFloatOps =
+      DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat);
+  if (!NoImplicitFloatOps && Subtarget.hasVInstructions()) {
+    unsigned VecSize = OpSize / 8;
+    EVT VecVT = MVT::getVectorVT(MVT::i8, VecSize);
+    EVT CmpVT = MVT::getVectorVT(MVT::i1, VecSize);
+
+    SDValue Cmp;
+    if (IsOrXorXorTreeCCZero) {
+      Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT);
+    } else {
+      SDValue VecX = DAG.getBitcast(VecVT, X);
+      SDValue VecY = DAG.getBitcast(VecVT, Y);
+      Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+    }
+    return DAG.getSetCC(DL, VT,
+                        DAG.getNode(ISD::VECREDUCE_AND, DL, XLenVT, Cmp),
+                        DAG.getConstant(0, DL, XLenVT), CC);
+  }
+
+  return SDValue();
+}
+
 // Replace (seteq (i64 (and X, 0xffffffff)), C1) with
 // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
 // bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
 // can become a sext.w instead of a shift pair.
 static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
+  SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
+  // Looking for an equality compare.
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (Cond == ISD::SETNE || Cond == ISD::SETEQ) {
+    if (SDValue V = combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG,
+                                                    Subtarget))
+      return V;
+  }
+
   if (OpVT != MVT::i64 || !Subtarget.is64Bit())
     return SDValue();
 
@@ -14499,8 +14599,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
       N0.getConstantOperandVal(1) != UINT64_C(0xffffffff))
     return SDValue();
 
-  // Looking for an equality compare.
-  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
   if (!isIntEqualitySetCC(Cond))
     return SDValue();
 
@@ -14512,7 +14610,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
 
   const APInt &C1 = N1C->getAPIntValue();
 
-  SDLoc dl(N);
   // If the constant is larger than 2^32 - 1 it is impossible for both sides
   // to be equal.
   if (C1.getActiveBits() > 32)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5f5a18e2868730..d7b05001185f32 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2504,5 +2504,10 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
     Options.LoadSizes = {8, 4, 2, 1};
   else
     Options.LoadSizes = {4, 2, 1};
+  if (IsZeroCmp && ST->hasVInstructions()) {
+    unsigned RealMinVLen = ST->getRealMinVLen() / 8;
+    for (int LMUL = 1; LMUL <= 8; LMUL *= 2)
+      Options.LoadSizes.insert(Options.LoadSizes.begin(), RealMinVLen * LMUL);
+  }
   return Options;
 }
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index 06fb88b02ea4a6..ba702b4921f098 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2910,190 +2910,24 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_16:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a4, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 3(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a5, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a6, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 4(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 5(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 6(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 7(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a6, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 4(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 5(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a3, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 6(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 7(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a7, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 8(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 9(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a3, a3, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 10(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 11(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a7, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 8(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 9(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 10(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu t0, 11(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a6, a7, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli t0, t0, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, t0, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, a5, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 12(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 13(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 14(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 15(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a6, a7, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 12(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a7, 13(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a6, 14(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 15(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a5, a7, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a6, a6, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a6
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a4, a0
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a2, a0
-; CHECK-ALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-ALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-ALIGNED-RV32-V-NEXT:    vmnot.m v8, v8
+; CHECK-ALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_16:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 4(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 5(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a4, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 6(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 7(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a5, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a6, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 32
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a3, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 3(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 4(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 5(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 6(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a7, 7(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a7, a7, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a7, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a4, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 32
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 8(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 9(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 10(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 11(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 12(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 13(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 14(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a0, 15(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 32
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 8(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 9(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 10(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 11(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 12(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a6, 13(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 14(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 15(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 32
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a2, a0
-; CHECK-ALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-ALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-ALIGNED-RV64-V-NEXT:    vmnot.m v8, v8
+; CHECK-ALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
+; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_16:
@@ -3194,34 +3028,24 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmnot.m v8, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmseq.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmnot.m v8, v8
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 16)
@@ -3229,15 +3053,15 @@ entry:
 }
 
 define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_31:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 31
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -3249,6 +3073,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK...
[truncated]

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

llvm/test/CodeGen/RISCV/memcmp.ll

Created using spr 1.3.6-beta.1

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Created using spr 1.3.6-beta.1

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Created using spr 1.3.6-beta.1

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Created using spr 1.3.6-beta.1

lukel97

LGTM

hiraditya · 2025-06-12T16:11:22Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

+  if (IsZeroCmp && ST->hasVInstructions()) {
+    unsigned RealMinVLen = ST->getRealMinVLen();
+    // Support Fractional LMULs if the lengths are larger than XLen.
+    // TODO: Support non-power-of-2 types.


Should we create a github ticket for this?

I have implemented it: #114971.

hiraditya · 2025-06-12T16:14:01Z

The range of supported load sizes is (XLEN, VLEN * LMUL8]

Is there a way for users to change this, depending on the platform users may want to expand memcmp's on larger sequences.

topperc · 2025-06-12T16:14:51Z

Please don't @ me in the commit message. Sometimes when this commit gets pulled into some other fork of llvm I'll get an email that I don't want.

topperc

LGTM

topperc · 2025-06-12T16:44:07Z

The range of supported load sizes is (XLEN, VLEN * LMUL8]

Is there a way for users to change this, depending on the platform users may want to expand memcmp's on larger sequences.

The number of loads is determined from the earlier Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

wangpc-pp · 2025-06-12T17:40:16Z

Please don't @ me in the commit message. Sometimes when this commit gets pulled into some other fork of llvm I'll get an email that I don't want.

Sorry about the noise, I will remove it.

wangpc-pp · 2025-06-13T06:34:12Z

The range of supported load sizes is (XLEN, VLEN * LMUL8]

Is there a way for users to change this, depending on the platform users may want to expand memcmp's on larger sequences.

The number of loads is determined from the earlier Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

Yes. And you may refer to -max-loads-per-memcmp option.

This patch adds the support of generating vector instructions for `memcmp`. This implementation is inspired by X86's. We convert integer comparisons (eq/ne only) into vector comparisons and do a vector reduction and to get the result. The range of supported load sizes is (XLEN, VLEN * LMUL8] and non-power-of-2 types are not supported. Fixes #143294. Reviewers: lukel97, asb, preames, topperc, dtcxzyw Reviewed By: topperc, lukel97 Pull Request: llvm/llvm-project#114517

This patch adds the support of generating vector instructions for `memcmp`. This implementation is inspired by X86's. We convert integer comparisons (eq/ne only) into vector comparisons and do a vector reduction and to get the result. The range of supported load sizes is (XLEN, VLEN * LMUL8] and non-power-of-2 types are not supported. Fixes llvm#143294. Reviewers: lukel97, asb, preames, topperc, dtcxzyw Reviewed By: topperc, lukel97 Pull Request: llvm#114517

wangpc-pp added 2 commits November 1, 2024 15:02

[𝘀𝗽𝗿] initial version

4d49e38

Created using spr 1.3.6-beta.1

[𝘀𝗽𝗿] changes to main this commit is based on

381fb1f

Created using spr 1.3.6-beta.1 [skip ci]

llvmbot added the backend:RISC-V label Nov 1, 2024

wangpc-pp requested review from lukel97, preames, topperc, asb and dtcxzyw November 1, 2024 10:10

topperc reviewed Nov 4, 2024

View reviewed changes

wangpc-pp added 3 commits November 4, 2024 20:30

Address comments and remove XorXorTree as it costs too many instructions

4bda55a

Created using spr 1.3.6-beta.1

Add TODO about non-power-of-2 types

d64cdbe

Created using spr 1.3.6-beta.1

Optimize reduction

1fbf7ac

Created using spr 1.3.6-beta.1

topperc reviewed Nov 4, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

topperc reviewed Nov 4, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

wangpc-pp added 6 commits November 5, 2024 13:23

Address comments

afbbd9d

Created using spr 1.3.6-beta.1

[𝘀𝗽𝗿] changes introduced through rebase

afbef79

Created using spr 1.3.6-beta.1 [skip ci]

Rebase and change AllowOverlappingLoads

c01cddd

Created using spr 1.3.6-beta.1

Reword comment

a58fc33

Created using spr 1.3.6-beta.1

[𝘀𝗽𝗿] changes introduced through rebase

a31d951

Created using spr 1.3.6-beta.1 [skip ci]

Fix comments

d876b41

Created using spr 1.3.6-beta.1

lukel97 reviewed Nov 6, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Show resolved Hide resolved

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Show resolved Hide resolved

Don't generate vectors if no aligned vector access

431ae7e

Created using spr 1.3.6-beta.1

lukel97 mentioned this pull request Jun 8, 2025

RISC-V: clang generates bcmp for memcmp for small comparisons #143294

Closed

topperc reviewed Jun 9, 2025

View reviewed changes

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

lukel97 reviewed Jun 11, 2025

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Show resolved Hide resolved

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

kadircet and others added 2 commits June 12, 2025 17:34

[𝘀𝗽𝗿] changes introduced through rebase

ee73a50

Created using spr 1.3.6-beta.1 [skip ci]

Rebase and address comments

7f7ada2

Created using spr 1.3.6-beta.1

lukel97 reviewed Jun 12, 2025

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

Remove enableUnalignedVectorMem and move isIntEqualitySetCC early

7fcecf0

Created using spr 1.3.6-beta.1

lukel97 approved these changes Jun 12, 2025

View reviewed changes

hiraditya reviewed Jun 12, 2025

View reviewed changes

topperc approved these changes Jun 12, 2025

View reviewed changes

wangpc-pp changed the base branch from users/wangpc-pp/spr/main.riscv-support-memcmp-expansion-for-vectors to main June 13, 2025 06:31

wangpc-pp merged commit 4903c11 into main Jun 13, 2025
7 checks passed

wangpc-pp deleted the users/wangpc-pp/spr/riscv-support-memcmp-expansion-for-vectors branch June 13, 2025 06:31

[RISCV] Support memcmp expansion for vectors #114517

[RISCV] Support memcmp expansion for vectors #114517

Uh oh!

Conversation

wangpc-pp commented Nov 1, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Nov 1, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

lukel97 left a comment

Choose a reason for hiding this comment

Uh oh!

hiraditya Jun 12, 2025

Choose a reason for hiding this comment

Uh oh!

wangpc-pp Jun 12, 2025

Choose a reason for hiding this comment

Uh oh!

hiraditya commented Jun 12, 2025

Uh oh!

topperc commented Jun 12, 2025

Uh oh!

topperc left a comment

Choose a reason for hiding this comment

Uh oh!

topperc commented Jun 12, 2025

Uh oh!

wangpc-pp commented Jun 12, 2025

Uh oh!

Uh oh!

wangpc-pp commented Jun 13, 2025

Uh oh!

Uh oh!

wangpc-pp commented Nov 1, 2024 •

edited

Loading