-
Notifications
You must be signed in to change notification settings - Fork 14.1k
[RISCV] Support memcmp expansion for vectors #114517
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Support memcmp expansion for vectors #114517
Conversation
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1 [skip ci]
@llvm/pr-subscribers-backend-risc-v Author: Pengcheng Wang (wangpc-pp) ChangesPatch is 404.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114517.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3b3f8772a08940..89b4f22a1260db 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23,6 +23,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -14474,17 +14475,116 @@ static bool narrowIndex(SDValue &N, ISD::MemIndexType IndexType, SelectionDAG &D
return true;
}
+/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
+/// recognizable memcmp expansion.
+static bool isOrXorXorTree(SDValue X, bool Root = true) {
+ if (X.getOpcode() == ISD::OR)
+ return isOrXorXorTree(X.getOperand(0), false) &&
+ isOrXorXorTree(X.getOperand(1), false);
+ if (Root)
+ return false;
+ return X.getOpcode() == ISD::XOR;
+}
+
+/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
+/// expansion.
+static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,
+ EVT VecVT, EVT CmpVT) {
+ SDValue Op0 = X.getOperand(0);
+ SDValue Op1 = X.getOperand(1);
+ if (X.getOpcode() == ISD::OR) {
+ SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT);
+ SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT);
+ if (VecVT != CmpVT)
+ return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
+ return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
+ }
+ if (X.getOpcode() == ISD::XOR) {
+ SDValue A = DAG.getBitcast(VecVT, Op0);
+ SDValue B = DAG.getBitcast(VecVT, Op1);
+ if (VecVT != CmpVT)
+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+ }
+ llvm_unreachable("Impossible");
+}
+
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue
+combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+ EVT OpVT = X.getValueType();
+ MVT XLenVT = Subtarget.getXLenVT();
+ unsigned OpSize = OpVT.getSizeInBits();
+
+ // We're looking for an oversized integer equality comparison.
+ if (!Subtarget.hasVInstructions() || !OpVT.isScalarInteger() ||
+ OpSize < Subtarget.getRealMinVLen() ||
+ OpSize > Subtarget.getRealMinVLen() * 8)
+ return SDValue();
+
+ bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
+ if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+ return SDValue();
+
+ // Don't perform this combine if constructing the vector will be expensive.
+ auto IsVectorBitCastCheap = [](SDValue X) {
+ X = peekThroughBitcasts(X);
+ return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+ X.getOpcode() == ISD::LOAD;
+ };
+ if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
+ !IsOrXorXorTreeCCZero)
+ return SDValue();
+
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (!NoImplicitFloatOps && Subtarget.hasVInstructions()) {
+ unsigned VecSize = OpSize / 8;
+ EVT VecVT = MVT::getVectorVT(MVT::i8, VecSize);
+ EVT CmpVT = MVT::getVectorVT(MVT::i1, VecSize);
+
+ SDValue Cmp;
+ if (IsOrXorXorTreeCCZero) {
+ Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT);
+ } else {
+ SDValue VecX = DAG.getBitcast(VecVT, X);
+ SDValue VecY = DAG.getBitcast(VecVT, Y);
+ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
+ }
+ return DAG.getSetCC(DL, VT,
+ DAG.getNode(ISD::VECREDUCE_AND, DL, XLenVT, Cmp),
+ DAG.getConstant(0, DL, XLenVT), CC);
+ }
+
+ return SDValue();
+}
+
// Replace (seteq (i64 (and X, 0xffffffff)), C1) with
// (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
// bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
// can become a sext.w instead of a shift pair.
static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
+ SDLoc dl(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT OpVT = N0.getValueType();
+ // Looking for an equality compare.
+ ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ if (Cond == ISD::SETNE || Cond == ISD::SETEQ) {
+ if (SDValue V = combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG,
+ Subtarget))
+ return V;
+ }
+
if (OpVT != MVT::i64 || !Subtarget.is64Bit())
return SDValue();
@@ -14499,8 +14599,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
N0.getConstantOperandVal(1) != UINT64_C(0xffffffff))
return SDValue();
- // Looking for an equality compare.
- ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
if (!isIntEqualitySetCC(Cond))
return SDValue();
@@ -14512,7 +14610,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
const APInt &C1 = N1C->getAPIntValue();
- SDLoc dl(N);
// If the constant is larger than 2^32 - 1 it is impossible for both sides
// to be equal.
if (C1.getActiveBits() > 32)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 5f5a18e2868730..d7b05001185f32 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2504,5 +2504,10 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
Options.LoadSizes = {8, 4, 2, 1};
else
Options.LoadSizes = {4, 2, 1};
+ if (IsZeroCmp && ST->hasVInstructions()) {
+ unsigned RealMinVLen = ST->getRealMinVLen() / 8;
+ for (int LMUL = 1; LMUL <= 8; LMUL *= 2)
+ Options.LoadSizes.insert(Options.LoadSizes.begin(), RealMinVLen * LMUL);
+ }
return Options;
}
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index 06fb88b02ea4a6..ba702b4921f098 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2910,190 +2910,24 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
;
; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_16:
; CHECK-ALIGNED-RV32-V: # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT: or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 24
-; CHECK-ALIGNED-RV32-V-NEXT: or a4, a5, a4
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: or a2, a4, a2
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a6, 3(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 8
-; CHECK-ALIGNED-RV32-V-NEXT: or a3, a5, a3
-; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT: slli a6, a6, 24
-; CHECK-ALIGNED-RV32-V-NEXT: or a4, a6, a4
-; CHECK-ALIGNED-RV32-V-NEXT: or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 4(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 5(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: xor a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a3, 6(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a6, 7(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 8
-; CHECK-ALIGNED-RV32-V-NEXT: or a4, a5, a4
-; CHECK-ALIGNED-RV32-V-NEXT: slli a3, a3, 16
-; CHECK-ALIGNED-RV32-V-NEXT: slli a6, a6, 24
-; CHECK-ALIGNED-RV32-V-NEXT: or a3, a6, a3
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 4(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a6, 5(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: or a3, a3, a4
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 6(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a7, 7(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: slli a6, a6, 8
-; CHECK-ALIGNED-RV32-V-NEXT: or a5, a6, a5
-; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT: slli a7, a7, 24
-; CHECK-ALIGNED-RV32-V-NEXT: or a4, a7, a4
-; CHECK-ALIGNED-RV32-V-NEXT: or a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 8(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a6, 9(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: xor a3, a3, a4
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a4, 10(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a7, 11(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: slli a6, a6, 8
-; CHECK-ALIGNED-RV32-V-NEXT: or a5, a6, a5
-; CHECK-ALIGNED-RV32-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT: slli a7, a7, 24
-; CHECK-ALIGNED-RV32-V-NEXT: or a4, a7, a4
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a6, 8(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a7, 9(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: or a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 10(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu t0, 11(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT: or a6, a7, a6
-; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 16
-; CHECK-ALIGNED-RV32-V-NEXT: slli t0, t0, 24
-; CHECK-ALIGNED-RV32-V-NEXT: or a5, t0, a5
-; CHECK-ALIGNED-RV32-V-NEXT: or a5, a5, a6
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a6, 12(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a7, 13(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: xor a4, a4, a5
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 14(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a0, 15(a0)
-; CHECK-ALIGNED-RV32-V-NEXT: slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT: or a6, a7, a6
-; CHECK-ALIGNED-RV32-V-NEXT: slli a5, a5, 16
-; CHECK-ALIGNED-RV32-V-NEXT: slli a0, a0, 24
-; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a5
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a5, 12(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a7, 13(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: or a0, a0, a6
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a6, 14(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: lbu a1, 15(a1)
-; CHECK-ALIGNED-RV32-V-NEXT: slli a7, a7, 8
-; CHECK-ALIGNED-RV32-V-NEXT: or a5, a7, a5
-; CHECK-ALIGNED-RV32-V-NEXT: slli a6, a6, 16
-; CHECK-ALIGNED-RV32-V-NEXT: slli a1, a1, 24
-; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a6
-; CHECK-ALIGNED-RV32-V-NEXT: or a1, a1, a5
-; CHECK-ALIGNED-RV32-V-NEXT: xor a0, a0, a1
-; CHECK-ALIGNED-RV32-V-NEXT: or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT: or a0, a4, a0
-; CHECK-ALIGNED-RV32-V-NEXT: or a0, a2, a0
-; CHECK-ALIGNED-RV32-V-NEXT: snez a0, a0
+; CHECK-ALIGNED-RV32-V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ALIGNED-RV32-V-NEXT: vle8.v v8, (a0)
+; CHECK-ALIGNED-RV32-V-NEXT: vle8.v v9, (a1)
+; CHECK-ALIGNED-RV32-V-NEXT: vmseq.vv v8, v8, v9
+; CHECK-ALIGNED-RV32-V-NEXT: vmnot.m v8, v8
+; CHECK-ALIGNED-RV32-V-NEXT: vcpop.m a0, v8
+; CHECK-ALIGNED-RV32-V-NEXT: seqz a0, a0
; CHECK-ALIGNED-RV32-V-NEXT: ret
;
; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_16:
; CHECK-ALIGNED-RV64-V: # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT: or a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 24
-; CHECK-ALIGNED-RV64-V-NEXT: or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 4(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 5(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: or a2, a4, a2
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 6(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a6, 7(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a5, a3
-; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT: slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT: or a4, a6, a4
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 32
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: or a2, a3, a2
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a6, 3(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT: or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT: slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 4(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a6, 5(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 6(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a7, 7(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT: or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT: slli a7, a7, 24
-; CHECK-ALIGNED-RV64-V-NEXT: or a4, a7, a4
-; CHECK-ALIGNED-RV64-V-NEXT: or a4, a4, a5
-; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 32
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 8(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 9(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: xor a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 10(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a6, 11(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT: or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT: slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 12(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a6, 13(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 14(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a0, 15(a0)
-; CHECK-ALIGNED-RV64-V-NEXT: slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT: or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT: slli a0, a0, 24
-; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a4
-; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a5
-; CHECK-ALIGNED-RV64-V-NEXT: slli a0, a0, 32
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 8(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 9(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: or a0, a0, a3
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a3, 10(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a6, 11(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: slli a5, a5, 8
-; CHECK-ALIGNED-RV64-V-NEXT: or a4, a5, a4
-; CHECK-ALIGNED-RV64-V-NEXT: slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT: slli a6, a6, 24
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a6, a3
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a5, 12(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a6, 13(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: or a3, a3, a4
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a4, 14(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: lbu a1, 15(a1)
-; CHECK-ALIGNED-RV64-V-NEXT: slli a6, a6, 8
-; CHECK-ALIGNED-RV64-V-NEXT: or a5, a6, a5
-; CHECK-ALIGNED-RV64-V-NEXT: slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT: slli a1, a1, 24
-; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a4
-; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a5
-; CHECK-ALIGNED-RV64-V-NEXT: slli a1, a1, 32
-; CHECK-ALIGNED-RV64-V-NEXT: or a1, a1, a3
-; CHECK-ALIGNED-RV64-V-NEXT: xor a0, a0, a1
-; CHECK-ALIGNED-RV64-V-NEXT: or a0, a2, a0
-; CHECK-ALIGNED-RV64-V-NEXT: snez a0, a0
+; CHECK-ALIGNED-RV64-V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-ALIGNED-RV64-V-NEXT: vle8.v v8, (a0)
+; CHECK-ALIGNED-RV64-V-NEXT: vle8.v v9, (a1)
+; CHECK-ALIGNED-RV64-V-NEXT: vmseq.vv v8, v8, v9
+; CHECK-ALIGNED-RV64-V-NEXT: vmnot.m v8, v8
+; CHECK-ALIGNED-RV64-V-NEXT: vcpop.m a0, v8
+; CHECK-ALIGNED-RV64-V-NEXT: seqz a0, a0
; CHECK-ALIGNED-RV64-V-NEXT: ret
;
; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_16:
@@ -3194,34 +3028,24 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
;
; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_16:
; CHECK-UNALIGNED-RV32-V: # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT: lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT: lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT: lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT: lw a0, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT: lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT: lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT: lw a7, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT: lw a1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT: xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT: xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT: xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT: xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT: or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT: or a0, a4, a0
-; CHECK-UNALIGNED-RV32-V-NEXT: or a0, a2, a0
-; CHECK-UNALIGNED-RV32-V-NEXT: snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT: vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT: vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT: vmseq.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT: vmnot.m v8, v8
+; CHECK-UNALIGNED-RV32-V-NEXT: vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT: seqz a0, a0
; CHECK-UNALIGNED-RV32-V-NEXT: ret
;
; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_16:
; CHECK-UNALIGNED-RV64-V: # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT: ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT: ld a0, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT: ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT: ld a1, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT: xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT: xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT: or a0, a2, a0
-; CHECK-UNALIGNED-RV64-V-NEXT: snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT: vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT: vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT: vmseq.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT: vmnot.m v8, v8
+; CHECK-UNALIGNED-RV64-V-NEXT: vcpop.m a0, v8
+; CHECK-UNALIGNED-RV64-V-NEXT: seqz a0, a0
; CHECK-UNALIGNED-RV64-V-NEXT: ret
entry:
%bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 16)
@@ -3229,15 +3053,15 @@ entry:
}
define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_31:
-; CHECK-RV32: # %bb.0: # %entry
-; CHECK-RV32-NEXT: addi sp, sp, -16
-; CHECK-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT: li a2, 31
-; CHECK-RV32-NEXT: call bcmp
-; CHECK-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT: addi sp, sp, 16
-; CHECK-RV32-NEXT: ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32: # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT: li a2, 31
+; CHECK-ALIGNED-RV32-NEXT: call bcmp
+; CHECK-ALIGNED-RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT: addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT: ret
;
; CHECK-ALIGNED-RV64-LABEL: bcmp_size_31:
; CHECK-ALIGNED-RV64: # %bb.0: # %entry
@@ -3249,6 +3073,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
; CHECK-ALIGNED-RV64-NEXT: addi sp, sp, 16
; CHECK-ALIGNED-RV64-NEXT: ret
;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-ZBB: # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT: li a2, 31
+; CHECK-ALIGNED-RV32-ZBB-NEXT: call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT: addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT: ret
+;
; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_31:
; CHECK-ALIGNED-RV64-ZBB: # %bb.0: # %entry
; CHECK...
[truncated]
|
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1 [skip ci]
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1 [skip ci]
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1 [skip ci]
Created using spr 1.3.6-beta.1
Created using spr 1.3.6-beta.1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
if (IsZeroCmp && ST->hasVInstructions()) { | ||
unsigned RealMinVLen = ST->getRealMinVLen(); | ||
// Support Fractional LMULs if the lengths are larger than XLen. | ||
// TODO: Support non-power-of-2 types. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we create a github ticket for this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have implemented it: #114971.
Is there a way for users to change this, depending on the platform users may want to expand memcmp's on larger sequences. |
Please don't @ me in the commit message. Sometimes when this commit gets pulled into some other fork of llvm I'll get an email that I don't want. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The number of loads is determined from the earlier |
Sorry about the noise, I will remove it. |
Yes. And you may refer to |
This patch adds the support of generating vector instructions for `memcmp`. This implementation is inspired by X86's. We convert integer comparisons (eq/ne only) into vector comparisons and do a vector reduction and to get the result. The range of supported load sizes is (XLEN, VLEN * LMUL8] and non-power-of-2 types are not supported. Fixes #143294. Reviewers: lukel97, asb, preames, topperc, dtcxzyw Reviewed By: topperc, lukel97 Pull Request: llvm/llvm-project#114517
This patch adds the support of generating vector instructions for `memcmp`. This implementation is inspired by X86's. We convert integer comparisons (eq/ne only) into vector comparisons and do a vector reduction and to get the result. The range of supported load sizes is (XLEN, VLEN * LMUL8] and non-power-of-2 types are not supported. Fixes llvm#143294. Reviewers: lukel97, asb, preames, topperc, dtcxzyw Reviewed By: topperc, lukel97 Pull Request: llvm#114517
This patch adds the support of generating vector instructions for
memcmp
. This implementation is inspired by X86's.We convert integer comparisons (eq/ne only) into vector comparisons
and do a vector reduction and to get the result.
The range of supported load sizes is (XLEN, VLEN * LMUL8] and
non-power-of-2 types are not supported.
Fixes #143294.