-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Fold vector shift of sext/zext to widening multiply #121563
base: main
Are you sure you want to change the base?
Conversation
pfusik
commented
Jan 3, 2025
(shl (sext X), C) -> (vwmulsu X, 1u << C) (shl (zext X), C) -> (vwmulu X, 1u << C)
@llvm/pr-subscribers-backend-risc-v Author: Piotr Fusik (pfusik) Changes
Patch is 115.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121563.diff 7 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 04dd23d9cdaa20..955a15393ca8a1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -17341,6 +17341,78 @@ static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG,
return DAG.getZExtOrTrunc(Pop, DL, VT);
}
+static SDValue combineSHL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ // (shl (zext x), y) -> (vwsll x, y)
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ return V;
+
+ // (shl (sext x), C) -> (vwmulsu x, 1u << C)
+ // (shl (zext x), C) -> (vwmulu x, 1u << C)
+
+ SDValue LHS = N->getOperand(0);
+ if (!LHS.hasOneUse())
+ return SDValue();
+ unsigned Opcode;
+ switch (LHS.getOpcode()) {
+ case ISD::SIGN_EXTEND:
+ Opcode = RISCVISD::VWMULSU_VL;
+ break;
+ case ISD::ZERO_EXTEND:
+ Opcode = RISCVISD::VWMULU_VL;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue RHS = N->getOperand(1);
+ APInt ShAmt;
+ if (!ISD::isConstantSplatVector(RHS.getNode(), ShAmt))
+ return SDValue();
+
+ // Better foldings:
+ // (shl (sext x), 1) -> (vwadd x, x)
+ // (shl (zext x), 1) -> (vwaddu x, x)
+ uint64_t ShAmtInt = ShAmt.getZExtValue();
+ if (ShAmtInt <= 1)
+ return SDValue();
+
+ SDValue NarrowOp = LHS.getOperand(0);
+ EVT NarrowVT = NarrowOp.getValueType();
+ uint64_t NarrowBits = NarrowVT.getScalarSizeInBits();
+ if (ShAmtInt >= NarrowBits)
+ return SDValue();
+ EVT VT = N->getValueType(0);
+ if (NarrowBits * 2 != VT.getScalarSizeInBits())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ SDValue Passthru, Mask, VL;
+ switch (N->getOpcode()) {
+ case ISD::SHL:
+ if (!VT.isScalableVector())
+ return SDValue();
+ Passthru = DAG.getUNDEF(VT);
+ std::tie(Mask, VL) =
+ getDefaultScalableVLOps(VT.getSimpleVT(), DL, DAG, Subtarget);
+ break;
+ case RISCVISD::SHL_VL:
+ Passthru = N->getOperand(2);
+ Mask = N->getOperand(3);
+ VL = N->getOperand(4);
+ break;
+ default:
+ llvm_unreachable("Expected SHL");
+ }
+ return DAG.getNode(Opcode, DL, VT, NarrowOp,
+ DAG.getConstant(1ULL << ShAmtInt, SDLoc(RHS), NarrowVT),
+ Passthru, Mask, VL);
+}
+
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -17970,7 +18042,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::SHL_VL:
- if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineSHL(N, DCI, Subtarget))
return V;
[[fallthrough]];
case RISCVISD::SRA_VL:
@@ -17995,7 +18067,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SRL:
case ISD::SHL: {
if (N->getOpcode() == ISD::SHL) {
- if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineSHL(N, DCI, Subtarget))
return V;
}
SDValue ShAmt = N->getOperand(1);
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 9ee2324f615dd8..0fad09f27007c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -775,11 +775,11 @@ define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i8_nxv8i32(ptr %base, <vscal
define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i8_nxv8i32(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v10, v8
-; CHECK-NEXT: vsll.vi v8, v10, 2
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vwmulu.vx v10, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT: vluxei16.v v12, (a0), v8, v0.t
+; CHECK-NEXT: vluxei16.v v12, (a0), v10, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
@@ -791,10 +791,11 @@ define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i8_nxv8i32(ptr %base, <vscal
define <vscale x 8 x i32> @mgather_baseidx_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v8, v16, 2
-; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v16, v8, a1
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
@@ -815,10 +816,11 @@ define <vscale x 8 x i32> @mgather_baseidx_nxv8i16_nxv8i32(ptr %base, <vscale x
define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v8, v16, 2
-; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v16, v8, a1
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
@@ -840,10 +842,11 @@ define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i16_nxv8i32(ptr %base, <vsca
define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; CHECK-NEXT: vzext.vf2 v16, v8
-; CHECK-NEXT: vsll.vi v8, v16, 2
-; CHECK-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v16, v8, a1
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vluxei32.v v12, (a0), v16, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
@@ -863,10 +866,9 @@ define <vscale x 8 x i32> @mgather_baseidx_nxv8i32(ptr %base, <vscale x 8 x i32>
;
; RV64-LABEL: mgather_baseidx_nxv8i32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf2 v16, v8
-; RV64-NEXT: vsll.vi v16, v16, 2
-; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64-NEXT: li a1, 4
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, mu
+; RV64-NEXT: vwmulsu.vx v16, v8, a1
; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t
; RV64-NEXT: vmv.v.v v8, v12
; RV64-NEXT: ret
@@ -1034,11 +1036,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i8_nxv8i64(ptr %base, <vscal
define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i8_nxv8i64(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v10, v8
-; CHECK-NEXT: vsll.vi v8, v10, 3
+; CHECK-NEXT: li a1, 8
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vwmulu.vx v10, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vluxei16.v v16, (a0), v8, v0.t
+; CHECK-NEXT: vluxei16.v v16, (a0), v10, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
@@ -1050,11 +1052,11 @@ define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i8_nxv8i64(ptr %base, <vscal
define <vscale x 8 x i64> @mgather_baseidx_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vsext.vf2 v12, v8
-; RV32-NEXT: vsll.vi v8, v12, 3
+; RV32-NEXT: li a1, 8
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v12, v8, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: ret
;
@@ -1074,11 +1076,11 @@ define <vscale x 8 x i64> @mgather_baseidx_nxv8i16_nxv8i64(ptr %base, <vscale x
define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vsext.vf2 v12, v8
-; RV32-NEXT: vsll.vi v8, v12, 3
+; RV32-NEXT: li a1, 8
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v12, v8, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: ret
;
@@ -1099,11 +1101,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i16_nxv8i64(ptr %base, <vsca
define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT: vzext.vf2 v12, v8
-; CHECK-NEXT: vsll.vi v8, v12, 3
+; CHECK-NEXT: li a1, 8
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v12, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; CHECK-NEXT: vluxei32.v v16, (a0), v12, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
@@ -1124,10 +1126,11 @@ define <vscale x 8 x i64> @mgather_baseidx_nxv8i32_nxv8i64(ptr %base, <vscale x
;
; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vsext.vf2 v24, v8
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: li a1, 8
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vwmulsu.vx v24, v8, a1
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: ret
%ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i32> %idxs
@@ -1147,10 +1150,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i32_nxv8i64(ptr %base, <vsca
;
; RV64-LABEL: mgather_baseidx_sext_nxv8i32_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vsext.vf2 v24, v8
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: li a1, 8
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vwmulsu.vx v24, v8, a1
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: ret
%eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
@@ -1171,10 +1175,11 @@ define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i32_nxv8i64(ptr %base, <vsca
;
; RV64-LABEL: mgather_baseidx_zext_nxv8i32_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vzext.vf2 v24, v8
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: li a1, 8
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vwmulu.vx v24, v8, a1
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: ret
%eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
@@ -1845,11 +1850,11 @@ define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i8_nxv8f32(ptr %base, <vsc
define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i8_nxv8f32(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v10, v8
-; CHECK-NEXT: vsll.vi v8, v10, 2
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vwmulu.vx v10, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT: vluxei16.v v12, (a0), v8, v0.t
+; CHECK-NEXT: vluxei16.v v12, (a0), v10, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
@@ -1861,10 +1866,11 @@ define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i8_nxv8f32(ptr %base, <vsc
define <vscale x 8 x float> @mgather_baseidx_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v8, v16, 2
-; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v16, v8, a1
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
@@ -1885,10 +1891,11 @@ define <vscale x 8 x float> @mgather_baseidx_nxv8i16_nxv8f32(ptr %base, <vscale
define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v8, v16, 2
-; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v16, v8, a1
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
@@ -1910,10 +1917,11 @@ define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i16_nxv8f32(ptr %base, <vs
define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; CHECK-NEXT: vzext.vf2 v16, v8
-; CHECK-NEXT: vsll.vi v8, v16, 2
-; CHECK-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v16, v8, a1
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vluxei32.v v12, (a0), v16, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
@@ -1933,10 +1941,9 @@ define <vscale x 8 x float> @mgather_baseidx_nxv8f32(ptr %base, <vscale x 8 x i3
;
; RV64-LABEL: mgather_baseidx_nxv8f32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf2 v16, v8
-; RV64-NEXT: vsll.vi v16, v16, 2
-; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64-NEXT: li a1, 4
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, mu
+; RV64-NEXT: vwmulsu.vx v16, v8, a1
; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t
; RV64-NEXT: vmv.v.v v8, v12
; RV64-NEXT: ret
@@ -2104,11 +2111,11 @@ define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i8_nxv8f64(ptr %base, <vs
define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i8_nxv8f64(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v10, v8
-; CHECK-NEXT: vsll.vi v8, v10, 3
+; CHECK-NEXT: li a1, 8
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vwmulu.vx v10, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vluxei16.v v16, (a0), v8, v0.t
+; CHECK-NEXT: vluxei16.v v16, (a0), v10, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
@@ -2120,11 +2127,11 @@ define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i8_nxv8f64(ptr %base, <vs
define <vscale x 8 x double> @mgather_baseidx_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vsext.vf2 v12, v8
-; RV32-NEXT: vsll.vi v8, v12, 3
+; RV32-NEXT: li a1, 8
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v12, v8, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: ret
;
@@ -2144,11 +2151,11 @@ define <vscale x 8 x double> @mgather_baseidx_nxv8i16_nxv8f64(ptr %base, <vscale
define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vsext.vf2 v12, v8
-; RV32-NEXT: vsll.vi v8, v12, 3
+; RV32-NEXT: li a1, 8
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v12, v8, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: ret
;
@@ -2169,11 +2176,11 @@ define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i16_nxv8f64(ptr %base, <v
define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT: vzext.vf2 v12, v8
-; CHECK-NEXT: vsll.vi v8, v12, 3
+; CHECK-NEXT: li a1, 8
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v12, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; CHECK-NEXT: vluxei32.v v16, (a0), v12, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
@@ -2194,10 +2201,11 @@ define <vscale x 8 x double> @mgather_baseidx_nxv8i32_nxv8f64(ptr %base, <vscale
;
; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8f64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vsext.vf2 v24, v8
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: li a1, 8
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vwmulsu.vx v24, v8, a1
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: ret
%ptrs = getelementptr inb...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the absence of Zvbb vwsll.vi
, it can be profitable to use a widening multiply instead of sign/zero extension followed by a left shift.
This is the case on BPI-F3. Each of vsext.vf2
, vsll.vi
and vwmul[s]u.vx
has 2*LMUL cycles throughtput: https://camel-cdr.github.io/rvv-bench-results/bpi_f3/
I confirmed this transform improves some benchmarks on a BPI-F3 board.
Looking at https://camel-cdr.github.io/rvv-bench-results/canmv_k230/, it should also apply there.
I don't know whether this is profitable for other RVV CPUs. Please advise if this should be restricted to certain CPUs and which ones.
|
||
SDValue LHS = N->getOperand(0); | ||
if (!LHS.hasOneUse()) | ||
return SDValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For now, this only handles single-use of sext/zext.
I can rewrite it to be part of combineOp_VLToVWOp_VL
so that it handles multi-use too.
; CHECK-RV32: # %bb.0: | ||
; CHECK-RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma | ||
; CHECK-RV32-NEXT: vzext.vf2 v10, v8 | ||
; CHECK-RV32-NEXT: vsll.vi v8, v10, 2, v0.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The transform did not apply on RV32 here:
17373 if (!ISD::isConstantSplatVector(RHS.getNode(), ShAmt))
(gdb)
17374 return SDValue();
(gdb) call RHS->dump()
t20: nxv2i64 = splat_vector_parts Constant:i32<2>, Constant:i32<0>
I'm guessing it's an effect of type legalization. Please advise on how to fix.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
splat_vector_parts is used to splat an e64 element on rv32, i.e. when the EEW > XLEN. You could manually handle it in a follow up PR, I don't think it's critical for this PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rv32 RVV is rare, I suppose?
I'd prefer to do it right away, so that the test in vwsll-sdnode.ll
doesn't disappear. Handling it here prioritizes vwmulu
over vwsll
-- I'll debug it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. combineOp_VLToVWOp_VL
handles RISCVISD::VMV_V_X_VL
in AfterLegalizeDAG
instead of ISD::SPLAT_VECTOR_PARTS
in AfterLegalizeTypes
. I do the same, so that vwsll
has priority over vwmulu
.
; CHECK-NEXT: vzext.vf2 v10, v8 | ||
; CHECK-NEXT: vsll.vi v8, v10, 2 | ||
; CHECK-NEXT: ret | ||
; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is same problem as in the vwsll-vp.ll
file - see my comment there.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense to me. I guess multiplies might not have the same number of available execution ports, but it's still less vector ops at the end of the day.
return SDValue(); | ||
|
||
SDValue NarrowOp = LHS.getOperand(0); | ||
EVT NarrowVT = NarrowOp.getValueType(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's better to use MVT + getSimpleValueType explicitly since this is past type legalization
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why? I use EVT for DAG.getConstant
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NarrowVT.getScalarSizeInBits()
generates less code if NarrowVT
is MVT.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
return SDValue(); | ||
unsigned Opcode; | ||
switch (LHS.getOpcode()) { | ||
case ISD::SIGN_EXTEND: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it make a difference if we also handle RISCVISD::VSEXT_VL
/RISCVISD::VZEXT_VL
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We probably should, as other widening optimizations do.
There's no rvv
test covering this, so I'd need to add one.
How to handle the mask and EVL values? Can this be an operand of ISD::SHL
or RISCVISD::SHL_VL
or both?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you're asking whether VZ/SEXT_VL might appear in RISCVISD::SHL_VL
, I think we generate SHL_VL + VZEXT_VL when dealing with some of the interleave vector cases.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can SHL_VL
have different mask/EVL from its VZ/SEXT_VL
operand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can
SHL_VL
have different mask/EVL from itsVZ/SEXT_VL
operand?
Technically it's possible
if (!VT.isScalableVector()) | ||
return SDValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be worthwhile to leave a TODO to handle fixed length vectors later. You would need to use the ContainerVT = getContainerForFixedLengthVector(...)
pattern that we use elsewhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO added
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
; CHECK-RV32: # %bb.0: | ||
; CHECK-RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma | ||
; CHECK-RV32-NEXT: vzext.vf2 v10, v8 | ||
; CHECK-RV32-NEXT: vsll.vi v8, v10, 2, v0.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
splat_vector_parts is used to splat an e64 element on rv32, i.e. when the EEW > XLEN. You could manually handle it in a follow up PR, I don't think it's critical for this PR
; RV32ZVE32F-NEXT: andi a3, t0, 1 | ||
; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 | ||
; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma | ||
; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vsext
+vsll
+vadd
folded into vwmaccus
.
; RV32: # %bb.0: | ||
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma | ||
; RV32-NEXT: vzext.vf2 v10, v8 | ||
; RV32-NEXT: vsll.vi v8, v10, 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not fold on RV32. I will investigate.
; CHECK-ZVBB-RV64: # %bb.0: | ||
; CHECK-ZVBB-RV64-NEXT: li a0, 4 | ||
; CHECK-ZVBB-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma | ||
; CHECK-ZVBB-RV64-NEXT: vwmulu.vx v10, v8, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pessimized. I will investigate.
; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma | ||
; CHECK-ZVBB-NEXT: vwsll.vi v10, v8, 2 | ||
; CHECK-ZVBB-NEXT: vwmulu.vx v10, v8, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pessimized.
if (!VT.isScalableVector()) | ||
return SDValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done