-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[RISCV] Use ri.vzip2{a,b} for interleave2 if available #136364
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
If XRivosVizip is available, the ri.vzip2a and ri.vzip2b instructions can be used perform a interleave shuffle. This patch only effects the intrinsic lowering (and thus scalable vectors). Fixed vectors go through shuffle lowering and the zip2a (but not zip2b) case is already handled there..
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesIf XRivosVizip is available, the ri.vzip2a and ri.vzip2b instructions can be used perform a interleave shuffle. This patch only effects the intrinsic lowering (and thus scalable vectors). Fixed vectors go through shuffle lowering and the zip2a (but not zip2b) case is already handled there.. Patch is 67.37 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136364.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 98fba9e86e88a..6abcba4fa4935 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5018,8 +5018,8 @@ static SDValue lowerVZIP(unsigned Opc, SDValue Op0, SDValue Op1,
const SDLoc &DL, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
assert(RISCVISD::RI_VZIPEVEN_VL == Opc || RISCVISD::RI_VZIPODD_VL == Opc ||
- RISCVISD::RI_VZIP2A_VL == Opc || RISCVISD::RI_VUNZIP2A_VL == Opc ||
- RISCVISD::RI_VUNZIP2B_VL == Opc);
+ RISCVISD::RI_VZIP2A_VL == Opc || RISCVISD::RI_VZIP2B_VL == Opc ||
+ RISCVISD::RI_VUNZIP2A_VL == Opc || RISCVISD::RI_VUNZIP2B_VL == Opc);
assert(Op0.getSimpleValueType() == Op1.getSimpleValueType());
MVT VT = Op0.getSimpleValueType();
@@ -6935,7 +6935,7 @@ static bool hasPassthruOp(unsigned Opcode) {
Opcode <= RISCVISD::LAST_STRICTFP_OPCODE &&
"not a RISC-V target specific op");
static_assert(
- RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 132 &&
+ RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 133 &&
RISCVISD::LAST_STRICTFP_OPCODE - RISCVISD::FIRST_STRICTFP_OPCODE == 21 &&
"adding target specific op should update this function");
if (Opcode >= RISCVISD::ADD_VL && Opcode <= RISCVISD::VFMAX_VL)
@@ -6959,7 +6959,7 @@ static bool hasMaskOp(unsigned Opcode) {
Opcode <= RISCVISD::LAST_STRICTFP_OPCODE &&
"not a RISC-V target specific op");
static_assert(
- RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 132 &&
+ RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 133 &&
RISCVISD::LAST_STRICTFP_OPCODE - RISCVISD::FIRST_STRICTFP_OPCODE == 21 &&
"adding target specific op should update this function");
if (Opcode >= RISCVISD::TRUNCATE_VECTOR_VL && Opcode <= RISCVISD::SETCC_VL)
@@ -11753,6 +11753,17 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
return DAG.getMergeValues(Loads, DL);
}
+ // Use ri.vzip2{a,b} if available
+ // TODO: Figure out the best lowering for the spread variants
+ if (Subtarget.hasVendorXRivosVizip() &&
+ !Op.getOperand(0).isUndef() && !Op.getOperand(1).isUndef()) {
+ SDValue V1 = Op->getOperand(0);
+ SDValue V2 = Op->getOperand(1);
+ SDValue Lo = lowerVZIP(RISCVISD::RI_VZIP2A_VL, V1, V2, DL, DAG, Subtarget);
+ SDValue Hi = lowerVZIP(RISCVISD::RI_VZIP2B_VL, V1, V2, DL, DAG, Subtarget);
+ return DAG.getMergeValues({Lo, Hi}, DL);
+ }
+
// If the element type is smaller than ELEN, then we can interleave with
// vwaddu.vv and vwmaccu.vx
if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) {
@@ -22256,6 +22267,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RI_VZIPEVEN_VL)
NODE_NAME_CASE(RI_VZIPODD_VL)
NODE_NAME_CASE(RI_VZIP2A_VL)
+ NODE_NAME_CASE(RI_VZIP2B_VL)
NODE_NAME_CASE(RI_VUNZIP2A_VL)
NODE_NAME_CASE(RI_VUNZIP2B_VL)
NODE_NAME_CASE(READ_CSR)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index baf1b2e4d8e6e..6e50ab8e1f296 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -408,6 +408,7 @@ enum NodeType : unsigned {
RI_VZIPEVEN_VL,
RI_VZIPODD_VL,
RI_VZIP2A_VL,
+ RI_VZIP2B_VL,
RI_VUNZIP2A_VL,
RI_VUNZIP2B_VL,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
index 147f89850765a..110dfdff7f29a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td
@@ -71,6 +71,7 @@ defm RI_VUNZIP2B_V : VALU_IV_V<"ri.vunzip2b", 0b011000>;
def ri_vzipeven_vl : SDNode<"RISCVISD::RI_VZIPEVEN_VL", SDT_RISCVIntBinOp_VL>;
def ri_vzipodd_vl : SDNode<"RISCVISD::RI_VZIPODD_VL", SDT_RISCVIntBinOp_VL>;
def ri_vzip2a_vl : SDNode<"RISCVISD::RI_VZIP2A_VL", SDT_RISCVIntBinOp_VL>;
+def ri_vzip2b_vl : SDNode<"RISCVISD::RI_VZIP2B_VL", SDT_RISCVIntBinOp_VL>;
def ri_vunzip2a_vl : SDNode<"RISCVISD::RI_VUNZIP2A_VL", SDT_RISCVIntBinOp_VL>;
def ri_vunzip2b_vl : SDNode<"RISCVISD::RI_VUNZIP2B_VL", SDT_RISCVIntBinOp_VL>;
@@ -84,6 +85,7 @@ let Predicates = [HasVendorXRivosVizip],
defm PseudoRI_VZIPEVEN : RIVPseudoVALU_VV;
defm PseudoRI_VZIPODD : RIVPseudoVALU_VV;
defm PseudoRI_VZIP2A : RIVPseudoVALU_VV;
+defm PseudoRI_VZIP2B : RIVPseudoVALU_VV;
defm PseudoRI_VUNZIP2A : RIVPseudoVALU_VV;
defm PseudoRI_VUNZIP2B : RIVPseudoVALU_VV;
}
@@ -102,6 +104,7 @@ multiclass RIVPatBinaryVL_VV<SDPatternOperator vop, string instruction_name,
defm : RIVPatBinaryVL_VV<ri_vzipeven_vl, "PseudoRI_VZIPEVEN">;
defm : RIVPatBinaryVL_VV<ri_vzipodd_vl, "PseudoRI_VZIPODD">;
defm : RIVPatBinaryVL_VV<ri_vzip2a_vl, "PseudoRI_VZIP2A">;
+defm : RIVPatBinaryVL_VV<ri_vzip2b_vl, "PseudoRI_VZIP2B">;
defm : RIVPatBinaryVL_VV<ri_vunzip2a_vl, "PseudoRI_VUNZIP2A">;
defm : RIVPatBinaryVL_VV<ri_vunzip2b_vl, "PseudoRI_VUNZIP2B">;
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index b0eac8bdf48dd..a6322c50ff233 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -1,34 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64
-; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,V,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,V,RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,V,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,V,RV64
; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=ZVBB,ZVBB-RV32
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=ZVBB,ZVBB-RV64
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin,+experimental-xrivosvizip | FileCheck %s --check-prefixes=CHECK,ZIP
; Integers
define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) {
-; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: vmerge.vim v12, v10, 1, v0
-; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vmerge.vim v14, v10, 1, v0
-; CHECK-NEXT: srli a1, a1, 2
-; CHECK-NEXT: vwaddu.vv v8, v14, v12
-; CHECK-NEXT: vwmaccu.vx v8, a0, v12
-; CHECK-NEXT: vmsne.vi v12, v10, 0
-; CHECK-NEXT: vmsne.vi v0, v8, 0
-; CHECK-NEXT: add a0, a1, a1
-; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v0, v12, a1
-; CHECK-NEXT: ret
+; V-LABEL: vector_interleave_nxv32i1_nxv16i1:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; V-NEXT: vmv1r.v v9, v0
+; V-NEXT: vmv1r.v v0, v8
+; V-NEXT: vmv.v.i v10, 0
+; V-NEXT: li a0, -1
+; V-NEXT: csrr a1, vlenb
+; V-NEXT: vmerge.vim v12, v10, 1, v0
+; V-NEXT: vmv1r.v v0, v9
+; V-NEXT: vmerge.vim v14, v10, 1, v0
+; V-NEXT: srli a1, a1, 2
+; V-NEXT: vwaddu.vv v8, v14, v12
+; V-NEXT: vwmaccu.vx v8, a0, v12
+; V-NEXT: vmsne.vi v12, v10, 0
+; V-NEXT: vmsne.vi v0, v8, 0
+; V-NEXT: add a0, a1, a1
+; V-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
+; V-NEXT: vslideup.vx v0, v12, a1
+; V-NEXT: ret
;
; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1:
; ZVBB: # %bb.0:
@@ -49,20 +50,40 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
; ZVBB-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
; ZVBB-NEXT: vslideup.vx v0, v8, a1
; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv32i1_nxv16i1:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZIP-NEXT: vmv1r.v v9, v0
+; ZIP-NEXT: vmv1r.v v0, v8
+; ZIP-NEXT: vmv.v.i v10, 0
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: vmerge.vim v12, v10, 1, v0
+; ZIP-NEXT: vmv1r.v v0, v9
+; ZIP-NEXT: vmerge.vim v8, v10, 1, v0
+; ZIP-NEXT: srli a0, a0, 2
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v14, v8, v12
+; ZIP-NEXT: vmsne.vi v8, v10, 0
+; ZIP-NEXT: vmsne.vi v0, v14, 0
+; ZIP-NEXT: add a1, a0, a0
+; ZIP-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; ZIP-NEXT: vslideup.vx v0, v8, a0
+; ZIP-NEXT: ret
%res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
ret <vscale x 32 x i1> %res
}
define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: vector_interleave_nxv32i8_nxv16i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv2r.v v12, v10
-; CHECK-NEXT: vmv2r.v v14, v8
-; CHECK-NEXT: vwaddu.vv v8, v14, v12
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v8, a0, v12
-; CHECK-NEXT: ret
+; V-LABEL: vector_interleave_nxv32i8_nxv16i8:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; V-NEXT: vmv2r.v v12, v10
+; V-NEXT: vmv2r.v v14, v8
+; V-NEXT: vwaddu.vv v8, v14, v12
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v12
+; V-NEXT: ret
;
; ZVBB-LABEL: vector_interleave_nxv32i8_nxv16i8:
; ZVBB: # %bb.0:
@@ -72,20 +93,29 @@ define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv16i8(<vscale x 16 x i8>
; ZVBB-NEXT: vwsll.vi v8, v12, 8
; ZVBB-NEXT: vwaddu.wv v8, v8, v14
; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv32i8_nxv16i8:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZIP-NEXT: vmv2r.v v12, v10
+; ZIP-NEXT: vmv2r.v v14, v8
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT: ret
%res = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
ret <vscale x 32 x i8> %res
}
define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: vector_interleave_nxv16i16_nxv8i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv2r.v v12, v10
-; CHECK-NEXT: vmv2r.v v14, v8
-; CHECK-NEXT: vwaddu.vv v8, v14, v12
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v8, a0, v12
-; CHECK-NEXT: ret
+; V-LABEL: vector_interleave_nxv16i16_nxv8i16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; V-NEXT: vmv2r.v v12, v10
+; V-NEXT: vmv2r.v v14, v8
+; V-NEXT: vwaddu.vv v8, v14, v12
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v12
+; V-NEXT: ret
;
; ZVBB-LABEL: vector_interleave_nxv16i16_nxv8i16:
; ZVBB: # %bb.0:
@@ -95,20 +125,29 @@ define <vscale x 16 x i16> @vector_interleave_nxv16i16_nxv8i16(<vscale x 8 x i16
; ZVBB-NEXT: vwsll.vi v8, v12, 16
; ZVBB-NEXT: vwaddu.wv v8, v8, v14
; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv16i16_nxv8i16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZIP-NEXT: vmv2r.v v12, v10
+; ZIP-NEXT: vmv2r.v v14, v8
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT: ret
%res = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
ret <vscale x 16 x i16> %res
}
define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; CHECK-NEXT: vmv2r.v v12, v10
-; CHECK-NEXT: vmv2r.v v14, v8
-; CHECK-NEXT: vwaddu.vv v8, v14, v12
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwmaccu.vx v8, a0, v12
-; CHECK-NEXT: ret
+; V-LABEL: vector_interleave_nxv8i32_nxv4i32:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; V-NEXT: vmv2r.v v12, v10
+; V-NEXT: vmv2r.v v14, v8
+; V-NEXT: vwaddu.vv v8, v14, v12
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v12
+; V-NEXT: ret
;
; ZVBB-LABEL: vector_interleave_nxv8i32_nxv4i32:
; ZVBB: # %bb.0:
@@ -119,25 +158,34 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32(<vscale x 4 x i32>
; ZVBB-NEXT: vwsll.vx v8, v12, a0
; ZVBB-NEXT: vwaddu.wv v8, v8, v14
; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv8i32_nxv4i32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; ZIP-NEXT: vmv2r.v v12, v10
+; ZIP-NEXT: vmv2r.v v14, v8
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT: ret
%res = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
ret <vscale x 8 x i32> %res
}
define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu
-; CHECK-NEXT: vid.v v12
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: vand.vi v13, v12, 1
-; CHECK-NEXT: vmsne.vi v0, v13, 0
-; CHECK-NEXT: vsrl.vi v16, v12, 1
-; CHECK-NEXT: vadd.vx v16, v16, a0, v0.t
-; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
-; CHECK-NEXT: vmv.v.v v8, v12
-; CHECK-NEXT: ret
+; V-LABEL: vector_interleave_nxv4i64_nxv2i64:
+; V: # %bb.0:
+; V-NEXT: csrr a0, vlenb
+; V-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; V-NEXT: vid.v v12
+; V-NEXT: srli a0, a0, 2
+; V-NEXT: vand.vi v13, v12, 1
+; V-NEXT: vmsne.vi v0, v13, 0
+; V-NEXT: vsrl.vi v16, v12, 1
+; V-NEXT: vadd.vx v16, v16, a0, v0.t
+; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; V-NEXT: vrgatherei16.vv v12, v8, v16
+; V-NEXT: vmv.v.v v8, v12
+; V-NEXT: ret
;
; ZVBB-LABEL: vector_interleave_nxv4i64_nxv2i64:
; ZVBB: # %bb.0:
@@ -153,32 +201,41 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16
; ZVBB-NEXT: vmv.v.v v8, v12
; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv4i64_nxv2i64:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; ZIP-NEXT: vmv2r.v v12, v10
+; ZIP-NEXT: vmv2r.v v14, v8
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT: ret
%res = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
ret <vscale x 4 x i64> %res
}
define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) {
-; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v9, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmv.v.i v24, 0
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vmerge.vim v16, v24, 1, v0
-; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vmerge.vim v24, v24, 1, v0
-; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT: vwaddu.vv v8, v24, v16
-; CHECK-NEXT: vwaddu.vv v0, v28, v20
-; CHECK-NEXT: vwmaccu.vx v8, a0, v16
-; CHECK-NEXT: vwmaccu.vx v0, a0, v20
-; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT: vmsne.vi v16, v8, 0
-; CHECK-NEXT: vmsne.vi v8, v0, 0
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: ret
+; V-LABEL: vector_interleave_nxv128i1_nxv64i1:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; V-NEXT: vmv1r.v v9, v0
+; V-NEXT: vmv1r.v v0, v8
+; V-NEXT: vmv.v.i v24, 0
+; V-NEXT: li a0, -1
+; V-NEXT: vmerge.vim v16, v24, 1, v0
+; V-NEXT: vmv1r.v v0, v9
+; V-NEXT: vmerge.vim v24, v24, 1, v0
+; V-NEXT: vsetvli a1, zero, e8, m4, ta, ma
+; V-NEXT: vwaddu.vv v8, v24, v16
+; V-NEXT: vwaddu.vv v0, v28, v20
+; V-NEXT: vwmaccu.vx v8, a0, v16
+; V-NEXT: vwmaccu.vx v0, a0, v20
+; V-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; V-NEXT: vmsne.vi v16, v8, 0
+; V-NEXT: vmsne.vi v8, v0, 0
+; V-NEXT: vmv1r.v v0, v16
+; V-NEXT: ret
;
; ZVBB-LABEL: vector_interleave_nxv128i1_nxv64i1:
; ZVBB: # %bb.0:
@@ -197,22 +254,42 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
; ZVBB-NEXT: vmsne.vi v8, v0, 0
; ZVBB-NEXT: vmv1r.v v0, v16
; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv128i1_nxv64i1:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZIP-NEXT: vmv1r.v v9, v0
+; ZIP-NEXT: vmv1r.v v0, v8
+; ZIP-NEXT: vmv.v.i v24, 0
+; ZIP-NEXT: vmerge.vim v16, v24, 1, v0
+; ZIP-NEXT: vmv1r.v v0, v9
+; ZIP-NEXT: vmerge.vim v8, v24, 1, v0
+; ZIP-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZIP-NEXT: ri.vzip2b.vv v4, v8, v16
+; ZIP-NEXT: ri.vzip2b.vv v28, v12, v20
+; ZIP-NEXT: ri.vzip2a.vv v0, v8, v16
+; ZIP-NEXT: ri.vzip2a.vv v24, v12, v20
+; ZIP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZIP-NEXT: vmsne.vi v9, v0, 0
+; ZIP-NEXT: vmsne.vi v8, v24, 0
+; ZIP-NEXT: vmv1r.v v0, v9
+; ZIP-NEXT: ret
%res = call <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
ret <vscale x 128 x i1> %res
}
define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b) {
-; CHECK-LABEL: vector_interleave_nxv128i8_nxv64i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT: vmv8r.v v24, v8
-; CHECK-NEXT: vwaddu.vv v8, v24, v16
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwaddu.vv v0, v28, v20
-; CHECK-NEXT: vwmaccu.vx v8, a0, v16
-; CHECK-NEXT: vwmaccu.vx v0, a0, v20
-; CHECK-NEXT: vmv8r.v v16, v0
-; CHECK-NEXT: ret
+; V-LABEL: vector_interleave_nxv128i8_nxv64i8:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: vwaddu.vv v8, v24, v16
+; V-NEXT: li a0, -1
+; V-NEXT: vwaddu.vv v0, v28, v20
+; V-NEXT: vwmaccu.vx v8, a0, v16
+; V-NEXT: vwmaccu.vx v0, a0, v20
+; V-NEXT: vmv8r.v v16, v0
+; V-NEXT: ret
;
; ZVBB-LABEL: vector_interleave_nxv128i8_nxv64i8:
; ZVBB: # %bb.0:
@@ -224,22 +301,33 @@ define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8
; ZVBB-NEXT: vmv8r.v v8, v24
; ZVBB-NEXT: vmv8r.v v16, v0
; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv128i8_nxv64i8:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
%res = call <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b)
ret <vscale x 128 x i8> %res
}
define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b) {
-; CHECK-LABEL: vector_interleave_nxv64i16_nxv32i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vmv8r.v v24, v8
-; CHECK-NEXT: vwaddu.vv v8, v24, v16
-; CHECK-NEXT: li a0, -1
-; CHECK-NEXT: vwaddu.vv v0, v28, v20
-; CHECK-NEXT: vwmaccu.vx v8, a...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
if (Subtarget.hasVendorXRivosVizip() && !Op.getOperand(0).isUndef() && | ||
!Op.getOperand(1).isUndef()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch with the undef case. But should we only skip undefs if VecVT.getScalarSizeInBits() < Subtarget.getELen()
? At e64 I don't think we get the vzext/vwsll.vx since it gets lowered as a vrgather.vv
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The vrgather case doesn't have a problem with undef propagation because it's only one use of each operand. The guard here honestly isn't entirely sufficient, we probably need to be using freeze in both this case and the unzip2a/b case I added. The guard I added wasn't directly motivated by the legality issue (I'm going to return to that shortly in it's own patch), it was motivated by profitability. Loosing the zero extend lowering for the spread2 case causes a few of the tests to regress. I'm going to post a patch to reorganize things, but doing it as a separate change seemed easier.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Loosing the zero extend lowering for the spread2 case causes a few of the tests to regress.
I wasn't even aware of the legality issue, I just spotted the poison tests and assumed this was so we don't lose the combine. Happy to look at this in a follow up
If XRivosVizip is available, the ri.vzip2a and ri.vzip2b instructions can be used perform a interleave shuffle. This patch only effects the intrinsic lowering (and thus scalable vectors). Fixed vectors go through shuffle lowering and the zip2a (but not zip2b) case is already handled there..