-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[RISCV] Use ri.vunzip2{a,b} for e64 fixed length deinterleave(2) shuffles #137217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…fles If we have xrivosvizip, we can use the vunzip2{a,b} instructions for these cases *provided* that we can prove the layout in the two registers matches the fixed length semantics. The majority of this patch is a straight-forward port of the existing vnsrl logic which has the same requirement (though for slightly different reasoning). The one complicated bit is the addition of the scalable splitting logic inside lowerVZIP to exploit the independent register operands, and allow the use of lower LMUL. This bit is annoyingly complicated, and really "should" be a DAG combine - except that the VL and mask reduction becomes hard when it's not known to be a constant.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesIf we have xrivosvizip, we can use the vunzip2{a,b} instructions for these cases provided that we can prove the layout in the two registers matches the fixed length semantics. The majority of this patch is a straight-forward port of the existing vnsrl logic which has the same requirement (though for slightly different reasoning). The one complicated bit is the addition of the scalable splitting logic inside lowerVZIP to exploit the independent register operands, and allow the use of lower LMUL. This bit is annoyingly complicated, and really "should" be a DAG combine - except that the VL and mask reduction becomes hard when it's not known to be a constant. Full diff: https://github.com/llvm/llvm-project/pull/137217.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c440df5a3e638..a20d89cd8f8d1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5062,9 +5062,27 @@ static SDValue lowerVZIP(unsigned Opc, SDValue Op0, SDValue Op1,
Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget);
}
- auto [Mask, VL] = getDefaultVLOps(IntVT, ContainerVT, DL, DAG, Subtarget);
- SDValue Passthru = DAG.getUNDEF(ContainerVT);
- SDValue Res = DAG.getNode(Opc, DL, ContainerVT, Op0, Op1, Passthru, Mask, VL);
+ MVT InnerVT = ContainerVT;
+ auto [Mask, VL] = getDefaultVLOps(IntVT, InnerVT, DL, DAG, Subtarget);
+ if (Op1.isUndef() && ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
+ (RISCVISD::RI_VUNZIP2A_VL == Opc || RISCVISD::RI_VUNZIP2B_VL == Opc)) {
+ InnerVT = ContainerVT.getHalfNumVectorElementsVT();
+ VL = DAG.getConstant(VT.getVectorNumElements() / 2, DL,
+ Subtarget.getXLenVT());
+ Mask = getAllOnesMask(InnerVT, VL, DL, DAG);
+ unsigned HighIdx = InnerVT.getVectorElementCount().getKnownMinValue();
+ Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InnerVT, Op0,
+ DAG.getVectorIdxConstant(HighIdx, DL));
+ Op0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InnerVT, Op0,
+ DAG.getVectorIdxConstant(0, DL));
+ }
+
+ SDValue Passthru = DAG.getUNDEF(InnerVT);
+ SDValue Res = DAG.getNode(Opc, DL, InnerVT, Op0, Op1, Passthru, Mask, VL);
+ if (InnerVT.bitsLT(ContainerVT))
+ Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), Res,
+ DAG.getVectorIdxConstant(0, DL));
if (IntVT.isFixedLengthVector())
Res = convertFromScalableVector(IntVT, Res, DAG, Subtarget);
Res = DAG.getBitcast(VT, Res);
@@ -5768,6 +5786,26 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
}
}
+ // If this is a deinterleave(2), try using vunzip{a,b}. This mostly catches
+ // e64 which can't match above.
+ unsigned Index = 0;
+ if (Subtarget.hasVendorXRivosVizip() &&
+ ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2, Index) &&
+ 1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
+ unsigned Opc =
+ Index == 0 ? RISCVISD::RI_VUNZIP2A_VL : RISCVISD::RI_VUNZIP2B_VL;
+ if (V2.isUndef())
+ return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget);
+ if (SDValue Src = foldConcatVector(V1, V2)) {
+ EVT NewVT = VT.getDoubleNumVectorElementsVT();
+ SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+ Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewVT, Src, ZeroIdx);
+ SDValue Res =
+ lowerVZIP(Opc, Src, DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, ZeroIdx);
+ }
+ }
+
if (SDValue V =
lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
return V;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
index c65d7c36a2198..b692a80159288 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
@@ -347,9 +347,8 @@ define void @vnsrl_0_i64(ptr %in, ptr %out) {
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
; ZIP-NEXT: vle64.v v8, (a0)
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v9, v8, 2
-; ZIP-NEXT: vslideup.vi v8, v9, 1
-; ZIP-NEXT: vse64.v v8, (a1)
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: vse64.v v10, (a1)
; ZIP-NEXT: ret
entry:
%0 = load <4 x i64>, ptr %in, align 8
@@ -383,8 +382,7 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) {
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
; ZIP-NEXT: vle64.v v8, (a0)
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v9, v8, 2
-; ZIP-NEXT: ri.vzipodd.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9
; ZIP-NEXT: vse64.v v10, (a1)
; ZIP-NEXT: ret
entry:
@@ -417,10 +415,9 @@ define void @vnsrl_0_double(ptr %in, ptr %out) {
; ZIP: # %bb.0: # %entry
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
; ZIP-NEXT: vle64.v v8, (a0)
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v9, v8, 2
-; ZIP-NEXT: vslideup.vi v8, v9, 1
-; ZIP-NEXT: vse64.v v8, (a1)
+; ZIP-NEXT: vse64.v v10, (a1)
; ZIP-NEXT: ret
entry:
%0 = load <4 x double>, ptr %in, align 8
@@ -453,9 +450,8 @@ define void @vnsrl_64_double(ptr %in, ptr %out) {
; ZIP: # %bb.0: # %entry
; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
; ZIP-NEXT: vle64.v v8, (a0)
+; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v9, v8, 2
-; ZIP-NEXT: ri.vzipodd.vv v10, v8, v9
; ZIP-NEXT: vse64.v v10, (a1)
; ZIP-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index f6b5a35aa06d6..3263ad121a9db 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -85,11 +85,11 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
;
; ZIP-LABEL: vector_deinterleave_v2i64_v4i64:
; ZIP: # %bb.0:
-; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
-; ZIP-NEXT: vslidedown.vi v10, v8, 2
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; ZIP-NEXT: ri.vzipodd.vv v9, v8, v10
-; ZIP-NEXT: vslideup.vi v8, v10, 1
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v11, v8, v9
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: vmv.v.v v9, v11
; ZIP-NEXT: ret
%retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec)
ret {<2 x i64>, <2 x i64>} %retval
@@ -129,62 +129,51 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) {
; ZIP-LABEL: vector_deinterleave_v4i64_v8i64:
; ZIP: # %bb.0:
; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; ZIP-NEXT: vslidedown.vi v12, v8, 1
-; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; ZIP-NEXT: vmv.v.i v0, 2
-; ZIP-NEXT: vmv.v.i v14, 12
-; ZIP-NEXT: vsetivli zero, 4, e64, m4, ta, ma
-; ZIP-NEXT: vslidedown.vi v16, v8, 4
-; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
-; ZIP-NEXT: vslidedown.vi v10, v8, 2
-; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; ZIP-NEXT: vslidedown.vi v12, v8, 2, v0.t
-; ZIP-NEXT: ri.vzip2a.vv v18, v8, v10
-; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
-; ZIP-NEXT: vslidedown.vi v8, v16, 2
-; ZIP-NEXT: vmv1r.v v0, v14
-; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; ZIP-NEXT: ri.vzip2a.vv v12, v16, v8, v0.t
-; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; ZIP-NEXT: vmv.v.i v0, 8
-; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; ZIP-NEXT: vslideup.vi v8, v16, 2
-; ZIP-NEXT: vslideup.vi v8, v16, 1, v0.t
-; ZIP-NEXT: vmv1r.v v0, v14
-; ZIP-NEXT: vmerge.vvm v8, v18, v8, v0
-; ZIP-NEXT: vmv2r.v v10, v12
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10
+; ZIP-NEXT: vmv.v.v v8, v12
+; ZIP-NEXT: vmv.v.v v10, v14
; ZIP-NEXT: ret
%retval = call {<4 x i64>, <4 x i64>} @llvm.vector.deinterleave2.v8i64(<8 x i64> %vec)
ret {<4 x i64>, <4 x i64>} %retval
}
define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_v8i64_v16i64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 85
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v0, -16
-; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v24, v8, 8
-; CHECK-NEXT: vmv.s.x v12, a0
-; CHECK-NEXT: li a0, 170
-; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vadd.vv v20, v16, v16
-; CHECK-NEXT: vmv.s.x v21, a0
-; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT: vcompress.vm v16, v8, v12
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vadd.vi v22, v20, -8
-; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT: vcompress.vm v12, v8, v21
-; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT: vadd.vi v8, v20, -7
-; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT: vrgatherei16.vv v16, v24, v22, v0.t
-; CHECK-NEXT: vrgatherei16.vv v12, v24, v8, v0.t
-; CHECK-NEXT: vmv.v.v v8, v16
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_v8i64_v16i64:
+; V: # %bb.0:
+; V-NEXT: li a0, 85
+; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; V-NEXT: vmv.v.i v0, -16
+; V-NEXT: vid.v v16
+; V-NEXT: vsetivli zero, 8, e64, m8, ta, ma
+; V-NEXT: vslidedown.vi v24, v8, 8
+; V-NEXT: vmv.s.x v12, a0
+; V-NEXT: li a0, 170
+; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; V-NEXT: vadd.vv v20, v16, v16
+; V-NEXT: vmv.s.x v21, a0
+; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; V-NEXT: vcompress.vm v16, v8, v12
+; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; V-NEXT: vadd.vi v22, v20, -8
+; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; V-NEXT: vcompress.vm v12, v8, v21
+; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma
+; V-NEXT: vadd.vi v8, v20, -7
+; V-NEXT: vsetvli zero, zero, e64, m4, ta, mu
+; V-NEXT: vrgatherei16.vv v16, v24, v22, v0.t
+; V-NEXT: vrgatherei16.vv v12, v24, v8, v0.t
+; V-NEXT: vmv.v.v v8, v16
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_v8i64_v16i64:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v12
+; ZIP-NEXT: ri.vunzip2b.vv v20, v8, v12
+; ZIP-NEXT: vmv.v.v v8, v16
+; ZIP-NEXT: vmv.v.v v12, v20
+; ZIP-NEXT: ret
%retval = call {<8 x i64>, <8 x i64>} @llvm.vector.deinterleave2.v16i64(<16 x i64> %vec)
ret {<8 x i64>, <8 x i64>} %retval
}
@@ -622,11 +611,11 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
;
; ZIP-LABEL: vector_deinterleave_v2f64_v4f64:
; ZIP: # %bb.0:
-; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
-; ZIP-NEXT: vslidedown.vi v10, v8, 2
; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; ZIP-NEXT: ri.vzipodd.vv v9, v8, v10
-; ZIP-NEXT: vslideup.vi v8, v10, 1
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v12, v8, v9
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: vmv.v.v v9, v12
; ZIP-NEXT: ret
%retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec)
ret {<2 x double>, <2 x double>} %retval
@@ -665,31 +654,11 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double
;
; ZIP-LABEL: vector_deinterleave_v4f64_v8f64:
; ZIP: # %bb.0:
-; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; ZIP-NEXT: vmv.v.i v0, 8
-; ZIP-NEXT: vsetivli zero, 4, e64, m4, ta, ma
-; ZIP-NEXT: vslidedown.vi v16, v8, 4
-; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
-; ZIP-NEXT: vslidedown.vi v12, v8, 2
-; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; ZIP-NEXT: vmv.v.i v10, 12
-; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; ZIP-NEXT: vslideup.vi v14, v16, 2
-; ZIP-NEXT: vslideup.vi v14, v16, 1, v0.t
-; ZIP-NEXT: ri.vzip2a.vv v18, v8, v12
-; ZIP-NEXT: vmv1r.v v0, v10
-; ZIP-NEXT: vmerge.vvm v12, v18, v14, v0
-; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma
-; ZIP-NEXT: vslidedown.vi v14, v16, 2
-; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
-; ZIP-NEXT: vmv.v.i v0, 2
-; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu
-; ZIP-NEXT: ri.vzip2a.vv v18, v16, v14
-; ZIP-NEXT: vslidedown.vi v14, v8, 1
-; ZIP-NEXT: vslidedown.vi v14, v8, 2, v0.t
-; ZIP-NEXT: vmv1r.v v0, v10
-; ZIP-NEXT: vmerge.vvm v10, v14, v18, v0
-; ZIP-NEXT: vmv2r.v v8, v12
+; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: ri.vunzip2b.vv v16, v8, v10
+; ZIP-NEXT: vmv.v.v v8, v12
+; ZIP-NEXT: vmv.v.v v10, v16
; ZIP-NEXT: ret
%retval = call {<4 x double>, <4 x double>} @llvm.vector.deinterleave2.v8f64(<8 x double> %vec)
ret {<4 x double>, <4 x double>} %retval
|
If we have xrivosvizip, we can use the vunzip2{a,b} instructions for these cases provided that we can prove the layout in the two registers matches the fixed length semantics.
The majority of this patch is a straight-forward port of the existing vnsrl logic which has the same requirement (though for slightly different reasoning).
The one complicated bit is the addition of the scalable splitting logic inside lowerVZIP to exploit the independent register operands, and allow the use of lower LMUL. This bit is annoyingly complicated, and really "should" be a DAG combine - except that the VL and mask reduction becomes hard when it's not known to be a constant.