-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[RISCV] Optimize two source deinterleave2 via ri.vunzip2{a,b} #142667
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The motivation is basically the same as the vnsrl cases; we'd rather do 3 simple linear in LMUL operation than need to fall back to a vrgather on at least one source.
This allows us to use a single instruction instead of needing to split and slide.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesAs done for the existing vnsrl cases, we can split a two source deinterleave2 In the review, I included these as separate changes since I find that slightly Full diff: https://github.com/llvm/llvm-project/pull/142667.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f74ca2a1c5492..777f4f91908d4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5830,6 +5830,9 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
Index == 0 ? RISCVISD::RI_VUNZIP2A_VL : RISCVISD::RI_VUNZIP2B_VL;
if (V2.isUndef())
return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget);
+ if (auto VLEN = Subtarget.getRealVLen();
+ VLEN && VT.getSizeInBits().getKnownMinValue() % *VLEN == 0)
+ return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget);
if (SDValue Src = foldConcatVector(V1, V2)) {
EVT NewVT = VT.getDoubleNumVectorElementsVT();
Src = DAG.getExtractSubvector(DL, NewVT, Src, 0);
@@ -5837,6 +5840,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
lowerVZIP(Opc, Src, DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
return DAG.getExtractSubvector(DL, VT, Res, 0);
}
+ // Narrow each source and concatenate them.
+ // FIXME: For small LMUL it is better to concatenate first.
+ if (1 < count_if(Mask,
+ [&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
+ 1 < count_if(Mask,
+ [&Mask](int Idx) { return Idx >= (int)Mask.size(); })) {
+ SDValue Lo = lowerVZIP(Opc, V1, DAG.getUNDEF(VT), DL, DAG, Subtarget);
+ SDValue Hi = lowerVZIP(Opc, V2, DAG.getUNDEF(VT), DL, DAG, Subtarget);
+
+ MVT SubVT = VT.getHalfNumVectorElementsVT();
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
+ DAG.getExtractSubvector(DL, SubVT, Lo, 0),
+ DAG.getExtractSubvector(DL, SubVT, Hi, 0));
+ }
}
if (SDValue V =
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
index 9c884454aa025..14b0e8352efa3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll
@@ -1364,13 +1364,11 @@ define <4 x i64> @unzip2a_dual_v4i64(<4 x i64> %a, <4 x i64> %b) {
;
; ZIP-LABEL: unzip2a_dual_v4i64:
; ZIP: # %bb.0: # %entry
-; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
-; ZIP-NEXT: vmv.v.i v0, 8
-; ZIP-NEXT: vslideup.vi v10, v9, 2
-; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
-; ZIP-NEXT: vmv.v.i v0, 12
-; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
-; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v11, v9, v10
+; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
+; ZIP-NEXT: vslideup.vi v9, v11, 2
+; ZIP-NEXT: vmv.v.v v8, v9
; ZIP-NEXT: ret
entry:
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1502,16 +1500,11 @@ define <16 x i64> @unzip2a_dual_v16i64(<16 x i64> %a, <16 x i64> %b) {
; ZIP-LABEL: unzip2a_dual_v16i64:
; ZIP: # %bb.0: # %entry
; ZIP-NEXT: vsetivli zero, 8, e64, m2, ta, ma
-; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v10
-; ZIP-NEXT: vsetivli zero, 16, e16, m1, ta, ma
-; ZIP-NEXT: vid.v v8
-; ZIP-NEXT: li a0, -256
-; ZIP-NEXT: vadd.vv v8, v8, v8
-; ZIP-NEXT: vmv.s.x v0, a0
-; ZIP-NEXT: vadd.vi v8, v8, -16
-; ZIP-NEXT: vsetvli zero, zero, e64, m4, ta, mu
-; ZIP-NEXT: vrgatherei16.vv v16, v12, v8, v0.t
-; ZIP-NEXT: vmv.v.v v8, v16
+; ZIP-NEXT: ri.vunzip2a.vv v16, v12, v14
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
+; ZIP-NEXT: vslideup.vi v12, v16, 8
+; ZIP-NEXT: vmv.v.v v8, v12
; ZIP-NEXT: ret
entry:
%c = shufflevector <16 x i64> %a, <16 x i64> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1557,13 +1550,9 @@ define <4 x i64> @unzip2a_dual_v4i64_exact(<4 x i64> %a, <4 x i64> %b) vscale_ra
;
; ZIP-LABEL: unzip2a_dual_v4i64_exact:
; ZIP: # %bb.0: # %entry
-; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
-; ZIP-NEXT: vmv.v.i v0, 8
-; ZIP-NEXT: vslideup.vi v10, v9, 2
-; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
-; ZIP-NEXT: vmv.v.i v0, 12
-; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
-; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: vmv.v.v v8, v10
; ZIP-NEXT: ret
entry:
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1609,13 +1598,12 @@ define <4 x i64> @unzip2a_dual_v4i64_exact_nf2(<4 x i64> %a, <4 x i64> %b) vscal
;
; ZIP-LABEL: unzip2a_dual_v4i64_exact_nf2:
; ZIP: # %bb.0: # %entry
-; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
-; ZIP-NEXT: vmv.v.i v0, 8
-; ZIP-NEXT: vslideup.vi v10, v9, 2
-; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t
-; ZIP-NEXT: vmv.v.i v0, 12
-; ZIP-NEXT: ri.vunzip2a.vv v11, v8, v9
-; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0
+; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v11, v9, v10
+; ZIP-NEXT: ri.vunzip2a.vv v9, v8, v10
+; ZIP-NEXT: vsetvli zero, zero, e64, m1, tu, ma
+; ZIP-NEXT: vslideup.vi v9, v11, 2
+; ZIP-NEXT: vmv1r.v v8, v9
; ZIP-NEXT: ret
entry:
%c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1740,37 +1728,9 @@ define <16 x i64> @unzip2a_dual_v16i64_exact(<16 x i64> %a, <16 x i64> %b) vscal
;
; ZIP-LABEL: unzip2a_dual_v16i64_exact:
; ZIP: # %bb.0: # %entry
-; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, mu
-; ZIP-NEXT: vslideup.vi v18, v15, 2
-; ZIP-NEXT: vmv.v.i v16, 8
-; ZIP-NEXT: vmv.v.i v17, 12
-; ZIP-NEXT: vslideup.vi v20, v13, 2
-; ZIP-NEXT: vmv.v.v v0, v16
-; ZIP-NEXT: vslideup.vi v18, v15, 1, v0.t
-; ZIP-NEXT: ri.vunzip2a.vv v15, v14, v19
-; ZIP-NEXT: vmv.v.v v0, v17
-; ZIP-NEXT: vmerge.vvm v15, v15, v18, v0
-; ZIP-NEXT: vmv.v.v v0, v16
-; ZIP-NEXT: vslideup.vi v20, v13, 1, v0.t
-; ZIP-NEXT: ri.vunzip2a.vv v14, v12, v13
-; ZIP-NEXT: vslideup.vi v12, v11, 2
-; ZIP-NEXT: vslideup.vi v18, v9, 2
-; ZIP-NEXT: vmv.v.v v0, v17
-; ZIP-NEXT: vmerge.vvm v14, v14, v20, v0
-; ZIP-NEXT: li a0, -256
-; ZIP-NEXT: ri.vunzip2a.vv v20, v10, v13
-; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v19
-; ZIP-NEXT: vmv.v.v v0, v16
-; ZIP-NEXT: vslideup.vi v12, v11, 1, v0.t
-; ZIP-NEXT: vmv.v.v v0, v17
-; ZIP-NEXT: vmerge.vvm v13, v20, v12, v0
-; ZIP-NEXT: vmv.v.v v0, v16
-; ZIP-NEXT: vslideup.vi v18, v9, 1, v0.t
-; ZIP-NEXT: vmv.v.v v0, v17
-; ZIP-NEXT: vmerge.vvm v12, v10, v18, v0
-; ZIP-NEXT: vmv.s.x v0, a0
; ZIP-NEXT: vsetivli zero, 16, e64, m4, ta, ma
-; ZIP-NEXT: vmerge.vvm v8, v12, v12, v0
+; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v12
+; ZIP-NEXT: vmv.v.v v8, v16
; ZIP-NEXT: ret
entry:
%c = shufflevector <16 x i64> %a, <16 x i64> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
|
This saves one unzip instruction, and avoids a vsetvl toggle.
Are we intentionally missing coverage for vunzip2b? |
Not other than the fact it didn't seem interesting given the opcode and matching bits weren't changing. Happy to duplicate the tests if you like? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
As long as it was intentional I don't care that much. I wanted to make sure there weren't tests that should have updated but didn't. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/59/builds/18797 Here is the relevant piece of the build log for the reference
|
As done for the existing vnsrl cases, we can split a two source deinterleave2
into two single source deinterleave2 and a slideup. We can also use a
concat-then-deinterleave2 tactic. Both are equally valid (except in the m8
source type case), and the concat-then-deinterleave2 saves one instruction
for fractional LMUL cases.
Additionally, if we happen to know the exact VLEN and our fixed vectors are
an even number of vector registers, we can avoid the need to split or concat
entirely and just use both registers sources.
In the review, I included these as separate changes since I find that slightly
easier to follow. I can either land these squashed or individually as reviewers
prefer.