Skip to content

Commit 0c6e03e

Browse files
topperc4vtomat
andauthored
[RISCV] Fold vp.store(vp.reverse(VAL), ADDR, MASK) -> vp.strided.store(VAL, NEW_ADDR, -1, MASK) (#123123)
Co-authored-by: Brandon Wu <brandon.wu@sifive.com>
1 parent 65cd9e4 commit 0c6e03e

File tree

2 files changed

+153
-7
lines changed

2 files changed

+153
-7
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 72 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,13 +1524,17 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
15241524
setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
15251525
ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
15261526
if (Subtarget.hasVInstructions())
1527-
setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER,
1528-
ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL,
1529-
ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR,
1530-
ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
1531-
ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL,
1532-
ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM,
1533-
ISD::INSERT_VECTOR_ELT, ISD::ABS, ISD::CTPOP,
1527+
setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER,
1528+
ISD::MSCATTER, ISD::VP_GATHER,
1529+
ISD::VP_SCATTER, ISD::SRA,
1530+
ISD::SRL, ISD::SHL,
1531+
ISD::STORE, ISD::SPLAT_VECTOR,
1532+
ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
1533+
ISD::VP_STORE, ISD::EXPERIMENTAL_VP_REVERSE,
1534+
ISD::MUL, ISD::SDIV,
1535+
ISD::UDIV, ISD::SREM,
1536+
ISD::UREM, ISD::INSERT_VECTOR_ELT,
1537+
ISD::ABS, ISD::CTPOP,
15341538
ISD::VECTOR_SHUFFLE});
15351539
if (Subtarget.hasVendorXTHeadMemPair())
15361540
setTargetDAGCombine({ISD::LOAD, ISD::STORE});
@@ -16294,6 +16298,65 @@ static SDValue performVP_REVERSECombine(SDNode *N, SelectionDAG &DAG,
1629416298
return Ret;
1629516299
}
1629616300

16301+
static SDValue performVP_STORECombine(SDNode *N, SelectionDAG &DAG,
16302+
const RISCVSubtarget &Subtarget) {
16303+
// Fold:
16304+
// vp.store(vp.reverse(VAL), ADDR, MASK) -> vp.strided.store(VAL, NEW_ADDR,
16305+
// -1, MASK)
16306+
auto *VPStore = cast<VPStoreSDNode>(N);
16307+
16308+
if (VPStore->getValue().getOpcode() != ISD::EXPERIMENTAL_VP_REVERSE)
16309+
return SDValue();
16310+
16311+
SDValue VPReverse = VPStore->getValue();
16312+
EVT ReverseVT = VPReverse->getValueType(0);
16313+
16314+
// We do not have a strided_store version for masks, and the evl of vp.reverse
16315+
// and vp.store should always be the same.
16316+
if (!ReverseVT.getVectorElementType().isByteSized() ||
16317+
VPStore->getVectorLength() != VPReverse.getOperand(2) ||
16318+
!VPReverse.hasOneUse())
16319+
return SDValue();
16320+
16321+
SDValue StoreMask = VPStore->getMask();
16322+
// If Mask is all ones, then load is unmasked and can be reversed.
16323+
if (!isOneOrOneSplat(StoreMask)) {
16324+
// If the mask is not all ones, we can reverse the store if the mask was
16325+
// also reversed by an unmasked vp.reverse with the same EVL.
16326+
if (StoreMask.getOpcode() != ISD::EXPERIMENTAL_VP_REVERSE ||
16327+
!isOneOrOneSplat(StoreMask.getOperand(1)) ||
16328+
StoreMask.getOperand(2) != VPStore->getVectorLength())
16329+
return SDValue();
16330+
StoreMask = StoreMask.getOperand(0);
16331+
}
16332+
16333+
// Base = StoreAddr + (NumElem - 1) * ElemWidthByte
16334+
SDLoc DL(N);
16335+
MVT XLenVT = Subtarget.getXLenVT();
16336+
SDValue NumElem = VPStore->getVectorLength();
16337+
uint64_t ElemWidthByte = VPReverse.getValueType().getScalarSizeInBits() / 8;
16338+
16339+
SDValue Temp1 = DAG.getNode(ISD::SUB, DL, XLenVT, NumElem,
16340+
DAG.getConstant(1, DL, XLenVT));
16341+
SDValue Temp2 = DAG.getNode(ISD::MUL, DL, XLenVT, Temp1,
16342+
DAG.getConstant(ElemWidthByte, DL, XLenVT));
16343+
SDValue Base =
16344+
DAG.getNode(ISD::ADD, DL, XLenVT, VPStore->getBasePtr(), Temp2);
16345+
SDValue Stride = DAG.getConstant(-ElemWidthByte, DL, XLenVT);
16346+
16347+
MachineFunction &MF = DAG.getMachineFunction();
16348+
MachinePointerInfo PtrInfo(VPStore->getAddressSpace());
16349+
MachineMemOperand *MMO = MF.getMachineMemOperand(
16350+
PtrInfo, VPStore->getMemOperand()->getFlags(),
16351+
LocationSize::beforeOrAfterPointer(), VPStore->getAlign());
16352+
16353+
return DAG.getStridedStoreVP(
16354+
VPStore->getChain(), DL, VPReverse.getOperand(0), Base,
16355+
VPStore->getOffset(), Stride, StoreMask, VPStore->getVectorLength(),
16356+
VPStore->getMemoryVT(), MMO, VPStore->getAddressingMode(),
16357+
VPStore->isTruncatingStore(), VPStore->isCompressingStore());
16358+
}
16359+
1629716360
// Convert from one FMA opcode to another based on whether we are negating the
1629816361
// multiply result and/or the accumulator.
1629916362
// NOTE: Only supports RVV operations with VL.
@@ -18474,6 +18537,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
1847418537
}
1847518538
case ISD::EXPERIMENTAL_VP_REVERSE:
1847618539
return performVP_REVERSECombine(N, DAG, Subtarget);
18540+
case ISD::VP_STORE:
18541+
return performVP_STORECombine(N, DAG, Subtarget);
1847718542
case ISD::BITCAST: {
1847818543
assert(Subtarget.useRVVForFixedLengthVectors());
1847918544
SDValue N0 = N->getOperand(0);
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=riscv64 -mattr=+f,+v -verify-machineinstrs < %s | FileCheck %s
3+
4+
define void @test_store_reverse_combiner(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, i32 zeroext %evl) {
5+
; CHECK-LABEL: test_store_reverse_combiner:
6+
; CHECK: # %bb.0:
7+
; CHECK-NEXT: slli a2, a1, 2
8+
; CHECK-NEXT: add a0, a2, a0
9+
; CHECK-NEXT: addi a0, a0, -4
10+
; CHECK-NEXT: li a2, -4
11+
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
12+
; CHECK-NEXT: vsse32.v v8, (a0), a2
13+
; CHECK-NEXT: ret
14+
%rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %val, <vscale x 2 x i1> splat (i1 true), i32 %evl)
15+
call void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float> %rev, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> splat (i1 true), i32 %evl)
16+
ret void
17+
}
18+
19+
define void @test_store_mask_is_vp_reverse(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %mask, i32 zeroext %evl) {
20+
; CHECK-LABEL: test_store_mask_is_vp_reverse:
21+
; CHECK: # %bb.0:
22+
; CHECK-NEXT: slli a2, a1, 2
23+
; CHECK-NEXT: add a0, a2, a0
24+
; CHECK-NEXT: addi a0, a0, -4
25+
; CHECK-NEXT: li a2, -4
26+
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
27+
; CHECK-NEXT: vsse32.v v8, (a0), a2, v0.t
28+
; CHECK-NEXT: ret
29+
%storemask = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> splat (i1 true), i32 %evl)
30+
%rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %val, <vscale x 2 x i1> splat (i1 true), i32 %evl)
31+
call void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float> %rev, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %storemask, i32 %evl)
32+
ret void
33+
}
34+
35+
define void @test_store_mask_not_all_one(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %notallones, i32 zeroext %evl) {
36+
; CHECK-LABEL: test_store_mask_not_all_one:
37+
; CHECK: # %bb.0:
38+
; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
39+
; CHECK-NEXT: vid.v v9, v0.t
40+
; CHECK-NEXT: addi a1, a1, -1
41+
; CHECK-NEXT: vrsub.vx v9, v9, a1, v0.t
42+
; CHECK-NEXT: vrgather.vv v10, v8, v9, v0.t
43+
; CHECK-NEXT: vse32.v v10, (a0), v0.t
44+
; CHECK-NEXT: ret
45+
%rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %val, <vscale x 2 x i1> %notallones, i32 %evl)
46+
call void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float> %rev, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %notallones, i32 %evl)
47+
ret void
48+
}
49+
50+
define void @test_different_evl(<vscale x 2 x float> %val, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %mask, i32 zeroext %evl1, i32 zeroext %evl2) {
51+
; CHECK-LABEL: test_different_evl:
52+
; CHECK: # %bb.0:
53+
; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma
54+
; CHECK-NEXT: vid.v v9
55+
; CHECK-NEXT: addi a1, a1, -1
56+
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
57+
; CHECK-NEXT: vmv.v.i v10, 0
58+
; CHECK-NEXT: vmerge.vim v10, v10, 1, v0
59+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
60+
; CHECK-NEXT: vid.v v11
61+
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
62+
; CHECK-NEXT: vrsub.vx v9, v9, a1
63+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
64+
; CHECK-NEXT: vrsub.vx v11, v11, a1
65+
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
66+
; CHECK-NEXT: vrgatherei16.vv v12, v10, v9
67+
; CHECK-NEXT: vmsne.vi v0, v12, 0
68+
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
69+
; CHECK-NEXT: vrgather.vv v9, v8, v11
70+
; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
71+
; CHECK-NEXT: vse32.v v9, (a0), v0.t
72+
; CHECK-NEXT: ret
73+
%storemask = call <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> splat (i1 true), i32 %evl1)
74+
%rev = call <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float> %val, <vscale x 2 x i1> splat (i1 true), i32 %evl1)
75+
call void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float> %rev, <vscale x 2 x float>* %ptr, <vscale x 2 x i1> %storemask, i32 %evl2)
76+
ret void
77+
}
78+
79+
declare <vscale x 2 x float> @llvm.experimental.vp.reverse.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, i32)
80+
declare <vscale x 2 x i1> @llvm.experimental.vp.reverse.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
81+
declare void @llvm.vp.store.nxv2f32.p0nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>* nocapture, <vscale x 2 x i1>, i32)

0 commit comments

Comments
 (0)