Skip to content

Commit a214c52

Browse files
committed
[RISCV] Don't use zero-stride vector load for gather if not optimized
We may form a zero-stride vector load when lowering gather to strided load. As what D137699 has done, we use `load+splat` for this form if there is no optimized implementation. We restrict this to unmasked loads currently in consideration of the complexity of hanlding all falses masks. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D137931
1 parent 85d3a41 commit a214c52

File tree

2 files changed

+98
-41
lines changed

2 files changed

+98
-41
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 40 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5450,33 +5450,50 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
54505450
}
54515451
}
54525452

5453+
auto *Load = cast<MemIntrinsicSDNode>(Op);
54535454
SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
5455+
SDValue Ptr = Op.getOperand(3);
5456+
SDValue Stride = Op.getOperand(4);
5457+
SDValue Result, Chain;
5458+
5459+
// TODO: We restrict this to unmasked loads currently in consideration of
5460+
// the complexity of hanlding all falses masks.
5461+
if (IsUnmasked && isNullConstant(Stride) &&
5462+
!Subtarget.hasOptimizedZeroStrideLoad()) {
5463+
MVT ScalarVT = ContainerVT.getVectorElementType();
5464+
SDValue ScalarLoad =
5465+
DAG.getExtLoad(ISD::ZEXTLOAD, DL, XLenVT, Load->getChain(), Ptr,
5466+
ScalarVT, Load->getMemOperand());
5467+
Chain = ScalarLoad.getValue(1);
5468+
Result = lowerScalarSplat(SDValue(), ScalarLoad, VL, ContainerVT, DL, DAG,
5469+
Subtarget);
5470+
} else {
5471+
SDValue IntID = DAG.getTargetConstant(
5472+
IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
5473+
XLenVT);
54545474

5455-
SDValue IntID = DAG.getTargetConstant(
5456-
IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
5457-
XLenVT);
5475+
SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
5476+
if (IsUnmasked)
5477+
Ops.push_back(DAG.getUNDEF(ContainerVT));
5478+
else
5479+
Ops.push_back(PassThru);
5480+
Ops.push_back(Ptr);
5481+
Ops.push_back(Stride);
5482+
if (!IsUnmasked)
5483+
Ops.push_back(Mask);
5484+
Ops.push_back(VL);
5485+
if (!IsUnmasked) {
5486+
SDValue Policy =
5487+
DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
5488+
Ops.push_back(Policy);
5489+
}
54585490

5459-
auto *Load = cast<MemIntrinsicSDNode>(Op);
5460-
SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
5461-
if (IsUnmasked)
5462-
Ops.push_back(DAG.getUNDEF(ContainerVT));
5463-
else
5464-
Ops.push_back(PassThru);
5465-
Ops.push_back(Op.getOperand(3)); // Ptr
5466-
Ops.push_back(Op.getOperand(4)); // Stride
5467-
if (!IsUnmasked)
5468-
Ops.push_back(Mask);
5469-
Ops.push_back(VL);
5470-
if (!IsUnmasked) {
5471-
SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
5472-
Ops.push_back(Policy);
5491+
SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
5492+
Result =
5493+
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
5494+
Load->getMemoryVT(), Load->getMemOperand());
5495+
Chain = Result.getValue(1);
54735496
}
5474-
5475-
SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
5476-
SDValue Result =
5477-
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
5478-
Load->getMemoryVT(), Load->getMemOperand());
5479-
SDValue Chain = Result.getValue(1);
54805497
if (VT.isFixedLengthVector())
54815498
Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
54825499
return DAG.getMergeValues({Result, Chain}, DL);

llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,V
33
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,ZVE32F
4+
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+no-optimized-zero-stride-load -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED
5+
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+no-optimized-zero-stride-load -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED
46

57
%struct.foo = type { i32, i32, i32, i32 }
68

@@ -176,24 +178,62 @@ for.cond.cleanup: ; preds = %vector.body
176178

177179
define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
178180
;
179-
; CHECK-LABEL: gather_zero_stride:
180-
; CHECK: # %bb.0: # %entry
181-
; CHECK-NEXT: li a2, 0
182-
; CHECK-NEXT: li a3, 32
183-
; CHECK-NEXT: li a4, 1024
184-
; CHECK-NEXT: .LBB3_1: # %vector.body
185-
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
186-
; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
187-
; CHECK-NEXT: vlse8.v v8, (a1), zero
188-
; CHECK-NEXT: add a5, a0, a2
189-
; CHECK-NEXT: vle8.v v9, (a5)
190-
; CHECK-NEXT: vadd.vv v8, v9, v8
191-
; CHECK-NEXT: vse8.v v8, (a5)
192-
; CHECK-NEXT: addi a2, a2, 32
193-
; CHECK-NEXT: addi a1, a1, 160
194-
; CHECK-NEXT: bne a2, a4, .LBB3_1
195-
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
196-
; CHECK-NEXT: ret
181+
; V-LABEL: gather_zero_stride:
182+
; V: # %bb.0: # %entry
183+
; V-NEXT: li a2, 0
184+
; V-NEXT: li a3, 32
185+
; V-NEXT: li a4, 1024
186+
; V-NEXT: .LBB3_1: # %vector.body
187+
; V-NEXT: # =>This Inner Loop Header: Depth=1
188+
; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
189+
; V-NEXT: vlse8.v v8, (a1), zero
190+
; V-NEXT: add a5, a0, a2
191+
; V-NEXT: vle8.v v9, (a5)
192+
; V-NEXT: vadd.vv v8, v9, v8
193+
; V-NEXT: vse8.v v8, (a5)
194+
; V-NEXT: addi a2, a2, 32
195+
; V-NEXT: addi a1, a1, 160
196+
; V-NEXT: bne a2, a4, .LBB3_1
197+
; V-NEXT: # %bb.2: # %for.cond.cleanup
198+
; V-NEXT: ret
199+
;
200+
; ZVE32F-LABEL: gather_zero_stride:
201+
; ZVE32F: # %bb.0: # %entry
202+
; ZVE32F-NEXT: li a2, 0
203+
; ZVE32F-NEXT: li a3, 32
204+
; ZVE32F-NEXT: li a4, 1024
205+
; ZVE32F-NEXT: .LBB3_1: # %vector.body
206+
; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
207+
; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
208+
; ZVE32F-NEXT: vlse8.v v8, (a1), zero
209+
; ZVE32F-NEXT: add a5, a0, a2
210+
; ZVE32F-NEXT: vle8.v v9, (a5)
211+
; ZVE32F-NEXT: vadd.vv v8, v9, v8
212+
; ZVE32F-NEXT: vse8.v v8, (a5)
213+
; ZVE32F-NEXT: addi a2, a2, 32
214+
; ZVE32F-NEXT: addi a1, a1, 160
215+
; ZVE32F-NEXT: bne a2, a4, .LBB3_1
216+
; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
217+
; ZVE32F-NEXT: ret
218+
;
219+
; NOT-OPTIMIZED-LABEL: gather_zero_stride:
220+
; NOT-OPTIMIZED: # %bb.0: # %entry
221+
; NOT-OPTIMIZED-NEXT: li a2, 0
222+
; NOT-OPTIMIZED-NEXT: li a3, 32
223+
; NOT-OPTIMIZED-NEXT: li a4, 1024
224+
; NOT-OPTIMIZED-NEXT: .LBB3_1: # %vector.body
225+
; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1
226+
; NOT-OPTIMIZED-NEXT: lbu a5, 0(a1)
227+
; NOT-OPTIMIZED-NEXT: add a6, a0, a2
228+
; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma
229+
; NOT-OPTIMIZED-NEXT: vle8.v v8, (a6)
230+
; NOT-OPTIMIZED-NEXT: vadd.vx v8, v8, a5
231+
; NOT-OPTIMIZED-NEXT: vse8.v v8, (a6)
232+
; NOT-OPTIMIZED-NEXT: addi a2, a2, 32
233+
; NOT-OPTIMIZED-NEXT: addi a1, a1, 160
234+
; NOT-OPTIMIZED-NEXT: bne a2, a4, .LBB3_1
235+
; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup
236+
; NOT-OPTIMIZED-NEXT: ret
197237
entry:
198238
br label %vector.body
199239

0 commit comments

Comments
 (0)