Skip to content

Commit d0899ba

Browse files
committed
[RISCV] Emit VP strided loads/stores in RISCVGatherScatterLowering
RISCVGatherScatterLowering is the main user of riscv_masked_strided_{load,store}, which we can remove if we replace them with their VP equivalents. Submitting early as a draft to show the regressions in the test diff that #97800 and #97798 (or the CGP version) are needed to fix.
1 parent 3f83a69 commit d0899ba

File tree

5 files changed

+158
-132
lines changed

5 files changed

+158
-132
lines changed

llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -515,17 +515,23 @@ bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II,
515515

516516
Builder.SetInsertPoint(II);
517517

518+
Value *EVL = Builder.CreateElementCount(
519+
IntegerType::get(Ctx, 32), cast<VectorType>(DataType)->getElementCount());
520+
518521
CallInst *Call;
519-
if (II->getIntrinsicID() == Intrinsic::masked_gather)
522+
if (II->getIntrinsicID() == Intrinsic::masked_gather) {
520523
Call = Builder.CreateIntrinsic(
521-
Intrinsic::riscv_masked_strided_load,
524+
Intrinsic::experimental_vp_strided_load,
522525
{DataType, BasePtr->getType(), Stride->getType()},
523-
{II->getArgOperand(3), BasePtr, Stride, II->getArgOperand(2)});
524-
else
526+
{BasePtr, Stride, II->getArgOperand(2), EVL});
527+
Call = Builder.CreateIntrinsic(
528+
Intrinsic::vp_select, {DataType},
529+
{II->getOperand(2), Call, II->getArgOperand(3), EVL});
530+
} else
525531
Call = Builder.CreateIntrinsic(
526-
Intrinsic::riscv_masked_strided_store,
532+
Intrinsic::experimental_vp_strided_store,
527533
{DataType, BasePtr->getType(), Stride->getType()},
528-
{II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3)});
534+
{II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3), EVL});
529535

530536
Call->takeName(II);
531537
II->replaceAllUsesWith(Call);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll

Lines changed: 27 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,9 @@ define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture
144144
; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
145145
; CHECK-NEXT: .LBB3_1: # %vector.body
146146
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
147-
; CHECK-NEXT: lbu a3, 0(a1)
148-
; CHECK-NEXT: vle8.v v8, (a0)
149-
; CHECK-NEXT: vadd.vx v8, v8, a3
147+
; CHECK-NEXT: vlse8.v v8, (a1), zero
148+
; CHECK-NEXT: vle8.v v9, (a0)
149+
; CHECK-NEXT: vadd.vv v8, v9, v8
150150
; CHECK-NEXT: vse8.v v8, (a0)
151151
; CHECK-NEXT: addi a0, a0, 32
152152
; CHECK-NEXT: addi a1, a1, 160
@@ -182,9 +182,9 @@ define void @gather_zero_stride_i32(ptr noalias nocapture %A, ptr noalias nocapt
182182
; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
183183
; CHECK-NEXT: .LBB4_1: # %vector.body
184184
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
185-
; CHECK-NEXT: lw a3, 0(a1)
186-
; CHECK-NEXT: vle32.v v8, (a0)
187-
; CHECK-NEXT: vadd.vx v8, v8, a3
185+
; CHECK-NEXT: vlse32.v v8, (a1), zero
186+
; CHECK-NEXT: vle32.v v9, (a0)
187+
; CHECK-NEXT: vadd.vv v8, v9, v8
188188
; CHECK-NEXT: vse32.v v8, (a0)
189189
; CHECK-NEXT: addi a0, a0, 8
190190
; CHECK-NEXT: addi a1, a1, 160
@@ -214,57 +214,22 @@ for.cond.cleanup: ; preds = %vector.body
214214
}
215215

216216
define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
217-
; V-LABEL: gather_zero_stride_unfold:
218-
; V: # %bb.0: # %entry
219-
; V-NEXT: addi a2, a0, 1024
220-
; V-NEXT: li a3, 32
221-
; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma
222-
; V-NEXT: .LBB5_1: # %vector.body
223-
; V-NEXT: # =>This Inner Loop Header: Depth=1
224-
; V-NEXT: vlse8.v v8, (a1), zero
225-
; V-NEXT: vle8.v v9, (a0)
226-
; V-NEXT: vdivu.vv v8, v8, v9
227-
; V-NEXT: vse8.v v8, (a0)
228-
; V-NEXT: addi a0, a0, 32
229-
; V-NEXT: addi a1, a1, 160
230-
; V-NEXT: bne a0, a2, .LBB5_1
231-
; V-NEXT: # %bb.2: # %for.cond.cleanup
232-
; V-NEXT: ret
233-
;
234-
; ZVE32F-LABEL: gather_zero_stride_unfold:
235-
; ZVE32F: # %bb.0: # %entry
236-
; ZVE32F-NEXT: addi a2, a0, 1024
237-
; ZVE32F-NEXT: li a3, 32
238-
; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
239-
; ZVE32F-NEXT: .LBB5_1: # %vector.body
240-
; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
241-
; ZVE32F-NEXT: vlse8.v v8, (a1), zero
242-
; ZVE32F-NEXT: vle8.v v9, (a0)
243-
; ZVE32F-NEXT: vdivu.vv v8, v8, v9
244-
; ZVE32F-NEXT: vse8.v v8, (a0)
245-
; ZVE32F-NEXT: addi a0, a0, 32
246-
; ZVE32F-NEXT: addi a1, a1, 160
247-
; ZVE32F-NEXT: bne a0, a2, .LBB5_1
248-
; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
249-
; ZVE32F-NEXT: ret
250-
;
251-
; NOT-OPTIMIZED-LABEL: gather_zero_stride_unfold:
252-
; NOT-OPTIMIZED: # %bb.0: # %entry
253-
; NOT-OPTIMIZED-NEXT: addi a2, a0, 1024
254-
; NOT-OPTIMIZED-NEXT: li a3, 32
255-
; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma
256-
; NOT-OPTIMIZED-NEXT: .LBB5_1: # %vector.body
257-
; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1
258-
; NOT-OPTIMIZED-NEXT: lbu a3, 0(a1)
259-
; NOT-OPTIMIZED-NEXT: vle8.v v8, (a0)
260-
; NOT-OPTIMIZED-NEXT: vmv.v.x v9, a3
261-
; NOT-OPTIMIZED-NEXT: vdivu.vv v8, v9, v8
262-
; NOT-OPTIMIZED-NEXT: vse8.v v8, (a0)
263-
; NOT-OPTIMIZED-NEXT: addi a0, a0, 32
264-
; NOT-OPTIMIZED-NEXT: addi a1, a1, 160
265-
; NOT-OPTIMIZED-NEXT: bne a0, a2, .LBB5_1
266-
; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup
267-
; NOT-OPTIMIZED-NEXT: ret
217+
; CHECK-LABEL: gather_zero_stride_unfold:
218+
; CHECK: # %bb.0: # %entry
219+
; CHECK-NEXT: addi a2, a0, 1024
220+
; CHECK-NEXT: li a3, 32
221+
; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma
222+
; CHECK-NEXT: .LBB5_1: # %vector.body
223+
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
224+
; CHECK-NEXT: vlse8.v v8, (a1), zero
225+
; CHECK-NEXT: vle8.v v9, (a0)
226+
; CHECK-NEXT: vdivu.vv v8, v8, v9
227+
; CHECK-NEXT: vse8.v v8, (a0)
228+
; CHECK-NEXT: addi a0, a0, 32
229+
; CHECK-NEXT: addi a1, a1, 160
230+
; CHECK-NEXT: bne a0, a2, .LBB5_1
231+
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
232+
; CHECK-NEXT: ret
268233
entry:
269234
br label %vector.body
270235

@@ -962,9 +927,9 @@ define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocaptu
962927
; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma
963928
; CHECK-NEXT: .LBB16_1: # %vector.body
964929
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
965-
; CHECK-NEXT: flw fa5, 0(a1)
966-
; CHECK-NEXT: vle32.v v8, (a0)
967-
; CHECK-NEXT: vfadd.vf v8, v8, fa5
930+
; CHECK-NEXT: vlse32.v v8, (a1), zero
931+
; CHECK-NEXT: vle32.v v9, (a0)
932+
; CHECK-NEXT: vfadd.vv v8, v9, v8
968933
; CHECK-NEXT: vse32.v v8, (a0)
969934
; CHECK-NEXT: addi a0, a0, 128
970935
; CHECK-NEXT: addi a1, a1, 640
@@ -992,3 +957,5 @@ vector.body: ; preds = %vector.body, %entry
992957
for.cond.cleanup: ; preds = %vector.body
993958
ret void
994959
}
960+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
961+
; NOT-OPTIMIZED: {{.*}}

0 commit comments

Comments
 (0)