Skip to content

Commit ceb5e02

Browse files
committed
[RISCV] Enable load clustering by default
Also tweaks the heuristic to cluster if operations are within a cache line of each other (as AMDGPU does in shouldScheduleLoadsNear). X86 does something similar, but does `((Offset2 - Offset1) / 8 > 64)`. I'm not sure if that's intentionally set to 512 bytes or if the division is in error. Posting for comment and for people to test on their workloads, feedback on ideas for a tweaked heuristic etc. Stacks on top of #73778.
1 parent 3b552e8 commit ceb5e02

36 files changed

+6915
-6889
lines changed

llvm/lib/Target/RISCV/RISCVInstrInfo.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2282,9 +2282,9 @@ bool RISCVInstrInfo::shouldClusterMemOps(
22822282
return false;
22832283
}
22842284

2285-
// TODO: Use a more carefully chosen heuristic, e.g. only cluster if offsets
2286-
// indicate they likely share a cache line.
2287-
return ClusterSize <= 4;
2285+
// A cache line is typically 64 bytes, so cluster if the memory ops are on
2286+
// the same or a neighbouring cache line.
2287+
return std::abs(Offset1 - Offset2) < 64;
22882288
}
22892289

22902290
// Set BaseReg (the base register operand), Offset (the byte offset being

llvm/lib/Target/RISCV/RISCVTargetMachine.cpp

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,6 @@ static cl::opt<bool>
9595
cl::desc("Enable Split RegisterAlloc for RVV"),
9696
cl::init(false));
9797

98-
static cl::opt<bool> EnableMISchedLoadClustering(
99-
"riscv-misched-load-clustering", cl::Hidden,
100-
cl::desc("Enable load clustering in the machine scheduler"),
101-
cl::init(false));
102-
10398
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
10499
RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
105100
RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -350,15 +345,10 @@ class RISCVPassConfig : public TargetPassConfig {
350345
ScheduleDAGInstrs *
351346
createMachineScheduler(MachineSchedContext *C) const override {
352347
const RISCVSubtarget &ST = C->MF->getSubtarget<RISCVSubtarget>();
353-
ScheduleDAGMILive *DAG = nullptr;
354-
if (EnableMISchedLoadClustering) {
355-
DAG = createGenericSchedLive(C);
356-
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
357-
}
358-
if (ST.hasMacroFusion()) {
359-
DAG = DAG ? DAG : createGenericSchedLive(C);
348+
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
349+
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
350+
if (ST.hasMacroFusion())
360351
DAG->addMutation(createRISCVMacroFusionDAGMutation());
361-
}
362352
return DAG;
363353
}
364354

llvm/test/CodeGen/RISCV/add-before-shl.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -200,25 +200,25 @@ define i128 @add_wide_operand(i128 %a) nounwind {
200200
;
201201
; RV32C-LABEL: add_wide_operand:
202202
; RV32C: # %bb.0:
203-
; RV32C-NEXT: lw a6, 4(a1)
203+
; RV32C-NEXT: lw a6, 8(a1)
204204
; RV32C-NEXT: c.lw a3, 12(a1)
205-
; RV32C-NEXT: c.lw a4, 0(a1)
206-
; RV32C-NEXT: c.lw a1, 8(a1)
205+
; RV32C-NEXT: c.lw a2, 4(a1)
206+
; RV32C-NEXT: c.lw a1, 0(a1)
207207
; RV32C-NEXT: c.lui a5, 16
208208
; RV32C-NEXT: c.add a3, a5
209209
; RV32C-NEXT: c.slli a3, 3
210-
; RV32C-NEXT: srli a5, a1, 29
211-
; RV32C-NEXT: c.or a3, a5
212-
; RV32C-NEXT: srli a5, a4, 29
213-
; RV32C-NEXT: slli a2, a6, 3
214-
; RV32C-NEXT: c.or a2, a5
215210
; RV32C-NEXT: srli a5, a6, 29
211+
; RV32C-NEXT: c.or a3, a5
212+
; RV32C-NEXT: srli a5, a1, 29
213+
; RV32C-NEXT: slli a4, a2, 3
214+
; RV32C-NEXT: c.or a4, a5
215+
; RV32C-NEXT: c.srli a2, 29
216+
; RV32C-NEXT: c.slli a6, 3
217+
; RV32C-NEXT: or a2, a6, a2
216218
; RV32C-NEXT: c.slli a1, 3
217-
; RV32C-NEXT: c.or a1, a5
218-
; RV32C-NEXT: c.slli a4, 3
219-
; RV32C-NEXT: c.sw a4, 0(a0)
220-
; RV32C-NEXT: c.sw a1, 8(a0)
221-
; RV32C-NEXT: c.sw a2, 4(a0)
219+
; RV32C-NEXT: c.sw a1, 0(a0)
220+
; RV32C-NEXT: c.sw a2, 8(a0)
221+
; RV32C-NEXT: c.sw a4, 4(a0)
222222
; RV32C-NEXT: c.sw a3, 12(a0)
223223
; RV32C-NEXT: c.jr ra
224224
;

0 commit comments

Comments
 (0)