Skip to content

Commit 0dda3ad

Browse files
authored
[AMDGPU] Identify vector idiom to unlock SROA (llvm#3967)
2 parents 5a40266 + caa3e0b commit 0dda3ad

File tree

6 files changed

+965
-0
lines changed

6 files changed

+965
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
6969
FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
7070
AMDGPUUnifyDivergentExitNodesPass())
7171
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
72+
FUNCTION_PASS("amdgpu-vector-idiom",
73+
AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32))
7274
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
7375
#undef FUNCTION_PASS
7476

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "AMDGPUTargetObjectFile.h"
3030
#include "AMDGPUTargetTransformInfo.h"
3131
#include "AMDGPUUnifyDivergentExitNodes.h"
32+
#include "AMDGPUVectorIdiom.h"
3233
#include "AMDGPUWaitSGPRHazards.h"
3334
#include "GCNDPPCombine.h"
3435
#include "GCNIterativeScheduler.h"
@@ -849,6 +850,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
849850
EnablePromoteKernelArguments)
850851
FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
851852

853+
// Run vector-idiom canonicalization early (after inlining) and before
854+
// infer-AS / SROA to maximize scalarization opportunities.
855+
// Specify 32 bytes since the largest HIP vector types are double4 or
856+
// long4.
857+
FPM.addPass(AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32));
858+
852859
// Add infer address spaces pass to the opt pipeline after inlining
853860
// but before SROA to increase SROA opportunities.
854861
FPM.addPass(InferAddressSpacesPass());
@@ -911,6 +918,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
911918
if (EnableLowerModuleLDS)
912919
PM.addPass(AMDGPULowerModuleLDSPass(*this));
913920
if (Level != OptimizationLevel::O0) {
921+
PM.addPass(createModuleToFunctionPassAdaptor(
922+
AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32)));
914923
// Do we really need internalization in LTO?
915924
if (InternalizeSymbols) {
916925
PM.addPass(InternalizePass(mustPreserveGV));

0 commit comments

Comments
 (0)