|
29 | 29 | #include "AMDGPUTargetObjectFile.h" |
30 | 30 | #include "AMDGPUTargetTransformInfo.h" |
31 | 31 | #include "AMDGPUUnifyDivergentExitNodes.h" |
| 32 | +#include "AMDGPUVectorIdiom.h" |
32 | 33 | #include "AMDGPUWaitSGPRHazards.h" |
33 | 34 | #include "GCNDPPCombine.h" |
34 | 35 | #include "GCNIterativeScheduler.h" |
@@ -849,6 +850,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { |
849 | 850 | EnablePromoteKernelArguments) |
850 | 851 | FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); |
851 | 852 |
|
| 853 | + // Run vector-idiom canonicalization early (after inlining) and before |
| 854 | + // infer-AS / SROA to maximize scalarization opportunities. |
| 855 | + // Specify 32 bytes since the largest HIP vector types are double4 or |
| 856 | + // long4. |
| 857 | + FPM.addPass(AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32)); |
| 858 | + |
852 | 859 | // Add infer address spaces pass to the opt pipeline after inlining |
853 | 860 | // but before SROA to increase SROA opportunities. |
854 | 861 | FPM.addPass(InferAddressSpacesPass()); |
@@ -911,6 +918,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { |
911 | 918 | if (EnableLowerModuleLDS) |
912 | 919 | PM.addPass(AMDGPULowerModuleLDSPass(*this)); |
913 | 920 | if (Level != OptimizationLevel::O0) { |
| 921 | + PM.addPass(createModuleToFunctionPassAdaptor( |
| 922 | + AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32))); |
914 | 923 | // Do we really need internalization in LTO? |
915 | 924 | if (InternalizeSymbols) { |
916 | 925 | PM.addPass(InternalizePass(mustPreserveGV)); |
|
0 commit comments