Skip to content

Commit 0f7358d

Browse files
committed
[Attributor][AMDGPU] Improve indirect call support in closed modules
If we see all functions that can be called, thus in a "closed world", we can perform better reasoning in the presence of unknown callees of indirect calls. We now collect all indirectly callable functions and limit the potentially called functions to those. The AMDGPU backend is the only user for now. We should enable this for AMDGPU (and NVIDIA GPUs in certain cases) also when we run the Attributor (or OpenMP-opt) earlier in the pipeline.
1 parent a475301 commit 0f7358d

12 files changed

+2513
-1952
lines changed

llvm/include/llvm/Transforms/IPO/Attributor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1435,7 +1435,7 @@ struct AttributorConfig {
14351435
/// Callback function to determine if an indirect call targets should be made
14361436
/// direct call targets (with an if-cascade).
14371437
std::function<bool(Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1438-
Function &AssummedCallee)>
1438+
Function &AssummedCallee, unsigned NumCallees)>
14391439
IndirectCalleeSpecializationCallback = nullptr;
14401440

14411441
/// Helper to update an underlying call graph and to delete functions.

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/TargetTransformInfo.h"
1718
#include "llvm/CodeGen/TargetPassConfig.h"
19+
#include "llvm/IR/CallingConv.h"
1820
#include "llvm/IR/IntrinsicsAMDGPU.h"
1921
#include "llvm/IR/IntrinsicsR600.h"
22+
#include "llvm/Support/Casting.h"
2023
#include "llvm/Target/TargetMachine.h"
2124
#include "llvm/Transforms/IPO/Attributor.h"
25+
#include <optional>
2226

2327
#define DEBUG_TYPE "amdgpu-attributor"
2428

@@ -944,16 +948,29 @@ class AMDGPUAttributor : public ModulePass {
944948
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
945949
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
946950
&AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
947-
&AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
951+
&AAIndirectCallInfo::ID, &AAPotentialConstantValues::ID,
952+
&AAUnderlyingObjects::ID});
948953

949954
AttributorConfig AC(CGUpdater);
950955
AC.Allowed = &Allowed;
951956
AC.IsModulePass = true;
952957
AC.DefaultInitializeLiveInternals = false;
958+
AC.IsClosedWorldModule = true;
953959
AC.IPOAmendableCB = [](const Function &F) {
954960
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
955961
};
956962

963+
// Callback to determine if we should specialize a indirect call site with a
964+
// specific callee. It's effectively a heuristic and we can add checks for
965+
// the callee size, PGO, etc. For now, we check for single potential callees
966+
// and kernel arguments as they are known uniform values.
967+
AC.IndirectCalleeSpecializationCallback =
968+
[&](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
969+
Function &Callee, unsigned NumCallees) {
970+
return indirectCalleeSpecializationCallback(A, AA, CB, Callee,
971+
NumCallees);
972+
};
973+
957974
Attributor A(Functions, InfoCache, AC);
958975

959976
for (Function &F : M) {
@@ -975,6 +992,20 @@ class AMDGPUAttributor : public ModulePass {
975992
AU.addRequired<CycleInfoWrapperPass>();
976993
}
977994

995+
/// Helper to decide if we should specialize the indirect \p CB for \p Callee,
996+
/// which is one of the \p NumCallees potential callees.
997+
bool indirectCalleeSpecializationCallback(Attributor &A,
998+
const AbstractAttribute &AA,
999+
CallBase &CB, Function &Callee,
1000+
unsigned NumCallees) {
1001+
// Singleton functions should be specialized.
1002+
if (NumCallees == 1)
1003+
return true;
1004+
// Otherewise specialize uniform values.
1005+
const auto &TTI = TM->getTargetTransformInfo(*CB.getCaller());
1006+
return TTI.isAlwaysUniform(CB.getCalledOperand());
1007+
}
1008+
9781009
StringRef getPassName() const override { return "AMDGPU Attributor"; }
9791010
TargetMachine *TM;
9801011
static char ID;

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-indirect-call.ll

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2-
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
2+
; RUN: llc -global-isel -stop-after=irtranslator -attributor-assume-closed-world=false -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope --check-prefixes=SAMEC,CHECK %s
3+
; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope --check-prefixes=SAMEC,CWRLD %s
34

45
define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
56
; CHECK-LABEL: name: test_indirect_call_sgpr_ptr
@@ -52,24 +53,31 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(ptr %fptr) {
5253
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[LOAD]](p0), 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
5354
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
5455
; CHECK-NEXT: S_ENDPGM 0
56+
;
57+
; CWRLD-LABEL: name: test_indirect_call_sgpr_ptr
58+
; CWRLD: bb.1 (%ir-block.0):
59+
; CWRLD-NEXT: liveins: $sgpr4_sgpr5
60+
; CWRLD-NEXT: {{ $}}
61+
; CWRLD-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
62+
; CWRLD-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
5563
call void %fptr()
5664
ret void
5765
}
5866

5967
define amdgpu_gfx void @test_gfx_indirect_call_sgpr_ptr(ptr %fptr) {
60-
; CHECK-LABEL: name: test_gfx_indirect_call_sgpr_ptr
61-
; CHECK: bb.1 (%ir-block.0):
62-
; CHECK-NEXT: liveins: $vgpr0, $vgpr1
63-
; CHECK-NEXT: {{ $}}
64-
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
65-
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
66-
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
67-
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
68-
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
69-
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
70-
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
71-
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
72-
; CHECK-NEXT: SI_RETURN
68+
; SAMEC-LABEL: name: test_gfx_indirect_call_sgpr_ptr
69+
; SAMEC: bb.1 (%ir-block.0):
70+
; SAMEC-NEXT: liveins: $vgpr0, $vgpr1
71+
; SAMEC-NEXT: {{ $}}
72+
; SAMEC-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
73+
; SAMEC-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
74+
; SAMEC-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
75+
; SAMEC-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
76+
; SAMEC-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
77+
; SAMEC-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]](<4 x s32>)
78+
; SAMEC-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[MV]](p0), 0, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3
79+
; SAMEC-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
80+
; SAMEC-NEXT: SI_RETURN
7381
call amdgpu_gfx void %fptr()
7482
ret void
7583
}

0 commit comments

Comments
 (0)