Skip to content

Commit e763b93

Browse files
shiltianronlieb
authored andcommitted
Reapply "[Attributor][AMDGPU] Enable AAIndirectCallInfo for AMDAttributor (llvm#100952)"
This reverts commit 36467bf. Change-Id: I5bbe47ebb60385a5c2c398ca3fa943fae675b860
1 parent 7ce5da3 commit e763b93

10 files changed

+40
-25
lines changed

llvm/include/llvm/Transforms/IPO/Attributor.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1453,7 +1453,7 @@ struct AttributorConfig {
14531453
/// Callback function to determine if an indirect call targets should be made
14541454
/// direct call targets (with an if-cascade).
14551455
std::function<bool(Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1456-
Function &AssummedCallee)>
1456+
Function &AssumedCallee, unsigned NumAssumedCallees)>
14571457
IndirectCalleeSpecializationCallback = nullptr;
14581458

14591459
/// Helper to update an underlying call graph and to delete functions.
@@ -1723,10 +1723,11 @@ struct Attributor {
17231723
/// Return true if we should specialize the call site \b CB for the potential
17241724
/// callee \p Fn.
17251725
bool shouldSpecializeCallSiteForCallee(const AbstractAttribute &AA,
1726-
CallBase &CB, Function &Callee) {
1726+
CallBase &CB, Function &Callee,
1727+
unsigned NumAssumedCallees) {
17271728
return Configuration.IndirectCalleeSpecializationCallback
1728-
? Configuration.IndirectCalleeSpecializationCallback(*this, AA,
1729-
CB, Callee)
1729+
? Configuration.IndirectCalleeSpecializationCallback(
1730+
*this, AA, CB, Callee, NumAssumedCallees)
17301731
: true;
17311732
}
17321733

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "GCNSubtarget.h"
1515
#include "Utils/AMDGPUBaseInfo.h"
1616
#include "llvm/Analysis/CycleAnalysis.h"
17+
#include "llvm/Analysis/TargetTransformInfo.h"
1718
#include "llvm/CodeGen/TargetPassConfig.h"
1819
#include "llvm/IR/IntrinsicsAMDGPU.h"
1920
#include "llvm/IR/IntrinsicsR600.h"
@@ -1045,13 +1046,26 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
10451046
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
10461047
&AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
10471048
&AAPointerInfo::ID, &AAPotentialConstantValues::ID,
1048-
&AAUnderlyingObjects::ID, &AAAddressSpace::ID});
1049+
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
1050+
&AAInstanceInfo::ID});
10491051

10501052
AttributorConfig AC(CGUpdater);
10511053
AC.IsClosedWorldModule = Options.IsClosedWorld;
10521054
AC.Allowed = &Allowed;
10531055
AC.IsModulePass = true;
10541056
AC.DefaultInitializeLiveInternals = false;
1057+
AC.IndirectCalleeSpecializationCallback =
1058+
[&TM](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
1059+
Function &Callee, unsigned NumAssumedCallees) {
1060+
if (AMDGPU::isEntryFunctionCC(Callee.getCallingConv()))
1061+
return false;
1062+
// Singleton functions can be specialized.
1063+
if (NumAssumedCallees == 1)
1064+
return true;
1065+
// Otherwise specialize uniform values.
1066+
const auto &TTI = TM.getTargetTransformInfo(*CB.getCaller());
1067+
return TTI.isAlwaysUniform(CB.getCalledOperand());
1068+
};
10551069
AC.IPOAmendableCB = [](const Function &F) {
10561070
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
10571071
};

llvm/lib/Transforms/IPO/Attributor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3836,7 +3836,7 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
38363836
if (MaxSpecializationPerCB.getNumOccurrences()) {
38373837
AC.IndirectCalleeSpecializationCallback =
38383838
[&](Attributor &, const AbstractAttribute &AA, CallBase &CB,
3839-
Function &Callee) {
3839+
Function &Callee, unsigned) {
38403840
if (MaxSpecializationPerCB == 0)
38413841
return false;
38423842
auto &Set = IndirectCalleeTrackingMap[&CB];

llvm/lib/Transforms/IPO/AttributorAttributes.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12369,7 +12369,8 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
1236912369
SmallVector<Function *, 8> SkippedAssumedCallees;
1237012370
SmallVector<std::pair<CallInst *, Instruction *>> NewCalls;
1237112371
for (Function *NewCallee : AssumedCallees) {
12372-
if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee)) {
12372+
if (!A.shouldSpecializeCallSiteForCallee(*this, *CB, *NewCallee,
12373+
AssumedCallees.size())) {
1237312374
SkippedAssumedCallees.push_back(NewCallee);
1237412375
SpecializedForAllCallees = false;
1237512376
continue;

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-accesslist-offsetbins-out-of-sync.ll

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,31 @@ define internal fastcc void @foo(ptr %kg) {
1414
; CHECK-NEXT: [[NUM_CLOSURE_I26_I:%.*]] = getelementptr i8, ptr [[KG]], i64 276
1515
; CHECK-NEXT: br label %[[WHILE_COND:.*]]
1616
; CHECK: [[WHILE_COND]]:
17-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[KG]], align 4
17+
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[KG]] to ptr addrspace(5)
18+
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[TMP0]], align 4
1819
; CHECK-NEXT: [[IDXPROM_I:%.*]] = zext i32 [[TMP1]] to i64
1920
; CHECK-NEXT: switch i32 0, label %[[SW_BB92:.*]] [
2021
; CHECK-NEXT: i32 1, label %[[SW_BB92]]
2122
; CHECK-NEXT: i32 0, label %[[SUBD_TRIANGLE_PATCH_EXIT_I_I35:.*]]
2223
; CHECK-NEXT: ]
2324
; CHECK: [[SUBD_TRIANGLE_PATCH_EXIT_I_I35]]:
2425
; CHECK-NEXT: [[ARRAYIDX_I27_I:%.*]] = getelementptr float, ptr [[KG]], i64 [[IDXPROM_I]]
25-
; CHECK-NEXT: store float 0.000000e+00, ptr [[ARRAYIDX_I27_I]], align 4
26+
; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX_I27_I]] to ptr addrspace(5)
27+
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[TMP5]], align 4
2628
; CHECK-NEXT: br label %[[WHILE_COND]]
2729
; CHECK: [[SW_BB92]]:
2830
; CHECK-NEXT: [[INSERT:%.*]] = insertelement <3 x i32> zeroinitializer, i32 [[TMP1]], i64 0
2931
; CHECK-NEXT: [[SPLAT_SPLATINSERT_I:%.*]] = bitcast <3 x i32> [[INSERT]] to <3 x float>
3032
; CHECK-NEXT: [[SHFL:%.*]] = shufflevector <3 x float> [[SPLAT_SPLATINSERT_I]], <3 x float> zeroinitializer, <4 x i32> zeroinitializer
31-
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[NUM_CLOSURE_I26_I]], align 4
33+
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[NUM_CLOSURE_I26_I]] to ptr addrspace(5)
34+
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[TMP2]], align 4
3235
; CHECK-NEXT: [[IDXPROM_I27_I:%.*]] = sext i32 [[LOAD]] to i64
3336
; CHECK-NEXT: [[ARRAYIDX_I28_I:%.*]] = getelementptr [64 x %struct.ShaderClosure], ptr [[CLOSURE_I25_I]], i64 0, i64 [[IDXPROM_I27_I]]
34-
; CHECK-NEXT: store <4 x float> [[SHFL]], ptr [[ARRAYIDX_I28_I]], align 16
37+
; CHECK-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[ARRAYIDX_I28_I]] to ptr addrspace(5)
38+
; CHECK-NEXT: store <4 x float> [[SHFL]], ptr addrspace(5) [[TMP3]], align 16
3539
; CHECK-NEXT: [[INC_I30_I:%.*]] = or i32 [[LOAD]], 1
36-
; CHECK-NEXT: store i32 [[INC_I30_I]], ptr [[NUM_CLOSURE_I26_I]], align 4
40+
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[NUM_CLOSURE_I26_I]] to ptr addrspace(5)
41+
; CHECK-NEXT: store i32 [[INC_I30_I]], ptr addrspace(5) [[TMP4]], align 4
3742
; CHECK-NEXT: br label %[[WHILE_COND]]
3843
;
3944
entry:

llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ define internal void @direct() {
1414
; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
1515
; CHECK-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
1616
; CHECK-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
17-
; CHECK-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
18-
; CHECK-NEXT: call void [[FP]]()
17+
; CHECK-NEXT: call void @indirect()
1918
; CHECK-NEXT: ret void
2019
;
2120
%fptr = alloca ptr, addrspace(5)
@@ -36,5 +35,5 @@ define amdgpu_kernel void @test_direct_indirect_call() {
3635
}
3736
;.
3837
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
39-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
38+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
4039
;.

llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 {
2626
; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] {
2727
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
2828
; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
29-
; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
30-
; ATTRIBUTOR_GCN-NEXT: call void [[FP]]()
29+
; ATTRIBUTOR_GCN-NEXT: call void @indirect()
3130
; ATTRIBUTOR_GCN-NEXT: ret void
3231
;
3332
%fptr = alloca ptr, addrspace(5)
@@ -43,5 +42,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
4342
; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
4443
;.
4544
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
46-
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
45+
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
4746
;.

llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ define amdgpu_kernel void @foo(ptr noundef %fp) {
6060
; CHECK-NEXT: entry:
6161
; CHECK-NEXT: [[FP_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
6262
; CHECK-NEXT: store ptr [[FP]], ptr addrspace(5) [[FP_ADDR]], align 8
63-
; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(5) [[FP_ADDR]], align 8
64-
; CHECK-NEXT: call void [[LOAD]]()
63+
; CHECK-NEXT: call void [[FP]]()
6564
; CHECK-NEXT: ret void
6665
;
6766
entry:

llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
3535
; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] {
3636
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
3737
; ATTRIBUTOR_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
38-
; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
39-
; ATTRIBUTOR_GCN-NEXT: call void [[FP]]()
38+
; ATTRIBUTOR_GCN-NEXT: call void @indirect()
4039
; ATTRIBUTOR_GCN-NEXT: ret void
4140
;
4241
; GFX9-LABEL: test_simple_indirect_call:
@@ -81,7 +80,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
8180
; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
8281
;.
8382
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
84-
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
83+
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
8584
;.
8685
; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
8786
;.

revert_patches.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,6 @@ dfeb3991fb48 Remove the `x86_mmx` IR type. (#98505)
7575
b7e4fba6e5dc Cleanup x86_mmx after removing IR type (#100646) (Reason: dependent on dfeb3991fb48)
7676
Ron: still broken 9-6-24
7777
---
78-
revert: 1ca9fe6db334 Reapply "[Attributor][AMDGPU] Enable AAIndirectCallInfo for AMDAttributor
79-
---
8078
revert : breaks build of amdgpu flat intrinsics
8179
ee08d9cba561 AMDGPU: Remove global/flat atomic fadd intrinics (#97051)
8280
---

0 commit comments

Comments
 (0)