Skip to content

[AMDGPU] Add AMDGPU-specific module splitting #89245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
744 changes: 744 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSplitModule.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
//===- AMDGPUSplitModule.h -------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_TARGET_AMDGPUSPLITMODULE_H
#define LLVM_TARGET_AMDGPUSPLITMODULE_H

#include "llvm/ADT/STLFunctionalExtras.h"
#include <memory>

namespace llvm {

class Module;
class AMDGPUTargetMachine;

/// Splits the module M into N linkable partitions. The function ModuleCallback
/// is called N times passing each individual partition as the MPart argument.
void splitAMDGPUModule(
const AMDGPUTargetMachine &TM, Module &M, unsigned N,
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback);

} // end namespace llvm

#endif // LLVM_TARGET_AMDGPUSPLITMODULE_H
8 changes: 8 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "AMDGPUIGroupLP.h"
#include "AMDGPUMacroFusion.h"
#include "AMDGPURegBankSelect.h"
#include "AMDGPUSplitModule.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUUnifyDivergentExitNodes.h"
Expand Down Expand Up @@ -806,6 +807,13 @@ AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
return AMDGPUAS::FLAT_ADDRESS;
}

bool AMDGPUTargetMachine::splitModule(
Module &M, unsigned NumParts,
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) const {
splitAMDGPUModule(*this, M, NumParts, ModuleCallback);
return true;
}

//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
getPredicatedAddrSpace(const Value *V) const override;

unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override;

bool splitModule(Module &M, unsigned NumParts,
function_ref<void(std::unique_ptr<Module> MPart)>
ModuleCallback) const override;
};

//===----------------------------------------------------------------------===//
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURewriteOutArguments.cpp
AMDGPURewriteUndefForPHI.cpp
AMDGPUSetWavePriority.cpp
AMDGPUSplitModule.cpp
AMDGPUSubtarget.cpp
AMDGPUTargetMachine.cpp
AMDGPUTargetObjectFile.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s

; 3 kernels:
; - A does a direct call to HelperA
; - B is storing @HelperA
; - C does a direct call to HelperA
;
; The helper functions will get externalized, which will force A and C into P0 as
; external functions cannot be duplicated.

; CHECK0: define hidden void @HelperA()
; CHECK0: define amdgpu_kernel void @A()
; CHECK0: declare amdgpu_kernel void @B(ptr)
; CHECK0: define amdgpu_kernel void @C()

; CHECK1: declare hidden void @HelperA()
; CHECK1: declare amdgpu_kernel void @A()
; CHECK1: declare amdgpu_kernel void @B(ptr)
; CHECK1: declare amdgpu_kernel void @C()

; CHECK2: declare hidden void @HelperA()
; CHECK2: declare amdgpu_kernel void @A()
; CHECK2: define amdgpu_kernel void @B(ptr %dst)
; CHECK2: declare amdgpu_kernel void @C()

define internal void @HelperA() {
ret void
}

define amdgpu_kernel void @A() {
call void @HelperA()
ret void
}

define amdgpu_kernel void @B(ptr %dst) {
store ptr @HelperA, ptr %dst
ret void
}

define amdgpu_kernel void @C() {
call void @HelperA()
ret void
}
37 changes: 37 additions & 0 deletions llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-kernel-threshold=0
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s

; 2 kernels:
; - A is isolated
; - B is storing @HelperA/B's address
;
; The helper functions should get externalized (become hidden w/ external linkage)

; CHECK0: define hidden void @HelperA()
; CHECK0: define hidden void @HelperB()
; CHECK0: define amdgpu_kernel void @A()
; CHECK0: declare amdgpu_kernel void @B(i1, ptr)

; CHECK1: declare hidden void @HelperA()
; CHECK1: declare hidden void @HelperB()
; CHECK1: declare amdgpu_kernel void @A()
; CHECK1: define amdgpu_kernel void @B(i1 %cond, ptr %dst)

define internal void @HelperA() {
ret void
}

define internal void @HelperB() {
ret void
}

define amdgpu_kernel void @A() {
ret void
}

define amdgpu_kernel void @B(i1 %cond, ptr %dst) {
%addr = select i1 %cond, ptr @HelperA, ptr @HelperB
store ptr %addr, ptr %dst
ret void
}
20 changes: 20 additions & 0 deletions llvm/test/tools/llvm-split/AMDGPU/debug-name-hiding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -debug -amdgpu-module-splitting-log-private 2>&1 | FileCheck %s --implicit-check-not=MyCustomKernel
; REQUIRES: asserts

; SHA256 of the kernel names.

; CHECK: a097723d21cf9f35d90e6fb7881995ac8c398b3366a6c97efc657404f9fe301c
; CHECK: 626bc23242de8fcfda7f0e66318d29455c081df6b5380e64d14703c95fcbcd59
; CHECK: c38d90a7ca71dc5d694bb9e093dadcdedfc4cb4adf7ed7e46d42fe95a0b4ef55

define amdgpu_kernel void @MyCustomKernel0() {
ret void
}

define amdgpu_kernel void @MyCustomKernel1() {
ret void
}

define amdgpu_kernel void @MyCustomKernel2() {
ret void
}
45 changes: 45 additions & 0 deletions llvm/test/tools/llvm-split/AMDGPU/kernels-alias-dependencies.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
; RUN: llvm-split -o %t %s -j 2 -mtriple amdgcn-amd-amdhsa
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s

; 3 kernels:
; - A calls nothing
; - B calls @PerryThePlatypus
; - C calls @Perry, an alias of @PerryThePlatypus
;
; We should see through the alias and put B/C in the same
; partition.
;
; Additionally, @PerryThePlatypus gets externalized as
; the alias counts as taking its address.

; CHECK0-NOT: define
; CHECK0: @Perry = internal alias ptr (), ptr @PerryThePlatypus
; CHECK0: define hidden void @PerryThePlatypus()
; CHECK0: define amdgpu_kernel void @B
; CHECK0: define amdgpu_kernel void @C
; CHECK0-NOT: define

; CHECK1-NOT: define
; CHECK1: define amdgpu_kernel void @A
; CHECK1-NOT: define

@Perry = internal alias ptr(), ptr @PerryThePlatypus

define internal void @PerryThePlatypus() {
ret void
}

define amdgpu_kernel void @A() {
ret void
}

define amdgpu_kernel void @B() {
call void @PerryThePlatypus()
ret void
}

define amdgpu_kernel void @C() {
call void @Perry()
ret void
}
54 changes: 54 additions & 0 deletions llvm/test/tools/llvm-split/AMDGPU/kernels-cost-ranking.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s

; 3 kernels with each their own dependencies should go into 3
; distinct partitions. The most expensive kernel should be
; seen first and go into the last partition.

; CHECK0-NOT: define
; CHECK0: define amdgpu_kernel void @C
; CHECK0: define internal void @HelperC
; CHECK0-NOT: define

; CHECK1-NOT: define
; CHECK1: define amdgpu_kernel void @A
; CHECK1: define internal void @HelperA
; CHECK1-NOT: define

; CHECK2-NOT: define
; CHECK2: define amdgpu_kernel void @B
; CHECK2: define internal void @HelperB
; CHECK2-NOT: define


define amdgpu_kernel void @A() {
call void @HelperA()
ret void
}

define internal void @HelperA() {
ret void
}

define amdgpu_kernel void @B(ptr %x) {
store i64 42, ptr %x
store i64 43, ptr %x
store i64 44, ptr %x
call void @HelperB()
ret void
}

define internal void @HelperB() {
ret void
}

define amdgpu_kernel void @C() {
call void @HelperC()
ret void
}

define internal void @HelperC() {
ret void
}
50 changes: 50 additions & 0 deletions llvm/test/tools/llvm-split/AMDGPU/kernels-dependencies.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s

; 3 kernels with each their own dependencies should go into 3
; distinct partitions.

; CHECK0-NOT: define
; CHECK0: define amdgpu_kernel void @C
; CHECK0: define internal void @HelperC
; CHECK0-NOT: define

; CHECK1-NOT: define
; CHECK1: define amdgpu_kernel void @B
; CHECK1: define internal void @HelperB
; CHECK1-NOT: define

; CHECK2-NOT: define
; CHECK2: define amdgpu_kernel void @A
; CHECK2: define internal void @HelperA
; CHECK2-NOT: define


define amdgpu_kernel void @A() {
call void @HelperA()
ret void
}

define internal void @HelperA() {
ret void
}

define amdgpu_kernel void @B() {
call void @HelperB()
ret void
}

define internal void @HelperB() {
ret void
}

define amdgpu_kernel void @C() {
call void @HelperC()
ret void
}

define internal void @HelperC() {
ret void
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa
; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s
; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s
; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s

; 3 kernels share a common helper, that helper should be
; cloned in all partitions.

; CHECK0-NOT: define
; CHECK0: define internal void @Helper
; CHECK0: define amdgpu_kernel void @C
; CHECK0-NOT: define

; CHECK1-NOT: define
; CHECK1: define internal void @Helper
; CHECK1: define amdgpu_kernel void @B
; CHECK1-NOT: define

; CHECK2-NOT: define
; CHECK2: define internal void @Helper
; CHECK2: define amdgpu_kernel void @A
; CHECK2-NOT: define

define internal void @Helper() {
ret void
}

define amdgpu_kernel void @A() {
call void @Helper()
ret void
}

define amdgpu_kernel void @B() {
call void @Helper()
ret void
}

define amdgpu_kernel void @C() {
call void @Helper()
ret void
}
Loading
Loading