Skip to content

Commit

Permalink
[CUDA][HIP] Externalize kernels with internal linkage
Browse files Browse the repository at this point in the history
This patch is a continuation of https://reviews.llvm.org/D123353.

Not only kernels in anonymous namespace, but also template
kernels with template arguments in anonymous namespace
need to be externalized.

To be more generic, this patch checks the linkage of a kernel
assuming the kernel does not have __global__ attribute. If
the linkage is internal then clang will externalize it.

This patch also fixes the postfix for externalized symbol
since nvptx does not allow '.' in symbol name.

Reviewed by: Artem Belevich

Differential Revision: https://reviews.llvm.org/D124189

Fixes: #54560
  • Loading branch information
yxsamliu committed Apr 22, 2022
1 parent baebe12 commit 04fb816
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 30 deletions.
4 changes: 3 additions & 1 deletion clang/lib/AST/ASTContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12298,7 +12298,9 @@ bool ASTContext::mayExternalize(const Decl *D) const {
// anonymous name space needs to be externalized to avoid duplicate symbols.
return (IsStaticVar &&
(D->hasAttr<HIPManagedAttr>() || IsExplicitDeviceVar)) ||
(D->hasAttr<CUDAGlobalAttr>() && D->isInAnonymousNamespace());
(D->hasAttr<CUDAGlobalAttr>() &&
basicGVALinkageForFunction(*this, cast<FunctionDecl>(D)) ==
GVA_Internal);
}

bool ASTContext::shouldExternalize(const Decl *D) const {
Expand Down
10 changes: 8 additions & 2 deletions clang/lib/CodeGen/CodeGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6809,6 +6809,12 @@ bool CodeGenModule::stopAutoInit() {

void CodeGenModule::printPostfixForExternalizedDecl(llvm::raw_ostream &OS,
const Decl *D) const {
OS << (isa<VarDecl>(D) ? "__static__" : ".anon.")
<< getContext().getCUIDHash();
StringRef Tag;
// ptxas does not allow '.' in symbol names. On the other hand, HIP prefers
// postfix beginning with '.' since the symbol name can be demangled.
if (LangOpts.HIP)
Tag = (isa<VarDecl>(D) ? ".static." : ".intern.");
else
Tag = (isa<VarDecl>(D) ? "__static__" : "__intern__");
OS << Tag << getContext().getCUIDHash();
}
31 changes: 23 additions & 8 deletions clang/test/CodeGenCUDA/device-var-linkage.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \
// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn -fcuda-is-device \
// RUN: -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=DEV,NORDC %s
// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \
// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn -fcuda-is-device \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=DEV,RDC %s
// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx \
// RUN: %clang_cc1 -no-opaque-pointers -triple x86_64-unknown-gnu-linux \
// RUN: -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s
// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx \
// RUN: %clang_cc1 -no-opaque-pointers -triple x86_64-unknown-gnu-linux \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s
// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device \
// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - %s \
// RUN: | FileCheck -check-prefixes=CUDA %s

#include "Inputs/cuda.h"

Expand All @@ -24,7 +27,9 @@ __constant__ int v2;
// DEV-DAG: @v3 = addrspace(1) externally_initialized global i32 addrspace(1)* null
// NORDC-H-DAG: @v3 = internal externally_initialized global i32* null
// RDC-H-DAG: @v3 = externally_initialized global i32* null
#if __HIP__
__managed__ int v3;
#endif

// DEV-DAG: @ev1 = external addrspace(1) global i32
// HOST-DAG: @ev1 = external global i32
Expand All @@ -34,25 +39,35 @@ extern __device__ int ev1;
extern __constant__ int ev2;
// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global i32 addrspace(1)*
// HOST-DAG: @ev3 = external externally_initialized global i32*
#if __HIP__
extern __managed__ int ev3;
#endif

// NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// HOST-DAG: @_ZL3sv1 = internal global i32 undef
// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
static __device__ int sv1;
// NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
// HOST-DAG: @_ZL3sv2 = internal global i32 undef
// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
static __constant__ int sv2;
// NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 addrspace(1)* null
// RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
// HOST-DAG: @_ZL3sv3 = internal externally_initialized global i32* null
#if __HIP__
static __managed__ int sv3;
#endif

__device__ __host__ int work(int *x);

__device__ __host__ int fun1() {
return work(&ev1) + work(&ev2) + work(&ev3) + work(&sv1) + work(&sv2) + work(&sv3);
return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2)
#if __HIP__
+ work(&ev3) + work(&sv3)
#endif
;
}

// HOST: hipRegisterVar({{.*}}@v1
Expand Down
46 changes: 40 additions & 6 deletions clang/test/CodeGenCUDA/kernel-in-anon-ns.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,53 @@
// RUN: -aux-triple amdgcn-amd-amdhsa -std=c++11 -fgpu-rdc \
// RUN: -emit-llvm -o - -x hip %s > %t.host

// RUN: cat %t.dev %t.host | FileCheck %s
// RUN: cat %t.dev %t.host | FileCheck -check-prefixes=HIP,COMMON %s

// RUN: echo "GPU binary" > %t.fatbin

// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -cuid=abc \
// RUN: -aux-triple x86_64-unknown-linux-gnu -std=c++11 -fgpu-rdc \
// RUN: -emit-llvm -o - %s > %t.dev

// RUN: %clang_cc1 -triple x86_64-gnu-linux -cuid=abc \
// RUN: -aux-triple nvptx -std=c++11 -fgpu-rdc -fcuda-include-gpubinary %t.fatbin \
// RUN: -emit-llvm -o - %s > %t.host

// RUN: cat %t.dev %t.host | FileCheck -check-prefixes=CUDA,COMMON %s

#include "Inputs/cuda.h"

// CHECK: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv\.anon\.b04fd23c98500190]](
// CHECK: @[[STR:.*]] = {{.*}} c"[[KERN]]\00"
// CHECK: call i32 @__hipRegisterFunction({{.*}}@[[STR]]
// HIP-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv\.intern\.b04fd23c98500190]](
// HIP-DAG: define weak_odr {{.*}}void @[[KERN2:_Z8tempKernIN12_GLOBAL__N_11XEEvT_\.intern\.b04fd23c98500190]](
// HIP-DAG: define weak_odr {{.*}}void @[[KERN3:_Z8tempKernIN12_GLOBAL__N_1UlvE_EEvT_\.intern\.b04fd23c98500190]](

// CUDA-DAG: define weak_odr {{.*}}void @[[KERN1:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN2:_Z8tempKernIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]](
// CUDA-DAG: define weak_odr {{.*}}void @[[KERN3:_Z8tempKernIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]](

// COMMON-DAG: @[[STR1:.*]] = {{.*}} c"[[KERN1]]\00"
// COMMON-DAG: @[[STR2:.*]] = {{.*}} c"[[KERN2]]\00"
// COMMON-DAG: @[[STR3:.*]] = {{.*}} c"[[KERN3]]\00"

// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR1]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR2]]
// COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[STR3]]


template <typename T>
__global__ void tempKern(T x) {}

namespace {
__global__ void kernel() {
}
__global__ void kernel() {}
struct X {};
X x;
auto lambda = [](){};
}

void test() {
kernel<<<1, 1>>>();

tempKern<<<1, 1>>>(x);

tempKern<<<1, 1>>>(lambda);
}
12 changes: 5 additions & 7 deletions clang/test/CodeGenCUDA/managed-var.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// REQUIRES: x86-registered-target, amdgpu-registered-target

// RUN: %clang_cc1 -no-opaque-pointers -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \
// RUN: -emit-llvm -o - -x hip %s | FileCheck \
// RUN: -check-prefixes=COMMON,DEV,NORDC-D %s
Expand Down Expand Up @@ -52,15 +50,15 @@ extern __managed__ int ex;

// NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4
// NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 addrspace(1)* null
// RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
// RDC-D-DAG: @_ZL2sx__static__[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4
// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
// HOST-DAG: @_ZL2sx.managed = internal global i32 1
// HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null
// NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00"
// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH:.*]]\00"
// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00"

// POSTFIX: @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx__static__[[HASH]]\00"
// POSTFIX: @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 addrspace(1)* null
// POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00"
static __managed__ int sx = 1;

// DEV-DAG: @llvm.compiler.used
Expand Down
18 changes: 12 additions & 6 deletions clang/test/CodeGenCUDA/static-device-var-rdc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@
// RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - -x hip %s > %t.host
// RUN: cat %t.host | FileCheck -check-prefix=HOST-NEG %s

// Check postfix for CUDA.

// RUN: %clang_cc1 -no-opaque-pointers -triple nvptx -fcuda-is-device -cuid=abc \
// RUN: -std=c++11 -fgpu-rdc -emit-llvm -o - %s | FileCheck \
// RUN: -check-prefixes=CUDA %s

#include "Inputs/cuda.h"

Expand All @@ -55,11 +60,12 @@
// INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"

// Test externalized static device variables
// EXT-DEV-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH:.*]]\00"
// EXT-DEV-DAG: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH:.*]]\00"
// CUDA-DAG: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0

// POSTFIX: @_ZL1x__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x__static__[[HASH]]\00"
// POSTFIX: @_ZL1x.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0
// POSTFIX: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.static.[[HASH]]\00"

static __device__ int x;

Expand All @@ -73,8 +79,8 @@ static __device__ int x2;
// INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"

// Test externalized static device variables
// EXT-DEV-DAG: @_ZL1y__static__[[HASH]] = addrspace(4) externally_initialized global i32 0
// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y__static__[[HASH]]\00"
// EXT-DEV-DAG: @_ZL1y.static.[[HASH]] = addrspace(4) externally_initialized global i32 0
// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.static.[[HASH]]\00"

static __constant__ int y;

Expand Down

0 comments on commit 04fb816

Please sign in to comment.