Skip to content

[SYCL][Fusion] Add HIP support #11003

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5557,8 +5557,9 @@ class OffloadingActionBuilder final {
DA.add(*DeviceWrappingAction, *TC, BoundArch, Action::OFK_SYCL);
continue;
}
if (IsNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) {
// When compiling for Nvidia/CUDA devices and the user requested the
if ((IsNVPTX || IsAMDGCN) &&
Args.hasArg(options::OPT_fsycl_embed_ir)) {
// When compiling for Nvidia/AMD devices and the user requested the
// IR to be embedded in the application (via option), run the output
// of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
// through the offload wrapper and link the resulting object to the
Expand Down
19 changes: 19 additions & 0 deletions clang/test/Driver/sycl-embed-ir.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/// Tests for -fsycl-embed-ir

// UNSUPPORTED: system-windows

// RUN: %clangxx -fsycl -fsycl-targets=nvidia_gpu_sm_80 -fsycl-embed-ir -ccc-print-phases %s 2>&1 | \
// RUN: FileCheck -check-prefix=CHECK-NV %s

// CHECK-NV: [[IR:[0-9]+]]: compiler, {4}, ir, (device-sycl, sm_80)
// CHECK-NV: [[POSTLINK:[0-9]+]]: sycl-post-link, {{{.*}}}, ir, (device-sycl, sm_80)
// CHECK-NV: [[WRAP:[0-9]+]]: clang-offload-wrapper, {[[POSTLINK]]}, object, (device-sycl, sm_80)
// CHECK-NV: offload, "host-sycl (x86_64-unknown-linux-gnu)" {{{.*}}}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {[[WRAP]]}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {{{.*}}}, image

// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx1010 -fsycl-embed-ir -ccc-print-phases %s 2>&1 | \
// RUN: FileCheck -check-prefix=CHECK-AMD %s

// CHECK-AMD: [[IR:[0-9]+]]: compiler, {4}, ir, (device-sycl, gfx1010)
// CHECK-AMD: [[POSTLINK:[0-9]+]]: sycl-post-link, {{{.*}}}, ir, (device-sycl, gfx1010)
// CHECK-AMD: [[WRAP:[0-9]+]]: clang-offload-wrapper, {[[POSTLINK]]}, object, (device-sycl, gfx1010)
// CHECK-AMD: offload, "host-sycl (x86_64-unknown-linux-gnu)" {{{.*}}}, "device-sycl (amdgcn-amd-amdhsa:gfx1010)" {[[WRAP]]}, "device-sycl (amdgcn-amd-amdhsa:gfx1010)" {{{.*}}}, image
26 changes: 25 additions & 1 deletion sycl-fusion/common/include/Kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,30 @@ namespace jit_compiler {

using BinaryAddress = const uint8_t *;

/// Possible barrier flags
enum class BarrierFlags : uint32_t {
None = 0, // Do not insert barrier
Local = 1, // Ensure correct ordering of memory operations to local memory
Global = 2, // Ensure correct ordering of memory operations to global memory
LocalAndGlobal = Local | Global
};

constexpr BarrierFlags getNoBarrierFlag() { return BarrierFlags::None; }
constexpr BarrierFlags getLocalAndGlobalBarrierFlag() {
return BarrierFlags::LocalAndGlobal;
}
constexpr bool isNoBarrierFlag(BarrierFlags Flag) {
return Flag == BarrierFlags::None;
}
constexpr bool hasLocalBarrierFlag(BarrierFlags Flag) {
return static_cast<uint32_t>(Flag) &
static_cast<uint32_t>(BarrierFlags::Local);
}
constexpr bool hasGlobalBarrierFlag(BarrierFlags Flag) {
return static_cast<uint32_t>(Flag) &
static_cast<uint32_t>(BarrierFlags::Global);
}

///
/// Enumerate possible kinds of parameters.
/// 1:1 correspondence with the definition in kernel_desc.hpp in the DPC++ SYCL
Expand All @@ -35,7 +59,7 @@ enum class ParameterKind : uint32_t {
};

/// Different binary formats supported as input to the JIT compiler.
enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX };
enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX, AMDGCN };

/// Information about a device intermediate representation module (e.g., SPIR-V,
/// LLVM IR) from DPC++.
Expand Down
1 change: 1 addition & 0 deletions sycl-fusion/common/lib/KernelIO.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ template <> struct ScalarEnumerationTraits<jit_compiler::BinaryFormat> {
IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM);
IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV);
IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX);
IO.enumCase(BF, "AMDGCN", jit_compiler::BinaryFormat::AMDGCN);
IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID);
}
};
Expand Down
7 changes: 7 additions & 0 deletions sycl-fusion/jit-compiler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ add_llvm_library(sycl-fusion
lib/fusion/ModuleHelper.cpp
lib/helper/ConfigHelper.cpp

DEPENDS
intrinsics_gen

LINK_COMPONENTS
BitReader
Core
Expand Down Expand Up @@ -50,6 +53,10 @@ if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX)
endif()

if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
endif()

if (BUILD_SHARED_LIBS)
if(NOT MSVC AND NOT APPLE)
# Manage symbol visibility through the linker to make sure no LLVM symbols
Expand Down
3 changes: 2 additions & 1 deletion sycl-fusion/jit-compiler/include/JITContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "Hashing.h"
#include "Kernel.h"
#include "Options.h"
#include "Parameter.h"

namespace llvm {
Expand All @@ -28,7 +29,7 @@ class LLVMContext;
namespace jit_compiler {

using CacheKeyT =
std::tuple<std::vector<std::string>, ParamIdentList, int,
std::tuple<std::vector<std::string>, ParamIdentList, BarrierFlags,
std::vector<ParameterInternalization>, std::vector<JITConstant>,
// This field of the cache is optional because, if all of the
// ranges are equal, we will perform no remapping, so that fused
Expand Down
3 changes: 2 additions & 1 deletion sycl-fusion/jit-compiler/include/KernelFusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ class KernelFusion {
const std::vector<SYCLKernelInfo> &KernelInformation,
const std::vector<std::string> &KernelsToFuse,
const std::string &FusedKernelName,
jit_compiler::ParamIdentList &Identities, int BarriersFlags,
jit_compiler::ParamIdentList &Identities,
BarrierFlags BarriersFlags,
const std::vector<jit_compiler::ParameterInternalization>
&Internalization,
const std::vector<jit_compiler::JITConstant> &JITConstants);
Expand Down
9 changes: 8 additions & 1 deletion sycl-fusion/jit-compiler/lib/KernelFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
#else // FUSION_JIT_SUPPORT_PTX
return false;
#endif // FUSION_JIT_SUPPORT_PTX
}
case BinaryFormat::AMDGCN: {
#ifdef FUSION_JIT_SUPPORT_AMDGCN
return true;
#else // FUSION_JIT_SUPPORT_AMDGCN
return false;
#endif // FUSION_JIT_SUPPORT_AMDGCN
}
default:
return false;
Expand All @@ -69,7 +76,7 @@ FusionResult KernelFusion::fuseKernels(
const std::vector<SYCLKernelInfo> &KernelInformation,
const std::vector<std::string> &KernelsToFuse,
const std::string &FusedKernelName, ParamIdentList &Identities,
int BarriersFlags,
BarrierFlags BarriersFlags,
const std::vector<jit_compiler::ParameterInternalization> &Internalization,
const std::vector<jit_compiler::JITConstant> &Constants) {
// Initialize the configuration helper to make the options for this invocation
Expand Down
4 changes: 2 additions & 2 deletions sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static unsigned getFlatAddressSpace(Module &Mod) {
// Ideally, we could get this information from the TargetTransformInfo, but
// the SPIR-V backend does not yet seem to have an implementation for that.
llvm::Triple Tri(Mod.getTargetTriple());
if (Tri.isNVPTX()) {
if (Tri.isNVPTX() || Tri.isAMDGCN()) {
return 0;
}
if (Tri.isSPIRV() || Tri.isSPIR()) {
Expand All @@ -53,7 +53,7 @@ static unsigned getFlatAddressSpace(Module &Mod) {

std::unique_ptr<SYCLModuleInfo>
FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
int BarriersFlags) {
BarrierFlags BarriersFlags) {
// Perform the actual kernel fusion, i.e., generate a kernel function for the
// fused kernel from the kernel functions of the input kernels. This is done
// by the SYCLKernelFusion LLVM pass, which is run here through a custom LLVM
Expand Down
2 changes: 1 addition & 1 deletion sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class FusionPipeline {
/// contain an entry for the fused kernel.
static std::unique_ptr<SYCLModuleInfo>
runFusionPasses(llvm::Module &Mod, SYCLModuleInfo &InputInfo,
int BarriersFlags);
BarrierFlags BarriersFlags);
};
} // namespace fusion
} // namespace jit_compiler
Expand Down
80 changes: 80 additions & 0 deletions sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,14 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
KernelBin = *BinaryOrError;
break;
}
case BinaryFormat::AMDGCN: {
llvm::Expected<KernelBinary *> BinaryOrError =
translateToAMDGCN(Kernel, Mod, JITCtx);
if (auto Error = BinaryOrError.takeError())
return Error;
KernelBin = *BinaryOrError;
break;
}
default: {
return createStringError(
inconvertibleErrorCode(),
Expand Down Expand Up @@ -287,3 +295,75 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX);
#endif // FUSION_JIT_SUPPORT_PTX
}

llvm::Expected<KernelBinary *>
KernelTranslator::translateToAMDGCN(SYCLKernelInfo &KernelInfo,
llvm::Module &Mod, JITContext &JITCtx) {
#ifndef FUSION_JIT_SUPPORT_AMDGCN
(void)KernelInfo;
(void)Mod;
(void)JITCtx;
return createStringError(inconvertibleErrorCode(),
"AMDGPU translation not supported in this build");
#else // FUSION_JIT_SUPPORT_AMDGCN

LLVMInitializeAMDGPUTargetInfo();
LLVMInitializeAMDGPUTarget();
LLVMInitializeAMDGPUAsmPrinter();
LLVMInitializeAMDGPUTargetMC();

static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";

std::string TargetTriple{"amdgcn-amd-amdhsa"};

std::string ErrorMessage;
const auto *Target =
llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage);

if (!Target)
return createStringError(
inconvertibleErrorCode(),
"Failed to load and translate AMDGCN LLVM IR module with error %s",
ErrorMessage.c_str());

// Set to the lowest tested target according to the GetStartedGuide, section
// "Build DPC++ toolchain with support for HIP AMD"
llvm::StringRef TargetCPU{"gfx906"};
llvm::StringRef TargetFeatures{""};
if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) {
if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
TargetCPU =
KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
}
if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
.getValueAsString();
}
}

// FIXME: Check whether we can provide more accurate target information here
auto *TargetMachine = Target->createTargetMachine(
TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
std::nullopt, llvm::CodeGenOptLevel::Default);

std::string AMDObj;
{
llvm::legacy::PassManager PM;
llvm::raw_string_ostream OBJStream{AMDObj};
llvm::buffer_ostream BufferedOBJ{OBJStream};

if (TargetMachine->addPassesToEmitFile(PM, BufferedOBJ, nullptr,
llvm::CodeGenFileType::ObjectFile)) {
return createStringError(
inconvertibleErrorCode(),
"Failed to construct pass pipeline to emit output");
}

PM.run(Mod);
OBJStream.flush();
}

return &JITCtx.emplaceKernelBinary(std::move(AMDObj), BinaryFormat::AMDGCN);
#endif // FUSION_JIT_SUPPORT_AMDGCN
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ class KernelTranslator {

static llvm::Expected<KernelBinary *>
translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx);

static llvm::Expected<KernelBinary *>
translateToAMDGCN(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
JITContext &JITCtx);
};
} // namespace translation
} // namespace jit_compiler
Expand Down
8 changes: 8 additions & 0 deletions sycl-fusion/passes/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_PTX)
endif()

if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
endif()

# Static library for linking with the jit_compiler
add_llvm_library(SYCLKernelFusionPasses
SYCLFusionPasses.cpp
Expand Down Expand Up @@ -68,3 +72,7 @@ target_link_libraries(SYCLKernelFusionPasses
if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_PTX)
endif()

if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
endif()
8 changes: 6 additions & 2 deletions sycl-fusion/passes/SYCLFusionPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Passes/PassPlugin.h"

#include "Kernel.h"

#include "internalization/Internalization.h"
#include "kernel-fusion/SYCLKernelFusion.h"
#include "kernel-info/SYCLKernelInfo.h"
#include "syclcp/SYCLCP.h"

using namespace llvm;
using namespace jit_compiler;

cl::opt<bool>
NoBarriers("sycl-kernel-fusion-no-barriers",
Expand All @@ -28,8 +31,9 @@ llvm::PassPluginLibraryInfo getSYCLKernelFusionPluginInfo() {
[](StringRef Name, ModulePassManager &MPM,
ArrayRef<PassBuilder::PipelineElement>) {
if (Name == "sycl-kernel-fusion") {
int BarrierFlag =
(NoBarriers) ? -1 : SYCLKernelFusion::DefaultBarriersFlags;
BarrierFlags BarrierFlag =
(NoBarriers) ? getNoBarrierFlag()
: SYCLKernelFusion::DefaultBarriersFlags;
MPM.addPass(SYCLKernelFusion(BarrierFlag));
return true;
}
Expand Down
15 changes: 7 additions & 8 deletions sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,13 +234,12 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder,
return {Entry, CallInsertion, Exit};
}

static Expected<CallInst *>
createFusionCall(IRBuilderBase &Builder, Function *F,
ArrayRef<Value *> CallArgs,
const jit_compiler::NDRange &SrcNDRange,
const jit_compiler::NDRange &FusedNDRange, bool IsLast,
int BarriersFlags, jit_compiler::Remapper &Remapper,
bool ShouldRemap, TargetFusionInfo &TargetInfo) {
static Expected<CallInst *> createFusionCall(
IRBuilderBase &Builder, Function *F, ArrayRef<Value *> CallArgs,
const jit_compiler::NDRange &SrcNDRange,
const jit_compiler::NDRange &FusedNDRange, bool IsLast,
jit_compiler::BarrierFlags BarriersFlags, jit_compiler::Remapper &Remapper,
bool ShouldRemap, TargetFusionInfo &TargetInfo) {
const auto IPs =
addGuard(Builder, TargetInfo, SrcNDRange, FusedNDRange, IsLast);

Expand All @@ -266,7 +265,7 @@ createFusionCall(IRBuilderBase &Builder, Function *F,
Builder.SetInsertPoint(IPs.Exit);

// Insert barrier if needed
if (!IsLast && BarriersFlags > 0) {
if (!IsLast && !jit_compiler::isNoBarrierFlag(BarriersFlags)) {
TargetInfo.createBarrierCall(Builder, BarriersFlags);
}

Expand Down
11 changes: 4 additions & 7 deletions sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
constexpr static llvm::StringLiteral NDRangesMDKey{"sycl.kernel.nd-ranges"};

constexpr SYCLKernelFusion() = default;
constexpr explicit SYCLKernelFusion(int BarriersFlags)
constexpr explicit SYCLKernelFusion(jit_compiler::BarrierFlags BarriersFlags)
: BarriersFlags{BarriersFlags} {}

llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
Expand All @@ -45,7 +45,8 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
///
/// By default, correct ordering of memory operations to global memory is
/// ensured.
constexpr static int DefaultBarriersFlags{3};
constexpr static jit_compiler::BarrierFlags DefaultBarriersFlags{
jit_compiler::getLocalAndGlobalBarrierFlag()};

private:
// This needs to be in sync with the metadata kind
Expand Down Expand Up @@ -155,11 +156,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
///
/// Flags to apply to the barrier to be introduced between fused kernels.
///
/// Possible values:
/// - -1: Do not insert barrier
/// - 1: ensure correct ordering of memory operations to local memory
/// - 2: ensure correct ordering of memory operations to global memory
const int BarriersFlags{DefaultBarriersFlags};
const jit_compiler::BarrierFlags BarriersFlags{DefaultBarriersFlags};

///
/// Merge the content of Other into Attributes, adding, removing or updating
Expand Down
Loading