Skip to content

[SYCL] Add support for eliminated arg masks in SYCLBIN kernel bundles #19163

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: sycl
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions sycl/source/detail/device_image_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//

#include <detail/device_image_impl.hpp>
#include <detail/kernel_arg_mask.hpp>
#include <detail/kernel_bundle_impl.hpp>

namespace sycl {
Expand Down Expand Up @@ -47,10 +48,15 @@ std::shared_ptr<kernel_impl> device_image_impl::tryGetExtensionKernel(
&UrKernel);
// Kernel created by urKernelCreate is implicitly retained.

const KernelArgMask *ArgMask = nullptr;
if (auto ArgMaskIt = MEliminatedKernelArgMasks.find(AdjustedName);
ArgMaskIt != MEliminatedKernelArgMasks.end())
ArgMask = &ArgMaskIt->second;

return std::make_shared<kernel_impl>(
UrKernel, *detail::getSyclObjImpl(Context), shared_from_this(),
OwnerBundle.shared_from_this(),
/*ArgMask=*/nullptr, UrProgram, /*CacheMutex=*/nullptr);
OwnerBundle.shared_from_this(), ArgMask, UrProgram,
/*CacheMutex=*/nullptr);
}

} // namespace detail
Expand Down
33 changes: 29 additions & 4 deletions sycl/source/detail/device_image_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ class ManagedDeviceBinaries {

using MangledKernelNameMapT = std::map<std::string, std::string, std::less<>>;
using KernelNameSetT = std::set<std::string, std::less<>>;
using KernelNameToArgMaskMap = std::unordered_map<std::string, KernelArgMask>;

// Information unique to images compiled at runtime through the
// ext_oneapi_kernel_compiler extension.
Expand Down Expand Up @@ -260,12 +261,23 @@ class device_image_impl
MKernelIDs(std::move(KernelIDs)),
MSpecConstsDefValBlob(getSpecConstsDefValBlob()), MOrigins(Origins) {
updateSpecConstSymMap();
// SYCLBIN files have the kernel names embedded in the binaries, so we
// collect them.
if (BinImage && (MOrigins & ImageOriginSYCLBIN))
if (BinImage && (MOrigins & ImageOriginSYCLBIN)) {
// SYCLBIN files have the kernel names embedded in the binaries, so we
// collect them.
for (const sycl_device_binary_property &KNProp :
BinImage->getKernelNames())
MKernelNames.insert(KNProp->Name);

KernelArgMask ArgMask;
if (BinImage->getKernelParamOptInfo().isAvailable()) {
// Extract argument mask from the image.
const RTDeviceBinaryImage::PropertyRange &KPOIRange =
BinImage->getKernelParamOptInfo();
for (const auto &Info : KPOIRange)
MEliminatedKernelArgMasks[Info->Name] =
createKernelArgMask(DeviceBinaryProperty(Info).asByteArray());
}
}
}

device_image_impl(
Expand All @@ -276,10 +288,12 @@ class device_image_impl
const std::vector<unsigned char> &SpecConstsBlob, uint8_t Origins,
std::optional<KernelCompilerBinaryInfo> &&RTCInfo,
KernelNameSetT &&KernelNames,
KernelNameToArgMaskMap &&EliminatedKernelArgMasks,
std::unique_ptr<DynRTDeviceBinaryImage> &&MergedImageStorage, private_tag)
: MBinImage(BinImage), MContext(std::move(Context)),
MDevices(std::move(Devices)), MState(State), MProgram(Program),
MKernelIDs(std::move(KernelIDs)), MKernelNames{std::move(KernelNames)},
MEliminatedKernelArgMasks{std::move(EliminatedKernelArgMasks)},
MSpecConstsBlob(SpecConstsBlob),
MSpecConstsDefValBlob(getSpecConstsDefValBlob()),
MSpecConstSymMap(SpecConstMap), MOrigins(Origins),
Expand All @@ -289,11 +303,14 @@ class device_image_impl
device_image_impl(const RTDeviceBinaryImage *BinImage, const context &Context,
const std::vector<device> &Devices, bundle_state State,
ur_program_handle_t Program, syclex::source_language Lang,
KernelNameSetT &&KernelNames, private_tag)
KernelNameSetT &&KernelNames,
KernelNameToArgMaskMap &&EliminatedKernelArgMasks,
private_tag)
: MBinImage(BinImage), MContext(std::move(Context)),
MDevices(std::move(Devices)), MState(State), MProgram(Program),
MKernelIDs(std::make_shared<std::vector<kernel_id>>()),
MKernelNames{std::move(KernelNames)},
MEliminatedKernelArgMasks{std::move(EliminatedKernelArgMasks)},
MSpecConstsDefValBlob(getSpecConstsDefValBlob()),
MOrigins(ImageOriginKernelCompiler),
MRTCBinInfo(KernelCompilerBinaryInfo{Lang}) {
Expand Down Expand Up @@ -674,6 +691,10 @@ class device_image_impl

const KernelNameSetT &getKernelNames() const noexcept { return MKernelNames; }

const KernelNameToArgMaskMap &getEliminatedKernelArgMasks() const noexcept {
return MEliminatedKernelArgMasks;
}

bool isNonSYCLSourceBased() const noexcept {
return (getOriginMask() & ImageOriginKernelCompiler) &&
!isFromSourceLanguage(syclex::source_language::sycl);
Expand Down Expand Up @@ -1265,6 +1286,10 @@ class device_image_impl
// List of known kernel names.
KernelNameSetT MKernelNames;

// Map for storing kernel argument masks for kernels. This is currently only
// used for images created from SYCLBIN.
KernelNameToArgMaskMap MEliminatedKernelArgMasks;

// A mutex for sycnhronizing access to spec constants blob. Mutable because
// needs to be locked in the const method for getting spec constant value.
mutable std::mutex MSpecConstAccessMtx;
Expand Down
17 changes: 14 additions & 3 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2871,6 +2871,8 @@ ProgramManager::compile(const DevImgPlainWithDeps &ImgWithDeps,
setSpecializationConstants(InputImpl, Prog, Adapter);

KernelNameSetT KernelNames = InputImpl.getKernelNames();
std::unordered_map<std::string, KernelArgMask> EliminatedKernelArgMasks =
InputImpl.getEliminatedKernelArgMasks();

std::optional<detail::KernelCompilerBinaryInfo> RTCInfo =
InputImpl.getRTCInfo();
Expand All @@ -2881,7 +2883,7 @@ ProgramManager::compile(const DevImgPlainWithDeps &ImgWithDeps,
InputImpl.get_spec_const_data_ref(),
InputImpl.get_spec_const_blob_ref(), InputImpl.getOriginMask(),
std::move(RTCInfo), std::move(KernelNames),
/*MergedImageStorage = */ nullptr);
std::move(EliminatedKernelArgMasks), nullptr);

std::string CompileOptions;
applyCompileOptionsFromEnvironment(CompileOptions);
Expand Down Expand Up @@ -3066,20 +3068,25 @@ ProgramManager::link(const std::vector<device_image_plain> &Imgs,
RTCInfoPtrs;
RTCInfoPtrs.reserve(Imgs.size());
KernelNameSetT MergedKernelNames;
std::unordered_map<std::string, KernelArgMask> MergedEliminatedKernelArgMasks;
for (const device_image_plain &DevImg : Imgs) {
const DeviceImageImplPtr &DevImgImpl = getSyclObjImpl(DevImg);
CombinedOrigins |= DevImgImpl->getOriginMask();
RTCInfoPtrs.emplace_back(&(DevImgImpl->getRTCInfo()));
MergedKernelNames.insert(DevImgImpl->getKernelNames().begin(),
DevImgImpl->getKernelNames().end());
MergedEliminatedKernelArgMasks.insert(
DevImgImpl->getEliminatedKernelArgMasks().begin(),
DevImgImpl->getEliminatedKernelArgMasks().end());
}
auto MergedRTCInfo = detail::KernelCompilerBinaryInfo::Merge(RTCInfoPtrs);

DeviceImageImplPtr ExecutableImpl = device_image_impl::create(
NewBinImg, Context, std::vector<device>{Devs}, bundle_state::executable,
std::move(KernelIDs), LinkedProg, std::move(NewSpecConstMap),
std::move(NewSpecConstBlob), CombinedOrigins, std::move(MergedRTCInfo),
std::move(MergedKernelNames), std::move(MergedImageStorage));
std::move(MergedKernelNames), std::move(MergedEliminatedKernelArgMasks),
std::move(MergedImageStorage));

// TODO: Make multiple sets of device images organized by devices they are
// compiled for.
Expand Down Expand Up @@ -3147,11 +3154,15 @@ ProgramManager::build(const DevImgPlainWithDeps &DevImgWithDeps,
RTCInfoPtrs;
RTCInfoPtrs.reserve(DevImgWithDeps.size());
KernelNameSetT MergedKernelNames;
std::unordered_map<std::string, KernelArgMask> MergedEliminatedKernelArgMasks;
for (const device_image_plain &DevImg : DevImgWithDeps) {
const auto &DevImgImpl = getSyclObjImpl(DevImg);
RTCInfoPtrs.emplace_back(&(DevImgImpl->getRTCInfo()));
MergedKernelNames.insert(DevImgImpl->getKernelNames().begin(),
DevImgImpl->getKernelNames().end());
MergedEliminatedKernelArgMasks.insert(
DevImgImpl->getEliminatedKernelArgMasks().begin(),
DevImgImpl->getEliminatedKernelArgMasks().end());
}
auto MergedRTCInfo = detail::KernelCompilerBinaryInfo::Merge(RTCInfoPtrs);

Expand All @@ -3160,7 +3171,7 @@ ProgramManager::build(const DevImgPlainWithDeps &DevImgWithDeps,
bundle_state::executable, std::move(KernelIDs), ResProgram,
std::move(SpecConstMap), std::move(SpecConstBlob), CombinedOrigins,
std::move(MergedRTCInfo), std::move(MergedKernelNames),
std::move(MergedImageStorage));
std::move(MergedEliminatedKernelArgMasks), std::move(MergedImageStorage));
return createSyclObjFromImpl<device_image_plain>(std::move(ExecImpl));
}

Expand Down
49 changes: 49 additions & 0 deletions sycl/test-e2e/SYCLBIN/Inputs/dae.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "common.hpp"

#include <sycl/usm.hpp>

static constexpr size_t NUM = 1024;
static constexpr size_t WGSIZE = 16;
static constexpr float EPS = 0.001;

int main(int argc, char *argv[]) {
assert(argc == 2);

sycl::queue Q;

int Failed = CommonLoadCheck(Q.get_context(), argv[1]);

#if defined(SYCLBIN_INPUT_STATE)
auto KBInput = syclexp::get_kernel_bundle<sycl::bundle_state::input>(
Q.get_context(), std::string{argv[1]});
auto KBExe = sycl::build(KBInput);
#elif defined(SYCLBIN_OBJECT_STATE)
auto KBObj = syclexp::get_kernel_bundle<sycl::bundle_state::object>(
Q.get_context(), std::string{argv[1]});
auto KBExe = sycl::link(KBObj);
#else // defined(SYCLBIN_EXECUTABLE_STATE)
auto KBExe = syclexp::get_kernel_bundle<sycl::bundle_state::executable>(
Q.get_context(), std::string{argv[1]});
#endif

assert(KBExe.ext_oneapi_has_kernel("iota"));
sycl::kernel IotaKern = KBExe.ext_oneapi_get_kernel("iota");

float *Ptr = sycl::malloc_shared<float>(NUM, Q);
Q.submit([&](sycl::handler &CGH) {
// First arugment is unused, but should still be passed, even if eliminated
// by DAE.
CGH.set_args(3.14f, Ptr);
CGH.parallel_for(sycl::nd_range{{NUM}, {WGSIZE}}, IotaKern);
}).wait_and_throw();

for (int I = 0; I < NUM; I++) {
const float Truth = static_cast<float>(I);
if (std::abs(Ptr[I] - Truth) > EPS) {
std::cout << "Result: " << Ptr[I] << " expected " << I << "\n";
++Failed;
}
}
sycl::free(Ptr, Q);
return Failed;
}
10 changes: 10 additions & 0 deletions sycl/test-e2e/SYCLBIN/Inputs/dae_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include <sycl/sycl.hpp>

namespace syclexp = sycl::ext::oneapi::experimental;
namespace syclext = sycl::ext::oneapi;

extern "C" SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(syclexp::nd_range_kernel<1>)) void iota(float, float *ptr) {
size_t id = syclext::this_work_item::get_nd_item<1>().get_global_linear_id();
ptr[id] = static_cast<float>(id);
}
24 changes: 24 additions & 0 deletions sycl/test-e2e/SYCLBIN/dae_executable.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//==--------- basic_executable.cpp --- SYCLBIN extension tests -------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// REQUIRES: aspect-usm_device_allocations

// -- Test for using a kernel from a SYCLBIN with a dead argument.

// Due to the regression in https://github.com/intel/llvm/issues/18432 it will
// fail to build the SYCLBIN with nvptx targets. Once this is fixed,
// %{sycl_target_opts} should be added to the SYCLBIN generation run-line.
// REQUIRES: target-spir

// RUN: %clangxx --offload-new-driver -fsyclbin=executable %S/Inputs/dae_kernel.cpp -o %t.syclbin
// RUN: %{build} -o %t.out
// RUN: %{l0_leak_check} %{run} %t.out %t.syclbin

#define SYCLBIN_EXECUTABLE_STATE

#include "Inputs/dae.hpp"
24 changes: 24 additions & 0 deletions sycl/test-e2e/SYCLBIN/dae_input.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//==--------- basic_input.cpp --- SYCLBIN extension tests ------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// REQUIRES: aspect-usm_device_allocations

// -- Test for using a kernel from a SYCLBIN with a dead argument.

// Due to the regression in https://github.com/intel/llvm/issues/18432 it will
// fail to build the SYCLBIN with nvptx targets. Once this is fixed,
// %{sycl_target_opts} should be added to the SYCLBIN generation run-line.
// REQUIRES: target-spir

// RUN: %clangxx --offload-new-driver -fsyclbin=input %S/Inputs/dae_kernel.cpp -o %t.syclbin
// RUN: %{build} -o %t.out
// RUN: %{l0_leak_check} %{run} %t.out %t.syclbin

#define SYCLBIN_INPUT_STATE

#include "Inputs/dae.hpp"
24 changes: 24 additions & 0 deletions sycl/test-e2e/SYCLBIN/dae_object.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//==--------- basic_object.cpp --- SYCLBIN extension tests -----------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// REQUIRES: aspect-usm_device_allocations

// -- Test for using a kernel from a SYCLBIN with a dead argument.

// Due to the regression in https://github.com/intel/llvm/issues/18432 it will
// fail to build the SYCLBIN with nvptx targets. Once this is fixed,
// %{sycl_target_opts} should be added to the SYCLBIN generation run-line.
// REQUIRES: target-spir

// RUN: %clangxx --offload-new-driver -fsyclbin=object %S/Inputs/dae_kernel.cpp -o %t.syclbin
// RUN: %{build} -o %t.out
// RUN: %{l0_leak_check} %{run} %t.out %t.syclbin

#define SYCLBIN_OBJECT_STATE

#include "Inputs/dae.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
// CHECK-DAG: README.md
// CHECK-DAG: lit.cfg.py
//
// CHECK-NUM-MATCHES: 25
// CHECK-NUM-MATCHES: 26
//
// This test verifies that `<sycl/sycl.hpp>` isn't used in E2E tests. Instead,
// fine-grained includes should used, see
Expand Down
Loading